aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--.gitignore3
-rw-r--r--Documentation/00-INDEX2
-rw-r--r--Documentation/ABI/obsolete/sysfs-block-zram119
-rw-r--r--Documentation/ABI/testing/procfs-concurrent_time16
-rw-r--r--Documentation/ABI/testing/procfs-smaps_rollup31
-rw-r--r--Documentation/ABI/testing/sysfs-block-zram109
-rw-r--r--Documentation/ABI/testing/sysfs-class-dual-role-usb71
-rw-r--r--Documentation/ABI/testing/sysfs-fs-f2fs126
-rw-r--r--Documentation/ABI/testing/sysfs-kernel-wakeup_reasons16
-rw-r--r--Documentation/android.txt121
-rw-r--r--Documentation/block/00-INDEX6
-rw-r--r--Documentation/block/mmc-max-speed.txt38
-rw-r--r--Documentation/blockdev/zram.txt110
-rw-r--r--Documentation/cpu-freq/governors.txt86
-rw-r--r--Documentation/device-mapper/boot.txt42
-rw-r--r--Documentation/device-mapper/dm-crypt.txt14
-rw-r--r--Documentation/device-mapper/verity.txt11
-rw-r--r--Documentation/devicetree/bindings/arm/firmware/linaro,optee-tz.txt31
-rw-r--r--Documentation/devicetree/bindings/misc/memory-state-time.txt8
-rw-r--r--Documentation/devicetree/bindings/vendor-prefixes.txt1
-rw-r--r--Documentation/filesystems/f2fs.txt68
-rw-r--r--Documentation/filesystems/fscrypt.rst641
-rw-r--r--Documentation/filesystems/proc.txt6
-rw-r--r--Documentation/gpu/drm-kms.rst6
-rw-r--r--Documentation/input/event-codes.txt5
-rw-r--r--Documentation/ioctl/ioctl-number.txt1
-rw-r--r--Documentation/kernel-parameters.txt20
-rw-r--r--Documentation/networking/ip-sysctl.txt23
-rw-r--r--Documentation/scheduler/sched-energy.txt362
-rw-r--r--Documentation/scheduler/sched-tune.txt366
-rw-r--r--Documentation/security/keys.txt17
-rw-r--r--Documentation/sync.txt75
-rw-r--r--Documentation/sysctl/kernel.txt4
-rw-r--r--Documentation/sysctl/vm.txt16
-rw-r--r--Documentation/tee.txt127
-rw-r--r--Documentation/trace/events-power.txt1
-rw-r--r--Documentation/trace/ftrace.txt49
-rw-r--r--MAINTAINERS13
-rw-r--r--Makefile95
-rw-r--r--arch/Kconfig69
-rw-r--r--arch/alpha/include/uapi/asm/socket.h2
-rw-r--r--arch/arm/Kconfig24
-rw-r--r--arch/arm/Kconfig.debug8
-rw-r--r--arch/arm/Makefile5
-rw-r--r--arch/arm/boot/.gitignore1
-rw-r--r--arch/arm/boot/Makefile13
-rw-r--r--arch/arm/boot/compressed/head.S2
-rw-r--r--arch/arm/boot/dts/Makefile12
-rw-r--r--arch/arm/common/Kconfig4
-rw-r--r--arch/arm/common/Makefile1
-rw-r--r--arch/arm/common/fiq_glue.S118
-rw-r--r--arch/arm/common/fiq_glue_setup.c147
-rw-r--r--arch/arm/configs/ranchu_defconfig316
-rw-r--r--arch/arm/crypto/Kconfig36
-rw-r--r--arch/arm/crypto/Makefile8
-rw-r--r--arch/arm/crypto/aes-cipher-core.S264
-rw-r--r--arch/arm/crypto/aes-cipher-glue.c74
-rw-r--r--arch/arm/crypto/aes_glue.c98
-rw-r--r--arch/arm/crypto/chacha-neon-core.S560
-rw-r--r--arch/arm/crypto/chacha-neon-glue.c226
-rw-r--r--arch/arm/crypto/nh-neon-core.S116
-rw-r--r--arch/arm/crypto/nhpoly1305-neon-glue.c77
-rw-r--r--arch/arm/include/asm/elf.h8
-rw-r--r--arch/arm/include/asm/fiq_glue.h33
-rw-r--r--arch/arm/include/asm/topology.h9
-rw-r--r--arch/arm/include/asm/traps.h7
-rw-r--r--arch/arm/kernel/entry-common.S10
-rw-r--r--arch/arm/kernel/kgdb.c4
-rw-r--r--arch/arm/kernel/process.c75
-rw-r--r--arch/arm/kernel/reboot.c30
-rw-r--r--arch/arm/kernel/signal.c7
-rw-r--r--arch/arm/kernel/topology.c143
-rw-r--r--arch/arm/mm/cache-v6.S17
-rw-r--r--arch/arm/mm/fault.c4
-rw-r--r--arch/arm64/Kconfig67
-rw-r--r--arch/arm64/Makefile35
-rw-r--r--arch/arm64/boot/.gitignore2
-rw-r--r--arch/arm64/boot/Makefile16
-rw-r--r--arch/arm64/boot/dts/Makefile14
-rw-r--r--arch/arm64/configs/cuttlefish_defconfig435
-rw-r--r--arch/arm64/configs/ranchu64_defconfig312
-rw-r--r--arch/arm64/crypto/Makefile3
-rw-r--r--arch/arm64/crypto/aes-ce-cipher-core.c (renamed from arch/arm64/crypto/aes-ce-cipher.c)57
-rw-r--r--arch/arm64/crypto/aes-ce-cipher-glue.c83
-rw-r--r--arch/arm64/crypto/sha1-ce-glue.c8
-rw-r--r--arch/arm64/crypto/sha2-ce-glue.c8
-rw-r--r--arch/arm64/include/asm/Kbuild1
-rw-r--r--arch/arm64/include/asm/arch_gicv3.h66
-rw-r--r--arch/arm64/include/asm/assembler.h16
-rw-r--r--arch/arm64/include/asm/cpufeature.h6
-rw-r--r--arch/arm64/include/asm/current.h30
-rw-r--r--arch/arm64/include/asm/efi.h28
-rw-r--r--arch/arm64/include/asm/elf.h2
-rw-r--r--arch/arm64/include/asm/esr.h6
-rw-r--r--arch/arm64/include/asm/futex.h17
-rw-r--r--arch/arm64/include/asm/hw_breakpoint.h6
-rw-r--r--arch/arm64/include/asm/kernel-pgtable.h7
-rw-r--r--arch/arm64/include/asm/kvm_hyp.h8
-rw-r--r--arch/arm64/include/asm/kvm_mmu.h4
-rw-r--r--arch/arm64/include/asm/memory.h6
-rw-r--r--arch/arm64/include/asm/mmu.h2
-rw-r--r--arch/arm64/include/asm/mmu_context.h63
-rw-r--r--arch/arm64/include/asm/percpu.h1
-rw-r--r--arch/arm64/include/asm/perf_event.h2
-rw-r--r--arch/arm64/include/asm/pgtable.h2
-rw-r--r--arch/arm64/include/asm/signal32.h2
-rw-r--r--arch/arm64/include/asm/smp.h14
-rw-r--r--arch/arm64/include/asm/stack_pointer.h9
-rw-r--r--arch/arm64/include/asm/suspend.h2
-rw-r--r--arch/arm64/include/asm/sysreg.h59
-rw-r--r--arch/arm64/include/asm/thread_info.h40
-rw-r--r--arch/arm64/include/asm/topology.h11
-rw-r--r--arch/arm64/include/asm/traps.h7
-rw-r--r--arch/arm64/include/asm/uaccess.h206
-rw-r--r--arch/arm64/include/uapi/asm/hwcap.h5
-rw-r--r--arch/arm64/kernel/acpi_parking_protocol.c3
-rw-r--r--arch/arm64/kernel/armv8_deprecated.c11
-rw-r--r--arch/arm64/kernel/asm-offsets.c13
-rw-r--r--arch/arm64/kernel/cpu-reset.h2
-rw-r--r--arch/arm64/kernel/cpufeature.c15
-rw-r--r--arch/arm64/kernel/cpuinfo.c11
-rw-r--r--arch/arm64/kernel/entry.S126
-rw-r--r--arch/arm64/kernel/head.S17
-rw-r--r--arch/arm64/kernel/hibernate.c20
-rw-r--r--arch/arm64/kernel/hw_breakpoint.c153
-rw-r--r--arch/arm64/kernel/insn.c2
-rw-r--r--arch/arm64/kernel/io.c12
-rw-r--r--arch/arm64/kernel/module.lds2
-rw-r--r--arch/arm64/kernel/process.c102
-rw-r--r--arch/arm64/kernel/psci.c3
-rw-r--r--arch/arm64/kernel/ptrace.c7
-rw-r--r--arch/arm64/kernel/return_address.c1
-rw-r--r--arch/arm64/kernel/setup.c26
-rw-r--r--arch/arm64/kernel/signal.c5
-rw-r--r--arch/arm64/kernel/sleep.S3
-rw-r--r--arch/arm64/kernel/smp.c14
-rw-r--r--arch/arm64/kernel/smp_spin_table.c3
-rw-r--r--arch/arm64/kernel/stacktrace.c7
-rw-r--r--arch/arm64/kernel/suspend.c6
-rw-r--r--arch/arm64/kernel/topology.c80
-rw-r--r--arch/arm64/kernel/traps.c60
-rw-r--r--arch/arm64/kernel/vdso.c14
-rw-r--r--arch/arm64/kernel/vdso/Makefile1
-rw-r--r--arch/arm64/kernel/vdso/gettimeofday.S2
-rw-r--r--arch/arm64/kernel/vmlinux.lds.S11
-rw-r--r--arch/arm64/kvm/hyp/Makefile4
-rw-r--r--arch/arm64/lib/clear_user.S11
-rw-r--r--arch/arm64/lib/copy_from_user.S11
-rw-r--r--arch/arm64/lib/copy_in_user.S11
-rw-r--r--arch/arm64/lib/copy_to_user.S11
-rw-r--r--arch/arm64/mm/cache.S6
-rw-r--r--arch/arm64/mm/context.c7
-rw-r--r--arch/arm64/mm/dma-mapping.c2
-rw-r--r--arch/arm64/mm/fault.c14
-rw-r--r--arch/arm64/mm/init.c12
-rw-r--r--arch/arm64/mm/kasan_init.c22
-rw-r--r--arch/arm64/mm/mmu.c33
-rw-r--r--arch/arm64/mm/proc.S9
-rw-r--r--arch/arm64/xen/hypercall.S15
-rw-r--r--arch/avr32/include/uapi/asm/socket.h2
-rw-r--r--arch/frv/include/uapi/asm/socket.h2
-rw-r--r--arch/ia64/include/uapi/asm/socket.h2
-rw-r--r--arch/m32r/include/uapi/asm/socket.h2
-rw-r--r--arch/mips/include/uapi/asm/socket.h2
-rw-r--r--arch/mn10300/include/uapi/asm/socket.h2
-rw-r--r--arch/parisc/include/uapi/asm/socket.h2
-rw-r--r--arch/powerpc/include/uapi/asm/socket.h2
-rw-r--r--arch/powerpc/platforms/pseries/cmm.c2
-rw-r--r--arch/s390/include/uapi/asm/socket.h2
-rw-r--r--arch/sparc/include/uapi/asm/socket.h2
-rw-r--r--arch/x86/Makefile2
-rw-r--r--arch/x86/configs/i386_ranchu_defconfig424
-rw-r--r--arch/x86/configs/x86_64_cuttlefish_defconfig478
-rw-r--r--arch/x86/configs/x86_64_ranchu_defconfig419
-rw-r--r--arch/x86/crypto/chacha20_glue.c50
-rw-r--r--arch/x86/crypto/poly1305_glue.c20
-rw-r--r--arch/x86/entry/common.c3
-rw-r--r--arch/x86/entry/entry_64.S9
-rw-r--r--arch/x86/entry/vdso/Makefile3
-rw-r--r--arch/x86/include/asm/alternative.h3
-rw-r--r--arch/x86/include/asm/idle.h7
-rw-r--r--arch/x86/include/asm/paravirt_types.h14
-rw-r--r--arch/x86/include/asm/preempt.h15
-rw-r--r--arch/x86/include/asm/rwsem.h4
-rw-r--r--arch/x86/include/asm/thread_info.h4
-rw-r--r--arch/x86/include/asm/uaccess.h12
-rw-r--r--arch/x86/include/asm/xen/hypercall.h5
-rw-r--r--arch/x86/kernel/process.c17
-rw-r--r--arch/x86/kvm/vmx.c3
-rw-r--r--arch/x86/mm/fault.c3
-rw-r--r--arch/x86/oprofile/nmi_int.c2
-rw-r--r--arch/xtensa/include/uapi/asm/socket.h2
-rw-r--r--block/blk-core.c51
-rw-r--r--build.config.cuttlefish.aarch6416
-rw-r--r--build.config.cuttlefish.x86_6416
-rw-r--r--build.config.goldfish.arm13
-rw-r--r--build.config.goldfish.arm6413
-rw-r--r--build.config.goldfish.mips12
-rw-r--r--build.config.goldfish.mips6412
-rw-r--r--build.config.goldfish.x8613
-rw-r--r--build.config.goldfish.x86_6413
-rw-r--r--certs/system_keyring.c43
-rw-r--r--crypto/Kconfig55
-rw-r--r--crypto/Makefile5
-rw-r--r--crypto/adiantum.c658
-rw-r--r--crypto/aes_generic.c9
-rw-r--r--crypto/api.c13
-rw-r--r--crypto/chacha20_generic.c151
-rw-r--r--crypto/chacha20poly1305.c10
-rw-r--r--crypto/chacha_generic.c243
-rw-r--r--crypto/nhpoly1305.c254
-rw-r--r--crypto/poly1305_generic.c191
-rw-r--r--crypto/tcrypt.c10
-rw-r--r--crypto/testmgr.c87
-rw-r--r--crypto/testmgr.h3415
-rw-r--r--crypto/zstd.c209
-rw-r--r--drivers/Kconfig2
-rw-r--r--drivers/Makefile1
-rw-r--r--drivers/acpi/button.c6
-rw-r--r--drivers/acpi/ec.c6
-rw-r--r--drivers/acpi/sysfs.c8
-rw-r--r--drivers/android/Kconfig29
-rw-r--r--drivers/android/Makefile3
-rw-r--r--drivers/android/binder.c5047
-rw-r--r--drivers/android/binder_alloc.c1020
-rw-r--r--drivers/android/binder_alloc.h187
-rw-r--r--drivers/android/binder_alloc_selftest.c310
-rw-r--r--drivers/android/binder_trace.h120
-rw-r--r--drivers/base/power/main.c5
-rw-r--r--drivers/base/power/opp/core.c2
-rw-r--r--drivers/base/power/wakeup.c36
-rw-r--r--drivers/base/syscore.c3
-rw-r--r--drivers/block/loop.c26
-rw-r--r--drivers/block/zram/Kconfig24
-rw-r--r--drivers/block/zram/zcomp.c3
-rw-r--r--drivers/block/zram/zram_drv.c1285
-rw-r--r--drivers/block/zram/zram_drv.h55
-rw-r--r--drivers/char/ipmi/ipmi_poweroff.c2
-rw-r--r--drivers/char/ipmi/ipmi_si_intf.c4
-rw-r--r--drivers/char/random.c45
-rw-r--r--drivers/clocksource/Kconfig8
-rw-r--r--drivers/clocksource/arm_arch_timer.c5
-rw-r--r--drivers/cpufreq/Kconfig39
-rw-r--r--drivers/cpufreq/Makefile6
-rw-r--r--drivers/cpufreq/cpufreq-dt.c7
-rw-r--r--drivers/cpufreq/cpufreq.c111
-rw-r--r--drivers/cpufreq/cpufreq_conservative.c5
-rw-r--r--drivers/cpufreq/cpufreq_interactive.c1411
-rw-r--r--drivers/cpufreq/cpufreq_performance.c5
-rw-r--r--drivers/cpufreq/cpufreq_powersave.c5
-rw-r--r--drivers/cpufreq/cpufreq_times.c650
-rw-r--r--drivers/cpufreq/cpufreq_userspace.c5
-rw-r--r--drivers/cpuidle/Kconfig.arm1
-rw-r--r--drivers/cpuidle/cpuidle-arm.c76
-rw-r--r--drivers/cpuidle/cpuidle.c4
-rw-r--r--drivers/cpuidle/governors/menu.c7
-rw-r--r--drivers/dma-buf/fence.c11
-rw-r--r--drivers/dma-buf/reservation.c117
-rw-r--r--drivers/dma-buf/sw_sync.c10
-rw-r--r--drivers/dma-buf/sync_file.c8
-rw-r--r--drivers/edac/edac_mc_sysfs.c2
-rw-r--r--drivers/edac/edac_module.c3
-rw-r--r--drivers/firmware/efi/libstub/Makefile3
-rw-r--r--drivers/firmware/efi/libstub/arm-stub.c24
-rw-r--r--drivers/firmware/efi/libstub/arm64-stub.c4
-rw-r--r--drivers/firmware/efi/libstub/efi-stub-helper.c13
-rw-r--r--drivers/firmware/efi/libstub/efistub.h2
-rw-r--r--drivers/gpu/drm/Kconfig1
-rw-r--r--drivers/gpu/drm/drm_atomic.c328
-rw-r--r--drivers/gpu/drm/drm_atomic_helper.c3
-rw-r--r--drivers/gpu/drm/drm_crtc.c68
-rw-r--r--drivers/gpu/drm/drm_crtc_internal.h2
-rw-r--r--drivers/gpu/drm/drm_fb_cma_helper.c35
-rw-r--r--drivers/gpu/drm/drm_fops.c4
-rw-r--r--drivers/gpu/drm/drm_plane.c1
-rw-r--r--drivers/gpu/drm/msm/mdp/mdp5/mdp5_plane.c5
-rw-r--r--drivers/gpu/drm/ttm/ttm_bo.c9
-rw-r--r--drivers/hid/hid-magicmouse.c3
-rw-r--r--drivers/hid/hid-sony.c1990
-rw-r--r--drivers/hid/uhid.c17
-rw-r--r--drivers/hwtracing/coresight/coresight-etm-perf.c33
-rw-r--r--drivers/hwtracing/coresight/coresight-etm4x.c20
-rw-r--r--drivers/hwtracing/coresight/coresight-priv.h4
-rw-r--r--drivers/hwtracing/coresight/coresight-tmc-etf.c48
-rw-r--r--drivers/hwtracing/coresight/coresight-tmc-etr.c286
-rw-r--r--drivers/hwtracing/coresight/coresight-tmc.h2
-rw-r--r--drivers/hwtracing/coresight/coresight.c74
-rw-r--r--drivers/ide/ide.c4
-rw-r--r--drivers/infiniband/hw/qib/qib_iba7322.c4
-rw-r--r--drivers/infiniband/ulp/srpt/ib_srpt.c2
-rw-r--r--drivers/input/Kconfig13
-rw-r--r--drivers/input/Makefile2
-rw-r--r--drivers/input/keyboard/goldfish_events.c28
-rw-r--r--drivers/input/keycombo.c261
-rw-r--r--drivers/input/keyreset.c144
-rw-r--r--drivers/input/misc/Kconfig16
-rw-r--r--drivers/input/misc/Makefile2
-rw-r--r--drivers/input/misc/gpio_axis.c192
-rw-r--r--drivers/input/misc/gpio_event.c228
-rw-r--r--drivers/input/misc/gpio_input.c390
-rw-r--r--drivers/input/misc/gpio_matrix.c440
-rw-r--r--drivers/input/misc/gpio_output.c97
-rw-r--r--drivers/input/misc/keychord.c467
-rw-r--r--drivers/isdn/hardware/mISDN/avmfritz.c2
-rw-r--r--drivers/isdn/hardware/mISDN/mISDNinfineon.c2
-rw-r--r--drivers/isdn/hardware/mISDN/netjet.c2
-rw-r--r--drivers/isdn/hardware/mISDN/speedfax.c2
-rw-r--r--drivers/isdn/hardware/mISDN/w6692.c2
-rw-r--r--drivers/md/Kconfig48
-rw-r--r--drivers/md/Makefile5
-rw-r--r--drivers/md/dm-android-verity.c923
-rw-r--r--drivers/md/dm-android-verity.h123
-rw-r--r--drivers/md/dm-crypt.c108
-rw-r--r--drivers/md/dm-ioctl.c39
-rw-r--r--drivers/md/dm-linear.c35
-rw-r--r--drivers/md/dm-table.c11
-rw-r--r--drivers/md/dm-verity-avb.c229
-rw-r--r--drivers/md/dm-verity-fec.c46
-rw-r--r--drivers/md/dm-verity-fec.h4
-rw-r--r--drivers/md/dm-verity-target.c109
-rw-r--r--drivers/md/dm-verity.h12
-rw-r--r--drivers/md/dm.h2
-rw-r--r--drivers/md/md.c6
-rw-r--r--drivers/media/pci/tw686x/tw686x-core.c4
-rw-r--r--drivers/media/usb/uvc/uvc_driver.c4
-rw-r--r--drivers/media/v4l2-core/v4l2-ioctl.c71
-rw-r--r--drivers/message/fusion/mptbase.c4
-rw-r--r--drivers/misc/Kconfig21
-rw-r--r--drivers/misc/Makefile4
-rw-r--r--drivers/misc/kgdbts.c3
-rw-r--r--drivers/misc/lkdtm.h3
-rw-r--r--drivers/misc/lkdtm_bugs.c78
-rw-r--r--drivers/misc/lkdtm_core.c3
-rw-r--r--drivers/misc/memory_state_time.c462
-rw-r--r--drivers/misc/uid_sys_stats.c706
-rw-r--r--drivers/mmc/card/Kconfig12
-rw-r--r--drivers/mmc/card/block.c300
-rw-r--r--drivers/mmc/card/queue.c6
-rw-r--r--drivers/mmc/card/queue.h8
-rw-r--r--drivers/mmc/core/Kconfig15
-rw-r--r--drivers/mmc/core/core.c99
-rw-r--r--drivers/mmc/core/host.c16
-rw-r--r--drivers/mmc/core/host.h5
-rw-r--r--drivers/mmc/core/mmc.c14
-rw-r--r--drivers/mmc/core/sd.c83
-rw-r--r--drivers/mmc/core/sdio.c115
-rw-r--r--drivers/mmc/core/sdio_bus.c13
-rw-r--r--drivers/mmc/core/sdio_io.c33
-rw-r--r--drivers/mtd/devices/block2mtd.c2
-rw-r--r--drivers/mtd/devices/phram.c2
-rw-r--r--drivers/mtd/nand/Kconfig10
-rw-r--r--drivers/mtd/ubi/build.c2
-rw-r--r--drivers/net/ppp/Kconfig17
-rw-r--r--drivers/net/ppp/Makefile2
-rw-r--r--drivers/net/ppp/pppolac.c450
-rw-r--r--drivers/net/ppp/pppopns.c429
-rw-r--r--drivers/net/tun.c6
-rw-r--r--drivers/net/wireless/Kconfig7
-rw-r--r--drivers/net/wireless/Makefile2
-rw-r--r--drivers/net/wireless/broadcom/brcm80211/brcmfmac/cfg80211.c26
-rw-r--r--drivers/net/wireless/ti/wlcore/init.c5
-rw-r--r--drivers/net/wireless/virt_wifi.c631
-rw-r--r--drivers/nfc/fdp/i2c.c10
-rw-r--r--drivers/nfc/st21nfca/dep.c3
-rw-r--r--drivers/nfc/st21nfca/se.c18
-rw-r--r--drivers/of/fdt.c74
-rw-r--r--drivers/pci/pcie/aspm.c5
-rw-r--r--drivers/perf/arm_pmu.c2
-rw-r--r--drivers/platform/goldfish/Makefile3
-rw-r--r--drivers/platform/goldfish/goldfish_pipe.c273
-rw-r--r--drivers/platform/goldfish/goldfish_pipe.h92
-rw-r--r--drivers/platform/goldfish/goldfish_pipe_v2.c889
-rw-r--r--drivers/platform/x86/thinkpad_acpi.c2
-rw-r--r--drivers/power/supply/power_supply_sysfs.c11
-rw-r--r--drivers/rtc/rtc-palmas.c44
-rw-r--r--drivers/scsi/fcoe/fcoe_transport.c20
-rw-r--r--drivers/scsi/mpt3sas/mpt3sas_base.c2
-rw-r--r--drivers/scsi/mpt3sas/mpt3sas_scsih.c2
-rw-r--r--drivers/scsi/ufs/ufshcd.c88
-rw-r--r--drivers/scsi/ufs/ufshcd.h4
-rw-r--r--drivers/staging/android/Kconfig20
-rw-r--r--drivers/staging/android/Makefile2
-rw-r--r--drivers/staging/android/TODO9
-rw-r--r--drivers/staging/android/ashmem.c26
-rw-r--r--drivers/staging/android/fiq_debugger/Kconfig58
-rw-r--r--drivers/staging/android/fiq_debugger/Makefile4
-rw-r--r--drivers/staging/android/fiq_debugger/fiq_debugger.c1246
-rw-r--r--drivers/staging/android/fiq_debugger/fiq_debugger.h64
-rw-r--r--drivers/staging/android/fiq_debugger/fiq_debugger_arm.c240
-rw-r--r--drivers/staging/android/fiq_debugger/fiq_debugger_arm64.c202
-rw-r--r--drivers/staging/android/fiq_debugger/fiq_debugger_priv.h37
-rw-r--r--drivers/staging/android/fiq_debugger/fiq_debugger_ringbuf.h94
-rw-r--r--drivers/staging/android/fiq_debugger/fiq_watchdog.c56
-rw-r--r--drivers/staging/android/fiq_debugger/fiq_watchdog.h20
-rw-r--r--drivers/staging/android/ion/ion.c11
-rw-r--r--drivers/staging/android/ion/ion_cma_heap.c4
-rw-r--r--drivers/staging/android/lowmemorykiller.c102
-rw-r--r--drivers/staging/android/trace/lowmemorykiller.h41
-rw-r--r--drivers/staging/android/uapi/vsoc_shm.h303
-rw-r--r--drivers/staging/android/vsoc.c1165
-rw-r--r--drivers/staging/goldfish/Kconfig8
-rw-r--r--drivers/staging/goldfish/Makefile6
-rw-r--r--drivers/staging/goldfish/goldfish_audio.c13
-rw-r--r--drivers/staging/goldfish/goldfish_sync_timeline.c962
-rw-r--r--drivers/staging/goldfish/goldfish_sync_timeline_fence.c254
-rw-r--r--drivers/staging/goldfish/goldfish_sync_timeline_fence.h58
-rw-r--r--drivers/staging/lustre/lustre/mdc/mdc_request.c4
-rw-r--r--drivers/tee/Kconfig19
-rw-r--r--drivers/tee/Makefile5
-rw-r--r--drivers/tee/optee/Kconfig7
-rw-r--r--drivers/tee/optee/Makefile6
-rw-r--r--drivers/tee/optee/call.c662
-rw-r--r--drivers/tee/optee/core.c705
-rw-r--r--drivers/tee/optee/optee_msg.h444
-rw-r--r--drivers/tee/optee/optee_private.h199
-rw-r--r--drivers/tee/optee/optee_smc.h457
-rw-r--r--drivers/tee/optee/rpc.c452
-rw-r--r--drivers/tee/optee/shm_pool.c75
-rw-r--r--drivers/tee/optee/shm_pool.h23
-rw-r--r--drivers/tee/optee/supp.c382
-rw-r--r--drivers/tee/tee_core.c949
-rw-r--r--drivers/tee/tee_private.h79
-rw-r--r--drivers/tee/tee_shm.c510
-rw-r--r--drivers/tee/tee_shm_pool.c195
-rw-r--r--drivers/thermal/hisi_thermal.c602
-rw-r--r--drivers/tty/serial/kgdboc.c3
-rw-r--r--drivers/tty/serial/serial_core.c3
-rw-r--r--drivers/usb/gadget/Kconfig50
-rw-r--r--drivers/usb/gadget/composite.c6
-rw-r--r--drivers/usb/gadget/configfs.c268
-rw-r--r--drivers/usb/gadget/function/Makefile8
-rw-r--r--drivers/usb/gadget/function/f_accessory.c1352
-rw-r--r--drivers/usb/gadget/function/f_audio_source.c1071
-rw-r--r--drivers/usb/gadget/function/f_midi.c66
-rw-r--r--drivers/usb/gadget/function/f_mtp.c1564
-rw-r--r--drivers/usb/gadget/function/f_mtp.h18
-rw-r--r--drivers/usb/gadget/function/f_ptp.c38
-rw-r--r--drivers/usb/phy/Kconfig17
-rw-r--r--drivers/usb/phy/Makefile2
-rw-r--r--drivers/usb/phy/class-dual-role.c529
-rw-r--r--drivers/usb/phy/otg-wakelock.c170
-rw-r--r--drivers/video/console/dummycon.c69
-rw-r--r--drivers/video/fbdev/goldfishfb.c18
-rw-r--r--drivers/w1/masters/ds2482.c47
-rw-r--r--fs/Kconfig1
-rw-r--r--fs/Makefile5
-rw-r--r--fs/afs/file.c14
-rw-r--r--fs/afs/internal.h2
-rw-r--r--fs/attr.c14
-rw-r--r--fs/btrfs/extent_io.c19
-rw-r--r--fs/ceph/addr.c18
-rw-r--r--fs/cifs/connect.c2
-rw-r--r--fs/coredump.c2
-rw-r--r--fs/crypto/Kconfig3
-rw-r--r--fs/crypto/Makefile3
-rw-r--r--fs/crypto/bio.c154
-rw-r--r--fs/crypto/crypto.c336
-rw-r--r--fs/crypto/fname.c298
-rw-r--r--fs/crypto/fscrypt_private.h174
-rw-r--r--fs/crypto/hooks.c271
-rw-r--r--fs/crypto/keyinfo.c629
-rw-r--r--fs/crypto/policy.c156
-rw-r--r--fs/dcache.c1
-rw-r--r--fs/ecryptfs/ecryptfs_kernel.h2
-rw-r--r--fs/eventpoll.c4
-rw-r--r--fs/exec.c2
-rw-r--r--fs/exofs/inode.c12
-rw-r--r--fs/ext4/ext4.h58
-rw-r--r--fs/ext4/ialloc.c18
-rw-r--r--fs/ext4/inline.c16
-rw-r--r--fs/ext4/inode.c80
-rw-r--r--fs/ext4/ioctl.c44
-rw-r--r--fs/ext4/mballoc.c28
-rw-r--r--fs/ext4/namei.c209
-rw-r--r--fs/ext4/page-io.c4
-rw-r--r--fs/ext4/readpage.c49
-rw-r--r--fs/ext4/super.c91
-rw-r--r--fs/ext4/symlink.c43
-rw-r--r--fs/f2fs/Makefile2
-rw-r--r--fs/f2fs/acl.c44
-rw-r--r--fs/f2fs/acl.h5
-rw-r--r--fs/f2fs/checkpoint.c683
-rw-r--r--fs/f2fs/data.c1875
-rw-r--r--fs/f2fs/debug.c164
-rw-r--r--fs/f2fs/dir.c274
-rw-r--r--fs/f2fs/extent_cache.c429
-rw-r--r--fs/f2fs/f2fs.h2300
-rw-r--r--fs/f2fs/file.c1292
-rw-r--r--fs/f2fs/gc.c723
-rw-r--r--fs/f2fs/gc.h34
-rw-r--r--fs/f2fs/hash.c5
-rw-r--r--fs/f2fs/inline.c291
-rw-r--r--fs/f2fs/inode.c456
-rw-r--r--fs/f2fs/namei.c675
-rw-r--r--fs/f2fs/node.c1823
-rw-r--r--fs/f2fs/node.h116
-rw-r--r--fs/f2fs/recovery.c387
-rw-r--r--fs/f2fs/segment.c2759
-rw-r--r--fs/f2fs/segment.h350
-rw-r--r--fs/f2fs/shrinker.c21
-rw-r--r--fs/f2fs/super.c2437
-rw-r--r--fs/f2fs/sysfs.c742
-rw-r--r--fs/f2fs/trace.c21
-rw-r--r--fs/f2fs/trace.h5
-rw-r--r--fs/f2fs/xattr.c337
-rw-r--r--fs/f2fs/xattr.h16
-rw-r--r--fs/fs-writeback.c2
-rw-r--r--fs/fs_struct.c3
-rw-r--r--fs/fscache/object-list.c2
-rw-r--r--fs/fuse/dev.c12
-rw-r--r--fs/fuse/dir.c46
-rw-r--r--fs/fuse/file.c4
-rw-r--r--fs/fuse/fuse_i.h3
-rw-r--r--fs/fuse/inode.c4
-rw-r--r--fs/gfs2/aops.c22
-rw-r--r--fs/inode.c6
-rw-r--r--fs/internal.h4
-rw-r--r--fs/lockd/svc.c2
-rw-r--r--fs/logfs/dev_bdev.c4
-rw-r--r--fs/logfs/dev_mtd.c4
-rw-r--r--fs/logfs/dir.c4
-rw-r--r--fs/logfs/logfs.h2
-rw-r--r--fs/mpage.c36
-rw-r--r--fs/namei.c176
-rw-r--r--fs/namespace.c30
-rw-r--r--fs/nfs/nfs4idmap.c2
-rw-r--r--fs/nfs/read.c2
-rw-r--r--fs/nilfs2/btree.c4
-rw-r--r--fs/nilfs2/page.c7
-rw-r--r--fs/nilfs2/segment.c12
-rw-r--r--fs/notify/fanotify/fanotify_user.c2
-rw-r--r--fs/notify/inotify/inotify_user.c17
-rw-r--r--fs/ocfs2/dlmfs/dlmfs.c4
-rw-r--r--fs/open.c37
-rw-r--r--fs/pnode.c34
-rw-r--r--fs/pnode.h1
-rw-r--r--fs/proc/Kconfig7
-rw-r--r--fs/proc/Makefile1
-rw-r--r--fs/proc/base.c9
-rw-r--r--fs/proc/inode.c9
-rw-r--r--fs/proc/internal.h15
-rw-r--r--fs/proc/root.c54
-rw-r--r--fs/proc/task_mmu.c258
-rw-r--r--fs/proc/uid.c295
-rw-r--r--fs/proc_namespace.c8
-rw-r--r--fs/pstore/ram.c6
-rw-r--r--fs/read_write.c54
-rw-r--r--fs/sdcardfs/Kconfig13
-rw-r--r--fs/sdcardfs/Makefile7
-rw-r--r--fs/sdcardfs/dentry.c196
-rw-r--r--fs/sdcardfs/derived_perm.c477
-rw-r--r--fs/sdcardfs/file.c467
-rw-r--r--fs/sdcardfs/inode.c818
-rw-r--r--fs/sdcardfs/lookup.c469
-rw-r--r--fs/sdcardfs/main.c511
-rw-r--r--fs/sdcardfs/mmap.c88
-rw-r--r--fs/sdcardfs/multiuser.h53
-rw-r--r--fs/sdcardfs/packagelist.c882
-rw-r--r--fs/sdcardfs/sdcardfs.h654
-rw-r--r--fs/sdcardfs/super.c333
-rw-r--r--fs/super.c30
-rw-r--r--fs/sync.c1
-rw-r--r--fs/userfaultfd.c9
-rw-r--r--fs/utimes.c2
-rw-r--r--fs/xattr.c2
-rw-r--r--include/asm-generic/sections.h4
-rw-r--r--include/asm-generic/vmlinux.lds.h54
-rw-r--r--include/crypto/chacha.h54
-rw-r--r--include/crypto/chacha20.h26
-rw-r--r--include/crypto/nhpoly1305.h74
-rw-r--r--include/crypto/poly1305.h28
-rw-r--r--include/drm/drm_atomic.h3
-rw-r--r--include/drm/drm_crtc.h40
-rw-r--r--include/drm/drm_fb_cma_helper.h5
-rw-r--r--include/drm/drm_plane.h2
-rw-r--r--include/keys/user-type.h9
-rw-r--r--include/linux/Kbuild2
-rw-r--r--include/linux/amba/mmci.h10
-rw-r--r--include/linux/android_aid.h28
-rw-r--r--include/linux/arm-smccc.h42
-rw-r--r--include/linux/blk_types.h1
-rw-r--r--include/linux/blkdev.h59
-rw-r--r--include/linux/bpf-cgroup.h77
-rw-r--r--include/linux/bpf.h28
-rw-r--r--include/linux/bug.h19
-rw-r--r--include/linux/cfi.h38
-rw-r--r--include/linux/cgroup-defs.h4
-rw-r--r--include/linux/cgroup_subsys.h4
-rw-r--r--include/linux/clocksource.h2
-rw-r--r--include/linux/compiler-clang.h17
-rw-r--r--include/linux/compiler.h8
-rw-r--r--include/linux/cpu.h7
-rw-r--r--include/linux/cpufreq.h48
-rw-r--r--include/linux/cpufreq_times.h46
-rw-r--r--include/linux/cpuidle.h2
-rw-r--r--include/linux/crypto.h40
-rw-r--r--include/linux/dcache.h1
-rw-r--r--include/linux/device-mapper.h13
-rw-r--r--include/linux/efi.h2
-rw-r--r--include/linux/f2fs_fs.h100
-rw-r--r--include/linux/fence.h58
-rw-r--r--include/linux/fs.h45
-rw-r--r--include/linux/fscrypt.h250
-rw-r--r--include/linux/fscrypt_notsupp.h230
-rw-r--r--include/linux/fscrypt_supp.h203
-rw-r--r--include/linux/fscrypto.h409
-rw-r--r--include/linux/fsnotify.h8
-rw-r--r--include/linux/ftrace.h11
-rw-r--r--include/linux/gfp.h2
-rw-r--r--include/linux/gpio_event.h170
-rw-r--r--include/linux/if_pppolac.h23
-rw-r--r--include/linux/if_pppopns.h23
-rw-r--r--include/linux/if_pppox.h21
-rw-r--r--include/linux/init.h13
-rw-r--r--include/linux/initramfs.h32
-rw-r--r--include/linux/interrupt.h14
-rw-r--r--include/linux/ipv6.h2
-rw-r--r--include/linux/kasan.h17
-rw-r--r--include/linux/kcov.h12
-rw-r--r--include/linux/key.h5
-rw-r--r--include/linux/keychord.h23
-rw-r--r--include/linux/keycombo.h36
-rw-r--r--include/linux/keyreset.h29
-rw-r--r--include/linux/list.h37
-rw-r--r--include/linux/lsm_hooks.h55
-rw-r--r--include/linux/memory-state-time.h42
-rw-r--r--include/linux/mm.h3
-rw-r--r--include/linux/mm_types.h24
-rw-r--r--include/linux/mmc/card.h3
-rw-r--r--include/linux/mmc/core.h4
-rw-r--r--include/linux/mmc/host.h24
-rw-r--r--include/linux/mmc/mmc.h3
-rw-r--r--include/linux/mmc/pm.h1
-rw-r--r--include/linux/mmc/sdio_func.h10
-rw-r--r--include/linux/module.h5
-rw-r--r--include/linux/moduleparam.h16
-rw-r--r--include/linux/mount.h1
-rw-r--r--include/linux/namei.h3
-rw-r--r--include/linux/netfilter/xt_qtaguid.h14
-rw-r--r--include/linux/netfilter/xt_quota2.h25
-rw-r--r--include/linux/of_fdt.h21
-rw-r--r--include/linux/overflow.h278
-rw-r--r--include/linux/pagemap.h16
-rw-r--r--include/linux/pagevec.h14
-rw-r--r--include/linux/perf_event.h5
-rw-r--r--include/linux/platform_data/ds2482.h21
-rw-r--r--include/linux/power_supply.h8
-rw-r--r--include/linux/proc_fs.h6
-rw-r--r--include/linux/pstore_ram.h2
-rw-r--r--include/linux/rculist.h8
-rw-r--r--include/linux/reservation.h15
-rw-r--r--include/linux/restart_block.h51
-rw-r--r--include/linux/sched.h161
-rw-r--r--include/linux/sched/sysctl.h24
-rw-r--r--include/linux/sched_energy.h44
-rw-r--r--include/linux/security.h48
-rw-r--r--include/linux/serial_core.h1
-rw-r--r--include/linux/sock_diag.h1
-rw-r--r--include/linux/suspend.h1
-rw-r--r--include/linux/sync_file.h3
-rw-r--r--include/linux/syscalls.h20
-rw-r--r--include/linux/sysfs.h2
-rw-r--r--include/linux/task_io_accounting.h2
-rw-r--r--include/linux/task_io_accounting_ops.h1
-rw-r--r--include/linux/tcp.h3
-rw-r--r--include/linux/tee_drv.h468
-rw-r--r--include/linux/thread_info.h45
-rw-r--r--include/linux/tick.h1
-rw-r--r--include/linux/timekeeper_internal.h4
-rw-r--r--include/linux/timekeeping.h1
-rw-r--r--include/linux/usb/class-dual-role.h129
-rw-r--r--include/linux/usb/composite.h1
-rw-r--r--include/linux/usb/f_accessory.h23
-rw-r--r--include/linux/usb/f_mtp.h23
-rw-r--r--include/linux/verification.h8
-rw-r--r--include/linux/wakeup_reason.h32
-rw-r--r--include/linux/wlan_plat.h30
-rw-r--r--include/linux/xxhash.h236
-rw-r--r--include/linux/zsmalloc.h2
-rw-r--r--include/linux/zstd.h1157
-rw-r--r--include/net/addrconf.h2
-rw-r--r--include/net/fib_rules.h9
-rw-r--r--include/net/flow.h18
-rw-r--r--include/net/inet_sock.h6
-rw-r--r--include/net/ip.h1
-rw-r--r--include/net/ip6_route.h5
-rw-r--r--include/net/netfilter/nf_conntrack.h2
-rw-r--r--include/net/route.h5
-rw-r--r--include/net/sock.h7
-rw-r--r--include/net/tcp.h4
-rw-r--r--include/net/xfrm.h73
-rw-r--r--include/trace/events/android_fs.h65
-rw-r--r--include/trace/events/android_fs_template.h64
-rw-r--r--include/trace/events/cpufreq_interactive.h112
-rw-r--r--include/trace/events/cpufreq_sched.h87
-rw-r--r--include/trace/events/f2fs.h443
-rw-r--r--include/trace/events/gpu.h143
-rw-r--r--include/trace/events/net.h8
-rw-r--r--include/trace/events/power.h51
-rw-r--r--include/trace/events/preemptirq.h73
-rw-r--r--include/trace/events/sched.h540
-rw-r--r--include/uapi/asm-generic/socket.h2
-rw-r--r--include/uapi/linux/android/binder.h179
-rw-r--r--include/uapi/linux/bpf.h90
-rw-r--r--include/uapi/linux/fcntl.h21
-rw-r--r--include/uapi/linux/fib_rules.h6
-rw-r--r--include/uapi/linux/fs.h37
-rw-r--r--include/uapi/linux/fuse.h1
-rw-r--r--include/uapi/linux/hw_breakpoint.h4
-rw-r--r--include/uapi/linux/if_link.h10
-rw-r--r--include/uapi/linux/if_pppolac.h33
-rw-r--r--include/uapi/linux/if_pppopns.h32
-rw-r--r--include/uapi/linux/if_pppox.h6
-rw-r--r--include/uapi/linux/input.h11
-rw-r--r--include/uapi/linux/ipv6.h7
-rw-r--r--include/uapi/linux/kcov.h24
-rw-r--r--include/uapi/linux/keychord.h52
-rw-r--r--include/uapi/linux/loop.h1
-rw-r--r--include/uapi/linux/magic.h2
-rw-r--r--include/uapi/linux/netfilter/xt_IDLETIMER.h8
-rw-r--r--include/uapi/linux/netfilter/xt_bpf.h22
-rw-r--r--include/uapi/linux/netfilter/xt_socket.h7
-rw-r--r--include/uapi/linux/prctl.h3
-rw-r--r--include/uapi/linux/rtnetlink.h1
-rw-r--r--include/uapi/linux/sysctl.h1
-rw-r--r--include/uapi/linux/tcp.h1
-rw-r--r--include/uapi/linux/tee.h384
-rw-r--r--include/uapi/linux/usb/f_accessory.h146
-rw-r--r--include/uapi/linux/usb/f_mtp.h61
-rw-r--r--include/uapi/linux/xfrm.h5
-rw-r--r--init/Kconfig155
-rw-r--r--init/Makefile4
-rw-r--r--init/do_mounts.c1
-rw-r--r--init/do_mounts.h10
-rw-r--r--init/do_mounts_dm.c470
-rw-r--r--init/initramfs.c22
-rw-r--r--init/noinitramfs.c9
-rw-r--r--ipc/mqueue.c10
-rw-r--r--kernel/Makefile4
-rw-r--r--kernel/bpf/Makefile1
-rw-r--r--kernel/bpf/arraymap.c6
-rw-r--r--kernel/bpf/cgroup.c205
-rw-r--r--kernel/bpf/core.c2
-rw-r--r--kernel/bpf/hashtab.c4
-rw-r--r--kernel/bpf/inode.c51
-rw-r--r--kernel/bpf/stackmap.c5
-rw-r--r--kernel/bpf/syscall.c207
-rw-r--r--kernel/cfi.c299
-rw-r--r--kernel/cgroup.c22
-rw-r--r--kernel/configs/android-base.config149
-rwxr-xr-xkernel/configs/android-fetch-configs.sh4
-rw-r--r--kernel/configs/android-recommended.config125
-rw-r--r--kernel/cpu.c27
-rw-r--r--kernel/cpuset.c33
-rw-r--r--kernel/debug/kdb/kdb_io.c12
-rw-r--r--kernel/events/core.c8
-rw-r--r--kernel/exit.c5
-rw-r--r--kernel/fork.c7
-rw-r--r--kernel/kallsyms.c53
-rw-r--r--kernel/kcov.c229
-rw-r--r--kernel/module.c27
-rw-r--r--kernel/power/Kconfig1
-rw-r--r--kernel/power/Makefile2
-rw-r--r--kernel/power/process.c34
-rw-r--r--kernel/power/suspend.c35
-rw-r--r--kernel/power/wakeup_reason.c225
-rw-r--r--kernel/printk/printk.c8
-rw-r--r--kernel/sched/Makefile4
-rw-r--r--kernel/sched/core.c264
-rw-r--r--kernel/sched/cpufreq_schedutil.c371
-rw-r--r--kernel/sched/cputime.c23
-rw-r--r--kernel/sched/deadline.c8
-rw-r--r--kernel/sched/debug.c95
-rw-r--r--kernel/sched/energy.c124
-rw-r--r--kernel/sched/fair.c2413
-rw-r--r--kernel/sched/features.h24
-rw-r--r--kernel/sched/idle.c3
-rw-r--r--kernel/sched/rt.c61
-rw-r--r--kernel/sched/sched.h236
-rw-r--r--kernel/sched/stats.c26
-rw-r--r--kernel/sched/stop_task.c3
-rw-r--r--kernel/sched/tune.c1027
-rw-r--r--kernel/sched/tune.h55
-rw-r--r--kernel/sched/walt.c903
-rw-r--r--kernel/sched/walt.h69
-rw-r--r--kernel/sys.c152
-rw-r--r--kernel/sysctl.c68
-rw-r--r--kernel/time/tick-sched.c12
-rw-r--r--kernel/time/timekeeping.c72
-rw-r--r--kernel/trace/Kconfig14
-rw-r--r--kernel/trace/Makefile2
-rw-r--r--kernel/trace/ftrace.c23
-rw-r--r--kernel/trace/gpu-traces.c23
-rw-r--r--kernel/trace/trace.c169
-rw-r--r--kernel/trace/trace.h4
-rw-r--r--kernel/trace/trace_functions_graph.c43
-rw-r--r--kernel/trace/trace_irqsoff.c133
-rw-r--r--kernel/trace/trace_output.c184
-rw-r--r--kernel/user.c3
-rw-r--r--lib/Kconfig11
-rw-r--r--lib/Kconfig.debug19
-rw-r--r--lib/Makefile5
-rw-r--r--lib/chacha.c117
-rw-r--r--lib/chacha20.c79
-rw-r--r--lib/digsig.c2
-rw-r--r--lib/list_debug.c110
-rw-r--r--lib/test_kasan.c32
-rw-r--r--lib/xxhash.c500
-rw-r--r--lib/zstd/Makefile18
-rw-r--r--lib/zstd/bitstream.h374
-rw-r--r--lib/zstd/compress.c3484
-rw-r--r--lib/zstd/decompress.c2528
-rw-r--r--lib/zstd/entropy_common.c243
-rw-r--r--lib/zstd/error_private.h53
-rw-r--r--lib/zstd/fse.h575
-rw-r--r--lib/zstd/fse_compress.c795
-rw-r--r--lib/zstd/fse_decompress.c332
-rw-r--r--lib/zstd/huf.h212
-rw-r--r--lib/zstd/huf_compress.c770
-rw-r--r--lib/zstd/huf_decompress.c960
-rw-r--r--lib/zstd/mem.h151
-rw-r--r--lib/zstd/zstd_common.c75
-rw-r--r--lib/zstd/zstd_internal.h263
-rw-r--r--lib/zstd/zstd_opt.h1014
-rw-r--r--mm/filemap.c53
-rw-r--r--mm/kasan/kasan.c64
-rw-r--r--mm/kasan/kasan.h59
-rw-r--r--mm/kasan/quarantine.c144
-rw-r--r--mm/kasan/report.c229
-rw-r--r--mm/madvise.c2
-rw-r--r--mm/mempolicy.c3
-rw-r--r--mm/mlock.c2
-rw-r--r--mm/mmap.c40
-rw-r--r--mm/mprotect.c2
-rw-r--r--mm/page-writeback.c20
-rw-r--r--mm/page_alloc.c40
-rw-r--r--mm/readahead.c2
-rw-r--r--mm/shmem.c13
-rw-r--r--mm/slab_common.c4
-rw-r--r--mm/slub.c26
-rw-r--r--mm/swap.c20
-rw-r--r--mm/zsmalloc.c41
-rw-r--r--net/Kconfig11
-rw-r--r--net/bluetooth/af_bluetooth.c29
-rw-r--r--net/bridge/br_device.c11
-rw-r--r--net/core/fib_rules.c82
-rw-r--r--net/core/filter.c69
-rw-r--r--net/core/netpoll.c2
-rw-r--r--net/core/pktgen.c2
-rw-r--r--net/core/sock.c13
-rw-r--r--net/core/sock_diag.c2
-rw-r--r--net/dns_resolver/dns_query.c4
-rw-r--r--net/ipv4/Makefile1
-rw-r--r--net/ipv4/af_inet.c54
-rw-r--r--net/ipv4/fib_frontend.c7
-rw-r--r--net/ipv4/icmp.c2
-rw-r--r--net/ipv4/inet_connection_sock.c4
-rw-r--r--net/ipv4/ip_output.c29
-rw-r--r--net/ipv4/ping.c3
-rw-r--r--net/ipv4/raw.c2
-rw-r--r--net/ipv4/route.c40
-rw-r--r--net/ipv4/syncookies.c2
-rw-r--r--net/ipv4/sysctl_net_ipv4.c22
-rw-r--r--net/ipv4/sysfs_net_ipv4.c88
-rw-r--r--net/ipv4/tcp.c49
-rw-r--r--net/ipv4/tcp_fastopen.c54
-rw-r--r--net/ipv4/tcp_input.c1
-rw-r--r--net/ipv4/tcp_ipv4.c20
-rw-r--r--net/ipv4/tcp_output.c18
-rw-r--r--net/ipv4/udp.c3
-rw-r--r--net/ipv4/xfrm4_policy.c14
-rw-r--r--net/ipv6/addrconf.c49
-rw-r--r--net/ipv6/af_inet6.c21
-rw-r--r--net/ipv6/ah6.c5
-rw-r--r--net/ipv6/datagram.c1
-rw-r--r--net/ipv6/esp6.c5
-rw-r--r--net/ipv6/exthdrs_core.c13
-rw-r--r--net/ipv6/icmp.c7
-rw-r--r--net/ipv6/inet6_connection_sock.c2
-rw-r--r--net/ipv6/ip6_fib.c2
-rw-r--r--net/ipv6/ip6_gre.c4
-rw-r--r--net/ipv6/ip6_output.c16
-rw-r--r--net/ipv6/ip6_tunnel.c4
-rw-r--r--net/ipv6/ip6_vti.c5
-rw-r--r--net/ipv6/ipcomp6.c5
-rw-r--r--net/ipv6/ndisc.c2
-rw-r--r--net/ipv6/netfilter.c1
-rw-r--r--net/ipv6/ping.c1
-rw-r--r--net/ipv6/raw.c1
-rw-r--r--net/ipv6/route.c70
-rw-r--r--net/ipv6/syncookies.c1
-rw-r--r--net/ipv6/tcp_ipv6.c15
-rw-r--r--net/ipv6/udp.c1
-rw-r--r--net/ipv6/xfrm6_policy.c9
-rw-r--r--net/key/af_key.c6
-rw-r--r--net/l2tp/l2tp_ip6.c1
-rw-r--r--net/netfilter/Kconfig41
-rw-r--r--net/netfilter/Makefile2
-rw-r--r--net/netfilter/nf_conntrack_core.c6
-rw-r--r--net/netfilter/nf_nat_ftp.c2
-rw-r--r--net/netfilter/nf_nat_irc.c2
-rw-r--r--net/netfilter/xt_IDLETIMER.c246
-rw-r--r--net/netfilter/xt_bpf.c111
-rw-r--r--net/netfilter/xt_qtaguid.c3021
-rw-r--r--net/netfilter/xt_qtaguid_internal.h350
-rw-r--r--net/netfilter/xt_qtaguid_print.c566
-rw-r--r--net/netfilter/xt_qtaguid_print.h120
-rw-r--r--net/netfilter/xt_quota2.c401
-rw-r--r--net/netfilter/xt_socket.c31
-rw-r--r--net/rfkill/Kconfig5
-rw-r--r--net/rfkill/core.c11
-rw-r--r--net/socket.c30
-rw-r--r--net/sunrpc/svc.c4
-rw-r--r--net/wireless/scan.c2
-rw-r--r--net/xfrm/Kconfig8
-rw-r--r--net/xfrm/Makefile1
-rw-r--r--net/xfrm/xfrm_algo.c2
-rw-r--r--net/xfrm/xfrm_input.c5
-rw-r--r--net/xfrm/xfrm_interface.c974
-rw-r--r--net/xfrm/xfrm_output.c2
-rw-r--r--net/xfrm/xfrm_policy.c81
-rw-r--r--net/xfrm/xfrm_state.c33
-rw-r--r--net/xfrm/xfrm_user.c93
-rw-r--r--samples/bpf/Makefile2
-rw-r--r--samples/bpf/libbpf.c23
-rw-r--r--samples/bpf/libbpf.h4
-rw-r--r--samples/bpf/test_cgrp2_attach.c147
-rw-r--r--scripts/Kbuild.include53
-rw-r--r--scripts/Makefile.build121
-rw-r--r--scripts/Makefile.clean2
-rw-r--r--scripts/Makefile.kasan30
-rw-r--r--scripts/Makefile.lib6
-rw-r--r--scripts/Makefile.modinst2
-rw-r--r--scripts/Makefile.modpost44
-rwxr-xr-xscripts/checkpatch.pl1
-rwxr-xr-xscripts/clang-android.sh4
-rwxr-xr-xscripts/clang-version.sh33
-rwxr-xr-xscripts/link-vmlinux.sh97
-rw-r--r--scripts/mod/Makefile1
-rw-r--r--scripts/mod/modpost.c67
-rw-r--r--scripts/recordmcount.c3
-rw-r--r--security/Kconfig9
-rw-r--r--security/apparmor/lsm.c16
-rw-r--r--security/commoncap.c29
-rw-r--r--security/inode.c2
-rw-r--r--security/keys/dh.c2
-rw-r--r--security/keys/encrypted-keys/encrypted.c4
-rw-r--r--security/keys/trusted.c4
-rw-r--r--security/keys/user_defined.c6
-rw-r--r--security/security.c49
-rw-r--r--security/selinux/hooks.c162
-rw-r--r--security/selinux/include/classmap.h2
-rw-r--r--security/selinux/include/objsec.h4
-rw-r--r--tools/include/uapi/linux/bpf.h90
-rw-r--r--tools/include/uapi/linux/hw_breakpoint.h4
-rw-r--r--verity_dev_keys.x50924
956 files changed, 107333 insertions, 13138 deletions
diff --git a/.gitignore b/.gitignore
index 0c39aa20b6ba..4105cfbd6f26 100644
--- a/.gitignore
+++ b/.gitignore
@@ -115,3 +115,6 @@ all.config
# Kdevelop4
*.kdev4
+
+# fetched Android config fragments
+kernel/configs/android-*.cfg
diff --git a/Documentation/00-INDEX b/Documentation/00-INDEX
index 3acc4f1a6f84..4a5a887181c3 100644
--- a/Documentation/00-INDEX
+++ b/Documentation/00-INDEX
@@ -436,6 +436,8 @@ sysrq.txt
- info on the magic SysRq key.
target/
- directory with info on generating TCM v4 fabric .ko modules
+tee.txt
+ - info on the TEE subsystem and drivers
this_cpu_ops.txt
- List rationale behind and the way to use this_cpu operations.
thermal/
diff --git a/Documentation/ABI/obsolete/sysfs-block-zram b/Documentation/ABI/obsolete/sysfs-block-zram
deleted file mode 100644
index 720ea92cfb2e..000000000000
--- a/Documentation/ABI/obsolete/sysfs-block-zram
+++ /dev/null
@@ -1,119 +0,0 @@
-What: /sys/block/zram<id>/num_reads
-Date: August 2015
-Contact: Sergey Senozhatsky <sergey.senozhatsky@gmail.com>
-Description:
- The num_reads file is read-only and specifies the number of
- reads (failed or successful) done on this device.
- Now accessible via zram<id>/stat node.
-
-What: /sys/block/zram<id>/num_writes
-Date: August 2015
-Contact: Sergey Senozhatsky <sergey.senozhatsky@gmail.com>
-Description:
- The num_writes file is read-only and specifies the number of
- writes (failed or successful) done on this device.
- Now accessible via zram<id>/stat node.
-
-What: /sys/block/zram<id>/invalid_io
-Date: August 2015
-Contact: Sergey Senozhatsky <sergey.senozhatsky@gmail.com>
-Description:
- The invalid_io file is read-only and specifies the number of
- non-page-size-aligned I/O requests issued to this device.
- Now accessible via zram<id>/io_stat node.
-
-What: /sys/block/zram<id>/failed_reads
-Date: August 2015
-Contact: Sergey Senozhatsky <sergey.senozhatsky@gmail.com>
-Description:
- The failed_reads file is read-only and specifies the number of
- failed reads happened on this device.
- Now accessible via zram<id>/io_stat node.
-
-What: /sys/block/zram<id>/failed_writes
-Date: August 2015
-Contact: Sergey Senozhatsky <sergey.senozhatsky@gmail.com>
-Description:
- The failed_writes file is read-only and specifies the number of
- failed writes happened on this device.
- Now accessible via zram<id>/io_stat node.
-
-What: /sys/block/zram<id>/notify_free
-Date: August 2015
-Contact: Sergey Senozhatsky <sergey.senozhatsky@gmail.com>
-Description:
- The notify_free file is read-only. Depending on device usage
- scenario it may account a) the number of pages freed because
- of swap slot free notifications or b) the number of pages freed
- because of REQ_DISCARD requests sent by bio. The former ones
- are sent to a swap block device when a swap slot is freed, which
- implies that this disk is being used as a swap disk. The latter
- ones are sent by filesystem mounted with discard option,
- whenever some data blocks are getting discarded.
- Now accessible via zram<id>/io_stat node.
-
-What: /sys/block/zram<id>/zero_pages
-Date: August 2015
-Contact: Sergey Senozhatsky <sergey.senozhatsky@gmail.com>
-Description:
- The zero_pages file is read-only and specifies number of zero
- filled pages written to this disk. No memory is allocated for
- such pages.
- Now accessible via zram<id>/mm_stat node.
-
-What: /sys/block/zram<id>/orig_data_size
-Date: August 2015
-Contact: Sergey Senozhatsky <sergey.senozhatsky@gmail.com>
-Description:
- The orig_data_size file is read-only and specifies uncompressed
- size of data stored in this disk. This excludes zero-filled
- pages (zero_pages) since no memory is allocated for them.
- Unit: bytes
- Now accessible via zram<id>/mm_stat node.
-
-What: /sys/block/zram<id>/compr_data_size
-Date: August 2015
-Contact: Sergey Senozhatsky <sergey.senozhatsky@gmail.com>
-Description:
- The compr_data_size file is read-only and specifies compressed
- size of data stored in this disk. So, compression ratio can be
- calculated using orig_data_size and this statistic.
- Unit: bytes
- Now accessible via zram<id>/mm_stat node.
-
-What: /sys/block/zram<id>/mem_used_total
-Date: August 2015
-Contact: Sergey Senozhatsky <sergey.senozhatsky@gmail.com>
-Description:
- The mem_used_total file is read-only and specifies the amount
- of memory, including allocator fragmentation and metadata
- overhead, allocated for this disk. So, allocator space
- efficiency can be calculated using compr_data_size and this
- statistic.
- Unit: bytes
- Now accessible via zram<id>/mm_stat node.
-
-What: /sys/block/zram<id>/mem_used_max
-Date: August 2015
-Contact: Sergey Senozhatsky <sergey.senozhatsky@gmail.com>
-Description:
- The mem_used_max file is read/write and specifies the amount
- of maximum memory zram have consumed to store compressed data.
- For resetting the value, you should write "0". Otherwise,
- you could see -EINVAL.
- Unit: bytes
- Downgraded to write-only node: so it's possible to set new
- value only; its current value is stored in zram<id>/mm_stat
- node.
-
-What: /sys/block/zram<id>/mem_limit
-Date: August 2015
-Contact: Sergey Senozhatsky <sergey.senozhatsky@gmail.com>
-Description:
- The mem_limit file is read/write and specifies the maximum
- amount of memory ZRAM can use to store the compressed data.
- The limit could be changed in run time and "0" means disable
- the limit. No limit is the initial state. Unit: bytes
- Downgraded to write-only node: so it's possible to set new
- value only; its current value is stored in zram<id>/mm_stat
- node.
diff --git a/Documentation/ABI/testing/procfs-concurrent_time b/Documentation/ABI/testing/procfs-concurrent_time
new file mode 100644
index 000000000000..55b414289b40
--- /dev/null
+++ b/Documentation/ABI/testing/procfs-concurrent_time
@@ -0,0 +1,16 @@
+What: /proc/uid_concurrent_active_time
+Date: December 2018
+Contact: Connor O'Brien <connoro@google.com>
+Description:
+ The /proc/uid_concurrent_active_time file displays aggregated cputime
+ numbers for each uid, broken down by the total number of cores that were
+ active while the uid's task was running.
+
+What: /proc/uid_concurrent_policy_time
+Date: December 2018
+Contact: Connor O'Brien <connoro@google.com>
+Description:
+ The /proc/uid_concurrent_policy_time file displays aggregated cputime
+ numbers for each uid, broken down based on the cpufreq policy
+ of the core used by the uid's task and the number of cores associated
+ with that policy that were active while the uid's task was running.
diff --git a/Documentation/ABI/testing/procfs-smaps_rollup b/Documentation/ABI/testing/procfs-smaps_rollup
new file mode 100644
index 000000000000..0a54ed0d63c9
--- /dev/null
+++ b/Documentation/ABI/testing/procfs-smaps_rollup
@@ -0,0 +1,31 @@
+What: /proc/pid/smaps_rollup
+Date: August 2017
+Contact: Daniel Colascione <dancol@google.com>
+Description:
+ This file provides pre-summed memory information for a
+ process. The format is identical to /proc/pid/smaps,
+ except instead of an entry for each VMA in a process,
+ smaps_rollup has a single entry (tagged "[rollup]")
+ for which each field is the sum of the corresponding
+ fields from all the maps in /proc/pid/smaps.
+ For more details, see the procfs man page.
+
+ Typical output looks like this:
+
+ 00100000-ff709000 ---p 00000000 00:00 0 [rollup]
+ Rss: 884 kB
+ Pss: 385 kB
+ Shared_Clean: 696 kB
+ Shared_Dirty: 0 kB
+ Private_Clean: 120 kB
+ Private_Dirty: 68 kB
+ Referenced: 884 kB
+ Anonymous: 68 kB
+ LazyFree: 0 kB
+ AnonHugePages: 0 kB
+ ShmemPmdMapped: 0 kB
+ Shared_Hugetlb: 0 kB
+ Private_Hugetlb: 0 kB
+ Swap: 0 kB
+ SwapPss: 0 kB
+ Locked: 385 kB
diff --git a/Documentation/ABI/testing/sysfs-block-zram b/Documentation/ABI/testing/sysfs-block-zram
index 4518d30b8c2e..c1513c756af1 100644
--- a/Documentation/ABI/testing/sysfs-block-zram
+++ b/Documentation/ABI/testing/sysfs-block-zram
@@ -22,41 +22,6 @@ Description:
device. The reset operation frees all the memory associated
with this device.
-What: /sys/block/zram<id>/num_reads
-Date: August 2010
-Contact: Nitin Gupta <ngupta@vflare.org>
-Description:
- The num_reads file is read-only and specifies the number of
- reads (failed or successful) done on this device.
-
-What: /sys/block/zram<id>/num_writes
-Date: August 2010
-Contact: Nitin Gupta <ngupta@vflare.org>
-Description:
- The num_writes file is read-only and specifies the number of
- writes (failed or successful) done on this device.
-
-What: /sys/block/zram<id>/invalid_io
-Date: August 2010
-Contact: Nitin Gupta <ngupta@vflare.org>
-Description:
- The invalid_io file is read-only and specifies the number of
- non-page-size-aligned I/O requests issued to this device.
-
-What: /sys/block/zram<id>/failed_reads
-Date: February 2014
-Contact: Sergey Senozhatsky <sergey.senozhatsky@gmail.com>
-Description:
- The failed_reads file is read-only and specifies the number of
- failed reads happened on this device.
-
-What: /sys/block/zram<id>/failed_writes
-Date: February 2014
-Contact: Sergey Senozhatsky <sergey.senozhatsky@gmail.com>
-Description:
- The failed_writes file is read-only and specifies the number of
- failed writes happened on this device.
-
What: /sys/block/zram<id>/max_comp_streams
Date: February 2014
Contact: Sergey Senozhatsky <sergey.senozhatsky@gmail.com>
@@ -73,74 +38,24 @@ Description:
available and selected compression algorithms, change
compression algorithm selection.
-What: /sys/block/zram<id>/notify_free
-Date: August 2010
-Contact: Nitin Gupta <ngupta@vflare.org>
-Description:
- The notify_free file is read-only. Depending on device usage
- scenario it may account a) the number of pages freed because
- of swap slot free notifications or b) the number of pages freed
- because of REQ_DISCARD requests sent by bio. The former ones
- are sent to a swap block device when a swap slot is freed, which
- implies that this disk is being used as a swap disk. The latter
- ones are sent by filesystem mounted with discard option,
- whenever some data blocks are getting discarded.
-
-What: /sys/block/zram<id>/zero_pages
-Date: August 2010
-Contact: Nitin Gupta <ngupta@vflare.org>
-Description:
- The zero_pages file is read-only and specifies number of zero
- filled pages written to this disk. No memory is allocated for
- such pages.
-
-What: /sys/block/zram<id>/orig_data_size
-Date: August 2010
-Contact: Nitin Gupta <ngupta@vflare.org>
-Description:
- The orig_data_size file is read-only and specifies uncompressed
- size of data stored in this disk. This excludes zero-filled
- pages (zero_pages) since no memory is allocated for them.
- Unit: bytes
-
-What: /sys/block/zram<id>/compr_data_size
-Date: August 2010
-Contact: Nitin Gupta <ngupta@vflare.org>
-Description:
- The compr_data_size file is read-only and specifies compressed
- size of data stored in this disk. So, compression ratio can be
- calculated using orig_data_size and this statistic.
- Unit: bytes
-
-What: /sys/block/zram<id>/mem_used_total
-Date: August 2010
-Contact: Nitin Gupta <ngupta@vflare.org>
-Description:
- The mem_used_total file is read-only and specifies the amount
- of memory, including allocator fragmentation and metadata
- overhead, allocated for this disk. So, allocator space
- efficiency can be calculated using compr_data_size and this
- statistic.
- Unit: bytes
-
What: /sys/block/zram<id>/mem_used_max
Date: August 2014
Contact: Minchan Kim <minchan@kernel.org>
Description:
- The mem_used_max file is read/write and specifies the amount
- of maximum memory zram have consumed to store compressed data.
- For resetting the value, you should write "0". Otherwise,
- you could see -EINVAL.
+ The mem_used_max file is write-only and is used to reset
+ the counter of maximum memory zram have consumed to store
+ compressed data. For resetting the value, you should write
+ "0". Otherwise, you could see -EINVAL.
Unit: bytes
What: /sys/block/zram<id>/mem_limit
Date: August 2014
Contact: Minchan Kim <minchan@kernel.org>
Description:
- The mem_limit file is read/write and specifies the maximum
- amount of memory ZRAM can use to store the compressed data. The
- limit could be changed in run time and "0" means disable the
- limit. No limit is the initial state. Unit: bytes
+ The mem_limit file is write-only and specifies the maximum
+ amount of memory ZRAM can use to store the compressed data.
+ The limit could be changed in run time and "0" means disable
+ the limit. No limit is the initial state. Unit: bytes
What: /sys/block/zram<id>/compact
Date: August 2015
@@ -175,3 +90,11 @@ Description:
device's debugging info useful for kernel developers. Its
format is not documented intentionally and may change
anytime without any notice.
+
+What: /sys/block/zram<id>/backing_dev
+Date: June 2017
+Contact: Minchan Kim <minchan@kernel.org>
+Description:
+ The backing_dev file is read-write and set up backing
+ device for zram to write incompressible pages.
+ For using, user should enable CONFIG_ZRAM_WRITEBACK.
diff --git a/Documentation/ABI/testing/sysfs-class-dual-role-usb b/Documentation/ABI/testing/sysfs-class-dual-role-usb
new file mode 100644
index 000000000000..a900fd75430c
--- /dev/null
+++ b/Documentation/ABI/testing/sysfs-class-dual-role-usb
@@ -0,0 +1,71 @@
+What: /sys/class/dual_role_usb/.../
+Date: June 2015
+Contact: Badhri Jagan Sridharan<badhri@google.com>
+Description:
+ Provide a generic interface to monitor and change
+ the state of dual role usb ports. The name here
+ refers to the name mentioned in the
+ dual_role_phy_desc that is passed while registering
+ the dual_role_phy_intstance through
+ devm_dual_role_instance_register.
+
+What: /sys/class/dual_role_usb/.../supported_modes
+Date: June 2015
+Contact: Badhri Jagan Sridharan<badhri@google.com>
+Description:
+ This is a static node, once initialized this
+ is not expected to change during runtime. "dfp"
+ refers to "downstream facing port" i.e. port can
+ only act as host. "ufp" refers to "upstream
+ facing port" i.e. port can only act as device.
+ "dfp ufp" refers to "dual role port" i.e. the port
+ can either be a host port or a device port.
+
+What: /sys/class/dual_role_usb/.../mode
+Date: June 2015
+Contact: Badhri Jagan Sridharan<badhri@google.com>
+Description:
+ The mode node refers to the current mode in which the
+ port is operating. "dfp" for host ports. "ufp" for device
+ ports and "none" when cable is not connected.
+
+ On devices where the USB mode is software-controllable,
+ userspace can change the mode by writing "dfp" or "ufp".
+ On devices where the USB mode is fixed in hardware,
+ this attribute is read-only.
+
+What: /sys/class/dual_role_usb/.../power_role
+Date: June 2015
+Contact: Badhri Jagan Sridharan<badhri@google.com>
+Description:
+ The power_role node mentions whether the port
+ is "sink"ing or "source"ing power. "none" if
+ they are not connected.
+
+ On devices implementing USB Power Delivery,
+ userspace can control the power role by writing "sink" or
+ "source". On devices without USB-PD, this attribute is
+ read-only.
+
+What: /sys/class/dual_role_usb/.../data_role
+Date: June 2015
+Contact: Badhri Jagan Sridharan<badhri@google.com>
+Description:
+ The data_role node mentions whether the port
+ is acting as "host" or "device" for USB data connection.
+ "none" if there is no active data link.
+
+ On devices implementing USB Power Delivery, userspace
+ can control the data role by writing "host" or "device".
+ On devices without USB-PD, this attribute is read-only
+
+What: /sys/class/dual_role_usb/.../powers_vconn
+Date: June 2015
+Contact: Badhri Jagan Sridharan<badhri@google.com>
+Description:
+ The powers_vconn node mentions whether the port
+ is supplying power for VCONN pin.
+
+ On devices with software control of VCONN,
+ userspace can disable the power supply to VCONN by writing "n",
+ or enable the power supply by writing "y".
diff --git a/Documentation/ABI/testing/sysfs-fs-f2fs b/Documentation/ABI/testing/sysfs-fs-f2fs
index a809f6005f14..e09db014830f 100644
--- a/Documentation/ABI/testing/sysfs-fs-f2fs
+++ b/Documentation/ABI/testing/sysfs-fs-f2fs
@@ -51,18 +51,56 @@ Description:
Controls the dirty page count condition for the in-place-update
policies.
+What: /sys/fs/f2fs/<disk>/min_seq_blocks
+Date: August 2018
+Contact: "Jaegeuk Kim" <jaegeuk@kernel.org>
+Description:
+ Controls the dirty page count condition for batched sequential
+ writes in ->writepages.
+
+
+What: /sys/fs/f2fs/<disk>/min_hot_blocks
+Date: March 2017
+Contact: "Jaegeuk Kim" <jaegeuk@kernel.org>
+Description:
+ Controls the dirty page count condition for redefining hot data.
+
+What: /sys/fs/f2fs/<disk>/min_ssr_sections
+Date: October 2017
+Contact: "Chao Yu" <yuchao0@huawei.com>
+Description:
+ Controls the fee section threshold to trigger SSR allocation.
+
What: /sys/fs/f2fs/<disk>/max_small_discards
Date: November 2013
Contact: "Jaegeuk Kim" <jaegeuk.kim@samsung.com>
Description:
Controls the issue rate of small discard commands.
+What: /sys/fs/f2fs/<disk>/discard_granularity
+Date: July 2017
+Contact: "Chao Yu" <yuchao0@huawei.com>
+Description:
+ Controls discard granularity of inner discard thread, inner thread
+ will not issue discards with size that is smaller than granularity.
+ The unit size is one block, now only support configuring in range
+ of [1, 512].
+
What: /sys/fs/f2fs/<disk>/max_victim_search
Date: January 2014
Contact: "Jaegeuk Kim" <jaegeuk.kim@samsung.com>
Description:
Controls the number of trials to find a victim segment.
+What: /sys/fs/f2fs/<disk>/migration_granularity
+Date: October 2018
+Contact: "Chao Yu" <yuchao0@huawei.com>
+Description:
+ Controls migration granularity of garbage collection on large
+ section, it can let GC move partial segment{s} of one section
+ in one GC cycle, so that dispersing heavy overhead GC to
+ multiple lightweight one.
+
What: /sys/fs/f2fs/<disk>/dir_level
Date: March 2014
Contact: "Jaegeuk Kim" <jaegeuk.kim@samsung.com>
@@ -80,6 +118,7 @@ Date: February 2015
Contact: "Jaegeuk Kim" <jaegeuk@kernel.org>
Description:
Controls the trimming rate in batch mode.
+ <deprecated>
What: /sys/fs/f2fs/<disk>/cp_interval
Date: October 2015
@@ -91,7 +130,28 @@ What: /sys/fs/f2fs/<disk>/idle_interval
Date: January 2016
Contact: "Jaegeuk Kim" <jaegeuk@kernel.org>
Description:
- Controls the idle timing.
+ Controls the idle timing for all paths other than
+ discard and gc path.
+
+What: /sys/fs/f2fs/<disk>/discard_idle_interval
+Date: September 2018
+Contact: "Chao Yu" <yuchao0@huawei.com>
+Contact: "Sahitya Tummala" <stummala@codeaurora.org>
+Description:
+ Controls the idle timing for discard path.
+
+What: /sys/fs/f2fs/<disk>/gc_idle_interval
+Date: September 2018
+Contact: "Chao Yu" <yuchao0@huawei.com>
+Contact: "Sahitya Tummala" <stummala@codeaurora.org>
+Description:
+ Controls the idle timing for gc path.
+
+What: /sys/fs/f2fs/<disk>/iostat_enable
+Date: August 2017
+Contact: "Chao Yu" <yuchao0@huawei.com>
+Description:
+ Controls to enable/disable IO stat.
What: /sys/fs/f2fs/<disk>/ra_nid_pages
Date: October 2015
@@ -112,3 +172,67 @@ Date: January 2016
Contact: "Shuoran Liu" <liushuoran@huawei.com>
Description:
Shows total written kbytes issued to disk.
+
+What: /sys/fs/f2fs/<disk>/feature
+Date: July 2017
+Contact: "Jaegeuk Kim" <jaegeuk@kernel.org>
+Description:
+ Shows all enabled features in current device.
+
+What: /sys/fs/f2fs/<disk>/inject_rate
+Date: May 2016
+Contact: "Sheng Yong" <shengyong1@huawei.com>
+Description:
+ Controls the injection rate.
+
+What: /sys/fs/f2fs/<disk>/inject_type
+Date: May 2016
+Contact: "Sheng Yong" <shengyong1@huawei.com>
+Description:
+ Controls the injection type.
+
+What: /sys/fs/f2fs/<disk>/reserved_blocks
+Date: June 2017
+Contact: "Chao Yu" <yuchao0@huawei.com>
+Description:
+ Controls target reserved blocks in system, the threshold
+ is soft, it could exceed current available user space.
+
+What: /sys/fs/f2fs/<disk>/current_reserved_blocks
+Date: October 2017
+Contact: "Yunlong Song" <yunlong.song@huawei.com>
+Contact: "Chao Yu" <yuchao0@huawei.com>
+Description:
+ Shows current reserved blocks in system, it may be temporarily
+ smaller than target_reserved_blocks, but will gradually
+ increase to target_reserved_blocks when more free blocks are
+ freed by user later.
+
+What: /sys/fs/f2fs/<disk>/gc_urgent
+Date: August 2017
+Contact: "Jaegeuk Kim" <jaegeuk@kernel.org>
+Description:
+ Do background GC agressively
+
+What: /sys/fs/f2fs/<disk>/gc_urgent_sleep_time
+Date: August 2017
+Contact: "Jaegeuk Kim" <jaegeuk@kernel.org>
+Description:
+ Controls sleep time of GC urgent mode
+
+What: /sys/fs/f2fs/<disk>/readdir_ra
+Date: November 2017
+Contact: "Sheng Yong" <shengyong1@huawei.com>
+Description:
+ Controls readahead inode block in readdir.
+
+What: /sys/fs/f2fs/<disk>/extension_list
+Date: Feburary 2018
+Contact: "Chao Yu" <yuchao0@huawei.com>
+Description:
+ Used to control configure extension list:
+ - Query: cat /sys/fs/f2fs/<disk>/extension_list
+ - Add: echo '[h/c]extension' > /sys/fs/f2fs/<disk>/extension_list
+ - Del: echo '[h/c]!extension' > /sys/fs/f2fs/<disk>/extension_list
+ - [h] means add/del hot file extension
+ - [c] means add/del cold file extension
diff --git a/Documentation/ABI/testing/sysfs-kernel-wakeup_reasons b/Documentation/ABI/testing/sysfs-kernel-wakeup_reasons
new file mode 100644
index 000000000000..acb19b91c192
--- /dev/null
+++ b/Documentation/ABI/testing/sysfs-kernel-wakeup_reasons
@@ -0,0 +1,16 @@
+What: /sys/kernel/wakeup_reasons/last_resume_reason
+Date: February 2014
+Contact: Ruchi Kandoi <kandoiruchi@google.com>
+Description:
+ The /sys/kernel/wakeup_reasons/last_resume_reason is
+ used to report wakeup reasons after system exited suspend.
+
+What: /sys/kernel/wakeup_reasons/last_suspend_time
+Date: March 2015
+Contact: jinqian <jinqian@google.com>
+Description:
+ The /sys/kernel/wakeup_reasons/last_suspend_time is
+ used to report time spent in last suspend cycle. It contains
+ two numbers (in seconds) separated by space. First number is
+ the time spent in suspend and resume processes. Second number
+ is the time spent in sleep state. \ No newline at end of file
diff --git a/Documentation/android.txt b/Documentation/android.txt
new file mode 100644
index 000000000000..0f40a78b045f
--- /dev/null
+++ b/Documentation/android.txt
@@ -0,0 +1,121 @@
+ =============
+ A N D R O I D
+ =============
+
+Copyright (C) 2009 Google, Inc.
+Written by Mike Chan <mike@android.com>
+
+CONTENTS:
+---------
+
+1. Android
+ 1.1 Required enabled config options
+ 1.2 Required disabled config options
+ 1.3 Recommended enabled config options
+2. Contact
+
+
+1. Android
+==========
+
+Android (www.android.com) is an open source operating system for mobile devices.
+This document describes configurations needed to run the Android framework on
+top of the Linux kernel.
+
+To see a working defconfig look at msm_defconfig or goldfish_defconfig
+which can be found at http://android.git.kernel.org in kernel/common.git
+and kernel/msm.git
+
+
+1.1 Required enabled config options
+-----------------------------------
+After building a standard defconfig, ensure that these options are enabled in
+your .config or defconfig if they are not already. Based off the msm_defconfig.
+You should keep the rest of the default options enabled in the defconfig
+unless you know what you are doing.
+
+ANDROID_PARANOID_NETWORK
+ASHMEM
+CONFIG_FB_MODE_HELPERS
+CONFIG_FONT_8x16
+CONFIG_FONT_8x8
+CONFIG_YAFFS_SHORT_NAMES_IN_RAM
+DAB
+EARLYSUSPEND
+FB
+FB_CFB_COPYAREA
+FB_CFB_FILLRECT
+FB_CFB_IMAGEBLIT
+FB_DEFERRED_IO
+FB_TILEBLITTING
+HIGH_RES_TIMERS
+INOTIFY
+INOTIFY_USER
+INPUT_EVDEV
+INPUT_GPIO
+INPUT_MISC
+LEDS_CLASS
+LEDS_GPIO
+LOCK_KERNEL
+LkOGGER
+LOW_MEMORY_KILLER
+MISC_DEVICES
+NEW_LEDS
+NO_HZ
+POWER_SUPPLY
+PREEMPT
+RAMFS
+RTC_CLASS
+RTC_LIB
+SWITCH
+SWITCH_GPIO
+TMPFS
+UID_STAT
+UID16
+USB_FUNCTION
+USB_FUNCTION_ADB
+USER_WAKELOCK
+VIDEO_OUTPUT_CONTROL
+WAKELOCK
+YAFFS_AUTO_YAFFS2
+YAFFS_FS
+YAFFS_YAFFS1
+YAFFS_YAFFS2
+
+
+1.2 Required disabled config options
+------------------------------------
+CONFIG_YAFFS_DISABLE_LAZY_LOAD
+DNOTIFY
+
+
+1.3 Recommended enabled config options
+------------------------------
+ANDROID_PMEM
+PSTORE_CONSOLE
+PSTORE_RAM
+SCHEDSTATS
+DEBUG_PREEMPT
+DEBUG_MUTEXES
+DEBUG_SPINLOCK_SLEEP
+DEBUG_INFO
+FRAME_POINTER
+CPU_FREQ
+CPU_FREQ_TABLE
+CPU_FREQ_DEFAULT_GOV_ONDEMAND
+CPU_FREQ_GOV_ONDEMAND
+CRC_CCITT
+EMBEDDED
+INPUT_TOUCHSCREEN
+I2C
+I2C_BOARDINFO
+LOG_BUF_SHIFT=17
+SERIAL_CORE
+SERIAL_CORE_CONSOLE
+
+
+2. Contact
+==========
+website: http://android.git.kernel.org
+
+mailing-lists: android-kernel@googlegroups.com
diff --git a/Documentation/block/00-INDEX b/Documentation/block/00-INDEX
index e55103ace382..a542b9f2a30d 100644
--- a/Documentation/block/00-INDEX
+++ b/Documentation/block/00-INDEX
@@ -30,3 +30,9 @@ switching-sched.txt
- Switching I/O schedulers at runtime
writeback_cache_control.txt
- Control of volatile write back caches
+mmc-max-speed.txt
+ - eMMC layer speed simulation, related to /sys/block/mmcblk*/
+ attributes:
+ max_read_speed
+ max_write_speed
+ cache_size
diff --git a/Documentation/block/mmc-max-speed.txt b/Documentation/block/mmc-max-speed.txt
new file mode 100644
index 000000000000..3f052b9fb999
--- /dev/null
+++ b/Documentation/block/mmc-max-speed.txt
@@ -0,0 +1,38 @@
+eMMC Block layer simulation speed controls in /sys/block/mmcblk*/
+===============================================
+
+Turned on with CONFIG_MMC_SIMULATE_MAX_SPEED which enables MMC device speed
+limiting. Used to test and simulate the behavior of the system when
+confronted with a slow MMC.
+
+Enables max_read_speed, max_write_speed and cache_size attributes and module
+default parameters to control the write or read maximum KB/second speed
+behaviors.
+
+NB: There is room for improving the algorithm for aspects tied directly to
+eMMC specific behavior. For instance, wear leveling and stalls from an
+exhausted erase pool. We would expect that if there was a need to provide
+similar speed simulation controls to other types of block devices, aspects of
+their behavior are modelled separately (e.g. head seek times, heat assist,
+shingling and rotational latency).
+
+/sys/block/mmcblk0/max_read_speed:
+
+Number of KB/second reads allowed to the block device. Used to test and
+simulate the behavior of the system when confronted with a slow reading MMC.
+Set to 0 or "off" to place no speed limit.
+
+/sys/block/mmcblk0/max_write_speed:
+
+Number of KB/second writes allowed to the block device. Used to test and
+simulate the behavior of the system when confronted with a slow writing MMC.
+Set to 0 or "off" to place no speed limit.
+
+/sys/block/mmcblk0/cache_size:
+
+Number of MB of high speed memory or high speed SLC cache expected on the
+eMMC device being simulated. Used to help simulate the write-back behavior
+more accurately. The assumption is the cache has no delay, but draws down
+in the background to the MLC/TLC primary store at the max_write_speed rate.
+Any write speed delays will show up when the cache is full, or when an I/O
+request to flush is issued.
diff --git a/Documentation/blockdev/zram.txt b/Documentation/blockdev/zram.txt
index 0535ae1f73e5..875b2b56b87f 100644
--- a/Documentation/blockdev/zram.txt
+++ b/Documentation/blockdev/zram.txt
@@ -161,42 +161,15 @@ Name access description
disksize RW show and set the device's disk size
initstate RO shows the initialization state of the device
reset WO trigger device reset
-num_reads RO the number of reads
-failed_reads RO the number of failed reads
-num_write RO the number of writes
-failed_writes RO the number of failed writes
-invalid_io RO the number of non-page-size-aligned I/O requests
+mem_used_max WO reset the `mem_used_max' counter (see later)
+mem_limit WO specifies the maximum amount of memory ZRAM can use
+ to store the compressed data
max_comp_streams RW the number of possible concurrent compress operations
comp_algorithm RW show and change the compression algorithm
-notify_free RO the number of notifications to free pages (either
- slot free notifications or REQ_DISCARD requests)
-zero_pages RO the number of zero filled pages written to this disk
-orig_data_size RO uncompressed size of data stored in this disk
-compr_data_size RO compressed size of data stored in this disk
-mem_used_total RO the amount of memory allocated for this disk
-mem_used_max RW the maximum amount of memory zram have consumed to
- store the data (to reset this counter to the actual
- current value, write 1 to this attribute)
-mem_limit RW the maximum amount of memory ZRAM can use to store
- the compressed data
-pages_compacted RO the number of pages freed during compaction
- (available only via zram<id>/mm_stat node)
compact WO trigger memory compaction
debug_stat RO this file is used for zram debugging purposes
+backing_dev RW set up backend storage for zram to write out
-WARNING
-=======
-per-stat sysfs attributes are considered to be deprecated.
-The basic strategy is:
--- the existing RW nodes will be downgraded to WO nodes (in linux 4.11)
--- deprecated RO sysfs nodes will eventually be removed (in linux 4.11)
-
-The list of deprecated attributes can be found here:
-Documentation/ABI/obsolete/sysfs-block-zram
-
-Basically, every attribute that has its own read accessible sysfs node
-(e.g. num_reads) *AND* is accessible via one of the stat files (zram<id>/stat
-or zram<id>/io_stat or zram<id>/mm_stat) is considered to be deprecated.
User space is advised to use the following files to read the device statistics.
@@ -211,22 +184,41 @@ The stat file represents device's I/O statistics not accounted by block
layer and, thus, not available in zram<id>/stat file. It consists of a
single line of text and contains the following stats separated by
whitespace:
- failed_reads
- failed_writes
- invalid_io
- notify_free
+ failed_reads the number of failed reads
+ failed_writes the number of failed writes
+ invalid_io the number of non-page-size-aligned I/O requests
+ notify_free Depending on device usage scenario it may account
+ a) the number of pages freed because of swap slot free
+ notifications or b) the number of pages freed because of
+ REQ_DISCARD requests sent by bio. The former ones are
+ sent to a swap block device when a swap slot is freed,
+ which implies that this disk is being used as a swap disk.
+ The latter ones are sent by filesystem mounted with
+ discard option, whenever some data blocks are getting
+ discarded.
File /sys/block/zram<id>/mm_stat
The stat file represents device's mm statistics. It consists of a single
line of text and contains the following stats separated by whitespace:
- orig_data_size
- compr_data_size
- mem_used_total
- mem_limit
- mem_used_max
- zero_pages
- num_migrated
+ orig_data_size uncompressed size of data stored in this disk.
+ This excludes same-element-filled pages (same_pages) since
+ no memory is allocated for them.
+ Unit: bytes
+ compr_data_size compressed size of data stored in this disk
+ mem_used_total the amount of memory allocated for this disk. This
+ includes allocator fragmentation and metadata overhead,
+ allocated for this disk. So, allocator space efficiency
+ can be calculated using compr_data_size and this statistic.
+ Unit: bytes
+ mem_limit the maximum amount of memory ZRAM can use to store
+ the compressed data
+ mem_used_max the maximum amount of memory zram have consumed to
+ store the data
+ same_pages the number of same element filled pages written to this disk.
+ No memory is allocated for such pages.
+ pages_compacted the number of pages freed during compaction
+ huge_pages the number of incompressible pages
9) Deactivate:
swapoff /dev/zram0
@@ -241,5 +233,39 @@ line of text and contains the following stats separated by whitespace:
resets the disksize to zero. You must set the disksize again
before reusing the device.
+* Optional Feature
+
+= writeback
+
+With incompressible pages, there is no memory saving with zram.
+Instead, with CONFIG_ZRAM_WRITEBACK, zram can write incompressible page
+to backing storage rather than keeping it in memory.
+User should set up backing device via /sys/block/zramX/backing_dev
+before disksize setting.
+
+= memory tracking
+
+With CONFIG_ZRAM_MEMORY_TRACKING, user can know information of the
+zram block. It could be useful to catch cold or incompressible
+pages of the process with*pagemap.
+If you enable the feature, you could see block state via
+/sys/kernel/debug/zram/zram0/block_state". The output is as follows,
+
+ 300 75.033841 .wh
+ 301 63.806904 s..
+ 302 63.806919 ..h
+
+First column is zram's block index.
+Second column is access time since the system was booted
+Third column is state of the block.
+(s: same page
+w: written page to backing store
+h: huge page)
+
+First line of above example says 300th block is accessed at 75.033841sec
+and the block's state is huge so it is written back to the backing
+storage. It's a debugging feature so anyone shouldn't rely on it to work
+properly.
+
Nitin Gupta
ngupta@vflare.org
diff --git a/Documentation/cpu-freq/governors.txt b/Documentation/cpu-freq/governors.txt
index c15aa75f5227..0cf9a6bff6a5 100644
--- a/Documentation/cpu-freq/governors.txt
+++ b/Documentation/cpu-freq/governors.txt
@@ -28,6 +28,7 @@ Contents:
2.3 Userspace
2.4 Ondemand
2.5 Conservative
+2.6 Interactive
3. The Governor Interface in the CPUfreq Core
@@ -218,6 +219,91 @@ a decision on when to decrease the frequency while running in any
speed. Load for frequency increase is still evaluated every
sampling rate.
+2.6 Interactive
+---------------
+
+The CPUfreq governor "interactive" is designed for latency-sensitive,
+interactive workloads. This governor sets the CPU speed depending on
+usage, similar to "ondemand" and "conservative" governors, but with a
+different set of configurable behaviors.
+
+The tunable values for this governor are:
+
+above_hispeed_delay: When speed is at or above hispeed_freq, wait for
+this long before raising speed in response to continued high load.
+The format is a single delay value, optionally followed by pairs of
+CPU speeds and the delay to use at or above those speeds. Colons can
+be used between the speeds and associated delays for readability. For
+example:
+
+ 80000 1300000:200000 1500000:40000
+
+uses delay 80000 uS until CPU speed 1.3 GHz, at which speed delay
+200000 uS is used until speed 1.5 GHz, at which speed (and above)
+delay 40000 uS is used. If speeds are specified these must appear in
+ascending order. Default is 20000 uS.
+
+boost: If non-zero, immediately boost speed of all CPUs to at least
+hispeed_freq until zero is written to this attribute. If zero, allow
+CPU speeds to drop below hispeed_freq according to load as usual.
+Default is zero.
+
+boostpulse: On each write, immediately boost speed of all CPUs to
+hispeed_freq for at least the period of time specified by
+boostpulse_duration, after which speeds are allowed to drop below
+hispeed_freq according to load as usual. Its a write-only file.
+
+boostpulse_duration: Length of time to hold CPU speed at hispeed_freq
+on a write to boostpulse, before allowing speed to drop according to
+load as usual. Default is 80000 uS.
+
+go_hispeed_load: The CPU load at which to ramp to hispeed_freq.
+Default is 99%.
+
+hispeed_freq: An intermediate "high speed" at which to initially ramp
+when CPU load hits the value specified in go_hispeed_load. If load
+stays high for the amount of time specified in above_hispeed_delay,
+then speed may be bumped higher. Default is the maximum speed allowed
+by the policy at governor initialization time.
+
+io_is_busy: If set, the governor accounts IO time as CPU busy time.
+
+min_sample_time: The minimum amount of time to spend at the current
+frequency before ramping down. Default is 80000 uS.
+
+target_loads: CPU load values used to adjust speed to influence the
+current CPU load toward that value. In general, the lower the target
+load, the more often the governor will raise CPU speeds to bring load
+below the target. The format is a single target load, optionally
+followed by pairs of CPU speeds and CPU loads to target at or above
+those speeds. Colons can be used between the speeds and associated
+target loads for readability. For example:
+
+ 85 1000000:90 1700000:99
+
+targets CPU load 85% below speed 1GHz, 90% at or above 1GHz, until
+1.7GHz and above, at which load 99% is targeted. If speeds are
+specified these must appear in ascending order. Higher target load
+values are typically specified for higher speeds, that is, target load
+values also usually appear in an ascending order. The default is
+target load 90% for all speeds.
+
+timer_rate: Sample rate for reevaluating CPU load when the CPU is not
+idle. A deferrable timer is used, such that the CPU will not be woken
+from idle to service this timer until something else needs to run.
+(The maximum time to allow deferring this timer when not running at
+minimum speed is configurable via timer_slack.) Default is 20000 uS.
+
+timer_slack: Maximum additional time to defer handling the governor
+sampling timer beyond timer_rate when running at speeds above the
+minimum. For platforms that consume additional power at idle when
+CPUs are running at speeds greater than minimum, this places an upper
+bound on how long the timer will be deferred prior to re-evaluating
+load and dropping speed. For example, if timer_rate is 20000uS and
+timer_slack is 10000uS then timers will be deferred for up to 30msec
+when not at lowest speed. A value of -1 means defer timers
+indefinitely at all speeds. Default is 80000 uS.
+
3. The Governor Interface in the CPUfreq Core
=============================================
diff --git a/Documentation/device-mapper/boot.txt b/Documentation/device-mapper/boot.txt
new file mode 100644
index 000000000000..adcaad5e5e32
--- /dev/null
+++ b/Documentation/device-mapper/boot.txt
@@ -0,0 +1,42 @@
+Boot time creation of mapped devices
+===================================
+
+It is possible to configure a device mapper device to act as the root
+device for your system in two ways.
+
+The first is to build an initial ramdisk which boots to a minimal
+userspace which configures the device, then pivot_root(8) in to it.
+
+For simple device mapper configurations, it is possible to boot directly
+using the following kernel command line:
+
+dm="<name> <uuid> <ro>,table line 1,...,table line n"
+
+name = the name to associate with the device
+ after boot, udev, if used, will use that name to label
+ the device node.
+uuid = may be 'none' or the UUID desired for the device.
+ro = may be "ro" or "rw". If "ro", the device and device table will be
+ marked read-only.
+
+Each table line may be as normal when using the dmsetup tool except for
+two variations:
+1. Any use of commas will be interpreted as a newline
+2. Quotation marks cannot be escaped and cannot be used without
+ terminating the dm= argument.
+
+Unless renamed by udev, the device node created will be dm-0 as the
+first minor number for the device-mapper is used during early creation.
+
+Example
+=======
+
+- Booting to a linear array made up of user-mode linux block devices:
+
+ dm="lroot none 0, 0 4096 linear 98:16 0, 4096 4096 linear 98:32 0" \
+ root=/dev/dm-0
+
+Will boot to a rw dm-linear target of 8192 sectors split across two
+block devices identified by their major:minor numbers. After boot, udev
+will rename this target to /dev/mapper/lroot (depending on the rules).
+No uuid was assigned.
diff --git a/Documentation/device-mapper/dm-crypt.txt b/Documentation/device-mapper/dm-crypt.txt
index 692171fe9da0..ae9a8f7e59b3 100644
--- a/Documentation/device-mapper/dm-crypt.txt
+++ b/Documentation/device-mapper/dm-crypt.txt
@@ -76,6 +76,20 @@ submit_from_crypt_cpus
thread because it benefits CFQ to have writes submitted using the
same context.
+sector_size:<bytes>
+ Use <bytes> as the encryption unit instead of 512 bytes sectors.
+ This option can be in range 512 - 4096 bytes and must be power of two.
+ Virtual device will announce this size as a minimal IO and logical sector.
+
+iv_large_sectors
+ IV generators will use sector number counted in <sector_size> units
+ instead of default 512 bytes sectors.
+
+ For example, if <sector_size> is 4096 bytes, plain64 IV for the second
+ sector will be 8 (without flag) and 1 if iv_large_sectors is present.
+ The <iv_offset> must be multiple of <sector_size> (in 512 bytes units)
+ if this flag is specified.
+
Example scripts
===============
LUKS (Linux Unified Key Setup) is now the preferred way to set up disk
diff --git a/Documentation/device-mapper/verity.txt b/Documentation/device-mapper/verity.txt
index 89fd8f9a259f..b3d2e4a42255 100644
--- a/Documentation/device-mapper/verity.txt
+++ b/Documentation/device-mapper/verity.txt
@@ -109,6 +109,17 @@ fec_start <offset>
This is the offset, in <data_block_size> blocks, from the start of the
FEC device to the beginning of the encoding data.
+check_at_most_once
+ Verify data blocks only the first time they are read from the data device,
+ rather than every time. This reduces the overhead of dm-verity so that it
+ can be used on systems that are memory and/or CPU constrained. However, it
+ provides a reduced level of security because only offline tampering of the
+ data device's content will be detected, not online tampering.
+
+ Hash blocks are still verified each time they are read from the hash device,
+ since verification of hash blocks is less performance critical than data
+ blocks, and a hash block will not be verified any more after all the data
+ blocks it covers have been verified anyway.
Theory of operation
===================
diff --git a/Documentation/devicetree/bindings/arm/firmware/linaro,optee-tz.txt b/Documentation/devicetree/bindings/arm/firmware/linaro,optee-tz.txt
new file mode 100644
index 000000000000..d38834c67dff
--- /dev/null
+++ b/Documentation/devicetree/bindings/arm/firmware/linaro,optee-tz.txt
@@ -0,0 +1,31 @@
+OP-TEE Device Tree Bindings
+
+OP-TEE is a piece of software using hardware features to provide a Trusted
+Execution Environment. The security can be provided with ARM TrustZone, but
+also by virtualization or a separate chip.
+
+We're using "linaro" as the first part of the compatible property for
+the reference implementation maintained by Linaro.
+
+* OP-TEE based on ARM TrustZone required properties:
+
+- compatible : should contain "linaro,optee-tz"
+
+- method : The method of calling the OP-TEE Trusted OS. Permitted
+ values are:
+
+ "smc" : SMC #0, with the register assignments specified
+ in drivers/tee/optee/optee_smc.h
+
+ "hvc" : HVC #0, with the register assignments specified
+ in drivers/tee/optee/optee_smc.h
+
+
+
+Example:
+ firmware {
+ optee {
+ compatible = "linaro,optee-tz";
+ method = "smc";
+ };
+ };
diff --git a/Documentation/devicetree/bindings/misc/memory-state-time.txt b/Documentation/devicetree/bindings/misc/memory-state-time.txt
new file mode 100644
index 000000000000..c99a506c030d
--- /dev/null
+++ b/Documentation/devicetree/bindings/misc/memory-state-time.txt
@@ -0,0 +1,8 @@
+Memory bandwidth and frequency state tracking
+
+Required properties:
+- compatible : should be:
+ "memory-state-time"
+- freq-tbl: Should contain entries with each frequency in Hz.
+- bw-buckets: Should contain upper-bound limits for each bandwidth bucket in Mbps.
+ Must match the framework power_profile.xml for the device.
diff --git a/Documentation/devicetree/bindings/vendor-prefixes.txt b/Documentation/devicetree/bindings/vendor-prefixes.txt
index bceffffb7502..d0526e05aea3 100644
--- a/Documentation/devicetree/bindings/vendor-prefixes.txt
+++ b/Documentation/devicetree/bindings/vendor-prefixes.txt
@@ -156,6 +156,7 @@ lacie LaCie
lantiq Lantiq Semiconductor
lenovo Lenovo Group Ltd.
lg LG Corporation
+linaro Linaro Limited
linux Linux-specific binding
lltc Linear Technology Corporation
lsi LSI Corp. (LSI Logic)
diff --git a/Documentation/filesystems/f2fs.txt b/Documentation/filesystems/f2fs.txt
index 753dd4f96afe..b640a3764cfa 100644
--- a/Documentation/filesystems/f2fs.txt
+++ b/Documentation/filesystems/f2fs.txt
@@ -154,9 +154,68 @@ noinline_data Disable the inline data feature, inline data feature is
enabled by default.
data_flush Enable data flushing before checkpoint in order to
persist data of regular and symlink.
+fault_injection=%d Enable fault injection in all supported types with
+ specified injection rate.
+fault_type=%d Support configuring fault injection type, should be
+ enabled with fault_injection option, fault type value
+ is shown below, it supports single or combined type.
+ Type_Name Type_Value
+ FAULT_KMALLOC 0x000000001
+ FAULT_KVMALLOC 0x000000002
+ FAULT_PAGE_ALLOC 0x000000004
+ FAULT_PAGE_GET 0x000000008
+ FAULT_ALLOC_BIO 0x000000010
+ FAULT_ALLOC_NID 0x000000020
+ FAULT_ORPHAN 0x000000040
+ FAULT_BLOCK 0x000000080
+ FAULT_DIR_DEPTH 0x000000100
+ FAULT_EVICT_INODE 0x000000200
+ FAULT_TRUNCATE 0x000000400
+ FAULT_READ_IO 0x000000800
+ FAULT_CHECKPOINT 0x000001000
+ FAULT_DISCARD 0x000002000
+ FAULT_WRITE_IO 0x000004000
mode=%s Control block allocation mode which supports "adaptive"
and "lfs". In "lfs" mode, there should be no random
writes towards main area.
+io_bits=%u Set the bit size of write IO requests. It should be set
+ with "mode=lfs".
+usrquota Enable plain user disk quota accounting.
+grpquota Enable plain group disk quota accounting.
+prjquota Enable plain project quota accounting.
+usrjquota=<file> Appoint specified file and type during mount, so that quota
+grpjquota=<file> information can be properly updated during recovery flow,
+prjjquota=<file> <quota file>: must be in root directory;
+jqfmt=<quota type> <quota type>: [vfsold,vfsv0,vfsv1].
+offusrjquota Turn off user journelled quota.
+offgrpjquota Turn off group journelled quota.
+offprjjquota Turn off project journelled quota.
+quota Enable plain user disk quota accounting.
+noquota Disable all plain disk quota option.
+whint_mode=%s Control which write hints are passed down to block
+ layer. This supports "off", "user-based", and
+ "fs-based". In "off" mode (default), f2fs does not pass
+ down hints. In "user-based" mode, f2fs tries to pass
+ down hints given by users. And in "fs-based" mode, f2fs
+ passes down hints with its policy.
+alloc_mode=%s Adjust block allocation policy, which supports "reuse"
+ and "default".
+fsync_mode=%s Control the policy of fsync. Currently supports "posix",
+ "strict", and "nobarrier". In "posix" mode, which is
+ default, fsync will follow POSIX semantics and does a
+ light operation to improve the filesystem performance.
+ In "strict" mode, fsync will be heavy and behaves in line
+ with xfs, ext4 and btrfs, where xfstest generic/342 will
+ pass, but the performance will regress. "nobarrier" is
+ based on "posix", but doesn't issue flush command for
+ non-atomic files likewise "nobarrier" mount option.
+test_dummy_encryption Enable dummy encryption, which provides a fake fscrypt
+ context. The fake fscrypt context is used by xfstests.
+checkpoint=%s Set to "disable" to turn off checkpointing. Set to "enable"
+ to reenable checkpointing. Is enabled by default. While
+ disabled, any unmounting or unexpected shutdowns will cause
+ the filesystem contents to appear as they did when the
+ filesystem was mounted with that option.
================================================================================
DEBUGFS ENTRIES
@@ -202,6 +261,15 @@ Files in /sys/fs/f2fs/<devname>
gc_idle = 1 will select the Cost Benefit approach
& setting gc_idle = 2 will select the greedy approach.
+ gc_urgent This parameter controls triggering background GCs
+ urgently or not. Setting gc_urgent = 0 [default]
+ makes back to default behavior, while if it is set
+ to 1, background thread starts to do GC by given
+ gc_urgent_sleep_time interval.
+
+ gc_urgent_sleep_time This parameter controls sleep time for gc_urgent.
+ 500 ms is set by default. See above gc_urgent.
+
reclaim_segments This parameter controls the number of prefree
segments to be reclaimed. If the number of prefree
segments is larger than the number of segments
diff --git a/Documentation/filesystems/fscrypt.rst b/Documentation/filesystems/fscrypt.rst
new file mode 100644
index 000000000000..3a7b60521b94
--- /dev/null
+++ b/Documentation/filesystems/fscrypt.rst
@@ -0,0 +1,641 @@
+=====================================
+Filesystem-level encryption (fscrypt)
+=====================================
+
+Introduction
+============
+
+fscrypt is a library which filesystems can hook into to support
+transparent encryption of files and directories.
+
+Note: "fscrypt" in this document refers to the kernel-level portion,
+implemented in ``fs/crypto/``, as opposed to the userspace tool
+`fscrypt <https://github.com/google/fscrypt>`_. This document only
+covers the kernel-level portion. For command-line examples of how to
+use encryption, see the documentation for the userspace tool `fscrypt
+<https://github.com/google/fscrypt>`_. Also, it is recommended to use
+the fscrypt userspace tool, or other existing userspace tools such as
+`fscryptctl <https://github.com/google/fscryptctl>`_ or `Android's key
+management system
+<https://source.android.com/security/encryption/file-based>`_, over
+using the kernel's API directly. Using existing tools reduces the
+chance of introducing your own security bugs. (Nevertheless, for
+completeness this documentation covers the kernel's API anyway.)
+
+Unlike dm-crypt, fscrypt operates at the filesystem level rather than
+at the block device level. This allows it to encrypt different files
+with different keys and to have unencrypted files on the same
+filesystem. This is useful for multi-user systems where each user's
+data-at-rest needs to be cryptographically isolated from the others.
+However, except for filenames, fscrypt does not encrypt filesystem
+metadata.
+
+Unlike eCryptfs, which is a stacked filesystem, fscrypt is integrated
+directly into supported filesystems --- currently ext4, F2FS, and
+UBIFS. This allows encrypted files to be read and written without
+caching both the decrypted and encrypted pages in the pagecache,
+thereby nearly halving the memory used and bringing it in line with
+unencrypted files. Similarly, half as many dentries and inodes are
+needed. eCryptfs also limits encrypted filenames to 143 bytes,
+causing application compatibility issues; fscrypt allows the full 255
+bytes (NAME_MAX). Finally, unlike eCryptfs, the fscrypt API can be
+used by unprivileged users, with no need to mount anything.
+
+fscrypt does not support encrypting files in-place. Instead, it
+supports marking an empty directory as encrypted. Then, after
+userspace provides the key, all regular files, directories, and
+symbolic links created in that directory tree are transparently
+encrypted.
+
+Threat model
+============
+
+Offline attacks
+---------------
+
+Provided that userspace chooses a strong encryption key, fscrypt
+protects the confidentiality of file contents and filenames in the
+event of a single point-in-time permanent offline compromise of the
+block device content. fscrypt does not protect the confidentiality of
+non-filename metadata, e.g. file sizes, file permissions, file
+timestamps, and extended attributes. Also, the existence and location
+of holes (unallocated blocks which logically contain all zeroes) in
+files is not protected.
+
+fscrypt is not guaranteed to protect confidentiality or authenticity
+if an attacker is able to manipulate the filesystem offline prior to
+an authorized user later accessing the filesystem.
+
+Online attacks
+--------------
+
+fscrypt (and storage encryption in general) can only provide limited
+protection, if any at all, against online attacks. In detail:
+
+fscrypt is only resistant to side-channel attacks, such as timing or
+electromagnetic attacks, to the extent that the underlying Linux
+Cryptographic API algorithms are. If a vulnerable algorithm is used,
+such as a table-based implementation of AES, it may be possible for an
+attacker to mount a side channel attack against the online system.
+Side channel attacks may also be mounted against applications
+consuming decrypted data.
+
+After an encryption key has been provided, fscrypt is not designed to
+hide the plaintext file contents or filenames from other users on the
+same system, regardless of the visibility of the keyring key.
+Instead, existing access control mechanisms such as file mode bits,
+POSIX ACLs, LSMs, or mount namespaces should be used for this purpose.
+Also note that as long as the encryption keys are *anywhere* in
+memory, an online attacker can necessarily compromise them by mounting
+a physical attack or by exploiting any kernel security vulnerability
+which provides an arbitrary memory read primitive.
+
+While it is ostensibly possible to "evict" keys from the system,
+recently accessed encrypted files will remain accessible at least
+until the filesystem is unmounted or the VFS caches are dropped, e.g.
+using ``echo 2 > /proc/sys/vm/drop_caches``. Even after that, if the
+RAM is compromised before being powered off, it will likely still be
+possible to recover portions of the plaintext file contents, if not
+some of the encryption keys as well. (Since Linux v4.12, all
+in-kernel keys related to fscrypt are sanitized before being freed.
+However, userspace would need to do its part as well.)
+
+Currently, fscrypt does not prevent a user from maliciously providing
+an incorrect key for another user's existing encrypted files. A
+protection against this is planned.
+
+Key hierarchy
+=============
+
+Master Keys
+-----------
+
+Each encrypted directory tree is protected by a *master key*. Master
+keys can be up to 64 bytes long, and must be at least as long as the
+greater of the key length needed by the contents and filenames
+encryption modes being used. For example, if AES-256-XTS is used for
+contents encryption, the master key must be 64 bytes (512 bits). Note
+that the XTS mode is defined to require a key twice as long as that
+required by the underlying block cipher.
+
+To "unlock" an encrypted directory tree, userspace must provide the
+appropriate master key. There can be any number of master keys, each
+of which protects any number of directory trees on any number of
+filesystems.
+
+Userspace should generate master keys either using a cryptographically
+secure random number generator, or by using a KDF (Key Derivation
+Function). Note that whenever a KDF is used to "stretch" a
+lower-entropy secret such as a passphrase, it is critical that a KDF
+designed for this purpose be used, such as scrypt, PBKDF2, or Argon2.
+
+Per-file keys
+-------------
+
+Since each master key can protect many files, it is necessary to
+"tweak" the encryption of each file so that the same plaintext in two
+files doesn't map to the same ciphertext, or vice versa. In most
+cases, fscrypt does this by deriving per-file keys. When a new
+encrypted inode (regular file, directory, or symlink) is created,
+fscrypt randomly generates a 16-byte nonce and stores it in the
+inode's encryption xattr. Then, it uses a KDF (Key Derivation
+Function) to derive the file's key from the master key and nonce.
+
+The Adiantum encryption mode (see `Encryption modes and usage`_) is
+special, since it accepts longer IVs and is suitable for both contents
+and filenames encryption. For it, a "direct key" option is offered
+where the file's nonce is included in the IVs and the master key is
+used for encryption directly. This improves performance; however,
+users must not use the same master key for any other encryption mode.
+
+Below, the KDF and design considerations are described in more detail.
+
+The current KDF works by encrypting the master key with AES-128-ECB,
+using the file's nonce as the AES key. The output is used as the
+derived key. If the output is longer than needed, then it is
+truncated to the needed length.
+
+Note: this KDF meets the primary security requirement, which is to
+produce unique derived keys that preserve the entropy of the master
+key, assuming that the master key is already a good pseudorandom key.
+However, it is nonstandard and has some problems such as being
+reversible, so it is generally considered to be a mistake! It may be
+replaced with HKDF or another more standard KDF in the future.
+
+Key derivation was chosen over key wrapping because wrapped keys would
+require larger xattrs which would be less likely to fit in-line in the
+filesystem's inode table, and there didn't appear to be any
+significant advantages to key wrapping. In particular, currently
+there is no requirement to support unlocking a file with multiple
+alternative master keys or to support rotating master keys. Instead,
+the master keys may be wrapped in userspace, e.g. as is done by the
+`fscrypt <https://github.com/google/fscrypt>`_ tool.
+
+Including the inode number in the IVs was considered. However, it was
+rejected as it would have prevented ext4 filesystems from being
+resized, and by itself still wouldn't have been sufficient to prevent
+the same key from being directly reused for both XTS and CTS-CBC.
+
+Encryption modes and usage
+==========================
+
+fscrypt allows one encryption mode to be specified for file contents
+and one encryption mode to be specified for filenames. Different
+directory trees are permitted to use different encryption modes.
+Currently, the following pairs of encryption modes are supported:
+
+- AES-256-XTS for contents and AES-256-CTS-CBC for filenames
+- AES-128-CBC for contents and AES-128-CTS-CBC for filenames
+- Adiantum for both contents and filenames
+
+If unsure, you should use the (AES-256-XTS, AES-256-CTS-CBC) pair.
+
+AES-128-CBC was added only for low-powered embedded devices with
+crypto accelerators such as CAAM or CESA that do not support XTS.
+
+Adiantum is a (primarily) stream cipher-based mode that is fast even
+on CPUs without dedicated crypto instructions. It's also a true
+wide-block mode, unlike XTS. It can also eliminate the need to derive
+per-file keys. However, it depends on the security of two primitives,
+XChaCha12 and AES-256, rather than just one. See the paper
+"Adiantum: length-preserving encryption for entry-level processors"
+(https://eprint.iacr.org/2018/720.pdf) for more details. To use
+Adiantum, CONFIG_CRYPTO_ADIANTUM must be enabled. Also, fast
+implementations of ChaCha and NHPoly1305 should be enabled, e.g.
+CONFIG_CRYPTO_CHACHA20_NEON and CONFIG_CRYPTO_NHPOLY1305_NEON for ARM.
+
+New encryption modes can be added relatively easily, without changes
+to individual filesystems. However, authenticated encryption (AE)
+modes are not currently supported because of the difficulty of dealing
+with ciphertext expansion.
+
+Contents encryption
+-------------------
+
+For file contents, each filesystem block is encrypted independently.
+Currently, only the case where the filesystem block size is equal to
+the system's page size (usually 4096 bytes) is supported.
+
+Each block's IV is set to the logical block number within the file as
+a little endian number, except that:
+
+- With CBC mode encryption, ESSIV is also used. Specifically, each IV
+ is encrypted with AES-256 where the AES-256 key is the SHA-256 hash
+ of the file's data encryption key.
+
+- In the "direct key" configuration (FS_POLICY_FLAG_DIRECT_KEY set in
+ the fscrypt_policy), the file's nonce is also appended to the IV.
+ Currently this is only allowed with the Adiantum encryption mode.
+
+Filenames encryption
+--------------------
+
+For filenames, each full filename is encrypted at once. Because of
+the requirements to retain support for efficient directory lookups and
+filenames of up to 255 bytes, the same IV is used for every filename
+in a directory.
+
+However, each encrypted directory still uses a unique key; or
+alternatively (for the "direct key" configuration) has the file's
+nonce included in the IVs. Thus, IV reuse is limited to within a
+single directory.
+
+With CTS-CBC, the IV reuse means that when the plaintext filenames
+share a common prefix at least as long as the cipher block size (16
+bytes for AES), the corresponding encrypted filenames will also share
+a common prefix. This is undesirable. Adiantum does not have this
+weakness, as it is a wide-block encryption mode.
+
+All supported filenames encryption modes accept any plaintext length
+>= 16 bytes; cipher block alignment is not required. However,
+filenames shorter than 16 bytes are NUL-padded to 16 bytes before
+being encrypted. In addition, to reduce leakage of filename lengths
+via their ciphertexts, all filenames are NUL-padded to the next 4, 8,
+16, or 32-byte boundary (configurable). 32 is recommended since this
+provides the best confidentiality, at the cost of making directory
+entries consume slightly more space. Note that since NUL (``\0``) is
+not otherwise a valid character in filenames, the padding will never
+produce duplicate plaintexts.
+
+Symbolic link targets are considered a type of filename and are
+encrypted in the same way as filenames in directory entries, except
+that IV reuse is not a problem as each symlink has its own inode.
+
+User API
+========
+
+Setting an encryption policy
+----------------------------
+
+The FS_IOC_SET_ENCRYPTION_POLICY ioctl sets an encryption policy on an
+empty directory or verifies that a directory or regular file already
+has the specified encryption policy. It takes in a pointer to a
+:c:type:`struct fscrypt_policy`, defined as follows::
+
+ #define FS_KEY_DESCRIPTOR_SIZE 8
+
+ struct fscrypt_policy {
+ __u8 version;
+ __u8 contents_encryption_mode;
+ __u8 filenames_encryption_mode;
+ __u8 flags;
+ __u8 master_key_descriptor[FS_KEY_DESCRIPTOR_SIZE];
+ };
+
+This structure must be initialized as follows:
+
+- ``version`` must be 0.
+
+- ``contents_encryption_mode`` and ``filenames_encryption_mode`` must
+ be set to constants from ``<linux/fs.h>`` which identify the
+ encryption modes to use. If unsure, use
+ FS_ENCRYPTION_MODE_AES_256_XTS (1) for ``contents_encryption_mode``
+ and FS_ENCRYPTION_MODE_AES_256_CTS (4) for
+ ``filenames_encryption_mode``.
+
+- ``flags`` must contain a value from ``<linux/fs.h>`` which
+ identifies the amount of NUL-padding to use when encrypting
+ filenames. If unsure, use FS_POLICY_FLAGS_PAD_32 (0x3).
+ In addition, if the chosen encryption modes are both
+ FS_ENCRYPTION_MODE_ADIANTUM, this can contain
+ FS_POLICY_FLAG_DIRECT_KEY to specify that the master key should be
+ used directly, without key derivation.
+
+- ``master_key_descriptor`` specifies how to find the master key in
+ the keyring; see `Adding keys`_. It is up to userspace to choose a
+ unique ``master_key_descriptor`` for each master key. The e4crypt
+ and fscrypt tools use the first 8 bytes of
+ ``SHA-512(SHA-512(master_key))``, but this particular scheme is not
+ required. Also, the master key need not be in the keyring yet when
+ FS_IOC_SET_ENCRYPTION_POLICY is executed. However, it must be added
+ before any files can be created in the encrypted directory.
+
+If the file is not yet encrypted, then FS_IOC_SET_ENCRYPTION_POLICY
+verifies that the file is an empty directory. If so, the specified
+encryption policy is assigned to the directory, turning it into an
+encrypted directory. After that, and after providing the
+corresponding master key as described in `Adding keys`_, all regular
+files, directories (recursively), and symlinks created in the
+directory will be encrypted, inheriting the same encryption policy.
+The filenames in the directory's entries will be encrypted as well.
+
+Alternatively, if the file is already encrypted, then
+FS_IOC_SET_ENCRYPTION_POLICY validates that the specified encryption
+policy exactly matches the actual one. If they match, then the ioctl
+returns 0. Otherwise, it fails with EEXIST. This works on both
+regular files and directories, including nonempty directories.
+
+Note that the ext4 filesystem does not allow the root directory to be
+encrypted, even if it is empty. Users who want to encrypt an entire
+filesystem with one key should consider using dm-crypt instead.
+
+FS_IOC_SET_ENCRYPTION_POLICY can fail with the following errors:
+
+- ``EACCES``: the file is not owned by the process's uid, nor does the
+ process have the CAP_FOWNER capability in a namespace with the file
+ owner's uid mapped
+- ``EEXIST``: the file is already encrypted with an encryption policy
+ different from the one specified
+- ``EINVAL``: an invalid encryption policy was specified (invalid
+ version, mode(s), or flags)
+- ``ENOTDIR``: the file is unencrypted and is a regular file, not a
+ directory
+- ``ENOTEMPTY``: the file is unencrypted and is a nonempty directory
+- ``ENOTTY``: this type of filesystem does not implement encryption
+- ``EOPNOTSUPP``: the kernel was not configured with encryption
+ support for this filesystem, or the filesystem superblock has not
+ had encryption enabled on it. (For example, to use encryption on an
+ ext4 filesystem, CONFIG_EXT4_ENCRYPTION must be enabled in the
+ kernel config, and the superblock must have had the "encrypt"
+ feature flag enabled using ``tune2fs -O encrypt`` or ``mkfs.ext4 -O
+ encrypt``.)
+- ``EPERM``: this directory may not be encrypted, e.g. because it is
+ the root directory of an ext4 filesystem
+- ``EROFS``: the filesystem is readonly
+
+Getting an encryption policy
+----------------------------
+
+The FS_IOC_GET_ENCRYPTION_POLICY ioctl retrieves the :c:type:`struct
+fscrypt_policy`, if any, for a directory or regular file. See above
+for the struct definition. No additional permissions are required
+beyond the ability to open the file.
+
+FS_IOC_GET_ENCRYPTION_POLICY can fail with the following errors:
+
+- ``EINVAL``: the file is encrypted, but it uses an unrecognized
+ encryption context format
+- ``ENODATA``: the file is not encrypted
+- ``ENOTTY``: this type of filesystem does not implement encryption
+- ``EOPNOTSUPP``: the kernel was not configured with encryption
+ support for this filesystem
+
+Note: if you only need to know whether a file is encrypted or not, on
+most filesystems it is also possible to use the FS_IOC_GETFLAGS ioctl
+and check for FS_ENCRYPT_FL, or to use the statx() system call and
+check for STATX_ATTR_ENCRYPTED in stx_attributes.
+
+Getting the per-filesystem salt
+-------------------------------
+
+Some filesystems, such as ext4 and F2FS, also support the deprecated
+ioctl FS_IOC_GET_ENCRYPTION_PWSALT. This ioctl retrieves a randomly
+generated 16-byte value stored in the filesystem superblock. This
+value is intended to used as a salt when deriving an encryption key
+from a passphrase or other low-entropy user credential.
+
+FS_IOC_GET_ENCRYPTION_PWSALT is deprecated. Instead, prefer to
+generate and manage any needed salt(s) in userspace.
+
+Adding keys
+-----------
+
+To provide a master key, userspace must add it to an appropriate
+keyring using the add_key() system call (see:
+``Documentation/security/keys/core.rst``). The key type must be
+"logon"; keys of this type are kept in kernel memory and cannot be
+read back by userspace. The key description must be "fscrypt:"
+followed by the 16-character lower case hex representation of the
+``master_key_descriptor`` that was set in the encryption policy. The
+key payload must conform to the following structure::
+
+ #define FS_MAX_KEY_SIZE 64
+
+ struct fscrypt_key {
+ u32 mode;
+ u8 raw[FS_MAX_KEY_SIZE];
+ u32 size;
+ };
+
+``mode`` is ignored; just set it to 0. The actual key is provided in
+``raw`` with ``size`` indicating its size in bytes. That is, the
+bytes ``raw[0..size-1]`` (inclusive) are the actual key.
+
+The key description prefix "fscrypt:" may alternatively be replaced
+with a filesystem-specific prefix such as "ext4:". However, the
+filesystem-specific prefixes are deprecated and should not be used in
+new programs.
+
+There are several different types of keyrings in which encryption keys
+may be placed, such as a session keyring, a user session keyring, or a
+user keyring. Each key must be placed in a keyring that is "attached"
+to all processes that might need to access files encrypted with it, in
+the sense that request_key() will find the key. Generally, if only
+processes belonging to a specific user need to access a given
+encrypted directory and no session keyring has been installed, then
+that directory's key should be placed in that user's user session
+keyring or user keyring. Otherwise, a session keyring should be
+installed if needed, and the key should be linked into that session
+keyring, or in a keyring linked into that session keyring.
+
+Note: introducing the complex visibility semantics of keyrings here
+was arguably a mistake --- especially given that by design, after any
+process successfully opens an encrypted file (thereby setting up the
+per-file key), possessing the keyring key is not actually required for
+any process to read/write the file until its in-memory inode is
+evicted. In the future there probably should be a way to provide keys
+directly to the filesystem instead, which would make the intended
+semantics clearer.
+
+Access semantics
+================
+
+With the key
+------------
+
+With the encryption key, encrypted regular files, directories, and
+symlinks behave very similarly to their unencrypted counterparts ---
+after all, the encryption is intended to be transparent. However,
+astute users may notice some differences in behavior:
+
+- Unencrypted files, or files encrypted with a different encryption
+ policy (i.e. different key, modes, or flags), cannot be renamed or
+ linked into an encrypted directory; see `Encryption policy
+ enforcement`_. Attempts to do so will fail with EPERM. However,
+ encrypted files can be renamed within an encrypted directory, or
+ into an unencrypted directory.
+
+- Direct I/O is not supported on encrypted files. Attempts to use
+ direct I/O on such files will fall back to buffered I/O.
+
+- The fallocate operations FALLOC_FL_COLLAPSE_RANGE,
+ FALLOC_FL_INSERT_RANGE, and FALLOC_FL_ZERO_RANGE are not supported
+ on encrypted files and will fail with EOPNOTSUPP.
+
+- Online defragmentation of encrypted files is not supported. The
+ EXT4_IOC_MOVE_EXT and F2FS_IOC_MOVE_RANGE ioctls will fail with
+ EOPNOTSUPP.
+
+- The ext4 filesystem does not support data journaling with encrypted
+ regular files. It will fall back to ordered data mode instead.
+
+- DAX (Direct Access) is not supported on encrypted files.
+
+- The st_size of an encrypted symlink will not necessarily give the
+ length of the symlink target as required by POSIX. It will actually
+ give the length of the ciphertext, which will be slightly longer
+ than the plaintext due to NUL-padding and an extra 2-byte overhead.
+
+- The maximum length of an encrypted symlink is 2 bytes shorter than
+ the maximum length of an unencrypted symlink. For example, on an
+ EXT4 filesystem with a 4K block size, unencrypted symlinks can be up
+ to 4095 bytes long, while encrypted symlinks can only be up to 4093
+ bytes long (both lengths excluding the terminating null).
+
+Note that mmap *is* supported. This is possible because the pagecache
+for an encrypted file contains the plaintext, not the ciphertext.
+
+Without the key
+---------------
+
+Some filesystem operations may be performed on encrypted regular
+files, directories, and symlinks even before their encryption key has
+been provided:
+
+- File metadata may be read, e.g. using stat().
+
+- Directories may be listed, in which case the filenames will be
+ listed in an encoded form derived from their ciphertext. The
+ current encoding algorithm is described in `Filename hashing and
+ encoding`_. The algorithm is subject to change, but it is
+ guaranteed that the presented filenames will be no longer than
+ NAME_MAX bytes, will not contain the ``/`` or ``\0`` characters, and
+ will uniquely identify directory entries.
+
+ The ``.`` and ``..`` directory entries are special. They are always
+ present and are not encrypted or encoded.
+
+- Files may be deleted. That is, nondirectory files may be deleted
+ with unlink() as usual, and empty directories may be deleted with
+ rmdir() as usual. Therefore, ``rm`` and ``rm -r`` will work as
+ expected.
+
+- Symlink targets may be read and followed, but they will be presented
+ in encrypted form, similar to filenames in directories. Hence, they
+ are unlikely to point to anywhere useful.
+
+Without the key, regular files cannot be opened or truncated.
+Attempts to do so will fail with ENOKEY. This implies that any
+regular file operations that require a file descriptor, such as
+read(), write(), mmap(), fallocate(), and ioctl(), are also forbidden.
+
+Also without the key, files of any type (including directories) cannot
+be created or linked into an encrypted directory, nor can a name in an
+encrypted directory be the source or target of a rename, nor can an
+O_TMPFILE temporary file be created in an encrypted directory. All
+such operations will fail with ENOKEY.
+
+It is not currently possible to backup and restore encrypted files
+without the encryption key. This would require special APIs which
+have not yet been implemented.
+
+Encryption policy enforcement
+=============================
+
+After an encryption policy has been set on a directory, all regular
+files, directories, and symbolic links created in that directory
+(recursively) will inherit that encryption policy. Special files ---
+that is, named pipes, device nodes, and UNIX domain sockets --- will
+not be encrypted.
+
+Except for those special files, it is forbidden to have unencrypted
+files, or files encrypted with a different encryption policy, in an
+encrypted directory tree. Attempts to link or rename such a file into
+an encrypted directory will fail with EPERM. This is also enforced
+during ->lookup() to provide limited protection against offline
+attacks that try to disable or downgrade encryption in known locations
+where applications may later write sensitive data. It is recommended
+that systems implementing a form of "verified boot" take advantage of
+this by validating all top-level encryption policies prior to access.
+
+Implementation details
+======================
+
+Encryption context
+------------------
+
+An encryption policy is represented on-disk by a :c:type:`struct
+fscrypt_context`. It is up to individual filesystems to decide where
+to store it, but normally it would be stored in a hidden extended
+attribute. It should *not* be exposed by the xattr-related system
+calls such as getxattr() and setxattr() because of the special
+semantics of the encryption xattr. (In particular, there would be
+much confusion if an encryption policy were to be added to or removed
+from anything other than an empty directory.) The struct is defined
+as follows::
+
+ #define FS_KEY_DESCRIPTOR_SIZE 8
+ #define FS_KEY_DERIVATION_NONCE_SIZE 16
+
+ struct fscrypt_context {
+ u8 format;
+ u8 contents_encryption_mode;
+ u8 filenames_encryption_mode;
+ u8 flags;
+ u8 master_key_descriptor[FS_KEY_DESCRIPTOR_SIZE];
+ u8 nonce[FS_KEY_DERIVATION_NONCE_SIZE];
+ };
+
+Note that :c:type:`struct fscrypt_context` contains the same
+information as :c:type:`struct fscrypt_policy` (see `Setting an
+encryption policy`_), except that :c:type:`struct fscrypt_context`
+also contains a nonce. The nonce is randomly generated by the kernel
+and is used to derive the inode's encryption key as described in
+`Per-file keys`_.
+
+Data path changes
+-----------------
+
+For the read path (->readpage()) of regular files, filesystems can
+read the ciphertext into the page cache and decrypt it in-place. The
+page lock must be held until decryption has finished, to prevent the
+page from becoming visible to userspace prematurely.
+
+For the write path (->writepage()) of regular files, filesystems
+cannot encrypt data in-place in the page cache, since the cached
+plaintext must be preserved. Instead, filesystems must encrypt into a
+temporary buffer or "bounce page", then write out the temporary
+buffer. Some filesystems, such as UBIFS, already use temporary
+buffers regardless of encryption. Other filesystems, such as ext4 and
+F2FS, have to allocate bounce pages specially for encryption.
+
+Filename hashing and encoding
+-----------------------------
+
+Modern filesystems accelerate directory lookups by using indexed
+directories. An indexed directory is organized as a tree keyed by
+filename hashes. When a ->lookup() is requested, the filesystem
+normally hashes the filename being looked up so that it can quickly
+find the corresponding directory entry, if any.
+
+With encryption, lookups must be supported and efficient both with and
+without the encryption key. Clearly, it would not work to hash the
+plaintext filenames, since the plaintext filenames are unavailable
+without the key. (Hashing the plaintext filenames would also make it
+impossible for the filesystem's fsck tool to optimize encrypted
+directories.) Instead, filesystems hash the ciphertext filenames,
+i.e. the bytes actually stored on-disk in the directory entries. When
+asked to do a ->lookup() with the key, the filesystem just encrypts
+the user-supplied name to get the ciphertext.
+
+Lookups without the key are more complicated. The raw ciphertext may
+contain the ``\0`` and ``/`` characters, which are illegal in
+filenames. Therefore, readdir() must base64-encode the ciphertext for
+presentation. For most filenames, this works fine; on ->lookup(), the
+filesystem just base64-decodes the user-supplied name to get back to
+the raw ciphertext.
+
+However, for very long filenames, base64 encoding would cause the
+filename length to exceed NAME_MAX. To prevent this, readdir()
+actually presents long filenames in an abbreviated form which encodes
+a strong "hash" of the ciphertext filename, along with the optional
+filesystem-specific hash(es) needed for directory lookups. This
+allows the filesystem to still, with a high degree of confidence, map
+the filename given in ->lookup() back to a particular directory entry
+that was previously listed by readdir(). See :c:type:`struct
+fscrypt_digested_name` in the source for more details.
+
+Note that the precise way that filenames are presented to userspace
+without the key is subject to change in the future. It is only meant
+as a way to temporarily present valid filenames so that commands like
+``rm -r`` work as expected on encrypted directories.
diff --git a/Documentation/filesystems/proc.txt b/Documentation/filesystems/proc.txt
index 74329fd0add2..6e027ae50d7e 100644
--- a/Documentation/filesystems/proc.txt
+++ b/Documentation/filesystems/proc.txt
@@ -392,6 +392,8 @@ is not associated with a file:
[stack] = the stack of the main process
[vdso] = the "virtual dynamic shared object",
the kernel system call handler
+ [anon:<name>] = an anonymous mapping that has been
+ named by userspace
or if empty, the mapping is anonymous.
@@ -419,6 +421,7 @@ KernelPageSize: 4 kB
MMUPageSize: 4 kB
Locked: 0 kB
VmFlags: rd ex mr mw me dw
+Name: name from userspace
the first of these lines shows the same information as is displayed for the
mapping in /proc/PID/maps. The remaining lines show the size of the mapping
@@ -486,6 +489,9 @@ Note that there is no guarantee that every flag and associated mnemonic will
be present in all further kernel releases. Things get changed, the flags may
be vanished or the reverse -- new added.
+The "Name" field will only be present on a mapping that has been named by
+userspace, and will show the name passed in by userspace.
+
This file is only present if the CONFIG_MMU kernel configuration option is
enabled.
diff --git a/Documentation/gpu/drm-kms.rst b/Documentation/gpu/drm-kms.rst
index 53b872c105d2..db86cda6fa34 100644
--- a/Documentation/gpu/drm-kms.rst
+++ b/Documentation/gpu/drm-kms.rst
@@ -308,6 +308,12 @@ Color Management Properties
.. kernel-doc:: drivers/gpu/drm/drm_color_mgmt.c
:export:
+Explicit Fencing Properties
+---------------------------
+
+.. kernel-doc:: drivers/gpu/drm/drm_atomic.c
+ :doc: explicit fencing properties
+
Existing KMS Properties
-----------------------
diff --git a/Documentation/input/event-codes.txt b/Documentation/input/event-codes.txt
index 36ea940e5bb9..575415f4cef0 100644
--- a/Documentation/input/event-codes.txt
+++ b/Documentation/input/event-codes.txt
@@ -301,7 +301,10 @@ them as any other INPUT_PROP_BUTTONPAD device.
INPUT_PROP_ACCELEROMETER
-------------------------
Directional axes on this device (absolute and/or relative x, y, z) represent
-accelerometer data. All other axes retain their meaning. A device must not mix
+accelerometer data. Some devices also report gyroscope data, which devices
+can report through the rotational axes (absolute and/or relative rx, ry, rz).
+
+All other axes retain their meaning. A device must not mix
regular directional axes and accelerometer axes on the same event node.
Guidelines:
diff --git a/Documentation/ioctl/ioctl-number.txt b/Documentation/ioctl/ioctl-number.txt
index 81c7f2bb7daf..efb38da700c8 100644
--- a/Documentation/ioctl/ioctl-number.txt
+++ b/Documentation/ioctl/ioctl-number.txt
@@ -308,6 +308,7 @@ Code Seq#(hex) Include File Comments
0xA3 80-8F Port ACL in development:
<mailto:tlewis@mindspring.com>
0xA3 90-9F linux/dtlk.h
+0xA4 00-1F uapi/linux/tee.h Generic TEE subsystem
0xAA 00-3F linux/uapi/linux/userfaultfd.h
0xAB 00-1F linux/nbd.h
0xAC 00-1F linux/raw.h
diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt
index c708a50b060e..26a55938f76f 100644
--- a/Documentation/kernel-parameters.txt
+++ b/Documentation/kernel-parameters.txt
@@ -87,6 +87,7 @@ parameter is applicable:
BLACKFIN Blackfin architecture is enabled.
CLK Common clock infrastructure is enabled.
CMA Contiguous Memory Area support is enabled.
+ DM Device mapper support is enabled.
DRM Direct Rendering Management support is enabled.
DYNAMIC_DEBUG Build in debug messages and enable them at runtime
EDD BIOS Enhanced Disk Drive Services (EDD) is enabled
@@ -1034,6 +1035,11 @@ bytes respectively. Such letter suffixes can also be entirely omitted.
dis_ucode_ldr [X86] Disable the microcode loader.
+ dm= [DM] Allows early creation of a device-mapper device.
+ See Documentation/device-mapper/boot.txt.
+
+ dmasound= [HW,OSS] Sound subsystem buff
+
dma_debug=off If the kernel is compiled with DMA_API_DEBUG support,
this option disables the debugging code at boot.
@@ -1896,6 +1902,12 @@ bytes respectively. Such letter suffixes can also be entirely omitted.
kernel and module base offset ASLR (Address Space
Layout Randomization).
+ kasan_multi_shot
+ [KNL] Enforce KASAN (Kernel Address Sanitizer) to print
+ report on every invalid memory access. Without this
+ parameter KASAN will print report only for the first
+ invalid access.
+
keepinitrd [HW,ARM]
kernelcore= [KNL,X86,IA-64,PPC]
@@ -3967,6 +3979,14 @@ bytes respectively. Such letter suffixes can also be entirely omitted.
last alloc / free. For more information see
Documentation/vm/slub.txt.
+ slub_memcg_sysfs= [MM, SLUB]
+ Determines whether to enable sysfs directories for
+ memory cgroup sub-caches. 1 to enable, 0 to disable.
+ The default is determined by CONFIG_SLUB_MEMCG_SYSFS_ON.
+ Enabling this can lead to a very high number of debug
+ directories and files being created under
+ /sys/kernel/slub.
+
slub_max_order= [MM, SLUB]
Determines the maximum allowed order for slabs.
A high setting may cause OOMs due to memory
diff --git a/Documentation/networking/ip-sysctl.txt b/Documentation/networking/ip-sysctl.txt
index dbdc4130e149..8b93b3e79639 100644
--- a/Documentation/networking/ip-sysctl.txt
+++ b/Documentation/networking/ip-sysctl.txt
@@ -600,6 +600,16 @@ tcp_fastopen - INTEGER
Note that that additional client or server features are only
effective if the basic support (0x1 and 0x2) are enabled respectively.
+tcp_fwmark_accept - BOOLEAN
+ If set, incoming connections to listening sockets that do not have a
+ socket mark will set the mark of the accepting socket to the fwmark of
+ the incoming SYN packet. This will cause all packets on that connection
+ (starting from the first SYNACK) to be sent with that fwmark. The
+ listening socket's mark is unchanged. Listening sockets that already
+ have a fwmark set via setsockopt(SOL_SOCKET, SO_MARK, ...) are
+ unaffected.
+ Default: 0
+
tcp_syn_retries - INTEGER
Number of times initial SYNs for an active TCP connection attempt
will be retransmitted. Should not be higher than 127. Default value
@@ -1439,11 +1449,20 @@ accept_ra_pinfo - BOOLEAN
Functional default: enabled if accept_ra is enabled.
disabled if accept_ra is disabled.
+accept_ra_rt_info_min_plen - INTEGER
+ Minimum prefix length of Route Information in RA.
+
+ Route Information w/ prefix smaller than this variable shall
+ be ignored.
+
+ Functional default: 0 if accept_ra_rtr_pref is enabled.
+ -1 if accept_ra_rtr_pref is disabled.
+
accept_ra_rt_info_max_plen - INTEGER
Maximum prefix length of Route Information in RA.
- Route Information w/ prefix larger than or equal to this
- variable shall be ignored.
+ Route Information w/ prefix larger than this variable shall
+ be ignored.
Functional default: 0 if accept_ra_rtr_pref is enabled.
-1 if accept_ra_rtr_pref is disabled.
diff --git a/Documentation/scheduler/sched-energy.txt b/Documentation/scheduler/sched-energy.txt
new file mode 100644
index 000000000000..dab2f9088b33
--- /dev/null
+++ b/Documentation/scheduler/sched-energy.txt
@@ -0,0 +1,362 @@
+Energy cost model for energy-aware scheduling (EXPERIMENTAL)
+
+Introduction
+=============
+
+The basic energy model uses platform energy data stored in sched_group_energy
+data structures attached to the sched_groups in the sched_domain hierarchy. The
+energy cost model offers two functions that can be used to guide scheduling
+decisions:
+
+1. static unsigned int sched_group_energy(struct energy_env *eenv)
+2. static int energy_diff(struct energy_env *eenv)
+
+sched_group_energy() estimates the energy consumed by all cpus in a specific
+sched_group including any shared resources owned exclusively by this group of
+cpus. Resources shared with other cpus are excluded (e.g. later level caches).
+
+energy_diff() estimates the total energy impact of a utilization change. That
+is, adding, removing, or migrating utilization (tasks).
+
+Both functions use a struct energy_env to specify the scenario to be evaluated:
+
+ struct energy_env {
+ struct sched_group *sg_top;
+ struct sched_group *sg_cap;
+ int cap_idx;
+ int util_delta;
+ int src_cpu;
+ int dst_cpu;
+ int energy;
+ };
+
+sg_top: sched_group to be evaluated. Not used by energy_diff().
+
+sg_cap: sched_group covering the cpus in the same frequency domain. Set by
+sched_group_energy().
+
+cap_idx: Capacity state to be used for energy calculations. Set by
+find_new_capacity().
+
+util_delta: Amount of utilization to be added, removed, or migrated.
+
+src_cpu: Source cpu from where 'util_delta' utilization is removed. Should be
+-1 if no source (e.g. task wake-up).
+
+dst_cpu: Destination cpu where 'util_delta' utilization is added. Should be -1
+if utilization is removed (e.g. terminating tasks).
+
+energy: Result of sched_group_energy().
+
+The metric used to represent utilization is the actual per-entity running time
+averaged over time using a geometric series. Very similar to the existing
+per-entity load-tracking, but _not_ scaled by task priority and capped by the
+capacity of the cpu. The latter property does mean that utilization may
+underestimate the compute requirements for task on fully/over utilized cpus.
+The greatest potential for energy savings without affecting performance too much
+is scenarios where the system isn't fully utilized. If the system is deemed
+fully utilized load-balancing should be done with task load (includes task
+priority) instead in the interest of fairness and performance.
+
+
+Background and Terminology
+===========================
+
+To make it clear from the start:
+
+energy = [joule] (resource like a battery on powered devices)
+power = energy/time = [joule/second] = [watt]
+
+The goal of energy-aware scheduling is to minimize energy, while still getting
+the job done. That is, we want to maximize:
+
+ performance [inst/s]
+ --------------------
+ power [W]
+
+which is equivalent to minimizing:
+
+ energy [J]
+ -----------
+ instruction
+
+while still getting 'good' performance. It is essentially an alternative
+optimization objective to the current performance-only objective for the
+scheduler. This alternative considers two objectives: energy-efficiency and
+performance. Hence, there needs to be a user controllable knob to switch the
+objective. Since it is early days, this is currently a sched_feature
+(ENERGY_AWARE).
+
+The idea behind introducing an energy cost model is to allow the scheduler to
+evaluate the implications of its decisions rather than applying energy-saving
+techniques blindly that may only have positive effects on some platforms. At
+the same time, the energy cost model must be as simple as possible to minimize
+the scheduler latency impact.
+
+Platform topology
+------------------
+
+The system topology (cpus, caches, and NUMA information, not peripherals) is
+represented in the scheduler by the sched_domain hierarchy which has
+sched_groups attached at each level that covers one or more cpus (see
+sched-domains.txt for more details). To add energy awareness to the scheduler
+we need to consider power and frequency domains.
+
+Power domain:
+
+A power domain is a part of the system that can be powered on/off
+independently. Power domains are typically organized in a hierarchy where you
+may be able to power down just a cpu or a group of cpus along with any
+associated resources (e.g. shared caches). Powering up a cpu means that all
+power domains it is a part of in the hierarchy must be powered up. Hence, it is
+more expensive to power up the first cpu that belongs to a higher level power
+domain than powering up additional cpus in the same high level domain. Two
+level power domain hierarchy example:
+
+ Power source
+ +-------------------------------+----...
+per group PD G G
+ | +----------+ |
+ +--------+-------| Shared | (other groups)
+per-cpu PD G G | resource |
+ | | +----------+
+ +-------+ +-------+
+ | CPU 0 | | CPU 1 |
+ +-------+ +-------+
+
+Frequency domain:
+
+Frequency domains (P-states) typically cover the same group of cpus as one of
+the power domain levels. That is, there might be several smaller power domains
+sharing the same frequency (P-state) or there might be a power domain spanning
+multiple frequency domains.
+
+From a scheduling point of view there is no need to know the actual frequencies
+[Hz]. All the scheduler cares about is the compute capacity available at the
+current state (P-state) the cpu is in and any other available states. For that
+reason, and to also factor in any cpu micro-architecture differences, compute
+capacity scaling states are called 'capacity states' in this document. For SMP
+systems this is equivalent to P-states. For mixed micro-architecture systems
+(like ARM big.LITTLE) it is P-states scaled according to the micro-architecture
+performance relative to the other cpus in the system.
+
+Energy modelling:
+------------------
+
+Due to the hierarchical nature of the power domains, the most obvious way to
+model energy costs is therefore to associate power and energy costs with
+domains (groups of cpus). Energy costs of shared resources are associated with
+the group of cpus that share the resources, only the cost of powering the
+cpu itself and any private resources (e.g. private L1 caches) is associated
+with the per-cpu groups (lowest level).
+
+For example, for an SMP system with per-cpu power domains and a cluster level
+(group of cpus) power domain we get the overall energy costs to be:
+
+ energy = energy_cluster + n * energy_cpu
+
+where 'n' is the number of cpus powered up and energy_cluster is the cost paid
+as soon as any cpu in the cluster is powered up.
+
+The power and frequency domains can naturally be mapped onto the existing
+sched_domain hierarchy and sched_groups by adding the necessary data to the
+existing data structures.
+
+The energy model considers energy consumption from two contributors (shown in
+the illustration below):
+
+1. Busy energy: Energy consumed while a cpu and the higher level groups that it
+belongs to are busy running tasks. Busy energy is associated with the state of
+the cpu, not an event. The time the cpu spends in this state varies. Thus, the
+most obvious platform parameter for this contribution is busy power
+(energy/time).
+
+2. Idle energy: Energy consumed while a cpu and higher level groups that it
+belongs to are idle (in a C-state). Like busy energy, idle energy is associated
+with the state of the cpu. Thus, the platform parameter for this contribution
+is idle power (energy/time).
+
+Energy consumed during transitions from an idle-state (C-state) to a busy state
+(P-state) or going the other way is ignored by the model to simplify the energy
+model calculations.
+
+
+ Power
+ ^
+ | busy->idle idle->busy
+ | transition transition
+ |
+ | _ __
+ | / \ / \__________________
+ |______________/ \ /
+ | \ /
+ | Busy \ Idle / Busy
+ | low P-state \____________/ high P-state
+ |
+ +------------------------------------------------------------> time
+
+Busy |--------------| |-----------------|
+
+Wakeup |------| |------|
+
+Idle |------------|
+
+
+The basic algorithm
+====================
+
+The basic idea is to determine the total energy impact when utilization is
+added or removed by estimating the impact at each level in the sched_domain
+hierarchy starting from the bottom (sched_group contains just a single cpu).
+The energy cost comes from busy time (sched_group is awake because one or more
+cpus are busy) and idle time (in an idle-state). Energy model numbers account
+for energy costs associated with all cpus in the sched_group as a group.
+
+ for_each_domain(cpu, sd) {
+ sg = sched_group_of(cpu)
+ energy_before = curr_util(sg) * busy_power(sg)
+ + (1-curr_util(sg)) * idle_power(sg)
+ energy_after = new_util(sg) * busy_power(sg)
+ + (1-new_util(sg)) * idle_power(sg)
+ energy_diff += energy_before - energy_after
+
+ }
+
+ return energy_diff
+
+{curr, new}_util: The cpu utilization at the lowest level and the overall
+non-idle time for the entire group for higher levels. Utilization is in the
+range 0.0 to 1.0 in the pseudo-code.
+
+busy_power: The power consumption of the sched_group.
+
+idle_power: The power consumption of the sched_group when idle.
+
+Note: It is a fundamental assumption that the utilization is (roughly) scale
+invariant. Task utilization tracking factors in any frequency scaling and
+performance scaling differences due to difference cpu microarchitectures such
+that task utilization can be used across the entire system.
+
+
+Platform energy data
+=====================
+
+struct sched_group_energy can be attached to sched_groups in the sched_domain
+hierarchy and has the following members:
+
+cap_states:
+ List of struct capacity_state representing the supported capacity states
+ (P-states). struct capacity_state has two members: cap and power, which
+ represents the compute capacity and the busy_power of the state. The
+ list must be ordered by capacity low->high.
+
+nr_cap_states:
+ Number of capacity states in cap_states list.
+
+idle_states:
+ List of struct idle_state containing idle_state power cost for each
+ idle-state supported by the system orderd by shallowest state first.
+ All states must be included at all level in the hierarchy, i.e. a
+ sched_group spanning just a single cpu must also include coupled
+ idle-states (cluster states). In addition to the cpuidle idle-states,
+ the list must also contain an entry for the idling using the arch
+ default idle (arch_idle_cpu()). Despite this state may not be a true
+ hardware idle-state it is considered the shallowest idle-state in the
+ energy model and must be the first entry. cpus may enter this state
+ (possibly 'active idling') if cpuidle decides not enter a cpuidle
+ idle-state. Default idle may not be used when cpuidle is enabled.
+ In this case, it should just be a copy of the first cpuidle idle-state.
+
+nr_idle_states:
+ Number of idle states in idle_states list.
+
+There are no unit requirements for the energy cost data. Data can be normalized
+with any reference, however, the normalization must be consistent across all
+energy cost data. That is, one bogo-joule/watt must be the same quantity for
+data, but we don't care what it is.
+
+A recipe for platform characterization
+=======================================
+
+Obtaining the actual model data for a particular platform requires some way of
+measuring power/energy. There isn't a tool to help with this (yet). This
+section provides a recipe for use as reference. It covers the steps used to
+characterize the ARM TC2 development platform. This sort of measurements is
+expected to be done anyway when tuning cpuidle and cpufreq for a given
+platform.
+
+The energy model needs two types of data (struct sched_group_energy holds
+these) for each sched_group where energy costs should be taken into account:
+
+1. Capacity state information
+
+A list containing the compute capacity and power consumption when fully
+utilized attributed to the group as a whole for each available capacity state.
+At the lowest level (group contains just a single cpu) this is the power of the
+cpu alone without including power consumed by resources shared with other cpus.
+It basically needs to fit the basic modelling approach described in "Background
+and Terminology" section:
+
+ energy_system = energy_shared + n * energy_cpu
+
+for a system containing 'n' busy cpus. Only 'energy_cpu' should be included at
+the lowest level. 'energy_shared' is included at the next level which
+represents the group of cpus among which the resources are shared.
+
+This model is, of course, a simplification of reality. Thus, power/energy
+attributions might not always exactly represent how the hardware is designed.
+Also, busy power is likely to depend on the workload. It is therefore
+recommended to use a representative mix of workloads when characterizing the
+capacity states.
+
+If the group has no capacity scaling support, the list will contain a single
+state where power is the busy power attributed to the group. The capacity
+should be set to a default value (1024).
+
+When frequency domains include multiple power domains, the group representing
+the frequency domain and all child groups share capacity states. This must be
+indicated by setting the SD_SHARE_CAP_STATES sched_domain flag. All groups at
+all levels that share the capacity state must have the list of capacity states
+with the power set to the contribution of the individual group.
+
+2. Idle power information
+
+Stored in the idle_states list. The power number is the group idle power
+consumption in each idle state as well when the group is idle but has not
+entered an idle-state ('active idle' as mentioned earlier). Due to the way the
+energy model is defined, the idle power of the deepest group idle state can
+alternatively be accounted for in the parent group busy power. In that case the
+group idle state power values are offset such that the idle power of the
+deepest state is zero. It is less intuitive, but it is easier to measure as
+idle power consumed by the group and the busy/idle power of the parent group
+cannot be distinguished without per group measurement points.
+
+Measuring capacity states and idle power:
+
+The capacity states' capacity and power can be estimated by running a benchmark
+workload at each available capacity state. By restricting the benchmark to run
+on subsets of cpus it is possible to extrapolate the power consumption of
+shared resources.
+
+ARM TC2 has two clusters of two and three cpus respectively. Each cluster has a
+shared L2 cache. TC2 has on-chip energy counters per cluster. Running a
+benchmark workload on just one cpu in a cluster means that power is consumed in
+the cluster (higher level group) and a single cpu (lowest level group). Adding
+another benchmark task to another cpu increases the power consumption by the
+amount consumed by the additional cpu. Hence, it is possible to extrapolate the
+cluster busy power.
+
+For platforms that don't have energy counters or equivalent instrumentation
+built-in, it may be possible to use an external DAQ to acquire similar data.
+
+If the benchmark includes some performance score (for example sysbench cpu
+benchmark), this can be used to record the compute capacity.
+
+Measuring idle power requires insight into the idle state implementation on the
+particular platform. Specifically, if the platform has coupled idle-states (or
+package states). To measure non-coupled per-cpu idle-states it is necessary to
+keep one cpu busy to keep any shared resources alive to isolate the idle power
+of the cpu from idle/busy power of the shared resources. The cpu can be tricked
+into different per-cpu idle states by disabling the other states. Based on
+various combinations of measurements with specific cpus busy and disabling
+idle-states it is possible to extrapolate the idle-state power.
diff --git a/Documentation/scheduler/sched-tune.txt b/Documentation/scheduler/sched-tune.txt
new file mode 100644
index 000000000000..9bd2231c01b1
--- /dev/null
+++ b/Documentation/scheduler/sched-tune.txt
@@ -0,0 +1,366 @@
+ Central, scheduler-driven, power-performance control
+ (EXPERIMENTAL)
+
+Abstract
+========
+
+The topic of a single simple power-performance tunable, that is wholly
+scheduler centric, and has well defined and predictable properties has come up
+on several occasions in the past [1,2]. With techniques such as a scheduler
+driven DVFS [3], we now have a good framework for implementing such a tunable.
+This document describes the overall ideas behind its design and implementation.
+
+
+Table of Contents
+=================
+
+1. Motivation
+2. Introduction
+3. Signal Boosting Strategy
+4. OPP selection using boosted CPU utilization
+5. Per task group boosting
+6. Question and Answers
+ - What about "auto" mode?
+ - What about boosting on a congested system?
+ - How CPUs are boosted when we have tasks with multiple boost values?
+7. References
+
+
+1. Motivation
+=============
+
+Sched-DVFS [3] is a new event-driven cpufreq governor which allows the
+scheduler to select the optimal DVFS operating point (OPP) for running a task
+allocated to a CPU. The introduction of sched-DVFS enables running workloads at
+the most energy efficient OPPs.
+
+However, sometimes it may be desired to intentionally boost the performance of
+a workload even if that could imply a reasonable increase in energy
+consumption. For example, in order to reduce the response time of a task, we
+may want to run the task at a higher OPP than the one that is actually required
+by it's CPU bandwidth demand.
+
+This last requirement is especially important if we consider that one of the
+main goals of the sched-DVFS component is to replace all currently available
+CPUFreq policies. Since sched-DVFS is event based, as opposed to the sampling
+driven governors we currently have, it is already more responsive at selecting
+the optimal OPP to run tasks allocated to a CPU. However, just tracking the
+actual task load demand may not be enough from a performance standpoint. For
+example, it is not possible to get behaviors similar to those provided by the
+"performance" and "interactive" CPUFreq governors.
+
+This document describes an implementation of a tunable, stacked on top of the
+sched-DVFS which extends its functionality to support task performance
+boosting.
+
+By "performance boosting" we mean the reduction of the time required to
+complete a task activation, i.e. the time elapsed from a task wakeup to its
+next deactivation (e.g. because it goes back to sleep or it terminates). For
+example, if we consider a simple periodic task which executes the same workload
+for 5[s] every 20[s] while running at a certain OPP, a boosted execution of
+that task must complete each of its activations in less than 5[s].
+
+A previous attempt [5] to introduce such a boosting feature has not been
+successful mainly because of the complexity of the proposed solution. The
+approach described in this document exposes a single simple interface to
+user-space. This single tunable knob allows the tuning of system wide
+scheduler behaviours ranging from energy efficiency at one end through to
+incremental performance boosting at the other end. This first tunable affects
+all tasks. However, a more advanced extension of the concept is also provided
+which uses CGroups to boost the performance of only selected tasks while using
+the energy efficient default for all others.
+
+The rest of this document introduces in more details the proposed solution
+which has been named SchedTune.
+
+
+2. Introduction
+===============
+
+SchedTune exposes a simple user-space interface with a single power-performance
+tunable:
+
+ /proc/sys/kernel/sched_cfs_boost
+
+This permits expressing a boost value as an integer in the range [0..100].
+
+A value of 0 (default) configures the CFS scheduler for maximum energy
+efficiency. This means that sched-DVFS runs the tasks at the minimum OPP
+required to satisfy their workload demand.
+A value of 100 configures scheduler for maximum performance, which translates
+to the selection of the maximum OPP on that CPU.
+
+The range between 0 and 100 can be set to satisfy other scenarios suitably. For
+example to satisfy interactive response or depending on other system events
+(battery level etc).
+
+A CGroup based extension is also provided, which permits further user-space
+defined task classification to tune the scheduler for different goals depending
+on the specific nature of the task, e.g. background vs interactive vs
+low-priority.
+
+The overall design of the SchedTune module is built on top of "Per-Entity Load
+Tracking" (PELT) signals and sched-DVFS by introducing a bias on the Operating
+Performance Point (OPP) selection.
+Each time a task is allocated on a CPU, sched-DVFS has the opportunity to tune
+the operating frequency of that CPU to better match the workload demand. The
+selection of the actual OPP being activated is influenced by the global boost
+value, or the boost value for the task CGroup when in use.
+
+This simple biasing approach leverages existing frameworks, which means minimal
+modifications to the scheduler, and yet it allows to achieve a range of
+different behaviours all from a single simple tunable knob.
+The only new concept introduced is that of signal boosting.
+
+
+3. Signal Boosting Strategy
+===========================
+
+The whole PELT machinery works based on the value of a few load tracking signals
+which basically track the CPU bandwidth requirements for tasks and the capacity
+of CPUs. The basic idea behind the SchedTune knob is to artificially inflate
+some of these load tracking signals to make a task or RQ appears more demanding
+that it actually is.
+
+Which signals have to be inflated depends on the specific "consumer". However,
+independently from the specific (signal, consumer) pair, it is important to
+define a simple and possibly consistent strategy for the concept of boosting a
+signal.
+
+A boosting strategy defines how the "abstract" user-space defined
+sched_cfs_boost value is translated into an internal "margin" value to be added
+to a signal to get its inflated value:
+
+ margin := boosting_strategy(sched_cfs_boost, signal)
+ boosted_signal := signal + margin
+
+Different boosting strategies were identified and analyzed before selecting the
+one found to be most effective.
+
+Signal Proportional Compensation (SPC)
+--------------------------------------
+
+In this boosting strategy the sched_cfs_boost value is used to compute a
+margin which is proportional to the complement of the original signal.
+When a signal has a maximum possible value, its complement is defined as
+the delta from the actual value and its possible maximum.
+
+Since the tunable implementation uses signals which have SCHED_LOAD_SCALE as
+the maximum possible value, the margin becomes:
+
+ margin := sched_cfs_boost * (SCHED_LOAD_SCALE - signal)
+
+Using this boosting strategy:
+- a 100% sched_cfs_boost means that the signal is scaled to the maximum value
+- each value in the range of sched_cfs_boost effectively inflates the signal in
+ question by a quantity which is proportional to the maximum value.
+
+For example, by applying the SPC boosting strategy to the selection of the OPP
+to run a task it is possible to achieve these behaviors:
+
+- 0% boosting: run the task at the minimum OPP required by its workload
+- 100% boosting: run the task at the maximum OPP available for the CPU
+- 50% boosting: run at the half-way OPP between minimum and maximum
+
+Which means that, at 50% boosting, a task will be scheduled to run at half of
+the maximum theoretically achievable performance on the specific target
+platform.
+
+A graphical representation of an SPC boosted signal is represented in the
+following figure where:
+ a) "-" represents the original signal
+ b) "b" represents a 50% boosted signal
+ c) "p" represents a 100% boosted signal
+
+
+ ^
+ | SCHED_LOAD_SCALE
+ +-----------------------------------------------------------------+
+ |pppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppp
+ |
+ | boosted_signal
+ | bbbbbbbbbbbbbbbbbbbbbbbb
+ |
+ | original signal
+ | bbbbbbbbbbbbbbbbbbbbbbbb+----------------------+
+ | |
+ |bbbbbbbbbbbbbbbbbb |
+ | |
+ | |
+ | |
+ | +-----------------------+
+ | |
+ | |
+ | |
+ |------------------+
+ |
+ |
+ +----------------------------------------------------------------------->
+
+The plot above shows a ramped load signal (titled 'original_signal') and it's
+boosted equivalent. For each step of the original signal the boosted signal
+corresponding to a 50% boost is midway from the original signal and the upper
+bound. Boosting by 100% generates a boosted signal which is always saturated to
+the upper bound.
+
+
+4. OPP selection using boosted CPU utilization
+==============================================
+
+It is worth calling out that the implementation does not introduce any new load
+signals. Instead, it provides an API to tune existing signals. This tuning is
+done on demand and only in scheduler code paths where it is sensible to do so.
+The new API calls are defined to return either the default signal or a boosted
+one, depending on the value of sched_cfs_boost. This is a clean an non invasive
+modification of the existing existing code paths.
+
+The signal representing a CPU's utilization is boosted according to the
+previously described SPC boosting strategy. To sched-DVFS, this allows a CPU
+(ie CFS run-queue) to appear more used then it actually is.
+
+Thus, with the sched_cfs_boost enabled we have the following main functions to
+get the current utilization of a CPU:
+
+ cpu_util()
+ boosted_cpu_util()
+
+The new boosted_cpu_util() is similar to the first but returns a boosted
+utilization signal which is a function of the sched_cfs_boost value.
+
+This function is used in the CFS scheduler code paths where sched-DVFS needs to
+decide the OPP to run a CPU at.
+For example, this allows selecting the highest OPP for a CPU which has
+the boost value set to 100%.
+
+
+5. Per task group boosting
+==========================
+
+The availability of a single knob which is used to boost all tasks in the
+system is certainly a simple solution but it quite likely doesn't fit many
+utilization scenarios, especially in the mobile device space.
+
+For example, on battery powered devices there usually are many background
+services which are long running and need energy efficient scheduling. On the
+other hand, some applications are more performance sensitive and require an
+interactive response and/or maximum performance, regardless of the energy cost.
+To better service such scenarios, the SchedTune implementation has an extension
+that provides a more fine grained boosting interface.
+
+A new CGroup controller, namely "schedtune", could be enabled which allows to
+defined and configure task groups with different boosting values.
+Tasks that require special performance can be put into separate CGroups.
+The value of the boost associated with the tasks in this group can be specified
+using a single knob exposed by the CGroup controller:
+
+ schedtune.boost
+
+This knob allows the definition of a boost value that is to be used for
+SPC boosting of all tasks attached to this group.
+
+The current schedtune controller implementation is really simple and has these
+main characteristics:
+
+ 1) It is only possible to create 1 level depth hierarchies
+
+ The root control groups define the system-wide boost value to be applied
+ by default to all tasks. Its direct subgroups are named "boost groups" and
+ they define the boost value for specific set of tasks.
+ Further nested subgroups are not allowed since they do not have a sensible
+ meaning from a user-space standpoint.
+
+ 2) It is possible to define only a limited number of "boost groups"
+
+ This number is defined at compile time and by default configured to 16.
+ This is a design decision motivated by two main reasons:
+ a) In a real system we do not expect utilization scenarios with more then few
+ boost groups. For example, a reasonable collection of groups could be
+ just "background", "interactive" and "performance".
+ b) It simplifies the implementation considerably, especially for the code
+ which has to compute the per CPU boosting once there are multiple
+ RUNNABLE tasks with different boost values.
+
+Such a simple design should allow servicing the main utilization scenarios identified
+so far. It provides a simple interface which can be used to manage the
+power-performance of all tasks or only selected tasks.
+Moreover, this interface can be easily integrated by user-space run-times (e.g.
+Android, ChromeOS) to implement a QoS solution for task boosting based on tasks
+classification, which has been a long standing requirement.
+
+Setup and usage
+---------------
+
+0. Use a kernel with CGROUP_SCHEDTUNE support enabled
+
+1. Check that the "schedtune" CGroup controller is available:
+
+ root@linaro-nano:~# cat /proc/cgroups
+ #subsys_name hierarchy num_cgroups enabled
+ cpuset 0 1 1
+ cpu 0 1 1
+ schedtune 0 1 1
+
+2. Mount a tmpfs to create the CGroups mount point (Optional)
+
+ root@linaro-nano:~# sudo mount -t tmpfs cgroups /sys/fs/cgroup
+
+3. Mount the "schedtune" controller
+
+ root@linaro-nano:~# mkdir /sys/fs/cgroup/stune
+ root@linaro-nano:~# sudo mount -t cgroup -o schedtune stune /sys/fs/cgroup/stune
+
+4. Setup the system-wide boost value (Optional)
+
+ If not configured the root control group has a 0% boost value, which
+ basically disables boosting for all tasks in the system thus running in
+ an energy-efficient mode.
+
+ root@linaro-nano:~# echo $SYSBOOST > /sys/fs/cgroup/stune/schedtune.boost
+
+5. Create task groups and configure their specific boost value (Optional)
+
+ For example here we create a "performance" boost group configure to boost
+ all its tasks to 100%
+
+ root@linaro-nano:~# mkdir /sys/fs/cgroup/stune/performance
+ root@linaro-nano:~# echo 100 > /sys/fs/cgroup/stune/performance/schedtune.boost
+
+6. Move tasks into the boost group
+
+ For example, the following moves the tasks with PID $TASKPID (and all its
+ threads) into the "performance" boost group.
+
+ root@linaro-nano:~# echo "TASKPID > /sys/fs/cgroup/stune/performance/cgroup.procs
+
+This simple configuration allows only the threads of the $TASKPID task to run,
+when needed, at the highest OPP in the most capable CPU of the system.
+
+
+6. Question and Answers
+=======================
+
+What about "auto" mode?
+-----------------------
+
+The 'auto' mode as described in [5] can be implemented by interfacing SchedTune
+with some suitable user-space element. This element could use the exposed
+system-wide or cgroup based interface.
+
+How are multiple groups of tasks with different boost values managed?
+---------------------------------------------------------------------
+
+The current SchedTune implementation keeps track of the boosted RUNNABLE tasks
+on a CPU. Once sched-DVFS selects the OPP to run a CPU at, the CPU utilization
+is boosted with a value which is the maximum of the boost values of the
+currently RUNNABLE tasks in its RQ.
+
+This allows sched-DVFS to boost a CPU only while there are boosted tasks ready
+to run and switch back to the energy efficient mode as soon as the last boosted
+task is dequeued.
+
+
+7. References
+=============
+[1] http://lwn.net/Articles/552889
+[2] http://lkml.org/lkml/2012/5/18/91
+[3] http://lkml.org/lkml/2015/6/26/620
diff --git a/Documentation/security/keys.txt b/Documentation/security/keys.txt
index 3849814bfe6d..0e03baf271bd 100644
--- a/Documentation/security/keys.txt
+++ b/Documentation/security/keys.txt
@@ -1151,8 +1151,21 @@ access the data:
usage. This is called key->payload.rcu_data0. The following accessors
wrap the RCU calls to this element:
- rcu_assign_keypointer(struct key *key, void *data);
- void *rcu_dereference_key(struct key *key);
+ (a) Set or change the first payload pointer:
+
+ rcu_assign_keypointer(struct key *key, void *data);
+
+ (b) Read the first payload pointer with the key semaphore held:
+
+ [const] void *dereference_key_locked([const] struct key *key);
+
+ Note that the return value will inherit its constness from the key
+ parameter. Static analysis will give an error if it things the lock
+ isn't held.
+
+ (c) Read the first payload pointer with the RCU read lock held:
+
+ const void *dereference_key_rcu(const struct key *key);
===================
diff --git a/Documentation/sync.txt b/Documentation/sync.txt
new file mode 100644
index 000000000000..a2d05e7fa193
--- /dev/null
+++ b/Documentation/sync.txt
@@ -0,0 +1,75 @@
+Motivation:
+
+In complicated DMA pipelines such as graphics (multimedia, camera, gpu, display)
+a consumer of a buffer needs to know when the producer has finished producing
+it. Likewise the producer needs to know when the consumer is finished with the
+buffer so it can reuse it. A particular buffer may be consumed by multiple
+consumers which will retain the buffer for different amounts of time. In
+addition, a consumer may consume multiple buffers atomically.
+The sync framework adds an API which allows synchronization between the
+producers and consumers in a generic way while also allowing platforms which
+have shared hardware synchronization primitives to exploit them.
+
+Goals:
+ * provide a generic API for expressing synchronization dependencies
+ * allow drivers to exploit hardware synchronization between hardware
+ blocks
+ * provide a userspace API that allows a compositor to manage
+ dependencies.
+ * provide rich telemetry data to allow debugging slowdowns and stalls of
+ the graphics pipeline.
+
+Objects:
+ * sync_timeline
+ * sync_pt
+ * sync_fence
+
+sync_timeline:
+
+A sync_timeline is an abstract monotonically increasing counter. In general,
+each driver/hardware block context will have one of these. They can be backed
+by the appropriate hardware or rely on the generic sw_sync implementation.
+Timelines are only ever created through their specific implementations
+(i.e. sw_sync.)
+
+sync_pt:
+
+A sync_pt is an abstract value which marks a point on a sync_timeline. Sync_pts
+have a single timeline parent. They have 3 states: active, signaled, and error.
+They start in active state and transition, once, to either signaled (when the
+timeline counter advances beyond the sync_pt’s value) or error state.
+
+sync_fence:
+
+Sync_fences are the primary primitives used by drivers to coordinate
+synchronization of their buffers. They are a collection of sync_pts which may
+or may not have the same timeline parent. A sync_pt can only exist in one fence
+and the fence's list of sync_pts is immutable once created. Fences can be
+waited on synchronously or asynchronously. Two fences can also be merged to
+create a third fence containing a copy of the two fences’ sync_pts. Fences are
+backed by file descriptors to allow userspace to coordinate the display pipeline
+dependencies.
+
+Use:
+
+A driver implementing sync support should have a work submission function which:
+ * takes a fence argument specifying when to begin work
+ * asynchronously queues that work to kick off when the fence is signaled
+ * returns a fence to indicate when its work will be done.
+ * signals the returned fence once the work is completed.
+
+Consider an imaginary display driver that has the following API:
+/*
+ * assumes buf is ready to be displayed.
+ * blocks until the buffer is on screen.
+ */
+ void display_buffer(struct dma_buf *buf);
+
+The new API will become:
+/*
+ * will display buf when fence is signaled.
+ * returns immediately with a fence that will signal when buf
+ * is no longer displayed.
+ */
+struct sync_fence* display_buffer(struct dma_buf *buf,
+ struct sync_fence *fence);
diff --git a/Documentation/sysctl/kernel.txt b/Documentation/sysctl/kernel.txt
index ffab8b5caa60..52daff6d09fb 100644
--- a/Documentation/sysctl/kernel.txt
+++ b/Documentation/sysctl/kernel.txt
@@ -659,12 +659,14 @@ allowed to execute.
perf_event_paranoid:
Controls use of the performance events system by unprivileged
-users (without CAP_SYS_ADMIN). The default value is 2.
+users (without CAP_SYS_ADMIN). The default value is 3 if
+CONFIG_SECURITY_PERF_EVENTS_RESTRICT is set, or 2 otherwise.
-1: Allow use of (almost) all events by all users
>=0: Disallow raw tracepoint access by users without CAP_IOC_LOCK
>=1: Disallow CPU event access by users without CAP_SYS_ADMIN
>=2: Disallow kernel profiling by users without CAP_SYS_ADMIN
+>=3: Disallow all event access by users without CAP_SYS_ADMIN
==============================================================
diff --git a/Documentation/sysctl/vm.txt b/Documentation/sysctl/vm.txt
index 95ccbe6d79ce..206c9b071cf2 100644
--- a/Documentation/sysctl/vm.txt
+++ b/Documentation/sysctl/vm.txt
@@ -30,6 +30,7 @@ Currently, these files are in /proc/sys/vm:
- dirty_writeback_centisecs
- drop_caches
- extfrag_threshold
+- extra_free_kbytes
- hugepages_treat_as_movable
- hugetlb_shm_group
- laptop_mode
@@ -240,6 +241,21 @@ fragmentation index is <= extfrag_threshold. The default value is 500.
==============================================================
+extra_free_kbytes
+
+This parameter tells the VM to keep extra free memory between the threshold
+where background reclaim (kswapd) kicks in, and the threshold where direct
+reclaim (by allocating processes) kicks in.
+
+This is useful for workloads that require low latency memory allocations
+and have a bounded burstiness in memory allocations, for example a
+realtime application that receives and transmits network traffic
+(causing in-kernel memory allocations) with a maximum total message burst
+size of 200MB may need 200MB of extra free memory to avoid direct reclaim
+related latencies.
+
+==============================================================
+
hugepages_treat_as_movable
This parameter controls whether we can allocate hugepages from ZONE_MOVABLE
diff --git a/Documentation/tee.txt b/Documentation/tee.txt
new file mode 100644
index 000000000000..56ea85ffebf2
--- /dev/null
+++ b/Documentation/tee.txt
@@ -0,0 +1,127 @@
+=============
+TEE subsystem
+=============
+
+This document describes the TEE subsystem in Linux.
+
+A TEE (Trusted Execution Environment) is a trusted OS running in some
+secure environment, for example, TrustZone on ARM CPUs, or a separate
+secure co-processor etc. A TEE driver handles the details needed to
+communicate with the TEE.
+
+This subsystem deals with:
+
+- Registration of TEE drivers
+
+- Managing shared memory between Linux and the TEE
+
+- Providing a generic API to the TEE
+
+The TEE interface
+=================
+
+include/uapi/linux/tee.h defines the generic interface to a TEE.
+
+User space (the client) connects to the driver by opening /dev/tee[0-9]* or
+/dev/teepriv[0-9]*.
+
+- TEE_IOC_SHM_ALLOC allocates shared memory and returns a file descriptor
+ which user space can mmap. When user space doesn't need the file
+ descriptor any more, it should be closed. When shared memory isn't needed
+ any longer it should be unmapped with munmap() to allow the reuse of
+ memory.
+
+- TEE_IOC_VERSION lets user space know which TEE this driver handles and
+ the its capabilities.
+
+- TEE_IOC_OPEN_SESSION opens a new session to a Trusted Application.
+
+- TEE_IOC_INVOKE invokes a function in a Trusted Application.
+
+- TEE_IOC_CANCEL may cancel an ongoing TEE_IOC_OPEN_SESSION or TEE_IOC_INVOKE.
+
+- TEE_IOC_CLOSE_SESSION closes a session to a Trusted Application.
+
+There are two classes of clients, normal clients and supplicants. The latter is
+a helper process for the TEE to access resources in Linux, for example file
+system access. A normal client opens /dev/tee[0-9]* and a supplicant opens
+/dev/teepriv[0-9].
+
+Much of the communication between clients and the TEE is opaque to the
+driver. The main job for the driver is to receive requests from the
+clients, forward them to the TEE and send back the results. In the case of
+supplicants the communication goes in the other direction, the TEE sends
+requests to the supplicant which then sends back the result.
+
+OP-TEE driver
+=============
+
+The OP-TEE driver handles OP-TEE [1] based TEEs. Currently it is only the ARM
+TrustZone based OP-TEE solution that is supported.
+
+Lowest level of communication with OP-TEE builds on ARM SMC Calling
+Convention (SMCCC) [2], which is the foundation for OP-TEE's SMC interface
+[3] used internally by the driver. Stacked on top of that is OP-TEE Message
+Protocol [4].
+
+OP-TEE SMC interface provides the basic functions required by SMCCC and some
+additional functions specific for OP-TEE. The most interesting functions are:
+
+- OPTEE_SMC_FUNCID_CALLS_UID (part of SMCCC) returns the version information
+ which is then returned by TEE_IOC_VERSION
+
+- OPTEE_SMC_CALL_GET_OS_UUID returns the particular OP-TEE implementation, used
+ to tell, for instance, a TrustZone OP-TEE apart from an OP-TEE running on a
+ separate secure co-processor.
+
+- OPTEE_SMC_CALL_WITH_ARG drives the OP-TEE message protocol
+
+- OPTEE_SMC_GET_SHM_CONFIG lets the driver and OP-TEE agree on which memory
+ range to used for shared memory between Linux and OP-TEE.
+
+The GlobalPlatform TEE Client API [5] is implemented on top of the generic
+TEE API.
+
+Picture of the relationship between the different components in the
+OP-TEE architecture::
+
+ User space Kernel Secure world
+ ~~~~~~~~~~ ~~~~~~ ~~~~~~~~~~~~
+ +--------+ +-------------+
+ | Client | | Trusted |
+ +--------+ | Application |
+ /\ +-------------+
+ || +----------+ /\
+ || |tee- | ||
+ || |supplicant| \/
+ || +----------+ +-------------+
+ \/ /\ | TEE Internal|
+ +-------+ || | API |
+ + TEE | || +--------+--------+ +-------------+
+ | Client| || | TEE | OP-TEE | | OP-TEE |
+ | API | \/ | subsys | driver | | Trusted OS |
+ +-------+----------------+----+-------+----+-----------+-------------+
+ | Generic TEE API | | OP-TEE MSG |
+ | IOCTL (TEE_IOC_*) | | SMCCC (OPTEE_SMC_CALL_*) |
+ +-----------------------------+ +------------------------------+
+
+RPC (Remote Procedure Call) are requests from secure world to kernel driver
+or tee-supplicant. An RPC is identified by a special range of SMCCC return
+values from OPTEE_SMC_CALL_WITH_ARG. RPC messages which are intended for the
+kernel are handled by the kernel driver. Other RPC messages will be forwarded to
+tee-supplicant without further involvement of the driver, except switching
+shared memory buffer representation.
+
+References
+==========
+
+[1] https://github.com/OP-TEE/optee_os
+
+[2] http://infocenter.arm.com/help/topic/com.arm.doc.den0028a/index.html
+
+[3] drivers/tee/optee/optee_smc.h
+
+[4] drivers/tee/optee/optee_msg.h
+
+[5] http://www.globalplatform.org/specificationsdevice.asp look for
+ "TEE Client API Specification v1.0" and click download.
diff --git a/Documentation/trace/events-power.txt b/Documentation/trace/events-power.txt
index 21d514ced212..4d817d5acc40 100644
--- a/Documentation/trace/events-power.txt
+++ b/Documentation/trace/events-power.txt
@@ -25,6 +25,7 @@ cpufreq.
cpu_idle "state=%lu cpu_id=%lu"
cpu_frequency "state=%lu cpu_id=%lu"
+cpu_frequency_limits "min=%lu max=%lu cpu_id=%lu"
A suspend event is used to indicate the system going in and out of the
suspend mode:
diff --git a/Documentation/trace/ftrace.txt b/Documentation/trace/ftrace.txt
index 185c39fea2a0..91723ed53470 100644
--- a/Documentation/trace/ftrace.txt
+++ b/Documentation/trace/ftrace.txt
@@ -362,6 +362,26 @@ of ftrace. Here is a list of some of the key files:
to correlate events across hypervisor/guest if
tb_offset is known.
+ mono: This uses the fast monotonic clock (CLOCK_MONOTONIC)
+ which is monotonic and is subject to NTP rate adjustments.
+
+ mono_raw:
+ This is the raw monotonic clock (CLOCK_MONOTONIC_RAW)
+ which is montonic but is not subject to any rate adjustments
+ and ticks at the same rate as the hardware clocksource.
+
+ boot: This is the boot clock (CLOCK_BOOTTIME) and is based on the
+ fast monotonic clock, but also accounts for time spent in
+ suspend. Since the clock access is designed for use in
+ tracing in the suspend path, some side effects are possible
+ if clock is accessed after the suspend time is accounted before
+ the fast mono clock is updated. In this case, the clock update
+ appears to happen slightly sooner than it normally would have.
+ Also on 32-bit systems, it's possible that the 64-bit boot offset
+ sees a partial update. These effects are rare and post
+ processing should be able to handle them. See comments in the
+ ktime_get_boot_fast_ns() function for more information.
+
To set a clock, simply echo the clock name into this file.
echo global > trace_clock
@@ -2102,6 +2122,35 @@ will produce:
1) 1.449 us | }
+You can disable the hierarchical function call formatting and instead print a
+flat list of function entry and return events. This uses the format described
+in the Output Formatting section and respects all the trace options that
+control that formatting. Hierarchical formatting is the default.
+
+ hierachical: echo nofuncgraph-flat > trace_options
+ flat: echo funcgraph-flat > trace_options
+
+ ie:
+
+ # tracer: function_graph
+ #
+ # entries-in-buffer/entries-written: 68355/68355 #P:2
+ #
+ # _-----=> irqs-off
+ # / _----=> need-resched
+ # | / _---=> hardirq/softirq
+ # || / _--=> preempt-depth
+ # ||| / delay
+ # TASK-PID CPU# |||| TIMESTAMP FUNCTION
+ # | | | |||| | |
+ sh-1806 [001] d... 198.843443: graph_ent: func=_raw_spin_lock
+ sh-1806 [001] d... 198.843445: graph_ent: func=__raw_spin_lock
+ sh-1806 [001] d..1 198.843447: graph_ret: func=__raw_spin_lock
+ sh-1806 [001] d..1 198.843449: graph_ret: func=_raw_spin_lock
+ sh-1806 [001] d..1 198.843451: graph_ent: func=_raw_spin_unlock_irqrestore
+ sh-1806 [001] d... 198.843453: graph_ret: func=_raw_spin_unlock_irqrestore
+
+
You might find other useful features for this tracer in the
following "dynamic ftrace" section such as tracing only specific
functions or tasks.
diff --git a/MAINTAINERS b/MAINTAINERS
index 4f559f5b3a89..a605955a6eef 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -9010,6 +9010,11 @@ F: arch/*/oprofile/
F: drivers/oprofile/
F: include/linux/oprofile.h
+OP-TEE DRIVER
+M: Jens Wiklander <jens.wiklander@linaro.org>
+S: Maintained
+F: drivers/tee/optee/
+
ORACLE CLUSTER FILESYSTEM 2 (OCFS2)
M: Mark Fasheh <mfasheh@versity.com>
M: Joel Becker <jlbec@evilplan.org>
@@ -10655,6 +10660,14 @@ F: drivers/hwtracing/stm/
F: include/linux/stm.h
F: include/uapi/linux/stm.h
+TEE SUBSYSTEM
+M: Jens Wiklander <jens.wiklander@linaro.org>
+S: Maintained
+F: include/linux/tee_drv.h
+F: include/uapi/linux/tee.h
+F: drivers/tee/
+F: Documentation/tee.txt
+
THUNDERBOLT DRIVER
M: Andreas Noever <andreas.noever@gmail.com>
S: Maintained
diff --git a/Makefile b/Makefile
index f1aeb98f9ace..569cf6f96cbf 100644
--- a/Makefile
+++ b/Makefile
@@ -348,6 +348,7 @@ include scripts/Kbuild.include
# Make variables (CC, etc...)
AS = $(CROSS_COMPILE)as
LD = $(CROSS_COMPILE)ld
+LDGOLD = $(CROSS_COMPILE)ld.gold
CC = $(CROSS_COMPILE)gcc
CPP = $(CC) -E
AR = $(CROSS_COMPILE)ar
@@ -511,13 +512,17 @@ endif
ifeq ($(cc-name),clang)
ifneq ($(CROSS_COMPILE),)
-CLANG_TARGET := -target $(notdir $(CROSS_COMPILE:%-=%))
+CLANG_TRIPLE ?= $(CROSS_COMPILE)
+CLANG_TARGET := --target=$(notdir $(CLANG_TRIPLE:%-=%))
+ifeq ($(shell $(srctree)/scripts/clang-android.sh $(CC) $(CLANG_TARGET)), y)
+$(error "Clang with Android --target detected. Did you specify CLANG_TRIPLE?")
+endif
GCC_TOOLCHAIN_DIR := $(dir $(shell which $(LD)))
CLANG_PREFIX := --prefix=$(GCC_TOOLCHAIN_DIR)
GCC_TOOLCHAIN := $(realpath $(GCC_TOOLCHAIN_DIR)/..)
endif
ifneq ($(GCC_TOOLCHAIN),)
-CLANG_GCC_TC := -gcc-toolchain $(GCC_TOOLCHAIN)
+CLANG_GCC_TC := --gcc-toolchain=$(GCC_TOOLCHAIN)
endif
KBUILD_CFLAGS += $(CLANG_TARGET) $(CLANG_GCC_TC) $(CLANG_PREFIX)
KBUILD_AFLAGS += $(CLANG_TARGET) $(CLANG_GCC_TC) $(CLANG_PREFIX)
@@ -526,6 +531,7 @@ KBUILD_CFLAGS += $(call cc-disable-warning, unused-variable)
KBUILD_CFLAGS += $(call cc-disable-warning, format-invalid-specifier)
KBUILD_CFLAGS += $(call cc-disable-warning, gnu)
KBUILD_CFLAGS += $(call cc-disable-warning, address-of-packed-member)
+KBUILD_CFLAGS += $(call cc-disable-warning, duplicate-decl-specifier)
# Quiet clang warning: comparison of unsigned expression < 0 is always false
KBUILD_CFLAGS += $(call cc-disable-warning, tautological-compare)
# CLANG uses a _MergedGlobals as optimization, but this breaks modpost, as the
@@ -659,6 +665,20 @@ CFLAGS_GCOV := -fprofile-arcs -ftest-coverage -fno-tree-loop-im $(call cc-disabl
CFLAGS_KCOV := $(call cc-option,-fsanitize-coverage=trace-pc,)
export CFLAGS_GCOV CFLAGS_KCOV
+# Make toolchain changes before including arch/$(SRCARCH)/Makefile to ensure
+# ar/cc/ld-* macros return correct values.
+ifdef CONFIG_LTO_CLANG
+# use GNU gold with LLVMgold for LTO linking, and LD for vmlinux_link
+LDFINAL_vmlinux := $(LD)
+LD := $(LDGOLD)
+LDFLAGS += -plugin LLVMgold.so
+# use llvm-ar for building symbol tables from IR files, and llvm-dis instead
+# of objdump for processing symbol versions and exports
+LLVM_AR := llvm-ar
+LLVM_DIS := llvm-dis
+export LLVM_AR LLVM_DIS
+endif
+
# The arch Makefile can set ARCH_{CPP,A,C}FLAGS to override the default
# values of the respective KBUILD_* variables
ARCH_CPPFLAGS :=
@@ -678,6 +698,53 @@ KBUILD_CFLAGS += $(call cc-option,-ffunction-sections,)
KBUILD_CFLAGS += $(call cc-option,-fdata-sections,)
endif
+ifdef CONFIG_LTO_CLANG
+lto-clang-flags := -flto -fvisibility=hidden
+
+# allow disabling only clang LTO where needed
+DISABLE_LTO_CLANG := -fno-lto -fvisibility=default
+export DISABLE_LTO_CLANG
+endif
+
+ifdef CONFIG_LTO
+lto-flags := $(lto-clang-flags)
+KBUILD_CFLAGS += $(lto-flags)
+
+DISABLE_LTO := $(DISABLE_LTO_CLANG)
+export DISABLE_LTO
+
+# LDFINAL_vmlinux and LDFLAGS_FINAL_vmlinux can be set to override
+# the linker and flags for vmlinux_link.
+export LDFINAL_vmlinux LDFLAGS_FINAL_vmlinux
+endif
+
+ifdef CONFIG_CFI_CLANG
+cfi-clang-flags += -fsanitize=cfi
+DISABLE_CFI_CLANG := -fno-sanitize=cfi
+ifdef CONFIG_MODULES
+cfi-clang-flags += -fsanitize-cfi-cross-dso
+DISABLE_CFI_CLANG += -fno-sanitize-cfi-cross-dso
+endif
+ifdef CONFIG_CFI_PERMISSIVE
+cfi-clang-flags += -fsanitize-recover=cfi -fno-sanitize-trap=cfi
+endif
+
+# also disable CFI when LTO is disabled
+DISABLE_LTO_CLANG += $(DISABLE_CFI_CLANG)
+# allow disabling only clang CFI where needed
+export DISABLE_CFI_CLANG
+endif
+
+ifdef CONFIG_CFI
+# cfi-flags are re-tested in prepare-compiler-check
+cfi-flags := $(cfi-clang-flags)
+KBUILD_CFLAGS += $(cfi-flags)
+
+DISABLE_CFI := $(DISABLE_CFI_CLANG)
+DISABLE_LTO += $(DISABLE_CFI)
+export DISABLE_CFI
+endif
+
ifdef CONFIG_CC_OPTIMIZE_FOR_SIZE
KBUILD_CFLAGS += $(call cc-option,-Oz,-Os)
KBUILD_CFLAGS += $(call cc-disable-warning,maybe-uninitialized,)
@@ -1098,6 +1165,22 @@ prepare-objtool: $(objtool_target)
# CC_STACKPROTECTOR_STRONG! Why did it build with _REGULAR?!")
PHONY += prepare-compiler-check
prepare-compiler-check: FORCE
+# Make sure we're using a supported toolchain with LTO_CLANG
+ifdef CONFIG_LTO_CLANG
+ ifneq ($(call clang-ifversion, -ge, 0500, y), y)
+ @echo Cannot use CONFIG_LTO_CLANG: requires clang 5.0 or later >&2 && exit 1
+ endif
+ ifneq ($(call gold-ifversion, -ge, 112000000, y), y)
+ @echo Cannot use CONFIG_LTO_CLANG: requires GNU gold 1.12 or later >&2 && exit 1
+ endif
+endif
+# Make sure compiler supports LTO flags
+ifdef lto-flags
+ ifeq ($(call cc-option, $(lto-flags)),)
+ @echo Cannot use CONFIG_LTO: $(lto-flags) not supported by compiler \
+ >&2 && exit 1
+ endif
+endif
# Make sure compiler supports requested stack protector flag.
ifdef stackp-name
ifeq ($(call cc-option, $(stackp-flag)),)
@@ -1112,6 +1195,11 @@ ifdef stackp-check
$(stackp-flag) available but compiler is broken >&2 && exit 1
endif
endif
+ifdef cfi-flags
+ ifeq ($(call cc-option, $(cfi-flags)),)
+ @echo Cannot use CONFIG_CFI: $(cfi-flags) not supported by compiler >&2 && exit 1
+ endif
+endif
@:
# Generate some files
@@ -1584,7 +1672,8 @@ clean: $(clean-dirs)
-o -name modules.builtin -o -name '.tmp_*.o.*' \
-o -name '*.c.[012]*.*' \
-o -name '*.ll' \
- -o -name '*.gcno' \) -type f -print | xargs rm -f
+ -o -name '*.gcno' \
+ -o -name '*.*.symversions' \) -type f -print | xargs rm -f
# Generate tags for editors
# ---------------------------------------------------------------------------
diff --git a/arch/Kconfig b/arch/Kconfig
index b39d0f93c67b..c8e6cdffca85 100644
--- a/arch/Kconfig
+++ b/arch/Kconfig
@@ -492,6 +492,75 @@ config LD_DEAD_CODE_DATA_ELIMINATION
sections (e.g., '.text.init'). Typically '.' in section names
is used to distinguish them from label names / C identifiers.
+config LTO
+ def_bool n
+
+config ARCH_SUPPORTS_LTO_CLANG
+ bool
+ help
+ An architecture should select this option it supports:
+ - compiling with clang,
+ - compiling inline assembly with clang's integrated assembler,
+ - and linking with either lld or GNU gold w/ LLVMgold.
+
+choice
+ prompt "Link-Time Optimization (LTO) (EXPERIMENTAL)"
+ default LTO_NONE
+ help
+ This option turns on Link-Time Optimization (LTO).
+
+config LTO_NONE
+ bool "None"
+
+config LTO_CLANG
+ bool "Use clang Link Time Optimization (LTO) (EXPERIMENTAL)"
+ depends on ARCH_SUPPORTS_LTO_CLANG
+ depends on !FTRACE_MCOUNT_RECORD || HAVE_C_RECORDMCOUNT
+ depends on !KASAN
+ select LTO
+ select THIN_ARCHIVES
+ select LD_DEAD_CODE_DATA_ELIMINATION
+ help
+ This option enables clang's Link Time Optimization (LTO), which allows
+ the compiler to optimize the kernel globally at link time. If you
+ enable this option, the compiler generates LLVM IR instead of object
+ files, and the actual compilation from IR occurs at the LTO link step,
+ which may take several minutes.
+
+ If you select this option, you must compile the kernel with clang >=
+ 5.0 (make CC=clang) and GNU gold from binutils >= 2.27, and have the
+ LLVMgold plug-in in LD_LIBRARY_PATH.
+
+endchoice
+
+config CFI
+ bool
+
+config CFI_PERMISSIVE
+ bool "Use CFI in permissive mode"
+ depends on CFI
+ help
+ When selected, Control Flow Integrity (CFI) violations result in a
+ warning instead of a kernel panic. This option is useful for finding
+ CFI violations in drivers during development.
+
+config CFI_CLANG
+ bool "Use clang Control Flow Integrity (CFI) (EXPERIMENTAL)"
+ depends on LTO_CLANG
+ depends on KALLSYMS
+ select CFI
+ help
+ This option enables clang Control Flow Integrity (CFI), which adds
+ runtime checking for indirect function calls.
+
+config CFI_CLANG_SHADOW
+ bool "Use CFI shadow to speed up cross-module checks"
+ default y
+ depends on CFI_CLANG
+ help
+ If you select this option, the kernel builds a fast look-up table of
+ CFI check functions in loaded modules to reduce overhead.
+
config HAVE_ARCH_WITHIN_STACK_FRAMES
bool
help
diff --git a/arch/alpha/include/uapi/asm/socket.h b/arch/alpha/include/uapi/asm/socket.h
index 9e46d6e656d9..fa47df6a953a 100644
--- a/arch/alpha/include/uapi/asm/socket.h
+++ b/arch/alpha/include/uapi/asm/socket.h
@@ -97,4 +97,6 @@
#define SO_CNX_ADVICE 53
+#define SO_COOKIE 57
+
#endif /* _UAPI_ASM_SOCKET_H */
diff --git a/arch/arm/Kconfig b/arch/arm/Kconfig
index b5d529fdffab..00be82f3929f 100644
--- a/arch/arm/Kconfig
+++ b/arch/arm/Kconfig
@@ -1836,6 +1836,15 @@ config XEN
help
Say Y if you want to run Linux in a Virtual Machine on Xen on ARM.
+config ARM_FLUSH_CONSOLE_ON_RESTART
+ bool "Force flush the console on restart"
+ help
+ If the console is locked while the system is rebooted, the messages
+ in the temporary logbuffer would not have propogated to all the
+ console drivers. This option forces the console lock to be
+ released if it failed to be acquired, which will cause all the
+ pending messages to be flushed.
+
endmenu
menu "Boot options"
@@ -1864,6 +1873,21 @@ config DEPRECATED_PARAM_STRUCT
This was deprecated in 2001 and announced to live on for 5 years.
Some old boot loaders still use this way.
+config BUILD_ARM_APPENDED_DTB_IMAGE
+ bool "Build a concatenated zImage/dtb by default"
+ depends on OF
+ help
+ Enabling this option will cause a concatenated zImage and list of
+ DTBs to be built by default (instead of a standalone zImage.)
+ The image will built in arch/arm/boot/zImage-dtb
+
+config BUILD_ARM_APPENDED_DTB_IMAGE_NAMES
+ string "Default dtb names"
+ depends on BUILD_ARM_APPENDED_DTB_IMAGE
+ help
+ Space separated list of names of dtbs to append when
+ building a concatenated zImage-dtb.
+
# Compressed boot loader in ROM. Yes, we really want to ask about
# TEXT and BSS so we preserve their values in the config files.
config ZBOOT_ROM_TEXT
diff --git a/arch/arm/Kconfig.debug b/arch/arm/Kconfig.debug
index d83f7c369e51..17dcd9416db3 100644
--- a/arch/arm/Kconfig.debug
+++ b/arch/arm/Kconfig.debug
@@ -1723,6 +1723,14 @@ config EARLY_PRINTK
kernel low-level debugging functions. Add earlyprintk to your
kernel parameters to enable this console.
+config EARLY_PRINTK_DIRECT
+ bool "Early printk direct"
+ depends on DEBUG_LL
+ help
+ Say Y here if you want to have an early console using the
+ kernel low-level debugging functions and EARLY_PRINTK is
+ not early enough.
+
config ARM_KPROBES_TEST
tristate "Kprobes test module"
depends on KPROBES && MODULES
diff --git a/arch/arm/Makefile b/arch/arm/Makefile
index e14ddca59d02..9ced939c495d 100644
--- a/arch/arm/Makefile
+++ b/arch/arm/Makefile
@@ -298,6 +298,8 @@ libs-y := arch/arm/lib/ $(libs-y)
# Default target when executing plain make
ifeq ($(CONFIG_XIP_KERNEL),y)
KBUILD_IMAGE := xipImage
+else ifeq ($(CONFIG_BUILD_ARM_APPENDED_DTB_IMAGE),y)
+KBUILD_IMAGE := zImage-dtb
else
KBUILD_IMAGE := zImage
endif
@@ -349,6 +351,9 @@ ifeq ($(CONFIG_VDSO),y)
$(Q)$(MAKE) $(build)=arch/arm/vdso $@
endif
+zImage-dtb: vmlinux scripts dtbs
+ $(Q)$(MAKE) $(build)=$(boot) MACHINE=$(MACHINE) $(boot)/$@
+
# We use MRPROPER_FILES and CLEAN_FILES now
archclean:
$(Q)$(MAKE) $(clean)=$(boot)
diff --git a/arch/arm/boot/.gitignore b/arch/arm/boot/.gitignore
index 3c79f85975aa..ad7a0253ea96 100644
--- a/arch/arm/boot/.gitignore
+++ b/arch/arm/boot/.gitignore
@@ -4,3 +4,4 @@ xipImage
bootpImage
uImage
*.dtb
+zImage-dtb \ No newline at end of file
diff --git a/arch/arm/boot/Makefile b/arch/arm/boot/Makefile
index 50f8d1be7fcb..da75630c440d 100644
--- a/arch/arm/boot/Makefile
+++ b/arch/arm/boot/Makefile
@@ -16,6 +16,7 @@ OBJCOPYFLAGS :=-O binary -R .comment -S
ifneq ($(MACHINE),)
include $(MACHINE)/Makefile.boot
endif
+include $(srctree)/arch/arm/boot/dts/Makefile
# Note: the following conditions must always be true:
# ZRELADDR == virt_to_phys(PAGE_OFFSET + TEXT_OFFSET)
@@ -29,6 +30,14 @@ export ZRELADDR INITRD_PHYS PARAMS_PHYS
targets := Image zImage xipImage bootpImage uImage
+DTB_NAMES := $(subst $\",,$(CONFIG_BUILD_ARM_APPENDED_DTB_IMAGE_NAMES))
+ifneq ($(DTB_NAMES),)
+DTB_LIST := $(addsuffix .dtb,$(DTB_NAMES))
+else
+DTB_LIST := $(dtb-y)
+endif
+DTB_OBJS := $(addprefix $(obj)/dts/,$(DTB_LIST))
+
ifeq ($(CONFIG_XIP_KERNEL),y)
$(obj)/xipImage: vmlinux FORCE
@@ -55,6 +64,10 @@ $(obj)/compressed/vmlinux: $(obj)/Image FORCE
$(obj)/zImage: $(obj)/compressed/vmlinux FORCE
$(call if_changed,objcopy)
+$(obj)/zImage-dtb: $(obj)/zImage $(DTB_OBJS) FORCE
+ $(call if_changed,cat)
+ @echo ' Kernel: $@ is ready'
+
endif
ifneq ($(LOADADDR),)
diff --git a/arch/arm/boot/compressed/head.S b/arch/arm/boot/compressed/head.S
index 2d7f2bb0d66a..b114faa4679c 100644
--- a/arch/arm/boot/compressed/head.S
+++ b/arch/arm/boot/compressed/head.S
@@ -784,6 +784,8 @@ __armv7_mmu_cache_on:
bic r6, r6, #1 << 31 @ 32-bit translation system
bic r6, r6, #(7 << 0) | (1 << 4) @ use only ttbr0
mcrne p15, 0, r3, c2, c0, 0 @ load page table pointer
+ mcrne p15, 0, r0, c8, c7, 0 @ flush I,D TLBs
+ mcr p15, 0, r0, c7, c5, 4 @ ISB
mcrne p15, 0, r1, c3, c0, 0 @ load domain access control
mcrne p15, 0, r6, c2, c0, 2 @ load ttb control
#endif
diff --git a/arch/arm/boot/dts/Makefile b/arch/arm/boot/dts/Makefile
index 7037201c5e3a..54f95d365bf7 100644
--- a/arch/arm/boot/dts/Makefile
+++ b/arch/arm/boot/dts/Makefile
@@ -960,5 +960,15 @@ endif
dtstree := $(srctree)/$(src)
dtb-$(CONFIG_OF_ALL_DTBS) := $(patsubst $(dtstree)/%.dts,%.dtb, $(wildcard $(dtstree)/*.dts))
-always := $(dtb-y)
+DTB_NAMES := $(subst $\",,$(CONFIG_BUILD_ARM_APPENDED_DTB_IMAGE_NAMES))
+ifneq ($(DTB_NAMES),)
+DTB_LIST := $(addsuffix .dtb,$(DTB_NAMES))
+else
+DTB_LIST := $(dtb-y)
+endif
+
+targets += dtbs dtbs_install
+targets += $(DTB_LIST)
+
+always := $(DTB_LIST)
clean-files := *.dtb
diff --git a/arch/arm/common/Kconfig b/arch/arm/common/Kconfig
index 9353184d730d..ce01364a96e3 100644
--- a/arch/arm/common/Kconfig
+++ b/arch/arm/common/Kconfig
@@ -17,3 +17,7 @@ config SHARP_PARAM
config SHARP_SCOOP
bool
+
+config FIQ_GLUE
+ bool
+ select FIQ
diff --git a/arch/arm/common/Makefile b/arch/arm/common/Makefile
index 27f23b15b1ea..04aca896b338 100644
--- a/arch/arm/common/Makefile
+++ b/arch/arm/common/Makefile
@@ -4,6 +4,7 @@
obj-y += firmware.o
+obj-$(CONFIG_FIQ_GLUE) += fiq_glue.o fiq_glue_setup.o
obj-$(CONFIG_ICST) += icst.o
obj-$(CONFIG_SA1111) += sa1111.o
obj-$(CONFIG_DMABOUNCE) += dmabounce.o
diff --git a/arch/arm/common/fiq_glue.S b/arch/arm/common/fiq_glue.S
new file mode 100644
index 000000000000..24b42cec4813
--- /dev/null
+++ b/arch/arm/common/fiq_glue.S
@@ -0,0 +1,118 @@
+/*
+ * Copyright (C) 2008 Google, Inc.
+ *
+ * This software is licensed under the terms of the GNU General Public
+ * License version 2, as published by the Free Software Foundation, and
+ * may be copied, distributed, and modified under those terms.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ */
+
+#include <linux/linkage.h>
+#include <asm/assembler.h>
+
+ .text
+
+ .global fiq_glue_end
+
+ /* fiq stack: r0-r15,cpsr,spsr of interrupted mode */
+
+ENTRY(fiq_glue)
+ /* store pc, cpsr from previous mode, reserve space for spsr */
+ mrs r12, spsr
+ sub lr, lr, #4
+ subs r10, #1
+ bne nested_fiq
+
+ str r12, [sp, #-8]!
+ str lr, [sp, #-4]!
+
+ /* store r8-r14 from previous mode */
+ sub sp, sp, #(7 * 4)
+ stmia sp, {r8-r14}^
+ nop
+
+ /* store r0-r7 from previous mode */
+ stmfd sp!, {r0-r7}
+
+ /* setup func(data,regs) arguments */
+ mov r0, r9
+ mov r1, sp
+ mov r3, r8
+
+ mov r7, sp
+
+ /* Get sp and lr from non-user modes */
+ and r4, r12, #MODE_MASK
+ cmp r4, #USR_MODE
+ beq fiq_from_usr_mode
+
+ mov r7, sp
+ orr r4, r4, #(PSR_I_BIT | PSR_F_BIT)
+ msr cpsr_c, r4
+ str sp, [r7, #(4 * 13)]
+ str lr, [r7, #(4 * 14)]
+ mrs r5, spsr
+ str r5, [r7, #(4 * 17)]
+
+ cmp r4, #(SVC_MODE | PSR_I_BIT | PSR_F_BIT)
+ /* use fiq stack if we reenter this mode */
+ subne sp, r7, #(4 * 3)
+
+fiq_from_usr_mode:
+ msr cpsr_c, #(SVC_MODE | PSR_I_BIT | PSR_F_BIT)
+ mov r2, sp
+ sub sp, r7, #12
+ stmfd sp!, {r2, ip, lr}
+ /* call func(data,regs) */
+ blx r3
+ ldmfd sp, {r2, ip, lr}
+ mov sp, r2
+
+ /* restore/discard saved state */
+ cmp r4, #USR_MODE
+ beq fiq_from_usr_mode_exit
+
+ msr cpsr_c, r4
+ ldr sp, [r7, #(4 * 13)]
+ ldr lr, [r7, #(4 * 14)]
+ msr spsr_cxsf, r5
+
+fiq_from_usr_mode_exit:
+ msr cpsr_c, #(FIQ_MODE | PSR_I_BIT | PSR_F_BIT)
+
+ ldmfd sp!, {r0-r7}
+ ldr lr, [sp, #(4 * 7)]
+ ldr r12, [sp, #(4 * 8)]
+ add sp, sp, #(10 * 4)
+exit_fiq:
+ msr spsr_cxsf, r12
+ add r10, #1
+ cmp r11, #0
+ moveqs pc, lr
+ bx r11 /* jump to custom fiq return function */
+
+nested_fiq:
+ orr r12, r12, #(PSR_F_BIT)
+ b exit_fiq
+
+fiq_glue_end:
+
+ENTRY(fiq_glue_setup) /* func, data, sp, smc call number */
+ stmfd sp!, {r4}
+ mrs r4, cpsr
+ msr cpsr_c, #(FIQ_MODE | PSR_I_BIT | PSR_F_BIT)
+ movs r8, r0
+ mov r9, r1
+ mov sp, r2
+ mov r11, r3
+ moveq r10, #0
+ movne r10, #1
+ msr cpsr_c, r4
+ ldmfd sp!, {r4}
+ bx lr
+
diff --git a/arch/arm/common/fiq_glue_setup.c b/arch/arm/common/fiq_glue_setup.c
new file mode 100644
index 000000000000..8cb1b611c6d5
--- /dev/null
+++ b/arch/arm/common/fiq_glue_setup.c
@@ -0,0 +1,147 @@
+/*
+ * Copyright (C) 2010 Google, Inc.
+ *
+ * This software is licensed under the terms of the GNU General Public
+ * License version 2, as published by the Free Software Foundation, and
+ * may be copied, distributed, and modified under those terms.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ */
+
+#include <linux/kernel.h>
+#include <linux/percpu.h>
+#include <linux/slab.h>
+#include <asm/fiq.h>
+#include <asm/fiq_glue.h>
+
+extern unsigned char fiq_glue, fiq_glue_end;
+extern void fiq_glue_setup(void *func, void *data, void *sp,
+ fiq_return_handler_t fiq_return_handler);
+
+static struct fiq_handler fiq_debbuger_fiq_handler = {
+ .name = "fiq_glue",
+};
+DEFINE_PER_CPU(void *, fiq_stack);
+static struct fiq_glue_handler *current_handler;
+static fiq_return_handler_t fiq_return_handler;
+static DEFINE_MUTEX(fiq_glue_lock);
+
+static void fiq_glue_setup_helper(void *info)
+{
+ struct fiq_glue_handler *handler = info;
+ fiq_glue_setup(handler->fiq, handler,
+ __get_cpu_var(fiq_stack) + THREAD_START_SP,
+ fiq_return_handler);
+}
+
+int fiq_glue_register_handler(struct fiq_glue_handler *handler)
+{
+ int ret;
+ int cpu;
+
+ if (!handler || !handler->fiq)
+ return -EINVAL;
+
+ mutex_lock(&fiq_glue_lock);
+ if (fiq_stack) {
+ ret = -EBUSY;
+ goto err_busy;
+ }
+
+ for_each_possible_cpu(cpu) {
+ void *stack;
+ stack = (void *)__get_free_pages(GFP_KERNEL, THREAD_SIZE_ORDER);
+ if (WARN_ON(!stack)) {
+ ret = -ENOMEM;
+ goto err_alloc_fiq_stack;
+ }
+ per_cpu(fiq_stack, cpu) = stack;
+ }
+
+ ret = claim_fiq(&fiq_debbuger_fiq_handler);
+ if (WARN_ON(ret))
+ goto err_claim_fiq;
+
+ current_handler = handler;
+ on_each_cpu(fiq_glue_setup_helper, handler, true);
+ set_fiq_handler(&fiq_glue, &fiq_glue_end - &fiq_glue);
+
+ mutex_unlock(&fiq_glue_lock);
+ return 0;
+
+err_claim_fiq:
+err_alloc_fiq_stack:
+ for_each_possible_cpu(cpu) {
+ __free_pages(per_cpu(fiq_stack, cpu), THREAD_SIZE_ORDER);
+ per_cpu(fiq_stack, cpu) = NULL;
+ }
+err_busy:
+ mutex_unlock(&fiq_glue_lock);
+ return ret;
+}
+
+static void fiq_glue_update_return_handler(void (*fiq_return)(void))
+{
+ fiq_return_handler = fiq_return;
+ if (current_handler)
+ on_each_cpu(fiq_glue_setup_helper, current_handler, true);
+}
+
+int fiq_glue_set_return_handler(void (*fiq_return)(void))
+{
+ int ret;
+
+ mutex_lock(&fiq_glue_lock);
+ if (fiq_return_handler) {
+ ret = -EBUSY;
+ goto err_busy;
+ }
+ fiq_glue_update_return_handler(fiq_return);
+ ret = 0;
+err_busy:
+ mutex_unlock(&fiq_glue_lock);
+
+ return ret;
+}
+EXPORT_SYMBOL(fiq_glue_set_return_handler);
+
+int fiq_glue_clear_return_handler(void (*fiq_return)(void))
+{
+ int ret;
+
+ mutex_lock(&fiq_glue_lock);
+ if (WARN_ON(fiq_return_handler != fiq_return)) {
+ ret = -EINVAL;
+ goto err_inval;
+ }
+ fiq_glue_update_return_handler(NULL);
+ ret = 0;
+err_inval:
+ mutex_unlock(&fiq_glue_lock);
+
+ return ret;
+}
+EXPORT_SYMBOL(fiq_glue_clear_return_handler);
+
+/**
+ * fiq_glue_resume - Restore fiqs after suspend or low power idle states
+ *
+ * This must be called before calling local_fiq_enable after returning from a
+ * power state where the fiq mode registers were lost. If a driver provided
+ * a resume hook when it registered the handler it will be called.
+ */
+
+void fiq_glue_resume(void)
+{
+ if (!current_handler)
+ return;
+ fiq_glue_setup(current_handler->fiq, current_handler,
+ __get_cpu_var(fiq_stack) + THREAD_START_SP,
+ fiq_return_handler);
+ if (current_handler->resume)
+ current_handler->resume(current_handler);
+}
+
diff --git a/arch/arm/configs/ranchu_defconfig b/arch/arm/configs/ranchu_defconfig
new file mode 100644
index 000000000000..49e7bbd5825a
--- /dev/null
+++ b/arch/arm/configs/ranchu_defconfig
@@ -0,0 +1,316 @@
+# CONFIG_LOCALVERSION_AUTO is not set
+CONFIG_AUDIT=y
+CONFIG_NO_HZ=y
+CONFIG_HIGH_RES_TIMERS=y
+CONFIG_TASKSTATS=y
+CONFIG_TASK_DELAY_ACCT=y
+CONFIG_TASK_XACCT=y
+CONFIG_TASK_IO_ACCOUNTING=y
+CONFIG_IKCONFIG=y
+CONFIG_IKCONFIG_PROC=y
+CONFIG_LOG_BUF_SHIFT=14
+CONFIG_CGROUPS=y
+CONFIG_CGROUP_DEBUG=y
+CONFIG_CGROUP_FREEZER=y
+CONFIG_CPUSETS=y
+CONFIG_CGROUP_CPUACCT=y
+CONFIG_CGROUP_SCHED=y
+CONFIG_RT_GROUP_SCHED=y
+CONFIG_BLK_DEV_INITRD=y
+CONFIG_KALLSYMS_ALL=y
+CONFIG_EMBEDDED=y
+CONFIG_PROFILING=y
+CONFIG_OPROFILE=y
+CONFIG_ARCH_MMAP_RND_BITS=16
+# CONFIG_BLK_DEV_BSG is not set
+# CONFIG_IOSCHED_DEADLINE is not set
+# CONFIG_IOSCHED_CFQ is not set
+CONFIG_ARCH_VIRT=y
+CONFIG_ARM_KERNMEM_PERMS=y
+CONFIG_SMP=y
+CONFIG_PREEMPT=y
+CONFIG_AEABI=y
+CONFIG_HIGHMEM=y
+CONFIG_KSM=y
+CONFIG_SECCOMP=y
+CONFIG_CMDLINE="console=ttyAMA0"
+CONFIG_VFP=y
+CONFIG_NEON=y
+# CONFIG_CORE_DUMP_DEFAULT_ELF_HEADERS is not set
+CONFIG_PM_AUTOSLEEP=y
+CONFIG_PM_WAKELOCKS=y
+CONFIG_PM_WAKELOCKS_LIMIT=0
+# CONFIG_PM_WAKELOCKS_GC is not set
+CONFIG_PM_DEBUG=y
+CONFIG_NET=y
+CONFIG_PACKET=y
+CONFIG_UNIX=y
+CONFIG_XFRM_USER=y
+CONFIG_NET_KEY=y
+CONFIG_INET=y
+CONFIG_INET_DIAG_DESTROY=y
+CONFIG_IP_MULTICAST=y
+CONFIG_IP_ADVANCED_ROUTER=y
+CONFIG_IP_MULTIPLE_TABLES=y
+CONFIG_IP_PNP=y
+CONFIG_IP_PNP_DHCP=y
+CONFIG_IP_PNP_BOOTP=y
+CONFIG_INET_ESP=y
+# CONFIG_INET_LRO is not set
+CONFIG_IPV6_ROUTER_PREF=y
+CONFIG_IPV6_ROUTE_INFO=y
+CONFIG_IPV6_OPTIMISTIC_DAD=y
+CONFIG_INET6_AH=y
+CONFIG_INET6_ESP=y
+CONFIG_INET6_IPCOMP=y
+CONFIG_IPV6_MIP6=y
+CONFIG_IPV6_MULTIPLE_TABLES=y
+CONFIG_NETFILTER=y
+CONFIG_NF_CONNTRACK=y
+CONFIG_NF_CONNTRACK_SECMARK=y
+CONFIG_NF_CONNTRACK_EVENTS=y
+CONFIG_NF_CT_PROTO_DCCP=y
+CONFIG_NF_CT_PROTO_SCTP=y
+CONFIG_NF_CT_PROTO_UDPLITE=y
+CONFIG_NF_CONNTRACK_AMANDA=y
+CONFIG_NF_CONNTRACK_FTP=y
+CONFIG_NF_CONNTRACK_H323=y
+CONFIG_NF_CONNTRACK_IRC=y
+CONFIG_NF_CONNTRACK_NETBIOS_NS=y
+CONFIG_NF_CONNTRACK_PPTP=y
+CONFIG_NF_CONNTRACK_SANE=y
+CONFIG_NF_CONNTRACK_TFTP=y
+CONFIG_NF_CT_NETLINK=y
+CONFIG_NETFILTER_XT_TARGET_CLASSIFY=y
+CONFIG_NETFILTER_XT_TARGET_CONNMARK=y
+CONFIG_NETFILTER_XT_TARGET_CONNSECMARK=y
+CONFIG_NETFILTER_XT_TARGET_IDLETIMER=y
+CONFIG_NETFILTER_XT_TARGET_MARK=y
+CONFIG_NETFILTER_XT_TARGET_NFLOG=y
+CONFIG_NETFILTER_XT_TARGET_NFQUEUE=y
+CONFIG_NETFILTER_XT_TARGET_TPROXY=y
+CONFIG_NETFILTER_XT_TARGET_TRACE=y
+CONFIG_NETFILTER_XT_TARGET_SECMARK=y
+CONFIG_NETFILTER_XT_TARGET_TCPMSS=y
+CONFIG_NETFILTER_XT_MATCH_COMMENT=y
+CONFIG_NETFILTER_XT_MATCH_CONNLIMIT=y
+CONFIG_NETFILTER_XT_MATCH_CONNMARK=y
+CONFIG_NETFILTER_XT_MATCH_CONNTRACK=y
+CONFIG_NETFILTER_XT_MATCH_HASHLIMIT=y
+CONFIG_NETFILTER_XT_MATCH_HELPER=y
+CONFIG_NETFILTER_XT_MATCH_IPRANGE=y
+CONFIG_NETFILTER_XT_MATCH_LENGTH=y
+CONFIG_NETFILTER_XT_MATCH_LIMIT=y
+CONFIG_NETFILTER_XT_MATCH_MAC=y
+CONFIG_NETFILTER_XT_MATCH_MARK=y
+CONFIG_NETFILTER_XT_MATCH_POLICY=y
+CONFIG_NETFILTER_XT_MATCH_PKTTYPE=y
+CONFIG_NETFILTER_XT_MATCH_QTAGUID=y
+CONFIG_NETFILTER_XT_MATCH_QUOTA=y
+CONFIG_NETFILTER_XT_MATCH_QUOTA2=y
+CONFIG_NETFILTER_XT_MATCH_SOCKET=y
+CONFIG_NETFILTER_XT_MATCH_STATE=y
+CONFIG_NETFILTER_XT_MATCH_STATISTIC=y
+CONFIG_NETFILTER_XT_MATCH_STRING=y
+CONFIG_NETFILTER_XT_MATCH_TIME=y
+CONFIG_NETFILTER_XT_MATCH_U32=y
+CONFIG_NF_CONNTRACK_IPV4=y
+CONFIG_IP_NF_IPTABLES=y
+CONFIG_IP_NF_MATCH_AH=y
+CONFIG_IP_NF_MATCH_ECN=y
+CONFIG_IP_NF_MATCH_TTL=y
+CONFIG_IP_NF_FILTER=y
+CONFIG_IP_NF_TARGET_REJECT=y
+CONFIG_IP_NF_MANGLE=y
+CONFIG_IP_NF_RAW=y
+CONFIG_IP_NF_SECURITY=y
+CONFIG_IP_NF_ARPTABLES=y
+CONFIG_IP_NF_ARPFILTER=y
+CONFIG_IP_NF_ARP_MANGLE=y
+CONFIG_NF_CONNTRACK_IPV6=y
+CONFIG_IP6_NF_IPTABLES=y
+CONFIG_IP6_NF_FILTER=y
+CONFIG_IP6_NF_TARGET_REJECT=y
+CONFIG_IP6_NF_MANGLE=y
+CONFIG_IP6_NF_RAW=y
+CONFIG_BRIDGE=y
+CONFIG_NET_SCHED=y
+CONFIG_NET_SCH_HTB=y
+CONFIG_NET_CLS_U32=y
+CONFIG_NET_EMATCH=y
+CONFIG_NET_EMATCH_U32=y
+CONFIG_NET_CLS_ACT=y
+# CONFIG_WIRELESS is not set
+CONFIG_UEVENT_HELPER_PATH="/sbin/hotplug"
+CONFIG_MTD=y
+CONFIG_MTD_CMDLINE_PARTS=y
+CONFIG_MTD_BLOCK=y
+CONFIG_MTD_CFI=y
+CONFIG_MTD_CFI_INTELEXT=y
+CONFIG_MTD_CFI_AMDSTD=y
+CONFIG_BLK_DEV_LOOP=y
+CONFIG_BLK_DEV_RAM=y
+CONFIG_BLK_DEV_RAM_SIZE=8192
+CONFIG_VIRTIO_BLK=y
+CONFIG_MD=y
+CONFIG_BLK_DEV_DM=y
+CONFIG_DM_CRYPT=y
+CONFIG_DM_UEVENT=y
+CONFIG_DM_VERITY=y
+CONFIG_DM_VERITY_FEC=y
+CONFIG_NETDEVICES=y
+CONFIG_TUN=y
+CONFIG_VIRTIO_NET=y
+CONFIG_SMSC911X=y
+CONFIG_PPP=y
+CONFIG_PPP_BSDCOMP=y
+CONFIG_PPP_DEFLATE=y
+CONFIG_PPP_MPPE=y
+CONFIG_PPPOLAC=y
+CONFIG_PPPOPNS=y
+CONFIG_USB_USBNET=y
+# CONFIG_WLAN is not set
+CONFIG_INPUT_EVDEV=y
+CONFIG_INPUT_KEYRESET=y
+CONFIG_KEYBOARD_GOLDFISH_EVENTS=y
+# CONFIG_INPUT_MOUSE is not set
+CONFIG_INPUT_JOYSTICK=y
+CONFIG_JOYSTICK_XPAD=y
+CONFIG_JOYSTICK_XPAD_FF=y
+CONFIG_JOYSTICK_XPAD_LEDS=y
+CONFIG_INPUT_TABLET=y
+CONFIG_TABLET_USB_ACECAD=y
+CONFIG_TABLET_USB_AIPTEK=y
+CONFIG_TABLET_USB_GTCO=y
+CONFIG_TABLET_USB_HANWANG=y
+CONFIG_TABLET_USB_KBTAB=y
+CONFIG_INPUT_MISC=y
+CONFIG_INPUT_KEYCHORD=y
+CONFIG_INPUT_UINPUT=y
+CONFIG_INPUT_GPIO=y
+# CONFIG_SERIO_SERPORT is not set
+CONFIG_SERIO_AMBAKMI=y
+# CONFIG_VT is not set
+# CONFIG_LEGACY_PTYS is not set
+# CONFIG_DEVMEM is not set
+# CONFIG_DEVKMEM is not set
+CONFIG_SERIAL_AMBA_PL011=y
+CONFIG_SERIAL_AMBA_PL011_CONSOLE=y
+CONFIG_VIRTIO_CONSOLE=y
+# CONFIG_HW_RANDOM is not set
+# CONFIG_HWMON is not set
+CONFIG_MEDIA_SUPPORT=y
+CONFIG_FB=y
+CONFIG_FB_GOLDFISH=y
+CONFIG_FB_SIMPLE=y
+CONFIG_BACKLIGHT_LCD_SUPPORT=y
+CONFIG_LOGO=y
+# CONFIG_LOGO_LINUX_MONO is not set
+# CONFIG_LOGO_LINUX_VGA16 is not set
+CONFIG_SOUND=y
+CONFIG_SND=y
+CONFIG_HIDRAW=y
+CONFIG_UHID=y
+CONFIG_HID_A4TECH=y
+CONFIG_HID_ACRUX=y
+CONFIG_HID_ACRUX_FF=y
+CONFIG_HID_APPLE=y
+CONFIG_HID_BELKIN=y
+CONFIG_HID_CHERRY=y
+CONFIG_HID_CHICONY=y
+CONFIG_HID_PRODIKEYS=y
+CONFIG_HID_CYPRESS=y
+CONFIG_HID_DRAGONRISE=y
+CONFIG_DRAGONRISE_FF=y
+CONFIG_HID_EMS_FF=y
+CONFIG_HID_ELECOM=y
+CONFIG_HID_EZKEY=y
+CONFIG_HID_HOLTEK=y
+CONFIG_HID_KEYTOUCH=y
+CONFIG_HID_KYE=y
+CONFIG_HID_UCLOGIC=y
+CONFIG_HID_WALTOP=y
+CONFIG_HID_GYRATION=y
+CONFIG_HID_TWINHAN=y
+CONFIG_HID_KENSINGTON=y
+CONFIG_HID_LCPOWER=y
+CONFIG_HID_LOGITECH=y
+CONFIG_HID_LOGITECH_DJ=y
+CONFIG_LOGITECH_FF=y
+CONFIG_LOGIRUMBLEPAD2_FF=y
+CONFIG_LOGIG940_FF=y
+CONFIG_HID_MAGICMOUSE=y
+CONFIG_HID_MICROSOFT=y
+CONFIG_HID_MONTEREY=y
+CONFIG_HID_MULTITOUCH=y
+CONFIG_HID_NTRIG=y
+CONFIG_HID_ORTEK=y
+CONFIG_HID_PANTHERLORD=y
+CONFIG_PANTHERLORD_FF=y
+CONFIG_HID_PETALYNX=y
+CONFIG_HID_PICOLCD=y
+CONFIG_HID_PRIMAX=y
+CONFIG_HID_ROCCAT=y
+CONFIG_HID_SAITEK=y
+CONFIG_HID_SAMSUNG=y
+CONFIG_HID_SONY=y
+CONFIG_HID_SPEEDLINK=y
+CONFIG_HID_SUNPLUS=y
+CONFIG_HID_GREENASIA=y
+CONFIG_GREENASIA_FF=y
+CONFIG_HID_SMARTJOYPLUS=y
+CONFIG_SMARTJOYPLUS_FF=y
+CONFIG_HID_TIVO=y
+CONFIG_HID_TOPSEED=y
+CONFIG_HID_THRUSTMASTER=y
+CONFIG_HID_WACOM=y
+CONFIG_HID_WIIMOTE=y
+CONFIG_HID_ZEROPLUS=y
+CONFIG_HID_ZYDACRON=y
+CONFIG_USB_HIDDEV=y
+CONFIG_USB_ANNOUNCE_NEW_DEVICES=y
+CONFIG_USB_EHCI_HCD=y
+CONFIG_USB_OTG_WAKELOCK=y
+CONFIG_RTC_CLASS=y
+CONFIG_RTC_DRV_PL031=y
+CONFIG_VIRTIO_MMIO=y
+CONFIG_STAGING=y
+CONFIG_ASHMEM=y
+CONFIG_ANDROID_LOW_MEMORY_KILLER=y
+CONFIG_SYNC=y
+CONFIG_SW_SYNC=y
+CONFIG_SW_SYNC_USER=y
+CONFIG_ION=y
+CONFIG_GOLDFISH_AUDIO=y
+CONFIG_GOLDFISH=y
+CONFIG_GOLDFISH_PIPE=y
+CONFIG_ANDROID=y
+CONFIG_ANDROID_BINDER_IPC=y
+CONFIG_EXT4_FS=y
+CONFIG_EXT4_FS_SECURITY=y
+CONFIG_QUOTA=y
+CONFIG_FUSE_FS=y
+CONFIG_CUSE=y
+CONFIG_MSDOS_FS=y
+CONFIG_VFAT_FS=y
+CONFIG_TMPFS=y
+CONFIG_TMPFS_POSIX_ACL=y
+CONFIG_PSTORE=y
+CONFIG_PSTORE_CONSOLE=y
+CONFIG_PSTORE_RAM=y
+CONFIG_NFS_FS=y
+CONFIG_ROOT_NFS=y
+CONFIG_NLS_CODEPAGE_437=y
+CONFIG_NLS_ISO8859_1=y
+CONFIG_DEBUG_INFO=y
+CONFIG_MAGIC_SYSRQ=y
+CONFIG_DETECT_HUNG_TASK=y
+CONFIG_PANIC_TIMEOUT=5
+# CONFIG_SCHED_DEBUG is not set
+CONFIG_SCHEDSTATS=y
+CONFIG_TIMER_STATS=y
+CONFIG_ENABLE_DEFAULT_TRACERS=y
+CONFIG_SECURITY=y
+CONFIG_SECURITY_NETWORK=y
+CONFIG_SECURITY_SELINUX=y
+CONFIG_VIRTUALIZATION=y
diff --git a/arch/arm/crypto/Kconfig b/arch/arm/crypto/Kconfig
index 27ed1b1cd1d7..27e2309978b0 100644
--- a/arch/arm/crypto/Kconfig
+++ b/arch/arm/crypto/Kconfig
@@ -62,34 +62,25 @@ config CRYPTO_SHA512_ARM
using optimized ARM assembler and NEON, when available.
config CRYPTO_AES_ARM
- tristate "AES cipher algorithms (ARM-asm)"
- depends on ARM
+ tristate "Scalar AES cipher for ARM"
select CRYPTO_ALGAPI
select CRYPTO_AES
help
Use optimized AES assembler routines for ARM platforms.
- AES cipher algorithms (FIPS-197). AES uses the Rijndael
- algorithm.
-
- Rijndael appears to be consistently a very good performer in
- both hardware and software across a wide range of computing
- environments regardless of its use in feedback or non-feedback
- modes. Its key setup time is excellent, and its key agility is
- good. Rijndael's very low memory requirements make it very well
- suited for restricted-space environments, in which it also
- demonstrates excellent performance. Rijndael's operations are
- among the easiest to defend against power and timing attacks.
+ On ARM processors without the Crypto Extensions, this is the
+ fastest AES implementation for single blocks. For multiple
+ blocks, the NEON bit-sliced implementation is usually faster.
- The AES specifies three key sizes: 128, 192 and 256 bits
-
- See <http://csrc.nist.gov/encryption/aes/> for more information.
+ This implementation may be vulnerable to cache timing attacks,
+ since it uses lookup tables. However, as countermeasures it
+ disables IRQs and preloads the tables; it is hoped this makes
+ such attacks very difficult.
config CRYPTO_AES_ARM_BS
tristate "Bit sliced AES using NEON instructions"
depends on KERNEL_MODE_NEON
select CRYPTO_ALGAPI
- select CRYPTO_AES_ARM
select CRYPTO_ABLK_HELPER
help
Use a faster and more secure NEON based implementation of AES in CBC,
@@ -120,4 +111,15 @@ config CRYPTO_GHASH_ARM_CE
that uses the 64x64 to 128 bit polynomial multiplication (vmull.p64)
that is part of the ARMv8 Crypto Extensions
+config CRYPTO_CHACHA20_NEON
+ tristate "NEON accelerated ChaCha stream cipher algorithms"
+ depends on KERNEL_MODE_NEON
+ select CRYPTO_BLKCIPHER
+ select CRYPTO_CHACHA20
+
+config CRYPTO_NHPOLY1305_NEON
+ tristate "NEON accelerated NHPoly1305 hash function (for Adiantum)"
+ depends on KERNEL_MODE_NEON
+ select CRYPTO_NHPOLY1305
+
endif
diff --git a/arch/arm/crypto/Makefile b/arch/arm/crypto/Makefile
index fc5150702b64..ae2480b9561a 100644
--- a/arch/arm/crypto/Makefile
+++ b/arch/arm/crypto/Makefile
@@ -8,6 +8,8 @@ obj-$(CONFIG_CRYPTO_SHA1_ARM) += sha1-arm.o
obj-$(CONFIG_CRYPTO_SHA1_ARM_NEON) += sha1-arm-neon.o
obj-$(CONFIG_CRYPTO_SHA256_ARM) += sha256-arm.o
obj-$(CONFIG_CRYPTO_SHA512_ARM) += sha512-arm.o
+obj-$(CONFIG_CRYPTO_CHACHA20_NEON) += chacha-neon.o
+obj-$(CONFIG_CRYPTO_NHPOLY1305_NEON) += nhpoly1305-neon.o
ce-obj-$(CONFIG_CRYPTO_AES_ARM_CE) += aes-arm-ce.o
ce-obj-$(CONFIG_CRYPTO_SHA1_ARM_CE) += sha1-arm-ce.o
@@ -24,8 +26,8 @@ $(warning $(ce-obj-y) $(ce-obj-m))
endif
endif
-aes-arm-y := aes-armv4.o aes_glue.o
-aes-arm-bs-y := aesbs-core.o aesbs-glue.o
+aes-arm-y := aes-cipher-core.o aes-cipher-glue.o
+aes-arm-bs-y := aes-armv4.o aesbs-core.o aesbs-glue.o
sha1-arm-y := sha1-armv4-large.o sha1_glue.o
sha1-arm-neon-y := sha1-armv7-neon.o sha1_neon_glue.o
sha256-arm-neon-$(CONFIG_KERNEL_MODE_NEON) := sha256_neon_glue.o
@@ -36,6 +38,8 @@ sha1-arm-ce-y := sha1-ce-core.o sha1-ce-glue.o
sha2-arm-ce-y := sha2-ce-core.o sha2-ce-glue.o
aes-arm-ce-y := aes-ce-core.o aes-ce-glue.o
ghash-arm-ce-y := ghash-ce-core.o ghash-ce-glue.o
+chacha-neon-y := chacha-neon-core.o chacha-neon-glue.o
+nhpoly1305-neon-y := nh-neon-core.o nhpoly1305-neon-glue.o
quiet_cmd_perl = PERL $@
cmd_perl = $(PERL) $(<) > $(@)
diff --git a/arch/arm/crypto/aes-cipher-core.S b/arch/arm/crypto/aes-cipher-core.S
new file mode 100644
index 000000000000..f2d67c095e59
--- /dev/null
+++ b/arch/arm/crypto/aes-cipher-core.S
@@ -0,0 +1,264 @@
+/*
+ * Scalar AES core transform
+ *
+ * Copyright (C) 2017 Linaro Ltd.
+ * Author: Ard Biesheuvel <ard.biesheuvel@linaro.org>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#include <linux/linkage.h>
+#include <asm/assembler.h>
+#include <asm/cache.h>
+
+ .text
+ .align 5
+
+ rk .req r0
+ rounds .req r1
+ in .req r2
+ out .req r3
+ ttab .req ip
+
+ t0 .req lr
+ t1 .req r2
+ t2 .req r3
+
+ .macro __select, out, in, idx
+ .if __LINUX_ARM_ARCH__ < 7
+ and \out, \in, #0xff << (8 * \idx)
+ .else
+ ubfx \out, \in, #(8 * \idx), #8
+ .endif
+ .endm
+
+ .macro __load, out, in, idx, sz, op
+ .if __LINUX_ARM_ARCH__ < 7 && \idx > 0
+ ldr\op \out, [ttab, \in, lsr #(8 * \idx) - \sz]
+ .else
+ ldr\op \out, [ttab, \in, lsl #\sz]
+ .endif
+ .endm
+
+ .macro __hround, out0, out1, in0, in1, in2, in3, t3, t4, enc, sz, op, oldcpsr
+ __select \out0, \in0, 0
+ __select t0, \in1, 1
+ __load \out0, \out0, 0, \sz, \op
+ __load t0, t0, 1, \sz, \op
+
+ .if \enc
+ __select \out1, \in1, 0
+ __select t1, \in2, 1
+ .else
+ __select \out1, \in3, 0
+ __select t1, \in0, 1
+ .endif
+ __load \out1, \out1, 0, \sz, \op
+ __select t2, \in2, 2
+ __load t1, t1, 1, \sz, \op
+ __load t2, t2, 2, \sz, \op
+
+ eor \out0, \out0, t0, ror #24
+
+ __select t0, \in3, 3
+ .if \enc
+ __select \t3, \in3, 2
+ __select \t4, \in0, 3
+ .else
+ __select \t3, \in1, 2
+ __select \t4, \in2, 3
+ .endif
+ __load \t3, \t3, 2, \sz, \op
+ __load t0, t0, 3, \sz, \op
+ __load \t4, \t4, 3, \sz, \op
+
+ .ifnb \oldcpsr
+ /*
+ * This is the final round and we're done with all data-dependent table
+ * lookups, so we can safely re-enable interrupts.
+ */
+ restore_irqs \oldcpsr
+ .endif
+
+ eor \out1, \out1, t1, ror #24
+ eor \out0, \out0, t2, ror #16
+ ldm rk!, {t1, t2}
+ eor \out1, \out1, \t3, ror #16
+ eor \out0, \out0, t0, ror #8
+ eor \out1, \out1, \t4, ror #8
+ eor \out0, \out0, t1
+ eor \out1, \out1, t2
+ .endm
+
+ .macro fround, out0, out1, out2, out3, in0, in1, in2, in3, sz=2, op, oldcpsr
+ __hround \out0, \out1, \in0, \in1, \in2, \in3, \out2, \out3, 1, \sz, \op
+ __hround \out2, \out3, \in2, \in3, \in0, \in1, \in1, \in2, 1, \sz, \op, \oldcpsr
+ .endm
+
+ .macro iround, out0, out1, out2, out3, in0, in1, in2, in3, sz=2, op, oldcpsr
+ __hround \out0, \out1, \in0, \in3, \in2, \in1, \out2, \out3, 0, \sz, \op
+ __hround \out2, \out3, \in2, \in1, \in0, \in3, \in1, \in0, 0, \sz, \op, \oldcpsr
+ .endm
+
+ .macro __rev, out, in
+ .if __LINUX_ARM_ARCH__ < 6
+ lsl t0, \in, #24
+ and t1, \in, #0xff00
+ and t2, \in, #0xff0000
+ orr \out, t0, \in, lsr #24
+ orr \out, \out, t1, lsl #8
+ orr \out, \out, t2, lsr #8
+ .else
+ rev \out, \in
+ .endif
+ .endm
+
+ .macro __adrl, out, sym, c
+ .if __LINUX_ARM_ARCH__ < 7
+ ldr\c \out, =\sym
+ .else
+ movw\c \out, #:lower16:\sym
+ movt\c \out, #:upper16:\sym
+ .endif
+ .endm
+
+ .macro do_crypt, round, ttab, ltab, bsz
+ push {r3-r11, lr}
+
+ // Load keys first, to reduce latency in case they're not cached yet.
+ ldm rk!, {r8-r11}
+
+ ldr r4, [in]
+ ldr r5, [in, #4]
+ ldr r6, [in, #8]
+ ldr r7, [in, #12]
+
+#ifdef CONFIG_CPU_BIG_ENDIAN
+ __rev r4, r4
+ __rev r5, r5
+ __rev r6, r6
+ __rev r7, r7
+#endif
+
+ eor r4, r4, r8
+ eor r5, r5, r9
+ eor r6, r6, r10
+ eor r7, r7, r11
+
+ __adrl ttab, \ttab
+ /*
+ * Disable interrupts and prefetch the 1024-byte 'ft' or 'it' table into
+ * L1 cache, assuming cacheline size >= 32. This is a hardening measure
+ * intended to make cache-timing attacks more difficult. They may not
+ * be fully prevented, however; see the paper
+ * https://cr.yp.to/antiforgery/cachetiming-20050414.pdf
+ * ("Cache-timing attacks on AES") for a discussion of the many
+ * difficulties involved in writing truly constant-time AES software.
+ */
+ save_and_disable_irqs t0
+ .set i, 0
+ .rept 1024 / 128
+ ldr r8, [ttab, #i + 0]
+ ldr r9, [ttab, #i + 32]
+ ldr r10, [ttab, #i + 64]
+ ldr r11, [ttab, #i + 96]
+ .set i, i + 128
+ .endr
+ push {t0} // oldcpsr
+
+ tst rounds, #2
+ bne 1f
+
+0: \round r8, r9, r10, r11, r4, r5, r6, r7
+ \round r4, r5, r6, r7, r8, r9, r10, r11
+
+1: subs rounds, rounds, #4
+ \round r8, r9, r10, r11, r4, r5, r6, r7
+ bls 2f
+ \round r4, r5, r6, r7, r8, r9, r10, r11
+ b 0b
+
+2: .ifb \ltab
+ add ttab, ttab, #1
+ .else
+ __adrl ttab, \ltab
+ // Prefetch inverse S-box for final round; see explanation above
+ .set i, 0
+ .rept 256 / 64
+ ldr t0, [ttab, #i + 0]
+ ldr t1, [ttab, #i + 32]
+ .set i, i + 64
+ .endr
+ .endif
+
+ pop {rounds} // oldcpsr
+ \round r4, r5, r6, r7, r8, r9, r10, r11, \bsz, b, rounds
+
+#ifdef CONFIG_CPU_BIG_ENDIAN
+ __rev r4, r4
+ __rev r5, r5
+ __rev r6, r6
+ __rev r7, r7
+#endif
+
+ ldr out, [sp]
+
+ str r4, [out]
+ str r5, [out, #4]
+ str r6, [out, #8]
+ str r7, [out, #12]
+
+ pop {r3-r11, pc}
+
+ .align 3
+ .ltorg
+ .endm
+
+ENTRY(__aes_arm_encrypt)
+ do_crypt fround, crypto_ft_tab,, 2
+ENDPROC(__aes_arm_encrypt)
+
+ .align 5
+ENTRY(__aes_arm_decrypt)
+ do_crypt iround, crypto_it_tab, __aes_arm_inverse_sbox, 0
+ENDPROC(__aes_arm_decrypt)
+
+ .section ".rodata", "a"
+ .align L1_CACHE_SHIFT
+ .type __aes_arm_inverse_sbox, %object
+__aes_arm_inverse_sbox:
+ .byte 0x52, 0x09, 0x6a, 0xd5, 0x30, 0x36, 0xa5, 0x38
+ .byte 0xbf, 0x40, 0xa3, 0x9e, 0x81, 0xf3, 0xd7, 0xfb
+ .byte 0x7c, 0xe3, 0x39, 0x82, 0x9b, 0x2f, 0xff, 0x87
+ .byte 0x34, 0x8e, 0x43, 0x44, 0xc4, 0xde, 0xe9, 0xcb
+ .byte 0x54, 0x7b, 0x94, 0x32, 0xa6, 0xc2, 0x23, 0x3d
+ .byte 0xee, 0x4c, 0x95, 0x0b, 0x42, 0xfa, 0xc3, 0x4e
+ .byte 0x08, 0x2e, 0xa1, 0x66, 0x28, 0xd9, 0x24, 0xb2
+ .byte 0x76, 0x5b, 0xa2, 0x49, 0x6d, 0x8b, 0xd1, 0x25
+ .byte 0x72, 0xf8, 0xf6, 0x64, 0x86, 0x68, 0x98, 0x16
+ .byte 0xd4, 0xa4, 0x5c, 0xcc, 0x5d, 0x65, 0xb6, 0x92
+ .byte 0x6c, 0x70, 0x48, 0x50, 0xfd, 0xed, 0xb9, 0xda
+ .byte 0x5e, 0x15, 0x46, 0x57, 0xa7, 0x8d, 0x9d, 0x84
+ .byte 0x90, 0xd8, 0xab, 0x00, 0x8c, 0xbc, 0xd3, 0x0a
+ .byte 0xf7, 0xe4, 0x58, 0x05, 0xb8, 0xb3, 0x45, 0x06
+ .byte 0xd0, 0x2c, 0x1e, 0x8f, 0xca, 0x3f, 0x0f, 0x02
+ .byte 0xc1, 0xaf, 0xbd, 0x03, 0x01, 0x13, 0x8a, 0x6b
+ .byte 0x3a, 0x91, 0x11, 0x41, 0x4f, 0x67, 0xdc, 0xea
+ .byte 0x97, 0xf2, 0xcf, 0xce, 0xf0, 0xb4, 0xe6, 0x73
+ .byte 0x96, 0xac, 0x74, 0x22, 0xe7, 0xad, 0x35, 0x85
+ .byte 0xe2, 0xf9, 0x37, 0xe8, 0x1c, 0x75, 0xdf, 0x6e
+ .byte 0x47, 0xf1, 0x1a, 0x71, 0x1d, 0x29, 0xc5, 0x89
+ .byte 0x6f, 0xb7, 0x62, 0x0e, 0xaa, 0x18, 0xbe, 0x1b
+ .byte 0xfc, 0x56, 0x3e, 0x4b, 0xc6, 0xd2, 0x79, 0x20
+ .byte 0x9a, 0xdb, 0xc0, 0xfe, 0x78, 0xcd, 0x5a, 0xf4
+ .byte 0x1f, 0xdd, 0xa8, 0x33, 0x88, 0x07, 0xc7, 0x31
+ .byte 0xb1, 0x12, 0x10, 0x59, 0x27, 0x80, 0xec, 0x5f
+ .byte 0x60, 0x51, 0x7f, 0xa9, 0x19, 0xb5, 0x4a, 0x0d
+ .byte 0x2d, 0xe5, 0x7a, 0x9f, 0x93, 0xc9, 0x9c, 0xef
+ .byte 0xa0, 0xe0, 0x3b, 0x4d, 0xae, 0x2a, 0xf5, 0xb0
+ .byte 0xc8, 0xeb, 0xbb, 0x3c, 0x83, 0x53, 0x99, 0x61
+ .byte 0x17, 0x2b, 0x04, 0x7e, 0xba, 0x77, 0xd6, 0x26
+ .byte 0xe1, 0x69, 0x14, 0x63, 0x55, 0x21, 0x0c, 0x7d
+ .size __aes_arm_inverse_sbox, . - __aes_arm_inverse_sbox
diff --git a/arch/arm/crypto/aes-cipher-glue.c b/arch/arm/crypto/aes-cipher-glue.c
new file mode 100644
index 000000000000..c222f6e072ad
--- /dev/null
+++ b/arch/arm/crypto/aes-cipher-glue.c
@@ -0,0 +1,74 @@
+/*
+ * Scalar AES core transform
+ *
+ * Copyright (C) 2017 Linaro Ltd.
+ * Author: Ard Biesheuvel <ard.biesheuvel@linaro.org>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#include <crypto/aes.h>
+#include <linux/crypto.h>
+#include <linux/module.h>
+
+asmlinkage void __aes_arm_encrypt(u32 *rk, int rounds, const u8 *in, u8 *out);
+EXPORT_SYMBOL(__aes_arm_encrypt);
+
+asmlinkage void __aes_arm_decrypt(u32 *rk, int rounds, const u8 *in, u8 *out);
+EXPORT_SYMBOL(__aes_arm_decrypt);
+
+static void aes_encrypt(struct crypto_tfm *tfm, u8 *out, const u8 *in)
+{
+ struct crypto_aes_ctx *ctx = crypto_tfm_ctx(tfm);
+ int rounds = 6 + ctx->key_length / 4;
+
+ __aes_arm_encrypt(ctx->key_enc, rounds, in, out);
+}
+
+static void aes_decrypt(struct crypto_tfm *tfm, u8 *out, const u8 *in)
+{
+ struct crypto_aes_ctx *ctx = crypto_tfm_ctx(tfm);
+ int rounds = 6 + ctx->key_length / 4;
+
+ __aes_arm_decrypt(ctx->key_dec, rounds, in, out);
+}
+
+static struct crypto_alg aes_alg = {
+ .cra_name = "aes",
+ .cra_driver_name = "aes-arm",
+ .cra_priority = 200,
+ .cra_flags = CRYPTO_ALG_TYPE_CIPHER,
+ .cra_blocksize = AES_BLOCK_SIZE,
+ .cra_ctxsize = sizeof(struct crypto_aes_ctx),
+ .cra_module = THIS_MODULE,
+
+ .cra_cipher.cia_min_keysize = AES_MIN_KEY_SIZE,
+ .cra_cipher.cia_max_keysize = AES_MAX_KEY_SIZE,
+ .cra_cipher.cia_setkey = crypto_aes_set_key,
+ .cra_cipher.cia_encrypt = aes_encrypt,
+ .cra_cipher.cia_decrypt = aes_decrypt,
+
+#ifndef CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS
+ .cra_alignmask = 3,
+#endif
+};
+
+static int __init aes_init(void)
+{
+ return crypto_register_alg(&aes_alg);
+}
+
+static void __exit aes_fini(void)
+{
+ crypto_unregister_alg(&aes_alg);
+}
+
+module_init(aes_init);
+module_exit(aes_fini);
+
+MODULE_DESCRIPTION("Scalar AES cipher for ARM");
+MODULE_AUTHOR("Ard Biesheuvel <ard.biesheuvel@linaro.org>");
+MODULE_LICENSE("GPL v2");
+MODULE_ALIAS_CRYPTO("aes");
diff --git a/arch/arm/crypto/aes_glue.c b/arch/arm/crypto/aes_glue.c
deleted file mode 100644
index 0409b8f89782..000000000000
--- a/arch/arm/crypto/aes_glue.c
+++ /dev/null
@@ -1,98 +0,0 @@
-/*
- * Glue Code for the asm optimized version of the AES Cipher Algorithm
- */
-
-#include <linux/module.h>
-#include <linux/crypto.h>
-#include <crypto/aes.h>
-
-#include "aes_glue.h"
-
-EXPORT_SYMBOL(AES_encrypt);
-EXPORT_SYMBOL(AES_decrypt);
-EXPORT_SYMBOL(private_AES_set_encrypt_key);
-EXPORT_SYMBOL(private_AES_set_decrypt_key);
-
-static void aes_encrypt(struct crypto_tfm *tfm, u8 *dst, const u8 *src)
-{
- struct AES_CTX *ctx = crypto_tfm_ctx(tfm);
- AES_encrypt(src, dst, &ctx->enc_key);
-}
-
-static void aes_decrypt(struct crypto_tfm *tfm, u8 *dst, const u8 *src)
-{
- struct AES_CTX *ctx = crypto_tfm_ctx(tfm);
- AES_decrypt(src, dst, &ctx->dec_key);
-}
-
-static int aes_set_key(struct crypto_tfm *tfm, const u8 *in_key,
- unsigned int key_len)
-{
- struct AES_CTX *ctx = crypto_tfm_ctx(tfm);
-
- switch (key_len) {
- case AES_KEYSIZE_128:
- key_len = 128;
- break;
- case AES_KEYSIZE_192:
- key_len = 192;
- break;
- case AES_KEYSIZE_256:
- key_len = 256;
- break;
- default:
- tfm->crt_flags |= CRYPTO_TFM_RES_BAD_KEY_LEN;
- return -EINVAL;
- }
-
- if (private_AES_set_encrypt_key(in_key, key_len, &ctx->enc_key) == -1) {
- tfm->crt_flags |= CRYPTO_TFM_RES_BAD_KEY_LEN;
- return -EINVAL;
- }
- /* private_AES_set_decrypt_key expects an encryption key as input */
- ctx->dec_key = ctx->enc_key;
- if (private_AES_set_decrypt_key(in_key, key_len, &ctx->dec_key) == -1) {
- tfm->crt_flags |= CRYPTO_TFM_RES_BAD_KEY_LEN;
- return -EINVAL;
- }
- return 0;
-}
-
-static struct crypto_alg aes_alg = {
- .cra_name = "aes",
- .cra_driver_name = "aes-asm",
- .cra_priority = 200,
- .cra_flags = CRYPTO_ALG_TYPE_CIPHER,
- .cra_blocksize = AES_BLOCK_SIZE,
- .cra_ctxsize = sizeof(struct AES_CTX),
- .cra_module = THIS_MODULE,
- .cra_list = LIST_HEAD_INIT(aes_alg.cra_list),
- .cra_u = {
- .cipher = {
- .cia_min_keysize = AES_MIN_KEY_SIZE,
- .cia_max_keysize = AES_MAX_KEY_SIZE,
- .cia_setkey = aes_set_key,
- .cia_encrypt = aes_encrypt,
- .cia_decrypt = aes_decrypt
- }
- }
-};
-
-static int __init aes_init(void)
-{
- return crypto_register_alg(&aes_alg);
-}
-
-static void __exit aes_fini(void)
-{
- crypto_unregister_alg(&aes_alg);
-}
-
-module_init(aes_init);
-module_exit(aes_fini);
-
-MODULE_DESCRIPTION("Rijndael (AES) Cipher Algorithm (ASM)");
-MODULE_LICENSE("GPL");
-MODULE_ALIAS_CRYPTO("aes");
-MODULE_ALIAS_CRYPTO("aes-asm");
-MODULE_AUTHOR("David McCullough <ucdevel@gmail.com>");
diff --git a/arch/arm/crypto/chacha-neon-core.S b/arch/arm/crypto/chacha-neon-core.S
new file mode 100644
index 000000000000..eb22926d4912
--- /dev/null
+++ b/arch/arm/crypto/chacha-neon-core.S
@@ -0,0 +1,560 @@
+/*
+ * ChaCha/XChaCha NEON helper functions
+ *
+ * Copyright (C) 2016 Linaro, Ltd. <ard.biesheuvel@linaro.org>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ * Based on:
+ * ChaCha20 256-bit cipher algorithm, RFC7539, x64 SSE3 functions
+ *
+ * Copyright (C) 2015 Martin Willi
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ */
+
+ /*
+ * NEON doesn't have a rotate instruction. The alternatives are, more or less:
+ *
+ * (a) vshl.u32 + vsri.u32 (needs temporary register)
+ * (b) vshl.u32 + vshr.u32 + vorr (needs temporary register)
+ * (c) vrev32.16 (16-bit rotations only)
+ * (d) vtbl.8 + vtbl.8 (multiple of 8 bits rotations only,
+ * needs index vector)
+ *
+ * ChaCha has 16, 12, 8, and 7-bit rotations. For the 12 and 7-bit rotations,
+ * the only choices are (a) and (b). We use (a) since it takes two-thirds the
+ * cycles of (b) on both Cortex-A7 and Cortex-A53.
+ *
+ * For the 16-bit rotation, we use vrev32.16 since it's consistently fastest
+ * and doesn't need a temporary register.
+ *
+ * For the 8-bit rotation, we use vtbl.8 + vtbl.8. On Cortex-A7, this sequence
+ * is twice as fast as (a), even when doing (a) on multiple registers
+ * simultaneously to eliminate the stall between vshl and vsri. Also, it
+ * parallelizes better when temporary registers are scarce.
+ *
+ * A disadvantage is that on Cortex-A53, the vtbl sequence is the same speed as
+ * (a), so the need to load the rotation table actually makes the vtbl method
+ * slightly slower overall on that CPU (~1.3% slower ChaCha20). Still, it
+ * seems to be a good compromise to get a more significant speed boost on some
+ * CPUs, e.g. ~4.8% faster ChaCha20 on Cortex-A7.
+ */
+
+#include <linux/linkage.h>
+
+ .text
+ .fpu neon
+ .align 5
+
+/*
+ * chacha_permute - permute one block
+ *
+ * Permute one 64-byte block where the state matrix is stored in the four NEON
+ * registers q0-q3. It performs matrix operations on four words in parallel,
+ * but requires shuffling to rearrange the words after each round.
+ *
+ * The round count is given in r3.
+ *
+ * Clobbers: r3, ip, q4-q5
+ */
+chacha_permute:
+
+ adr ip, .Lrol8_table
+ vld1.8 {d10}, [ip, :64]
+
+.Ldoubleround:
+ // x0 += x1, x3 = rotl32(x3 ^ x0, 16)
+ vadd.i32 q0, q0, q1
+ veor q3, q3, q0
+ vrev32.16 q3, q3
+
+ // x2 += x3, x1 = rotl32(x1 ^ x2, 12)
+ vadd.i32 q2, q2, q3
+ veor q4, q1, q2
+ vshl.u32 q1, q4, #12
+ vsri.u32 q1, q4, #20
+
+ // x0 += x1, x3 = rotl32(x3 ^ x0, 8)
+ vadd.i32 q0, q0, q1
+ veor q3, q3, q0
+ vtbl.8 d6, {d6}, d10
+ vtbl.8 d7, {d7}, d10
+
+ // x2 += x3, x1 = rotl32(x1 ^ x2, 7)
+ vadd.i32 q2, q2, q3
+ veor q4, q1, q2
+ vshl.u32 q1, q4, #7
+ vsri.u32 q1, q4, #25
+
+ // x1 = shuffle32(x1, MASK(0, 3, 2, 1))
+ vext.8 q1, q1, q1, #4
+ // x2 = shuffle32(x2, MASK(1, 0, 3, 2))
+ vext.8 q2, q2, q2, #8
+ // x3 = shuffle32(x3, MASK(2, 1, 0, 3))
+ vext.8 q3, q3, q3, #12
+
+ // x0 += x1, x3 = rotl32(x3 ^ x0, 16)
+ vadd.i32 q0, q0, q1
+ veor q3, q3, q0
+ vrev32.16 q3, q3
+
+ // x2 += x3, x1 = rotl32(x1 ^ x2, 12)
+ vadd.i32 q2, q2, q3
+ veor q4, q1, q2
+ vshl.u32 q1, q4, #12
+ vsri.u32 q1, q4, #20
+
+ // x0 += x1, x3 = rotl32(x3 ^ x0, 8)
+ vadd.i32 q0, q0, q1
+ veor q3, q3, q0
+ vtbl.8 d6, {d6}, d10
+ vtbl.8 d7, {d7}, d10
+
+ // x2 += x3, x1 = rotl32(x1 ^ x2, 7)
+ vadd.i32 q2, q2, q3
+ veor q4, q1, q2
+ vshl.u32 q1, q4, #7
+ vsri.u32 q1, q4, #25
+
+ // x1 = shuffle32(x1, MASK(2, 1, 0, 3))
+ vext.8 q1, q1, q1, #12
+ // x2 = shuffle32(x2, MASK(1, 0, 3, 2))
+ vext.8 q2, q2, q2, #8
+ // x3 = shuffle32(x3, MASK(0, 3, 2, 1))
+ vext.8 q3, q3, q3, #4
+
+ subs r3, r3, #2
+ bne .Ldoubleround
+
+ bx lr
+ENDPROC(chacha_permute)
+
+ENTRY(chacha_block_xor_neon)
+ // r0: Input state matrix, s
+ // r1: 1 data block output, o
+ // r2: 1 data block input, i
+ // r3: nrounds
+ push {lr}
+
+ // x0..3 = s0..3
+ add ip, r0, #0x20
+ vld1.32 {q0-q1}, [r0]
+ vld1.32 {q2-q3}, [ip]
+
+ vmov q8, q0
+ vmov q9, q1
+ vmov q10, q2
+ vmov q11, q3
+
+ bl chacha_permute
+
+ add ip, r2, #0x20
+ vld1.8 {q4-q5}, [r2]
+ vld1.8 {q6-q7}, [ip]
+
+ // o0 = i0 ^ (x0 + s0)
+ vadd.i32 q0, q0, q8
+ veor q0, q0, q4
+
+ // o1 = i1 ^ (x1 + s1)
+ vadd.i32 q1, q1, q9
+ veor q1, q1, q5
+
+ // o2 = i2 ^ (x2 + s2)
+ vadd.i32 q2, q2, q10
+ veor q2, q2, q6
+
+ // o3 = i3 ^ (x3 + s3)
+ vadd.i32 q3, q3, q11
+ veor q3, q3, q7
+
+ add ip, r1, #0x20
+ vst1.8 {q0-q1}, [r1]
+ vst1.8 {q2-q3}, [ip]
+
+ pop {pc}
+ENDPROC(chacha_block_xor_neon)
+
+ENTRY(hchacha_block_neon)
+ // r0: Input state matrix, s
+ // r1: output (8 32-bit words)
+ // r2: nrounds
+ push {lr}
+
+ vld1.32 {q0-q1}, [r0]!
+ vld1.32 {q2-q3}, [r0]
+
+ mov r3, r2
+ bl chacha_permute
+
+ vst1.32 {q0}, [r1]!
+ vst1.32 {q3}, [r1]
+
+ pop {pc}
+ENDPROC(hchacha_block_neon)
+
+ .align 4
+.Lctrinc: .word 0, 1, 2, 3
+.Lrol8_table: .byte 3, 0, 1, 2, 7, 4, 5, 6
+
+ .align 5
+ENTRY(chacha_4block_xor_neon)
+ push {r4-r5}
+ mov r4, sp // preserve the stack pointer
+ sub ip, sp, #0x20 // allocate a 32 byte buffer
+ bic ip, ip, #0x1f // aligned to 32 bytes
+ mov sp, ip
+
+ // r0: Input state matrix, s
+ // r1: 4 data blocks output, o
+ // r2: 4 data blocks input, i
+ // r3: nrounds
+
+ //
+ // This function encrypts four consecutive ChaCha blocks by loading
+ // the state matrix in NEON registers four times. The algorithm performs
+ // each operation on the corresponding word of each state matrix, hence
+ // requires no word shuffling. The words are re-interleaved before the
+ // final addition of the original state and the XORing step.
+ //
+
+ // x0..15[0-3] = s0..15[0-3]
+ add ip, r0, #0x20
+ vld1.32 {q0-q1}, [r0]
+ vld1.32 {q2-q3}, [ip]
+
+ adr r5, .Lctrinc
+ vdup.32 q15, d7[1]
+ vdup.32 q14, d7[0]
+ vld1.32 {q4}, [r5, :128]
+ vdup.32 q13, d6[1]
+ vdup.32 q12, d6[0]
+ vdup.32 q11, d5[1]
+ vdup.32 q10, d5[0]
+ vadd.u32 q12, q12, q4 // x12 += counter values 0-3
+ vdup.32 q9, d4[1]
+ vdup.32 q8, d4[0]
+ vdup.32 q7, d3[1]
+ vdup.32 q6, d3[0]
+ vdup.32 q5, d2[1]
+ vdup.32 q4, d2[0]
+ vdup.32 q3, d1[1]
+ vdup.32 q2, d1[0]
+ vdup.32 q1, d0[1]
+ vdup.32 q0, d0[0]
+
+ adr ip, .Lrol8_table
+ b 1f
+
+.Ldoubleround4:
+ vld1.32 {q8-q9}, [sp, :256]
+1:
+ // x0 += x4, x12 = rotl32(x12 ^ x0, 16)
+ // x1 += x5, x13 = rotl32(x13 ^ x1, 16)
+ // x2 += x6, x14 = rotl32(x14 ^ x2, 16)
+ // x3 += x7, x15 = rotl32(x15 ^ x3, 16)
+ vadd.i32 q0, q0, q4
+ vadd.i32 q1, q1, q5
+ vadd.i32 q2, q2, q6
+ vadd.i32 q3, q3, q7
+
+ veor q12, q12, q0
+ veor q13, q13, q1
+ veor q14, q14, q2
+ veor q15, q15, q3
+
+ vrev32.16 q12, q12
+ vrev32.16 q13, q13
+ vrev32.16 q14, q14
+ vrev32.16 q15, q15
+
+ // x8 += x12, x4 = rotl32(x4 ^ x8, 12)
+ // x9 += x13, x5 = rotl32(x5 ^ x9, 12)
+ // x10 += x14, x6 = rotl32(x6 ^ x10, 12)
+ // x11 += x15, x7 = rotl32(x7 ^ x11, 12)
+ vadd.i32 q8, q8, q12
+ vadd.i32 q9, q9, q13
+ vadd.i32 q10, q10, q14
+ vadd.i32 q11, q11, q15
+
+ vst1.32 {q8-q9}, [sp, :256]
+
+ veor q8, q4, q8
+ veor q9, q5, q9
+ vshl.u32 q4, q8, #12
+ vshl.u32 q5, q9, #12
+ vsri.u32 q4, q8, #20
+ vsri.u32 q5, q9, #20
+
+ veor q8, q6, q10
+ veor q9, q7, q11
+ vshl.u32 q6, q8, #12
+ vshl.u32 q7, q9, #12
+ vsri.u32 q6, q8, #20
+ vsri.u32 q7, q9, #20
+
+ // x0 += x4, x12 = rotl32(x12 ^ x0, 8)
+ // x1 += x5, x13 = rotl32(x13 ^ x1, 8)
+ // x2 += x6, x14 = rotl32(x14 ^ x2, 8)
+ // x3 += x7, x15 = rotl32(x15 ^ x3, 8)
+ vld1.8 {d16}, [ip, :64]
+ vadd.i32 q0, q0, q4
+ vadd.i32 q1, q1, q5
+ vadd.i32 q2, q2, q6
+ vadd.i32 q3, q3, q7
+
+ veor q12, q12, q0
+ veor q13, q13, q1
+ veor q14, q14, q2
+ veor q15, q15, q3
+
+ vtbl.8 d24, {d24}, d16
+ vtbl.8 d25, {d25}, d16
+ vtbl.8 d26, {d26}, d16
+ vtbl.8 d27, {d27}, d16
+ vtbl.8 d28, {d28}, d16
+ vtbl.8 d29, {d29}, d16
+ vtbl.8 d30, {d30}, d16
+ vtbl.8 d31, {d31}, d16
+
+ vld1.32 {q8-q9}, [sp, :256]
+
+ // x8 += x12, x4 = rotl32(x4 ^ x8, 7)
+ // x9 += x13, x5 = rotl32(x5 ^ x9, 7)
+ // x10 += x14, x6 = rotl32(x6 ^ x10, 7)
+ // x11 += x15, x7 = rotl32(x7 ^ x11, 7)
+ vadd.i32 q8, q8, q12
+ vadd.i32 q9, q9, q13
+ vadd.i32 q10, q10, q14
+ vadd.i32 q11, q11, q15
+
+ vst1.32 {q8-q9}, [sp, :256]
+
+ veor q8, q4, q8
+ veor q9, q5, q9
+ vshl.u32 q4, q8, #7
+ vshl.u32 q5, q9, #7
+ vsri.u32 q4, q8, #25
+ vsri.u32 q5, q9, #25
+
+ veor q8, q6, q10
+ veor q9, q7, q11
+ vshl.u32 q6, q8, #7
+ vshl.u32 q7, q9, #7
+ vsri.u32 q6, q8, #25
+ vsri.u32 q7, q9, #25
+
+ vld1.32 {q8-q9}, [sp, :256]
+
+ // x0 += x5, x15 = rotl32(x15 ^ x0, 16)
+ // x1 += x6, x12 = rotl32(x12 ^ x1, 16)
+ // x2 += x7, x13 = rotl32(x13 ^ x2, 16)
+ // x3 += x4, x14 = rotl32(x14 ^ x3, 16)
+ vadd.i32 q0, q0, q5
+ vadd.i32 q1, q1, q6
+ vadd.i32 q2, q2, q7
+ vadd.i32 q3, q3, q4
+
+ veor q15, q15, q0
+ veor q12, q12, q1
+ veor q13, q13, q2
+ veor q14, q14, q3
+
+ vrev32.16 q15, q15
+ vrev32.16 q12, q12
+ vrev32.16 q13, q13
+ vrev32.16 q14, q14
+
+ // x10 += x15, x5 = rotl32(x5 ^ x10, 12)
+ // x11 += x12, x6 = rotl32(x6 ^ x11, 12)
+ // x8 += x13, x7 = rotl32(x7 ^ x8, 12)
+ // x9 += x14, x4 = rotl32(x4 ^ x9, 12)
+ vadd.i32 q10, q10, q15
+ vadd.i32 q11, q11, q12
+ vadd.i32 q8, q8, q13
+ vadd.i32 q9, q9, q14
+
+ vst1.32 {q8-q9}, [sp, :256]
+
+ veor q8, q7, q8
+ veor q9, q4, q9
+ vshl.u32 q7, q8, #12
+ vshl.u32 q4, q9, #12
+ vsri.u32 q7, q8, #20
+ vsri.u32 q4, q9, #20
+
+ veor q8, q5, q10
+ veor q9, q6, q11
+ vshl.u32 q5, q8, #12
+ vshl.u32 q6, q9, #12
+ vsri.u32 q5, q8, #20
+ vsri.u32 q6, q9, #20
+
+ // x0 += x5, x15 = rotl32(x15 ^ x0, 8)
+ // x1 += x6, x12 = rotl32(x12 ^ x1, 8)
+ // x2 += x7, x13 = rotl32(x13 ^ x2, 8)
+ // x3 += x4, x14 = rotl32(x14 ^ x3, 8)
+ vld1.8 {d16}, [ip, :64]
+ vadd.i32 q0, q0, q5
+ vadd.i32 q1, q1, q6
+ vadd.i32 q2, q2, q7
+ vadd.i32 q3, q3, q4
+
+ veor q15, q15, q0
+ veor q12, q12, q1
+ veor q13, q13, q2
+ veor q14, q14, q3
+
+ vtbl.8 d30, {d30}, d16
+ vtbl.8 d31, {d31}, d16
+ vtbl.8 d24, {d24}, d16
+ vtbl.8 d25, {d25}, d16
+ vtbl.8 d26, {d26}, d16
+ vtbl.8 d27, {d27}, d16
+ vtbl.8 d28, {d28}, d16
+ vtbl.8 d29, {d29}, d16
+
+ vld1.32 {q8-q9}, [sp, :256]
+
+ // x10 += x15, x5 = rotl32(x5 ^ x10, 7)
+ // x11 += x12, x6 = rotl32(x6 ^ x11, 7)
+ // x8 += x13, x7 = rotl32(x7 ^ x8, 7)
+ // x9 += x14, x4 = rotl32(x4 ^ x9, 7)
+ vadd.i32 q10, q10, q15
+ vadd.i32 q11, q11, q12
+ vadd.i32 q8, q8, q13
+ vadd.i32 q9, q9, q14
+
+ vst1.32 {q8-q9}, [sp, :256]
+
+ veor q8, q7, q8
+ veor q9, q4, q9
+ vshl.u32 q7, q8, #7
+ vshl.u32 q4, q9, #7
+ vsri.u32 q7, q8, #25
+ vsri.u32 q4, q9, #25
+
+ veor q8, q5, q10
+ veor q9, q6, q11
+ vshl.u32 q5, q8, #7
+ vshl.u32 q6, q9, #7
+ vsri.u32 q5, q8, #25
+ vsri.u32 q6, q9, #25
+
+ subs r3, r3, #2
+ bne .Ldoubleround4
+
+ // x0..7[0-3] are in q0-q7, x10..15[0-3] are in q10-q15.
+ // x8..9[0-3] are on the stack.
+
+ // Re-interleave the words in the first two rows of each block (x0..7).
+ // Also add the counter values 0-3 to x12[0-3].
+ vld1.32 {q8}, [r5, :128] // load counter values 0-3
+ vzip.32 q0, q1 // => (0 1 0 1) (0 1 0 1)
+ vzip.32 q2, q3 // => (2 3 2 3) (2 3 2 3)
+ vzip.32 q4, q5 // => (4 5 4 5) (4 5 4 5)
+ vzip.32 q6, q7 // => (6 7 6 7) (6 7 6 7)
+ vadd.u32 q12, q8 // x12 += counter values 0-3
+ vswp d1, d4
+ vswp d3, d6
+ vld1.32 {q8-q9}, [r0]! // load s0..7
+ vswp d9, d12
+ vswp d11, d14
+
+ // Swap q1 and q4 so that we'll free up consecutive registers (q0-q1)
+ // after XORing the first 32 bytes.
+ vswp q1, q4
+
+ // First two rows of each block are (q0 q1) (q2 q6) (q4 q5) (q3 q7)
+
+ // x0..3[0-3] += s0..3[0-3] (add orig state to 1st row of each block)
+ vadd.u32 q0, q0, q8
+ vadd.u32 q2, q2, q8
+ vadd.u32 q4, q4, q8
+ vadd.u32 q3, q3, q8
+
+ // x4..7[0-3] += s4..7[0-3] (add orig state to 2nd row of each block)
+ vadd.u32 q1, q1, q9
+ vadd.u32 q6, q6, q9
+ vadd.u32 q5, q5, q9
+ vadd.u32 q7, q7, q9
+
+ // XOR first 32 bytes using keystream from first two rows of first block
+ vld1.8 {q8-q9}, [r2]!
+ veor q8, q8, q0
+ veor q9, q9, q1
+ vst1.8 {q8-q9}, [r1]!
+
+ // Re-interleave the words in the last two rows of each block (x8..15).
+ vld1.32 {q8-q9}, [sp, :256]
+ vzip.32 q12, q13 // => (12 13 12 13) (12 13 12 13)
+ vzip.32 q14, q15 // => (14 15 14 15) (14 15 14 15)
+ vzip.32 q8, q9 // => (8 9 8 9) (8 9 8 9)
+ vzip.32 q10, q11 // => (10 11 10 11) (10 11 10 11)
+ vld1.32 {q0-q1}, [r0] // load s8..15
+ vswp d25, d28
+ vswp d27, d30
+ vswp d17, d20
+ vswp d19, d22
+
+ // Last two rows of each block are (q8 q12) (q10 q14) (q9 q13) (q11 q15)
+
+ // x8..11[0-3] += s8..11[0-3] (add orig state to 3rd row of each block)
+ vadd.u32 q8, q8, q0
+ vadd.u32 q10, q10, q0
+ vadd.u32 q9, q9, q0
+ vadd.u32 q11, q11, q0
+
+ // x12..15[0-3] += s12..15[0-3] (add orig state to 4th row of each block)
+ vadd.u32 q12, q12, q1
+ vadd.u32 q14, q14, q1
+ vadd.u32 q13, q13, q1
+ vadd.u32 q15, q15, q1
+
+ // XOR the rest of the data with the keystream
+
+ vld1.8 {q0-q1}, [r2]!
+ veor q0, q0, q8
+ veor q1, q1, q12
+ vst1.8 {q0-q1}, [r1]!
+
+ vld1.8 {q0-q1}, [r2]!
+ veor q0, q0, q2
+ veor q1, q1, q6
+ vst1.8 {q0-q1}, [r1]!
+
+ vld1.8 {q0-q1}, [r2]!
+ veor q0, q0, q10
+ veor q1, q1, q14
+ vst1.8 {q0-q1}, [r1]!
+
+ vld1.8 {q0-q1}, [r2]!
+ veor q0, q0, q4
+ veor q1, q1, q5
+ vst1.8 {q0-q1}, [r1]!
+
+ vld1.8 {q0-q1}, [r2]!
+ veor q0, q0, q9
+ veor q1, q1, q13
+ vst1.8 {q0-q1}, [r1]!
+
+ vld1.8 {q0-q1}, [r2]!
+ veor q0, q0, q3
+ veor q1, q1, q7
+ vst1.8 {q0-q1}, [r1]!
+
+ vld1.8 {q0-q1}, [r2]
+ mov sp, r4 // restore original stack pointer
+ veor q0, q0, q11
+ veor q1, q1, q15
+ vst1.8 {q0-q1}, [r1]
+
+ pop {r4-r5}
+ bx lr
+ENDPROC(chacha_4block_xor_neon)
diff --git a/arch/arm/crypto/chacha-neon-glue.c b/arch/arm/crypto/chacha-neon-glue.c
new file mode 100644
index 000000000000..14cc6b08e4d3
--- /dev/null
+++ b/arch/arm/crypto/chacha-neon-glue.c
@@ -0,0 +1,226 @@
+/*
+ * ARM NEON accelerated ChaCha and XChaCha stream ciphers,
+ * including ChaCha20 (RFC7539)
+ *
+ * Copyright (C) 2016 Linaro, Ltd. <ard.biesheuvel@linaro.org>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ * Based on:
+ * ChaCha20 256-bit cipher algorithm, RFC7539, SIMD glue code
+ *
+ * Copyright (C) 2015 Martin Willi
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ */
+
+#include <crypto/algapi.h>
+#include <crypto/chacha.h>
+#include <linux/crypto.h>
+#include <linux/kernel.h>
+#include <linux/module.h>
+
+#include <asm/hwcap.h>
+#include <asm/neon.h>
+#include <asm/simd.h>
+
+asmlinkage void chacha_block_xor_neon(const u32 *state, u8 *dst, const u8 *src,
+ int nrounds);
+asmlinkage void chacha_4block_xor_neon(const u32 *state, u8 *dst, const u8 *src,
+ int nrounds);
+asmlinkage void hchacha_block_neon(const u32 *state, u32 *out, int nrounds);
+
+static void chacha_doneon(u32 *state, u8 *dst, const u8 *src,
+ unsigned int bytes, int nrounds)
+{
+ u8 buf[CHACHA_BLOCK_SIZE];
+
+ while (bytes >= CHACHA_BLOCK_SIZE * 4) {
+ chacha_4block_xor_neon(state, dst, src, nrounds);
+ bytes -= CHACHA_BLOCK_SIZE * 4;
+ src += CHACHA_BLOCK_SIZE * 4;
+ dst += CHACHA_BLOCK_SIZE * 4;
+ state[12] += 4;
+ }
+ while (bytes >= CHACHA_BLOCK_SIZE) {
+ chacha_block_xor_neon(state, dst, src, nrounds);
+ bytes -= CHACHA_BLOCK_SIZE;
+ src += CHACHA_BLOCK_SIZE;
+ dst += CHACHA_BLOCK_SIZE;
+ state[12]++;
+ }
+ if (bytes) {
+ memcpy(buf, src, bytes);
+ chacha_block_xor_neon(state, buf, buf, nrounds);
+ memcpy(dst, buf, bytes);
+ }
+}
+
+static int chacha_neon_stream_xor(struct blkcipher_desc *desc,
+ struct scatterlist *dst,
+ struct scatterlist *src,
+ unsigned int nbytes,
+ struct chacha_ctx *ctx, u8 *iv)
+{
+ struct blkcipher_walk walk;
+ u32 state[16];
+ int err;
+
+ blkcipher_walk_init(&walk, dst, src, nbytes);
+ err = blkcipher_walk_virt_block(desc, &walk, CHACHA_BLOCK_SIZE);
+
+ crypto_chacha_init(state, ctx, iv);
+
+ while (walk.nbytes >= CHACHA_BLOCK_SIZE) {
+ kernel_neon_begin();
+ chacha_doneon(state, walk.dst.virt.addr, walk.src.virt.addr,
+ rounddown(walk.nbytes, CHACHA_BLOCK_SIZE),
+ ctx->nrounds);
+ kernel_neon_end();
+ err = blkcipher_walk_done(desc, &walk,
+ walk.nbytes % CHACHA_BLOCK_SIZE);
+ }
+
+ if (walk.nbytes) {
+ kernel_neon_begin();
+ chacha_doneon(state, walk.dst.virt.addr, walk.src.virt.addr,
+ walk.nbytes, ctx->nrounds);
+ kernel_neon_end();
+ err = blkcipher_walk_done(desc, &walk, 0);
+ }
+ return err;
+}
+
+static int chacha_neon(struct blkcipher_desc *desc, struct scatterlist *dst,
+ struct scatterlist *src, unsigned int nbytes)
+{
+ struct chacha_ctx *ctx = crypto_blkcipher_ctx(desc->tfm);
+ u8 *iv = desc->info;
+
+ if (nbytes <= CHACHA_BLOCK_SIZE || !may_use_simd())
+ return crypto_chacha_crypt(desc, dst, src, nbytes);
+
+ return chacha_neon_stream_xor(desc, dst, src, nbytes, ctx, iv);
+}
+
+static int xchacha_neon(struct blkcipher_desc *desc, struct scatterlist *dst,
+ struct scatterlist *src, unsigned int nbytes)
+{
+ struct chacha_ctx *ctx = crypto_blkcipher_ctx(desc->tfm);
+ u8 *iv = desc->info;
+ struct chacha_ctx subctx;
+ u32 state[16];
+ u8 real_iv[16];
+
+ if (nbytes <= CHACHA_BLOCK_SIZE || !may_use_simd())
+ return crypto_xchacha_crypt(desc, dst, src, nbytes);
+
+ crypto_chacha_init(state, ctx, iv);
+
+ kernel_neon_begin();
+ hchacha_block_neon(state, subctx.key, ctx->nrounds);
+ kernel_neon_end();
+ subctx.nrounds = ctx->nrounds;
+
+ memcpy(&real_iv[0], iv + 24, 8);
+ memcpy(&real_iv[8], iv + 16, 8);
+ return chacha_neon_stream_xor(desc, dst, src, nbytes, &subctx, real_iv);
+}
+
+static struct crypto_alg algs[] = {
+ {
+ .cra_name = "chacha20",
+ .cra_driver_name = "chacha20-neon",
+ .cra_priority = 300,
+ .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER,
+ .cra_blocksize = 1,
+ .cra_type = &crypto_blkcipher_type,
+ .cra_ctxsize = sizeof(struct chacha_ctx),
+ .cra_alignmask = sizeof(u32) - 1,
+ .cra_module = THIS_MODULE,
+ .cra_u = {
+ .blkcipher = {
+ .min_keysize = CHACHA_KEY_SIZE,
+ .max_keysize = CHACHA_KEY_SIZE,
+ .ivsize = CHACHA_IV_SIZE,
+ .geniv = "seqiv",
+ .setkey = crypto_chacha20_setkey,
+ .encrypt = chacha_neon,
+ .decrypt = chacha_neon,
+ },
+ },
+ }, {
+ .cra_name = "xchacha20",
+ .cra_driver_name = "xchacha20-neon",
+ .cra_priority = 300,
+ .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER,
+ .cra_blocksize = 1,
+ .cra_type = &crypto_blkcipher_type,
+ .cra_ctxsize = sizeof(struct chacha_ctx),
+ .cra_alignmask = sizeof(u32) - 1,
+ .cra_module = THIS_MODULE,
+ .cra_u = {
+ .blkcipher = {
+ .min_keysize = CHACHA_KEY_SIZE,
+ .max_keysize = CHACHA_KEY_SIZE,
+ .ivsize = XCHACHA_IV_SIZE,
+ .geniv = "seqiv",
+ .setkey = crypto_chacha20_setkey,
+ .encrypt = xchacha_neon,
+ .decrypt = xchacha_neon,
+ },
+ },
+ }, {
+ .cra_name = "xchacha12",
+ .cra_driver_name = "xchacha12-neon",
+ .cra_priority = 300,
+ .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER,
+ .cra_blocksize = 1,
+ .cra_type = &crypto_blkcipher_type,
+ .cra_ctxsize = sizeof(struct chacha_ctx),
+ .cra_alignmask = sizeof(u32) - 1,
+ .cra_module = THIS_MODULE,
+ .cra_u = {
+ .blkcipher = {
+ .min_keysize = CHACHA_KEY_SIZE,
+ .max_keysize = CHACHA_KEY_SIZE,
+ .ivsize = XCHACHA_IV_SIZE,
+ .geniv = "seqiv",
+ .setkey = crypto_chacha12_setkey,
+ .encrypt = xchacha_neon,
+ .decrypt = xchacha_neon,
+ },
+ },
+ },
+};
+
+static int __init chacha_simd_mod_init(void)
+{
+ if (!(elf_hwcap & HWCAP_NEON))
+ return -ENODEV;
+
+ return crypto_register_algs(algs, ARRAY_SIZE(algs));
+}
+
+static void __exit chacha_simd_mod_fini(void)
+{
+ crypto_unregister_algs(algs, ARRAY_SIZE(algs));
+}
+
+module_init(chacha_simd_mod_init);
+module_exit(chacha_simd_mod_fini);
+
+MODULE_DESCRIPTION("ChaCha and XChaCha stream ciphers (NEON accelerated)");
+MODULE_AUTHOR("Ard Biesheuvel <ard.biesheuvel@linaro.org>");
+MODULE_LICENSE("GPL v2");
+MODULE_ALIAS_CRYPTO("chacha20");
+MODULE_ALIAS_CRYPTO("chacha20-neon");
+MODULE_ALIAS_CRYPTO("xchacha20");
+MODULE_ALIAS_CRYPTO("xchacha20-neon");
+MODULE_ALIAS_CRYPTO("xchacha12");
+MODULE_ALIAS_CRYPTO("xchacha12-neon");
diff --git a/arch/arm/crypto/nh-neon-core.S b/arch/arm/crypto/nh-neon-core.S
new file mode 100644
index 000000000000..434d80ab531c
--- /dev/null
+++ b/arch/arm/crypto/nh-neon-core.S
@@ -0,0 +1,116 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * NH - ε-almost-universal hash function, NEON accelerated version
+ *
+ * Copyright 2018 Google LLC
+ *
+ * Author: Eric Biggers <ebiggers@google.com>
+ */
+
+#include <linux/linkage.h>
+
+ .text
+ .fpu neon
+
+ KEY .req r0
+ MESSAGE .req r1
+ MESSAGE_LEN .req r2
+ HASH .req r3
+
+ PASS0_SUMS .req q0
+ PASS0_SUM_A .req d0
+ PASS0_SUM_B .req d1
+ PASS1_SUMS .req q1
+ PASS1_SUM_A .req d2
+ PASS1_SUM_B .req d3
+ PASS2_SUMS .req q2
+ PASS2_SUM_A .req d4
+ PASS2_SUM_B .req d5
+ PASS3_SUMS .req q3
+ PASS3_SUM_A .req d6
+ PASS3_SUM_B .req d7
+ K0 .req q4
+ K1 .req q5
+ K2 .req q6
+ K3 .req q7
+ T0 .req q8
+ T0_L .req d16
+ T0_H .req d17
+ T1 .req q9
+ T1_L .req d18
+ T1_H .req d19
+ T2 .req q10
+ T2_L .req d20
+ T2_H .req d21
+ T3 .req q11
+ T3_L .req d22
+ T3_H .req d23
+
+.macro _nh_stride k0, k1, k2, k3
+
+ // Load next message stride
+ vld1.8 {T3}, [MESSAGE]!
+
+ // Load next key stride
+ vld1.32 {\k3}, [KEY]!
+
+ // Add message words to key words
+ vadd.u32 T0, T3, \k0
+ vadd.u32 T1, T3, \k1
+ vadd.u32 T2, T3, \k2
+ vadd.u32 T3, T3, \k3
+
+ // Multiply 32x32 => 64 and accumulate
+ vmlal.u32 PASS0_SUMS, T0_L, T0_H
+ vmlal.u32 PASS1_SUMS, T1_L, T1_H
+ vmlal.u32 PASS2_SUMS, T2_L, T2_H
+ vmlal.u32 PASS3_SUMS, T3_L, T3_H
+.endm
+
+/*
+ * void nh_neon(const u32 *key, const u8 *message, size_t message_len,
+ * u8 hash[NH_HASH_BYTES])
+ *
+ * It's guaranteed that message_len % 16 == 0.
+ */
+ENTRY(nh_neon)
+
+ vld1.32 {K0,K1}, [KEY]!
+ vmov.u64 PASS0_SUMS, #0
+ vmov.u64 PASS1_SUMS, #0
+ vld1.32 {K2}, [KEY]!
+ vmov.u64 PASS2_SUMS, #0
+ vmov.u64 PASS3_SUMS, #0
+
+ subs MESSAGE_LEN, MESSAGE_LEN, #64
+ blt .Lloop4_done
+.Lloop4:
+ _nh_stride K0, K1, K2, K3
+ _nh_stride K1, K2, K3, K0
+ _nh_stride K2, K3, K0, K1
+ _nh_stride K3, K0, K1, K2
+ subs MESSAGE_LEN, MESSAGE_LEN, #64
+ bge .Lloop4
+
+.Lloop4_done:
+ ands MESSAGE_LEN, MESSAGE_LEN, #63
+ beq .Ldone
+ _nh_stride K0, K1, K2, K3
+
+ subs MESSAGE_LEN, MESSAGE_LEN, #16
+ beq .Ldone
+ _nh_stride K1, K2, K3, K0
+
+ subs MESSAGE_LEN, MESSAGE_LEN, #16
+ beq .Ldone
+ _nh_stride K2, K3, K0, K1
+
+.Ldone:
+ // Sum the accumulators for each pass, then store the sums to 'hash'
+ vadd.u64 T0_L, PASS0_SUM_A, PASS0_SUM_B
+ vadd.u64 T0_H, PASS1_SUM_A, PASS1_SUM_B
+ vadd.u64 T1_L, PASS2_SUM_A, PASS2_SUM_B
+ vadd.u64 T1_H, PASS3_SUM_A, PASS3_SUM_B
+ vst1.8 {T0-T1}, [HASH]
+ bx lr
+ENDPROC(nh_neon)
diff --git a/arch/arm/crypto/nhpoly1305-neon-glue.c b/arch/arm/crypto/nhpoly1305-neon-glue.c
new file mode 100644
index 000000000000..49aae87cb2bc
--- /dev/null
+++ b/arch/arm/crypto/nhpoly1305-neon-glue.c
@@ -0,0 +1,77 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * NHPoly1305 - ε-almost-∆-universal hash function for Adiantum
+ * (NEON accelerated version)
+ *
+ * Copyright 2018 Google LLC
+ */
+
+#include <asm/neon.h>
+#include <asm/simd.h>
+#include <crypto/internal/hash.h>
+#include <crypto/nhpoly1305.h>
+#include <linux/module.h>
+
+asmlinkage void nh_neon(const u32 *key, const u8 *message, size_t message_len,
+ u8 hash[NH_HASH_BYTES]);
+
+/* wrapper to avoid indirect call to assembly, which doesn't work with CFI */
+static void _nh_neon(const u32 *key, const u8 *message, size_t message_len,
+ __le64 hash[NH_NUM_PASSES])
+{
+ nh_neon(key, message, message_len, (u8 *)hash);
+}
+
+static int nhpoly1305_neon_update(struct shash_desc *desc,
+ const u8 *src, unsigned int srclen)
+{
+ if (srclen < 64 || !may_use_simd())
+ return crypto_nhpoly1305_update(desc, src, srclen);
+
+ do {
+ unsigned int n = min_t(unsigned int, srclen, PAGE_SIZE);
+
+ kernel_neon_begin();
+ crypto_nhpoly1305_update_helper(desc, src, n, _nh_neon);
+ kernel_neon_end();
+ src += n;
+ srclen -= n;
+ } while (srclen);
+ return 0;
+}
+
+static struct shash_alg nhpoly1305_alg = {
+ .base.cra_name = "nhpoly1305",
+ .base.cra_driver_name = "nhpoly1305-neon",
+ .base.cra_priority = 200,
+ .base.cra_ctxsize = sizeof(struct nhpoly1305_key),
+ .base.cra_module = THIS_MODULE,
+ .digestsize = POLY1305_DIGEST_SIZE,
+ .init = crypto_nhpoly1305_init,
+ .update = nhpoly1305_neon_update,
+ .final = crypto_nhpoly1305_final,
+ .setkey = crypto_nhpoly1305_setkey,
+ .descsize = sizeof(struct nhpoly1305_state),
+};
+
+static int __init nhpoly1305_mod_init(void)
+{
+ if (!(elf_hwcap & HWCAP_NEON))
+ return -ENODEV;
+
+ return crypto_register_shash(&nhpoly1305_alg);
+}
+
+static void __exit nhpoly1305_mod_exit(void)
+{
+ crypto_unregister_shash(&nhpoly1305_alg);
+}
+
+module_init(nhpoly1305_mod_init);
+module_exit(nhpoly1305_mod_exit);
+
+MODULE_DESCRIPTION("NHPoly1305 ε-almost-∆-universal hash function (NEON-accelerated)");
+MODULE_LICENSE("GPL v2");
+MODULE_AUTHOR("Eric Biggers <ebiggers@google.com>");
+MODULE_ALIAS_CRYPTO("nhpoly1305");
+MODULE_ALIAS_CRYPTO("nhpoly1305-neon");
diff --git a/arch/arm/include/asm/elf.h b/arch/arm/include/asm/elf.h
index f13ae153fb24..d2315ffd8f12 100644
--- a/arch/arm/include/asm/elf.h
+++ b/arch/arm/include/asm/elf.h
@@ -112,8 +112,12 @@ int dump_task_regs(struct task_struct *t, elf_gregset_t *elfregs);
#define CORE_DUMP_USE_REGSET
#define ELF_EXEC_PAGESIZE 4096
-/* This is the base location for PIE (ET_DYN with INTERP) loads. */
-#define ELF_ET_DYN_BASE 0x400000UL
+/* This is the location that an ET_DYN program is loaded if exec'ed. Typical
+ use of this is to invoke "./ld.so someprog" to test out a new version of
+ the loader. We need to make sure that it is out of the way of the program
+ that it will "exec", and that there is sufficient room for the brk. */
+
+#define ELF_ET_DYN_BASE (TASK_SIZE / 3 * 2)
/* When the program starts, a1 contains a pointer to a function to be
registered with atexit, as per the SVR4 ABI. A value of 0 means we
diff --git a/arch/arm/include/asm/fiq_glue.h b/arch/arm/include/asm/fiq_glue.h
new file mode 100644
index 000000000000..a9e244f9f197
--- /dev/null
+++ b/arch/arm/include/asm/fiq_glue.h
@@ -0,0 +1,33 @@
+/*
+ * Copyright (C) 2010 Google, Inc.
+ *
+ * This software is licensed under the terms of the GNU General Public
+ * License version 2, as published by the Free Software Foundation, and
+ * may be copied, distributed, and modified under those terms.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ */
+
+#ifndef __ASM_FIQ_GLUE_H
+#define __ASM_FIQ_GLUE_H
+
+struct fiq_glue_handler {
+ void (*fiq)(struct fiq_glue_handler *h, void *regs, void *svc_sp);
+ void (*resume)(struct fiq_glue_handler *h);
+};
+typedef void (*fiq_return_handler_t)(void);
+
+int fiq_glue_register_handler(struct fiq_glue_handler *handler);
+int fiq_glue_set_return_handler(fiq_return_handler_t fiq_return);
+int fiq_glue_clear_return_handler(fiq_return_handler_t fiq_return);
+
+#ifdef CONFIG_FIQ_GLUE
+void fiq_glue_resume(void);
+#else
+static inline void fiq_glue_resume(void) {}
+#endif
+
+#endif
diff --git a/arch/arm/include/asm/topology.h b/arch/arm/include/asm/topology.h
index 370f7a732900..4e9b95745217 100644
--- a/arch/arm/include/asm/topology.h
+++ b/arch/arm/include/asm/topology.h
@@ -3,6 +3,7 @@
#ifdef CONFIG_ARM_CPU_TOPOLOGY
+#include <linux/cpufreq.h>
#include <linux/cpumask.h>
struct cputopo_arm {
@@ -24,6 +25,14 @@ void init_cpu_topology(void);
void store_cpu_topology(unsigned int cpuid);
const struct cpumask *cpu_coregroup_mask(int cpu);
+#ifdef CONFIG_CPU_FREQ
+#define arch_scale_freq_capacity cpufreq_scale_freq_capacity
+#define arch_scale_max_freq_capacity cpufreq_scale_max_freq_capacity
+#define arch_scale_min_freq_capacity cpufreq_scale_min_freq_capacity
+#endif
+#define arch_scale_cpu_capacity scale_cpu_capacity
+extern unsigned long scale_cpu_capacity(struct sched_domain *sd, int cpu);
+
#else
static inline void init_cpu_topology(void) { }
diff --git a/arch/arm/include/asm/traps.h b/arch/arm/include/asm/traps.h
index f555bb3664dc..683d9230984a 100644
--- a/arch/arm/include/asm/traps.h
+++ b/arch/arm/include/asm/traps.h
@@ -18,7 +18,6 @@ struct undef_hook {
void register_undef_hook(struct undef_hook *hook);
void unregister_undef_hook(struct undef_hook *hook);
-#ifdef CONFIG_FUNCTION_GRAPH_TRACER
static inline int __in_irqentry_text(unsigned long ptr)
{
extern char __irqentry_text_start[];
@@ -27,12 +26,6 @@ static inline int __in_irqentry_text(unsigned long ptr)
return ptr >= (unsigned long)&__irqentry_text_start &&
ptr < (unsigned long)&__irqentry_text_end;
}
-#else
-static inline int __in_irqentry_text(unsigned long ptr)
-{
- return 0;
-}
-#endif
static inline int in_exception_text(unsigned long ptr)
{
diff --git a/arch/arm/kernel/entry-common.S b/arch/arm/kernel/entry-common.S
index 56be67ecf0fa..60f148fbb36b 100644
--- a/arch/arm/kernel/entry-common.S
+++ b/arch/arm/kernel/entry-common.S
@@ -12,6 +12,7 @@
#include <asm/unistd.h>
#include <asm/ftrace.h>
#include <asm/unwind.h>
+#include <asm/memory.h>
#ifdef CONFIG_NEED_RET_TO_USER
#include <mach/entry-macro.S>
@@ -35,6 +36,9 @@ ret_fast_syscall:
UNWIND(.fnstart )
UNWIND(.cantunwind )
disable_irq_notrace @ disable interrupts
+ ldr r2, [tsk, #TI_ADDR_LIMIT]
+ cmp r2, #TASK_SIZE
+ blne addr_limit_check_failed
ldr r1, [tsk, #TI_FLAGS] @ re-check for syscall tracing
tst r1, #_TIF_SYSCALL_WORK | _TIF_WORK_MASK
bne fast_work_pending
@@ -61,6 +65,9 @@ ret_fast_syscall:
UNWIND(.cantunwind )
str r0, [sp, #S_R0 + S_OFF]! @ save returned r0
disable_irq_notrace @ disable interrupts
+ ldr r2, [tsk, #TI_ADDR_LIMIT]
+ cmp r2, #TASK_SIZE
+ blne addr_limit_check_failed
ldr r1, [tsk, #TI_FLAGS] @ re-check for syscall tracing
tst r1, #_TIF_SYSCALL_WORK | _TIF_WORK_MASK
beq no_work_pending
@@ -93,6 +100,9 @@ ENTRY(ret_to_user)
ret_slow_syscall:
disable_irq_notrace @ disable interrupts
ENTRY(ret_to_user_from_irq)
+ ldr r2, [tsk, #TI_ADDR_LIMIT]
+ cmp r2, #TASK_SIZE
+ blne addr_limit_check_failed
ldr r1, [tsk, #TI_FLAGS]
tst r1, #_TIF_WORK_MASK
bne slow_work_pending
diff --git a/arch/arm/kernel/kgdb.c b/arch/arm/kernel/kgdb.c
index 9232caee7060..f3c662299531 100644
--- a/arch/arm/kernel/kgdb.c
+++ b/arch/arm/kernel/kgdb.c
@@ -140,6 +140,8 @@ int kgdb_arch_handle_exception(int exception_vector, int signo,
static int kgdb_brk_fn(struct pt_regs *regs, unsigned int instr)
{
+ if (user_mode(regs))
+ return -1;
kgdb_handle_exception(1, SIGTRAP, 0, regs);
return 0;
@@ -147,6 +149,8 @@ static int kgdb_brk_fn(struct pt_regs *regs, unsigned int instr)
static int kgdb_compiled_brk_fn(struct pt_regs *regs, unsigned int instr)
{
+ if (user_mode(regs))
+ return -1;
compiled_break = 1;
kgdb_handle_exception(1, SIGTRAP, 0, regs);
diff --git a/arch/arm/kernel/process.c b/arch/arm/kernel/process.c
index 91d2d5b01414..38ad8b956c6b 100644
--- a/arch/arm/kernel/process.c
+++ b/arch/arm/kernel/process.c
@@ -80,6 +80,7 @@ void arch_cpu_idle_prepare(void)
void arch_cpu_idle_enter(void)
{
+ idle_notifier_call_chain(IDLE_START);
ledtrig_cpu(CPU_LED_IDLE_START);
#ifdef CONFIG_PL310_ERRATA_769419
wmb();
@@ -89,6 +90,78 @@ void arch_cpu_idle_enter(void)
void arch_cpu_idle_exit(void)
{
ledtrig_cpu(CPU_LED_IDLE_END);
+ idle_notifier_call_chain(IDLE_END);
+}
+
+/*
+ * dump a block of kernel memory from around the given address
+ */
+static void show_data(unsigned long addr, int nbytes, const char *name)
+{
+ int i, j;
+ int nlines;
+ u32 *p;
+
+ /*
+ * don't attempt to dump non-kernel addresses or
+ * values that are probably just small negative numbers
+ */
+ if (addr < PAGE_OFFSET || addr > -256UL)
+ return;
+
+ printk("\n%s: %#lx:\n", name, addr);
+
+ /*
+ * round address down to a 32 bit boundary
+ * and always dump a multiple of 32 bytes
+ */
+ p = (u32 *)(addr & ~(sizeof(u32) - 1));
+ nbytes += (addr & (sizeof(u32) - 1));
+ nlines = (nbytes + 31) / 32;
+
+
+ for (i = 0; i < nlines; i++) {
+ /*
+ * just display low 16 bits of address to keep
+ * each line of the dump < 80 characters
+ */
+ printk("%04lx ", (unsigned long)p & 0xffff);
+ for (j = 0; j < 8; j++) {
+ u32 data;
+ if (probe_kernel_address(p, data)) {
+ pr_cont(" ********");
+ } else {
+ pr_cont(" %08x", data);
+ }
+ ++p;
+ }
+ pr_cont("\n");
+ }
+}
+
+static void show_extra_register_data(struct pt_regs *regs, int nbytes)
+{
+ mm_segment_t fs;
+
+ fs = get_fs();
+ set_fs(KERNEL_DS);
+ show_data(regs->ARM_pc - nbytes, nbytes * 2, "PC");
+ show_data(regs->ARM_lr - nbytes, nbytes * 2, "LR");
+ show_data(regs->ARM_sp - nbytes, nbytes * 2, "SP");
+ show_data(regs->ARM_ip - nbytes, nbytes * 2, "IP");
+ show_data(regs->ARM_fp - nbytes, nbytes * 2, "FP");
+ show_data(regs->ARM_r0 - nbytes, nbytes * 2, "R0");
+ show_data(regs->ARM_r1 - nbytes, nbytes * 2, "R1");
+ show_data(regs->ARM_r2 - nbytes, nbytes * 2, "R2");
+ show_data(regs->ARM_r3 - nbytes, nbytes * 2, "R3");
+ show_data(regs->ARM_r4 - nbytes, nbytes * 2, "R4");
+ show_data(regs->ARM_r5 - nbytes, nbytes * 2, "R5");
+ show_data(regs->ARM_r6 - nbytes, nbytes * 2, "R6");
+ show_data(regs->ARM_r7 - nbytes, nbytes * 2, "R7");
+ show_data(regs->ARM_r8 - nbytes, nbytes * 2, "R8");
+ show_data(regs->ARM_r9 - nbytes, nbytes * 2, "R9");
+ show_data(regs->ARM_r10 - nbytes, nbytes * 2, "R10");
+ set_fs(fs);
}
void __show_regs(struct pt_regs *regs)
@@ -182,6 +255,8 @@ void __show_regs(struct pt_regs *regs)
printk("Control: %08x%s\n", ctrl, buf);
}
#endif
+
+ show_extra_register_data(regs, 128);
}
void show_regs(struct pt_regs * regs)
diff --git a/arch/arm/kernel/reboot.c b/arch/arm/kernel/reboot.c
index 3fa867a2aae6..d704df89a546 100644
--- a/arch/arm/kernel/reboot.c
+++ b/arch/arm/kernel/reboot.c
@@ -6,6 +6,7 @@
* it under the terms of the GNU General Public License version 2 as
* published by the Free Software Foundation.
*/
+#include <linux/console.h>
#include <linux/cpu.h>
#include <linux/delay.h>
#include <linux/reboot.h>
@@ -122,6 +123,31 @@ void machine_power_off(void)
pm_power_off();
}
+#ifdef CONFIG_ARM_FLUSH_CONSOLE_ON_RESTART
+void arm_machine_flush_console(void)
+{
+ printk("\n");
+ pr_emerg("Restarting %s\n", linux_banner);
+ if (console_trylock()) {
+ console_unlock();
+ return;
+ }
+
+ mdelay(50);
+
+ local_irq_disable();
+ if (!console_trylock())
+ pr_emerg("arm_restart: Console was locked! Busting\n");
+ else
+ pr_emerg("arm_restart: Console was locked!\n");
+ console_unlock();
+}
+#else
+void arm_machine_flush_console(void)
+{
+}
+#endif
+
/*
* Restart requires that the secondary CPUs stop performing any activity
* while the primary CPU resets the system. Systems with a single CPU can
@@ -138,6 +164,10 @@ void machine_restart(char *cmd)
local_irq_disable();
smp_send_stop();
+ /* Flush the console to make sure all the relevant messages make it
+ * out to the console drivers */
+ arm_machine_flush_console();
+
if (arm_pm_restart)
arm_pm_restart(reboot_mode, cmd);
else
diff --git a/arch/arm/kernel/signal.c b/arch/arm/kernel/signal.c
index 6bee5c9b1133..a53e2b827b1c 100644
--- a/arch/arm/kernel/signal.c
+++ b/arch/arm/kernel/signal.c
@@ -14,6 +14,7 @@
#include <linux/uaccess.h>
#include <linux/tracehook.h>
#include <linux/uprobes.h>
+#include <linux/syscalls.h>
#include <asm/elf.h>
#include <asm/cacheflush.h>
@@ -634,3 +635,9 @@ struct page *get_signal_page(void)
return page;
}
+
+/* Defer to generic check */
+asmlinkage void addr_limit_check_failed(void)
+{
+ addr_limit_user_check();
+}
diff --git a/arch/arm/kernel/topology.c b/arch/arm/kernel/topology.c
index ec279d161b32..07b0812a0109 100644
--- a/arch/arm/kernel/topology.c
+++ b/arch/arm/kernel/topology.c
@@ -42,7 +42,7 @@
*/
static DEFINE_PER_CPU(unsigned long, cpu_scale) = SCHED_CAPACITY_SCALE;
-unsigned long arch_scale_cpu_capacity(struct sched_domain *sd, int cpu)
+unsigned long scale_cpu_capacity(struct sched_domain *sd, int cpu)
{
return per_cpu(cpu_scale, cpu);
}
@@ -153,6 +153,8 @@ static void __init parse_dt_topology(void)
}
+static const struct sched_group_energy * const cpu_core_energy(int cpu);
+
/*
* Look for a customed capacity of a CPU in the cpu_capacity table during the
* boot. The update of all CPUs is in O(n^2) for heteregeneous system but the
@@ -160,10 +162,14 @@ static void __init parse_dt_topology(void)
*/
static void update_cpu_capacity(unsigned int cpu)
{
- if (!cpu_capacity(cpu))
- return;
+ unsigned long capacity = SCHED_CAPACITY_SCALE;
- set_capacity_scale(cpu, cpu_capacity(cpu) / middle_capacity);
+ if (cpu_core_energy(cpu)) {
+ int max_cap_idx = cpu_core_energy(cpu)->nr_cap_states - 1;
+ capacity = cpu_core_energy(cpu)->cap_states[max_cap_idx].cap;
+ }
+
+ set_capacity_scale(cpu, capacity);
pr_info("CPU%u: update cpu_capacity %lu\n",
cpu, arch_scale_cpu_capacity(NULL, cpu));
@@ -275,17 +281,138 @@ void store_cpu_topology(unsigned int cpuid)
cpu_topology[cpuid].socket_id, mpidr);
}
+/*
+ * ARM TC2 specific energy cost model data. There are no unit requirements for
+ * the data. Data can be normalized to any reference point, but the
+ * normalization must be consistent. That is, one bogo-joule/watt must be the
+ * same quantity for all data, but we don't care what it is.
+ */
+static struct idle_state idle_states_cluster_a7[] = {
+ { .power = 25 }, /* arch_cpu_idle() (active idle) = WFI */
+ { .power = 25 }, /* WFI */
+ { .power = 10 }, /* cluster-sleep-l */
+ };
+
+static struct idle_state idle_states_cluster_a15[] = {
+ { .power = 70 }, /* arch_cpu_idle() (active idle) = WFI */
+ { .power = 70 }, /* WFI */
+ { .power = 25 }, /* cluster-sleep-b */
+ };
+
+static struct capacity_state cap_states_cluster_a7[] = {
+ /* Cluster only power */
+ { .cap = 150, .power = 2967, }, /* 350 MHz */
+ { .cap = 172, .power = 2792, }, /* 400 MHz */
+ { .cap = 215, .power = 2810, }, /* 500 MHz */
+ { .cap = 258, .power = 2815, }, /* 600 MHz */
+ { .cap = 301, .power = 2919, }, /* 700 MHz */
+ { .cap = 344, .power = 2847, }, /* 800 MHz */
+ { .cap = 387, .power = 3917, }, /* 900 MHz */
+ { .cap = 430, .power = 4905, }, /* 1000 MHz */
+ };
+
+static struct capacity_state cap_states_cluster_a15[] = {
+ /* Cluster only power */
+ { .cap = 426, .power = 7920, }, /* 500 MHz */
+ { .cap = 512, .power = 8165, }, /* 600 MHz */
+ { .cap = 597, .power = 8172, }, /* 700 MHz */
+ { .cap = 682, .power = 8195, }, /* 800 MHz */
+ { .cap = 768, .power = 8265, }, /* 900 MHz */
+ { .cap = 853, .power = 8446, }, /* 1000 MHz */
+ { .cap = 938, .power = 11426, }, /* 1100 MHz */
+ { .cap = 1024, .power = 15200, }, /* 1200 MHz */
+ };
+
+static struct sched_group_energy energy_cluster_a7 = {
+ .nr_idle_states = ARRAY_SIZE(idle_states_cluster_a7),
+ .idle_states = idle_states_cluster_a7,
+ .nr_cap_states = ARRAY_SIZE(cap_states_cluster_a7),
+ .cap_states = cap_states_cluster_a7,
+};
+
+static struct sched_group_energy energy_cluster_a15 = {
+ .nr_idle_states = ARRAY_SIZE(idle_states_cluster_a15),
+ .idle_states = idle_states_cluster_a15,
+ .nr_cap_states = ARRAY_SIZE(cap_states_cluster_a15),
+ .cap_states = cap_states_cluster_a15,
+};
+
+static struct idle_state idle_states_core_a7[] = {
+ { .power = 0 }, /* arch_cpu_idle (active idle) = WFI */
+ { .power = 0 }, /* WFI */
+ { .power = 0 }, /* cluster-sleep-l */
+ };
+
+static struct idle_state idle_states_core_a15[] = {
+ { .power = 0 }, /* arch_cpu_idle (active idle) = WFI */
+ { .power = 0 }, /* WFI */
+ { .power = 0 }, /* cluster-sleep-b */
+ };
+
+static struct capacity_state cap_states_core_a7[] = {
+ /* Power per cpu */
+ { .cap = 150, .power = 187, }, /* 350 MHz */
+ { .cap = 172, .power = 275, }, /* 400 MHz */
+ { .cap = 215, .power = 334, }, /* 500 MHz */
+ { .cap = 258, .power = 407, }, /* 600 MHz */
+ { .cap = 301, .power = 447, }, /* 700 MHz */
+ { .cap = 344, .power = 549, }, /* 800 MHz */
+ { .cap = 387, .power = 761, }, /* 900 MHz */
+ { .cap = 430, .power = 1024, }, /* 1000 MHz */
+ };
+
+static struct capacity_state cap_states_core_a15[] = {
+ /* Power per cpu */
+ { .cap = 426, .power = 2021, }, /* 500 MHz */
+ { .cap = 512, .power = 2312, }, /* 600 MHz */
+ { .cap = 597, .power = 2756, }, /* 700 MHz */
+ { .cap = 682, .power = 3125, }, /* 800 MHz */
+ { .cap = 768, .power = 3524, }, /* 900 MHz */
+ { .cap = 853, .power = 3846, }, /* 1000 MHz */
+ { .cap = 938, .power = 5177, }, /* 1100 MHz */
+ { .cap = 1024, .power = 6997, }, /* 1200 MHz */
+ };
+
+static struct sched_group_energy energy_core_a7 = {
+ .nr_idle_states = ARRAY_SIZE(idle_states_core_a7),
+ .idle_states = idle_states_core_a7,
+ .nr_cap_states = ARRAY_SIZE(cap_states_core_a7),
+ .cap_states = cap_states_core_a7,
+};
+
+static struct sched_group_energy energy_core_a15 = {
+ .nr_idle_states = ARRAY_SIZE(idle_states_core_a15),
+ .idle_states = idle_states_core_a15,
+ .nr_cap_states = ARRAY_SIZE(cap_states_core_a15),
+ .cap_states = cap_states_core_a15,
+};
+
+/* sd energy functions */
+static inline
+const struct sched_group_energy * const cpu_cluster_energy(int cpu)
+{
+ return cpu_topology[cpu].socket_id ? &energy_cluster_a7 :
+ &energy_cluster_a15;
+}
+
+static inline
+const struct sched_group_energy * const cpu_core_energy(int cpu)
+{
+ return cpu_topology[cpu].socket_id ? &energy_core_a7 :
+ &energy_core_a15;
+}
+
static inline int cpu_corepower_flags(void)
{
- return SD_SHARE_PKG_RESOURCES | SD_SHARE_POWERDOMAIN;
+ return SD_SHARE_PKG_RESOURCES | SD_SHARE_POWERDOMAIN | \
+ SD_SHARE_CAP_STATES;
}
static struct sched_domain_topology_level arm_topology[] = {
#ifdef CONFIG_SCHED_MC
- { cpu_corepower_mask, cpu_corepower_flags, SD_INIT_NAME(GMC) },
- { cpu_coregroup_mask, cpu_core_flags, SD_INIT_NAME(MC) },
+ { cpu_coregroup_mask, cpu_corepower_flags, cpu_core_energy, SD_INIT_NAME(MC) },
#endif
- { cpu_cpu_mask, SD_INIT_NAME(DIE) },
+ { cpu_cpu_mask, NULL, cpu_cluster_energy, SD_INIT_NAME(DIE) },
{ NULL, },
};
diff --git a/arch/arm/mm/cache-v6.S b/arch/arm/mm/cache-v6.S
index 24659952c278..11da0f50a1fe 100644
--- a/arch/arm/mm/cache-v6.S
+++ b/arch/arm/mm/cache-v6.S
@@ -270,6 +270,11 @@ v6_dma_clean_range:
* - end - virtual end address of region
*/
ENTRY(v6_dma_flush_range)
+#ifdef CONFIG_CACHE_FLUSH_RANGE_LIMIT
+ sub r2, r1, r0
+ cmp r2, #CONFIG_CACHE_FLUSH_RANGE_LIMIT
+ bhi v6_dma_flush_dcache_all
+#endif
#ifdef CONFIG_DMA_CACHE_RWFO
ldrb r2, [r0] @ read for ownership
strb r2, [r0] @ write for ownership
@@ -292,6 +297,18 @@ ENTRY(v6_dma_flush_range)
mcr p15, 0, r0, c7, c10, 4 @ drain write buffer
ret lr
+#ifdef CONFIG_CACHE_FLUSH_RANGE_LIMIT
+v6_dma_flush_dcache_all:
+ mov r0, #0
+#ifdef HARVARD_CACHE
+ mcr p15, 0, r0, c7, c14, 0 @ D cache clean+invalidate
+#else
+ mcr p15, 0, r0, c7, c15, 0 @ Cache clean+invalidate
+#endif
+ mcr p15, 0, r0, c7, c10, 4 @ drain write buffer
+ mov pc, lr
+#endif
+
/*
* dma_map_area(start, size, dir)
* - start - kernel virtual start address
diff --git a/arch/arm/mm/fault.c b/arch/arm/mm/fault.c
index 5ca207ada852..d6c9dee1b252 100644
--- a/arch/arm/mm/fault.c
+++ b/arch/arm/mm/fault.c
@@ -276,10 +276,10 @@ do_page_fault(unsigned long addr, unsigned int fsr, struct pt_regs *regs)
local_irq_enable();
/*
- * If we're in an interrupt or have no user
+ * If we're in an interrupt, or have no irqs, or have no user
* context, we must not take the fault..
*/
- if (faulthandler_disabled() || !mm)
+ if (faulthandler_disabled() || irqs_disabled() || !mm)
goto no_context;
if (user_mode(regs))
diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig
index 3e43874568f9..514dd68f6b97 100644
--- a/arch/arm64/Kconfig
+++ b/arch/arm64/Kconfig
@@ -15,6 +15,7 @@ config ARM64
select ARCH_HAS_SG_CHAIN
select ARCH_HAS_TICK_BROADCAST if GENERIC_CLOCKEVENTS_BROADCAST
select ARCH_USE_CMPXCHG_LOCKREF
+ select ARCH_SUPPORTS_LTO_CLANG
select ARCH_SUPPORTS_ATOMIC_RMW
select ARCH_SUPPORTS_NUMA_BALANCING
select ARCH_WANT_COMPAT_IPC_PARSE_VERSION
@@ -109,6 +110,7 @@ config ARM64
select POWER_SUPPLY
select SPARSE_IRQ
select SYSCTL_EXCEPTION_TRACE
+ select THREAD_INFO_IN_TASK
help
ARM 64-bit (AArch64) Linux support.
@@ -417,7 +419,7 @@ config ARM64_ERRATUM_845719
config ARM64_ERRATUM_843419
bool "Cortex-A53: 843419: A load or store might access an incorrect address"
- default y
+ default y if !LTO_CLANG
select ARM64_MODULE_CMODEL_LARGE if MODULES
help
This option links the kernel with '--fix-cortex-a53-843419' and
@@ -852,6 +854,14 @@ config SETEND_EMULATION
If unsure, say Y
endif
+config ARM64_SW_TTBR0_PAN
+ bool "Emulate Privileged Access Never using TTBR0_EL1 switching"
+ help
+ Enabling this option prevents the kernel from accessing
+ user-space memory directly by pointing TTBR0_EL1 to a reserved
+ zeroed area and reserved ASID. The user access routines
+ restore the valid TTBR0_EL1 temporarily.
+
menu "ARMv8.1 architectural features"
config ARM64_HW_AFDBM
@@ -977,7 +987,7 @@ config RANDOMIZE_BASE
config RANDOMIZE_MODULE_REGION_FULL
bool "Randomize the module region independently from the core kernel"
- depends on RANDOMIZE_BASE && !DYNAMIC_FTRACE
+ depends on RANDOMIZE_BASE && !DYNAMIC_FTRACE && !LTO_CLANG
default y
help
Randomizes the location of the module region without considering the
@@ -1011,6 +1021,23 @@ config CMDLINE
entering them here. As a minimum, you should specify the the
root device (e.g. root=/dev/nfs).
+choice
+ prompt "Kernel command line type" if CMDLINE != ""
+ default CMDLINE_FROM_BOOTLOADER
+
+config CMDLINE_FROM_BOOTLOADER
+ bool "Use bootloader kernel arguments if available"
+ help
+ Uses the command-line options passed by the boot loader. If
+ the boot loader doesn't provide any, the default kernel command
+ string provided in CMDLINE will be used.
+
+config CMDLINE_EXTEND
+ bool "Extend bootloader kernel arguments"
+ help
+ The command-line arguments provided by the boot loader will be
+ appended to the default kernel command string.
+
config CMDLINE_FORCE
bool "Always use the default kernel command string"
help
@@ -1018,6 +1045,7 @@ config CMDLINE_FORCE
loader passes other arguments to the kernel.
This is useful if you cannot or don't want to change the
command-line options your boot loader passes to the kernel.
+endchoice
config EFI_STUB
bool
@@ -1050,6 +1078,41 @@ config DMI
However, even with this option, the resultant kernel should
continue to boot on existing non-UEFI platforms.
+config BUILD_ARM64_APPENDED_DTB_IMAGE
+ bool "Build a concatenated Image.gz/dtb by default"
+ depends on OF
+ help
+ Enabling this option will cause a concatenated Image.gz and list of
+ DTBs to be built by default (instead of a standalone Image.gz.)
+ The image will built in arch/arm64/boot/Image.gz-dtb
+
+choice
+ prompt "Appended DTB Kernel Image name"
+ depends on BUILD_ARM64_APPENDED_DTB_IMAGE
+ help
+ Enabling this option will cause a specific kernel image Image or
+ Image.gz to be used for final image creation.
+ The image will built in arch/arm64/boot/IMAGE-NAME-dtb
+
+ config IMG_GZ_DTB
+ bool "Image.gz-dtb"
+ config IMG_DTB
+ bool "Image-dtb"
+endchoice
+
+config BUILD_ARM64_APPENDED_KERNEL_IMAGE_NAME
+ string
+ depends on BUILD_ARM64_APPENDED_DTB_IMAGE
+ default "Image.gz-dtb" if IMG_GZ_DTB
+ default "Image-dtb" if IMG_DTB
+
+config BUILD_ARM64_APPENDED_DTB_IMAGE_NAMES
+ string "Default dtb names"
+ depends on BUILD_ARM64_APPENDED_DTB_IMAGE
+ help
+ Space separated list of names of dtbs to append when
+ building a concatenated Image.gz-dtb.
+
endmenu
menu "Userspace binary formats"
diff --git a/arch/arm64/Makefile b/arch/arm64/Makefile
index ee94597773fa..fde945314ca6 100644
--- a/arch/arm64/Makefile
+++ b/arch/arm64/Makefile
@@ -26,8 +26,17 @@ ifeq ($(CONFIG_ARM64_ERRATUM_843419),y)
ifeq ($(call ld-option, --fix-cortex-a53-843419),)
$(warning ld does not support --fix-cortex-a53-843419; kernel may be susceptible to erratum)
else
+ ifeq ($(call gold-ifversion, -lt, 114000000, y), y)
+$(warning This version of GNU gold may generate incorrect code with --fix-cortex-a53-843419;\
+ see https://sourceware.org/bugzilla/show_bug.cgi?id=21491)
+ endif
LDFLAGS_vmlinux += --fix-cortex-a53-843419
endif
+else
+ ifeq ($(ld-name),gold)
+# Pass --no-fix-cortex-a53-843419 to ensure the erratum fix is disabled
+LDFLAGS += --no-fix-cortex-a53-843419
+ endif
endif
KBUILD_DEFCONFIG := defconfig
@@ -41,9 +50,17 @@ $(warning LSE atomics not supported by binutils)
endif
endif
-KBUILD_CFLAGS += -mgeneral-regs-only $(lseinstr)
+ifeq ($(cc-name),clang)
+# This is a workaround for https://bugs.llvm.org/show_bug.cgi?id=30792.
+# TODO: revert when this is fixed in LLVM.
+KBUILD_CFLAGS += -mno-implicit-float
+else
+KBUILD_CFLAGS += -mgeneral-regs-only
+endif
+KBUILD_CFLAGS += $(lseinstr)
KBUILD_CFLAGS += -fno-asynchronous-unwind-tables
KBUILD_CFLAGS += $(call cc-option, -mpc-relative-literal-loads)
+KBUILD_CFLAGS += -fno-pic
KBUILD_AFLAGS += $(lseinstr)
ifeq ($(CONFIG_CPU_BIG_ENDIAN), y)
@@ -62,6 +79,10 @@ CHECKFLAGS += -D__aarch64__
ifeq ($(CONFIG_ARM64_MODULE_CMODEL_LARGE), y)
KBUILD_CFLAGS_MODULE += -mcmodel=large
+ifeq ($(CONFIG_LTO_CLANG), y)
+# Code model is not stored in LLVM IR, so we need to pass it also to LLVMgold
+KBUILD_LDFLAGS_MODULE += -plugin-opt=-code-model=large
+endif
endif
ifeq ($(CONFIG_ARM64_MODULE_PLTS),y)
@@ -80,6 +101,10 @@ else
TEXT_OFFSET := 0x00080000
endif
+ifeq ($(cc-name),clang)
+KBUILD_CFLAGS += $(call cc-disable-warning, asm-operand-widths)
+endif
+
# KASAN_SHADOW_OFFSET = VA_START + (1 << (VA_BITS - 3)) - (1 << 61)
# in 32-bit arithmetic
KASAN_SHADOW_OFFSET := $(shell printf "0x%08x00000000\n" $$(( \
@@ -98,7 +123,12 @@ libs-y := arch/arm64/lib/ $(libs-y)
core-$(CONFIG_EFI_STUB) += $(objtree)/drivers/firmware/efi/libstub/lib.a
# Default target when executing plain make
+ifeq ($(CONFIG_BUILD_ARM64_APPENDED_DTB_IMAGE),y)
+KBUILD_IMAGE := $(subst $\",,$(CONFIG_BUILD_ARM64_APPENDED_KERNEL_IMAGE_NAME))
+else
KBUILD_IMAGE := Image.gz
+endif
+
KBUILD_DTBS := dtbs
all: $(KBUILD_IMAGE) $(KBUILD_DTBS)
@@ -125,6 +155,9 @@ dtbs: prepare scripts
dtbs_install:
$(Q)$(MAKE) $(dtbinst)=$(boot)/dts
+Image-dtb Image.gz-dtb: vmlinux scripts dtbs
+ $(Q)$(MAKE) $(build)=$(boot) $(boot)/$@
+
PHONY += vdso_install
vdso_install:
$(Q)$(MAKE) $(build)=arch/arm64/kernel/vdso $@
diff --git a/arch/arm64/boot/.gitignore b/arch/arm64/boot/.gitignore
index 8dab0bb6ae66..34e35209fc2e 100644
--- a/arch/arm64/boot/.gitignore
+++ b/arch/arm64/boot/.gitignore
@@ -1,2 +1,4 @@
Image
+Image-dtb
Image.gz
+Image.gz-dtb
diff --git a/arch/arm64/boot/Makefile b/arch/arm64/boot/Makefile
index 1f012c506434..2c8cb864315e 100644
--- a/arch/arm64/boot/Makefile
+++ b/arch/arm64/boot/Makefile
@@ -14,16 +14,29 @@
# Based on the ia64 boot/Makefile.
#
+include $(srctree)/arch/arm64/boot/dts/Makefile
+
OBJCOPYFLAGS_Image :=-O binary -R .note -R .note.gnu.build-id -R .comment -S
targets := Image Image.gz
+DTB_NAMES := $(subst $\",,$(CONFIG_BUILD_ARM64_APPENDED_DTB_IMAGE_NAMES))
+ifneq ($(DTB_NAMES),)
+DTB_LIST := $(addsuffix .dtb,$(DTB_NAMES))
+else
+DTB_LIST := $(dtb-y)
+endif
+DTB_OBJS := $(addprefix $(obj)/dts/,$(DTB_LIST))
+
$(obj)/Image: vmlinux FORCE
$(call if_changed,objcopy)
$(obj)/Image.bz2: $(obj)/Image FORCE
$(call if_changed,bzip2)
+$(obj)/Image-dtb: $(obj)/Image $(DTB_OBJS) FORCE
+ $(call if_changed,cat)
+
$(obj)/Image.gz: $(obj)/Image FORCE
$(call if_changed,gzip)
@@ -36,6 +49,9 @@ $(obj)/Image.lzma: $(obj)/Image FORCE
$(obj)/Image.lzo: $(obj)/Image FORCE
$(call if_changed,lzo)
+$(obj)/Image.gz-dtb: $(obj)/Image.gz $(DTB_OBJS) FORCE
+ $(call if_changed,cat)
+
install:
$(CONFIG_SHELL) $(srctree)/$(src)/install.sh $(KERNELRELEASE) \
$(obj)/Image System.map "$(INSTALL_PATH)"
diff --git a/arch/arm64/boot/dts/Makefile b/arch/arm64/boot/dts/Makefile
index 6684f97c2722..7ad2cf0a607b 100644
--- a/arch/arm64/boot/dts/Makefile
+++ b/arch/arm64/boot/dts/Makefile
@@ -28,3 +28,17 @@ dtstree := $(srctree)/$(src)
dtb-$(CONFIG_OF_ALL_DTBS) := $(patsubst $(dtstree)/%.dts,%.dtb, $(foreach d,$(dts-dirs), $(wildcard $(dtstree)/$(d)/*.dts)))
always := $(dtb-y)
+
+targets += dtbs
+
+DTB_NAMES := $(subst $\",,$(CONFIG_BUILD_ARM64_APPENDED_DTB_IMAGE_NAMES))
+ifneq ($(DTB_NAMES),)
+DTB_LIST := $(addsuffix .dtb,$(DTB_NAMES))
+else
+DTB_LIST := $(dtb-y)
+endif
+targets += $(DTB_LIST)
+
+dtbs: $(addprefix $(obj)/, $(DTB_LIST))
+
+clean-files := dts/*.dtb *.dtb
diff --git a/arch/arm64/configs/cuttlefish_defconfig b/arch/arm64/configs/cuttlefish_defconfig
new file mode 100644
index 000000000000..1ff6114ac682
--- /dev/null
+++ b/arch/arm64/configs/cuttlefish_defconfig
@@ -0,0 +1,435 @@
+# CONFIG_FHANDLE is not set
+CONFIG_AUDIT=y
+CONFIG_NO_HZ=y
+CONFIG_HIGH_RES_TIMERS=y
+CONFIG_TASKSTATS=y
+CONFIG_TASK_DELAY_ACCT=y
+CONFIG_TASK_XACCT=y
+CONFIG_TASK_IO_ACCOUNTING=y
+CONFIG_IKCONFIG=y
+CONFIG_IKCONFIG_PROC=y
+CONFIG_CGROUP_FREEZER=y
+CONFIG_CPUSETS=y
+# CONFIG_PROC_PID_CPUSET is not set
+CONFIG_CGROUP_CPUACCT=y
+CONFIG_CGROUP_SCHEDTUNE=y
+CONFIG_MEMCG=y
+CONFIG_MEMCG_SWAP=y
+CONFIG_RT_GROUP_SCHED=y
+CONFIG_CGROUP_BPF=y
+CONFIG_SCHED_AUTOGROUP=y
+CONFIG_SCHED_TUNE=y
+CONFIG_DEFAULT_USE_ENERGY_AWARE=y
+CONFIG_BLK_DEV_INITRD=y
+# CONFIG_RD_BZIP2 is not set
+# CONFIG_RD_LZMA is not set
+# CONFIG_RD_XZ is not set
+# CONFIG_RD_LZO is not set
+# CONFIG_RD_LZ4 is not set
+CONFIG_SGETMASK_SYSCALL=y
+# CONFIG_SYSFS_SYSCALL is not set
+CONFIG_KALLSYMS_ALL=y
+CONFIG_BPF_SYSCALL=y
+CONFIG_EMBEDDED=y
+# CONFIG_VM_EVENT_COUNTERS is not set
+# CONFIG_COMPAT_BRK is not set
+CONFIG_PROFILING=y
+CONFIG_KPROBES=y
+CONFIG_JUMP_LABEL=y
+CONFIG_CC_STACKPROTECTOR_STRONG=y
+CONFIG_MODULES=y
+CONFIG_MODULE_UNLOAD=y
+CONFIG_MODVERSIONS=y
+CONFIG_PCI=y
+CONFIG_PCI_HOST_GENERIC=y
+CONFIG_PREEMPT=y
+CONFIG_HZ_100=y
+# CONFIG_SPARSEMEM_VMEMMAP is not set
+CONFIG_KSM=y
+CONFIG_TRANSPARENT_HUGEPAGE=y
+CONFIG_ZSMALLOC=y
+CONFIG_SECCOMP=y
+CONFIG_PARAVIRT=y
+CONFIG_ARMV8_DEPRECATED=y
+CONFIG_SWP_EMULATION=y
+CONFIG_CP15_BARRIER_EMULATION=y
+CONFIG_SETEND_EMULATION=y
+CONFIG_ARM64_SW_TTBR0_PAN=y
+CONFIG_ARM64_LSE_ATOMICS=y
+CONFIG_RANDOMIZE_BASE=y
+CONFIG_CMDLINE="console=ttyAMA0"
+CONFIG_CMDLINE_EXTEND=y
+# CONFIG_EFI is not set
+# CONFIG_CORE_DUMP_DEFAULT_ELF_HEADERS is not set
+CONFIG_COMPAT=y
+CONFIG_PM_WAKELOCKS=y
+CONFIG_PM_WAKELOCKS_LIMIT=0
+# CONFIG_PM_WAKELOCKS_GC is not set
+CONFIG_PM_DEBUG=y
+CONFIG_CPU_IDLE=y
+CONFIG_ARM_CPUIDLE=y
+CONFIG_CPU_FREQ=y
+CONFIG_CPU_FREQ_DEFAULT_GOV_SCHEDUTIL=y
+CONFIG_CPU_FREQ_GOV_POWERSAVE=y
+CONFIG_CPU_FREQ_GOV_CONSERVATIVE=y
+CONFIG_CPUFREQ_DT=y
+CONFIG_ARM_BIG_LITTLE_CPUFREQ=y
+CONFIG_ARM_DT_BL_CPUFREQ=y
+CONFIG_ARM_SCPI_CPUFREQ=y
+CONFIG_NET=y
+CONFIG_PACKET=y
+CONFIG_UNIX=y
+CONFIG_XFRM_USER=y
+CONFIG_NET_KEY=y
+CONFIG_INET=y
+CONFIG_IP_MULTICAST=y
+CONFIG_IP_ADVANCED_ROUTER=y
+CONFIG_IP_MULTIPLE_TABLES=y
+CONFIG_NET_IPGRE_DEMUX=y
+CONFIG_NET_IPVTI=y
+CONFIG_INET_ESP=y
+# CONFIG_INET_XFRM_MODE_BEET is not set
+CONFIG_INET_UDP_DIAG=y
+CONFIG_INET_DIAG_DESTROY=y
+CONFIG_TCP_CONG_ADVANCED=y
+# CONFIG_TCP_CONG_BIC is not set
+# CONFIG_TCP_CONG_WESTWOOD is not set
+# CONFIG_TCP_CONG_HTCP is not set
+CONFIG_IPV6_ROUTER_PREF=y
+CONFIG_IPV6_ROUTE_INFO=y
+CONFIG_IPV6_OPTIMISTIC_DAD=y
+CONFIG_INET6_ESP=y
+CONFIG_INET6_IPCOMP=y
+CONFIG_IPV6_MIP6=y
+CONFIG_IPV6_VTI=y
+CONFIG_IPV6_MULTIPLE_TABLES=y
+CONFIG_NETFILTER=y
+CONFIG_NF_CONNTRACK=y
+CONFIG_NF_CONNTRACK_SECMARK=y
+CONFIG_NF_CONNTRACK_EVENTS=y
+CONFIG_NF_CT_PROTO_DCCP=y
+CONFIG_NF_CT_PROTO_SCTP=y
+CONFIG_NF_CT_PROTO_UDPLITE=y
+CONFIG_NF_CONNTRACK_AMANDA=y
+CONFIG_NF_CONNTRACK_FTP=y
+CONFIG_NF_CONNTRACK_H323=y
+CONFIG_NF_CONNTRACK_IRC=y
+CONFIG_NF_CONNTRACK_NETBIOS_NS=y
+CONFIG_NF_CONNTRACK_PPTP=y
+CONFIG_NF_CONNTRACK_SANE=y
+CONFIG_NF_CONNTRACK_TFTP=y
+CONFIG_NF_CT_NETLINK=y
+CONFIG_NETFILTER_XT_TARGET_CLASSIFY=y
+CONFIG_NETFILTER_XT_TARGET_CONNMARK=y
+CONFIG_NETFILTER_XT_TARGET_CONNSECMARK=y
+CONFIG_NETFILTER_XT_TARGET_IDLETIMER=y
+CONFIG_NETFILTER_XT_TARGET_MARK=y
+CONFIG_NETFILTER_XT_TARGET_NFLOG=y
+CONFIG_NETFILTER_XT_TARGET_NFQUEUE=y
+CONFIG_NETFILTER_XT_TARGET_TPROXY=y
+CONFIG_NETFILTER_XT_TARGET_TRACE=y
+CONFIG_NETFILTER_XT_TARGET_SECMARK=y
+CONFIG_NETFILTER_XT_TARGET_TCPMSS=y
+CONFIG_NETFILTER_XT_MATCH_BPF=y
+CONFIG_NETFILTER_XT_MATCH_COMMENT=y
+CONFIG_NETFILTER_XT_MATCH_CONNLIMIT=y
+CONFIG_NETFILTER_XT_MATCH_CONNMARK=y
+CONFIG_NETFILTER_XT_MATCH_CONNTRACK=y
+CONFIG_NETFILTER_XT_MATCH_HASHLIMIT=y
+CONFIG_NETFILTER_XT_MATCH_HELPER=y
+CONFIG_NETFILTER_XT_MATCH_IPRANGE=y
+CONFIG_NETFILTER_XT_MATCH_LENGTH=y
+CONFIG_NETFILTER_XT_MATCH_LIMIT=y
+CONFIG_NETFILTER_XT_MATCH_MAC=y
+CONFIG_NETFILTER_XT_MATCH_MARK=y
+CONFIG_NETFILTER_XT_MATCH_POLICY=y
+CONFIG_NETFILTER_XT_MATCH_PKTTYPE=y
+CONFIG_NETFILTER_XT_MATCH_QUOTA=y
+CONFIG_NETFILTER_XT_MATCH_QUOTA2=y
+CONFIG_NETFILTER_XT_MATCH_SOCKET=y
+CONFIG_NETFILTER_XT_MATCH_STATE=y
+CONFIG_NETFILTER_XT_MATCH_STATISTIC=y
+CONFIG_NETFILTER_XT_MATCH_STRING=y
+CONFIG_NETFILTER_XT_MATCH_TIME=y
+CONFIG_NETFILTER_XT_MATCH_U32=y
+CONFIG_NF_CONNTRACK_IPV4=y
+CONFIG_IP_NF_IPTABLES=y
+CONFIG_IP_NF_MATCH_ECN=y
+CONFIG_IP_NF_MATCH_TTL=y
+CONFIG_IP_NF_FILTER=y
+CONFIG_IP_NF_TARGET_REJECT=y
+CONFIG_IP_NF_NAT=y
+CONFIG_IP_NF_TARGET_MASQUERADE=y
+CONFIG_IP_NF_TARGET_NETMAP=y
+CONFIG_IP_NF_TARGET_REDIRECT=y
+CONFIG_IP_NF_MANGLE=y
+CONFIG_IP_NF_RAW=y
+CONFIG_IP_NF_SECURITY=y
+CONFIG_IP_NF_ARPTABLES=y
+CONFIG_IP_NF_ARPFILTER=y
+CONFIG_IP_NF_ARP_MANGLE=y
+CONFIG_NF_CONNTRACK_IPV6=y
+CONFIG_IP6_NF_IPTABLES=y
+CONFIG_IP6_NF_MATCH_RPFILTER=y
+CONFIG_IP6_NF_FILTER=y
+CONFIG_IP6_NF_TARGET_REJECT=y
+CONFIG_IP6_NF_MANGLE=y
+CONFIG_IP6_NF_RAW=y
+CONFIG_L2TP=y
+CONFIG_NET_SCHED=y
+CONFIG_NET_SCH_HTB=y
+CONFIG_NET_CLS_U32=y
+CONFIG_NET_CLS_BPF=y
+CONFIG_NET_EMATCH=y
+CONFIG_NET_EMATCH_U32=y
+CONFIG_NET_CLS_ACT=y
+CONFIG_CFG80211=y
+# CONFIG_CFG80211_DEFAULT_PS is not set
+CONFIG_MAC80211=y
+# CONFIG_MAC80211_RC_MINSTREL is not set
+CONFIG_RFKILL=y
+# CONFIG_UEVENT_HELPER is not set
+CONFIG_DEVTMPFS=y
+# CONFIG_ALLOW_DEV_COREDUMP is not set
+CONFIG_DEBUG_DEVRES=y
+CONFIG_OF_UNITTEST=y
+CONFIG_ZRAM=y
+CONFIG_BLK_DEV_LOOP=y
+CONFIG_BLK_DEV_RAM=y
+CONFIG_BLK_DEV_RAM_SIZE=8192
+CONFIG_VIRTIO_BLK=y
+CONFIG_UID_SYS_STATS=y
+CONFIG_SCSI=y
+# CONFIG_SCSI_PROC_FS is not set
+CONFIG_BLK_DEV_SD=y
+CONFIG_SCSI_VIRTIO=y
+CONFIG_MD=y
+CONFIG_BLK_DEV_DM=y
+CONFIG_DM_CRYPT=y
+CONFIG_DM_UEVENT=y
+CONFIG_DM_VERITY=y
+CONFIG_DM_VERITY_FEC=y
+CONFIG_DM_VERITY_AVB=y
+CONFIG_NETDEVICES=y
+CONFIG_NETCONSOLE=y
+CONFIG_NETCONSOLE_DYNAMIC=y
+CONFIG_TUN=y
+CONFIG_VIRTIO_NET=y
+# CONFIG_ETHERNET is not set
+CONFIG_PHYLIB=y
+CONFIG_PPP=y
+CONFIG_PPP_BSDCOMP=y
+CONFIG_PPP_DEFLATE=y
+CONFIG_PPP_MPPE=y
+CONFIG_PPTP=y
+CONFIG_PPPOL2TP=y
+CONFIG_USB_USBNET=y
+# CONFIG_USB_NET_AX8817X is not set
+# CONFIG_USB_NET_AX88179_178A is not set
+# CONFIG_USB_NET_CDCETHER is not set
+# CONFIG_USB_NET_CDC_NCM is not set
+# CONFIG_USB_NET_NET1080 is not set
+# CONFIG_USB_NET_CDC_SUBSET is not set
+# CONFIG_USB_NET_ZAURUS is not set
+# CONFIG_WLAN_VENDOR_ADMTEK is not set
+# CONFIG_WLAN_VENDOR_ATH is not set
+# CONFIG_WLAN_VENDOR_ATMEL is not set
+# CONFIG_WLAN_VENDOR_BROADCOM is not set
+# CONFIG_WLAN_VENDOR_CISCO is not set
+# CONFIG_WLAN_VENDOR_INTEL is not set
+# CONFIG_WLAN_VENDOR_INTERSIL is not set
+# CONFIG_WLAN_VENDOR_MARVELL is not set
+# CONFIG_WLAN_VENDOR_MEDIATEK is not set
+# CONFIG_WLAN_VENDOR_RALINK is not set
+# CONFIG_WLAN_VENDOR_REALTEK is not set
+# CONFIG_WLAN_VENDOR_RSI is not set
+# CONFIG_WLAN_VENDOR_ST is not set
+# CONFIG_WLAN_VENDOR_TI is not set
+# CONFIG_WLAN_VENDOR_ZYDAS is not set
+CONFIG_VIRT_WIFI=y
+CONFIG_INPUT_EVDEV=y
+CONFIG_INPUT_KEYRESET=y
+# CONFIG_INPUT_KEYBOARD is not set
+# CONFIG_INPUT_MOUSE is not set
+CONFIG_INPUT_JOYSTICK=y
+CONFIG_JOYSTICK_XPAD=y
+CONFIG_JOYSTICK_XPAD_FF=y
+CONFIG_JOYSTICK_XPAD_LEDS=y
+CONFIG_INPUT_TABLET=y
+CONFIG_TABLET_USB_ACECAD=y
+CONFIG_TABLET_USB_AIPTEK=y
+CONFIG_TABLET_USB_GTCO=y
+CONFIG_TABLET_USB_HANWANG=y
+CONFIG_TABLET_USB_KBTAB=y
+CONFIG_INPUT_MISC=y
+CONFIG_INPUT_UINPUT=y
+CONFIG_INPUT_GPIO=y
+# CONFIG_VT is not set
+# CONFIG_LEGACY_PTYS is not set
+# CONFIG_DEVMEM is not set
+# CONFIG_DEVKMEM is not set
+CONFIG_SERIAL_8250=y
+# CONFIG_SERIAL_8250_DEPRECATED_OPTIONS is not set
+CONFIG_SERIAL_8250_CONSOLE=y
+CONFIG_SERIAL_8250_NR_UARTS=48
+CONFIG_SERIAL_8250_EXTENDED=y
+CONFIG_SERIAL_8250_MANY_PORTS=y
+CONFIG_SERIAL_8250_SHARE_IRQ=y
+CONFIG_SERIAL_AMBA_PL011=y
+CONFIG_SERIAL_AMBA_PL011_CONSOLE=y
+CONFIG_VIRTIO_CONSOLE=y
+CONFIG_HW_RANDOM=y
+CONFIG_HW_RANDOM_VIRTIO=y
+# CONFIG_HW_RANDOM_CAVIUM is not set
+# CONFIG_DEVPORT is not set
+# CONFIG_I2C_COMPAT is not set
+# CONFIG_I2C_HELPER_AUTO is not set
+# CONFIG_HWMON is not set
+CONFIG_THERMAL=y
+CONFIG_CPU_THERMAL=y
+CONFIG_MEDIA_SUPPORT=y
+# CONFIG_DVB_TUNER_DIB0070 is not set
+# CONFIG_DVB_TUNER_DIB0090 is not set
+# CONFIG_VGA_ARB is not set
+CONFIG_DRM=y
+# CONFIG_DRM_FBDEV_EMULATION is not set
+CONFIG_DRM_VIRTIO_GPU=y
+CONFIG_SOUND=y
+CONFIG_SND=y
+CONFIG_HIDRAW=y
+CONFIG_UHID=y
+CONFIG_HID_A4TECH=y
+CONFIG_HID_ACRUX=y
+CONFIG_HID_ACRUX_FF=y
+CONFIG_HID_APPLE=y
+CONFIG_HID_BELKIN=y
+CONFIG_HID_CHERRY=y
+CONFIG_HID_CHICONY=y
+CONFIG_HID_PRODIKEYS=y
+CONFIG_HID_CYPRESS=y
+CONFIG_HID_DRAGONRISE=y
+CONFIG_DRAGONRISE_FF=y
+CONFIG_HID_EMS_FF=y
+CONFIG_HID_ELECOM=y
+CONFIG_HID_EZKEY=y
+CONFIG_HID_HOLTEK=y
+CONFIG_HID_KEYTOUCH=y
+CONFIG_HID_KYE=y
+CONFIG_HID_UCLOGIC=y
+CONFIG_HID_WALTOP=y
+CONFIG_HID_GYRATION=y
+CONFIG_HID_TWINHAN=y
+CONFIG_HID_KENSINGTON=y
+CONFIG_HID_LCPOWER=y
+CONFIG_HID_LOGITECH=y
+CONFIG_HID_LOGITECH_DJ=y
+CONFIG_LOGITECH_FF=y
+CONFIG_LOGIRUMBLEPAD2_FF=y
+CONFIG_LOGIG940_FF=y
+CONFIG_HID_MAGICMOUSE=y
+CONFIG_HID_MICROSOFT=y
+CONFIG_HID_MONTEREY=y
+CONFIG_HID_MULTITOUCH=y
+CONFIG_HID_NTRIG=y
+CONFIG_HID_ORTEK=y
+CONFIG_HID_PANTHERLORD=y
+CONFIG_PANTHERLORD_FF=y
+CONFIG_HID_PETALYNX=y
+CONFIG_HID_PICOLCD=y
+CONFIG_HID_PRIMAX=y
+CONFIG_HID_ROCCAT=y
+CONFIG_HID_SAITEK=y
+CONFIG_HID_SAMSUNG=y
+CONFIG_HID_SONY=y
+CONFIG_HID_SPEEDLINK=y
+CONFIG_HID_SUNPLUS=y
+CONFIG_HID_GREENASIA=y
+CONFIG_GREENASIA_FF=y
+CONFIG_HID_SMARTJOYPLUS=y
+CONFIG_SMARTJOYPLUS_FF=y
+CONFIG_HID_TIVO=y
+CONFIG_HID_TOPSEED=y
+CONFIG_HID_THRUSTMASTER=y
+CONFIG_HID_WACOM=y
+CONFIG_HID_WIIMOTE=y
+CONFIG_HID_ZEROPLUS=y
+CONFIG_HID_ZYDACRON=y
+CONFIG_USB_HIDDEV=y
+CONFIG_USB_ANNOUNCE_NEW_DEVICES=y
+CONFIG_USB_EHCI_HCD=y
+CONFIG_USB_GADGET=y
+CONFIG_USB_CONFIGFS=y
+CONFIG_USB_CONFIGFS_F_FS=y
+CONFIG_USB_CONFIGFS_F_ACC=y
+CONFIG_USB_CONFIGFS_F_AUDIO_SRC=y
+CONFIG_USB_CONFIGFS_UEVENT=y
+CONFIG_USB_CONFIGFS_F_MIDI=y
+CONFIG_MMC=y
+# CONFIG_PWRSEQ_EMMC is not set
+# CONFIG_PWRSEQ_SIMPLE is not set
+# CONFIG_MMC_BLOCK is not set
+CONFIG_RTC_CLASS=y
+# CONFIG_RTC_HCTOSYS is not set
+# CONFIG_RTC_SYSTOHC is not set
+CONFIG_RTC_DRV_PL031=y
+CONFIG_VIRTIO_PCI=y
+# CONFIG_VIRTIO_PCI_LEGACY is not set
+CONFIG_VIRTIO_BALLOON=y
+CONFIG_VIRTIO_INPUT=y
+CONFIG_VIRTIO_MMIO=y
+CONFIG_VIRTIO_MMIO_CMDLINE_DEVICES=y
+CONFIG_STAGING=y
+CONFIG_ASHMEM=y
+CONFIG_ANDROID_VSOC=y
+CONFIG_ION=y
+CONFIG_COMMON_CLK_SCPI=y
+# CONFIG_COMMON_CLK_XGENE is not set
+CONFIG_MAILBOX=y
+# CONFIG_IOMMU_SUPPORT is not set
+CONFIG_ANDROID=y
+CONFIG_ANDROID_BINDER_IPC=y
+CONFIG_ARM_SCPI_PROTOCOL=y
+# CONFIG_ARM_SCPI_POWER_DOMAIN is not set
+CONFIG_EXT4_FS=y
+CONFIG_EXT4_FS_SECURITY=y
+CONFIG_EXT4_ENCRYPTION=y
+CONFIG_F2FS_FS=y
+CONFIG_F2FS_FS_SECURITY=y
+CONFIG_F2FS_FS_ENCRYPTION=y
+# CONFIG_DNOTIFY is not set
+CONFIG_QUOTA=y
+CONFIG_QFMT_V2=y
+CONFIG_FUSE_FS=y
+CONFIG_MSDOS_FS=y
+CONFIG_VFAT_FS=y
+CONFIG_TMPFS=y
+CONFIG_TMPFS_POSIX_ACL=y
+CONFIG_SDCARD_FS=y
+CONFIG_PSTORE=y
+CONFIG_PSTORE_CONSOLE=y
+CONFIG_PSTORE_RAM=y
+CONFIG_PRINTK_TIME=y
+CONFIG_DEBUG_INFO=y
+# CONFIG_ENABLE_MUST_CHECK is not set
+CONFIG_FRAME_WARN=1024
+# CONFIG_SECTION_MISMATCH_WARN_ONLY is not set
+CONFIG_MAGIC_SYSRQ=y
+CONFIG_DEBUG_STACK_USAGE=y
+CONFIG_DEBUG_MEMORY_INIT=y
+CONFIG_PANIC_TIMEOUT=5
+CONFIG_SCHEDSTATS=y
+CONFIG_TIMER_STATS=y
+CONFIG_RCU_CPU_STALL_TIMEOUT=60
+CONFIG_ENABLE_DEFAULT_TRACERS=y
+CONFIG_SECURITY_PERF_EVENTS_RESTRICT=y
+CONFIG_SECURITY=y
+CONFIG_SECURITY_NETWORK=y
+CONFIG_LSM_MMAP_MIN_ADDR=65536
+CONFIG_HARDENED_USERCOPY=y
+CONFIG_SECURITY_SELINUX=y
+CONFIG_CRYPTO_ADIANTUM=y
+CONFIG_CRYPTO_SHA512=y
+CONFIG_CRYPTO_LZ4=y
+CONFIG_CRYPTO_ZSTD=y
+CONFIG_CRYPTO_ANSI_CPRNG=y
+CONFIG_XZ_DEC=y
diff --git a/arch/arm64/configs/ranchu64_defconfig b/arch/arm64/configs/ranchu64_defconfig
new file mode 100644
index 000000000000..fc55008d8c4c
--- /dev/null
+++ b/arch/arm64/configs/ranchu64_defconfig
@@ -0,0 +1,312 @@
+# CONFIG_LOCALVERSION_AUTO is not set
+# CONFIG_SWAP is not set
+CONFIG_POSIX_MQUEUE=y
+CONFIG_AUDIT=y
+CONFIG_NO_HZ=y
+CONFIG_HIGH_RES_TIMERS=y
+CONFIG_BSD_PROCESS_ACCT=y
+CONFIG_BSD_PROCESS_ACCT_V3=y
+CONFIG_TASKSTATS=y
+CONFIG_TASK_DELAY_ACCT=y
+CONFIG_TASK_XACCT=y
+CONFIG_TASK_IO_ACCOUNTING=y
+CONFIG_IKCONFIG=y
+CONFIG_IKCONFIG_PROC=y
+CONFIG_LOG_BUF_SHIFT=14
+CONFIG_CGROUP_DEBUG=y
+CONFIG_CGROUP_FREEZER=y
+CONFIG_CGROUP_CPUACCT=y
+CONFIG_RT_GROUP_SCHED=y
+CONFIG_SCHED_AUTOGROUP=y
+CONFIG_BLK_DEV_INITRD=y
+CONFIG_KALLSYMS_ALL=y
+CONFIG_EMBEDDED=y
+# CONFIG_COMPAT_BRK is not set
+CONFIG_PROFILING=y
+CONFIG_ARCH_MMAP_RND_BITS=24
+CONFIG_ARCH_MMAP_RND_COMPAT_BITS=16
+# CONFIG_BLK_DEV_BSG is not set
+# CONFIG_IOSCHED_DEADLINE is not set
+CONFIG_ARCH_VEXPRESS=y
+CONFIG_NR_CPUS=4
+CONFIG_PREEMPT=y
+CONFIG_KSM=y
+CONFIG_SECCOMP=y
+CONFIG_ARMV8_DEPRECATED=y
+CONFIG_SWP_EMULATION=y
+CONFIG_CP15_BARRIER_EMULATION=y
+CONFIG_SETEND_EMULATION=y
+CONFIG_CMDLINE="console=ttyAMA0"
+# CONFIG_CORE_DUMP_DEFAULT_ELF_HEADERS is not set
+CONFIG_COMPAT=y
+CONFIG_PM_AUTOSLEEP=y
+CONFIG_PM_WAKELOCKS=y
+CONFIG_PM_WAKELOCKS_LIMIT=0
+# CONFIG_PM_WAKELOCKS_GC is not set
+CONFIG_PM_DEBUG=y
+CONFIG_NET=y
+CONFIG_PACKET=y
+CONFIG_UNIX=y
+CONFIG_XFRM_USER=y
+CONFIG_NET_KEY=y
+CONFIG_INET=y
+CONFIG_INET_DIAG_DESTROY=y
+CONFIG_IP_MULTICAST=y
+CONFIG_IP_ADVANCED_ROUTER=y
+CONFIG_IP_MULTIPLE_TABLES=y
+CONFIG_IP_PNP=y
+CONFIG_IP_PNP_DHCP=y
+CONFIG_IP_PNP_BOOTP=y
+CONFIG_INET_ESP=y
+# CONFIG_INET_LRO is not set
+CONFIG_IPV6_ROUTER_PREF=y
+CONFIG_IPV6_ROUTE_INFO=y
+CONFIG_IPV6_OPTIMISTIC_DAD=y
+CONFIG_INET6_AH=y
+CONFIG_INET6_ESP=y
+CONFIG_INET6_IPCOMP=y
+CONFIG_IPV6_MIP6=y
+CONFIG_IPV6_MULTIPLE_TABLES=y
+CONFIG_NETFILTER=y
+CONFIG_NF_CONNTRACK=y
+CONFIG_NF_CONNTRACK_SECMARK=y
+CONFIG_NF_CONNTRACK_EVENTS=y
+CONFIG_NF_CT_PROTO_DCCP=y
+CONFIG_NF_CT_PROTO_SCTP=y
+CONFIG_NF_CT_PROTO_UDPLITE=y
+CONFIG_NF_CONNTRACK_AMANDA=y
+CONFIG_NF_CONNTRACK_FTP=y
+CONFIG_NF_CONNTRACK_H323=y
+CONFIG_NF_CONNTRACK_IRC=y
+CONFIG_NF_CONNTRACK_NETBIOS_NS=y
+CONFIG_NF_CONNTRACK_PPTP=y
+CONFIG_NF_CONNTRACK_SANE=y
+CONFIG_NF_CONNTRACK_TFTP=y
+CONFIG_NF_CT_NETLINK=y
+CONFIG_NETFILTER_XT_TARGET_CLASSIFY=y
+CONFIG_NETFILTER_XT_TARGET_CONNMARK=y
+CONFIG_NETFILTER_XT_TARGET_CONNSECMARK=y
+CONFIG_NETFILTER_XT_TARGET_IDLETIMER=y
+CONFIG_NETFILTER_XT_TARGET_MARK=y
+CONFIG_NETFILTER_XT_TARGET_NFLOG=y
+CONFIG_NETFILTER_XT_TARGET_NFQUEUE=y
+CONFIG_NETFILTER_XT_TARGET_TPROXY=y
+CONFIG_NETFILTER_XT_TARGET_TRACE=y
+CONFIG_NETFILTER_XT_TARGET_SECMARK=y
+CONFIG_NETFILTER_XT_TARGET_TCPMSS=y
+CONFIG_NETFILTER_XT_MATCH_COMMENT=y
+CONFIG_NETFILTER_XT_MATCH_CONNLIMIT=y
+CONFIG_NETFILTER_XT_MATCH_CONNMARK=y
+CONFIG_NETFILTER_XT_MATCH_CONNTRACK=y
+CONFIG_NETFILTER_XT_MATCH_HASHLIMIT=y
+CONFIG_NETFILTER_XT_MATCH_HELPER=y
+CONFIG_NETFILTER_XT_MATCH_IPRANGE=y
+CONFIG_NETFILTER_XT_MATCH_LENGTH=y
+CONFIG_NETFILTER_XT_MATCH_LIMIT=y
+CONFIG_NETFILTER_XT_MATCH_MAC=y
+CONFIG_NETFILTER_XT_MATCH_MARK=y
+CONFIG_NETFILTER_XT_MATCH_POLICY=y
+CONFIG_NETFILTER_XT_MATCH_PKTTYPE=y
+CONFIG_NETFILTER_XT_MATCH_QTAGUID=y
+CONFIG_NETFILTER_XT_MATCH_QUOTA=y
+CONFIG_NETFILTER_XT_MATCH_QUOTA2=y
+CONFIG_NETFILTER_XT_MATCH_SOCKET=y
+CONFIG_NETFILTER_XT_MATCH_STATE=y
+CONFIG_NETFILTER_XT_MATCH_STATISTIC=y
+CONFIG_NETFILTER_XT_MATCH_STRING=y
+CONFIG_NETFILTER_XT_MATCH_TIME=y
+CONFIG_NETFILTER_XT_MATCH_U32=y
+CONFIG_NF_CONNTRACK_IPV4=y
+CONFIG_IP_NF_IPTABLES=y
+CONFIG_IP_NF_MATCH_AH=y
+CONFIG_IP_NF_MATCH_ECN=y
+CONFIG_IP_NF_MATCH_RPFILTER=y
+CONFIG_IP_NF_MATCH_TTL=y
+CONFIG_IP_NF_FILTER=y
+CONFIG_IP_NF_TARGET_REJECT=y
+CONFIG_IP_NF_MANGLE=y
+CONFIG_IP_NF_TARGET_ECN=y
+CONFIG_IP_NF_TARGET_TTL=y
+CONFIG_IP_NF_RAW=y
+CONFIG_IP_NF_SECURITY=y
+CONFIG_IP_NF_ARPTABLES=y
+CONFIG_IP_NF_ARPFILTER=y
+CONFIG_IP_NF_ARP_MANGLE=y
+CONFIG_NF_CONNTRACK_IPV6=y
+CONFIG_IP6_NF_IPTABLES=y
+CONFIG_IP6_NF_MATCH_AH=y
+CONFIG_IP6_NF_MATCH_EUI64=y
+CONFIG_IP6_NF_MATCH_FRAG=y
+CONFIG_IP6_NF_MATCH_OPTS=y
+CONFIG_IP6_NF_MATCH_HL=y
+CONFIG_IP6_NF_MATCH_IPV6HEADER=y
+CONFIG_IP6_NF_MATCH_MH=y
+CONFIG_IP6_NF_MATCH_RT=y
+CONFIG_IP6_NF_TARGET_HL=y
+CONFIG_IP6_NF_FILTER=y
+CONFIG_IP6_NF_TARGET_REJECT=y
+CONFIG_IP6_NF_MANGLE=y
+CONFIG_IP6_NF_RAW=y
+CONFIG_BRIDGE=y
+CONFIG_NET_SCHED=y
+CONFIG_NET_SCH_HTB=y
+CONFIG_NET_CLS_U32=y
+CONFIG_NET_EMATCH=y
+CONFIG_NET_EMATCH_U32=y
+CONFIG_NET_CLS_ACT=y
+# CONFIG_WIRELESS is not set
+CONFIG_UEVENT_HELPER_PATH="/sbin/hotplug"
+CONFIG_BLK_DEV_LOOP=y
+CONFIG_BLK_DEV_RAM=y
+CONFIG_BLK_DEV_RAM_SIZE=8192
+CONFIG_VIRTIO_BLK=y
+CONFIG_SCSI=y
+# CONFIG_SCSI_PROC_FS is not set
+CONFIG_BLK_DEV_SD=y
+# CONFIG_SCSI_LOWLEVEL is not set
+CONFIG_MD=y
+CONFIG_BLK_DEV_DM=y
+CONFIG_DM_CRYPT=y
+CONFIG_DM_UEVENT=y
+CONFIG_DM_VERITY=y
+CONFIG_DM_VERITY_FEC=y
+CONFIG_NETDEVICES=y
+CONFIG_TUN=y
+CONFIG_VIRTIO_NET=y
+CONFIG_SMC91X=y
+CONFIG_PPP=y
+CONFIG_PPP_BSDCOMP=y
+CONFIG_PPP_DEFLATE=y
+CONFIG_PPP_MPPE=y
+CONFIG_PPPOLAC=y
+CONFIG_PPPOPNS=y
+# CONFIG_WLAN is not set
+CONFIG_INPUT_EVDEV=y
+CONFIG_INPUT_KEYRESET=y
+CONFIG_KEYBOARD_GOLDFISH_EVENTS=y
+# CONFIG_INPUT_MOUSE is not set
+CONFIG_INPUT_JOYSTICK=y
+CONFIG_INPUT_TABLET=y
+CONFIG_INPUT_MISC=y
+CONFIG_INPUT_KEYCHORD=y
+CONFIG_INPUT_UINPUT=y
+CONFIG_INPUT_GPIO=y
+# CONFIG_SERIO_SERPORT is not set
+# CONFIG_VT is not set
+# CONFIG_LEGACY_PTYS is not set
+# CONFIG_DEVMEM is not set
+# CONFIG_DEVKMEM is not set
+CONFIG_SERIAL_AMBA_PL011=y
+CONFIG_SERIAL_AMBA_PL011_CONSOLE=y
+CONFIG_VIRTIO_CONSOLE=y
+# CONFIG_HW_RANDOM is not set
+CONFIG_BATTERY_GOLDFISH=y
+# CONFIG_HWMON is not set
+CONFIG_MEDIA_SUPPORT=y
+CONFIG_FB=y
+CONFIG_FB_GOLDFISH=y
+CONFIG_FB_SIMPLE=y
+CONFIG_BACKLIGHT_LCD_SUPPORT=y
+CONFIG_LOGO=y
+# CONFIG_LOGO_LINUX_MONO is not set
+# CONFIG_LOGO_LINUX_VGA16 is not set
+CONFIG_SOUND=y
+CONFIG_SND=y
+CONFIG_HIDRAW=y
+CONFIG_UHID=y
+CONFIG_HID_A4TECH=y
+CONFIG_HID_ACRUX=y
+CONFIG_HID_ACRUX_FF=y
+CONFIG_HID_APPLE=y
+CONFIG_HID_BELKIN=y
+CONFIG_HID_CHERRY=y
+CONFIG_HID_CHICONY=y
+CONFIG_HID_PRODIKEYS=y
+CONFIG_HID_CYPRESS=y
+CONFIG_HID_DRAGONRISE=y
+CONFIG_DRAGONRISE_FF=y
+CONFIG_HID_EMS_FF=y
+CONFIG_HID_ELECOM=y
+CONFIG_HID_EZKEY=y
+CONFIG_HID_KEYTOUCH=y
+CONFIG_HID_KYE=y
+CONFIG_HID_WALTOP=y
+CONFIG_HID_GYRATION=y
+CONFIG_HID_TWINHAN=y
+CONFIG_HID_KENSINGTON=y
+CONFIG_HID_LCPOWER=y
+CONFIG_HID_LOGITECH=y
+CONFIG_HID_LOGITECH_DJ=y
+CONFIG_LOGITECH_FF=y
+CONFIG_LOGIRUMBLEPAD2_FF=y
+CONFIG_LOGIG940_FF=y
+CONFIG_HID_MAGICMOUSE=y
+CONFIG_HID_MICROSOFT=y
+CONFIG_HID_MONTEREY=y
+CONFIG_HID_MULTITOUCH=y
+CONFIG_HID_ORTEK=y
+CONFIG_HID_PANTHERLORD=y
+CONFIG_PANTHERLORD_FF=y
+CONFIG_HID_PETALYNX=y
+CONFIG_HID_PICOLCD=y
+CONFIG_HID_PRIMAX=y
+CONFIG_HID_SAITEK=y
+CONFIG_HID_SAMSUNG=y
+CONFIG_HID_SPEEDLINK=y
+CONFIG_HID_SUNPLUS=y
+CONFIG_HID_GREENASIA=y
+CONFIG_GREENASIA_FF=y
+CONFIG_HID_SMARTJOYPLUS=y
+CONFIG_SMARTJOYPLUS_FF=y
+CONFIG_HID_TIVO=y
+CONFIG_HID_TOPSEED=y
+CONFIG_HID_THRUSTMASTER=y
+CONFIG_HID_WACOM=y
+CONFIG_HID_WIIMOTE=y
+CONFIG_HID_ZEROPLUS=y
+CONFIG_HID_ZYDACRON=y
+# CONFIG_USB_SUPPORT is not set
+CONFIG_RTC_CLASS=y
+CONFIG_VIRTIO_MMIO=y
+CONFIG_STAGING=y
+CONFIG_ASHMEM=y
+CONFIG_ANDROID_TIMED_GPIO=y
+CONFIG_ANDROID_LOW_MEMORY_KILLER=y
+CONFIG_SYNC=y
+CONFIG_SW_SYNC=y
+CONFIG_SW_SYNC_USER=y
+CONFIG_ION=y
+CONFIG_GOLDFISH_AUDIO=y
+CONFIG_GOLDFISH=y
+CONFIG_GOLDFISH_PIPE=y
+# CONFIG_IOMMU_SUPPORT is not set
+CONFIG_ANDROID=y
+CONFIG_ANDROID_BINDER_IPC=y
+CONFIG_EXT2_FS=y
+CONFIG_EXT4_FS=y
+CONFIG_EXT4_FS_SECURITY=y
+CONFIG_QUOTA=y
+CONFIG_FUSE_FS=y
+CONFIG_CUSE=y
+CONFIG_MSDOS_FS=y
+CONFIG_VFAT_FS=y
+CONFIG_TMPFS=y
+CONFIG_TMPFS_POSIX_ACL=y
+# CONFIG_MISC_FILESYSTEMS is not set
+CONFIG_NFS_FS=y
+CONFIG_ROOT_NFS=y
+CONFIG_NLS_CODEPAGE_437=y
+CONFIG_NLS_ISO8859_1=y
+CONFIG_DEBUG_INFO=y
+CONFIG_DEBUG_FS=y
+CONFIG_MAGIC_SYSRQ=y
+CONFIG_PANIC_TIMEOUT=5
+# CONFIG_SCHED_DEBUG is not set
+CONFIG_SCHEDSTATS=y
+CONFIG_TIMER_STATS=y
+# CONFIG_FTRACE is not set
+CONFIG_ATOMIC64_SELFTEST=y
+CONFIG_DEBUG_RODATA=y
+CONFIG_SECURITY=y
+CONFIG_SECURITY_NETWORK=y
+CONFIG_SECURITY_SELINUX=y
diff --git a/arch/arm64/crypto/Makefile b/arch/arm64/crypto/Makefile
index abb79b3cfcfe..550e02a5aa32 100644
--- a/arch/arm64/crypto/Makefile
+++ b/arch/arm64/crypto/Makefile
@@ -18,7 +18,8 @@ obj-$(CONFIG_CRYPTO_GHASH_ARM64_CE) += ghash-ce.o
ghash-ce-y := ghash-ce-glue.o ghash-ce-core.o
obj-$(CONFIG_CRYPTO_AES_ARM64_CE) += aes-ce-cipher.o
-CFLAGS_aes-ce-cipher.o += -march=armv8-a+crypto
+aes-ce-cipher-y := aes-ce-cipher-glue.o aes-ce-cipher-core.o
+CFLAGS_aes-ce-cipher-core.o += -march=armv8-a+crypto -Wa,-march=armv8-a+crypto $(DISABLE_LTO)
obj-$(CONFIG_CRYPTO_AES_ARM64_CE_CCM) += aes-ce-ccm.o
aes-ce-ccm-y := aes-ce-ccm-glue.o aes-ce-ccm-core.o
diff --git a/arch/arm64/crypto/aes-ce-cipher.c b/arch/arm64/crypto/aes-ce-cipher-core.c
index 50d9fe11d0c8..9f4191774b35 100644
--- a/arch/arm64/crypto/aes-ce-cipher.c
+++ b/arch/arm64/crypto/aes-ce-cipher-core.c
@@ -1,5 +1,5 @@
/*
- * aes-ce-cipher.c - core AES cipher using ARMv8 Crypto Extensions
+ * aes-ce-cipher-core.c - core AES cipher using ARMv8 Crypto Extensions
*
* Copyright (C) 2013 - 2014 Linaro Ltd <ard.biesheuvel@linaro.org>
*
@@ -10,16 +10,10 @@
#include <asm/neon.h>
#include <crypto/aes.h>
-#include <linux/cpufeature.h>
#include <linux/crypto.h>
-#include <linux/module.h>
#include "aes-ce-setkey.h"
-MODULE_DESCRIPTION("Synchronous AES cipher using ARMv8 Crypto Extensions");
-MODULE_AUTHOR("Ard Biesheuvel <ard.biesheuvel@linaro.org>");
-MODULE_LICENSE("GPL v2");
-
struct aes_block {
u8 b[AES_BLOCK_SIZE];
};
@@ -36,7 +30,7 @@ static int num_rounds(struct crypto_aes_ctx *ctx)
return 6 + ctx->key_length / 4;
}
-static void aes_cipher_encrypt(struct crypto_tfm *tfm, u8 dst[], u8 const src[])
+void aes_cipher_encrypt(struct crypto_tfm *tfm, u8 dst[], u8 const src[])
{
struct crypto_aes_ctx *ctx = crypto_tfm_ctx(tfm);
struct aes_block *out = (struct aes_block *)dst;
@@ -81,7 +75,7 @@ static void aes_cipher_encrypt(struct crypto_tfm *tfm, u8 dst[], u8 const src[])
kernel_neon_end();
}
-static void aes_cipher_decrypt(struct crypto_tfm *tfm, u8 dst[], u8 const src[])
+void aes_cipher_decrypt(struct crypto_tfm *tfm, u8 dst[], u8 const src[])
{
struct crypto_aes_ctx *ctx = crypto_tfm_ctx(tfm);
struct aes_block *out = (struct aes_block *)dst;
@@ -223,48 +217,3 @@ int ce_aes_expandkey(struct crypto_aes_ctx *ctx, const u8 *in_key,
return 0;
}
EXPORT_SYMBOL(ce_aes_expandkey);
-
-int ce_aes_setkey(struct crypto_tfm *tfm, const u8 *in_key,
- unsigned int key_len)
-{
- struct crypto_aes_ctx *ctx = crypto_tfm_ctx(tfm);
- int ret;
-
- ret = ce_aes_expandkey(ctx, in_key, key_len);
- if (!ret)
- return 0;
-
- tfm->crt_flags |= CRYPTO_TFM_RES_BAD_KEY_LEN;
- return -EINVAL;
-}
-EXPORT_SYMBOL(ce_aes_setkey);
-
-static struct crypto_alg aes_alg = {
- .cra_name = "aes",
- .cra_driver_name = "aes-ce",
- .cra_priority = 250,
- .cra_flags = CRYPTO_ALG_TYPE_CIPHER,
- .cra_blocksize = AES_BLOCK_SIZE,
- .cra_ctxsize = sizeof(struct crypto_aes_ctx),
- .cra_module = THIS_MODULE,
- .cra_cipher = {
- .cia_min_keysize = AES_MIN_KEY_SIZE,
- .cia_max_keysize = AES_MAX_KEY_SIZE,
- .cia_setkey = ce_aes_setkey,
- .cia_encrypt = aes_cipher_encrypt,
- .cia_decrypt = aes_cipher_decrypt
- }
-};
-
-static int __init aes_mod_init(void)
-{
- return crypto_register_alg(&aes_alg);
-}
-
-static void __exit aes_mod_exit(void)
-{
- crypto_unregister_alg(&aes_alg);
-}
-
-module_cpu_feature_match(AES, aes_mod_init);
-module_exit(aes_mod_exit);
diff --git a/arch/arm64/crypto/aes-ce-cipher-glue.c b/arch/arm64/crypto/aes-ce-cipher-glue.c
new file mode 100644
index 000000000000..442949e7e849
--- /dev/null
+++ b/arch/arm64/crypto/aes-ce-cipher-glue.c
@@ -0,0 +1,83 @@
+/*
+ * aes-ce-cipher.c - core AES cipher using ARMv8 Crypto Extensions
+ *
+ * Copyright (C) 2013 - 2014 Linaro Ltd <ard.biesheuvel@linaro.org>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#include <crypto/aes.h>
+#include <linux/cpufeature.h>
+#include <linux/crypto.h>
+#include <linux/module.h>
+
+#include "aes-ce-setkey.h"
+
+MODULE_DESCRIPTION("Synchronous AES cipher using ARMv8 Crypto Extensions");
+MODULE_AUTHOR("Ard Biesheuvel <ard.biesheuvel@linaro.org>");
+MODULE_LICENSE("GPL v2");
+
+extern void aes_cipher_encrypt(struct crypto_tfm *tfm, u8 dst[], u8 const src[]);
+extern void aes_cipher_decrypt(struct crypto_tfm *tfm, u8 dst[], u8 const src[]);
+
+#ifdef CONFIG_CFI_CLANG
+static inline void __cfi_aes_cipher_encrypt(struct crypto_tfm *tfm, u8 dst[], u8 const src[])
+{
+ aes_cipher_encrypt(tfm, dst, src);
+}
+
+static inline void __cfi_aes_cipher_decrypt(struct crypto_tfm *tfm, u8 dst[], u8 const src[])
+{
+ aes_cipher_decrypt(tfm, dst, src);
+}
+
+#define aes_cipher_encrypt __cfi_aes_cipher_encrypt
+#define aes_cipher_decrypt __cfi_aes_cipher_decrypt
+#endif
+
+int ce_aes_setkey(struct crypto_tfm *tfm, const u8 *in_key,
+ unsigned int key_len)
+{
+ struct crypto_aes_ctx *ctx = crypto_tfm_ctx(tfm);
+ int ret;
+
+ ret = ce_aes_expandkey(ctx, in_key, key_len);
+ if (!ret)
+ return 0;
+
+ tfm->crt_flags |= CRYPTO_TFM_RES_BAD_KEY_LEN;
+ return -EINVAL;
+}
+EXPORT_SYMBOL(ce_aes_setkey);
+
+static struct crypto_alg aes_alg = {
+ .cra_name = "aes",
+ .cra_driver_name = "aes-ce",
+ .cra_priority = 250,
+ .cra_flags = CRYPTO_ALG_TYPE_CIPHER,
+ .cra_blocksize = AES_BLOCK_SIZE,
+ .cra_ctxsize = sizeof(struct crypto_aes_ctx),
+ .cra_module = THIS_MODULE,
+ .cra_cipher = {
+ .cia_min_keysize = AES_MIN_KEY_SIZE,
+ .cia_max_keysize = AES_MAX_KEY_SIZE,
+ .cia_setkey = ce_aes_setkey,
+ .cia_encrypt = aes_cipher_encrypt,
+ .cia_decrypt = aes_cipher_decrypt
+ }
+};
+
+static int __init aes_mod_init(void)
+{
+ return crypto_register_alg(&aes_alg);
+}
+
+static void __exit aes_mod_exit(void)
+{
+ crypto_unregister_alg(&aes_alg);
+}
+
+module_cpu_feature_match(AES, aes_mod_init);
+module_exit(aes_mod_exit);
diff --git a/arch/arm64/crypto/sha1-ce-glue.c b/arch/arm64/crypto/sha1-ce-glue.c
index ea319c055f5d..f08ecf431dcf 100644
--- a/arch/arm64/crypto/sha1-ce-glue.c
+++ b/arch/arm64/crypto/sha1-ce-glue.c
@@ -28,6 +28,14 @@ struct sha1_ce_state {
asmlinkage void sha1_ce_transform(struct sha1_ce_state *sst, u8 const *src,
int blocks);
+#ifdef CONFIG_CFI_CLANG
+static inline void __cfi_sha1_ce_transform(struct sha1_state *sst,
+ u8 const *src, int blocks)
+{
+ sha1_ce_transform((struct sha1_ce_state *)sst, src, blocks);
+}
+#define sha1_ce_transform __cfi_sha1_ce_transform
+#endif
const u32 sha1_ce_offsetof_count = offsetof(struct sha1_ce_state, sst.count);
const u32 sha1_ce_offsetof_finalize = offsetof(struct sha1_ce_state, finalize);
diff --git a/arch/arm64/crypto/sha2-ce-glue.c b/arch/arm64/crypto/sha2-ce-glue.c
index 0ed9486f75dd..c2432210f5e2 100644
--- a/arch/arm64/crypto/sha2-ce-glue.c
+++ b/arch/arm64/crypto/sha2-ce-glue.c
@@ -28,6 +28,14 @@ struct sha256_ce_state {
asmlinkage void sha2_ce_transform(struct sha256_ce_state *sst, u8 const *src,
int blocks);
+#ifdef CONFIG_CFI_CLANG
+static inline void __cfi_sha2_ce_transform(struct sha256_state *sst,
+ u8 const *src, int blocks)
+{
+ sha2_ce_transform((struct sha256_ce_state *)sst, src, blocks);
+}
+#define sha2_ce_transform __cfi_sha2_ce_transform
+#endif
const u32 sha256_ce_offsetof_count = offsetof(struct sha256_ce_state,
sst.count);
diff --git a/arch/arm64/include/asm/Kbuild b/arch/arm64/include/asm/Kbuild
index 44e1d7f10add..28196b18e394 100644
--- a/arch/arm64/include/asm/Kbuild
+++ b/arch/arm64/include/asm/Kbuild
@@ -1,7 +1,6 @@
generic-y += bugs.h
generic-y += clkdev.h
generic-y += cputime.h
-generic-y += current.h
generic-y += delay.h
generic-y += div64.h
generic-y += dma.h
diff --git a/arch/arm64/include/asm/arch_gicv3.h b/arch/arm64/include/asm/arch_gicv3.h
index f8ae6d6e4767..fecf7bf863f0 100644
--- a/arch/arm64/include/asm/arch_gicv3.h
+++ b/arch/arm64/include/asm/arch_gicv3.h
@@ -83,14 +83,20 @@
#define read_gicreg(r) \
({ \
u64 reg; \
- asm volatile("mrs_s %0, " __stringify(r) : "=r" (reg)); \
+ asm volatile(DEFINE_MRS_S \
+ "mrs_s %0, " __stringify(r) "\n" \
+ UNDEFINE_MRS_S \
+ : "=r" (reg)); \
reg; \
})
#define write_gicreg(v,r) \
do { \
u64 __val = (v); \
- asm volatile("msr_s " __stringify(r) ", %0" : : "r" (__val));\
+ asm volatile(DEFINE_MSR_S \
+ "msr_s " __stringify(r) ", %0\n" \
+ UNDEFINE_MSR_S \
+ : : "r" (__val)); \
} while (0)
/*
@@ -102,13 +108,19 @@
static inline void gic_write_eoir(u32 irq)
{
- asm volatile("msr_s " __stringify(ICC_EOIR1_EL1) ", %0" : : "r" ((u64)irq));
+ asm volatile(DEFINE_MSR_S
+ "msr_s " __stringify(ICC_EOIR1_EL1) ", %0\n"
+ UNDEFINE_MSR_S
+ : : "r" ((u64)irq));
isb();
}
static inline void gic_write_dir(u32 irq)
{
- asm volatile("msr_s " __stringify(ICC_DIR_EL1) ", %0" : : "r" ((u64)irq));
+ asm volatile(DEFINE_MSR_S
+ "msr_s " __stringify(ICC_DIR_EL1) ", %0\n"
+ UNDEFINE_MSR_S
+ : : "r" ((u64)irq));
isb();
}
@@ -116,7 +128,10 @@ static inline u64 gic_read_iar_common(void)
{
u64 irqstat;
- asm volatile("mrs_s %0, " __stringify(ICC_IAR1_EL1) : "=r" (irqstat));
+ asm volatile(DEFINE_MRS_S
+ "mrs_s %0, " __stringify(ICC_IAR1_EL1) "\n"
+ UNDEFINE_MRS_S
+ : "=r" (irqstat));
dsb(sy);
return irqstat;
}
@@ -135,7 +150,9 @@ static inline u64 gic_read_iar_cavium_thunderx(void)
asm volatile(
"nop;nop;nop;nop\n\t"
"nop;nop;nop;nop\n\t"
+ DEFINE_MRS_S
"mrs_s %0, " __stringify(ICC_IAR1_EL1) "\n\t"
+ UNDEFINE_MRS_S
"nop;nop;nop;nop"
: "=r" (irqstat));
mb();
@@ -145,43 +162,68 @@ static inline u64 gic_read_iar_cavium_thunderx(void)
static inline void gic_write_pmr(u32 val)
{
- asm volatile("msr_s " __stringify(ICC_PMR_EL1) ", %0" : : "r" ((u64)val));
+ asm volatile(DEFINE_MSR_S
+ "msr_s " __stringify(ICC_PMR_EL1) ", %0\n"
+ UNDEFINE_MSR_S
+ : : "r" ((u64)val));
+ /* As per the architecture specification */
+ mb();
}
static inline void gic_write_ctlr(u32 val)
{
- asm volatile("msr_s " __stringify(ICC_CTLR_EL1) ", %0" : : "r" ((u64)val));
+ asm volatile(DEFINE_MSR_S
+ "msr_s " __stringify(ICC_CTLR_EL1) ", %0\n"
+ UNDEFINE_MSR_S
+ : : "r" ((u64)val));
isb();
}
static inline void gic_write_grpen1(u32 val)
{
- asm volatile("msr_s " __stringify(ICC_GRPEN1_EL1) ", %0" : : "r" ((u64)val));
+ asm volatile(DEFINE_MSR_S
+ "msr_s " __stringify(ICC_GRPEN1_EL1) ", %0\n"
+ UNDEFINE_MSR_S
+ : : "r" ((u64)val));
isb();
}
static inline void gic_write_sgi1r(u64 val)
{
- asm volatile("msr_s " __stringify(ICC_SGI1R_EL1) ", %0" : : "r" (val));
+ asm volatile(DEFINE_MSR_S
+ "msr_s " __stringify(ICC_SGI1R_EL1) ", %0\n"
+ UNDEFINE_MSR_S
+ : : "r" (val));
+ /* As per the architecture specification */
+ mb();
}
static inline u32 gic_read_sre(void)
{
u64 val;
- asm volatile("mrs_s %0, " __stringify(ICC_SRE_EL1) : "=r" (val));
+ asm volatile(DEFINE_MRS_S
+ "mrs_s %0, " __stringify(ICC_SRE_EL1) "\n"
+ UNDEFINE_MRS_S
+ : "=r" (val));
return val;
}
static inline void gic_write_sre(u32 val)
{
- asm volatile("msr_s " __stringify(ICC_SRE_EL1) ", %0" : : "r" ((u64)val));
+ asm volatile(DEFINE_MSR_S
+ "msr_s " __stringify(ICC_SRE_EL1) ", %0\n"
+ UNDEFINE_MSR_S
+ : : "r" ((u64)val));
isb();
}
static inline void gic_write_bpr1(u32 val)
{
- asm volatile("msr_s " __stringify(ICC_BPR1_EL1) ", %0" : : "r" (val));
+ asm volatile(DEFINE_MSR_S
+ "msr_s " __stringify(ICC_BPR1_EL1) ", %x0\n"
+ UNDEFINE_MSR_S
+ : : "rZ" (val));
}
#define gic_read_typer(c) readq_relaxed(c)
diff --git a/arch/arm64/include/asm/assembler.h b/arch/arm64/include/asm/assembler.h
index 3f85bbcd7e40..148849c10604 100644
--- a/arch/arm64/include/asm/assembler.h
+++ b/arch/arm64/include/asm/assembler.h
@@ -42,6 +42,15 @@
msr daifclr, #2
.endm
+ .macro save_and_disable_irq, flags
+ mrs \flags, daif
+ msr daifset, #2
+ .endm
+
+ .macro restore_irq, flags
+ msr daif, \flags
+ .endm
+
/*
* Enable and disable debug exceptions.
*/
@@ -451,6 +460,13 @@ alternative_endif
movk \reg, :abs_g0_nc:\val
.endm
+/*
+ * Return the current thread_info.
+ */
+ .macro get_thread_info, rd
+ mrs \rd, sp_el0
+ .endm
+
.macro pte_to_phys, phys, pte
and \phys, \pte, #(((1 << (48 - PAGE_SHIFT)) - 1) << PAGE_SHIFT)
.endm
diff --git a/arch/arm64/include/asm/cpufeature.h b/arch/arm64/include/asm/cpufeature.h
index 15868eca58de..1dc16f5b54ec 100644
--- a/arch/arm64/include/asm/cpufeature.h
+++ b/arch/arm64/include/asm/cpufeature.h
@@ -221,6 +221,12 @@ static inline bool system_supports_mixed_endian_el0(void)
return id_aa64mmfr0_mixed_endian_el0(read_system_reg(SYS_ID_AA64MMFR0_EL1));
}
+static inline bool system_uses_ttbr0_pan(void)
+{
+ return IS_ENABLED(CONFIG_ARM64_SW_TTBR0_PAN) &&
+ !cpus_have_cap(ARM64_HAS_PAN);
+}
+
#define ARM64_SSBD_UNKNOWN -1
#define ARM64_SSBD_FORCE_DISABLE 0
#define ARM64_SSBD_KERNEL 1
diff --git a/arch/arm64/include/asm/current.h b/arch/arm64/include/asm/current.h
new file mode 100644
index 000000000000..86c404171305
--- /dev/null
+++ b/arch/arm64/include/asm/current.h
@@ -0,0 +1,30 @@
+#ifndef __ASM_CURRENT_H
+#define __ASM_CURRENT_H
+
+#include <linux/compiler.h>
+
+#include <asm/sysreg.h>
+
+#ifndef __ASSEMBLY__
+
+struct task_struct;
+
+/*
+ * We don't use read_sysreg() as we want the compiler to cache the value where
+ * possible.
+ */
+static __always_inline struct task_struct *get_current(void)
+{
+ unsigned long sp_el0;
+
+ asm ("mrs %0, sp_el0" : "=r" (sp_el0));
+
+ return (struct task_struct *)sp_el0;
+}
+
+#define current get_current()
+
+#endif /* __ASSEMBLY__ */
+
+#endif /* __ASM_CURRENT_H */
+
diff --git a/arch/arm64/include/asm/efi.h b/arch/arm64/include/asm/efi.h
index 65615820155e..6c4f9e8febf3 100644
--- a/arch/arm64/include/asm/efi.h
+++ b/arch/arm64/include/asm/efi.h
@@ -1,6 +1,7 @@
#ifndef _ASM_EFI_H
#define _ASM_EFI_H
+#include <asm/cpufeature.h>
#include <asm/io.h>
#include <asm/mmu_context.h>
#include <asm/neon.h>
@@ -78,7 +79,32 @@ static inline void efifb_setup_from_dmi(struct screen_info *si, const char *opt)
static inline void efi_set_pgd(struct mm_struct *mm)
{
- switch_mm(NULL, mm, NULL);
+ __switch_mm(mm);
+
+ if (system_uses_ttbr0_pan()) {
+ if (mm != current->active_mm) {
+ /*
+ * Update the current thread's saved ttbr0 since it is
+ * restored as part of a return from exception. Enable
+ * access to the valid TTBR0_EL1 and invoke the errata
+ * workaround directly since there is no return from
+ * exception when invoking the EFI run-time services.
+ */
+ update_saved_ttbr0(current, mm);
+ uaccess_ttbr0_enable();
+ post_ttbr_update_workaround();
+ } else {
+ /*
+ * Defer the switch to the current thread's TTBR0_EL1
+ * until uaccess_enable(). Restore the current
+ * thread's saved ttbr0 corresponding to its active_mm
+ * (if different from init_mm).
+ */
+ uaccess_ttbr0_disable();
+ if (current->active_mm != &init_mm)
+ update_saved_ttbr0(current, current->active_mm);
+ }
+ }
}
void efi_virtmap_load(void);
diff --git a/arch/arm64/include/asm/elf.h b/arch/arm64/include/asm/elf.h
index 1fb023076dfc..40a8a94db23b 100644
--- a/arch/arm64/include/asm/elf.h
+++ b/arch/arm64/include/asm/elf.h
@@ -169,7 +169,7 @@ extern int arch_setup_additional_pages(struct linux_binprm *bprm,
#ifdef CONFIG_COMPAT
/* PIE load location for compat arm. Must match ARM ELF_ET_DYN_BASE. */
-#define COMPAT_ELF_ET_DYN_BASE 0x000400000UL
+#define COMPAT_ELF_ET_DYN_BASE (2 * TASK_SIZE_32 / 3)
/* AArch32 registers. */
#define COMPAT_ELF_NGREG 18
diff --git a/arch/arm64/include/asm/esr.h b/arch/arm64/include/asm/esr.h
index d14c478976d0..85997c0e5443 100644
--- a/arch/arm64/include/asm/esr.h
+++ b/arch/arm64/include/asm/esr.h
@@ -175,6 +175,12 @@
#define ESR_ELx_SYS64_ISS_SYS_CTR_READ (ESR_ELx_SYS64_ISS_SYS_CTR | \
ESR_ELx_SYS64_ISS_DIR_READ)
+#define ESR_ELx_SYS64_ISS_SYS_CNTVCT (ESR_ELx_SYS64_ISS_SYS_VAL(3, 3, 2, 14, 0) | \
+ ESR_ELx_SYS64_ISS_DIR_READ)
+
+#define ESR_ELx_SYS64_ISS_SYS_CNTFRQ (ESR_ELx_SYS64_ISS_SYS_VAL(3, 3, 0, 14, 0) | \
+ ESR_ELx_SYS64_ISS_DIR_READ)
+
#ifndef __ASSEMBLY__
#include <asm/types.h>
diff --git a/arch/arm64/include/asm/futex.h b/arch/arm64/include/asm/futex.h
index 2a5090fb9113..a891bb6bb3d4 100644
--- a/arch/arm64/include/asm/futex.h
+++ b/arch/arm64/include/asm/futex.h
@@ -21,15 +21,12 @@
#include <linux/futex.h>
#include <linux/uaccess.h>
-#include <asm/alternative.h>
-#include <asm/cpufeature.h>
#include <asm/errno.h>
-#include <asm/sysreg.h>
#define __futex_atomic_op(insn, ret, oldval, uaddr, tmp, oparg) \
+do { \
+ uaccess_enable(); \
asm volatile( \
- ALTERNATIVE("nop", SET_PSTATE_PAN(0), ARM64_HAS_PAN, \
- CONFIG_ARM64_PAN) \
" prfm pstl1strm, %2\n" \
"1: ldxr %w1, %2\n" \
insn "\n" \
@@ -44,11 +41,11 @@
" .popsection\n" \
_ASM_EXTABLE(1b, 4b) \
_ASM_EXTABLE(2b, 4b) \
- ALTERNATIVE("nop", SET_PSTATE_PAN(1), ARM64_HAS_PAN, \
- CONFIG_ARM64_PAN) \
: "=&r" (ret), "=&r" (oldval), "+Q" (*uaddr), "=&r" (tmp) \
: "r" (oparg), "Ir" (-EFAULT) \
- : "memory")
+ : "memory"); \
+ uaccess_disable(); \
+} while (0)
static inline int
arch_futex_atomic_op_inuser(int op, int oparg, int *oval, u32 __user *uaddr)
@@ -102,8 +99,8 @@ futex_atomic_cmpxchg_inatomic(u32 *uval, u32 __user *_uaddr,
return -EFAULT;
uaddr = __uaccess_mask_ptr(_uaddr);
+ uaccess_enable();
asm volatile("// futex_atomic_cmpxchg_inatomic\n"
-ALTERNATIVE("nop", SET_PSTATE_PAN(0), ARM64_HAS_PAN, CONFIG_ARM64_PAN)
" prfm pstl1strm, %2\n"
"1: ldxr %w1, %2\n"
" sub %w3, %w1, %w4\n"
@@ -118,10 +115,10 @@ ALTERNATIVE("nop", SET_PSTATE_PAN(0), ARM64_HAS_PAN, CONFIG_ARM64_PAN)
" .popsection\n"
_ASM_EXTABLE(1b, 4b)
_ASM_EXTABLE(2b, 4b)
-ALTERNATIVE("nop", SET_PSTATE_PAN(1), ARM64_HAS_PAN, CONFIG_ARM64_PAN)
: "+r" (ret), "=&r" (val), "+Q" (*uaddr), "=&r" (tmp)
: "r" (oldval), "r" (newval), "Ir" (-EFAULT)
: "memory");
+ uaccess_disable();
*uval = val;
return ret;
diff --git a/arch/arm64/include/asm/hw_breakpoint.h b/arch/arm64/include/asm/hw_breakpoint.h
index 9510ace570e2..b6b167ac082b 100644
--- a/arch/arm64/include/asm/hw_breakpoint.h
+++ b/arch/arm64/include/asm/hw_breakpoint.h
@@ -77,7 +77,11 @@ static inline void decode_ctrl_reg(u32 reg,
/* Lengths */
#define ARM_BREAKPOINT_LEN_1 0x1
#define ARM_BREAKPOINT_LEN_2 0x3
+#define ARM_BREAKPOINT_LEN_3 0x7
#define ARM_BREAKPOINT_LEN_4 0xf
+#define ARM_BREAKPOINT_LEN_5 0x1f
+#define ARM_BREAKPOINT_LEN_6 0x3f
+#define ARM_BREAKPOINT_LEN_7 0x7f
#define ARM_BREAKPOINT_LEN_8 0xff
/* Kernel stepping */
@@ -119,7 +123,7 @@ struct perf_event;
struct pmu;
extern int arch_bp_generic_fields(struct arch_hw_breakpoint_ctrl ctrl,
- int *gen_len, int *gen_type);
+ int *gen_len, int *gen_type, int *offset);
extern int arch_check_bp_in_kernelspace(struct perf_event *bp);
extern int arch_validate_hwbkpt_settings(struct perf_event *bp);
extern int hw_breakpoint_exceptions_notify(struct notifier_block *unused,
diff --git a/arch/arm64/include/asm/kernel-pgtable.h b/arch/arm64/include/asm/kernel-pgtable.h
index 7e51d1b57c0c..7803343e5881 100644
--- a/arch/arm64/include/asm/kernel-pgtable.h
+++ b/arch/arm64/include/asm/kernel-pgtable.h
@@ -19,6 +19,7 @@
#ifndef __ASM_KERNEL_PGTABLE_H
#define __ASM_KERNEL_PGTABLE_H
+#include <asm/pgtable.h>
#include <asm/sparsemem.h>
/*
@@ -54,6 +55,12 @@
#define SWAPPER_DIR_SIZE (SWAPPER_PGTABLE_LEVELS * PAGE_SIZE)
#define IDMAP_DIR_SIZE (IDMAP_PGTABLE_LEVELS * PAGE_SIZE)
+#ifdef CONFIG_ARM64_SW_TTBR0_PAN
+#define RESERVED_TTBR0_SIZE (PAGE_SIZE)
+#else
+#define RESERVED_TTBR0_SIZE (0)
+#endif
+
/* Initial memory map size */
#if ARM64_SWAPPER_USES_SECTION_MAPS
#define SWAPPER_BLOCK_SHIFT SECTION_SHIFT
diff --git a/arch/arm64/include/asm/kvm_hyp.h b/arch/arm64/include/asm/kvm_hyp.h
index b18e852d27e8..36d6758d580b 100644
--- a/arch/arm64/include/asm/kvm_hyp.h
+++ b/arch/arm64/include/asm/kvm_hyp.h
@@ -29,7 +29,9 @@
({ \
u64 reg; \
asm volatile(ALTERNATIVE("mrs %0, " __stringify(r##nvh),\
- "mrs_s %0, " __stringify(r##vh),\
+ DEFINE_MRS_S \
+ "mrs_s %0, " __stringify(r##vh) "\n"\
+ UNDEFINE_MRS_S, \
ARM64_HAS_VIRT_HOST_EXTN) \
: "=r" (reg)); \
reg; \
@@ -39,7 +41,9 @@
do { \
u64 __val = (u64)(v); \
asm volatile(ALTERNATIVE("msr " __stringify(r##nvh) ", %x0",\
- "msr_s " __stringify(r##vh) ", %x0",\
+ DEFINE_MSR_S \
+ "msr_s " __stringify(r##vh) ", %x0\n"\
+ UNDEFINE_MSR_S, \
ARM64_HAS_VIRT_HOST_EXTN) \
: : "rZ" (__val)); \
} while (0)
diff --git a/arch/arm64/include/asm/kvm_mmu.h b/arch/arm64/include/asm/kvm_mmu.h
index 547519abc751..4287acbe25c7 100644
--- a/arch/arm64/include/asm/kvm_mmu.h
+++ b/arch/arm64/include/asm/kvm_mmu.h
@@ -47,7 +47,7 @@
* If the page is in the bottom half, we have to use the top half. If
* the page is in the top half, we have to use the bottom half:
*
- * T = __virt_to_phys(__hyp_idmap_text_start)
+ * T = __pa_symbol(__hyp_idmap_text_start)
* if (T & BIT(VA_BITS - 1))
* HYP_VA_MIN = 0 //idmap in upper half
* else
@@ -290,7 +290,7 @@ static inline void __kvm_flush_dcache_pud(pud_t pud)
kvm_flush_dcache_to_poc(page_address(page), PUD_SIZE);
}
-#define kvm_virt_to_phys(x) __virt_to_phys((unsigned long)(x))
+#define kvm_virt_to_phys(x) __pa_symbol(x)
void kvm_set_way_flush(struct kvm_vcpu *vcpu);
void kvm_toggle_cache(struct kvm_vcpu *vcpu, bool was_enabled);
diff --git a/arch/arm64/include/asm/memory.h b/arch/arm64/include/asm/memory.h
index ba917be5565a..345020953cc4 100644
--- a/arch/arm64/include/asm/memory.h
+++ b/arch/arm64/include/asm/memory.h
@@ -152,6 +152,11 @@ extern u64 kimage_vaddr;
/* the offset between the kernel virtual and physical mappings */
extern u64 kimage_voffset;
+static inline unsigned long kaslr_offset(void)
+{
+ return kimage_vaddr - KIMAGE_VADDR;
+}
+
/*
* Allow all memory at the discovery stage. We will clip it later.
*/
@@ -192,6 +197,7 @@ static inline void *phys_to_virt(phys_addr_t x)
#define __va(x) ((void *)__phys_to_virt((phys_addr_t)(x)))
#define pfn_to_kaddr(pfn) __va((pfn) << PAGE_SHIFT)
#define virt_to_pfn(x) __phys_to_pfn(__virt_to_phys(x))
+#define sym_to_pfn(x) __phys_to_pfn(__pa_symbol(x))
/*
* virt_to_page(k) convert a _valid_ virtual address to struct page *
diff --git a/arch/arm64/include/asm/mmu.h b/arch/arm64/include/asm/mmu.h
index 6ac34c75f4e1..18b9f159afcd 100644
--- a/arch/arm64/include/asm/mmu.h
+++ b/arch/arm64/include/asm/mmu.h
@@ -16,7 +16,9 @@
#ifndef __ASM_MMU_H
#define __ASM_MMU_H
+
#define USER_ASID_FLAG (UL(1) << 48)
+#define TTBR_ASID_MASK (UL(0xffff) << 48)
#ifndef __ASSEMBLY__
diff --git a/arch/arm64/include/asm/mmu_context.h b/arch/arm64/include/asm/mmu_context.h
index b96c4799f881..c0ae91c39292 100644
--- a/arch/arm64/include/asm/mmu_context.h
+++ b/arch/arm64/include/asm/mmu_context.h
@@ -23,6 +23,7 @@
#include <linux/sched.h>
#include <asm/cacheflush.h>
+#include <asm/cpufeature.h>
#include <asm/proc-fns.h>
#include <asm-generic/mm_hooks.h>
#include <asm/cputype.h>
@@ -44,7 +45,7 @@ static inline void contextidr_thread_switch(struct task_struct *next)
*/
static inline void cpu_set_reserved_ttbr0(void)
{
- unsigned long ttbr = virt_to_phys(empty_zero_page);
+ unsigned long ttbr = __pa_symbol(empty_zero_page);
write_sysreg(ttbr, ttbr0_el1);
isb();
@@ -110,7 +111,7 @@ static inline void cpu_uninstall_idmap(void)
local_flush_tlb_all();
cpu_set_default_tcr_t0sz();
- if (mm != &init_mm)
+ if (mm != &init_mm && !system_uses_ttbr0_pan())
cpu_switch_mm(mm->pgd, mm);
}
@@ -120,14 +121,14 @@ static inline void cpu_install_idmap(void)
local_flush_tlb_all();
cpu_set_idmap_tcr_t0sz();
- cpu_switch_mm(idmap_pg_dir, &init_mm);
+ cpu_switch_mm(lm_alias(idmap_pg_dir), &init_mm);
}
/*
* Atomically replaces the active TTBR1_EL1 PGD with a new VA-compatible PGD,
* avoiding the possibility of conflicting TLB entries being allocated.
*/
-static inline void cpu_replace_ttbr1(pgd_t *pgd)
+static inline void __nocfi cpu_replace_ttbr1(pgd_t *pgd)
{
typedef void (ttbr_replace_func)(phys_addr_t);
extern ttbr_replace_func idmap_cpu_replace_ttbr1;
@@ -135,7 +136,7 @@ static inline void cpu_replace_ttbr1(pgd_t *pgd)
phys_addr_t pgd_phys = virt_to_phys(pgd);
- replace_phys = (void *)virt_to_phys(idmap_cpu_replace_ttbr1);
+ replace_phys = (void *)__pa_symbol(idmap_cpu_replace_ttbr1);
cpu_install_idmap();
replace_phys(pgd_phys);
@@ -170,20 +171,27 @@ enter_lazy_tlb(struct mm_struct *mm, struct task_struct *tsk)
{
}
-/*
- * This is the actual mm switch as far as the scheduler
- * is concerned. No registers are touched. We avoid
- * calling the CPU specific function when the mm hasn't
- * actually changed.
- */
-static inline void
-switch_mm(struct mm_struct *prev, struct mm_struct *next,
- struct task_struct *tsk)
+#ifdef CONFIG_ARM64_SW_TTBR0_PAN
+static inline void update_saved_ttbr0(struct task_struct *tsk,
+ struct mm_struct *mm)
{
- unsigned int cpu = smp_processor_id();
+ if (system_uses_ttbr0_pan()) {
+ u64 ttbr;
+ BUG_ON(mm->pgd == swapper_pg_dir);
+ ttbr = virt_to_phys(mm->pgd) | ASID(mm) << 48;
+ WRITE_ONCE(task_thread_info(tsk)->ttbr0, ttbr);
+ }
+}
+#else
+static inline void update_saved_ttbr0(struct task_struct *tsk,
+ struct mm_struct *mm)
+{
+}
+#endif
- if (prev == next)
- return;
+static inline void __switch_mm(struct mm_struct *next)
+{
+ unsigned int cpu = smp_processor_id();
/*
* init_mm.pgd does not contain any user mappings and it is always
@@ -197,9 +205,28 @@ switch_mm(struct mm_struct *prev, struct mm_struct *next,
check_and_switch_context(next, cpu);
}
+static inline void
+switch_mm(struct mm_struct *prev, struct mm_struct *next,
+ struct task_struct *tsk)
+{
+ if (prev != next)
+ __switch_mm(next);
+
+ /*
+ * Update the saved TTBR0_EL1 of the scheduled-in task as the previous
+ * value may have not been initialised yet (activate_mm caller) or the
+ * ASID has changed since the last run (following the context switch
+ * of another thread of the same process). Avoid setting the reserved
+ * TTBR0_EL1 to swapper_pg_dir (init_mm; e.g. via idle_task_exit).
+ */
+ if (next != &init_mm)
+ update_saved_ttbr0(tsk, next);
+}
+
#define deactivate_mm(tsk,mm) do { } while (0)
-#define activate_mm(prev,next) switch_mm(prev, next, NULL)
+#define activate_mm(prev,next) switch_mm(prev, next, current)
void verify_cpu_asid_bits(void);
+void post_ttbr_update_workaround(void);
#endif
diff --git a/arch/arm64/include/asm/percpu.h b/arch/arm64/include/asm/percpu.h
index 4724b8f0b625..1abad2315fa9 100644
--- a/arch/arm64/include/asm/percpu.h
+++ b/arch/arm64/include/asm/percpu.h
@@ -16,6 +16,7 @@
#ifndef __ASM_PERCPU_H
#define __ASM_PERCPU_H
+#include <asm/stack_pointer.h>
#include <asm/alternative.h>
static inline void set_my_cpu_offset(unsigned long off)
diff --git a/arch/arm64/include/asm/perf_event.h b/arch/arm64/include/asm/perf_event.h
index 38b6a2b49d68..8d5cbec17d80 100644
--- a/arch/arm64/include/asm/perf_event.h
+++ b/arch/arm64/include/asm/perf_event.h
@@ -17,6 +17,8 @@
#ifndef __ASM_PERF_EVENT_H
#define __ASM_PERF_EVENT_H
+#include <asm/stack_pointer.h>
+
#define ARMV8_PMU_MAX_COUNTERS 32
#define ARMV8_PMU_COUNTER_MASK (ARMV8_PMU_MAX_COUNTERS - 1)
diff --git a/arch/arm64/include/asm/pgtable.h b/arch/arm64/include/asm/pgtable.h
index 3a30a3994e4a..7bcd8b96a4b3 100644
--- a/arch/arm64/include/asm/pgtable.h
+++ b/arch/arm64/include/asm/pgtable.h
@@ -52,7 +52,7 @@ extern void __pgd_error(const char *file, int line, unsigned long val);
* for zero-mapped memory areas etc..
*/
extern unsigned long empty_zero_page[PAGE_SIZE / sizeof(unsigned long)];
-#define ZERO_PAGE(vaddr) pfn_to_page(PHYS_PFN(__pa(empty_zero_page)))
+#define ZERO_PAGE(vaddr) phys_to_page(__pa_symbol(empty_zero_page))
#define pte_ERROR(pte) __pte_error(__FILE__, __LINE__, pte_val(pte))
diff --git a/arch/arm64/include/asm/signal32.h b/arch/arm64/include/asm/signal32.h
index eeaa97559bab..81abea0b7650 100644
--- a/arch/arm64/include/asm/signal32.h
+++ b/arch/arm64/include/asm/signal32.h
@@ -22,8 +22,6 @@
#define AARCH32_KERN_SIGRET_CODE_OFFSET 0x500
-extern const compat_ulong_t aarch32_sigret_code[6];
-
int compat_setup_frame(int usig, struct ksignal *ksig, sigset_t *set,
struct pt_regs *regs);
int compat_setup_rt_frame(int usig, struct ksignal *ksig, sigset_t *set,
diff --git a/arch/arm64/include/asm/smp.h b/arch/arm64/include/asm/smp.h
index 022644704a93..d050d720a1b4 100644
--- a/arch/arm64/include/asm/smp.h
+++ b/arch/arm64/include/asm/smp.h
@@ -29,11 +29,22 @@
#ifndef __ASSEMBLY__
+#include <asm/percpu.h>
+
#include <linux/threads.h>
#include <linux/cpumask.h>
#include <linux/thread_info.h>
-#define raw_smp_processor_id() (current_thread_info()->cpu)
+DECLARE_PER_CPU_READ_MOSTLY(int, cpu_number);
+
+/*
+ * We don't use this_cpu_read(cpu_number) as that has implicit writes to
+ * preempt_count, and associated (compiler) barriers, that we'd like to avoid
+ * the expense of. If we're preemptible, the value can be stale at use anyway.
+ * And we can't use this_cpu_ptr() either, as that winds up recursing back
+ * here under CONFIG_DEBUG_PREEMPT=y.
+ */
+#define raw_smp_processor_id() (*raw_cpu_ptr(&cpu_number))
struct seq_file;
@@ -73,6 +84,7 @@ asmlinkage void secondary_start_kernel(void);
*/
struct secondary_data {
void *stack;
+ struct task_struct *task;
long status;
};
diff --git a/arch/arm64/include/asm/stack_pointer.h b/arch/arm64/include/asm/stack_pointer.h
new file mode 100644
index 000000000000..ffcdf742cddf
--- /dev/null
+++ b/arch/arm64/include/asm/stack_pointer.h
@@ -0,0 +1,9 @@
+#ifndef __ASM_STACK_POINTER_H
+#define __ASM_STACK_POINTER_H
+
+/*
+ * how to get the current stack pointer from C
+ */
+register unsigned long current_stack_pointer asm ("sp");
+
+#endif /* __ASM_STACK_POINTER_H */
diff --git a/arch/arm64/include/asm/suspend.h b/arch/arm64/include/asm/suspend.h
index b8a313fd7a09..de5600f40adf 100644
--- a/arch/arm64/include/asm/suspend.h
+++ b/arch/arm64/include/asm/suspend.h
@@ -1,7 +1,7 @@
#ifndef __ASM_SUSPEND_H
#define __ASM_SUSPEND_H
-#define NR_CTX_REGS 10
+#define NR_CTX_REGS 12
#define NR_CALLEE_SAVED_REGS 12
/*
diff --git a/arch/arm64/include/asm/sysreg.h b/arch/arm64/include/asm/sysreg.h
index 88bbe364b6ae..3ae4418a6ba4 100644
--- a/arch/arm64/include/asm/sysreg.h
+++ b/arch/arm64/include/asm/sysreg.h
@@ -109,6 +109,10 @@
#define SCTLR_EL1_CP15BEN (1 << 5)
/* id_aa64isar0 */
+#define ID_AA64ISAR0_DP_SHIFT 44
+#define ID_AA64ISAR0_SM4_SHIFT 40
+#define ID_AA64ISAR0_SM3_SHIFT 36
+#define ID_AA64ISAR0_SHA3_SHIFT 32
#define ID_AA64ISAR0_RDM_SHIFT 28
#define ID_AA64ISAR0_ATOMICS_SHIFT 20
#define ID_AA64ISAR0_CRC32_SHIFT 16
@@ -246,20 +250,39 @@
#include <linux/types.h>
-asm(
-" .irp num,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30\n"
-" .equ .L__reg_num_x\\num, \\num\n"
-" .endr\n"
+#define __DEFINE_MRS_MSR_S_REGNUM \
+" .irp num,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30\n" \
+" .equ .L__reg_num_x\\num, \\num\n" \
+" .endr\n" \
" .equ .L__reg_num_xzr, 31\n"
-"\n"
-" .macro mrs_s, rt, sreg\n"
-" .inst 0xd5200000|(\\sreg)|(.L__reg_num_\\rt)\n"
+
+#define DEFINE_MRS_S \
+ __DEFINE_MRS_MSR_S_REGNUM \
+" .macro mrs_s, rt, sreg\n" \
+" .inst 0xd5200000|(\\sreg)|(.L__reg_num_\\rt)\n" \
" .endm\n"
-"\n"
-" .macro msr_s, sreg, rt\n"
-" .inst 0xd5000000|(\\sreg)|(.L__reg_num_\\rt)\n"
+
+#define DEFINE_MSR_S \
+ __DEFINE_MRS_MSR_S_REGNUM \
+" .macro msr_s, sreg, rt\n" \
+" .inst 0xd5000000|(\\sreg)|(.L__reg_num_\\rt)\n" \
" .endm\n"
-);
+
+#define UNDEFINE_MRS_S \
+" .purgem mrs_s\n"
+
+#define UNDEFINE_MSR_S \
+" .purgem msr_s\n"
+
+#define __mrs_s(r, v) \
+ DEFINE_MRS_S \
+" mrs_s %0, " __stringify(r) "\n" \
+ UNDEFINE_MRS_S : "=r" (v)
+
+#define __msr_s(r, v) \
+ DEFINE_MSR_S \
+" msr_s " __stringify(r) ", %x0\n" \
+ UNDEFINE_MSR_S : : "rZ" (v)
/*
* Unlike read_cpuid, calls to read_sysreg are never expected to be
@@ -285,15 +308,15 @@ asm(
* For registers without architectural names, or simply unsupported by
* GAS.
*/
-#define read_sysreg_s(r) ({ \
- u64 __val; \
- asm volatile("mrs_s %0, " __stringify(r) : "=r" (__val)); \
- __val; \
+#define read_sysreg_s(r) ({ \
+ u64 __val; \
+ asm volatile(__mrs_s(r, __val)); \
+ __val; \
})
-#define write_sysreg_s(v, r) do { \
- u64 __val = (u64)v; \
- asm volatile("msr_s " __stringify(r) ", %x0" : : "rZ" (__val)); \
+#define write_sysreg_s(v, r) do { \
+ u64 __val = (u64)(v); \
+ asm volatile(__msr_s(r, __val)); \
} while (0)
static inline void config_sctlr_el1(u32 clear, u32 set)
diff --git a/arch/arm64/include/asm/thread_info.h b/arch/arm64/include/asm/thread_info.h
index 0dd1bc13f942..bbd5dc7fab78 100644
--- a/arch/arm64/include/asm/thread_info.h
+++ b/arch/arm64/include/asm/thread_info.h
@@ -36,58 +36,31 @@
struct task_struct;
+#include <asm/stack_pointer.h>
#include <asm/types.h>
typedef unsigned long mm_segment_t;
/*
* low level task data that entry.S needs immediate access to.
- * __switch_to() assumes cpu_context follows immediately after cpu_domain.
*/
struct thread_info {
unsigned long flags; /* low level flags */
mm_segment_t addr_limit; /* address limit */
- struct task_struct *task; /* main task structure */
+#ifdef CONFIG_ARM64_SW_TTBR0_PAN
+ u64 ttbr0; /* saved TTBR0_EL1 */
+#endif
int preempt_count; /* 0 => preemptable, <0 => bug */
- int cpu; /* cpu */
};
#define INIT_THREAD_INFO(tsk) \
{ \
- .task = &tsk, \
- .flags = 0, \
.preempt_count = INIT_PREEMPT_COUNT, \
.addr_limit = KERNEL_DS, \
}
-#define init_thread_info (init_thread_union.thread_info)
#define init_stack (init_thread_union.stack)
-/*
- * how to get the current stack pointer from C
- */
-register unsigned long current_stack_pointer asm ("sp");
-
-/*
- * how to get the thread information struct from C
- */
-static inline struct thread_info *current_thread_info(void) __attribute_const__;
-
-/*
- * struct thread_info can be accessed directly via sp_el0.
- *
- * We don't use read_sysreg() as we want the compiler to cache the value where
- * possible.
- */
-static inline struct thread_info *current_thread_info(void)
-{
- unsigned long sp_el0;
-
- asm ("mrs %0, sp_el0" : "=r" (sp_el0));
-
- return (struct thread_info *)sp_el0;
-}
-
#define thread_saved_pc(tsk) \
((unsigned long)(tsk->thread.cpu_context.pc))
#define thread_saved_sp(tsk) \
@@ -112,6 +85,7 @@ static inline struct thread_info *current_thread_info(void)
#define TIF_NEED_RESCHED 1
#define TIF_NOTIFY_RESUME 2 /* callback before returning to user */
#define TIF_FOREIGN_FPSTATE 3 /* CPU's FP state is not current's */
+#define TIF_FSCHECK 4 /* Check FS is USER_DS on return */
#define TIF_NOHZ 7
#define TIF_SYSCALL_TRACE 8
#define TIF_SYSCALL_AUDIT 9
@@ -133,10 +107,12 @@ static inline struct thread_info *current_thread_info(void)
#define _TIF_SYSCALL_AUDIT (1 << TIF_SYSCALL_AUDIT)
#define _TIF_SYSCALL_TRACEPOINT (1 << TIF_SYSCALL_TRACEPOINT)
#define _TIF_SECCOMP (1 << TIF_SECCOMP)
+#define _TIF_FSCHECK (1 << TIF_FSCHECK)
#define _TIF_32BIT (1 << TIF_32BIT)
#define _TIF_WORK_MASK (_TIF_NEED_RESCHED | _TIF_SIGPENDING | \
- _TIF_NOTIFY_RESUME | _TIF_FOREIGN_FPSTATE)
+ _TIF_NOTIFY_RESUME | _TIF_FOREIGN_FPSTATE | \
+ _TIF_FSCHECK)
#define _TIF_SYSCALL_WORK (_TIF_SYSCALL_TRACE | _TIF_SYSCALL_AUDIT | \
_TIF_SYSCALL_TRACEPOINT | _TIF_SECCOMP | \
diff --git a/arch/arm64/include/asm/topology.h b/arch/arm64/include/asm/topology.h
index 8b57339823e9..a4f86ded68c3 100644
--- a/arch/arm64/include/asm/topology.h
+++ b/arch/arm64/include/asm/topology.h
@@ -31,6 +31,17 @@ int pcibus_to_node(struct pci_bus *bus);
cpumask_of_node(pcibus_to_node(bus)))
#endif /* CONFIG_NUMA */
+struct sched_domain;
+#ifdef CONFIG_CPU_FREQ
+#define arch_scale_freq_capacity cpufreq_scale_freq_capacity
+extern unsigned long cpufreq_scale_freq_capacity(struct sched_domain *sd, int cpu);
+#define arch_scale_max_freq_capacity cpufreq_scale_max_freq_capacity
+extern unsigned long cpufreq_scale_max_freq_capacity(struct sched_domain *sd, int cpu);
+#define arch_scale_min_freq_capacity cpufreq_scale_min_freq_capacity
+extern unsigned long cpufreq_scale_min_freq_capacity(struct sched_domain *sd, int cpu);
+#endif
+#define arch_scale_cpu_capacity scale_cpu_capacity
+extern unsigned long scale_cpu_capacity(struct sched_domain *sd, int cpu);
#include <asm-generic/topology.h>
diff --git a/arch/arm64/include/asm/traps.h b/arch/arm64/include/asm/traps.h
index 02e9035b0685..47a9066f7c86 100644
--- a/arch/arm64/include/asm/traps.h
+++ b/arch/arm64/include/asm/traps.h
@@ -37,18 +37,11 @@ void unregister_undef_hook(struct undef_hook *hook);
void arm64_notify_segfault(struct pt_regs *regs, unsigned long addr);
-#ifdef CONFIG_FUNCTION_GRAPH_TRACER
static inline int __in_irqentry_text(unsigned long ptr)
{
return ptr >= (unsigned long)&__irqentry_text_start &&
ptr < (unsigned long)&__irqentry_text_end;
}
-#else
-static inline int __in_irqentry_text(unsigned long ptr)
-{
- return 0;
-}
-#endif
static inline int in_exception_text(unsigned long ptr)
{
diff --git a/arch/arm64/include/asm/uaccess.h b/arch/arm64/include/asm/uaccess.h
index f5cd96c60eb9..216327905d04 100644
--- a/arch/arm64/include/asm/uaccess.h
+++ b/arch/arm64/include/asm/uaccess.h
@@ -18,6 +18,13 @@
#ifndef __ASM_UACCESS_H
#define __ASM_UACCESS_H
+#include <asm/alternative.h>
+#include <asm/kernel-pgtable.h>
+#include <asm/mmu.h>
+#include <asm/sysreg.h>
+
+#ifndef __ASSEMBLY__
+
/*
* User space memory access functions
*/
@@ -26,11 +33,9 @@
#include <linux/string.h>
#include <linux/thread_info.h>
-#include <asm/alternative.h>
#include <asm/cpufeature.h>
#include <asm/processor.h>
#include <asm/ptrace.h>
-#include <asm/sysreg.h>
#include <asm/errno.h>
#include <asm/memory.h>
#include <asm/compiler.h>
@@ -67,6 +72,9 @@ static inline void set_fs(mm_segment_t fs)
{
current_thread_info()->addr_limit = fs;
+ /* On user-mode return, check fs is correct */
+ set_thread_flag(TIF_FSCHECK);
+
/*
* Prevent a mispredicted conditional call to set_fs from forwarding
* the wrong address limit to access_ok under speculation.
@@ -136,6 +144,115 @@ static inline unsigned long __range_ok(unsigned long addr, unsigned long size)
" .popsection\n"
/*
+ * User access enabling/disabling.
+ */
+#ifdef CONFIG_ARM64_SW_TTBR0_PAN
+static inline void __uaccess_ttbr0_disable(void)
+{
+ unsigned long flags, ttbr;
+
+ local_irq_save(flags);
+ ttbr = read_sysreg(ttbr1_el1);
+ ttbr &= ~TTBR_ASID_MASK;
+ /* reserved_ttbr0 placed at the end of swapper_pg_dir */
+ write_sysreg(ttbr + SWAPPER_DIR_SIZE, ttbr0_el1);
+ isb();
+ /* Set reserved ASID */
+ write_sysreg(ttbr, ttbr1_el1);
+ isb();
+ local_irq_restore(flags);
+}
+
+static inline void __uaccess_ttbr0_enable(void)
+{
+ unsigned long flags, ttbr0, ttbr1;
+
+ /*
+ * Disable interrupts to avoid preemption between reading the 'ttbr0'
+ * variable and the MSR. A context switch could trigger an ASID
+ * roll-over and an update of 'ttbr0'.
+ */
+ local_irq_save(flags);
+ ttbr0 = READ_ONCE(current_thread_info()->ttbr0);
+
+ /* Restore active ASID */
+ ttbr1 = read_sysreg(ttbr1_el1);
+ ttbr1 &= ~TTBR_ASID_MASK; /* safety measure */
+ ttbr1 |= ttbr0 & TTBR_ASID_MASK;
+ write_sysreg(ttbr1, ttbr1_el1);
+ isb();
+
+ /* Restore user page table */
+ write_sysreg(ttbr0, ttbr0_el1);
+ isb();
+ local_irq_restore(flags);
+}
+
+static inline bool uaccess_ttbr0_disable(void)
+{
+ if (!system_uses_ttbr0_pan())
+ return false;
+ __uaccess_ttbr0_disable();
+ return true;
+}
+
+static inline bool uaccess_ttbr0_enable(void)
+{
+ if (!system_uses_ttbr0_pan())
+ return false;
+ __uaccess_ttbr0_enable();
+ return true;
+}
+#else
+static inline bool uaccess_ttbr0_disable(void)
+{
+ return false;
+}
+
+static inline bool uaccess_ttbr0_enable(void)
+{
+ return false;
+}
+#endif
+
+#define __uaccess_disable(alt) \
+do { \
+ if (!uaccess_ttbr0_disable()) \
+ asm(ALTERNATIVE("nop", SET_PSTATE_PAN(1), alt, \
+ CONFIG_ARM64_PAN)); \
+} while (0)
+
+#define __uaccess_enable(alt) \
+do { \
+ if (!uaccess_ttbr0_enable()) \
+ asm(ALTERNATIVE("nop", SET_PSTATE_PAN(0), alt, \
+ CONFIG_ARM64_PAN)); \
+} while (0)
+
+static inline void uaccess_disable(void)
+{
+ __uaccess_disable(ARM64_HAS_PAN);
+}
+
+static inline void uaccess_enable(void)
+{
+ __uaccess_enable(ARM64_HAS_PAN);
+}
+
+/*
+ * These functions are no-ops when UAO is present.
+ */
+static inline void uaccess_disable_not_uao(void)
+{
+ __uaccess_disable(ARM64_ALT_PAN_NOT_UAO);
+}
+
+static inline void uaccess_enable_not_uao(void)
+{
+ __uaccess_enable(ARM64_ALT_PAN_NOT_UAO);
+}
+
+/*
* Sanitise a uaccess pointer such that it becomes NULL if above the
* current addr_limit.
*/
@@ -182,8 +299,7 @@ static inline void __user *__uaccess_mask_ptr(const void __user *ptr)
do { \
unsigned long __gu_val; \
__chk_user_ptr(ptr); \
- asm(ALTERNATIVE("nop", SET_PSTATE_PAN(0), ARM64_ALT_PAN_NOT_UAO,\
- CONFIG_ARM64_PAN)); \
+ uaccess_enable_not_uao(); \
switch (sizeof(*(ptr))) { \
case 1: \
__get_user_asm("ldrb", "ldtrb", "%w", __gu_val, (ptr), \
@@ -204,9 +320,8 @@ do { \
default: \
BUILD_BUG(); \
} \
+ uaccess_disable_not_uao(); \
(x) = (__force __typeof__(*(ptr)))__gu_val; \
- asm(ALTERNATIVE("nop", SET_PSTATE_PAN(1), ARM64_ALT_PAN_NOT_UAO,\
- CONFIG_ARM64_PAN)); \
} while (0)
#define __get_user_check(x, ptr, err) \
@@ -256,8 +371,7 @@ do { \
do { \
__typeof__(*(ptr)) __pu_val = (x); \
__chk_user_ptr(ptr); \
- asm(ALTERNATIVE("nop", SET_PSTATE_PAN(0), ARM64_ALT_PAN_NOT_UAO,\
- CONFIG_ARM64_PAN)); \
+ uaccess_enable_not_uao(); \
switch (sizeof(*(ptr))) { \
case 1: \
__put_user_asm("strb", "sttrb", "%w", __pu_val, (ptr), \
@@ -278,8 +392,7 @@ do { \
default: \
BUILD_BUG(); \
} \
- asm(ALTERNATIVE("nop", SET_PSTATE_PAN(1), ARM64_ALT_PAN_NOT_UAO,\
- CONFIG_ARM64_PAN)); \
+ uaccess_disable_not_uao(); \
} while (0)
#define __put_user_check(x, ptr, err) \
@@ -379,4 +492,77 @@ extern long strncpy_from_user(char *dest, const char __user *src, long count);
extern __must_check long strlen_user(const char __user *str);
extern __must_check long strnlen_user(const char __user *str, long n);
+#else /* __ASSEMBLY__ */
+
+#include <asm/assembler.h>
+
+/*
+ * User access enabling/disabling macros.
+ */
+#ifdef CONFIG_ARM64_SW_TTBR0_PAN
+ .macro __uaccess_ttbr0_disable, tmp1
+ mrs \tmp1, ttbr1_el1 // swapper_pg_dir
+ bic \tmp1, \tmp1, #TTBR_ASID_MASK
+ add \tmp1, \tmp1, #SWAPPER_DIR_SIZE // reserved_ttbr0 at the end of swapper_pg_dir
+ msr ttbr0_el1, \tmp1 // set reserved TTBR0_EL1
+ isb
+ sub \tmp1, \tmp1, #SWAPPER_DIR_SIZE
+ msr ttbr1_el1, \tmp1 // set reserved ASID
+ isb
+ .endm
+
+ .macro __uaccess_ttbr0_enable, tmp1, tmp2
+ get_thread_info \tmp1
+ ldr \tmp1, [\tmp1, #TSK_TI_TTBR0] // load saved TTBR0_EL1
+ mrs \tmp2, ttbr1_el1
+ extr \tmp2, \tmp2, \tmp1, #48
+ ror \tmp2, \tmp2, #16
+ msr ttbr1_el1, \tmp2 // set the active ASID
+ isb
+ msr ttbr0_el1, \tmp1 // set the non-PAN TTBR0_EL1
+ isb
+ .endm
+
+ .macro uaccess_ttbr0_disable, tmp1, tmp2
+alternative_if_not ARM64_HAS_PAN
+ save_and_disable_irq \tmp2 // avoid preemption
+ __uaccess_ttbr0_disable \tmp1
+ restore_irq \tmp2
+alternative_else_nop_endif
+ .endm
+
+ .macro uaccess_ttbr0_enable, tmp1, tmp2, tmp3
+alternative_if_not ARM64_HAS_PAN
+ save_and_disable_irq \tmp3 // avoid preemption
+ __uaccess_ttbr0_enable \tmp1, \tmp2
+ restore_irq \tmp3
+alternative_else_nop_endif
+ .endm
+#else
+ .macro uaccess_ttbr0_disable, tmp1, tmp2
+ .endm
+
+ .macro uaccess_ttbr0_enable, tmp1, tmp2, tmp3
+ .endm
+#endif
+
+/*
+ * These macros are no-ops when UAO is present.
+ */
+ .macro uaccess_disable_not_uao, tmp1, tmp2
+ uaccess_ttbr0_disable \tmp1, \tmp2
+alternative_if ARM64_ALT_PAN_NOT_UAO
+ SET_PSTATE_PAN(1)
+alternative_else_nop_endif
+ .endm
+
+ .macro uaccess_enable_not_uao, tmp1, tmp2, tmp3
+ uaccess_ttbr0_enable \tmp1, \tmp2, \tmp3
+alternative_if ARM64_ALT_PAN_NOT_UAO
+ SET_PSTATE_PAN(0)
+alternative_else_nop_endif
+ .endm
+
+#endif /* __ASSEMBLY__ */
+
#endif /* __ASM_UACCESS_H */
diff --git a/arch/arm64/include/uapi/asm/hwcap.h b/arch/arm64/include/uapi/asm/hwcap.h
index a739287ef6a3..1cc77c255b75 100644
--- a/arch/arm64/include/uapi/asm/hwcap.h
+++ b/arch/arm64/include/uapi/asm/hwcap.h
@@ -30,5 +30,10 @@
#define HWCAP_ATOMICS (1 << 8)
#define HWCAP_FPHP (1 << 9)
#define HWCAP_ASIMDHP (1 << 10)
+#define HWCAP_SHA3 (1 << 17)
+#define HWCAP_SM3 (1 << 18)
+#define HWCAP_SM4 (1 << 19)
+#define HWCAP_ASIMDDP (1 << 20)
+#define HWCAP_SHA512 (1 << 21)
#endif /* _UAPI__ASM_HWCAP_H */
diff --git a/arch/arm64/kernel/acpi_parking_protocol.c b/arch/arm64/kernel/acpi_parking_protocol.c
index a32b4011d711..1f5655cd9cc9 100644
--- a/arch/arm64/kernel/acpi_parking_protocol.c
+++ b/arch/arm64/kernel/acpi_parking_protocol.c
@@ -17,6 +17,7 @@
* along with this program. If not, see <http://www.gnu.org/licenses/>.
*/
#include <linux/acpi.h>
+#include <linux/mm.h>
#include <linux/types.h>
#include <asm/cpu_ops.h>
@@ -109,7 +110,7 @@ static int acpi_parking_protocol_cpu_boot(unsigned int cpu)
* that read this address need to convert this address to the
* Boot-Loader's endianness before jumping.
*/
- writeq_relaxed(__pa(secondary_entry), &mailbox->entry_point);
+ writeq_relaxed(__pa_symbol(secondary_entry), &mailbox->entry_point);
writel_relaxed(cpu_entry->gic_cpu_id, &mailbox->cpu_id);
arch_send_wakeup_ipi_mask(cpumask_of(cpu));
diff --git a/arch/arm64/kernel/armv8_deprecated.c b/arch/arm64/kernel/armv8_deprecated.c
index c0ede237c14b..29d2ad8844a5 100644
--- a/arch/arm64/kernel/armv8_deprecated.c
+++ b/arch/arm64/kernel/armv8_deprecated.c
@@ -14,7 +14,6 @@
#include <linux/slab.h>
#include <linux/sysctl.h>
-#include <asm/alternative.h>
#include <asm/cpufeature.h>
#include <asm/insn.h>
#include <asm/opcodes.h>
@@ -285,10 +284,10 @@ static void __init register_insn_emulation_sysctl(struct ctl_table *table)
#define __SWP_LL_SC_LOOPS 4
#define __user_swpX_asm(data, addr, res, temp, temp2, B) \
+do { \
+ uaccess_enable(); \
__asm__ __volatile__( \
" mov %w3, %w7\n" \
- ALTERNATIVE("nop", SET_PSTATE_PAN(0), ARM64_HAS_PAN, \
- CONFIG_ARM64_PAN) \
"0: ldxr"B" %w2, [%4]\n" \
"1: stxr"B" %w0, %w1, [%4]\n" \
" cbz %w0, 2f\n" \
@@ -306,13 +305,13 @@ static void __init register_insn_emulation_sysctl(struct ctl_table *table)
" .popsection" \
_ASM_EXTABLE(0b, 4b) \
_ASM_EXTABLE(1b, 4b) \
- ALTERNATIVE("nop", SET_PSTATE_PAN(1), ARM64_HAS_PAN, \
- CONFIG_ARM64_PAN) \
: "=&r" (res), "+r" (data), "=&r" (temp), "=&r" (temp2) \
: "r" ((unsigned long)addr), "i" (-EAGAIN), \
"i" (-EFAULT), \
"i" (__SWP_LL_SC_LOOPS) \
- : "memory")
+ : "memory"); \
+ uaccess_disable(); \
+} while (0)
#define __user_swp_asm(data, addr, res, temp, temp2) \
__user_swpX_asm(data, addr, res, temp, temp2, "")
diff --git a/arch/arm64/kernel/asm-offsets.c b/arch/arm64/kernel/asm-offsets.c
index bd239b1b7a68..61ae19ef3c1a 100644
--- a/arch/arm64/kernel/asm-offsets.c
+++ b/arch/arm64/kernel/asm-offsets.c
@@ -37,11 +37,13 @@ int main(void)
{
DEFINE(TSK_ACTIVE_MM, offsetof(struct task_struct, active_mm));
BLANK();
- DEFINE(TI_FLAGS, offsetof(struct thread_info, flags));
- DEFINE(TI_PREEMPT, offsetof(struct thread_info, preempt_count));
- DEFINE(TI_ADDR_LIMIT, offsetof(struct thread_info, addr_limit));
- DEFINE(TI_TASK, offsetof(struct thread_info, task));
- DEFINE(TI_CPU, offsetof(struct thread_info, cpu));
+ DEFINE(TSK_TI_FLAGS, offsetof(struct task_struct, thread_info.flags));
+ DEFINE(TSK_TI_PREEMPT, offsetof(struct task_struct, thread_info.preempt_count));
+ DEFINE(TSK_TI_ADDR_LIMIT, offsetof(struct task_struct, thread_info.addr_limit));
+#ifdef CONFIG_ARM64_SW_TTBR0_PAN
+ DEFINE(TSK_TI_TTBR0, offsetof(struct task_struct, thread_info.ttbr0));
+#endif
+ DEFINE(TSK_STACK, offsetof(struct task_struct, stack));
BLANK();
DEFINE(THREAD_CPU_CONTEXT, offsetof(struct task_struct, thread.cpu_context));
BLANK();
@@ -124,6 +126,7 @@ int main(void)
DEFINE(TZ_DSTTIME, offsetof(struct timezone, tz_dsttime));
BLANK();
DEFINE(CPU_BOOT_STACK, offsetof(struct secondary_data, stack));
+ DEFINE(CPU_BOOT_TASK, offsetof(struct secondary_data, task));
BLANK();
#ifdef CONFIG_KVM_ARM_HOST
DEFINE(VCPU_CONTEXT, offsetof(struct kvm_vcpu, arch.ctxt));
diff --git a/arch/arm64/kernel/cpu-reset.h b/arch/arm64/kernel/cpu-reset.h
index d4e9ecb264f0..6c2b1b4f57c9 100644
--- a/arch/arm64/kernel/cpu-reset.h
+++ b/arch/arm64/kernel/cpu-reset.h
@@ -24,7 +24,7 @@ static inline void __noreturn cpu_soft_restart(unsigned long el2_switch,
el2_switch = el2_switch && !is_kernel_in_hyp_mode() &&
is_hyp_mode_available();
- restart = (void *)virt_to_phys(__cpu_soft_restart);
+ restart = (void *)__pa_symbol(__cpu_soft_restart);
cpu_install_idmap();
restart(el2_switch, entry, arg0, arg1, arg2);
diff --git a/arch/arm64/kernel/cpufeature.c b/arch/arm64/kernel/cpufeature.c
index a3ab7dfad50a..ac5d22b8ef8c 100644
--- a/arch/arm64/kernel/cpufeature.c
+++ b/arch/arm64/kernel/cpufeature.c
@@ -23,6 +23,7 @@
#include <linux/sort.h>
#include <linux/stop_machine.h>
#include <linux/types.h>
+#include <linux/mm.h>
#include <asm/cpu.h>
#include <asm/cpufeature.h>
#include <asm/cpu_ops.h>
@@ -81,7 +82,10 @@ cpufeature_pan_not_uao(const struct arm64_cpu_capabilities *entry, int __unused)
static const struct arm64_ftr_bits ftr_id_aa64isar0[] = {
- ARM64_FTR_BITS(FTR_STRICT, FTR_EXACT, 32, 32, 0),
+ ARM64_FTR_BITS(FTR_STRICT, FTR_EXACT, ID_AA64ISAR0_DP_SHIFT, 4, 0),
+ ARM64_FTR_BITS(FTR_STRICT, FTR_EXACT, ID_AA64ISAR0_SM4_SHIFT, 4, 0),
+ ARM64_FTR_BITS(FTR_STRICT, FTR_EXACT, ID_AA64ISAR0_SM3_SHIFT, 4, 0),
+ ARM64_FTR_BITS(FTR_STRICT, FTR_EXACT, ID_AA64ISAR0_SHA3_SHIFT, 4, 0),
ARM64_FTR_BITS(FTR_STRICT, FTR_EXACT, ID_AA64ISAR0_RDM_SHIFT, 4, 0),
ARM64_FTR_BITS(FTR_STRICT, FTR_EXACT, 24, 4, 0),
ARM64_FTR_BITS(FTR_STRICT, FTR_LOWER_SAFE, ID_AA64ISAR0_ATOMICS_SHIFT, 4, 0),
@@ -739,7 +743,7 @@ static bool runs_at_el2(const struct arm64_cpu_capabilities *entry, int __unused
static bool hyp_offset_low(const struct arm64_cpu_capabilities *entry,
int __unused)
{
- phys_addr_t idmap_addr = virt_to_phys(__hyp_idmap_text_start);
+ phys_addr_t idmap_addr = __pa_symbol(__hyp_idmap_text_start);
/*
* Activate the lower HYP offset only if:
@@ -791,7 +795,7 @@ static bool unmap_kernel_at_el0(const struct arm64_cpu_capabilities *entry,
ID_AA64PFR0_CSV3_SHIFT);
}
-static int kpti_install_ng_mappings(void *__unused)
+static int __nocfi kpti_install_ng_mappings(void *__unused)
{
typedef void (kpti_remap_fn)(int, int, phys_addr_t);
extern kpti_remap_fn idmap_kpti_install_ng_mappings;
@@ -959,8 +963,13 @@ static const struct arm64_cpu_capabilities arm64_elf_hwcaps[] = {
HWCAP_CAP(SYS_ID_AA64ISAR0_EL1, ID_AA64ISAR0_AES_SHIFT, FTR_UNSIGNED, 1, CAP_HWCAP, HWCAP_AES),
HWCAP_CAP(SYS_ID_AA64ISAR0_EL1, ID_AA64ISAR0_SHA1_SHIFT, FTR_UNSIGNED, 1, CAP_HWCAP, HWCAP_SHA1),
HWCAP_CAP(SYS_ID_AA64ISAR0_EL1, ID_AA64ISAR0_SHA2_SHIFT, FTR_UNSIGNED, 1, CAP_HWCAP, HWCAP_SHA2),
+ HWCAP_CAP(SYS_ID_AA64ISAR0_EL1, ID_AA64ISAR0_SHA2_SHIFT, FTR_UNSIGNED, 2, CAP_HWCAP, HWCAP_SHA512),
HWCAP_CAP(SYS_ID_AA64ISAR0_EL1, ID_AA64ISAR0_CRC32_SHIFT, FTR_UNSIGNED, 1, CAP_HWCAP, HWCAP_CRC32),
HWCAP_CAP(SYS_ID_AA64ISAR0_EL1, ID_AA64ISAR0_ATOMICS_SHIFT, FTR_UNSIGNED, 2, CAP_HWCAP, HWCAP_ATOMICS),
+ HWCAP_CAP(SYS_ID_AA64ISAR0_EL1, ID_AA64ISAR0_SHA3_SHIFT, FTR_UNSIGNED, 1, CAP_HWCAP, HWCAP_SHA3),
+ HWCAP_CAP(SYS_ID_AA64ISAR0_EL1, ID_AA64ISAR0_SM3_SHIFT, FTR_UNSIGNED, 1, CAP_HWCAP, HWCAP_SM3),
+ HWCAP_CAP(SYS_ID_AA64ISAR0_EL1, ID_AA64ISAR0_SM4_SHIFT, FTR_UNSIGNED, 1, CAP_HWCAP, HWCAP_SM4),
+ HWCAP_CAP(SYS_ID_AA64ISAR0_EL1, ID_AA64ISAR0_DP_SHIFT, FTR_UNSIGNED, 1, CAP_HWCAP, HWCAP_ASIMDDP),
HWCAP_CAP(SYS_ID_AA64PFR0_EL1, ID_AA64PFR0_FP_SHIFT, FTR_SIGNED, 0, CAP_HWCAP, HWCAP_FP),
HWCAP_CAP(SYS_ID_AA64PFR0_EL1, ID_AA64PFR0_FP_SHIFT, FTR_SIGNED, 1, CAP_HWCAP, HWCAP_FPHP),
HWCAP_CAP(SYS_ID_AA64PFR0_EL1, ID_AA64PFR0_ASIMD_SHIFT, FTR_SIGNED, 0, CAP_HWCAP, HWCAP_ASIMD),
diff --git a/arch/arm64/kernel/cpuinfo.c b/arch/arm64/kernel/cpuinfo.c
index b3d5b3e8fbcb..024185799cbd 100644
--- a/arch/arm64/kernel/cpuinfo.c
+++ b/arch/arm64/kernel/cpuinfo.c
@@ -63,6 +63,17 @@ static const char *const hwcap_str[] = {
"atomics",
"fphp",
"asimdhp",
+ "cpuid",
+ "asimdrdm",
+ "jscvt",
+ "fcma",
+ "lrcpc",
+ "dcpop",
+ "sha3",
+ "sm3",
+ "sm4",
+ "asimddp",
+ "sha512",
NULL
};
diff --git a/arch/arm64/kernel/entry.S b/arch/arm64/kernel/entry.S
index ca978d7d98eb..ecebb48c7cf4 100644
--- a/arch/arm64/kernel/entry.S
+++ b/arch/arm64/kernel/entry.S
@@ -32,7 +32,9 @@
#include <asm/memory.h>
#include <asm/mmu.h>
#include <asm/processor.h>
+#include <asm/ptrace.h>
#include <asm/thread_info.h>
+#include <asm/uaccess.h>
#include <asm/asm-uaccess.h>
#include <asm/unistd.h>
#include <asm/kernel-pgtable.h>
@@ -105,7 +107,7 @@ alternative_cb arm64_enable_wa2_handling
alternative_cb_end
ldr_this_cpu \tmp2, arm64_ssbd_callback_required, \tmp1
cbz \tmp2, \targ
- ldr \tmp2, [tsk, #TI_FLAGS]
+ ldr \tmp2, [tsk, #TSK_TI_FLAGS]
tbnz \tmp2, #TIF_SSBD, \targ
mov w0, #ARM_SMCCC_ARCH_WORKAROUND_2
mov w1, #\state
@@ -137,9 +139,8 @@ alternative_cb_end
.if \el == 0
mrs x21, sp_el0
- mov tsk, sp
- and tsk, tsk, #~(THREAD_SIZE - 1) // Ensure MDSCR_EL1.SS is clear,
- ldr x19, [tsk, #TI_FLAGS] // since we can unmask debug
+ ldr_this_cpu tsk, __entry_task, x20 // Ensure MDSCR_EL1.SS is clear,
+ ldr x19, [tsk, #TSK_TI_FLAGS] // since we can unmask debug
disable_step_tsk x19, x20 // exceptions when scheduling.
apply_ssbd 1, 1f, x22, x23
@@ -155,15 +156,41 @@ alternative_cb_end
add x21, sp, #S_FRAME_SIZE
get_thread_info tsk
/* Save the task's original addr_limit and set USER_DS */
- ldr x20, [tsk, #TI_ADDR_LIMIT]
+ ldr x20, [tsk, #TSK_TI_ADDR_LIMIT]
str x20, [sp, #S_ORIG_ADDR_LIMIT]
mov x20, #USER_DS
- str x20, [tsk, #TI_ADDR_LIMIT]
+ str x20, [tsk, #TSK_TI_ADDR_LIMIT]
/* No need to reset PSTATE.UAO, hardware's already set it to 0 for us */
.endif /* \el == 0 */
mrs x22, elr_el1
mrs x23, spsr_el1
stp lr, x21, [sp, #S_LR]
+
+#ifdef CONFIG_ARM64_SW_TTBR0_PAN
+ /*
+ * Set the TTBR0 PAN bit in SPSR. When the exception is taken from
+ * EL0, there is no need to check the state of TTBR0_EL1 since
+ * accesses are always enabled.
+ * Note that the meaning of this bit differs from the ARMv8.1 PAN
+ * feature as all TTBR0_EL1 accesses are disabled, not just those to
+ * user mappings.
+ */
+alternative_if ARM64_HAS_PAN
+ b 1f // skip TTBR0 PAN
+alternative_else_nop_endif
+
+ .if \el != 0
+ mrs x21, ttbr0_el1
+ tst x21, #TTBR_ASID_MASK // Check for the reserved ASID
+ orr x23, x23, #PSR_PAN_BIT // Set the emulated PAN in the saved SPSR
+ b.eq 1f // TTBR0 access already disabled
+ and x23, x23, #~PSR_PAN_BIT // Clear the emulated PAN in the saved SPSR
+ .endif
+
+ __uaccess_ttbr0_disable x21
+1:
+#endif
+
stp x22, x23, [sp, #S_PC]
/*
@@ -194,7 +221,7 @@ alternative_cb_end
.if \el != 0
/* Restore the task's original addr_limit. */
ldr x20, [sp, #S_ORIG_ADDR_LIMIT]
- str x20, [tsk, #TI_ADDR_LIMIT]
+ str x20, [tsk, #TSK_TI_ADDR_LIMIT]
/* No need to restore UAO, it will be restored from SPSR_EL1 */
.endif
@@ -202,6 +229,40 @@ alternative_cb_end
ldp x21, x22, [sp, #S_PC] // load ELR, SPSR
.if \el == 0
ct_user_enter
+ .endif
+
+#ifdef CONFIG_ARM64_SW_TTBR0_PAN
+ /*
+ * Restore access to TTBR0_EL1. If returning to EL0, no need for SPSR
+ * PAN bit checking.
+ */
+alternative_if ARM64_HAS_PAN
+ b 2f // skip TTBR0 PAN
+alternative_else_nop_endif
+
+ .if \el != 0
+ tbnz x22, #22, 1f // Skip re-enabling TTBR0 access if the PSR_PAN_BIT is set
+ .endif
+
+ __uaccess_ttbr0_enable x0, x1
+
+ .if \el == 0
+ /*
+ * Enable errata workarounds only if returning to user. The only
+ * workaround currently required for TTBR0_EL1 changes are for the
+ * Cavium erratum 27456 (broadcast TLBI instructions may cause I-cache
+ * corruption).
+ */
+ bl post_ttbr_update_workaround
+ .endif
+1:
+ .if \el != 0
+ and x22, x22, #~PSR_PAN_BIT // ARMv8.0 CPUs do not understand this bit
+ .endif
+2:
+#endif
+
+ .if \el == 0
ldr x23, [sp, #S_SP] // load return stack pointer
msr sp_el0, x23
tst x22, #PSR_MODE32_BIT // native task?
@@ -221,6 +282,7 @@ alternative_else_nop_endif
apply_ssbd 0, 5f, x0, x1
5:
.endif
+
msr elr_el1, x21 // set up the return data
msr spsr_el1, x22
ldp x0, x1, [sp, #16 * 0]
@@ -257,21 +319,18 @@ alternative_insn eret, nop, ARM64_UNMAP_KERNEL_AT_EL0
.endif
.endm
- .macro get_thread_info, rd
- mrs \rd, sp_el0
- .endm
-
.macro irq_stack_entry
mov x19, sp // preserve the original sp
/*
- * Compare sp with the current thread_info, if the top
- * ~(THREAD_SIZE - 1) bits match, we are on a task stack, and
- * should switch to the irq stack.
+ * Compare sp with the base of the task stack.
+ * If the top ~(THREAD_SIZE - 1) bits match, we are on a task stack,
+ * and should switch to the irq stack.
*/
- and x25, x19, #~(THREAD_SIZE - 1)
- cmp x25, tsk
- b.ne 9998f
+ ldr x25, [tsk, TSK_STACK]
+ eor x25, x25, x19
+ and x25, x25, #~(THREAD_SIZE - 1)
+ cbnz x25, 9998f
adr_this_cpu x25, irq_stack, x26
mov x26, #IRQ_STACK_START_SP
@@ -501,9 +560,9 @@ el1_irq:
irq_handler
#ifdef CONFIG_PREEMPT
- ldr w24, [tsk, #TI_PREEMPT] // get preempt count
+ ldr w24, [tsk, #TSK_TI_PREEMPT] // get preempt count
cbnz w24, 1f // preempt count != 0
- ldr x0, [tsk, #TI_FLAGS] // get flags
+ ldr x0, [tsk, #TSK_TI_FLAGS] // get flags
tbz x0, #TIF_NEED_RESCHED, 1f // needs rescheduling?
bl el1_preempt
1:
@@ -518,7 +577,7 @@ ENDPROC(el1_irq)
el1_preempt:
mov x24, lr
1: bl preempt_schedule_irq // irq en/disable is done inside
- ldr x0, [tsk, #TI_FLAGS] // get new tasks TI_FLAGS
+ ldr x0, [tsk, #TSK_TI_FLAGS] // get new tasks TI_FLAGS
tbnz x0, #TIF_NEED_RESCHED, 1b // needs rescheduling?
ret x24
#endif
@@ -757,8 +816,7 @@ ENTRY(cpu_switch_to)
ldp x29, x9, [x8], #16
ldr lr, [x8]
mov sp, x9
- and x9, x9, #~(THREAD_SIZE - 1)
- msr sp_el0, x9
+ msr sp_el0, x1
ret
ENDPROC(cpu_switch_to)
@@ -769,7 +827,7 @@ ENDPROC(cpu_switch_to)
ret_fast_syscall:
disable_irq // disable interrupts
str x0, [sp, #S_X0] // returned x0
- ldr x1, [tsk, #TI_FLAGS] // re-check for syscall tracing
+ ldr x1, [tsk, #TSK_TI_FLAGS] // re-check for syscall tracing
and x2, x1, #_TIF_SYSCALL_WORK
cbnz x2, ret_fast_syscall_trace
and x2, x1, #_TIF_WORK_MASK
@@ -789,14 +847,14 @@ work_pending:
#ifdef CONFIG_TRACE_IRQFLAGS
bl trace_hardirqs_on // enabled while in userspace
#endif
- ldr x1, [tsk, #TI_FLAGS] // re-check for single-step
+ ldr x1, [tsk, #TSK_TI_FLAGS] // re-check for single-step
b finish_ret_to_user
/*
* "slow" syscall return path.
*/
ret_to_user:
disable_irq // disable interrupts
- ldr x1, [tsk, #TI_FLAGS]
+ ldr x1, [tsk, #TSK_TI_FLAGS]
and x2, x1, #_TIF_WORK_MASK
cbnz x2, work_pending
finish_ret_to_user:
@@ -829,7 +887,7 @@ el0_svc_naked: // compat entry point
enable_dbg_and_irq
ct_user_exit 1
- ldr x16, [tsk, #TI_FLAGS] // check for syscall hooks
+ ldr x16, [tsk, #TSK_TI_FLAGS] // check for syscall hooks
tst x16, #_TIF_SYSCALL_WORK
b.ne __sys_trace
cmp scno, sc_nr // check upper syscall limit
@@ -891,14 +949,24 @@ __ni_sys_trace:
.macro tramp_map_kernel, tmp
mrs \tmp, ttbr1_el1
- sub \tmp, \tmp, #SWAPPER_DIR_SIZE
+ sub \tmp, \tmp, #(SWAPPER_DIR_SIZE + RESERVED_TTBR0_SIZE)
bic \tmp, \tmp, #USER_ASID_FLAG
msr ttbr1_el1, \tmp
+#ifdef CONFIG_ARCH_MSM8996
+ /* ASID already in \tmp[63:48] */
+ movk \tmp, #:abs_g2_nc:(TRAMP_VALIAS >> 12)
+ movk \tmp, #:abs_g1_nc:(TRAMP_VALIAS >> 12)
+ /* 2MB boundary containing the vectors, so we nobble the walk cache */
+ movk \tmp, #:abs_g0_nc:((TRAMP_VALIAS & ~(SZ_2M - 1)) >> 12)
+ isb
+ tlbi vae1, \tmp
+ dsb nsh
+#endif /* CONFIG_ARCH_MSM8996 */
.endm
.macro tramp_unmap_kernel, tmp
mrs \tmp, ttbr1_el1
- add \tmp, \tmp, #SWAPPER_DIR_SIZE
+ add \tmp, \tmp, #(SWAPPER_DIR_SIZE + RESERVED_TTBR0_SIZE)
orr \tmp, \tmp, #USER_ASID_FLAG
msr ttbr1_el1, \tmp
/*
@@ -925,7 +993,9 @@ __ni_sys_trace:
tramp_map_kernel x30
#ifdef CONFIG_RANDOMIZE_BASE
adr x30, tramp_vectors + PAGE_SIZE
+#ifndef CONFIG_ARCH_MSM8996
isb
+#endif
ldr x30, [x30]
#else
ldr x30, =vectors
diff --git a/arch/arm64/kernel/head.S b/arch/arm64/kernel/head.S
index fa52817d84c5..d479185d29c9 100644
--- a/arch/arm64/kernel/head.S
+++ b/arch/arm64/kernel/head.S
@@ -326,14 +326,14 @@ __create_page_tables:
* dirty cache lines being evicted.
*/
adrp x0, idmap_pg_dir
- adrp x1, swapper_pg_dir + SWAPPER_DIR_SIZE
+ adrp x1, swapper_pg_dir + SWAPPER_DIR_SIZE + RESERVED_TTBR0_SIZE
bl __inval_cache_range
/*
* Clear the idmap and swapper page tables.
*/
adrp x0, idmap_pg_dir
- adrp x6, swapper_pg_dir + SWAPPER_DIR_SIZE
+ adrp x6, swapper_pg_dir + SWAPPER_DIR_SIZE + RESERVED_TTBR0_SIZE
1: stp xzr, xzr, [x0], #16
stp xzr, xzr, [x0], #16
stp xzr, xzr, [x0], #16
@@ -412,7 +412,7 @@ __create_page_tables:
* tables again to remove any speculatively loaded cache lines.
*/
adrp x0, idmap_pg_dir
- adrp x1, swapper_pg_dir + SWAPPER_DIR_SIZE
+ adrp x1, swapper_pg_dir + SWAPPER_DIR_SIZE + RESERVED_TTBR0_SIZE
dmb sy
bl __inval_cache_range
@@ -428,7 +428,8 @@ ENDPROC(__create_page_tables)
__primary_switched:
adrp x4, init_thread_union
add sp, x4, #THREAD_SIZE
- msr sp_el0, x4 // Save thread_info
+ adr_l x5, init_task
+ msr sp_el0, x5 // Save thread_info
adr_l x8, vectors // load VBAR_EL1 with virtual
msr vbar_el1, x8 // vector table address
@@ -700,10 +701,10 @@ __secondary_switched:
isb
adr_l x0, secondary_data
- ldr x0, [x0, #CPU_BOOT_STACK] // get secondary_data.stack
- mov sp, x0
- and x0, x0, #~(THREAD_SIZE - 1)
- msr sp_el0, x0 // save thread_info
+ ldr x1, [x0, #CPU_BOOT_STACK] // get secondary_data.stack
+ mov sp, x1
+ ldr x2, [x0, #CPU_BOOT_TASK]
+ msr sp_el0, x2
mov x29, #0
b secondary_start_kernel
ENDPROC(__secondary_switched)
diff --git a/arch/arm64/kernel/hibernate.c b/arch/arm64/kernel/hibernate.c
index f6e71c73cceb..db603cdd44a4 100644
--- a/arch/arm64/kernel/hibernate.c
+++ b/arch/arm64/kernel/hibernate.c
@@ -50,9 +50,6 @@
*/
extern int in_suspend;
-/* Find a symbols alias in the linear map */
-#define LMADDR(x) phys_to_virt(virt_to_phys(x))
-
/* Do we need to reset el2? */
#define el2_reset_needed() (is_hyp_mode_available() && !is_kernel_in_hyp_mode())
@@ -102,8 +99,8 @@ static inline void arch_hdr_invariants(struct arch_hibernate_hdr_invariants *i)
int pfn_is_nosave(unsigned long pfn)
{
- unsigned long nosave_begin_pfn = virt_to_pfn(&__nosave_begin);
- unsigned long nosave_end_pfn = virt_to_pfn(&__nosave_end - 1);
+ unsigned long nosave_begin_pfn = sym_to_pfn(&__nosave_begin);
+ unsigned long nosave_end_pfn = sym_to_pfn(&__nosave_end - 1);
return (pfn >= nosave_begin_pfn) && (pfn <= nosave_end_pfn);
}
@@ -125,12 +122,12 @@ int arch_hibernation_header_save(void *addr, unsigned int max_size)
return -EOVERFLOW;
arch_hdr_invariants(&hdr->invariants);
- hdr->ttbr1_el1 = virt_to_phys(swapper_pg_dir);
+ hdr->ttbr1_el1 = __pa_symbol(swapper_pg_dir);
hdr->reenter_kernel = _cpu_resume;
/* We can't use __hyp_get_vectors() because kvm may still be loaded */
if (el2_reset_needed())
- hdr->__hyp_stub_vectors = virt_to_phys(__hyp_stub_vectors);
+ hdr->__hyp_stub_vectors = __pa_symbol(__hyp_stub_vectors);
else
hdr->__hyp_stub_vectors = 0;
@@ -471,7 +468,6 @@ int swsusp_arch_resume(void)
void *zero_page;
size_t exit_size;
pgd_t *tmp_pg_dir;
- void *lm_restore_pblist;
phys_addr_t phys_hibernate_exit;
void __noreturn (*hibernate_exit)(phys_addr_t, phys_addr_t, void *,
void *, phys_addr_t, phys_addr_t);
@@ -492,12 +488,6 @@ int swsusp_arch_resume(void)
goto out;
/*
- * Since we only copied the linear map, we need to find restore_pblist's
- * linear map address.
- */
- lm_restore_pblist = LMADDR(restore_pblist);
-
- /*
* We need a zero page that is zero before & after resume in order to
* to break before make on the ttbr1 page tables.
*/
@@ -548,7 +538,7 @@ int swsusp_arch_resume(void)
}
hibernate_exit(virt_to_phys(tmp_pg_dir), resume_hdr.ttbr1_el1,
- resume_hdr.reenter_kernel, lm_restore_pblist,
+ resume_hdr.reenter_kernel, restore_pblist,
resume_hdr.__hyp_stub_vectors, virt_to_phys(zero_page));
out:
diff --git a/arch/arm64/kernel/hw_breakpoint.c b/arch/arm64/kernel/hw_breakpoint.c
index 0b9e5f6290f9..fb0082ab40a7 100644
--- a/arch/arm64/kernel/hw_breakpoint.c
+++ b/arch/arm64/kernel/hw_breakpoint.c
@@ -318,9 +318,21 @@ static int get_hbp_len(u8 hbp_len)
case ARM_BREAKPOINT_LEN_2:
len_in_bytes = 2;
break;
+ case ARM_BREAKPOINT_LEN_3:
+ len_in_bytes = 3;
+ break;
case ARM_BREAKPOINT_LEN_4:
len_in_bytes = 4;
break;
+ case ARM_BREAKPOINT_LEN_5:
+ len_in_bytes = 5;
+ break;
+ case ARM_BREAKPOINT_LEN_6:
+ len_in_bytes = 6;
+ break;
+ case ARM_BREAKPOINT_LEN_7:
+ len_in_bytes = 7;
+ break;
case ARM_BREAKPOINT_LEN_8:
len_in_bytes = 8;
break;
@@ -350,7 +362,7 @@ int arch_check_bp_in_kernelspace(struct perf_event *bp)
* to generic breakpoint descriptions.
*/
int arch_bp_generic_fields(struct arch_hw_breakpoint_ctrl ctrl,
- int *gen_len, int *gen_type)
+ int *gen_len, int *gen_type, int *offset)
{
/* Type */
switch (ctrl.type) {
@@ -370,17 +382,33 @@ int arch_bp_generic_fields(struct arch_hw_breakpoint_ctrl ctrl,
return -EINVAL;
}
+ if (!ctrl.len)
+ return -EINVAL;
+ *offset = __ffs(ctrl.len);
+
/* Len */
- switch (ctrl.len) {
+ switch (ctrl.len >> *offset) {
case ARM_BREAKPOINT_LEN_1:
*gen_len = HW_BREAKPOINT_LEN_1;
break;
case ARM_BREAKPOINT_LEN_2:
*gen_len = HW_BREAKPOINT_LEN_2;
break;
+ case ARM_BREAKPOINT_LEN_3:
+ *gen_len = HW_BREAKPOINT_LEN_3;
+ break;
case ARM_BREAKPOINT_LEN_4:
*gen_len = HW_BREAKPOINT_LEN_4;
break;
+ case ARM_BREAKPOINT_LEN_5:
+ *gen_len = HW_BREAKPOINT_LEN_5;
+ break;
+ case ARM_BREAKPOINT_LEN_6:
+ *gen_len = HW_BREAKPOINT_LEN_6;
+ break;
+ case ARM_BREAKPOINT_LEN_7:
+ *gen_len = HW_BREAKPOINT_LEN_7;
+ break;
case ARM_BREAKPOINT_LEN_8:
*gen_len = HW_BREAKPOINT_LEN_8;
break;
@@ -424,9 +452,21 @@ static int arch_build_bp_info(struct perf_event *bp)
case HW_BREAKPOINT_LEN_2:
info->ctrl.len = ARM_BREAKPOINT_LEN_2;
break;
+ case HW_BREAKPOINT_LEN_3:
+ info->ctrl.len = ARM_BREAKPOINT_LEN_3;
+ break;
case HW_BREAKPOINT_LEN_4:
info->ctrl.len = ARM_BREAKPOINT_LEN_4;
break;
+ case HW_BREAKPOINT_LEN_5:
+ info->ctrl.len = ARM_BREAKPOINT_LEN_5;
+ break;
+ case HW_BREAKPOINT_LEN_6:
+ info->ctrl.len = ARM_BREAKPOINT_LEN_6;
+ break;
+ case HW_BREAKPOINT_LEN_7:
+ info->ctrl.len = ARM_BREAKPOINT_LEN_7;
+ break;
case HW_BREAKPOINT_LEN_8:
info->ctrl.len = ARM_BREAKPOINT_LEN_8;
break;
@@ -518,18 +558,17 @@ int arch_validate_hwbkpt_settings(struct perf_event *bp)
default:
return -EINVAL;
}
-
- info->address &= ~alignment_mask;
- info->ctrl.len <<= offset;
} else {
if (info->ctrl.type == ARM_BREAKPOINT_EXECUTE)
alignment_mask = 0x3;
else
alignment_mask = 0x7;
- if (info->address & alignment_mask)
- return -EINVAL;
+ offset = info->address & alignment_mask;
}
+ info->address &= ~alignment_mask;
+ info->ctrl.len <<= offset;
+
/*
* Disallow per-task kernel breakpoints since these would
* complicate the stepping code.
@@ -662,12 +701,47 @@ unlock:
}
NOKPROBE_SYMBOL(breakpoint_handler);
+/*
+ * Arm64 hardware does not always report a watchpoint hit address that matches
+ * one of the watchpoints set. It can also report an address "near" the
+ * watchpoint if a single instruction access both watched and unwatched
+ * addresses. There is no straight-forward way, short of disassembling the
+ * offending instruction, to map that address back to the watchpoint. This
+ * function computes the distance of the memory access from the watchpoint as a
+ * heuristic for the likelyhood that a given access triggered the watchpoint.
+ *
+ * See Section D2.10.5 "Determining the memory location that caused a Watchpoint
+ * exception" of ARMv8 Architecture Reference Manual for details.
+ *
+ * The function returns the distance of the address from the bytes watched by
+ * the watchpoint. In case of an exact match, it returns 0.
+ */
+static u64 get_distance_from_watchpoint(unsigned long addr, u64 val,
+ struct arch_hw_breakpoint_ctrl *ctrl)
+{
+ u64 wp_low, wp_high;
+ u32 lens, lene;
+
+ lens = __ffs(ctrl->len);
+ lene = __fls(ctrl->len);
+
+ wp_low = val + lens;
+ wp_high = val + lene;
+ if (addr < wp_low)
+ return wp_low - addr;
+ else if (addr > wp_high)
+ return addr - wp_high;
+ else
+ return 0;
+}
+
static int watchpoint_handler(unsigned long addr, unsigned int esr,
struct pt_regs *regs)
{
- int i, step = 0, *kernel_step, access;
+ int i, step = 0, *kernel_step, access, closest_match = 0;
+ u64 min_dist = -1, dist;
u32 ctrl_reg;
- u64 val, alignment_mask;
+ u64 val;
struct perf_event *wp, **slots;
struct debug_info *debug_info;
struct arch_hw_breakpoint *info;
@@ -676,35 +750,15 @@ static int watchpoint_handler(unsigned long addr, unsigned int esr,
slots = this_cpu_ptr(wp_on_reg);
debug_info = &current->thread.debug;
+ /*
+ * Find all watchpoints that match the reported address. If no exact
+ * match is found. Attribute the hit to the closest watchpoint.
+ */
+ rcu_read_lock();
for (i = 0; i < core_num_wrps; ++i) {
- rcu_read_lock();
-
wp = slots[i];
-
if (wp == NULL)
- goto unlock;
-
- info = counter_arch_bp(wp);
- /* AArch32 watchpoints are either 4 or 8 bytes aligned. */
- if (is_compat_task()) {
- if (info->ctrl.len == ARM_BREAKPOINT_LEN_8)
- alignment_mask = 0x7;
- else
- alignment_mask = 0x3;
- } else {
- alignment_mask = 0x7;
- }
-
- /* Check if the watchpoint value matches. */
- val = read_wb_reg(AARCH64_DBG_REG_WVR, i);
- if (val != (untagged_addr(addr) & ~alignment_mask))
- goto unlock;
-
- /* Possible match, check the byte address select to confirm. */
- ctrl_reg = read_wb_reg(AARCH64_DBG_REG_WCR, i);
- decode_ctrl_reg(ctrl_reg, &ctrl);
- if (!((1 << (addr & alignment_mask)) & ctrl.len))
- goto unlock;
+ continue;
/*
* Check that the access type matches.
@@ -713,18 +767,41 @@ static int watchpoint_handler(unsigned long addr, unsigned int esr,
access = (esr & AARCH64_ESR_ACCESS_MASK) ? HW_BREAKPOINT_W :
HW_BREAKPOINT_R;
if (!(access & hw_breakpoint_type(wp)))
- goto unlock;
+ continue;
+ /* Check if the watchpoint value and byte select match. */
+ val = read_wb_reg(AARCH64_DBG_REG_WVR, i);
+ ctrl_reg = read_wb_reg(AARCH64_DBG_REG_WCR, i);
+ decode_ctrl_reg(ctrl_reg, &ctrl);
+ dist = get_distance_from_watchpoint(addr, val, &ctrl);
+ if (dist < min_dist) {
+ min_dist = dist;
+ closest_match = i;
+ }
+ /* Is this an exact match? */
+ if (dist != 0)
+ continue;
+
+ info = counter_arch_bp(wp);
info->trigger = addr;
perf_bp_event(wp, regs);
/* Do we need to handle the stepping? */
if (is_default_overflow_handler(wp))
step = 1;
+ }
+ if (min_dist > 0 && min_dist != -1) {
+ /* No exact match found. */
+ wp = slots[closest_match];
+ info = counter_arch_bp(wp);
+ info->trigger = addr;
+ perf_bp_event(wp, regs);
-unlock:
- rcu_read_unlock();
+ /* Do we need to handle the stepping? */
+ if (is_default_overflow_handler(wp))
+ step = 1;
}
+ rcu_read_unlock();
if (!step)
return 0;
diff --git a/arch/arm64/kernel/insn.c b/arch/arm64/kernel/insn.c
index 6f2ac4fc66ca..f607b387fcdc 100644
--- a/arch/arm64/kernel/insn.c
+++ b/arch/arm64/kernel/insn.c
@@ -97,7 +97,7 @@ static void __kprobes *patch_map(void *addr, int fixmap)
if (module && IS_ENABLED(CONFIG_DEBUG_SET_MODULE_RONX))
page = vmalloc_to_page(addr);
else if (!module)
- page = pfn_to_page(PHYS_PFN(__pa(addr)));
+ page = phys_to_page(__pa_symbol(addr));
else
return addr;
diff --git a/arch/arm64/kernel/io.c b/arch/arm64/kernel/io.c
index 354be2a872ae..79b17384effa 100644
--- a/arch/arm64/kernel/io.c
+++ b/arch/arm64/kernel/io.c
@@ -25,8 +25,7 @@
*/
void __memcpy_fromio(void *to, const volatile void __iomem *from, size_t count)
{
- while (count && (!IS_ALIGNED((unsigned long)from, 8) ||
- !IS_ALIGNED((unsigned long)to, 8))) {
+ while (count && !IS_ALIGNED((unsigned long)from, 8)) {
*(u8 *)to = __raw_readb(from);
from++;
to++;
@@ -54,23 +53,22 @@ EXPORT_SYMBOL(__memcpy_fromio);
*/
void __memcpy_toio(volatile void __iomem *to, const void *from, size_t count)
{
- while (count && (!IS_ALIGNED((unsigned long)to, 8) ||
- !IS_ALIGNED((unsigned long)from, 8))) {
- __raw_writeb(*(volatile u8 *)from, to);
+ while (count && !IS_ALIGNED((unsigned long)to, 8)) {
+ __raw_writeb(*(u8 *)from, to);
from++;
to++;
count--;
}
while (count >= 8) {
- __raw_writeq(*(volatile u64 *)from, to);
+ __raw_writeq(*(u64 *)from, to);
from += 8;
to += 8;
count -= 8;
}
while (count) {
- __raw_writeb(*(volatile u8 *)from, to);
+ __raw_writeb(*(u8 *)from, to);
from++;
to++;
count--;
diff --git a/arch/arm64/kernel/module.lds b/arch/arm64/kernel/module.lds
index 8949f6c6f729..05881e2b414c 100644
--- a/arch/arm64/kernel/module.lds
+++ b/arch/arm64/kernel/module.lds
@@ -1,3 +1,3 @@
SECTIONS {
- .plt (NOLOAD) : { BYTE(0) }
+ .plt : { BYTE(0) }
}
diff --git a/arch/arm64/kernel/process.c b/arch/arm64/kernel/process.c
index e917d119490c..ce1c9336be14 100644
--- a/arch/arm64/kernel/process.c
+++ b/arch/arm64/kernel/process.c
@@ -45,6 +45,7 @@
#include <linux/personality.h>
#include <linux/notifier.h>
#include <trace/events/power.h>
+#include <linux/percpu.h>
#include <asm/alternative.h>
#include <asm/compat.h>
@@ -166,6 +167,70 @@ void machine_restart(char *cmd)
while (1);
}
+/*
+ * dump a block of kernel memory from around the given address
+ */
+static void show_data(unsigned long addr, int nbytes, const char *name)
+{
+ int i, j;
+ int nlines;
+ u32 *p;
+
+ /*
+ * don't attempt to dump non-kernel addresses or
+ * values that are probably just small negative numbers
+ */
+ if (addr < PAGE_OFFSET || addr > -256UL)
+ return;
+
+ printk("\n%s: %#lx:\n", name, addr);
+
+ /*
+ * round address down to a 32 bit boundary
+ * and always dump a multiple of 32 bytes
+ */
+ p = (u32 *)(addr & ~(sizeof(u32) - 1));
+ nbytes += (addr & (sizeof(u32) - 1));
+ nlines = (nbytes + 31) / 32;
+
+
+ for (i = 0; i < nlines; i++) {
+ /*
+ * just display low 16 bits of address to keep
+ * each line of the dump < 80 characters
+ */
+ printk("%04lx ", (unsigned long)p & 0xffff);
+ for (j = 0; j < 8; j++) {
+ u32 data;
+ if (probe_kernel_address(p, data)) {
+ pr_cont(" ********");
+ } else {
+ pr_cont(" %08x", data);
+ }
+ ++p;
+ }
+ pr_cont("\n");
+ }
+}
+
+static void show_extra_register_data(struct pt_regs *regs, int nbytes)
+{
+ mm_segment_t fs;
+ unsigned int i;
+
+ fs = get_fs();
+ set_fs(KERNEL_DS);
+ show_data(regs->pc - nbytes, nbytes * 2, "PC");
+ show_data(regs->regs[30] - nbytes, nbytes * 2, "LR");
+ show_data(regs->sp - nbytes, nbytes * 2, "SP");
+ for (i = 0; i < 30; i++) {
+ char name[4];
+ snprintf(name, sizeof(name), "X%u", i);
+ show_data(regs->regs[i] - nbytes, nbytes * 2, name);
+ }
+ set_fs(fs);
+}
+
void __show_regs(struct pt_regs *regs)
{
int i, top_reg;
@@ -201,6 +266,8 @@ void __show_regs(struct pt_regs *regs)
pr_cont("\n");
}
+ if (!user_mode(regs))
+ show_extra_register_data(regs, 128);
printk("\n");
}
@@ -331,6 +398,20 @@ void uao_thread_switch(struct task_struct *next)
}
/*
+ * We store our current task in sp_el0, which is clobbered by userspace. Keep a
+ * shadow copy so that we can restore this upon entry from userspace.
+ *
+ * This is *only* for exception entry from EL0, and is not valid until we
+ * __switch_to() a user task.
+ */
+DEFINE_PER_CPU(struct task_struct *, __entry_task);
+
+static void entry_task_switch(struct task_struct *next)
+{
+ __this_cpu_write(__entry_task, next);
+}
+
+/*
* Thread switching.
*/
struct task_struct *__switch_to(struct task_struct *prev,
@@ -342,6 +423,7 @@ struct task_struct *__switch_to(struct task_struct *prev,
tls_thread_switch(next);
hw_breakpoint_thread_switch(next);
contextidr_thread_switch(next);
+ entry_task_switch(next);
uao_thread_switch(next);
/*
@@ -359,27 +441,35 @@ struct task_struct *__switch_to(struct task_struct *prev,
unsigned long get_wchan(struct task_struct *p)
{
struct stackframe frame;
- unsigned long stack_page;
+ unsigned long stack_page, ret = 0;
int count = 0;
if (!p || p == current || p->state == TASK_RUNNING)
return 0;
+ stack_page = (unsigned long)try_get_task_stack(p);
+ if (!stack_page)
+ return 0;
+
frame.fp = thread_saved_fp(p);
frame.sp = thread_saved_sp(p);
frame.pc = thread_saved_pc(p);
#ifdef CONFIG_FUNCTION_GRAPH_TRACER
frame.graph = p->curr_ret_stack;
#endif
- stack_page = (unsigned long)task_stack_page(p);
do {
if (frame.sp < stack_page ||
frame.sp >= stack_page + THREAD_SIZE ||
unwind_frame(p, &frame))
- return 0;
- if (!in_sched_functions(frame.pc))
- return frame.pc;
+ goto out;
+ if (!in_sched_functions(frame.pc)) {
+ ret = frame.pc;
+ goto out;
+ }
} while (count ++ < 16);
- return 0;
+
+out:
+ put_task_stack(p);
+ return ret;
}
unsigned long arch_align_stack(unsigned long sp)
diff --git a/arch/arm64/kernel/psci.c b/arch/arm64/kernel/psci.c
index 42816bebb1e0..e8edbf13302a 100644
--- a/arch/arm64/kernel/psci.c
+++ b/arch/arm64/kernel/psci.c
@@ -20,6 +20,7 @@
#include <linux/smp.h>
#include <linux/delay.h>
#include <linux/psci.h>
+#include <linux/mm.h>
#include <uapi/linux/psci.h>
@@ -45,7 +46,7 @@ static int __init cpu_psci_cpu_prepare(unsigned int cpu)
static int cpu_psci_cpu_boot(unsigned int cpu)
{
- int err = psci_ops.cpu_on(cpu_logical_map(cpu), __pa(secondary_entry));
+ int err = psci_ops.cpu_on(cpu_logical_map(cpu), __pa_symbol(secondary_entry));
if (err)
pr_err("failed to boot CPU%d (%d)\n", cpu, err);
diff --git a/arch/arm64/kernel/ptrace.c b/arch/arm64/kernel/ptrace.c
index 8eedeef375d6..a22161ccf447 100644
--- a/arch/arm64/kernel/ptrace.c
+++ b/arch/arm64/kernel/ptrace.c
@@ -327,13 +327,13 @@ static int ptrace_hbp_fill_attr_ctrl(unsigned int note_type,
struct arch_hw_breakpoint_ctrl ctrl,
struct perf_event_attr *attr)
{
- int err, len, type, disabled = !ctrl.enabled;
+ int err, len, type, offset, disabled = !ctrl.enabled;
attr->disabled = disabled;
if (disabled)
return 0;
- err = arch_bp_generic_fields(ctrl, &len, &type);
+ err = arch_bp_generic_fields(ctrl, &len, &type, &offset);
if (err)
return err;
@@ -352,6 +352,7 @@ static int ptrace_hbp_fill_attr_ctrl(unsigned int note_type,
attr->bp_len = len;
attr->bp_type = type;
+ attr->bp_addr += offset;
return 0;
}
@@ -404,7 +405,7 @@ static int ptrace_hbp_get_addr(unsigned int note_type,
if (IS_ERR(bp))
return PTR_ERR(bp);
- *addr = bp ? bp->attr.bp_addr : 0;
+ *addr = bp ? counter_arch_bp(bp)->address : 0;
return 0;
}
diff --git a/arch/arm64/kernel/return_address.c b/arch/arm64/kernel/return_address.c
index 1718706fde83..12a87f2600f2 100644
--- a/arch/arm64/kernel/return_address.c
+++ b/arch/arm64/kernel/return_address.c
@@ -12,6 +12,7 @@
#include <linux/export.h>
#include <linux/ftrace.h>
+#include <asm/stack_pointer.h>
#include <asm/stacktrace.h>
struct return_address_data {
diff --git a/arch/arm64/kernel/setup.c b/arch/arm64/kernel/setup.c
index f534f492a268..b5222094ab52 100644
--- a/arch/arm64/kernel/setup.c
+++ b/arch/arm64/kernel/setup.c
@@ -42,6 +42,7 @@
#include <linux/of_fdt.h>
#include <linux/efi.h>
#include <linux/psci.h>
+#include <linux/mm.h>
#include <asm/acpi.h>
#include <asm/fixmap.h>
@@ -199,10 +200,10 @@ static void __init request_standard_resources(void)
struct memblock_region *region;
struct resource *res;
- kernel_code.start = virt_to_phys(_text);
- kernel_code.end = virt_to_phys(__init_begin - 1);
- kernel_data.start = virt_to_phys(_sdata);
- kernel_data.end = virt_to_phys(_end - 1);
+ kernel_code.start = __pa_symbol(_text);
+ kernel_code.end = __pa_symbol(__init_begin - 1);
+ kernel_data.start = __pa_symbol(_sdata);
+ kernel_data.end = __pa_symbol(_end - 1);
for_each_memblock(memory, region) {
res = alloc_bootmem_low(sizeof(*res));
@@ -291,6 +292,15 @@ void __init setup_arch(char **cmdline_p)
smp_init_cpus();
smp_build_mpidr_hash();
+#ifdef CONFIG_ARM64_SW_TTBR0_PAN
+ /*
+ * Make sure init_thread_info.ttbr0 always generates translation
+ * faults in case uaccess_enable() is inadvertently called by the init
+ * thread.
+ */
+ init_task.thread_info.ttbr0 = __pa_symbol(empty_zero_page);
+#endif
+
#ifdef CONFIG_VT
#if defined(CONFIG_VGA_CONSOLE)
conswitchp = &vga_con;
@@ -329,11 +339,11 @@ subsys_initcall(topology_init);
static int dump_kernel_offset(struct notifier_block *self, unsigned long v,
void *p)
{
- u64 const kaslr_offset = kimage_vaddr - KIMAGE_VADDR;
+ const unsigned long offset = kaslr_offset();
- if (IS_ENABLED(CONFIG_RANDOMIZE_BASE) && kaslr_offset > 0) {
- pr_emerg("Kernel Offset: 0x%llx from 0x%lx\n",
- kaslr_offset, KIMAGE_VADDR);
+ if (IS_ENABLED(CONFIG_RANDOMIZE_BASE) && offset > 0) {
+ pr_emerg("Kernel Offset: 0x%lx from 0x%lx\n",
+ offset, KIMAGE_VADDR);
} else {
pr_emerg("Kernel Offset: disabled\n");
}
diff --git a/arch/arm64/kernel/signal.c b/arch/arm64/kernel/signal.c
index 404dd67080b9..c59e675787e9 100644
--- a/arch/arm64/kernel/signal.c
+++ b/arch/arm64/kernel/signal.c
@@ -25,6 +25,7 @@
#include <linux/uaccess.h>
#include <linux/tracehook.h>
#include <linux/ratelimit.h>
+#include <linux/syscalls.h>
#include <asm/debug-monitors.h>
#include <asm/elf.h>
@@ -408,7 +409,11 @@ asmlinkage void do_notify_resume(struct pt_regs *regs,
* Update the trace code with the current status.
*/
trace_hardirqs_off();
+
do {
+ /* Check valid user FS if needed */
+ addr_limit_user_check();
+
if (thread_flags & _TIF_NEED_RESCHED) {
schedule();
} else {
diff --git a/arch/arm64/kernel/sleep.S b/arch/arm64/kernel/sleep.S
index 0030d6964e65..5a4fbcc744d2 100644
--- a/arch/arm64/kernel/sleep.S
+++ b/arch/arm64/kernel/sleep.S
@@ -125,9 +125,6 @@ ENTRY(_cpu_resume)
/* load sp from context */
ldr x2, [x0, #CPU_CTX_SP]
mov sp, x2
- /* save thread_info */
- and x2, x2, #~(THREAD_SIZE - 1)
- msr sp_el0, x2
/*
* cpu_do_resume expects x0 to contain context address pointer
*/
diff --git a/arch/arm64/kernel/smp.c b/arch/arm64/kernel/smp.c
index cfd33f18f437..a0d9a8c5e694 100644
--- a/arch/arm64/kernel/smp.c
+++ b/arch/arm64/kernel/smp.c
@@ -58,6 +58,9 @@
#define CREATE_TRACE_POINTS
#include <trace/events/ipi.h>
+DEFINE_PER_CPU_READ_MOSTLY(int, cpu_number);
+EXPORT_PER_CPU_SYMBOL(cpu_number);
+
/*
* as from 2.5, kernels no longer have an init_tasks structure
* so we need some other way of telling a new secondary core
@@ -146,6 +149,7 @@ int __cpu_up(unsigned int cpu, struct task_struct *idle)
* We need to tell the secondary core where to find its stack and the
* page tables.
*/
+ secondary_data.task = idle;
secondary_data.stack = task_stack_page(idle) + THREAD_START_SP;
update_cpu_boot_status(CPU_MMU_OFF);
__flush_dcache_area(&secondary_data, sizeof(secondary_data));
@@ -170,6 +174,7 @@ int __cpu_up(unsigned int cpu, struct task_struct *idle)
pr_err("CPU%u: failed to boot: %d\n", cpu, ret);
}
+ secondary_data.task = NULL;
secondary_data.stack = NULL;
status = READ_ONCE(secondary_data.status);
if (ret && status) {
@@ -208,7 +213,10 @@ int __cpu_up(unsigned int cpu, struct task_struct *idle)
asmlinkage notrace void secondary_start_kernel(void)
{
struct mm_struct *mm = &init_mm;
- unsigned int cpu = smp_processor_id();
+ unsigned int cpu;
+
+ cpu = task_cpu(current);
+ set_my_cpu_offset(per_cpu_offset(cpu));
/*
* All kernel threads share the same mm context; grab a
@@ -217,8 +225,6 @@ asmlinkage notrace void secondary_start_kernel(void)
atomic_inc(&mm->mm_count);
current->active_mm = mm;
- set_my_cpu_offset(per_cpu_offset(smp_processor_id()));
-
/*
* TTBR0 is only used for the identity mapping at this stage. Make it
* point to zero page to avoid speculatively fetching new entries.
@@ -718,6 +724,8 @@ void __init smp_prepare_cpus(unsigned int max_cpus)
*/
for_each_possible_cpu(cpu) {
+ per_cpu(cpu_number, cpu) = cpu;
+
if (cpu == smp_processor_id())
continue;
diff --git a/arch/arm64/kernel/smp_spin_table.c b/arch/arm64/kernel/smp_spin_table.c
index 9a00eee9acc8..93034651c87e 100644
--- a/arch/arm64/kernel/smp_spin_table.c
+++ b/arch/arm64/kernel/smp_spin_table.c
@@ -21,6 +21,7 @@
#include <linux/of.h>
#include <linux/smp.h>
#include <linux/types.h>
+#include <linux/mm.h>
#include <asm/cacheflush.h>
#include <asm/cpu_ops.h>
@@ -98,7 +99,7 @@ static int smp_spin_table_cpu_prepare(unsigned int cpu)
* boot-loader's endianess before jumping. This is mandated by
* the boot protocol.
*/
- writeq_relaxed(__pa(secondary_holding_pen), release_addr);
+ writeq_relaxed(__pa_symbol(secondary_holding_pen), release_addr);
__flush_dcache_area((__force void *)release_addr,
sizeof(*release_addr));
diff --git a/arch/arm64/kernel/stacktrace.c b/arch/arm64/kernel/stacktrace.c
index 0cc01e0d38eb..5201bebcec07 100644
--- a/arch/arm64/kernel/stacktrace.c
+++ b/arch/arm64/kernel/stacktrace.c
@@ -22,6 +22,7 @@
#include <linux/stacktrace.h>
#include <asm/irq.h>
+#include <asm/stack_pointer.h>
#include <asm/stacktrace.h>
/*
@@ -133,7 +134,6 @@ void notrace walk_stackframe(struct task_struct *tsk, struct stackframe *frame,
break;
}
}
-EXPORT_SYMBOL(walk_stackframe);
#ifdef CONFIG_STACKTRACE
struct stack_trace_data {
@@ -186,6 +186,9 @@ void save_stack_trace_tsk(struct task_struct *tsk, struct stack_trace *trace)
struct stack_trace_data data;
struct stackframe frame;
+ if (!try_get_task_stack(tsk))
+ return;
+
data.trace = trace;
data.skip = trace->skip;
@@ -207,6 +210,8 @@ void save_stack_trace_tsk(struct task_struct *tsk, struct stack_trace *trace)
walk_stackframe(tsk, &frame, save_trace, &data);
if (trace->nr_entries < trace->max_entries)
trace->entries[trace->nr_entries++] = ULONG_MAX;
+
+ put_task_stack(tsk);
}
void save_stack_trace(struct stack_trace *trace)
diff --git a/arch/arm64/kernel/suspend.c b/arch/arm64/kernel/suspend.c
index 1dbf6099e2a5..e12f2d0dfc5d 100644
--- a/arch/arm64/kernel/suspend.c
+++ b/arch/arm64/kernel/suspend.c
@@ -47,12 +47,6 @@ void notrace __cpu_suspend_exit(void)
cpu_uninstall_idmap();
/*
- * Restore per-cpu offset before any kernel
- * subsystem relying on it has a chance to run.
- */
- set_my_cpu_offset(per_cpu_offset(cpu));
-
- /*
* PSTATE was not saved over suspend/resume, re-enable any detected
* features that might not have been set correctly.
*/
diff --git a/arch/arm64/kernel/topology.c b/arch/arm64/kernel/topology.c
index 694f6deedbab..ed734264a516 100644
--- a/arch/arm64/kernel/topology.c
+++ b/arch/arm64/kernel/topology.c
@@ -19,10 +19,24 @@
#include <linux/nodemask.h>
#include <linux/of.h>
#include <linux/sched.h>
+#include <linux/sched.h>
+#include <linux/sched_energy.h>
#include <asm/cputype.h>
#include <asm/topology.h>
+static DEFINE_PER_CPU(unsigned long, cpu_scale) = SCHED_CAPACITY_SCALE;
+
+unsigned long scale_cpu_capacity(struct sched_domain *sd, int cpu)
+{
+ return per_cpu(cpu_scale, cpu);
+}
+
+static void set_capacity_scale(unsigned int cpu, unsigned long capacity)
+{
+ per_cpu(cpu_scale, cpu) = capacity;
+}
+
static int __init get_cpu_for_node(struct device_node *node)
{
struct device_node *cpu_node;
@@ -206,11 +220,72 @@ out:
struct cpu_topology cpu_topology[NR_CPUS];
EXPORT_SYMBOL_GPL(cpu_topology);
+/* sd energy functions */
+static inline
+const struct sched_group_energy * const cpu_cluster_energy(int cpu)
+{
+ struct sched_group_energy *sge = sge_array[cpu][SD_LEVEL1];
+
+ if (!sge) {
+ pr_warn("Invalid sched_group_energy for Cluster%d\n", cpu);
+ return NULL;
+ }
+
+ return sge;
+}
+
+static inline
+const struct sched_group_energy * const cpu_core_energy(int cpu)
+{
+ struct sched_group_energy *sge = sge_array[cpu][SD_LEVEL0];
+
+ if (!sge) {
+ pr_warn("Invalid sched_group_energy for CPU%d\n", cpu);
+ return NULL;
+ }
+
+ return sge;
+}
+
const struct cpumask *cpu_coregroup_mask(int cpu)
{
return &cpu_topology[cpu].core_sibling;
}
+static int cpu_cpu_flags(void)
+{
+ return SD_ASYM_CPUCAPACITY;
+}
+
+static inline int cpu_corepower_flags(void)
+{
+ return SD_SHARE_PKG_RESOURCES | SD_SHARE_POWERDOMAIN | \
+ SD_SHARE_CAP_STATES;
+}
+
+static struct sched_domain_topology_level arm64_topology[] = {
+#ifdef CONFIG_SCHED_MC
+ { cpu_coregroup_mask, cpu_corepower_flags, cpu_core_energy, SD_INIT_NAME(MC) },
+#endif
+ { cpu_cpu_mask, cpu_cpu_flags, cpu_cluster_energy, SD_INIT_NAME(DIE) },
+ { NULL, },
+};
+
+static void update_cpu_capacity(unsigned int cpu)
+{
+ unsigned long capacity = SCHED_CAPACITY_SCALE;
+
+ if (cpu_core_energy(cpu)) {
+ int max_cap_idx = cpu_core_energy(cpu)->nr_cap_states - 1;
+ capacity = cpu_core_energy(cpu)->cap_states[max_cap_idx].cap;
+ }
+
+ set_capacity_scale(cpu, capacity);
+
+ pr_info("CPU%d: update cpu_capacity %lu\n",
+ cpu, arch_scale_cpu_capacity(NULL, cpu));
+}
+
static void update_siblings_masks(unsigned int cpuid)
{
struct cpu_topology *cpu_topo, *cpuid_topo = &cpu_topology[cpuid];
@@ -272,6 +347,7 @@ void store_cpu_topology(unsigned int cpuid)
topology_populated:
update_siblings_masks(cpuid);
+ update_cpu_capacity(cpuid);
}
static void __init reset_cpu_topology(void)
@@ -302,4 +378,8 @@ void __init init_cpu_topology(void)
*/
if (of_have_populated_dt() && parse_dt_topology())
reset_cpu_topology();
+ else
+ set_sched_topology(arm64_topology);
+
+ init_sched_energy_costs();
}
diff --git a/arch/arm64/kernel/traps.c b/arch/arm64/kernel/traps.c
index 5963be2e05f0..86a87ae2b996 100644
--- a/arch/arm64/kernel/traps.c
+++ b/arch/arm64/kernel/traps.c
@@ -33,11 +33,13 @@
#include <linux/syscalls.h>
#include <asm/atomic.h>
+#include <asm/barrier.h>
#include <asm/bug.h>
#include <asm/debug-monitors.h>
#include <asm/esr.h>
#include <asm/insn.h>
#include <asm/traps.h>
+#include <asm/stack_pointer.h>
#include <asm/stacktrace.h>
#include <asm/exception.h>
#include <asm/system_misc.h>
@@ -147,6 +149,9 @@ static void dump_backtrace(struct pt_regs *regs, struct task_struct *tsk)
if (!tsk)
tsk = current;
+ if (!try_get_task_stack(tsk))
+ return;
+
/*
* Switching between stacks is valid when tracing current and in
* non-preemptible context.
@@ -212,6 +217,8 @@ static void dump_backtrace(struct pt_regs *regs, struct task_struct *tsk)
stack + sizeof(struct pt_regs));
}
}
+
+ put_task_stack(tsk);
}
void show_stack(struct task_struct *tsk, unsigned long *sp)
@@ -227,10 +234,9 @@ void show_stack(struct task_struct *tsk, unsigned long *sp)
#endif
#define S_SMP " SMP"
-static int __die(const char *str, int err, struct thread_info *thread,
- struct pt_regs *regs)
+static int __die(const char *str, int err, struct pt_regs *regs)
{
- struct task_struct *tsk = thread->task;
+ struct task_struct *tsk = current;
static int die_counter;
int ret;
@@ -245,7 +251,8 @@ static int __die(const char *str, int err, struct thread_info *thread,
print_modules();
__show_regs(regs);
pr_emerg("Process %.*s (pid: %d, stack limit = 0x%p)\n",
- TASK_COMM_LEN, tsk->comm, task_pid_nr(tsk), thread + 1);
+ TASK_COMM_LEN, tsk->comm, task_pid_nr(tsk),
+ end_of_stack(tsk));
if (!user_mode(regs)) {
dump_mem(KERN_EMERG, "Stack: ", regs->sp,
@@ -264,7 +271,6 @@ static DEFINE_RAW_SPINLOCK(die_lock);
*/
void die(const char *str, struct pt_regs *regs, int err)
{
- struct thread_info *thread = current_thread_info();
int ret;
oops_enter();
@@ -272,9 +278,9 @@ void die(const char *str, struct pt_regs *regs, int err)
raw_spin_lock_irq(&die_lock);
console_verbose();
bust_spinlocks(1);
- ret = __die(str, err, thread, regs);
+ ret = __die(str, err, regs);
- if (regs && kexec_should_crash(thread->task))
+ if (regs && kexec_should_crash(current))
crash_kexec(regs);
bust_spinlocks(0);
@@ -435,9 +441,10 @@ int cpu_enable_cache_maint_trap(void *__unused)
}
#define __user_cache_maint(insn, address, res) \
- if (address >= user_addr_max()) \
+ if (address >= user_addr_max()) { \
res = -EFAULT; \
- else \
+ } else { \
+ uaccess_ttbr0_enable(); \
asm volatile ( \
"1: " insn ", %1\n" \
" mov %w0, #0\n" \
@@ -449,7 +456,9 @@ int cpu_enable_cache_maint_trap(void *__unused)
" .popsection\n" \
_ASM_EXTABLE(1b, 3b) \
: "=r" (res) \
- : "r" (address), "i" (-EFAULT) )
+ : "r" (address), "i" (-EFAULT)); \
+ uaccess_ttbr0_disable(); \
+ }
static void user_cache_maint_handler(unsigned int esr, struct pt_regs *regs)
{
@@ -492,6 +501,25 @@ static void ctr_read_handler(unsigned int esr, struct pt_regs *regs)
regs->pc += 4;
}
+static void cntvct_read_handler(unsigned int esr, struct pt_regs *regs)
+{
+ int rt = (esr & ESR_ELx_SYS64_ISS_RT_MASK) >> ESR_ELx_SYS64_ISS_RT_SHIFT;
+
+ isb();
+ if (rt != 31)
+ regs->regs[rt] = arch_counter_get_cntvct();
+ regs->pc += 4;
+}
+
+static void cntfrq_read_handler(unsigned int esr, struct pt_regs *regs)
+{
+ int rt = (esr & ESR_ELx_SYS64_ISS_RT_MASK) >> ESR_ELx_SYS64_ISS_RT_SHIFT;
+
+ if (rt != 31)
+ regs->regs[rt] = read_sysreg(cntfrq_el0);
+ regs->pc += 4;
+}
+
struct sys64_hook {
unsigned int esr_mask;
unsigned int esr_val;
@@ -510,6 +538,18 @@ static struct sys64_hook sys64_hooks[] = {
.esr_val = ESR_ELx_SYS64_ISS_SYS_CTR_READ,
.handler = ctr_read_handler,
},
+ {
+ /* Trap read access to CNTVCT_EL0 */
+ .esr_mask = ESR_ELx_SYS64_ISS_SYS_OP_MASK,
+ .esr_val = ESR_ELx_SYS64_ISS_SYS_CNTVCT,
+ .handler = cntvct_read_handler,
+ },
+ {
+ /* Trap read access to CNTFRQ_EL0 */
+ .esr_mask = ESR_ELx_SYS64_ISS_SYS_OP_MASK,
+ .esr_val = ESR_ELx_SYS64_ISS_SYS_CNTFRQ,
+ .handler = cntfrq_read_handler,
+ },
{},
};
diff --git a/arch/arm64/kernel/vdso.c b/arch/arm64/kernel/vdso.c
index 4bcfe01b5aad..7492d9009610 100644
--- a/arch/arm64/kernel/vdso.c
+++ b/arch/arm64/kernel/vdso.c
@@ -123,6 +123,7 @@ static int __init vdso_init(void)
{
int i;
struct page **vdso_pagelist;
+ unsigned long pfn;
if (memcmp(&vdso_start, "\177ELF", 4)) {
pr_err("vDSO is not a valid ELF object!\n");
@@ -140,11 +141,14 @@ static int __init vdso_init(void)
return -ENOMEM;
/* Grab the vDSO data page. */
- vdso_pagelist[0] = pfn_to_page(PHYS_PFN(__pa(vdso_data)));
+ vdso_pagelist[0] = phys_to_page(__pa_symbol(vdso_data));
+
/* Grab the vDSO code pages. */
+ pfn = sym_to_pfn(&vdso_start);
+
for (i = 0; i < vdso_pages; i++)
- vdso_pagelist[i + 1] = pfn_to_page(PHYS_PFN(__pa(&vdso_start)) + i);
+ vdso_pagelist[i + 1] = pfn_to_page(pfn + i);
vdso_spec[0].pages = &vdso_pagelist[0];
vdso_spec[1].pages = &vdso_pagelist[1];
@@ -216,10 +220,8 @@ void update_vsyscall(struct timekeeper *tk)
if (!use_syscall) {
/* tkr_mono.cycle_last == tkr_raw.cycle_last */
vdso_data->cs_cycle_last = tk->tkr_mono.cycle_last;
- vdso_data->raw_time_sec = tk->raw_time.tv_sec;
- vdso_data->raw_time_nsec = (tk->raw_time.tv_nsec <<
- tk->tkr_raw.shift) +
- tk->tkr_raw.xtime_nsec;
+ vdso_data->raw_time_sec = tk->raw_sec;
+ vdso_data->raw_time_nsec = tk->tkr_raw.xtime_nsec;
vdso_data->xtime_clock_sec = tk->xtime_sec;
vdso_data->xtime_clock_nsec = tk->tkr_mono.xtime_nsec;
vdso_data->cs_mono_mult = tk->tkr_mono.mult;
diff --git a/arch/arm64/kernel/vdso/Makefile b/arch/arm64/kernel/vdso/Makefile
index 62c84f7cb01b..88fef3867886 100644
--- a/arch/arm64/kernel/vdso/Makefile
+++ b/arch/arm64/kernel/vdso/Makefile
@@ -14,6 +14,7 @@ obj-vdso := $(addprefix $(obj)/, $(obj-vdso))
ccflags-y := -shared -fno-common -fno-builtin
ccflags-y += -nostdlib -Wl,-soname=linux-vdso.so.1 \
$(call cc-ldoption, -Wl$(comma)--hash-style=sysv)
+ccflags-y += $(DISABLE_LTO)
# Disable gcov profiling for VDSO code
GCOV_PROFILE := n
diff --git a/arch/arm64/kernel/vdso/gettimeofday.S b/arch/arm64/kernel/vdso/gettimeofday.S
index 76320e920965..c39872a7b03c 100644
--- a/arch/arm64/kernel/vdso/gettimeofday.S
+++ b/arch/arm64/kernel/vdso/gettimeofday.S
@@ -309,7 +309,7 @@ ENTRY(__kernel_clock_getres)
b.ne 4f
ldr x2, 6f
2:
- cbz w1, 3f
+ cbz x1, 3f
stp xzr, x2, [x1]
3: /* res == NULL. */
diff --git a/arch/arm64/kernel/vmlinux.lds.S b/arch/arm64/kernel/vmlinux.lds.S
index 6a584558b29d..402a62165d40 100644
--- a/arch/arm64/kernel/vmlinux.lds.S
+++ b/arch/arm64/kernel/vmlinux.lds.S
@@ -60,7 +60,7 @@ jiffies = jiffies_64;
#define TRAMP_TEXT \
. = ALIGN(PAGE_SIZE); \
VMLINUX_SYMBOL(__entry_tramp_text_start) = .; \
- *(.entry.tramp.text) \
+ KEEP(*(.entry.tramp.text)) \
. = ALIGN(PAGE_SIZE); \
VMLINUX_SYMBOL(__entry_tramp_text_end) = .;
#else
@@ -179,11 +179,11 @@ SECTIONS
. = ALIGN(4);
.altinstructions : {
__alt_instructions = .;
- *(.altinstructions)
+ KEEP(*(.altinstructions))
__alt_instructions_end = .;
}
.altinstr_replacement : {
- *(.altinstr_replacement)
+ KEEP(*(.altinstr_replacement))
}
.rela : ALIGN(8) {
*(.rela .rela*)
@@ -228,6 +228,11 @@ SECTIONS
swapper_pg_dir = .;
. += SWAPPER_DIR_SIZE;
+#ifdef CONFIG_ARM64_SW_TTBR0_PAN
+ reserved_ttbr0 = .;
+ . += RESERVED_TTBR0_SIZE;
+#endif
+
#ifdef CONFIG_UNMAP_KERNEL_AT_EL0
tramp_pg_dir = .;
. += PAGE_SIZE;
diff --git a/arch/arm64/kvm/hyp/Makefile b/arch/arm64/kvm/hyp/Makefile
index 48b03547a969..c470a7c5f05b 100644
--- a/arch/arm64/kvm/hyp/Makefile
+++ b/arch/arm64/kvm/hyp/Makefile
@@ -4,6 +4,10 @@
ccflags-y += -fno-stack-protector -DDISABLE_BRANCH_PROFILING
+ifeq ($(cc-name),clang)
+ccflags-y += -fno-jump-tables
+endif
+
KVM=../../../../virt/kvm
obj-$(CONFIG_KVM_ARM_HOST) += $(KVM)/arm/hyp/vgic-v2-sr.o
diff --git a/arch/arm64/lib/clear_user.S b/arch/arm64/lib/clear_user.S
index efbf610eaf4e..b581e16320dd 100644
--- a/arch/arm64/lib/clear_user.S
+++ b/arch/arm64/lib/clear_user.S
@@ -17,10 +17,7 @@
*/
#include <linux/linkage.h>
-#include <asm/alternative.h>
-#include <asm/assembler.h>
-#include <asm/cpufeature.h>
-#include <asm/sysreg.h>
+#include <asm/uaccess.h>
.text
@@ -33,8 +30,7 @@
* Alignment fixed up by hardware.
*/
ENTRY(__arch_clear_user)
-ALTERNATIVE("nop", __stringify(SET_PSTATE_PAN(0)), ARM64_ALT_PAN_NOT_UAO, \
- CONFIG_ARM64_PAN)
+ uaccess_enable_not_uao x2, x3, x4
mov x2, x1 // save the size for fixup return
subs x1, x1, #8
b.mi 2f
@@ -54,8 +50,7 @@ uao_user_alternative 9f, strh, sttrh, wzr, x0, 2
b.mi 5f
uao_user_alternative 9f, strb, sttrb, wzr, x0, 0
5: mov x0, #0
-ALTERNATIVE("nop", __stringify(SET_PSTATE_PAN(1)), ARM64_ALT_PAN_NOT_UAO, \
- CONFIG_ARM64_PAN)
+ uaccess_disable_not_uao x2, x3
ret
ENDPROC(__arch_clear_user)
diff --git a/arch/arm64/lib/copy_from_user.S b/arch/arm64/lib/copy_from_user.S
index 4fd67ea03bb0..c7a7d9689e8f 100644
--- a/arch/arm64/lib/copy_from_user.S
+++ b/arch/arm64/lib/copy_from_user.S
@@ -16,11 +16,8 @@
#include <linux/linkage.h>
-#include <asm/alternative.h>
-#include <asm/assembler.h>
#include <asm/cache.h>
-#include <asm/cpufeature.h>
-#include <asm/sysreg.h>
+#include <asm/uaccess.h>
/*
* Copy from user space to a kernel buffer (alignment handled by the hardware)
@@ -67,12 +64,10 @@
end .req x5
ENTRY(__arch_copy_from_user)
-ALTERNATIVE("nop", __stringify(SET_PSTATE_PAN(0)), ARM64_ALT_PAN_NOT_UAO, \
- CONFIG_ARM64_PAN)
+ uaccess_enable_not_uao x3, x4, x5
add end, x0, x2
#include "copy_template.S"
-ALTERNATIVE("nop", __stringify(SET_PSTATE_PAN(1)), ARM64_ALT_PAN_NOT_UAO, \
- CONFIG_ARM64_PAN)
+ uaccess_disable_not_uao x3, x4
mov x0, #0 // Nothing to copy
ret
ENDPROC(__arch_copy_from_user)
diff --git a/arch/arm64/lib/copy_in_user.S b/arch/arm64/lib/copy_in_user.S
index 841bf8f7fab7..800779eb3079 100644
--- a/arch/arm64/lib/copy_in_user.S
+++ b/arch/arm64/lib/copy_in_user.S
@@ -18,11 +18,8 @@
#include <linux/linkage.h>
-#include <asm/alternative.h>
-#include <asm/assembler.h>
#include <asm/cache.h>
-#include <asm/cpufeature.h>
-#include <asm/sysreg.h>
+#include <asm/uaccess.h>
/*
* Copy from user space to user space (alignment handled by the hardware)
@@ -68,12 +65,10 @@
end .req x5
ENTRY(__arch_copy_in_user)
-ALTERNATIVE("nop", __stringify(SET_PSTATE_PAN(0)), ARM64_ALT_PAN_NOT_UAO, \
- CONFIG_ARM64_PAN)
+ uaccess_enable_not_uao x3, x4, x5
add end, x0, x2
#include "copy_template.S"
-ALTERNATIVE("nop", __stringify(SET_PSTATE_PAN(1)), ARM64_ALT_PAN_NOT_UAO, \
- CONFIG_ARM64_PAN)
+ uaccess_disable_not_uao x3, x4
mov x0, #0
ret
ENDPROC(__arch_copy_in_user)
diff --git a/arch/arm64/lib/copy_to_user.S b/arch/arm64/lib/copy_to_user.S
index 7a7efe255034..f6cfcc0441de 100644
--- a/arch/arm64/lib/copy_to_user.S
+++ b/arch/arm64/lib/copy_to_user.S
@@ -16,11 +16,8 @@
#include <linux/linkage.h>
-#include <asm/alternative.h>
-#include <asm/assembler.h>
#include <asm/cache.h>
-#include <asm/cpufeature.h>
-#include <asm/sysreg.h>
+#include <asm/uaccess.h>
/*
* Copy to user space from a kernel buffer (alignment handled by the hardware)
@@ -66,12 +63,10 @@
end .req x5
ENTRY(__arch_copy_to_user)
-ALTERNATIVE("nop", __stringify(SET_PSTATE_PAN(0)), ARM64_ALT_PAN_NOT_UAO, \
- CONFIG_ARM64_PAN)
+ uaccess_enable_not_uao x3, x4, x5
add end, x0, x2
#include "copy_template.S"
-ALTERNATIVE("nop", __stringify(SET_PSTATE_PAN(1)), ARM64_ALT_PAN_NOT_UAO, \
- CONFIG_ARM64_PAN)
+ uaccess_disable_not_uao x3, x4
mov x0, #0
ret
ENDPROC(__arch_copy_to_user)
diff --git a/arch/arm64/mm/cache.S b/arch/arm64/mm/cache.S
index 58b5a906ff78..82ecb6c5f015 100644
--- a/arch/arm64/mm/cache.S
+++ b/arch/arm64/mm/cache.S
@@ -23,6 +23,7 @@
#include <asm/assembler.h>
#include <asm/cpufeature.h>
#include <asm/alternative.h>
+#include <asm/uaccess.h>
/*
* flush_icache_range(start,end)
@@ -48,6 +49,7 @@ ENTRY(flush_icache_range)
* - end - virtual end address of region
*/
ENTRY(__flush_cache_user_range)
+ uaccess_ttbr0_enable x2, x3, x4
dcache_line_size x2, x3
sub x3, x2, #1
bic x4, x0, x3
@@ -69,10 +71,12 @@ USER(9f, ic ivau, x4 ) // invalidate I line PoU
dsb ish
isb
mov x0, #0
+1:
+ uaccess_ttbr0_disable x1, x2
ret
9:
mov x0, #-EFAULT
- ret
+ b 1b
ENDPROC(flush_icache_range)
ENDPROC(__flush_cache_user_range)
diff --git a/arch/arm64/mm/context.c b/arch/arm64/mm/context.c
index 62d976e843fc..c841ccecbc4d 100644
--- a/arch/arm64/mm/context.c
+++ b/arch/arm64/mm/context.c
@@ -233,7 +233,12 @@ switch_mm_fastpath:
arm64_apply_bp_hardening();
- cpu_switch_mm(mm->pgd, mm);
+ /*
+ * Defer TTBR0_EL1 setting for user threads to uaccess_enable() when
+ * emulating PAN.
+ */
+ if (!system_uses_ttbr0_pan())
+ cpu_switch_mm(mm->pgd, mm);
}
/* Errata workaround post TTBRx_EL1 update. */
diff --git a/arch/arm64/mm/dma-mapping.c b/arch/arm64/mm/dma-mapping.c
index cab3574ab7d9..6dda5ff63930 100644
--- a/arch/arm64/mm/dma-mapping.c
+++ b/arch/arm64/mm/dma-mapping.c
@@ -174,7 +174,7 @@ static void *__dma_alloc(struct device *dev, size_t size,
/* create a coherent mapping */
page = virt_to_page(ptr);
coherent_ptr = dma_common_contiguous_remap(page, size, VM_USERMAP,
- prot, NULL);
+ prot, __builtin_return_address(0));
if (!coherent_ptr)
goto no_map;
diff --git a/arch/arm64/mm/fault.c b/arch/arm64/mm/fault.c
index ad49ae8f3967..b9bff2258e59 100644
--- a/arch/arm64/mm/fault.c
+++ b/arch/arm64/mm/fault.c
@@ -286,13 +286,19 @@ out:
return fault;
}
-static inline bool is_permission_fault(unsigned int esr)
+static inline bool is_permission_fault(unsigned int esr, struct pt_regs *regs)
{
unsigned int ec = ESR_ELx_EC(esr);
unsigned int fsc_type = esr & ESR_ELx_FSC_TYPE;
- return (ec == ESR_ELx_EC_DABT_CUR && fsc_type == ESR_ELx_FSC_PERM) ||
- (ec == ESR_ELx_EC_IABT_CUR && fsc_type == ESR_ELx_FSC_PERM);
+ if (ec != ESR_ELx_EC_DABT_CUR && ec != ESR_ELx_EC_IABT_CUR)
+ return false;
+
+ if (system_uses_ttbr0_pan())
+ return fsc_type == ESR_ELx_FSC_FAULT &&
+ (regs->pstate & PSR_PAN_BIT);
+ else
+ return fsc_type == ESR_ELx_FSC_PERM;
}
static bool is_el0_instruction_abort(unsigned int esr)
@@ -332,7 +338,7 @@ static int __kprobes do_page_fault(unsigned long addr, unsigned int esr,
mm_flags |= FAULT_FLAG_WRITE;
}
- if (is_permission_fault(esr) && (addr < TASK_SIZE)) {
+ if (addr < TASK_SIZE && is_permission_fault(esr, regs)) {
/* regs->orig_addr_limit may be 0 if we entered from EL0 */
if (regs->orig_addr_limit == KERNEL_DS)
die("Accessing user space memory with fs=KERNEL_DS", regs, esr);
diff --git a/arch/arm64/mm/init.c b/arch/arm64/mm/init.c
index fa6b2fad7a3d..c410c9ecb0de 100644
--- a/arch/arm64/mm/init.c
+++ b/arch/arm64/mm/init.c
@@ -36,6 +36,7 @@
#include <linux/efi.h>
#include <linux/swiotlb.h>
#include <linux/vmalloc.h>
+#include <linux/mm.h>
#include <asm/boot.h>
#include <asm/fixmap.h>
@@ -213,8 +214,8 @@ void __init arm64_memblock_init(void)
* linear mapping. Take care not to clip the kernel which may be
* high in memory.
*/
- memblock_remove(max_t(u64, memstart_addr + linear_region_size, __pa(_end)),
- ULLONG_MAX);
+ memblock_remove(max_t(u64, memstart_addr + linear_region_size,
+ __pa_symbol(_end)), ULLONG_MAX);
if (memstart_addr + linear_region_size < memblock_end_of_DRAM()) {
/* ensure that memstart_addr remains sufficiently aligned */
memstart_addr = round_up(memblock_end_of_DRAM() - linear_region_size,
@@ -229,7 +230,7 @@ void __init arm64_memblock_init(void)
*/
if (memory_limit != (phys_addr_t)ULLONG_MAX) {
memblock_mem_limit_remove_map(memory_limit);
- memblock_add(__pa(_text), (u64)(_end - _text));
+ memblock_add(__pa_symbol(_text), (u64)(_end - _text));
}
if (IS_ENABLED(CONFIG_BLK_DEV_INITRD) && initrd_start) {
@@ -282,7 +283,7 @@ void __init arm64_memblock_init(void)
* Register the kernel text, kernel data, initrd, and initial
* pagetables with memblock.
*/
- memblock_reserve(__pa(_text), _end - _text);
+ memblock_reserve(__pa_symbol(_text), _end - _text);
#ifdef CONFIG_BLK_DEV_INITRD
if (initrd_start) {
memblock_reserve(initrd_start, initrd_end - initrd_start);
@@ -492,7 +493,8 @@ void __init mem_init(void)
void free_initmem(void)
{
- free_reserved_area(__va(__pa(__init_begin)), __va(__pa(__init_end)),
+ free_reserved_area(lm_alias(__init_begin),
+ lm_alias(__init_end),
0, "unused kernel");
/*
* Unmap the __init region but leave the VM area in place. This
diff --git a/arch/arm64/mm/kasan_init.c b/arch/arm64/mm/kasan_init.c
index 757009daa9ed..201d918e7575 100644
--- a/arch/arm64/mm/kasan_init.c
+++ b/arch/arm64/mm/kasan_init.c
@@ -15,6 +15,7 @@
#include <linux/kernel.h>
#include <linux/memblock.h>
#include <linux/start_kernel.h>
+#include <linux/mm.h>
#include <asm/mmu_context.h>
#include <asm/kernel-pgtable.h>
@@ -26,6 +27,13 @@
static pgd_t tmp_pg_dir[PTRS_PER_PGD] __initdata __aligned(PGD_SIZE);
+/*
+ * The p*d_populate functions call virt_to_phys implicitly so they can't be used
+ * directly on kernel symbols (bm_p*d). All the early functions are called too
+ * early to use lm_alias so __p*d_populate functions must be used to populate
+ * with the physical address from __pa_symbol.
+ */
+
static void __init kasan_early_pte_populate(pmd_t *pmd, unsigned long addr,
unsigned long end)
{
@@ -33,12 +41,12 @@ static void __init kasan_early_pte_populate(pmd_t *pmd, unsigned long addr,
unsigned long next;
if (pmd_none(*pmd))
- pmd_populate_kernel(&init_mm, pmd, kasan_zero_pte);
+ __pmd_populate(pmd, __pa_symbol(kasan_zero_pte), PMD_TYPE_TABLE);
pte = pte_offset_kimg(pmd, addr);
do {
next = addr + PAGE_SIZE;
- set_pte(pte, pfn_pte(virt_to_pfn(kasan_zero_page),
+ set_pte(pte, pfn_pte(sym_to_pfn(kasan_zero_page),
PAGE_KERNEL));
} while (pte++, addr = next, addr != end && pte_none(*pte));
}
@@ -51,7 +59,7 @@ static void __init kasan_early_pmd_populate(pud_t *pud,
unsigned long next;
if (pud_none(*pud))
- pud_populate(&init_mm, pud, kasan_zero_pmd);
+ __pud_populate(pud, __pa_symbol(kasan_zero_pmd), PMD_TYPE_TABLE);
pmd = pmd_offset_kimg(pud, addr);
do {
@@ -68,7 +76,7 @@ static void __init kasan_early_pud_populate(pgd_t *pgd,
unsigned long next;
if (pgd_none(*pgd))
- pgd_populate(&init_mm, pgd, kasan_zero_pud);
+ __pgd_populate(pgd, __pa_symbol(kasan_zero_pud), PUD_TYPE_TABLE);
pud = pud_offset_kimg(pgd, addr);
do {
@@ -148,7 +156,7 @@ void __init kasan_init(void)
*/
memcpy(tmp_pg_dir, swapper_pg_dir, sizeof(tmp_pg_dir));
dsb(ishst);
- cpu_replace_ttbr1(tmp_pg_dir);
+ cpu_replace_ttbr1(lm_alias(tmp_pg_dir));
clear_pgds(KASAN_SHADOW_START, KASAN_SHADOW_END);
@@ -199,10 +207,10 @@ void __init kasan_init(void)
*/
for (i = 0; i < PTRS_PER_PTE; i++)
set_pte(&kasan_zero_pte[i],
- pfn_pte(virt_to_pfn(kasan_zero_page), PAGE_KERNEL_RO));
+ pfn_pte(sym_to_pfn(kasan_zero_page), PAGE_KERNEL_RO));
memset(kasan_zero_page, 0, PAGE_SIZE);
- cpu_replace_ttbr1(swapper_pg_dir);
+ cpu_replace_ttbr1(lm_alias(swapper_pg_dir));
/* At this point kasan is fully initialized. Enable error messages */
init_task.kasan_depth = 0;
diff --git a/arch/arm64/mm/mmu.c b/arch/arm64/mm/mmu.c
index 0a56898f8410..7ae943eab7be 100644
--- a/arch/arm64/mm/mmu.c
+++ b/arch/arm64/mm/mmu.c
@@ -30,6 +30,7 @@
#include <linux/io.h>
#include <linux/slab.h>
#include <linux/stop_machine.h>
+#include <linux/mm.h>
#include <asm/barrier.h>
#include <asm/cputype.h>
@@ -319,8 +320,8 @@ static void create_mapping_late(phys_addr_t phys, unsigned long virt,
static void __init __map_memblock(pgd_t *pgd, phys_addr_t start, phys_addr_t end)
{
- unsigned long kernel_start = __pa(_text);
- unsigned long kernel_end = __pa(__init_begin);
+ unsigned long kernel_start = __pa_symbol(_text);
+ unsigned long kernel_end = __pa_symbol(__init_begin);
/*
* Take care not to create a writable alias for the
@@ -387,21 +388,21 @@ void mark_rodata_ro(void)
unsigned long section_size;
section_size = (unsigned long)_etext - (unsigned long)_text;
- create_mapping_late(__pa(_text), (unsigned long)_text,
+ create_mapping_late(__pa_symbol(_text), (unsigned long)_text,
section_size, PAGE_KERNEL_ROX);
/*
* mark .rodata as read only. Use __init_begin rather than __end_rodata
* to cover NOTES and EXCEPTION_TABLE.
*/
section_size = (unsigned long)__init_begin - (unsigned long)__start_rodata;
- create_mapping_late(__pa(__start_rodata), (unsigned long)__start_rodata,
+ create_mapping_late(__pa_symbol(__start_rodata), (unsigned long)__start_rodata,
section_size, PAGE_KERNEL_RO);
}
static void __init map_kernel_segment(pgd_t *pgd, void *va_start, void *va_end,
pgprot_t prot, struct vm_struct *vma)
{
- phys_addr_t pa_start = __pa(va_start);
+ phys_addr_t pa_start = __pa_symbol(va_start);
unsigned long size = va_end - va_start;
BUG_ON(!PAGE_ALIGNED(pa_start));
@@ -480,7 +481,7 @@ static void __init map_kernel(pgd_t *pgd)
*/
BUG_ON(!IS_ENABLED(CONFIG_ARM64_16K_PAGES));
set_pud(pud_set_fixmap_offset(pgd, FIXADDR_START),
- __pud(__pa(bm_pmd) | PUD_TYPE_TABLE));
+ __pud(__pa_symbol(bm_pmd) | PUD_TYPE_TABLE));
pud_clear_fixmap();
} else {
BUG();
@@ -511,7 +512,7 @@ void __init paging_init(void)
*/
cpu_replace_ttbr1(__va(pgd_phys));
memcpy(swapper_pg_dir, pgd, PGD_SIZE);
- cpu_replace_ttbr1(swapper_pg_dir);
+ cpu_replace_ttbr1(lm_alias(swapper_pg_dir));
pgd_clear_fixmap();
memblock_free(pgd_phys, PAGE_SIZE);
@@ -520,7 +521,7 @@ void __init paging_init(void)
* We only reuse the PGD from the swapper_pg_dir, not the pud + pmd
* allocated with it.
*/
- memblock_free(__pa(swapper_pg_dir) + PAGE_SIZE,
+ memblock_free(__pa_symbol(swapper_pg_dir) + PAGE_SIZE,
SWAPPER_DIR_SIZE - PAGE_SIZE);
}
@@ -631,6 +632,12 @@ static inline pte_t * fixmap_pte(unsigned long addr)
return &bm_pte[pte_index(addr)];
}
+/*
+ * The p*d_populate functions call virt_to_phys implicitly so they can't be used
+ * directly on kernel symbols (bm_p*d). This function is called too early to use
+ * lm_alias so __p*d_populate functions must be used to populate with the
+ * physical address from __pa_symbol.
+ */
void __init early_fixmap_init(void)
{
pgd_t *pgd;
@@ -640,7 +647,7 @@ void __init early_fixmap_init(void)
pgd = pgd_offset_k(addr);
if (CONFIG_PGTABLE_LEVELS > 3 &&
- !(pgd_none(*pgd) || pgd_page_paddr(*pgd) == __pa(bm_pud))) {
+ !(pgd_none(*pgd) || pgd_page_paddr(*pgd) == __pa_symbol(bm_pud))) {
/*
* We only end up here if the kernel mapping and the fixmap
* share the top level pgd entry, which should only happen on
@@ -649,12 +656,14 @@ void __init early_fixmap_init(void)
BUG_ON(!IS_ENABLED(CONFIG_ARM64_16K_PAGES));
pud = pud_offset_kimg(pgd, addr);
} else {
- pgd_populate(&init_mm, pgd, bm_pud);
+ if (pgd_none(*pgd))
+ __pgd_populate(pgd, __pa_symbol(bm_pud), PUD_TYPE_TABLE);
pud = fixmap_pud(addr);
}
- pud_populate(&init_mm, pud, bm_pmd);
+ if (pud_none(*pud))
+ __pud_populate(pud, __pa_symbol(bm_pmd), PMD_TYPE_TABLE);
pmd = fixmap_pmd(addr);
- pmd_populate_kernel(&init_mm, pmd, bm_pte);
+ __pmd_populate(pmd, __pa_symbol(bm_pte), PMD_TYPE_TABLE);
/*
* The boot-ioremap range spans multiple pmds, for which
diff --git a/arch/arm64/mm/proc.S b/arch/arm64/mm/proc.S
index 18d96d349a8b..b5f5e98f2c41 100644
--- a/arch/arm64/mm/proc.S
+++ b/arch/arm64/mm/proc.S
@@ -70,11 +70,14 @@ ENTRY(cpu_do_suspend)
mrs x8, mdscr_el1
mrs x9, oslsr_el1
mrs x10, sctlr_el1
+ mrs x11, tpidr_el1
+ mrs x12, sp_el0
stp x2, x3, [x0]
stp x4, xzr, [x0, #16]
stp x5, x6, [x0, #32]
stp x7, x8, [x0, #48]
stp x9, x10, [x0, #64]
+ stp x11, x12, [x0, #80]
ret
ENDPROC(cpu_do_suspend)
@@ -90,6 +93,7 @@ ENTRY(cpu_do_resume)
ldp x6, x8, [x0, #32]
ldp x9, x10, [x0, #48]
ldp x11, x12, [x0, #64]
+ ldp x13, x14, [x0, #80]
msr tpidr_el0, x2
msr tpidrro_el0, x3
msr contextidr_el1, x4
@@ -112,6 +116,8 @@ ENTRY(cpu_do_resume)
msr mdscr_el1, x10
msr sctlr_el1, x12
+ msr tpidr_el1, x13
+ msr sp_el0, x14
/*
* Restore oslsr_el1 by writing oslar_el1
*/
@@ -134,6 +140,9 @@ ENDPROC(cpu_do_resume)
ENTRY(cpu_do_switch_mm)
mrs x2, ttbr1_el1
mmid x1, x1 // get mm->context.id
+#ifdef CONFIG_ARM64_SW_TTBR0_PAN
+ bfi x0, x1, #48, #16 // set the ASID field in TTBR0
+#endif
bfi x2, x1, #48, #16 // set the ASID
msr ttbr1_el1, x2 // in TTBR1 (since TCR.A1 is set)
isb
diff --git a/arch/arm64/xen/hypercall.S b/arch/arm64/xen/hypercall.S
index 329c8027b0a9..69711f24b743 100644
--- a/arch/arm64/xen/hypercall.S
+++ b/arch/arm64/xen/hypercall.S
@@ -49,6 +49,7 @@
#include <linux/linkage.h>
#include <asm/assembler.h>
+#include <asm/uaccess.h>
#include <xen/interface/xen.h>
@@ -91,6 +92,20 @@ ENTRY(privcmd_call)
mov x2, x3
mov x3, x4
mov x4, x5
+ /*
+ * Privcmd calls are issued by the userspace. The kernel needs to
+ * enable access to TTBR0_EL1 as the hypervisor would issue stage 1
+ * translations to user memory via AT instructions. Since AT
+ * instructions are not affected by the PAN bit (ARMv8.1), we only
+ * need the explicit uaccess_enable/disable if the TTBR0 PAN emulation
+ * is enabled (it implies that hardware UAO and PAN disabled).
+ */
+ uaccess_ttbr0_enable x6, x7, x8
hvc XEN_IMM
+
+ /*
+ * Disable userspace access from kernel once the hyp call completed.
+ */
+ uaccess_ttbr0_disable x6, x7
ret
ENDPROC(privcmd_call);
diff --git a/arch/avr32/include/uapi/asm/socket.h b/arch/avr32/include/uapi/asm/socket.h
index 1fd147f09a38..5f10f9bcd417 100644
--- a/arch/avr32/include/uapi/asm/socket.h
+++ b/arch/avr32/include/uapi/asm/socket.h
@@ -90,4 +90,6 @@
#define SO_CNX_ADVICE 53
+#define SO_COOKIE 57
+
#endif /* _UAPI__ASM_AVR32_SOCKET_H */
diff --git a/arch/frv/include/uapi/asm/socket.h b/arch/frv/include/uapi/asm/socket.h
index afbc98f02d27..ed960d3af35d 100644
--- a/arch/frv/include/uapi/asm/socket.h
+++ b/arch/frv/include/uapi/asm/socket.h
@@ -90,5 +90,7 @@
#define SO_CNX_ADVICE 53
+#define SO_COOKIE 57
+
#endif /* _ASM_SOCKET_H */
diff --git a/arch/ia64/include/uapi/asm/socket.h b/arch/ia64/include/uapi/asm/socket.h
index 0018fad9039f..9790d139f1c9 100644
--- a/arch/ia64/include/uapi/asm/socket.h
+++ b/arch/ia64/include/uapi/asm/socket.h
@@ -99,4 +99,6 @@
#define SO_CNX_ADVICE 53
+#define SO_COOKIE 57
+
#endif /* _ASM_IA64_SOCKET_H */
diff --git a/arch/m32r/include/uapi/asm/socket.h b/arch/m32r/include/uapi/asm/socket.h
index 5fe42fc7b6c5..ad2567655e65 100644
--- a/arch/m32r/include/uapi/asm/socket.h
+++ b/arch/m32r/include/uapi/asm/socket.h
@@ -90,4 +90,6 @@
#define SO_CNX_ADVICE 53
+#define SO_COOKIE 57
+
#endif /* _ASM_M32R_SOCKET_H */
diff --git a/arch/mips/include/uapi/asm/socket.h b/arch/mips/include/uapi/asm/socket.h
index 2027240aafbb..2f106d0357f4 100644
--- a/arch/mips/include/uapi/asm/socket.h
+++ b/arch/mips/include/uapi/asm/socket.h
@@ -108,4 +108,6 @@
#define SO_CNX_ADVICE 53
+#define SO_COOKIE 57
+
#endif /* _UAPI_ASM_SOCKET_H */
diff --git a/arch/mn10300/include/uapi/asm/socket.h b/arch/mn10300/include/uapi/asm/socket.h
index 5129f23a9ee1..69f96180a3f4 100644
--- a/arch/mn10300/include/uapi/asm/socket.h
+++ b/arch/mn10300/include/uapi/asm/socket.h
@@ -90,4 +90,6 @@
#define SO_CNX_ADVICE 53
+#define SO_COOKIE 57
+
#endif /* _ASM_SOCKET_H */
diff --git a/arch/parisc/include/uapi/asm/socket.h b/arch/parisc/include/uapi/asm/socket.h
index 9c935d717df9..b96a193a2a4d 100644
--- a/arch/parisc/include/uapi/asm/socket.h
+++ b/arch/parisc/include/uapi/asm/socket.h
@@ -89,4 +89,6 @@
#define SO_CNX_ADVICE 0x402E
+#define SO_COOKIE 0x4032
+
#endif /* _UAPI_ASM_SOCKET_H */
diff --git a/arch/powerpc/include/uapi/asm/socket.h b/arch/powerpc/include/uapi/asm/socket.h
index 1672e3398270..e78550f71833 100644
--- a/arch/powerpc/include/uapi/asm/socket.h
+++ b/arch/powerpc/include/uapi/asm/socket.h
@@ -97,4 +97,6 @@
#define SO_CNX_ADVICE 53
+#define SO_COOKIE 57
+
#endif /* _ASM_POWERPC_SOCKET_H */
diff --git a/arch/powerpc/platforms/pseries/cmm.c b/arch/powerpc/platforms/pseries/cmm.c
index 66e7227469b8..85018a14217a 100644
--- a/arch/powerpc/platforms/pseries/cmm.c
+++ b/arch/powerpc/platforms/pseries/cmm.c
@@ -708,7 +708,7 @@ static void cmm_exit(void)
* Return value:
* 0 on success / other on failure
**/
-static int cmm_set_disable(const char *val, struct kernel_param *kp)
+static int cmm_set_disable(const char *val, const struct kernel_param *kp)
{
int disable = simple_strtoul(val, NULL, 10);
diff --git a/arch/s390/include/uapi/asm/socket.h b/arch/s390/include/uapi/asm/socket.h
index 41b51c2f4f1b..04fe908755b5 100644
--- a/arch/s390/include/uapi/asm/socket.h
+++ b/arch/s390/include/uapi/asm/socket.h
@@ -96,4 +96,6 @@
#define SO_CNX_ADVICE 53
+#define SO_COOKIE 57
+
#endif /* _ASM_SOCKET_H */
diff --git a/arch/sparc/include/uapi/asm/socket.h b/arch/sparc/include/uapi/asm/socket.h
index 31aede3af088..de15f0a09b32 100644
--- a/arch/sparc/include/uapi/asm/socket.h
+++ b/arch/sparc/include/uapi/asm/socket.h
@@ -86,6 +86,8 @@
#define SO_CNX_ADVICE 0x0037
+#define SO_COOKIE 0x003b
+
/* Security levels - as per NRL IPv6 - don't actually do anything */
#define SO_SECURITY_AUTHENTICATION 0x5001
#define SO_SECURITY_ENCRYPTION_TRANSPORT 0x5002
diff --git a/arch/x86/Makefile b/arch/x86/Makefile
index b5226a009973..93b170ea2480 100644
--- a/arch/x86/Makefile
+++ b/arch/x86/Makefile
@@ -110,6 +110,8 @@ else
KBUILD_CFLAGS += $(call cc-option,-mno-80387)
KBUILD_CFLAGS += $(call cc-option,-mno-fp-ret-in-387)
+ KBUILD_CFLAGS += -fno-pic
+
# By default gcc and clang use a stack alignment of 16 bytes for x86.
# However the standard kernel entry on x86-64 leaves the stack on an
# 8-byte boundary. If the compiler isn't informed about the actual
diff --git a/arch/x86/configs/i386_ranchu_defconfig b/arch/x86/configs/i386_ranchu_defconfig
new file mode 100644
index 000000000000..a1c83c4e78ae
--- /dev/null
+++ b/arch/x86/configs/i386_ranchu_defconfig
@@ -0,0 +1,424 @@
+# CONFIG_64BIT is not set
+# CONFIG_LOCALVERSION_AUTO is not set
+CONFIG_POSIX_MQUEUE=y
+CONFIG_AUDIT=y
+CONFIG_NO_HZ=y
+CONFIG_HIGH_RES_TIMERS=y
+CONFIG_BSD_PROCESS_ACCT=y
+CONFIG_TASKSTATS=y
+CONFIG_TASK_DELAY_ACCT=y
+CONFIG_TASK_XACCT=y
+CONFIG_TASK_IO_ACCOUNTING=y
+CONFIG_CGROUPS=y
+CONFIG_CGROUP_DEBUG=y
+CONFIG_CGROUP_FREEZER=y
+CONFIG_CGROUP_CPUACCT=y
+CONFIG_CGROUP_SCHED=y
+CONFIG_RT_GROUP_SCHED=y
+CONFIG_BLK_DEV_INITRD=y
+CONFIG_CC_OPTIMIZE_FOR_SIZE=y
+CONFIG_SYSCTL_SYSCALL=y
+CONFIG_KALLSYMS_ALL=y
+CONFIG_EMBEDDED=y
+# CONFIG_COMPAT_BRK is not set
+CONFIG_ARCH_MMAP_RND_BITS=16
+CONFIG_PARTITION_ADVANCED=y
+CONFIG_OSF_PARTITION=y
+CONFIG_AMIGA_PARTITION=y
+CONFIG_MAC_PARTITION=y
+CONFIG_BSD_DISKLABEL=y
+CONFIG_MINIX_SUBPARTITION=y
+CONFIG_SOLARIS_X86_PARTITION=y
+CONFIG_UNIXWARE_DISKLABEL=y
+CONFIG_SGI_PARTITION=y
+CONFIG_SUN_PARTITION=y
+CONFIG_KARMA_PARTITION=y
+CONFIG_SMP=y
+CONFIG_X86_BIGSMP=y
+CONFIG_MCORE2=y
+CONFIG_X86_GENERIC=y
+CONFIG_HPET_TIMER=y
+CONFIG_NR_CPUS=512
+CONFIG_PREEMPT=y
+# CONFIG_X86_MCE is not set
+CONFIG_X86_REBOOTFIXUPS=y
+CONFIG_X86_MSR=y
+CONFIG_X86_CPUID=y
+CONFIG_KSM=y
+CONFIG_CMA=y
+# CONFIG_MTRR_SANITIZER is not set
+CONFIG_EFI=y
+CONFIG_EFI_STUB=y
+CONFIG_HZ_100=y
+CONFIG_PHYSICAL_START=0x100000
+CONFIG_PM_AUTOSLEEP=y
+CONFIG_PM_WAKELOCKS=y
+CONFIG_PM_WAKELOCKS_LIMIT=0
+# CONFIG_PM_WAKELOCKS_GC is not set
+CONFIG_PM_DEBUG=y
+CONFIG_CPU_FREQ=y
+# CONFIG_CPU_FREQ_STAT is not set
+CONFIG_CPU_FREQ_DEFAULT_GOV_ONDEMAND=y
+CONFIG_CPU_FREQ_GOV_USERSPACE=y
+CONFIG_PCIEPORTBUS=y
+# CONFIG_PCIEASPM is not set
+CONFIG_PCCARD=y
+CONFIG_YENTA=y
+CONFIG_HOTPLUG_PCI=y
+# CONFIG_CORE_DUMP_DEFAULT_ELF_HEADERS is not set
+CONFIG_BINFMT_MISC=y
+CONFIG_NET=y
+CONFIG_PACKET=y
+CONFIG_UNIX=y
+CONFIG_XFRM_USER=y
+CONFIG_NET_KEY=y
+CONFIG_INET=y
+CONFIG_IP_MULTICAST=y
+CONFIG_IP_ADVANCED_ROUTER=y
+CONFIG_IP_MULTIPLE_TABLES=y
+CONFIG_IP_ROUTE_MULTIPATH=y
+CONFIG_IP_ROUTE_VERBOSE=y
+CONFIG_IP_PNP=y
+CONFIG_IP_PNP_DHCP=y
+CONFIG_IP_PNP_BOOTP=y
+CONFIG_IP_PNP_RARP=y
+CONFIG_IP_MROUTE=y
+CONFIG_IP_PIMSM_V1=y
+CONFIG_IP_PIMSM_V2=y
+CONFIG_SYN_COOKIES=y
+CONFIG_INET_ESP=y
+# CONFIG_INET_XFRM_MODE_BEET is not set
+# CONFIG_INET_LRO is not set
+CONFIG_INET_DIAG_DESTROY=y
+CONFIG_IPV6_ROUTER_PREF=y
+CONFIG_IPV6_ROUTE_INFO=y
+CONFIG_IPV6_OPTIMISTIC_DAD=y
+CONFIG_INET6_AH=y
+CONFIG_INET6_ESP=y
+CONFIG_INET6_IPCOMP=y
+CONFIG_IPV6_MIP6=y
+CONFIG_IPV6_MULTIPLE_TABLES=y
+CONFIG_NETLABEL=y
+CONFIG_NETFILTER=y
+CONFIG_NF_CONNTRACK=y
+CONFIG_NF_CONNTRACK_SECMARK=y
+CONFIG_NF_CONNTRACK_EVENTS=y
+CONFIG_NF_CT_PROTO_DCCP=y
+CONFIG_NF_CT_PROTO_SCTP=y
+CONFIG_NF_CT_PROTO_UDPLITE=y
+CONFIG_NF_CONNTRACK_AMANDA=y
+CONFIG_NF_CONNTRACK_FTP=y
+CONFIG_NF_CONNTRACK_H323=y
+CONFIG_NF_CONNTRACK_IRC=y
+CONFIG_NF_CONNTRACK_NETBIOS_NS=y
+CONFIG_NF_CONNTRACK_PPTP=y
+CONFIG_NF_CONNTRACK_SANE=y
+CONFIG_NF_CONNTRACK_TFTP=y
+CONFIG_NF_CT_NETLINK=y
+CONFIG_NETFILTER_XT_TARGET_CLASSIFY=y
+CONFIG_NETFILTER_XT_TARGET_CONNMARK=y
+CONFIG_NETFILTER_XT_TARGET_CONNSECMARK=y
+CONFIG_NETFILTER_XT_TARGET_IDLETIMER=y
+CONFIG_NETFILTER_XT_TARGET_MARK=y
+CONFIG_NETFILTER_XT_TARGET_NFLOG=y
+CONFIG_NETFILTER_XT_TARGET_NFQUEUE=y
+CONFIG_NETFILTER_XT_TARGET_TPROXY=y
+CONFIG_NETFILTER_XT_TARGET_TRACE=y
+CONFIG_NETFILTER_XT_TARGET_SECMARK=y
+CONFIG_NETFILTER_XT_TARGET_TCPMSS=y
+CONFIG_NETFILTER_XT_MATCH_COMMENT=y
+CONFIG_NETFILTER_XT_MATCH_CONNLIMIT=y
+CONFIG_NETFILTER_XT_MATCH_CONNMARK=y
+CONFIG_NETFILTER_XT_MATCH_CONNTRACK=y
+CONFIG_NETFILTER_XT_MATCH_HASHLIMIT=y
+CONFIG_NETFILTER_XT_MATCH_HELPER=y
+CONFIG_NETFILTER_XT_MATCH_IPRANGE=y
+CONFIG_NETFILTER_XT_MATCH_LENGTH=y
+CONFIG_NETFILTER_XT_MATCH_LIMIT=y
+CONFIG_NETFILTER_XT_MATCH_MAC=y
+CONFIG_NETFILTER_XT_MATCH_MARK=y
+CONFIG_NETFILTER_XT_MATCH_POLICY=y
+CONFIG_NETFILTER_XT_MATCH_PKTTYPE=y
+CONFIG_NETFILTER_XT_MATCH_QTAGUID=y
+CONFIG_NETFILTER_XT_MATCH_QUOTA=y
+CONFIG_NETFILTER_XT_MATCH_QUOTA2=y
+CONFIG_NETFILTER_XT_MATCH_SOCKET=y
+CONFIG_NETFILTER_XT_MATCH_STATE=y
+CONFIG_NETFILTER_XT_MATCH_STATISTIC=y
+CONFIG_NETFILTER_XT_MATCH_STRING=y
+CONFIG_NETFILTER_XT_MATCH_TIME=y
+CONFIG_NETFILTER_XT_MATCH_U32=y
+CONFIG_NF_CONNTRACK_IPV4=y
+CONFIG_IP_NF_IPTABLES=y
+CONFIG_IP_NF_MATCH_AH=y
+CONFIG_IP_NF_MATCH_ECN=y
+CONFIG_IP_NF_MATCH_TTL=y
+CONFIG_IP_NF_FILTER=y
+CONFIG_IP_NF_TARGET_REJECT=y
+CONFIG_IP_NF_MANGLE=y
+CONFIG_IP_NF_RAW=y
+CONFIG_IP_NF_SECURITY=y
+CONFIG_IP_NF_ARPTABLES=y
+CONFIG_IP_NF_ARPFILTER=y
+CONFIG_IP_NF_ARP_MANGLE=y
+CONFIG_NF_CONNTRACK_IPV6=y
+CONFIG_IP6_NF_IPTABLES=y
+CONFIG_IP6_NF_FILTER=y
+CONFIG_IP6_NF_TARGET_REJECT=y
+CONFIG_IP6_NF_MANGLE=y
+CONFIG_IP6_NF_RAW=y
+CONFIG_NET_SCHED=y
+CONFIG_NET_SCH_HTB=y
+CONFIG_NET_CLS_U32=y
+CONFIG_NET_EMATCH=y
+CONFIG_NET_EMATCH_U32=y
+CONFIG_NET_CLS_ACT=y
+CONFIG_CFG80211=y
+CONFIG_MAC80211=y
+CONFIG_MAC80211_LEDS=y
+CONFIG_UEVENT_HELPER_PATH="/sbin/hotplug"
+CONFIG_DMA_CMA=y
+CONFIG_CMA_SIZE_MBYTES=16
+CONFIG_CONNECTOR=y
+CONFIG_BLK_DEV_LOOP=y
+CONFIG_BLK_DEV_RAM=y
+CONFIG_BLK_DEV_RAM_SIZE=8192
+CONFIG_VIRTIO_BLK=y
+CONFIG_BLK_DEV_SD=y
+CONFIG_BLK_DEV_SR=y
+CONFIG_BLK_DEV_SR_VENDOR=y
+CONFIG_CHR_DEV_SG=y
+CONFIG_SCSI_CONSTANTS=y
+CONFIG_SCSI_SPI_ATTRS=y
+CONFIG_SCSI_ISCSI_ATTRS=y
+# CONFIG_SCSI_LOWLEVEL is not set
+CONFIG_ATA=y
+CONFIG_SATA_AHCI=y
+CONFIG_ATA_PIIX=y
+CONFIG_PATA_AMD=y
+CONFIG_PATA_OLDPIIX=y
+CONFIG_PATA_SCH=y
+CONFIG_PATA_MPIIX=y
+CONFIG_ATA_GENERIC=y
+CONFIG_MD=y
+CONFIG_BLK_DEV_MD=y
+CONFIG_BLK_DEV_DM=y
+CONFIG_DM_DEBUG=y
+CONFIG_DM_CRYPT=y
+CONFIG_DM_MIRROR=y
+CONFIG_DM_ZERO=y
+CONFIG_DM_UEVENT=y
+CONFIG_DM_VERITY=y
+CONFIG_DM_VERITY_FEC=y
+CONFIG_NETDEVICES=y
+CONFIG_NETCONSOLE=y
+CONFIG_TUN=y
+CONFIG_VIRTIO_NET=y
+CONFIG_BNX2=y
+CONFIG_TIGON3=y
+CONFIG_NET_TULIP=y
+CONFIG_E100=y
+CONFIG_E1000=y
+CONFIG_E1000E=y
+CONFIG_SKY2=y
+CONFIG_NE2K_PCI=y
+CONFIG_FORCEDETH=y
+CONFIG_8139TOO=y
+# CONFIG_8139TOO_PIO is not set
+CONFIG_R8169=y
+CONFIG_FDDI=y
+CONFIG_PPP=y
+CONFIG_PPP_BSDCOMP=y
+CONFIG_PPP_DEFLATE=y
+CONFIG_PPP_MPPE=y
+CONFIG_PPPOLAC=y
+CONFIG_PPPOPNS=y
+CONFIG_USB_USBNET=y
+CONFIG_INPUT_POLLDEV=y
+# CONFIG_INPUT_MOUSEDEV_PSAUX is not set
+CONFIG_INPUT_EVDEV=y
+CONFIG_INPUT_KEYRESET=y
+# CONFIG_KEYBOARD_ATKBD is not set
+CONFIG_KEYBOARD_GOLDFISH_EVENTS=y
+# CONFIG_INPUT_MOUSE is not set
+CONFIG_INPUT_JOYSTICK=y
+CONFIG_JOYSTICK_XPAD=y
+CONFIG_JOYSTICK_XPAD_FF=y
+CONFIG_JOYSTICK_XPAD_LEDS=y
+CONFIG_INPUT_TABLET=y
+CONFIG_TABLET_USB_ACECAD=y
+CONFIG_TABLET_USB_AIPTEK=y
+CONFIG_TABLET_USB_GTCO=y
+CONFIG_TABLET_USB_HANWANG=y
+CONFIG_TABLET_USB_KBTAB=y
+CONFIG_INPUT_TOUCHSCREEN=y
+CONFIG_INPUT_MISC=y
+CONFIG_INPUT_KEYCHORD=y
+CONFIG_INPUT_UINPUT=y
+CONFIG_INPUT_GPIO=y
+# CONFIG_SERIO is not set
+# CONFIG_VT is not set
+# CONFIG_LEGACY_PTYS is not set
+CONFIG_SERIAL_NONSTANDARD=y
+# CONFIG_DEVMEM is not set
+# CONFIG_DEVKMEM is not set
+CONFIG_SERIAL_8250=y
+CONFIG_SERIAL_8250_CONSOLE=y
+CONFIG_VIRTIO_CONSOLE=y
+CONFIG_NVRAM=y
+CONFIG_I2C_I801=y
+CONFIG_BATTERY_GOLDFISH=y
+CONFIG_WATCHDOG=y
+CONFIG_MEDIA_SUPPORT=y
+CONFIG_AGP=y
+CONFIG_AGP_AMD64=y
+CONFIG_AGP_INTEL=y
+CONFIG_DRM=y
+CONFIG_FB_MODE_HELPERS=y
+CONFIG_FB_TILEBLITTING=y
+CONFIG_FB_EFI=y
+CONFIG_FB_GOLDFISH=y
+CONFIG_BACKLIGHT_LCD_SUPPORT=y
+# CONFIG_LCD_CLASS_DEVICE is not set
+CONFIG_SOUND=y
+CONFIG_SND=y
+CONFIG_HIDRAW=y
+CONFIG_UHID=y
+CONFIG_HID_A4TECH=y
+CONFIG_HID_ACRUX=y
+CONFIG_HID_ACRUX_FF=y
+CONFIG_HID_APPLE=y
+CONFIG_HID_BELKIN=y
+CONFIG_HID_CHERRY=y
+CONFIG_HID_CHICONY=y
+CONFIG_HID_PRODIKEYS=y
+CONFIG_HID_CYPRESS=y
+CONFIG_HID_DRAGONRISE=y
+CONFIG_DRAGONRISE_FF=y
+CONFIG_HID_EMS_FF=y
+CONFIG_HID_ELECOM=y
+CONFIG_HID_EZKEY=y
+CONFIG_HID_HOLTEK=y
+CONFIG_HID_KEYTOUCH=y
+CONFIG_HID_KYE=y
+CONFIG_HID_UCLOGIC=y
+CONFIG_HID_WALTOP=y
+CONFIG_HID_GYRATION=y
+CONFIG_HID_TWINHAN=y
+CONFIG_HID_KENSINGTON=y
+CONFIG_HID_LCPOWER=y
+CONFIG_HID_LOGITECH=y
+CONFIG_HID_LOGITECH_DJ=y
+CONFIG_LOGITECH_FF=y
+CONFIG_LOGIRUMBLEPAD2_FF=y
+CONFIG_LOGIG940_FF=y
+CONFIG_HID_MAGICMOUSE=y
+CONFIG_HID_MICROSOFT=y
+CONFIG_HID_MONTEREY=y
+CONFIG_HID_MULTITOUCH=y
+CONFIG_HID_NTRIG=y
+CONFIG_HID_ORTEK=y
+CONFIG_HID_PANTHERLORD=y
+CONFIG_PANTHERLORD_FF=y
+CONFIG_HID_PETALYNX=y
+CONFIG_HID_PICOLCD=y
+CONFIG_HID_PRIMAX=y
+CONFIG_HID_ROCCAT=y
+CONFIG_HID_SAITEK=y
+CONFIG_HID_SAMSUNG=y
+CONFIG_HID_SONY=y
+CONFIG_HID_SPEEDLINK=y
+CONFIG_HID_SUNPLUS=y
+CONFIG_HID_GREENASIA=y
+CONFIG_GREENASIA_FF=y
+CONFIG_HID_SMARTJOYPLUS=y
+CONFIG_SMARTJOYPLUS_FF=y
+CONFIG_HID_TIVO=y
+CONFIG_HID_TOPSEED=y
+CONFIG_HID_THRUSTMASTER=y
+CONFIG_HID_WACOM=y
+CONFIG_HID_WIIMOTE=y
+CONFIG_HID_ZEROPLUS=y
+CONFIG_HID_ZYDACRON=y
+CONFIG_HID_PID=y
+CONFIG_USB_HIDDEV=y
+CONFIG_USB_ANNOUNCE_NEW_DEVICES=y
+CONFIG_USB_MON=y
+CONFIG_USB_EHCI_HCD=y
+# CONFIG_USB_EHCI_TT_NEWSCHED is not set
+CONFIG_USB_OHCI_HCD=y
+CONFIG_USB_UHCI_HCD=y
+CONFIG_USB_PRINTER=y
+CONFIG_USB_STORAGE=y
+CONFIG_USB_OTG_WAKELOCK=y
+CONFIG_EDAC=y
+CONFIG_RTC_CLASS=y
+# CONFIG_RTC_HCTOSYS is not set
+CONFIG_DMADEVICES=y
+CONFIG_VIRTIO_PCI=y
+CONFIG_STAGING=y
+CONFIG_ASHMEM=y
+CONFIG_ANDROID_LOW_MEMORY_KILLER=y
+CONFIG_SYNC=y
+CONFIG_SW_SYNC=y
+CONFIG_SYNC_FILE=y
+CONFIG_ION=y
+CONFIG_GOLDFISH_AUDIO=y
+CONFIG_SND_HDA_INTEL=y
+CONFIG_GOLDFISH=y
+CONFIG_GOLDFISH_PIPE=y
+CONFIG_GOLDFISH_SYNC=y
+CONFIG_ANDROID=y
+CONFIG_ANDROID_BINDER_IPC=y
+CONFIG_ISCSI_IBFT_FIND=y
+CONFIG_EXT4_FS=y
+CONFIG_EXT4_FS_SECURITY=y
+CONFIG_QUOTA=y
+CONFIG_QUOTA_NETLINK_INTERFACE=y
+# CONFIG_PRINT_QUOTA_WARNING is not set
+CONFIG_FUSE_FS=y
+CONFIG_ISO9660_FS=y
+CONFIG_JOLIET=y
+CONFIG_ZISOFS=y
+CONFIG_MSDOS_FS=y
+CONFIG_VFAT_FS=y
+CONFIG_PROC_KCORE=y
+CONFIG_TMPFS=y
+CONFIG_TMPFS_POSIX_ACL=y
+CONFIG_HUGETLBFS=y
+CONFIG_PSTORE=y
+CONFIG_PSTORE_CONSOLE=y
+CONFIG_PSTORE_RAM=y
+# CONFIG_NETWORK_FILESYSTEMS is not set
+CONFIG_NLS_DEFAULT="utf8"
+CONFIG_NLS_CODEPAGE_437=y
+CONFIG_NLS_ASCII=y
+CONFIG_NLS_ISO8859_1=y
+CONFIG_NLS_UTF8=y
+CONFIG_PRINTK_TIME=y
+CONFIG_DEBUG_INFO=y
+# CONFIG_ENABLE_WARN_DEPRECATED is not set
+# CONFIG_ENABLE_MUST_CHECK is not set
+CONFIG_FRAME_WARN=2048
+# CONFIG_UNUSED_SYMBOLS is not set
+CONFIG_MAGIC_SYSRQ=y
+CONFIG_DEBUG_MEMORY_INIT=y
+CONFIG_PANIC_TIMEOUT=5
+CONFIG_SCHEDSTATS=y
+CONFIG_TIMER_STATS=y
+CONFIG_SCHED_TRACER=y
+CONFIG_BLK_DEV_IO_TRACE=y
+CONFIG_PROVIDE_OHCI1394_DMA_INIT=y
+CONFIG_KEYS=y
+CONFIG_SECURITY=y
+CONFIG_SECURITY_NETWORK=y
+CONFIG_SECURITY_SELINUX=y
+CONFIG_CRYPTO_AES_586=y
+CONFIG_CRYPTO_TWOFISH=y
+CONFIG_ASYMMETRIC_KEY_TYPE=y
+CONFIG_ASYMMETRIC_PUBLIC_KEY_SUBTYPE=y
+CONFIG_X509_CERTIFICATE_PARSER=y
+CONFIG_PKCS7_MESSAGE_PARSER=y
+CONFIG_PKCS7_TEST_KEY=y
+# CONFIG_VIRTUALIZATION is not set
+CONFIG_CRC_T10DIF=y
diff --git a/arch/x86/configs/x86_64_cuttlefish_defconfig b/arch/x86/configs/x86_64_cuttlefish_defconfig
new file mode 100644
index 000000000000..bc1fd7f32992
--- /dev/null
+++ b/arch/x86/configs/x86_64_cuttlefish_defconfig
@@ -0,0 +1,478 @@
+CONFIG_POSIX_MQUEUE=y
+# CONFIG_FHANDLE is not set
+# CONFIG_USELIB is not set
+CONFIG_AUDIT=y
+CONFIG_NO_HZ=y
+CONFIG_HIGH_RES_TIMERS=y
+CONFIG_BSD_PROCESS_ACCT=y
+CONFIG_TASKSTATS=y
+CONFIG_TASK_DELAY_ACCT=y
+CONFIG_TASK_XACCT=y
+CONFIG_TASK_IO_ACCOUNTING=y
+CONFIG_IKCONFIG=y
+CONFIG_IKCONFIG_PROC=y
+CONFIG_CGROUPS=y
+CONFIG_CGROUP_DEBUG=y
+CONFIG_CGROUP_FREEZER=y
+CONFIG_CGROUP_CPUACCT=y
+CONFIG_MEMCG=y
+CONFIG_MEMCG_SWAP=y
+CONFIG_CGROUP_SCHED=y
+CONFIG_RT_GROUP_SCHED=y
+CONFIG_CGROUP_BPF=y
+CONFIG_NAMESPACES=y
+CONFIG_BLK_DEV_INITRD=y
+# CONFIG_RD_LZ4 is not set
+CONFIG_KALLSYMS_ALL=y
+# CONFIG_PCSPKR_PLATFORM is not set
+CONFIG_BPF_SYSCALL=y
+CONFIG_EMBEDDED=y
+# CONFIG_COMPAT_BRK is not set
+CONFIG_PROFILING=y
+CONFIG_OPROFILE=y
+CONFIG_KPROBES=y
+CONFIG_JUMP_LABEL=y
+CONFIG_CC_STACKPROTECTOR_STRONG=y
+CONFIG_MODULES=y
+CONFIG_MODULE_UNLOAD=y
+CONFIG_MODVERSIONS=y
+CONFIG_PARTITION_ADVANCED=y
+CONFIG_SMP=y
+CONFIG_HYPERVISOR_GUEST=y
+CONFIG_PARAVIRT=y
+CONFIG_PARAVIRT_SPINLOCKS=y
+CONFIG_MCORE2=y
+CONFIG_PROCESSOR_SELECT=y
+# CONFIG_CPU_SUP_CENTAUR is not set
+CONFIG_NR_CPUS=8
+CONFIG_PREEMPT=y
+# CONFIG_MICROCODE is not set
+CONFIG_X86_MSR=y
+CONFIG_X86_CPUID=y
+CONFIG_KSM=y
+CONFIG_DEFAULT_MMAP_MIN_ADDR=65536
+CONFIG_TRANSPARENT_HUGEPAGE=y
+CONFIG_ZSMALLOC=y
+# CONFIG_MTRR is not set
+CONFIG_HZ_100=y
+CONFIG_KEXEC=y
+CONFIG_CRASH_DUMP=y
+CONFIG_PHYSICAL_START=0x200000
+CONFIG_RANDOMIZE_BASE=y
+CONFIG_PHYSICAL_ALIGN=0x1000000
+CONFIG_CMDLINE_BOOL=y
+CONFIG_CMDLINE="console=ttyS0 reboot=p nopti"
+CONFIG_PM_AUTOSLEEP=y
+CONFIG_PM_WAKELOCKS=y
+CONFIG_PM_WAKELOCKS_LIMIT=0
+# CONFIG_PM_WAKELOCKS_GC is not set
+CONFIG_PM_DEBUG=y
+CONFIG_ACPI_PROCFS_POWER=y
+# CONFIG_ACPI_FAN is not set
+# CONFIG_ACPI_THERMAL is not set
+# CONFIG_X86_PM_TIMER is not set
+CONFIG_CPU_FREQ=y
+CONFIG_CPU_FREQ_GOV_ONDEMAND=y
+CONFIG_X86_ACPI_CPUFREQ=y
+# CONFIG_X86_ACPI_CPUFREQ_CPB is not set
+CONFIG_PCI_MMCONFIG=y
+CONFIG_PCI_MSI=y
+# CONFIG_CORE_DUMP_DEFAULT_ELF_HEADERS is not set
+CONFIG_BINFMT_MISC=y
+CONFIG_IA32_EMULATION=y
+CONFIG_NET=y
+CONFIG_PACKET=y
+CONFIG_UNIX=y
+CONFIG_XFRM_USER=y
+CONFIG_NET_KEY=y
+CONFIG_INET=y
+CONFIG_IP_MULTICAST=y
+CONFIG_IP_ADVANCED_ROUTER=y
+CONFIG_IP_MULTIPLE_TABLES=y
+CONFIG_IP_ROUTE_MULTIPATH=y
+CONFIG_IP_ROUTE_VERBOSE=y
+CONFIG_IP_MROUTE=y
+CONFIG_IP_PIMSM_V1=y
+CONFIG_IP_PIMSM_V2=y
+CONFIG_SYN_COOKIES=y
+CONFIG_NET_IPVTI=y
+CONFIG_INET_ESP=y
+# CONFIG_INET_XFRM_MODE_BEET is not set
+CONFIG_INET_DIAG_DESTROY=y
+CONFIG_TCP_CONG_ADVANCED=y
+# CONFIG_TCP_CONG_BIC is not set
+# CONFIG_TCP_CONG_WESTWOOD is not set
+# CONFIG_TCP_CONG_HTCP is not set
+CONFIG_TCP_MD5SIG=y
+CONFIG_IPV6_ROUTER_PREF=y
+CONFIG_IPV6_ROUTE_INFO=y
+CONFIG_IPV6_OPTIMISTIC_DAD=y
+CONFIG_INET6_AH=y
+CONFIG_INET6_ESP=y
+CONFIG_INET6_IPCOMP=y
+CONFIG_IPV6_MIP6=y
+CONFIG_IPV6_VTI=y
+CONFIG_IPV6_MULTIPLE_TABLES=y
+CONFIG_NETLABEL=y
+CONFIG_NETFILTER=y
+CONFIG_NF_CONNTRACK=y
+CONFIG_NF_CONNTRACK_SECMARK=y
+CONFIG_NF_CONNTRACK_EVENTS=y
+CONFIG_NF_CT_PROTO_DCCP=y
+CONFIG_NF_CT_PROTO_SCTP=y
+CONFIG_NF_CT_PROTO_UDPLITE=y
+CONFIG_NF_CONNTRACK_AMANDA=y
+CONFIG_NF_CONNTRACK_FTP=y
+CONFIG_NF_CONNTRACK_H323=y
+CONFIG_NF_CONNTRACK_IRC=y
+CONFIG_NF_CONNTRACK_NETBIOS_NS=y
+CONFIG_NF_CONNTRACK_PPTP=y
+CONFIG_NF_CONNTRACK_SANE=y
+CONFIG_NF_CONNTRACK_TFTP=y
+CONFIG_NF_CT_NETLINK=y
+CONFIG_NETFILTER_XT_TARGET_CLASSIFY=y
+CONFIG_NETFILTER_XT_TARGET_CONNMARK=y
+CONFIG_NETFILTER_XT_TARGET_CONNSECMARK=y
+CONFIG_NETFILTER_XT_TARGET_IDLETIMER=y
+CONFIG_NETFILTER_XT_TARGET_MARK=y
+CONFIG_NETFILTER_XT_TARGET_NFLOG=y
+CONFIG_NETFILTER_XT_TARGET_NFQUEUE=y
+CONFIG_NETFILTER_XT_TARGET_TPROXY=y
+CONFIG_NETFILTER_XT_TARGET_TRACE=y
+CONFIG_NETFILTER_XT_TARGET_SECMARK=y
+CONFIG_NETFILTER_XT_TARGET_TCPMSS=y
+CONFIG_NETFILTER_XT_MATCH_BPF=y
+CONFIG_NETFILTER_XT_MATCH_COMMENT=y
+CONFIG_NETFILTER_XT_MATCH_CONNLIMIT=y
+CONFIG_NETFILTER_XT_MATCH_CONNMARK=y
+CONFIG_NETFILTER_XT_MATCH_CONNTRACK=y
+CONFIG_NETFILTER_XT_MATCH_HASHLIMIT=y
+CONFIG_NETFILTER_XT_MATCH_HELPER=y
+CONFIG_NETFILTER_XT_MATCH_IPRANGE=y
+CONFIG_NETFILTER_XT_MATCH_LENGTH=y
+CONFIG_NETFILTER_XT_MATCH_LIMIT=y
+CONFIG_NETFILTER_XT_MATCH_MAC=y
+CONFIG_NETFILTER_XT_MATCH_MARK=y
+CONFIG_NETFILTER_XT_MATCH_POLICY=y
+CONFIG_NETFILTER_XT_MATCH_PKTTYPE=y
+CONFIG_NETFILTER_XT_MATCH_QTAGUID=y
+CONFIG_NETFILTER_XT_MATCH_QUOTA=y
+CONFIG_NETFILTER_XT_MATCH_QUOTA2=y
+CONFIG_NETFILTER_XT_MATCH_SOCKET=y
+CONFIG_NETFILTER_XT_MATCH_STATE=y
+CONFIG_NETFILTER_XT_MATCH_STATISTIC=y
+CONFIG_NETFILTER_XT_MATCH_STRING=y
+CONFIG_NETFILTER_XT_MATCH_TIME=y
+CONFIG_NETFILTER_XT_MATCH_U32=y
+CONFIG_NF_CONNTRACK_IPV4=y
+CONFIG_IP_NF_IPTABLES=y
+CONFIG_IP_NF_MATCH_AH=y
+CONFIG_IP_NF_MATCH_ECN=y
+CONFIG_IP_NF_MATCH_TTL=y
+CONFIG_IP_NF_FILTER=y
+CONFIG_IP_NF_TARGET_REJECT=y
+CONFIG_IP_NF_NAT=y
+CONFIG_IP_NF_TARGET_MASQUERADE=y
+CONFIG_IP_NF_TARGET_NETMAP=y
+CONFIG_IP_NF_TARGET_REDIRECT=y
+CONFIG_IP_NF_MANGLE=y
+CONFIG_IP_NF_RAW=y
+CONFIG_IP_NF_SECURITY=y
+CONFIG_IP_NF_ARPTABLES=y
+CONFIG_IP_NF_ARPFILTER=y
+CONFIG_IP_NF_ARP_MANGLE=y
+CONFIG_NF_CONNTRACK_IPV6=y
+CONFIG_IP6_NF_IPTABLES=y
+CONFIG_IP6_NF_MATCH_IPV6HEADER=y
+CONFIG_IP6_NF_MATCH_RPFILTER=y
+CONFIG_IP6_NF_FILTER=y
+CONFIG_IP6_NF_TARGET_REJECT=y
+CONFIG_IP6_NF_MANGLE=y
+CONFIG_IP6_NF_RAW=y
+CONFIG_NET_SCHED=y
+CONFIG_NET_SCH_HTB=y
+CONFIG_NET_CLS_U32=y
+CONFIG_NET_CLS_BPF=y
+CONFIG_NET_EMATCH=y
+CONFIG_NET_EMATCH_U32=y
+CONFIG_NET_CLS_ACT=y
+CONFIG_CFG80211=y
+CONFIG_MAC80211=y
+CONFIG_RFKILL=y
+CONFIG_UEVENT_HELPER_PATH="/sbin/hotplug"
+CONFIG_DEVTMPFS=y
+CONFIG_DEBUG_DEVRES=y
+CONFIG_OF=y
+CONFIG_OF_UNITTEST=y
+# CONFIG_PNP_DEBUG_MESSAGES is not set
+CONFIG_ZRAM=y
+CONFIG_BLK_DEV_LOOP=y
+CONFIG_BLK_DEV_RAM=y
+CONFIG_BLK_DEV_RAM_SIZE=8192
+CONFIG_VIRTIO_BLK=y
+CONFIG_UID_SYS_STATS=y
+CONFIG_SCSI=y
+CONFIG_BLK_DEV_SD=y
+CONFIG_BLK_DEV_SR=y
+CONFIG_BLK_DEV_SR_VENDOR=y
+CONFIG_CHR_DEV_SG=y
+CONFIG_SCSI_CONSTANTS=y
+CONFIG_SCSI_SPI_ATTRS=y
+CONFIG_SCSI_VIRTIO=y
+CONFIG_MD=y
+CONFIG_BLK_DEV_DM=y
+CONFIG_DM_CRYPT=y
+CONFIG_DM_MIRROR=y
+CONFIG_DM_ZERO=y
+CONFIG_DM_UEVENT=y
+CONFIG_DM_VERITY=y
+CONFIG_DM_VERITY_FEC=y
+CONFIG_DM_ANDROID_VERITY=y
+CONFIG_NETDEVICES=y
+CONFIG_NETCONSOLE=y
+CONFIG_NETCONSOLE_DYNAMIC=y
+CONFIG_TUN=y
+CONFIG_VIRTIO_NET=y
+# CONFIG_ETHERNET is not set
+CONFIG_PPP=y
+CONFIG_PPP_BSDCOMP=y
+CONFIG_PPP_DEFLATE=y
+CONFIG_PPP_MPPE=y
+CONFIG_PPPOLAC=y
+CONFIG_PPPOPNS=y
+CONFIG_USB_USBNET=y
+# CONFIG_USB_NET_AX8817X is not set
+# CONFIG_USB_NET_AX88179_178A is not set
+# CONFIG_USB_NET_CDCETHER is not set
+# CONFIG_USB_NET_CDC_NCM is not set
+# CONFIG_USB_NET_NET1080 is not set
+# CONFIG_USB_NET_CDC_SUBSET is not set
+# CONFIG_USB_NET_ZAURUS is not set
+# CONFIG_WLAN_VENDOR_ADMTEK is not set
+# CONFIG_WLAN_VENDOR_ATH is not set
+# CONFIG_WLAN_VENDOR_ATMEL is not set
+# CONFIG_WLAN_VENDOR_BROADCOM is not set
+# CONFIG_WLAN_VENDOR_CISCO is not set
+# CONFIG_WLAN_VENDOR_INTEL is not set
+# CONFIG_WLAN_VENDOR_INTERSIL is not set
+# CONFIG_WLAN_VENDOR_MARVELL is not set
+# CONFIG_WLAN_VENDOR_MEDIATEK is not set
+# CONFIG_WLAN_VENDOR_RALINK is not set
+# CONFIG_WLAN_VENDOR_REALTEK is not set
+# CONFIG_WLAN_VENDOR_RSI is not set
+# CONFIG_WLAN_VENDOR_ST is not set
+# CONFIG_WLAN_VENDOR_TI is not set
+# CONFIG_WLAN_VENDOR_ZYDAS is not set
+CONFIG_MAC80211_HWSIM=y
+CONFIG_VIRT_WIFI=y
+CONFIG_INPUT_EVDEV=y
+CONFIG_INPUT_KEYRESET=y
+# CONFIG_INPUT_KEYBOARD is not set
+# CONFIG_INPUT_MOUSE is not set
+CONFIG_INPUT_JOYSTICK=y
+CONFIG_JOYSTICK_XPAD=y
+CONFIG_JOYSTICK_XPAD_FF=y
+CONFIG_JOYSTICK_XPAD_LEDS=y
+CONFIG_INPUT_TABLET=y
+CONFIG_TABLET_USB_ACECAD=y
+CONFIG_TABLET_USB_AIPTEK=y
+CONFIG_TABLET_USB_GTCO=y
+CONFIG_TABLET_USB_HANWANG=y
+CONFIG_TABLET_USB_KBTAB=y
+CONFIG_INPUT_MISC=y
+CONFIG_INPUT_KEYCHORD=y
+CONFIG_INPUT_UINPUT=y
+CONFIG_INPUT_GPIO=y
+# CONFIG_SERIO_I8042 is not set
+# CONFIG_VT is not set
+# CONFIG_LEGACY_PTYS is not set
+# CONFIG_DEVMEM is not set
+# CONFIG_DEVKMEM is not set
+CONFIG_SERIAL_8250=y
+# CONFIG_SERIAL_8250_DEPRECATED_OPTIONS is not set
+CONFIG_SERIAL_8250_CONSOLE=y
+CONFIG_SERIAL_8250_NR_UARTS=48
+CONFIG_SERIAL_8250_EXTENDED=y
+CONFIG_SERIAL_8250_MANY_PORTS=y
+CONFIG_SERIAL_8250_SHARE_IRQ=y
+CONFIG_VIRTIO_CONSOLE=y
+CONFIG_HW_RANDOM=y
+# CONFIG_HW_RANDOM_INTEL is not set
+# CONFIG_HW_RANDOM_AMD is not set
+# CONFIG_HW_RANDOM_VIA is not set
+CONFIG_HW_RANDOM_VIRTIO=y
+CONFIG_HPET=y
+# CONFIG_HPET_MMAP_DEFAULT is not set
+# CONFIG_DEVPORT is not set
+# CONFIG_ACPI_I2C_OPREGION is not set
+# CONFIG_I2C_COMPAT is not set
+# CONFIG_I2C_HELPER_AUTO is not set
+CONFIG_PTP_1588_CLOCK=y
+# CONFIG_HWMON is not set
+# CONFIG_X86_PKG_TEMP_THERMAL is not set
+CONFIG_WATCHDOG=y
+CONFIG_SOFT_WATCHDOG=y
+CONFIG_MEDIA_SUPPORT=y
+# CONFIG_DVB_TUNER_DIB0070 is not set
+# CONFIG_DVB_TUNER_DIB0090 is not set
+# CONFIG_VGA_ARB is not set
+CONFIG_DRM=y
+# CONFIG_DRM_FBDEV_EMULATION is not set
+CONFIG_DRM_VIRTIO_GPU=y
+CONFIG_FB=y
+CONFIG_SOUND=y
+CONFIG_SND=y
+CONFIG_HIDRAW=y
+CONFIG_UHID=y
+CONFIG_HID_A4TECH=y
+CONFIG_HID_ACRUX=y
+CONFIG_HID_ACRUX_FF=y
+CONFIG_HID_APPLE=y
+CONFIG_HID_BELKIN=y
+CONFIG_HID_CHERRY=y
+CONFIG_HID_CHICONY=y
+CONFIG_HID_PRODIKEYS=y
+CONFIG_HID_CYPRESS=y
+CONFIG_HID_DRAGONRISE=y
+CONFIG_DRAGONRISE_FF=y
+CONFIG_HID_EMS_FF=y
+CONFIG_HID_ELECOM=y
+CONFIG_HID_EZKEY=y
+CONFIG_HID_HOLTEK=y
+CONFIG_HID_KEYTOUCH=y
+CONFIG_HID_KYE=y
+CONFIG_HID_UCLOGIC=y
+CONFIG_HID_WALTOP=y
+CONFIG_HID_GYRATION=y
+CONFIG_HID_TWINHAN=y
+CONFIG_HID_KENSINGTON=y
+CONFIG_HID_LCPOWER=y
+CONFIG_HID_LOGITECH=y
+CONFIG_HID_LOGITECH_DJ=y
+CONFIG_LOGITECH_FF=y
+CONFIG_LOGIRUMBLEPAD2_FF=y
+CONFIG_LOGIG940_FF=y
+CONFIG_HID_MAGICMOUSE=y
+CONFIG_HID_MICROSOFT=y
+CONFIG_HID_MONTEREY=y
+CONFIG_HID_MULTITOUCH=y
+CONFIG_HID_NTRIG=y
+CONFIG_HID_ORTEK=y
+CONFIG_HID_PANTHERLORD=y
+CONFIG_PANTHERLORD_FF=y
+CONFIG_HID_PETALYNX=y
+CONFIG_HID_PICOLCD=y
+CONFIG_HID_PRIMAX=y
+CONFIG_HID_ROCCAT=y
+CONFIG_HID_SAITEK=y
+CONFIG_HID_SAMSUNG=y
+CONFIG_HID_SONY=y
+CONFIG_HID_SPEEDLINK=y
+CONFIG_HID_SUNPLUS=y
+CONFIG_HID_GREENASIA=y
+CONFIG_GREENASIA_FF=y
+CONFIG_HID_SMARTJOYPLUS=y
+CONFIG_SMARTJOYPLUS_FF=y
+CONFIG_HID_TIVO=y
+CONFIG_HID_TOPSEED=y
+CONFIG_HID_THRUSTMASTER=y
+CONFIG_HID_WACOM=y
+CONFIG_HID_WIIMOTE=y
+CONFIG_HID_ZEROPLUS=y
+CONFIG_HID_ZYDACRON=y
+CONFIG_USB_HIDDEV=y
+CONFIG_USB_ANNOUNCE_NEW_DEVICES=y
+CONFIG_USB_EHCI_HCD=y
+CONFIG_USB_GADGET=y
+CONFIG_USB_DUMMY_HCD=y
+CONFIG_USB_CONFIGFS=y
+CONFIG_USB_CONFIGFS_F_FS=y
+CONFIG_USB_CONFIGFS_F_MTP=y
+CONFIG_USB_CONFIGFS_F_PTP=y
+CONFIG_USB_CONFIGFS_F_ACC=y
+CONFIG_USB_CONFIGFS_F_AUDIO_SRC=y
+CONFIG_USB_CONFIGFS_UEVENT=y
+CONFIG_USB_CONFIGFS_F_MIDI=y
+CONFIG_RTC_CLASS=y
+# CONFIG_RTC_HCTOSYS is not set
+CONFIG_SW_SYNC=y
+CONFIG_VIRTIO_PCI=y
+CONFIG_VIRTIO_BALLOON=y
+CONFIG_VIRTIO_INPUT=y
+CONFIG_VIRTIO_MMIO=y
+CONFIG_VIRTIO_MMIO_CMDLINE_DEVICES=y
+CONFIG_STAGING=y
+CONFIG_ASHMEM=y
+CONFIG_ANDROID_VSOC=y
+CONFIG_ION=y
+# CONFIG_X86_PLATFORM_DEVICES is not set
+# CONFIG_IOMMU_SUPPORT is not set
+CONFIG_ANDROID=y
+CONFIG_ANDROID_BINDER_IPC=y
+# CONFIG_FIRMWARE_MEMMAP is not set
+CONFIG_EXT4_FS=y
+CONFIG_EXT4_FS_POSIX_ACL=y
+CONFIG_EXT4_FS_SECURITY=y
+CONFIG_EXT4_ENCRYPTION=y
+CONFIG_F2FS_FS=y
+CONFIG_F2FS_FS_SECURITY=y
+CONFIG_F2FS_FS_ENCRYPTION=y
+CONFIG_QUOTA=y
+CONFIG_QUOTA_NETLINK_INTERFACE=y
+# CONFIG_PRINT_QUOTA_WARNING is not set
+CONFIG_QFMT_V2=y
+CONFIG_AUTOFS4_FS=y
+CONFIG_FUSE_FS=y
+CONFIG_MSDOS_FS=y
+CONFIG_VFAT_FS=y
+CONFIG_PROC_KCORE=y
+CONFIG_TMPFS=y
+CONFIG_TMPFS_POSIX_ACL=y
+CONFIG_HUGETLBFS=y
+CONFIG_SDCARD_FS=y
+CONFIG_PSTORE=y
+CONFIG_PSTORE_CONSOLE=y
+CONFIG_PSTORE_RAM=y
+CONFIG_NLS_DEFAULT="utf8"
+CONFIG_NLS_CODEPAGE_437=y
+CONFIG_NLS_ASCII=y
+CONFIG_NLS_ISO8859_1=y
+CONFIG_NLS_UTF8=y
+CONFIG_PRINTK_TIME=y
+CONFIG_DEBUG_INFO=y
+# CONFIG_ENABLE_WARN_DEPRECATED is not set
+# CONFIG_ENABLE_MUST_CHECK is not set
+CONFIG_FRAME_WARN=1024
+# CONFIG_UNUSED_SYMBOLS is not set
+CONFIG_MAGIC_SYSRQ=y
+CONFIG_DEBUG_STACK_USAGE=y
+CONFIG_DEBUG_MEMORY_INIT=y
+CONFIG_DEBUG_STACKOVERFLOW=y
+CONFIG_LOCKUP_DETECTOR=y
+CONFIG_PANIC_TIMEOUT=5
+CONFIG_SCHEDSTATS=y
+CONFIG_TIMER_STATS=y
+CONFIG_RCU_CPU_STALL_TIMEOUT=60
+CONFIG_ENABLE_DEFAULT_TRACERS=y
+CONFIG_UPROBE_EVENT=y
+CONFIG_IO_DELAY_NONE=y
+CONFIG_DEBUG_BOOT_PARAMS=y
+CONFIG_OPTIMIZE_INLINING=y
+CONFIG_SECURITY_PERF_EVENTS_RESTRICT=y
+CONFIG_SECURITY=y
+CONFIG_SECURITY_NETWORK=y
+CONFIG_SECURITY_PATH=y
+CONFIG_HARDENED_USERCOPY=y
+CONFIG_SECURITY_SELINUX=y
+CONFIG_SECURITY_SELINUX_CHECKREQPROT_VALUE=1
+CONFIG_CRYPTO_RSA=y
+# CONFIG_CRYPTO_MANAGER_DISABLE_TESTS is not set
+CONFIG_CRYPTO_ADIANTUM=y
+CONFIG_CRYPTO_SHA512=y
+CONFIG_CRYPTO_LZ4=y
+CONFIG_CRYPTO_ZSTD=y
+CONFIG_ASYMMETRIC_KEY_TYPE=y
+CONFIG_ASYMMETRIC_PUBLIC_KEY_SUBTYPE=y
+CONFIG_X509_CERTIFICATE_PARSER=y
+CONFIG_SYSTEM_TRUSTED_KEYRING=y
+CONFIG_SYSTEM_TRUSTED_KEYS="verity_dev_keys.x509"
diff --git a/arch/x86/configs/x86_64_ranchu_defconfig b/arch/x86/configs/x86_64_ranchu_defconfig
new file mode 100644
index 000000000000..d50434f501fb
--- /dev/null
+++ b/arch/x86/configs/x86_64_ranchu_defconfig
@@ -0,0 +1,419 @@
+# CONFIG_LOCALVERSION_AUTO is not set
+CONFIG_POSIX_MQUEUE=y
+CONFIG_AUDIT=y
+CONFIG_NO_HZ=y
+CONFIG_HIGH_RES_TIMERS=y
+CONFIG_BSD_PROCESS_ACCT=y
+CONFIG_TASKSTATS=y
+CONFIG_TASK_DELAY_ACCT=y
+CONFIG_TASK_XACCT=y
+CONFIG_TASK_IO_ACCOUNTING=y
+CONFIG_CGROUPS=y
+CONFIG_CGROUP_DEBUG=y
+CONFIG_CGROUP_FREEZER=y
+CONFIG_CGROUP_CPUACCT=y
+CONFIG_CGROUP_SCHED=y
+CONFIG_RT_GROUP_SCHED=y
+CONFIG_BLK_DEV_INITRD=y
+CONFIG_CC_OPTIMIZE_FOR_SIZE=y
+CONFIG_SYSCTL_SYSCALL=y
+CONFIG_KALLSYMS_ALL=y
+CONFIG_EMBEDDED=y
+# CONFIG_COMPAT_BRK is not set
+CONFIG_ARCH_MMAP_RND_BITS=32
+CONFIG_ARCH_MMAP_RND_COMPAT_BITS=16
+CONFIG_PARTITION_ADVANCED=y
+CONFIG_OSF_PARTITION=y
+CONFIG_AMIGA_PARTITION=y
+CONFIG_MAC_PARTITION=y
+CONFIG_BSD_DISKLABEL=y
+CONFIG_MINIX_SUBPARTITION=y
+CONFIG_SOLARIS_X86_PARTITION=y
+CONFIG_UNIXWARE_DISKLABEL=y
+CONFIG_SGI_PARTITION=y
+CONFIG_SUN_PARTITION=y
+CONFIG_KARMA_PARTITION=y
+CONFIG_SMP=y
+CONFIG_MCORE2=y
+CONFIG_MAXSMP=y
+CONFIG_PREEMPT=y
+# CONFIG_X86_MCE is not set
+CONFIG_X86_MSR=y
+CONFIG_X86_CPUID=y
+CONFIG_KSM=y
+CONFIG_CMA=y
+# CONFIG_MTRR_SANITIZER is not set
+CONFIG_EFI=y
+CONFIG_EFI_STUB=y
+CONFIG_HZ_100=y
+CONFIG_PHYSICAL_START=0x100000
+CONFIG_PM_AUTOSLEEP=y
+CONFIG_PM_WAKELOCKS=y
+CONFIG_PM_WAKELOCKS_LIMIT=0
+# CONFIG_PM_WAKELOCKS_GC is not set
+CONFIG_PM_DEBUG=y
+CONFIG_CPU_FREQ=y
+# CONFIG_CPU_FREQ_STAT is not set
+CONFIG_CPU_FREQ_DEFAULT_GOV_ONDEMAND=y
+CONFIG_CPU_FREQ_GOV_USERSPACE=y
+CONFIG_PCI_MMCONFIG=y
+CONFIG_PCIEPORTBUS=y
+# CONFIG_PCIEASPM is not set
+CONFIG_PCCARD=y
+CONFIG_YENTA=y
+CONFIG_HOTPLUG_PCI=y
+# CONFIG_CORE_DUMP_DEFAULT_ELF_HEADERS is not set
+CONFIG_BINFMT_MISC=y
+CONFIG_IA32_EMULATION=y
+CONFIG_NET=y
+CONFIG_PACKET=y
+CONFIG_UNIX=y
+CONFIG_XFRM_USER=y
+CONFIG_NET_KEY=y
+CONFIG_INET=y
+CONFIG_IP_MULTICAST=y
+CONFIG_IP_ADVANCED_ROUTER=y
+CONFIG_IP_MULTIPLE_TABLES=y
+CONFIG_IP_ROUTE_MULTIPATH=y
+CONFIG_IP_ROUTE_VERBOSE=y
+CONFIG_IP_PNP=y
+CONFIG_IP_PNP_DHCP=y
+CONFIG_IP_PNP_BOOTP=y
+CONFIG_IP_PNP_RARP=y
+CONFIG_IP_MROUTE=y
+CONFIG_IP_PIMSM_V1=y
+CONFIG_IP_PIMSM_V2=y
+CONFIG_SYN_COOKIES=y
+CONFIG_INET_ESP=y
+# CONFIG_INET_XFRM_MODE_BEET is not set
+# CONFIG_INET_LRO is not set
+CONFIG_INET_DIAG_DESTROY=y
+CONFIG_IPV6_ROUTER_PREF=y
+CONFIG_IPV6_ROUTE_INFO=y
+CONFIG_IPV6_OPTIMISTIC_DAD=y
+CONFIG_INET6_AH=y
+CONFIG_INET6_ESP=y
+CONFIG_INET6_IPCOMP=y
+CONFIG_IPV6_MIP6=y
+CONFIG_IPV6_MULTIPLE_TABLES=y
+CONFIG_NETLABEL=y
+CONFIG_NETFILTER=y
+CONFIG_NF_CONNTRACK=y
+CONFIG_NF_CONNTRACK_SECMARK=y
+CONFIG_NF_CONNTRACK_EVENTS=y
+CONFIG_NF_CT_PROTO_DCCP=y
+CONFIG_NF_CT_PROTO_SCTP=y
+CONFIG_NF_CT_PROTO_UDPLITE=y
+CONFIG_NF_CONNTRACK_AMANDA=y
+CONFIG_NF_CONNTRACK_FTP=y
+CONFIG_NF_CONNTRACK_H323=y
+CONFIG_NF_CONNTRACK_IRC=y
+CONFIG_NF_CONNTRACK_NETBIOS_NS=y
+CONFIG_NF_CONNTRACK_PPTP=y
+CONFIG_NF_CONNTRACK_SANE=y
+CONFIG_NF_CONNTRACK_TFTP=y
+CONFIG_NF_CT_NETLINK=y
+CONFIG_NETFILTER_XT_TARGET_CLASSIFY=y
+CONFIG_NETFILTER_XT_TARGET_CONNMARK=y
+CONFIG_NETFILTER_XT_TARGET_CONNSECMARK=y
+CONFIG_NETFILTER_XT_TARGET_IDLETIMER=y
+CONFIG_NETFILTER_XT_TARGET_MARK=y
+CONFIG_NETFILTER_XT_TARGET_NFLOG=y
+CONFIG_NETFILTER_XT_TARGET_NFQUEUE=y
+CONFIG_NETFILTER_XT_TARGET_TPROXY=y
+CONFIG_NETFILTER_XT_TARGET_TRACE=y
+CONFIG_NETFILTER_XT_TARGET_SECMARK=y
+CONFIG_NETFILTER_XT_TARGET_TCPMSS=y
+CONFIG_NETFILTER_XT_MATCH_COMMENT=y
+CONFIG_NETFILTER_XT_MATCH_CONNLIMIT=y
+CONFIG_NETFILTER_XT_MATCH_CONNMARK=y
+CONFIG_NETFILTER_XT_MATCH_CONNTRACK=y
+CONFIG_NETFILTER_XT_MATCH_HASHLIMIT=y
+CONFIG_NETFILTER_XT_MATCH_HELPER=y
+CONFIG_NETFILTER_XT_MATCH_IPRANGE=y
+CONFIG_NETFILTER_XT_MATCH_LENGTH=y
+CONFIG_NETFILTER_XT_MATCH_LIMIT=y
+CONFIG_NETFILTER_XT_MATCH_MAC=y
+CONFIG_NETFILTER_XT_MATCH_MARK=y
+CONFIG_NETFILTER_XT_MATCH_POLICY=y
+CONFIG_NETFILTER_XT_MATCH_PKTTYPE=y
+CONFIG_NETFILTER_XT_MATCH_QTAGUID=y
+CONFIG_NETFILTER_XT_MATCH_QUOTA=y
+CONFIG_NETFILTER_XT_MATCH_QUOTA2=y
+CONFIG_NETFILTER_XT_MATCH_SOCKET=y
+CONFIG_NETFILTER_XT_MATCH_STATE=y
+CONFIG_NETFILTER_XT_MATCH_STATISTIC=y
+CONFIG_NETFILTER_XT_MATCH_STRING=y
+CONFIG_NETFILTER_XT_MATCH_TIME=y
+CONFIG_NETFILTER_XT_MATCH_U32=y
+CONFIG_NF_CONNTRACK_IPV4=y
+CONFIG_IP_NF_IPTABLES=y
+CONFIG_IP_NF_MATCH_AH=y
+CONFIG_IP_NF_MATCH_ECN=y
+CONFIG_IP_NF_MATCH_TTL=y
+CONFIG_IP_NF_FILTER=y
+CONFIG_IP_NF_TARGET_REJECT=y
+CONFIG_IP_NF_MANGLE=y
+CONFIG_IP_NF_RAW=y
+CONFIG_IP_NF_SECURITY=y
+CONFIG_IP_NF_ARPTABLES=y
+CONFIG_IP_NF_ARPFILTER=y
+CONFIG_IP_NF_ARP_MANGLE=y
+CONFIG_NF_CONNTRACK_IPV6=y
+CONFIG_IP6_NF_IPTABLES=y
+CONFIG_IP6_NF_FILTER=y
+CONFIG_IP6_NF_TARGET_REJECT=y
+CONFIG_IP6_NF_MANGLE=y
+CONFIG_IP6_NF_RAW=y
+CONFIG_NET_SCHED=y
+CONFIG_NET_SCH_HTB=y
+CONFIG_NET_CLS_U32=y
+CONFIG_NET_EMATCH=y
+CONFIG_NET_EMATCH_U32=y
+CONFIG_NET_CLS_ACT=y
+CONFIG_CFG80211=y
+CONFIG_MAC80211=y
+CONFIG_MAC80211_LEDS=y
+CONFIG_UEVENT_HELPER_PATH="/sbin/hotplug"
+CONFIG_DMA_CMA=y
+CONFIG_CONNECTOR=y
+CONFIG_BLK_DEV_LOOP=y
+CONFIG_BLK_DEV_RAM=y
+CONFIG_BLK_DEV_RAM_SIZE=8192
+CONFIG_VIRTIO_BLK=y
+CONFIG_BLK_DEV_SD=y
+CONFIG_BLK_DEV_SR=y
+CONFIG_BLK_DEV_SR_VENDOR=y
+CONFIG_CHR_DEV_SG=y
+CONFIG_SCSI_CONSTANTS=y
+CONFIG_SCSI_SPI_ATTRS=y
+CONFIG_SCSI_ISCSI_ATTRS=y
+# CONFIG_SCSI_LOWLEVEL is not set
+CONFIG_ATA=y
+CONFIG_SATA_AHCI=y
+CONFIG_ATA_PIIX=y
+CONFIG_PATA_AMD=y
+CONFIG_PATA_OLDPIIX=y
+CONFIG_PATA_SCH=y
+CONFIG_PATA_MPIIX=y
+CONFIG_ATA_GENERIC=y
+CONFIG_MD=y
+CONFIG_BLK_DEV_MD=y
+CONFIG_BLK_DEV_DM=y
+CONFIG_DM_DEBUG=y
+CONFIG_DM_CRYPT=y
+CONFIG_DM_MIRROR=y
+CONFIG_DM_ZERO=y
+CONFIG_DM_UEVENT=y
+CONFIG_DM_VERITY=y
+CONFIG_DM_VERITY_FEC=y
+CONFIG_NETDEVICES=y
+CONFIG_NETCONSOLE=y
+CONFIG_TUN=y
+CONFIG_VIRTIO_NET=y
+CONFIG_BNX2=y
+CONFIG_TIGON3=y
+CONFIG_NET_TULIP=y
+CONFIG_E100=y
+CONFIG_E1000=y
+CONFIG_E1000E=y
+CONFIG_SKY2=y
+CONFIG_NE2K_PCI=y
+CONFIG_FORCEDETH=y
+CONFIG_8139TOO=y
+# CONFIG_8139TOO_PIO is not set
+CONFIG_R8169=y
+CONFIG_FDDI=y
+CONFIG_PPP=y
+CONFIG_PPP_BSDCOMP=y
+CONFIG_PPP_DEFLATE=y
+CONFIG_PPP_MPPE=y
+CONFIG_PPPOLAC=y
+CONFIG_PPPOPNS=y
+CONFIG_USB_USBNET=y
+CONFIG_INPUT_POLLDEV=y
+# CONFIG_INPUT_MOUSEDEV_PSAUX is not set
+CONFIG_INPUT_EVDEV=y
+CONFIG_INPUT_KEYRESET=y
+# CONFIG_KEYBOARD_ATKBD is not set
+CONFIG_KEYBOARD_GOLDFISH_EVENTS=y
+# CONFIG_INPUT_MOUSE is not set
+CONFIG_INPUT_JOYSTICK=y
+CONFIG_JOYSTICK_XPAD=y
+CONFIG_JOYSTICK_XPAD_FF=y
+CONFIG_JOYSTICK_XPAD_LEDS=y
+CONFIG_INPUT_TABLET=y
+CONFIG_TABLET_USB_ACECAD=y
+CONFIG_TABLET_USB_AIPTEK=y
+CONFIG_TABLET_USB_GTCO=y
+CONFIG_TABLET_USB_HANWANG=y
+CONFIG_TABLET_USB_KBTAB=y
+CONFIG_INPUT_TOUCHSCREEN=y
+CONFIG_INPUT_MISC=y
+CONFIG_INPUT_KEYCHORD=y
+CONFIG_INPUT_UINPUT=y
+CONFIG_INPUT_GPIO=y
+# CONFIG_SERIO is not set
+# CONFIG_VT is not set
+# CONFIG_LEGACY_PTYS is not set
+CONFIG_SERIAL_NONSTANDARD=y
+# CONFIG_DEVMEM is not set
+# CONFIG_DEVKMEM is not set
+CONFIG_SERIAL_8250=y
+CONFIG_SERIAL_8250_CONSOLE=y
+CONFIG_VIRTIO_CONSOLE=y
+CONFIG_NVRAM=y
+CONFIG_I2C_I801=y
+CONFIG_BATTERY_GOLDFISH=y
+CONFIG_WATCHDOG=y
+CONFIG_MEDIA_SUPPORT=y
+CONFIG_AGP=y
+CONFIG_AGP_AMD64=y
+CONFIG_AGP_INTEL=y
+CONFIG_DRM=y
+CONFIG_FB_MODE_HELPERS=y
+CONFIG_FB_TILEBLITTING=y
+CONFIG_FB_EFI=y
+CONFIG_FB_GOLDFISH=y
+CONFIG_BACKLIGHT_LCD_SUPPORT=y
+# CONFIG_LCD_CLASS_DEVICE is not set
+CONFIG_SOUND=y
+CONFIG_SND=y
+CONFIG_HIDRAW=y
+CONFIG_UHID=y
+CONFIG_HID_A4TECH=y
+CONFIG_HID_ACRUX=y
+CONFIG_HID_ACRUX_FF=y
+CONFIG_HID_APPLE=y
+CONFIG_HID_BELKIN=y
+CONFIG_HID_CHERRY=y
+CONFIG_HID_CHICONY=y
+CONFIG_HID_PRODIKEYS=y
+CONFIG_HID_CYPRESS=y
+CONFIG_HID_DRAGONRISE=y
+CONFIG_DRAGONRISE_FF=y
+CONFIG_HID_EMS_FF=y
+CONFIG_HID_ELECOM=y
+CONFIG_HID_EZKEY=y
+CONFIG_HID_HOLTEK=y
+CONFIG_HID_KEYTOUCH=y
+CONFIG_HID_KYE=y
+CONFIG_HID_UCLOGIC=y
+CONFIG_HID_WALTOP=y
+CONFIG_HID_GYRATION=y
+CONFIG_HID_TWINHAN=y
+CONFIG_HID_KENSINGTON=y
+CONFIG_HID_LCPOWER=y
+CONFIG_HID_LOGITECH=y
+CONFIG_HID_LOGITECH_DJ=y
+CONFIG_LOGITECH_FF=y
+CONFIG_LOGIRUMBLEPAD2_FF=y
+CONFIG_LOGIG940_FF=y
+CONFIG_HID_MAGICMOUSE=y
+CONFIG_HID_MICROSOFT=y
+CONFIG_HID_MONTEREY=y
+CONFIG_HID_MULTITOUCH=y
+CONFIG_HID_NTRIG=y
+CONFIG_HID_ORTEK=y
+CONFIG_HID_PANTHERLORD=y
+CONFIG_PANTHERLORD_FF=y
+CONFIG_HID_PETALYNX=y
+CONFIG_HID_PICOLCD=y
+CONFIG_HID_PRIMAX=y
+CONFIG_HID_ROCCAT=y
+CONFIG_HID_SAITEK=y
+CONFIG_HID_SAMSUNG=y
+CONFIG_HID_SONY=y
+CONFIG_HID_SPEEDLINK=y
+CONFIG_HID_SUNPLUS=y
+CONFIG_HID_GREENASIA=y
+CONFIG_GREENASIA_FF=y
+CONFIG_HID_SMARTJOYPLUS=y
+CONFIG_SMARTJOYPLUS_FF=y
+CONFIG_HID_TIVO=y
+CONFIG_HID_TOPSEED=y
+CONFIG_HID_THRUSTMASTER=y
+CONFIG_HID_WACOM=y
+CONFIG_HID_WIIMOTE=y
+CONFIG_HID_ZEROPLUS=y
+CONFIG_HID_ZYDACRON=y
+CONFIG_HID_PID=y
+CONFIG_USB_HIDDEV=y
+CONFIG_USB_ANNOUNCE_NEW_DEVICES=y
+CONFIG_USB_MON=y
+CONFIG_USB_EHCI_HCD=y
+# CONFIG_USB_EHCI_TT_NEWSCHED is not set
+CONFIG_USB_OHCI_HCD=y
+CONFIG_USB_UHCI_HCD=y
+CONFIG_USB_PRINTER=y
+CONFIG_USB_STORAGE=y
+CONFIG_USB_OTG_WAKELOCK=y
+CONFIG_EDAC=y
+CONFIG_RTC_CLASS=y
+# CONFIG_RTC_HCTOSYS is not set
+CONFIG_DMADEVICES=y
+CONFIG_VIRTIO_PCI=y
+CONFIG_STAGING=y
+CONFIG_ASHMEM=y
+CONFIG_ANDROID_LOW_MEMORY_KILLER=y
+CONFIG_SYNC=y
+CONFIG_SW_SYNC=y
+CONFIG_SYNC_FILE=y
+CONFIG_ION=y
+CONFIG_GOLDFISH_AUDIO=y
+CONFIG_SND_HDA_INTEL=y
+CONFIG_GOLDFISH=y
+CONFIG_GOLDFISH_PIPE=y
+CONFIG_GOLDFISH_SYNC=y
+CONFIG_ANDROID=y
+CONFIG_ANDROID_BINDER_IPC=y
+CONFIG_ISCSI_IBFT_FIND=y
+CONFIG_EXT4_FS=y
+CONFIG_EXT4_FS_SECURITY=y
+CONFIG_QUOTA=y
+CONFIG_QUOTA_NETLINK_INTERFACE=y
+# CONFIG_PRINT_QUOTA_WARNING is not set
+CONFIG_FUSE_FS=y
+CONFIG_ISO9660_FS=y
+CONFIG_JOLIET=y
+CONFIG_ZISOFS=y
+CONFIG_MSDOS_FS=y
+CONFIG_VFAT_FS=y
+CONFIG_PROC_KCORE=y
+CONFIG_TMPFS=y
+CONFIG_TMPFS_POSIX_ACL=y
+CONFIG_HUGETLBFS=y
+CONFIG_PSTORE=y
+CONFIG_PSTORE_CONSOLE=y
+CONFIG_PSTORE_RAM=y
+# CONFIG_NETWORK_FILESYSTEMS is not set
+CONFIG_NLS_DEFAULT="utf8"
+CONFIG_NLS_CODEPAGE_437=y
+CONFIG_NLS_ASCII=y
+CONFIG_NLS_ISO8859_1=y
+CONFIG_NLS_UTF8=y
+CONFIG_PRINTK_TIME=y
+CONFIG_DEBUG_INFO=y
+# CONFIG_ENABLE_WARN_DEPRECATED is not set
+# CONFIG_ENABLE_MUST_CHECK is not set
+# CONFIG_UNUSED_SYMBOLS is not set
+CONFIG_MAGIC_SYSRQ=y
+CONFIG_DEBUG_MEMORY_INIT=y
+CONFIG_PANIC_TIMEOUT=5
+CONFIG_SCHEDSTATS=y
+CONFIG_TIMER_STATS=y
+CONFIG_SCHED_TRACER=y
+CONFIG_BLK_DEV_IO_TRACE=y
+CONFIG_PROVIDE_OHCI1394_DMA_INIT=y
+CONFIG_KEYS=y
+CONFIG_SECURITY=y
+CONFIG_SECURITY_NETWORK=y
+CONFIG_SECURITY_SELINUX=y
+CONFIG_CRYPTO_TWOFISH=y
+CONFIG_ASYMMETRIC_KEY_TYPE=y
+CONFIG_ASYMMETRIC_PUBLIC_KEY_SUBTYPE=y
+CONFIG_X509_CERTIFICATE_PARSER=y
+CONFIG_PKCS7_MESSAGE_PARSER=y
+CONFIG_PKCS7_TEST_KEY=y
+# CONFIG_VIRTUALIZATION is not set
+CONFIG_CRC_T10DIF=y
diff --git a/arch/x86/crypto/chacha20_glue.c b/arch/x86/crypto/chacha20_glue.c
index 0a5fedf43bdc..44466881cdc9 100644
--- a/arch/x86/crypto/chacha20_glue.c
+++ b/arch/x86/crypto/chacha20_glue.c
@@ -10,7 +10,7 @@
*/
#include <crypto/algapi.h>
-#include <crypto/chacha20.h>
+#include <crypto/chacha.h>
#include <linux/crypto.h>
#include <linux/kernel.h>
#include <linux/module.h>
@@ -29,31 +29,31 @@ static bool chacha20_use_avx2;
static void chacha20_dosimd(u32 *state, u8 *dst, const u8 *src,
unsigned int bytes)
{
- u8 buf[CHACHA20_BLOCK_SIZE];
+ u8 buf[CHACHA_BLOCK_SIZE];
#ifdef CONFIG_AS_AVX2
if (chacha20_use_avx2) {
- while (bytes >= CHACHA20_BLOCK_SIZE * 8) {
+ while (bytes >= CHACHA_BLOCK_SIZE * 8) {
chacha20_8block_xor_avx2(state, dst, src);
- bytes -= CHACHA20_BLOCK_SIZE * 8;
- src += CHACHA20_BLOCK_SIZE * 8;
- dst += CHACHA20_BLOCK_SIZE * 8;
+ bytes -= CHACHA_BLOCK_SIZE * 8;
+ src += CHACHA_BLOCK_SIZE * 8;
+ dst += CHACHA_BLOCK_SIZE * 8;
state[12] += 8;
}
}
#endif
- while (bytes >= CHACHA20_BLOCK_SIZE * 4) {
+ while (bytes >= CHACHA_BLOCK_SIZE * 4) {
chacha20_4block_xor_ssse3(state, dst, src);
- bytes -= CHACHA20_BLOCK_SIZE * 4;
- src += CHACHA20_BLOCK_SIZE * 4;
- dst += CHACHA20_BLOCK_SIZE * 4;
+ bytes -= CHACHA_BLOCK_SIZE * 4;
+ src += CHACHA_BLOCK_SIZE * 4;
+ dst += CHACHA_BLOCK_SIZE * 4;
state[12] += 4;
}
- while (bytes >= CHACHA20_BLOCK_SIZE) {
+ while (bytes >= CHACHA_BLOCK_SIZE) {
chacha20_block_xor_ssse3(state, dst, src);
- bytes -= CHACHA20_BLOCK_SIZE;
- src += CHACHA20_BLOCK_SIZE;
- dst += CHACHA20_BLOCK_SIZE;
+ bytes -= CHACHA_BLOCK_SIZE;
+ src += CHACHA_BLOCK_SIZE;
+ dst += CHACHA_BLOCK_SIZE;
state[12]++;
}
if (bytes) {
@@ -70,24 +70,24 @@ static int chacha20_simd(struct blkcipher_desc *desc, struct scatterlist *dst,
struct blkcipher_walk walk;
int err;
- if (nbytes <= CHACHA20_BLOCK_SIZE || !may_use_simd())
- return crypto_chacha20_crypt(desc, dst, src, nbytes);
+ if (nbytes <= CHACHA_BLOCK_SIZE || !may_use_simd())
+ return crypto_chacha_crypt(desc, dst, src, nbytes);
state = (u32 *)roundup((uintptr_t)state_buf, CHACHA20_STATE_ALIGN);
blkcipher_walk_init(&walk, dst, src, nbytes);
- err = blkcipher_walk_virt_block(desc, &walk, CHACHA20_BLOCK_SIZE);
+ err = blkcipher_walk_virt_block(desc, &walk, CHACHA_BLOCK_SIZE);
desc->flags &= ~CRYPTO_TFM_REQ_MAY_SLEEP;
- crypto_chacha20_init(state, crypto_blkcipher_ctx(desc->tfm), walk.iv);
+ crypto_chacha_init(state, crypto_blkcipher_ctx(desc->tfm), walk.iv);
kernel_fpu_begin();
- while (walk.nbytes >= CHACHA20_BLOCK_SIZE) {
+ while (walk.nbytes >= CHACHA_BLOCK_SIZE) {
chacha20_dosimd(state, walk.dst.virt.addr, walk.src.virt.addr,
- rounddown(walk.nbytes, CHACHA20_BLOCK_SIZE));
+ rounddown(walk.nbytes, CHACHA_BLOCK_SIZE));
err = blkcipher_walk_done(desc, &walk,
- walk.nbytes % CHACHA20_BLOCK_SIZE);
+ walk.nbytes % CHACHA_BLOCK_SIZE);
}
if (walk.nbytes) {
@@ -108,14 +108,14 @@ static struct crypto_alg alg = {
.cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER,
.cra_blocksize = 1,
.cra_type = &crypto_blkcipher_type,
- .cra_ctxsize = sizeof(struct chacha20_ctx),
+ .cra_ctxsize = sizeof(struct chacha_ctx),
.cra_alignmask = sizeof(u32) - 1,
.cra_module = THIS_MODULE,
.cra_u = {
.blkcipher = {
- .min_keysize = CHACHA20_KEY_SIZE,
- .max_keysize = CHACHA20_KEY_SIZE,
- .ivsize = CHACHA20_IV_SIZE,
+ .min_keysize = CHACHA_KEY_SIZE,
+ .max_keysize = CHACHA_KEY_SIZE,
+ .ivsize = CHACHA_IV_SIZE,
.geniv = "seqiv",
.setkey = crypto_chacha20_setkey,
.encrypt = chacha20_simd,
diff --git a/arch/x86/crypto/poly1305_glue.c b/arch/x86/crypto/poly1305_glue.c
index 28c372003e44..a69670b5b849 100644
--- a/arch/x86/crypto/poly1305_glue.c
+++ b/arch/x86/crypto/poly1305_glue.c
@@ -83,35 +83,37 @@ static unsigned int poly1305_simd_blocks(struct poly1305_desc_ctx *dctx,
if (poly1305_use_avx2 && srclen >= POLY1305_BLOCK_SIZE * 4) {
if (unlikely(!sctx->wset)) {
if (!sctx->uset) {
- memcpy(sctx->u, dctx->r, sizeof(sctx->u));
- poly1305_simd_mult(sctx->u, dctx->r);
+ memcpy(sctx->u, dctx->r.r, sizeof(sctx->u));
+ poly1305_simd_mult(sctx->u, dctx->r.r);
sctx->uset = true;
}
memcpy(sctx->u + 5, sctx->u, sizeof(sctx->u));
- poly1305_simd_mult(sctx->u + 5, dctx->r);
+ poly1305_simd_mult(sctx->u + 5, dctx->r.r);
memcpy(sctx->u + 10, sctx->u + 5, sizeof(sctx->u));
- poly1305_simd_mult(sctx->u + 10, dctx->r);
+ poly1305_simd_mult(sctx->u + 10, dctx->r.r);
sctx->wset = true;
}
blocks = srclen / (POLY1305_BLOCK_SIZE * 4);
- poly1305_4block_avx2(dctx->h, src, dctx->r, blocks, sctx->u);
+ poly1305_4block_avx2(dctx->h.h, src, dctx->r.r, blocks,
+ sctx->u);
src += POLY1305_BLOCK_SIZE * 4 * blocks;
srclen -= POLY1305_BLOCK_SIZE * 4 * blocks;
}
#endif
if (likely(srclen >= POLY1305_BLOCK_SIZE * 2)) {
if (unlikely(!sctx->uset)) {
- memcpy(sctx->u, dctx->r, sizeof(sctx->u));
- poly1305_simd_mult(sctx->u, dctx->r);
+ memcpy(sctx->u, dctx->r.r, sizeof(sctx->u));
+ poly1305_simd_mult(sctx->u, dctx->r.r);
sctx->uset = true;
}
blocks = srclen / (POLY1305_BLOCK_SIZE * 2);
- poly1305_2block_sse2(dctx->h, src, dctx->r, blocks, sctx->u);
+ poly1305_2block_sse2(dctx->h.h, src, dctx->r.r, blocks,
+ sctx->u);
src += POLY1305_BLOCK_SIZE * 2 * blocks;
srclen -= POLY1305_BLOCK_SIZE * 2 * blocks;
}
if (srclen >= POLY1305_BLOCK_SIZE) {
- poly1305_block_sse2(dctx->h, src, dctx->r, 1);
+ poly1305_block_sse2(dctx->h.h, src, dctx->r.r, 1);
srclen -= POLY1305_BLOCK_SIZE;
}
return srclen;
diff --git a/arch/x86/entry/common.c b/arch/x86/entry/common.c
index b0cd306dc527..b83eafa3766a 100644
--- a/arch/x86/entry/common.c
+++ b/arch/x86/entry/common.c
@@ -22,6 +22,7 @@
#include <linux/user-return-notifier.h>
#include <linux/nospec.h>
#include <linux/uprobes.h>
+#include <linux/syscalls.h>
#include <asm/desc.h>
#include <asm/traps.h>
@@ -180,6 +181,8 @@ __visible inline void prepare_exit_to_usermode(struct pt_regs *regs)
struct thread_info *ti = current_thread_info();
u32 cached_flags;
+ addr_limit_user_check();
+
if (IS_ENABLED(CONFIG_PROVE_LOCKING) && WARN_ON(!irqs_disabled()))
local_irq_disable();
diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S
index 870e941c1947..d7649924e5aa 100644
--- a/arch/x86/entry/entry_64.S
+++ b/arch/x86/entry/entry_64.S
@@ -626,13 +626,8 @@ apicinterrupt3 \num trace(\sym) smp_trace(\sym)
#endif
/* Make sure APIC interrupt handlers end up in the irqentry section: */
-#if defined(CONFIG_FUNCTION_GRAPH_TRACER) || defined(CONFIG_KASAN)
-# define PUSH_SECTION_IRQENTRY .pushsection .irqentry.text, "ax"
-# define POP_SECTION_IRQENTRY .popsection
-#else
-# define PUSH_SECTION_IRQENTRY
-# define POP_SECTION_IRQENTRY
-#endif
+#define PUSH_SECTION_IRQENTRY .pushsection .irqentry.text, "ax"
+#define POP_SECTION_IRQENTRY .popsection
.macro apicinterrupt num sym do_sym
PUSH_SECTION_IRQENTRY
diff --git a/arch/x86/entry/vdso/Makefile b/arch/x86/entry/vdso/Makefile
index d5409660f5de..51a858ea9158 100644
--- a/arch/x86/entry/vdso/Makefile
+++ b/arch/x86/entry/vdso/Makefile
@@ -171,7 +171,8 @@ quiet_cmd_vdso = VDSO $@
sh $(srctree)/$(src)/checkundef.sh '$(NM)' '$@'
VDSO_LDFLAGS = -fPIC -shared $(call cc-ldoption, -Wl$(comma)--hash-style=both) \
- $(call cc-ldoption, -Wl$(comma)--build-id) -Wl,-Bsymbolic $(LTO_CFLAGS)
+ $(call cc-ldoption, -Wl$(comma)--build-id) -Wl,-Bsymbolic $(LTO_CFLAGS) \
+ $(filter --target=% --gcc-toolchain=%,$(KBUILD_CFLAGS))
GCOV_PROFILE := n
#
diff --git a/arch/x86/include/asm/alternative.h b/arch/x86/include/asm/alternative.h
index deca9b9c7923..6665000a03df 100644
--- a/arch/x86/include/asm/alternative.h
+++ b/arch/x86/include/asm/alternative.h
@@ -218,10 +218,9 @@ static inline int alternatives_text_reserved(void *start, void *end)
#define alternative_call_2(oldfunc, newfunc1, feature1, newfunc2, feature2, \
output, input...) \
{ \
- register void *__sp asm(_ASM_SP); \
asm volatile (ALTERNATIVE_2("call %P[old]", "call %P[new1]", feature1,\
"call %P[new2]", feature2) \
- : output, "+r" (__sp) \
+ : output, ASM_CALL_CONSTRAINT \
: [old] "i" (oldfunc), [new1] "i" (newfunc1), \
[new2] "i" (newfunc2), ## input); \
}
diff --git a/arch/x86/include/asm/idle.h b/arch/x86/include/asm/idle.h
index c5d1785373ed..02bab09707f2 100644
--- a/arch/x86/include/asm/idle.h
+++ b/arch/x86/include/asm/idle.h
@@ -1,13 +1,6 @@
#ifndef _ASM_X86_IDLE_H
#define _ASM_X86_IDLE_H
-#define IDLE_START 1
-#define IDLE_END 2
-
-struct notifier_block;
-void idle_notifier_register(struct notifier_block *n);
-void idle_notifier_unregister(struct notifier_block *n);
-
#ifdef CONFIG_X86_64
void enter_idle(void);
void exit_idle(void);
diff --git a/arch/x86/include/asm/paravirt_types.h b/arch/x86/include/asm/paravirt_types.h
index 04b79712b09c..98dbad6bd2f4 100644
--- a/arch/x86/include/asm/paravirt_types.h
+++ b/arch/x86/include/asm/paravirt_types.h
@@ -462,8 +462,8 @@ int paravirt_disable_iospace(void);
*/
#ifdef CONFIG_X86_32
#define PVOP_VCALL_ARGS \
- unsigned long __eax = __eax, __edx = __edx, __ecx = __ecx; \
- register void *__sp asm("esp")
+ unsigned long __eax = __eax, __edx = __edx, __ecx = __ecx;
+
#define PVOP_CALL_ARGS PVOP_VCALL_ARGS
#define PVOP_CALL_ARG1(x) "a" ((unsigned long)(x))
@@ -483,8 +483,8 @@ int paravirt_disable_iospace(void);
/* [re]ax isn't an arg, but the return val */
#define PVOP_VCALL_ARGS \
unsigned long __edi = __edi, __esi = __esi, \
- __edx = __edx, __ecx = __ecx, __eax = __eax; \
- register void *__sp asm("rsp")
+ __edx = __edx, __ecx = __ecx, __eax = __eax;
+
#define PVOP_CALL_ARGS PVOP_VCALL_ARGS
#define PVOP_CALL_ARG1(x) "D" ((unsigned long)(x))
@@ -523,7 +523,7 @@ int paravirt_disable_iospace(void);
asm volatile(pre \
paravirt_alt(PARAVIRT_CALL) \
post \
- : call_clbr, "+r" (__sp) \
+ : call_clbr, ASM_CALL_CONSTRAINT \
: paravirt_type(op), \
paravirt_clobber(clbr), \
##__VA_ARGS__ \
@@ -533,7 +533,7 @@ int paravirt_disable_iospace(void);
asm volatile(pre \
paravirt_alt(PARAVIRT_CALL) \
post \
- : call_clbr, "+r" (__sp) \
+ : call_clbr, ASM_CALL_CONSTRAINT \
: paravirt_type(op), \
paravirt_clobber(clbr), \
##__VA_ARGS__ \
@@ -560,7 +560,7 @@ int paravirt_disable_iospace(void);
asm volatile(pre \
paravirt_alt(PARAVIRT_CALL) \
post \
- : call_clbr, "+r" (__sp) \
+ : call_clbr, ASM_CALL_CONSTRAINT \
: paravirt_type(op), \
paravirt_clobber(clbr), \
##__VA_ARGS__ \
diff --git a/arch/x86/include/asm/preempt.h b/arch/x86/include/asm/preempt.h
index 17f218645701..4939f6ed1a49 100644
--- a/arch/x86/include/asm/preempt.h
+++ b/arch/x86/include/asm/preempt.h
@@ -94,19 +94,14 @@ static __always_inline bool should_resched(int preempt_offset)
#ifdef CONFIG_PREEMPT
extern asmlinkage void ___preempt_schedule(void);
-# define __preempt_schedule() \
-({ \
- register void *__sp asm(_ASM_SP); \
- asm volatile ("call ___preempt_schedule" : "+r"(__sp)); \
-})
+# define __preempt_schedule() \
+ asm volatile ("call ___preempt_schedule" : ASM_CALL_CONSTRAINT)
extern asmlinkage void preempt_schedule(void);
extern asmlinkage void ___preempt_schedule_notrace(void);
-# define __preempt_schedule_notrace() \
-({ \
- register void *__sp asm(_ASM_SP); \
- asm volatile ("call ___preempt_schedule_notrace" : "+r"(__sp)); \
-})
+# define __preempt_schedule_notrace() \
+ asm volatile ("call ___preempt_schedule_notrace" : ASM_CALL_CONSTRAINT)
+
extern asmlinkage void preempt_schedule_notrace(void);
#endif
diff --git a/arch/x86/include/asm/rwsem.h b/arch/x86/include/asm/rwsem.h
index a34e0d4b957d..7116b7931c7b 100644
--- a/arch/x86/include/asm/rwsem.h
+++ b/arch/x86/include/asm/rwsem.h
@@ -103,7 +103,6 @@ static inline bool __down_read_trylock(struct rw_semaphore *sem)
({ \
long tmp; \
struct rw_semaphore* ret; \
- register void *__sp asm(_ASM_SP); \
\
asm volatile("# beginning down_write\n\t" \
LOCK_PREFIX " xadd %1,(%4)\n\t" \
@@ -114,7 +113,8 @@ static inline bool __down_read_trylock(struct rw_semaphore *sem)
" call " slow_path "\n" \
"1:\n" \
"# ending down_write" \
- : "+m" (sem->count), "=d" (tmp), "=a" (ret), "+r" (__sp) \
+ : "+m" (sem->count), "=d" (tmp), \
+ "=a" (ret), ASM_CALL_CONSTRAINT \
: "a" (sem), "1" (RWSEM_ACTIVE_WRITE_BIAS) \
: "memory", "cc"); \
ret; \
diff --git a/arch/x86/include/asm/thread_info.h b/arch/x86/include/asm/thread_info.h
index 2d8788a59b4d..83252c2e67c6 100644
--- a/arch/x86/include/asm/thread_info.h
+++ b/arch/x86/include/asm/thread_info.h
@@ -101,6 +101,7 @@ struct thread_info {
#define TIF_SYSCALL_TRACEPOINT 28 /* syscall tracepoint instrumentation */
#define TIF_ADDR32 29 /* 32-bit address space on 64 bits */
#define TIF_X32 30 /* 32-bit native x86-64 binary */
+#define TIF_FSCHECK 31 /* Check FS is USER_DS on return */
#define _TIF_SYSCALL_TRACE (1 << TIF_SYSCALL_TRACE)
#define _TIF_NOTIFY_RESUME (1 << TIF_NOTIFY_RESUME)
@@ -124,6 +125,7 @@ struct thread_info {
#define _TIF_SYSCALL_TRACEPOINT (1 << TIF_SYSCALL_TRACEPOINT)
#define _TIF_ADDR32 (1 << TIF_ADDR32)
#define _TIF_X32 (1 << TIF_X32)
+#define _TIF_FSCHECK (1 << TIF_FSCHECK)
/*
* work to do in syscall_trace_enter(). Also includes TIF_NOHZ for
@@ -137,7 +139,7 @@ struct thread_info {
/* work to do on any return to user space */
#define _TIF_ALLWORK_MASK \
((0x0000FFFF & ~_TIF_SECCOMP) | _TIF_SYSCALL_TRACEPOINT | \
- _TIF_NOHZ)
+ _TIF_NOHZ | _TIF_FSCHECK)
/* flags to check in __switch_to() */
#define _TIF_WORK_CTXSW \
diff --git a/arch/x86/include/asm/uaccess.h b/arch/x86/include/asm/uaccess.h
index a8d85a687cf4..76a12ab47e11 100644
--- a/arch/x86/include/asm/uaccess.h
+++ b/arch/x86/include/asm/uaccess.h
@@ -8,6 +8,7 @@
#include <linux/kasan-checks.h>
#include <linux/thread_info.h>
#include <linux/string.h>
+#include <linux/sched.h>
#include <asm/asm.h>
#include <asm/page.h>
#include <asm/smap.h>
@@ -31,7 +32,12 @@
#define get_ds() (KERNEL_DS)
#define get_fs() (current->thread.addr_limit)
-#define set_fs(x) (current->thread.addr_limit = (x))
+static inline void set_fs(mm_segment_t fs)
+{
+ current->thread.addr_limit = fs;
+ /* On user-mode return, check fs is correct */
+ set_thread_flag(TIF_FSCHECK);
+}
#define segment_eq(a, b) ((a).seg == (b).seg)
@@ -171,11 +177,11 @@ __typeof__(__builtin_choose_expr(sizeof(x) > sizeof(0UL), 0ULL, 0UL))
({ \
int __ret_gu; \
register __inttype(*(ptr)) __val_gu asm("%"_ASM_DX); \
- register void *__sp asm(_ASM_SP); \
__chk_user_ptr(ptr); \
might_fault(); \
asm volatile("call __get_user_%P4" \
- : "=a" (__ret_gu), "=r" (__val_gu), "+r" (__sp) \
+ : "=a" (__ret_gu), "=r" (__val_gu), \
+ ASM_CALL_CONSTRAINT \
: "0" (ptr), "i" (sizeof(*(ptr)))); \
(x) = (__force __typeof__(*(ptr))) __val_gu; \
__builtin_expect(__ret_gu, 0); \
diff --git a/arch/x86/include/asm/xen/hypercall.h b/arch/x86/include/asm/xen/hypercall.h
index ccdc23d89b60..1fdcb8cda74a 100644
--- a/arch/x86/include/asm/xen/hypercall.h
+++ b/arch/x86/include/asm/xen/hypercall.h
@@ -112,10 +112,9 @@ extern struct { char _entry[32]; } hypercall_page[];
register unsigned long __arg2 asm(__HYPERCALL_ARG2REG) = __arg2; \
register unsigned long __arg3 asm(__HYPERCALL_ARG3REG) = __arg3; \
register unsigned long __arg4 asm(__HYPERCALL_ARG4REG) = __arg4; \
- register unsigned long __arg5 asm(__HYPERCALL_ARG5REG) = __arg5; \
- register void *__sp asm(_ASM_SP);
+ register unsigned long __arg5 asm(__HYPERCALL_ARG5REG) = __arg5;
-#define __HYPERCALL_0PARAM "=r" (__res), "+r" (__sp)
+#define __HYPERCALL_0PARAM "=r" (__res), ASM_CALL_CONSTRAINT
#define __HYPERCALL_1PARAM __HYPERCALL_0PARAM, "+r" (__arg1)
#define __HYPERCALL_2PARAM __HYPERCALL_1PARAM, "+r" (__arg2)
#define __HYPERCALL_3PARAM __HYPERCALL_2PARAM, "+r" (__arg3)
diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c
index 00a9047539d7..e9195a139d4e 100644
--- a/arch/x86/kernel/process.c
+++ b/arch/x86/kernel/process.c
@@ -68,19 +68,6 @@ EXPORT_PER_CPU_SYMBOL(cpu_tss);
#ifdef CONFIG_X86_64
static DEFINE_PER_CPU(unsigned char, is_idle);
-static ATOMIC_NOTIFIER_HEAD(idle_notifier);
-
-void idle_notifier_register(struct notifier_block *n)
-{
- atomic_notifier_chain_register(&idle_notifier, n);
-}
-EXPORT_SYMBOL_GPL(idle_notifier_register);
-
-void idle_notifier_unregister(struct notifier_block *n)
-{
- atomic_notifier_chain_unregister(&idle_notifier, n);
-}
-EXPORT_SYMBOL_GPL(idle_notifier_unregister);
#endif
/*
@@ -397,14 +384,14 @@ static inline void play_dead(void)
void enter_idle(void)
{
this_cpu_write(is_idle, 1);
- atomic_notifier_call_chain(&idle_notifier, IDLE_START, NULL);
+ idle_notifier_call_chain(IDLE_START);
}
static void __exit_idle(void)
{
if (x86_test_and_clear_bit_percpu(0, is_idle) == 0)
return;
- atomic_notifier_call_chain(&idle_notifier, IDLE_END, NULL);
+ idle_notifier_call_chain(IDLE_END);
}
/* Called from interrupts to signify idle end */
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 9446a3a2fc69..726cc44ccf0f 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -8922,7 +8922,6 @@ static void vmx_complete_atomic_exit(struct vcpu_vmx *vmx)
static void vmx_handle_external_intr(struct kvm_vcpu *vcpu)
{
u32 exit_intr_info = vmcs_read32(VM_EXIT_INTR_INFO);
- register void *__sp asm(_ASM_SP);
/*
* If external interrupt exists, IF bit is set in rflags/eflags on the
@@ -8956,7 +8955,7 @@ static void vmx_handle_external_intr(struct kvm_vcpu *vcpu)
#ifdef CONFIG_X86_64
[sp]"=&r"(tmp),
#endif
- "+r"(__sp)
+ ASM_CALL_CONSTRAINT
:
THUNK_TARGET(entry),
[ss]"i"(__KERNEL_DS),
diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c
index 5c419b8f99a0..0da3945afb48 100644
--- a/arch/x86/mm/fault.c
+++ b/arch/x86/mm/fault.c
@@ -758,7 +758,6 @@ no_context(struct pt_regs *regs, unsigned long error_code,
if (is_vmalloc_addr((void *)address) &&
(((unsigned long)tsk->stack - 1 - address < PAGE_SIZE) ||
address - ((unsigned long)tsk->stack + THREAD_SIZE) < PAGE_SIZE)) {
- register void *__sp asm("rsp");
unsigned long stack = this_cpu_read(orig_ist.ist[DOUBLEFAULT_STACK]) - sizeof(void *);
/*
* We're likely to be running with very little stack space
@@ -773,7 +772,7 @@ no_context(struct pt_regs *regs, unsigned long error_code,
asm volatile ("movq %[stack], %%rsp\n\t"
"call handle_stack_overflow\n\t"
"1: jmp 1b"
- : "+r" (__sp)
+ : ASM_CALL_CONSTRAINT
: "D" ("kernel stack overflow (page fault)"),
"S" (regs), "d" (address),
[stack] "rm" (stack));
diff --git a/arch/x86/oprofile/nmi_int.c b/arch/x86/oprofile/nmi_int.c
index 842ca3cab6b3..18f9dad8dc79 100644
--- a/arch/x86/oprofile/nmi_int.c
+++ b/arch/x86/oprofile/nmi_int.c
@@ -615,7 +615,7 @@ enum __force_cpu_type {
static int force_cpu_type;
-static int set_cpu_type(const char *str, struct kernel_param *kp)
+static int set_cpu_type(const char *str, const struct kernel_param *kp)
{
if (!strcmp(str, "timer")) {
force_cpu_type = timer;
diff --git a/arch/xtensa/include/uapi/asm/socket.h b/arch/xtensa/include/uapi/asm/socket.h
index 81435d995e11..fc7ca2841206 100644
--- a/arch/xtensa/include/uapi/asm/socket.h
+++ b/arch/xtensa/include/uapi/asm/socket.h
@@ -101,4 +101,6 @@
#define SO_CNX_ADVICE 53
+#define SO_COOKIE 57
+
#endif /* _XTENSA_SOCKET_H */
diff --git a/block/blk-core.c b/block/blk-core.c
index 77b99bf16c83..c13af6192a35 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -40,6 +40,8 @@
#include "blk.h"
#include "blk-mq.h"
+#include <linux/math64.h>
+
EXPORT_TRACEPOINT_SYMBOL_GPL(block_bio_remap);
EXPORT_TRACEPOINT_SYMBOL_GPL(block_rq_remap);
EXPORT_TRACEPOINT_SYMBOL_GPL(block_bio_complete);
@@ -3569,3 +3571,52 @@ int __init blk_dev_init(void)
return 0;
}
+
+/*
+ * Blk IO latency support. We want this to be as cheap as possible, so doing
+ * this lockless (and avoiding atomics), a few off by a few errors in this
+ * code is not harmful, and we don't want to do anything that is
+ * perf-impactful.
+ * TODO : If necessary, we can make the histograms per-cpu and aggregate
+ * them when printing them out.
+ */
+ssize_t
+blk_latency_hist_show(char* name, struct io_latency_state *s, char *buf,
+ int buf_size)
+{
+ int i;
+ int bytes_written = 0;
+ u_int64_t num_elem, elem;
+ int pct;
+ u_int64_t average;
+
+ num_elem = s->latency_elems;
+ if (num_elem > 0) {
+ average = div64_u64(s->latency_sum, s->latency_elems);
+ bytes_written += scnprintf(buf + bytes_written,
+ buf_size - bytes_written,
+ "IO svc_time %s Latency Histogram (n = %llu,"
+ " average = %llu):\n", name, num_elem, average);
+ for (i = 0;
+ i < ARRAY_SIZE(latency_x_axis_us);
+ i++) {
+ elem = s->latency_y_axis[i];
+ pct = div64_u64(elem * 100, num_elem);
+ bytes_written += scnprintf(buf + bytes_written,
+ PAGE_SIZE - bytes_written,
+ "\t< %6lluus%15llu%15d%%\n",
+ latency_x_axis_us[i],
+ elem, pct);
+ }
+ /* Last element in y-axis table is overflow */
+ elem = s->latency_y_axis[i];
+ pct = div64_u64(elem * 100, num_elem);
+ bytes_written += scnprintf(buf + bytes_written,
+ PAGE_SIZE - bytes_written,
+ "\t>=%6lluus%15llu%15d%%\n",
+ latency_x_axis_us[i - 1], elem, pct);
+ }
+
+ return bytes_written;
+}
+EXPORT_SYMBOL(blk_latency_hist_show);
diff --git a/build.config.cuttlefish.aarch64 b/build.config.cuttlefish.aarch64
new file mode 100644
index 000000000000..e1f750dc47e4
--- /dev/null
+++ b/build.config.cuttlefish.aarch64
@@ -0,0 +1,16 @@
+ARCH=arm64
+BRANCH=android-4.9
+CLANG_TRIPLE=aarch64-linux-gnu-
+CROSS_COMPILE=aarch64-linux-androidkernel-
+DEFCONFIG=cuttlefish_defconfig
+EXTRA_CMDS=''
+KERNEL_DIR=common
+POST_DEFCONFIG_CMDS="check_defconfig"
+CLANG_PREBUILT_BIN=prebuilts-master/clang/host/linux-x86/clang-r346389b/bin
+LINUX_GCC_CROSS_COMPILE_PREBUILTS_BIN=prebuilts/gcc/linux-x86/aarch64/aarch64-linux-android-4.9/bin
+FILES="
+arch/arm64/boot/Image.gz
+vmlinux
+System.map
+"
+STOP_SHIP_TRACEPRINTK=1
diff --git a/build.config.cuttlefish.x86_64 b/build.config.cuttlefish.x86_64
new file mode 100644
index 000000000000..f8e85a1ce5c7
--- /dev/null
+++ b/build.config.cuttlefish.x86_64
@@ -0,0 +1,16 @@
+ARCH=x86_64
+BRANCH=android-4.9
+CLANG_TRIPLE=x86_64-linux-gnu-
+CROSS_COMPILE=x86_64-linux-androidkernel-
+DEFCONFIG=x86_64_cuttlefish_defconfig
+EXTRA_CMDS=''
+KERNEL_DIR=common
+POST_DEFCONFIG_CMDS="check_defconfig"
+CLANG_PREBUILT_BIN=prebuilts-master/clang/host/linux-x86/clang-r346389b/bin
+LINUX_GCC_CROSS_COMPILE_PREBUILTS_BIN=prebuilts/gcc/linux-x86/x86/x86_64-linux-android-4.9/bin
+FILES="
+arch/x86/boot/bzImage
+vmlinux
+System.map
+"
+STOP_SHIP_TRACEPRINTK=1
diff --git a/build.config.goldfish.arm b/build.config.goldfish.arm
new file mode 100644
index 000000000000..ff5646ab4f40
--- /dev/null
+++ b/build.config.goldfish.arm
@@ -0,0 +1,13 @@
+ARCH=arm
+BRANCH=android-4.4
+CROSS_COMPILE=arm-linux-androidkernel-
+DEFCONFIG=ranchu_defconfig
+EXTRA_CMDS=''
+KERNEL_DIR=common
+LINUX_GCC_CROSS_COMPILE_PREBUILTS_BIN=prebuilts/gcc/linux-x86/arm/arm-linux-androideabi-4.9/bin
+FILES="
+arch/arm/boot/zImage
+vmlinux
+System.map
+"
+STOP_SHIP_TRACEPRINTK=1
diff --git a/build.config.goldfish.arm64 b/build.config.goldfish.arm64
new file mode 100644
index 000000000000..4c896a679ab9
--- /dev/null
+++ b/build.config.goldfish.arm64
@@ -0,0 +1,13 @@
+ARCH=arm64
+BRANCH=android-4.4
+CROSS_COMPILE=aarch64-linux-android-
+DEFCONFIG=ranchu64_defconfig
+EXTRA_CMDS=''
+KERNEL_DIR=common
+LINUX_GCC_CROSS_COMPILE_PREBUILTS_BIN=prebuilts/gcc/linux-x86/aarch64/aarch64-linux-android-4.9/bin
+FILES="
+arch/arm64/boot/Image
+vmlinux
+System.map
+"
+STOP_SHIP_TRACEPRINTK=1
diff --git a/build.config.goldfish.mips b/build.config.goldfish.mips
new file mode 100644
index 000000000000..9a14a444ac14
--- /dev/null
+++ b/build.config.goldfish.mips
@@ -0,0 +1,12 @@
+ARCH=mips
+BRANCH=android-4.4
+CROSS_COMPILE=mips64el-linux-android-
+DEFCONFIG=ranchu_defconfig
+EXTRA_CMDS=''
+KERNEL_DIR=common
+LINUX_GCC_CROSS_COMPILE_PREBUILTS_BIN=prebuilts/gcc/linux-x86/mips/mips64el-linux-android-4.9/bin
+FILES="
+vmlinux
+System.map
+"
+STOP_SHIP_TRACEPRINTK=1
diff --git a/build.config.goldfish.mips64 b/build.config.goldfish.mips64
new file mode 100644
index 000000000000..6ad9759f5f4a
--- /dev/null
+++ b/build.config.goldfish.mips64
@@ -0,0 +1,12 @@
+ARCH=mips
+BRANCH=android-4.4
+CROSS_COMPILE=mips64el-linux-android-
+DEFCONFIG=ranchu64_defconfig
+EXTRA_CMDS=''
+KERNEL_DIR=common
+LINUX_GCC_CROSS_COMPILE_PREBUILTS_BIN=prebuilts/gcc/linux-x86/mips/mips64el-linux-android-4.9/bin
+FILES="
+vmlinux
+System.map
+"
+STOP_SHIP_TRACEPRINTK=1
diff --git a/build.config.goldfish.x86 b/build.config.goldfish.x86
new file mode 100644
index 000000000000..2266c621835e
--- /dev/null
+++ b/build.config.goldfish.x86
@@ -0,0 +1,13 @@
+ARCH=x86
+BRANCH=android-4.4
+CROSS_COMPILE=x86_64-linux-android-
+DEFCONFIG=i386_ranchu_defconfig
+EXTRA_CMDS=''
+KERNEL_DIR=common
+LINUX_GCC_CROSS_COMPILE_PREBUILTS_BIN=prebuilts/gcc/linux-x86/x86/x86_64-linux-android-4.9/bin
+FILES="
+arch/x86/boot/bzImage
+vmlinux
+System.map
+"
+STOP_SHIP_TRACEPRINTK=1
diff --git a/build.config.goldfish.x86_64 b/build.config.goldfish.x86_64
new file mode 100644
index 000000000000..08c42c2eba03
--- /dev/null
+++ b/build.config.goldfish.x86_64
@@ -0,0 +1,13 @@
+ARCH=x86_64
+BRANCH=android-4.4
+CROSS_COMPILE=x86_64-linux-android-
+DEFCONFIG=x86_64_ranchu_defconfig
+EXTRA_CMDS=''
+KERNEL_DIR=common
+LINUX_GCC_CROSS_COMPILE_PREBUILTS_BIN=prebuilts/gcc/linux-x86/x86/x86_64-linux-android-4.9/bin
+FILES="
+arch/x86/boot/bzImage
+vmlinux
+System.map
+"
+STOP_SHIP_TRACEPRINTK=1
diff --git a/certs/system_keyring.c b/certs/system_keyring.c
index 247665054691..8cde8ea88393 100644
--- a/certs/system_keyring.c
+++ b/certs/system_keyring.c
@@ -241,5 +241,46 @@ error:
return ret;
}
EXPORT_SYMBOL_GPL(verify_pkcs7_signature);
-
#endif /* CONFIG_SYSTEM_DATA_VERIFICATION */
+
+/**
+ * verify_signature_one - Verify a signature with keys from given keyring
+ * @sig: The signature to be verified
+ * @trusted_keys: Trusted keys to use (NULL for builtin trusted keys only,
+ * (void *)1UL for all trusted keys).
+ * @keyid: key description (not partial)
+ */
+int verify_signature_one(const struct public_key_signature *sig,
+ struct key *trusted_keys, const char *keyid)
+{
+ key_ref_t ref;
+ struct key *key;
+ int ret;
+
+ if (!sig)
+ return -EBADMSG;
+ if (!trusted_keys) {
+ trusted_keys = builtin_trusted_keys;
+ } else if (trusted_keys == (void *)1UL) {
+#ifdef CONFIG_SECONDARY_TRUSTED_KEYRING
+ trusted_keys = secondary_trusted_keys;
+#else
+ trusted_keys = builtin_trusted_keys;
+#endif
+ }
+
+ ref = keyring_search(make_key_ref(trusted_keys, 1),
+ &key_type_asymmetric, keyid);
+ if (IS_ERR(ref)) {
+ pr_err("Asymmetric key (%s) not found in keyring(%s)\n",
+ keyid, trusted_keys->description);
+ return -ENOKEY;
+ }
+
+ key = key_ref_to_ptr(ref);
+ ret = verify_signature(key, sig);
+ key_put(key);
+ return ret;
+}
+EXPORT_SYMBOL_GPL(verify_signature_one);
+
diff --git a/crypto/Kconfig b/crypto/Kconfig
index ab0d93ab5695..60ece1b8bfd3 100644
--- a/crypto/Kconfig
+++ b/crypto/Kconfig
@@ -372,6 +372,34 @@ config CRYPTO_KEYWRAP
Support for key wrapping (NIST SP800-38F / RFC3394) without
padding.
+config CRYPTO_NHPOLY1305
+ tristate
+ select CRYPTO_HASH
+ select CRYPTO_POLY1305
+
+config CRYPTO_ADIANTUM
+ tristate "Adiantum support"
+ select CRYPTO_CHACHA20
+ select CRYPTO_POLY1305
+ select CRYPTO_NHPOLY1305
+ help
+ Adiantum is a tweakable, length-preserving encryption mode
+ designed for fast and secure disk encryption, especially on
+ CPUs without dedicated crypto instructions. It encrypts
+ each sector using the XChaCha12 stream cipher, two passes of
+ an ε-almost-∆-universal hash function, and an invocation of
+ the AES-256 block cipher on a single 16-byte block. On CPUs
+ without AES instructions, Adiantum is much faster than
+ AES-XTS.
+
+ Adiantum's security is provably reducible to that of its
+ underlying stream and block ciphers, subject to a security
+ bound. Unlike XTS, Adiantum is a true wide-block encryption
+ mode, so it actually provides an even stronger notion of
+ security than XTS, subject to the security bound.
+
+ If unsure, say N.
+
comment "Hash modes"
config CRYPTO_CMAC
@@ -1301,18 +1329,26 @@ config CRYPTO_SALSA20_X86_64
Bernstein <djb@cr.yp.to>. See <http://cr.yp.to/snuffle.html>
config CRYPTO_CHACHA20
- tristate "ChaCha20 cipher algorithm"
+ tristate "ChaCha stream cipher algorithms"
select CRYPTO_BLKCIPHER
help
- ChaCha20 cipher algorithm, RFC7539.
+ The ChaCha20, XChaCha20, and XChaCha12 stream cipher algorithms.
ChaCha20 is a 256-bit high-speed stream cipher designed by Daniel J.
Bernstein and further specified in RFC7539 for use in IETF protocols.
- This is the portable C implementation of ChaCha20.
-
- See also:
+ This is the portable C implementation of ChaCha20. See also:
<http://cr.yp.to/chacha/chacha-20080128.pdf>
+ XChaCha20 is the application of the XSalsa20 construction to ChaCha20
+ rather than to Salsa20. XChaCha20 extends ChaCha20's nonce length
+ from 64 bits (or 96 bits using the RFC7539 convention) to 192 bits,
+ while provably retaining ChaCha20's security. See also:
+ <https://cr.yp.to/snuffle/xsalsa-20081128.pdf>
+
+ XChaCha12 is XChaCha20 reduced to 12 rounds, with correspondingly
+ reduced security margin but increased performance. It can be needed
+ in some performance-sensitive scenarios.
+
config CRYPTO_CHACHA20_X86_64
tristate "ChaCha20 cipher algorithm (x86_64/SSSE3/AVX2)"
depends on X86 && 64BIT
@@ -1608,6 +1644,15 @@ config CRYPTO_LZ4HC
help
This is the LZ4 high compression mode algorithm.
+config CRYPTO_ZSTD
+ tristate "Zstd compression algorithm"
+ select CRYPTO_ALGAPI
+ select CRYPTO_ACOMP2
+ select ZSTD_COMPRESS
+ select ZSTD_DECOMPRESS
+ help
+ This is the zstd algorithm.
+
comment "Random Number Generation"
config CRYPTO_ANSI_CPRNG
diff --git a/crypto/Makefile b/crypto/Makefile
index 9e52b3c528df..bb20d35a26ee 100644
--- a/crypto/Makefile
+++ b/crypto/Makefile
@@ -82,6 +82,8 @@ obj-$(CONFIG_CRYPTO_LRW) += lrw.o
obj-$(CONFIG_CRYPTO_XTS) += xts.o
obj-$(CONFIG_CRYPTO_CTR) += ctr.o
obj-$(CONFIG_CRYPTO_KEYWRAP) += keywrap.o
+obj-$(CONFIG_CRYPTO_ADIANTUM) += adiantum.o
+obj-$(CONFIG_CRYPTO_NHPOLY1305) += nhpoly1305.o
obj-$(CONFIG_CRYPTO_GCM) += gcm.o
obj-$(CONFIG_CRYPTO_CCM) += ccm.o
obj-$(CONFIG_CRYPTO_CHACHA20POLY1305) += chacha20poly1305.o
@@ -107,7 +109,7 @@ obj-$(CONFIG_CRYPTO_KHAZAD) += khazad.o
obj-$(CONFIG_CRYPTO_ANUBIS) += anubis.o
obj-$(CONFIG_CRYPTO_SEED) += seed.o
obj-$(CONFIG_CRYPTO_SALSA20) += salsa20_generic.o
-obj-$(CONFIG_CRYPTO_CHACHA20) += chacha20_generic.o
+obj-$(CONFIG_CRYPTO_CHACHA20) += chacha_generic.o
obj-$(CONFIG_CRYPTO_POLY1305) += poly1305_generic.o
obj-$(CONFIG_CRYPTO_DEFLATE) += deflate.o
obj-$(CONFIG_CRYPTO_MICHAEL_MIC) += michael_mic.o
@@ -132,6 +134,7 @@ obj-$(CONFIG_CRYPTO_USER_API_HASH) += algif_hash.o
obj-$(CONFIG_CRYPTO_USER_API_SKCIPHER) += algif_skcipher.o
obj-$(CONFIG_CRYPTO_USER_API_RNG) += algif_rng.o
obj-$(CONFIG_CRYPTO_USER_API_AEAD) += algif_aead.o
+obj-$(CONFIG_CRYPTO_ZSTD) += zstd.o
#
# generic algorithms and the async_tx api
diff --git a/crypto/adiantum.c b/crypto/adiantum.c
new file mode 100644
index 000000000000..2dfcf12fd452
--- /dev/null
+++ b/crypto/adiantum.c
@@ -0,0 +1,658 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Adiantum length-preserving encryption mode
+ *
+ * Copyright 2018 Google LLC
+ */
+
+/*
+ * Adiantum is a tweakable, length-preserving encryption mode designed for fast
+ * and secure disk encryption, especially on CPUs without dedicated crypto
+ * instructions. Adiantum encrypts each sector using the XChaCha12 stream
+ * cipher, two passes of an ε-almost-∆-universal (εA∆U) hash function based on
+ * NH and Poly1305, and an invocation of the AES-256 block cipher on a single
+ * 16-byte block. See the paper for details:
+ *
+ * Adiantum: length-preserving encryption for entry-level processors
+ * (https://eprint.iacr.org/2018/720.pdf)
+ *
+ * For flexibility, this implementation also allows other ciphers:
+ *
+ * - Stream cipher: XChaCha12 or XChaCha20
+ * - Block cipher: any with a 128-bit block size and 256-bit key
+ *
+ * This implementation doesn't currently allow other εA∆U hash functions, i.e.
+ * HPolyC is not supported. This is because Adiantum is ~20% faster than HPolyC
+ * but still provably as secure, and also the εA∆U hash function of HBSH is
+ * formally defined to take two inputs (tweak, message) which makes it difficult
+ * to wrap with the crypto_shash API. Rather, some details need to be handled
+ * here. Nevertheless, if needed in the future, support for other εA∆U hash
+ * functions could be added here.
+ */
+
+#include <crypto/b128ops.h>
+#include <crypto/chacha.h>
+#include <crypto/internal/hash.h>
+#include <crypto/internal/skcipher.h>
+#include <crypto/nhpoly1305.h>
+#include <crypto/scatterwalk.h>
+#include <linux/module.h>
+
+#include "internal.h"
+
+/*
+ * Size of right-hand block of input data, in bytes; also the size of the block
+ * cipher's block size and the hash function's output.
+ */
+#define BLOCKCIPHER_BLOCK_SIZE 16
+
+/* Size of the block cipher key (K_E) in bytes */
+#define BLOCKCIPHER_KEY_SIZE 32
+
+/* Size of the hash key (K_H) in bytes */
+#define HASH_KEY_SIZE (POLY1305_BLOCK_SIZE + NHPOLY1305_KEY_SIZE)
+
+/*
+ * The specification allows variable-length tweaks, but Linux's crypto API
+ * currently only allows algorithms to support a single length. The "natural"
+ * tweak length for Adiantum is 16, since that fits into one Poly1305 block for
+ * the best performance. But longer tweaks are useful for fscrypt, to avoid
+ * needing to derive per-file keys. So instead we use two blocks, or 32 bytes.
+ */
+#define TWEAK_SIZE 32
+
+struct adiantum_instance_ctx {
+ struct crypto_skcipher_spawn streamcipher_spawn;
+ struct crypto_spawn blockcipher_spawn;
+ struct crypto_shash_spawn hash_spawn;
+};
+
+struct adiantum_tfm_ctx {
+ struct crypto_skcipher *streamcipher;
+ struct crypto_cipher *blockcipher;
+ struct crypto_shash *hash;
+ struct poly1305_key header_hash_key;
+};
+
+struct adiantum_request_ctx {
+
+ /*
+ * Buffer for right-hand block of data, i.e.
+ *
+ * P_L => P_M => C_M => C_R when encrypting, or
+ * C_R => C_M => P_M => P_L when decrypting.
+ *
+ * Also used to build the IV for the stream cipher.
+ */
+ union {
+ u8 bytes[XCHACHA_IV_SIZE];
+ __le32 words[XCHACHA_IV_SIZE / sizeof(__le32)];
+ le128 bignum; /* interpret as element of Z/(2^{128}Z) */
+ } rbuf;
+
+ bool enc; /* true if encrypting, false if decrypting */
+
+ /*
+ * The result of the Poly1305 εA∆U hash function applied to
+ * (message length, tweak).
+ */
+ le128 header_hash;
+
+ /* Sub-requests, must be last */
+ union {
+ struct shash_desc hash_desc;
+ struct skcipher_request streamcipher_req;
+ } u;
+};
+
+/*
+ * Given the XChaCha stream key K_S, derive the block cipher key K_E and the
+ * hash key K_H as follows:
+ *
+ * K_E || K_H || ... = XChaCha(key=K_S, nonce=1||0^191)
+ *
+ * Note that this denotes using bits from the XChaCha keystream, which here we
+ * get indirectly by encrypting a buffer containing all 0's.
+ */
+static int adiantum_setkey(struct crypto_skcipher *tfm, const u8 *key,
+ unsigned int keylen)
+{
+ struct adiantum_tfm_ctx *tctx = crypto_skcipher_ctx(tfm);
+ struct {
+ u8 iv[XCHACHA_IV_SIZE];
+ u8 derived_keys[BLOCKCIPHER_KEY_SIZE + HASH_KEY_SIZE];
+ struct scatterlist sg;
+ struct crypto_wait wait;
+ struct skcipher_request req; /* must be last */
+ } *data;
+ u8 *keyp;
+ int err;
+
+ /* Set the stream cipher key (K_S) */
+ crypto_skcipher_clear_flags(tctx->streamcipher, CRYPTO_TFM_REQ_MASK);
+ crypto_skcipher_set_flags(tctx->streamcipher,
+ crypto_skcipher_get_flags(tfm) &
+ CRYPTO_TFM_REQ_MASK);
+ err = crypto_skcipher_setkey(tctx->streamcipher, key, keylen);
+ crypto_skcipher_set_flags(tfm,
+ crypto_skcipher_get_flags(tctx->streamcipher) &
+ CRYPTO_TFM_RES_MASK);
+ if (err)
+ return err;
+
+ /* Derive the subkeys */
+ data = kzalloc(sizeof(*data) +
+ crypto_skcipher_reqsize(tctx->streamcipher), GFP_KERNEL);
+ if (!data)
+ return -ENOMEM;
+ data->iv[0] = 1;
+ sg_init_one(&data->sg, data->derived_keys, sizeof(data->derived_keys));
+ crypto_init_wait(&data->wait);
+ skcipher_request_set_tfm(&data->req, tctx->streamcipher);
+ skcipher_request_set_callback(&data->req, CRYPTO_TFM_REQ_MAY_SLEEP |
+ CRYPTO_TFM_REQ_MAY_BACKLOG,
+ crypto_req_done, &data->wait);
+ skcipher_request_set_crypt(&data->req, &data->sg, &data->sg,
+ sizeof(data->derived_keys), data->iv);
+ err = crypto_wait_req(crypto_skcipher_encrypt(&data->req), &data->wait);
+ if (err)
+ goto out;
+ keyp = data->derived_keys;
+
+ /* Set the block cipher key (K_E) */
+ crypto_cipher_clear_flags(tctx->blockcipher, CRYPTO_TFM_REQ_MASK);
+ crypto_cipher_set_flags(tctx->blockcipher,
+ crypto_skcipher_get_flags(tfm) &
+ CRYPTO_TFM_REQ_MASK);
+ err = crypto_cipher_setkey(tctx->blockcipher, keyp,
+ BLOCKCIPHER_KEY_SIZE);
+ crypto_skcipher_set_flags(tfm,
+ crypto_cipher_get_flags(tctx->blockcipher) &
+ CRYPTO_TFM_RES_MASK);
+ if (err)
+ goto out;
+ keyp += BLOCKCIPHER_KEY_SIZE;
+
+ /* Set the hash key (K_H) */
+ poly1305_core_setkey(&tctx->header_hash_key, keyp);
+ keyp += POLY1305_BLOCK_SIZE;
+
+ crypto_shash_clear_flags(tctx->hash, CRYPTO_TFM_REQ_MASK);
+ crypto_shash_set_flags(tctx->hash, crypto_skcipher_get_flags(tfm) &
+ CRYPTO_TFM_REQ_MASK);
+ err = crypto_shash_setkey(tctx->hash, keyp, NHPOLY1305_KEY_SIZE);
+ crypto_skcipher_set_flags(tfm, crypto_shash_get_flags(tctx->hash) &
+ CRYPTO_TFM_RES_MASK);
+ keyp += NHPOLY1305_KEY_SIZE;
+ WARN_ON(keyp != &data->derived_keys[ARRAY_SIZE(data->derived_keys)]);
+out:
+ kzfree(data);
+ return err;
+}
+
+/* Addition in Z/(2^{128}Z) */
+static inline void le128_add(le128 *r, const le128 *v1, const le128 *v2)
+{
+ u64 x = le64_to_cpu(v1->b);
+ u64 y = le64_to_cpu(v2->b);
+
+ r->b = cpu_to_le64(x + y);
+ r->a = cpu_to_le64(le64_to_cpu(v1->a) + le64_to_cpu(v2->a) +
+ (x + y < x));
+}
+
+/* Subtraction in Z/(2^{128}Z) */
+static inline void le128_sub(le128 *r, const le128 *v1, const le128 *v2)
+{
+ u64 x = le64_to_cpu(v1->b);
+ u64 y = le64_to_cpu(v2->b);
+
+ r->b = cpu_to_le64(x - y);
+ r->a = cpu_to_le64(le64_to_cpu(v1->a) - le64_to_cpu(v2->a) -
+ (x - y > x));
+}
+
+/*
+ * Apply the Poly1305 εA∆U hash function to (message length, tweak) and save the
+ * result to rctx->header_hash.
+ *
+ * This value is reused in both the first and second hash steps. Specifically,
+ * it's added to the result of an independently keyed εA∆U hash function (for
+ * equal length inputs only) taken over the message. This gives the overall
+ * Adiantum hash of the (tweak, message) pair.
+ */
+static void adiantum_hash_header(struct skcipher_request *req)
+{
+ struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req);
+ const struct adiantum_tfm_ctx *tctx = crypto_skcipher_ctx(tfm);
+ struct adiantum_request_ctx *rctx = skcipher_request_ctx(req);
+ const unsigned int bulk_len = req->cryptlen - BLOCKCIPHER_BLOCK_SIZE;
+ struct {
+ __le64 message_bits;
+ __le64 padding;
+ } header = {
+ .message_bits = cpu_to_le64((u64)bulk_len * 8)
+ };
+ struct poly1305_state state;
+
+ poly1305_core_init(&state);
+
+ BUILD_BUG_ON(sizeof(header) % POLY1305_BLOCK_SIZE != 0);
+ poly1305_core_blocks(&state, &tctx->header_hash_key,
+ &header, sizeof(header) / POLY1305_BLOCK_SIZE);
+
+ BUILD_BUG_ON(TWEAK_SIZE % POLY1305_BLOCK_SIZE != 0);
+ poly1305_core_blocks(&state, &tctx->header_hash_key, req->iv,
+ TWEAK_SIZE / POLY1305_BLOCK_SIZE);
+
+ poly1305_core_emit(&state, &rctx->header_hash);
+}
+
+/* Hash the left-hand block (the "bulk") of the message using NHPoly1305 */
+static int adiantum_hash_message(struct skcipher_request *req,
+ struct scatterlist *sgl, le128 *digest)
+{
+ struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req);
+ const struct adiantum_tfm_ctx *tctx = crypto_skcipher_ctx(tfm);
+ struct adiantum_request_ctx *rctx = skcipher_request_ctx(req);
+ const unsigned int bulk_len = req->cryptlen - BLOCKCIPHER_BLOCK_SIZE;
+ struct shash_desc *hash_desc = &rctx->u.hash_desc;
+ struct sg_mapping_iter miter;
+ unsigned int i, n;
+ int err;
+
+ hash_desc->tfm = tctx->hash;
+ hash_desc->flags = 0;
+
+ err = crypto_shash_init(hash_desc);
+ if (err)
+ return err;
+
+ sg_miter_start(&miter, sgl, sg_nents(sgl),
+ SG_MITER_FROM_SG | SG_MITER_ATOMIC);
+ for (i = 0; i < bulk_len; i += n) {
+ sg_miter_next(&miter);
+ n = min_t(unsigned int, miter.length, bulk_len - i);
+ err = crypto_shash_update(hash_desc, miter.addr, n);
+ if (err)
+ break;
+ }
+ sg_miter_stop(&miter);
+ if (err)
+ return err;
+
+ return crypto_shash_final(hash_desc, (u8 *)digest);
+}
+
+/* Continue Adiantum encryption/decryption after the stream cipher step */
+static int adiantum_finish(struct skcipher_request *req)
+{
+ struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req);
+ const struct adiantum_tfm_ctx *tctx = crypto_skcipher_ctx(tfm);
+ struct adiantum_request_ctx *rctx = skcipher_request_ctx(req);
+ const unsigned int bulk_len = req->cryptlen - BLOCKCIPHER_BLOCK_SIZE;
+ le128 digest;
+ int err;
+
+ /* If decrypting, decrypt C_M with the block cipher to get P_M */
+ if (!rctx->enc)
+ crypto_cipher_decrypt_one(tctx->blockcipher, rctx->rbuf.bytes,
+ rctx->rbuf.bytes);
+
+ /*
+ * Second hash step
+ * enc: C_R = C_M - H_{K_H}(T, C_L)
+ * dec: P_R = P_M - H_{K_H}(T, P_L)
+ */
+ err = adiantum_hash_message(req, req->dst, &digest);
+ if (err)
+ return err;
+ le128_add(&digest, &digest, &rctx->header_hash);
+ le128_sub(&rctx->rbuf.bignum, &rctx->rbuf.bignum, &digest);
+ scatterwalk_map_and_copy(&rctx->rbuf.bignum, req->dst,
+ bulk_len, BLOCKCIPHER_BLOCK_SIZE, 1);
+ return 0;
+}
+
+static void adiantum_streamcipher_done(struct crypto_async_request *areq,
+ int err)
+{
+ struct skcipher_request *req = areq->data;
+
+ if (!err)
+ err = adiantum_finish(req);
+
+ skcipher_request_complete(req, err);
+}
+
+static int adiantum_crypt(struct skcipher_request *req, bool enc)
+{
+ struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req);
+ const struct adiantum_tfm_ctx *tctx = crypto_skcipher_ctx(tfm);
+ struct adiantum_request_ctx *rctx = skcipher_request_ctx(req);
+ const unsigned int bulk_len = req->cryptlen - BLOCKCIPHER_BLOCK_SIZE;
+ unsigned int stream_len;
+ le128 digest;
+ int err;
+
+ if (req->cryptlen < BLOCKCIPHER_BLOCK_SIZE)
+ return -EINVAL;
+
+ rctx->enc = enc;
+
+ /*
+ * First hash step
+ * enc: P_M = P_R + H_{K_H}(T, P_L)
+ * dec: C_M = C_R + H_{K_H}(T, C_L)
+ */
+ adiantum_hash_header(req);
+ err = adiantum_hash_message(req, req->src, &digest);
+ if (err)
+ return err;
+ le128_add(&digest, &digest, &rctx->header_hash);
+ scatterwalk_map_and_copy(&rctx->rbuf.bignum, req->src,
+ bulk_len, BLOCKCIPHER_BLOCK_SIZE, 0);
+ le128_add(&rctx->rbuf.bignum, &rctx->rbuf.bignum, &digest);
+
+ /* If encrypting, encrypt P_M with the block cipher to get C_M */
+ if (enc)
+ crypto_cipher_encrypt_one(tctx->blockcipher, rctx->rbuf.bytes,
+ rctx->rbuf.bytes);
+
+ /* Initialize the rest of the XChaCha IV (first part is C_M) */
+ BUILD_BUG_ON(BLOCKCIPHER_BLOCK_SIZE != 16);
+ BUILD_BUG_ON(XCHACHA_IV_SIZE != 32); /* nonce || stream position */
+ rctx->rbuf.words[4] = cpu_to_le32(1);
+ rctx->rbuf.words[5] = 0;
+ rctx->rbuf.words[6] = 0;
+ rctx->rbuf.words[7] = 0;
+
+ /*
+ * XChaCha needs to be done on all the data except the last 16 bytes;
+ * for disk encryption that usually means 4080 or 496 bytes. But ChaCha
+ * implementations tend to be most efficient when passed a whole number
+ * of 64-byte ChaCha blocks, or sometimes even a multiple of 256 bytes.
+ * And here it doesn't matter whether the last 16 bytes are written to,
+ * as the second hash step will overwrite them. Thus, round the XChaCha
+ * length up to the next 64-byte boundary if possible.
+ */
+ stream_len = bulk_len;
+ if (round_up(stream_len, CHACHA_BLOCK_SIZE) <= req->cryptlen)
+ stream_len = round_up(stream_len, CHACHA_BLOCK_SIZE);
+
+ skcipher_request_set_tfm(&rctx->u.streamcipher_req, tctx->streamcipher);
+ skcipher_request_set_crypt(&rctx->u.streamcipher_req, req->src,
+ req->dst, stream_len, &rctx->rbuf);
+ skcipher_request_set_callback(&rctx->u.streamcipher_req,
+ req->base.flags,
+ adiantum_streamcipher_done, req);
+ return crypto_skcipher_encrypt(&rctx->u.streamcipher_req) ?:
+ adiantum_finish(req);
+}
+
+static int adiantum_encrypt(struct skcipher_request *req)
+{
+ return adiantum_crypt(req, true);
+}
+
+static int adiantum_decrypt(struct skcipher_request *req)
+{
+ return adiantum_crypt(req, false);
+}
+
+static int adiantum_init_tfm(struct crypto_skcipher *tfm)
+{
+ struct skcipher_instance *inst = skcipher_alg_instance(tfm);
+ struct adiantum_instance_ctx *ictx = skcipher_instance_ctx(inst);
+ struct adiantum_tfm_ctx *tctx = crypto_skcipher_ctx(tfm);
+ struct crypto_skcipher *streamcipher;
+ struct crypto_cipher *blockcipher;
+ struct crypto_shash *hash;
+ unsigned int subreq_size;
+ int err;
+
+ streamcipher = crypto_spawn_skcipher(&ictx->streamcipher_spawn);
+ if (IS_ERR(streamcipher))
+ return PTR_ERR(streamcipher);
+
+ blockcipher = crypto_spawn_cipher(&ictx->blockcipher_spawn);
+ if (IS_ERR(blockcipher)) {
+ err = PTR_ERR(blockcipher);
+ goto err_free_streamcipher;
+ }
+
+ hash = crypto_spawn_shash(&ictx->hash_spawn);
+ if (IS_ERR(hash)) {
+ err = PTR_ERR(hash);
+ goto err_free_blockcipher;
+ }
+
+ tctx->streamcipher = streamcipher;
+ tctx->blockcipher = blockcipher;
+ tctx->hash = hash;
+
+ BUILD_BUG_ON(offsetofend(struct adiantum_request_ctx, u) !=
+ sizeof(struct adiantum_request_ctx));
+ subreq_size = max(FIELD_SIZEOF(struct adiantum_request_ctx,
+ u.hash_desc) +
+ crypto_shash_descsize(hash),
+ FIELD_SIZEOF(struct adiantum_request_ctx,
+ u.streamcipher_req) +
+ crypto_skcipher_reqsize(streamcipher));
+
+ crypto_skcipher_set_reqsize(tfm,
+ offsetof(struct adiantum_request_ctx, u) +
+ subreq_size);
+ return 0;
+
+err_free_blockcipher:
+ crypto_free_cipher(blockcipher);
+err_free_streamcipher:
+ crypto_free_skcipher(streamcipher);
+ return err;
+}
+
+static void adiantum_exit_tfm(struct crypto_skcipher *tfm)
+{
+ struct adiantum_tfm_ctx *tctx = crypto_skcipher_ctx(tfm);
+
+ crypto_free_skcipher(tctx->streamcipher);
+ crypto_free_cipher(tctx->blockcipher);
+ crypto_free_shash(tctx->hash);
+}
+
+static void adiantum_free_instance(struct skcipher_instance *inst)
+{
+ struct adiantum_instance_ctx *ictx = skcipher_instance_ctx(inst);
+
+ crypto_drop_skcipher(&ictx->streamcipher_spawn);
+ crypto_drop_spawn(&ictx->blockcipher_spawn);
+ crypto_drop_shash(&ictx->hash_spawn);
+ kfree(inst);
+}
+
+/*
+ * Check for a supported set of inner algorithms.
+ * See the comment at the beginning of this file.
+ */
+static bool adiantum_supported_algorithms(struct skcipher_alg *streamcipher_alg,
+ struct crypto_alg *blockcipher_alg,
+ struct shash_alg *hash_alg)
+{
+ if (strcmp(streamcipher_alg->base.cra_name, "xchacha12") != 0 &&
+ strcmp(streamcipher_alg->base.cra_name, "xchacha20") != 0)
+ return false;
+
+ if (blockcipher_alg->cra_cipher.cia_min_keysize > BLOCKCIPHER_KEY_SIZE ||
+ blockcipher_alg->cra_cipher.cia_max_keysize < BLOCKCIPHER_KEY_SIZE)
+ return false;
+ if (blockcipher_alg->cra_blocksize != BLOCKCIPHER_BLOCK_SIZE)
+ return false;
+
+ if (strcmp(hash_alg->base.cra_name, "nhpoly1305") != 0)
+ return false;
+
+ return true;
+}
+
+static int adiantum_create(struct crypto_template *tmpl, struct rtattr **tb)
+{
+ struct crypto_attr_type *algt;
+ const char *streamcipher_name;
+ const char *blockcipher_name;
+ const char *nhpoly1305_name;
+ struct skcipher_instance *inst;
+ struct adiantum_instance_ctx *ictx;
+ struct skcipher_alg *streamcipher_alg;
+ struct crypto_alg *blockcipher_alg;
+ struct crypto_alg *_hash_alg;
+ struct shash_alg *hash_alg;
+ int err;
+
+ algt = crypto_get_attr_type(tb);
+ if (IS_ERR(algt))
+ return PTR_ERR(algt);
+
+ if ((algt->type ^ CRYPTO_ALG_TYPE_SKCIPHER) & algt->mask)
+ return -EINVAL;
+
+ streamcipher_name = crypto_attr_alg_name(tb[1]);
+ if (IS_ERR(streamcipher_name))
+ return PTR_ERR(streamcipher_name);
+
+ blockcipher_name = crypto_attr_alg_name(tb[2]);
+ if (IS_ERR(blockcipher_name))
+ return PTR_ERR(blockcipher_name);
+
+ nhpoly1305_name = crypto_attr_alg_name(tb[3]);
+ if (nhpoly1305_name == ERR_PTR(-ENOENT))
+ nhpoly1305_name = "nhpoly1305";
+ if (IS_ERR(nhpoly1305_name))
+ return PTR_ERR(nhpoly1305_name);
+
+ inst = kzalloc(sizeof(*inst) + sizeof(*ictx), GFP_KERNEL);
+ if (!inst)
+ return -ENOMEM;
+ ictx = skcipher_instance_ctx(inst);
+
+ /* Stream cipher, e.g. "xchacha12" */
+ err = crypto_grab_skcipher(&ictx->streamcipher_spawn, streamcipher_name,
+ 0, crypto_requires_sync(algt->type,
+ algt->mask));
+ if (err)
+ goto out_free_inst;
+ streamcipher_alg = crypto_spawn_skcipher_alg(&ictx->streamcipher_spawn);
+
+ /* Block cipher, e.g. "aes" */
+ err = crypto_grab_spawn(&ictx->blockcipher_spawn, blockcipher_name,
+ CRYPTO_ALG_TYPE_CIPHER, CRYPTO_ALG_TYPE_MASK);
+ if (err)
+ goto out_drop_streamcipher;
+ blockcipher_alg = ictx->blockcipher_spawn.alg;
+
+ /* NHPoly1305 εA∆U hash function */
+ _hash_alg = crypto_alg_mod_lookup(nhpoly1305_name,
+ CRYPTO_ALG_TYPE_SHASH,
+ CRYPTO_ALG_TYPE_MASK);
+ if (IS_ERR(_hash_alg)) {
+ err = PTR_ERR(_hash_alg);
+ goto out_drop_blockcipher;
+ }
+ hash_alg = __crypto_shash_alg(_hash_alg);
+ err = crypto_init_shash_spawn(&ictx->hash_spawn, hash_alg,
+ skcipher_crypto_instance(inst));
+ if (err) {
+ crypto_mod_put(_hash_alg);
+ goto out_drop_blockcipher;
+ }
+
+ /* Check the set of algorithms */
+ if (!adiantum_supported_algorithms(streamcipher_alg, blockcipher_alg,
+ hash_alg)) {
+ pr_warn("Unsupported Adiantum instantiation: (%s,%s,%s)\n",
+ streamcipher_alg->base.cra_name,
+ blockcipher_alg->cra_name, hash_alg->base.cra_name);
+ err = -EINVAL;
+ goto out_drop_hash;
+ }
+
+ /* Instance fields */
+
+ err = -ENAMETOOLONG;
+ if (snprintf(inst->alg.base.cra_name, CRYPTO_MAX_ALG_NAME,
+ "adiantum(%s,%s)", streamcipher_alg->base.cra_name,
+ blockcipher_alg->cra_name) >= CRYPTO_MAX_ALG_NAME)
+ goto out_drop_hash;
+ if (snprintf(inst->alg.base.cra_driver_name, CRYPTO_MAX_ALG_NAME,
+ "adiantum(%s,%s,%s)",
+ streamcipher_alg->base.cra_driver_name,
+ blockcipher_alg->cra_driver_name,
+ hash_alg->base.cra_driver_name) >= CRYPTO_MAX_ALG_NAME)
+ goto out_drop_hash;
+
+ inst->alg.base.cra_blocksize = BLOCKCIPHER_BLOCK_SIZE;
+ inst->alg.base.cra_ctxsize = sizeof(struct adiantum_tfm_ctx);
+ inst->alg.base.cra_alignmask = streamcipher_alg->base.cra_alignmask |
+ hash_alg->base.cra_alignmask;
+ /*
+ * The block cipher is only invoked once per message, so for long
+ * messages (e.g. sectors for disk encryption) its performance doesn't
+ * matter as much as that of the stream cipher and hash function. Thus,
+ * weigh the block cipher's ->cra_priority less.
+ */
+ inst->alg.base.cra_priority = (4 * streamcipher_alg->base.cra_priority +
+ 2 * hash_alg->base.cra_priority +
+ blockcipher_alg->cra_priority) / 7;
+
+ inst->alg.setkey = adiantum_setkey;
+ inst->alg.encrypt = adiantum_encrypt;
+ inst->alg.decrypt = adiantum_decrypt;
+ inst->alg.init = adiantum_init_tfm;
+ inst->alg.exit = adiantum_exit_tfm;
+ inst->alg.min_keysize = crypto_skcipher_alg_min_keysize(streamcipher_alg);
+ inst->alg.max_keysize = crypto_skcipher_alg_max_keysize(streamcipher_alg);
+ inst->alg.ivsize = TWEAK_SIZE;
+
+ inst->free = adiantum_free_instance;
+
+ err = skcipher_register_instance(tmpl, inst);
+ if (err)
+ goto out_drop_hash;
+
+ return 0;
+
+out_drop_hash:
+ crypto_drop_shash(&ictx->hash_spawn);
+out_drop_blockcipher:
+ crypto_drop_spawn(&ictx->blockcipher_spawn);
+out_drop_streamcipher:
+ crypto_drop_skcipher(&ictx->streamcipher_spawn);
+out_free_inst:
+ kfree(inst);
+ return err;
+}
+
+/* adiantum(streamcipher_name, blockcipher_name [, nhpoly1305_name]) */
+static struct crypto_template adiantum_tmpl = {
+ .name = "adiantum",
+ .create = adiantum_create,
+ .module = THIS_MODULE,
+};
+
+static int __init adiantum_module_init(void)
+{
+ return crypto_register_template(&adiantum_tmpl);
+}
+
+static void __exit adiantum_module_exit(void)
+{
+ crypto_unregister_template(&adiantum_tmpl);
+}
+
+module_init(adiantum_module_init);
+module_exit(adiantum_module_exit);
+
+MODULE_DESCRIPTION("Adiantum length-preserving encryption mode");
+MODULE_LICENSE("GPL v2");
+MODULE_AUTHOR("Eric Biggers <ebiggers@google.com>");
+MODULE_ALIAS_CRYPTO("adiantum");
diff --git a/crypto/aes_generic.c b/crypto/aes_generic.c
index 3dd101144a58..956f8b859d4b 100644
--- a/crypto/aes_generic.c
+++ b/crypto/aes_generic.c
@@ -62,7 +62,8 @@ static inline u8 byte(const u32 x, const unsigned n)
static const u32 rco_tab[10] = { 1, 2, 4, 8, 16, 32, 64, 128, 27, 54 };
-__visible const u32 crypto_ft_tab[4][256] = {
+/* cacheline-aligned to facilitate prefetching into cache */
+__visible const u32 crypto_ft_tab[4][256] __cacheline_aligned = {
{
0xa56363c6, 0x847c7cf8, 0x997777ee, 0x8d7b7bf6,
0x0df2f2ff, 0xbd6b6bd6, 0xb16f6fde, 0x54c5c591,
@@ -326,7 +327,7 @@ __visible const u32 crypto_ft_tab[4][256] = {
}
};
-__visible const u32 crypto_fl_tab[4][256] = {
+__visible const u32 crypto_fl_tab[4][256] __cacheline_aligned = {
{
0x00000063, 0x0000007c, 0x00000077, 0x0000007b,
0x000000f2, 0x0000006b, 0x0000006f, 0x000000c5,
@@ -590,7 +591,7 @@ __visible const u32 crypto_fl_tab[4][256] = {
}
};
-__visible const u32 crypto_it_tab[4][256] = {
+__visible const u32 crypto_it_tab[4][256] __cacheline_aligned = {
{
0x50a7f451, 0x5365417e, 0xc3a4171a, 0x965e273a,
0xcb6bab3b, 0xf1459d1f, 0xab58faac, 0x9303e34b,
@@ -854,7 +855,7 @@ __visible const u32 crypto_it_tab[4][256] = {
}
};
-__visible const u32 crypto_il_tab[4][256] = {
+__visible const u32 crypto_il_tab[4][256] __cacheline_aligned = {
{
0x00000052, 0x00000009, 0x0000006a, 0x000000d5,
0x00000030, 0x00000036, 0x000000a5, 0x00000038,
diff --git a/crypto/api.c b/crypto/api.c
index abf53e67e3d8..f12d6b966608 100644
--- a/crypto/api.c
+++ b/crypto/api.c
@@ -24,6 +24,7 @@
#include <linux/sched.h>
#include <linux/slab.h>
#include <linux/string.h>
+#include <linux/completion.h>
#include "internal.h"
LIST_HEAD(crypto_alg_list);
@@ -611,5 +612,17 @@ int crypto_has_alg(const char *name, u32 type, u32 mask)
}
EXPORT_SYMBOL_GPL(crypto_has_alg);
+void crypto_req_done(struct crypto_async_request *req, int err)
+{
+ struct crypto_wait *wait = req->data;
+
+ if (err == -EINPROGRESS)
+ return;
+
+ wait->err = err;
+ complete(&wait->completion);
+}
+EXPORT_SYMBOL_GPL(crypto_req_done);
+
MODULE_DESCRIPTION("Cryptographic core API");
MODULE_LICENSE("GPL");
diff --git a/crypto/chacha20_generic.c b/crypto/chacha20_generic.c
deleted file mode 100644
index 1cab83146e33..000000000000
--- a/crypto/chacha20_generic.c
+++ /dev/null
@@ -1,151 +0,0 @@
-/*
- * ChaCha20 256-bit cipher algorithm, RFC7539
- *
- * Copyright (C) 2015 Martin Willi
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- */
-
-#include <crypto/algapi.h>
-#include <linux/crypto.h>
-#include <linux/kernel.h>
-#include <linux/module.h>
-#include <crypto/chacha20.h>
-
-static inline u32 le32_to_cpuvp(const void *p)
-{
- return le32_to_cpup(p);
-}
-
-static void chacha20_docrypt(u32 *state, u8 *dst, const u8 *src,
- unsigned int bytes)
-{
- u8 stream[CHACHA20_BLOCK_SIZE];
-
- if (dst != src)
- memcpy(dst, src, bytes);
-
- while (bytes >= CHACHA20_BLOCK_SIZE) {
- chacha20_block(state, stream);
- crypto_xor(dst, stream, CHACHA20_BLOCK_SIZE);
- bytes -= CHACHA20_BLOCK_SIZE;
- dst += CHACHA20_BLOCK_SIZE;
- }
- if (bytes) {
- chacha20_block(state, stream);
- crypto_xor(dst, stream, bytes);
- }
-}
-
-void crypto_chacha20_init(u32 *state, struct chacha20_ctx *ctx, u8 *iv)
-{
- static const char constant[16] = "expand 32-byte k";
-
- state[0] = le32_to_cpuvp(constant + 0);
- state[1] = le32_to_cpuvp(constant + 4);
- state[2] = le32_to_cpuvp(constant + 8);
- state[3] = le32_to_cpuvp(constant + 12);
- state[4] = ctx->key[0];
- state[5] = ctx->key[1];
- state[6] = ctx->key[2];
- state[7] = ctx->key[3];
- state[8] = ctx->key[4];
- state[9] = ctx->key[5];
- state[10] = ctx->key[6];
- state[11] = ctx->key[7];
- state[12] = le32_to_cpuvp(iv + 0);
- state[13] = le32_to_cpuvp(iv + 4);
- state[14] = le32_to_cpuvp(iv + 8);
- state[15] = le32_to_cpuvp(iv + 12);
-}
-EXPORT_SYMBOL_GPL(crypto_chacha20_init);
-
-int crypto_chacha20_setkey(struct crypto_tfm *tfm, const u8 *key,
- unsigned int keysize)
-{
- struct chacha20_ctx *ctx = crypto_tfm_ctx(tfm);
- int i;
-
- if (keysize != CHACHA20_KEY_SIZE)
- return -EINVAL;
-
- for (i = 0; i < ARRAY_SIZE(ctx->key); i++)
- ctx->key[i] = le32_to_cpuvp(key + i * sizeof(u32));
-
- return 0;
-}
-EXPORT_SYMBOL_GPL(crypto_chacha20_setkey);
-
-int crypto_chacha20_crypt(struct blkcipher_desc *desc, struct scatterlist *dst,
- struct scatterlist *src, unsigned int nbytes)
-{
- struct blkcipher_walk walk;
- u32 state[16];
- int err;
-
- blkcipher_walk_init(&walk, dst, src, nbytes);
- err = blkcipher_walk_virt_block(desc, &walk, CHACHA20_BLOCK_SIZE);
-
- crypto_chacha20_init(state, crypto_blkcipher_ctx(desc->tfm), walk.iv);
-
- while (walk.nbytes >= CHACHA20_BLOCK_SIZE) {
- chacha20_docrypt(state, walk.dst.virt.addr, walk.src.virt.addr,
- rounddown(walk.nbytes, CHACHA20_BLOCK_SIZE));
- err = blkcipher_walk_done(desc, &walk,
- walk.nbytes % CHACHA20_BLOCK_SIZE);
- }
-
- if (walk.nbytes) {
- chacha20_docrypt(state, walk.dst.virt.addr, walk.src.virt.addr,
- walk.nbytes);
- err = blkcipher_walk_done(desc, &walk, 0);
- }
-
- return err;
-}
-EXPORT_SYMBOL_GPL(crypto_chacha20_crypt);
-
-static struct crypto_alg alg = {
- .cra_name = "chacha20",
- .cra_driver_name = "chacha20-generic",
- .cra_priority = 100,
- .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER,
- .cra_blocksize = 1,
- .cra_type = &crypto_blkcipher_type,
- .cra_ctxsize = sizeof(struct chacha20_ctx),
- .cra_alignmask = sizeof(u32) - 1,
- .cra_module = THIS_MODULE,
- .cra_u = {
- .blkcipher = {
- .min_keysize = CHACHA20_KEY_SIZE,
- .max_keysize = CHACHA20_KEY_SIZE,
- .ivsize = CHACHA20_IV_SIZE,
- .geniv = "seqiv",
- .setkey = crypto_chacha20_setkey,
- .encrypt = crypto_chacha20_crypt,
- .decrypt = crypto_chacha20_crypt,
- },
- },
-};
-
-static int __init chacha20_generic_mod_init(void)
-{
- return crypto_register_alg(&alg);
-}
-
-static void __exit chacha20_generic_mod_fini(void)
-{
- crypto_unregister_alg(&alg);
-}
-
-module_init(chacha20_generic_mod_init);
-module_exit(chacha20_generic_mod_fini);
-
-MODULE_LICENSE("GPL");
-MODULE_AUTHOR("Martin Willi <martin@strongswan.org>");
-MODULE_DESCRIPTION("chacha20 cipher algorithm");
-MODULE_ALIAS_CRYPTO("chacha20");
-MODULE_ALIAS_CRYPTO("chacha20-generic");
diff --git a/crypto/chacha20poly1305.c b/crypto/chacha20poly1305.c
index cb1c3a3287b0..c0f699c31625 100644
--- a/crypto/chacha20poly1305.c
+++ b/crypto/chacha20poly1305.c
@@ -13,7 +13,7 @@
#include <crypto/internal/hash.h>
#include <crypto/internal/skcipher.h>
#include <crypto/scatterwalk.h>
-#include <crypto/chacha20.h>
+#include <crypto/chacha.h>
#include <crypto/poly1305.h>
#include <linux/err.h>
#include <linux/init.h>
@@ -51,7 +51,7 @@ struct poly_req {
};
struct chacha_req {
- u8 iv[CHACHA20_IV_SIZE];
+ u8 iv[CHACHA_IV_SIZE];
struct scatterlist src[1];
struct skcipher_request req; /* must be last member */
};
@@ -91,7 +91,7 @@ static void chacha_iv(u8 *iv, struct aead_request *req, u32 icb)
memcpy(iv, &leicb, sizeof(leicb));
memcpy(iv + sizeof(leicb), ctx->salt, ctx->saltlen);
memcpy(iv + sizeof(leicb) + ctx->saltlen, req->iv,
- CHACHA20_IV_SIZE - sizeof(leicb) - ctx->saltlen);
+ CHACHA_IV_SIZE - sizeof(leicb) - ctx->saltlen);
}
static int poly_verify_tag(struct aead_request *req)
@@ -494,7 +494,7 @@ static int chachapoly_setkey(struct crypto_aead *aead, const u8 *key,
struct chachapoly_ctx *ctx = crypto_aead_ctx(aead);
int err;
- if (keylen != ctx->saltlen + CHACHA20_KEY_SIZE)
+ if (keylen != ctx->saltlen + CHACHA_KEY_SIZE)
return -EINVAL;
keylen -= ctx->saltlen;
@@ -639,7 +639,7 @@ static int chachapoly_create(struct crypto_template *tmpl, struct rtattr **tb,
err = -EINVAL;
/* Need 16-byte IV size, including Initial Block Counter value */
- if (crypto_skcipher_alg_ivsize(chacha) != CHACHA20_IV_SIZE)
+ if (crypto_skcipher_alg_ivsize(chacha) != CHACHA_IV_SIZE)
goto out_drop_chacha;
/* Not a stream cipher? */
if (chacha->base.cra_blocksize != 1)
diff --git a/crypto/chacha_generic.c b/crypto/chacha_generic.c
new file mode 100644
index 000000000000..dda06a8feb52
--- /dev/null
+++ b/crypto/chacha_generic.c
@@ -0,0 +1,243 @@
+/*
+ * ChaCha and XChaCha stream ciphers, including ChaCha20 (RFC7539)
+ *
+ * Copyright (C) 2015 Martin Willi
+ * Copyright (C) 2018 Google LLC
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ */
+
+#include <asm/unaligned.h>
+#include <crypto/algapi.h>
+#include <linux/crypto.h>
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <crypto/chacha.h>
+
+static void chacha_docrypt(u32 *state, u8 *dst, const u8 *src,
+ unsigned int bytes, int nrounds)
+{
+ /* aligned to potentially speed up crypto_xor() */
+ u8 stream[CHACHA_BLOCK_SIZE] __aligned(sizeof(long));
+
+ if (dst != src)
+ memcpy(dst, src, bytes);
+
+ while (bytes >= CHACHA_BLOCK_SIZE) {
+ chacha_block(state, stream, nrounds);
+ crypto_xor(dst, stream, CHACHA_BLOCK_SIZE);
+ bytes -= CHACHA_BLOCK_SIZE;
+ dst += CHACHA_BLOCK_SIZE;
+ }
+ if (bytes) {
+ chacha_block(state, stream, nrounds);
+ crypto_xor(dst, stream, bytes);
+ }
+}
+
+static int chacha_stream_xor(struct blkcipher_desc *desc, struct scatterlist *dst,
+ struct scatterlist *src, unsigned int nbytes,
+ struct chacha_ctx *ctx, u8 *iv)
+{
+ struct blkcipher_walk walk;
+ u32 state[16];
+ int err;
+
+ blkcipher_walk_init(&walk, dst, src, nbytes);
+ err = blkcipher_walk_virt_block(desc, &walk, CHACHA_BLOCK_SIZE);
+
+ crypto_chacha_init(state, ctx, iv);
+
+ while (walk.nbytes >= CHACHA_BLOCK_SIZE) {
+ chacha_docrypt(state, walk.dst.virt.addr, walk.src.virt.addr,
+ rounddown(walk.nbytes, CHACHA_BLOCK_SIZE),
+ ctx->nrounds);
+ err = blkcipher_walk_done(desc, &walk,
+ walk.nbytes % CHACHA_BLOCK_SIZE);
+ }
+
+ if (walk.nbytes) {
+ chacha_docrypt(state, walk.dst.virt.addr, walk.src.virt.addr,
+ walk.nbytes, ctx->nrounds);
+ err = blkcipher_walk_done(desc, &walk, 0);
+ }
+
+ return err;
+}
+
+void crypto_chacha_init(u32 *state, struct chacha_ctx *ctx, u8 *iv)
+{
+ state[0] = 0x61707865; /* "expa" */
+ state[1] = 0x3320646e; /* "nd 3" */
+ state[2] = 0x79622d32; /* "2-by" */
+ state[3] = 0x6b206574; /* "te k" */
+ state[4] = ctx->key[0];
+ state[5] = ctx->key[1];
+ state[6] = ctx->key[2];
+ state[7] = ctx->key[3];
+ state[8] = ctx->key[4];
+ state[9] = ctx->key[5];
+ state[10] = ctx->key[6];
+ state[11] = ctx->key[7];
+ state[12] = get_unaligned_le32(iv + 0);
+ state[13] = get_unaligned_le32(iv + 4);
+ state[14] = get_unaligned_le32(iv + 8);
+ state[15] = get_unaligned_le32(iv + 12);
+}
+EXPORT_SYMBOL_GPL(crypto_chacha_init);
+
+static int chacha_setkey(struct crypto_tfm *tfm, const u8 *key,
+ unsigned int keysize, int nrounds)
+{
+ struct chacha_ctx *ctx = crypto_tfm_ctx(tfm);
+ int i;
+
+ if (keysize != CHACHA_KEY_SIZE)
+ return -EINVAL;
+
+ for (i = 0; i < ARRAY_SIZE(ctx->key); i++)
+ ctx->key[i] = get_unaligned_le32(key + i * sizeof(u32));
+
+ ctx->nrounds = nrounds;
+ return 0;
+}
+
+int crypto_chacha20_setkey(struct crypto_tfm *tfm, const u8 *key,
+ unsigned int keysize)
+{
+ return chacha_setkey(tfm, key, keysize, 20);
+}
+EXPORT_SYMBOL_GPL(crypto_chacha20_setkey);
+
+int crypto_chacha12_setkey(struct crypto_tfm *tfm, const u8 *key,
+ unsigned int keysize)
+{
+ return chacha_setkey(tfm, key, keysize, 12);
+}
+EXPORT_SYMBOL_GPL(crypto_chacha12_setkey);
+
+int crypto_chacha_crypt(struct blkcipher_desc *desc, struct scatterlist *dst,
+ struct scatterlist *src, unsigned int nbytes)
+{
+ struct chacha_ctx *ctx = crypto_blkcipher_ctx(desc->tfm);
+ u8 *iv = desc->info;
+
+ return chacha_stream_xor(desc, dst, src, nbytes, ctx, iv);
+}
+EXPORT_SYMBOL_GPL(crypto_chacha_crypt);
+
+int crypto_xchacha_crypt(struct blkcipher_desc *desc, struct scatterlist *dst,
+ struct scatterlist *src, unsigned int nbytes)
+{
+ struct chacha_ctx *ctx = crypto_blkcipher_ctx(desc->tfm);
+ u8 *iv = desc->info;
+ struct chacha_ctx subctx;
+ u32 state[16];
+ u8 real_iv[16];
+
+ /* Compute the subkey given the original key and first 128 nonce bits */
+ crypto_chacha_init(state, ctx, iv);
+ hchacha_block(state, subctx.key, ctx->nrounds);
+ subctx.nrounds = ctx->nrounds;
+
+ /* Build the real IV */
+ memcpy(&real_iv[0], iv + 24, 8); /* stream position */
+ memcpy(&real_iv[8], iv + 16, 8); /* remaining 64 nonce bits */
+
+ /* Generate the stream and XOR it with the data */
+ return chacha_stream_xor(desc, dst, src, nbytes, &subctx, real_iv);
+}
+EXPORT_SYMBOL_GPL(crypto_xchacha_crypt);
+
+static struct crypto_alg algs[] = {
+ {
+ .cra_name = "chacha20",
+ .cra_driver_name = "chacha20-generic",
+ .cra_priority = 100,
+ .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER,
+ .cra_blocksize = 1,
+ .cra_type = &crypto_blkcipher_type,
+ .cra_ctxsize = sizeof(struct chacha_ctx),
+ .cra_alignmask = sizeof(u32) - 1,
+ .cra_module = THIS_MODULE,
+ .cra_u = {
+ .blkcipher = {
+ .min_keysize = CHACHA_KEY_SIZE,
+ .max_keysize = CHACHA_KEY_SIZE,
+ .ivsize = CHACHA_IV_SIZE,
+ .geniv = "seqiv",
+ .setkey = crypto_chacha20_setkey,
+ .encrypt = crypto_chacha_crypt,
+ .decrypt = crypto_chacha_crypt,
+ },
+ },
+ }, {
+ .cra_name = "xchacha20",
+ .cra_driver_name = "xchacha20-generic",
+ .cra_priority = 100,
+ .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER,
+ .cra_blocksize = 1,
+ .cra_type = &crypto_blkcipher_type,
+ .cra_ctxsize = sizeof(struct chacha_ctx),
+ .cra_alignmask = sizeof(u32) - 1,
+ .cra_module = THIS_MODULE,
+ .cra_u = {
+ .blkcipher = {
+ .min_keysize = CHACHA_KEY_SIZE,
+ .max_keysize = CHACHA_KEY_SIZE,
+ .ivsize = XCHACHA_IV_SIZE,
+ .geniv = "seqiv",
+ .setkey = crypto_chacha20_setkey,
+ .encrypt = crypto_xchacha_crypt,
+ .decrypt = crypto_xchacha_crypt,
+ },
+ },
+ }, {
+ .cra_name = "xchacha12",
+ .cra_driver_name = "xchacha12-generic",
+ .cra_priority = 100,
+ .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER,
+ .cra_blocksize = 1,
+ .cra_type = &crypto_blkcipher_type,
+ .cra_ctxsize = sizeof(struct chacha_ctx),
+ .cra_alignmask = sizeof(u32) - 1,
+ .cra_module = THIS_MODULE,
+ .cra_u = {
+ .blkcipher = {
+ .min_keysize = CHACHA_KEY_SIZE,
+ .max_keysize = CHACHA_KEY_SIZE,
+ .ivsize = XCHACHA_IV_SIZE,
+ .geniv = "seqiv",
+ .setkey = crypto_chacha12_setkey,
+ .encrypt = crypto_xchacha_crypt,
+ .decrypt = crypto_xchacha_crypt,
+ },
+ },
+ },
+};
+
+static int __init chacha_generic_mod_init(void)
+{
+ return crypto_register_algs(algs, ARRAY_SIZE(algs));
+}
+
+static void __exit chacha_generic_mod_fini(void)
+{
+ crypto_unregister_algs(algs, ARRAY_SIZE(algs));
+}
+
+module_init(chacha_generic_mod_init);
+module_exit(chacha_generic_mod_fini);
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Martin Willi <martin@strongswan.org>");
+MODULE_DESCRIPTION("ChaCha and XChaCha stream ciphers (generic)");
+MODULE_ALIAS_CRYPTO("chacha20");
+MODULE_ALIAS_CRYPTO("chacha20-generic");
+MODULE_ALIAS_CRYPTO("xchacha20");
+MODULE_ALIAS_CRYPTO("xchacha20-generic");
+MODULE_ALIAS_CRYPTO("xchacha12");
+MODULE_ALIAS_CRYPTO("xchacha12-generic");
diff --git a/crypto/nhpoly1305.c b/crypto/nhpoly1305.c
new file mode 100644
index 000000000000..c8385853f699
--- /dev/null
+++ b/crypto/nhpoly1305.c
@@ -0,0 +1,254 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * NHPoly1305 - ε-almost-∆-universal hash function for Adiantum
+ *
+ * Copyright 2018 Google LLC
+ */
+
+/*
+ * "NHPoly1305" is the main component of Adiantum hashing.
+ * Specifically, it is the calculation
+ *
+ * H_M ← Poly1305_{K_M}(NH_{K_N}(pad_{128}(M)))
+ *
+ * from the procedure in section A.5 of the Adiantum paper [1]. It is an
+ * ε-almost-∆-universal (εA∆U) hash function for equal-length inputs over
+ * Z/(2^{128}Z), where the "∆" operation is addition. It hashes 1024-byte
+ * chunks of the input with the NH hash function [2], reducing the input length
+ * by 32x. The resulting NH digests are evaluated as a polynomial in
+ * GF(2^{130}-5), like in the Poly1305 MAC [3]. Note that the polynomial
+ * evaluation by itself would suffice to achieve the εA∆U property; NH is used
+ * for performance since it's over twice as fast as Poly1305.
+ *
+ * This is *not* a cryptographic hash function; do not use it as such!
+ *
+ * [1] Adiantum: length-preserving encryption for entry-level processors
+ * (https://eprint.iacr.org/2018/720.pdf)
+ * [2] UMAC: Fast and Secure Message Authentication
+ * (https://fastcrypto.org/umac/umac_proc.pdf)
+ * [3] The Poly1305-AES message-authentication code
+ * (https://cr.yp.to/mac/poly1305-20050329.pdf)
+ */
+
+#include <asm/unaligned.h>
+#include <crypto/algapi.h>
+#include <crypto/internal/hash.h>
+#include <crypto/nhpoly1305.h>
+#include <linux/crypto.h>
+#include <linux/kernel.h>
+#include <linux/module.h>
+
+static void nh_generic(const u32 *key, const u8 *message, size_t message_len,
+ __le64 hash[NH_NUM_PASSES])
+{
+ u64 sums[4] = { 0, 0, 0, 0 };
+
+ BUILD_BUG_ON(NH_PAIR_STRIDE != 2);
+ BUILD_BUG_ON(NH_NUM_PASSES != 4);
+
+ while (message_len) {
+ u32 m0 = get_unaligned_le32(message + 0);
+ u32 m1 = get_unaligned_le32(message + 4);
+ u32 m2 = get_unaligned_le32(message + 8);
+ u32 m3 = get_unaligned_le32(message + 12);
+
+ sums[0] += (u64)(u32)(m0 + key[ 0]) * (u32)(m2 + key[ 2]);
+ sums[1] += (u64)(u32)(m0 + key[ 4]) * (u32)(m2 + key[ 6]);
+ sums[2] += (u64)(u32)(m0 + key[ 8]) * (u32)(m2 + key[10]);
+ sums[3] += (u64)(u32)(m0 + key[12]) * (u32)(m2 + key[14]);
+ sums[0] += (u64)(u32)(m1 + key[ 1]) * (u32)(m3 + key[ 3]);
+ sums[1] += (u64)(u32)(m1 + key[ 5]) * (u32)(m3 + key[ 7]);
+ sums[2] += (u64)(u32)(m1 + key[ 9]) * (u32)(m3 + key[11]);
+ sums[3] += (u64)(u32)(m1 + key[13]) * (u32)(m3 + key[15]);
+ key += NH_MESSAGE_UNIT / sizeof(key[0]);
+ message += NH_MESSAGE_UNIT;
+ message_len -= NH_MESSAGE_UNIT;
+ }
+
+ hash[0] = cpu_to_le64(sums[0]);
+ hash[1] = cpu_to_le64(sums[1]);
+ hash[2] = cpu_to_le64(sums[2]);
+ hash[3] = cpu_to_le64(sums[3]);
+}
+
+/* Pass the next NH hash value through Poly1305 */
+static void process_nh_hash_value(struct nhpoly1305_state *state,
+ const struct nhpoly1305_key *key)
+{
+ BUILD_BUG_ON(NH_HASH_BYTES % POLY1305_BLOCK_SIZE != 0);
+
+ poly1305_core_blocks(&state->poly_state, &key->poly_key, state->nh_hash,
+ NH_HASH_BYTES / POLY1305_BLOCK_SIZE);
+}
+
+/*
+ * Feed the next portion of the source data, as a whole number of 16-byte
+ * "NH message units", through NH and Poly1305. Each NH hash is taken over
+ * 1024 bytes, except possibly the final one which is taken over a multiple of
+ * 16 bytes up to 1024. Also, in the case where data is passed in misaligned
+ * chunks, we combine partial hashes; the end result is the same either way.
+ */
+static void nhpoly1305_units(struct nhpoly1305_state *state,
+ const struct nhpoly1305_key *key,
+ const u8 *src, unsigned int srclen, nh_t nh_fn)
+{
+ do {
+ unsigned int bytes;
+
+ if (state->nh_remaining == 0) {
+ /* Starting a new NH message */
+ bytes = min_t(unsigned int, srclen, NH_MESSAGE_BYTES);
+ nh_fn(key->nh_key, src, bytes, state->nh_hash);
+ state->nh_remaining = NH_MESSAGE_BYTES - bytes;
+ } else {
+ /* Continuing a previous NH message */
+ __le64 tmp_hash[NH_NUM_PASSES];
+ unsigned int pos;
+ int i;
+
+ pos = NH_MESSAGE_BYTES - state->nh_remaining;
+ bytes = min(srclen, state->nh_remaining);
+ nh_fn(&key->nh_key[pos / 4], src, bytes, tmp_hash);
+ for (i = 0; i < NH_NUM_PASSES; i++)
+ le64_add_cpu(&state->nh_hash[i],
+ le64_to_cpu(tmp_hash[i]));
+ state->nh_remaining -= bytes;
+ }
+ if (state->nh_remaining == 0)
+ process_nh_hash_value(state, key);
+ src += bytes;
+ srclen -= bytes;
+ } while (srclen);
+}
+
+int crypto_nhpoly1305_setkey(struct crypto_shash *tfm,
+ const u8 *key, unsigned int keylen)
+{
+ struct nhpoly1305_key *ctx = crypto_shash_ctx(tfm);
+ int i;
+
+ if (keylen != NHPOLY1305_KEY_SIZE)
+ return -EINVAL;
+
+ poly1305_core_setkey(&ctx->poly_key, key);
+ key += POLY1305_BLOCK_SIZE;
+
+ for (i = 0; i < NH_KEY_WORDS; i++)
+ ctx->nh_key[i] = get_unaligned_le32(key + i * sizeof(u32));
+
+ return 0;
+}
+EXPORT_SYMBOL(crypto_nhpoly1305_setkey);
+
+int crypto_nhpoly1305_init(struct shash_desc *desc)
+{
+ struct nhpoly1305_state *state = shash_desc_ctx(desc);
+
+ poly1305_core_init(&state->poly_state);
+ state->buflen = 0;
+ state->nh_remaining = 0;
+ return 0;
+}
+EXPORT_SYMBOL(crypto_nhpoly1305_init);
+
+int crypto_nhpoly1305_update_helper(struct shash_desc *desc,
+ const u8 *src, unsigned int srclen,
+ nh_t nh_fn)
+{
+ struct nhpoly1305_state *state = shash_desc_ctx(desc);
+ const struct nhpoly1305_key *key = crypto_shash_ctx(desc->tfm);
+ unsigned int bytes;
+
+ if (state->buflen) {
+ bytes = min(srclen, (int)NH_MESSAGE_UNIT - state->buflen);
+ memcpy(&state->buffer[state->buflen], src, bytes);
+ state->buflen += bytes;
+ if (state->buflen < NH_MESSAGE_UNIT)
+ return 0;
+ nhpoly1305_units(state, key, state->buffer, NH_MESSAGE_UNIT,
+ nh_fn);
+ state->buflen = 0;
+ src += bytes;
+ srclen -= bytes;
+ }
+
+ if (srclen >= NH_MESSAGE_UNIT) {
+ bytes = round_down(srclen, NH_MESSAGE_UNIT);
+ nhpoly1305_units(state, key, src, bytes, nh_fn);
+ src += bytes;
+ srclen -= bytes;
+ }
+
+ if (srclen) {
+ memcpy(state->buffer, src, srclen);
+ state->buflen = srclen;
+ }
+ return 0;
+}
+EXPORT_SYMBOL(crypto_nhpoly1305_update_helper);
+
+int crypto_nhpoly1305_update(struct shash_desc *desc,
+ const u8 *src, unsigned int srclen)
+{
+ return crypto_nhpoly1305_update_helper(desc, src, srclen, nh_generic);
+}
+EXPORT_SYMBOL(crypto_nhpoly1305_update);
+
+int crypto_nhpoly1305_final_helper(struct shash_desc *desc, u8 *dst, nh_t nh_fn)
+{
+ struct nhpoly1305_state *state = shash_desc_ctx(desc);
+ const struct nhpoly1305_key *key = crypto_shash_ctx(desc->tfm);
+
+ if (state->buflen) {
+ memset(&state->buffer[state->buflen], 0,
+ NH_MESSAGE_UNIT - state->buflen);
+ nhpoly1305_units(state, key, state->buffer, NH_MESSAGE_UNIT,
+ nh_fn);
+ }
+
+ if (state->nh_remaining)
+ process_nh_hash_value(state, key);
+
+ poly1305_core_emit(&state->poly_state, dst);
+ return 0;
+}
+EXPORT_SYMBOL(crypto_nhpoly1305_final_helper);
+
+int crypto_nhpoly1305_final(struct shash_desc *desc, u8 *dst)
+{
+ return crypto_nhpoly1305_final_helper(desc, dst, nh_generic);
+}
+EXPORT_SYMBOL(crypto_nhpoly1305_final);
+
+static struct shash_alg nhpoly1305_alg = {
+ .base.cra_name = "nhpoly1305",
+ .base.cra_driver_name = "nhpoly1305-generic",
+ .base.cra_priority = 100,
+ .base.cra_ctxsize = sizeof(struct nhpoly1305_key),
+ .base.cra_module = THIS_MODULE,
+ .digestsize = POLY1305_DIGEST_SIZE,
+ .init = crypto_nhpoly1305_init,
+ .update = crypto_nhpoly1305_update,
+ .final = crypto_nhpoly1305_final,
+ .setkey = crypto_nhpoly1305_setkey,
+ .descsize = sizeof(struct nhpoly1305_state),
+};
+
+static int __init nhpoly1305_mod_init(void)
+{
+ return crypto_register_shash(&nhpoly1305_alg);
+}
+
+static void __exit nhpoly1305_mod_exit(void)
+{
+ crypto_unregister_shash(&nhpoly1305_alg);
+}
+
+module_init(nhpoly1305_mod_init);
+module_exit(nhpoly1305_mod_exit);
+
+MODULE_DESCRIPTION("NHPoly1305 ε-almost-∆-universal hash function");
+MODULE_LICENSE("GPL v2");
+MODULE_AUTHOR("Eric Biggers <ebiggers@google.com>");
+MODULE_ALIAS_CRYPTO("nhpoly1305");
+MODULE_ALIAS_CRYPTO("nhpoly1305-generic");
diff --git a/crypto/poly1305_generic.c b/crypto/poly1305_generic.c
index bca99238948f..b60c1eee9839 100644
--- a/crypto/poly1305_generic.c
+++ b/crypto/poly1305_generic.c
@@ -17,6 +17,7 @@
#include <linux/crypto.h>
#include <linux/kernel.h>
#include <linux/module.h>
+#include <asm/unaligned.h>
static inline u64 mlt(u64 a, u64 b)
{
@@ -33,16 +34,11 @@ static inline u32 and(u32 v, u32 mask)
return v & mask;
}
-static inline u32 le32_to_cpuvp(const void *p)
-{
- return le32_to_cpup(p);
-}
-
int crypto_poly1305_init(struct shash_desc *desc)
{
struct poly1305_desc_ctx *dctx = shash_desc_ctx(desc);
- memset(dctx->h, 0, sizeof(dctx->h));
+ poly1305_core_init(&dctx->h);
dctx->buflen = 0;
dctx->rset = false;
dctx->sset = false;
@@ -51,23 +47,16 @@ int crypto_poly1305_init(struct shash_desc *desc)
}
EXPORT_SYMBOL_GPL(crypto_poly1305_init);
-static void poly1305_setrkey(struct poly1305_desc_ctx *dctx, const u8 *key)
+void poly1305_core_setkey(struct poly1305_key *key, const u8 *raw_key)
{
/* r &= 0xffffffc0ffffffc0ffffffc0fffffff */
- dctx->r[0] = (le32_to_cpuvp(key + 0) >> 0) & 0x3ffffff;
- dctx->r[1] = (le32_to_cpuvp(key + 3) >> 2) & 0x3ffff03;
- dctx->r[2] = (le32_to_cpuvp(key + 6) >> 4) & 0x3ffc0ff;
- dctx->r[3] = (le32_to_cpuvp(key + 9) >> 6) & 0x3f03fff;
- dctx->r[4] = (le32_to_cpuvp(key + 12) >> 8) & 0x00fffff;
-}
-
-static void poly1305_setskey(struct poly1305_desc_ctx *dctx, const u8 *key)
-{
- dctx->s[0] = le32_to_cpuvp(key + 0);
- dctx->s[1] = le32_to_cpuvp(key + 4);
- dctx->s[2] = le32_to_cpuvp(key + 8);
- dctx->s[3] = le32_to_cpuvp(key + 12);
+ key->r[0] = (get_unaligned_le32(raw_key + 0) >> 0) & 0x3ffffff;
+ key->r[1] = (get_unaligned_le32(raw_key + 3) >> 2) & 0x3ffff03;
+ key->r[2] = (get_unaligned_le32(raw_key + 6) >> 4) & 0x3ffc0ff;
+ key->r[3] = (get_unaligned_le32(raw_key + 9) >> 6) & 0x3f03fff;
+ key->r[4] = (get_unaligned_le32(raw_key + 12) >> 8) & 0x00fffff;
}
+EXPORT_SYMBOL_GPL(poly1305_core_setkey);
/*
* Poly1305 requires a unique key for each tag, which implies that we can't set
@@ -79,13 +68,16 @@ unsigned int crypto_poly1305_setdesckey(struct poly1305_desc_ctx *dctx,
{
if (!dctx->sset) {
if (!dctx->rset && srclen >= POLY1305_BLOCK_SIZE) {
- poly1305_setrkey(dctx, src);
+ poly1305_core_setkey(&dctx->r, src);
src += POLY1305_BLOCK_SIZE;
srclen -= POLY1305_BLOCK_SIZE;
dctx->rset = true;
}
if (srclen >= POLY1305_BLOCK_SIZE) {
- poly1305_setskey(dctx, src);
+ dctx->s[0] = get_unaligned_le32(src + 0);
+ dctx->s[1] = get_unaligned_le32(src + 4);
+ dctx->s[2] = get_unaligned_le32(src + 8);
+ dctx->s[3] = get_unaligned_le32(src + 12);
src += POLY1305_BLOCK_SIZE;
srclen -= POLY1305_BLOCK_SIZE;
dctx->sset = true;
@@ -95,47 +87,43 @@ unsigned int crypto_poly1305_setdesckey(struct poly1305_desc_ctx *dctx,
}
EXPORT_SYMBOL_GPL(crypto_poly1305_setdesckey);
-static unsigned int poly1305_blocks(struct poly1305_desc_ctx *dctx,
- const u8 *src, unsigned int srclen,
- u32 hibit)
+static void poly1305_blocks_internal(struct poly1305_state *state,
+ const struct poly1305_key *key,
+ const void *src, unsigned int nblocks,
+ u32 hibit)
{
u32 r0, r1, r2, r3, r4;
u32 s1, s2, s3, s4;
u32 h0, h1, h2, h3, h4;
u64 d0, d1, d2, d3, d4;
- unsigned int datalen;
- if (unlikely(!dctx->sset)) {
- datalen = crypto_poly1305_setdesckey(dctx, src, srclen);
- src += srclen - datalen;
- srclen = datalen;
- }
+ if (!nblocks)
+ return;
- r0 = dctx->r[0];
- r1 = dctx->r[1];
- r2 = dctx->r[2];
- r3 = dctx->r[3];
- r4 = dctx->r[4];
+ r0 = key->r[0];
+ r1 = key->r[1];
+ r2 = key->r[2];
+ r3 = key->r[3];
+ r4 = key->r[4];
s1 = r1 * 5;
s2 = r2 * 5;
s3 = r3 * 5;
s4 = r4 * 5;
- h0 = dctx->h[0];
- h1 = dctx->h[1];
- h2 = dctx->h[2];
- h3 = dctx->h[3];
- h4 = dctx->h[4];
-
- while (likely(srclen >= POLY1305_BLOCK_SIZE)) {
+ h0 = state->h[0];
+ h1 = state->h[1];
+ h2 = state->h[2];
+ h3 = state->h[3];
+ h4 = state->h[4];
+ do {
/* h += m[i] */
- h0 += (le32_to_cpuvp(src + 0) >> 0) & 0x3ffffff;
- h1 += (le32_to_cpuvp(src + 3) >> 2) & 0x3ffffff;
- h2 += (le32_to_cpuvp(src + 6) >> 4) & 0x3ffffff;
- h3 += (le32_to_cpuvp(src + 9) >> 6) & 0x3ffffff;
- h4 += (le32_to_cpuvp(src + 12) >> 8) | hibit;
+ h0 += (get_unaligned_le32(src + 0) >> 0) & 0x3ffffff;
+ h1 += (get_unaligned_le32(src + 3) >> 2) & 0x3ffffff;
+ h2 += (get_unaligned_le32(src + 6) >> 4) & 0x3ffffff;
+ h3 += (get_unaligned_le32(src + 9) >> 6) & 0x3ffffff;
+ h4 += (get_unaligned_le32(src + 12) >> 8) | hibit;
/* h *= r */
d0 = mlt(h0, r0) + mlt(h1, s4) + mlt(h2, s3) +
@@ -158,16 +146,36 @@ static unsigned int poly1305_blocks(struct poly1305_desc_ctx *dctx,
h1 += h0 >> 26; h0 = h0 & 0x3ffffff;
src += POLY1305_BLOCK_SIZE;
- srclen -= POLY1305_BLOCK_SIZE;
- }
+ } while (--nblocks);
- dctx->h[0] = h0;
- dctx->h[1] = h1;
- dctx->h[2] = h2;
- dctx->h[3] = h3;
- dctx->h[4] = h4;
+ state->h[0] = h0;
+ state->h[1] = h1;
+ state->h[2] = h2;
+ state->h[3] = h3;
+ state->h[4] = h4;
+}
- return srclen;
+void poly1305_core_blocks(struct poly1305_state *state,
+ const struct poly1305_key *key,
+ const void *src, unsigned int nblocks)
+{
+ poly1305_blocks_internal(state, key, src, nblocks, 1 << 24);
+}
+EXPORT_SYMBOL_GPL(poly1305_core_blocks);
+
+static void poly1305_blocks(struct poly1305_desc_ctx *dctx,
+ const u8 *src, unsigned int srclen, u32 hibit)
+{
+ unsigned int datalen;
+
+ if (unlikely(!dctx->sset)) {
+ datalen = crypto_poly1305_setdesckey(dctx, src, srclen);
+ src += srclen - datalen;
+ srclen = datalen;
+ }
+
+ poly1305_blocks_internal(&dctx->h, &dctx->r,
+ src, srclen / POLY1305_BLOCK_SIZE, hibit);
}
int crypto_poly1305_update(struct shash_desc *desc,
@@ -191,9 +199,9 @@ int crypto_poly1305_update(struct shash_desc *desc,
}
if (likely(srclen >= POLY1305_BLOCK_SIZE)) {
- bytes = poly1305_blocks(dctx, src, srclen, 1 << 24);
- src += srclen - bytes;
- srclen = bytes;
+ poly1305_blocks(dctx, src, srclen, 1 << 24);
+ src += srclen - (srclen % POLY1305_BLOCK_SIZE);
+ srclen %= POLY1305_BLOCK_SIZE;
}
if (unlikely(srclen)) {
@@ -205,31 +213,18 @@ int crypto_poly1305_update(struct shash_desc *desc,
}
EXPORT_SYMBOL_GPL(crypto_poly1305_update);
-int crypto_poly1305_final(struct shash_desc *desc, u8 *dst)
+void poly1305_core_emit(const struct poly1305_state *state, void *dst)
{
- struct poly1305_desc_ctx *dctx = shash_desc_ctx(desc);
- __le32 *mac = (__le32 *)dst;
u32 h0, h1, h2, h3, h4;
u32 g0, g1, g2, g3, g4;
u32 mask;
- u64 f = 0;
-
- if (unlikely(!dctx->sset))
- return -ENOKEY;
-
- if (unlikely(dctx->buflen)) {
- dctx->buf[dctx->buflen++] = 1;
- memset(dctx->buf + dctx->buflen, 0,
- POLY1305_BLOCK_SIZE - dctx->buflen);
- poly1305_blocks(dctx, dctx->buf, POLY1305_BLOCK_SIZE, 0);
- }
/* fully carry h */
- h0 = dctx->h[0];
- h1 = dctx->h[1];
- h2 = dctx->h[2];
- h3 = dctx->h[3];
- h4 = dctx->h[4];
+ h0 = state->h[0];
+ h1 = state->h[1];
+ h2 = state->h[2];
+ h3 = state->h[3];
+ h4 = state->h[4];
h2 += (h1 >> 26); h1 = h1 & 0x3ffffff;
h3 += (h2 >> 26); h2 = h2 & 0x3ffffff;
@@ -259,16 +254,40 @@ int crypto_poly1305_final(struct shash_desc *desc, u8 *dst)
h4 = (h4 & mask) | g4;
/* h = h % (2^128) */
- h0 = (h0 >> 0) | (h1 << 26);
- h1 = (h1 >> 6) | (h2 << 20);
- h2 = (h2 >> 12) | (h3 << 14);
- h3 = (h3 >> 18) | (h4 << 8);
+ put_unaligned_le32((h0 >> 0) | (h1 << 26), dst + 0);
+ put_unaligned_le32((h1 >> 6) | (h2 << 20), dst + 4);
+ put_unaligned_le32((h2 >> 12) | (h3 << 14), dst + 8);
+ put_unaligned_le32((h3 >> 18) | (h4 << 8), dst + 12);
+}
+EXPORT_SYMBOL_GPL(poly1305_core_emit);
+
+int crypto_poly1305_final(struct shash_desc *desc, u8 *dst)
+{
+ struct poly1305_desc_ctx *dctx = shash_desc_ctx(desc);
+ __le32 digest[4];
+ u64 f = 0;
+
+ if (unlikely(!dctx->sset))
+ return -ENOKEY;
+
+ if (unlikely(dctx->buflen)) {
+ dctx->buf[dctx->buflen++] = 1;
+ memset(dctx->buf + dctx->buflen, 0,
+ POLY1305_BLOCK_SIZE - dctx->buflen);
+ poly1305_blocks(dctx, dctx->buf, POLY1305_BLOCK_SIZE, 0);
+ }
+
+ poly1305_core_emit(&dctx->h, digest);
/* mac = (h + s) % (2^128) */
- f = (f >> 32) + h0 + dctx->s[0]; mac[0] = cpu_to_le32(f);
- f = (f >> 32) + h1 + dctx->s[1]; mac[1] = cpu_to_le32(f);
- f = (f >> 32) + h2 + dctx->s[2]; mac[2] = cpu_to_le32(f);
- f = (f >> 32) + h3 + dctx->s[3]; mac[3] = cpu_to_le32(f);
+ f = (f >> 32) + le32_to_cpu(digest[0]) + dctx->s[0];
+ put_unaligned_le32(f, dst + 0);
+ f = (f >> 32) + le32_to_cpu(digest[1]) + dctx->s[1];
+ put_unaligned_le32(f, dst + 4);
+ f = (f >> 32) + le32_to_cpu(digest[2]) + dctx->s[2];
+ put_unaligned_le32(f, dst + 8);
+ f = (f >> 32) + le32_to_cpu(digest[3]) + dctx->s[3];
+ put_unaligned_le32(f, dst + 12);
return 0;
}
diff --git a/crypto/tcrypt.c b/crypto/tcrypt.c
index babbda230c07..fb7956150be4 100644
--- a/crypto/tcrypt.c
+++ b/crypto/tcrypt.c
@@ -1610,6 +1610,16 @@ static int do_test(const char *alg, u32 type, u32 mask, int m)
speed_template_32);
break;
+ case 219:
+ test_cipher_speed("adiantum(xchacha12,aes)", ENCRYPT, sec, NULL,
+ 0, speed_template_32);
+ test_cipher_speed("adiantum(xchacha12,aes)", DECRYPT, sec, NULL,
+ 0, speed_template_32);
+ test_cipher_speed("adiantum(xchacha20,aes)", ENCRYPT, sec, NULL,
+ 0, speed_template_32);
+ test_cipher_speed("adiantum(xchacha20,aes)", DECRYPT, sec, NULL,
+ 0, speed_template_32);
+ break;
case 300:
if (alg) {
diff --git a/crypto/testmgr.c b/crypto/testmgr.c
index 62dffa0028ac..898ef4d736be 100644
--- a/crypto/testmgr.c
+++ b/crypto/testmgr.c
@@ -62,7 +62,7 @@ int alg_test(const char *driver, const char *alg, u32 type, u32 mask)
*/
#define IDX1 32
#define IDX2 32400
-#define IDX3 1
+#define IDX3 1511
#define IDX4 8193
#define IDX5 22222
#define IDX6 17101
@@ -2173,6 +2173,36 @@ static const struct alg_test_desc alg_test_descs[] = {
.test = alg_test_null,
.fips_allowed = 1,
}, {
+ .alg = "adiantum(xchacha12,aes)",
+ .test = alg_test_skcipher,
+ .suite = {
+ .cipher = {
+ .enc = {
+ .vecs = adiantum_xchacha12_aes_enc_tv_template,
+ .count = ARRAY_SIZE(adiantum_xchacha12_aes_enc_tv_template),
+ },
+ .dec = {
+ .vecs = adiantum_xchacha12_aes_dec_tv_template,
+ .count = ARRAY_SIZE(adiantum_xchacha12_aes_dec_tv_template),
+ },
+ }
+ },
+ }, {
+ .alg = "adiantum(xchacha20,aes)",
+ .test = alg_test_skcipher,
+ .suite = {
+ .cipher = {
+ .enc = {
+ .vecs = adiantum_xchacha20_aes_enc_tv_template,
+ .count = ARRAY_SIZE(adiantum_xchacha20_aes_enc_tv_template),
+ },
+ .dec = {
+ .vecs = adiantum_xchacha20_aes_dec_tv_template,
+ .count = ARRAY_SIZE(adiantum_xchacha20_aes_dec_tv_template),
+ },
+ }
+ },
+ }, {
.alg = "ansi_cprng",
.test = alg_test_cprng,
.suite = {
@@ -3645,6 +3675,15 @@ static const struct alg_test_desc alg_test_descs[] = {
}
}
}, {
+ .alg = "nhpoly1305",
+ .test = alg_test_hash,
+ .suite = {
+ .hash = {
+ .vecs = nhpoly1305_tv_template,
+ .count = ARRAY_SIZE(nhpoly1305_tv_template),
+ }
+ }
+ }, {
.alg = "ofb(aes)",
.test = alg_test_skcipher,
.fips_allowed = 1,
@@ -3997,6 +4036,36 @@ static const struct alg_test_desc alg_test_descs[] = {
}
}
}, {
+ .alg = "xchacha12",
+ .test = alg_test_skcipher,
+ .suite = {
+ .cipher = {
+ .enc = {
+ .vecs = xchacha12_tv_template,
+ .count = ARRAY_SIZE(xchacha12_tv_template),
+ },
+ .dec = {
+ .vecs = xchacha12_tv_template,
+ .count = ARRAY_SIZE(xchacha12_tv_template),
+ },
+ }
+ },
+ }, {
+ .alg = "xchacha20",
+ .test = alg_test_skcipher,
+ .suite = {
+ .cipher = {
+ .enc = {
+ .vecs = xchacha20_tv_template,
+ .count = ARRAY_SIZE(xchacha20_tv_template),
+ },
+ .dec = {
+ .vecs = xchacha20_tv_template,
+ .count = ARRAY_SIZE(xchacha20_tv_template),
+ },
+ }
+ },
+ }, {
.alg = "xts(aes)",
.test = alg_test_skcipher,
.fips_allowed = 1,
@@ -4072,6 +4141,22 @@ static const struct alg_test_desc alg_test_descs[] = {
}
}
}
+ }, {
+ .alg = "zstd",
+ .test = alg_test_comp,
+ .fips_allowed = 1,
+ .suite = {
+ .comp = {
+ .comp = {
+ .vecs = zstd_comp_tv_template,
+ .count = ZSTD_COMP_TEST_VECTORS
+ },
+ .decomp = {
+ .vecs = zstd_decomp_tv_template,
+ .count = ZSTD_DECOMP_TEST_VECTORS
+ }
+ }
+ }
}
};
diff --git a/crypto/testmgr.h b/crypto/testmgr.h
index 9033088ca231..4a349b74e0b5 100644
--- a/crypto/testmgr.h
+++ b/crypto/testmgr.h
@@ -29,7 +29,7 @@
#define MAX_DIGEST_SIZE 64
#define MAX_TAP 8
-#define MAX_KEYLEN 160
+#define MAX_KEYLEN 1088
#define MAX_IVLEN 32
struct hash_testvec {
@@ -37,10 +37,10 @@ struct hash_testvec {
char *key;
char *plaintext;
char *digest;
- unsigned char tap[MAX_TAP];
+ unsigned short tap[MAX_TAP];
unsigned short psize;
- unsigned char np;
- unsigned char ksize;
+ unsigned short np;
+ unsigned short ksize;
};
/*
@@ -4530,6 +4530,1237 @@ static struct hash_testvec poly1305_tv_template[] = {
},
};
+/* NHPoly1305 test vectors from https://github.com/google/adiantum */
+static struct hash_testvec nhpoly1305_tv_template[] = {
+ {
+ .key = "\xd2\x5d\x4c\xdd\x8d\x2b\x7f\x7a"
+ "\xd9\xbe\x71\xec\xd1\x83\x52\xe3"
+ "\xe1\xad\xd7\x5c\x0a\x75\x9d\xec"
+ "\x1d\x13\x7e\x5d\x71\x07\xc9\xe4"
+ "\x57\x2d\x44\x68\xcf\xd8\xd6\xc5"
+ "\x39\x69\x7d\x32\x75\x51\x4f\x7e"
+ "\xb2\x4c\xc6\x90\x51\x6e\xd9\xd6"
+ "\xa5\x8b\x2d\xf1\x94\xf9\xf7\x5e"
+ "\x2c\x84\x7b\x41\x0f\x88\x50\x89"
+ "\x30\xd9\xa1\x38\x46\x6c\xc0\x4f"
+ "\xe8\xdf\xdc\x66\xab\x24\x43\x41"
+ "\x91\x55\x29\x65\x86\x28\x5e\x45"
+ "\xd5\x2d\xb7\x80\x08\x9a\xc3\xd4"
+ "\x9a\x77\x0a\xd4\xef\x3e\xe6\x3f"
+ "\x6f\x2f\x9b\x3a\x7d\x12\x1e\x80"
+ "\x6c\x44\xa2\x25\xe1\xf6\x60\xe9"
+ "\x0d\xaf\xc5\x3c\xa5\x79\xae\x64"
+ "\xbc\xa0\x39\xa3\x4d\x10\xe5\x4d"
+ "\xd5\xe7\x89\x7a\x13\xee\x06\x78"
+ "\xdc\xa4\xdc\x14\x27\xe6\x49\x38"
+ "\xd0\xe0\x45\x25\x36\xc5\xf4\x79"
+ "\x2e\x9a\x98\x04\xe4\x2b\x46\x52"
+ "\x7c\x33\xca\xe2\x56\x51\x50\xe2"
+ "\xa5\x9a\xae\x18\x6a\x13\xf8\xd2"
+ "\x21\x31\x66\x02\xe2\xda\x8d\x7e"
+ "\x41\x19\xb2\x61\xee\x48\x8f\xf1"
+ "\x65\x24\x2e\x1e\x68\xce\x05\xd9"
+ "\x2a\xcf\xa5\x3a\x57\xdd\x35\x91"
+ "\x93\x01\xca\x95\xfc\x2b\x36\x04"
+ "\xe6\x96\x97\x28\xf6\x31\xfe\xa3"
+ "\x9d\xf6\x6a\x1e\x80\x8d\xdc\xec"
+ "\xaf\x66\x11\x13\x02\x88\xd5\x27"
+ "\x33\xb4\x1a\xcd\xa3\xf6\xde\x31"
+ "\x8e\xc0\x0e\x6c\xd8\x5a\x97\x5e"
+ "\xdd\xfd\x60\x69\x38\x46\x3f\x90"
+ "\x5e\x97\xd3\x32\x76\xc7\x82\x49"
+ "\xfe\xba\x06\x5f\x2f\xa2\xfd\xff"
+ "\x80\x05\x40\xe4\x33\x03\xfb\x10"
+ "\xc0\xde\x65\x8c\xc9\x8d\x3a\x9d"
+ "\xb5\x7b\x36\x4b\xb5\x0c\xcf\x00"
+ "\x9c\x87\xe4\x49\xad\x90\xda\x4a"
+ "\xdd\xbd\xff\xe2\x32\x57\xd6\x78"
+ "\x36\x39\x6c\xd3\x5b\x9b\x88\x59"
+ "\x2d\xf0\x46\xe4\x13\x0e\x2b\x35"
+ "\x0d\x0f\x73\x8a\x4f\x26\x84\x75"
+ "\x88\x3c\xc5\x58\x66\x18\x1a\xb4"
+ "\x64\x51\x34\x27\x1b\xa4\x11\xc9"
+ "\x6d\x91\x8a\xfa\x32\x60\x9d\xd7"
+ "\x87\xe5\xaa\x43\x72\xf8\xda\xd1"
+ "\x48\x44\x13\x61\xdc\x8c\x76\x17"
+ "\x0c\x85\x4e\xf3\xdd\xa2\x42\xd2"
+ "\x74\xc1\x30\x1b\xeb\x35\x31\x29"
+ "\x5b\xd7\x4c\x94\x46\x35\xa1\x23"
+ "\x50\xf2\xa2\x8e\x7e\x4f\x23\x4f"
+ "\x51\xff\xe2\xc9\xa3\x7d\x56\x8b"
+ "\x41\xf2\xd0\xc5\x57\x7e\x59\xac"
+ "\xbb\x65\xf3\xfe\xf7\x17\xef\x63"
+ "\x7c\x6f\x23\xdd\x22\x8e\xed\x84"
+ "\x0e\x3b\x09\xb3\xf3\xf4\x8f\xcd"
+ "\x37\xa8\xe1\xa7\x30\xdb\xb1\xa2"
+ "\x9c\xa2\xdf\x34\x17\x3e\x68\x44"
+ "\xd0\xde\x03\x50\xd1\x48\x6b\x20"
+ "\xe2\x63\x45\xa5\xea\x87\xc2\x42"
+ "\x95\x03\x49\x05\xed\xe0\x90\x29"
+ "\x1a\xb8\xcf\x9b\x43\xcf\x29\x7a"
+ "\x63\x17\x41\x9f\xe0\xc9\x10\xfd"
+ "\x2c\x56\x8c\x08\x55\xb4\xa9\x27"
+ "\x0f\x23\xb1\x05\x6a\x12\x46\xc7"
+ "\xe1\xfe\x28\x93\x93\xd7\x2f\xdc"
+ "\x98\x30\xdb\x75\x8a\xbe\x97\x7a"
+ "\x02\xfb\x8c\xba\xbe\x25\x09\xbe"
+ "\xce\xcb\xa2\xef\x79\x4d\x0e\x9d"
+ "\x1b\x9d\xb6\x39\x34\x38\xfa\x07"
+ "\xec\xe8\xfc\x32\x85\x1d\xf7\x85"
+ "\x63\xc3\x3c\xc0\x02\x75\xd7\x3f"
+ "\xb2\x68\x60\x66\x65\x81\xc6\xb1"
+ "\x42\x65\x4b\x4b\x28\xd7\xc7\xaa"
+ "\x9b\xd2\xdc\x1b\x01\xe0\x26\x39"
+ "\x01\xc1\x52\x14\xd1\x3f\xb7\xe6"
+ "\x61\x41\xc7\x93\xd2\xa2\x67\xc6"
+ "\xf7\x11\xb5\xf5\xea\xdd\x19\xfb"
+ "\x4d\x21\x12\xd6\x7d\xf1\x10\xb0"
+ "\x89\x07\xc7\x5a\x52\x73\x70\x2f"
+ "\x32\xef\x65\x2b\x12\xb2\xf0\xf5"
+ "\x20\xe0\x90\x59\x7e\x64\xf1\x4c"
+ "\x41\xb3\xa5\x91\x08\xe6\x5e\x5f"
+ "\x05\x56\x76\xb4\xb0\xcd\x70\x53"
+ "\x10\x48\x9c\xff\xc2\x69\x55\x24"
+ "\x87\xef\x84\xea\xfb\xa7\xbf\xa0"
+ "\x91\x04\xad\x4f\x8b\x57\x54\x4b"
+ "\xb6\xe9\xd1\xac\x37\x2f\x1d\x2e"
+ "\xab\xa5\xa4\xe8\xff\xfb\xd9\x39"
+ "\x2f\xb7\xac\xd1\xfe\x0b\x9a\x80"
+ "\x0f\xb6\xf4\x36\x39\x90\x51\xe3"
+ "\x0a\x2f\xb6\x45\x76\x89\xcd\x61"
+ "\xfe\x48\x5f\x75\x1d\x13\x00\x62"
+ "\x80\x24\x47\xe7\xbc\x37\xd7\xe3"
+ "\x15\xe8\x68\x22\xaf\x80\x6f\x4b"
+ "\xa8\x9f\x01\x10\x48\x14\xc3\x02"
+ "\x52\xd2\xc7\x75\x9b\x52\x6d\x30"
+ "\xac\x13\x85\xc8\xf7\xa3\x58\x4b"
+ "\x49\xf7\x1c\x45\x55\x8c\x39\x9a"
+ "\x99\x6d\x97\x27\x27\xe6\xab\xdd"
+ "\x2c\x42\x1b\x35\xdd\x9d\x73\xbb"
+ "\x6c\xf3\x64\xf1\xfb\xb9\xf7\xe6"
+ "\x4a\x3c\xc0\x92\xc0\x2e\xb7\x1a"
+ "\xbe\xab\xb3\x5a\xe5\xea\xb1\x48"
+ "\x58\x13\x53\x90\xfd\xc3\x8e\x54"
+ "\xf9\x18\x16\x73\xe8\xcb\x6d\x39"
+ "\x0e\xd7\xe0\xfe\xb6\x9f\x43\x97"
+ "\xe8\xd0\x85\x56\x83\x3e\x98\x68"
+ "\x7f\xbd\x95\xa8\x9a\x61\x21\x8f"
+ "\x06\x98\x34\xa6\xc8\xd6\x1d\xf3"
+ "\x3d\x43\xa4\x9a\x8c\xe5\xd3\x5a"
+ "\x32\xa2\x04\x22\xa4\x19\x1a\x46"
+ "\x42\x7e\x4d\xe5\xe0\xe6\x0e\xca"
+ "\xd5\x58\x9d\x2c\xaf\xda\x33\x5c"
+ "\xb0\x79\x9e\xc9\xfc\xca\xf0\x2f"
+ "\xa8\xb2\x77\xeb\x7a\xa2\xdd\x37"
+ "\x35\x83\x07\xd6\x02\x1a\xb6\x6c"
+ "\x24\xe2\x59\x08\x0e\xfd\x3e\x46"
+ "\xec\x40\x93\xf4\x00\x26\x4f\x2a"
+ "\xff\x47\x2f\xeb\x02\x92\x26\x5b"
+ "\x53\x17\xc2\x8d\x2a\xc7\xa3\x1b"
+ "\xcd\xbc\xa7\xe8\xd1\x76\xe3\x80"
+ "\x21\xca\x5d\x3b\xe4\x9c\x8f\xa9"
+ "\x5b\x7f\x29\x7f\x7c\xd8\xed\x6d"
+ "\x8c\xb2\x86\x85\xe7\x77\xf2\x85"
+ "\xab\x38\xa9\x9d\xc1\x4e\xc5\x64"
+ "\x33\x73\x8b\x59\x03\xad\x05\xdf"
+ "\x25\x98\x31\xde\xef\x13\xf1\x9b"
+ "\x3c\x91\x9d\x7b\xb1\xfa\xe6\xbf"
+ "\x5b\xed\xa5\x55\xe6\xea\x6c\x74"
+ "\xf4\xb9\xe4\x45\x64\x72\x81\xc2"
+ "\x4c\x28\xd4\xcd\xac\xe2\xde\xf9"
+ "\xeb\x5c\xeb\x61\x60\x5a\xe5\x28",
+ .ksize = 1088,
+ .plaintext = "",
+ .psize = 0,
+ .digest = "\x00\x00\x00\x00\x00\x00\x00\x00"
+ "\x00\x00\x00\x00\x00\x00\x00\x00",
+ }, {
+ .key = "\x29\x21\x43\xcb\xcb\x13\x07\xde"
+ "\xbf\x48\xdf\x8a\x7f\xa2\x84\xde"
+ "\x72\x23\x9d\xf5\xf0\x07\xf2\x4c"
+ "\x20\x3a\x93\xb9\xcd\x5d\xfe\xcb"
+ "\x99\x2c\x2b\x58\xc6\x50\x5f\x94"
+ "\x56\xc3\x7c\x0d\x02\x3f\xb8\x5e"
+ "\x7b\xc0\x6c\x51\x34\x76\xc0\x0e"
+ "\xc6\x22\xc8\x9e\x92\xa0\x21\xc9"
+ "\x85\x5c\x7c\xf8\xe2\x64\x47\xc9"
+ "\xe4\xa2\x57\x93\xf8\xa2\x69\xcd"
+ "\x62\x98\x99\xf4\xd7\x7b\x14\xb1"
+ "\xd8\x05\xff\x04\x15\xc9\xe1\x6e"
+ "\x9b\xe6\x50\x6b\x0b\x3f\x22\x1f"
+ "\x08\xde\x0c\x5b\x08\x7e\xc6\x2f"
+ "\x6c\xed\xd6\xb2\x15\xa4\xb3\xf9"
+ "\xa7\x46\x38\x2a\xea\x69\xa5\xde"
+ "\x02\xc3\x96\x89\x4d\x55\x3b\xed"
+ "\x3d\x3a\x85\x77\xbf\x97\x45\x5c"
+ "\x9e\x02\x69\xe2\x1b\x68\xbe\x96"
+ "\xfb\x64\x6f\x0f\xf6\x06\x40\x67"
+ "\xfa\x04\xe3\x55\xfa\xbe\xa4\x60"
+ "\xef\x21\x66\x97\xe6\x9d\x5c\x1f"
+ "\x62\x37\xaa\x31\xde\xe4\x9c\x28"
+ "\x95\xe0\x22\x86\xf4\x4d\xf3\x07"
+ "\xfd\x5f\x3a\x54\x2c\x51\x80\x71"
+ "\xba\x78\x69\x5b\x65\xab\x1f\x81"
+ "\xed\x3b\xff\x34\xa3\xfb\xbc\x73"
+ "\x66\x7d\x13\x7f\xdf\x6e\xe2\xe2"
+ "\xeb\x4f\x6c\xda\x7d\x33\x57\xd0"
+ "\xd3\x7c\x95\x4f\x33\x58\x21\xc7"
+ "\xc0\xe5\x6f\x42\x26\xc6\x1f\x5e"
+ "\x85\x1b\x98\x9a\xa2\x1e\x55\x77"
+ "\x23\xdf\x81\x5e\x79\x55\x05\xfc"
+ "\xfb\xda\xee\xba\x5a\xba\xf7\x77"
+ "\x7f\x0e\xd3\xe1\x37\xfe\x8d\x2b"
+ "\xd5\x3f\xfb\xd0\xc0\x3c\x0b\x3f"
+ "\xcf\x3c\x14\xcf\xfb\x46\x72\x4c"
+ "\x1f\x39\xe2\xda\x03\x71\x6d\x23"
+ "\xef\x93\xcd\x39\xd9\x37\x80\x4d"
+ "\x65\x61\xd1\x2c\x03\xa9\x47\x72"
+ "\x4d\x1e\x0e\x16\x33\x0f\x21\x17"
+ "\xec\x92\xea\x6f\x37\x22\xa4\xd8"
+ "\x03\x33\x9e\xd8\x03\x69\x9a\xe8"
+ "\xb2\x57\xaf\x78\x99\x05\x12\xab"
+ "\x48\x90\x80\xf0\x12\x9b\x20\x64"
+ "\x7a\x1d\x47\x5f\xba\x3c\xf9\xc3"
+ "\x0a\x0d\x8d\xa1\xf9\x1b\x82\x13"
+ "\x3e\x0d\xec\x0a\x83\xc0\x65\xe1"
+ "\xe9\x95\xff\x97\xd6\xf2\xe4\xd5"
+ "\x86\xc0\x1f\x29\x27\x63\xd7\xde"
+ "\xb7\x0a\x07\x99\x04\x2d\xa3\x89"
+ "\xa2\x43\xcf\xf3\xe1\x43\xac\x4a"
+ "\x06\x97\xd0\x05\x4f\x87\xfa\xf9"
+ "\x9b\xbf\x52\x70\xbd\xbc\x6c\xf3"
+ "\x03\x13\x60\x41\x28\x09\xec\xcc"
+ "\xb1\x1a\xec\xd6\xfb\x6f\x2a\x89"
+ "\x5d\x0b\x53\x9c\x59\xc1\x84\x21"
+ "\x33\x51\x47\x19\x31\x9c\xd4\x0a"
+ "\x4d\x04\xec\x50\x90\x61\xbd\xbc"
+ "\x7e\xc8\xd9\x6c\x98\x1d\x45\x41"
+ "\x17\x5e\x97\x1c\xc5\xa8\xe8\xea"
+ "\x46\x58\x53\xf7\x17\xd5\xad\x11"
+ "\xc8\x54\xf5\x7a\x33\x90\xf5\x19"
+ "\xba\x36\xb4\xfc\x52\xa5\x72\x3d"
+ "\x14\xbb\x55\xa7\xe9\xe3\x12\xf7"
+ "\x1c\x30\xa2\x82\x03\xbf\x53\x91"
+ "\x2e\x60\x41\x9f\x5b\x69\x39\xf6"
+ "\x4d\xc8\xf8\x46\x7a\x7f\xa4\x98"
+ "\x36\xff\x06\xcb\xca\xe7\x33\xf2"
+ "\xc0\x4a\xf4\x3c\x14\x44\x5f\x6b"
+ "\x75\xef\x02\x36\x75\x08\x14\xfd"
+ "\x10\x8e\xa5\x58\xd0\x30\x46\x49"
+ "\xaf\x3a\xf8\x40\x3d\x35\xdb\x84"
+ "\x11\x2e\x97\x6a\xb7\x87\x7f\xad"
+ "\xf1\xfa\xa5\x63\x60\xd8\x5e\xbf"
+ "\x41\x78\x49\xcf\x77\xbb\x56\xbb"
+ "\x7d\x01\x67\x05\x22\xc8\x8f\x41"
+ "\xba\x81\xd2\xca\x2c\x38\xac\x76"
+ "\x06\xc1\x1a\xc2\xce\xac\x90\x67"
+ "\x57\x3e\x20\x12\x5b\xd9\x97\x58"
+ "\x65\x05\xb7\x04\x61\x7e\xd8\x3a"
+ "\xbf\x55\x3b\x13\xe9\x34\x5a\x37"
+ "\x36\xcb\x94\x45\xc5\x32\xb3\xa0"
+ "\x0c\x3e\x49\xc5\xd3\xed\xa7\xf0"
+ "\x1c\x69\xcc\xea\xcc\x83\xc9\x16"
+ "\x95\x72\x4b\xf4\x89\xd5\xb9\x10"
+ "\xf6\x2d\x60\x15\xea\x3c\x06\x66"
+ "\x9f\x82\xad\x17\xce\xd2\xa4\x48"
+ "\x7c\x65\xd9\xf8\x02\x4d\x9b\x4c"
+ "\x89\x06\x3a\x34\x85\x48\x89\x86"
+ "\xf9\x24\xa9\x54\x72\xdb\x44\x95"
+ "\xc7\x44\x1c\x19\x11\x4c\x04\xdc"
+ "\x13\xb9\x67\xc8\xc3\x3a\x6a\x50"
+ "\xfa\xd1\xfb\xe1\x88\xb6\xf1\xa3"
+ "\xc5\x3b\xdc\x38\x45\x16\x26\x02"
+ "\x3b\xb8\x8f\x8b\x58\x7d\x23\x04"
+ "\x50\x6b\x81\x9f\xae\x66\xac\x6f"
+ "\xcf\x2a\x9d\xf1\xfd\x1d\x57\x07"
+ "\xbe\x58\xeb\x77\x0c\xe3\xc2\x19"
+ "\x14\x74\x1b\x51\x1c\x4f\x41\xf3"
+ "\x32\x89\xb3\xe7\xde\x62\xf6\x5f"
+ "\xc7\x6a\x4a\x2a\x5b\x0f\x5f\x87"
+ "\x9c\x08\xb9\x02\x88\xc8\x29\xb7"
+ "\x94\x52\xfa\x52\xfe\xaa\x50\x10"
+ "\xba\x48\x75\x5e\x11\x1b\xe6\x39"
+ "\xd7\x82\x2c\x87\xf1\x1e\xa4\x38"
+ "\x72\x3e\x51\xe7\xd8\x3e\x5b\x7b"
+ "\x31\x16\x89\xba\xd6\xad\x18\x5e"
+ "\xba\xf8\x12\xb3\xf4\x6c\x47\x30"
+ "\xc0\x38\x58\xb3\x10\x8d\x58\x5d"
+ "\xb4\xfb\x19\x7e\x41\xc3\x66\xb8"
+ "\xd6\x72\x84\xe1\x1a\xc2\x71\x4c"
+ "\x0d\x4a\x21\x7a\xab\xa2\xc0\x36"
+ "\x15\xc5\xe9\x46\xd7\x29\x17\x76"
+ "\x5e\x47\x36\x7f\x72\x05\xa7\xcc"
+ "\x36\x63\xf9\x47\x7d\xe6\x07\x3c"
+ "\x8b\x79\x1d\x96\x61\x8d\x90\x65"
+ "\x7c\xf5\xeb\x4e\x6e\x09\x59\x6d"
+ "\x62\x50\x1b\x0f\xe0\xdc\x78\xf2"
+ "\x5b\x83\x1a\xa1\x11\x75\xfd\x18"
+ "\xd7\xe2\x8d\x65\x14\x21\xce\xbe"
+ "\xb5\x87\xe3\x0a\xda\x24\x0a\x64"
+ "\xa9\x9f\x03\x8d\x46\x5d\x24\x1a"
+ "\x8a\x0c\x42\x01\xca\xb1\x5f\x7c"
+ "\xa5\xac\x32\x4a\xb8\x07\x91\x18"
+ "\x6f\xb0\x71\x3c\xc9\xb1\xa8\xf8"
+ "\x5f\x69\xa5\xa1\xca\x9e\x7a\xaa"
+ "\xac\xe9\xc7\x47\x41\x75\x25\xc3"
+ "\x73\xe2\x0b\xdd\x6d\x52\x71\xbe"
+ "\xc5\xdc\xb4\xe7\x01\x26\x53\x77"
+ "\x86\x90\x85\x68\x6b\x7b\x03\x53"
+ "\xda\x52\x52\x51\x68\xc8\xf3\xec"
+ "\x6c\xd5\x03\x7a\xa3\x0e\xb4\x02"
+ "\x5f\x1a\xab\xee\xca\x67\x29\x7b"
+ "\xbd\x96\x59\xb3\x8b\x32\x7a\x92"
+ "\x9f\xd8\x25\x2b\xdf\xc0\x4c\xda",
+ .ksize = 1088,
+ .plaintext = "\xbc\xda\x81\xa8\x78\x79\x1c\xbf"
+ "\x77\x53\xba\x4c\x30\x5b\xb8\x33",
+ .psize = 16,
+ .digest = "\x04\xbf\x7f\x6a\xce\x72\xea\x6a"
+ "\x79\xdb\xb0\xc9\x60\xf6\x12\xcc",
+ .np = 6,
+ .tap = { 4, 4, 1, 1, 1, 5 },
+ }, {
+ .key = "\x65\x4d\xe3\xf8\xd2\x4c\xac\x28"
+ "\x68\xf5\xb3\x81\x71\x4b\xa1\xfa"
+ "\x04\x0e\xd3\x81\x36\xbe\x0c\x81"
+ "\x5e\xaf\xbc\x3a\xa4\xc0\x8e\x8b"
+ "\x55\x63\xd3\x52\x97\x88\xd6\x19"
+ "\xbc\x96\xdf\x49\xff\x04\x63\xf5"
+ "\x0c\x11\x13\xaa\x9e\x1f\x5a\xf7"
+ "\xdd\xbd\x37\x80\xc3\xd0\xbe\xa7"
+ "\x05\xc8\x3c\x98\x1e\x05\x3c\x84"
+ "\x39\x61\xc4\xed\xed\x71\x1b\xc4"
+ "\x74\x45\x2c\xa1\x56\x70\x97\xfd"
+ "\x44\x18\x07\x7d\xca\x60\x1f\x73"
+ "\x3b\x6d\x21\xcb\x61\x87\x70\x25"
+ "\x46\x21\xf1\x1f\x21\x91\x31\x2d"
+ "\x5d\xcc\xb7\xd1\x84\x3e\x3d\xdb"
+ "\x03\x53\x2a\x82\xa6\x9a\x95\xbc"
+ "\x1a\x1e\x0a\x5e\x07\x43\xab\x43"
+ "\xaf\x92\x82\x06\x91\x04\x09\xf4"
+ "\x17\x0a\x9a\x2c\x54\xdb\xb8\xf4"
+ "\xd0\xf0\x10\x66\x24\x8d\xcd\xda"
+ "\xfe\x0e\x45\x9d\x6f\xc4\x4e\xf4"
+ "\x96\xaf\x13\xdc\xa9\xd4\x8c\xc4"
+ "\xc8\x57\x39\x3c\xc2\xd3\x0a\x76"
+ "\x4a\x1f\x75\x83\x44\xc7\xd1\x39"
+ "\xd8\xb5\x41\xba\x73\x87\xfa\x96"
+ "\xc7\x18\x53\xfb\x9b\xda\xa0\x97"
+ "\x1d\xee\x60\x85\x9e\x14\xc3\xce"
+ "\xc4\x05\x29\x3b\x95\x30\xa3\xd1"
+ "\x9f\x82\x6a\x04\xf5\xa7\x75\x57"
+ "\x82\x04\xfe\x71\x51\x71\xb1\x49"
+ "\x50\xf8\xe0\x96\xf1\xfa\xa8\x88"
+ "\x3f\xa0\x86\x20\xd4\x60\x79\x59"
+ "\x17\x2d\xd1\x09\xf4\xec\x05\x57"
+ "\xcf\x62\x7e\x0e\x7e\x60\x78\xe6"
+ "\x08\x60\x29\xd8\xd5\x08\x1a\x24"
+ "\xc4\x6c\x24\xe7\x92\x08\x3d\x8a"
+ "\x98\x7a\xcf\x99\x0a\x65\x0e\xdc"
+ "\x8c\x8a\xbe\x92\x82\x91\xcc\x62"
+ "\x30\xb6\xf4\x3f\xc6\x8a\x7f\x12"
+ "\x4a\x8a\x49\xfa\x3f\x5c\xd4\x5a"
+ "\xa6\x82\xa3\xe6\xaa\x34\x76\xb2"
+ "\xab\x0a\x30\xef\x6c\x77\x58\x3f"
+ "\x05\x6b\xcc\x5c\xae\xdc\xd7\xb9"
+ "\x51\x7e\x8d\x32\x5b\x24\x25\xbe"
+ "\x2b\x24\x01\xcf\x80\xda\x16\xd8"
+ "\x90\x72\x2c\xad\x34\x8d\x0c\x74"
+ "\x02\xcb\xfd\xcf\x6e\xef\x97\xb5"
+ "\x4c\xf2\x68\xca\xde\x43\x9e\x8a"
+ "\xc5\x5f\x31\x7f\x14\x71\x38\xec"
+ "\xbd\x98\xe5\x71\xc4\xb5\xdb\xef"
+ "\x59\xd2\xca\xc0\xc1\x86\x75\x01"
+ "\xd4\x15\x0d\x6f\xa4\xf7\x7b\x37"
+ "\x47\xda\x18\x93\x63\xda\xbe\x9e"
+ "\x07\xfb\xb2\x83\xd5\xc4\x34\x55"
+ "\xee\x73\xa1\x42\x96\xf9\x66\x41"
+ "\xa4\xcc\xd2\x93\x6e\xe1\x0a\xbb"
+ "\xd2\xdd\x18\x23\xe6\x6b\x98\x0b"
+ "\x8a\x83\x59\x2c\xc3\xa6\x59\x5b"
+ "\x01\x22\x59\xf7\xdc\xb0\x87\x7e"
+ "\xdb\x7d\xf4\x71\x41\xab\xbd\xee"
+ "\x79\xbe\x3c\x01\x76\x0b\x2d\x0a"
+ "\x42\xc9\x77\x8c\xbb\x54\x95\x60"
+ "\x43\x2e\xe0\x17\x52\xbd\x90\xc9"
+ "\xc2\x2c\xdd\x90\x24\x22\x76\x40"
+ "\x5c\xb9\x41\xc9\xa1\xd5\xbd\xe3"
+ "\x44\xe0\xa4\xab\xcc\xb8\xe2\x32"
+ "\x02\x15\x04\x1f\x8c\xec\x5d\x14"
+ "\xac\x18\xaa\xef\x6e\x33\x19\x6e"
+ "\xde\xfe\x19\xdb\xeb\x61\xca\x18"
+ "\xad\xd8\x3d\xbf\x09\x11\xc7\xa5"
+ "\x86\x0b\x0f\xe5\x3e\xde\xe8\xd9"
+ "\x0a\x69\x9e\x4c\x20\xff\xf9\xc5"
+ "\xfa\xf8\xf3\x7f\xa5\x01\x4b\x5e"
+ "\x0f\xf0\x3b\x68\xf0\x46\x8c\x2a"
+ "\x7a\xc1\x8f\xa0\xfe\x6a\x5b\x44"
+ "\x70\x5c\xcc\x92\x2c\x6f\x0f\xbd"
+ "\x25\x3e\xb7\x8e\x73\x58\xda\xc9"
+ "\xa5\xaa\x9e\xf3\x9b\xfd\x37\x3e"
+ "\xe2\x88\xa4\x7b\xc8\x5c\xa8\x93"
+ "\x0e\xe7\x9a\x9c\x2e\x95\x18\x9f"
+ "\xc8\x45\x0c\x88\x9e\x53\x4f\x3a"
+ "\x76\xc1\x35\xfa\x17\xd8\xac\xa0"
+ "\x0c\x2d\x47\x2e\x4f\x69\x9b\xf7"
+ "\xd0\xb6\x96\x0c\x19\xb3\x08\x01"
+ "\x65\x7a\x1f\xc7\x31\x86\xdb\xc8"
+ "\xc1\x99\x8f\xf8\x08\x4a\x9d\x23"
+ "\x22\xa8\xcf\x27\x01\x01\x88\x93"
+ "\x9c\x86\x45\xbd\xe0\x51\xca\x52"
+ "\x84\xba\xfe\x03\xf7\xda\xc5\xce"
+ "\x3e\x77\x75\x86\xaf\x84\xc8\x05"
+ "\x44\x01\x0f\x02\xf3\x58\xb0\x06"
+ "\x5a\xd7\x12\x30\x8d\xdf\x1f\x1f"
+ "\x0a\xe6\xd2\xea\xf6\x3a\x7a\x99"
+ "\x63\xe8\xd2\xc1\x4a\x45\x8b\x40"
+ "\x4d\x0a\xa9\x76\x92\xb3\xda\x87"
+ "\x36\x33\xf0\x78\xc3\x2f\x5f\x02"
+ "\x1a\x6a\x2c\x32\xcd\x76\xbf\xbd"
+ "\x5a\x26\x20\x28\x8c\x8c\xbc\x52"
+ "\x3d\x0a\xc9\xcb\xab\xa4\x21\xb0"
+ "\x54\x40\x81\x44\xc7\xd6\x1c\x11"
+ "\x44\xc6\x02\x92\x14\x5a\xbf\x1a"
+ "\x09\x8a\x18\xad\xcd\x64\x3d\x53"
+ "\x4a\xb6\xa5\x1b\x57\x0e\xef\xe0"
+ "\x8c\x44\x5f\x7d\xbd\x6c\xfd\x60"
+ "\xae\x02\x24\xb6\x99\xdd\x8c\xaf"
+ "\x59\x39\x75\x3c\xd1\x54\x7b\x86"
+ "\xcc\x99\xd9\x28\x0c\xb0\x94\x62"
+ "\xf9\x51\xd1\x19\x96\x2d\x66\xf5"
+ "\x55\xcf\x9e\x59\xe2\x6b\x2c\x08"
+ "\xc0\x54\x48\x24\x45\xc3\x8c\x73"
+ "\xea\x27\x6e\x66\x7d\x1d\x0e\x6e"
+ "\x13\xe8\x56\x65\x3a\xb0\x81\x5c"
+ "\xf0\xe8\xd8\x00\x6b\xcd\x8f\xad"
+ "\xdd\x53\xf3\xa4\x6c\x43\xd6\x31"
+ "\xaf\xd2\x76\x1e\x91\x12\xdb\x3c"
+ "\x8c\xc2\x81\xf0\x49\xdb\xe2\x6b"
+ "\x76\x62\x0a\x04\xe4\xaa\x8a\x7c"
+ "\x08\x0b\x5d\xd0\xee\x1d\xfb\xc4"
+ "\x02\x75\x42\xd6\xba\xa7\x22\xa8"
+ "\x47\x29\xb7\x85\x6d\x93\x3a\xdb"
+ "\x00\x53\x0b\xa2\xeb\xf8\xfe\x01"
+ "\x6f\x8a\x31\xd6\x17\x05\x6f\x67"
+ "\x88\x95\x32\xfe\x4f\xa6\x4b\xf8"
+ "\x03\xe4\xcd\x9a\x18\xe8\x4e\x2d"
+ "\xf7\x97\x9a\x0c\x7d\x9f\x7e\x44"
+ "\x69\x51\xe0\x32\x6b\x62\x86\x8f"
+ "\xa6\x8e\x0b\x21\x96\xe5\xaf\x77"
+ "\xc0\x83\xdf\xa5\x0e\xd0\xa1\x04"
+ "\xaf\xc1\x10\xcb\x5a\x40\xe4\xe3"
+ "\x38\x7e\x07\xe8\x4d\xfa\xed\xc5"
+ "\xf0\x37\xdf\xbb\x8a\xcf\x3d\xdc"
+ "\x61\xd2\xc6\x2b\xff\x07\xc9\x2f"
+ "\x0c\x2d\x5c\x07\xa8\x35\x6a\xfc"
+ "\xae\x09\x03\x45\x74\x51\x4d\xc4"
+ "\xb8\x23\x87\x4a\x99\x27\x20\x87"
+ "\x62\x44\x0a\x4a\xce\x78\x47\x22",
+ .ksize = 1088,
+ .plaintext = "\x8e\xb0\x4c\xde\x9c\x4a\x04\x5a"
+ "\xf6\xa9\x7f\x45\x25\xa5\x7b\x3a"
+ "\xbc\x4d\x73\x39\x81\xb5\xbd\x3d"
+ "\x21\x6f\xd7\x37\x50\x3c\x7b\x28"
+ "\xd1\x03\x3a\x17\xed\x7b\x7c\x2a"
+ "\x16\xbc\xdf\x19\x89\x52\x71\x31"
+ "\xb6\xc0\xfd\xb5\xd3\xba\x96\x99"
+ "\xb6\x34\x0b\xd0\x99\x93\xfc\x1a"
+ "\x01\x3c\x85\xc6\x9b\x78\x5c\x8b"
+ "\xfe\xae\xd2\xbf\xb2\x6f\xf9\xed"
+ "\xc8\x25\x17\xfe\x10\x3b\x7d\xda"
+ "\xf4\x8d\x35\x4b\x7c\x7b\x82\xe7"
+ "\xc2\xb3\xee\x60\x4a\x03\x86\xc9"
+ "\x4e\xb5\xc4\xbe\xd2\xbd\x66\xf1"
+ "\x13\xf1\x09\xab\x5d\xca\x63\x1f"
+ "\xfc\xfb\x57\x2a\xfc\xca\x66\xd8"
+ "\x77\x84\x38\x23\x1d\xac\xd3\xb3"
+ "\x7a\xad\x4c\x70\xfa\x9c\xc9\x61"
+ "\xa6\x1b\xba\x33\x4b\x4e\x33\xec"
+ "\xa0\xa1\x64\x39\x40\x05\x1c\xc2"
+ "\x3f\x49\x9d\xae\xf2\xc5\xf2\xc5"
+ "\xfe\xe8\xf4\xc2\xf9\x96\x2d\x28"
+ "\x92\x30\x44\xbc\xd2\x7f\xe1\x6e"
+ "\x62\x02\x8f\x3d\x1c\x80\xda\x0e"
+ "\x6a\x90\x7e\x75\xff\xec\x3e\xc4"
+ "\xcd\x16\x34\x3b\x05\x6d\x4d\x20"
+ "\x1c\x7b\xf5\x57\x4f\xfa\x3d\xac"
+ "\xd0\x13\x55\xe8\xb3\xe1\x1b\x78"
+ "\x30\xe6\x9f\x84\xd4\x69\xd1\x08"
+ "\x12\x77\xa7\x4a\xbd\xc0\xf2\xd2"
+ "\x78\xdd\xa3\x81\x12\xcb\x6c\x14"
+ "\x90\x61\xe2\x84\xc6\x2b\x16\xcc"
+ "\x40\x99\x50\x88\x01\x09\x64\x4f"
+ "\x0a\x80\xbe\x61\xae\x46\xc9\x0a"
+ "\x5d\xe0\xfb\x72\x7a\x1a\xdd\x61"
+ "\x63\x20\x05\xa0\x4a\xf0\x60\x69"
+ "\x7f\x92\xbc\xbf\x4e\x39\x4d\xdd"
+ "\x74\xd1\xb7\xc0\x5a\x34\xb7\xae"
+ "\x76\x65\x2e\xbc\x36\xb9\x04\x95"
+ "\x42\xe9\x6f\xca\x78\xb3\x72\x07"
+ "\xa3\xba\x02\x94\x67\x4c\xb1\xd7"
+ "\xe9\x30\x0d\xf0\x3b\xb8\x10\x6d"
+ "\xea\x2b\x21\xbf\x74\x59\x82\x97"
+ "\x85\xaa\xf1\xd7\x54\x39\xeb\x05"
+ "\xbd\xf3\x40\xa0\x97\xe6\x74\xfe"
+ "\xb4\x82\x5b\xb1\x36\xcb\xe8\x0d"
+ "\xce\x14\xd9\xdf\xf1\x94\x22\xcd"
+ "\xd6\x00\xba\x04\x4c\x05\x0c\xc0"
+ "\xd1\x5a\xeb\x52\xd5\xa8\x8e\xc8"
+ "\x97\xa1\xaa\xc1\xea\xc1\xbe\x7c"
+ "\x36\xb3\x36\xa0\xc6\x76\x66\xc5"
+ "\xe2\xaf\xd6\x5c\xe2\xdb\x2c\xb3"
+ "\x6c\xb9\x99\x7f\xff\x9f\x03\x24"
+ "\xe1\x51\x44\x66\xd8\x0c\x5d\x7f"
+ "\x5c\x85\x22\x2a\xcf\x6d\x79\x28"
+ "\xab\x98\x01\x72\xfe\x80\x87\x5f"
+ "\x46\xba\xef\x81\x24\xee\xbf\xb0"
+ "\x24\x74\xa3\x65\x97\x12\xc4\xaf"
+ "\x8b\xa0\x39\xda\x8a\x7e\x74\x6e"
+ "\x1b\x42\xb4\x44\x37\xfc\x59\xfd"
+ "\x86\xed\xfb\x8c\x66\x33\xda\x63"
+ "\x75\xeb\xe1\xa4\x85\x4f\x50\x8f"
+ "\x83\x66\x0d\xd3\x37\xfa\xe6\x9c"
+ "\x4f\x30\x87\x35\x18\xe3\x0b\xb7"
+ "\x6e\x64\x54\xcd\x70\xb3\xde\x54"
+ "\xb7\x1d\xe6\x4c\x4d\x55\x12\x12"
+ "\xaf\x5f\x7f\x5e\xee\x9d\xe8\x8e"
+ "\x32\x9d\x4e\x75\xeb\xc6\xdd\xaa"
+ "\x48\x82\xa4\x3f\x3c\xd7\xd3\xa8"
+ "\x63\x9e\x64\xfe\xe3\x97\x00\x62"
+ "\xe5\x40\x5d\xc3\xad\x72\xe1\x28"
+ "\x18\x50\xb7\x75\xef\xcd\x23\xbf"
+ "\x3f\xc0\x51\x36\xf8\x41\xc3\x08"
+ "\xcb\xf1\x8d\x38\x34\xbd\x48\x45"
+ "\x75\xed\xbc\x65\x7b\xb5\x0c\x9b"
+ "\xd7\x67\x7d\x27\xb4\xc4\x80\xd7"
+ "\xa9\xb9\xc7\x4a\x97\xaa\xda\xc8"
+ "\x3c\x74\xcf\x36\x8f\xe4\x41\xe3"
+ "\xd4\xd3\x26\xa7\xf3\x23\x9d\x8f"
+ "\x6c\x20\x05\x32\x3e\xe0\xc3\xc8"
+ "\x56\x3f\xa7\x09\xb7\xfb\xc7\xf7"
+ "\xbe\x2a\xdd\x0f\x06\x7b\x0d\xdd"
+ "\xb0\xb4\x86\x17\xfd\xb9\x04\xe5"
+ "\xc0\x64\x5d\xad\x2a\x36\x38\xdb"
+ "\x24\xaf\x5b\xff\xca\xf9\x41\xe8"
+ "\xf9\x2f\x1e\x5e\xf9\xf5\xd5\xf2"
+ "\xb2\x88\xca\xc9\xa1\x31\xe2\xe8"
+ "\x10\x95\x65\xbf\xf1\x11\x61\x7a"
+ "\x30\x1a\x54\x90\xea\xd2\x30\xf6"
+ "\xa5\xad\x60\xf9\x4d\x84\x21\x1b"
+ "\xe4\x42\x22\xc8\x12\x4b\xb0\x58"
+ "\x3e\x9c\x2d\x32\x95\x0a\x8e\xb0"
+ "\x0a\x7e\x77\x2f\xe8\x97\x31\x6a"
+ "\xf5\x59\xb4\x26\xe6\x37\x12\xc9"
+ "\xcb\xa0\x58\x33\x6f\xd5\x55\x55"
+ "\x3c\xa1\x33\xb1\x0b\x7e\x2e\xb4"
+ "\x43\x2a\x84\x39\xf0\x9c\xf4\x69"
+ "\x4f\x1e\x79\xa6\x15\x1b\x87\xbb"
+ "\xdb\x9b\xe0\xf1\x0b\xba\xe3\x6e"
+ "\xcc\x2f\x49\x19\x22\x29\xfc\x71"
+ "\xbb\x77\x38\x18\x61\xaf\x85\x76"
+ "\xeb\xd1\x09\xcc\x86\x04\x20\x9a"
+ "\x66\x53\x2f\x44\x8b\xc6\xa3\xd2"
+ "\x5f\xc7\x79\x82\x66\xa8\x6e\x75"
+ "\x7d\x94\xd1\x86\x75\x0f\xa5\x4f"
+ "\x3c\x7a\x33\xce\xd1\x6e\x9d\x7b"
+ "\x1f\x91\x37\xb8\x37\x80\xfb\xe0"
+ "\x52\x26\xd0\x9a\xd4\x48\x02\x41"
+ "\x05\xe3\x5a\x94\xf1\x65\x61\x19"
+ "\xb8\x88\x4e\x2b\xea\xba\x8b\x58"
+ "\x8b\x42\x01\x00\xa8\xfe\x00\x5c"
+ "\xfe\x1c\xee\x31\x15\x69\xfa\xb3"
+ "\x9b\x5f\x22\x8e\x0d\x2c\xe3\xa5"
+ "\x21\xb9\x99\x8a\x8e\x94\x5a\xef"
+ "\x13\x3e\x99\x96\x79\x6e\xd5\x42"
+ "\x36\x03\xa9\xe2\xca\x65\x4e\x8a"
+ "\x8a\x30\xd2\x7d\x74\xe7\xf0\xaa"
+ "\x23\x26\xdd\xcb\x82\x39\xfc\x9d"
+ "\x51\x76\x21\x80\xa2\xbe\x93\x03"
+ "\x47\xb0\xc1\xb6\xdc\x63\xfd\x9f"
+ "\xca\x9d\xa5\xca\x27\x85\xe2\xd8"
+ "\x15\x5b\x7e\x14\x7a\xc4\x89\xcc"
+ "\x74\x14\x4b\x46\xd2\xce\xac\x39"
+ "\x6b\x6a\x5a\xa4\x0e\xe3\x7b\x15"
+ "\x94\x4b\x0f\x74\xcb\x0c\x7f\xa9"
+ "\xbe\x09\x39\xa3\xdd\x56\x5c\xc7"
+ "\x99\x56\x65\x39\xf4\x0b\x7d\x87"
+ "\xec\xaa\xe3\x4d\x22\x65\x39\x4e",
+ .psize = 1024,
+ .digest = "\x64\x3a\xbc\xc3\x3f\x74\x40\x51"
+ "\x6e\x56\x01\x1a\x51\xec\x36\xde",
+ .np = 8,
+ .tap = { 64, 203, 267, 28, 263, 62, 54, 83 },
+ }, {
+ .key = "\x1b\x82\x2e\x1b\x17\x23\xb9\x6d"
+ "\xdc\x9c\xda\x99\x07\xe3\x5f\xd8"
+ "\xd2\xf8\x43\x80\x8d\x86\x7d\x80"
+ "\x1a\xd0\xcc\x13\xb9\x11\x05\x3f"
+ "\x7e\xcf\x7e\x80\x0e\xd8\x25\x48"
+ "\x8b\xaa\x63\x83\x92\xd0\x72\xf5"
+ "\x4f\x67\x7e\x50\x18\x25\xa4\xd1"
+ "\xe0\x7e\x1e\xba\xd8\xa7\x6e\xdb"
+ "\x1a\xcc\x0d\xfe\x9f\x6d\x22\x35"
+ "\xe1\xe6\xe0\xa8\x7b\x9c\xb1\x66"
+ "\xa3\xf8\xff\x4d\x90\x84\x28\xbc"
+ "\xdc\x19\xc7\x91\x49\xfc\xf6\x33"
+ "\xc9\x6e\x65\x7f\x28\x6f\x68\x2e"
+ "\xdf\x1a\x75\xe9\xc2\x0c\x96\xb9"
+ "\x31\x22\xc4\x07\xc6\x0a\x2f\xfd"
+ "\x36\x06\x5f\x5c\xc5\xb1\x3a\xf4"
+ "\x5e\x48\xa4\x45\x2b\x88\xa7\xee"
+ "\xa9\x8b\x52\xcc\x99\xd9\x2f\xb8"
+ "\xa4\x58\x0a\x13\xeb\x71\x5a\xfa"
+ "\xe5\x5e\xbe\xf2\x64\xad\x75\xbc"
+ "\x0b\x5b\x34\x13\x3b\x23\x13\x9a"
+ "\x69\x30\x1e\x9a\xb8\x03\xb8\x8b"
+ "\x3e\x46\x18\x6d\x38\xd9\xb3\xd8"
+ "\xbf\xf1\xd0\x28\xe6\x51\x57\x80"
+ "\x5e\x99\xfb\xd0\xce\x1e\x83\xf7"
+ "\xe9\x07\x5a\x63\xa9\xef\xce\xa5"
+ "\xfb\x3f\x37\x17\xfc\x0b\x37\x0e"
+ "\xbb\x4b\x21\x62\xb7\x83\x0e\xa9"
+ "\x9e\xb0\xc4\xad\x47\xbe\x35\xe7"
+ "\x51\xb2\xf2\xac\x2b\x65\x7b\x48"
+ "\xe3\x3f\x5f\xb6\x09\x04\x0c\x58"
+ "\xce\x99\xa9\x15\x2f\x4e\xc1\xf2"
+ "\x24\x48\xc0\xd8\x6c\xd3\x76\x17"
+ "\x83\x5d\xe6\xe3\xfd\x01\x8e\xf7"
+ "\x42\xa5\x04\x29\x30\xdf\xf9\x00"
+ "\x4a\xdc\x71\x22\x1a\x33\x15\xb6"
+ "\xd7\x72\xfb\x9a\xb8\xeb\x2b\x38"
+ "\xea\xa8\x61\xa8\x90\x11\x9d\x73"
+ "\x2e\x6c\xce\x81\x54\x5a\x9f\xcd"
+ "\xcf\xd5\xbd\x26\x5d\x66\xdb\xfb"
+ "\xdc\x1e\x7c\x10\xfe\x58\x82\x10"
+ "\x16\x24\x01\xce\x67\x55\x51\xd1"
+ "\xdd\x6b\x44\xa3\x20\x8e\xa9\xa6"
+ "\x06\xa8\x29\x77\x6e\x00\x38\x5b"
+ "\xde\x4d\x58\xd8\x1f\x34\xdf\xf9"
+ "\x2c\xac\x3e\xad\xfb\x92\x0d\x72"
+ "\x39\xa4\xac\x44\x10\xc0\x43\xc4"
+ "\xa4\x77\x3b\xfc\xc4\x0d\x37\xd3"
+ "\x05\x84\xda\x53\x71\xf8\x80\xd3"
+ "\x34\x44\xdb\x09\xb4\x2b\x8e\xe3"
+ "\x00\x75\x50\x9e\x43\x22\x00\x0b"
+ "\x7c\x70\xab\xd4\x41\xf1\x93\xcd"
+ "\x25\x2d\x84\x74\xb5\xf2\x92\xcd"
+ "\x0a\x28\xea\x9a\x49\x02\x96\xcb"
+ "\x85\x9e\x2f\x33\x03\x86\x1d\xdc"
+ "\x1d\x31\xd5\xfc\x9d\xaa\xc5\xe9"
+ "\x9a\xc4\x57\xf5\x35\xed\xf4\x4b"
+ "\x3d\x34\xc2\x29\x13\x86\x36\x42"
+ "\x5d\xbf\x90\x86\x13\x77\xe5\xc3"
+ "\x62\xb4\xfe\x0b\x70\x39\x35\x65"
+ "\x02\xea\xf6\xce\x57\x0c\xbb\x74"
+ "\x29\xe3\xfd\x60\x90\xfd\x10\x38"
+ "\xd5\x4e\x86\xbd\x37\x70\xf0\x97"
+ "\xa6\xab\x3b\x83\x64\x52\xca\x66"
+ "\x2f\xf9\xa4\xca\x3a\x55\x6b\xb0"
+ "\xe8\x3a\x34\xdb\x9e\x48\x50\x2f"
+ "\x3b\xef\xfd\x08\x2d\x5f\xc1\x37"
+ "\x5d\xbe\x73\xe4\xd8\xe9\xac\xca"
+ "\x8a\xaa\x48\x7c\x5c\xf4\xa6\x96"
+ "\x5f\xfa\x70\xa6\xb7\x8b\x50\xcb"
+ "\xa6\xf5\xa9\xbd\x7b\x75\x4c\x22"
+ "\x0b\x19\x40\x2e\xc9\x39\x39\x32"
+ "\x83\x03\xa8\xa4\x98\xe6\x8e\x16"
+ "\xb9\xde\x08\xc5\xfc\xbf\xad\x39"
+ "\xa8\xc7\x93\x6c\x6f\x23\xaf\xc1"
+ "\xab\xe1\xdf\xbb\x39\xae\x93\x29"
+ "\x0e\x7d\x80\x8d\x3e\x65\xf3\xfd"
+ "\x96\x06\x65\x90\xa1\x28\x64\x4b"
+ "\x69\xf9\xa8\x84\x27\x50\xfc\x87"
+ "\xf7\xbf\x55\x8e\x56\x13\x58\x7b"
+ "\x85\xb4\x6a\x72\x0f\x40\xf1\x4f"
+ "\x83\x81\x1f\x76\xde\x15\x64\x7a"
+ "\x7a\x80\xe4\xc7\x5e\x63\x01\x91"
+ "\xd7\x6b\xea\x0b\x9b\xa2\x99\x3b"
+ "\x6c\x88\xd8\xfd\x59\x3c\x8d\x22"
+ "\x86\x56\xbe\xab\xa1\x37\x08\x01"
+ "\x50\x85\x69\x29\xee\x9f\xdf\x21"
+ "\x3e\x20\x20\xf5\xb0\xbb\x6b\xd0"
+ "\x9c\x41\x38\xec\x54\x6f\x2d\xbd"
+ "\x0f\xe1\xbd\xf1\x2b\x6e\x60\x56"
+ "\x29\xe5\x7a\x70\x1c\xe2\xfc\x97"
+ "\x82\x68\x67\xd9\x3d\x1f\xfb\xd8"
+ "\x07\x9f\xbf\x96\x74\xba\x6a\x0e"
+ "\x10\x48\x20\xd8\x13\x1e\xb5\x44"
+ "\xf2\xcc\xb1\x8b\xfb\xbb\xec\xd7"
+ "\x37\x70\x1f\x7c\x55\xd2\x4b\xb9"
+ "\xfd\x70\x5e\xa3\x91\x73\x63\x52"
+ "\x13\x47\x5a\x06\xfb\x01\x67\xa5"
+ "\xc0\xd0\x49\x19\x56\x66\x9a\x77"
+ "\x64\xaf\x8c\x25\x91\x52\x87\x0e"
+ "\x18\xf3\x5f\x97\xfd\x71\x13\xf8"
+ "\x05\xa5\x39\xcc\x65\xd3\xcc\x63"
+ "\x5b\xdb\x5f\x7e\x5f\x6e\xad\xc4"
+ "\xf4\xa0\xc5\xc2\x2b\x4d\x97\x38"
+ "\x4f\xbc\xfa\x33\x17\xb4\x47\xb9"
+ "\x43\x24\x15\x8d\xd2\xed\x80\x68"
+ "\x84\xdb\x04\x80\xca\x5e\x6a\x35"
+ "\x2c\x2c\xe7\xc5\x03\x5f\x54\xb0"
+ "\x5e\x4f\x1d\x40\x54\x3d\x78\x9a"
+ "\xac\xda\x80\x27\x4d\x15\x4c\x1a"
+ "\x6e\x80\xc9\xc4\x3b\x84\x0e\xd9"
+ "\x2e\x93\x01\x8c\xc3\xc8\x91\x4b"
+ "\xb3\xaa\x07\x04\x68\x5b\x93\xa5"
+ "\xe7\xc4\x9d\xe7\x07\xee\xf5\x3b"
+ "\x40\x89\xcc\x60\x34\x9d\xb4\x06"
+ "\x1b\xef\x92\xe6\xc1\x2a\x7d\x0f"
+ "\x81\xaa\x56\xe3\xd7\xed\xa7\xd4"
+ "\xa7\x3a\x49\xc4\xad\x81\x5c\x83"
+ "\x55\x8e\x91\x54\xb7\x7d\x65\xa5"
+ "\x06\x16\xd5\x9a\x16\xc1\xb0\xa2"
+ "\x06\xd8\x98\x47\x73\x7e\x73\xa0"
+ "\xb8\x23\xb1\x52\xbf\x68\x74\x5d"
+ "\x0b\xcb\xfa\x8c\x46\xe3\x24\xe6"
+ "\xab\xd4\x69\x8d\x8c\xf2\x8a\x59"
+ "\xbe\x48\x46\x50\x8c\x9a\xe8\xe3"
+ "\x31\x55\x0a\x06\xed\x4f\xf8\xb7"
+ "\x4f\xe3\x85\x17\x30\xbd\xd5\x20"
+ "\xe7\x5b\xb2\x32\xcf\x6b\x16\x44"
+ "\xd2\xf5\x7e\xd7\xd1\x2f\xee\x64"
+ "\x3e\x9d\x10\xef\x27\x35\x43\x64"
+ "\x67\xfb\x7a\x7b\xe0\x62\x31\x9a"
+ "\x4d\xdf\xa5\xab\xc0\x20\xbb\x01"
+ "\xe9\x7b\x54\xf1\xde\xb2\x79\x50"
+ "\x6c\x4b\x91\xdb\x7f\xbb\x50\xc1"
+ "\x55\x44\x38\x9a\xe0\x9f\xe8\x29"
+ "\x6f\x15\xf8\x4e\xa6\xec\xa0\x60",
+ .ksize = 1088,
+ .plaintext = "\x15\x68\x9e\x2f\xad\x15\x52\xdf"
+ "\xf0\x42\x62\x24\x2a\x2d\xea\xbf"
+ "\xc7\xf3\xb4\x1a\xf5\xed\xb2\x08"
+ "\x15\x60\x1c\x00\x77\xbf\x0b\x0e"
+ "\xb7\x2c\xcf\x32\x3a\xc7\x01\x77"
+ "\xef\xa6\x75\xd0\x29\xc7\x68\x20"
+ "\xb2\x92\x25\xbf\x12\x34\xe9\xa4"
+ "\xfd\x32\x7b\x3f\x7c\xbd\xa5\x02"
+ "\x38\x41\xde\xc9\xc1\x09\xd9\xfc"
+ "\x6e\x78\x22\x83\x18\xf7\x50\x8d"
+ "\x8f\x9c\x2d\x02\xa5\x30\xac\xff"
+ "\xea\x63\x2e\x80\x37\x83\xb0\x58"
+ "\xda\x2f\xef\x21\x55\xba\x7b\xb1"
+ "\xb6\xed\xf5\xd2\x4d\xaa\x8c\xa9"
+ "\xdd\xdb\x0f\xb4\xce\xc1\x9a\xb1"
+ "\xc1\xdc\xbd\xab\x86\xc2\xdf\x0b"
+ "\xe1\x2c\xf9\xbe\xf6\xd8\xda\x62"
+ "\x72\xdd\x98\x09\x52\xc0\xc4\xb6"
+ "\x7b\x17\x5c\xf5\xd8\x4b\x88\xd6"
+ "\x6b\xbf\x84\x4a\x3f\xf5\x4d\xd2"
+ "\x94\xe2\x9c\xff\xc7\x3c\xd9\xc8"
+ "\x37\x38\xbc\x8c\xf3\xe7\xb7\xd0"
+ "\x1d\x78\xc4\x39\x07\xc8\x5e\x79"
+ "\xb6\x5a\x90\x5b\x6e\x97\xc9\xd4"
+ "\x82\x9c\xf3\x83\x7a\xe7\x97\xfc"
+ "\x1d\xbb\xef\xdb\xce\xe0\x82\xad"
+ "\xca\x07\x6c\x54\x62\x6f\x81\xe6"
+ "\x7a\x5a\x96\x6e\x80\x3a\xa2\x37"
+ "\x6f\xc6\xa4\x29\xc3\x9e\x19\x94"
+ "\x9f\xb0\x3e\x38\xfb\x3c\x2b\x7d"
+ "\xaa\xb8\x74\xda\x54\x23\x51\x12"
+ "\x4b\x96\x36\x8f\x91\x4f\x19\x37"
+ "\x83\xc9\xdd\xc7\x1a\x32\x2d\xab"
+ "\xc7\x89\xe2\x07\x47\x6c\xe8\xa6"
+ "\x70\x6b\x8e\x0c\xda\x5c\x6a\x59"
+ "\x27\x33\x0e\xe1\xe1\x20\xe8\xc8"
+ "\xae\xdc\xd0\xe3\x6d\xa8\xa6\x06"
+ "\x41\xb4\xd4\xd4\xcf\x91\x3e\x06"
+ "\xb0\x9a\xf7\xf1\xaa\xa6\x23\x92"
+ "\x10\x86\xf0\x94\xd1\x7c\x2e\x07"
+ "\x30\xfb\xc5\xd8\xf3\x12\xa9\xe8"
+ "\x22\x1c\x97\x1a\xad\x96\xb0\xa1"
+ "\x72\x6a\x6b\xb4\xfd\xf7\xe8\xfa"
+ "\xe2\x74\xd8\x65\x8d\x35\x17\x4b"
+ "\x00\x23\x5c\x8c\x70\xad\x71\xa2"
+ "\xca\xc5\x6c\x59\xbf\xb4\xc0\x6d"
+ "\x86\x98\x3e\x19\x5a\x90\x92\xb1"
+ "\x66\x57\x6a\x91\x68\x7c\xbc\xf3"
+ "\xf1\xdb\x94\xf8\x48\xf1\x36\xd8"
+ "\x78\xac\x1c\xa9\xcc\xd6\x27\xba"
+ "\x91\x54\x22\xf5\xe6\x05\x3f\xcc"
+ "\xc2\x8f\x2c\x3b\x2b\xc3\x2b\x2b"
+ "\x3b\xb8\xb6\x29\xb7\x2f\x94\xb6"
+ "\x7b\xfc\x94\x3e\xd0\x7a\x41\x59"
+ "\x7b\x1f\x9a\x09\xa6\xed\x4a\x82"
+ "\x9d\x34\x1c\xbd\x4e\x1c\x3a\x66"
+ "\x80\x74\x0e\x9a\x4f\x55\x54\x47"
+ "\x16\xba\x2a\x0a\x03\x35\x99\xa3"
+ "\x5c\x63\x8d\xa2\x72\x8b\x17\x15"
+ "\x68\x39\x73\xeb\xec\xf2\xe8\xf5"
+ "\x95\x32\x27\xd6\xc4\xfe\xb0\x51"
+ "\xd5\x0c\x50\xc5\xcd\x6d\x16\xb3"
+ "\xa3\x1e\x95\x69\xad\x78\x95\x06"
+ "\xb9\x46\xf2\x6d\x24\x5a\x99\x76"
+ "\x73\x6a\x91\xa6\xac\x12\xe1\x28"
+ "\x79\xbc\x08\x4e\x97\x00\x98\x63"
+ "\x07\x1c\x4e\xd1\x68\xf3\xb3\x81"
+ "\xa8\xa6\x5f\xf1\x01\xc9\xc1\xaf"
+ "\x3a\x96\xf9\x9d\xb5\x5a\x5f\x8f"
+ "\x7e\xc1\x7e\x77\x0a\x40\xc8\x8e"
+ "\xfc\x0e\xed\xe1\x0d\xb0\xe5\x5e"
+ "\x5e\x6f\xf5\x7f\xab\x33\x7d\xcd"
+ "\xf0\x09\x4b\xb2\x11\x37\xdc\x65"
+ "\x97\x32\x62\x71\x3a\x29\x54\xb9"
+ "\xc7\xa4\xbf\x75\x0f\xf9\x40\xa9"
+ "\x8d\xd7\x8b\xa7\xe0\x9a\xbe\x15"
+ "\xc6\xda\xd8\x00\x14\x69\x1a\xaf"
+ "\x5f\x79\xc3\xf5\xbb\x6c\x2a\x9d"
+ "\xdd\x3c\x5f\x97\x21\xe1\x3a\x03"
+ "\x84\x6a\xe9\x76\x11\x1f\xd3\xd5"
+ "\xf0\x54\x20\x4d\xc2\x91\xc3\xa4"
+ "\x36\x25\xbe\x1b\x2a\x06\xb7\xf3"
+ "\xd1\xd0\x55\x29\x81\x4c\x83\xa3"
+ "\xa6\x84\x1e\x5c\xd1\xd0\x6c\x90"
+ "\xa4\x11\xf0\xd7\x63\x6a\x48\x05"
+ "\xbc\x48\x18\x53\xcd\xb0\x8d\xdb"
+ "\xdc\xfe\x55\x11\x5c\x51\xb3\xab"
+ "\xab\x63\x3e\x31\x5a\x8b\x93\x63"
+ "\x34\xa9\xba\x2b\x69\x1a\xc0\xe3"
+ "\xcb\x41\xbc\xd7\xf5\x7f\x82\x3e"
+ "\x01\xa3\x3c\x72\xf4\xfe\xdf\xbe"
+ "\xb1\x67\x17\x2b\x37\x60\x0d\xca"
+ "\x6f\xc3\x94\x2c\xd2\x92\x6d\x9d"
+ "\x75\x18\x77\xaa\x29\x38\x96\xed"
+ "\x0e\x20\x70\x92\xd5\xd0\xb4\x00"
+ "\xc0\x31\xf2\xc9\x43\x0e\x75\x1d"
+ "\x4b\x64\xf2\x1f\xf2\x29\x6c\x7b"
+ "\x7f\xec\x59\x7d\x8c\x0d\xd4\xd3"
+ "\xac\x53\x4c\xa3\xde\x42\x92\x95"
+ "\x6d\xa3\x4f\xd0\xe6\x3d\xe7\xec"
+ "\x7a\x4d\x68\xf1\xfe\x67\x66\x09"
+ "\x83\x22\xb1\x98\x43\x8c\xab\xb8"
+ "\x45\xe6\x6d\xdf\x5e\x50\x71\xce"
+ "\xf5\x4e\x40\x93\x2b\xfa\x86\x0e"
+ "\xe8\x30\xbd\x82\xcc\x1c\x9c\x5f"
+ "\xad\xfd\x08\x31\xbe\x52\xe7\xe6"
+ "\xf2\x06\x01\x62\x25\x15\x99\x74"
+ "\x33\x51\x52\x57\x3f\x57\x87\x61"
+ "\xb9\x7f\x29\x3d\xcd\x92\x5e\xa6"
+ "\x5c\x3b\xf1\xed\x5f\xeb\x82\xed"
+ "\x56\x7b\x61\xe7\xfd\x02\x47\x0e"
+ "\x2a\x15\xa4\xce\x43\x86\x9b\xe1"
+ "\x2b\x4c\x2a\xd9\x42\x97\xf7\x9a"
+ "\xe5\x47\x46\x48\xd3\x55\x6f\x4d"
+ "\xd9\xeb\x4b\xdd\x7b\x21\x2f\xb3"
+ "\xa8\x36\x28\xdf\xca\xf1\xf6\xd9"
+ "\x10\xf6\x1c\xfd\x2e\x0c\x27\xe0"
+ "\x01\xb3\xff\x6d\x47\x08\x4d\xd4"
+ "\x00\x25\xee\x55\x4a\xe9\xe8\x5b"
+ "\xd8\xf7\x56\x12\xd4\x50\xb2\xe5"
+ "\x51\x6f\x34\x63\x69\xd2\x4e\x96"
+ "\x4e\xbc\x79\xbf\x18\xae\xc6\x13"
+ "\x80\x92\x77\xb0\xb4\x0f\x29\x94"
+ "\x6f\x4c\xbb\x53\x11\x36\xc3\x9f"
+ "\x42\x8e\x96\x8a\x91\xc8\xe9\xfc"
+ "\xfe\xbf\x7c\x2d\x6f\xf9\xb8\x44"
+ "\x89\x1b\x09\x53\x0a\x2a\x92\xc3"
+ "\x54\x7a\x3a\xf9\xe2\xe4\x75\x87"
+ "\xa0\x5e\x4b\x03\x7a\x0d\x8a\xf4"
+ "\x55\x59\x94\x2b\x63\x96\x0e\xf5",
+ .psize = 1040,
+ .digest = "\xb5\xb9\x08\xb3\x24\x3e\x03\xf0"
+ "\xd6\x0b\x57\xbc\x0a\x6d\x89\x59",
+ }, {
+ .key = "\xf6\x34\x42\x71\x35\x52\x8b\x58"
+ "\x02\x3a\x8e\x4a\x8d\x41\x13\xe9"
+ "\x7f\xba\xb9\x55\x9d\x73\x4d\xf8"
+ "\x3f\x5d\x73\x15\xff\xd3\x9e\x7f"
+ "\x20\x2a\x6a\xa8\xd1\xf0\x8f\x12"
+ "\x6b\x02\xd8\x6c\xde\xba\x80\x22"
+ "\x19\x37\xc8\xd0\x4e\x89\x17\x7c"
+ "\x7c\xdd\x88\xfd\x41\xc0\x04\xb7"
+ "\x1d\xac\x19\xe3\x20\xc7\x16\xcf"
+ "\x58\xee\x1d\x7a\x61\x69\xa9\x12"
+ "\x4b\xef\x4f\xb6\x38\xdd\x78\xf8"
+ "\x28\xee\x70\x08\xc7\x7c\xcc\xc8"
+ "\x1e\x41\xf5\x80\x86\x70\xd0\xf0"
+ "\xa3\x87\x6b\x0a\x00\xd2\x41\x28"
+ "\x74\x26\xf1\x24\xf3\xd0\x28\x77"
+ "\xd7\xcd\xf6\x2d\x61\xf4\xa2\x13"
+ "\x77\xb4\x6f\xa0\xf4\xfb\xd6\xb5"
+ "\x38\x9d\x5a\x0c\x51\xaf\xad\x63"
+ "\x27\x67\x8c\x01\xea\x42\x1a\x66"
+ "\xda\x16\x7c\x3c\x30\x0c\x66\x53"
+ "\x1c\x88\xa4\x5c\xb2\xe3\x78\x0a"
+ "\x13\x05\x6d\xe2\xaf\xb3\xe4\x75"
+ "\x00\x99\x58\xee\x76\x09\x64\xaa"
+ "\xbb\x2e\xb1\x81\xec\xd8\x0e\xd3"
+ "\x0c\x33\x5d\xb7\x98\xef\x36\xb6"
+ "\xd2\x65\x69\x41\x70\x12\xdc\x25"
+ "\x41\x03\x99\x81\x41\x19\x62\x13"
+ "\xd1\x0a\x29\xc5\x8c\xe0\x4c\xf3"
+ "\xd6\xef\x4c\xf4\x1d\x83\x2e\x6d"
+ "\x8e\x14\x87\xed\x80\xe0\xaa\xd3"
+ "\x08\x04\x73\x1a\x84\x40\xf5\x64"
+ "\xbd\x61\x32\x65\x40\x42\xfb\xb0"
+ "\x40\xf6\x40\x8d\xc7\x7f\x14\xd0"
+ "\x83\x99\xaa\x36\x7e\x60\xc6\xbf"
+ "\x13\x8a\xf9\x21\xe4\x7e\x68\x87"
+ "\xf3\x33\x86\xb4\xe0\x23\x7e\x0a"
+ "\x21\xb1\xf5\xad\x67\x3c\x9c\x9d"
+ "\x09\xab\xaf\x5f\xba\xe0\xd0\x82"
+ "\x48\x22\x70\xb5\x6d\x53\xd6\x0e"
+ "\xde\x64\x92\x41\xb0\xd3\xfb\xda"
+ "\x21\xfe\xab\xea\x20\xc4\x03\x58"
+ "\x18\x2e\x7d\x2f\x03\xa9\x47\x66"
+ "\xdf\x7b\xa4\x6b\x34\x6b\x55\x9c"
+ "\x4f\xd7\x9c\x47\xfb\xa9\x42\xec"
+ "\x5a\x12\xfd\xfe\x76\xa0\x92\x9d"
+ "\xfe\x1e\x16\xdd\x24\x2a\xe4\x27"
+ "\xd5\xa9\xf2\x05\x4f\x83\xa2\xaf"
+ "\xfe\xee\x83\x7a\xad\xde\xdf\x9a"
+ "\x80\xd5\x81\x14\x93\x16\x7e\x46"
+ "\x47\xc2\x14\xef\x49\x6e\xb9\xdb"
+ "\x40\xe8\x06\x6f\x9c\x2a\xfd\x62"
+ "\x06\x46\xfd\x15\x1d\x36\x61\x6f"
+ "\x77\x77\x5e\x64\xce\x78\x1b\x85"
+ "\xbf\x50\x9a\xfd\x67\xa6\x1a\x65"
+ "\xad\x5b\x33\x30\xf1\x71\xaa\xd9"
+ "\x23\x0d\x92\x24\x5f\xae\x57\xb0"
+ "\x24\x37\x0a\x94\x12\xfb\xb5\xb1"
+ "\xd3\xb8\x1d\x12\x29\xb0\x80\x24"
+ "\x2d\x47\x9f\x96\x1f\x95\xf1\xb1"
+ "\xda\x35\xf6\x29\xe0\xe1\x23\x96"
+ "\xc7\xe8\x22\x9b\x7c\xac\xf9\x41"
+ "\x39\x01\xe5\x73\x15\x5e\x99\xec"
+ "\xb4\xc1\xf4\xe7\xa7\x97\x6a\xd5"
+ "\x90\x9a\xa0\x1d\xf3\x5a\x8b\x5f"
+ "\xdf\x01\x52\xa4\x93\x31\x97\xb0"
+ "\x93\x24\xb5\xbc\xb2\x14\x24\x98"
+ "\x4a\x8f\x19\x85\xc3\x2d\x0f\x74"
+ "\x9d\x16\x13\x80\x5e\x59\x62\x62"
+ "\x25\xe0\xd1\x2f\x64\xef\xba\xac"
+ "\xcd\x09\x07\x15\x8a\xcf\x73\xb5"
+ "\x8b\xc9\xd8\x24\xb0\x53\xd5\x6f"
+ "\xe1\x2b\x77\xb1\xc5\xe4\xa7\x0e"
+ "\x18\x45\xab\x36\x03\x59\xa8\xbd"
+ "\x43\xf0\xd8\x2c\x1a\x69\x96\xbb"
+ "\x13\xdf\x6c\x33\x77\xdf\x25\x34"
+ "\x5b\xa5\x5b\x8c\xf9\x51\x05\xd4"
+ "\x8b\x8b\x44\x87\x49\xfc\xa0\x8f"
+ "\x45\x15\x5b\x40\x42\xc4\x09\x92"
+ "\x98\x0c\x4d\xf4\x26\x37\x1b\x13"
+ "\x76\x01\x93\x8d\x4f\xe6\xed\x18"
+ "\xd0\x79\x7b\x3f\x44\x50\xcb\xee"
+ "\xf7\x4a\xc9\x9e\xe0\x96\x74\xa7"
+ "\xe6\x93\xb2\x53\xca\x55\xa8\xdc"
+ "\x1e\x68\x07\x87\xb7\x2e\xc1\x08"
+ "\xb2\xa4\x5b\xaf\xc6\xdb\x5c\x66"
+ "\x41\x1c\x51\xd9\xb0\x07\x00\x0d"
+ "\xf0\x4c\xdc\x93\xde\xa9\x1e\x8e"
+ "\xd3\x22\x62\xd8\x8b\x88\x2c\xea"
+ "\x5e\xf1\x6e\x14\x40\xc7\xbe\xaa"
+ "\x42\x28\xd0\x26\x30\x78\x01\x9b"
+ "\x83\x07\xbc\x94\xc7\x57\xa2\x9f"
+ "\x03\x07\xff\x16\xff\x3c\x6e\x48"
+ "\x0a\xd0\xdd\x4c\xf6\x64\x9a\xf1"
+ "\xcd\x30\x12\x82\x2c\x38\xd3\x26"
+ "\x83\xdb\xab\x3e\xc6\xf8\xe6\xfa"
+ "\x77\x0a\x78\x82\x75\xf8\x63\x51"
+ "\x59\xd0\x8d\x24\x9f\x25\xe6\xa3"
+ "\x4c\xbc\x34\xfc\xe3\x10\xc7\x62"
+ "\xd4\x23\xc8\x3d\xa7\xc6\xa6\x0a"
+ "\x4f\x7e\x29\x9d\x6d\xbe\xb5\xf1"
+ "\xdf\xa4\x53\xfa\xc0\x23\x0f\x37"
+ "\x84\x68\xd0\xb5\xc8\xc6\xae\xf8"
+ "\xb7\x8d\xb3\x16\xfe\x8f\x87\xad"
+ "\xd0\xc1\x08\xee\x12\x1c\x9b\x1d"
+ "\x90\xf8\xd1\x63\xa4\x92\x3c\xf0"
+ "\xc7\x34\xd8\xf1\x14\xed\xa3\xbc"
+ "\x17\x7e\xd4\x62\x42\x54\x57\x2c"
+ "\x3e\x7a\x35\x35\x17\x0f\x0b\x7f"
+ "\x81\xa1\x3f\xd0\xcd\xc8\x3b\x96"
+ "\xe9\xe0\x4a\x04\xe1\xb6\x3c\xa1"
+ "\xd6\xca\xc4\xbd\xb6\xb5\x95\x34"
+ "\x12\x9d\xc5\x96\xf2\xdf\xba\x54"
+ "\x76\xd1\xb2\x6b\x3b\x39\xe0\xb9"
+ "\x18\x62\xfb\xf7\xfc\x12\xf1\x5f"
+ "\x7e\xc7\xe3\x59\x4c\xa6\xc2\x3d"
+ "\x40\x15\xf9\xa3\x95\x64\x4c\x74"
+ "\x8b\x73\x77\x33\x07\xa7\x04\x1d"
+ "\x33\x5a\x7e\x8f\xbd\x86\x01\x4f"
+ "\x3e\xb9\x27\x6f\xe2\x41\xf7\x09"
+ "\x67\xfd\x29\x28\xc5\xe4\xf6\x18"
+ "\x4c\x1b\x49\xb2\x9c\x5b\xf6\x81"
+ "\x4f\xbb\x5c\xcc\x0b\xdf\x84\x23"
+ "\x58\xd6\x28\x34\x93\x3a\x25\x97"
+ "\xdf\xb2\xc3\x9e\x97\x38\x0b\x7d"
+ "\x10\xb3\x54\x35\x23\x8c\x64\xee"
+ "\xf0\xd8\x66\xff\x8b\x22\xd2\x5b"
+ "\x05\x16\x3c\x89\xf7\xb1\x75\xaf"
+ "\xc0\xae\x6a\x4f\x3f\xaf\x9a\xf4"
+ "\xf4\x9a\x24\xd9\x80\x82\xc0\x12"
+ "\xde\x96\xd1\xbe\x15\x0b\x8d\x6a"
+ "\xd7\x12\xe4\x85\x9f\x83\xc9\xc3"
+ "\xff\x0b\xb5\xaf\x3b\xd8\x6d\x67"
+ "\x81\x45\xe6\xac\xec\xc1\x7b\x16"
+ "\x18\x0a\xce\x4b\xc0\x2e\x76\xbc"
+ "\x1b\xfa\xb4\x34\xb8\xfc\x3e\xc8"
+ "\x5d\x90\x71\x6d\x7a\x79\xef\x06",
+ .ksize = 1088,
+ .plaintext = "\xaa\x5d\x54\xcb\xea\x1e\x46\x0f"
+ "\x45\x87\x70\x51\x8a\x66\x7a\x33"
+ "\xb4\x18\xff\xa9\x82\xf9\x45\x4b"
+ "\x93\xae\x2e\x7f\xab\x98\xfe\xbf"
+ "\x01\xee\xe5\xa0\x37\x8f\x57\xa6"
+ "\xb0\x76\x0d\xa4\xd6\x28\x2b\x5d"
+ "\xe1\x03\xd6\x1c\x6f\x34\x0d\xe7"
+ "\x61\x2d\x2e\xe5\xae\x5d\x47\xc7"
+ "\x80\x4b\x18\x8f\xa8\x99\xbc\x28"
+ "\xed\x1d\x9d\x86\x7d\xd7\x41\xd1"
+ "\xe0\x2b\xe1\x8c\x93\x2a\xa7\x80"
+ "\xe1\x07\xa0\xa9\x9f\x8c\x8d\x1a"
+ "\x55\xfc\x6b\x24\x7a\xbd\x3e\x51"
+ "\x68\x4b\x26\x59\xc8\xa7\x16\xd9"
+ "\xb9\x61\x13\xde\x8b\x63\x1c\xf6"
+ "\x60\x01\xfb\x08\xb3\x5b\x0a\xbf"
+ "\x34\x73\xda\x87\x87\x3d\x6f\x97"
+ "\x4a\x0c\xa3\x58\x20\xa2\xc0\x81"
+ "\x5b\x8c\xef\xa9\xc2\x01\x1e\x64"
+ "\x83\x8c\xbc\x03\xb6\xd0\x29\x9f"
+ "\x54\xe2\xce\x8b\xc2\x07\x85\x78"
+ "\x25\x38\x96\x4c\xb4\xbe\x17\x4a"
+ "\x65\xa6\xfa\x52\x9d\x66\x9d\x65"
+ "\x4a\xd1\x01\x01\xf0\xcb\x13\xcc"
+ "\xa5\x82\xf3\xf2\x66\xcd\x3f\x9d"
+ "\xd1\xaa\xe4\x67\xea\xf2\xad\x88"
+ "\x56\x76\xa7\x9b\x59\x3c\xb1\x5d"
+ "\x78\xfd\x69\x79\x74\x78\x43\x26"
+ "\x7b\xde\x3f\xf1\xf5\x4e\x14\xd9"
+ "\x15\xf5\x75\xb5\x2e\x19\xf3\x0c"
+ "\x48\x72\xd6\x71\x6d\x03\x6e\xaa"
+ "\xa7\x08\xf9\xaa\x70\xa3\x0f\x4d"
+ "\x12\x8a\xdd\xe3\x39\x73\x7e\xa7"
+ "\xea\x1f\x6d\x06\x26\x2a\xf2\xc5"
+ "\x52\xb4\xbf\xfd\x52\x0c\x06\x60"
+ "\x90\xd1\xb2\x7b\x56\xae\xac\x58"
+ "\x5a\x6b\x50\x2a\xf5\xe0\x30\x3c"
+ "\x2a\x98\x0f\x1b\x5b\x0a\x84\x6c"
+ "\x31\xae\x92\xe2\xd4\xbb\x7f\x59"
+ "\x26\x10\xb9\x89\x37\x68\x26\xbf"
+ "\x41\xc8\x49\xc4\x70\x35\x7d\xff"
+ "\x2d\x7f\xf6\x8a\x93\x68\x8c\x78"
+ "\x0d\x53\xce\x7d\xff\x7d\xfb\xae"
+ "\x13\x1b\x75\xc4\x78\xd7\x71\xd8"
+ "\xea\xd3\xf4\x9d\x95\x64\x8e\xb4"
+ "\xde\xb8\xe4\xa6\x68\xc8\xae\x73"
+ "\x58\xaf\xa8\xb0\x5a\x20\xde\x87"
+ "\x43\xb9\x0f\xe3\xad\x41\x4b\xd5"
+ "\xb7\xad\x16\x00\xa6\xff\xf6\x74"
+ "\xbf\x8c\x9f\xb3\x58\x1b\xb6\x55"
+ "\xa9\x90\x56\x28\xf0\xb5\x13\x4e"
+ "\x9e\xf7\x25\x86\xe0\x07\x7b\x98"
+ "\xd8\x60\x5d\x38\x95\x3c\xe4\x22"
+ "\x16\x2f\xb2\xa2\xaf\xe8\x90\x17"
+ "\xec\x11\x83\x1a\xf4\xa9\x26\xda"
+ "\x39\x72\xf5\x94\x61\x05\x51\xec"
+ "\xa8\x30\x8b\x2c\x13\xd0\x72\xac"
+ "\xb9\xd2\xa0\x4c\x4b\x78\xe8\x6e"
+ "\x04\x85\xe9\x04\x49\x82\x91\xff"
+ "\x89\xe5\xab\x4c\xaa\x37\x03\x12"
+ "\xca\x8b\x74\x10\xfd\x9e\xd9\x7b"
+ "\xcb\xdb\x82\x6e\xce\x2e\x33\x39"
+ "\xce\xd2\x84\x6e\x34\x71\x51\x6e"
+ "\x0d\xd6\x01\x87\xc7\xfa\x0a\xd3"
+ "\xad\x36\xf3\x4c\x9f\x96\x5e\x62"
+ "\x62\x54\xc3\x03\x78\xd6\xab\xdd"
+ "\x89\x73\x55\x25\x30\xf8\xa7\xe6"
+ "\x4f\x11\x0c\x7c\x0a\xa1\x2b\x7b"
+ "\x3d\x0d\xde\x81\xd4\x9d\x0b\xae"
+ "\xdf\x00\xf9\x4c\xb6\x90\x8e\x16"
+ "\xcb\x11\xc8\xd1\x2e\x73\x13\x75"
+ "\x75\x3e\xaa\xf5\xee\x02\xb3\x18"
+ "\xa6\x2d\xf5\x3b\x51\xd1\x1f\x47"
+ "\x6b\x2c\xdb\xc4\x10\xe0\xc8\xba"
+ "\x9d\xac\xb1\x9d\x75\xd5\x41\x0e"
+ "\x7e\xbe\x18\x5b\xa4\x1f\xf8\x22"
+ "\x4c\xc1\x68\xda\x6d\x51\x34\x6c"
+ "\x19\x59\xec\xb5\xb1\xec\xa7\x03"
+ "\xca\x54\x99\x63\x05\x6c\xb1\xac"
+ "\x9c\x31\xd6\xdb\xba\x7b\x14\x12"
+ "\x7a\xc3\x2f\xbf\x8d\xdc\x37\x46"
+ "\xdb\xd2\xbc\xd4\x2f\xab\x30\xd5"
+ "\xed\x34\x99\x8e\x83\x3e\xbe\x4c"
+ "\x86\x79\x58\xe0\x33\x8d\x9a\xb8"
+ "\xa9\xa6\x90\x46\xa2\x02\xb8\xdd"
+ "\xf5\xf9\x1a\x5c\x8c\x01\xaa\x6e"
+ "\xb4\x22\x12\xf5\x0c\x1b\x9b\x7a"
+ "\xc3\x80\xf3\x06\x00\x5f\x30\xd5"
+ "\x06\xdb\x7d\x82\xc2\xd4\x0b\x4c"
+ "\x5f\xe9\xc5\xf5\xdf\x97\x12\xbf"
+ "\x56\xaf\x9b\x69\xcd\xee\x30\xb4"
+ "\xa8\x71\xff\x3e\x7d\x73\x7a\xb4"
+ "\x0d\xa5\x46\x7a\xf3\xf4\x15\x87"
+ "\x5d\x93\x2b\x8c\x37\x64\xb5\xdd"
+ "\x48\xd1\xe5\x8c\xae\xd4\xf1\x76"
+ "\xda\xf4\xba\x9e\x25\x0e\xad\xa3"
+ "\x0d\x08\x7c\xa8\x82\x16\x8d\x90"
+ "\x56\x40\x16\x84\xe7\x22\x53\x3a"
+ "\x58\xbc\xb9\x8f\x33\xc8\xc2\x84"
+ "\x22\xe6\x0d\xe7\xb3\xdc\x5d\xdf"
+ "\xd7\x2a\x36\xe4\x16\x06\x07\xd2"
+ "\x97\x60\xb2\xf5\x5e\x14\xc9\xfd"
+ "\x8b\x05\xd1\xce\xee\x9a\x65\x99"
+ "\xb7\xae\x19\xb7\xc8\xbc\xd5\xa2"
+ "\x7b\x95\xe1\xcc\xba\x0d\xdc\x8a"
+ "\x1d\x59\x52\x50\xaa\x16\x02\x82"
+ "\xdf\x61\x33\x2e\x44\xce\x49\xc7"
+ "\xe5\xc6\x2e\x76\xcf\x80\x52\xf0"
+ "\x3d\x17\x34\x47\x3f\xd3\x80\x48"
+ "\xa2\xba\xd5\xc7\x7b\x02\x28\xdb"
+ "\xac\x44\xc7\x6e\x05\x5c\xc2\x79"
+ "\xb3\x7d\x6a\x47\x77\x66\xf1\x38"
+ "\xf0\xf5\x4f\x27\x1a\x31\xca\x6c"
+ "\x72\x95\x92\x8e\x3f\xb0\xec\x1d"
+ "\xc7\x2a\xff\x73\xee\xdf\x55\x80"
+ "\x93\xd2\xbd\x34\xd3\x9f\x00\x51"
+ "\xfb\x2e\x41\xba\x6c\x5a\x7c\x17"
+ "\x7f\xe6\x70\xac\x8d\x39\x3f\x77"
+ "\xe2\x23\xac\x8f\x72\x4e\xe4\x53"
+ "\xcc\xf1\x1b\xf1\x35\xfe\x52\xa4"
+ "\xd6\xb8\x40\x6b\xc1\xfd\xa0\xa1"
+ "\xf5\x46\x65\xc2\x50\xbb\x43\xe2"
+ "\xd1\x43\x28\x34\x74\xf5\x87\xa0"
+ "\xf2\x5e\x27\x3b\x59\x2b\x3e\x49"
+ "\xdf\x46\xee\xaf\x71\xd7\x32\x36"
+ "\xc7\x14\x0b\x58\x6e\x3e\x2d\x41"
+ "\xfa\x75\x66\x3a\x54\xe0\xb2\xb9"
+ "\xaf\xdd\x04\x80\x15\x19\x3f\x6f"
+ "\xce\x12\xb4\xd8\xe8\x89\x3c\x05"
+ "\x30\xeb\xf3\x3d\xcd\x27\xec\xdc"
+ "\x56\x70\x12\xcf\x78\x2b\x77\xbf"
+ "\x22\xf0\x1b\x17\x9c\xcc\xd6\x1b"
+ "\x2d\x3d\xa0\x3b\xd8\xc9\x70\xa4"
+ "\x7a\x3e\x07\xb9\x06\xc3\xfa\xb0"
+ "\x33\xee\xc1\xd8\xf6\xe0\xf0\xb2"
+ "\x61\x12\x69\xb0\x5f\x28\x99\xda"
+ "\xc3\x61\x48\xfa\x07\x16\x03\xc4"
+ "\xa8\xe1\x3c\xe8\x0e\x64\x15\x30"
+ "\xc1\x9d\x84\x2f\x73\x98\x0e\x3a"
+ "\xf2\x86\x21\xa4\x9e\x1d\xb5\x86"
+ "\x16\xdb\x2b\x9a\x06\x64\x8e\x79"
+ "\x8d\x76\x3e\xc3\xc2\x64\x44\xe3"
+ "\xda\xbc\x1a\x52\xd7\x61\x03\x65"
+ "\x54\x32\x77\x01\xed\x9d\x8a\x43"
+ "\x25\x24\xe3\xc1\xbe\xb8\x2f\xcb"
+ "\x89\x14\x64\xab\xf6\xa0\x6e\x02"
+ "\x57\xe4\x7d\xa9\x4e\x9a\x03\x36"
+ "\xad\xf1\xb1\xfc\x0b\xe6\x79\x51"
+ "\x9f\x81\x77\xc4\x14\x78\x9d\xbf"
+ "\xb6\xd6\xa3\x8c\xba\x0b\x26\xe7"
+ "\xc8\xb9\x5c\xcc\xe1\x5f\xd5\xc6"
+ "\xc4\xca\xc2\xa3\x45\xba\x94\x13"
+ "\xb2\x8f\xc3\x54\x01\x09\xe7\x8b"
+ "\xda\x2a\x0a\x11\x02\x43\xcb\x57"
+ "\xc9\xcc\xb5\x5c\xab\xc4\xec\x54"
+ "\x00\x06\x34\xe1\x6e\x03\x89\x7c"
+ "\xc6\xfb\x6a\xc7\x60\x43\xd6\xc5"
+ "\xb5\x68\x72\x89\x8f\x42\xc3\x74"
+ "\xbd\x25\xaa\x9f\x67\xb5\xdf\x26"
+ "\x20\xe8\xb7\x01\x3c\xe4\x77\xce"
+ "\xc4\x65\xa7\x23\x79\xea\x33\xc7"
+ "\x82\x14\x5c\x82\xf2\x4e\x3d\xf6"
+ "\xc6\x4a\x0e\x29\xbb\xec\x44\xcd"
+ "\x2f\xd1\x4f\x21\x71\xa9\xce\x0f"
+ "\x5c\xf2\x72\x5c\x08\x2e\x21\xd2"
+ "\xc3\x29\x13\xd8\xac\xc3\xda\x13"
+ "\x1a\x9d\xa7\x71\x1d\x27\x1d\x27"
+ "\x1d\xea\xab\x44\x79\xad\xe5\xeb"
+ "\xef\x1f\x22\x0a\x44\x4f\xcb\x87"
+ "\xa7\x58\x71\x0e\x66\xf8\x60\xbf"
+ "\x60\x74\x4a\xb4\xec\x2e\xfe\xd3"
+ "\xf5\xb8\xfe\x46\x08\x50\x99\x6c"
+ "\x66\xa5\xa8\x34\x44\xb5\xe5\xf0"
+ "\xdd\x2c\x67\x4e\x35\x96\x8e\x67"
+ "\x48\x3f\x5f\x37\x44\x60\x51\x2e"
+ "\x14\x91\x5e\x57\xc3\x0e\x79\x77"
+ "\x2f\x03\xf4\xe2\x1c\x72\xbf\x85"
+ "\x5d\xd3\x17\xdf\x6c\xc5\x70\x24"
+ "\x42\xdf\x51\x4e\x2a\xb2\xd2\x5b"
+ "\x9e\x69\x83\x41\x11\xfe\x73\x22"
+ "\xde\x8a\x9e\xd8\x8a\xfb\x20\x38"
+ "\xd8\x47\x6f\xd5\xed\x8f\x41\xfd"
+ "\x13\x7a\x18\x03\x7d\x0f\xcd\x7d"
+ "\xa6\x7d\x31\x9e\xf1\x8f\x30\xa3"
+ "\x8b\x4c\x24\xb7\xf5\x48\xd7\xd9"
+ "\x12\xe7\x84\x97\x5c\x31\x6d\xfb"
+ "\xdf\xf3\xd3\xd1\xd5\x0c\x30\x06"
+ "\x01\x6a\xbc\x6c\x78\x7b\xa6\x50"
+ "\xfa\x0f\x3c\x42\x2d\xa5\xa3\x3b"
+ "\xcf\x62\x50\xff\x71\x6d\xe7\xda"
+ "\x27\xab\xc6\x67\x16\x65\x68\x64"
+ "\xc7\xd5\x5f\x81\xa9\xf6\x65\xb3"
+ "\x5e\x43\x91\x16\xcd\x3d\x55\x37"
+ "\x55\xb3\xf0\x28\xc5\x54\x19\xc0"
+ "\xe0\xd6\x2a\x61\xd4\xc8\x72\x51"
+ "\xe9\xa1\x7b\x48\x21\xad\x44\x09"
+ "\xe4\x01\x61\x3c\x8a\x5b\xf9\xa1"
+ "\x6e\x1b\xdf\xc0\x04\xa8\x8b\xf2"
+ "\x21\xbe\x34\x7b\xfc\xa1\xcd\xc9"
+ "\xa9\x96\xf4\xa4\x4c\xf7\x4e\x8f"
+ "\x84\xcc\xd3\xa8\x92\x77\x8f\x36"
+ "\xe2\x2e\x8c\x33\xe8\x84\xa6\x0c"
+ "\x6c\x8a\xda\x14\x32\xc2\x96\xff"
+ "\xc6\x4a\xc2\x9b\x30\x7f\xd1\x29"
+ "\xc0\xd5\x78\x41\x00\x80\x80\x03"
+ "\x2a\xb1\xde\x26\x03\x48\x49\xee"
+ "\x57\x14\x76\x51\x3c\x36\x5d\x0a"
+ "\x5c\x9f\xe8\xd8\x53\xdb\x4f\xd4"
+ "\x38\xbf\x66\xc9\x75\x12\x18\x75"
+ "\x34\x2d\x93\x22\x96\x51\x24\x6e"
+ "\x4e\xd9\x30\xea\x67\xff\x92\x1c"
+ "\x16\x26\xe9\xb5\x33\xab\x8c\x22"
+ "\x47\xdb\xa0\x2c\x08\xf0\x12\x69"
+ "\x7e\x93\x52\xda\xa5\xe5\xca\xc1"
+ "\x0f\x55\x2a\xbd\x09\x30\x88\x1b"
+ "\x9c\xc6\x9f\xe6\xdb\xa6\x92\xeb"
+ "\xf4\xbd\x5c\xc4\xdb\xc6\x71\x09"
+ "\xab\x5e\x48\x0c\xed\x6f\xda\x8e"
+ "\x8d\x0c\x98\x71\x7d\x10\xd0\x9c"
+ "\x20\x9b\x79\x53\x26\x5d\xb9\x85"
+ "\x8a\x31\xb8\xc5\x1c\x97\xde\x88"
+ "\x61\x55\x7f\x7c\x21\x06\xea\xc4"
+ "\x5f\xaf\xf2\xf0\xd5\x5e\x7d\xb4"
+ "\x6e\xcf\xe9\xae\x1b\x0e\x11\x80"
+ "\xc1\x9a\x74\x7e\x52\x6f\xa0\xb7"
+ "\x24\xcd\x8d\x0a\x11\x40\x63\x72"
+ "\xfa\xe2\xc5\xb3\x94\xef\x29\xa2"
+ "\x1a\x23\x43\x04\x37\x55\x0d\xe9"
+ "\x83\xb2\x29\x51\x49\x64\xa0\xbd"
+ "\xde\x73\xfd\xa5\x7c\x95\x70\x62"
+ "\x58\xdc\xe2\xd0\xbf\x98\xf5\x8a"
+ "\x6a\xfd\xce\xa8\x0e\x42\x2a\xeb"
+ "\xd2\xff\x83\x27\x53\x5c\xa0\x6e"
+ "\x93\xef\xe2\xb9\x5d\x35\xd6\x98"
+ "\xf6\x71\x19\x7a\x54\xa1\xa7\xe8"
+ "\x09\xfe\xf6\x9e\xc7\xbd\x3e\x29"
+ "\xbd\x6b\x17\xf4\xe7\x3e\x10\x5c"
+ "\xc1\xd2\x59\x4f\x4b\x12\x1a\x5b"
+ "\x50\x80\x59\xb9\xec\x13\x66\xa8"
+ "\xd2\x31\x7b\x6a\x61\x22\xdd\x7d"
+ "\x61\xee\x87\x16\x46\x9f\xf9\xc7"
+ "\x41\xee\x74\xf8\xd0\x96\x2c\x76"
+ "\x2a\xac\x7d\x6e\x9f\x0e\x7f\x95"
+ "\xfe\x50\x16\xb2\x23\xca\x62\xd5"
+ "\x68\xcf\x07\x3f\x3f\x97\x85\x2a"
+ "\x0c\x25\x45\xba\xdb\x32\xcb\x83"
+ "\x8c\x4f\xe0\x6d\x9a\x99\xf9\xc9"
+ "\xda\xd4\x19\x31\xc1\x7c\x6d\xd9"
+ "\x9c\x56\xd3\xec\xc1\x81\x4c\xed"
+ "\x28\x9d\x87\xeb\x19\xd7\x1a\x4f"
+ "\x04\x6a\xcb\x1f\xcf\x1f\xa2\x16"
+ "\xfc\x2a\x0d\xa1\x14\x2d\xfa\xc5"
+ "\x5a\xd2\xc5\xf9\x19\x7c\x20\x1f"
+ "\x2d\x10\xc0\x66\x7c\xd9\x2d\xe5"
+ "\x88\x70\x59\xa7\x85\xd5\x2e\x7c"
+ "\x5c\xe3\xb7\x12\xd6\x97\x3f\x29",
+ .psize = 2048,
+ .digest = "\x37\x90\x92\xc2\xeb\x01\x87\xd9"
+ "\x95\xc7\x91\xc3\x17\x8b\x38\x52",
+ }
+};
+
/*
* DES test vectors.
*/
@@ -33083,6 +34314,2108 @@ static struct cipher_testvec chacha20_enc_tv_template[] = {
},
};
+static struct cipher_testvec xchacha20_tv_template[] = {
+ { /* from libsodium test/default/xchacha20.c */
+ .key = "\x79\xc9\x97\x98\xac\x67\x30\x0b"
+ "\xbb\x27\x04\xc9\x5c\x34\x1e\x32"
+ "\x45\xf3\xdc\xb2\x17\x61\xb9\x8e"
+ "\x52\xff\x45\xb2\x4f\x30\x4f\xc4",
+ .klen = 32,
+ .iv = "\xb3\x3f\xfd\x30\x96\x47\x9b\xcf"
+ "\xbc\x9a\xee\x49\x41\x76\x88\xa0"
+ "\xa2\x55\x4f\x8d\x95\x38\x94\x19"
+ "\x00\x00\x00\x00\x00\x00\x00\x00",
+ .input = "\x00\x00\x00\x00\x00\x00\x00\x00"
+ "\x00\x00\x00\x00\x00\x00\x00\x00"
+ "\x00\x00\x00\x00\x00\x00\x00\x00"
+ "\x00\x00\x00\x00\x00",
+ .result = "\xc6\xe9\x75\x81\x60\x08\x3a\xc6"
+ "\x04\xef\x90\xe7\x12\xce\x6e\x75"
+ "\xd7\x79\x75\x90\x74\x4e\x0c\xf0"
+ "\x60\xf0\x13\x73\x9c",
+ .ilen = 29,
+ .rlen = 29,
+ }, { /* from libsodium test/default/xchacha20.c */
+ .key = "\x9d\x23\xbd\x41\x49\xcb\x97\x9c"
+ "\xcf\x3c\x5c\x94\xdd\x21\x7e\x98"
+ "\x08\xcb\x0e\x50\xcd\x0f\x67\x81"
+ "\x22\x35\xea\xaf\x60\x1d\x62\x32",
+ .klen = 32,
+ .iv = "\xc0\x47\x54\x82\x66\xb7\xc3\x70"
+ "\xd3\x35\x66\xa2\x42\x5c\xbf\x30"
+ "\xd8\x2d\x1e\xaf\x52\x94\x10\x9e"
+ "\x00\x00\x00\x00\x00\x00\x00\x00",
+ .input = "\x00\x00\x00\x00\x00\x00\x00\x00"
+ "\x00\x00\x00\x00\x00\x00\x00\x00"
+ "\x00\x00\x00\x00\x00\x00\x00\x00"
+ "\x00\x00\x00\x00\x00\x00\x00\x00"
+ "\x00\x00\x00\x00\x00\x00\x00\x00"
+ "\x00\x00\x00\x00\x00\x00\x00\x00"
+ "\x00\x00\x00\x00\x00\x00\x00\x00"
+ "\x00\x00\x00\x00\x00\x00\x00\x00"
+ "\x00\x00\x00\x00\x00\x00\x00\x00"
+ "\x00\x00\x00\x00\x00\x00\x00\x00"
+ "\x00\x00\x00\x00\x00\x00\x00\x00"
+ "\x00\x00\x00",
+ .result = "\xa2\x12\x09\x09\x65\x94\xde\x8c"
+ "\x56\x67\xb1\xd1\x3a\xd9\x3f\x74"
+ "\x41\x06\xd0\x54\xdf\x21\x0e\x47"
+ "\x82\xcd\x39\x6f\xec\x69\x2d\x35"
+ "\x15\xa2\x0b\xf3\x51\xee\xc0\x11"
+ "\xa9\x2c\x36\x78\x88\xbc\x46\x4c"
+ "\x32\xf0\x80\x7a\xcd\x6c\x20\x3a"
+ "\x24\x7e\x0d\xb8\x54\x14\x84\x68"
+ "\xe9\xf9\x6b\xee\x4c\xf7\x18\xd6"
+ "\x8d\x5f\x63\x7c\xbd\x5a\x37\x64"
+ "\x57\x78\x8e\x6f\xae\x90\xfc\x31"
+ "\x09\x7c\xfc",
+ .ilen = 91,
+ .rlen = 91,
+ }, { /* Taken from the ChaCha20 test vectors, appended 16 random bytes
+ to nonce, and recomputed the ciphertext with libsodium */
+ .key = "\x00\x00\x00\x00\x00\x00\x00\x00"
+ "\x00\x00\x00\x00\x00\x00\x00\x00"
+ "\x00\x00\x00\x00\x00\x00\x00\x00"
+ "\x00\x00\x00\x00\x00\x00\x00\x00",
+ .klen = 32,
+ .iv = "\x00\x00\x00\x00\x00\x00\x00\x00"
+ "\x00\x00\x00\x00\x67\xc6\x69\x73"
+ "\x51\xff\x4a\xec\x29\xcd\xba\xab"
+ "\x00\x00\x00\x00\x00\x00\x00\x00",
+ .input = "\x00\x00\x00\x00\x00\x00\x00\x00"
+ "\x00\x00\x00\x00\x00\x00\x00\x00"
+ "\x00\x00\x00\x00\x00\x00\x00\x00"
+ "\x00\x00\x00\x00\x00\x00\x00\x00"
+ "\x00\x00\x00\x00\x00\x00\x00\x00"
+ "\x00\x00\x00\x00\x00\x00\x00\x00"
+ "\x00\x00\x00\x00\x00\x00\x00\x00"
+ "\x00\x00\x00\x00\x00\x00\x00\x00",
+ .result = "\x9c\x49\x2a\xe7\x8a\x2f\x93\xc7"
+ "\xb3\x33\x6f\x82\x17\xd8\xc4\x1e"
+ "\xad\x80\x11\x11\x1d\x4c\x16\x18"
+ "\x07\x73\x9b\x4f\xdb\x7c\xcb\x47"
+ "\xfd\xef\x59\x74\xfa\x3f\xe5\x4c"
+ "\x9b\xd0\xea\xbc\xba\x56\xad\x32"
+ "\x03\xdc\xf8\x2b\xc1\xe1\x75\x67"
+ "\x23\x7b\xe6\xfc\xd4\x03\x86\x54",
+ .ilen = 64,
+ .rlen = 64,
+ }, { /* Taken from the ChaCha20 test vectors, appended 16 random bytes
+ to nonce, and recomputed the ciphertext with libsodium */
+ .key = "\x00\x00\x00\x00\x00\x00\x00\x00"
+ "\x00\x00\x00\x00\x00\x00\x00\x00"
+ "\x00\x00\x00\x00\x00\x00\x00\x00"
+ "\x00\x00\x00\x00\x00\x00\x00\x01",
+ .klen = 32,
+ .iv = "\x00\x00\x00\x00\x00\x00\x00\x00"
+ "\x00\x00\x00\x02\xf2\xfb\xe3\x46"
+ "\x7c\xc2\x54\xf8\x1b\xe8\xe7\x8d"
+ "\x01\x00\x00\x00\x00\x00\x00\x00",
+ .input = "\x41\x6e\x79\x20\x73\x75\x62\x6d"
+ "\x69\x73\x73\x69\x6f\x6e\x20\x74"
+ "\x6f\x20\x74\x68\x65\x20\x49\x45"
+ "\x54\x46\x20\x69\x6e\x74\x65\x6e"
+ "\x64\x65\x64\x20\x62\x79\x20\x74"
+ "\x68\x65\x20\x43\x6f\x6e\x74\x72"
+ "\x69\x62\x75\x74\x6f\x72\x20\x66"
+ "\x6f\x72\x20\x70\x75\x62\x6c\x69"
+ "\x63\x61\x74\x69\x6f\x6e\x20\x61"
+ "\x73\x20\x61\x6c\x6c\x20\x6f\x72"
+ "\x20\x70\x61\x72\x74\x20\x6f\x66"
+ "\x20\x61\x6e\x20\x49\x45\x54\x46"
+ "\x20\x49\x6e\x74\x65\x72\x6e\x65"
+ "\x74\x2d\x44\x72\x61\x66\x74\x20"
+ "\x6f\x72\x20\x52\x46\x43\x20\x61"
+ "\x6e\x64\x20\x61\x6e\x79\x20\x73"
+ "\x74\x61\x74\x65\x6d\x65\x6e\x74"
+ "\x20\x6d\x61\x64\x65\x20\x77\x69"
+ "\x74\x68\x69\x6e\x20\x74\x68\x65"
+ "\x20\x63\x6f\x6e\x74\x65\x78\x74"
+ "\x20\x6f\x66\x20\x61\x6e\x20\x49"
+ "\x45\x54\x46\x20\x61\x63\x74\x69"
+ "\x76\x69\x74\x79\x20\x69\x73\x20"
+ "\x63\x6f\x6e\x73\x69\x64\x65\x72"
+ "\x65\x64\x20\x61\x6e\x20\x22\x49"
+ "\x45\x54\x46\x20\x43\x6f\x6e\x74"
+ "\x72\x69\x62\x75\x74\x69\x6f\x6e"
+ "\x22\x2e\x20\x53\x75\x63\x68\x20"
+ "\x73\x74\x61\x74\x65\x6d\x65\x6e"
+ "\x74\x73\x20\x69\x6e\x63\x6c\x75"
+ "\x64\x65\x20\x6f\x72\x61\x6c\x20"
+ "\x73\x74\x61\x74\x65\x6d\x65\x6e"
+ "\x74\x73\x20\x69\x6e\x20\x49\x45"
+ "\x54\x46\x20\x73\x65\x73\x73\x69"
+ "\x6f\x6e\x73\x2c\x20\x61\x73\x20"
+ "\x77\x65\x6c\x6c\x20\x61\x73\x20"
+ "\x77\x72\x69\x74\x74\x65\x6e\x20"
+ "\x61\x6e\x64\x20\x65\x6c\x65\x63"
+ "\x74\x72\x6f\x6e\x69\x63\x20\x63"
+ "\x6f\x6d\x6d\x75\x6e\x69\x63\x61"
+ "\x74\x69\x6f\x6e\x73\x20\x6d\x61"
+ "\x64\x65\x20\x61\x74\x20\x61\x6e"
+ "\x79\x20\x74\x69\x6d\x65\x20\x6f"
+ "\x72\x20\x70\x6c\x61\x63\x65\x2c"
+ "\x20\x77\x68\x69\x63\x68\x20\x61"
+ "\x72\x65\x20\x61\x64\x64\x72\x65"
+ "\x73\x73\x65\x64\x20\x74\x6f",
+ .result = "\xf9\xab\x7a\x4a\x60\xb8\x5f\xa0"
+ "\x50\xbb\x57\xce\xef\x8c\xc1\xd9"
+ "\x24\x15\xb3\x67\x5e\x7f\x01\xf6"
+ "\x1c\x22\xf6\xe5\x71\xb1\x43\x64"
+ "\x63\x05\xd5\xfc\x5c\x3d\xc0\x0e"
+ "\x23\xef\xd3\x3b\xd9\xdc\x7f\xa8"
+ "\x58\x26\xb3\xd0\xc2\xd5\x04\x3f"
+ "\x0a\x0e\x8f\x17\xe4\xcd\xf7\x2a"
+ "\xb4\x2c\x09\xe4\x47\xec\x8b\xfb"
+ "\x59\x37\x7a\xa1\xd0\x04\x7e\xaa"
+ "\xf1\x98\x5f\x24\x3d\x72\x9a\x43"
+ "\xa4\x36\x51\x92\x22\x87\xff\x26"
+ "\xce\x9d\xeb\x59\x78\x84\x5e\x74"
+ "\x97\x2e\x63\xc0\xef\x29\xf7\x8a"
+ "\xb9\xee\x35\x08\x77\x6a\x35\x9a"
+ "\x3e\xe6\x4f\x06\x03\x74\x1b\xc1"
+ "\x5b\xb3\x0b\x89\x11\x07\xd3\xb7"
+ "\x53\xd6\x25\x04\xd9\x35\xb4\x5d"
+ "\x4c\x33\x5a\xc2\x42\x4c\xe6\xa4"
+ "\x97\x6e\x0e\xd2\xb2\x8b\x2f\x7f"
+ "\x28\xe5\x9f\xac\x4b\x2e\x02\xab"
+ "\x85\xfa\xa9\x0d\x7c\x2d\x10\xe6"
+ "\x91\xab\x55\x63\xf0\xde\x3a\x94"
+ "\x25\x08\x10\x03\xc2\x68\xd1\xf4"
+ "\xaf\x7d\x9c\x99\xf7\x86\x96\x30"
+ "\x60\xfc\x0b\xe6\xa8\x80\x15\xb0"
+ "\x81\xb1\x0c\xbe\xb9\x12\x18\x25"
+ "\xe9\x0e\xb1\xe7\x23\xb2\xef\x4a"
+ "\x22\x8f\xc5\x61\x89\xd4\xe7\x0c"
+ "\x64\x36\x35\x61\xb6\x34\x60\xf7"
+ "\x7b\x61\x37\x37\x12\x10\xa2\xf6"
+ "\x7e\xdb\x7f\x39\x3f\xb6\x8e\x89"
+ "\x9e\xf3\xfe\x13\x98\xbb\x66\x5a"
+ "\xec\xea\xab\x3f\x9c\x87\xc4\x8c"
+ "\x8a\x04\x18\x49\xfc\x77\x11\x50"
+ "\x16\xe6\x71\x2b\xee\xc0\x9c\xb6"
+ "\x87\xfd\x80\xff\x0b\x1d\x73\x38"
+ "\xa4\x1d\x6f\xae\xe4\x12\xd7\x93"
+ "\x9d\xcd\x38\x26\x09\x40\x52\xcd"
+ "\x67\x01\x67\x26\xe0\x3e\x98\xa8"
+ "\xe8\x1a\x13\x41\xbb\x90\x4d\x87"
+ "\xbb\x42\x82\x39\xce\x3a\xd0\x18"
+ "\x6d\x7b\x71\x8f\xbb\x2c\x6a\xd1"
+ "\xbd\xf5\xc7\x8a\x7e\xe1\x1e\x0f"
+ "\x0d\x0d\x13\x7c\xd9\xd8\x3c\x91"
+ "\xab\xff\x1f\x12\xc3\xee\xe5\x65"
+ "\x12\x8d\x7b\x61\xe5\x1f\x98",
+ .ilen = 375,
+ .rlen = 375,
+ .also_non_np = 1,
+ .np = 3,
+ .tap = { 375 - 20, 4, 16 },
+
+ }, { /* Taken from the ChaCha20 test vectors, appended 16 random bytes
+ to nonce, and recomputed the ciphertext with libsodium */
+ .key = "\x1c\x92\x40\xa5\xeb\x55\xd3\x8a"
+ "\xf3\x33\x88\x86\x04\xf6\xb5\xf0"
+ "\x47\x39\x17\xc1\x40\x2b\x80\x09"
+ "\x9d\xca\x5c\xbc\x20\x70\x75\xc0",
+ .klen = 32,
+ .iv = "\x00\x00\x00\x00\x00\x00\x00\x00"
+ "\x00\x00\x00\x02\x76\x5a\x2e\x63"
+ "\x33\x9f\xc9\x9a\x66\x32\x0d\xb7"
+ "\x2a\x00\x00\x00\x00\x00\x00\x00",
+ .input = "\x27\x54\x77\x61\x73\x20\x62\x72"
+ "\x69\x6c\x6c\x69\x67\x2c\x20\x61"
+ "\x6e\x64\x20\x74\x68\x65\x20\x73"
+ "\x6c\x69\x74\x68\x79\x20\x74\x6f"
+ "\x76\x65\x73\x0a\x44\x69\x64\x20"
+ "\x67\x79\x72\x65\x20\x61\x6e\x64"
+ "\x20\x67\x69\x6d\x62\x6c\x65\x20"
+ "\x69\x6e\x20\x74\x68\x65\x20\x77"
+ "\x61\x62\x65\x3a\x0a\x41\x6c\x6c"
+ "\x20\x6d\x69\x6d\x73\x79\x20\x77"
+ "\x65\x72\x65\x20\x74\x68\x65\x20"
+ "\x62\x6f\x72\x6f\x67\x6f\x76\x65"
+ "\x73\x2c\x0a\x41\x6e\x64\x20\x74"
+ "\x68\x65\x20\x6d\x6f\x6d\x65\x20"
+ "\x72\x61\x74\x68\x73\x20\x6f\x75"
+ "\x74\x67\x72\x61\x62\x65\x2e",
+ .result = "\x95\xb9\x51\xe7\x8f\xb4\xa4\x03"
+ "\xca\x37\xcc\xde\x60\x1d\x8c\xe2"
+ "\xf1\xbb\x8a\x13\x7f\x61\x85\xcc"
+ "\xad\xf4\xf0\xdc\x86\xa6\x1e\x10"
+ "\xbc\x8e\xcb\x38\x2b\xa5\xc8\x8f"
+ "\xaa\x03\x3d\x53\x4a\x42\xb1\x33"
+ "\xfc\xd3\xef\xf0\x8e\x7e\x10\x9c"
+ "\x6f\x12\x5e\xd4\x96\xfe\x5b\x08"
+ "\xb6\x48\xf0\x14\x74\x51\x18\x7c"
+ "\x07\x92\xfc\xac\x9d\xf1\x94\xc0"
+ "\xc1\x9d\xc5\x19\x43\x1f\x1d\xbb"
+ "\x07\xf0\x1b\x14\x25\x45\xbb\xcb"
+ "\x5c\xe2\x8b\x28\xf3\xcf\x47\x29"
+ "\x27\x79\x67\x24\xa6\x87\xc2\x11"
+ "\x65\x03\xfa\x45\xf7\x9e\x53\x7a"
+ "\x99\xf1\x82\x25\x4f\x8d\x07",
+ .ilen = 127,
+ .rlen = 127,
+ }, { /* Taken from the ChaCha20 test vectors, appended 16 random bytes
+ to nonce, and recomputed the ciphertext with libsodium */
+ .key = "\x1c\x92\x40\xa5\xeb\x55\xd3\x8a"
+ "\xf3\x33\x88\x86\x04\xf6\xb5\xf0"
+ "\x47\x39\x17\xc1\x40\x2b\x80\x09"
+ "\x9d\xca\x5c\xbc\x20\x70\x75\xc0",
+ .klen = 32,
+ .iv = "\x00\x00\x00\x00\x00\x00\x00\x00"
+ "\x00\x00\x00\x01\x31\x58\xa3\x5a"
+ "\x25\x5d\x05\x17\x58\xe9\x5e\xd4"
+ "\x1c\x00\x00\x00\x00\x00\x00\x00",
+ .input = "\x49\xee\xe0\xdc\x24\x90\x40\xcd"
+ "\xc5\x40\x8f\x47\x05\xbc\xdd\x81"
+ "\x47\xc6\x8d\xe6\xb1\x8f\xd7\xcb"
+ "\x09\x0e\x6e\x22\x48\x1f\xbf\xb8"
+ "\x5c\xf7\x1e\x8a\xc1\x23\xf2\xd4"
+ "\x19\x4b\x01\x0f\x4e\xa4\x43\xce"
+ "\x01\xc6\x67\xda\x03\x91\x18\x90"
+ "\xa5\xa4\x8e\x45\x03\xb3\x2d\xac"
+ "\x74\x92\xd3\x53\x47\xc8\xdd\x25"
+ "\x53\x6c\x02\x03\x87\x0d\x11\x0c"
+ "\x58\xe3\x12\x18\xfd\x2a\x5b\x40"
+ "\x0c\x30\xf0\xb8\x3f\x43\xce\xae"
+ "\x65\x3a\x7d\x7c\xf4\x54\xaa\xcc"
+ "\x33\x97\xc3\x77\xba\xc5\x70\xde"
+ "\xd7\xd5\x13\xa5\x65\xc4\x5f\x0f"
+ "\x46\x1a\x0d\x97\xb5\xf3\xbb\x3c"
+ "\x84\x0f\x2b\xc5\xaa\xea\xf2\x6c"
+ "\xc9\xb5\x0c\xee\x15\xf3\x7d\xbe"
+ "\x9f\x7b\x5a\xa6\xae\x4f\x83\xb6"
+ "\x79\x49\x41\xf4\x58\x18\xcb\x86"
+ "\x7f\x30\x0e\xf8\x7d\x44\x36\xea"
+ "\x75\xeb\x88\x84\x40\x3c\xad\x4f"
+ "\x6f\x31\x6b\xaa\x5d\xe5\xa5\xc5"
+ "\x21\x66\xe9\xa7\xe3\xb2\x15\x88"
+ "\x78\xf6\x79\xa1\x59\x47\x12\x4e"
+ "\x9f\x9f\x64\x1a\xa0\x22\x5b\x08"
+ "\xbe\x7c\x36\xc2\x2b\x66\x33\x1b"
+ "\xdd\x60\x71\xf7\x47\x8c\x61\xc3"
+ "\xda\x8a\x78\x1e\x16\xfa\x1e\x86"
+ "\x81\xa6\x17\x2a\xa7\xb5\xc2\xe7"
+ "\xa4\xc7\x42\xf1\xcf\x6a\xca\xb4"
+ "\x45\xcf\xf3\x93\xf0\xe7\xea\xf6"
+ "\xf4\xe6\x33\x43\x84\x93\xa5\x67"
+ "\x9b\x16\x58\x58\x80\x0f\x2b\x5c"
+ "\x24\x74\x75\x7f\x95\x81\xb7\x30"
+ "\x7a\x33\xa7\xf7\x94\x87\x32\x27"
+ "\x10\x5d\x14\x4c\x43\x29\xdd\x26"
+ "\xbd\x3e\x3c\x0e\xfe\x0e\xa5\x10"
+ "\xea\x6b\x64\xfd\x73\xc6\xed\xec"
+ "\xa8\xc9\xbf\xb3\xba\x0b\x4d\x07"
+ "\x70\xfc\x16\xfd\x79\x1e\xd7\xc5"
+ "\x49\x4e\x1c\x8b\x8d\x79\x1b\xb1"
+ "\xec\xca\x60\x09\x4c\x6a\xd5\x09"
+ "\x49\x46\x00\x88\x22\x8d\xce\xea"
+ "\xb1\x17\x11\xde\x42\xd2\x23\xc1"
+ "\x72\x11\xf5\x50\x73\x04\x40\x47"
+ "\xf9\x5d\xe7\xa7\x26\xb1\x7e\xb0"
+ "\x3f\x58\xc1\x52\xab\x12\x67\x9d"
+ "\x3f\x43\x4b\x68\xd4\x9c\x68\x38"
+ "\x07\x8a\x2d\x3e\xf3\xaf\x6a\x4b"
+ "\xf9\xe5\x31\x69\x22\xf9\xa6\x69"
+ "\xc6\x9c\x96\x9a\x12\x35\x95\x1d"
+ "\x95\xd5\xdd\xbe\xbf\x93\x53\x24"
+ "\xfd\xeb\xc2\x0a\x64\xb0\x77\x00"
+ "\x6f\x88\xc4\x37\x18\x69\x7c\xd7"
+ "\x41\x92\x55\x4c\x03\xa1\x9a\x4b"
+ "\x15\xe5\xdf\x7f\x37\x33\x72\xc1"
+ "\x8b\x10\x67\xa3\x01\x57\x94\x25"
+ "\x7b\x38\x71\x7e\xdd\x1e\xcc\x73"
+ "\x55\xd2\x8e\xeb\x07\xdd\xf1\xda"
+ "\x58\xb1\x47\x90\xfe\x42\x21\x72"
+ "\xa3\x54\x7a\xa0\x40\xec\x9f\xdd"
+ "\xc6\x84\x6e\xca\xae\xe3\x68\xb4"
+ "\x9d\xe4\x78\xff\x57\xf2\xf8\x1b"
+ "\x03\xa1\x31\xd9\xde\x8d\xf5\x22"
+ "\x9c\xdd\x20\xa4\x1e\x27\xb1\x76"
+ "\x4f\x44\x55\xe2\x9b\xa1\x9c\xfe"
+ "\x54\xf7\x27\x1b\xf4\xde\x02\xf5"
+ "\x1b\x55\x48\x5c\xdc\x21\x4b\x9e"
+ "\x4b\x6e\xed\x46\x23\xdc\x65\xb2"
+ "\xcf\x79\x5f\x28\xe0\x9e\x8b\xe7"
+ "\x4c\x9d\x8a\xff\xc1\xa6\x28\xb8"
+ "\x65\x69\x8a\x45\x29\xef\x74\x85"
+ "\xde\x79\xc7\x08\xae\x30\xb0\xf4"
+ "\xa3\x1d\x51\x41\xab\xce\xcb\xf6"
+ "\xb5\xd8\x6d\xe0\x85\xe1\x98\xb3"
+ "\x43\xbb\x86\x83\x0a\xa0\xf5\xb7"
+ "\x04\x0b\xfa\x71\x1f\xb0\xf6\xd9"
+ "\x13\x00\x15\xf0\xc7\xeb\x0d\x5a"
+ "\x9f\xd7\xb9\x6c\x65\x14\x22\x45"
+ "\x6e\x45\x32\x3e\x7e\x60\x1a\x12"
+ "\x97\x82\x14\xfb\xaa\x04\x22\xfa"
+ "\xa0\xe5\x7e\x8c\x78\x02\x48\x5d"
+ "\x78\x33\x5a\x7c\xad\xdb\x29\xce"
+ "\xbb\x8b\x61\xa4\xb7\x42\xe2\xac"
+ "\x8b\x1a\xd9\x2f\x0b\x8b\x62\x21"
+ "\x83\x35\x7e\xad\x73\xc2\xb5\x6c"
+ "\x10\x26\x38\x07\xe5\xc7\x36\x80"
+ "\xe2\x23\x12\x61\xf5\x48\x4b\x2b"
+ "\xc5\xdf\x15\xd9\x87\x01\xaa\xac"
+ "\x1e\x7c\xad\x73\x78\x18\x63\xe0"
+ "\x8b\x9f\x81\xd8\x12\x6a\x28\x10"
+ "\xbe\x04\x68\x8a\x09\x7c\x1b\x1c"
+ "\x83\x66\x80\x47\x80\xe8\xfd\x35"
+ "\x1c\x97\x6f\xae\x49\x10\x66\xcc"
+ "\xc6\xd8\xcc\x3a\x84\x91\x20\x77"
+ "\x72\xe4\x24\xd2\x37\x9f\xc5\xc9"
+ "\x25\x94\x10\x5f\x40\x00\x64\x99"
+ "\xdc\xae\xd7\x21\x09\x78\x50\x15"
+ "\xac\x5f\xc6\x2c\xa2\x0b\xa9\x39"
+ "\x87\x6e\x6d\xab\xde\x08\x51\x16"
+ "\xc7\x13\xe9\xea\xed\x06\x8e\x2c"
+ "\xf8\x37\x8c\xf0\xa6\x96\x8d\x43"
+ "\xb6\x98\x37\xb2\x43\xed\xde\xdf"
+ "\x89\x1a\xe7\xeb\x9d\xa1\x7b\x0b"
+ "\x77\xb0\xe2\x75\xc0\xf1\x98\xd9"
+ "\x80\x55\xc9\x34\x91\xd1\x59\xe8"
+ "\x4b\x0f\xc1\xa9\x4b\x7a\x84\x06"
+ "\x20\xa8\x5d\xfa\xd1\xde\x70\x56"
+ "\x2f\x9e\x91\x9c\x20\xb3\x24\xd8"
+ "\x84\x3d\xe1\x8c\x7e\x62\x52\xe5"
+ "\x44\x4b\x9f\xc2\x93\x03\xea\x2b"
+ "\x59\xc5\xfa\x3f\x91\x2b\xbb\x23"
+ "\xf5\xb2\x7b\xf5\x38\xaf\xb3\xee"
+ "\x63\xdc\x7b\xd1\xff\xaa\x8b\xab"
+ "\x82\x6b\x37\x04\xeb\x74\xbe\x79"
+ "\xb9\x83\x90\xef\x20\x59\x46\xff"
+ "\xe9\x97\x3e\x2f\xee\xb6\x64\x18"
+ "\x38\x4c\x7a\x4a\xf9\x61\xe8\x9a"
+ "\xa1\xb5\x01\xa6\x47\xd3\x11\xd4"
+ "\xce\xd3\x91\x49\x88\xc7\xb8\x4d"
+ "\xb1\xb9\x07\x6d\x16\x72\xae\x46"
+ "\x5e\x03\xa1\x4b\xb6\x02\x30\xa8"
+ "\x3d\xa9\x07\x2a\x7c\x19\xe7\x62"
+ "\x87\xe3\x82\x2f\x6f\xe1\x09\xd9"
+ "\x94\x97\xea\xdd\x58\x9e\xae\x76"
+ "\x7e\x35\xe5\xb4\xda\x7e\xf4\xde"
+ "\xf7\x32\x87\xcd\x93\xbf\x11\x56"
+ "\x11\xbe\x08\x74\xe1\x69\xad\xe2"
+ "\xd7\xf8\x86\x75\x8a\x3c\xa4\xbe"
+ "\x70\xa7\x1b\xfc\x0b\x44\x2a\x76"
+ "\x35\xea\x5d\x85\x81\xaf\x85\xeb"
+ "\xa0\x1c\x61\xc2\xf7\x4f\xa5\xdc"
+ "\x02\x7f\xf6\x95\x40\x6e\x8a\x9a"
+ "\xf3\x5d\x25\x6e\x14\x3a\x22\xc9"
+ "\x37\x1c\xeb\x46\x54\x3f\xa5\x91"
+ "\xc2\xb5\x8c\xfe\x53\x08\x97\x32"
+ "\x1b\xb2\x30\x27\xfe\x25\x5d\xdc"
+ "\x08\x87\xd0\xe5\x94\x1a\xd4\xf1"
+ "\xfe\xd6\xb4\xa3\xe6\x74\x81\x3c"
+ "\x1b\xb7\x31\xa7\x22\xfd\xd4\xdd"
+ "\x20\x4e\x7c\x51\xb0\x60\x73\xb8"
+ "\x9c\xac\x91\x90\x7e\x01\xb0\xe1"
+ "\x8a\x2f\x75\x1c\x53\x2a\x98\x2a"
+ "\x06\x52\x95\x52\xb2\xe9\x25\x2e"
+ "\x4c\xe2\x5a\x00\xb2\x13\x81\x03"
+ "\x77\x66\x0d\xa5\x99\xda\x4e\x8c"
+ "\xac\xf3\x13\x53\x27\x45\xaf\x64"
+ "\x46\xdc\xea\x23\xda\x97\xd1\xab"
+ "\x7d\x6c\x30\x96\x1f\xbc\x06\x34"
+ "\x18\x0b\x5e\x21\x35\x11\x8d\x4c"
+ "\xe0\x2d\xe9\x50\x16\x74\x81\xa8"
+ "\xb4\x34\xb9\x72\x42\xa6\xcc\xbc"
+ "\xca\x34\x83\x27\x10\x5b\x68\x45"
+ "\x8f\x52\x22\x0c\x55\x3d\x29\x7c"
+ "\xe3\xc0\x66\x05\x42\x91\x5f\x58"
+ "\xfe\x4a\x62\xd9\x8c\xa9\x04\x19"
+ "\x04\xa9\x08\x4b\x57\xfc\x67\x53"
+ "\x08\x7c\xbc\x66\x8a\xb0\xb6\x9f"
+ "\x92\xd6\x41\x7c\x5b\x2a\x00\x79"
+ "\x72",
+ .result = "\x3a\x92\xee\x53\x31\xaf\x2b\x60"
+ "\x5f\x55\x8d\x00\x5d\xfc\x74\x97"
+ "\x28\x54\xf4\xa5\x75\xf1\x9b\x25"
+ "\x62\x1c\xc0\xe0\x13\xc8\x87\x53"
+ "\xd0\xf3\xa7\x97\x1f\x3b\x1e\xea"
+ "\xe0\xe5\x2a\xd1\xdd\xa4\x3b\x50"
+ "\x45\xa3\x0d\x7e\x1b\xc9\xa0\xad"
+ "\xb9\x2c\x54\xa6\xc7\x55\x16\xd0"
+ "\xc5\x2e\x02\x44\x35\xd0\x7e\x67"
+ "\xf2\xc4\x9b\xcd\x95\x10\xcc\x29"
+ "\x4b\xfa\x86\x87\xbe\x40\x36\xbe"
+ "\xe1\xa3\x52\x89\x55\x20\x9b\xc2"
+ "\xab\xf2\x31\x34\x16\xad\xc8\x17"
+ "\x65\x24\xc0\xff\x12\x37\xfe\x5a"
+ "\x62\x3b\x59\x47\x6c\x5f\x3a\x8e"
+ "\x3b\xd9\x30\xc8\x7f\x2f\x88\xda"
+ "\x80\xfd\x02\xda\x7f\x9a\x7a\x73"
+ "\x59\xc5\x34\x09\x9a\x11\xcb\xa7"
+ "\xfc\xf6\xa1\xa0\x60\xfb\x43\xbb"
+ "\xf1\xe9\xd7\xc6\x79\x27\x4e\xff"
+ "\x22\xb4\x24\xbf\x76\xee\x47\xb9"
+ "\x6d\x3f\x8b\xb0\x9c\x3c\x43\xdd"
+ "\xff\x25\x2e\x6d\xa4\x2b\xfb\x5d"
+ "\x1b\x97\x6c\x55\x0a\x82\x7a\x7b"
+ "\x94\x34\xc2\xdb\x2f\x1f\xc1\xea"
+ "\xd4\x4d\x17\x46\x3b\x51\x69\x09"
+ "\xe4\x99\x32\x25\xfd\x94\xaf\xfb"
+ "\x10\xf7\x4f\xdd\x0b\x3c\x8b\x41"
+ "\xb3\x6a\xb7\xd1\x33\xa8\x0c\x2f"
+ "\x62\x4c\x72\x11\xd7\x74\xe1\x3b"
+ "\x38\x43\x66\x7b\x6c\x36\x48\xe7"
+ "\xe3\xe7\x9d\xb9\x42\x73\x7a\x2a"
+ "\x89\x20\x1a\x41\x80\x03\xf7\x8f"
+ "\x61\x78\x13\xbf\xfe\x50\xf5\x04"
+ "\x52\xf9\xac\x47\xf8\x62\x4b\xb2"
+ "\x24\xa9\xbf\x64\xb0\x18\x69\xd2"
+ "\xf5\xe4\xce\xc8\xb1\x87\x75\xd6"
+ "\x2c\x24\x79\x00\x7d\x26\xfb\x44"
+ "\xe7\x45\x7a\xee\x58\xa5\x83\xc1"
+ "\xb4\x24\xab\x23\x2f\x4d\xd7\x4f"
+ "\x1c\xc7\xaa\xa9\x50\xf4\xa3\x07"
+ "\x12\x13\x89\x74\xdc\x31\x6a\xb2"
+ "\xf5\x0f\x13\x8b\xb9\xdb\x85\x1f"
+ "\xf5\xbc\x88\xd9\x95\xea\x31\x6c"
+ "\x36\x60\xb6\x49\xdc\xc4\xf7\x55"
+ "\x3f\x21\xc1\xb5\x92\x18\x5e\xbc"
+ "\x9f\x87\x7f\xe7\x79\x25\x40\x33"
+ "\xd6\xb9\x33\xd5\x50\xb3\xc7\x89"
+ "\x1b\x12\xa0\x46\xdd\xa7\xd8\x3e"
+ "\x71\xeb\x6f\x66\xa1\x26\x0c\x67"
+ "\xab\xb2\x38\x58\x17\xd8\x44\x3b"
+ "\x16\xf0\x8e\x62\x8d\x16\x10\x00"
+ "\x32\x8b\xef\xb9\x28\xd3\xc5\xad"
+ "\x0a\x19\xa2\xe4\x03\x27\x7d\x94"
+ "\x06\x18\xcd\xd6\x27\x00\xf9\x1f"
+ "\xb6\xb3\xfe\x96\x35\x5f\xc4\x1c"
+ "\x07\x62\x10\x79\x68\x50\xf1\x7e"
+ "\x29\xe7\xc4\xc4\xe7\xee\x54\xd6"
+ "\x58\x76\x84\x6d\x8d\xe4\x59\x31"
+ "\xe9\xf4\xdc\xa1\x1f\xe5\x1a\xd6"
+ "\xe6\x64\x46\xf5\x77\x9c\x60\x7a"
+ "\x5e\x62\xe3\x0a\xd4\x9f\x7a\x2d"
+ "\x7a\xa5\x0a\x7b\x29\x86\x7a\x74"
+ "\x74\x71\x6b\xca\x7d\x1d\xaa\xba"
+ "\x39\x84\x43\x76\x35\xfe\x4f\x9b"
+ "\xbb\xbb\xb5\x6a\x32\xb5\x5d\x41"
+ "\x51\xf0\x5b\x68\x03\x47\x4b\x8a"
+ "\xca\x88\xf6\x37\xbd\x73\x51\x70"
+ "\x66\xfe\x9e\x5f\x21\x9c\xf3\xdd"
+ "\xc3\xea\x27\xf9\x64\x94\xe1\x19"
+ "\xa0\xa9\xab\x60\xe0\x0e\xf7\x78"
+ "\x70\x86\xeb\xe0\xd1\x5c\x05\xd3"
+ "\xd7\xca\xe0\xc0\x47\x47\x34\xee"
+ "\x11\xa3\xa3\x54\x98\xb7\x49\x8e"
+ "\x84\x28\x70\x2c\x9e\xfb\x55\x54"
+ "\x4d\xf8\x86\xf7\x85\x7c\xbd\xf3"
+ "\x17\xd8\x47\xcb\xac\xf4\x20\x85"
+ "\x34\x66\xad\x37\x2d\x5e\x52\xda"
+ "\x8a\xfe\x98\x55\x30\xe7\x2d\x2b"
+ "\x19\x10\x8e\x7b\x66\x5e\xdc\xe0"
+ "\x45\x1f\x7b\xb4\x08\xfb\x8f\xf6"
+ "\x8c\x89\x21\x34\x55\x27\xb2\x76"
+ "\xb2\x07\xd9\xd6\x68\x9b\xea\x6b"
+ "\x2d\xb4\xc4\x35\xdd\xd2\x79\xae"
+ "\xc7\xd6\x26\x7f\x12\x01\x8c\xa7"
+ "\xe3\xdb\xa8\xf4\xf7\x2b\xec\x99"
+ "\x11\x00\xf1\x35\x8c\xcf\xd5\xc9"
+ "\xbd\x91\x36\x39\x70\xcf\x7d\x70"
+ "\x47\x1a\xfc\x6b\x56\xe0\x3f\x9c"
+ "\x60\x49\x01\x72\xa9\xaf\x2c\x9c"
+ "\xe8\xab\xda\x8c\x14\x19\xf3\x75"
+ "\x07\x17\x9d\x44\x67\x7a\x2e\xef"
+ "\xb7\x83\x35\x4a\xd1\x3d\x1c\x84"
+ "\x32\xdd\xaa\xea\xca\x1d\xdc\x72"
+ "\x2c\xcc\x43\xcd\x5d\xe3\x21\xa4"
+ "\xd0\x8a\x4b\x20\x12\xa3\xd5\x86"
+ "\x76\x96\xff\x5f\x04\x57\x0f\xe6"
+ "\xba\xe8\x76\x50\x0c\x64\x1d\x83"
+ "\x9c\x9b\x9a\x9a\x58\x97\x9c\x5c"
+ "\xb4\xa4\xa6\x3e\x19\xeb\x8f\x5a"
+ "\x61\xb2\x03\x7b\x35\x19\xbe\xa7"
+ "\x63\x0c\xfd\xdd\xf9\x90\x6c\x08"
+ "\x19\x11\xd3\x65\x4a\xf5\x96\x92"
+ "\x59\xaa\x9c\x61\x0c\x29\xa7\xf8"
+ "\x14\x39\x37\xbf\x3c\xf2\x16\x72"
+ "\x02\xfa\xa2\xf3\x18\x67\x5d\xcb"
+ "\xdc\x4d\xbb\x96\xff\x70\x08\x2d"
+ "\xc2\xa8\x52\xe1\x34\x5f\x72\xfe"
+ "\x64\xbf\xca\xa7\x74\x38\xfb\x74"
+ "\x55\x9c\xfa\x8a\xed\xfb\x98\xeb"
+ "\x58\x2e\x6c\xe1\x52\x76\x86\xd7"
+ "\xcf\xa1\xa4\xfc\xb2\x47\x41\x28"
+ "\xa3\xc1\xe5\xfd\x53\x19\x28\x2b"
+ "\x37\x04\x65\x96\x99\x7a\x28\x0f"
+ "\x07\x68\x4b\xc7\x52\x0a\x55\x35"
+ "\x40\x19\x95\x61\xe8\x59\x40\x1f"
+ "\x9d\xbf\x78\x7d\x8f\x84\xff\x6f"
+ "\xd0\xd5\x63\xd2\x22\xbd\xc8\x4e"
+ "\xfb\xe7\x9f\x06\xe6\xe7\x39\x6d"
+ "\x6a\x96\x9f\xf0\x74\x7e\xc9\x35"
+ "\xb7\x26\xb8\x1c\x0a\xa6\x27\x2c"
+ "\xa2\x2b\xfe\xbe\x0f\x07\x73\xae"
+ "\x7f\x7f\x54\xf5\x7c\x6a\x0a\x56"
+ "\x49\xd4\x81\xe5\x85\x53\x99\x1f"
+ "\x95\x05\x13\x58\x8d\x0e\x1b\x90"
+ "\xc3\x75\x48\x64\x58\x98\x67\x84"
+ "\xae\xe2\x21\xa2\x8a\x04\x0a\x0b"
+ "\x61\xaa\xb0\xd4\x28\x60\x7a\xf8"
+ "\xbc\x52\xfb\x24\x7f\xed\x0d\x2a"
+ "\x0a\xb2\xf9\xc6\x95\xb5\x11\xc9"
+ "\xf4\x0f\x26\x11\xcf\x2a\x57\x87"
+ "\x7a\xf3\xe7\x94\x65\xc2\xb5\xb3"
+ "\xab\x98\xe3\xc1\x2b\x59\x19\x7c"
+ "\xd6\xf3\xf9\xbf\xff\x6d\xc6\x82"
+ "\x13\x2f\x4a\x2e\xcd\x26\xfe\x2d"
+ "\x01\x70\xf4\xc2\x7f\x1f\x4c\xcb"
+ "\x47\x77\x0c\xa0\xa3\x03\xec\xda"
+ "\xa9\xbf\x0d\x2d\xae\xe4\xb8\x7b"
+ "\xa9\xbc\x08\xb4\x68\x2e\xc5\x60"
+ "\x8d\x87\x41\x2b\x0f\x69\xf0\xaf"
+ "\x5f\xba\x72\x20\x0f\x33\xcd\x6d"
+ "\x36\x7d\x7b\xd5\x05\xf1\x4b\x05"
+ "\xc4\xfc\x7f\x80\xb9\x4d\xbd\xf7"
+ "\x7c\x84\x07\x01\xc2\x40\x66\x5b"
+ "\x98\xc7\x2c\xe3\x97\xfa\xdf\x87"
+ "\xa0\x1f\xe9\x21\x42\x0f\x3b\xeb"
+ "\x89\x1c\x3b\xca\x83\x61\x77\x68"
+ "\x84\xbb\x60\x87\x38\x2e\x25\xd5"
+ "\x9e\x04\x41\x70\xac\xda\xc0\x9c"
+ "\x9c\x69\xea\x8d\x4e\x55\x2a\x29"
+ "\xed\x05\x4b\x7b\x73\x71\x90\x59"
+ "\x4d\xc8\xd8\x44\xf0\x4c\xe1\x5e"
+ "\x84\x47\x55\xcc\x32\x3f\xe7\x97"
+ "\x42\xc6\x32\xac\x40\xe5\xa5\xc7"
+ "\x8b\xed\xdb\xf7\x83\xd6\xb1\xc2"
+ "\x52\x5e\x34\xb7\xeb\x6e\xd9\xfc"
+ "\xe5\x93\x9a\x97\x3e\xb0\xdc\xd9"
+ "\xd7\x06\x10\xb6\x1d\x80\x59\xdd"
+ "\x0d\xfe\x64\x35\xcd\x5d\xec\xf0"
+ "\xba\xd0\x34\xc9\x2d\x91\xc5\x17"
+ "\x11",
+ .ilen = 1281,
+ .rlen = 1281,
+ .also_non_np = 1,
+ .np = 3,
+ .tap = { 1200, 1, 80 },
+ },
+};
+
+/*
+ * Same as XChaCha20 test vectors above, but recomputed the ciphertext with
+ * XChaCha12, using a modified libsodium.
+ */
+static struct cipher_testvec xchacha12_tv_template[] = {
+ {
+ .key = "\x79\xc9\x97\x98\xac\x67\x30\x0b"
+ "\xbb\x27\x04\xc9\x5c\x34\x1e\x32"
+ "\x45\xf3\xdc\xb2\x17\x61\xb9\x8e"
+ "\x52\xff\x45\xb2\x4f\x30\x4f\xc4",
+ .klen = 32,
+ .iv = "\xb3\x3f\xfd\x30\x96\x47\x9b\xcf"
+ "\xbc\x9a\xee\x49\x41\x76\x88\xa0"
+ "\xa2\x55\x4f\x8d\x95\x38\x94\x19"
+ "\x00\x00\x00\x00\x00\x00\x00\x00",
+ .input = "\x00\x00\x00\x00\x00\x00\x00\x00"
+ "\x00\x00\x00\x00\x00\x00\x00\x00"
+ "\x00\x00\x00\x00\x00\x00\x00\x00"
+ "\x00\x00\x00\x00\x00",
+ .result = "\x1b\x78\x7f\xd7\xa1\x41\x68\xab"
+ "\x3d\x3f\xd1\x7b\x69\x56\xb2\xd5"
+ "\x43\xce\xeb\xaf\x36\xf0\x29\x9d"
+ "\x3a\xfb\x18\xae\x1b",
+ .ilen = 29,
+ .rlen = 29,
+ }, {
+ .key = "\x9d\x23\xbd\x41\x49\xcb\x97\x9c"
+ "\xcf\x3c\x5c\x94\xdd\x21\x7e\x98"
+ "\x08\xcb\x0e\x50\xcd\x0f\x67\x81"
+ "\x22\x35\xea\xaf\x60\x1d\x62\x32",
+ .klen = 32,
+ .iv = "\xc0\x47\x54\x82\x66\xb7\xc3\x70"
+ "\xd3\x35\x66\xa2\x42\x5c\xbf\x30"
+ "\xd8\x2d\x1e\xaf\x52\x94\x10\x9e"
+ "\x00\x00\x00\x00\x00\x00\x00\x00",
+ .input = "\x00\x00\x00\x00\x00\x00\x00\x00"
+ "\x00\x00\x00\x00\x00\x00\x00\x00"
+ "\x00\x00\x00\x00\x00\x00\x00\x00"
+ "\x00\x00\x00\x00\x00\x00\x00\x00"
+ "\x00\x00\x00\x00\x00\x00\x00\x00"
+ "\x00\x00\x00\x00\x00\x00\x00\x00"
+ "\x00\x00\x00\x00\x00\x00\x00\x00"
+ "\x00\x00\x00\x00\x00\x00\x00\x00"
+ "\x00\x00\x00\x00\x00\x00\x00\x00"
+ "\x00\x00\x00\x00\x00\x00\x00\x00"
+ "\x00\x00\x00\x00\x00\x00\x00\x00"
+ "\x00\x00\x00",
+ .result = "\xfb\x32\x09\x1d\x83\x05\xae\x4c"
+ "\x13\x1f\x12\x71\xf2\xca\xb2\xeb"
+ "\x5b\x83\x14\x7d\x83\xf6\x57\x77"
+ "\x2e\x40\x1f\x92\x2c\xf9\xec\x35"
+ "\x34\x1f\x93\xdf\xfb\x30\xd7\x35"
+ "\x03\x05\x78\xc1\x20\x3b\x7a\xe3"
+ "\x62\xa3\x89\xdc\x11\x11\x45\xa8"
+ "\x82\x89\xa0\xf1\x4e\xc7\x0f\x11"
+ "\x69\xdd\x0c\x84\x2b\x89\x5c\xdc"
+ "\xf0\xde\x01\xef\xc5\x65\x79\x23"
+ "\x87\x67\xd6\x50\xd9\x8d\xd9\x92"
+ "\x54\x5b\x0e",
+ .ilen = 91,
+ .rlen = 91,
+ }, {
+ .key = "\x00\x00\x00\x00\x00\x00\x00\x00"
+ "\x00\x00\x00\x00\x00\x00\x00\x00"
+ "\x00\x00\x00\x00\x00\x00\x00\x00"
+ "\x00\x00\x00\x00\x00\x00\x00\x00",
+ .klen = 32,
+ .iv = "\x00\x00\x00\x00\x00\x00\x00\x00"
+ "\x00\x00\x00\x00\x67\xc6\x69\x73"
+ "\x51\xff\x4a\xec\x29\xcd\xba\xab"
+ "\x00\x00\x00\x00\x00\x00\x00\x00",
+ .input = "\x00\x00\x00\x00\x00\x00\x00\x00"
+ "\x00\x00\x00\x00\x00\x00\x00\x00"
+ "\x00\x00\x00\x00\x00\x00\x00\x00"
+ "\x00\x00\x00\x00\x00\x00\x00\x00"
+ "\x00\x00\x00\x00\x00\x00\x00\x00"
+ "\x00\x00\x00\x00\x00\x00\x00\x00"
+ "\x00\x00\x00\x00\x00\x00\x00\x00"
+ "\x00\x00\x00\x00\x00\x00\x00\x00",
+ .result = "\xdf\x2d\xc6\x21\x2a\x9d\xa1\xbb"
+ "\xc2\x77\x66\x0c\x5c\x46\xef\xa7"
+ "\x79\x1b\xb9\xdf\x55\xe2\xf9\x61"
+ "\x4c\x7b\xa4\x52\x24\xaf\xa2\xda"
+ "\xd1\x8f\x8f\xa2\x9e\x53\x4d\xc4"
+ "\xb8\x55\x98\x08\x7c\x08\xd4\x18"
+ "\x67\x8f\xef\x50\xb1\x5f\xa5\x77"
+ "\x4c\x25\xe7\x86\x26\x42\xca\x44",
+ .ilen = 64,
+ .rlen = 64,
+ }, {
+ .key = "\x00\x00\x00\x00\x00\x00\x00\x00"
+ "\x00\x00\x00\x00\x00\x00\x00\x00"
+ "\x00\x00\x00\x00\x00\x00\x00\x00"
+ "\x00\x00\x00\x00\x00\x00\x00\x01",
+ .klen = 32,
+ .iv = "\x00\x00\x00\x00\x00\x00\x00\x00"
+ "\x00\x00\x00\x02\xf2\xfb\xe3\x46"
+ "\x7c\xc2\x54\xf8\x1b\xe8\xe7\x8d"
+ "\x01\x00\x00\x00\x00\x00\x00\x00",
+ .input = "\x41\x6e\x79\x20\x73\x75\x62\x6d"
+ "\x69\x73\x73\x69\x6f\x6e\x20\x74"
+ "\x6f\x20\x74\x68\x65\x20\x49\x45"
+ "\x54\x46\x20\x69\x6e\x74\x65\x6e"
+ "\x64\x65\x64\x20\x62\x79\x20\x74"
+ "\x68\x65\x20\x43\x6f\x6e\x74\x72"
+ "\x69\x62\x75\x74\x6f\x72\x20\x66"
+ "\x6f\x72\x20\x70\x75\x62\x6c\x69"
+ "\x63\x61\x74\x69\x6f\x6e\x20\x61"
+ "\x73\x20\x61\x6c\x6c\x20\x6f\x72"
+ "\x20\x70\x61\x72\x74\x20\x6f\x66"
+ "\x20\x61\x6e\x20\x49\x45\x54\x46"
+ "\x20\x49\x6e\x74\x65\x72\x6e\x65"
+ "\x74\x2d\x44\x72\x61\x66\x74\x20"
+ "\x6f\x72\x20\x52\x46\x43\x20\x61"
+ "\x6e\x64\x20\x61\x6e\x79\x20\x73"
+ "\x74\x61\x74\x65\x6d\x65\x6e\x74"
+ "\x20\x6d\x61\x64\x65\x20\x77\x69"
+ "\x74\x68\x69\x6e\x20\x74\x68\x65"
+ "\x20\x63\x6f\x6e\x74\x65\x78\x74"
+ "\x20\x6f\x66\x20\x61\x6e\x20\x49"
+ "\x45\x54\x46\x20\x61\x63\x74\x69"
+ "\x76\x69\x74\x79\x20\x69\x73\x20"
+ "\x63\x6f\x6e\x73\x69\x64\x65\x72"
+ "\x65\x64\x20\x61\x6e\x20\x22\x49"
+ "\x45\x54\x46\x20\x43\x6f\x6e\x74"
+ "\x72\x69\x62\x75\x74\x69\x6f\x6e"
+ "\x22\x2e\x20\x53\x75\x63\x68\x20"
+ "\x73\x74\x61\x74\x65\x6d\x65\x6e"
+ "\x74\x73\x20\x69\x6e\x63\x6c\x75"
+ "\x64\x65\x20\x6f\x72\x61\x6c\x20"
+ "\x73\x74\x61\x74\x65\x6d\x65\x6e"
+ "\x74\x73\x20\x69\x6e\x20\x49\x45"
+ "\x54\x46\x20\x73\x65\x73\x73\x69"
+ "\x6f\x6e\x73\x2c\x20\x61\x73\x20"
+ "\x77\x65\x6c\x6c\x20\x61\x73\x20"
+ "\x77\x72\x69\x74\x74\x65\x6e\x20"
+ "\x61\x6e\x64\x20\x65\x6c\x65\x63"
+ "\x74\x72\x6f\x6e\x69\x63\x20\x63"
+ "\x6f\x6d\x6d\x75\x6e\x69\x63\x61"
+ "\x74\x69\x6f\x6e\x73\x20\x6d\x61"
+ "\x64\x65\x20\x61\x74\x20\x61\x6e"
+ "\x79\x20\x74\x69\x6d\x65\x20\x6f"
+ "\x72\x20\x70\x6c\x61\x63\x65\x2c"
+ "\x20\x77\x68\x69\x63\x68\x20\x61"
+ "\x72\x65\x20\x61\x64\x64\x72\x65"
+ "\x73\x73\x65\x64\x20\x74\x6f",
+ .result = "\xe4\xa6\xc8\x30\xc4\x23\x13\xd6"
+ "\x08\x4d\xc9\xb7\xa5\x64\x7c\xb9"
+ "\x71\xe2\xab\x3e\xa8\x30\x8a\x1c"
+ "\x4a\x94\x6d\x9b\xe0\xb3\x6f\xf1"
+ "\xdc\xe3\x1b\xb3\xa9\x6d\x0d\xd6"
+ "\xd0\xca\x12\xef\xe7\x5f\xd8\x61"
+ "\x3c\x82\xd3\x99\x86\x3c\x6f\x66"
+ "\x02\x06\xdc\x55\xf9\xed\xdf\x38"
+ "\xb4\xa6\x17\x00\x7f\xef\xbf\x4f"
+ "\xf8\x36\xf1\x60\x7e\x47\xaf\xdb"
+ "\x55\x9b\x12\xcb\x56\x44\xa7\x1f"
+ "\xd3\x1a\x07\x3b\x00\xec\xe6\x4c"
+ "\xa2\x43\x27\xdf\x86\x19\x4f\x16"
+ "\xed\xf9\x4a\xf3\x63\x6f\xfa\x7f"
+ "\x78\x11\xf6\x7d\x97\x6f\xec\x6f"
+ "\x85\x0f\x5c\x36\x13\x8d\x87\xe0"
+ "\x80\xb1\x69\x0b\x98\x89\x9c\x4e"
+ "\xf8\xdd\xee\x5c\x0a\x85\xce\xd4"
+ "\xea\x1b\x48\xbe\x08\xf8\xe2\xa8"
+ "\xa5\xb0\x3c\x79\xb1\x15\xb4\xb9"
+ "\x75\x10\x95\x35\x81\x7e\x26\xe6"
+ "\x78\xa4\x88\xcf\xdb\x91\x34\x18"
+ "\xad\xd7\x8e\x07\x7d\xab\x39\xf9"
+ "\xa3\x9e\xa5\x1d\xbb\xed\x61\xfd"
+ "\xdc\xb7\x5a\x27\xfc\xb5\xc9\x10"
+ "\xa8\xcc\x52\x7f\x14\x76\x90\xe7"
+ "\x1b\x29\x60\x74\xc0\x98\x77\xbb"
+ "\xe0\x54\xbb\x27\x49\x59\x1e\x62"
+ "\x3d\xaf\x74\x06\xa4\x42\x6f\xc6"
+ "\x52\x97\xc4\x1d\xc4\x9f\xe2\xe5"
+ "\x38\x57\x91\xd1\xa2\x28\xcc\x40"
+ "\xcc\x70\x59\x37\xfc\x9f\x4b\xda"
+ "\xa0\xeb\x97\x9a\x7d\xed\x14\x5c"
+ "\x9c\xb7\x93\x26\x41\xa8\x66\xdd"
+ "\x87\x6a\xc0\xd3\xc2\xa9\x3e\xae"
+ "\xe9\x72\xfe\xd1\xb3\xac\x38\xea"
+ "\x4d\x15\xa9\xd5\x36\x61\xe9\x96"
+ "\x6c\x23\xf8\x43\xe4\x92\x29\xd9"
+ "\x8b\x78\xf7\x0a\x52\xe0\x19\x5b"
+ "\x59\x69\x5b\x5d\xa1\x53\xc4\x68"
+ "\xe1\xbb\xac\x89\x14\xe2\xe2\x85"
+ "\x41\x18\xf5\xb3\xd1\xfa\x68\x19"
+ "\x44\x78\xdc\xcf\xe7\x88\x2d\x52"
+ "\x5f\x40\xb5\x7e\xf8\x88\xa2\xae"
+ "\x4a\xb2\x07\x35\x9d\x9b\x07\x88"
+ "\xb7\x00\xd0\x0c\xb6\xa0\x47\x59"
+ "\xda\x4e\xc9\xab\x9b\x8a\x7b",
+
+ .ilen = 375,
+ .rlen = 375,
+ .also_non_np = 1,
+ .np = 3,
+ .tap = { 375 - 20, 4, 16 },
+
+ }, {
+ .key = "\x1c\x92\x40\xa5\xeb\x55\xd3\x8a"
+ "\xf3\x33\x88\x86\x04\xf6\xb5\xf0"
+ "\x47\x39\x17\xc1\x40\x2b\x80\x09"
+ "\x9d\xca\x5c\xbc\x20\x70\x75\xc0",
+ .klen = 32,
+ .iv = "\x00\x00\x00\x00\x00\x00\x00\x00"
+ "\x00\x00\x00\x02\x76\x5a\x2e\x63"
+ "\x33\x9f\xc9\x9a\x66\x32\x0d\xb7"
+ "\x2a\x00\x00\x00\x00\x00\x00\x00",
+ .input = "\x27\x54\x77\x61\x73\x20\x62\x72"
+ "\x69\x6c\x6c\x69\x67\x2c\x20\x61"
+ "\x6e\x64\x20\x74\x68\x65\x20\x73"
+ "\x6c\x69\x74\x68\x79\x20\x74\x6f"
+ "\x76\x65\x73\x0a\x44\x69\x64\x20"
+ "\x67\x79\x72\x65\x20\x61\x6e\x64"
+ "\x20\x67\x69\x6d\x62\x6c\x65\x20"
+ "\x69\x6e\x20\x74\x68\x65\x20\x77"
+ "\x61\x62\x65\x3a\x0a\x41\x6c\x6c"
+ "\x20\x6d\x69\x6d\x73\x79\x20\x77"
+ "\x65\x72\x65\x20\x74\x68\x65\x20"
+ "\x62\x6f\x72\x6f\x67\x6f\x76\x65"
+ "\x73\x2c\x0a\x41\x6e\x64\x20\x74"
+ "\x68\x65\x20\x6d\x6f\x6d\x65\x20"
+ "\x72\x61\x74\x68\x73\x20\x6f\x75"
+ "\x74\x67\x72\x61\x62\x65\x2e",
+ .result = "\xb9\x68\xbc\x6a\x24\xbc\xcc\xd8"
+ "\x9b\x2a\x8d\x5b\x96\xaf\x56\xe3"
+ "\x11\x61\xe7\xa7\x9b\xce\x4e\x7d"
+ "\x60\x02\x48\xac\xeb\xd5\x3a\x26"
+ "\x9d\x77\x3b\xb5\x32\x13\x86\x8e"
+ "\x20\x82\x26\x72\xae\x64\x1b\x7e"
+ "\x2e\x01\x68\xb4\x87\x45\xa1\x24"
+ "\xe4\x48\x40\xf0\xaa\xac\xee\xa9"
+ "\xfc\x31\xad\x9d\x89\xa3\xbb\xd2"
+ "\xe4\x25\x13\xad\x0f\x5e\xdf\x3c"
+ "\x27\xab\xb8\x62\x46\x22\x30\x48"
+ "\x55\x2c\x4e\x84\x78\x1d\x0d\x34"
+ "\x8d\x3c\x91\x0a\x7f\x5b\x19\x9f"
+ "\x97\x05\x4c\xa7\x62\x47\x8b\xc5"
+ "\x44\x2e\x20\x33\xdd\xa0\x82\xa9"
+ "\x25\x76\x37\xe6\x3c\x67\x5b",
+ .ilen = 127,
+ .rlen = 127,
+ }, {
+ .key = "\x1c\x92\x40\xa5\xeb\x55\xd3\x8a"
+ "\xf3\x33\x88\x86\x04\xf6\xb5\xf0"
+ "\x47\x39\x17\xc1\x40\x2b\x80\x09"
+ "\x9d\xca\x5c\xbc\x20\x70\x75\xc0",
+ .klen = 32,
+ .iv = "\x00\x00\x00\x00\x00\x00\x00\x00"
+ "\x00\x00\x00\x01\x31\x58\xa3\x5a"
+ "\x25\x5d\x05\x17\x58\xe9\x5e\xd4"
+ "\x1c\x00\x00\x00\x00\x00\x00\x00",
+ .input = "\x49\xee\xe0\xdc\x24\x90\x40\xcd"
+ "\xc5\x40\x8f\x47\x05\xbc\xdd\x81"
+ "\x47\xc6\x8d\xe6\xb1\x8f\xd7\xcb"
+ "\x09\x0e\x6e\x22\x48\x1f\xbf\xb8"
+ "\x5c\xf7\x1e\x8a\xc1\x23\xf2\xd4"
+ "\x19\x4b\x01\x0f\x4e\xa4\x43\xce"
+ "\x01\xc6\x67\xda\x03\x91\x18\x90"
+ "\xa5\xa4\x8e\x45\x03\xb3\x2d\xac"
+ "\x74\x92\xd3\x53\x47\xc8\xdd\x25"
+ "\x53\x6c\x02\x03\x87\x0d\x11\x0c"
+ "\x58\xe3\x12\x18\xfd\x2a\x5b\x40"
+ "\x0c\x30\xf0\xb8\x3f\x43\xce\xae"
+ "\x65\x3a\x7d\x7c\xf4\x54\xaa\xcc"
+ "\x33\x97\xc3\x77\xba\xc5\x70\xde"
+ "\xd7\xd5\x13\xa5\x65\xc4\x5f\x0f"
+ "\x46\x1a\x0d\x97\xb5\xf3\xbb\x3c"
+ "\x84\x0f\x2b\xc5\xaa\xea\xf2\x6c"
+ "\xc9\xb5\x0c\xee\x15\xf3\x7d\xbe"
+ "\x9f\x7b\x5a\xa6\xae\x4f\x83\xb6"
+ "\x79\x49\x41\xf4\x58\x18\xcb\x86"
+ "\x7f\x30\x0e\xf8\x7d\x44\x36\xea"
+ "\x75\xeb\x88\x84\x40\x3c\xad\x4f"
+ "\x6f\x31\x6b\xaa\x5d\xe5\xa5\xc5"
+ "\x21\x66\xe9\xa7\xe3\xb2\x15\x88"
+ "\x78\xf6\x79\xa1\x59\x47\x12\x4e"
+ "\x9f\x9f\x64\x1a\xa0\x22\x5b\x08"
+ "\xbe\x7c\x36\xc2\x2b\x66\x33\x1b"
+ "\xdd\x60\x71\xf7\x47\x8c\x61\xc3"
+ "\xda\x8a\x78\x1e\x16\xfa\x1e\x86"
+ "\x81\xa6\x17\x2a\xa7\xb5\xc2\xe7"
+ "\xa4\xc7\x42\xf1\xcf\x6a\xca\xb4"
+ "\x45\xcf\xf3\x93\xf0\xe7\xea\xf6"
+ "\xf4\xe6\x33\x43\x84\x93\xa5\x67"
+ "\x9b\x16\x58\x58\x80\x0f\x2b\x5c"
+ "\x24\x74\x75\x7f\x95\x81\xb7\x30"
+ "\x7a\x33\xa7\xf7\x94\x87\x32\x27"
+ "\x10\x5d\x14\x4c\x43\x29\xdd\x26"
+ "\xbd\x3e\x3c\x0e\xfe\x0e\xa5\x10"
+ "\xea\x6b\x64\xfd\x73\xc6\xed\xec"
+ "\xa8\xc9\xbf\xb3\xba\x0b\x4d\x07"
+ "\x70\xfc\x16\xfd\x79\x1e\xd7\xc5"
+ "\x49\x4e\x1c\x8b\x8d\x79\x1b\xb1"
+ "\xec\xca\x60\x09\x4c\x6a\xd5\x09"
+ "\x49\x46\x00\x88\x22\x8d\xce\xea"
+ "\xb1\x17\x11\xde\x42\xd2\x23\xc1"
+ "\x72\x11\xf5\x50\x73\x04\x40\x47"
+ "\xf9\x5d\xe7\xa7\x26\xb1\x7e\xb0"
+ "\x3f\x58\xc1\x52\xab\x12\x67\x9d"
+ "\x3f\x43\x4b\x68\xd4\x9c\x68\x38"
+ "\x07\x8a\x2d\x3e\xf3\xaf\x6a\x4b"
+ "\xf9\xe5\x31\x69\x22\xf9\xa6\x69"
+ "\xc6\x9c\x96\x9a\x12\x35\x95\x1d"
+ "\x95\xd5\xdd\xbe\xbf\x93\x53\x24"
+ "\xfd\xeb\xc2\x0a\x64\xb0\x77\x00"
+ "\x6f\x88\xc4\x37\x18\x69\x7c\xd7"
+ "\x41\x92\x55\x4c\x03\xa1\x9a\x4b"
+ "\x15\xe5\xdf\x7f\x37\x33\x72\xc1"
+ "\x8b\x10\x67\xa3\x01\x57\x94\x25"
+ "\x7b\x38\x71\x7e\xdd\x1e\xcc\x73"
+ "\x55\xd2\x8e\xeb\x07\xdd\xf1\xda"
+ "\x58\xb1\x47\x90\xfe\x42\x21\x72"
+ "\xa3\x54\x7a\xa0\x40\xec\x9f\xdd"
+ "\xc6\x84\x6e\xca\xae\xe3\x68\xb4"
+ "\x9d\xe4\x78\xff\x57\xf2\xf8\x1b"
+ "\x03\xa1\x31\xd9\xde\x8d\xf5\x22"
+ "\x9c\xdd\x20\xa4\x1e\x27\xb1\x76"
+ "\x4f\x44\x55\xe2\x9b\xa1\x9c\xfe"
+ "\x54\xf7\x27\x1b\xf4\xde\x02\xf5"
+ "\x1b\x55\x48\x5c\xdc\x21\x4b\x9e"
+ "\x4b\x6e\xed\x46\x23\xdc\x65\xb2"
+ "\xcf\x79\x5f\x28\xe0\x9e\x8b\xe7"
+ "\x4c\x9d\x8a\xff\xc1\xa6\x28\xb8"
+ "\x65\x69\x8a\x45\x29\xef\x74\x85"
+ "\xde\x79\xc7\x08\xae\x30\xb0\xf4"
+ "\xa3\x1d\x51\x41\xab\xce\xcb\xf6"
+ "\xb5\xd8\x6d\xe0\x85\xe1\x98\xb3"
+ "\x43\xbb\x86\x83\x0a\xa0\xf5\xb7"
+ "\x04\x0b\xfa\x71\x1f\xb0\xf6\xd9"
+ "\x13\x00\x15\xf0\xc7\xeb\x0d\x5a"
+ "\x9f\xd7\xb9\x6c\x65\x14\x22\x45"
+ "\x6e\x45\x32\x3e\x7e\x60\x1a\x12"
+ "\x97\x82\x14\xfb\xaa\x04\x22\xfa"
+ "\xa0\xe5\x7e\x8c\x78\x02\x48\x5d"
+ "\x78\x33\x5a\x7c\xad\xdb\x29\xce"
+ "\xbb\x8b\x61\xa4\xb7\x42\xe2\xac"
+ "\x8b\x1a\xd9\x2f\x0b\x8b\x62\x21"
+ "\x83\x35\x7e\xad\x73\xc2\xb5\x6c"
+ "\x10\x26\x38\x07\xe5\xc7\x36\x80"
+ "\xe2\x23\x12\x61\xf5\x48\x4b\x2b"
+ "\xc5\xdf\x15\xd9\x87\x01\xaa\xac"
+ "\x1e\x7c\xad\x73\x78\x18\x63\xe0"
+ "\x8b\x9f\x81\xd8\x12\x6a\x28\x10"
+ "\xbe\x04\x68\x8a\x09\x7c\x1b\x1c"
+ "\x83\x66\x80\x47\x80\xe8\xfd\x35"
+ "\x1c\x97\x6f\xae\x49\x10\x66\xcc"
+ "\xc6\xd8\xcc\x3a\x84\x91\x20\x77"
+ "\x72\xe4\x24\xd2\x37\x9f\xc5\xc9"
+ "\x25\x94\x10\x5f\x40\x00\x64\x99"
+ "\xdc\xae\xd7\x21\x09\x78\x50\x15"
+ "\xac\x5f\xc6\x2c\xa2\x0b\xa9\x39"
+ "\x87\x6e\x6d\xab\xde\x08\x51\x16"
+ "\xc7\x13\xe9\xea\xed\x06\x8e\x2c"
+ "\xf8\x37\x8c\xf0\xa6\x96\x8d\x43"
+ "\xb6\x98\x37\xb2\x43\xed\xde\xdf"
+ "\x89\x1a\xe7\xeb\x9d\xa1\x7b\x0b"
+ "\x77\xb0\xe2\x75\xc0\xf1\x98\xd9"
+ "\x80\x55\xc9\x34\x91\xd1\x59\xe8"
+ "\x4b\x0f\xc1\xa9\x4b\x7a\x84\x06"
+ "\x20\xa8\x5d\xfa\xd1\xde\x70\x56"
+ "\x2f\x9e\x91\x9c\x20\xb3\x24\xd8"
+ "\x84\x3d\xe1\x8c\x7e\x62\x52\xe5"
+ "\x44\x4b\x9f\xc2\x93\x03\xea\x2b"
+ "\x59\xc5\xfa\x3f\x91\x2b\xbb\x23"
+ "\xf5\xb2\x7b\xf5\x38\xaf\xb3\xee"
+ "\x63\xdc\x7b\xd1\xff\xaa\x8b\xab"
+ "\x82\x6b\x37\x04\xeb\x74\xbe\x79"
+ "\xb9\x83\x90\xef\x20\x59\x46\xff"
+ "\xe9\x97\x3e\x2f\xee\xb6\x64\x18"
+ "\x38\x4c\x7a\x4a\xf9\x61\xe8\x9a"
+ "\xa1\xb5\x01\xa6\x47\xd3\x11\xd4"
+ "\xce\xd3\x91\x49\x88\xc7\xb8\x4d"
+ "\xb1\xb9\x07\x6d\x16\x72\xae\x46"
+ "\x5e\x03\xa1\x4b\xb6\x02\x30\xa8"
+ "\x3d\xa9\x07\x2a\x7c\x19\xe7\x62"
+ "\x87\xe3\x82\x2f\x6f\xe1\x09\xd9"
+ "\x94\x97\xea\xdd\x58\x9e\xae\x76"
+ "\x7e\x35\xe5\xb4\xda\x7e\xf4\xde"
+ "\xf7\x32\x87\xcd\x93\xbf\x11\x56"
+ "\x11\xbe\x08\x74\xe1\x69\xad\xe2"
+ "\xd7\xf8\x86\x75\x8a\x3c\xa4\xbe"
+ "\x70\xa7\x1b\xfc\x0b\x44\x2a\x76"
+ "\x35\xea\x5d\x85\x81\xaf\x85\xeb"
+ "\xa0\x1c\x61\xc2\xf7\x4f\xa5\xdc"
+ "\x02\x7f\xf6\x95\x40\x6e\x8a\x9a"
+ "\xf3\x5d\x25\x6e\x14\x3a\x22\xc9"
+ "\x37\x1c\xeb\x46\x54\x3f\xa5\x91"
+ "\xc2\xb5\x8c\xfe\x53\x08\x97\x32"
+ "\x1b\xb2\x30\x27\xfe\x25\x5d\xdc"
+ "\x08\x87\xd0\xe5\x94\x1a\xd4\xf1"
+ "\xfe\xd6\xb4\xa3\xe6\x74\x81\x3c"
+ "\x1b\xb7\x31\xa7\x22\xfd\xd4\xdd"
+ "\x20\x4e\x7c\x51\xb0\x60\x73\xb8"
+ "\x9c\xac\x91\x90\x7e\x01\xb0\xe1"
+ "\x8a\x2f\x75\x1c\x53\x2a\x98\x2a"
+ "\x06\x52\x95\x52\xb2\xe9\x25\x2e"
+ "\x4c\xe2\x5a\x00\xb2\x13\x81\x03"
+ "\x77\x66\x0d\xa5\x99\xda\x4e\x8c"
+ "\xac\xf3\x13\x53\x27\x45\xaf\x64"
+ "\x46\xdc\xea\x23\xda\x97\xd1\xab"
+ "\x7d\x6c\x30\x96\x1f\xbc\x06\x34"
+ "\x18\x0b\x5e\x21\x35\x11\x8d\x4c"
+ "\xe0\x2d\xe9\x50\x16\x74\x81\xa8"
+ "\xb4\x34\xb9\x72\x42\xa6\xcc\xbc"
+ "\xca\x34\x83\x27\x10\x5b\x68\x45"
+ "\x8f\x52\x22\x0c\x55\x3d\x29\x7c"
+ "\xe3\xc0\x66\x05\x42\x91\x5f\x58"
+ "\xfe\x4a\x62\xd9\x8c\xa9\x04\x19"
+ "\x04\xa9\x08\x4b\x57\xfc\x67\x53"
+ "\x08\x7c\xbc\x66\x8a\xb0\xb6\x9f"
+ "\x92\xd6\x41\x7c\x5b\x2a\x00\x79"
+ "\x72",
+ .result = "\xe1\xb6\x8b\x5c\x80\xb8\xcc\x08"
+ "\x1b\x84\xb2\xd1\xad\xa4\x70\xac"
+ "\x67\xa9\x39\x27\xac\xb4\x5b\xb7"
+ "\x4c\x26\x77\x23\x1d\xce\x0a\xbe"
+ "\x18\x9e\x42\x8b\xbd\x7f\xd6\xf1"
+ "\xf1\x6b\xe2\x6d\x7f\x92\x0e\xcb"
+ "\xb8\x79\xba\xb4\xac\x7e\x2d\xc0"
+ "\x9e\x83\x81\x91\xd5\xea\xc3\x12"
+ "\x8d\xa4\x26\x70\xa4\xf9\x71\x0b"
+ "\xbd\x2e\xe1\xb3\x80\x42\x25\xb3"
+ "\x0b\x31\x99\xe1\x0d\xde\xa6\x90"
+ "\xf2\xa3\x10\xf7\xe5\xf3\x83\x1e"
+ "\x2c\xfb\x4d\xf0\x45\x3d\x28\x3c"
+ "\xb8\xf1\xcb\xbf\x67\xd8\x43\x5a"
+ "\x9d\x7b\x73\x29\x88\x0f\x13\x06"
+ "\x37\x50\x0d\x7c\xe6\x9b\x07\xdd"
+ "\x7e\x01\x1f\x81\x90\x10\x69\xdb"
+ "\xa4\xad\x8a\x5e\xac\x30\x72\xf2"
+ "\x36\xcd\xe3\x23\x49\x02\x93\xfa"
+ "\x3d\xbb\xe2\x98\x83\xeb\xe9\x8d"
+ "\xb3\x8f\x11\xaa\x53\xdb\xaf\x2e"
+ "\x95\x13\x99\x3d\x71\xbd\x32\x92"
+ "\xdd\xfc\x9d\x5e\x6f\x63\x2c\xee"
+ "\x91\x1f\x4c\x64\x3d\x87\x55\x0f"
+ "\xcc\x3d\x89\x61\x53\x02\x57\x8f"
+ "\xe4\x77\x29\x32\xaf\xa6\x2f\x0a"
+ "\xae\x3c\x3f\x3f\xf4\xfb\x65\x52"
+ "\xc5\xc1\x78\x78\x53\x28\xad\xed"
+ "\xd1\x67\x37\xc7\x59\x70\xcd\x0a"
+ "\xb8\x0f\x80\x51\x9f\xc0\x12\x5e"
+ "\x06\x0a\x7e\xec\x24\x5f\x73\x00"
+ "\xb1\x0b\x31\x47\x4f\x73\x8d\xb4"
+ "\xce\xf3\x55\x45\x6c\x84\x27\xba"
+ "\xb9\x6f\x03\x4a\xeb\x98\x88\x6e"
+ "\x53\xed\x25\x19\x0d\x8f\xfe\xca"
+ "\x60\xe5\x00\x93\x6e\x3c\xff\x19"
+ "\xae\x08\x3b\x8a\xa6\x84\x05\xfe"
+ "\x9b\x59\xa0\x8c\xc8\x05\x45\xf5"
+ "\x05\x37\xdc\x45\x6f\x8b\x95\x8c"
+ "\x4e\x11\x45\x7a\xce\x21\xa5\xf7"
+ "\x71\x67\xb9\xce\xd7\xf9\xe9\x5e"
+ "\x60\xf5\x53\x7a\xa8\x85\x14\x03"
+ "\xa0\x92\xec\xf3\x51\x80\x84\xc4"
+ "\xdc\x11\x9e\x57\xce\x4b\x45\xcf"
+ "\x90\x95\x85\x0b\x96\xe9\xee\x35"
+ "\x10\xb8\x9b\xf2\x59\x4a\xc6\x7e"
+ "\x85\xe5\x6f\x38\x51\x93\x40\x0c"
+ "\x99\xd7\x7f\x32\xa8\x06\x27\xd1"
+ "\x2b\xd5\xb5\x3a\x1a\xe1\x5e\xda"
+ "\xcd\x5a\x50\x30\x3c\xc7\xe7\x65"
+ "\xa6\x07\x0b\x98\x91\xc6\x20\x27"
+ "\x2a\x03\x63\x1b\x1e\x3d\xaf\xc8"
+ "\x71\x48\x46\x6a\x64\x28\xf9\x3d"
+ "\xd1\x1d\xab\xc8\x40\x76\xc2\x39"
+ "\x4e\x00\x75\xd2\x0e\x82\x58\x8c"
+ "\xd3\x73\x5a\xea\x46\x89\xbe\xfd"
+ "\x4e\x2c\x0d\x94\xaa\x9b\x68\xac"
+ "\x86\x87\x30\x7e\xa9\x16\xcd\x59"
+ "\xd2\xa6\xbe\x0a\xd8\xf5\xfd\x2d"
+ "\x49\x69\xd2\x1a\x90\xd2\x1b\xed"
+ "\xff\x71\x04\x87\x87\x21\xc4\xb8"
+ "\x1f\x5b\x51\x33\xd0\xd6\x59\x9a"
+ "\x03\x0e\xd3\x8b\xfb\x57\x73\xfd"
+ "\x5a\x52\x63\x82\xc8\x85\x2f\xcb"
+ "\x74\x6d\x4e\xd9\x68\x37\x85\x6a"
+ "\xd4\xfb\x94\xed\x8d\xd1\x1a\xaf"
+ "\x76\xa7\xb7\x88\xd0\x2b\x4e\xda"
+ "\xec\x99\x94\x27\x6f\x87\x8c\xdf"
+ "\x4b\x5e\xa6\x66\xdd\xcb\x33\x7b"
+ "\x64\x94\x31\xa8\x37\xa6\x1d\xdb"
+ "\x0d\x5c\x93\xa4\x40\xf9\x30\x53"
+ "\x4b\x74\x8d\xdd\xf6\xde\x3c\xac"
+ "\x5c\x80\x01\x3a\xef\xb1\x9a\x02"
+ "\x0c\x22\x8e\xe7\x44\x09\x74\x4c"
+ "\xf2\x9a\x27\x69\x7f\x12\x32\x36"
+ "\xde\x92\xdf\xde\x8f\x5b\x31\xab"
+ "\x4a\x01\x26\xe0\xb1\xda\xe8\x37"
+ "\x21\x64\xe8\xff\x69\xfc\x9e\x41"
+ "\xd2\x96\x2d\x18\x64\x98\x33\x78"
+ "\x24\x61\x73\x9b\x47\x29\xf1\xa7"
+ "\xcb\x27\x0f\xf0\x85\x6d\x8c\x9d"
+ "\x2c\x95\x9e\xe5\xb2\x8e\x30\x29"
+ "\x78\x8a\x9d\x65\xb4\x8e\xde\x7b"
+ "\xd9\x00\x50\xf5\x7f\x81\xc3\x1b"
+ "\x25\x85\xeb\xc2\x8c\x33\x22\x1e"
+ "\x68\x38\x22\x30\xd8\x2e\x00\x98"
+ "\x85\x16\x06\x56\xb4\x81\x74\x20"
+ "\x95\xdb\x1c\x05\x19\xe8\x23\x4d"
+ "\x65\x5d\xcc\xd8\x7f\xc4\x2d\x0f"
+ "\x57\x26\x71\x07\xad\xaa\x71\x9f"
+ "\x19\x76\x2f\x25\x51\x88\xe4\xc0"
+ "\x82\x6e\x08\x05\x37\x04\xee\x25"
+ "\x23\x90\xe9\x4e\xce\x9b\x16\xc1"
+ "\x31\xe7\x6e\x2c\x1b\xe1\x85\x9a"
+ "\x0c\x8c\xbb\x12\x1e\x68\x7b\x93"
+ "\xa9\x3c\x39\x56\x23\x3e\x6e\xc7"
+ "\x77\x84\xd3\xe0\x86\x59\xaa\xb9"
+ "\xd5\x53\x58\xc9\x0a\x83\x5f\x85"
+ "\xd8\x47\x14\x67\x8a\x3c\x17\xe0"
+ "\xab\x02\x51\xea\xf1\xf0\x4f\x30"
+ "\x7d\xe0\x92\xc2\x5f\xfb\x19\x5a"
+ "\x3f\xbd\xf4\x39\xa4\x31\x0c\x39"
+ "\xd1\xae\x4e\xf7\x65\x7f\x1f\xce"
+ "\xc2\x39\xd1\x84\xd4\xe5\x02\xe0"
+ "\x58\xaa\xf1\x5e\x81\xaf\x7f\x72"
+ "\x0f\x08\x99\x43\xb9\xd8\xac\x41"
+ "\x35\x55\xf2\xb2\xd4\x98\xb8\x3b"
+ "\x2b\x3c\x3e\x16\x06\x31\xfc\x79"
+ "\x47\x38\x63\x51\xc5\xd0\x26\xd7"
+ "\x43\xb4\x2b\xd9\xc5\x05\xf2\x9d"
+ "\x18\xc9\x26\x82\x56\xd2\x11\x05"
+ "\xb6\x89\xb4\x43\x9c\xb5\x9d\x11"
+ "\x6c\x83\x37\x71\x27\x1c\xae\xbf"
+ "\xcd\x57\xd2\xee\x0d\x5a\x15\x26"
+ "\x67\x88\x80\x80\x1b\xdc\xc1\x62"
+ "\xdd\x4c\xff\x92\x5c\x6c\xe1\xa0"
+ "\xe3\x79\xa9\x65\x8c\x8c\x14\x42"
+ "\xe5\x11\xd2\x1a\xad\xa9\x56\x6f"
+ "\x98\xfc\x8a\x7b\x56\x1f\xc6\xc1"
+ "\x52\x12\x92\x9b\x41\x0f\x4b\xae"
+ "\x1b\x4a\xbc\xfe\x23\xb6\x94\x70"
+ "\x04\x30\x9e\x69\x47\xbe\xb8\x8f"
+ "\xca\x45\xd7\x8a\xf4\x78\x3e\xaa"
+ "\x71\x17\xd8\x1e\xb8\x11\x8f\xbc"
+ "\xc8\x1a\x65\x7b\x41\x89\x72\xc7"
+ "\x5f\xbe\xc5\x2a\xdb\x5c\x54\xf9"
+ "\x25\xa3\x7a\x80\x56\x9c\x8c\xab"
+ "\x26\x19\x10\x36\xa6\xf3\x14\x79"
+ "\x40\x98\x70\x68\xb7\x35\xd9\xb9"
+ "\x27\xd4\xe7\x74\x5b\x3d\x97\xb4"
+ "\xd9\xaa\xd9\xf2\xb5\x14\x84\x1f"
+ "\xa9\xde\x12\x44\x5b\x00\xc0\xbc"
+ "\xc8\x11\x25\x1b\x67\x7a\x15\x72"
+ "\xa6\x31\x6f\xf4\x68\x7a\x86\x9d"
+ "\x43\x1c\x5f\x16\xd3\xad\x2e\x52"
+ "\xf3\xb4\xc3\xfa\x27\x2e\x68\x6c"
+ "\x06\xe7\x4c\x4f\xa2\xe0\xe4\x21"
+ "\x5d\x9e\x33\x58\x8d\xbf\xd5\x70"
+ "\xf8\x80\xa5\xdd\xe7\x18\x79\xfa"
+ "\x7b\xfd\x09\x69\x2c\x37\x32\xa8"
+ "\x65\xfa\x8d\x8b\x5c\xcc\xe8\xf3"
+ "\x37\xf6\xa6\xc6\x5c\xa2\x66\x79"
+ "\xfa\x8a\xa7\xd1\x0b\x2e\x1b\x5e"
+ "\x95\x35\x00\x76\xae\x42\xf7\x50"
+ "\x51\x78\xfb\xb4\x28\x24\xde\x1a"
+ "\x70\x8b\xed\xca\x3c\x5e\xe4\xbd"
+ "\x28\xb5\xf3\x76\x4f\x67\x5d\x81"
+ "\xb2\x60\x87\xd9\x7b\x19\x1a\xa7"
+ "\x79\xa2\xfa\x3f\x9e\xa9\xd7\x25"
+ "\x61\xe1\x74\x31\xa2\x77\xa0\x1b"
+ "\xf6\xf7\xcb\xc5\xaa\x9e\xce\xf9"
+ "\x9b\x96\xef\x51\xc3\x1a\x44\x96"
+ "\xae\x17\x50\xab\x29\x08\xda\xcc"
+ "\x1a\xb3\x12\xd0\x24\xe4\xe2\xe0"
+ "\xc6\xe3\xcc\x82\xd0\xba\x47\x4c"
+ "\x3f\x49\xd7\xe8\xb6\x61\xaa\x65"
+ "\x25\x18\x40\x2d\x62\x25\x02\x71"
+ "\x61\xa2\xc1\xb2\x13\xd2\x71\x3f"
+ "\x43\x1a\xc9\x09\x92\xff\xd5\x57"
+ "\xf0\xfc\x5e\x1c\xf1\xf5\xf9\xf3"
+ "\x5b",
+ .ilen = 1281,
+ .rlen = 1281,
+ .also_non_np = 1,
+ .np = 3,
+ .tap = { 1200, 1, 80 },
+ },
+};
+
+/* Adiantum test vectors from https://github.com/google/adiantum */
+static struct cipher_testvec adiantum_xchacha12_aes_enc_tv_template[] = {
+ {
+ .key = "\x9e\xeb\xb2\x49\x3c\x1c\xf5\xf4"
+ "\x6a\x99\xc2\xc4\xdf\xb1\xf4\xdd"
+ "\x75\x20\x57\xea\x2c\x4f\xcd\xb2"
+ "\xa5\x3d\x7b\x49\x1e\xab\xfd\x0f",
+ .klen = 32,
+ .iv = "\xdf\x63\xd4\xab\xd2\x49\xf3\xd8"
+ "\x33\x81\x37\x60\x7d\xfa\x73\x08"
+ "\xd8\x49\x6d\x80\xe8\x2f\x62\x54"
+ "\xeb\x0e\xa9\x39\x5b\x45\x7f\x8a",
+ .input = "\x67\xc9\xf2\x30\x84\x41\x8e\x43"
+ "\xfb\xf3\xb3\x3e\x79\x36\x7f\xe8",
+ .result = "\x6d\x32\x86\x18\x67\x86\x0f\x3f"
+ "\x96\x7c\x9d\x28\x0d\x53\xec\x9f",
+ .ilen = 16,
+ .rlen = 16,
+ .also_non_np = 1,
+ .np = 2,
+ .tap = { 14, 2 },
+ }, {
+ .key = "\x36\x2b\x57\x97\xf8\x5d\xcd\x99"
+ "\x5f\x1a\x5a\x44\x1d\x92\x0f\x27"
+ "\xcc\x16\xd7\x2b\x85\x63\x99\xd3"
+ "\xba\x96\xa1\xdb\xd2\x60\x68\xda",
+ .klen = 32,
+ .iv = "\xef\x58\x69\xb1\x2c\x5e\x9a\x47"
+ "\x24\xc1\xb1\x69\xe1\x12\x93\x8f"
+ "\x43\x3d\x6d\x00\xdb\x5e\xd8\xd9"
+ "\x12\x9a\xfe\xd9\xff\x2d\xaa\xc4",
+ .input = "\x5e\xa8\x68\x19\x85\x98\x12\x23"
+ "\x26\x0a\xcc\xdb\x0a\x04\xb9\xdf"
+ "\x4d\xb3\x48\x7b\xb0\xe3\xc8\x19"
+ "\x43\x5a\x46\x06\x94\x2d\xf2",
+ .result = "\xc7\xc6\xf1\x73\x8f\xc4\xff\x4a"
+ "\x39\xbe\x78\xbe\x8d\x28\xc8\x89"
+ "\x46\x63\xe7\x0c\x7d\x87\xe8\x4e"
+ "\xc9\x18\x7b\xbe\x18\x60\x50",
+ .ilen = 31,
+ .rlen = 31,
+ }, {
+ .key = "\xa5\x28\x24\x34\x1a\x3c\xd8\xf7"
+ "\x05\x91\x8f\xee\x85\x1f\x35\x7f"
+ "\x80\x3d\xfc\x9b\x94\xf6\xfc\x9e"
+ "\x19\x09\x00\xa9\x04\x31\x4f\x11",
+ .klen = 32,
+ .iv = "\xa1\xba\x49\x95\xff\x34\x6d\xb8"
+ "\xcd\x87\x5d\x5e\xfd\xea\x85\xdb"
+ "\x8a\x7b\x5e\xb2\x5d\x57\xdd\x62"
+ "\xac\xa9\x8c\x41\x42\x94\x75\xb7",
+ .input = "\x69\xb4\xe8\x8c\x37\xe8\x67\x82"
+ "\xf1\xec\x5d\x04\xe5\x14\x91\x13"
+ "\xdf\xf2\x87\x1b\x69\x81\x1d\x71"
+ "\x70\x9e\x9c\x3b\xde\x49\x70\x11"
+ "\xa0\xa3\xdb\x0d\x54\x4f\x66\x69"
+ "\xd7\xdb\x80\xa7\x70\x92\x68\xce"
+ "\x81\x04\x2c\xc6\xab\xae\xe5\x60"
+ "\x15\xe9\x6f\xef\xaa\x8f\xa7\xa7"
+ "\x63\x8f\xf2\xf0\x77\xf1\xa8\xea"
+ "\xe1\xb7\x1f\x9e\xab\x9e\x4b\x3f"
+ "\x07\x87\x5b\x6f\xcd\xa8\xaf\xb9"
+ "\xfa\x70\x0b\x52\xb8\xa8\xa7\x9e"
+ "\x07\x5f\xa6\x0e\xb3\x9b\x79\x13"
+ "\x79\xc3\x3e\x8d\x1c\x2c\x68\xc8"
+ "\x51\x1d\x3c\x7b\x7d\x79\x77\x2a"
+ "\x56\x65\xc5\x54\x23\x28\xb0\x03",
+ .result = "\x9e\x16\xab\xed\x4b\xa7\x42\x5a"
+ "\xc6\xfb\x4e\x76\xff\xbe\x03\xa0"
+ "\x0f\xe3\xad\xba\xe4\x98\x2b\x0e"
+ "\x21\x48\xa0\xb8\x65\x48\x27\x48"
+ "\x84\x54\x54\xb2\x9a\x94\x7b\xe6"
+ "\x4b\x29\xe9\xcf\x05\x91\x80\x1a"
+ "\x3a\xf3\x41\x96\x85\x1d\x9f\x74"
+ "\x51\x56\x63\xfa\x7c\x28\x85\x49"
+ "\xf7\x2f\xf9\xf2\x18\x46\xf5\x33"
+ "\x80\xa3\x3c\xce\xb2\x57\x93\xf5"
+ "\xae\xbd\xa9\xf5\x7b\x30\xc4\x93"
+ "\x66\xe0\x30\x77\x16\xe4\xa0\x31"
+ "\xba\x70\xbc\x68\x13\xf5\xb0\x9a"
+ "\xc1\xfc\x7e\xfe\x55\x80\x5c\x48"
+ "\x74\xa6\xaa\xa3\xac\xdc\xc2\xf5"
+ "\x8d\xde\x34\x86\x78\x60\x75\x8d",
+ .ilen = 128,
+ .rlen = 128,
+ .also_non_np = 1,
+ .np = 4,
+ .tap = { 104, 16, 4, 4 },
+ }, {
+ .key = "\xd3\x81\x72\x18\x23\xff\x6f\x4a"
+ "\x25\x74\x29\x0d\x51\x8a\x0e\x13"
+ "\xc1\x53\x5d\x30\x8d\xee\x75\x0d"
+ "\x14\xd6\x69\xc9\x15\xa9\x0c\x60",
+ .klen = 32,
+ .iv = "\x65\x9b\xd4\xa8\x7d\x29\x1d\xf4"
+ "\xc4\xd6\x9b\x6a\x28\xab\x64\xe2"
+ "\x62\x81\x97\xc5\x81\xaa\xf9\x44"
+ "\xc1\x72\x59\x82\xaf\x16\xc8\x2c",
+ .input = "\xc7\x6b\x52\x6a\x10\xf0\xcc\x09"
+ "\xc1\x12\x1d\x6d\x21\xa6\x78\xf5"
+ "\x05\xa3\x69\x60\x91\x36\x98\x57"
+ "\xba\x0c\x14\xcc\xf3\x2d\x73\x03"
+ "\xc6\xb2\x5f\xc8\x16\x27\x37\x5d"
+ "\xd0\x0b\x87\xb2\x50\x94\x7b\x58"
+ "\x04\xf4\xe0\x7f\x6e\x57\x8e\xc9"
+ "\x41\x84\xc1\xb1\x7e\x4b\x91\x12"
+ "\x3a\x8b\x5d\x50\x82\x7b\xcb\xd9"
+ "\x9a\xd9\x4e\x18\x06\x23\x9e\xd4"
+ "\xa5\x20\x98\xef\xb5\xda\xe5\xc0"
+ "\x8a\x6a\x83\x77\x15\x84\x1e\xae"
+ "\x78\x94\x9d\xdf\xb7\xd1\xea\x67"
+ "\xaa\xb0\x14\x15\xfa\x67\x21\x84"
+ "\xd3\x41\x2a\xce\xba\x4b\x4a\xe8"
+ "\x95\x62\xa9\x55\xf0\x80\xad\xbd"
+ "\xab\xaf\xdd\x4f\xa5\x7c\x13\x36"
+ "\xed\x5e\x4f\x72\xad\x4b\xf1\xd0"
+ "\x88\x4e\xec\x2c\x88\x10\x5e\xea"
+ "\x12\xc0\x16\x01\x29\xa3\xa0\x55"
+ "\xaa\x68\xf3\xe9\x9d\x3b\x0d\x3b"
+ "\x6d\xec\xf8\xa0\x2d\xf0\x90\x8d"
+ "\x1c\xe2\x88\xd4\x24\x71\xf9\xb3"
+ "\xc1\x9f\xc5\xd6\x76\x70\xc5\x2e"
+ "\x9c\xac\xdb\x90\xbd\x83\x72\xba"
+ "\x6e\xb5\xa5\x53\x83\xa9\xa5\xbf"
+ "\x7d\x06\x0e\x3c\x2a\xd2\x04\xb5"
+ "\x1e\x19\x38\x09\x16\xd2\x82\x1f"
+ "\x75\x18\x56\xb8\x96\x0b\xa6\xf9"
+ "\xcf\x62\xd9\x32\x5d\xa9\xd7\x1d"
+ "\xec\xe4\xdf\x1b\xbe\xf1\x36\xee"
+ "\xe3\x7b\xb5\x2f\xee\xf8\x53\x3d"
+ "\x6a\xb7\x70\xa9\xfc\x9c\x57\x25"
+ "\xf2\x89\x10\xd3\xb8\xa8\x8c\x30"
+ "\xae\x23\x4f\x0e\x13\x66\x4f\xe1"
+ "\xb6\xc0\xe4\xf8\xef\x93\xbd\x6e"
+ "\x15\x85\x6b\xe3\x60\x81\x1d\x68"
+ "\xd7\x31\x87\x89\x09\xab\xd5\x96"
+ "\x1d\xf3\x6d\x67\x80\xca\x07\x31"
+ "\x5d\xa7\xe4\xfb\x3e\xf2\x9b\x33"
+ "\x52\x18\xc8\x30\xfe\x2d\xca\x1e"
+ "\x79\x92\x7a\x60\x5c\xb6\x58\x87"
+ "\xa4\x36\xa2\x67\x92\x8b\xa4\xb7"
+ "\xf1\x86\xdf\xdc\xc0\x7e\x8f\x63"
+ "\xd2\xa2\xdc\x78\xeb\x4f\xd8\x96"
+ "\x47\xca\xb8\x91\xf9\xf7\x94\x21"
+ "\x5f\x9a\x9f\x5b\xb8\x40\x41\x4b"
+ "\x66\x69\x6a\x72\xd0\xcb\x70\xb7"
+ "\x93\xb5\x37\x96\x05\x37\x4f\xe5"
+ "\x8c\xa7\x5a\x4e\x8b\xb7\x84\xea"
+ "\xc7\xfc\x19\x6e\x1f\x5a\xa1\xac"
+ "\x18\x7d\x52\x3b\xb3\x34\x62\x99"
+ "\xe4\x9e\x31\x04\x3f\xc0\x8d\x84"
+ "\x17\x7c\x25\x48\x52\x67\x11\x27"
+ "\x67\xbb\x5a\x85\xca\x56\xb2\x5c"
+ "\xe6\xec\xd5\x96\x3d\x15\xfc\xfb"
+ "\x22\x25\xf4\x13\xe5\x93\x4b\x9a"
+ "\x77\xf1\x52\x18\xfa\x16\x5e\x49"
+ "\x03\x45\xa8\x08\xfa\xb3\x41\x92"
+ "\x79\x50\x33\xca\xd0\xd7\x42\x55"
+ "\xc3\x9a\x0c\x4e\xd9\xa4\x3c\x86"
+ "\x80\x9f\x53\xd1\xa4\x2e\xd1\xbc"
+ "\xf1\x54\x6e\x93\xa4\x65\x99\x8e"
+ "\xdf\x29\xc0\x64\x63\x07\xbb\xea",
+ .result = "\x15\x97\xd0\x86\x18\x03\x9c\x51"
+ "\xc5\x11\x36\x62\x13\x92\xe6\x73"
+ "\x29\x79\xde\xa1\x00\x3e\x08\x64"
+ "\x17\x1a\xbc\xd5\xfe\x33\x0e\x0c"
+ "\x7c\x94\xa7\xc6\x3c\xbe\xac\xa2"
+ "\x89\xe6\xbc\xdf\x0c\x33\x27\x42"
+ "\x46\x73\x2f\xba\x4e\xa6\x46\x8f"
+ "\xe4\xee\x39\x63\x42\x65\xa3\x88"
+ "\x7a\xad\x33\x23\xa9\xa7\x20\x7f"
+ "\x0b\xe6\x6a\xc3\x60\xda\x9e\xb4"
+ "\xd6\x07\x8a\x77\x26\xd1\xab\x44"
+ "\x99\x55\x03\x5e\xed\x8d\x7b\xbd"
+ "\xc8\x21\xb7\x21\x30\x3f\xc0\xb5"
+ "\xc8\xec\x6c\x23\xa6\xa3\x6d\xf1"
+ "\x30\x0a\xd0\xa6\xa9\x28\x69\xae"
+ "\x2a\xe6\x54\xac\x82\x9d\x6a\x95"
+ "\x6f\x06\x44\xc5\x5a\x77\x6e\xec"
+ "\xf8\xf8\x63\xb2\xe6\xaa\xbd\x8e"
+ "\x0e\x8a\x62\x00\x03\xc8\x84\xdd"
+ "\x47\x4a\xc3\x55\xba\xb7\xe7\xdf"
+ "\x08\xbf\x62\xf5\xe8\xbc\xb6\x11"
+ "\xe4\xcb\xd0\x66\x74\x32\xcf\xd4"
+ "\xf8\x51\x80\x39\x14\x05\x12\xdb"
+ "\x87\x93\xe2\x26\x30\x9c\x3a\x21"
+ "\xe5\xd0\x38\x57\x80\x15\xe4\x08"
+ "\x58\x05\x49\x7d\xe6\x92\x77\x70"
+ "\xfb\x1e\x2d\x6a\x84\x00\xc8\x68"
+ "\xf7\x1a\xdd\xf0\x7b\x38\x1e\xd8"
+ "\x2c\x78\x78\x61\xcf\xe3\xde\x69"
+ "\x1f\xd5\x03\xd5\x1a\xb4\xcf\x03"
+ "\xc8\x7a\x70\x68\x35\xb4\xf6\xbe"
+ "\x90\x62\xb2\x28\x99\x86\xf5\x44"
+ "\x99\xeb\x31\xcf\xca\xdf\xd0\x21"
+ "\xd6\x60\xf7\x0f\x40\xb4\x80\xb7"
+ "\xab\xe1\x9b\x45\xba\x66\xda\xee"
+ "\xdd\x04\x12\x40\x98\xe1\x69\xe5"
+ "\x2b\x9c\x59\x80\xe7\x7b\xcc\x63"
+ "\xa6\xc0\x3a\xa9\xfe\x8a\xf9\x62"
+ "\x11\x34\x61\x94\x35\xfe\xf2\x99"
+ "\xfd\xee\x19\xea\x95\xb6\x12\xbf"
+ "\x1b\xdf\x02\x1a\xcc\x3e\x7e\x65"
+ "\x78\x74\x10\x50\x29\x63\x28\xea"
+ "\x6b\xab\xd4\x06\x4d\x15\x24\x31"
+ "\xc7\x0a\xc9\x16\xb6\x48\xf0\xbf"
+ "\x49\xdb\x68\x71\x31\x8f\x87\xe2"
+ "\x13\x05\x64\xd6\x22\x0c\xf8\x36"
+ "\x84\x24\x3e\x69\x5e\xb8\x9e\x16"
+ "\x73\x6c\x83\x1e\xe0\x9f\x9e\xba"
+ "\xe5\x59\x21\x33\x1b\xa9\x26\xc2"
+ "\xc7\xd9\x30\x73\xb6\xa6\x73\x82"
+ "\x19\xfa\x44\x4d\x40\x8b\x69\x04"
+ "\x94\x74\xea\x6e\xb3\x09\x47\x01"
+ "\x2a\xb9\x78\x34\x43\x11\xed\xd6"
+ "\x8c\x95\x65\x1b\x85\x67\xa5\x40"
+ "\xac\x9c\x05\x4b\x57\x4a\xa9\x96"
+ "\x0f\xdd\x4f\xa1\xe0\xcf\x6e\xc7"
+ "\x1b\xed\xa2\xb4\x56\x8c\x09\x6e"
+ "\xa6\x65\xd7\x55\x81\xb7\xed\x11"
+ "\x9b\x40\x75\xa8\x6b\x56\xaf\x16"
+ "\x8b\x3d\xf4\xcb\xfe\xd5\x1d\x3d"
+ "\x85\xc2\xc0\xde\x43\x39\x4a\x96"
+ "\xba\x88\x97\xc0\xd6\x00\x0e\x27"
+ "\x21\xb0\x21\x52\xba\xa7\x37\xaa"
+ "\xcc\xbf\x95\xa8\xf4\xd0\x91\xf6",
+ .ilen = 512,
+ .rlen = 512,
+ .also_non_np = 1,
+ .np = 2,
+ .tap = { 144, 368 },
+ }
+};
+
+static struct cipher_testvec adiantum_xchacha12_aes_dec_tv_template[] = {
+ {
+ .key = "\x9e\xeb\xb2\x49\x3c\x1c\xf5\xf4"
+ "\x6a\x99\xc2\xc4\xdf\xb1\xf4\xdd"
+ "\x75\x20\x57\xea\x2c\x4f\xcd\xb2"
+ "\xa5\x3d\x7b\x49\x1e\xab\xfd\x0f",
+ .klen = 32,
+ .iv = "\xdf\x63\xd4\xab\xd2\x49\xf3\xd8"
+ "\x33\x81\x37\x60\x7d\xfa\x73\x08"
+ "\xd8\x49\x6d\x80\xe8\x2f\x62\x54"
+ "\xeb\x0e\xa9\x39\x5b\x45\x7f\x8a",
+ .result = "\x67\xc9\xf2\x30\x84\x41\x8e\x43"
+ "\xfb\xf3\xb3\x3e\x79\x36\x7f\xe8",
+ .input = "\x6d\x32\x86\x18\x67\x86\x0f\x3f"
+ "\x96\x7c\x9d\x28\x0d\x53\xec\x9f",
+ .ilen = 16,
+ .rlen = 16,
+ .also_non_np = 1,
+ .np = 2,
+ .tap = { 14, 2 },
+ }, {
+ .key = "\x36\x2b\x57\x97\xf8\x5d\xcd\x99"
+ "\x5f\x1a\x5a\x44\x1d\x92\x0f\x27"
+ "\xcc\x16\xd7\x2b\x85\x63\x99\xd3"
+ "\xba\x96\xa1\xdb\xd2\x60\x68\xda",
+ .klen = 32,
+ .iv = "\xef\x58\x69\xb1\x2c\x5e\x9a\x47"
+ "\x24\xc1\xb1\x69\xe1\x12\x93\x8f"
+ "\x43\x3d\x6d\x00\xdb\x5e\xd8\xd9"
+ "\x12\x9a\xfe\xd9\xff\x2d\xaa\xc4",
+ .result = "\x5e\xa8\x68\x19\x85\x98\x12\x23"
+ "\x26\x0a\xcc\xdb\x0a\x04\xb9\xdf"
+ "\x4d\xb3\x48\x7b\xb0\xe3\xc8\x19"
+ "\x43\x5a\x46\x06\x94\x2d\xf2",
+ .input = "\xc7\xc6\xf1\x73\x8f\xc4\xff\x4a"
+ "\x39\xbe\x78\xbe\x8d\x28\xc8\x89"
+ "\x46\x63\xe7\x0c\x7d\x87\xe8\x4e"
+ "\xc9\x18\x7b\xbe\x18\x60\x50",
+ .ilen = 31,
+ .rlen = 31,
+ }, {
+ .key = "\xa5\x28\x24\x34\x1a\x3c\xd8\xf7"
+ "\x05\x91\x8f\xee\x85\x1f\x35\x7f"
+ "\x80\x3d\xfc\x9b\x94\xf6\xfc\x9e"
+ "\x19\x09\x00\xa9\x04\x31\x4f\x11",
+ .klen = 32,
+ .iv = "\xa1\xba\x49\x95\xff\x34\x6d\xb8"
+ "\xcd\x87\x5d\x5e\xfd\xea\x85\xdb"
+ "\x8a\x7b\x5e\xb2\x5d\x57\xdd\x62"
+ "\xac\xa9\x8c\x41\x42\x94\x75\xb7",
+ .result = "\x69\xb4\xe8\x8c\x37\xe8\x67\x82"
+ "\xf1\xec\x5d\x04\xe5\x14\x91\x13"
+ "\xdf\xf2\x87\x1b\x69\x81\x1d\x71"
+ "\x70\x9e\x9c\x3b\xde\x49\x70\x11"
+ "\xa0\xa3\xdb\x0d\x54\x4f\x66\x69"
+ "\xd7\xdb\x80\xa7\x70\x92\x68\xce"
+ "\x81\x04\x2c\xc6\xab\xae\xe5\x60"
+ "\x15\xe9\x6f\xef\xaa\x8f\xa7\xa7"
+ "\x63\x8f\xf2\xf0\x77\xf1\xa8\xea"
+ "\xe1\xb7\x1f\x9e\xab\x9e\x4b\x3f"
+ "\x07\x87\x5b\x6f\xcd\xa8\xaf\xb9"
+ "\xfa\x70\x0b\x52\xb8\xa8\xa7\x9e"
+ "\x07\x5f\xa6\x0e\xb3\x9b\x79\x13"
+ "\x79\xc3\x3e\x8d\x1c\x2c\x68\xc8"
+ "\x51\x1d\x3c\x7b\x7d\x79\x77\x2a"
+ "\x56\x65\xc5\x54\x23\x28\xb0\x03",
+ .input = "\x9e\x16\xab\xed\x4b\xa7\x42\x5a"
+ "\xc6\xfb\x4e\x76\xff\xbe\x03\xa0"
+ "\x0f\xe3\xad\xba\xe4\x98\x2b\x0e"
+ "\x21\x48\xa0\xb8\x65\x48\x27\x48"
+ "\x84\x54\x54\xb2\x9a\x94\x7b\xe6"
+ "\x4b\x29\xe9\xcf\x05\x91\x80\x1a"
+ "\x3a\xf3\x41\x96\x85\x1d\x9f\x74"
+ "\x51\x56\x63\xfa\x7c\x28\x85\x49"
+ "\xf7\x2f\xf9\xf2\x18\x46\xf5\x33"
+ "\x80\xa3\x3c\xce\xb2\x57\x93\xf5"
+ "\xae\xbd\xa9\xf5\x7b\x30\xc4\x93"
+ "\x66\xe0\x30\x77\x16\xe4\xa0\x31"
+ "\xba\x70\xbc\x68\x13\xf5\xb0\x9a"
+ "\xc1\xfc\x7e\xfe\x55\x80\x5c\x48"
+ "\x74\xa6\xaa\xa3\xac\xdc\xc2\xf5"
+ "\x8d\xde\x34\x86\x78\x60\x75\x8d",
+ .ilen = 128,
+ .rlen = 128,
+ .also_non_np = 1,
+ .np = 4,
+ .tap = { 104, 16, 4, 4 },
+ }, {
+ .key = "\xd3\x81\x72\x18\x23\xff\x6f\x4a"
+ "\x25\x74\x29\x0d\x51\x8a\x0e\x13"
+ "\xc1\x53\x5d\x30\x8d\xee\x75\x0d"
+ "\x14\xd6\x69\xc9\x15\xa9\x0c\x60",
+ .klen = 32,
+ .iv = "\x65\x9b\xd4\xa8\x7d\x29\x1d\xf4"
+ "\xc4\xd6\x9b\x6a\x28\xab\x64\xe2"
+ "\x62\x81\x97\xc5\x81\xaa\xf9\x44"
+ "\xc1\x72\x59\x82\xaf\x16\xc8\x2c",
+ .result = "\xc7\x6b\x52\x6a\x10\xf0\xcc\x09"
+ "\xc1\x12\x1d\x6d\x21\xa6\x78\xf5"
+ "\x05\xa3\x69\x60\x91\x36\x98\x57"
+ "\xba\x0c\x14\xcc\xf3\x2d\x73\x03"
+ "\xc6\xb2\x5f\xc8\x16\x27\x37\x5d"
+ "\xd0\x0b\x87\xb2\x50\x94\x7b\x58"
+ "\x04\xf4\xe0\x7f\x6e\x57\x8e\xc9"
+ "\x41\x84\xc1\xb1\x7e\x4b\x91\x12"
+ "\x3a\x8b\x5d\x50\x82\x7b\xcb\xd9"
+ "\x9a\xd9\x4e\x18\x06\x23\x9e\xd4"
+ "\xa5\x20\x98\xef\xb5\xda\xe5\xc0"
+ "\x8a\x6a\x83\x77\x15\x84\x1e\xae"
+ "\x78\x94\x9d\xdf\xb7\xd1\xea\x67"
+ "\xaa\xb0\x14\x15\xfa\x67\x21\x84"
+ "\xd3\x41\x2a\xce\xba\x4b\x4a\xe8"
+ "\x95\x62\xa9\x55\xf0\x80\xad\xbd"
+ "\xab\xaf\xdd\x4f\xa5\x7c\x13\x36"
+ "\xed\x5e\x4f\x72\xad\x4b\xf1\xd0"
+ "\x88\x4e\xec\x2c\x88\x10\x5e\xea"
+ "\x12\xc0\x16\x01\x29\xa3\xa0\x55"
+ "\xaa\x68\xf3\xe9\x9d\x3b\x0d\x3b"
+ "\x6d\xec\xf8\xa0\x2d\xf0\x90\x8d"
+ "\x1c\xe2\x88\xd4\x24\x71\xf9\xb3"
+ "\xc1\x9f\xc5\xd6\x76\x70\xc5\x2e"
+ "\x9c\xac\xdb\x90\xbd\x83\x72\xba"
+ "\x6e\xb5\xa5\x53\x83\xa9\xa5\xbf"
+ "\x7d\x06\x0e\x3c\x2a\xd2\x04\xb5"
+ "\x1e\x19\x38\x09\x16\xd2\x82\x1f"
+ "\x75\x18\x56\xb8\x96\x0b\xa6\xf9"
+ "\xcf\x62\xd9\x32\x5d\xa9\xd7\x1d"
+ "\xec\xe4\xdf\x1b\xbe\xf1\x36\xee"
+ "\xe3\x7b\xb5\x2f\xee\xf8\x53\x3d"
+ "\x6a\xb7\x70\xa9\xfc\x9c\x57\x25"
+ "\xf2\x89\x10\xd3\xb8\xa8\x8c\x30"
+ "\xae\x23\x4f\x0e\x13\x66\x4f\xe1"
+ "\xb6\xc0\xe4\xf8\xef\x93\xbd\x6e"
+ "\x15\x85\x6b\xe3\x60\x81\x1d\x68"
+ "\xd7\x31\x87\x89\x09\xab\xd5\x96"
+ "\x1d\xf3\x6d\x67\x80\xca\x07\x31"
+ "\x5d\xa7\xe4\xfb\x3e\xf2\x9b\x33"
+ "\x52\x18\xc8\x30\xfe\x2d\xca\x1e"
+ "\x79\x92\x7a\x60\x5c\xb6\x58\x87"
+ "\xa4\x36\xa2\x67\x92\x8b\xa4\xb7"
+ "\xf1\x86\xdf\xdc\xc0\x7e\x8f\x63"
+ "\xd2\xa2\xdc\x78\xeb\x4f\xd8\x96"
+ "\x47\xca\xb8\x91\xf9\xf7\x94\x21"
+ "\x5f\x9a\x9f\x5b\xb8\x40\x41\x4b"
+ "\x66\x69\x6a\x72\xd0\xcb\x70\xb7"
+ "\x93\xb5\x37\x96\x05\x37\x4f\xe5"
+ "\x8c\xa7\x5a\x4e\x8b\xb7\x84\xea"
+ "\xc7\xfc\x19\x6e\x1f\x5a\xa1\xac"
+ "\x18\x7d\x52\x3b\xb3\x34\x62\x99"
+ "\xe4\x9e\x31\x04\x3f\xc0\x8d\x84"
+ "\x17\x7c\x25\x48\x52\x67\x11\x27"
+ "\x67\xbb\x5a\x85\xca\x56\xb2\x5c"
+ "\xe6\xec\xd5\x96\x3d\x15\xfc\xfb"
+ "\x22\x25\xf4\x13\xe5\x93\x4b\x9a"
+ "\x77\xf1\x52\x18\xfa\x16\x5e\x49"
+ "\x03\x45\xa8\x08\xfa\xb3\x41\x92"
+ "\x79\x50\x33\xca\xd0\xd7\x42\x55"
+ "\xc3\x9a\x0c\x4e\xd9\xa4\x3c\x86"
+ "\x80\x9f\x53\xd1\xa4\x2e\xd1\xbc"
+ "\xf1\x54\x6e\x93\xa4\x65\x99\x8e"
+ "\xdf\x29\xc0\x64\x63\x07\xbb\xea",
+ .input = "\x15\x97\xd0\x86\x18\x03\x9c\x51"
+ "\xc5\x11\x36\x62\x13\x92\xe6\x73"
+ "\x29\x79\xde\xa1\x00\x3e\x08\x64"
+ "\x17\x1a\xbc\xd5\xfe\x33\x0e\x0c"
+ "\x7c\x94\xa7\xc6\x3c\xbe\xac\xa2"
+ "\x89\xe6\xbc\xdf\x0c\x33\x27\x42"
+ "\x46\x73\x2f\xba\x4e\xa6\x46\x8f"
+ "\xe4\xee\x39\x63\x42\x65\xa3\x88"
+ "\x7a\xad\x33\x23\xa9\xa7\x20\x7f"
+ "\x0b\xe6\x6a\xc3\x60\xda\x9e\xb4"
+ "\xd6\x07\x8a\x77\x26\xd1\xab\x44"
+ "\x99\x55\x03\x5e\xed\x8d\x7b\xbd"
+ "\xc8\x21\xb7\x21\x30\x3f\xc0\xb5"
+ "\xc8\xec\x6c\x23\xa6\xa3\x6d\xf1"
+ "\x30\x0a\xd0\xa6\xa9\x28\x69\xae"
+ "\x2a\xe6\x54\xac\x82\x9d\x6a\x95"
+ "\x6f\x06\x44\xc5\x5a\x77\x6e\xec"
+ "\xf8\xf8\x63\xb2\xe6\xaa\xbd\x8e"
+ "\x0e\x8a\x62\x00\x03\xc8\x84\xdd"
+ "\x47\x4a\xc3\x55\xba\xb7\xe7\xdf"
+ "\x08\xbf\x62\xf5\xe8\xbc\xb6\x11"
+ "\xe4\xcb\xd0\x66\x74\x32\xcf\xd4"
+ "\xf8\x51\x80\x39\x14\x05\x12\xdb"
+ "\x87\x93\xe2\x26\x30\x9c\x3a\x21"
+ "\xe5\xd0\x38\x57\x80\x15\xe4\x08"
+ "\x58\x05\x49\x7d\xe6\x92\x77\x70"
+ "\xfb\x1e\x2d\x6a\x84\x00\xc8\x68"
+ "\xf7\x1a\xdd\xf0\x7b\x38\x1e\xd8"
+ "\x2c\x78\x78\x61\xcf\xe3\xde\x69"
+ "\x1f\xd5\x03\xd5\x1a\xb4\xcf\x03"
+ "\xc8\x7a\x70\x68\x35\xb4\xf6\xbe"
+ "\x90\x62\xb2\x28\x99\x86\xf5\x44"
+ "\x99\xeb\x31\xcf\xca\xdf\xd0\x21"
+ "\xd6\x60\xf7\x0f\x40\xb4\x80\xb7"
+ "\xab\xe1\x9b\x45\xba\x66\xda\xee"
+ "\xdd\x04\x12\x40\x98\xe1\x69\xe5"
+ "\x2b\x9c\x59\x80\xe7\x7b\xcc\x63"
+ "\xa6\xc0\x3a\xa9\xfe\x8a\xf9\x62"
+ "\x11\x34\x61\x94\x35\xfe\xf2\x99"
+ "\xfd\xee\x19\xea\x95\xb6\x12\xbf"
+ "\x1b\xdf\x02\x1a\xcc\x3e\x7e\x65"
+ "\x78\x74\x10\x50\x29\x63\x28\xea"
+ "\x6b\xab\xd4\x06\x4d\x15\x24\x31"
+ "\xc7\x0a\xc9\x16\xb6\x48\xf0\xbf"
+ "\x49\xdb\x68\x71\x31\x8f\x87\xe2"
+ "\x13\x05\x64\xd6\x22\x0c\xf8\x36"
+ "\x84\x24\x3e\x69\x5e\xb8\x9e\x16"
+ "\x73\x6c\x83\x1e\xe0\x9f\x9e\xba"
+ "\xe5\x59\x21\x33\x1b\xa9\x26\xc2"
+ "\xc7\xd9\x30\x73\xb6\xa6\x73\x82"
+ "\x19\xfa\x44\x4d\x40\x8b\x69\x04"
+ "\x94\x74\xea\x6e\xb3\x09\x47\x01"
+ "\x2a\xb9\x78\x34\x43\x11\xed\xd6"
+ "\x8c\x95\x65\x1b\x85\x67\xa5\x40"
+ "\xac\x9c\x05\x4b\x57\x4a\xa9\x96"
+ "\x0f\xdd\x4f\xa1\xe0\xcf\x6e\xc7"
+ "\x1b\xed\xa2\xb4\x56\x8c\x09\x6e"
+ "\xa6\x65\xd7\x55\x81\xb7\xed\x11"
+ "\x9b\x40\x75\xa8\x6b\x56\xaf\x16"
+ "\x8b\x3d\xf4\xcb\xfe\xd5\x1d\x3d"
+ "\x85\xc2\xc0\xde\x43\x39\x4a\x96"
+ "\xba\x88\x97\xc0\xd6\x00\x0e\x27"
+ "\x21\xb0\x21\x52\xba\xa7\x37\xaa"
+ "\xcc\xbf\x95\xa8\xf4\xd0\x91\xf6",
+ .ilen = 512,
+ .rlen = 512,
+ .also_non_np = 1,
+ .np = 2,
+ .tap = { 144, 368 },
+ }
+};
+
+/* Adiantum with XChaCha20 instead of XChaCha12 */
+/* Test vectors from https://github.com/google/adiantum */
+static struct cipher_testvec adiantum_xchacha20_aes_enc_tv_template[] = {
+ {
+ .key = "\x9e\xeb\xb2\x49\x3c\x1c\xf5\xf4"
+ "\x6a\x99\xc2\xc4\xdf\xb1\xf4\xdd"
+ "\x75\x20\x57\xea\x2c\x4f\xcd\xb2"
+ "\xa5\x3d\x7b\x49\x1e\xab\xfd\x0f",
+ .klen = 32,
+ .iv = "\xdf\x63\xd4\xab\xd2\x49\xf3\xd8"
+ "\x33\x81\x37\x60\x7d\xfa\x73\x08"
+ "\xd8\x49\x6d\x80\xe8\x2f\x62\x54"
+ "\xeb\x0e\xa9\x39\x5b\x45\x7f\x8a",
+ .input = "\x67\xc9\xf2\x30\x84\x41\x8e\x43"
+ "\xfb\xf3\xb3\x3e\x79\x36\x7f\xe8",
+ .result = "\xf6\x78\x97\xd6\xaa\x94\x01\x27"
+ "\x2e\x4d\x83\xe0\x6e\x64\x9a\xdf",
+ .ilen = 16,
+ .rlen = 16,
+ .also_non_np = 1,
+ .np = 3,
+ .tap = { 5, 2, 9 },
+ }, {
+ .key = "\x36\x2b\x57\x97\xf8\x5d\xcd\x99"
+ "\x5f\x1a\x5a\x44\x1d\x92\x0f\x27"
+ "\xcc\x16\xd7\x2b\x85\x63\x99\xd3"
+ "\xba\x96\xa1\xdb\xd2\x60\x68\xda",
+ .klen = 32,
+ .iv = "\xef\x58\x69\xb1\x2c\x5e\x9a\x47"
+ "\x24\xc1\xb1\x69\xe1\x12\x93\x8f"
+ "\x43\x3d\x6d\x00\xdb\x5e\xd8\xd9"
+ "\x12\x9a\xfe\xd9\xff\x2d\xaa\xc4",
+ .input = "\x5e\xa8\x68\x19\x85\x98\x12\x23"
+ "\x26\x0a\xcc\xdb\x0a\x04\xb9\xdf"
+ "\x4d\xb3\x48\x7b\xb0\xe3\xc8\x19"
+ "\x43\x5a\x46\x06\x94\x2d\xf2",
+ .result = "\x4b\xb8\x90\x10\xdf\x7f\x64\x08"
+ "\x0e\x14\x42\x5f\x00\x74\x09\x36"
+ "\x57\x72\xb5\xfd\xb5\x5d\xb8\x28"
+ "\x0c\x04\x91\x14\x91\xe9\x37",
+ .ilen = 31,
+ .rlen = 31,
+ .also_non_np = 1,
+ .np = 2,
+ .tap = { 16, 15 },
+ }, {
+ .key = "\xa5\x28\x24\x34\x1a\x3c\xd8\xf7"
+ "\x05\x91\x8f\xee\x85\x1f\x35\x7f"
+ "\x80\x3d\xfc\x9b\x94\xf6\xfc\x9e"
+ "\x19\x09\x00\xa9\x04\x31\x4f\x11",
+ .klen = 32,
+ .iv = "\xa1\xba\x49\x95\xff\x34\x6d\xb8"
+ "\xcd\x87\x5d\x5e\xfd\xea\x85\xdb"
+ "\x8a\x7b\x5e\xb2\x5d\x57\xdd\x62"
+ "\xac\xa9\x8c\x41\x42\x94\x75\xb7",
+ .input = "\x69\xb4\xe8\x8c\x37\xe8\x67\x82"
+ "\xf1\xec\x5d\x04\xe5\x14\x91\x13"
+ "\xdf\xf2\x87\x1b\x69\x81\x1d\x71"
+ "\x70\x9e\x9c\x3b\xde\x49\x70\x11"
+ "\xa0\xa3\xdb\x0d\x54\x4f\x66\x69"
+ "\xd7\xdb\x80\xa7\x70\x92\x68\xce"
+ "\x81\x04\x2c\xc6\xab\xae\xe5\x60"
+ "\x15\xe9\x6f\xef\xaa\x8f\xa7\xa7"
+ "\x63\x8f\xf2\xf0\x77\xf1\xa8\xea"
+ "\xe1\xb7\x1f\x9e\xab\x9e\x4b\x3f"
+ "\x07\x87\x5b\x6f\xcd\xa8\xaf\xb9"
+ "\xfa\x70\x0b\x52\xb8\xa8\xa7\x9e"
+ "\x07\x5f\xa6\x0e\xb3\x9b\x79\x13"
+ "\x79\xc3\x3e\x8d\x1c\x2c\x68\xc8"
+ "\x51\x1d\x3c\x7b\x7d\x79\x77\x2a"
+ "\x56\x65\xc5\x54\x23\x28\xb0\x03",
+ .result = "\xb1\x8b\xa0\x05\x77\xa8\x4d\x59"
+ "\x1b\x8e\x21\xfc\x3a\x49\xfa\xd4"
+ "\xeb\x36\xf3\xc4\xdf\xdc\xae\x67"
+ "\x07\x3f\x70\x0e\xe9\x66\xf5\x0c"
+ "\x30\x4d\x66\xc9\xa4\x2f\x73\x9c"
+ "\x13\xc8\x49\x44\xcc\x0a\x90\x9d"
+ "\x7c\xdd\x19\x3f\xea\x72\x8d\x58"
+ "\xab\xe7\x09\x2c\xec\xb5\x44\xd2"
+ "\xca\xa6\x2d\x7a\x5c\x9c\x2b\x15"
+ "\xec\x2a\xa6\x69\x91\xf9\xf3\x13"
+ "\xf7\x72\xc1\xc1\x40\xd5\xe1\x94"
+ "\xf4\x29\xa1\x3e\x25\x02\xa8\x3e"
+ "\x94\xc1\x91\x14\xa1\x14\xcb\xbe"
+ "\x67\x4c\xb9\x38\xfe\xa7\xaa\x32"
+ "\x29\x62\x0d\xb2\xf6\x3c\x58\x57"
+ "\xc1\xd5\x5a\xbb\xd6\xa6\x2a\xe5",
+ .ilen = 128,
+ .rlen = 128,
+ .also_non_np = 1,
+ .np = 4,
+ .tap = { 112, 7, 8, 1 },
+ }, {
+ .key = "\xd3\x81\x72\x18\x23\xff\x6f\x4a"
+ "\x25\x74\x29\x0d\x51\x8a\x0e\x13"
+ "\xc1\x53\x5d\x30\x8d\xee\x75\x0d"
+ "\x14\xd6\x69\xc9\x15\xa9\x0c\x60",
+ .klen = 32,
+ .iv = "\x65\x9b\xd4\xa8\x7d\x29\x1d\xf4"
+ "\xc4\xd6\x9b\x6a\x28\xab\x64\xe2"
+ "\x62\x81\x97\xc5\x81\xaa\xf9\x44"
+ "\xc1\x72\x59\x82\xaf\x16\xc8\x2c",
+ .input = "\xc7\x6b\x52\x6a\x10\xf0\xcc\x09"
+ "\xc1\x12\x1d\x6d\x21\xa6\x78\xf5"
+ "\x05\xa3\x69\x60\x91\x36\x98\x57"
+ "\xba\x0c\x14\xcc\xf3\x2d\x73\x03"
+ "\xc6\xb2\x5f\xc8\x16\x27\x37\x5d"
+ "\xd0\x0b\x87\xb2\x50\x94\x7b\x58"
+ "\x04\xf4\xe0\x7f\x6e\x57\x8e\xc9"
+ "\x41\x84\xc1\xb1\x7e\x4b\x91\x12"
+ "\x3a\x8b\x5d\x50\x82\x7b\xcb\xd9"
+ "\x9a\xd9\x4e\x18\x06\x23\x9e\xd4"
+ "\xa5\x20\x98\xef\xb5\xda\xe5\xc0"
+ "\x8a\x6a\x83\x77\x15\x84\x1e\xae"
+ "\x78\x94\x9d\xdf\xb7\xd1\xea\x67"
+ "\xaa\xb0\x14\x15\xfa\x67\x21\x84"
+ "\xd3\x41\x2a\xce\xba\x4b\x4a\xe8"
+ "\x95\x62\xa9\x55\xf0\x80\xad\xbd"
+ "\xab\xaf\xdd\x4f\xa5\x7c\x13\x36"
+ "\xed\x5e\x4f\x72\xad\x4b\xf1\xd0"
+ "\x88\x4e\xec\x2c\x88\x10\x5e\xea"
+ "\x12\xc0\x16\x01\x29\xa3\xa0\x55"
+ "\xaa\x68\xf3\xe9\x9d\x3b\x0d\x3b"
+ "\x6d\xec\xf8\xa0\x2d\xf0\x90\x8d"
+ "\x1c\xe2\x88\xd4\x24\x71\xf9\xb3"
+ "\xc1\x9f\xc5\xd6\x76\x70\xc5\x2e"
+ "\x9c\xac\xdb\x90\xbd\x83\x72\xba"
+ "\x6e\xb5\xa5\x53\x83\xa9\xa5\xbf"
+ "\x7d\x06\x0e\x3c\x2a\xd2\x04\xb5"
+ "\x1e\x19\x38\x09\x16\xd2\x82\x1f"
+ "\x75\x18\x56\xb8\x96\x0b\xa6\xf9"
+ "\xcf\x62\xd9\x32\x5d\xa9\xd7\x1d"
+ "\xec\xe4\xdf\x1b\xbe\xf1\x36\xee"
+ "\xe3\x7b\xb5\x2f\xee\xf8\x53\x3d"
+ "\x6a\xb7\x70\xa9\xfc\x9c\x57\x25"
+ "\xf2\x89\x10\xd3\xb8\xa8\x8c\x30"
+ "\xae\x23\x4f\x0e\x13\x66\x4f\xe1"
+ "\xb6\xc0\xe4\xf8\xef\x93\xbd\x6e"
+ "\x15\x85\x6b\xe3\x60\x81\x1d\x68"
+ "\xd7\x31\x87\x89\x09\xab\xd5\x96"
+ "\x1d\xf3\x6d\x67\x80\xca\x07\x31"
+ "\x5d\xa7\xe4\xfb\x3e\xf2\x9b\x33"
+ "\x52\x18\xc8\x30\xfe\x2d\xca\x1e"
+ "\x79\x92\x7a\x60\x5c\xb6\x58\x87"
+ "\xa4\x36\xa2\x67\x92\x8b\xa4\xb7"
+ "\xf1\x86\xdf\xdc\xc0\x7e\x8f\x63"
+ "\xd2\xa2\xdc\x78\xeb\x4f\xd8\x96"
+ "\x47\xca\xb8\x91\xf9\xf7\x94\x21"
+ "\x5f\x9a\x9f\x5b\xb8\x40\x41\x4b"
+ "\x66\x69\x6a\x72\xd0\xcb\x70\xb7"
+ "\x93\xb5\x37\x96\x05\x37\x4f\xe5"
+ "\x8c\xa7\x5a\x4e\x8b\xb7\x84\xea"
+ "\xc7\xfc\x19\x6e\x1f\x5a\xa1\xac"
+ "\x18\x7d\x52\x3b\xb3\x34\x62\x99"
+ "\xe4\x9e\x31\x04\x3f\xc0\x8d\x84"
+ "\x17\x7c\x25\x48\x52\x67\x11\x27"
+ "\x67\xbb\x5a\x85\xca\x56\xb2\x5c"
+ "\xe6\xec\xd5\x96\x3d\x15\xfc\xfb"
+ "\x22\x25\xf4\x13\xe5\x93\x4b\x9a"
+ "\x77\xf1\x52\x18\xfa\x16\x5e\x49"
+ "\x03\x45\xa8\x08\xfa\xb3\x41\x92"
+ "\x79\x50\x33\xca\xd0\xd7\x42\x55"
+ "\xc3\x9a\x0c\x4e\xd9\xa4\x3c\x86"
+ "\x80\x9f\x53\xd1\xa4\x2e\xd1\xbc"
+ "\xf1\x54\x6e\x93\xa4\x65\x99\x8e"
+ "\xdf\x29\xc0\x64\x63\x07\xbb\xea",
+ .result = "\xe0\x33\xf6\xe0\xb4\xa5\xdd\x2b"
+ "\xdd\xce\xfc\x12\x1e\xfc\x2d\xf2"
+ "\x8b\xc7\xeb\xc1\xc4\x2a\xe8\x44"
+ "\x0f\x3d\x97\x19\x2e\x6d\xa2\x38"
+ "\x9d\xa6\xaa\xe1\x96\xb9\x08\xe8"
+ "\x0b\x70\x48\x5c\xed\xb5\x9b\xcb"
+ "\x8b\x40\x88\x7e\x69\x73\xf7\x16"
+ "\x71\xbb\x5b\xfc\xa3\x47\x5d\xa6"
+ "\xae\x3a\x64\xc4\xe7\xb8\xa8\xe7"
+ "\xb1\x32\x19\xdb\xe3\x01\xb8\xf0"
+ "\xa4\x86\xb4\x4c\xc2\xde\x5c\xd2"
+ "\x6c\x77\xd2\xe8\x18\xb7\x0a\xc9"
+ "\x3d\x53\xb5\xc4\x5c\xf0\x8c\x06"
+ "\xdc\x90\xe0\x74\x47\x1b\x0b\xf6"
+ "\xd2\x71\x6b\xc4\xf1\x97\x00\x2d"
+ "\x63\x57\x44\x1f\x8c\xf4\xe6\x9b"
+ "\xe0\x7a\xdd\xec\x32\x73\x42\x32"
+ "\x7f\x35\x67\x60\x0d\xcf\x10\x52"
+ "\x61\x22\x53\x8d\x8e\xbb\x33\x76"
+ "\x59\xd9\x10\xce\xdf\xef\xc0\x41"
+ "\xd5\x33\x29\x6a\xda\x46\xa4\x51"
+ "\xf0\x99\x3d\x96\x31\xdd\xb5\xcb"
+ "\x3e\x2a\x1f\xc7\x5c\x79\xd3\xc5"
+ "\x20\xa1\xb1\x39\x1b\xc6\x0a\x70"
+ "\x26\x39\x95\x07\xad\x7a\xc9\x69"
+ "\xfe\x81\xc7\x88\x08\x38\xaf\xad"
+ "\x9e\x8d\xfb\xe8\x24\x0d\x22\xb8"
+ "\x0e\xed\xbe\x37\x53\x7c\xa6\xc6"
+ "\x78\x62\xec\xa3\x59\xd9\xc6\x9d"
+ "\xb8\x0e\x69\x77\x84\x2d\x6a\x4c"
+ "\xc5\xd9\xb2\xa0\x2b\xa8\x80\xcc"
+ "\xe9\x1e\x9c\x5a\xc4\xa1\xb2\x37"
+ "\x06\x9b\x30\x32\x67\xf7\xe7\xd2"
+ "\x42\xc7\xdf\x4e\xd4\xcb\xa0\x12"
+ "\x94\xa1\x34\x85\x93\x50\x4b\x0a"
+ "\x3c\x7d\x49\x25\x01\x41\x6b\x96"
+ "\xa9\x12\xbb\x0b\xc0\xd7\xd0\x93"
+ "\x1f\x70\x38\xb8\x21\xee\xf6\xa7"
+ "\xee\xeb\xe7\x81\xa4\x13\xb4\x87"
+ "\xfa\xc1\xb0\xb5\x37\x8b\x74\xa2"
+ "\x4e\xc7\xc2\xad\x3d\x62\x3f\xf8"
+ "\x34\x42\xe5\xae\x45\x13\x63\xfe"
+ "\xfc\x2a\x17\x46\x61\xa9\xd3\x1c"
+ "\x4c\xaf\xf0\x09\x62\x26\x66\x1e"
+ "\x74\xcf\xd6\x68\x3d\x7d\xd8\xb7"
+ "\xe7\xe6\xf8\xf0\x08\x20\xf7\x47"
+ "\x1c\x52\xaa\x0f\x3e\x21\xa3\xf2"
+ "\xbf\x2f\x95\x16\xa8\xc8\xc8\x8c"
+ "\x99\x0f\x5d\xfb\xfa\x2b\x58\x8a"
+ "\x7e\xd6\x74\x02\x60\xf0\xd0\x5b"
+ "\x65\xa8\xac\xea\x8d\x68\x46\x34"
+ "\x26\x9d\x4f\xb1\x9a\x8e\xc0\x1a"
+ "\xf1\xed\xc6\x7a\x83\xfd\x8a\x57"
+ "\xf2\xe6\xe4\xba\xfc\xc6\x3c\xad"
+ "\x5b\x19\x50\x2f\x3a\xcc\x06\x46"
+ "\x04\x51\x3f\x91\x97\xf0\xd2\x07"
+ "\xe7\x93\x89\x7e\xb5\x32\x0f\x03"
+ "\xe5\x58\x9e\x74\x72\xeb\xc2\x38"
+ "\x00\x0c\x91\x72\x69\xed\x7d\x6d"
+ "\xc8\x71\xf0\xec\xff\x80\xd9\x1c"
+ "\x9e\xd2\xfa\x15\xfc\x6c\x4e\xbc"
+ "\xb1\xa6\xbd\xbd\x70\x40\xca\x20"
+ "\xb8\x78\xd2\xa3\xc6\xf3\x79\x9c"
+ "\xc7\x27\xe1\x6a\x29\xad\xa4\x03",
+ .ilen = 512,
+ .rlen = 512,
+ }
+};
+
+static struct cipher_testvec adiantum_xchacha20_aes_dec_tv_template[] = {
+ {
+ .key = "\x9e\xeb\xb2\x49\x3c\x1c\xf5\xf4"
+ "\x6a\x99\xc2\xc4\xdf\xb1\xf4\xdd"
+ "\x75\x20\x57\xea\x2c\x4f\xcd\xb2"
+ "\xa5\x3d\x7b\x49\x1e\xab\xfd\x0f",
+ .klen = 32,
+ .iv = "\xdf\x63\xd4\xab\xd2\x49\xf3\xd8"
+ "\x33\x81\x37\x60\x7d\xfa\x73\x08"
+ "\xd8\x49\x6d\x80\xe8\x2f\x62\x54"
+ "\xeb\x0e\xa9\x39\x5b\x45\x7f\x8a",
+ .result = "\x67\xc9\xf2\x30\x84\x41\x8e\x43"
+ "\xfb\xf3\xb3\x3e\x79\x36\x7f\xe8",
+ .input = "\xf6\x78\x97\xd6\xaa\x94\x01\x27"
+ "\x2e\x4d\x83\xe0\x6e\x64\x9a\xdf",
+ .ilen = 16,
+ .rlen = 16,
+ .also_non_np = 1,
+ .np = 3,
+ .tap = { 5, 2, 9 },
+ }, {
+ .key = "\x36\x2b\x57\x97\xf8\x5d\xcd\x99"
+ "\x5f\x1a\x5a\x44\x1d\x92\x0f\x27"
+ "\xcc\x16\xd7\x2b\x85\x63\x99\xd3"
+ "\xba\x96\xa1\xdb\xd2\x60\x68\xda",
+ .klen = 32,
+ .iv = "\xef\x58\x69\xb1\x2c\x5e\x9a\x47"
+ "\x24\xc1\xb1\x69\xe1\x12\x93\x8f"
+ "\x43\x3d\x6d\x00\xdb\x5e\xd8\xd9"
+ "\x12\x9a\xfe\xd9\xff\x2d\xaa\xc4",
+ .result = "\x5e\xa8\x68\x19\x85\x98\x12\x23"
+ "\x26\x0a\xcc\xdb\x0a\x04\xb9\xdf"
+ "\x4d\xb3\x48\x7b\xb0\xe3\xc8\x19"
+ "\x43\x5a\x46\x06\x94\x2d\xf2",
+ .input = "\x4b\xb8\x90\x10\xdf\x7f\x64\x08"
+ "\x0e\x14\x42\x5f\x00\x74\x09\x36"
+ "\x57\x72\xb5\xfd\xb5\x5d\xb8\x28"
+ "\x0c\x04\x91\x14\x91\xe9\x37",
+ .ilen = 31,
+ .rlen = 31,
+ .also_non_np = 1,
+ .np = 2,
+ .tap = { 16, 15 },
+ }, {
+ .key = "\xa5\x28\x24\x34\x1a\x3c\xd8\xf7"
+ "\x05\x91\x8f\xee\x85\x1f\x35\x7f"
+ "\x80\x3d\xfc\x9b\x94\xf6\xfc\x9e"
+ "\x19\x09\x00\xa9\x04\x31\x4f\x11",
+ .klen = 32,
+ .iv = "\xa1\xba\x49\x95\xff\x34\x6d\xb8"
+ "\xcd\x87\x5d\x5e\xfd\xea\x85\xdb"
+ "\x8a\x7b\x5e\xb2\x5d\x57\xdd\x62"
+ "\xac\xa9\x8c\x41\x42\x94\x75\xb7",
+ .result = "\x69\xb4\xe8\x8c\x37\xe8\x67\x82"
+ "\xf1\xec\x5d\x04\xe5\x14\x91\x13"
+ "\xdf\xf2\x87\x1b\x69\x81\x1d\x71"
+ "\x70\x9e\x9c\x3b\xde\x49\x70\x11"
+ "\xa0\xa3\xdb\x0d\x54\x4f\x66\x69"
+ "\xd7\xdb\x80\xa7\x70\x92\x68\xce"
+ "\x81\x04\x2c\xc6\xab\xae\xe5\x60"
+ "\x15\xe9\x6f\xef\xaa\x8f\xa7\xa7"
+ "\x63\x8f\xf2\xf0\x77\xf1\xa8\xea"
+ "\xe1\xb7\x1f\x9e\xab\x9e\x4b\x3f"
+ "\x07\x87\x5b\x6f\xcd\xa8\xaf\xb9"
+ "\xfa\x70\x0b\x52\xb8\xa8\xa7\x9e"
+ "\x07\x5f\xa6\x0e\xb3\x9b\x79\x13"
+ "\x79\xc3\x3e\x8d\x1c\x2c\x68\xc8"
+ "\x51\x1d\x3c\x7b\x7d\x79\x77\x2a"
+ "\x56\x65\xc5\x54\x23\x28\xb0\x03",
+ .input = "\xb1\x8b\xa0\x05\x77\xa8\x4d\x59"
+ "\x1b\x8e\x21\xfc\x3a\x49\xfa\xd4"
+ "\xeb\x36\xf3\xc4\xdf\xdc\xae\x67"
+ "\x07\x3f\x70\x0e\xe9\x66\xf5\x0c"
+ "\x30\x4d\x66\xc9\xa4\x2f\x73\x9c"
+ "\x13\xc8\x49\x44\xcc\x0a\x90\x9d"
+ "\x7c\xdd\x19\x3f\xea\x72\x8d\x58"
+ "\xab\xe7\x09\x2c\xec\xb5\x44\xd2"
+ "\xca\xa6\x2d\x7a\x5c\x9c\x2b\x15"
+ "\xec\x2a\xa6\x69\x91\xf9\xf3\x13"
+ "\xf7\x72\xc1\xc1\x40\xd5\xe1\x94"
+ "\xf4\x29\xa1\x3e\x25\x02\xa8\x3e"
+ "\x94\xc1\x91\x14\xa1\x14\xcb\xbe"
+ "\x67\x4c\xb9\x38\xfe\xa7\xaa\x32"
+ "\x29\x62\x0d\xb2\xf6\x3c\x58\x57"
+ "\xc1\xd5\x5a\xbb\xd6\xa6\x2a\xe5",
+ .ilen = 128,
+ .rlen = 128,
+ .also_non_np = 1,
+ .np = 4,
+ .tap = { 112, 7, 8, 1 },
+ }, {
+ .key = "\xd3\x81\x72\x18\x23\xff\x6f\x4a"
+ "\x25\x74\x29\x0d\x51\x8a\x0e\x13"
+ "\xc1\x53\x5d\x30\x8d\xee\x75\x0d"
+ "\x14\xd6\x69\xc9\x15\xa9\x0c\x60",
+ .klen = 32,
+ .iv = "\x65\x9b\xd4\xa8\x7d\x29\x1d\xf4"
+ "\xc4\xd6\x9b\x6a\x28\xab\x64\xe2"
+ "\x62\x81\x97\xc5\x81\xaa\xf9\x44"
+ "\xc1\x72\x59\x82\xaf\x16\xc8\x2c",
+ .result = "\xc7\x6b\x52\x6a\x10\xf0\xcc\x09"
+ "\xc1\x12\x1d\x6d\x21\xa6\x78\xf5"
+ "\x05\xa3\x69\x60\x91\x36\x98\x57"
+ "\xba\x0c\x14\xcc\xf3\x2d\x73\x03"
+ "\xc6\xb2\x5f\xc8\x16\x27\x37\x5d"
+ "\xd0\x0b\x87\xb2\x50\x94\x7b\x58"
+ "\x04\xf4\xe0\x7f\x6e\x57\x8e\xc9"
+ "\x41\x84\xc1\xb1\x7e\x4b\x91\x12"
+ "\x3a\x8b\x5d\x50\x82\x7b\xcb\xd9"
+ "\x9a\xd9\x4e\x18\x06\x23\x9e\xd4"
+ "\xa5\x20\x98\xef\xb5\xda\xe5\xc0"
+ "\x8a\x6a\x83\x77\x15\x84\x1e\xae"
+ "\x78\x94\x9d\xdf\xb7\xd1\xea\x67"
+ "\xaa\xb0\x14\x15\xfa\x67\x21\x84"
+ "\xd3\x41\x2a\xce\xba\x4b\x4a\xe8"
+ "\x95\x62\xa9\x55\xf0\x80\xad\xbd"
+ "\xab\xaf\xdd\x4f\xa5\x7c\x13\x36"
+ "\xed\x5e\x4f\x72\xad\x4b\xf1\xd0"
+ "\x88\x4e\xec\x2c\x88\x10\x5e\xea"
+ "\x12\xc0\x16\x01\x29\xa3\xa0\x55"
+ "\xaa\x68\xf3\xe9\x9d\x3b\x0d\x3b"
+ "\x6d\xec\xf8\xa0\x2d\xf0\x90\x8d"
+ "\x1c\xe2\x88\xd4\x24\x71\xf9\xb3"
+ "\xc1\x9f\xc5\xd6\x76\x70\xc5\x2e"
+ "\x9c\xac\xdb\x90\xbd\x83\x72\xba"
+ "\x6e\xb5\xa5\x53\x83\xa9\xa5\xbf"
+ "\x7d\x06\x0e\x3c\x2a\xd2\x04\xb5"
+ "\x1e\x19\x38\x09\x16\xd2\x82\x1f"
+ "\x75\x18\x56\xb8\x96\x0b\xa6\xf9"
+ "\xcf\x62\xd9\x32\x5d\xa9\xd7\x1d"
+ "\xec\xe4\xdf\x1b\xbe\xf1\x36\xee"
+ "\xe3\x7b\xb5\x2f\xee\xf8\x53\x3d"
+ "\x6a\xb7\x70\xa9\xfc\x9c\x57\x25"
+ "\xf2\x89\x10\xd3\xb8\xa8\x8c\x30"
+ "\xae\x23\x4f\x0e\x13\x66\x4f\xe1"
+ "\xb6\xc0\xe4\xf8\xef\x93\xbd\x6e"
+ "\x15\x85\x6b\xe3\x60\x81\x1d\x68"
+ "\xd7\x31\x87\x89\x09\xab\xd5\x96"
+ "\x1d\xf3\x6d\x67\x80\xca\x07\x31"
+ "\x5d\xa7\xe4\xfb\x3e\xf2\x9b\x33"
+ "\x52\x18\xc8\x30\xfe\x2d\xca\x1e"
+ "\x79\x92\x7a\x60\x5c\xb6\x58\x87"
+ "\xa4\x36\xa2\x67\x92\x8b\xa4\xb7"
+ "\xf1\x86\xdf\xdc\xc0\x7e\x8f\x63"
+ "\xd2\xa2\xdc\x78\xeb\x4f\xd8\x96"
+ "\x47\xca\xb8\x91\xf9\xf7\x94\x21"
+ "\x5f\x9a\x9f\x5b\xb8\x40\x41\x4b"
+ "\x66\x69\x6a\x72\xd0\xcb\x70\xb7"
+ "\x93\xb5\x37\x96\x05\x37\x4f\xe5"
+ "\x8c\xa7\x5a\x4e\x8b\xb7\x84\xea"
+ "\xc7\xfc\x19\x6e\x1f\x5a\xa1\xac"
+ "\x18\x7d\x52\x3b\xb3\x34\x62\x99"
+ "\xe4\x9e\x31\x04\x3f\xc0\x8d\x84"
+ "\x17\x7c\x25\x48\x52\x67\x11\x27"
+ "\x67\xbb\x5a\x85\xca\x56\xb2\x5c"
+ "\xe6\xec\xd5\x96\x3d\x15\xfc\xfb"
+ "\x22\x25\xf4\x13\xe5\x93\x4b\x9a"
+ "\x77\xf1\x52\x18\xfa\x16\x5e\x49"
+ "\x03\x45\xa8\x08\xfa\xb3\x41\x92"
+ "\x79\x50\x33\xca\xd0\xd7\x42\x55"
+ "\xc3\x9a\x0c\x4e\xd9\xa4\x3c\x86"
+ "\x80\x9f\x53\xd1\xa4\x2e\xd1\xbc"
+ "\xf1\x54\x6e\x93\xa4\x65\x99\x8e"
+ "\xdf\x29\xc0\x64\x63\x07\xbb\xea",
+ .input = "\xe0\x33\xf6\xe0\xb4\xa5\xdd\x2b"
+ "\xdd\xce\xfc\x12\x1e\xfc\x2d\xf2"
+ "\x8b\xc7\xeb\xc1\xc4\x2a\xe8\x44"
+ "\x0f\x3d\x97\x19\x2e\x6d\xa2\x38"
+ "\x9d\xa6\xaa\xe1\x96\xb9\x08\xe8"
+ "\x0b\x70\x48\x5c\xed\xb5\x9b\xcb"
+ "\x8b\x40\x88\x7e\x69\x73\xf7\x16"
+ "\x71\xbb\x5b\xfc\xa3\x47\x5d\xa6"
+ "\xae\x3a\x64\xc4\xe7\xb8\xa8\xe7"
+ "\xb1\x32\x19\xdb\xe3\x01\xb8\xf0"
+ "\xa4\x86\xb4\x4c\xc2\xde\x5c\xd2"
+ "\x6c\x77\xd2\xe8\x18\xb7\x0a\xc9"
+ "\x3d\x53\xb5\xc4\x5c\xf0\x8c\x06"
+ "\xdc\x90\xe0\x74\x47\x1b\x0b\xf6"
+ "\xd2\x71\x6b\xc4\xf1\x97\x00\x2d"
+ "\x63\x57\x44\x1f\x8c\xf4\xe6\x9b"
+ "\xe0\x7a\xdd\xec\x32\x73\x42\x32"
+ "\x7f\x35\x67\x60\x0d\xcf\x10\x52"
+ "\x61\x22\x53\x8d\x8e\xbb\x33\x76"
+ "\x59\xd9\x10\xce\xdf\xef\xc0\x41"
+ "\xd5\x33\x29\x6a\xda\x46\xa4\x51"
+ "\xf0\x99\x3d\x96\x31\xdd\xb5\xcb"
+ "\x3e\x2a\x1f\xc7\x5c\x79\xd3\xc5"
+ "\x20\xa1\xb1\x39\x1b\xc6\x0a\x70"
+ "\x26\x39\x95\x07\xad\x7a\xc9\x69"
+ "\xfe\x81\xc7\x88\x08\x38\xaf\xad"
+ "\x9e\x8d\xfb\xe8\x24\x0d\x22\xb8"
+ "\x0e\xed\xbe\x37\x53\x7c\xa6\xc6"
+ "\x78\x62\xec\xa3\x59\xd9\xc6\x9d"
+ "\xb8\x0e\x69\x77\x84\x2d\x6a\x4c"
+ "\xc5\xd9\xb2\xa0\x2b\xa8\x80\xcc"
+ "\xe9\x1e\x9c\x5a\xc4\xa1\xb2\x37"
+ "\x06\x9b\x30\x32\x67\xf7\xe7\xd2"
+ "\x42\xc7\xdf\x4e\xd4\xcb\xa0\x12"
+ "\x94\xa1\x34\x85\x93\x50\x4b\x0a"
+ "\x3c\x7d\x49\x25\x01\x41\x6b\x96"
+ "\xa9\x12\xbb\x0b\xc0\xd7\xd0\x93"
+ "\x1f\x70\x38\xb8\x21\xee\xf6\xa7"
+ "\xee\xeb\xe7\x81\xa4\x13\xb4\x87"
+ "\xfa\xc1\xb0\xb5\x37\x8b\x74\xa2"
+ "\x4e\xc7\xc2\xad\x3d\x62\x3f\xf8"
+ "\x34\x42\xe5\xae\x45\x13\x63\xfe"
+ "\xfc\x2a\x17\x46\x61\xa9\xd3\x1c"
+ "\x4c\xaf\xf0\x09\x62\x26\x66\x1e"
+ "\x74\xcf\xd6\x68\x3d\x7d\xd8\xb7"
+ "\xe7\xe6\xf8\xf0\x08\x20\xf7\x47"
+ "\x1c\x52\xaa\x0f\x3e\x21\xa3\xf2"
+ "\xbf\x2f\x95\x16\xa8\xc8\xc8\x8c"
+ "\x99\x0f\x5d\xfb\xfa\x2b\x58\x8a"
+ "\x7e\xd6\x74\x02\x60\xf0\xd0\x5b"
+ "\x65\xa8\xac\xea\x8d\x68\x46\x34"
+ "\x26\x9d\x4f\xb1\x9a\x8e\xc0\x1a"
+ "\xf1\xed\xc6\x7a\x83\xfd\x8a\x57"
+ "\xf2\xe6\xe4\xba\xfc\xc6\x3c\xad"
+ "\x5b\x19\x50\x2f\x3a\xcc\x06\x46"
+ "\x04\x51\x3f\x91\x97\xf0\xd2\x07"
+ "\xe7\x93\x89\x7e\xb5\x32\x0f\x03"
+ "\xe5\x58\x9e\x74\x72\xeb\xc2\x38"
+ "\x00\x0c\x91\x72\x69\xed\x7d\x6d"
+ "\xc8\x71\xf0\xec\xff\x80\xd9\x1c"
+ "\x9e\xd2\xfa\x15\xfc\x6c\x4e\xbc"
+ "\xb1\xa6\xbd\xbd\x70\x40\xca\x20"
+ "\xb8\x78\xd2\xa3\xc6\xf3\x79\x9c"
+ "\xc7\x27\xe1\x6a\x29\xad\xa4\x03",
+ .ilen = 512,
+ .rlen = 512,
+ }
+};
+
/*
* CTS (Cipher Text Stealing) mode tests
*/
@@ -34545,4 +37878,78 @@ static struct comp_testvec lz4hc_decomp_tv_template[] = {
},
};
+#define ZSTD_COMP_TEST_VECTORS 2
+#define ZSTD_DECOMP_TEST_VECTORS 2
+
+static struct comp_testvec zstd_comp_tv_template[] = {
+ {
+ .inlen = 68,
+ .outlen = 39,
+ .input = "The algorithm is zstd. "
+ "The algorithm is zstd. "
+ "The algorithm is zstd.",
+ .output = "\x28\xb5\x2f\xfd\x00\x50\xf5\x00\x00\xb8\x54\x68\x65"
+ "\x20\x61\x6c\x67\x6f\x72\x69\x74\x68\x6d\x20\x69\x73"
+ "\x20\x7a\x73\x74\x64\x2e\x20\x01\x00\x55\x73\x36\x01"
+ ,
+ },
+ {
+ .inlen = 244,
+ .outlen = 151,
+ .input = "zstd, short for Zstandard, is a fast lossless "
+ "compression algorithm, targeting real-time "
+ "compression scenarios at zlib-level and better "
+ "compression ratios. The zstd compression library "
+ "provides in-memory compression and decompression "
+ "functions.",
+ .output = "\x28\xb5\x2f\xfd\x00\x50\x75\x04\x00\x42\x4b\x1e\x17"
+ "\x90\x81\x31\x00\xf2\x2f\xe4\x36\xc9\xef\x92\x88\x32"
+ "\xc9\xf2\x24\x94\xd8\x68\x9a\x0f\x00\x0c\xc4\x31\x6f"
+ "\x0d\x0c\x38\xac\x5c\x48\x03\xcd\x63\x67\xc0\xf3\xad"
+ "\x4e\x90\xaa\x78\xa0\xa4\xc5\x99\xda\x2f\xb6\x24\x60"
+ "\xe2\x79\x4b\xaa\xb6\x6b\x85\x0b\xc9\xc6\x04\x66\x86"
+ "\xe2\xcc\xe2\x25\x3f\x4f\x09\xcd\xb8\x9d\xdb\xc1\x90"
+ "\xa9\x11\xbc\x35\x44\x69\x2d\x9c\x64\x4f\x13\x31\x64"
+ "\xcc\xfb\x4d\x95\x93\x86\x7f\x33\x7f\x1a\xef\xe9\x30"
+ "\xf9\x67\xa1\x94\x0a\x69\x0f\x60\xcd\xc3\xab\x99\xdc"
+ "\x42\xed\x97\x05\x00\x33\xc3\x15\x95\x3a\x06\xa0\x0e"
+ "\x20\xa9\x0e\x82\xb9\x43\x45\x01",
+ },
+};
+
+static struct comp_testvec zstd_decomp_tv_template[] = {
+ {
+ .inlen = 43,
+ .outlen = 68,
+ .input = "\x28\xb5\x2f\xfd\x04\x50\xf5\x00\x00\xb8\x54\x68\x65"
+ "\x20\x61\x6c\x67\x6f\x72\x69\x74\x68\x6d\x20\x69\x73"
+ "\x20\x7a\x73\x74\x64\x2e\x20\x01\x00\x55\x73\x36\x01"
+ "\x6b\xf4\x13\x35",
+ .output = "The algorithm is zstd. "
+ "The algorithm is zstd. "
+ "The algorithm is zstd.",
+ },
+ {
+ .inlen = 155,
+ .outlen = 244,
+ .input = "\x28\xb5\x2f\xfd\x04\x50\x75\x04\x00\x42\x4b\x1e\x17"
+ "\x90\x81\x31\x00\xf2\x2f\xe4\x36\xc9\xef\x92\x88\x32"
+ "\xc9\xf2\x24\x94\xd8\x68\x9a\x0f\x00\x0c\xc4\x31\x6f"
+ "\x0d\x0c\x38\xac\x5c\x48\x03\xcd\x63\x67\xc0\xf3\xad"
+ "\x4e\x90\xaa\x78\xa0\xa4\xc5\x99\xda\x2f\xb6\x24\x60"
+ "\xe2\x79\x4b\xaa\xb6\x6b\x85\x0b\xc9\xc6\x04\x66\x86"
+ "\xe2\xcc\xe2\x25\x3f\x4f\x09\xcd\xb8\x9d\xdb\xc1\x90"
+ "\xa9\x11\xbc\x35\x44\x69\x2d\x9c\x64\x4f\x13\x31\x64"
+ "\xcc\xfb\x4d\x95\x93\x86\x7f\x33\x7f\x1a\xef\xe9\x30"
+ "\xf9\x67\xa1\x94\x0a\x69\x0f\x60\xcd\xc3\xab\x99\xdc"
+ "\x42\xed\x97\x05\x00\x33\xc3\x15\x95\x3a\x06\xa0\x0e"
+ "\x20\xa9\x0e\x82\xb9\x43\x45\x01\xaa\x6d\xda\x0d",
+ .output = "zstd, short for Zstandard, is a fast lossless "
+ "compression algorithm, targeting real-time "
+ "compression scenarios at zlib-level and better "
+ "compression ratios. The zstd compression library "
+ "provides in-memory compression and decompression "
+ "functions.",
+ },
+};
#endif /* _CRYPTO_TESTMGR_H */
diff --git a/crypto/zstd.c b/crypto/zstd.c
new file mode 100644
index 000000000000..9bfd28f8cc77
--- /dev/null
+++ b/crypto/zstd.c
@@ -0,0 +1,209 @@
+/*
+ * Cryptographic API.
+ *
+ * Copyright (c) 2017-present, Facebook, Inc.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 as published by
+ * the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
+ * more details.
+ */
+#include <linux/crypto.h>
+#include <linux/init.h>
+#include <linux/interrupt.h>
+#include <linux/mm.h>
+#include <linux/module.h>
+#include <linux/net.h>
+#include <linux/vmalloc.h>
+#include <linux/zstd.h>
+
+
+#define ZSTD_DEF_LEVEL 3
+
+struct zstd_ctx {
+ ZSTD_CCtx *cctx;
+ ZSTD_DCtx *dctx;
+ void *cwksp;
+ void *dwksp;
+};
+
+static ZSTD_parameters zstd_params(void)
+{
+ return ZSTD_getParams(ZSTD_DEF_LEVEL, 0, 0);
+}
+
+static int zstd_comp_init(struct zstd_ctx *ctx)
+{
+ int ret = 0;
+ const ZSTD_parameters params = zstd_params();
+ const size_t wksp_size = ZSTD_CCtxWorkspaceBound(params.cParams);
+
+ ctx->cwksp = vzalloc(wksp_size);
+ if (!ctx->cwksp) {
+ ret = -ENOMEM;
+ goto out;
+ }
+
+ ctx->cctx = ZSTD_initCCtx(ctx->cwksp, wksp_size);
+ if (!ctx->cctx) {
+ ret = -EINVAL;
+ goto out_free;
+ }
+out:
+ return ret;
+out_free:
+ vfree(ctx->cwksp);
+ goto out;
+}
+
+static int zstd_decomp_init(struct zstd_ctx *ctx)
+{
+ int ret = 0;
+ const size_t wksp_size = ZSTD_DCtxWorkspaceBound();
+
+ ctx->dwksp = vzalloc(wksp_size);
+ if (!ctx->dwksp) {
+ ret = -ENOMEM;
+ goto out;
+ }
+
+ ctx->dctx = ZSTD_initDCtx(ctx->dwksp, wksp_size);
+ if (!ctx->dctx) {
+ ret = -EINVAL;
+ goto out_free;
+ }
+out:
+ return ret;
+out_free:
+ vfree(ctx->dwksp);
+ goto out;
+}
+
+static void zstd_comp_exit(struct zstd_ctx *ctx)
+{
+ vfree(ctx->cwksp);
+ ctx->cwksp = NULL;
+ ctx->cctx = NULL;
+}
+
+static void zstd_decomp_exit(struct zstd_ctx *ctx)
+{
+ vfree(ctx->dwksp);
+ ctx->dwksp = NULL;
+ ctx->dctx = NULL;
+}
+
+static int __zstd_init(void *ctx)
+{
+ int ret;
+
+ ret = zstd_comp_init(ctx);
+ if (ret)
+ return ret;
+ ret = zstd_decomp_init(ctx);
+ if (ret)
+ zstd_comp_exit(ctx);
+ return ret;
+}
+
+static int zstd_init(struct crypto_tfm *tfm)
+{
+ struct zstd_ctx *ctx = crypto_tfm_ctx(tfm);
+
+ return __zstd_init(ctx);
+}
+
+static void __zstd_exit(void *ctx)
+{
+ zstd_comp_exit(ctx);
+ zstd_decomp_exit(ctx);
+}
+
+static void zstd_exit(struct crypto_tfm *tfm)
+{
+ struct zstd_ctx *ctx = crypto_tfm_ctx(tfm);
+
+ __zstd_exit(ctx);
+}
+
+static int __zstd_compress(const u8 *src, unsigned int slen,
+ u8 *dst, unsigned int *dlen, void *ctx)
+{
+ size_t out_len;
+ struct zstd_ctx *zctx = ctx;
+ const ZSTD_parameters params = zstd_params();
+
+ out_len = ZSTD_compressCCtx(zctx->cctx, dst, *dlen, src, slen, params);
+ if (ZSTD_isError(out_len))
+ return -EINVAL;
+ *dlen = out_len;
+ return 0;
+}
+
+static int zstd_compress(struct crypto_tfm *tfm, const u8 *src,
+ unsigned int slen, u8 *dst, unsigned int *dlen)
+{
+ struct zstd_ctx *ctx = crypto_tfm_ctx(tfm);
+
+ return __zstd_compress(src, slen, dst, dlen, ctx);
+}
+
+static int __zstd_decompress(const u8 *src, unsigned int slen,
+ u8 *dst, unsigned int *dlen, void *ctx)
+{
+ size_t out_len;
+ struct zstd_ctx *zctx = ctx;
+
+ out_len = ZSTD_decompressDCtx(zctx->dctx, dst, *dlen, src, slen);
+ if (ZSTD_isError(out_len))
+ return -EINVAL;
+ *dlen = out_len;
+ return 0;
+}
+
+static int zstd_decompress(struct crypto_tfm *tfm, const u8 *src,
+ unsigned int slen, u8 *dst, unsigned int *dlen)
+{
+ struct zstd_ctx *ctx = crypto_tfm_ctx(tfm);
+
+ return __zstd_decompress(src, slen, dst, dlen, ctx);
+}
+
+static struct crypto_alg alg = {
+ .cra_name = "zstd",
+ .cra_flags = CRYPTO_ALG_TYPE_COMPRESS,
+ .cra_ctxsize = sizeof(struct zstd_ctx),
+ .cra_module = THIS_MODULE,
+ .cra_init = zstd_init,
+ .cra_exit = zstd_exit,
+ .cra_u = { .compress = {
+ .coa_compress = zstd_compress,
+ .coa_decompress = zstd_decompress } }
+};
+
+static int __init zstd_mod_init(void)
+{
+ int ret;
+
+ ret = crypto_register_alg(&alg);
+ if (ret)
+ return ret;
+
+ return ret;
+}
+
+static void __exit zstd_mod_fini(void)
+{
+ crypto_unregister_alg(&alg);
+}
+
+module_init(zstd_mod_init);
+module_exit(zstd_mod_fini);
+
+MODULE_LICENSE("GPL");
+MODULE_DESCRIPTION("Zstd Compression Algorithm");
+MODULE_ALIAS_CRYPTO("zstd");
diff --git a/drivers/Kconfig b/drivers/Kconfig
index e1e2066cecdb..de581c13ec9a 100644
--- a/drivers/Kconfig
+++ b/drivers/Kconfig
@@ -202,4 +202,6 @@ source "drivers/hwtracing/intel_th/Kconfig"
source "drivers/fpga/Kconfig"
+source "drivers/tee/Kconfig"
+
endmenu
diff --git a/drivers/Makefile b/drivers/Makefile
index 7c3d58dcf6b3..dc932be710e8 100644
--- a/drivers/Makefile
+++ b/drivers/Makefile
@@ -175,3 +175,4 @@ obj-$(CONFIG_STM) += hwtracing/stm/
obj-$(CONFIG_ANDROID) += android/
obj-$(CONFIG_NVMEM) += nvmem/
obj-$(CONFIG_FPGA) += fpga/
+obj-$(CONFIG_TEE) += tee/
diff --git a/drivers/acpi/button.c b/drivers/acpi/button.c
index e19f530f1083..f7c4301f7fa3 100644
--- a/drivers/acpi/button.c
+++ b/drivers/acpi/button.c
@@ -556,7 +556,8 @@ static int acpi_button_remove(struct acpi_device *device)
return 0;
}
-static int param_set_lid_init_state(const char *val, struct kernel_param *kp)
+static int param_set_lid_init_state(const char *val,
+ const struct kernel_param *kp)
{
int result = 0;
@@ -574,7 +575,8 @@ static int param_set_lid_init_state(const char *val, struct kernel_param *kp)
return result;
}
-static int param_get_lid_init_state(char *buffer, struct kernel_param *kp)
+static int param_get_lid_init_state(char *buffer,
+ const struct kernel_param *kp)
{
switch (lid_init_state) {
case ACPI_BUTTON_LID_INIT_OPEN:
diff --git a/drivers/acpi/ec.c b/drivers/acpi/ec.c
index 307b3e28f34c..7b665aaa6a57 100644
--- a/drivers/acpi/ec.c
+++ b/drivers/acpi/ec.c
@@ -1891,7 +1891,8 @@ static const struct dev_pm_ops acpi_ec_pm = {
SET_SYSTEM_SLEEP_PM_OPS(acpi_ec_suspend, acpi_ec_resume)
};
-static int param_set_event_clearing(const char *val, struct kernel_param *kp)
+static int param_set_event_clearing(const char *val,
+ const struct kernel_param *kp)
{
int result = 0;
@@ -1909,7 +1910,8 @@ static int param_set_event_clearing(const char *val, struct kernel_param *kp)
return result;
}
-static int param_get_event_clearing(char *buffer, struct kernel_param *kp)
+static int param_get_event_clearing(char *buffer,
+ const struct kernel_param *kp)
{
switch (ec_event_clearing) {
case ACPI_EC_EVT_TIMING_STATUS:
diff --git a/drivers/acpi/sysfs.c b/drivers/acpi/sysfs.c
index a36d0739dbfe..cf2a17bdccc1 100644
--- a/drivers/acpi/sysfs.c
+++ b/drivers/acpi/sysfs.c
@@ -227,7 +227,8 @@ module_param_cb(trace_method_name, &param_ops_trace_method, &trace_method_name,
module_param_cb(trace_debug_layer, &param_ops_trace_attrib, &acpi_gbl_trace_dbg_layer, 0644);
module_param_cb(trace_debug_level, &param_ops_trace_attrib, &acpi_gbl_trace_dbg_level, 0644);
-static int param_set_trace_state(const char *val, struct kernel_param *kp)
+static int param_set_trace_state(const char *val,
+ const struct kernel_param *kp)
{
acpi_status status;
const char *method = trace_method_name;
@@ -263,7 +264,7 @@ static int param_set_trace_state(const char *val, struct kernel_param *kp)
return 0;
}
-static int param_get_trace_state(char *buffer, struct kernel_param *kp)
+static int param_get_trace_state(char *buffer, const struct kernel_param *kp)
{
if (!(acpi_gbl_trace_flags & ACPI_TRACE_ENABLED))
return sprintf(buffer, "disable");
@@ -292,7 +293,8 @@ MODULE_PARM_DESC(aml_debug_output,
"To enable/disable the ACPI Debug Object output.");
/* /sys/module/acpi/parameters/acpica_version */
-static int param_get_acpica_version(char *buffer, struct kernel_param *kp)
+static int param_get_acpica_version(char *buffer,
+ const struct kernel_param *kp)
{
int result;
diff --git a/drivers/android/Kconfig b/drivers/android/Kconfig
index bdfc6c6f4f5a..63ed9ceebf7b 100644
--- a/drivers/android/Kconfig
+++ b/drivers/android/Kconfig
@@ -9,7 +9,7 @@ if ANDROID
config ANDROID_BINDER_IPC
bool "Android Binder IPC Driver"
- depends on MMU
+ depends on MMU && !M68K
default n
---help---
Binder is used in Android for both communication between processes,
@@ -19,18 +19,27 @@ config ANDROID_BINDER_IPC
Android process, using Binder to identify, invoke and pass arguments
between said processes.
-config ANDROID_BINDER_IPC_32BIT
- bool
- depends on !64BIT && ANDROID_BINDER_IPC
- default y
+config ANDROID_BINDER_DEVICES
+ string "Android Binder devices"
+ depends on ANDROID_BINDER_IPC
+ default "binder,hwbinder,vndbinder"
---help---
- The Binder API has been changed to support both 32 and 64bit
- applications in a mixed environment.
+ Default value for the binder.devices parameter.
- Enable this to support an old 32-bit Android user-space (v4.4 and
- earlier).
+ The binder.devices parameter is a comma-separated list of strings
+ that specifies the names of the binder device nodes that will be
+ created. Each binder device has its own context manager, and is
+ therefore logically separated from the other devices.
- Note that enabling this will break newer Android user-space.
+config ANDROID_BINDER_IPC_SELFTEST
+ bool "Android Binder IPC Driver Selftest"
+ depends on ANDROID_BINDER_IPC
+ ---help---
+ This feature allows binder selftest to run.
+
+ Binder selftest checks the allocation and free of binder buffers
+ exhaustively with combinations of various buffer sizes and
+ alignments.
endif # if ANDROID
diff --git a/drivers/android/Makefile b/drivers/android/Makefile
index 3b7e4b072c58..a01254c43ee3 100644
--- a/drivers/android/Makefile
+++ b/drivers/android/Makefile
@@ -1,3 +1,4 @@
ccflags-y += -I$(src) # needed for trace events
-obj-$(CONFIG_ANDROID_BINDER_IPC) += binder.o
+obj-$(CONFIG_ANDROID_BINDER_IPC) += binder.o binder_alloc.o
+obj-$(CONFIG_ANDROID_BINDER_IPC_SELFTEST) += binder_alloc_selftest.o
diff --git a/drivers/android/binder.c b/drivers/android/binder.c
index 80499f421a29..d578ea8bd388 100644
--- a/drivers/android/binder.c
+++ b/drivers/android/binder.c
@@ -15,6 +15,40 @@
*
*/
+/*
+ * Locking overview
+ *
+ * There are 3 main spinlocks which must be acquired in the
+ * order shown:
+ *
+ * 1) proc->outer_lock : protects binder_ref
+ * binder_proc_lock() and binder_proc_unlock() are
+ * used to acq/rel.
+ * 2) node->lock : protects most fields of binder_node.
+ * binder_node_lock() and binder_node_unlock() are
+ * used to acq/rel
+ * 3) proc->inner_lock : protects the thread and node lists
+ * (proc->threads, proc->waiting_threads, proc->nodes)
+ * and all todo lists associated with the binder_proc
+ * (proc->todo, thread->todo, proc->delivered_death and
+ * node->async_todo), as well as thread->transaction_stack
+ * binder_inner_proc_lock() and binder_inner_proc_unlock()
+ * are used to acq/rel
+ *
+ * Any lock under procA must never be nested under any lock at the same
+ * level or below on procB.
+ *
+ * Functions that require a lock held on entry indicate which lock
+ * in the suffix of the function name:
+ *
+ * foo_olocked() : requires node->outer_lock
+ * foo_nlocked() : requires node->lock
+ * foo_ilocked() : requires proc->inner_lock
+ * foo_oilocked(): requires proc->outer_lock and proc->inner_lock
+ * foo_nilocked(): requires node->lock and proc->inner_lock
+ * ...
+ */
+
#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
#include <asm/cacheflush.h>
@@ -24,7 +58,6 @@
#include <linux/fs.h>
#include <linux/list.h>
#include <linux/miscdevice.h>
-#include <linux/mm.h>
#include <linux/module.h>
#include <linux/mutex.h>
#include <linux/nsproxy.h>
@@ -34,31 +67,27 @@
#include <linux/sched.h>
#include <linux/seq_file.h>
#include <linux/uaccess.h>
-#include <linux/vmalloc.h>
-#include <linux/slab.h>
#include <linux/pid_namespace.h>
#include <linux/security.h>
-
-#ifdef CONFIG_ANDROID_BINDER_IPC_32BIT
-#define BINDER_IPC_32BIT 1
-#endif
+#include <linux/spinlock.h>
#include <uapi/linux/android/binder.h>
+#include "binder_alloc.h"
#include "binder_trace.h"
-static DEFINE_MUTEX(binder_main_lock);
+static HLIST_HEAD(binder_deferred_list);
static DEFINE_MUTEX(binder_deferred_lock);
-static DEFINE_MUTEX(binder_mmap_lock);
+static HLIST_HEAD(binder_devices);
static HLIST_HEAD(binder_procs);
-static HLIST_HEAD(binder_deferred_list);
+static DEFINE_MUTEX(binder_procs_lock);
+
static HLIST_HEAD(binder_dead_nodes);
+static DEFINE_SPINLOCK(binder_dead_nodes_lock);
static struct dentry *binder_debugfs_dir_entry_root;
static struct dentry *binder_debugfs_dir_entry_proc;
-static struct binder_node *binder_context_mgr_node;
-static kuid_t binder_context_mgr_uid = INVALID_UID;
-static int binder_last_id;
+static atomic_t binder_last_id;
#define BINDER_DEBUG_ENTRY(name) \
static int binder_##name##_open(struct inode *inode, struct file *file) \
@@ -104,22 +133,21 @@ enum {
BINDER_DEBUG_TRANSACTION_COMPLETE = 1U << 10,
BINDER_DEBUG_FREE_BUFFER = 1U << 11,
BINDER_DEBUG_INTERNAL_REFS = 1U << 12,
- BINDER_DEBUG_BUFFER_ALLOC = 1U << 13,
- BINDER_DEBUG_PRIORITY_CAP = 1U << 14,
- BINDER_DEBUG_BUFFER_ALLOC_ASYNC = 1U << 15,
+ BINDER_DEBUG_PRIORITY_CAP = 1U << 13,
+ BINDER_DEBUG_SPINLOCKS = 1U << 14,
};
static uint32_t binder_debug_mask = BINDER_DEBUG_USER_ERROR |
BINDER_DEBUG_FAILED_TRANSACTION | BINDER_DEBUG_DEAD_TRANSACTION;
-module_param_named(debug_mask, binder_debug_mask, uint, S_IWUSR | S_IRUGO);
+module_param_named(debug_mask, binder_debug_mask, uint, 0644);
-static bool binder_debug_no_lock;
-module_param_named(proc_no_lock, binder_debug_no_lock, bool, S_IWUSR | S_IRUGO);
+static char *binder_devices_param = CONFIG_ANDROID_BINDER_DEVICES;
+module_param_named(devices, binder_devices_param, charp, S_IRUGO);
static DECLARE_WAIT_QUEUE_HEAD(binder_user_error_wait);
static int binder_stop_on_user_error;
static int binder_set_stop_on_user_error(const char *val,
- struct kernel_param *kp)
+ const struct kernel_param *kp)
{
int ret;
@@ -129,7 +157,7 @@ static int binder_set_stop_on_user_error(const char *val,
return ret;
}
module_param_call(stop_on_user_error, binder_set_stop_on_user_error,
- param_get_int, &binder_stop_on_user_error, S_IWUSR | S_IRUGO);
+ param_get_int, &binder_stop_on_user_error, 0644);
#define binder_debug(mask, x...) \
do { \
@@ -145,6 +173,17 @@ module_param_call(stop_on_user_error, binder_set_stop_on_user_error,
binder_stop_on_user_error = 2; \
} while (0)
+#define to_flat_binder_object(hdr) \
+ container_of(hdr, struct flat_binder_object, hdr)
+
+#define to_binder_fd_object(hdr) container_of(hdr, struct binder_fd_object, hdr)
+
+#define to_binder_buffer_object(hdr) \
+ container_of(hdr, struct binder_buffer_object, hdr)
+
+#define to_binder_fd_array_object(hdr) \
+ container_of(hdr, struct binder_fd_array_object, hdr)
+
enum binder_stat_types {
BINDER_STAT_PROC,
BINDER_STAT_THREAD,
@@ -157,26 +196,27 @@ enum binder_stat_types {
};
struct binder_stats {
- int br[_IOC_NR(BR_FAILED_REPLY) + 1];
- int bc[_IOC_NR(BC_DEAD_BINDER_DONE) + 1];
- int obj_created[BINDER_STAT_COUNT];
- int obj_deleted[BINDER_STAT_COUNT];
+ atomic_t br[_IOC_NR(BR_FAILED_REPLY) + 1];
+ atomic_t bc[_IOC_NR(BC_REPLY_SG) + 1];
+ atomic_t obj_created[BINDER_STAT_COUNT];
+ atomic_t obj_deleted[BINDER_STAT_COUNT];
};
static struct binder_stats binder_stats;
static inline void binder_stats_deleted(enum binder_stat_types type)
{
- binder_stats.obj_deleted[type]++;
+ atomic_inc(&binder_stats.obj_deleted[type]);
}
static inline void binder_stats_created(enum binder_stat_types type)
{
- binder_stats.obj_created[type]++;
+ atomic_inc(&binder_stats.obj_created[type]);
}
struct binder_transaction_log_entry {
int debug_id;
+ int debug_id_done;
int call_type;
int from_proc;
int from_thread;
@@ -186,10 +226,14 @@ struct binder_transaction_log_entry {
int to_node;
int data_size;
int offsets_size;
+ int return_error_line;
+ uint32_t return_error;
+ uint32_t return_error_param;
+ const char *context_name;
};
struct binder_transaction_log {
- int next;
- int full;
+ atomic_t cur;
+ bool full;
struct binder_transaction_log_entry entry[32];
};
static struct binder_transaction_log binder_transaction_log;
@@ -199,22 +243,50 @@ static struct binder_transaction_log_entry *binder_transaction_log_add(
struct binder_transaction_log *log)
{
struct binder_transaction_log_entry *e;
-
- e = &log->entry[log->next];
+ unsigned int cur = atomic_inc_return(&log->cur);
+
+ if (cur >= ARRAY_SIZE(log->entry))
+ log->full = true;
+ e = &log->entry[cur % ARRAY_SIZE(log->entry)];
+ WRITE_ONCE(e->debug_id_done, 0);
+ /*
+ * write-barrier to synchronize access to e->debug_id_done.
+ * We make sure the initialized 0 value is seen before
+ * memset() other fields are zeroed by memset.
+ */
+ smp_wmb();
memset(e, 0, sizeof(*e));
- log->next++;
- if (log->next == ARRAY_SIZE(log->entry)) {
- log->next = 0;
- log->full = 1;
- }
return e;
}
+struct binder_context {
+ struct binder_node *binder_context_mgr_node;
+ struct mutex context_mgr_node_lock;
+
+ kuid_t binder_context_mgr_uid;
+ const char *name;
+};
+
+struct binder_device {
+ struct hlist_node hlist;
+ struct miscdevice miscdev;
+ struct binder_context context;
+};
+
+/**
+ * struct binder_work - work enqueued on a worklist
+ * @entry: node enqueued on list
+ * @type: type of work to be performed
+ *
+ * There are separate work lists for proc, thread, and node (async).
+ */
struct binder_work {
struct list_head entry;
+
enum {
BINDER_WORK_TRANSACTION = 1,
BINDER_WORK_TRANSACTION_COMPLETE,
+ BINDER_WORK_RETURN_ERROR,
BINDER_WORK_NODE,
BINDER_WORK_DEAD_BINDER,
BINDER_WORK_DEAD_BINDER_AND_CLEAR,
@@ -222,8 +294,76 @@ struct binder_work {
} type;
};
+struct binder_error {
+ struct binder_work work;
+ uint32_t cmd;
+};
+
+/**
+ * struct binder_node - binder node bookkeeping
+ * @debug_id: unique ID for debugging
+ * (invariant after initialized)
+ * @lock: lock for node fields
+ * @work: worklist element for node work
+ * (protected by @proc->inner_lock)
+ * @rb_node: element for proc->nodes tree
+ * (protected by @proc->inner_lock)
+ * @dead_node: element for binder_dead_nodes list
+ * (protected by binder_dead_nodes_lock)
+ * @proc: binder_proc that owns this node
+ * (invariant after initialized)
+ * @refs: list of references on this node
+ * (protected by @lock)
+ * @internal_strong_refs: used to take strong references when
+ * initiating a transaction
+ * (protected by @proc->inner_lock if @proc
+ * and by @lock)
+ * @local_weak_refs: weak user refs from local process
+ * (protected by @proc->inner_lock if @proc
+ * and by @lock)
+ * @local_strong_refs: strong user refs from local process
+ * (protected by @proc->inner_lock if @proc
+ * and by @lock)
+ * @tmp_refs: temporary kernel refs
+ * (protected by @proc->inner_lock while @proc
+ * is valid, and by binder_dead_nodes_lock
+ * if @proc is NULL. During inc/dec and node release
+ * it is also protected by @lock to provide safety
+ * as the node dies and @proc becomes NULL)
+ * @ptr: userspace pointer for node
+ * (invariant, no lock needed)
+ * @cookie: userspace cookie for node
+ * (invariant, no lock needed)
+ * @has_strong_ref: userspace notified of strong ref
+ * (protected by @proc->inner_lock if @proc
+ * and by @lock)
+ * @pending_strong_ref: userspace has acked notification of strong ref
+ * (protected by @proc->inner_lock if @proc
+ * and by @lock)
+ * @has_weak_ref: userspace notified of weak ref
+ * (protected by @proc->inner_lock if @proc
+ * and by @lock)
+ * @pending_weak_ref: userspace has acked notification of weak ref
+ * (protected by @proc->inner_lock if @proc
+ * and by @lock)
+ * @has_async_transaction: async transaction to node in progress
+ * (protected by @lock)
+ * @sched_policy: minimum scheduling policy for node
+ * (invariant after initialized)
+ * @accept_fds: file descriptor operations supported for node
+ * (invariant after initialized)
+ * @min_priority: minimum scheduling priority
+ * (invariant after initialized)
+ * @inherit_rt: inherit RT scheduling policy from caller
+ * (invariant after initialized)
+ * @async_todo: list of async work items
+ * (protected by @proc->inner_lock)
+ *
+ * Bookkeeping structure for binder nodes.
+ */
struct binder_node {
int debug_id;
+ spinlock_t lock;
struct binder_work work;
union {
struct rb_node rb_node;
@@ -234,98 +374,198 @@ struct binder_node {
int internal_strong_refs;
int local_weak_refs;
int local_strong_refs;
+ int tmp_refs;
binder_uintptr_t ptr;
binder_uintptr_t cookie;
- unsigned has_strong_ref:1;
- unsigned pending_strong_ref:1;
- unsigned has_weak_ref:1;
- unsigned pending_weak_ref:1;
- unsigned has_async_transaction:1;
- unsigned accept_fds:1;
- unsigned min_priority:8;
+ struct {
+ /*
+ * bitfield elements protected by
+ * proc inner_lock
+ */
+ u8 has_strong_ref:1;
+ u8 pending_strong_ref:1;
+ u8 has_weak_ref:1;
+ u8 pending_weak_ref:1;
+ };
+ struct {
+ /*
+ * invariant after initialization
+ */
+ u8 sched_policy:2;
+ u8 inherit_rt:1;
+ u8 accept_fds:1;
+ u8 min_priority;
+ };
+ bool has_async_transaction;
struct list_head async_todo;
};
struct binder_ref_death {
+ /**
+ * @work: worklist element for death notifications
+ * (protected by inner_lock of the proc that
+ * this ref belongs to)
+ */
struct binder_work work;
binder_uintptr_t cookie;
};
+/**
+ * struct binder_ref_data - binder_ref counts and id
+ * @debug_id: unique ID for the ref
+ * @desc: unique userspace handle for ref
+ * @strong: strong ref count (debugging only if not locked)
+ * @weak: weak ref count (debugging only if not locked)
+ *
+ * Structure to hold ref count and ref id information. Since
+ * the actual ref can only be accessed with a lock, this structure
+ * is used to return information about the ref to callers of
+ * ref inc/dec functions.
+ */
+struct binder_ref_data {
+ int debug_id;
+ uint32_t desc;
+ int strong;
+ int weak;
+};
+
+/**
+ * struct binder_ref - struct to track references on nodes
+ * @data: binder_ref_data containing id, handle, and current refcounts
+ * @rb_node_desc: node for lookup by @data.desc in proc's rb_tree
+ * @rb_node_node: node for lookup by @node in proc's rb_tree
+ * @node_entry: list entry for node->refs list in target node
+ * (protected by @node->lock)
+ * @proc: binder_proc containing ref
+ * @node: binder_node of target node. When cleaning up a
+ * ref for deletion in binder_cleanup_ref, a non-NULL
+ * @node indicates the node must be freed
+ * @death: pointer to death notification (ref_death) if requested
+ * (protected by @node->lock)
+ *
+ * Structure to track references from procA to target node (on procB). This
+ * structure is unsafe to access without holding @proc->outer_lock.
+ */
struct binder_ref {
/* Lookups needed: */
/* node + proc => ref (transaction) */
/* desc + proc => ref (transaction, inc/dec ref) */
/* node => refs + procs (proc exit) */
- int debug_id;
+ struct binder_ref_data data;
struct rb_node rb_node_desc;
struct rb_node rb_node_node;
struct hlist_node node_entry;
struct binder_proc *proc;
struct binder_node *node;
- uint32_t desc;
- int strong;
- int weak;
struct binder_ref_death *death;
};
-struct binder_buffer {
- struct list_head entry; /* free and allocated entries by address */
- struct rb_node rb_node; /* free entry by size or allocated entry */
- /* by address */
- unsigned free:1;
- unsigned allow_user_free:1;
- unsigned async_transaction:1;
- unsigned debug_id:29;
-
- struct binder_transaction *transaction;
-
- struct binder_node *target_node;
- size_t data_size;
- size_t offsets_size;
- uint8_t data[0];
-};
-
enum binder_deferred_state {
BINDER_DEFERRED_PUT_FILES = 0x01,
BINDER_DEFERRED_FLUSH = 0x02,
BINDER_DEFERRED_RELEASE = 0x04,
};
+/**
+ * struct binder_priority - scheduler policy and priority
+ * @sched_policy scheduler policy
+ * @prio [100..139] for SCHED_NORMAL, [0..99] for FIFO/RT
+ *
+ * The binder driver supports inheriting the following scheduler policies:
+ * SCHED_NORMAL
+ * SCHED_BATCH
+ * SCHED_FIFO
+ * SCHED_RR
+ */
+struct binder_priority {
+ unsigned int sched_policy;
+ int prio;
+};
+
+/**
+ * struct binder_proc - binder process bookkeeping
+ * @proc_node: element for binder_procs list
+ * @threads: rbtree of binder_threads in this proc
+ * (protected by @inner_lock)
+ * @nodes: rbtree of binder nodes associated with
+ * this proc ordered by node->ptr
+ * (protected by @inner_lock)
+ * @refs_by_desc: rbtree of refs ordered by ref->desc
+ * (protected by @outer_lock)
+ * @refs_by_node: rbtree of refs ordered by ref->node
+ * (protected by @outer_lock)
+ * @waiting_threads: threads currently waiting for proc work
+ * (protected by @inner_lock)
+ * @pid PID of group_leader of process
+ * (invariant after initialized)
+ * @tsk task_struct for group_leader of process
+ * (invariant after initialized)
+ * @files files_struct for process
+ * (protected by @files_lock)
+ * @files_lock mutex to protect @files
+ * @deferred_work_node: element for binder_deferred_list
+ * (protected by binder_deferred_lock)
+ * @deferred_work: bitmap of deferred work to perform
+ * (protected by binder_deferred_lock)
+ * @is_dead: process is dead and awaiting free
+ * when outstanding transactions are cleaned up
+ * (protected by @inner_lock)
+ * @todo: list of work for this process
+ * (protected by @inner_lock)
+ * @stats: per-process binder statistics
+ * (atomics, no lock needed)
+ * @delivered_death: list of delivered death notification
+ * (protected by @inner_lock)
+ * @max_threads: cap on number of binder threads
+ * (protected by @inner_lock)
+ * @requested_threads: number of binder threads requested but not
+ * yet started. In current implementation, can
+ * only be 0 or 1.
+ * (protected by @inner_lock)
+ * @requested_threads_started: number binder threads started
+ * (protected by @inner_lock)
+ * @tmp_ref: temporary reference to indicate proc is in use
+ * (protected by @inner_lock)
+ * @default_priority: default scheduler priority
+ * (invariant after initialized)
+ * @debugfs_entry: debugfs node
+ * @alloc: binder allocator bookkeeping
+ * @context: binder_context for this proc
+ * (invariant after initialized)
+ * @inner_lock: can nest under outer_lock and/or node lock
+ * @outer_lock: no nesting under innor or node lock
+ * Lock order: 1) outer, 2) node, 3) inner
+ *
+ * Bookkeeping structure for binder processes
+ */
struct binder_proc {
struct hlist_node proc_node;
struct rb_root threads;
struct rb_root nodes;
struct rb_root refs_by_desc;
struct rb_root refs_by_node;
+ struct list_head waiting_threads;
int pid;
- struct vm_area_struct *vma;
- struct mm_struct *vma_vm_mm;
struct task_struct *tsk;
struct files_struct *files;
struct mutex files_lock;
struct hlist_node deferred_work_node;
int deferred_work;
- void *buffer;
- ptrdiff_t user_buffer_offset;
-
- struct list_head buffers;
- struct rb_root free_buffers;
- struct rb_root allocated_buffers;
- size_t free_async_space;
+ bool is_dead;
- struct page **pages;
- size_t buffer_size;
- uint32_t buffer_free;
struct list_head todo;
- wait_queue_head_t wait;
struct binder_stats stats;
struct list_head delivered_death;
int max_threads;
int requested_threads;
int requested_threads_started;
- int ready_threads;
- long default_priority;
+ int tmp_ref;
+ struct binder_priority default_priority;
struct dentry *debugfs_entry;
+ struct binder_alloc alloc;
+ struct binder_context *context;
+ spinlock_t inner_lock;
+ spinlock_t outer_lock;
};
enum {
@@ -334,22 +574,63 @@ enum {
BINDER_LOOPER_STATE_EXITED = 0x04,
BINDER_LOOPER_STATE_INVALID = 0x08,
BINDER_LOOPER_STATE_WAITING = 0x10,
- BINDER_LOOPER_STATE_NEED_RETURN = 0x20
+ BINDER_LOOPER_STATE_POLL = 0x20,
};
+/**
+ * struct binder_thread - binder thread bookkeeping
+ * @proc: binder process for this thread
+ * (invariant after initialization)
+ * @rb_node: element for proc->threads rbtree
+ * (protected by @proc->inner_lock)
+ * @waiting_thread_node: element for @proc->waiting_threads list
+ * (protected by @proc->inner_lock)
+ * @pid: PID for this thread
+ * (invariant after initialization)
+ * @looper: bitmap of looping state
+ * (only accessed by this thread)
+ * @looper_needs_return: looping thread needs to exit driver
+ * (no lock needed)
+ * @transaction_stack: stack of in-progress transactions for this thread
+ * (protected by @proc->inner_lock)
+ * @todo: list of work to do for this thread
+ * (protected by @proc->inner_lock)
+ * @process_todo: whether work in @todo should be processed
+ * (protected by @proc->inner_lock)
+ * @return_error: transaction errors reported by this thread
+ * (only accessed by this thread)
+ * @reply_error: transaction errors reported by target thread
+ * (protected by @proc->inner_lock)
+ * @wait: wait queue for thread work
+ * @stats: per-thread statistics
+ * (atomics, no lock needed)
+ * @tmp_ref: temporary reference to indicate thread is in use
+ * (atomic since @proc->inner_lock cannot
+ * always be acquired)
+ * @is_dead: thread is dead and awaiting free
+ * when outstanding transactions are cleaned up
+ * (protected by @proc->inner_lock)
+ * @task: struct task_struct for this thread
+ *
+ * Bookkeeping structure for binder threads.
+ */
struct binder_thread {
struct binder_proc *proc;
struct rb_node rb_node;
+ struct list_head waiting_thread_node;
int pid;
- int looper;
+ int looper; /* only modified by this thread */
+ bool looper_need_return; /* can be written by other thread */
struct binder_transaction *transaction_stack;
struct list_head todo;
- uint32_t return_error; /* Write failed, return error code in read buf */
- uint32_t return_error2; /* Write failed, return error code in read */
- /* buffer. Used when sending a reply to a dead process that */
- /* we are also waiting on */
+ bool process_todo;
+ struct binder_error return_error;
+ struct binder_error reply_error;
wait_queue_head_t wait;
struct binder_stats stats;
+ atomic_t tmp_ref;
+ bool is_dead;
+ struct task_struct *task;
};
struct binder_transaction {
@@ -366,13 +647,301 @@ struct binder_transaction {
struct binder_buffer *buffer;
unsigned int code;
unsigned int flags;
- long priority;
- long saved_priority;
+ struct binder_priority priority;
+ struct binder_priority saved_priority;
+ bool set_priority_called;
kuid_t sender_euid;
+ /**
+ * @lock: protects @from, @to_proc, and @to_thread
+ *
+ * @from, @to_proc, and @to_thread can be set to NULL
+ * during thread teardown
+ */
+ spinlock_t lock;
};
+/**
+ * binder_proc_lock() - Acquire outer lock for given binder_proc
+ * @proc: struct binder_proc to acquire
+ *
+ * Acquires proc->outer_lock. Used to protect binder_ref
+ * structures associated with the given proc.
+ */
+#define binder_proc_lock(proc) _binder_proc_lock(proc, __LINE__)
+static void
+_binder_proc_lock(struct binder_proc *proc, int line)
+{
+ binder_debug(BINDER_DEBUG_SPINLOCKS,
+ "%s: line=%d\n", __func__, line);
+ spin_lock(&proc->outer_lock);
+}
+
+/**
+ * binder_proc_unlock() - Release spinlock for given binder_proc
+ * @proc: struct binder_proc to acquire
+ *
+ * Release lock acquired via binder_proc_lock()
+ */
+#define binder_proc_unlock(_proc) _binder_proc_unlock(_proc, __LINE__)
+static void
+_binder_proc_unlock(struct binder_proc *proc, int line)
+{
+ binder_debug(BINDER_DEBUG_SPINLOCKS,
+ "%s: line=%d\n", __func__, line);
+ spin_unlock(&proc->outer_lock);
+}
+
+/**
+ * binder_inner_proc_lock() - Acquire inner lock for given binder_proc
+ * @proc: struct binder_proc to acquire
+ *
+ * Acquires proc->inner_lock. Used to protect todo lists
+ */
+#define binder_inner_proc_lock(proc) _binder_inner_proc_lock(proc, __LINE__)
+static void
+_binder_inner_proc_lock(struct binder_proc *proc, int line)
+{
+ binder_debug(BINDER_DEBUG_SPINLOCKS,
+ "%s: line=%d\n", __func__, line);
+ spin_lock(&proc->inner_lock);
+}
+
+/**
+ * binder_inner_proc_unlock() - Release inner lock for given binder_proc
+ * @proc: struct binder_proc to acquire
+ *
+ * Release lock acquired via binder_inner_proc_lock()
+ */
+#define binder_inner_proc_unlock(proc) _binder_inner_proc_unlock(proc, __LINE__)
+static void
+_binder_inner_proc_unlock(struct binder_proc *proc, int line)
+{
+ binder_debug(BINDER_DEBUG_SPINLOCKS,
+ "%s: line=%d\n", __func__, line);
+ spin_unlock(&proc->inner_lock);
+}
+
+/**
+ * binder_node_lock() - Acquire spinlock for given binder_node
+ * @node: struct binder_node to acquire
+ *
+ * Acquires node->lock. Used to protect binder_node fields
+ */
+#define binder_node_lock(node) _binder_node_lock(node, __LINE__)
+static void
+_binder_node_lock(struct binder_node *node, int line)
+{
+ binder_debug(BINDER_DEBUG_SPINLOCKS,
+ "%s: line=%d\n", __func__, line);
+ spin_lock(&node->lock);
+}
+
+/**
+ * binder_node_unlock() - Release spinlock for given binder_proc
+ * @node: struct binder_node to acquire
+ *
+ * Release lock acquired via binder_node_lock()
+ */
+#define binder_node_unlock(node) _binder_node_unlock(node, __LINE__)
+static void
+_binder_node_unlock(struct binder_node *node, int line)
+{
+ binder_debug(BINDER_DEBUG_SPINLOCKS,
+ "%s: line=%d\n", __func__, line);
+ spin_unlock(&node->lock);
+}
+
+/**
+ * binder_node_inner_lock() - Acquire node and inner locks
+ * @node: struct binder_node to acquire
+ *
+ * Acquires node->lock. If node->proc also acquires
+ * proc->inner_lock. Used to protect binder_node fields
+ */
+#define binder_node_inner_lock(node) _binder_node_inner_lock(node, __LINE__)
+static void
+_binder_node_inner_lock(struct binder_node *node, int line)
+{
+ binder_debug(BINDER_DEBUG_SPINLOCKS,
+ "%s: line=%d\n", __func__, line);
+ spin_lock(&node->lock);
+ if (node->proc)
+ binder_inner_proc_lock(node->proc);
+}
+
+/**
+ * binder_node_unlock() - Release node and inner locks
+ * @node: struct binder_node to acquire
+ *
+ * Release lock acquired via binder_node_lock()
+ */
+#define binder_node_inner_unlock(node) _binder_node_inner_unlock(node, __LINE__)
+static void
+_binder_node_inner_unlock(struct binder_node *node, int line)
+{
+ struct binder_proc *proc = node->proc;
+
+ binder_debug(BINDER_DEBUG_SPINLOCKS,
+ "%s: line=%d\n", __func__, line);
+ if (proc)
+ binder_inner_proc_unlock(proc);
+ spin_unlock(&node->lock);
+}
+
+static bool binder_worklist_empty_ilocked(struct list_head *list)
+{
+ return list_empty(list);
+}
+
+/**
+ * binder_worklist_empty() - Check if no items on the work list
+ * @proc: binder_proc associated with list
+ * @list: list to check
+ *
+ * Return: true if there are no items on list, else false
+ */
+static bool binder_worklist_empty(struct binder_proc *proc,
+ struct list_head *list)
+{
+ bool ret;
+
+ binder_inner_proc_lock(proc);
+ ret = binder_worklist_empty_ilocked(list);
+ binder_inner_proc_unlock(proc);
+ return ret;
+}
+
+/**
+ * binder_enqueue_work_ilocked() - Add an item to the work list
+ * @work: struct binder_work to add to list
+ * @target_list: list to add work to
+ *
+ * Adds the work to the specified list. Asserts that work
+ * is not already on a list.
+ *
+ * Requires the proc->inner_lock to be held.
+ */
+static void
+binder_enqueue_work_ilocked(struct binder_work *work,
+ struct list_head *target_list)
+{
+ BUG_ON(target_list == NULL);
+ BUG_ON(work->entry.next && !list_empty(&work->entry));
+ list_add_tail(&work->entry, target_list);
+}
+
+/**
+ * binder_enqueue_deferred_thread_work_ilocked() - Add deferred thread work
+ * @thread: thread to queue work to
+ * @work: struct binder_work to add to list
+ *
+ * Adds the work to the todo list of the thread. Doesn't set the process_todo
+ * flag, which means that (if it wasn't already set) the thread will go to
+ * sleep without handling this work when it calls read.
+ *
+ * Requires the proc->inner_lock to be held.
+ */
+static void
+binder_enqueue_deferred_thread_work_ilocked(struct binder_thread *thread,
+ struct binder_work *work)
+{
+ binder_enqueue_work_ilocked(work, &thread->todo);
+}
+
+/**
+ * binder_enqueue_thread_work_ilocked() - Add an item to the thread work list
+ * @thread: thread to queue work to
+ * @work: struct binder_work to add to list
+ *
+ * Adds the work to the todo list of the thread, and enables processing
+ * of the todo queue.
+ *
+ * Requires the proc->inner_lock to be held.
+ */
+static void
+binder_enqueue_thread_work_ilocked(struct binder_thread *thread,
+ struct binder_work *work)
+{
+ binder_enqueue_work_ilocked(work, &thread->todo);
+ thread->process_todo = true;
+}
+
+/**
+ * binder_enqueue_thread_work() - Add an item to the thread work list
+ * @thread: thread to queue work to
+ * @work: struct binder_work to add to list
+ *
+ * Adds the work to the todo list of the thread, and enables processing
+ * of the todo queue.
+ */
+static void
+binder_enqueue_thread_work(struct binder_thread *thread,
+ struct binder_work *work)
+{
+ binder_inner_proc_lock(thread->proc);
+ binder_enqueue_thread_work_ilocked(thread, work);
+ binder_inner_proc_unlock(thread->proc);
+}
+
+static void
+binder_dequeue_work_ilocked(struct binder_work *work)
+{
+ list_del_init(&work->entry);
+}
+
+/**
+ * binder_dequeue_work() - Removes an item from the work list
+ * @proc: binder_proc associated with list
+ * @work: struct binder_work to remove from list
+ *
+ * Removes the specified work item from whatever list it is on.
+ * Can safely be called if work is not on any list.
+ */
+static void
+binder_dequeue_work(struct binder_proc *proc, struct binder_work *work)
+{
+ binder_inner_proc_lock(proc);
+ binder_dequeue_work_ilocked(work);
+ binder_inner_proc_unlock(proc);
+}
+
+static struct binder_work *binder_dequeue_work_head_ilocked(
+ struct list_head *list)
+{
+ struct binder_work *w;
+
+ w = list_first_entry_or_null(list, struct binder_work, entry);
+ if (w)
+ list_del_init(&w->entry);
+ return w;
+}
+
+/**
+ * binder_dequeue_work_head() - Dequeues the item at head of list
+ * @proc: binder_proc associated with list
+ * @list: list to dequeue head
+ *
+ * Removes the head of the list if there are items on the list
+ *
+ * Return: pointer dequeued binder_work, NULL if list was empty
+ */
+static struct binder_work *binder_dequeue_work_head(
+ struct binder_proc *proc,
+ struct list_head *list)
+{
+ struct binder_work *w;
+
+ binder_inner_proc_lock(proc);
+ w = binder_dequeue_work_head_ilocked(list);
+ binder_inner_proc_unlock(proc);
+ return w;
+}
+
static void
binder_defer_work(struct binder_proc *proc, enum binder_deferred_state defer);
+static void binder_free_thread(struct binder_thread *thread);
+static void binder_free_proc(struct binder_proc *proc);
+static void binder_inc_node_tmpref_ilocked(struct binder_node *node);
static int task_get_unused_fd_flags(struct binder_proc *proc, int flags)
{
@@ -434,453 +1003,282 @@ err:
return retval;
}
-static inline void binder_lock(const char *tag)
+static bool binder_has_work_ilocked(struct binder_thread *thread,
+ bool do_proc_work)
{
- trace_binder_lock(tag);
- mutex_lock(&binder_main_lock);
- trace_binder_locked(tag);
+ return thread->process_todo ||
+ thread->looper_need_return ||
+ (do_proc_work &&
+ !binder_worklist_empty_ilocked(&thread->proc->todo));
}
-static inline void binder_unlock(const char *tag)
+static bool binder_has_work(struct binder_thread *thread, bool do_proc_work)
{
- trace_binder_unlock(tag);
- mutex_unlock(&binder_main_lock);
-}
+ bool has_work;
-static void binder_set_nice(long nice)
-{
- long min_nice;
+ binder_inner_proc_lock(thread->proc);
+ has_work = binder_has_work_ilocked(thread, do_proc_work);
+ binder_inner_proc_unlock(thread->proc);
- if (can_nice(current, nice)) {
- set_user_nice(current, nice);
- return;
- }
- min_nice = rlimit_to_nice(current->signal->rlim[RLIMIT_NICE].rlim_cur);
- binder_debug(BINDER_DEBUG_PRIORITY_CAP,
- "%d: nice value %ld not allowed use %ld instead\n",
- current->pid, nice, min_nice);
- set_user_nice(current, min_nice);
- if (min_nice <= MAX_NICE)
- return;
- binder_user_error("%d RLIMIT_NICE not set\n", current->pid);
+ return has_work;
}
-static size_t binder_buffer_size(struct binder_proc *proc,
- struct binder_buffer *buffer)
+static bool binder_available_for_proc_work_ilocked(struct binder_thread *thread)
{
- if (list_is_last(&buffer->entry, &proc->buffers))
- return proc->buffer + proc->buffer_size - (void *)buffer->data;
- return (size_t)list_entry(buffer->entry.next,
- struct binder_buffer, entry) - (size_t)buffer->data;
+ return !thread->transaction_stack &&
+ binder_worklist_empty_ilocked(&thread->todo) &&
+ (thread->looper & (BINDER_LOOPER_STATE_ENTERED |
+ BINDER_LOOPER_STATE_REGISTERED));
}
-static void binder_insert_free_buffer(struct binder_proc *proc,
- struct binder_buffer *new_buffer)
+static void binder_wakeup_poll_threads_ilocked(struct binder_proc *proc,
+ bool sync)
{
- struct rb_node **p = &proc->free_buffers.rb_node;
- struct rb_node *parent = NULL;
- struct binder_buffer *buffer;
- size_t buffer_size;
- size_t new_buffer_size;
-
- BUG_ON(!new_buffer->free);
-
- new_buffer_size = binder_buffer_size(proc, new_buffer);
-
- binder_debug(BINDER_DEBUG_BUFFER_ALLOC,
- "%d: add free buffer, size %zd, at %p\n",
- proc->pid, new_buffer_size, new_buffer);
-
- while (*p) {
- parent = *p;
- buffer = rb_entry(parent, struct binder_buffer, rb_node);
- BUG_ON(!buffer->free);
-
- buffer_size = binder_buffer_size(proc, buffer);
+ struct rb_node *n;
+ struct binder_thread *thread;
- if (new_buffer_size < buffer_size)
- p = &parent->rb_left;
- else
- p = &parent->rb_right;
+ for (n = rb_first(&proc->threads); n != NULL; n = rb_next(n)) {
+ thread = rb_entry(n, struct binder_thread, rb_node);
+ if (thread->looper & BINDER_LOOPER_STATE_POLL &&
+ binder_available_for_proc_work_ilocked(thread)) {
+ if (sync)
+ wake_up_interruptible_sync(&thread->wait);
+ else
+ wake_up_interruptible(&thread->wait);
+ }
}
- rb_link_node(&new_buffer->rb_node, parent, p);
- rb_insert_color(&new_buffer->rb_node, &proc->free_buffers);
}
-static void binder_insert_allocated_buffer(struct binder_proc *proc,
- struct binder_buffer *new_buffer)
+/**
+ * binder_select_thread_ilocked() - selects a thread for doing proc work.
+ * @proc: process to select a thread from
+ *
+ * Note that calling this function moves the thread off the waiting_threads
+ * list, so it can only be woken up by the caller of this function, or a
+ * signal. Therefore, callers *should* always wake up the thread this function
+ * returns.
+ *
+ * Return: If there's a thread currently waiting for process work,
+ * returns that thread. Otherwise returns NULL.
+ */
+static struct binder_thread *
+binder_select_thread_ilocked(struct binder_proc *proc)
{
- struct rb_node **p = &proc->allocated_buffers.rb_node;
- struct rb_node *parent = NULL;
- struct binder_buffer *buffer;
+ struct binder_thread *thread;
- BUG_ON(new_buffer->free);
+ assert_spin_locked(&proc->inner_lock);
+ thread = list_first_entry_or_null(&proc->waiting_threads,
+ struct binder_thread,
+ waiting_thread_node);
- while (*p) {
- parent = *p;
- buffer = rb_entry(parent, struct binder_buffer, rb_node);
- BUG_ON(buffer->free);
+ if (thread)
+ list_del_init(&thread->waiting_thread_node);
- if (new_buffer < buffer)
- p = &parent->rb_left;
- else if (new_buffer > buffer)
- p = &parent->rb_right;
- else
- BUG();
- }
- rb_link_node(&new_buffer->rb_node, parent, p);
- rb_insert_color(&new_buffer->rb_node, &proc->allocated_buffers);
+ return thread;
}
-static struct binder_buffer *binder_buffer_lookup(struct binder_proc *proc,
- uintptr_t user_ptr)
+/**
+ * binder_wakeup_thread_ilocked() - wakes up a thread for doing proc work.
+ * @proc: process to wake up a thread in
+ * @thread: specific thread to wake-up (may be NULL)
+ * @sync: whether to do a synchronous wake-up
+ *
+ * This function wakes up a thread in the @proc process.
+ * The caller may provide a specific thread to wake-up in
+ * the @thread parameter. If @thread is NULL, this function
+ * will wake up threads that have called poll().
+ *
+ * Note that for this function to work as expected, callers
+ * should first call binder_select_thread() to find a thread
+ * to handle the work (if they don't have a thread already),
+ * and pass the result into the @thread parameter.
+ */
+static void binder_wakeup_thread_ilocked(struct binder_proc *proc,
+ struct binder_thread *thread,
+ bool sync)
{
- struct rb_node *n = proc->allocated_buffers.rb_node;
- struct binder_buffer *buffer;
- struct binder_buffer *kern_ptr;
+ assert_spin_locked(&proc->inner_lock);
- kern_ptr = (struct binder_buffer *)(user_ptr - proc->user_buffer_offset
- - offsetof(struct binder_buffer, data));
-
- while (n) {
- buffer = rb_entry(n, struct binder_buffer, rb_node);
- BUG_ON(buffer->free);
-
- if (kern_ptr < buffer)
- n = n->rb_left;
- else if (kern_ptr > buffer)
- n = n->rb_right;
+ if (thread) {
+ if (sync)
+ wake_up_interruptible_sync(&thread->wait);
else
- return buffer;
+ wake_up_interruptible(&thread->wait);
+ return;
}
- return NULL;
+
+ /* Didn't find a thread waiting for proc work; this can happen
+ * in two scenarios:
+ * 1. All threads are busy handling transactions
+ * In that case, one of those threads should call back into
+ * the kernel driver soon and pick up this work.
+ * 2. Threads are using the (e)poll interface, in which case
+ * they may be blocked on the waitqueue without having been
+ * added to waiting_threads. For this case, we just iterate
+ * over all threads not handling transaction work, and
+ * wake them all up. We wake all because we don't know whether
+ * a thread that called into (e)poll is handling non-binder
+ * work currently.
+ */
+ binder_wakeup_poll_threads_ilocked(proc, sync);
}
-static int binder_update_page_range(struct binder_proc *proc, int allocate,
- void *start, void *end,
- struct vm_area_struct *vma)
+static void binder_wakeup_proc_ilocked(struct binder_proc *proc)
{
- void *page_addr;
- unsigned long user_page_addr;
- struct page **page;
- struct mm_struct *mm;
-
- binder_debug(BINDER_DEBUG_BUFFER_ALLOC,
- "%d: %s pages %p-%p\n", proc->pid,
- allocate ? "allocate" : "free", start, end);
+ struct binder_thread *thread = binder_select_thread_ilocked(proc);
- if (end <= start)
- return 0;
+ binder_wakeup_thread_ilocked(proc, thread, /* sync = */false);
+}
- trace_binder_update_page_range(proc, allocate, start, end);
+static bool is_rt_policy(int policy)
+{
+ return policy == SCHED_FIFO || policy == SCHED_RR;
+}
- if (vma)
- mm = NULL;
- else
- mm = get_task_mm(proc->tsk);
+static bool is_fair_policy(int policy)
+{
+ return policy == SCHED_NORMAL || policy == SCHED_BATCH;
+}
- if (mm) {
- down_write(&mm->mmap_sem);
- vma = proc->vma;
- if (vma && mm != proc->vma_vm_mm) {
- pr_err("%d: vma mm and task mm mismatch\n",
- proc->pid);
- vma = NULL;
- }
- }
+static bool binder_supported_policy(int policy)
+{
+ return is_fair_policy(policy) || is_rt_policy(policy);
+}
- if (allocate == 0)
- goto free_range;
+static int to_userspace_prio(int policy, int kernel_priority)
+{
+ if (is_fair_policy(policy))
+ return PRIO_TO_NICE(kernel_priority);
+ else
+ return MAX_USER_RT_PRIO - 1 - kernel_priority;
+}
- if (vma == NULL) {
- pr_err("%d: binder_alloc_buf failed to map pages in userspace, no vma\n",
- proc->pid);
- goto err_no_vma;
- }
+static int to_kernel_prio(int policy, int user_priority)
+{
+ if (is_fair_policy(policy))
+ return NICE_TO_PRIO(user_priority);
+ else
+ return MAX_USER_RT_PRIO - 1 - user_priority;
+}
- for (page_addr = start; page_addr < end; page_addr += PAGE_SIZE) {
- int ret;
+static void binder_do_set_priority(struct task_struct *task,
+ struct binder_priority desired,
+ bool verify)
+{
+ int priority; /* user-space prio value */
+ bool has_cap_nice;
+ unsigned int policy = desired.sched_policy;
- page = &proc->pages[(page_addr - proc->buffer) / PAGE_SIZE];
-
- BUG_ON(*page);
- *page = alloc_page(GFP_KERNEL | __GFP_HIGHMEM | __GFP_ZERO);
- if (*page == NULL) {
- pr_err("%d: binder_alloc_buf failed for page at %p\n",
- proc->pid, page_addr);
- goto err_alloc_page_failed;
- }
- ret = map_kernel_range_noflush((unsigned long)page_addr,
- PAGE_SIZE, PAGE_KERNEL, page);
- flush_cache_vmap((unsigned long)page_addr,
- (unsigned long)page_addr + PAGE_SIZE);
- if (ret != 1) {
- pr_err("%d: binder_alloc_buf failed to map page at %p in kernel\n",
- proc->pid, page_addr);
- goto err_map_kernel_failed;
- }
- user_page_addr =
- (uintptr_t)page_addr + proc->user_buffer_offset;
- ret = vm_insert_page(vma, user_page_addr, page[0]);
- if (ret) {
- pr_err("%d: binder_alloc_buf failed to map page at %lx in userspace\n",
- proc->pid, user_page_addr);
- goto err_vm_insert_page_failed;
- }
- /* vm_insert_page does not seem to increment the refcount */
- }
- if (mm) {
- up_write(&mm->mmap_sem);
- mmput(mm);
- }
- return 0;
+ if (task->policy == policy && task->normal_prio == desired.prio)
+ return;
-free_range:
- for (page_addr = end - PAGE_SIZE; page_addr >= start;
- page_addr -= PAGE_SIZE) {
- page = &proc->pages[(page_addr - proc->buffer) / PAGE_SIZE];
- if (vma)
- zap_page_range(vma, (uintptr_t)page_addr +
- proc->user_buffer_offset, PAGE_SIZE, NULL);
-err_vm_insert_page_failed:
- unmap_kernel_range((unsigned long)page_addr, PAGE_SIZE);
-err_map_kernel_failed:
- __free_page(*page);
- *page = NULL;
-err_alloc_page_failed:
- ;
- }
-err_no_vma:
- if (mm) {
- up_write(&mm->mmap_sem);
- mmput(mm);
- }
- return -ENOMEM;
-}
-
-static struct binder_buffer *binder_alloc_buf(struct binder_proc *proc,
- size_t data_size,
- size_t offsets_size, int is_async)
-{
- struct rb_node *n = proc->free_buffers.rb_node;
- struct binder_buffer *buffer;
- size_t buffer_size;
- struct rb_node *best_fit = NULL;
- void *has_page_addr;
- void *end_page_addr;
- size_t size;
-
- if (proc->vma == NULL) {
- pr_err("%d: binder_alloc_buf, no vma\n",
- proc->pid);
- return NULL;
- }
+ has_cap_nice = has_capability_noaudit(task, CAP_SYS_NICE);
- size = ALIGN(data_size, sizeof(void *)) +
- ALIGN(offsets_size, sizeof(void *));
+ priority = to_userspace_prio(policy, desired.prio);
- if (size < data_size || size < offsets_size) {
- binder_user_error("%d: got transaction with invalid size %zd-%zd\n",
- proc->pid, data_size, offsets_size);
- return NULL;
- }
+ if (verify && is_rt_policy(policy) && !has_cap_nice) {
+ long max_rtprio = task_rlimit(task, RLIMIT_RTPRIO);
- if (is_async &&
- proc->free_async_space < size + sizeof(struct binder_buffer)) {
- binder_debug(BINDER_DEBUG_BUFFER_ALLOC,
- "%d: binder_alloc_buf size %zd failed, no async space left\n",
- proc->pid, size);
- return NULL;
+ if (max_rtprio == 0) {
+ policy = SCHED_NORMAL;
+ priority = MIN_NICE;
+ } else if (priority > max_rtprio) {
+ priority = max_rtprio;
+ }
}
- while (n) {
- buffer = rb_entry(n, struct binder_buffer, rb_node);
- BUG_ON(!buffer->free);
- buffer_size = binder_buffer_size(proc, buffer);
+ if (verify && is_fair_policy(policy) && !has_cap_nice) {
+ long min_nice = rlimit_to_nice(task_rlimit(task, RLIMIT_NICE));
- if (size < buffer_size) {
- best_fit = n;
- n = n->rb_left;
- } else if (size > buffer_size)
- n = n->rb_right;
- else {
- best_fit = n;
- break;
+ if (min_nice > MAX_NICE) {
+ binder_user_error("%d RLIMIT_NICE not set\n",
+ task->pid);
+ return;
+ } else if (priority < min_nice) {
+ priority = min_nice;
}
}
- if (best_fit == NULL) {
- pr_err("%d: binder_alloc_buf size %zd failed, no address space\n",
- proc->pid, size);
- return NULL;
- }
- if (n == NULL) {
- buffer = rb_entry(best_fit, struct binder_buffer, rb_node);
- buffer_size = binder_buffer_size(proc, buffer);
- }
- binder_debug(BINDER_DEBUG_BUFFER_ALLOC,
- "%d: binder_alloc_buf size %zd got buffer %p size %zd\n",
- proc->pid, size, buffer, buffer_size);
+ if (policy != desired.sched_policy ||
+ to_kernel_prio(policy, priority) != desired.prio)
+ binder_debug(BINDER_DEBUG_PRIORITY_CAP,
+ "%d: priority %d not allowed, using %d instead\n",
+ task->pid, desired.prio,
+ to_kernel_prio(policy, priority));
- has_page_addr =
- (void *)(((uintptr_t)buffer->data + buffer_size) & PAGE_MASK);
- if (n == NULL) {
- if (size + sizeof(struct binder_buffer) + 4 >= buffer_size)
- buffer_size = size; /* no room for other buffers */
- else
- buffer_size = size + sizeof(struct binder_buffer);
- }
- end_page_addr =
- (void *)PAGE_ALIGN((uintptr_t)buffer->data + buffer_size);
- if (end_page_addr > has_page_addr)
- end_page_addr = has_page_addr;
- if (binder_update_page_range(proc, 1,
- (void *)PAGE_ALIGN((uintptr_t)buffer->data), end_page_addr, NULL))
- return NULL;
+ trace_binder_set_priority(task->tgid, task->pid, task->normal_prio,
+ to_kernel_prio(policy, priority),
+ desired.prio);
- rb_erase(best_fit, &proc->free_buffers);
- buffer->free = 0;
- binder_insert_allocated_buffer(proc, buffer);
- if (buffer_size != size) {
- struct binder_buffer *new_buffer = (void *)buffer->data + size;
+ /* Set the actual priority */
+ if (task->policy != policy || is_rt_policy(policy)) {
+ struct sched_param params;
- list_add(&new_buffer->entry, &buffer->entry);
- new_buffer->free = 1;
- binder_insert_free_buffer(proc, new_buffer);
- }
- binder_debug(BINDER_DEBUG_BUFFER_ALLOC,
- "%d: binder_alloc_buf size %zd got %p\n",
- proc->pid, size, buffer);
- buffer->data_size = data_size;
- buffer->offsets_size = offsets_size;
- buffer->async_transaction = is_async;
- if (is_async) {
- proc->free_async_space -= size + sizeof(struct binder_buffer);
- binder_debug(BINDER_DEBUG_BUFFER_ALLOC_ASYNC,
- "%d: binder_alloc_buf size %zd async free %zd\n",
- proc->pid, size, proc->free_async_space);
- }
-
- return buffer;
-}
+ params.sched_priority = is_rt_policy(policy) ? priority : 0;
-static void *buffer_start_page(struct binder_buffer *buffer)
-{
- return (void *)((uintptr_t)buffer & PAGE_MASK);
+ sched_setscheduler_nocheck(task,
+ policy | SCHED_RESET_ON_FORK,
+ &params);
+ }
+ if (is_fair_policy(policy))
+ set_user_nice(task, priority);
}
-static void *buffer_end_page(struct binder_buffer *buffer)
+static void binder_set_priority(struct task_struct *task,
+ struct binder_priority desired)
{
- return (void *)(((uintptr_t)(buffer + 1) - 1) & PAGE_MASK);
+ binder_do_set_priority(task, desired, /* verify = */ true);
}
-static void binder_delete_free_buffer(struct binder_proc *proc,
- struct binder_buffer *buffer)
+static void binder_restore_priority(struct task_struct *task,
+ struct binder_priority desired)
{
- struct binder_buffer *prev, *next = NULL;
- int free_page_end = 1;
- int free_page_start = 1;
-
- BUG_ON(proc->buffers.next == &buffer->entry);
- prev = list_entry(buffer->entry.prev, struct binder_buffer, entry);
- BUG_ON(!prev->free);
- if (buffer_end_page(prev) == buffer_start_page(buffer)) {
- free_page_start = 0;
- if (buffer_end_page(prev) == buffer_end_page(buffer))
- free_page_end = 0;
- binder_debug(BINDER_DEBUG_BUFFER_ALLOC,
- "%d: merge free, buffer %p share page with %p\n",
- proc->pid, buffer, prev);
- }
-
- if (!list_is_last(&buffer->entry, &proc->buffers)) {
- next = list_entry(buffer->entry.next,
- struct binder_buffer, entry);
- if (buffer_start_page(next) == buffer_end_page(buffer)) {
- free_page_end = 0;
- if (buffer_start_page(next) ==
- buffer_start_page(buffer))
- free_page_start = 0;
- binder_debug(BINDER_DEBUG_BUFFER_ALLOC,
- "%d: merge free, buffer %p share page with %p\n",
- proc->pid, buffer, prev);
- }
- }
- list_del(&buffer->entry);
- if (free_page_start || free_page_end) {
- binder_debug(BINDER_DEBUG_BUFFER_ALLOC,
- "%d: merge free, buffer %p do not share page%s%s with %p or %p\n",
- proc->pid, buffer, free_page_start ? "" : " end",
- free_page_end ? "" : " start", prev, next);
- binder_update_page_range(proc, 0, free_page_start ?
- buffer_start_page(buffer) : buffer_end_page(buffer),
- (free_page_end ? buffer_end_page(buffer) :
- buffer_start_page(buffer)) + PAGE_SIZE, NULL);
- }
+ binder_do_set_priority(task, desired, /* verify = */ false);
}
-static void binder_free_buf(struct binder_proc *proc,
- struct binder_buffer *buffer)
+static void binder_transaction_priority(struct task_struct *task,
+ struct binder_transaction *t,
+ struct binder_priority node_prio,
+ bool inherit_rt)
{
- size_t size, buffer_size;
+ struct binder_priority desired_prio = t->priority;
- buffer_size = binder_buffer_size(proc, buffer);
-
- size = ALIGN(buffer->data_size, sizeof(void *)) +
- ALIGN(buffer->offsets_size, sizeof(void *));
-
- binder_debug(BINDER_DEBUG_BUFFER_ALLOC,
- "%d: binder_free_buf %p size %zd buffer_size %zd\n",
- proc->pid, buffer, size, buffer_size);
-
- BUG_ON(buffer->free);
- BUG_ON(size > buffer_size);
- BUG_ON(buffer->transaction != NULL);
- BUG_ON((void *)buffer < proc->buffer);
- BUG_ON((void *)buffer > proc->buffer + proc->buffer_size);
+ if (t->set_priority_called)
+ return;
- if (buffer->async_transaction) {
- proc->free_async_space += size + sizeof(struct binder_buffer);
+ t->set_priority_called = true;
+ t->saved_priority.sched_policy = task->policy;
+ t->saved_priority.prio = task->normal_prio;
- binder_debug(BINDER_DEBUG_BUFFER_ALLOC_ASYNC,
- "%d: binder_free_buf size %zd async free %zd\n",
- proc->pid, size, proc->free_async_space);
+ if (!inherit_rt && is_rt_policy(desired_prio.sched_policy)) {
+ desired_prio.prio = NICE_TO_PRIO(0);
+ desired_prio.sched_policy = SCHED_NORMAL;
}
- binder_update_page_range(proc, 0,
- (void *)PAGE_ALIGN((uintptr_t)buffer->data),
- (void *)(((uintptr_t)buffer->data + buffer_size) & PAGE_MASK),
- NULL);
- rb_erase(&buffer->rb_node, &proc->allocated_buffers);
- buffer->free = 1;
- if (!list_is_last(&buffer->entry, &proc->buffers)) {
- struct binder_buffer *next = list_entry(buffer->entry.next,
- struct binder_buffer, entry);
-
- if (next->free) {
- rb_erase(&next->rb_node, &proc->free_buffers);
- binder_delete_free_buffer(proc, next);
- }
+ if (node_prio.prio < t->priority.prio ||
+ (node_prio.prio == t->priority.prio &&
+ node_prio.sched_policy == SCHED_FIFO)) {
+ /*
+ * In case the minimum priority on the node is
+ * higher (lower value), use that priority. If
+ * the priority is the same, but the node uses
+ * SCHED_FIFO, prefer SCHED_FIFO, since it can
+ * run unbounded, unlike SCHED_RR.
+ */
+ desired_prio = node_prio;
}
- if (proc->buffers.next != &buffer->entry) {
- struct binder_buffer *prev = list_entry(buffer->entry.prev,
- struct binder_buffer, entry);
- if (prev->free) {
- binder_delete_free_buffer(proc, buffer);
- rb_erase(&prev->rb_node, &proc->free_buffers);
- buffer = prev;
- }
- }
- binder_insert_free_buffer(proc, buffer);
+ binder_set_priority(task, desired_prio);
}
-static struct binder_node *binder_get_node(struct binder_proc *proc,
- binder_uintptr_t ptr)
+static struct binder_node *binder_get_node_ilocked(struct binder_proc *proc,
+ binder_uintptr_t ptr)
{
struct rb_node *n = proc->nodes.rb_node;
struct binder_node *node;
+ assert_spin_locked(&proc->inner_lock);
+
while (n) {
node = rb_entry(n, struct binder_node, rb_node);
@@ -888,21 +1286,47 @@ static struct binder_node *binder_get_node(struct binder_proc *proc,
n = n->rb_left;
else if (ptr > node->ptr)
n = n->rb_right;
- else
+ else {
+ /*
+ * take an implicit weak reference
+ * to ensure node stays alive until
+ * call to binder_put_node()
+ */
+ binder_inc_node_tmpref_ilocked(node);
return node;
+ }
}
return NULL;
}
-static struct binder_node *binder_new_node(struct binder_proc *proc,
- binder_uintptr_t ptr,
- binder_uintptr_t cookie)
+static struct binder_node *binder_get_node(struct binder_proc *proc,
+ binder_uintptr_t ptr)
+{
+ struct binder_node *node;
+
+ binder_inner_proc_lock(proc);
+ node = binder_get_node_ilocked(proc, ptr);
+ binder_inner_proc_unlock(proc);
+ return node;
+}
+
+static struct binder_node *binder_init_node_ilocked(
+ struct binder_proc *proc,
+ struct binder_node *new_node,
+ struct flat_binder_object *fp)
{
struct rb_node **p = &proc->nodes.rb_node;
struct rb_node *parent = NULL;
struct binder_node *node;
+ binder_uintptr_t ptr = fp ? fp->binder : 0;
+ binder_uintptr_t cookie = fp ? fp->cookie : 0;
+ __u32 flags = fp ? fp->flags : 0;
+ s8 priority;
+
+ assert_spin_locked(&proc->inner_lock);
while (*p) {
+
parent = *p;
node = rb_entry(parent, struct binder_node, rb_node);
@@ -910,39 +1334,86 @@ static struct binder_node *binder_new_node(struct binder_proc *proc,
p = &(*p)->rb_left;
else if (ptr > node->ptr)
p = &(*p)->rb_right;
- else
- return NULL;
+ else {
+ /*
+ * A matching node is already in
+ * the rb tree. Abandon the init
+ * and return it.
+ */
+ binder_inc_node_tmpref_ilocked(node);
+ return node;
+ }
}
-
- node = kzalloc(sizeof(*node), GFP_KERNEL);
- if (node == NULL)
- return NULL;
+ node = new_node;
binder_stats_created(BINDER_STAT_NODE);
+ node->tmp_refs++;
rb_link_node(&node->rb_node, parent, p);
rb_insert_color(&node->rb_node, &proc->nodes);
- node->debug_id = ++binder_last_id;
+ node->debug_id = atomic_inc_return(&binder_last_id);
node->proc = proc;
node->ptr = ptr;
node->cookie = cookie;
node->work.type = BINDER_WORK_NODE;
+ priority = flags & FLAT_BINDER_FLAG_PRIORITY_MASK;
+ node->sched_policy = (flags & FLAT_BINDER_FLAG_SCHED_POLICY_MASK) >>
+ FLAT_BINDER_FLAG_SCHED_POLICY_SHIFT;
+ node->min_priority = to_kernel_prio(node->sched_policy, priority);
+ node->accept_fds = !!(flags & FLAT_BINDER_FLAG_ACCEPTS_FDS);
+ node->inherit_rt = !!(flags & FLAT_BINDER_FLAG_INHERIT_RT);
+ spin_lock_init(&node->lock);
INIT_LIST_HEAD(&node->work.entry);
INIT_LIST_HEAD(&node->async_todo);
binder_debug(BINDER_DEBUG_INTERNAL_REFS,
"%d:%d node %d u%016llx c%016llx created\n",
proc->pid, current->pid, node->debug_id,
(u64)node->ptr, (u64)node->cookie);
+
return node;
}
-static int binder_inc_node(struct binder_node *node, int strong, int internal,
- struct list_head *target_list)
+static struct binder_node *binder_new_node(struct binder_proc *proc,
+ struct flat_binder_object *fp)
+{
+ struct binder_node *node;
+ struct binder_node *new_node = kzalloc(sizeof(*node), GFP_KERNEL);
+
+ if (!new_node)
+ return NULL;
+ binder_inner_proc_lock(proc);
+ node = binder_init_node_ilocked(proc, new_node, fp);
+ binder_inner_proc_unlock(proc);
+ if (node != new_node)
+ /*
+ * The node was already added by another thread
+ */
+ kfree(new_node);
+
+ return node;
+}
+
+static void binder_free_node(struct binder_node *node)
{
+ kfree(node);
+ binder_stats_deleted(BINDER_STAT_NODE);
+}
+
+static int binder_inc_node_nilocked(struct binder_node *node, int strong,
+ int internal,
+ struct list_head *target_list)
+{
+ struct binder_proc *proc = node->proc;
+
+ assert_spin_locked(&node->lock);
+ if (proc)
+ assert_spin_locked(&proc->inner_lock);
if (strong) {
if (internal) {
if (target_list == NULL &&
node->internal_strong_refs == 0 &&
- !(node == binder_context_mgr_node &&
- node->has_strong_ref)) {
+ !(node->proc &&
+ node == node->proc->context->
+ binder_context_mgr_node &&
+ node->has_strong_ref)) {
pr_err("invalid inc strong node for %d\n",
node->debug_id);
return -EINVAL;
@@ -951,8 +1422,19 @@ static int binder_inc_node(struct binder_node *node, int strong, int internal,
} else
node->local_strong_refs++;
if (!node->has_strong_ref && target_list) {
- list_del_init(&node->work.entry);
- list_add_tail(&node->work.entry, target_list);
+ binder_dequeue_work_ilocked(&node->work);
+ /*
+ * Note: this function is the only place where we queue
+ * directly to a thread->todo without using the
+ * corresponding binder_enqueue_thread_work() helper
+ * functions; in this case it's ok to not set the
+ * process_todo flag, since we know this node work will
+ * always be followed by other work that starts queue
+ * processing: in case of synchronous transactions, a
+ * BR_REPLY or BR_ERROR; in case of oneway
+ * transactions, a BR_TRANSACTION_COMPLETE.
+ */
+ binder_enqueue_work_ilocked(&node->work, target_list);
}
} else {
if (!internal)
@@ -963,58 +1445,172 @@ static int binder_inc_node(struct binder_node *node, int strong, int internal,
node->debug_id);
return -EINVAL;
}
- list_add_tail(&node->work.entry, target_list);
+ /*
+ * See comment above
+ */
+ binder_enqueue_work_ilocked(&node->work, target_list);
}
}
return 0;
}
-static int binder_dec_node(struct binder_node *node, int strong, int internal)
+static int binder_inc_node(struct binder_node *node, int strong, int internal,
+ struct list_head *target_list)
+{
+ int ret;
+
+ binder_node_inner_lock(node);
+ ret = binder_inc_node_nilocked(node, strong, internal, target_list);
+ binder_node_inner_unlock(node);
+
+ return ret;
+}
+
+static bool binder_dec_node_nilocked(struct binder_node *node,
+ int strong, int internal)
{
+ struct binder_proc *proc = node->proc;
+
+ assert_spin_locked(&node->lock);
+ if (proc)
+ assert_spin_locked(&proc->inner_lock);
if (strong) {
if (internal)
node->internal_strong_refs--;
else
node->local_strong_refs--;
if (node->local_strong_refs || node->internal_strong_refs)
- return 0;
+ return false;
} else {
if (!internal)
node->local_weak_refs--;
- if (node->local_weak_refs || !hlist_empty(&node->refs))
- return 0;
+ if (node->local_weak_refs || node->tmp_refs ||
+ !hlist_empty(&node->refs))
+ return false;
}
- if (node->proc && (node->has_strong_ref || node->has_weak_ref)) {
+
+ if (proc && (node->has_strong_ref || node->has_weak_ref)) {
if (list_empty(&node->work.entry)) {
- list_add_tail(&node->work.entry, &node->proc->todo);
- wake_up_interruptible(&node->proc->wait);
+ binder_enqueue_work_ilocked(&node->work, &proc->todo);
+ binder_wakeup_proc_ilocked(proc);
}
} else {
if (hlist_empty(&node->refs) && !node->local_strong_refs &&
- !node->local_weak_refs) {
- list_del_init(&node->work.entry);
- if (node->proc) {
- rb_erase(&node->rb_node, &node->proc->nodes);
+ !node->local_weak_refs && !node->tmp_refs) {
+ if (proc) {
+ binder_dequeue_work_ilocked(&node->work);
+ rb_erase(&node->rb_node, &proc->nodes);
binder_debug(BINDER_DEBUG_INTERNAL_REFS,
"refless node %d deleted\n",
node->debug_id);
} else {
+ BUG_ON(!list_empty(&node->work.entry));
+ spin_lock(&binder_dead_nodes_lock);
+ /*
+ * tmp_refs could have changed so
+ * check it again
+ */
+ if (node->tmp_refs) {
+ spin_unlock(&binder_dead_nodes_lock);
+ return false;
+ }
hlist_del(&node->dead_node);
+ spin_unlock(&binder_dead_nodes_lock);
binder_debug(BINDER_DEBUG_INTERNAL_REFS,
"dead node %d deleted\n",
node->debug_id);
}
- kfree(node);
- binder_stats_deleted(BINDER_STAT_NODE);
+ return true;
}
}
+ return false;
+}
- return 0;
+static void binder_dec_node(struct binder_node *node, int strong, int internal)
+{
+ bool free_node;
+
+ binder_node_inner_lock(node);
+ free_node = binder_dec_node_nilocked(node, strong, internal);
+ binder_node_inner_unlock(node);
+ if (free_node)
+ binder_free_node(node);
}
+static void binder_inc_node_tmpref_ilocked(struct binder_node *node)
+{
+ /*
+ * No call to binder_inc_node() is needed since we
+ * don't need to inform userspace of any changes to
+ * tmp_refs
+ */
+ node->tmp_refs++;
+}
-static struct binder_ref *binder_get_ref(struct binder_proc *proc,
- u32 desc, bool need_strong_ref)
+/**
+ * binder_inc_node_tmpref() - take a temporary reference on node
+ * @node: node to reference
+ *
+ * Take reference on node to prevent the node from being freed
+ * while referenced only by a local variable. The inner lock is
+ * needed to serialize with the node work on the queue (which
+ * isn't needed after the node is dead). If the node is dead
+ * (node->proc is NULL), use binder_dead_nodes_lock to protect
+ * node->tmp_refs against dead-node-only cases where the node
+ * lock cannot be acquired (eg traversing the dead node list to
+ * print nodes)
+ */
+static void binder_inc_node_tmpref(struct binder_node *node)
+{
+ binder_node_lock(node);
+ if (node->proc)
+ binder_inner_proc_lock(node->proc);
+ else
+ spin_lock(&binder_dead_nodes_lock);
+ binder_inc_node_tmpref_ilocked(node);
+ if (node->proc)
+ binder_inner_proc_unlock(node->proc);
+ else
+ spin_unlock(&binder_dead_nodes_lock);
+ binder_node_unlock(node);
+}
+
+/**
+ * binder_dec_node_tmpref() - remove a temporary reference on node
+ * @node: node to reference
+ *
+ * Release temporary reference on node taken via binder_inc_node_tmpref()
+ */
+static void binder_dec_node_tmpref(struct binder_node *node)
+{
+ bool free_node;
+
+ binder_node_inner_lock(node);
+ if (!node->proc)
+ spin_lock(&binder_dead_nodes_lock);
+ node->tmp_refs--;
+ BUG_ON(node->tmp_refs < 0);
+ if (!node->proc)
+ spin_unlock(&binder_dead_nodes_lock);
+ /*
+ * Call binder_dec_node() to check if all refcounts are 0
+ * and cleanup is needed. Calling with strong=0 and internal=1
+ * causes no actual reference to be released in binder_dec_node().
+ * If that changes, a change is needed here too.
+ */
+ free_node = binder_dec_node_nilocked(node, 0, 1);
+ binder_node_inner_unlock(node);
+ if (free_node)
+ binder_free_node(node);
+}
+
+static void binder_put_node(struct binder_node *node)
+{
+ binder_dec_node_tmpref(node);
+}
+
+static struct binder_ref *binder_get_ref_olocked(struct binder_proc *proc,
+ u32 desc, bool need_strong_ref)
{
struct rb_node *n = proc->refs_by_desc.rb_node;
struct binder_ref *ref;
@@ -1022,11 +1618,11 @@ static struct binder_ref *binder_get_ref(struct binder_proc *proc,
while (n) {
ref = rb_entry(n, struct binder_ref, rb_node_desc);
- if (desc < ref->desc) {
+ if (desc < ref->data.desc) {
n = n->rb_left;
- } else if (desc > ref->desc) {
+ } else if (desc > ref->data.desc) {
n = n->rb_right;
- } else if (need_strong_ref && !ref->strong) {
+ } else if (need_strong_ref && !ref->data.strong) {
binder_user_error("tried to use weak ref as strong ref\n");
return NULL;
} else {
@@ -1036,13 +1632,34 @@ static struct binder_ref *binder_get_ref(struct binder_proc *proc,
return NULL;
}
-static struct binder_ref *binder_get_ref_for_node(struct binder_proc *proc,
- struct binder_node *node)
+/**
+ * binder_get_ref_for_node_olocked() - get the ref associated with given node
+ * @proc: binder_proc that owns the ref
+ * @node: binder_node of target
+ * @new_ref: newly allocated binder_ref to be initialized or %NULL
+ *
+ * Look up the ref for the given node and return it if it exists
+ *
+ * If it doesn't exist and the caller provides a newly allocated
+ * ref, initialize the fields of the newly allocated ref and insert
+ * into the given proc rb_trees and node refs list.
+ *
+ * Return: the ref for node. It is possible that another thread
+ * allocated/initialized the ref first in which case the
+ * returned ref would be different than the passed-in
+ * new_ref. new_ref must be kfree'd by the caller in
+ * this case.
+ */
+static struct binder_ref *binder_get_ref_for_node_olocked(
+ struct binder_proc *proc,
+ struct binder_node *node,
+ struct binder_ref *new_ref)
{
- struct rb_node *n;
+ struct binder_context *context = proc->context;
struct rb_node **p = &proc->refs_by_node.rb_node;
struct rb_node *parent = NULL;
- struct binder_ref *ref, *new_ref;
+ struct binder_ref *ref;
+ struct rb_node *n;
while (*p) {
parent = *p;
@@ -1055,22 +1672,22 @@ static struct binder_ref *binder_get_ref_for_node(struct binder_proc *proc,
else
return ref;
}
- new_ref = kzalloc(sizeof(*ref), GFP_KERNEL);
- if (new_ref == NULL)
+ if (!new_ref)
return NULL;
+
binder_stats_created(BINDER_STAT_REF);
- new_ref->debug_id = ++binder_last_id;
+ new_ref->data.debug_id = atomic_inc_return(&binder_last_id);
new_ref->proc = proc;
new_ref->node = node;
rb_link_node(&new_ref->rb_node_node, parent, p);
rb_insert_color(&new_ref->rb_node_node, &proc->refs_by_node);
- new_ref->desc = (node == binder_context_mgr_node) ? 0 : 1;
+ new_ref->data.desc = (node == context->binder_context_mgr_node) ? 0 : 1;
for (n = rb_first(&proc->refs_by_desc); n != NULL; n = rb_next(n)) {
ref = rb_entry(n, struct binder_ref, rb_node_desc);
- if (ref->desc > new_ref->desc)
+ if (ref->data.desc > new_ref->data.desc)
break;
- new_ref->desc = ref->desc + 1;
+ new_ref->data.desc = ref->data.desc + 1;
}
p = &proc->refs_by_desc.rb_node;
@@ -1078,121 +1695,423 @@ static struct binder_ref *binder_get_ref_for_node(struct binder_proc *proc,
parent = *p;
ref = rb_entry(parent, struct binder_ref, rb_node_desc);
- if (new_ref->desc < ref->desc)
+ if (new_ref->data.desc < ref->data.desc)
p = &(*p)->rb_left;
- else if (new_ref->desc > ref->desc)
+ else if (new_ref->data.desc > ref->data.desc)
p = &(*p)->rb_right;
else
BUG();
}
rb_link_node(&new_ref->rb_node_desc, parent, p);
rb_insert_color(&new_ref->rb_node_desc, &proc->refs_by_desc);
- if (node) {
- hlist_add_head(&new_ref->node_entry, &node->refs);
- binder_debug(BINDER_DEBUG_INTERNAL_REFS,
- "%d new ref %d desc %d for node %d\n",
- proc->pid, new_ref->debug_id, new_ref->desc,
- node->debug_id);
- } else {
- binder_debug(BINDER_DEBUG_INTERNAL_REFS,
- "%d new ref %d desc %d for dead node\n",
- proc->pid, new_ref->debug_id, new_ref->desc);
- }
+ binder_node_lock(node);
+ hlist_add_head(&new_ref->node_entry, &node->refs);
+
+ binder_debug(BINDER_DEBUG_INTERNAL_REFS,
+ "%d new ref %d desc %d for node %d\n",
+ proc->pid, new_ref->data.debug_id, new_ref->data.desc,
+ node->debug_id);
+ binder_node_unlock(node);
return new_ref;
}
-static void binder_delete_ref(struct binder_ref *ref)
+static void binder_cleanup_ref_olocked(struct binder_ref *ref)
{
+ bool delete_node = false;
+
binder_debug(BINDER_DEBUG_INTERNAL_REFS,
"%d delete ref %d desc %d for node %d\n",
- ref->proc->pid, ref->debug_id, ref->desc,
+ ref->proc->pid, ref->data.debug_id, ref->data.desc,
ref->node->debug_id);
rb_erase(&ref->rb_node_desc, &ref->proc->refs_by_desc);
rb_erase(&ref->rb_node_node, &ref->proc->refs_by_node);
- if (ref->strong)
- binder_dec_node(ref->node, 1, 1);
+
+ binder_node_inner_lock(ref->node);
+ if (ref->data.strong)
+ binder_dec_node_nilocked(ref->node, 1, 1);
+
hlist_del(&ref->node_entry);
- binder_dec_node(ref->node, 0, 1);
+ delete_node = binder_dec_node_nilocked(ref->node, 0, 1);
+ binder_node_inner_unlock(ref->node);
+ /*
+ * Clear ref->node unless we want the caller to free the node
+ */
+ if (!delete_node) {
+ /*
+ * The caller uses ref->node to determine
+ * whether the node needs to be freed. Clear
+ * it since the node is still alive.
+ */
+ ref->node = NULL;
+ }
+
if (ref->death) {
binder_debug(BINDER_DEBUG_DEAD_BINDER,
"%d delete ref %d desc %d has death notification\n",
- ref->proc->pid, ref->debug_id, ref->desc);
- list_del(&ref->death->work.entry);
- kfree(ref->death);
+ ref->proc->pid, ref->data.debug_id,
+ ref->data.desc);
+ binder_dequeue_work(ref->proc, &ref->death->work);
binder_stats_deleted(BINDER_STAT_DEATH);
}
- kfree(ref);
binder_stats_deleted(BINDER_STAT_REF);
}
-static int binder_inc_ref(struct binder_ref *ref, int strong,
- struct list_head *target_list)
+/**
+ * binder_inc_ref_olocked() - increment the ref for given handle
+ * @ref: ref to be incremented
+ * @strong: if true, strong increment, else weak
+ * @target_list: list to queue node work on
+ *
+ * Increment the ref. @ref->proc->outer_lock must be held on entry
+ *
+ * Return: 0, if successful, else errno
+ */
+static int binder_inc_ref_olocked(struct binder_ref *ref, int strong,
+ struct list_head *target_list)
{
int ret;
if (strong) {
- if (ref->strong == 0) {
+ if (ref->data.strong == 0) {
ret = binder_inc_node(ref->node, 1, 1, target_list);
if (ret)
return ret;
}
- ref->strong++;
+ ref->data.strong++;
} else {
- if (ref->weak == 0) {
+ if (ref->data.weak == 0) {
ret = binder_inc_node(ref->node, 0, 1, target_list);
if (ret)
return ret;
}
- ref->weak++;
+ ref->data.weak++;
}
return 0;
}
-
-static int binder_dec_ref(struct binder_ref *ref, int strong)
+/**
+ * binder_dec_ref() - dec the ref for given handle
+ * @ref: ref to be decremented
+ * @strong: if true, strong decrement, else weak
+ *
+ * Decrement the ref.
+ *
+ * Return: true if ref is cleaned up and ready to be freed
+ */
+static bool binder_dec_ref_olocked(struct binder_ref *ref, int strong)
{
if (strong) {
- if (ref->strong == 0) {
+ if (ref->data.strong == 0) {
binder_user_error("%d invalid dec strong, ref %d desc %d s %d w %d\n",
- ref->proc->pid, ref->debug_id,
- ref->desc, ref->strong, ref->weak);
- return -EINVAL;
- }
- ref->strong--;
- if (ref->strong == 0) {
- int ret;
-
- ret = binder_dec_node(ref->node, strong, 1);
- if (ret)
- return ret;
+ ref->proc->pid, ref->data.debug_id,
+ ref->data.desc, ref->data.strong,
+ ref->data.weak);
+ return false;
}
+ ref->data.strong--;
+ if (ref->data.strong == 0)
+ binder_dec_node(ref->node, strong, 1);
} else {
- if (ref->weak == 0) {
+ if (ref->data.weak == 0) {
binder_user_error("%d invalid dec weak, ref %d desc %d s %d w %d\n",
- ref->proc->pid, ref->debug_id,
- ref->desc, ref->strong, ref->weak);
- return -EINVAL;
+ ref->proc->pid, ref->data.debug_id,
+ ref->data.desc, ref->data.strong,
+ ref->data.weak);
+ return false;
}
- ref->weak--;
+ ref->data.weak--;
}
- if (ref->strong == 0 && ref->weak == 0)
- binder_delete_ref(ref);
- return 0;
+ if (ref->data.strong == 0 && ref->data.weak == 0) {
+ binder_cleanup_ref_olocked(ref);
+ return true;
+ }
+ return false;
}
-static void binder_pop_transaction(struct binder_thread *target_thread,
- struct binder_transaction *t)
+/**
+ * binder_get_node_from_ref() - get the node from the given proc/desc
+ * @proc: proc containing the ref
+ * @desc: the handle associated with the ref
+ * @need_strong_ref: if true, only return node if ref is strong
+ * @rdata: the id/refcount data for the ref
+ *
+ * Given a proc and ref handle, return the associated binder_node
+ *
+ * Return: a binder_node or NULL if not found or not strong when strong required
+ */
+static struct binder_node *binder_get_node_from_ref(
+ struct binder_proc *proc,
+ u32 desc, bool need_strong_ref,
+ struct binder_ref_data *rdata)
{
- if (target_thread) {
- BUG_ON(target_thread->transaction_stack != t);
- BUG_ON(target_thread->transaction_stack->from != target_thread);
- target_thread->transaction_stack =
- target_thread->transaction_stack->from_parent;
- t->from = NULL;
+ struct binder_node *node;
+ struct binder_ref *ref;
+
+ binder_proc_lock(proc);
+ ref = binder_get_ref_olocked(proc, desc, need_strong_ref);
+ if (!ref)
+ goto err_no_ref;
+ node = ref->node;
+ /*
+ * Take an implicit reference on the node to ensure
+ * it stays alive until the call to binder_put_node()
+ */
+ binder_inc_node_tmpref(node);
+ if (rdata)
+ *rdata = ref->data;
+ binder_proc_unlock(proc);
+
+ return node;
+
+err_no_ref:
+ binder_proc_unlock(proc);
+ return NULL;
+}
+
+/**
+ * binder_free_ref() - free the binder_ref
+ * @ref: ref to free
+ *
+ * Free the binder_ref. Free the binder_node indicated by ref->node
+ * (if non-NULL) and the binder_ref_death indicated by ref->death.
+ */
+static void binder_free_ref(struct binder_ref *ref)
+{
+ if (ref->node)
+ binder_free_node(ref->node);
+ kfree(ref->death);
+ kfree(ref);
+}
+
+/**
+ * binder_update_ref_for_handle() - inc/dec the ref for given handle
+ * @proc: proc containing the ref
+ * @desc: the handle associated with the ref
+ * @increment: true=inc reference, false=dec reference
+ * @strong: true=strong reference, false=weak reference
+ * @rdata: the id/refcount data for the ref
+ *
+ * Given a proc and ref handle, increment or decrement the ref
+ * according to "increment" arg.
+ *
+ * Return: 0 if successful, else errno
+ */
+static int binder_update_ref_for_handle(struct binder_proc *proc,
+ uint32_t desc, bool increment, bool strong,
+ struct binder_ref_data *rdata)
+{
+ int ret = 0;
+ struct binder_ref *ref;
+ bool delete_ref = false;
+
+ binder_proc_lock(proc);
+ ref = binder_get_ref_olocked(proc, desc, strong);
+ if (!ref) {
+ ret = -EINVAL;
+ goto err_no_ref;
}
- t->need_reply = 0;
+ if (increment)
+ ret = binder_inc_ref_olocked(ref, strong, NULL);
+ else
+ delete_ref = binder_dec_ref_olocked(ref, strong);
+
+ if (rdata)
+ *rdata = ref->data;
+ binder_proc_unlock(proc);
+
+ if (delete_ref)
+ binder_free_ref(ref);
+ return ret;
+
+err_no_ref:
+ binder_proc_unlock(proc);
+ return ret;
+}
+
+/**
+ * binder_dec_ref_for_handle() - dec the ref for given handle
+ * @proc: proc containing the ref
+ * @desc: the handle associated with the ref
+ * @strong: true=strong reference, false=weak reference
+ * @rdata: the id/refcount data for the ref
+ *
+ * Just calls binder_update_ref_for_handle() to decrement the ref.
+ *
+ * Return: 0 if successful, else errno
+ */
+static int binder_dec_ref_for_handle(struct binder_proc *proc,
+ uint32_t desc, bool strong, struct binder_ref_data *rdata)
+{
+ return binder_update_ref_for_handle(proc, desc, false, strong, rdata);
+}
+
+
+/**
+ * binder_inc_ref_for_node() - increment the ref for given proc/node
+ * @proc: proc containing the ref
+ * @node: target node
+ * @strong: true=strong reference, false=weak reference
+ * @target_list: worklist to use if node is incremented
+ * @rdata: the id/refcount data for the ref
+ *
+ * Given a proc and node, increment the ref. Create the ref if it
+ * doesn't already exist
+ *
+ * Return: 0 if successful, else errno
+ */
+static int binder_inc_ref_for_node(struct binder_proc *proc,
+ struct binder_node *node,
+ bool strong,
+ struct list_head *target_list,
+ struct binder_ref_data *rdata)
+{
+ struct binder_ref *ref;
+ struct binder_ref *new_ref = NULL;
+ int ret = 0;
+
+ binder_proc_lock(proc);
+ ref = binder_get_ref_for_node_olocked(proc, node, NULL);
+ if (!ref) {
+ binder_proc_unlock(proc);
+ new_ref = kzalloc(sizeof(*ref), GFP_KERNEL);
+ if (!new_ref)
+ return -ENOMEM;
+ binder_proc_lock(proc);
+ ref = binder_get_ref_for_node_olocked(proc, node, new_ref);
+ }
+ ret = binder_inc_ref_olocked(ref, strong, target_list);
+ *rdata = ref->data;
+ binder_proc_unlock(proc);
+ if (new_ref && ref != new_ref)
+ /*
+ * Another thread created the ref first so
+ * free the one we allocated
+ */
+ kfree(new_ref);
+ return ret;
+}
+
+static void binder_pop_transaction_ilocked(struct binder_thread *target_thread,
+ struct binder_transaction *t)
+{
+ BUG_ON(!target_thread);
+ assert_spin_locked(&target_thread->proc->inner_lock);
+ BUG_ON(target_thread->transaction_stack != t);
+ BUG_ON(target_thread->transaction_stack->from != target_thread);
+ target_thread->transaction_stack =
+ target_thread->transaction_stack->from_parent;
+ t->from = NULL;
+}
+
+/**
+ * binder_thread_dec_tmpref() - decrement thread->tmp_ref
+ * @thread: thread to decrement
+ *
+ * A thread needs to be kept alive while being used to create or
+ * handle a transaction. binder_get_txn_from() is used to safely
+ * extract t->from from a binder_transaction and keep the thread
+ * indicated by t->from from being freed. When done with that
+ * binder_thread, this function is called to decrement the
+ * tmp_ref and free if appropriate (thread has been released
+ * and no transaction being processed by the driver)
+ */
+static void binder_thread_dec_tmpref(struct binder_thread *thread)
+{
+ /*
+ * atomic is used to protect the counter value while
+ * it cannot reach zero or thread->is_dead is false
+ */
+ binder_inner_proc_lock(thread->proc);
+ atomic_dec(&thread->tmp_ref);
+ if (thread->is_dead && !atomic_read(&thread->tmp_ref)) {
+ binder_inner_proc_unlock(thread->proc);
+ binder_free_thread(thread);
+ return;
+ }
+ binder_inner_proc_unlock(thread->proc);
+}
+
+/**
+ * binder_proc_dec_tmpref() - decrement proc->tmp_ref
+ * @proc: proc to decrement
+ *
+ * A binder_proc needs to be kept alive while being used to create or
+ * handle a transaction. proc->tmp_ref is incremented when
+ * creating a new transaction or the binder_proc is currently in-use
+ * by threads that are being released. When done with the binder_proc,
+ * this function is called to decrement the counter and free the
+ * proc if appropriate (proc has been released, all threads have
+ * been released and not currenly in-use to process a transaction).
+ */
+static void binder_proc_dec_tmpref(struct binder_proc *proc)
+{
+ binder_inner_proc_lock(proc);
+ proc->tmp_ref--;
+ if (proc->is_dead && RB_EMPTY_ROOT(&proc->threads) &&
+ !proc->tmp_ref) {
+ binder_inner_proc_unlock(proc);
+ binder_free_proc(proc);
+ return;
+ }
+ binder_inner_proc_unlock(proc);
+}
+
+/**
+ * binder_get_txn_from() - safely extract the "from" thread in transaction
+ * @t: binder transaction for t->from
+ *
+ * Atomically return the "from" thread and increment the tmp_ref
+ * count for the thread to ensure it stays alive until
+ * binder_thread_dec_tmpref() is called.
+ *
+ * Return: the value of t->from
+ */
+static struct binder_thread *binder_get_txn_from(
+ struct binder_transaction *t)
+{
+ struct binder_thread *from;
+
+ spin_lock(&t->lock);
+ from = t->from;
+ if (from)
+ atomic_inc(&from->tmp_ref);
+ spin_unlock(&t->lock);
+ return from;
+}
+
+/**
+ * binder_get_txn_from_and_acq_inner() - get t->from and acquire inner lock
+ * @t: binder transaction for t->from
+ *
+ * Same as binder_get_txn_from() except it also acquires the proc->inner_lock
+ * to guarantee that the thread cannot be released while operating on it.
+ * The caller must call binder_inner_proc_unlock() to release the inner lock
+ * as well as call binder_dec_thread_txn() to release the reference.
+ *
+ * Return: the value of t->from
+ */
+static struct binder_thread *binder_get_txn_from_and_acq_inner(
+ struct binder_transaction *t)
+{
+ struct binder_thread *from;
+
+ from = binder_get_txn_from(t);
+ if (!from)
+ return NULL;
+ binder_inner_proc_lock(from->proc);
+ if (t->from) {
+ BUG_ON(from != t->from);
+ return from;
+ }
+ binder_inner_proc_unlock(from->proc);
+ binder_thread_dec_tmpref(from);
+ return NULL;
+}
+
+static void binder_free_transaction(struct binder_transaction *t)
+{
if (t->buffer)
t->buffer->transaction = NULL;
kfree(t);
@@ -1207,30 +2126,34 @@ static void binder_send_failed_reply(struct binder_transaction *t,
BUG_ON(t->flags & TF_ONE_WAY);
while (1) {
- target_thread = t->from;
+ target_thread = binder_get_txn_from_and_acq_inner(t);
if (target_thread) {
- if (target_thread->return_error != BR_OK &&
- target_thread->return_error2 == BR_OK) {
- target_thread->return_error2 =
- target_thread->return_error;
- target_thread->return_error = BR_OK;
- }
- if (target_thread->return_error == BR_OK) {
- binder_debug(BINDER_DEBUG_FAILED_TRANSACTION,
- "send failed reply for transaction %d to %d:%d\n",
- t->debug_id,
- target_thread->proc->pid,
- target_thread->pid);
-
- binder_pop_transaction(target_thread, t);
- target_thread->return_error = error_code;
+ binder_debug(BINDER_DEBUG_FAILED_TRANSACTION,
+ "send failed reply for transaction %d to %d:%d\n",
+ t->debug_id,
+ target_thread->proc->pid,
+ target_thread->pid);
+
+ binder_pop_transaction_ilocked(target_thread, t);
+ if (target_thread->reply_error.cmd == BR_OK) {
+ target_thread->reply_error.cmd = error_code;
+ binder_enqueue_thread_work_ilocked(
+ target_thread,
+ &target_thread->reply_error.work);
wake_up_interruptible(&target_thread->wait);
} else {
- pr_err("reply failed, target thread, %d:%d, has error code %d already\n",
- target_thread->proc->pid,
- target_thread->pid,
- target_thread->return_error);
+ /*
+ * Cannot get here for normal operation, but
+ * we can if multiple synchronous transactions
+ * are sent without blocking for responses.
+ * Just ignore the 2nd error in this case.
+ */
+ pr_warn("Unexpected reply error: %u\n",
+ target_thread->reply_error.cmd);
}
+ binder_inner_proc_unlock(target_thread->proc);
+ binder_thread_dec_tmpref(target_thread);
+ binder_free_transaction(t);
return;
}
next = t->from_parent;
@@ -1239,7 +2162,7 @@ static void binder_send_failed_reply(struct binder_transaction *t,
"send failed reply for transaction %d, target dead\n",
t->debug_id);
- binder_pop_transaction(target_thread, t);
+ binder_free_transaction(t);
if (next == NULL) {
binder_debug(BINDER_DEBUG_DEAD_BINDER,
"reply failed, no target thread at root\n");
@@ -1252,43 +2175,212 @@ static void binder_send_failed_reply(struct binder_transaction *t,
}
}
+/**
+ * binder_cleanup_transaction() - cleans up undelivered transaction
+ * @t: transaction that needs to be cleaned up
+ * @reason: reason the transaction wasn't delivered
+ * @error_code: error to return to caller (if synchronous call)
+ */
+static void binder_cleanup_transaction(struct binder_transaction *t,
+ const char *reason,
+ uint32_t error_code)
+{
+ if (t->buffer->target_node && !(t->flags & TF_ONE_WAY)) {
+ binder_send_failed_reply(t, error_code);
+ } else {
+ binder_debug(BINDER_DEBUG_DEAD_TRANSACTION,
+ "undelivered transaction %d, %s\n",
+ t->debug_id, reason);
+ binder_free_transaction(t);
+ }
+}
+
+/**
+ * binder_validate_object() - checks for a valid metadata object in a buffer.
+ * @buffer: binder_buffer that we're parsing.
+ * @offset: offset in the buffer at which to validate an object.
+ *
+ * Return: If there's a valid metadata object at @offset in @buffer, the
+ * size of that object. Otherwise, it returns zero.
+ */
+static size_t binder_validate_object(struct binder_buffer *buffer, u64 offset)
+{
+ /* Check if we can read a header first */
+ struct binder_object_header *hdr;
+ size_t object_size = 0;
+
+ if (buffer->data_size < sizeof(*hdr) ||
+ offset > buffer->data_size - sizeof(*hdr) ||
+ !IS_ALIGNED(offset, sizeof(u32)))
+ return 0;
+
+ /* Ok, now see if we can read a complete object. */
+ hdr = (struct binder_object_header *)(buffer->data + offset);
+ switch (hdr->type) {
+ case BINDER_TYPE_BINDER:
+ case BINDER_TYPE_WEAK_BINDER:
+ case BINDER_TYPE_HANDLE:
+ case BINDER_TYPE_WEAK_HANDLE:
+ object_size = sizeof(struct flat_binder_object);
+ break;
+ case BINDER_TYPE_FD:
+ object_size = sizeof(struct binder_fd_object);
+ break;
+ case BINDER_TYPE_PTR:
+ object_size = sizeof(struct binder_buffer_object);
+ break;
+ case BINDER_TYPE_FDA:
+ object_size = sizeof(struct binder_fd_array_object);
+ break;
+ default:
+ return 0;
+ }
+ if (offset <= buffer->data_size - object_size &&
+ buffer->data_size >= object_size)
+ return object_size;
+ else
+ return 0;
+}
+
+/**
+ * binder_validate_ptr() - validates binder_buffer_object in a binder_buffer.
+ * @b: binder_buffer containing the object
+ * @index: index in offset array at which the binder_buffer_object is
+ * located
+ * @start: points to the start of the offset array
+ * @num_valid: the number of valid offsets in the offset array
+ *
+ * Return: If @index is within the valid range of the offset array
+ * described by @start and @num_valid, and if there's a valid
+ * binder_buffer_object at the offset found in index @index
+ * of the offset array, that object is returned. Otherwise,
+ * %NULL is returned.
+ * Note that the offset found in index @index itself is not
+ * verified; this function assumes that @num_valid elements
+ * from @start were previously verified to have valid offsets.
+ */
+static struct binder_buffer_object *binder_validate_ptr(struct binder_buffer *b,
+ binder_size_t index,
+ binder_size_t *start,
+ binder_size_t num_valid)
+{
+ struct binder_buffer_object *buffer_obj;
+ binder_size_t *offp;
+
+ if (index >= num_valid)
+ return NULL;
+
+ offp = start + index;
+ buffer_obj = (struct binder_buffer_object *)(b->data + *offp);
+ if (buffer_obj->hdr.type != BINDER_TYPE_PTR)
+ return NULL;
+
+ return buffer_obj;
+}
+
+/**
+ * binder_validate_fixup() - validates pointer/fd fixups happen in order.
+ * @b: transaction buffer
+ * @objects_start start of objects buffer
+ * @buffer: binder_buffer_object in which to fix up
+ * @offset: start offset in @buffer to fix up
+ * @last_obj: last binder_buffer_object that we fixed up in
+ * @last_min_offset: minimum fixup offset in @last_obj
+ *
+ * Return: %true if a fixup in buffer @buffer at offset @offset is
+ * allowed.
+ *
+ * For safety reasons, we only allow fixups inside a buffer to happen
+ * at increasing offsets; additionally, we only allow fixup on the last
+ * buffer object that was verified, or one of its parents.
+ *
+ * Example of what is allowed:
+ *
+ * A
+ * B (parent = A, offset = 0)
+ * C (parent = A, offset = 16)
+ * D (parent = C, offset = 0)
+ * E (parent = A, offset = 32) // min_offset is 16 (C.parent_offset)
+ *
+ * Examples of what is not allowed:
+ *
+ * Decreasing offsets within the same parent:
+ * A
+ * C (parent = A, offset = 16)
+ * B (parent = A, offset = 0) // decreasing offset within A
+ *
+ * Referring to a parent that wasn't the last object or any of its parents:
+ * A
+ * B (parent = A, offset = 0)
+ * C (parent = A, offset = 0)
+ * C (parent = A, offset = 16)
+ * D (parent = B, offset = 0) // B is not A or any of A's parents
+ */
+static bool binder_validate_fixup(struct binder_buffer *b,
+ binder_size_t *objects_start,
+ struct binder_buffer_object *buffer,
+ binder_size_t fixup_offset,
+ struct binder_buffer_object *last_obj,
+ binder_size_t last_min_offset)
+{
+ if (!last_obj) {
+ /* Nothing to fix up in */
+ return false;
+ }
+
+ while (last_obj != buffer) {
+ /*
+ * Safe to retrieve the parent of last_obj, since it
+ * was already previously verified by the driver.
+ */
+ if ((last_obj->flags & BINDER_BUFFER_FLAG_HAS_PARENT) == 0)
+ return false;
+ last_min_offset = last_obj->parent_offset + sizeof(uintptr_t);
+ last_obj = (struct binder_buffer_object *)
+ (b->data + *(objects_start + last_obj->parent));
+ }
+ return (fixup_offset >= last_min_offset);
+}
+
static void binder_transaction_buffer_release(struct binder_proc *proc,
struct binder_buffer *buffer,
binder_size_t *failed_at)
{
- binder_size_t *offp, *off_end;
+ binder_size_t *offp, *off_start, *off_end;
int debug_id = buffer->debug_id;
binder_debug(BINDER_DEBUG_TRANSACTION,
- "%d buffer release %d, size %zd-%zd, failed at %p\n",
+ "%d buffer release %d, size %zd-%zd, failed at %pK\n",
proc->pid, buffer->debug_id,
buffer->data_size, buffer->offsets_size, failed_at);
if (buffer->target_node)
binder_dec_node(buffer->target_node, 1, 0);
- offp = (binder_size_t *)(buffer->data +
- ALIGN(buffer->data_size, sizeof(void *)));
+ off_start = (binder_size_t *)(buffer->data +
+ ALIGN(buffer->data_size, sizeof(void *)));
if (failed_at)
off_end = failed_at;
else
- off_end = (void *)offp + buffer->offsets_size;
- for (; offp < off_end; offp++) {
- struct flat_binder_object *fp;
+ off_end = (void *)off_start + buffer->offsets_size;
+ for (offp = off_start; offp < off_end; offp++) {
+ struct binder_object_header *hdr;
+ size_t object_size = binder_validate_object(buffer, *offp);
- if (*offp > buffer->data_size - sizeof(*fp) ||
- buffer->data_size < sizeof(*fp) ||
- !IS_ALIGNED(*offp, sizeof(u32))) {
- pr_err("transaction release %d bad offset %lld, size %zd\n",
+ if (object_size == 0) {
+ pr_err("transaction release %d bad object at offset %lld, size %zd\n",
debug_id, (u64)*offp, buffer->data_size);
continue;
}
- fp = (struct flat_binder_object *)(buffer->data + *offp);
- switch (fp->type) {
+ hdr = (struct binder_object_header *)(buffer->data + *offp);
+ switch (hdr->type) {
case BINDER_TYPE_BINDER:
case BINDER_TYPE_WEAK_BINDER: {
- struct binder_node *node = binder_get_node(proc, fp->binder);
+ struct flat_binder_object *fp;
+ struct binder_node *node;
+ fp = to_flat_binder_object(hdr);
+ node = binder_get_node(proc, fp->binder);
if (node == NULL) {
pr_err("transaction release %d bad node %016llx\n",
debug_id, (u64)fp->binder);
@@ -1297,90 +2389,560 @@ static void binder_transaction_buffer_release(struct binder_proc *proc,
binder_debug(BINDER_DEBUG_TRANSACTION,
" node %d u%016llx\n",
node->debug_id, (u64)node->ptr);
- binder_dec_node(node, fp->type == BINDER_TYPE_BINDER, 0);
+ binder_dec_node(node, hdr->type == BINDER_TYPE_BINDER,
+ 0);
+ binder_put_node(node);
} break;
case BINDER_TYPE_HANDLE:
case BINDER_TYPE_WEAK_HANDLE: {
- struct binder_ref *ref;
+ struct flat_binder_object *fp;
+ struct binder_ref_data rdata;
+ int ret;
- ref = binder_get_ref(proc, fp->handle,
- fp->type == BINDER_TYPE_HANDLE);
+ fp = to_flat_binder_object(hdr);
+ ret = binder_dec_ref_for_handle(proc, fp->handle,
+ hdr->type == BINDER_TYPE_HANDLE, &rdata);
- if (ref == NULL) {
- pr_err("transaction release %d bad handle %d\n",
- debug_id, fp->handle);
+ if (ret) {
+ pr_err("transaction release %d bad handle %d, ret = %d\n",
+ debug_id, fp->handle, ret);
break;
}
binder_debug(BINDER_DEBUG_TRANSACTION,
- " ref %d desc %d (node %d)\n",
- ref->debug_id, ref->desc, ref->node->debug_id);
- binder_dec_ref(ref, fp->type == BINDER_TYPE_HANDLE);
+ " ref %d desc %d\n",
+ rdata.debug_id, rdata.desc);
} break;
- case BINDER_TYPE_FD:
+ case BINDER_TYPE_FD: {
+ struct binder_fd_object *fp = to_binder_fd_object(hdr);
+
binder_debug(BINDER_DEBUG_TRANSACTION,
- " fd %d\n", fp->handle);
+ " fd %d\n", fp->fd);
if (failed_at)
- task_close_fd(proc, fp->handle);
+ task_close_fd(proc, fp->fd);
+ } break;
+ case BINDER_TYPE_PTR:
+ /*
+ * Nothing to do here, this will get cleaned up when the
+ * transaction buffer gets freed
+ */
break;
-
+ case BINDER_TYPE_FDA: {
+ struct binder_fd_array_object *fda;
+ struct binder_buffer_object *parent;
+ uintptr_t parent_buffer;
+ u32 *fd_array;
+ size_t fd_index;
+ binder_size_t fd_buf_size;
+
+ fda = to_binder_fd_array_object(hdr);
+ parent = binder_validate_ptr(buffer, fda->parent,
+ off_start,
+ offp - off_start);
+ if (!parent) {
+ pr_err("transaction release %d bad parent offset",
+ debug_id);
+ continue;
+ }
+ /*
+ * Since the parent was already fixed up, convert it
+ * back to kernel address space to access it
+ */
+ parent_buffer = parent->buffer -
+ binder_alloc_get_user_buffer_offset(
+ &proc->alloc);
+
+ fd_buf_size = sizeof(u32) * fda->num_fds;
+ if (fda->num_fds >= SIZE_MAX / sizeof(u32)) {
+ pr_err("transaction release %d invalid number of fds (%lld)\n",
+ debug_id, (u64)fda->num_fds);
+ continue;
+ }
+ if (fd_buf_size > parent->length ||
+ fda->parent_offset > parent->length - fd_buf_size) {
+ /* No space for all file descriptors here. */
+ pr_err("transaction release %d not enough space for %lld fds in buffer\n",
+ debug_id, (u64)fda->num_fds);
+ continue;
+ }
+ fd_array = (u32 *)(parent_buffer + (uintptr_t)fda->parent_offset);
+ for (fd_index = 0; fd_index < fda->num_fds; fd_index++)
+ task_close_fd(proc, fd_array[fd_index]);
+ } break;
default:
pr_err("transaction release %d bad object type %x\n",
- debug_id, fp->type);
+ debug_id, hdr->type);
break;
}
}
}
+static int binder_translate_binder(struct flat_binder_object *fp,
+ struct binder_transaction *t,
+ struct binder_thread *thread)
+{
+ struct binder_node *node;
+ struct binder_proc *proc = thread->proc;
+ struct binder_proc *target_proc = t->to_proc;
+ struct binder_ref_data rdata;
+ int ret = 0;
+
+ node = binder_get_node(proc, fp->binder);
+ if (!node) {
+ node = binder_new_node(proc, fp);
+ if (!node)
+ return -ENOMEM;
+ }
+ if (fp->cookie != node->cookie) {
+ binder_user_error("%d:%d sending u%016llx node %d, cookie mismatch %016llx != %016llx\n",
+ proc->pid, thread->pid, (u64)fp->binder,
+ node->debug_id, (u64)fp->cookie,
+ (u64)node->cookie);
+ ret = -EINVAL;
+ goto done;
+ }
+ if (security_binder_transfer_binder(proc->tsk, target_proc->tsk)) {
+ ret = -EPERM;
+ goto done;
+ }
+
+ ret = binder_inc_ref_for_node(target_proc, node,
+ fp->hdr.type == BINDER_TYPE_BINDER,
+ &thread->todo, &rdata);
+ if (ret)
+ goto done;
+
+ if (fp->hdr.type == BINDER_TYPE_BINDER)
+ fp->hdr.type = BINDER_TYPE_HANDLE;
+ else
+ fp->hdr.type = BINDER_TYPE_WEAK_HANDLE;
+ fp->binder = 0;
+ fp->handle = rdata.desc;
+ fp->cookie = 0;
+
+ trace_binder_transaction_node_to_ref(t, node, &rdata);
+ binder_debug(BINDER_DEBUG_TRANSACTION,
+ " node %d u%016llx -> ref %d desc %d\n",
+ node->debug_id, (u64)node->ptr,
+ rdata.debug_id, rdata.desc);
+done:
+ binder_put_node(node);
+ return ret;
+}
+
+static int binder_translate_handle(struct flat_binder_object *fp,
+ struct binder_transaction *t,
+ struct binder_thread *thread)
+{
+ struct binder_proc *proc = thread->proc;
+ struct binder_proc *target_proc = t->to_proc;
+ struct binder_node *node;
+ struct binder_ref_data src_rdata;
+ int ret = 0;
+
+ node = binder_get_node_from_ref(proc, fp->handle,
+ fp->hdr.type == BINDER_TYPE_HANDLE, &src_rdata);
+ if (!node) {
+ binder_user_error("%d:%d got transaction with invalid handle, %d\n",
+ proc->pid, thread->pid, fp->handle);
+ return -EINVAL;
+ }
+ if (security_binder_transfer_binder(proc->tsk, target_proc->tsk)) {
+ ret = -EPERM;
+ goto done;
+ }
+
+ binder_node_lock(node);
+ if (node->proc == target_proc) {
+ if (fp->hdr.type == BINDER_TYPE_HANDLE)
+ fp->hdr.type = BINDER_TYPE_BINDER;
+ else
+ fp->hdr.type = BINDER_TYPE_WEAK_BINDER;
+ fp->binder = node->ptr;
+ fp->cookie = node->cookie;
+ if (node->proc)
+ binder_inner_proc_lock(node->proc);
+ binder_inc_node_nilocked(node,
+ fp->hdr.type == BINDER_TYPE_BINDER,
+ 0, NULL);
+ if (node->proc)
+ binder_inner_proc_unlock(node->proc);
+ trace_binder_transaction_ref_to_node(t, node, &src_rdata);
+ binder_debug(BINDER_DEBUG_TRANSACTION,
+ " ref %d desc %d -> node %d u%016llx\n",
+ src_rdata.debug_id, src_rdata.desc, node->debug_id,
+ (u64)node->ptr);
+ binder_node_unlock(node);
+ } else {
+ struct binder_ref_data dest_rdata;
+
+ binder_node_unlock(node);
+ ret = binder_inc_ref_for_node(target_proc, node,
+ fp->hdr.type == BINDER_TYPE_HANDLE,
+ NULL, &dest_rdata);
+ if (ret)
+ goto done;
+
+ fp->binder = 0;
+ fp->handle = dest_rdata.desc;
+ fp->cookie = 0;
+ trace_binder_transaction_ref_to_ref(t, node, &src_rdata,
+ &dest_rdata);
+ binder_debug(BINDER_DEBUG_TRANSACTION,
+ " ref %d desc %d -> ref %d desc %d (node %d)\n",
+ src_rdata.debug_id, src_rdata.desc,
+ dest_rdata.debug_id, dest_rdata.desc,
+ node->debug_id);
+ }
+done:
+ binder_put_node(node);
+ return ret;
+}
+
+static int binder_translate_fd(int fd,
+ struct binder_transaction *t,
+ struct binder_thread *thread,
+ struct binder_transaction *in_reply_to)
+{
+ struct binder_proc *proc = thread->proc;
+ struct binder_proc *target_proc = t->to_proc;
+ int target_fd;
+ struct file *file;
+ int ret;
+ bool target_allows_fd;
+
+ if (in_reply_to)
+ target_allows_fd = !!(in_reply_to->flags & TF_ACCEPT_FDS);
+ else
+ target_allows_fd = t->buffer->target_node->accept_fds;
+ if (!target_allows_fd) {
+ binder_user_error("%d:%d got %s with fd, %d, but target does not allow fds\n",
+ proc->pid, thread->pid,
+ in_reply_to ? "reply" : "transaction",
+ fd);
+ ret = -EPERM;
+ goto err_fd_not_accepted;
+ }
+
+ file = fget(fd);
+ if (!file) {
+ binder_user_error("%d:%d got transaction with invalid fd, %d\n",
+ proc->pid, thread->pid, fd);
+ ret = -EBADF;
+ goto err_fget;
+ }
+ ret = security_binder_transfer_file(proc->tsk, target_proc->tsk, file);
+ if (ret < 0) {
+ ret = -EPERM;
+ goto err_security;
+ }
+
+ target_fd = task_get_unused_fd_flags(target_proc, O_CLOEXEC);
+ if (target_fd < 0) {
+ ret = -ENOMEM;
+ goto err_get_unused_fd;
+ }
+ task_fd_install(target_proc, target_fd, file);
+ trace_binder_transaction_fd(t, fd, target_fd);
+ binder_debug(BINDER_DEBUG_TRANSACTION, " fd %d -> %d\n",
+ fd, target_fd);
+
+ return target_fd;
+
+err_get_unused_fd:
+err_security:
+ fput(file);
+err_fget:
+err_fd_not_accepted:
+ return ret;
+}
+
+static int binder_translate_fd_array(struct binder_fd_array_object *fda,
+ struct binder_buffer_object *parent,
+ struct binder_transaction *t,
+ struct binder_thread *thread,
+ struct binder_transaction *in_reply_to)
+{
+ binder_size_t fdi, fd_buf_size, num_installed_fds;
+ int target_fd;
+ uintptr_t parent_buffer;
+ u32 *fd_array;
+ struct binder_proc *proc = thread->proc;
+ struct binder_proc *target_proc = t->to_proc;
+
+ fd_buf_size = sizeof(u32) * fda->num_fds;
+ if (fda->num_fds >= SIZE_MAX / sizeof(u32)) {
+ binder_user_error("%d:%d got transaction with invalid number of fds (%lld)\n",
+ proc->pid, thread->pid, (u64)fda->num_fds);
+ return -EINVAL;
+ }
+ if (fd_buf_size > parent->length ||
+ fda->parent_offset > parent->length - fd_buf_size) {
+ /* No space for all file descriptors here. */
+ binder_user_error("%d:%d not enough space to store %lld fds in buffer\n",
+ proc->pid, thread->pid, (u64)fda->num_fds);
+ return -EINVAL;
+ }
+ /*
+ * Since the parent was already fixed up, convert it
+ * back to the kernel address space to access it
+ */
+ parent_buffer = parent->buffer -
+ binder_alloc_get_user_buffer_offset(&target_proc->alloc);
+ fd_array = (u32 *)(parent_buffer + (uintptr_t)fda->parent_offset);
+ if (!IS_ALIGNED((unsigned long)fd_array, sizeof(u32))) {
+ binder_user_error("%d:%d parent offset not aligned correctly.\n",
+ proc->pid, thread->pid);
+ return -EINVAL;
+ }
+ for (fdi = 0; fdi < fda->num_fds; fdi++) {
+ target_fd = binder_translate_fd(fd_array[fdi], t, thread,
+ in_reply_to);
+ if (target_fd < 0)
+ goto err_translate_fd_failed;
+ fd_array[fdi] = target_fd;
+ }
+ return 0;
+
+err_translate_fd_failed:
+ /*
+ * Failed to allocate fd or security error, free fds
+ * installed so far.
+ */
+ num_installed_fds = fdi;
+ for (fdi = 0; fdi < num_installed_fds; fdi++)
+ task_close_fd(target_proc, fd_array[fdi]);
+ return target_fd;
+}
+
+static int binder_fixup_parent(struct binder_transaction *t,
+ struct binder_thread *thread,
+ struct binder_buffer_object *bp,
+ binder_size_t *off_start,
+ binder_size_t num_valid,
+ struct binder_buffer_object *last_fixup_obj,
+ binder_size_t last_fixup_min_off)
+{
+ struct binder_buffer_object *parent;
+ u8 *parent_buffer;
+ struct binder_buffer *b = t->buffer;
+ struct binder_proc *proc = thread->proc;
+ struct binder_proc *target_proc = t->to_proc;
+
+ if (!(bp->flags & BINDER_BUFFER_FLAG_HAS_PARENT))
+ return 0;
+
+ parent = binder_validate_ptr(b, bp->parent, off_start, num_valid);
+ if (!parent) {
+ binder_user_error("%d:%d got transaction with invalid parent offset or type\n",
+ proc->pid, thread->pid);
+ return -EINVAL;
+ }
+
+ if (!binder_validate_fixup(b, off_start,
+ parent, bp->parent_offset,
+ last_fixup_obj,
+ last_fixup_min_off)) {
+ binder_user_error("%d:%d got transaction with out-of-order buffer fixup\n",
+ proc->pid, thread->pid);
+ return -EINVAL;
+ }
+
+ if (parent->length < sizeof(binder_uintptr_t) ||
+ bp->parent_offset > parent->length - sizeof(binder_uintptr_t)) {
+ /* No space for a pointer here! */
+ binder_user_error("%d:%d got transaction with invalid parent offset\n",
+ proc->pid, thread->pid);
+ return -EINVAL;
+ }
+ parent_buffer = (u8 *)((uintptr_t)parent->buffer -
+ binder_alloc_get_user_buffer_offset(
+ &target_proc->alloc));
+ *(binder_uintptr_t *)(parent_buffer + bp->parent_offset) = bp->buffer;
+
+ return 0;
+}
+
+/**
+ * binder_proc_transaction() - sends a transaction to a process and wakes it up
+ * @t: transaction to send
+ * @proc: process to send the transaction to
+ * @thread: thread in @proc to send the transaction to (may be NULL)
+ *
+ * This function queues a transaction to the specified process. It will try
+ * to find a thread in the target process to handle the transaction and
+ * wake it up. If no thread is found, the work is queued to the proc
+ * waitqueue.
+ *
+ * If the @thread parameter is not NULL, the transaction is always queued
+ * to the waitlist of that specific thread.
+ *
+ * Return: true if the transactions was successfully queued
+ * false if the target process or thread is dead
+ */
+static bool binder_proc_transaction(struct binder_transaction *t,
+ struct binder_proc *proc,
+ struct binder_thread *thread)
+{
+ struct binder_node *node = t->buffer->target_node;
+ struct binder_priority node_prio;
+ bool oneway = !!(t->flags & TF_ONE_WAY);
+ bool pending_async = false;
+
+ BUG_ON(!node);
+ binder_node_lock(node);
+ node_prio.prio = node->min_priority;
+ node_prio.sched_policy = node->sched_policy;
+
+ if (oneway) {
+ BUG_ON(thread);
+ if (node->has_async_transaction) {
+ pending_async = true;
+ } else {
+ node->has_async_transaction = true;
+ }
+ }
+
+ binder_inner_proc_lock(proc);
+
+ if (proc->is_dead || (thread && thread->is_dead)) {
+ binder_inner_proc_unlock(proc);
+ binder_node_unlock(node);
+ return false;
+ }
+
+ if (!thread && !pending_async)
+ thread = binder_select_thread_ilocked(proc);
+
+ if (thread) {
+ binder_transaction_priority(thread->task, t, node_prio,
+ node->inherit_rt);
+ binder_enqueue_thread_work_ilocked(thread, &t->work);
+ } else if (!pending_async) {
+ binder_enqueue_work_ilocked(&t->work, &proc->todo);
+ } else {
+ binder_enqueue_work_ilocked(&t->work, &node->async_todo);
+ }
+
+ if (!pending_async)
+ binder_wakeup_thread_ilocked(proc, thread, !oneway /* sync */);
+
+ binder_inner_proc_unlock(proc);
+ binder_node_unlock(node);
+
+ return true;
+}
+
+/**
+ * binder_get_node_refs_for_txn() - Get required refs on node for txn
+ * @node: struct binder_node for which to get refs
+ * @proc: returns @node->proc if valid
+ * @error: if no @proc then returns BR_DEAD_REPLY
+ *
+ * User-space normally keeps the node alive when creating a transaction
+ * since it has a reference to the target. The local strong ref keeps it
+ * alive if the sending process dies before the target process processes
+ * the transaction. If the source process is malicious or has a reference
+ * counting bug, relying on the local strong ref can fail.
+ *
+ * Since user-space can cause the local strong ref to go away, we also take
+ * a tmpref on the node to ensure it survives while we are constructing
+ * the transaction. We also need a tmpref on the proc while we are
+ * constructing the transaction, so we take that here as well.
+ *
+ * Return: The target_node with refs taken or NULL if no @node->proc is NULL.
+ * Also sets @proc if valid. If the @node->proc is NULL indicating that the
+ * target proc has died, @error is set to BR_DEAD_REPLY
+ */
+static struct binder_node *binder_get_node_refs_for_txn(
+ struct binder_node *node,
+ struct binder_proc **procp,
+ uint32_t *error)
+{
+ struct binder_node *target_node = NULL;
+
+ binder_node_inner_lock(node);
+ if (node->proc) {
+ target_node = node;
+ binder_inc_node_nilocked(node, 1, 0, NULL);
+ binder_inc_node_tmpref_ilocked(node);
+ node->proc->tmp_ref++;
+ *procp = node->proc;
+ } else
+ *error = BR_DEAD_REPLY;
+ binder_node_inner_unlock(node);
+
+ return target_node;
+}
+
static void binder_transaction(struct binder_proc *proc,
struct binder_thread *thread,
- struct binder_transaction_data *tr, int reply)
+ struct binder_transaction_data *tr, int reply,
+ binder_size_t extra_buffers_size)
{
+ int ret;
struct binder_transaction *t;
struct binder_work *tcomplete;
- binder_size_t *offp, *off_end;
+ binder_size_t *offp, *off_end, *off_start;
binder_size_t off_min;
- struct binder_proc *target_proc;
+ u8 *sg_bufp, *sg_buf_end;
+ struct binder_proc *target_proc = NULL;
struct binder_thread *target_thread = NULL;
struct binder_node *target_node = NULL;
- struct list_head *target_list;
- wait_queue_head_t *target_wait;
struct binder_transaction *in_reply_to = NULL;
struct binder_transaction_log_entry *e;
- uint32_t return_error;
+ uint32_t return_error = 0;
+ uint32_t return_error_param = 0;
+ uint32_t return_error_line = 0;
+ struct binder_buffer_object *last_fixup_obj = NULL;
+ binder_size_t last_fixup_min_off = 0;
+ struct binder_context *context = proc->context;
+ int t_debug_id = atomic_inc_return(&binder_last_id);
e = binder_transaction_log_add(&binder_transaction_log);
+ e->debug_id = t_debug_id;
e->call_type = reply ? 2 : !!(tr->flags & TF_ONE_WAY);
e->from_proc = proc->pid;
e->from_thread = thread->pid;
e->target_handle = tr->target.handle;
e->data_size = tr->data_size;
e->offsets_size = tr->offsets_size;
+ e->context_name = proc->context->name;
if (reply) {
+ binder_inner_proc_lock(proc);
in_reply_to = thread->transaction_stack;
if (in_reply_to == NULL) {
+ binder_inner_proc_unlock(proc);
binder_user_error("%d:%d got reply transaction with no transaction stack\n",
proc->pid, thread->pid);
return_error = BR_FAILED_REPLY;
+ return_error_param = -EPROTO;
+ return_error_line = __LINE__;
goto err_empty_call_stack;
}
- binder_set_nice(in_reply_to->saved_priority);
if (in_reply_to->to_thread != thread) {
+ spin_lock(&in_reply_to->lock);
binder_user_error("%d:%d got reply transaction with bad transaction stack, transaction %d has target %d:%d\n",
proc->pid, thread->pid, in_reply_to->debug_id,
in_reply_to->to_proc ?
in_reply_to->to_proc->pid : 0,
in_reply_to->to_thread ?
in_reply_to->to_thread->pid : 0);
+ spin_unlock(&in_reply_to->lock);
+ binder_inner_proc_unlock(proc);
return_error = BR_FAILED_REPLY;
+ return_error_param = -EPROTO;
+ return_error_line = __LINE__;
in_reply_to = NULL;
goto err_bad_call_stack;
}
thread->transaction_stack = in_reply_to->to_parent;
- target_thread = in_reply_to->from;
+ binder_inner_proc_unlock(proc);
+ target_thread = binder_get_txn_from_and_acq_inner(in_reply_to);
if (target_thread == NULL) {
return_error = BR_DEAD_REPLY;
+ return_error_line = __LINE__;
goto err_dead_binder;
}
if (target_thread->transaction_stack != in_reply_to) {
@@ -1389,106 +2951,156 @@ static void binder_transaction(struct binder_proc *proc,
target_thread->transaction_stack ?
target_thread->transaction_stack->debug_id : 0,
in_reply_to->debug_id);
+ binder_inner_proc_unlock(target_thread->proc);
return_error = BR_FAILED_REPLY;
+ return_error_param = -EPROTO;
+ return_error_line = __LINE__;
in_reply_to = NULL;
target_thread = NULL;
goto err_dead_binder;
}
target_proc = target_thread->proc;
+ target_proc->tmp_ref++;
+ binder_inner_proc_unlock(target_thread->proc);
} else {
if (tr->target.handle) {
struct binder_ref *ref;
- ref = binder_get_ref(proc, tr->target.handle, true);
- if (ref == NULL) {
+ /*
+ * There must already be a strong ref
+ * on this node. If so, do a strong
+ * increment on the node to ensure it
+ * stays alive until the transaction is
+ * done.
+ */
+ binder_proc_lock(proc);
+ ref = binder_get_ref_olocked(proc, tr->target.handle,
+ true);
+ if (ref) {
+ target_node = binder_get_node_refs_for_txn(
+ ref->node, &target_proc,
+ &return_error);
+ } else {
binder_user_error("%d:%d got transaction to invalid handle\n",
- proc->pid, thread->pid);
+ proc->pid, thread->pid);
return_error = BR_FAILED_REPLY;
- goto err_invalid_target_handle;
}
- target_node = ref->node;
+ binder_proc_unlock(proc);
} else {
- target_node = binder_context_mgr_node;
- if (target_node == NULL) {
+ mutex_lock(&context->context_mgr_node_lock);
+ target_node = context->binder_context_mgr_node;
+ if (target_node)
+ target_node = binder_get_node_refs_for_txn(
+ target_node, &target_proc,
+ &return_error);
+ else
return_error = BR_DEAD_REPLY;
- goto err_no_context_mgr_node;
+ mutex_unlock(&context->context_mgr_node_lock);
+ if (target_node && target_proc == proc) {
+ binder_user_error("%d:%d got transaction to context manager from process owning it\n",
+ proc->pid, thread->pid);
+ return_error = BR_FAILED_REPLY;
+ return_error_param = -EINVAL;
+ return_error_line = __LINE__;
+ goto err_invalid_target_handle;
}
}
- e->to_node = target_node->debug_id;
- target_proc = target_node->proc;
- if (target_proc == NULL) {
- return_error = BR_DEAD_REPLY;
+ if (!target_node) {
+ /*
+ * return_error is set above
+ */
+ return_error_param = -EINVAL;
+ return_error_line = __LINE__;
goto err_dead_binder;
}
+ e->to_node = target_node->debug_id;
if (security_binder_transaction(proc->tsk,
target_proc->tsk) < 0) {
return_error = BR_FAILED_REPLY;
+ return_error_param = -EPERM;
+ return_error_line = __LINE__;
goto err_invalid_target_handle;
}
+ binder_inner_proc_lock(proc);
if (!(tr->flags & TF_ONE_WAY) && thread->transaction_stack) {
struct binder_transaction *tmp;
tmp = thread->transaction_stack;
if (tmp->to_thread != thread) {
+ spin_lock(&tmp->lock);
binder_user_error("%d:%d got new transaction with bad transaction stack, transaction %d has target %d:%d\n",
proc->pid, thread->pid, tmp->debug_id,
tmp->to_proc ? tmp->to_proc->pid : 0,
tmp->to_thread ?
tmp->to_thread->pid : 0);
+ spin_unlock(&tmp->lock);
+ binder_inner_proc_unlock(proc);
return_error = BR_FAILED_REPLY;
+ return_error_param = -EPROTO;
+ return_error_line = __LINE__;
goto err_bad_call_stack;
}
while (tmp) {
- if (tmp->from && tmp->from->proc == target_proc)
- target_thread = tmp->from;
+ struct binder_thread *from;
+
+ spin_lock(&tmp->lock);
+ from = tmp->from;
+ if (from && from->proc == target_proc) {
+ atomic_inc(&from->tmp_ref);
+ target_thread = from;
+ spin_unlock(&tmp->lock);
+ break;
+ }
+ spin_unlock(&tmp->lock);
tmp = tmp->from_parent;
}
}
+ binder_inner_proc_unlock(proc);
}
- if (target_thread) {
+ if (target_thread)
e->to_thread = target_thread->pid;
- target_list = &target_thread->todo;
- target_wait = &target_thread->wait;
- } else {
- target_list = &target_proc->todo;
- target_wait = &target_proc->wait;
- }
e->to_proc = target_proc->pid;
/* TODO: reuse incoming transaction for reply */
t = kzalloc(sizeof(*t), GFP_KERNEL);
if (t == NULL) {
return_error = BR_FAILED_REPLY;
+ return_error_param = -ENOMEM;
+ return_error_line = __LINE__;
goto err_alloc_t_failed;
}
binder_stats_created(BINDER_STAT_TRANSACTION);
+ spin_lock_init(&t->lock);
tcomplete = kzalloc(sizeof(*tcomplete), GFP_KERNEL);
if (tcomplete == NULL) {
return_error = BR_FAILED_REPLY;
+ return_error_param = -ENOMEM;
+ return_error_line = __LINE__;
goto err_alloc_tcomplete_failed;
}
binder_stats_created(BINDER_STAT_TRANSACTION_COMPLETE);
- t->debug_id = ++binder_last_id;
- e->debug_id = t->debug_id;
+ t->debug_id = t_debug_id;
if (reply)
binder_debug(BINDER_DEBUG_TRANSACTION,
- "%d:%d BC_REPLY %d -> %d:%d, data %016llx-%016llx size %lld-%lld\n",
+ "%d:%d BC_REPLY %d -> %d:%d, data %016llx-%016llx size %lld-%lld-%lld\n",
proc->pid, thread->pid, t->debug_id,
target_proc->pid, target_thread->pid,
(u64)tr->data.ptr.buffer,
(u64)tr->data.ptr.offsets,
- (u64)tr->data_size, (u64)tr->offsets_size);
+ (u64)tr->data_size, (u64)tr->offsets_size,
+ (u64)extra_buffers_size);
else
binder_debug(BINDER_DEBUG_TRANSACTION,
- "%d:%d BC_TRANSACTION %d -> %d - node %d, data %016llx-%016llx size %lld-%lld\n",
+ "%d:%d BC_TRANSACTION %d -> %d - node %d, data %016llx-%016llx size %lld-%lld-%lld\n",
proc->pid, thread->pid, t->debug_id,
target_proc->pid, target_node->debug_id,
(u64)tr->data.ptr.buffer,
(u64)tr->data.ptr.offsets,
- (u64)tr->data_size, (u64)tr->offsets_size);
+ (u64)tr->data_size, (u64)tr->offsets_size,
+ (u64)extra_buffers_size);
if (!reply && !(tr->flags & TF_ONE_WAY))
t->from = thread;
@@ -1499,32 +3111,47 @@ static void binder_transaction(struct binder_proc *proc,
t->to_thread = target_thread;
t->code = tr->code;
t->flags = tr->flags;
- t->priority = task_nice(current);
+ if (!(t->flags & TF_ONE_WAY) &&
+ binder_supported_policy(current->policy)) {
+ /* Inherit supported policies for synchronous transactions */
+ t->priority.sched_policy = current->policy;
+ t->priority.prio = current->normal_prio;
+ } else {
+ /* Otherwise, fall back to the default priority */
+ t->priority = target_proc->default_priority;
+ }
trace_binder_transaction(reply, t, target_node);
- t->buffer = binder_alloc_buf(target_proc, tr->data_size,
- tr->offsets_size, !reply && (t->flags & TF_ONE_WAY));
- if (t->buffer == NULL) {
- return_error = BR_FAILED_REPLY;
+ t->buffer = binder_alloc_new_buf(&target_proc->alloc, tr->data_size,
+ tr->offsets_size, extra_buffers_size,
+ !reply && (t->flags & TF_ONE_WAY));
+ if (IS_ERR(t->buffer)) {
+ /*
+ * -ESRCH indicates VMA cleared. The target is dying.
+ */
+ return_error_param = PTR_ERR(t->buffer);
+ return_error = return_error_param == -ESRCH ?
+ BR_DEAD_REPLY : BR_FAILED_REPLY;
+ return_error_line = __LINE__;
+ t->buffer = NULL;
goto err_binder_alloc_buf_failed;
}
- t->buffer->allow_user_free = 0;
t->buffer->debug_id = t->debug_id;
t->buffer->transaction = t;
t->buffer->target_node = target_node;
trace_binder_transaction_alloc_buf(t->buffer);
- if (target_node)
- binder_inc_node(target_node, 1, 0, NULL);
-
- offp = (binder_size_t *)(t->buffer->data +
- ALIGN(tr->data_size, sizeof(void *)));
+ off_start = (binder_size_t *)(t->buffer->data +
+ ALIGN(tr->data_size, sizeof(void *)));
+ offp = off_start;
if (copy_from_user(t->buffer->data, (const void __user *)(uintptr_t)
tr->data.ptr.buffer, tr->data_size)) {
binder_user_error("%d:%d got transaction with invalid data ptr\n",
proc->pid, thread->pid);
return_error = BR_FAILED_REPLY;
+ return_error_param = -EFAULT;
+ return_error_line = __LINE__;
goto err_copy_data_failed;
}
if (copy_from_user(offp, (const void __user *)(uintptr_t)
@@ -1532,231 +3159,253 @@ static void binder_transaction(struct binder_proc *proc,
binder_user_error("%d:%d got transaction with invalid offsets ptr\n",
proc->pid, thread->pid);
return_error = BR_FAILED_REPLY;
+ return_error_param = -EFAULT;
+ return_error_line = __LINE__;
goto err_copy_data_failed;
}
if (!IS_ALIGNED(tr->offsets_size, sizeof(binder_size_t))) {
binder_user_error("%d:%d got transaction with invalid offsets size, %lld\n",
proc->pid, thread->pid, (u64)tr->offsets_size);
return_error = BR_FAILED_REPLY;
+ return_error_param = -EINVAL;
+ return_error_line = __LINE__;
goto err_bad_offset;
}
- off_end = (void *)offp + tr->offsets_size;
+ if (!IS_ALIGNED(extra_buffers_size, sizeof(u64))) {
+ binder_user_error("%d:%d got transaction with unaligned buffers size, %lld\n",
+ proc->pid, thread->pid,
+ (u64)extra_buffers_size);
+ return_error = BR_FAILED_REPLY;
+ return_error_param = -EINVAL;
+ return_error_line = __LINE__;
+ goto err_bad_offset;
+ }
+ off_end = (void *)off_start + tr->offsets_size;
+ sg_bufp = (u8 *)(PTR_ALIGN(off_end, sizeof(void *)));
+ sg_buf_end = sg_bufp + extra_buffers_size;
off_min = 0;
for (; offp < off_end; offp++) {
- struct flat_binder_object *fp;
+ struct binder_object_header *hdr;
+ size_t object_size = binder_validate_object(t->buffer, *offp);
- if (*offp > t->buffer->data_size - sizeof(*fp) ||
- *offp < off_min ||
- t->buffer->data_size < sizeof(*fp) ||
- !IS_ALIGNED(*offp, sizeof(u32))) {
- binder_user_error("%d:%d got transaction with invalid offset, %lld (min %lld, max %lld)\n",
+ if (object_size == 0 || *offp < off_min) {
+ binder_user_error("%d:%d got transaction with invalid offset (%lld, min %lld max %lld) or object.\n",
proc->pid, thread->pid, (u64)*offp,
(u64)off_min,
- (u64)(t->buffer->data_size -
- sizeof(*fp)));
+ (u64)t->buffer->data_size);
return_error = BR_FAILED_REPLY;
+ return_error_param = -EINVAL;
+ return_error_line = __LINE__;
goto err_bad_offset;
}
- fp = (struct flat_binder_object *)(t->buffer->data + *offp);
- off_min = *offp + sizeof(struct flat_binder_object);
- switch (fp->type) {
+
+ hdr = (struct binder_object_header *)(t->buffer->data + *offp);
+ off_min = *offp + object_size;
+ switch (hdr->type) {
case BINDER_TYPE_BINDER:
case BINDER_TYPE_WEAK_BINDER: {
- struct binder_ref *ref;
- struct binder_node *node = binder_get_node(proc, fp->binder);
+ struct flat_binder_object *fp;
- if (node == NULL) {
- node = binder_new_node(proc, fp->binder, fp->cookie);
- if (node == NULL) {
- return_error = BR_FAILED_REPLY;
- goto err_binder_new_node_failed;
- }
- node->min_priority = fp->flags & FLAT_BINDER_FLAG_PRIORITY_MASK;
- node->accept_fds = !!(fp->flags & FLAT_BINDER_FLAG_ACCEPTS_FDS);
- }
- if (fp->cookie != node->cookie) {
- binder_user_error("%d:%d sending u%016llx node %d, cookie mismatch %016llx != %016llx\n",
- proc->pid, thread->pid,
- (u64)fp->binder, node->debug_id,
- (u64)fp->cookie, (u64)node->cookie);
- return_error = BR_FAILED_REPLY;
- goto err_binder_get_ref_for_node_failed;
- }
- if (security_binder_transfer_binder(proc->tsk,
- target_proc->tsk)) {
+ fp = to_flat_binder_object(hdr);
+ ret = binder_translate_binder(fp, t, thread);
+ if (ret < 0) {
return_error = BR_FAILED_REPLY;
- goto err_binder_get_ref_for_node_failed;
+ return_error_param = ret;
+ return_error_line = __LINE__;
+ goto err_translate_failed;
}
- ref = binder_get_ref_for_node(target_proc, node);
- if (ref == NULL) {
- return_error = BR_FAILED_REPLY;
- goto err_binder_get_ref_for_node_failed;
- }
- if (fp->type == BINDER_TYPE_BINDER)
- fp->type = BINDER_TYPE_HANDLE;
- else
- fp->type = BINDER_TYPE_WEAK_HANDLE;
- fp->binder = 0;
- fp->handle = ref->desc;
- fp->cookie = 0;
- binder_inc_ref(ref, fp->type == BINDER_TYPE_HANDLE,
- &thread->todo);
-
- trace_binder_transaction_node_to_ref(t, node, ref);
- binder_debug(BINDER_DEBUG_TRANSACTION,
- " node %d u%016llx -> ref %d desc %d\n",
- node->debug_id, (u64)node->ptr,
- ref->debug_id, ref->desc);
} break;
case BINDER_TYPE_HANDLE:
case BINDER_TYPE_WEAK_HANDLE: {
- struct binder_ref *ref;
+ struct flat_binder_object *fp;
- ref = binder_get_ref(proc, fp->handle,
- fp->type == BINDER_TYPE_HANDLE);
+ fp = to_flat_binder_object(hdr);
+ ret = binder_translate_handle(fp, t, thread);
+ if (ret < 0) {
+ return_error = BR_FAILED_REPLY;
+ return_error_param = ret;
+ return_error_line = __LINE__;
+ goto err_translate_failed;
+ }
+ } break;
- if (ref == NULL) {
- binder_user_error("%d:%d got transaction with invalid handle, %d\n",
- proc->pid,
- thread->pid, fp->handle);
+ case BINDER_TYPE_FD: {
+ struct binder_fd_object *fp = to_binder_fd_object(hdr);
+ int target_fd = binder_translate_fd(fp->fd, t, thread,
+ in_reply_to);
+
+ if (target_fd < 0) {
return_error = BR_FAILED_REPLY;
- goto err_binder_get_ref_failed;
+ return_error_param = target_fd;
+ return_error_line = __LINE__;
+ goto err_translate_failed;
}
- if (security_binder_transfer_binder(proc->tsk,
- target_proc->tsk)) {
+ fp->pad_binder = 0;
+ fp->fd = target_fd;
+ } break;
+ case BINDER_TYPE_FDA: {
+ struct binder_fd_array_object *fda =
+ to_binder_fd_array_object(hdr);
+ struct binder_buffer_object *parent =
+ binder_validate_ptr(t->buffer, fda->parent,
+ off_start,
+ offp - off_start);
+ if (!parent) {
+ binder_user_error("%d:%d got transaction with invalid parent offset or type\n",
+ proc->pid, thread->pid);
return_error = BR_FAILED_REPLY;
- goto err_binder_get_ref_failed;
+ return_error_param = -EINVAL;
+ return_error_line = __LINE__;
+ goto err_bad_parent;
}
- if (ref->node->proc == target_proc) {
- if (fp->type == BINDER_TYPE_HANDLE)
- fp->type = BINDER_TYPE_BINDER;
- else
- fp->type = BINDER_TYPE_WEAK_BINDER;
- fp->binder = ref->node->ptr;
- fp->cookie = ref->node->cookie;
- binder_inc_node(ref->node, fp->type == BINDER_TYPE_BINDER, 0, NULL);
- trace_binder_transaction_ref_to_node(t, ref);
- binder_debug(BINDER_DEBUG_TRANSACTION,
- " ref %d desc %d -> node %d u%016llx\n",
- ref->debug_id, ref->desc, ref->node->debug_id,
- (u64)ref->node->ptr);
- } else {
- struct binder_ref *new_ref;
-
- new_ref = binder_get_ref_for_node(target_proc, ref->node);
- if (new_ref == NULL) {
- return_error = BR_FAILED_REPLY;
- goto err_binder_get_ref_for_node_failed;
- }
- fp->binder = 0;
- fp->handle = new_ref->desc;
- fp->cookie = 0;
- binder_inc_ref(new_ref, fp->type == BINDER_TYPE_HANDLE, NULL);
- trace_binder_transaction_ref_to_ref(t, ref,
- new_ref);
- binder_debug(BINDER_DEBUG_TRANSACTION,
- " ref %d desc %d -> ref %d desc %d (node %d)\n",
- ref->debug_id, ref->desc, new_ref->debug_id,
- new_ref->desc, ref->node->debug_id);
+ if (!binder_validate_fixup(t->buffer, off_start,
+ parent, fda->parent_offset,
+ last_fixup_obj,
+ last_fixup_min_off)) {
+ binder_user_error("%d:%d got transaction with out-of-order buffer fixup\n",
+ proc->pid, thread->pid);
+ return_error = BR_FAILED_REPLY;
+ return_error_param = -EINVAL;
+ return_error_line = __LINE__;
+ goto err_bad_parent;
}
- } break;
-
- case BINDER_TYPE_FD: {
- int target_fd;
- struct file *file;
-
- if (reply) {
- if (!(in_reply_to->flags & TF_ACCEPT_FDS)) {
- binder_user_error("%d:%d got reply with fd, %d, but target does not allow fds\n",
- proc->pid, thread->pid, fp->handle);
- return_error = BR_FAILED_REPLY;
- goto err_fd_not_allowed;
- }
- } else if (!target_node->accept_fds) {
- binder_user_error("%d:%d got transaction with fd, %d, but target does not allow fds\n",
- proc->pid, thread->pid, fp->handle);
+ ret = binder_translate_fd_array(fda, parent, t, thread,
+ in_reply_to);
+ if (ret < 0) {
return_error = BR_FAILED_REPLY;
- goto err_fd_not_allowed;
+ return_error_param = ret;
+ return_error_line = __LINE__;
+ goto err_translate_failed;
}
-
- file = fget(fp->handle);
- if (file == NULL) {
- binder_user_error("%d:%d got transaction with invalid fd, %d\n",
- proc->pid, thread->pid, fp->handle);
+ last_fixup_obj = parent;
+ last_fixup_min_off =
+ fda->parent_offset + sizeof(u32) * fda->num_fds;
+ } break;
+ case BINDER_TYPE_PTR: {
+ struct binder_buffer_object *bp =
+ to_binder_buffer_object(hdr);
+ size_t buf_left = sg_buf_end - sg_bufp;
+
+ if (bp->length > buf_left) {
+ binder_user_error("%d:%d got transaction with too large buffer\n",
+ proc->pid, thread->pid);
return_error = BR_FAILED_REPLY;
- goto err_fget_failed;
+ return_error_param = -EINVAL;
+ return_error_line = __LINE__;
+ goto err_bad_offset;
}
- if (security_binder_transfer_file(proc->tsk,
- target_proc->tsk,
- file) < 0) {
- fput(file);
+ if (copy_from_user(sg_bufp,
+ (const void __user *)(uintptr_t)
+ bp->buffer, bp->length)) {
+ binder_user_error("%d:%d got transaction with invalid offsets ptr\n",
+ proc->pid, thread->pid);
+ return_error_param = -EFAULT;
return_error = BR_FAILED_REPLY;
- goto err_get_unused_fd_failed;
+ return_error_line = __LINE__;
+ goto err_copy_data_failed;
}
- target_fd = task_get_unused_fd_flags(target_proc, O_CLOEXEC);
- if (target_fd < 0) {
- fput(file);
+ /* Fixup buffer pointer to target proc address space */
+ bp->buffer = (uintptr_t)sg_bufp +
+ binder_alloc_get_user_buffer_offset(
+ &target_proc->alloc);
+ sg_bufp += ALIGN(bp->length, sizeof(u64));
+
+ ret = binder_fixup_parent(t, thread, bp, off_start,
+ offp - off_start,
+ last_fixup_obj,
+ last_fixup_min_off);
+ if (ret < 0) {
return_error = BR_FAILED_REPLY;
- goto err_get_unused_fd_failed;
+ return_error_param = ret;
+ return_error_line = __LINE__;
+ goto err_translate_failed;
}
- task_fd_install(target_proc, target_fd, file);
- trace_binder_transaction_fd(t, fp->handle, target_fd);
- binder_debug(BINDER_DEBUG_TRANSACTION,
- " fd %d -> %d\n", fp->handle, target_fd);
- /* TODO: fput? */
- fp->binder = 0;
- fp->handle = target_fd;
+ last_fixup_obj = bp;
+ last_fixup_min_off = 0;
} break;
-
default:
binder_user_error("%d:%d got transaction with invalid object type, %x\n",
- proc->pid, thread->pid, fp->type);
+ proc->pid, thread->pid, hdr->type);
return_error = BR_FAILED_REPLY;
+ return_error_param = -EINVAL;
+ return_error_line = __LINE__;
goto err_bad_object_type;
}
}
+ tcomplete->type = BINDER_WORK_TRANSACTION_COMPLETE;
+ t->work.type = BINDER_WORK_TRANSACTION;
+
if (reply) {
+ binder_enqueue_thread_work(thread, tcomplete);
+ binder_inner_proc_lock(target_proc);
+ if (target_thread->is_dead) {
+ binder_inner_proc_unlock(target_proc);
+ goto err_dead_proc_or_thread;
+ }
BUG_ON(t->buffer->async_transaction != 0);
- binder_pop_transaction(target_thread, in_reply_to);
+ binder_pop_transaction_ilocked(target_thread, in_reply_to);
+ binder_enqueue_thread_work_ilocked(target_thread, &t->work);
+ binder_inner_proc_unlock(target_proc);
+ wake_up_interruptible_sync(&target_thread->wait);
+ binder_restore_priority(current, in_reply_to->saved_priority);
+ binder_free_transaction(in_reply_to);
} else if (!(t->flags & TF_ONE_WAY)) {
BUG_ON(t->buffer->async_transaction != 0);
+ binder_inner_proc_lock(proc);
+ /*
+ * Defer the TRANSACTION_COMPLETE, so we don't return to
+ * userspace immediately; this allows the target process to
+ * immediately start processing this transaction, reducing
+ * latency. We will then return the TRANSACTION_COMPLETE when
+ * the target replies (or there is an error).
+ */
+ binder_enqueue_deferred_thread_work_ilocked(thread, tcomplete);
t->need_reply = 1;
t->from_parent = thread->transaction_stack;
thread->transaction_stack = t;
+ binder_inner_proc_unlock(proc);
+ if (!binder_proc_transaction(t, target_proc, target_thread)) {
+ binder_inner_proc_lock(proc);
+ binder_pop_transaction_ilocked(thread, t);
+ binder_inner_proc_unlock(proc);
+ goto err_dead_proc_or_thread;
+ }
} else {
BUG_ON(target_node == NULL);
BUG_ON(t->buffer->async_transaction != 1);
- if (target_node->has_async_transaction) {
- target_list = &target_node->async_todo;
- target_wait = NULL;
- } else
- target_node->has_async_transaction = 1;
- }
- t->work.type = BINDER_WORK_TRANSACTION;
- list_add_tail(&t->work.entry, target_list);
- tcomplete->type = BINDER_WORK_TRANSACTION_COMPLETE;
- list_add_tail(&tcomplete->entry, &thread->todo);
- if (target_wait) {
- if (reply || !(t->flags & TF_ONE_WAY))
- wake_up_interruptible_sync(target_wait);
- else
- wake_up_interruptible(target_wait);
+ binder_enqueue_thread_work(thread, tcomplete);
+ if (!binder_proc_transaction(t, target_proc, NULL))
+ goto err_dead_proc_or_thread;
}
+ if (target_thread)
+ binder_thread_dec_tmpref(target_thread);
+ binder_proc_dec_tmpref(target_proc);
+ if (target_node)
+ binder_dec_node_tmpref(target_node);
+ /*
+ * write barrier to synchronize with initialization
+ * of log entry
+ */
+ smp_wmb();
+ WRITE_ONCE(e->debug_id_done, t_debug_id);
return;
-err_get_unused_fd_failed:
-err_fget_failed:
-err_fd_not_allowed:
-err_binder_get_ref_for_node_failed:
-err_binder_get_ref_failed:
-err_binder_new_node_failed:
+err_dead_proc_or_thread:
+ return_error = BR_DEAD_REPLY;
+ return_error_line = __LINE__;
+ binder_dequeue_work(proc, tcomplete);
+err_translate_failed:
err_bad_object_type:
err_bad_offset:
+err_bad_parent:
err_copy_data_failed:
trace_binder_transaction_failed_buffer_release(t->buffer);
binder_transaction_buffer_release(target_proc, t->buffer, offp);
+ if (target_node)
+ binder_dec_node_tmpref(target_node);
+ target_node = NULL;
t->buffer->transaction = NULL;
- binder_free_buf(target_proc, t->buffer);
+ binder_alloc_free_buf(&target_proc->alloc, t->buffer);
err_binder_alloc_buf_failed:
kfree(tcomplete);
binder_stats_deleted(BINDER_STAT_TRANSACTION_COMPLETE);
@@ -1768,25 +3417,48 @@ err_bad_call_stack:
err_empty_call_stack:
err_dead_binder:
err_invalid_target_handle:
-err_no_context_mgr_node:
+ if (target_thread)
+ binder_thread_dec_tmpref(target_thread);
+ if (target_proc)
+ binder_proc_dec_tmpref(target_proc);
+ if (target_node) {
+ binder_dec_node(target_node, 1, 0);
+ binder_dec_node_tmpref(target_node);
+ }
+
binder_debug(BINDER_DEBUG_FAILED_TRANSACTION,
- "%d:%d transaction failed %d, size %lld-%lld\n",
- proc->pid, thread->pid, return_error,
- (u64)tr->data_size, (u64)tr->offsets_size);
+ "%d:%d transaction failed %d/%d, size %lld-%lld line %d\n",
+ proc->pid, thread->pid, return_error, return_error_param,
+ (u64)tr->data_size, (u64)tr->offsets_size,
+ return_error_line);
{
struct binder_transaction_log_entry *fe;
+ e->return_error = return_error;
+ e->return_error_param = return_error_param;
+ e->return_error_line = return_error_line;
fe = binder_transaction_log_add(&binder_transaction_log_failed);
*fe = *e;
+ /*
+ * write barrier to synchronize with initialization
+ * of log entry
+ */
+ smp_wmb();
+ WRITE_ONCE(e->debug_id_done, t_debug_id);
+ WRITE_ONCE(fe->debug_id_done, t_debug_id);
}
- BUG_ON(thread->return_error != BR_OK);
+ BUG_ON(thread->return_error.cmd != BR_OK);
if (in_reply_to) {
- thread->return_error = BR_TRANSACTION_COMPLETE;
+ binder_restore_priority(current, in_reply_to->saved_priority);
+ thread->return_error.cmd = BR_TRANSACTION_COMPLETE;
+ binder_enqueue_thread_work(thread, &thread->return_error.work);
binder_send_failed_reply(in_reply_to, return_error);
- } else
- thread->return_error = return_error;
+ } else {
+ thread->return_error.cmd = return_error;
+ binder_enqueue_thread_work(thread, &thread->return_error.work);
+ }
}
static int binder_thread_write(struct binder_proc *proc,
@@ -1795,19 +3467,22 @@ static int binder_thread_write(struct binder_proc *proc,
binder_size_t *consumed)
{
uint32_t cmd;
+ struct binder_context *context = proc->context;
void __user *buffer = (void __user *)(uintptr_t)binder_buffer;
void __user *ptr = buffer + *consumed;
void __user *end = buffer + size;
- while (ptr < end && thread->return_error == BR_OK) {
+ while (ptr < end && thread->return_error.cmd == BR_OK) {
+ int ret;
+
if (get_user(cmd, (uint32_t __user *)ptr))
return -EFAULT;
ptr += sizeof(uint32_t);
trace_binder_command(cmd);
if (_IOC_NR(cmd) < ARRAY_SIZE(binder_stats.bc)) {
- binder_stats.bc[_IOC_NR(cmd)]++;
- proc->stats.bc[_IOC_NR(cmd)]++;
- thread->stats.bc[_IOC_NR(cmd)]++;
+ atomic_inc(&binder_stats.bc[_IOC_NR(cmd)]);
+ atomic_inc(&proc->stats.bc[_IOC_NR(cmd)]);
+ atomic_inc(&thread->stats.bc[_IOC_NR(cmd)]);
}
switch (cmd) {
case BC_INCREFS:
@@ -1815,53 +3490,61 @@ static int binder_thread_write(struct binder_proc *proc,
case BC_RELEASE:
case BC_DECREFS: {
uint32_t target;
- struct binder_ref *ref;
const char *debug_string;
+ bool strong = cmd == BC_ACQUIRE || cmd == BC_RELEASE;
+ bool increment = cmd == BC_INCREFS || cmd == BC_ACQUIRE;
+ struct binder_ref_data rdata;
if (get_user(target, (uint32_t __user *)ptr))
return -EFAULT;
+
ptr += sizeof(uint32_t);
- if (target == 0 && binder_context_mgr_node &&
- (cmd == BC_INCREFS || cmd == BC_ACQUIRE)) {
- ref = binder_get_ref_for_node(proc,
- binder_context_mgr_node);
- if (ref->desc != target) {
- binder_user_error("%d:%d tried to acquire reference to desc 0, got %d instead\n",
- proc->pid, thread->pid,
- ref->desc);
- }
- } else
- ref = binder_get_ref(proc, target,
- cmd == BC_ACQUIRE ||
- cmd == BC_RELEASE);
- if (ref == NULL) {
- binder_user_error("%d:%d refcount change on invalid ref %d\n",
- proc->pid, thread->pid, target);
- break;
+ ret = -1;
+ if (increment && !target) {
+ struct binder_node *ctx_mgr_node;
+ mutex_lock(&context->context_mgr_node_lock);
+ ctx_mgr_node = context->binder_context_mgr_node;
+ if (ctx_mgr_node)
+ ret = binder_inc_ref_for_node(
+ proc, ctx_mgr_node,
+ strong, NULL, &rdata);
+ mutex_unlock(&context->context_mgr_node_lock);
+ }
+ if (ret)
+ ret = binder_update_ref_for_handle(
+ proc, target, increment, strong,
+ &rdata);
+ if (!ret && rdata.desc != target) {
+ binder_user_error("%d:%d tried to acquire reference to desc %d, got %d instead\n",
+ proc->pid, thread->pid,
+ target, rdata.desc);
}
switch (cmd) {
case BC_INCREFS:
debug_string = "IncRefs";
- binder_inc_ref(ref, 0, NULL);
break;
case BC_ACQUIRE:
debug_string = "Acquire";
- binder_inc_ref(ref, 1, NULL);
break;
case BC_RELEASE:
debug_string = "Release";
- binder_dec_ref(ref, 1);
break;
case BC_DECREFS:
default:
debug_string = "DecRefs";
- binder_dec_ref(ref, 0);
+ break;
+ }
+ if (ret) {
+ binder_user_error("%d:%d %s %d refcount change on invalid ref %d ret %d\n",
+ proc->pid, thread->pid, debug_string,
+ strong, target, ret);
break;
}
binder_debug(BINDER_DEBUG_USER_REFS,
- "%d:%d %s ref %d desc %d s %d w %d for node %d\n",
- proc->pid, thread->pid, debug_string, ref->debug_id,
- ref->desc, ref->strong, ref->weak, ref->node->debug_id);
+ "%d:%d %s ref %d desc %d s %d w %d\n",
+ proc->pid, thread->pid, debug_string,
+ rdata.debug_id, rdata.desc, rdata.strong,
+ rdata.weak);
break;
}
case BC_INCREFS_DONE:
@@ -1869,6 +3552,7 @@ static int binder_thread_write(struct binder_proc *proc,
binder_uintptr_t node_ptr;
binder_uintptr_t cookie;
struct binder_node *node;
+ bool free_node;
if (get_user(node_ptr, (binder_uintptr_t __user *)ptr))
return -EFAULT;
@@ -1893,13 +3577,17 @@ static int binder_thread_write(struct binder_proc *proc,
"BC_INCREFS_DONE" : "BC_ACQUIRE_DONE",
(u64)node_ptr, node->debug_id,
(u64)cookie, (u64)node->cookie);
+ binder_put_node(node);
break;
}
+ binder_node_inner_lock(node);
if (cmd == BC_ACQUIRE_DONE) {
if (node->pending_strong_ref == 0) {
binder_user_error("%d:%d BC_ACQUIRE_DONE node %d has no pending acquire request\n",
proc->pid, thread->pid,
node->debug_id);
+ binder_node_inner_unlock(node);
+ binder_put_node(node);
break;
}
node->pending_strong_ref = 0;
@@ -1908,16 +3596,23 @@ static int binder_thread_write(struct binder_proc *proc,
binder_user_error("%d:%d BC_INCREFS_DONE node %d has no pending increfs request\n",
proc->pid, thread->pid,
node->debug_id);
+ binder_node_inner_unlock(node);
+ binder_put_node(node);
break;
}
node->pending_weak_ref = 0;
}
- binder_dec_node(node, cmd == BC_ACQUIRE_DONE, 0);
+ free_node = binder_dec_node_nilocked(node,
+ cmd == BC_ACQUIRE_DONE, 0);
+ WARN_ON(free_node);
binder_debug(BINDER_DEBUG_USER_REFS,
- "%d:%d %s node %d ls %d lw %d\n",
+ "%d:%d %s node %d ls %d lw %d tr %d\n",
proc->pid, thread->pid,
cmd == BC_INCREFS_DONE ? "BC_INCREFS_DONE" : "BC_ACQUIRE_DONE",
- node->debug_id, node->local_strong_refs, node->local_weak_refs);
+ node->debug_id, node->local_strong_refs,
+ node->local_weak_refs, node->tmp_refs);
+ binder_node_inner_unlock(node);
+ binder_put_node(node);
break;
}
case BC_ATTEMPT_ACQUIRE:
@@ -1935,15 +3630,20 @@ static int binder_thread_write(struct binder_proc *proc,
return -EFAULT;
ptr += sizeof(binder_uintptr_t);
- buffer = binder_buffer_lookup(proc, data_ptr);
- if (buffer == NULL) {
- binder_user_error("%d:%d BC_FREE_BUFFER u%016llx no match\n",
- proc->pid, thread->pid, (u64)data_ptr);
- break;
- }
- if (!buffer->allow_user_free) {
- binder_user_error("%d:%d BC_FREE_BUFFER u%016llx matched unreturned buffer\n",
- proc->pid, thread->pid, (u64)data_ptr);
+ buffer = binder_alloc_prepare_to_free(&proc->alloc,
+ data_ptr);
+ if (IS_ERR_OR_NULL(buffer)) {
+ if (PTR_ERR(buffer) == -EPERM) {
+ binder_user_error(
+ "%d:%d BC_FREE_BUFFER u%016llx matched unreturned or currently freeing buffer\n",
+ proc->pid, thread->pid,
+ (u64)data_ptr);
+ } else {
+ binder_user_error(
+ "%d:%d BC_FREE_BUFFER u%016llx no match\n",
+ proc->pid, thread->pid,
+ (u64)data_ptr);
+ }
break;
}
binder_debug(BINDER_DEBUG_FREE_BUFFER,
@@ -1957,18 +3657,41 @@ static int binder_thread_write(struct binder_proc *proc,
buffer->transaction = NULL;
}
if (buffer->async_transaction && buffer->target_node) {
- BUG_ON(!buffer->target_node->has_async_transaction);
- if (list_empty(&buffer->target_node->async_todo))
- buffer->target_node->has_async_transaction = 0;
- else
- list_move_tail(buffer->target_node->async_todo.next, &thread->todo);
+ struct binder_node *buf_node;
+ struct binder_work *w;
+
+ buf_node = buffer->target_node;
+ binder_node_inner_lock(buf_node);
+ BUG_ON(!buf_node->has_async_transaction);
+ BUG_ON(buf_node->proc != proc);
+ w = binder_dequeue_work_head_ilocked(
+ &buf_node->async_todo);
+ if (!w) {
+ buf_node->has_async_transaction = false;
+ } else {
+ binder_enqueue_work_ilocked(
+ w, &proc->todo);
+ binder_wakeup_proc_ilocked(proc);
+ }
+ binder_node_inner_unlock(buf_node);
}
trace_binder_transaction_buffer_release(buffer);
binder_transaction_buffer_release(proc, buffer, NULL);
- binder_free_buf(proc, buffer);
+ binder_alloc_free_buf(&proc->alloc, buffer);
break;
}
+ case BC_TRANSACTION_SG:
+ case BC_REPLY_SG: {
+ struct binder_transaction_data_sg tr;
+
+ if (copy_from_user(&tr, ptr, sizeof(tr)))
+ return -EFAULT;
+ ptr += sizeof(tr);
+ binder_transaction(proc, thread, &tr.transaction_data,
+ cmd == BC_REPLY_SG, tr.buffers_size);
+ break;
+ }
case BC_TRANSACTION:
case BC_REPLY: {
struct binder_transaction_data tr;
@@ -1976,7 +3699,8 @@ static int binder_thread_write(struct binder_proc *proc,
if (copy_from_user(&tr, ptr, sizeof(tr)))
return -EFAULT;
ptr += sizeof(tr);
- binder_transaction(proc, thread, &tr, cmd == BC_REPLY);
+ binder_transaction(proc, thread, &tr,
+ cmd == BC_REPLY, 0);
break;
}
@@ -1984,6 +3708,7 @@ static int binder_thread_write(struct binder_proc *proc,
binder_debug(BINDER_DEBUG_THREADS,
"%d:%d BC_REGISTER_LOOPER\n",
proc->pid, thread->pid);
+ binder_inner_proc_lock(proc);
if (thread->looper & BINDER_LOOPER_STATE_ENTERED) {
thread->looper |= BINDER_LOOPER_STATE_INVALID;
binder_user_error("%d:%d ERROR: BC_REGISTER_LOOPER called after BC_ENTER_LOOPER\n",
@@ -1997,6 +3722,7 @@ static int binder_thread_write(struct binder_proc *proc,
proc->requested_threads_started++;
}
thread->looper |= BINDER_LOOPER_STATE_REGISTERED;
+ binder_inner_proc_unlock(proc);
break;
case BC_ENTER_LOOPER:
binder_debug(BINDER_DEBUG_THREADS,
@@ -2021,7 +3747,7 @@ static int binder_thread_write(struct binder_proc *proc,
uint32_t target;
binder_uintptr_t cookie;
struct binder_ref *ref;
- struct binder_ref_death *death;
+ struct binder_ref_death *death = NULL;
if (get_user(target, (uint32_t __user *)ptr))
return -EFAULT;
@@ -2029,7 +3755,28 @@ static int binder_thread_write(struct binder_proc *proc,
if (get_user(cookie, (binder_uintptr_t __user *)ptr))
return -EFAULT;
ptr += sizeof(binder_uintptr_t);
- ref = binder_get_ref(proc, target, false);
+ if (cmd == BC_REQUEST_DEATH_NOTIFICATION) {
+ /*
+ * Allocate memory for death notification
+ * before taking lock
+ */
+ death = kzalloc(sizeof(*death), GFP_KERNEL);
+ if (death == NULL) {
+ WARN_ON(thread->return_error.cmd !=
+ BR_OK);
+ thread->return_error.cmd = BR_ERROR;
+ binder_enqueue_thread_work(
+ thread,
+ &thread->return_error.work);
+ binder_debug(
+ BINDER_DEBUG_FAILED_TRANSACTION,
+ "%d:%d BC_REQUEST_DEATH_NOTIFICATION failed\n",
+ proc->pid, thread->pid);
+ break;
+ }
+ }
+ binder_proc_lock(proc);
+ ref = binder_get_ref_olocked(proc, target, false);
if (ref == NULL) {
binder_user_error("%d:%d %s invalid ref %d\n",
proc->pid, thread->pid,
@@ -2037,6 +3784,8 @@ static int binder_thread_write(struct binder_proc *proc,
"BC_REQUEST_DEATH_NOTIFICATION" :
"BC_CLEAR_DEATH_NOTIFICATION",
target);
+ binder_proc_unlock(proc);
+ kfree(death);
break;
}
@@ -2046,21 +3795,18 @@ static int binder_thread_write(struct binder_proc *proc,
cmd == BC_REQUEST_DEATH_NOTIFICATION ?
"BC_REQUEST_DEATH_NOTIFICATION" :
"BC_CLEAR_DEATH_NOTIFICATION",
- (u64)cookie, ref->debug_id, ref->desc,
- ref->strong, ref->weak, ref->node->debug_id);
+ (u64)cookie, ref->data.debug_id,
+ ref->data.desc, ref->data.strong,
+ ref->data.weak, ref->node->debug_id);
+ binder_node_lock(ref->node);
if (cmd == BC_REQUEST_DEATH_NOTIFICATION) {
if (ref->death) {
binder_user_error("%d:%d BC_REQUEST_DEATH_NOTIFICATION death notification already set\n",
proc->pid, thread->pid);
- break;
- }
- death = kzalloc(sizeof(*death), GFP_KERNEL);
- if (death == NULL) {
- thread->return_error = BR_ERROR;
- binder_debug(BINDER_DEBUG_FAILED_TRANSACTION,
- "%d:%d BC_REQUEST_DEATH_NOTIFICATION failed\n",
- proc->pid, thread->pid);
+ binder_node_unlock(ref->node);
+ binder_proc_unlock(proc);
+ kfree(death);
break;
}
binder_stats_created(BINDER_STAT_DEATH);
@@ -2069,17 +3815,19 @@ static int binder_thread_write(struct binder_proc *proc,
ref->death = death;
if (ref->node->proc == NULL) {
ref->death->work.type = BINDER_WORK_DEAD_BINDER;
- if (thread->looper & (BINDER_LOOPER_STATE_REGISTERED | BINDER_LOOPER_STATE_ENTERED)) {
- list_add_tail(&ref->death->work.entry, &thread->todo);
- } else {
- list_add_tail(&ref->death->work.entry, &proc->todo);
- wake_up_interruptible(&proc->wait);
- }
+
+ binder_inner_proc_lock(proc);
+ binder_enqueue_work_ilocked(
+ &ref->death->work, &proc->todo);
+ binder_wakeup_proc_ilocked(proc);
+ binder_inner_proc_unlock(proc);
}
} else {
if (ref->death == NULL) {
binder_user_error("%d:%d BC_CLEAR_DEATH_NOTIFICATION death notification not active\n",
proc->pid, thread->pid);
+ binder_node_unlock(ref->node);
+ binder_proc_unlock(proc);
break;
}
death = ref->death;
@@ -2088,22 +3836,35 @@ static int binder_thread_write(struct binder_proc *proc,
proc->pid, thread->pid,
(u64)death->cookie,
(u64)cookie);
+ binder_node_unlock(ref->node);
+ binder_proc_unlock(proc);
break;
}
ref->death = NULL;
+ binder_inner_proc_lock(proc);
if (list_empty(&death->work.entry)) {
death->work.type = BINDER_WORK_CLEAR_DEATH_NOTIFICATION;
- if (thread->looper & (BINDER_LOOPER_STATE_REGISTERED | BINDER_LOOPER_STATE_ENTERED)) {
- list_add_tail(&death->work.entry, &thread->todo);
- } else {
- list_add_tail(&death->work.entry, &proc->todo);
- wake_up_interruptible(&proc->wait);
+ if (thread->looper &
+ (BINDER_LOOPER_STATE_REGISTERED |
+ BINDER_LOOPER_STATE_ENTERED))
+ binder_enqueue_thread_work_ilocked(
+ thread,
+ &death->work);
+ else {
+ binder_enqueue_work_ilocked(
+ &death->work,
+ &proc->todo);
+ binder_wakeup_proc_ilocked(
+ proc);
}
} else {
BUG_ON(death->work.type != BINDER_WORK_DEAD_BINDER);
death->work.type = BINDER_WORK_DEAD_BINDER_AND_CLEAR;
}
+ binder_inner_proc_unlock(proc);
}
+ binder_node_unlock(ref->node);
+ binder_proc_unlock(proc);
} break;
case BC_DEAD_BINDER_DONE: {
struct binder_work *w;
@@ -2114,8 +3875,13 @@ static int binder_thread_write(struct binder_proc *proc,
return -EFAULT;
ptr += sizeof(cookie);
- list_for_each_entry(w, &proc->delivered_death, entry) {
- struct binder_ref_death *tmp_death = container_of(w, struct binder_ref_death, work);
+ binder_inner_proc_lock(proc);
+ list_for_each_entry(w, &proc->delivered_death,
+ entry) {
+ struct binder_ref_death *tmp_death =
+ container_of(w,
+ struct binder_ref_death,
+ work);
if (tmp_death->cookie == cookie) {
death = tmp_death;
@@ -2123,25 +3889,31 @@ static int binder_thread_write(struct binder_proc *proc,
}
}
binder_debug(BINDER_DEBUG_DEAD_BINDER,
- "%d:%d BC_DEAD_BINDER_DONE %016llx found %p\n",
+ "%d:%d BC_DEAD_BINDER_DONE %016llx found %pK\n",
proc->pid, thread->pid, (u64)cookie,
death);
if (death == NULL) {
binder_user_error("%d:%d BC_DEAD_BINDER_DONE %016llx not found\n",
proc->pid, thread->pid, (u64)cookie);
+ binder_inner_proc_unlock(proc);
break;
}
-
- list_del_init(&death->work.entry);
+ binder_dequeue_work_ilocked(&death->work);
if (death->work.type == BINDER_WORK_DEAD_BINDER_AND_CLEAR) {
death->work.type = BINDER_WORK_CLEAR_DEATH_NOTIFICATION;
- if (thread->looper & (BINDER_LOOPER_STATE_REGISTERED | BINDER_LOOPER_STATE_ENTERED)) {
- list_add_tail(&death->work.entry, &thread->todo);
- } else {
- list_add_tail(&death->work.entry, &proc->todo);
- wake_up_interruptible(&proc->wait);
+ if (thread->looper &
+ (BINDER_LOOPER_STATE_REGISTERED |
+ BINDER_LOOPER_STATE_ENTERED))
+ binder_enqueue_thread_work_ilocked(
+ thread, &death->work);
+ else {
+ binder_enqueue_work_ilocked(
+ &death->work,
+ &proc->todo);
+ binder_wakeup_proc_ilocked(proc);
}
}
+ binder_inner_proc_unlock(proc);
} break;
default:
@@ -2159,23 +3931,73 @@ static void binder_stat_br(struct binder_proc *proc,
{
trace_binder_return(cmd);
if (_IOC_NR(cmd) < ARRAY_SIZE(binder_stats.br)) {
- binder_stats.br[_IOC_NR(cmd)]++;
- proc->stats.br[_IOC_NR(cmd)]++;
- thread->stats.br[_IOC_NR(cmd)]++;
+ atomic_inc(&binder_stats.br[_IOC_NR(cmd)]);
+ atomic_inc(&proc->stats.br[_IOC_NR(cmd)]);
+ atomic_inc(&thread->stats.br[_IOC_NR(cmd)]);
}
}
-static int binder_has_proc_work(struct binder_proc *proc,
- struct binder_thread *thread)
+static int binder_put_node_cmd(struct binder_proc *proc,
+ struct binder_thread *thread,
+ void __user **ptrp,
+ binder_uintptr_t node_ptr,
+ binder_uintptr_t node_cookie,
+ int node_debug_id,
+ uint32_t cmd, const char *cmd_name)
{
- return !list_empty(&proc->todo) ||
- (thread->looper & BINDER_LOOPER_STATE_NEED_RETURN);
+ void __user *ptr = *ptrp;
+
+ if (put_user(cmd, (uint32_t __user *)ptr))
+ return -EFAULT;
+ ptr += sizeof(uint32_t);
+
+ if (put_user(node_ptr, (binder_uintptr_t __user *)ptr))
+ return -EFAULT;
+ ptr += sizeof(binder_uintptr_t);
+
+ if (put_user(node_cookie, (binder_uintptr_t __user *)ptr))
+ return -EFAULT;
+ ptr += sizeof(binder_uintptr_t);
+
+ binder_stat_br(proc, thread, cmd);
+ binder_debug(BINDER_DEBUG_USER_REFS, "%d:%d %s %d u%016llx c%016llx\n",
+ proc->pid, thread->pid, cmd_name, node_debug_id,
+ (u64)node_ptr, (u64)node_cookie);
+
+ *ptrp = ptr;
+ return 0;
}
-static int binder_has_thread_work(struct binder_thread *thread)
+static int binder_wait_for_work(struct binder_thread *thread,
+ bool do_proc_work)
{
- return !list_empty(&thread->todo) || thread->return_error != BR_OK ||
- (thread->looper & BINDER_LOOPER_STATE_NEED_RETURN);
+ DEFINE_WAIT(wait);
+ struct binder_proc *proc = thread->proc;
+ int ret = 0;
+
+ freezer_do_not_count();
+ binder_inner_proc_lock(proc);
+ for (;;) {
+ prepare_to_wait(&thread->wait, &wait, TASK_INTERRUPTIBLE);
+ if (binder_has_work_ilocked(thread, do_proc_work))
+ break;
+ if (do_proc_work)
+ list_add(&thread->waiting_thread_node,
+ &proc->waiting_threads);
+ binder_inner_proc_unlock(proc);
+ schedule();
+ binder_inner_proc_lock(proc);
+ list_del_init(&thread->waiting_thread_node);
+ if (signal_pending(current)) {
+ ret = -ERESTARTSYS;
+ break;
+ }
+ }
+ finish_wait(&thread->wait, &wait);
+ binder_inner_proc_unlock(proc);
+ freezer_count();
+
+ return ret;
}
static int binder_thread_read(struct binder_proc *proc,
@@ -2197,37 +4019,15 @@ static int binder_thread_read(struct binder_proc *proc,
}
retry:
- wait_for_proc_work = thread->transaction_stack == NULL &&
- list_empty(&thread->todo);
-
- if (thread->return_error != BR_OK && ptr < end) {
- if (thread->return_error2 != BR_OK) {
- if (put_user(thread->return_error2, (uint32_t __user *)ptr))
- return -EFAULT;
- ptr += sizeof(uint32_t);
- binder_stat_br(proc, thread, thread->return_error2);
- if (ptr == end)
- goto done;
- thread->return_error2 = BR_OK;
- }
- if (put_user(thread->return_error, (uint32_t __user *)ptr))
- return -EFAULT;
- ptr += sizeof(uint32_t);
- binder_stat_br(proc, thread, thread->return_error);
- thread->return_error = BR_OK;
- goto done;
- }
-
+ binder_inner_proc_lock(proc);
+ wait_for_proc_work = binder_available_for_proc_work_ilocked(thread);
+ binder_inner_proc_unlock(proc);
thread->looper |= BINDER_LOOPER_STATE_WAITING;
- if (wait_for_proc_work)
- proc->ready_threads++;
-
- binder_unlock(__func__);
trace_binder_wait_for_work(wait_for_proc_work,
!!thread->transaction_stack,
- !list_empty(&thread->todo));
+ !binder_worklist_empty(proc, &thread->todo));
if (wait_for_proc_work) {
if (!(thread->looper & (BINDER_LOOPER_STATE_REGISTERED |
BINDER_LOOPER_STATE_ENTERED))) {
@@ -2236,24 +4036,16 @@ retry:
wait_event_interruptible(binder_user_error_wait,
binder_stop_on_user_error < 2);
}
- binder_set_nice(proc->default_priority);
- if (non_block) {
- if (!binder_has_proc_work(proc, thread))
- ret = -EAGAIN;
- } else
- ret = wait_event_freezable_exclusive(proc->wait, binder_has_proc_work(proc, thread));
- } else {
- if (non_block) {
- if (!binder_has_thread_work(thread))
- ret = -EAGAIN;
- } else
- ret = wait_event_freezable(thread->wait, binder_has_thread_work(thread));
+ binder_restore_priority(current, proc->default_priority);
}
- binder_lock(__func__);
+ if (non_block) {
+ if (!binder_has_work(thread, wait_for_proc_work))
+ ret = -EAGAIN;
+ } else {
+ ret = binder_wait_for_work(thread, wait_for_proc_work);
+ }
- if (wait_for_proc_work)
- proc->ready_threads--;
thread->looper &= ~BINDER_LOOPER_STATE_WAITING;
if (ret)
@@ -2262,31 +4054,55 @@ retry:
while (1) {
uint32_t cmd;
struct binder_transaction_data tr;
- struct binder_work *w;
+ struct binder_work *w = NULL;
+ struct list_head *list = NULL;
struct binder_transaction *t = NULL;
+ struct binder_thread *t_from;
+
+ binder_inner_proc_lock(proc);
+ if (!binder_worklist_empty_ilocked(&thread->todo))
+ list = &thread->todo;
+ else if (!binder_worklist_empty_ilocked(&proc->todo) &&
+ wait_for_proc_work)
+ list = &proc->todo;
+ else {
+ binder_inner_proc_unlock(proc);
- if (!list_empty(&thread->todo)) {
- w = list_first_entry(&thread->todo, struct binder_work,
- entry);
- } else if (!list_empty(&proc->todo) && wait_for_proc_work) {
- w = list_first_entry(&proc->todo, struct binder_work,
- entry);
- } else {
/* no data added */
- if (ptr - buffer == 4 &&
- !(thread->looper & BINDER_LOOPER_STATE_NEED_RETURN))
+ if (ptr - buffer == 4 && !thread->looper_need_return)
goto retry;
break;
}
- if (end - ptr < sizeof(tr) + 4)
+ if (end - ptr < sizeof(tr) + 4) {
+ binder_inner_proc_unlock(proc);
break;
+ }
+ w = binder_dequeue_work_head_ilocked(list);
+ if (binder_worklist_empty_ilocked(&thread->todo))
+ thread->process_todo = false;
switch (w->type) {
case BINDER_WORK_TRANSACTION: {
+ binder_inner_proc_unlock(proc);
t = container_of(w, struct binder_transaction, work);
} break;
+ case BINDER_WORK_RETURN_ERROR: {
+ struct binder_error *e = container_of(
+ w, struct binder_error, work);
+
+ WARN_ON(e->cmd == BR_OK);
+ binder_inner_proc_unlock(proc);
+ if (put_user(e->cmd, (uint32_t __user *)ptr))
+ return -EFAULT;
+ cmd = e->cmd;
+ e->cmd = BR_OK;
+ ptr += sizeof(uint32_t);
+
+ binder_stat_br(proc, thread, cmd);
+ } break;
case BINDER_WORK_TRANSACTION_COMPLETE: {
+ binder_inner_proc_unlock(proc);
cmd = BR_TRANSACTION_COMPLETE;
if (put_user(cmd, (uint32_t __user *)ptr))
return -EFAULT;
@@ -2296,113 +4112,134 @@ retry:
binder_debug(BINDER_DEBUG_TRANSACTION_COMPLETE,
"%d:%d BR_TRANSACTION_COMPLETE\n",
proc->pid, thread->pid);
-
- list_del(&w->entry);
kfree(w);
binder_stats_deleted(BINDER_STAT_TRANSACTION_COMPLETE);
} break;
case BINDER_WORK_NODE: {
struct binder_node *node = container_of(w, struct binder_node, work);
- uint32_t cmd = BR_NOOP;
- const char *cmd_name;
- int strong = node->internal_strong_refs || node->local_strong_refs;
- int weak = !hlist_empty(&node->refs) || node->local_weak_refs || strong;
-
- if (weak && !node->has_weak_ref) {
- cmd = BR_INCREFS;
- cmd_name = "BR_INCREFS";
+ int strong, weak;
+ binder_uintptr_t node_ptr = node->ptr;
+ binder_uintptr_t node_cookie = node->cookie;
+ int node_debug_id = node->debug_id;
+ int has_weak_ref;
+ int has_strong_ref;
+ void __user *orig_ptr = ptr;
+
+ BUG_ON(proc != node->proc);
+ strong = node->internal_strong_refs ||
+ node->local_strong_refs;
+ weak = !hlist_empty(&node->refs) ||
+ node->local_weak_refs ||
+ node->tmp_refs || strong;
+ has_strong_ref = node->has_strong_ref;
+ has_weak_ref = node->has_weak_ref;
+
+ if (weak && !has_weak_ref) {
node->has_weak_ref = 1;
node->pending_weak_ref = 1;
node->local_weak_refs++;
- } else if (strong && !node->has_strong_ref) {
- cmd = BR_ACQUIRE;
- cmd_name = "BR_ACQUIRE";
+ }
+ if (strong && !has_strong_ref) {
node->has_strong_ref = 1;
node->pending_strong_ref = 1;
node->local_strong_refs++;
- } else if (!strong && node->has_strong_ref) {
- cmd = BR_RELEASE;
- cmd_name = "BR_RELEASE";
+ }
+ if (!strong && has_strong_ref)
node->has_strong_ref = 0;
- } else if (!weak && node->has_weak_ref) {
- cmd = BR_DECREFS;
- cmd_name = "BR_DECREFS";
+ if (!weak && has_weak_ref)
node->has_weak_ref = 0;
- }
- if (cmd != BR_NOOP) {
- if (put_user(cmd, (uint32_t __user *)ptr))
- return -EFAULT;
- ptr += sizeof(uint32_t);
- if (put_user(node->ptr,
- (binder_uintptr_t __user *)ptr))
- return -EFAULT;
- ptr += sizeof(binder_uintptr_t);
- if (put_user(node->cookie,
- (binder_uintptr_t __user *)ptr))
- return -EFAULT;
- ptr += sizeof(binder_uintptr_t);
-
- binder_stat_br(proc, thread, cmd);
- binder_debug(BINDER_DEBUG_USER_REFS,
- "%d:%d %s %d u%016llx c%016llx\n",
- proc->pid, thread->pid, cmd_name,
- node->debug_id,
- (u64)node->ptr, (u64)node->cookie);
- } else {
- list_del_init(&w->entry);
- if (!weak && !strong) {
- binder_debug(BINDER_DEBUG_INTERNAL_REFS,
- "%d:%d node %d u%016llx c%016llx deleted\n",
- proc->pid, thread->pid,
- node->debug_id,
- (u64)node->ptr,
- (u64)node->cookie);
- rb_erase(&node->rb_node, &proc->nodes);
- kfree(node);
- binder_stats_deleted(BINDER_STAT_NODE);
- } else {
- binder_debug(BINDER_DEBUG_INTERNAL_REFS,
- "%d:%d node %d u%016llx c%016llx state unchanged\n",
- proc->pid, thread->pid,
- node->debug_id,
- (u64)node->ptr,
- (u64)node->cookie);
- }
- }
+ if (!weak && !strong) {
+ binder_debug(BINDER_DEBUG_INTERNAL_REFS,
+ "%d:%d node %d u%016llx c%016llx deleted\n",
+ proc->pid, thread->pid,
+ node_debug_id,
+ (u64)node_ptr,
+ (u64)node_cookie);
+ rb_erase(&node->rb_node, &proc->nodes);
+ binder_inner_proc_unlock(proc);
+ binder_node_lock(node);
+ /*
+ * Acquire the node lock before freeing the
+ * node to serialize with other threads that
+ * may have been holding the node lock while
+ * decrementing this node (avoids race where
+ * this thread frees while the other thread
+ * is unlocking the node after the final
+ * decrement)
+ */
+ binder_node_unlock(node);
+ binder_free_node(node);
+ } else
+ binder_inner_proc_unlock(proc);
+
+ if (weak && !has_weak_ref)
+ ret = binder_put_node_cmd(
+ proc, thread, &ptr, node_ptr,
+ node_cookie, node_debug_id,
+ BR_INCREFS, "BR_INCREFS");
+ if (!ret && strong && !has_strong_ref)
+ ret = binder_put_node_cmd(
+ proc, thread, &ptr, node_ptr,
+ node_cookie, node_debug_id,
+ BR_ACQUIRE, "BR_ACQUIRE");
+ if (!ret && !strong && has_strong_ref)
+ ret = binder_put_node_cmd(
+ proc, thread, &ptr, node_ptr,
+ node_cookie, node_debug_id,
+ BR_RELEASE, "BR_RELEASE");
+ if (!ret && !weak && has_weak_ref)
+ ret = binder_put_node_cmd(
+ proc, thread, &ptr, node_ptr,
+ node_cookie, node_debug_id,
+ BR_DECREFS, "BR_DECREFS");
+ if (orig_ptr == ptr)
+ binder_debug(BINDER_DEBUG_INTERNAL_REFS,
+ "%d:%d node %d u%016llx c%016llx state unchanged\n",
+ proc->pid, thread->pid,
+ node_debug_id,
+ (u64)node_ptr,
+ (u64)node_cookie);
+ if (ret)
+ return ret;
} break;
case BINDER_WORK_DEAD_BINDER:
case BINDER_WORK_DEAD_BINDER_AND_CLEAR:
case BINDER_WORK_CLEAR_DEATH_NOTIFICATION: {
struct binder_ref_death *death;
uint32_t cmd;
+ binder_uintptr_t cookie;
death = container_of(w, struct binder_ref_death, work);
if (w->type == BINDER_WORK_CLEAR_DEATH_NOTIFICATION)
cmd = BR_CLEAR_DEATH_NOTIFICATION_DONE;
else
cmd = BR_DEAD_BINDER;
- if (put_user(cmd, (uint32_t __user *)ptr))
- return -EFAULT;
- ptr += sizeof(uint32_t);
- if (put_user(death->cookie,
- (binder_uintptr_t __user *)ptr))
- return -EFAULT;
- ptr += sizeof(binder_uintptr_t);
- binder_stat_br(proc, thread, cmd);
+ cookie = death->cookie;
+
binder_debug(BINDER_DEBUG_DEATH_NOTIFICATION,
"%d:%d %s %016llx\n",
proc->pid, thread->pid,
cmd == BR_DEAD_BINDER ?
"BR_DEAD_BINDER" :
"BR_CLEAR_DEATH_NOTIFICATION_DONE",
- (u64)death->cookie);
-
+ (u64)cookie);
if (w->type == BINDER_WORK_CLEAR_DEATH_NOTIFICATION) {
- list_del(&w->entry);
+ binder_inner_proc_unlock(proc);
kfree(death);
binder_stats_deleted(BINDER_STAT_DEATH);
- } else
- list_move(&w->entry, &proc->delivered_death);
+ } else {
+ binder_enqueue_work_ilocked(
+ w, &proc->delivered_death);
+ binder_inner_proc_unlock(proc);
+ }
+ if (put_user(cmd, (uint32_t __user *)ptr))
+ return -EFAULT;
+ ptr += sizeof(uint32_t);
+ if (put_user(cookie,
+ (binder_uintptr_t __user *)ptr))
+ return -EFAULT;
+ ptr += sizeof(binder_uintptr_t);
+ binder_stat_br(proc, thread, cmd);
if (cmd == BR_DEAD_BINDER)
goto done; /* DEAD_BINDER notifications can cause transactions */
} break;
@@ -2414,16 +4251,14 @@ retry:
BUG_ON(t->buffer == NULL);
if (t->buffer->target_node) {
struct binder_node *target_node = t->buffer->target_node;
+ struct binder_priority node_prio;
tr.target.ptr = target_node->ptr;
tr.cookie = target_node->cookie;
- t->saved_priority = task_nice(current);
- if (t->priority < target_node->min_priority &&
- !(t->flags & TF_ONE_WAY))
- binder_set_nice(t->priority);
- else if (!(t->flags & TF_ONE_WAY) ||
- t->saved_priority > target_node->min_priority)
- binder_set_nice(target_node->min_priority);
+ node_prio.sched_policy = target_node->sched_policy;
+ node_prio.prio = target_node->min_priority;
+ binder_transaction_priority(current, t, node_prio,
+ target_node->inherit_rt);
cmd = BR_TRANSACTION;
} else {
tr.target.ptr = 0;
@@ -2434,8 +4269,9 @@ retry:
tr.flags = t->flags;
tr.sender_euid = from_kuid(current_user_ns(), t->sender_euid);
- if (t->from) {
- struct task_struct *sender = t->from->proc->tsk;
+ t_from = binder_get_txn_from(t);
+ if (t_from) {
+ struct task_struct *sender = t_from->proc->tsk;
tr.sender_pid = task_tgid_nr_ns(sender,
task_active_pid_ns(current));
@@ -2445,18 +4281,32 @@ retry:
tr.data_size = t->buffer->data_size;
tr.offsets_size = t->buffer->offsets_size;
- tr.data.ptr.buffer = (binder_uintptr_t)(
- (uintptr_t)t->buffer->data +
- proc->user_buffer_offset);
+ tr.data.ptr.buffer = (binder_uintptr_t)
+ ((uintptr_t)t->buffer->data +
+ binder_alloc_get_user_buffer_offset(&proc->alloc));
tr.data.ptr.offsets = tr.data.ptr.buffer +
ALIGN(t->buffer->data_size,
sizeof(void *));
- if (put_user(cmd, (uint32_t __user *)ptr))
+ if (put_user(cmd, (uint32_t __user *)ptr)) {
+ if (t_from)
+ binder_thread_dec_tmpref(t_from);
+
+ binder_cleanup_transaction(t, "put_user failed",
+ BR_FAILED_REPLY);
+
return -EFAULT;
+ }
ptr += sizeof(uint32_t);
- if (copy_to_user(ptr, &tr, sizeof(tr)))
+ if (copy_to_user(ptr, &tr, sizeof(tr))) {
+ if (t_from)
+ binder_thread_dec_tmpref(t_from);
+
+ binder_cleanup_transaction(t, "copy_to_user failed",
+ BR_FAILED_REPLY);
+
return -EFAULT;
+ }
ptr += sizeof(tr);
trace_binder_transaction_received(t);
@@ -2466,21 +4316,22 @@ retry:
proc->pid, thread->pid,
(cmd == BR_TRANSACTION) ? "BR_TRANSACTION" :
"BR_REPLY",
- t->debug_id, t->from ? t->from->proc->pid : 0,
- t->from ? t->from->pid : 0, cmd,
+ t->debug_id, t_from ? t_from->proc->pid : 0,
+ t_from ? t_from->pid : 0, cmd,
t->buffer->data_size, t->buffer->offsets_size,
(u64)tr.data.ptr.buffer, (u64)tr.data.ptr.offsets);
- list_del(&t->work.entry);
+ if (t_from)
+ binder_thread_dec_tmpref(t_from);
t->buffer->allow_user_free = 1;
if (cmd == BR_TRANSACTION && !(t->flags & TF_ONE_WAY)) {
+ binder_inner_proc_lock(thread->proc);
t->to_parent = thread->transaction_stack;
t->to_thread = thread;
thread->transaction_stack = t;
+ binder_inner_proc_unlock(thread->proc);
} else {
- t->buffer->transaction = NULL;
- kfree(t);
- binder_stats_deleted(BINDER_STAT_TRANSACTION);
+ binder_free_transaction(t);
}
break;
}
@@ -2488,45 +4339,52 @@ retry:
done:
*consumed = ptr - buffer;
- if (proc->requested_threads + proc->ready_threads == 0 &&
+ binder_inner_proc_lock(proc);
+ if (proc->requested_threads == 0 &&
+ list_empty(&thread->proc->waiting_threads) &&
proc->requested_threads_started < proc->max_threads &&
(thread->looper & (BINDER_LOOPER_STATE_REGISTERED |
BINDER_LOOPER_STATE_ENTERED)) /* the user-space code fails to */
/*spawn a new thread if we leave this out */) {
proc->requested_threads++;
+ binder_inner_proc_unlock(proc);
binder_debug(BINDER_DEBUG_THREADS,
"%d:%d BR_SPAWN_LOOPER\n",
proc->pid, thread->pid);
if (put_user(BR_SPAWN_LOOPER, (uint32_t __user *)buffer))
return -EFAULT;
binder_stat_br(proc, thread, BR_SPAWN_LOOPER);
- }
+ } else
+ binder_inner_proc_unlock(proc);
return 0;
}
-static void binder_release_work(struct list_head *list)
+static void binder_release_work(struct binder_proc *proc,
+ struct list_head *list)
{
struct binder_work *w;
- while (!list_empty(list)) {
- w = list_first_entry(list, struct binder_work, entry);
- list_del_init(&w->entry);
+ while (1) {
+ w = binder_dequeue_work_head(proc, list);
+ if (!w)
+ return;
+
switch (w->type) {
case BINDER_WORK_TRANSACTION: {
struct binder_transaction *t;
t = container_of(w, struct binder_transaction, work);
- if (t->buffer->target_node &&
- !(t->flags & TF_ONE_WAY)) {
- binder_send_failed_reply(t, BR_DEAD_REPLY);
- } else {
- binder_debug(BINDER_DEBUG_DEAD_TRANSACTION,
- "undelivered transaction %d\n",
- t->debug_id);
- t->buffer->transaction = NULL;
- kfree(t);
- binder_stats_deleted(BINDER_STAT_TRANSACTION);
- }
+
+ binder_cleanup_transaction(t, "process died.",
+ BR_DEAD_REPLY);
+ } break;
+ case BINDER_WORK_RETURN_ERROR: {
+ struct binder_error *e = container_of(
+ w, struct binder_error, work);
+
+ binder_debug(BINDER_DEBUG_DEAD_TRANSACTION,
+ "undelivered TRANSACTION_ERROR: %u\n",
+ e->cmd);
} break;
case BINDER_WORK_TRANSACTION_COMPLETE: {
binder_debug(BINDER_DEBUG_DEAD_TRANSACTION,
@@ -2554,7 +4412,8 @@ static void binder_release_work(struct list_head *list)
}
-static struct binder_thread *binder_get_thread(struct binder_proc *proc)
+static struct binder_thread *binder_get_thread_ilocked(
+ struct binder_proc *proc, struct binder_thread *new_thread)
{
struct binder_thread *thread = NULL;
struct rb_node *parent = NULL;
@@ -2569,38 +4428,102 @@ static struct binder_thread *binder_get_thread(struct binder_proc *proc)
else if (current->pid > thread->pid)
p = &(*p)->rb_right;
else
- break;
+ return thread;
}
- if (*p == NULL) {
- thread = kzalloc(sizeof(*thread), GFP_KERNEL);
- if (thread == NULL)
+ if (!new_thread)
+ return NULL;
+ thread = new_thread;
+ binder_stats_created(BINDER_STAT_THREAD);
+ thread->proc = proc;
+ thread->pid = current->pid;
+ get_task_struct(current);
+ thread->task = current;
+ atomic_set(&thread->tmp_ref, 0);
+ init_waitqueue_head(&thread->wait);
+ INIT_LIST_HEAD(&thread->todo);
+ rb_link_node(&thread->rb_node, parent, p);
+ rb_insert_color(&thread->rb_node, &proc->threads);
+ thread->looper_need_return = true;
+ thread->return_error.work.type = BINDER_WORK_RETURN_ERROR;
+ thread->return_error.cmd = BR_OK;
+ thread->reply_error.work.type = BINDER_WORK_RETURN_ERROR;
+ thread->reply_error.cmd = BR_OK;
+ INIT_LIST_HEAD(&new_thread->waiting_thread_node);
+ return thread;
+}
+
+static struct binder_thread *binder_get_thread(struct binder_proc *proc)
+{
+ struct binder_thread *thread;
+ struct binder_thread *new_thread;
+
+ binder_inner_proc_lock(proc);
+ thread = binder_get_thread_ilocked(proc, NULL);
+ binder_inner_proc_unlock(proc);
+ if (!thread) {
+ new_thread = kzalloc(sizeof(*thread), GFP_KERNEL);
+ if (new_thread == NULL)
return NULL;
- binder_stats_created(BINDER_STAT_THREAD);
- thread->proc = proc;
- thread->pid = current->pid;
- init_waitqueue_head(&thread->wait);
- INIT_LIST_HEAD(&thread->todo);
- rb_link_node(&thread->rb_node, parent, p);
- rb_insert_color(&thread->rb_node, &proc->threads);
- thread->looper |= BINDER_LOOPER_STATE_NEED_RETURN;
- thread->return_error = BR_OK;
- thread->return_error2 = BR_OK;
+ binder_inner_proc_lock(proc);
+ thread = binder_get_thread_ilocked(proc, new_thread);
+ binder_inner_proc_unlock(proc);
+ if (thread != new_thread)
+ kfree(new_thread);
}
return thread;
}
-static int binder_free_thread(struct binder_proc *proc,
- struct binder_thread *thread)
+static void binder_free_proc(struct binder_proc *proc)
+{
+ BUG_ON(!list_empty(&proc->todo));
+ BUG_ON(!list_empty(&proc->delivered_death));
+ binder_alloc_deferred_release(&proc->alloc);
+ put_task_struct(proc->tsk);
+ binder_stats_deleted(BINDER_STAT_PROC);
+ kfree(proc);
+}
+
+static void binder_free_thread(struct binder_thread *thread)
+{
+ BUG_ON(!list_empty(&thread->todo));
+ binder_stats_deleted(BINDER_STAT_THREAD);
+ binder_proc_dec_tmpref(thread->proc);
+ put_task_struct(thread->task);
+ kfree(thread);
+}
+
+static int binder_thread_release(struct binder_proc *proc,
+ struct binder_thread *thread)
{
struct binder_transaction *t;
struct binder_transaction *send_reply = NULL;
int active_transactions = 0;
-
+ struct binder_transaction *last_t = NULL;
+
+ binder_inner_proc_lock(thread->proc);
+ /*
+ * take a ref on the proc so it survives
+ * after we remove this thread from proc->threads.
+ * The corresponding dec is when we actually
+ * free the thread in binder_free_thread()
+ */
+ proc->tmp_ref++;
+ /*
+ * take a ref on this thread to ensure it
+ * survives while we are releasing it
+ */
+ atomic_inc(&thread->tmp_ref);
rb_erase(&thread->rb_node, &proc->threads);
t = thread->transaction_stack;
- if (t && t->to_thread == thread)
- send_reply = t;
+ if (t) {
+ spin_lock(&t->lock);
+ if (t->to_thread == thread)
+ send_reply = t;
+ }
+ thread->is_dead = true;
+
while (t) {
+ last_t = t;
active_transactions++;
binder_debug(BINDER_DEBUG_DEAD_TRANSACTION,
"release %d:%d transaction %d %s, still active\n",
@@ -2621,12 +4544,37 @@ static int binder_free_thread(struct binder_proc *proc,
t = t->from_parent;
} else
BUG();
+ spin_unlock(&last_t->lock);
+ if (t)
+ spin_lock(&t->lock);
}
+
+ /*
+ * If this thread used poll, make sure we remove the waitqueue
+ * from any epoll data structures holding it with POLLFREE.
+ * waitqueue_active() is safe to use here because we're holding
+ * the inner lock.
+ */
+ if ((thread->looper & BINDER_LOOPER_STATE_POLL) &&
+ waitqueue_active(&thread->wait)) {
+ wake_up_poll(&thread->wait, POLLHUP | POLLFREE);
+ }
+
+ binder_inner_proc_unlock(thread->proc);
+
+ /*
+ * This is needed to avoid races between wake_up_poll() above and
+ * and ep_remove_waitqueue() called for other reasons (eg the epoll file
+ * descriptor being closed); ep_remove_waitqueue() holds an RCU read
+ * lock, so we can be sure it's done after calling synchronize_rcu().
+ */
+ if (thread->looper & BINDER_LOOPER_STATE_POLL)
+ synchronize_rcu();
+
if (send_reply)
binder_send_failed_reply(send_reply, BR_DEAD_REPLY);
- binder_release_work(&thread->todo);
- kfree(thread);
- binder_stats_deleted(BINDER_STAT_THREAD);
+ binder_release_work(proc, &thread->todo);
+ binder_thread_dec_tmpref(thread);
return active_transactions;
}
@@ -2635,34 +4583,23 @@ static unsigned int binder_poll(struct file *filp,
{
struct binder_proc *proc = filp->private_data;
struct binder_thread *thread = NULL;
- int wait_for_proc_work;
-
- binder_lock(__func__);
+ bool wait_for_proc_work;
thread = binder_get_thread(proc);
- if (!thread) {
- binder_unlock(__func__);
+ if (!thread)
return POLLERR;
- }
- wait_for_proc_work = thread->transaction_stack == NULL &&
- list_empty(&thread->todo) && thread->return_error == BR_OK;
+ binder_inner_proc_lock(thread->proc);
+ thread->looper |= BINDER_LOOPER_STATE_POLL;
+ wait_for_proc_work = binder_available_for_proc_work_ilocked(thread);
- binder_unlock(__func__);
+ binder_inner_proc_unlock(thread->proc);
+
+ poll_wait(filp, &thread->wait, wait);
+
+ if (binder_has_work(thread, wait_for_proc_work))
+ return POLLIN;
- if (wait_for_proc_work) {
- if (binder_has_proc_work(proc, thread))
- return POLLIN;
- poll_wait(filp, &proc->wait, wait);
- if (binder_has_proc_work(proc, thread))
- return POLLIN;
- } else {
- if (binder_has_thread_work(thread))
- return POLLIN;
- poll_wait(filp, &thread->wait, wait);
- if (binder_has_thread_work(thread))
- return POLLIN;
- }
return 0;
}
@@ -2709,8 +4646,10 @@ static int binder_ioctl_write_read(struct file *filp,
&bwr.read_consumed,
filp->f_flags & O_NONBLOCK);
trace_binder_read_done(ret);
- if (!list_empty(&proc->todo))
- wake_up_interruptible(&proc->wait);
+ binder_inner_proc_lock(proc);
+ if (!binder_worklist_empty_ilocked(&proc->todo))
+ binder_wakeup_proc_ilocked(proc);
+ binder_inner_proc_unlock(proc);
if (ret < 0) {
if (copy_to_user(ubuf, &bwr, sizeof(bwr)))
ret = -EFAULT;
@@ -2734,9 +4673,12 @@ static int binder_ioctl_set_ctx_mgr(struct file *filp)
{
int ret = 0;
struct binder_proc *proc = filp->private_data;
+ struct binder_context *context = proc->context;
+ struct binder_node *new_node;
kuid_t curr_euid = current_euid();
- if (binder_context_mgr_node != NULL) {
+ mutex_lock(&context->context_mgr_node_lock);
+ if (context->binder_context_mgr_node) {
pr_err("BINDER_SET_CONTEXT_MGR already set\n");
ret = -EBUSY;
goto out;
@@ -2744,31 +4686,96 @@ static int binder_ioctl_set_ctx_mgr(struct file *filp)
ret = security_binder_set_context_mgr(proc->tsk);
if (ret < 0)
goto out;
- if (uid_valid(binder_context_mgr_uid)) {
- if (!uid_eq(binder_context_mgr_uid, curr_euid)) {
+ if (uid_valid(context->binder_context_mgr_uid)) {
+ if (!uid_eq(context->binder_context_mgr_uid, curr_euid)) {
pr_err("BINDER_SET_CONTEXT_MGR bad uid %d != %d\n",
from_kuid(&init_user_ns, curr_euid),
from_kuid(&init_user_ns,
- binder_context_mgr_uid));
+ context->binder_context_mgr_uid));
ret = -EPERM;
goto out;
}
} else {
- binder_context_mgr_uid = curr_euid;
+ context->binder_context_mgr_uid = curr_euid;
}
- binder_context_mgr_node = binder_new_node(proc, 0, 0);
- if (binder_context_mgr_node == NULL) {
+ new_node = binder_new_node(proc, NULL);
+ if (!new_node) {
ret = -ENOMEM;
goto out;
}
- binder_context_mgr_node->local_weak_refs++;
- binder_context_mgr_node->local_strong_refs++;
- binder_context_mgr_node->has_strong_ref = 1;
- binder_context_mgr_node->has_weak_ref = 1;
+ binder_node_lock(new_node);
+ new_node->local_weak_refs++;
+ new_node->local_strong_refs++;
+ new_node->has_strong_ref = 1;
+ new_node->has_weak_ref = 1;
+ context->binder_context_mgr_node = new_node;
+ binder_node_unlock(new_node);
+ binder_put_node(new_node);
out:
+ mutex_unlock(&context->context_mgr_node_lock);
return ret;
}
+static int binder_ioctl_get_node_info_for_ref(struct binder_proc *proc,
+ struct binder_node_info_for_ref *info)
+{
+ struct binder_node *node;
+ struct binder_context *context = proc->context;
+ __u32 handle = info->handle;
+
+ if (info->strong_count || info->weak_count || info->reserved1 ||
+ info->reserved2 || info->reserved3) {
+ binder_user_error("%d BINDER_GET_NODE_INFO_FOR_REF: only handle may be non-zero.",
+ proc->pid);
+ return -EINVAL;
+ }
+
+ /* This ioctl may only be used by the context manager */
+ mutex_lock(&context->context_mgr_node_lock);
+ if (!context->binder_context_mgr_node ||
+ context->binder_context_mgr_node->proc != proc) {
+ mutex_unlock(&context->context_mgr_node_lock);
+ return -EPERM;
+ }
+ mutex_unlock(&context->context_mgr_node_lock);
+
+ node = binder_get_node_from_ref(proc, handle, true, NULL);
+ if (!node)
+ return -EINVAL;
+
+ info->strong_count = node->local_strong_refs +
+ node->internal_strong_refs;
+ info->weak_count = node->local_weak_refs;
+
+ binder_put_node(node);
+
+ return 0;
+}
+
+static int binder_ioctl_get_node_debug_info(struct binder_proc *proc,
+ struct binder_node_debug_info *info) {
+ struct rb_node *n;
+ binder_uintptr_t ptr = info->ptr;
+
+ memset(info, 0, sizeof(*info));
+
+ binder_inner_proc_lock(proc);
+ for (n = rb_first(&proc->nodes); n != NULL; n = rb_next(n)) {
+ struct binder_node *node = rb_entry(n, struct binder_node,
+ rb_node);
+ if (node->ptr > ptr) {
+ info->ptr = node->ptr;
+ info->cookie = node->cookie;
+ info->has_strong_ref = node->has_strong_ref;
+ info->has_weak_ref = node->has_weak_ref;
+ break;
+ }
+ }
+ binder_inner_proc_unlock(proc);
+
+ return 0;
+}
+
static long binder_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
{
int ret;
@@ -2780,13 +4787,14 @@ static long binder_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
/*pr_info("binder_ioctl: %d:%d %x %lx\n",
proc->pid, current->pid, cmd, arg);*/
+ binder_selftest_alloc(&proc->alloc);
+
trace_binder_ioctl(cmd, arg);
ret = wait_event_interruptible(binder_user_error_wait, binder_stop_on_user_error < 2);
if (ret)
goto err_unlocked;
- binder_lock(__func__);
thread = binder_get_thread(proc);
if (thread == NULL) {
ret = -ENOMEM;
@@ -2799,12 +4807,19 @@ static long binder_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
if (ret)
goto err;
break;
- case BINDER_SET_MAX_THREADS:
- if (copy_from_user(&proc->max_threads, ubuf, sizeof(proc->max_threads))) {
+ case BINDER_SET_MAX_THREADS: {
+ int max_threads;
+
+ if (copy_from_user(&max_threads, ubuf,
+ sizeof(max_threads))) {
ret = -EINVAL;
goto err;
}
+ binder_inner_proc_lock(proc);
+ proc->max_threads = max_threads;
+ binder_inner_proc_unlock(proc);
break;
+ }
case BINDER_SET_CONTEXT_MGR:
ret = binder_ioctl_set_ctx_mgr(filp);
if (ret)
@@ -2813,7 +4828,7 @@ static long binder_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
case BINDER_THREAD_EXIT:
binder_debug(BINDER_DEBUG_THREADS, "%d:%d exit\n",
proc->pid, thread->pid);
- binder_free_thread(proc, thread);
+ binder_thread_release(proc, thread);
thread = NULL;
break;
case BINDER_VERSION: {
@@ -2830,6 +4845,43 @@ static long binder_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
}
break;
}
+ case BINDER_GET_NODE_INFO_FOR_REF: {
+ struct binder_node_info_for_ref info;
+
+ if (copy_from_user(&info, ubuf, sizeof(info))) {
+ ret = -EFAULT;
+ goto err;
+ }
+
+ ret = binder_ioctl_get_node_info_for_ref(proc, &info);
+ if (ret < 0)
+ goto err;
+
+ if (copy_to_user(ubuf, &info, sizeof(info))) {
+ ret = -EFAULT;
+ goto err;
+ }
+
+ break;
+ }
+ case BINDER_GET_NODE_DEBUG_INFO: {
+ struct binder_node_debug_info info;
+
+ if (copy_from_user(&info, ubuf, sizeof(info))) {
+ ret = -EFAULT;
+ goto err;
+ }
+
+ ret = binder_ioctl_get_node_debug_info(proc, &info);
+ if (ret < 0)
+ goto err;
+
+ if (copy_to_user(ubuf, &info, sizeof(info))) {
+ ret = -EFAULT;
+ goto err;
+ }
+ break;
+ }
default:
ret = -EINVAL;
goto err;
@@ -2837,8 +4889,7 @@ static long binder_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
ret = 0;
err:
if (thread)
- thread->looper &= ~BINDER_LOOPER_STATE_NEED_RETURN;
- binder_unlock(__func__);
+ thread->looper_need_return = false;
wait_event_interruptible(binder_user_error_wait, binder_stop_on_user_error < 2);
if (ret && ret != -ERESTARTSYS)
pr_info("%d:%d ioctl %x %lx returned %d\n", proc->pid, current->pid, cmd, arg, ret);
@@ -2867,8 +4918,7 @@ static void binder_vma_close(struct vm_area_struct *vma)
proc->pid, vma->vm_start, vma->vm_end,
(vma->vm_end - vma->vm_start) / SZ_1K, vma->vm_flags,
(unsigned long)pgprot_val(vma->vm_page_prot));
- proc->vma = NULL;
- proc->vma_vm_mm = NULL;
+ binder_alloc_vma_close(&proc->alloc);
binder_defer_work(proc, BINDER_DEFERRED_PUT_FILES);
}
@@ -2886,10 +4936,8 @@ static const struct vm_operations_struct binder_vm_ops = {
static int binder_mmap(struct file *filp, struct vm_area_struct *vma)
{
int ret;
- struct vm_struct *area;
struct binder_proc *proc = filp->private_data;
const char *failure_string;
- struct binder_buffer *buffer;
if (proc->tsk != current->group_leader)
return -EINVAL;
@@ -2898,8 +4946,8 @@ static int binder_mmap(struct file *filp, struct vm_area_struct *vma)
vma->vm_end = vma->vm_start + SZ_4M;
binder_debug(BINDER_DEBUG_OPEN_CLOSE,
- "binder_mmap: %d %lx-%lx (%ld K) vma %lx pagep %lx\n",
- proc->pid, vma->vm_start, vma->vm_end,
+ "%s: %d %lx-%lx (%ld K) vma %lx pagep %lx\n",
+ __func__, proc->pid, vma->vm_start, vma->vm_end,
(vma->vm_end - vma->vm_start) / SZ_1K, vma->vm_flags,
(unsigned long)pgprot_val(vma->vm_page_prot));
@@ -2908,78 +4956,22 @@ static int binder_mmap(struct file *filp, struct vm_area_struct *vma)
failure_string = "bad vm_flags";
goto err_bad_arg;
}
- vma->vm_flags = (vma->vm_flags | VM_DONTCOPY) & ~VM_MAYWRITE;
-
- mutex_lock(&binder_mmap_lock);
- if (proc->buffer) {
- ret = -EBUSY;
- failure_string = "already mapped";
- goto err_already_mapped;
- }
-
- area = get_vm_area(vma->vm_end - vma->vm_start, VM_IOREMAP);
- if (area == NULL) {
- ret = -ENOMEM;
- failure_string = "get_vm_area";
- goto err_get_vm_area_failed;
- }
- proc->buffer = area->addr;
- proc->user_buffer_offset = vma->vm_start - (uintptr_t)proc->buffer;
- mutex_unlock(&binder_mmap_lock);
-
-#ifdef CONFIG_CPU_CACHE_VIPT
- if (cache_is_vipt_aliasing()) {
- while (CACHE_COLOUR((vma->vm_start ^ (uint32_t)proc->buffer))) {
- pr_info("binder_mmap: %d %lx-%lx maps %p bad alignment\n", proc->pid, vma->vm_start, vma->vm_end, proc->buffer);
- vma->vm_start += PAGE_SIZE;
- }
- }
-#endif
- proc->pages = kzalloc(sizeof(proc->pages[0]) * ((vma->vm_end - vma->vm_start) / PAGE_SIZE), GFP_KERNEL);
- if (proc->pages == NULL) {
- ret = -ENOMEM;
- failure_string = "alloc page array";
- goto err_alloc_pages_failed;
- }
- proc->buffer_size = vma->vm_end - vma->vm_start;
+ vma->vm_flags |= VM_DONTCOPY | VM_MIXEDMAP;
+ vma->vm_flags &= ~VM_MAYWRITE;
vma->vm_ops = &binder_vm_ops;
vma->vm_private_data = proc;
- if (binder_update_page_range(proc, 1, proc->buffer, proc->buffer + PAGE_SIZE, vma)) {
- ret = -ENOMEM;
- failure_string = "alloc small buf";
- goto err_alloc_small_buf_failed;
- }
- buffer = proc->buffer;
- INIT_LIST_HEAD(&proc->buffers);
- list_add(&buffer->entry, &proc->buffers);
- buffer->free = 1;
- binder_insert_free_buffer(proc, buffer);
- proc->free_async_space = proc->buffer_size / 2;
- barrier();
+ ret = binder_alloc_mmap_handler(&proc->alloc, vma);
+ if (ret)
+ return ret;
mutex_lock(&proc->files_lock);
proc->files = get_files_struct(current);
mutex_unlock(&proc->files_lock);
- proc->vma = vma;
- proc->vma_vm_mm = vma->vm_mm;
-
- /*pr_info("binder_mmap: %d %lx-%lx maps %p\n",
- proc->pid, vma->vm_start, vma->vm_end, proc->buffer);*/
return 0;
-err_alloc_small_buf_failed:
- kfree(proc->pages);
- proc->pages = NULL;
-err_alloc_pages_failed:
- mutex_lock(&binder_mmap_lock);
- vfree(proc->buffer);
- proc->buffer = NULL;
-err_get_vm_area_failed:
-err_already_mapped:
- mutex_unlock(&binder_mmap_lock);
err_bad_arg:
- pr_err("binder_mmap: %d %lx-%lx %s failed %d\n",
+ pr_err("%s: %d %lx-%lx %s failed %d\n", __func__,
proc->pid, vma->vm_start, vma->vm_end, failure_string, ret);
return ret;
}
@@ -2987,36 +4979,58 @@ err_bad_arg:
static int binder_open(struct inode *nodp, struct file *filp)
{
struct binder_proc *proc;
+ struct binder_device *binder_dev;
- binder_debug(BINDER_DEBUG_OPEN_CLOSE, "binder_open: %d:%d\n",
+ binder_debug(BINDER_DEBUG_OPEN_CLOSE, "%s: %d:%d\n", __func__,
current->group_leader->pid, current->pid);
proc = kzalloc(sizeof(*proc), GFP_KERNEL);
if (proc == NULL)
return -ENOMEM;
+ spin_lock_init(&proc->inner_lock);
+ spin_lock_init(&proc->outer_lock);
get_task_struct(current->group_leader);
proc->tsk = current->group_leader;
mutex_init(&proc->files_lock);
INIT_LIST_HEAD(&proc->todo);
- init_waitqueue_head(&proc->wait);
- proc->default_priority = task_nice(current);
+ if (binder_supported_policy(current->policy)) {
+ proc->default_priority.sched_policy = current->policy;
+ proc->default_priority.prio = current->normal_prio;
+ } else {
+ proc->default_priority.sched_policy = SCHED_NORMAL;
+ proc->default_priority.prio = NICE_TO_PRIO(0);
+ }
- binder_lock(__func__);
+ binder_dev = container_of(filp->private_data, struct binder_device,
+ miscdev);
+ proc->context = &binder_dev->context;
+ binder_alloc_init(&proc->alloc);
binder_stats_created(BINDER_STAT_PROC);
- hlist_add_head(&proc->proc_node, &binder_procs);
proc->pid = current->group_leader->pid;
INIT_LIST_HEAD(&proc->delivered_death);
+ INIT_LIST_HEAD(&proc->waiting_threads);
filp->private_data = proc;
- binder_unlock(__func__);
+ mutex_lock(&binder_procs_lock);
+ hlist_add_head(&proc->proc_node, &binder_procs);
+ mutex_unlock(&binder_procs_lock);
if (binder_debugfs_dir_entry_proc) {
char strbuf[11];
snprintf(strbuf, sizeof(strbuf), "%u", proc->pid);
- proc->debugfs_entry = debugfs_create_file(strbuf, S_IRUGO,
- binder_debugfs_dir_entry_proc, proc, &binder_proc_fops);
+ /*
+ * proc debug entries are shared between contexts, so
+ * this will fail if the process tries to open the driver
+ * again with a different context. The priting code will
+ * anyway print all contexts that a given PID has, so this
+ * is not a problem.
+ */
+ proc->debugfs_entry = debugfs_create_file(strbuf, 0444,
+ binder_debugfs_dir_entry_proc,
+ (void *)(unsigned long)proc->pid,
+ &binder_proc_fops);
}
return 0;
@@ -3036,16 +5050,17 @@ static void binder_deferred_flush(struct binder_proc *proc)
struct rb_node *n;
int wake_count = 0;
+ binder_inner_proc_lock(proc);
for (n = rb_first(&proc->threads); n != NULL; n = rb_next(n)) {
struct binder_thread *thread = rb_entry(n, struct binder_thread, rb_node);
- thread->looper |= BINDER_LOOPER_STATE_NEED_RETURN;
+ thread->looper_need_return = true;
if (thread->looper & BINDER_LOOPER_STATE_WAITING) {
wake_up_interruptible(&thread->wait);
wake_count++;
}
}
- wake_up_interruptible_all(&proc->wait);
+ binder_inner_proc_unlock(proc);
binder_debug(BINDER_DEBUG_OPEN_CLOSE,
"binder_flush: %d woke %d threads\n", proc->pid,
@@ -3066,13 +5081,21 @@ static int binder_node_release(struct binder_node *node, int refs)
{
struct binder_ref *ref;
int death = 0;
-
- list_del_init(&node->work.entry);
- binder_release_work(&node->async_todo);
-
- if (hlist_empty(&node->refs)) {
- kfree(node);
- binder_stats_deleted(BINDER_STAT_NODE);
+ struct binder_proc *proc = node->proc;
+
+ binder_release_work(proc, &node->async_todo);
+
+ binder_node_lock(node);
+ binder_inner_proc_lock(proc);
+ binder_dequeue_work_ilocked(&node->work);
+ /*
+ * The caller must have taken a temporary ref on the node,
+ */
+ BUG_ON(!node->tmp_refs);
+ if (hlist_empty(&node->refs) && node->tmp_refs == 1) {
+ binder_inner_proc_unlock(proc);
+ binder_node_unlock(node);
+ binder_free_node(node);
return refs;
}
@@ -3080,59 +5103,84 @@ static int binder_node_release(struct binder_node *node, int refs)
node->proc = NULL;
node->local_strong_refs = 0;
node->local_weak_refs = 0;
+ binder_inner_proc_unlock(proc);
+
+ spin_lock(&binder_dead_nodes_lock);
hlist_add_head(&node->dead_node, &binder_dead_nodes);
+ spin_unlock(&binder_dead_nodes_lock);
hlist_for_each_entry(ref, &node->refs, node_entry) {
refs++;
-
- if (!ref->death)
+ /*
+ * Need the node lock to synchronize
+ * with new notification requests and the
+ * inner lock to synchronize with queued
+ * death notifications.
+ */
+ binder_inner_proc_lock(ref->proc);
+ if (!ref->death) {
+ binder_inner_proc_unlock(ref->proc);
continue;
+ }
death++;
- if (list_empty(&ref->death->work.entry)) {
- ref->death->work.type = BINDER_WORK_DEAD_BINDER;
- list_add_tail(&ref->death->work.entry,
- &ref->proc->todo);
- wake_up_interruptible(&ref->proc->wait);
- } else
- BUG();
+ BUG_ON(!list_empty(&ref->death->work.entry));
+ ref->death->work.type = BINDER_WORK_DEAD_BINDER;
+ binder_enqueue_work_ilocked(&ref->death->work,
+ &ref->proc->todo);
+ binder_wakeup_proc_ilocked(ref->proc);
+ binder_inner_proc_unlock(ref->proc);
}
binder_debug(BINDER_DEBUG_DEAD_BINDER,
"node %d now dead, refs %d, death %d\n",
node->debug_id, refs, death);
+ binder_node_unlock(node);
+ binder_put_node(node);
return refs;
}
static void binder_deferred_release(struct binder_proc *proc)
{
- struct binder_transaction *t;
+ struct binder_context *context = proc->context;
struct rb_node *n;
- int threads, nodes, incoming_refs, outgoing_refs, buffers,
- active_transactions, page_count;
+ int threads, nodes, incoming_refs, outgoing_refs, active_transactions;
- BUG_ON(proc->vma);
BUG_ON(proc->files);
+ mutex_lock(&binder_procs_lock);
hlist_del(&proc->proc_node);
+ mutex_unlock(&binder_procs_lock);
- if (binder_context_mgr_node && binder_context_mgr_node->proc == proc) {
+ mutex_lock(&context->context_mgr_node_lock);
+ if (context->binder_context_mgr_node &&
+ context->binder_context_mgr_node->proc == proc) {
binder_debug(BINDER_DEBUG_DEAD_BINDER,
"%s: %d context_mgr_node gone\n",
__func__, proc->pid);
- binder_context_mgr_node = NULL;
+ context->binder_context_mgr_node = NULL;
}
-
+ mutex_unlock(&context->context_mgr_node_lock);
+ binder_inner_proc_lock(proc);
+ /*
+ * Make sure proc stays alive after we
+ * remove all the threads
+ */
+ proc->tmp_ref++;
+
+ proc->is_dead = true;
threads = 0;
active_transactions = 0;
while ((n = rb_first(&proc->threads))) {
struct binder_thread *thread;
thread = rb_entry(n, struct binder_thread, rb_node);
+ binder_inner_proc_unlock(proc);
threads++;
- active_transactions += binder_free_thread(proc, thread);
+ active_transactions += binder_thread_release(proc, thread);
+ binder_inner_proc_lock(proc);
}
nodes = 0;
@@ -3142,73 +5190,42 @@ static void binder_deferred_release(struct binder_proc *proc)
node = rb_entry(n, struct binder_node, rb_node);
nodes++;
+ /*
+ * take a temporary ref on the node before
+ * calling binder_node_release() which will either
+ * kfree() the node or call binder_put_node()
+ */
+ binder_inc_node_tmpref_ilocked(node);
rb_erase(&node->rb_node, &proc->nodes);
+ binder_inner_proc_unlock(proc);
incoming_refs = binder_node_release(node, incoming_refs);
+ binder_inner_proc_lock(proc);
}
+ binder_inner_proc_unlock(proc);
outgoing_refs = 0;
+ binder_proc_lock(proc);
while ((n = rb_first(&proc->refs_by_desc))) {
struct binder_ref *ref;
ref = rb_entry(n, struct binder_ref, rb_node_desc);
outgoing_refs++;
- binder_delete_ref(ref);
- }
-
- binder_release_work(&proc->todo);
- binder_release_work(&proc->delivered_death);
-
- buffers = 0;
- while ((n = rb_first(&proc->allocated_buffers))) {
- struct binder_buffer *buffer;
-
- buffer = rb_entry(n, struct binder_buffer, rb_node);
-
- t = buffer->transaction;
- if (t) {
- t->buffer = NULL;
- buffer->transaction = NULL;
- pr_err("release proc %d, transaction %d, not freed\n",
- proc->pid, t->debug_id);
- /*BUG();*/
- }
-
- binder_free_buf(proc, buffer);
- buffers++;
+ binder_cleanup_ref_olocked(ref);
+ binder_proc_unlock(proc);
+ binder_free_ref(ref);
+ binder_proc_lock(proc);
}
+ binder_proc_unlock(proc);
- binder_stats_deleted(BINDER_STAT_PROC);
-
- page_count = 0;
- if (proc->pages) {
- int i;
-
- for (i = 0; i < proc->buffer_size / PAGE_SIZE; i++) {
- void *page_addr;
-
- if (!proc->pages[i])
- continue;
-
- page_addr = proc->buffer + i * PAGE_SIZE;
- binder_debug(BINDER_DEBUG_BUFFER_ALLOC,
- "%s: %d: page %d at %p not freed\n",
- __func__, proc->pid, i, page_addr);
- unmap_kernel_range((unsigned long)page_addr, PAGE_SIZE);
- __free_page(proc->pages[i]);
- page_count++;
- }
- kfree(proc->pages);
- vfree(proc->buffer);
- }
-
- put_task_struct(proc->tsk);
+ binder_release_work(proc, &proc->todo);
+ binder_release_work(proc, &proc->delivered_death);
binder_debug(BINDER_DEBUG_OPEN_CLOSE,
- "%s: %d threads %d, nodes %d (ref %d), refs %d, active transactions %d, buffers %d, pages %d\n",
+ "%s: %d threads %d, nodes %d (ref %d), refs %d, active transactions %d\n",
__func__, proc->pid, threads, nodes, incoming_refs,
- outgoing_refs, active_transactions, buffers, page_count);
+ outgoing_refs, active_transactions);
- kfree(proc);
+ binder_proc_dec_tmpref(proc);
}
static void binder_deferred_func(struct work_struct *work)
@@ -3219,7 +5236,6 @@ static void binder_deferred_func(struct work_struct *work)
int defer;
do {
- binder_lock(__func__);
mutex_lock(&binder_deferred_lock);
if (!hlist_empty(&binder_deferred_list)) {
proc = hlist_entry(binder_deferred_list.first,
@@ -3248,7 +5264,6 @@ static void binder_deferred_func(struct work_struct *work)
if (defer & BINDER_DEFERRED_RELEASE)
binder_deferred_release(proc); /* frees proc */
- binder_unlock(__func__);
if (files)
put_files_struct(files);
} while (proc);
@@ -3268,41 +5283,52 @@ binder_defer_work(struct binder_proc *proc, enum binder_deferred_state defer)
mutex_unlock(&binder_deferred_lock);
}
-static void print_binder_transaction(struct seq_file *m, const char *prefix,
- struct binder_transaction *t)
+static void print_binder_transaction_ilocked(struct seq_file *m,
+ struct binder_proc *proc,
+ const char *prefix,
+ struct binder_transaction *t)
{
+ struct binder_proc *to_proc;
+ struct binder_buffer *buffer = t->buffer;
+
+ spin_lock(&t->lock);
+ to_proc = t->to_proc;
seq_printf(m,
- "%s %d: %p from %d:%d to %d:%d code %x flags %x pri %ld r%d",
+ "%s %d: %pK from %d:%d to %d:%d code %x flags %x pri %d:%d r%d",
prefix, t->debug_id, t,
t->from ? t->from->proc->pid : 0,
t->from ? t->from->pid : 0,
- t->to_proc ? t->to_proc->pid : 0,
+ to_proc ? to_proc->pid : 0,
t->to_thread ? t->to_thread->pid : 0,
- t->code, t->flags, t->priority, t->need_reply);
- if (t->buffer == NULL) {
- seq_puts(m, " buffer free\n");
+ t->code, t->flags, t->priority.sched_policy,
+ t->priority.prio, t->need_reply);
+ spin_unlock(&t->lock);
+
+ if (proc != to_proc) {
+ /*
+ * Can only safely deref buffer if we are holding the
+ * correct proc inner lock for this node
+ */
+ seq_puts(m, "\n");
return;
}
- if (t->buffer->target_node)
- seq_printf(m, " node %d",
- t->buffer->target_node->debug_id);
- seq_printf(m, " size %zd:%zd data %p\n",
- t->buffer->data_size, t->buffer->offsets_size,
- t->buffer->data);
-}
-static void print_binder_buffer(struct seq_file *m, const char *prefix,
- struct binder_buffer *buffer)
-{
- seq_printf(m, "%s %d: %p size %zd:%zd %s\n",
- prefix, buffer->debug_id, buffer->data,
+ if (buffer == NULL) {
+ seq_puts(m, " buffer free\n");
+ return;
+ }
+ if (buffer->target_node)
+ seq_printf(m, " node %d", buffer->target_node->debug_id);
+ seq_printf(m, " size %zd:%zd data %pK\n",
buffer->data_size, buffer->offsets_size,
- buffer->transaction ? "active" : "delivered");
+ buffer->data);
}
-static void print_binder_work(struct seq_file *m, const char *prefix,
- const char *transaction_prefix,
- struct binder_work *w)
+static void print_binder_work_ilocked(struct seq_file *m,
+ struct binder_proc *proc,
+ const char *prefix,
+ const char *transaction_prefix,
+ struct binder_work *w)
{
struct binder_node *node;
struct binder_transaction *t;
@@ -3310,8 +5336,16 @@ static void print_binder_work(struct seq_file *m, const char *prefix,
switch (w->type) {
case BINDER_WORK_TRANSACTION:
t = container_of(w, struct binder_transaction, work);
- print_binder_transaction(m, transaction_prefix, t);
+ print_binder_transaction_ilocked(
+ m, proc, transaction_prefix, t);
break;
+ case BINDER_WORK_RETURN_ERROR: {
+ struct binder_error *e = container_of(
+ w, struct binder_error, work);
+
+ seq_printf(m, "%stransaction error: %u\n",
+ prefix, e->cmd);
+ } break;
case BINDER_WORK_TRANSACTION_COMPLETE:
seq_printf(m, "%stransaction complete\n", prefix);
break;
@@ -3336,40 +5370,46 @@ static void print_binder_work(struct seq_file *m, const char *prefix,
}
}
-static void print_binder_thread(struct seq_file *m,
- struct binder_thread *thread,
- int print_always)
+static void print_binder_thread_ilocked(struct seq_file *m,
+ struct binder_thread *thread,
+ int print_always)
{
struct binder_transaction *t;
struct binder_work *w;
size_t start_pos = m->count;
size_t header_pos;
- seq_printf(m, " thread %d: l %02x\n", thread->pid, thread->looper);
+ seq_printf(m, " thread %d: l %02x need_return %d tr %d\n",
+ thread->pid, thread->looper,
+ thread->looper_need_return,
+ atomic_read(&thread->tmp_ref));
header_pos = m->count;
t = thread->transaction_stack;
while (t) {
if (t->from == thread) {
- print_binder_transaction(m,
- " outgoing transaction", t);
+ print_binder_transaction_ilocked(m, thread->proc,
+ " outgoing transaction", t);
t = t->from_parent;
} else if (t->to_thread == thread) {
- print_binder_transaction(m,
+ print_binder_transaction_ilocked(m, thread->proc,
" incoming transaction", t);
t = t->to_parent;
} else {
- print_binder_transaction(m, " bad transaction", t);
+ print_binder_transaction_ilocked(m, thread->proc,
+ " bad transaction", t);
t = NULL;
}
}
list_for_each_entry(w, &thread->todo, entry) {
- print_binder_work(m, " ", " pending transaction", w);
+ print_binder_work_ilocked(m, thread->proc, " ",
+ " pending transaction", w);
}
if (!print_always && m->count == header_pos)
m->count = start_pos;
}
-static void print_binder_node(struct seq_file *m, struct binder_node *node)
+static void print_binder_node_nilocked(struct seq_file *m,
+ struct binder_node *node)
{
struct binder_ref *ref;
struct binder_work *w;
@@ -3379,27 +5419,35 @@ static void print_binder_node(struct seq_file *m, struct binder_node *node)
hlist_for_each_entry(ref, &node->refs, node_entry)
count++;
- seq_printf(m, " node %d: u%016llx c%016llx hs %d hw %d ls %d lw %d is %d iw %d",
+ seq_printf(m, " node %d: u%016llx c%016llx pri %d:%d hs %d hw %d ls %d lw %d is %d iw %d tr %d",
node->debug_id, (u64)node->ptr, (u64)node->cookie,
+ node->sched_policy, node->min_priority,
node->has_strong_ref, node->has_weak_ref,
node->local_strong_refs, node->local_weak_refs,
- node->internal_strong_refs, count);
+ node->internal_strong_refs, count, node->tmp_refs);
if (count) {
seq_puts(m, " proc");
hlist_for_each_entry(ref, &node->refs, node_entry)
seq_printf(m, " %d", ref->proc->pid);
}
seq_puts(m, "\n");
- list_for_each_entry(w, &node->async_todo, entry)
- print_binder_work(m, " ",
- " pending async transaction", w);
+ if (node->proc) {
+ list_for_each_entry(w, &node->async_todo, entry)
+ print_binder_work_ilocked(m, node->proc, " ",
+ " pending async transaction", w);
+ }
}
-static void print_binder_ref(struct seq_file *m, struct binder_ref *ref)
+static void print_binder_ref_olocked(struct seq_file *m,
+ struct binder_ref *ref)
{
- seq_printf(m, " ref %d: desc %d %snode %d s %d w %d d %p\n",
- ref->debug_id, ref->desc, ref->node->proc ? "" : "dead ",
- ref->node->debug_id, ref->strong, ref->weak, ref->death);
+ binder_node_lock(ref->node);
+ seq_printf(m, " ref %d: desc %d %snode %d s %d w %d d %pK\n",
+ ref->data.debug_id, ref->data.desc,
+ ref->node->proc ? "" : "dead ",
+ ref->node->debug_id, ref->data.strong,
+ ref->data.weak, ref->death);
+ binder_node_unlock(ref->node);
}
static void print_binder_proc(struct seq_file *m,
@@ -3409,35 +5457,60 @@ static void print_binder_proc(struct seq_file *m,
struct rb_node *n;
size_t start_pos = m->count;
size_t header_pos;
+ struct binder_node *last_node = NULL;
seq_printf(m, "proc %d\n", proc->pid);
+ seq_printf(m, "context %s\n", proc->context->name);
header_pos = m->count;
+ binder_inner_proc_lock(proc);
for (n = rb_first(&proc->threads); n != NULL; n = rb_next(n))
- print_binder_thread(m, rb_entry(n, struct binder_thread,
+ print_binder_thread_ilocked(m, rb_entry(n, struct binder_thread,
rb_node), print_all);
+
for (n = rb_first(&proc->nodes); n != NULL; n = rb_next(n)) {
struct binder_node *node = rb_entry(n, struct binder_node,
rb_node);
- if (print_all || node->has_async_transaction)
- print_binder_node(m, node);
+ /*
+ * take a temporary reference on the node so it
+ * survives and isn't removed from the tree
+ * while we print it.
+ */
+ binder_inc_node_tmpref_ilocked(node);
+ /* Need to drop inner lock to take node lock */
+ binder_inner_proc_unlock(proc);
+ if (last_node)
+ binder_put_node(last_node);
+ binder_node_inner_lock(node);
+ print_binder_node_nilocked(m, node);
+ binder_node_inner_unlock(node);
+ last_node = node;
+ binder_inner_proc_lock(proc);
}
+ binder_inner_proc_unlock(proc);
+ if (last_node)
+ binder_put_node(last_node);
+
if (print_all) {
+ binder_proc_lock(proc);
for (n = rb_first(&proc->refs_by_desc);
n != NULL;
n = rb_next(n))
- print_binder_ref(m, rb_entry(n, struct binder_ref,
- rb_node_desc));
+ print_binder_ref_olocked(m, rb_entry(n,
+ struct binder_ref,
+ rb_node_desc));
+ binder_proc_unlock(proc);
}
- for (n = rb_first(&proc->allocated_buffers); n != NULL; n = rb_next(n))
- print_binder_buffer(m, " buffer",
- rb_entry(n, struct binder_buffer, rb_node));
+ binder_alloc_print_allocated(m, &proc->alloc);
+ binder_inner_proc_lock(proc);
list_for_each_entry(w, &proc->todo, entry)
- print_binder_work(m, " ", " pending transaction", w);
+ print_binder_work_ilocked(m, proc, " ",
+ " pending transaction", w);
list_for_each_entry(w, &proc->delivered_death, entry) {
seq_puts(m, " has delivered dead binder\n");
break;
}
+ binder_inner_proc_unlock(proc);
if (!print_all && m->count == header_pos)
m->count = start_pos;
}
@@ -3480,7 +5553,9 @@ static const char * const binder_command_strings[] = {
"BC_EXIT_LOOPER",
"BC_REQUEST_DEATH_NOTIFICATION",
"BC_CLEAR_DEATH_NOTIFICATION",
- "BC_DEAD_BINDER_DONE"
+ "BC_DEAD_BINDER_DONE",
+ "BC_TRANSACTION_SG",
+ "BC_REPLY_SG",
};
static const char * const binder_objstat_strings[] = {
@@ -3501,17 +5576,21 @@ static void print_binder_stats(struct seq_file *m, const char *prefix,
BUILD_BUG_ON(ARRAY_SIZE(stats->bc) !=
ARRAY_SIZE(binder_command_strings));
for (i = 0; i < ARRAY_SIZE(stats->bc); i++) {
- if (stats->bc[i])
+ int temp = atomic_read(&stats->bc[i]);
+
+ if (temp)
seq_printf(m, "%s%s: %d\n", prefix,
- binder_command_strings[i], stats->bc[i]);
+ binder_command_strings[i], temp);
}
BUILD_BUG_ON(ARRAY_SIZE(stats->br) !=
ARRAY_SIZE(binder_return_strings));
for (i = 0; i < ARRAY_SIZE(stats->br); i++) {
- if (stats->br[i])
+ int temp = atomic_read(&stats->br[i]);
+
+ if (temp)
seq_printf(m, "%s%s: %d\n", prefix,
- binder_return_strings[i], stats->br[i]);
+ binder_return_strings[i], temp);
}
BUILD_BUG_ON(ARRAY_SIZE(stats->obj_created) !=
@@ -3519,11 +5598,15 @@ static void print_binder_stats(struct seq_file *m, const char *prefix,
BUILD_BUG_ON(ARRAY_SIZE(stats->obj_created) !=
ARRAY_SIZE(stats->obj_deleted));
for (i = 0; i < ARRAY_SIZE(stats->obj_created); i++) {
- if (stats->obj_created[i] || stats->obj_deleted[i])
- seq_printf(m, "%s%s: active %d total %d\n", prefix,
+ int created = atomic_read(&stats->obj_created[i]);
+ int deleted = atomic_read(&stats->obj_deleted[i]);
+
+ if (created || deleted)
+ seq_printf(m, "%s%s: active %d total %d\n",
+ prefix,
binder_objstat_strings[i],
- stats->obj_created[i] - stats->obj_deleted[i],
- stats->obj_created[i]);
+ created - deleted,
+ created);
}
}
@@ -3531,50 +5614,61 @@ static void print_binder_proc_stats(struct seq_file *m,
struct binder_proc *proc)
{
struct binder_work *w;
+ struct binder_thread *thread;
struct rb_node *n;
- int count, strong, weak;
+ int count, strong, weak, ready_threads;
+ size_t free_async_space =
+ binder_alloc_get_free_async_space(&proc->alloc);
seq_printf(m, "proc %d\n", proc->pid);
+ seq_printf(m, "context %s\n", proc->context->name);
count = 0;
+ ready_threads = 0;
+ binder_inner_proc_lock(proc);
for (n = rb_first(&proc->threads); n != NULL; n = rb_next(n))
count++;
+
+ list_for_each_entry(thread, &proc->waiting_threads, waiting_thread_node)
+ ready_threads++;
+
seq_printf(m, " threads: %d\n", count);
seq_printf(m, " requested threads: %d+%d/%d\n"
" ready threads %d\n"
" free async space %zd\n", proc->requested_threads,
proc->requested_threads_started, proc->max_threads,
- proc->ready_threads, proc->free_async_space);
+ ready_threads,
+ free_async_space);
count = 0;
for (n = rb_first(&proc->nodes); n != NULL; n = rb_next(n))
count++;
+ binder_inner_proc_unlock(proc);
seq_printf(m, " nodes: %d\n", count);
count = 0;
strong = 0;
weak = 0;
+ binder_proc_lock(proc);
for (n = rb_first(&proc->refs_by_desc); n != NULL; n = rb_next(n)) {
struct binder_ref *ref = rb_entry(n, struct binder_ref,
rb_node_desc);
count++;
- strong += ref->strong;
- weak += ref->weak;
+ strong += ref->data.strong;
+ weak += ref->data.weak;
}
+ binder_proc_unlock(proc);
seq_printf(m, " refs: %d s %d w %d\n", count, strong, weak);
- count = 0;
- for (n = rb_first(&proc->allocated_buffers); n != NULL; n = rb_next(n))
- count++;
+ count = binder_alloc_get_allocated_count(&proc->alloc);
seq_printf(m, " buffers: %d\n", count);
+ binder_alloc_print_pages(m, &proc->alloc);
+
count = 0;
+ binder_inner_proc_lock(proc);
list_for_each_entry(w, &proc->todo, entry) {
- switch (w->type) {
- case BINDER_WORK_TRANSACTION:
+ if (w->type == BINDER_WORK_TRANSACTION)
count++;
- break;
- default:
- break;
- }
}
+ binder_inner_proc_unlock(proc);
seq_printf(m, " pending transactions: %d\n", count);
print_binder_stats(m, " ", &proc->stats);
@@ -3585,107 +5679,131 @@ static int binder_state_show(struct seq_file *m, void *unused)
{
struct binder_proc *proc;
struct binder_node *node;
- int do_lock = !binder_debug_no_lock;
-
- if (do_lock)
- binder_lock(__func__);
+ struct binder_node *last_node = NULL;
seq_puts(m, "binder state:\n");
+ spin_lock(&binder_dead_nodes_lock);
if (!hlist_empty(&binder_dead_nodes))
seq_puts(m, "dead nodes:\n");
- hlist_for_each_entry(node, &binder_dead_nodes, dead_node)
- print_binder_node(m, node);
+ hlist_for_each_entry(node, &binder_dead_nodes, dead_node) {
+ /*
+ * take a temporary reference on the node so it
+ * survives and isn't removed from the list
+ * while we print it.
+ */
+ node->tmp_refs++;
+ spin_unlock(&binder_dead_nodes_lock);
+ if (last_node)
+ binder_put_node(last_node);
+ binder_node_lock(node);
+ print_binder_node_nilocked(m, node);
+ binder_node_unlock(node);
+ last_node = node;
+ spin_lock(&binder_dead_nodes_lock);
+ }
+ spin_unlock(&binder_dead_nodes_lock);
+ if (last_node)
+ binder_put_node(last_node);
+ mutex_lock(&binder_procs_lock);
hlist_for_each_entry(proc, &binder_procs, proc_node)
print_binder_proc(m, proc, 1);
- if (do_lock)
- binder_unlock(__func__);
+ mutex_unlock(&binder_procs_lock);
+
return 0;
}
static int binder_stats_show(struct seq_file *m, void *unused)
{
struct binder_proc *proc;
- int do_lock = !binder_debug_no_lock;
-
- if (do_lock)
- binder_lock(__func__);
seq_puts(m, "binder stats:\n");
print_binder_stats(m, "", &binder_stats);
+ mutex_lock(&binder_procs_lock);
hlist_for_each_entry(proc, &binder_procs, proc_node)
print_binder_proc_stats(m, proc);
- if (do_lock)
- binder_unlock(__func__);
+ mutex_unlock(&binder_procs_lock);
+
return 0;
}
static int binder_transactions_show(struct seq_file *m, void *unused)
{
struct binder_proc *proc;
- int do_lock = !binder_debug_no_lock;
-
- if (do_lock)
- binder_lock(__func__);
seq_puts(m, "binder transactions:\n");
+ mutex_lock(&binder_procs_lock);
hlist_for_each_entry(proc, &binder_procs, proc_node)
print_binder_proc(m, proc, 0);
- if (do_lock)
- binder_unlock(__func__);
+ mutex_unlock(&binder_procs_lock);
+
return 0;
}
static int binder_proc_show(struct seq_file *m, void *unused)
{
struct binder_proc *itr;
- struct binder_proc *proc = m->private;
- int do_lock = !binder_debug_no_lock;
- bool valid_proc = false;
-
- if (do_lock)
- binder_lock(__func__);
+ int pid = (unsigned long)m->private;
+ mutex_lock(&binder_procs_lock);
hlist_for_each_entry(itr, &binder_procs, proc_node) {
- if (itr == proc) {
- valid_proc = true;
- break;
+ if (itr->pid == pid) {
+ seq_puts(m, "binder proc state:\n");
+ print_binder_proc(m, itr, 1);
}
}
- if (valid_proc) {
- seq_puts(m, "binder proc state:\n");
- print_binder_proc(m, proc, 1);
- }
- if (do_lock)
- binder_unlock(__func__);
+ mutex_unlock(&binder_procs_lock);
+
return 0;
}
static void print_binder_transaction_log_entry(struct seq_file *m,
struct binder_transaction_log_entry *e)
{
+ int debug_id = READ_ONCE(e->debug_id_done);
+ /*
+ * read barrier to guarantee debug_id_done read before
+ * we print the log values
+ */
+ smp_rmb();
seq_printf(m,
- "%d: %s from %d:%d to %d:%d node %d handle %d size %d:%d\n",
+ "%d: %s from %d:%d to %d:%d context %s node %d handle %d size %d:%d ret %d/%d l=%d",
e->debug_id, (e->call_type == 2) ? "reply" :
((e->call_type == 1) ? "async" : "call "), e->from_proc,
- e->from_thread, e->to_proc, e->to_thread, e->to_node,
- e->target_handle, e->data_size, e->offsets_size);
+ e->from_thread, e->to_proc, e->to_thread, e->context_name,
+ e->to_node, e->target_handle, e->data_size, e->offsets_size,
+ e->return_error, e->return_error_param,
+ e->return_error_line);
+ /*
+ * read-barrier to guarantee read of debug_id_done after
+ * done printing the fields of the entry
+ */
+ smp_rmb();
+ seq_printf(m, debug_id && debug_id == READ_ONCE(e->debug_id_done) ?
+ "\n" : " (incomplete)\n");
}
static int binder_transaction_log_show(struct seq_file *m, void *unused)
{
struct binder_transaction_log *log = m->private;
+ unsigned int log_cur = atomic_read(&log->cur);
+ unsigned int count;
+ unsigned int cur;
int i;
- if (log->full) {
- for (i = log->next; i < ARRAY_SIZE(log->entry); i++)
- print_binder_transaction_log_entry(m, &log->entry[i]);
+ count = log_cur + 1;
+ cur = count < ARRAY_SIZE(log->entry) && !log->full ?
+ 0 : count % ARRAY_SIZE(log->entry);
+ if (count > ARRAY_SIZE(log->entry) || log->full)
+ count = ARRAY_SIZE(log->entry);
+ for (i = 0; i < count; i++) {
+ unsigned int index = cur++ % ARRAY_SIZE(log->entry);
+
+ print_binder_transaction_log_entry(m, &log->entry[index]);
}
- for (i = 0; i < log->next; i++)
- print_binder_transaction_log_entry(m, &log->entry[i]);
return 0;
}
@@ -3700,53 +5818,118 @@ static const struct file_operations binder_fops = {
.release = binder_release,
};
-static struct miscdevice binder_miscdev = {
- .minor = MISC_DYNAMIC_MINOR,
- .name = "binder",
- .fops = &binder_fops
-};
-
BINDER_DEBUG_ENTRY(state);
BINDER_DEBUG_ENTRY(stats);
BINDER_DEBUG_ENTRY(transactions);
BINDER_DEBUG_ENTRY(transaction_log);
+static int __init init_binder_device(const char *name)
+{
+ int ret;
+ struct binder_device *binder_device;
+
+ binder_device = kzalloc(sizeof(*binder_device), GFP_KERNEL);
+ if (!binder_device)
+ return -ENOMEM;
+
+ binder_device->miscdev.fops = &binder_fops;
+ binder_device->miscdev.minor = MISC_DYNAMIC_MINOR;
+ binder_device->miscdev.name = name;
+
+ binder_device->context.binder_context_mgr_uid = INVALID_UID;
+ binder_device->context.name = name;
+ mutex_init(&binder_device->context.context_mgr_node_lock);
+
+ ret = misc_register(&binder_device->miscdev);
+ if (ret < 0) {
+ kfree(binder_device);
+ return ret;
+ }
+
+ hlist_add_head(&binder_device->hlist, &binder_devices);
+
+ return ret;
+}
+
static int __init binder_init(void)
{
int ret;
+ char *device_name, *device_names, *device_tmp;
+ struct binder_device *device;
+ struct hlist_node *tmp;
+
+ ret = binder_alloc_shrinker_init();
+ if (ret)
+ return ret;
+
+ atomic_set(&binder_transaction_log.cur, ~0U);
+ atomic_set(&binder_transaction_log_failed.cur, ~0U);
binder_debugfs_dir_entry_root = debugfs_create_dir("binder", NULL);
if (binder_debugfs_dir_entry_root)
binder_debugfs_dir_entry_proc = debugfs_create_dir("proc",
binder_debugfs_dir_entry_root);
- ret = misc_register(&binder_miscdev);
+
if (binder_debugfs_dir_entry_root) {
debugfs_create_file("state",
- S_IRUGO,
+ 0444,
binder_debugfs_dir_entry_root,
NULL,
&binder_state_fops);
debugfs_create_file("stats",
- S_IRUGO,
+ 0444,
binder_debugfs_dir_entry_root,
NULL,
&binder_stats_fops);
debugfs_create_file("transactions",
- S_IRUGO,
+ 0444,
binder_debugfs_dir_entry_root,
NULL,
&binder_transactions_fops);
debugfs_create_file("transaction_log",
- S_IRUGO,
+ 0444,
binder_debugfs_dir_entry_root,
&binder_transaction_log,
&binder_transaction_log_fops);
debugfs_create_file("failed_transaction_log",
- S_IRUGO,
+ 0444,
binder_debugfs_dir_entry_root,
&binder_transaction_log_failed,
&binder_transaction_log_fops);
}
+
+ /*
+ * Copy the module_parameter string, because we don't want to
+ * tokenize it in-place.
+ */
+ device_names = kzalloc(strlen(binder_devices_param) + 1, GFP_KERNEL);
+ if (!device_names) {
+ ret = -ENOMEM;
+ goto err_alloc_device_names_failed;
+ }
+ strcpy(device_names, binder_devices_param);
+
+ device_tmp = device_names;
+ while ((device_name = strsep(&device_tmp, ","))) {
+ ret = init_binder_device(device_name);
+ if (ret)
+ goto err_init_binder_device_failed;
+ }
+
+ return ret;
+
+err_init_binder_device_failed:
+ hlist_for_each_entry_safe(device, tmp, &binder_devices, hlist) {
+ misc_deregister(&device->miscdev);
+ hlist_del(&device->hlist);
+ kfree(device);
+ }
+
+ kfree(device_names);
+
+err_alloc_device_names_failed:
+ debugfs_remove_recursive(binder_debugfs_dir_entry_root);
+
return ret;
}
diff --git a/drivers/android/binder_alloc.c b/drivers/android/binder_alloc.c
new file mode 100644
index 000000000000..38bfa7adfed5
--- /dev/null
+++ b/drivers/android/binder_alloc.c
@@ -0,0 +1,1020 @@
+/* binder_alloc.c
+ *
+ * Android IPC Subsystem
+ *
+ * Copyright (C) 2007-2017 Google, Inc.
+ *
+ * This software is licensed under the terms of the GNU General Public
+ * License version 2, as published by the Free Software Foundation, and
+ * may be copied, distributed, and modified under those terms.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ */
+
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include <asm/cacheflush.h>
+#include <linux/list.h>
+#include <linux/mm.h>
+#include <linux/module.h>
+#include <linux/rtmutex.h>
+#include <linux/rbtree.h>
+#include <linux/seq_file.h>
+#include <linux/vmalloc.h>
+#include <linux/slab.h>
+#include <linux/sched.h>
+#include <linux/list_lru.h>
+#include "binder_alloc.h"
+#include "binder_trace.h"
+
+struct list_lru binder_alloc_lru;
+
+static DEFINE_MUTEX(binder_alloc_mmap_lock);
+
+enum {
+ BINDER_DEBUG_OPEN_CLOSE = 1U << 1,
+ BINDER_DEBUG_BUFFER_ALLOC = 1U << 2,
+ BINDER_DEBUG_BUFFER_ALLOC_ASYNC = 1U << 3,
+};
+static uint32_t binder_alloc_debug_mask;
+
+module_param_named(debug_mask, binder_alloc_debug_mask,
+ uint, 0644);
+
+#define binder_alloc_debug(mask, x...) \
+ do { \
+ if (binder_alloc_debug_mask & mask) \
+ pr_info(x); \
+ } while (0)
+
+static struct binder_buffer *binder_buffer_next(struct binder_buffer *buffer)
+{
+ return list_entry(buffer->entry.next, struct binder_buffer, entry);
+}
+
+static struct binder_buffer *binder_buffer_prev(struct binder_buffer *buffer)
+{
+ return list_entry(buffer->entry.prev, struct binder_buffer, entry);
+}
+
+static size_t binder_alloc_buffer_size(struct binder_alloc *alloc,
+ struct binder_buffer *buffer)
+{
+ if (list_is_last(&buffer->entry, &alloc->buffers))
+ return (u8 *)alloc->buffer +
+ alloc->buffer_size - (u8 *)buffer->data;
+ return (u8 *)binder_buffer_next(buffer)->data - (u8 *)buffer->data;
+}
+
+static void binder_insert_free_buffer(struct binder_alloc *alloc,
+ struct binder_buffer *new_buffer)
+{
+ struct rb_node **p = &alloc->free_buffers.rb_node;
+ struct rb_node *parent = NULL;
+ struct binder_buffer *buffer;
+ size_t buffer_size;
+ size_t new_buffer_size;
+
+ BUG_ON(!new_buffer->free);
+
+ new_buffer_size = binder_alloc_buffer_size(alloc, new_buffer);
+
+ binder_alloc_debug(BINDER_DEBUG_BUFFER_ALLOC,
+ "%d: add free buffer, size %zd, at %pK\n",
+ alloc->pid, new_buffer_size, new_buffer);
+
+ while (*p) {
+ parent = *p;
+ buffer = rb_entry(parent, struct binder_buffer, rb_node);
+ BUG_ON(!buffer->free);
+
+ buffer_size = binder_alloc_buffer_size(alloc, buffer);
+
+ if (new_buffer_size < buffer_size)
+ p = &parent->rb_left;
+ else
+ p = &parent->rb_right;
+ }
+ rb_link_node(&new_buffer->rb_node, parent, p);
+ rb_insert_color(&new_buffer->rb_node, &alloc->free_buffers);
+}
+
+static void binder_insert_allocated_buffer_locked(
+ struct binder_alloc *alloc, struct binder_buffer *new_buffer)
+{
+ struct rb_node **p = &alloc->allocated_buffers.rb_node;
+ struct rb_node *parent = NULL;
+ struct binder_buffer *buffer;
+
+ BUG_ON(new_buffer->free);
+
+ while (*p) {
+ parent = *p;
+ buffer = rb_entry(parent, struct binder_buffer, rb_node);
+ BUG_ON(buffer->free);
+
+ if (new_buffer->data < buffer->data)
+ p = &parent->rb_left;
+ else if (new_buffer->data > buffer->data)
+ p = &parent->rb_right;
+ else
+ BUG();
+ }
+ rb_link_node(&new_buffer->rb_node, parent, p);
+ rb_insert_color(&new_buffer->rb_node, &alloc->allocated_buffers);
+}
+
+static struct binder_buffer *binder_alloc_prepare_to_free_locked(
+ struct binder_alloc *alloc,
+ uintptr_t user_ptr)
+{
+ struct rb_node *n = alloc->allocated_buffers.rb_node;
+ struct binder_buffer *buffer;
+ void *kern_ptr;
+
+ kern_ptr = (void *)(user_ptr - alloc->user_buffer_offset);
+
+ while (n) {
+ buffer = rb_entry(n, struct binder_buffer, rb_node);
+ BUG_ON(buffer->free);
+
+ if (kern_ptr < buffer->data)
+ n = n->rb_left;
+ else if (kern_ptr > buffer->data)
+ n = n->rb_right;
+ else {
+ /*
+ * Guard against user threads attempting to
+ * free the buffer when in use by kernel or
+ * after it's already been freed.
+ */
+ if (!buffer->allow_user_free)
+ return ERR_PTR(-EPERM);
+ buffer->allow_user_free = 0;
+ return buffer;
+ }
+ }
+ return NULL;
+}
+
+/**
+ * binder_alloc_buffer_lookup() - get buffer given user ptr
+ * @alloc: binder_alloc for this proc
+ * @user_ptr: User pointer to buffer data
+ *
+ * Validate userspace pointer to buffer data and return buffer corresponding to
+ * that user pointer. Search the rb tree for buffer that matches user data
+ * pointer.
+ *
+ * Return: Pointer to buffer or NULL
+ */
+struct binder_buffer *binder_alloc_prepare_to_free(struct binder_alloc *alloc,
+ uintptr_t user_ptr)
+{
+ struct binder_buffer *buffer;
+
+ mutex_lock(&alloc->mutex);
+ buffer = binder_alloc_prepare_to_free_locked(alloc, user_ptr);
+ mutex_unlock(&alloc->mutex);
+ return buffer;
+}
+
+static int binder_update_page_range(struct binder_alloc *alloc, int allocate,
+ void *start, void *end)
+{
+ void *page_addr;
+ unsigned long user_page_addr;
+ struct binder_lru_page *page;
+ struct vm_area_struct *vma = NULL;
+ struct mm_struct *mm = NULL;
+ bool need_mm = false;
+
+ binder_alloc_debug(BINDER_DEBUG_BUFFER_ALLOC,
+ "%d: %s pages %pK-%pK\n", alloc->pid,
+ allocate ? "allocate" : "free", start, end);
+
+ if (end <= start)
+ return 0;
+
+ trace_binder_update_page_range(alloc, allocate, start, end);
+
+ if (allocate == 0)
+ goto free_range;
+
+ for (page_addr = start; page_addr < end; page_addr += PAGE_SIZE) {
+ page = &alloc->pages[(page_addr - alloc->buffer) / PAGE_SIZE];
+ if (!page->page_ptr) {
+ need_mm = true;
+ break;
+ }
+ }
+
+ if (need_mm && mmget_not_zero(alloc->vma_vm_mm))
+ mm = alloc->vma_vm_mm;
+
+ if (mm) {
+ down_read(&mm->mmap_sem);
+ vma = alloc->vma;
+ }
+
+ if (!vma && need_mm) {
+ pr_err("%d: binder_alloc_buf failed to map pages in userspace, no vma\n",
+ alloc->pid);
+ goto err_no_vma;
+ }
+
+ for (page_addr = start; page_addr < end; page_addr += PAGE_SIZE) {
+ int ret;
+ bool on_lru;
+ size_t index;
+
+ index = (page_addr - alloc->buffer) / PAGE_SIZE;
+ page = &alloc->pages[index];
+
+ if (page->page_ptr) {
+ trace_binder_alloc_lru_start(alloc, index);
+
+ on_lru = list_lru_del(&binder_alloc_lru, &page->lru);
+ WARN_ON(!on_lru);
+
+ trace_binder_alloc_lru_end(alloc, index);
+ continue;
+ }
+
+ if (WARN_ON(!vma))
+ goto err_page_ptr_cleared;
+
+ trace_binder_alloc_page_start(alloc, index);
+ page->page_ptr = alloc_page(GFP_KERNEL |
+ __GFP_HIGHMEM |
+ __GFP_ZERO);
+ if (!page->page_ptr) {
+ pr_err("%d: binder_alloc_buf failed for page at %pK\n",
+ alloc->pid, page_addr);
+ goto err_alloc_page_failed;
+ }
+ page->alloc = alloc;
+ INIT_LIST_HEAD(&page->lru);
+
+ ret = map_kernel_range_noflush((unsigned long)page_addr,
+ PAGE_SIZE, PAGE_KERNEL,
+ &page->page_ptr);
+ flush_cache_vmap((unsigned long)page_addr,
+ (unsigned long)page_addr + PAGE_SIZE);
+ if (ret != 1) {
+ pr_err("%d: binder_alloc_buf failed to map page at %pK in kernel\n",
+ alloc->pid, page_addr);
+ goto err_map_kernel_failed;
+ }
+ user_page_addr =
+ (uintptr_t)page_addr + alloc->user_buffer_offset;
+ ret = vm_insert_page(vma, user_page_addr, page[0].page_ptr);
+ if (ret) {
+ pr_err("%d: binder_alloc_buf failed to map page at %lx in userspace\n",
+ alloc->pid, user_page_addr);
+ goto err_vm_insert_page_failed;
+ }
+
+ if (index + 1 > alloc->pages_high)
+ alloc->pages_high = index + 1;
+
+ trace_binder_alloc_page_end(alloc, index);
+ /* vm_insert_page does not seem to increment the refcount */
+ }
+ if (mm) {
+ up_read(&mm->mmap_sem);
+ mmput(mm);
+ }
+ return 0;
+
+free_range:
+ for (page_addr = end - PAGE_SIZE; page_addr >= start;
+ page_addr -= PAGE_SIZE) {
+ bool ret;
+ size_t index;
+
+ index = (page_addr - alloc->buffer) / PAGE_SIZE;
+ page = &alloc->pages[index];
+
+ trace_binder_free_lru_start(alloc, index);
+
+ ret = list_lru_add(&binder_alloc_lru, &page->lru);
+ WARN_ON(!ret);
+
+ trace_binder_free_lru_end(alloc, index);
+ continue;
+
+err_vm_insert_page_failed:
+ unmap_kernel_range((unsigned long)page_addr, PAGE_SIZE);
+err_map_kernel_failed:
+ __free_page(page->page_ptr);
+ page->page_ptr = NULL;
+err_alloc_page_failed:
+err_page_ptr_cleared:
+ ;
+ }
+err_no_vma:
+ if (mm) {
+ up_read(&mm->mmap_sem);
+ mmput(mm);
+ }
+ return vma ? -ENOMEM : -ESRCH;
+}
+
+static struct binder_buffer *binder_alloc_new_buf_locked(
+ struct binder_alloc *alloc,
+ size_t data_size,
+ size_t offsets_size,
+ size_t extra_buffers_size,
+ int is_async)
+{
+ struct rb_node *n = alloc->free_buffers.rb_node;
+ struct binder_buffer *buffer;
+ size_t buffer_size;
+ struct rb_node *best_fit = NULL;
+ void *has_page_addr;
+ void *end_page_addr;
+ size_t size, data_offsets_size;
+ int ret;
+
+ if (alloc->vma == NULL) {
+ pr_err("%d: binder_alloc_buf, no vma\n",
+ alloc->pid);
+ return ERR_PTR(-ESRCH);
+ }
+
+ data_offsets_size = ALIGN(data_size, sizeof(void *)) +
+ ALIGN(offsets_size, sizeof(void *));
+
+ if (data_offsets_size < data_size || data_offsets_size < offsets_size) {
+ binder_alloc_debug(BINDER_DEBUG_BUFFER_ALLOC,
+ "%d: got transaction with invalid size %zd-%zd\n",
+ alloc->pid, data_size, offsets_size);
+ return ERR_PTR(-EINVAL);
+ }
+ size = data_offsets_size + ALIGN(extra_buffers_size, sizeof(void *));
+ if (size < data_offsets_size || size < extra_buffers_size) {
+ binder_alloc_debug(BINDER_DEBUG_BUFFER_ALLOC,
+ "%d: got transaction with invalid extra_buffers_size %zd\n",
+ alloc->pid, extra_buffers_size);
+ return ERR_PTR(-EINVAL);
+ }
+ if (is_async &&
+ alloc->free_async_space < size + sizeof(struct binder_buffer)) {
+ binder_alloc_debug(BINDER_DEBUG_BUFFER_ALLOC,
+ "%d: binder_alloc_buf size %zd failed, no async space left\n",
+ alloc->pid, size);
+ return ERR_PTR(-ENOSPC);
+ }
+
+ /* Pad 0-size buffers so they get assigned unique addresses */
+ size = max(size, sizeof(void *));
+
+ while (n) {
+ buffer = rb_entry(n, struct binder_buffer, rb_node);
+ BUG_ON(!buffer->free);
+ buffer_size = binder_alloc_buffer_size(alloc, buffer);
+
+ if (size < buffer_size) {
+ best_fit = n;
+ n = n->rb_left;
+ } else if (size > buffer_size)
+ n = n->rb_right;
+ else {
+ best_fit = n;
+ break;
+ }
+ }
+ if (best_fit == NULL) {
+ size_t allocated_buffers = 0;
+ size_t largest_alloc_size = 0;
+ size_t total_alloc_size = 0;
+ size_t free_buffers = 0;
+ size_t largest_free_size = 0;
+ size_t total_free_size = 0;
+
+ for (n = rb_first(&alloc->allocated_buffers); n != NULL;
+ n = rb_next(n)) {
+ buffer = rb_entry(n, struct binder_buffer, rb_node);
+ buffer_size = binder_alloc_buffer_size(alloc, buffer);
+ allocated_buffers++;
+ total_alloc_size += buffer_size;
+ if (buffer_size > largest_alloc_size)
+ largest_alloc_size = buffer_size;
+ }
+ for (n = rb_first(&alloc->free_buffers); n != NULL;
+ n = rb_next(n)) {
+ buffer = rb_entry(n, struct binder_buffer, rb_node);
+ buffer_size = binder_alloc_buffer_size(alloc, buffer);
+ free_buffers++;
+ total_free_size += buffer_size;
+ if (buffer_size > largest_free_size)
+ largest_free_size = buffer_size;
+ }
+ pr_err("%d: binder_alloc_buf size %zd failed, no address space\n",
+ alloc->pid, size);
+ pr_err("allocated: %zd (num: %zd largest: %zd), free: %zd (num: %zd largest: %zd)\n",
+ total_alloc_size, allocated_buffers, largest_alloc_size,
+ total_free_size, free_buffers, largest_free_size);
+ return ERR_PTR(-ENOSPC);
+ }
+ if (n == NULL) {
+ buffer = rb_entry(best_fit, struct binder_buffer, rb_node);
+ buffer_size = binder_alloc_buffer_size(alloc, buffer);
+ }
+
+ binder_alloc_debug(BINDER_DEBUG_BUFFER_ALLOC,
+ "%d: binder_alloc_buf size %zd got buffer %pK size %zd\n",
+ alloc->pid, size, buffer, buffer_size);
+
+ has_page_addr =
+ (void *)(((uintptr_t)buffer->data + buffer_size) & PAGE_MASK);
+ WARN_ON(n && buffer_size != size);
+ end_page_addr =
+ (void *)PAGE_ALIGN((uintptr_t)buffer->data + size);
+ if (end_page_addr > has_page_addr)
+ end_page_addr = has_page_addr;
+ ret = binder_update_page_range(alloc, 1,
+ (void *)PAGE_ALIGN((uintptr_t)buffer->data), end_page_addr);
+ if (ret)
+ return ERR_PTR(ret);
+
+ if (buffer_size != size) {
+ struct binder_buffer *new_buffer;
+
+ new_buffer = kzalloc(sizeof(*buffer), GFP_KERNEL);
+ if (!new_buffer) {
+ pr_err("%s: %d failed to alloc new buffer struct\n",
+ __func__, alloc->pid);
+ goto err_alloc_buf_struct_failed;
+ }
+ new_buffer->data = (u8 *)buffer->data + size;
+ list_add(&new_buffer->entry, &buffer->entry);
+ new_buffer->free = 1;
+ binder_insert_free_buffer(alloc, new_buffer);
+ }
+
+ rb_erase(best_fit, &alloc->free_buffers);
+ buffer->free = 0;
+ buffer->allow_user_free = 0;
+ binder_insert_allocated_buffer_locked(alloc, buffer);
+ binder_alloc_debug(BINDER_DEBUG_BUFFER_ALLOC,
+ "%d: binder_alloc_buf size %zd got %pK\n",
+ alloc->pid, size, buffer);
+ buffer->data_size = data_size;
+ buffer->offsets_size = offsets_size;
+ buffer->async_transaction = is_async;
+ buffer->extra_buffers_size = extra_buffers_size;
+ if (is_async) {
+ alloc->free_async_space -= size + sizeof(struct binder_buffer);
+ binder_alloc_debug(BINDER_DEBUG_BUFFER_ALLOC_ASYNC,
+ "%d: binder_alloc_buf size %zd async free %zd\n",
+ alloc->pid, size, alloc->free_async_space);
+ }
+ return buffer;
+
+err_alloc_buf_struct_failed:
+ binder_update_page_range(alloc, 0,
+ (void *)PAGE_ALIGN((uintptr_t)buffer->data),
+ end_page_addr);
+ return ERR_PTR(-ENOMEM);
+}
+
+/**
+ * binder_alloc_new_buf() - Allocate a new binder buffer
+ * @alloc: binder_alloc for this proc
+ * @data_size: size of user data buffer
+ * @offsets_size: user specified buffer offset
+ * @extra_buffers_size: size of extra space for meta-data (eg, security context)
+ * @is_async: buffer for async transaction
+ *
+ * Allocate a new buffer given the requested sizes. Returns
+ * the kernel version of the buffer pointer. The size allocated
+ * is the sum of the three given sizes (each rounded up to
+ * pointer-sized boundary)
+ *
+ * Return: The allocated buffer or %NULL if error
+ */
+struct binder_buffer *binder_alloc_new_buf(struct binder_alloc *alloc,
+ size_t data_size,
+ size_t offsets_size,
+ size_t extra_buffers_size,
+ int is_async)
+{
+ struct binder_buffer *buffer;
+
+ mutex_lock(&alloc->mutex);
+ buffer = binder_alloc_new_buf_locked(alloc, data_size, offsets_size,
+ extra_buffers_size, is_async);
+ mutex_unlock(&alloc->mutex);
+ return buffer;
+}
+
+static void *buffer_start_page(struct binder_buffer *buffer)
+{
+ return (void *)((uintptr_t)buffer->data & PAGE_MASK);
+}
+
+static void *prev_buffer_end_page(struct binder_buffer *buffer)
+{
+ return (void *)(((uintptr_t)(buffer->data) - 1) & PAGE_MASK);
+}
+
+static void binder_delete_free_buffer(struct binder_alloc *alloc,
+ struct binder_buffer *buffer)
+{
+ struct binder_buffer *prev, *next = NULL;
+ bool to_free = true;
+ BUG_ON(alloc->buffers.next == &buffer->entry);
+ prev = binder_buffer_prev(buffer);
+ BUG_ON(!prev->free);
+ if (prev_buffer_end_page(prev) == buffer_start_page(buffer)) {
+ to_free = false;
+ binder_alloc_debug(BINDER_DEBUG_BUFFER_ALLOC,
+ "%d: merge free, buffer %pK share page with %pK\n",
+ alloc->pid, buffer->data, prev->data);
+ }
+
+ if (!list_is_last(&buffer->entry, &alloc->buffers)) {
+ next = binder_buffer_next(buffer);
+ if (buffer_start_page(next) == buffer_start_page(buffer)) {
+ to_free = false;
+ binder_alloc_debug(BINDER_DEBUG_BUFFER_ALLOC,
+ "%d: merge free, buffer %pK share page with %pK\n",
+ alloc->pid,
+ buffer->data,
+ next->data);
+ }
+ }
+
+ if (PAGE_ALIGNED(buffer->data)) {
+ binder_alloc_debug(BINDER_DEBUG_BUFFER_ALLOC,
+ "%d: merge free, buffer start %pK is page aligned\n",
+ alloc->pid, buffer->data);
+ to_free = false;
+ }
+
+ if (to_free) {
+ binder_alloc_debug(BINDER_DEBUG_BUFFER_ALLOC,
+ "%d: merge free, buffer %pK do not share page with %pK or %pK\n",
+ alloc->pid, buffer->data,
+ prev->data, next ? next->data : NULL);
+ binder_update_page_range(alloc, 0, buffer_start_page(buffer),
+ buffer_start_page(buffer) + PAGE_SIZE);
+ }
+ list_del(&buffer->entry);
+ kfree(buffer);
+}
+
+static void binder_free_buf_locked(struct binder_alloc *alloc,
+ struct binder_buffer *buffer)
+{
+ size_t size, buffer_size;
+
+ buffer_size = binder_alloc_buffer_size(alloc, buffer);
+
+ size = ALIGN(buffer->data_size, sizeof(void *)) +
+ ALIGN(buffer->offsets_size, sizeof(void *)) +
+ ALIGN(buffer->extra_buffers_size, sizeof(void *));
+
+ binder_alloc_debug(BINDER_DEBUG_BUFFER_ALLOC,
+ "%d: binder_free_buf %pK size %zd buffer_size %zd\n",
+ alloc->pid, buffer, size, buffer_size);
+
+ BUG_ON(buffer->free);
+ BUG_ON(size > buffer_size);
+ BUG_ON(buffer->transaction != NULL);
+ BUG_ON(buffer->data < alloc->buffer);
+ BUG_ON(buffer->data > alloc->buffer + alloc->buffer_size);
+
+ if (buffer->async_transaction) {
+ alloc->free_async_space += size + sizeof(struct binder_buffer);
+
+ binder_alloc_debug(BINDER_DEBUG_BUFFER_ALLOC_ASYNC,
+ "%d: binder_free_buf size %zd async free %zd\n",
+ alloc->pid, size, alloc->free_async_space);
+ }
+
+ binder_update_page_range(alloc, 0,
+ (void *)PAGE_ALIGN((uintptr_t)buffer->data),
+ (void *)(((uintptr_t)buffer->data + buffer_size) & PAGE_MASK));
+
+ rb_erase(&buffer->rb_node, &alloc->allocated_buffers);
+ buffer->free = 1;
+ if (!list_is_last(&buffer->entry, &alloc->buffers)) {
+ struct binder_buffer *next = binder_buffer_next(buffer);
+
+ if (next->free) {
+ rb_erase(&next->rb_node, &alloc->free_buffers);
+ binder_delete_free_buffer(alloc, next);
+ }
+ }
+ if (alloc->buffers.next != &buffer->entry) {
+ struct binder_buffer *prev = binder_buffer_prev(buffer);
+
+ if (prev->free) {
+ binder_delete_free_buffer(alloc, buffer);
+ rb_erase(&prev->rb_node, &alloc->free_buffers);
+ buffer = prev;
+ }
+ }
+ binder_insert_free_buffer(alloc, buffer);
+}
+
+/**
+ * binder_alloc_free_buf() - free a binder buffer
+ * @alloc: binder_alloc for this proc
+ * @buffer: kernel pointer to buffer
+ *
+ * Free the buffer allocated via binder_alloc_new_buffer()
+ */
+void binder_alloc_free_buf(struct binder_alloc *alloc,
+ struct binder_buffer *buffer)
+{
+ mutex_lock(&alloc->mutex);
+ binder_free_buf_locked(alloc, buffer);
+ mutex_unlock(&alloc->mutex);
+}
+
+/**
+ * binder_alloc_mmap_handler() - map virtual address space for proc
+ * @alloc: alloc structure for this proc
+ * @vma: vma passed to mmap()
+ *
+ * Called by binder_mmap() to initialize the space specified in
+ * vma for allocating binder buffers
+ *
+ * Return:
+ * 0 = success
+ * -EBUSY = address space already mapped
+ * -ENOMEM = failed to map memory to given address space
+ */
+int binder_alloc_mmap_handler(struct binder_alloc *alloc,
+ struct vm_area_struct *vma)
+{
+ int ret;
+ struct vm_struct *area;
+ const char *failure_string;
+ struct binder_buffer *buffer;
+
+ mutex_lock(&binder_alloc_mmap_lock);
+ if (alloc->buffer) {
+ ret = -EBUSY;
+ failure_string = "already mapped";
+ goto err_already_mapped;
+ }
+
+ area = get_vm_area(vma->vm_end - vma->vm_start, VM_ALLOC);
+ if (area == NULL) {
+ ret = -ENOMEM;
+ failure_string = "get_vm_area";
+ goto err_get_vm_area_failed;
+ }
+ alloc->buffer = area->addr;
+ alloc->user_buffer_offset =
+ vma->vm_start - (uintptr_t)alloc->buffer;
+ mutex_unlock(&binder_alloc_mmap_lock);
+
+#ifdef CONFIG_CPU_CACHE_VIPT
+ if (cache_is_vipt_aliasing()) {
+ while (CACHE_COLOUR(
+ (vma->vm_start ^ (uint32_t)alloc->buffer))) {
+ pr_info("%s: %d %lx-%lx maps %pK bad alignment\n",
+ __func__, alloc->pid, vma->vm_start,
+ vma->vm_end, alloc->buffer);
+ vma->vm_start += PAGE_SIZE;
+ }
+ }
+#endif
+ alloc->pages = kzalloc(sizeof(alloc->pages[0]) *
+ ((vma->vm_end - vma->vm_start) / PAGE_SIZE),
+ GFP_KERNEL);
+ if (alloc->pages == NULL) {
+ ret = -ENOMEM;
+ failure_string = "alloc page array";
+ goto err_alloc_pages_failed;
+ }
+ alloc->buffer_size = vma->vm_end - vma->vm_start;
+
+ buffer = kzalloc(sizeof(*buffer), GFP_KERNEL);
+ if (!buffer) {
+ ret = -ENOMEM;
+ failure_string = "alloc buffer struct";
+ goto err_alloc_buf_struct_failed;
+ }
+
+ buffer->data = alloc->buffer;
+ list_add(&buffer->entry, &alloc->buffers);
+ buffer->free = 1;
+ binder_insert_free_buffer(alloc, buffer);
+ alloc->free_async_space = alloc->buffer_size / 2;
+ barrier();
+ alloc->vma = vma;
+ alloc->vma_vm_mm = vma->vm_mm;
+ /* Same as mmgrab() in later kernel versions */
+ atomic_inc(&alloc->vma_vm_mm->mm_count);
+
+ return 0;
+
+err_alloc_buf_struct_failed:
+ kfree(alloc->pages);
+ alloc->pages = NULL;
+err_alloc_pages_failed:
+ mutex_lock(&binder_alloc_mmap_lock);
+ vfree(alloc->buffer);
+ alloc->buffer = NULL;
+err_get_vm_area_failed:
+err_already_mapped:
+ mutex_unlock(&binder_alloc_mmap_lock);
+ pr_err("%s: %d %lx-%lx %s failed %d\n", __func__,
+ alloc->pid, vma->vm_start, vma->vm_end, failure_string, ret);
+ return ret;
+}
+
+
+void binder_alloc_deferred_release(struct binder_alloc *alloc)
+{
+ struct rb_node *n;
+ int buffers, page_count;
+ struct binder_buffer *buffer;
+
+ BUG_ON(alloc->vma);
+
+ buffers = 0;
+ mutex_lock(&alloc->mutex);
+ while ((n = rb_first(&alloc->allocated_buffers))) {
+ buffer = rb_entry(n, struct binder_buffer, rb_node);
+
+ /* Transaction should already have been freed */
+ BUG_ON(buffer->transaction);
+
+ binder_free_buf_locked(alloc, buffer);
+ buffers++;
+ }
+
+ while (!list_empty(&alloc->buffers)) {
+ buffer = list_first_entry(&alloc->buffers,
+ struct binder_buffer, entry);
+ WARN_ON(!buffer->free);
+
+ list_del(&buffer->entry);
+ WARN_ON_ONCE(!list_empty(&alloc->buffers));
+ kfree(buffer);
+ }
+
+ page_count = 0;
+ if (alloc->pages) {
+ int i;
+
+ for (i = 0; i < alloc->buffer_size / PAGE_SIZE; i++) {
+ void *page_addr;
+ bool on_lru;
+
+ if (!alloc->pages[i].page_ptr)
+ continue;
+
+ on_lru = list_lru_del(&binder_alloc_lru,
+ &alloc->pages[i].lru);
+ page_addr = alloc->buffer + i * PAGE_SIZE;
+ binder_alloc_debug(BINDER_DEBUG_BUFFER_ALLOC,
+ "%s: %d: page %d at %pK %s\n",
+ __func__, alloc->pid, i, page_addr,
+ on_lru ? "on lru" : "active");
+ unmap_kernel_range((unsigned long)page_addr, PAGE_SIZE);
+ __free_page(alloc->pages[i].page_ptr);
+ page_count++;
+ }
+ kfree(alloc->pages);
+ vfree(alloc->buffer);
+ }
+ mutex_unlock(&alloc->mutex);
+ if (alloc->vma_vm_mm)
+ mmdrop(alloc->vma_vm_mm);
+
+ binder_alloc_debug(BINDER_DEBUG_OPEN_CLOSE,
+ "%s: %d buffers %d, pages %d\n",
+ __func__, alloc->pid, buffers, page_count);
+}
+
+static void print_binder_buffer(struct seq_file *m, const char *prefix,
+ struct binder_buffer *buffer)
+{
+ seq_printf(m, "%s %d: %pK size %zd:%zd:%zd %s\n",
+ prefix, buffer->debug_id, buffer->data,
+ buffer->data_size, buffer->offsets_size,
+ buffer->extra_buffers_size,
+ buffer->transaction ? "active" : "delivered");
+}
+
+/**
+ * binder_alloc_print_allocated() - print buffer info
+ * @m: seq_file for output via seq_printf()
+ * @alloc: binder_alloc for this proc
+ *
+ * Prints information about every buffer associated with
+ * the binder_alloc state to the given seq_file
+ */
+void binder_alloc_print_allocated(struct seq_file *m,
+ struct binder_alloc *alloc)
+{
+ struct rb_node *n;
+
+ mutex_lock(&alloc->mutex);
+ for (n = rb_first(&alloc->allocated_buffers); n != NULL; n = rb_next(n))
+ print_binder_buffer(m, " buffer",
+ rb_entry(n, struct binder_buffer, rb_node));
+ mutex_unlock(&alloc->mutex);
+}
+
+/**
+ * binder_alloc_print_pages() - print page usage
+ * @m: seq_file for output via seq_printf()
+ * @alloc: binder_alloc for this proc
+ */
+void binder_alloc_print_pages(struct seq_file *m,
+ struct binder_alloc *alloc)
+{
+ struct binder_lru_page *page;
+ int i;
+ int active = 0;
+ int lru = 0;
+ int free = 0;
+
+ mutex_lock(&alloc->mutex);
+ for (i = 0; i < alloc->buffer_size / PAGE_SIZE; i++) {
+ page = &alloc->pages[i];
+ if (!page->page_ptr)
+ free++;
+ else if (list_empty(&page->lru))
+ active++;
+ else
+ lru++;
+ }
+ mutex_unlock(&alloc->mutex);
+ seq_printf(m, " pages: %d:%d:%d\n", active, lru, free);
+ seq_printf(m, " pages high watermark: %zu\n", alloc->pages_high);
+}
+
+/**
+ * binder_alloc_get_allocated_count() - return count of buffers
+ * @alloc: binder_alloc for this proc
+ *
+ * Return: count of allocated buffers
+ */
+int binder_alloc_get_allocated_count(struct binder_alloc *alloc)
+{
+ struct rb_node *n;
+ int count = 0;
+
+ mutex_lock(&alloc->mutex);
+ for (n = rb_first(&alloc->allocated_buffers); n != NULL; n = rb_next(n))
+ count++;
+ mutex_unlock(&alloc->mutex);
+ return count;
+}
+
+
+/**
+ * binder_alloc_vma_close() - invalidate address space
+ * @alloc: binder_alloc for this proc
+ *
+ * Called from binder_vma_close() when releasing address space.
+ * Clears alloc->vma to prevent new incoming transactions from
+ * allocating more buffers.
+ */
+void binder_alloc_vma_close(struct binder_alloc *alloc)
+{
+ WRITE_ONCE(alloc->vma, NULL);
+}
+
+/**
+ * binder_alloc_free_page() - shrinker callback to free pages
+ * @item: item to free
+ * @lock: lock protecting the item
+ * @cb_arg: callback argument
+ *
+ * Called from list_lru_walk() in binder_shrink_scan() to free
+ * up pages when the system is under memory pressure.
+ */
+enum lru_status binder_alloc_free_page(struct list_head *item,
+ struct list_lru_one *lru,
+ spinlock_t *lock,
+ void *cb_arg)
+{
+ struct mm_struct *mm = NULL;
+ struct binder_lru_page *page = container_of(item,
+ struct binder_lru_page,
+ lru);
+ struct binder_alloc *alloc;
+ uintptr_t page_addr;
+ size_t index;
+ struct vm_area_struct *vma;
+
+ alloc = page->alloc;
+ if (!mutex_trylock(&alloc->mutex))
+ goto err_get_alloc_mutex_failed;
+
+ if (!page->page_ptr)
+ goto err_page_already_freed;
+
+ index = page - alloc->pages;
+ page_addr = (uintptr_t)alloc->buffer + index * PAGE_SIZE;
+ vma = alloc->vma;
+ if (vma) {
+ if (!mmget_not_zero(alloc->vma_vm_mm))
+ goto err_mmget;
+ mm = alloc->vma_vm_mm;
+ if (!down_write_trylock(&mm->mmap_sem))
+ goto err_down_write_mmap_sem_failed;
+ }
+
+ list_lru_isolate(lru, item);
+ spin_unlock(lock);
+
+ if (vma) {
+ trace_binder_unmap_user_start(alloc, index);
+
+ zap_page_range(vma,
+ page_addr +
+ alloc->user_buffer_offset,
+ PAGE_SIZE, NULL);
+
+ trace_binder_unmap_user_end(alloc, index);
+
+ up_write(&mm->mmap_sem);
+ mmput(mm);
+ }
+
+ trace_binder_unmap_kernel_start(alloc, index);
+
+ unmap_kernel_range(page_addr, PAGE_SIZE);
+ __free_page(page->page_ptr);
+ page->page_ptr = NULL;
+
+ trace_binder_unmap_kernel_end(alloc, index);
+
+ spin_lock(lock);
+ mutex_unlock(&alloc->mutex);
+ return LRU_REMOVED_RETRY;
+
+err_down_write_mmap_sem_failed:
+ mmput_async(mm);
+err_mmget:
+err_page_already_freed:
+ mutex_unlock(&alloc->mutex);
+err_get_alloc_mutex_failed:
+ return LRU_SKIP;
+}
+
+static unsigned long
+binder_shrink_count(struct shrinker *shrink, struct shrink_control *sc)
+{
+ unsigned long ret = list_lru_count(&binder_alloc_lru);
+ return ret;
+}
+
+static unsigned long
+binder_shrink_scan(struct shrinker *shrink, struct shrink_control *sc)
+{
+ unsigned long ret;
+
+ ret = list_lru_walk(&binder_alloc_lru, binder_alloc_free_page,
+ NULL, sc->nr_to_scan);
+ return ret;
+}
+
+static struct shrinker binder_shrinker = {
+ .count_objects = binder_shrink_count,
+ .scan_objects = binder_shrink_scan,
+ .seeks = DEFAULT_SEEKS,
+};
+
+/**
+ * binder_alloc_init() - called by binder_open() for per-proc initialization
+ * @alloc: binder_alloc for this proc
+ *
+ * Called from binder_open() to initialize binder_alloc fields for
+ * new binder proc
+ */
+void binder_alloc_init(struct binder_alloc *alloc)
+{
+ alloc->pid = current->group_leader->pid;
+ mutex_init(&alloc->mutex);
+ INIT_LIST_HEAD(&alloc->buffers);
+}
+
+int binder_alloc_shrinker_init(void)
+{
+ int ret = list_lru_init(&binder_alloc_lru);
+
+ if (ret == 0) {
+ ret = register_shrinker(&binder_shrinker);
+ if (ret)
+ list_lru_destroy(&binder_alloc_lru);
+ }
+ return ret;
+}
diff --git a/drivers/android/binder_alloc.h b/drivers/android/binder_alloc.h
new file mode 100644
index 000000000000..fb3238c74c8a
--- /dev/null
+++ b/drivers/android/binder_alloc.h
@@ -0,0 +1,187 @@
+/*
+ * Copyright (C) 2017 Google, Inc.
+ *
+ * This software is licensed under the terms of the GNU General Public
+ * License version 2, as published by the Free Software Foundation, and
+ * may be copied, distributed, and modified under those terms.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ */
+
+#ifndef _LINUX_BINDER_ALLOC_H
+#define _LINUX_BINDER_ALLOC_H
+
+#include <linux/rbtree.h>
+#include <linux/list.h>
+#include <linux/mm.h>
+#include <linux/rtmutex.h>
+#include <linux/vmalloc.h>
+#include <linux/slab.h>
+#include <linux/list_lru.h>
+
+extern struct list_lru binder_alloc_lru;
+struct binder_transaction;
+
+/**
+ * struct binder_buffer - buffer used for binder transactions
+ * @entry: entry alloc->buffers
+ * @rb_node: node for allocated_buffers/free_buffers rb trees
+ * @free: true if buffer is free
+ * @allow_user_free: describe the second member of struct blah,
+ * @async_transaction: describe the second member of struct blah,
+ * @debug_id: describe the second member of struct blah,
+ * @transaction: describe the second member of struct blah,
+ * @target_node: describe the second member of struct blah,
+ * @data_size: describe the second member of struct blah,
+ * @offsets_size: describe the second member of struct blah,
+ * @extra_buffers_size: describe the second member of struct blah,
+ * @data:i describe the second member of struct blah,
+ *
+ * Bookkeeping structure for binder transaction buffers
+ */
+struct binder_buffer {
+ struct list_head entry; /* free and allocated entries by address */
+ struct rb_node rb_node; /* free entry by size or allocated entry */
+ /* by address */
+ unsigned free:1;
+ unsigned allow_user_free:1;
+ unsigned async_transaction:1;
+ unsigned debug_id:29;
+
+ struct binder_transaction *transaction;
+
+ struct binder_node *target_node;
+ size_t data_size;
+ size_t offsets_size;
+ size_t extra_buffers_size;
+ void *data;
+};
+
+/**
+ * struct binder_lru_page - page object used for binder shrinker
+ * @page_ptr: pointer to physical page in mmap'd space
+ * @lru: entry in binder_alloc_lru
+ * @alloc: binder_alloc for a proc
+ */
+struct binder_lru_page {
+ struct list_head lru;
+ struct page *page_ptr;
+ struct binder_alloc *alloc;
+};
+
+/**
+ * struct binder_alloc - per-binder proc state for binder allocator
+ * @vma: vm_area_struct passed to mmap_handler
+ * (invarient after mmap)
+ * @tsk: tid for task that called init for this proc
+ * (invariant after init)
+ * @vma_vm_mm: copy of vma->vm_mm (invarient after mmap)
+ * @buffer: base of per-proc address space mapped via mmap
+ * @user_buffer_offset: offset between user and kernel VAs for buffer
+ * @buffers: list of all buffers for this proc
+ * @free_buffers: rb tree of buffers available for allocation
+ * sorted by size
+ * @allocated_buffers: rb tree of allocated buffers sorted by address
+ * @free_async_space: VA space available for async buffers. This is
+ * initialized at mmap time to 1/2 the full VA space
+ * @pages: array of binder_lru_page
+ * @buffer_size: size of address space specified via mmap
+ * @pid: pid for associated binder_proc (invariant after init)
+ * @pages_high: high watermark of offset in @pages
+ *
+ * Bookkeeping structure for per-proc address space management for binder
+ * buffers. It is normally initialized during binder_init() and binder_mmap()
+ * calls. The address space is used for both user-visible buffers and for
+ * struct binder_buffer objects used to track the user buffers
+ */
+struct binder_alloc {
+ struct mutex mutex;
+ struct vm_area_struct *vma;
+ struct mm_struct *vma_vm_mm;
+ void *buffer;
+ ptrdiff_t user_buffer_offset;
+ struct list_head buffers;
+ struct rb_root free_buffers;
+ struct rb_root allocated_buffers;
+ size_t free_async_space;
+ struct binder_lru_page *pages;
+ size_t buffer_size;
+ uint32_t buffer_free;
+ int pid;
+ size_t pages_high;
+};
+
+#ifdef CONFIG_ANDROID_BINDER_IPC_SELFTEST
+void binder_selftest_alloc(struct binder_alloc *alloc);
+#else
+static inline void binder_selftest_alloc(struct binder_alloc *alloc) {}
+#endif
+enum lru_status binder_alloc_free_page(struct list_head *item,
+ struct list_lru_one *lru,
+ spinlock_t *lock, void *cb_arg);
+extern struct binder_buffer *binder_alloc_new_buf(struct binder_alloc *alloc,
+ size_t data_size,
+ size_t offsets_size,
+ size_t extra_buffers_size,
+ int is_async);
+extern void binder_alloc_init(struct binder_alloc *alloc);
+extern int binder_alloc_shrinker_init(void);
+extern void binder_alloc_vma_close(struct binder_alloc *alloc);
+extern struct binder_buffer *
+binder_alloc_prepare_to_free(struct binder_alloc *alloc,
+ uintptr_t user_ptr);
+extern void binder_alloc_free_buf(struct binder_alloc *alloc,
+ struct binder_buffer *buffer);
+extern int binder_alloc_mmap_handler(struct binder_alloc *alloc,
+ struct vm_area_struct *vma);
+extern void binder_alloc_deferred_release(struct binder_alloc *alloc);
+extern int binder_alloc_get_allocated_count(struct binder_alloc *alloc);
+extern void binder_alloc_print_allocated(struct seq_file *m,
+ struct binder_alloc *alloc);
+void binder_alloc_print_pages(struct seq_file *m,
+ struct binder_alloc *alloc);
+
+/**
+ * binder_alloc_get_free_async_space() - get free space available for async
+ * @alloc: binder_alloc for this proc
+ *
+ * Return: the bytes remaining in the address-space for async transactions
+ */
+static inline size_t
+binder_alloc_get_free_async_space(struct binder_alloc *alloc)
+{
+ size_t free_async_space;
+
+ mutex_lock(&alloc->mutex);
+ free_async_space = alloc->free_async_space;
+ mutex_unlock(&alloc->mutex);
+ return free_async_space;
+}
+
+/**
+ * binder_alloc_get_user_buffer_offset() - get offset between kernel/user addrs
+ * @alloc: binder_alloc for this proc
+ *
+ * Return: the offset between kernel and user-space addresses to use for
+ * virtual address conversion
+ */
+static inline ptrdiff_t
+binder_alloc_get_user_buffer_offset(struct binder_alloc *alloc)
+{
+ /*
+ * user_buffer_offset is constant if vma is set and
+ * undefined if vma is not set. It is possible to
+ * get here with !alloc->vma if the target process
+ * is dying while a transaction is being initiated.
+ * Returning the old value is ok in this case and
+ * the transaction will fail.
+ */
+ return alloc->user_buffer_offset;
+}
+
+#endif /* _LINUX_BINDER_ALLOC_H */
+
diff --git a/drivers/android/binder_alloc_selftest.c b/drivers/android/binder_alloc_selftest.c
new file mode 100644
index 000000000000..8bd7bcef967d
--- /dev/null
+++ b/drivers/android/binder_alloc_selftest.c
@@ -0,0 +1,310 @@
+/* binder_alloc_selftest.c
+ *
+ * Android IPC Subsystem
+ *
+ * Copyright (C) 2017 Google, Inc.
+ *
+ * This software is licensed under the terms of the GNU General Public
+ * License version 2, as published by the Free Software Foundation, and
+ * may be copied, distributed, and modified under those terms.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ */
+
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include <linux/mm_types.h>
+#include <linux/err.h>
+#include "binder_alloc.h"
+
+#define BUFFER_NUM 5
+#define BUFFER_MIN_SIZE (PAGE_SIZE / 8)
+
+static bool binder_selftest_run = true;
+static int binder_selftest_failures;
+static DEFINE_MUTEX(binder_selftest_lock);
+
+/**
+ * enum buf_end_align_type - Page alignment of a buffer
+ * end with regard to the end of the previous buffer.
+ *
+ * In the pictures below, buf2 refers to the buffer we
+ * are aligning. buf1 refers to previous buffer by addr.
+ * Symbol [ means the start of a buffer, ] means the end
+ * of a buffer, and | means page boundaries.
+ */
+enum buf_end_align_type {
+ /**
+ * @SAME_PAGE_UNALIGNED: The end of this buffer is on
+ * the same page as the end of the previous buffer and
+ * is not page aligned. Examples:
+ * buf1 ][ buf2 ][ ...
+ * buf1 ]|[ buf2 ][ ...
+ */
+ SAME_PAGE_UNALIGNED = 0,
+ /**
+ * @SAME_PAGE_ALIGNED: When the end of the previous buffer
+ * is not page aligned, the end of this buffer is on the
+ * same page as the end of the previous buffer and is page
+ * aligned. When the previous buffer is page aligned, the
+ * end of this buffer is aligned to the next page boundary.
+ * Examples:
+ * buf1 ][ buf2 ]| ...
+ * buf1 ]|[ buf2 ]| ...
+ */
+ SAME_PAGE_ALIGNED,
+ /**
+ * @NEXT_PAGE_UNALIGNED: The end of this buffer is on
+ * the page next to the end of the previous buffer and
+ * is not page aligned. Examples:
+ * buf1 ][ buf2 | buf2 ][ ...
+ * buf1 ]|[ buf2 | buf2 ][ ...
+ */
+ NEXT_PAGE_UNALIGNED,
+ /**
+ * @NEXT_PAGE_ALIGNED: The end of this buffer is on
+ * the page next to the end of the previous buffer and
+ * is page aligned. Examples:
+ * buf1 ][ buf2 | buf2 ]| ...
+ * buf1 ]|[ buf2 | buf2 ]| ...
+ */
+ NEXT_PAGE_ALIGNED,
+ /**
+ * @NEXT_NEXT_UNALIGNED: The end of this buffer is on
+ * the page that follows the page after the end of the
+ * previous buffer and is not page aligned. Examples:
+ * buf1 ][ buf2 | buf2 | buf2 ][ ...
+ * buf1 ]|[ buf2 | buf2 | buf2 ][ ...
+ */
+ NEXT_NEXT_UNALIGNED,
+ LOOP_END,
+};
+
+static void pr_err_size_seq(size_t *sizes, int *seq)
+{
+ int i;
+
+ pr_err("alloc sizes: ");
+ for (i = 0; i < BUFFER_NUM; i++)
+ pr_cont("[%zu]", sizes[i]);
+ pr_cont("\n");
+ pr_err("free seq: ");
+ for (i = 0; i < BUFFER_NUM; i++)
+ pr_cont("[%d]", seq[i]);
+ pr_cont("\n");
+}
+
+static bool check_buffer_pages_allocated(struct binder_alloc *alloc,
+ struct binder_buffer *buffer,
+ size_t size)
+{
+ void *page_addr, *end;
+ int page_index;
+
+ end = (void *)PAGE_ALIGN((uintptr_t)buffer->data + size);
+ page_addr = buffer->data;
+ for (; page_addr < end; page_addr += PAGE_SIZE) {
+ page_index = (page_addr - alloc->buffer) / PAGE_SIZE;
+ if (!alloc->pages[page_index].page_ptr ||
+ !list_empty(&alloc->pages[page_index].lru)) {
+ pr_err("expect alloc but is %s at page index %d\n",
+ alloc->pages[page_index].page_ptr ?
+ "lru" : "free", page_index);
+ return false;
+ }
+ }
+ return true;
+}
+
+static void binder_selftest_alloc_buf(struct binder_alloc *alloc,
+ struct binder_buffer *buffers[],
+ size_t *sizes, int *seq)
+{
+ int i;
+
+ for (i = 0; i < BUFFER_NUM; i++) {
+ buffers[i] = binder_alloc_new_buf(alloc, sizes[i], 0, 0, 0);
+ if (IS_ERR(buffers[i]) ||
+ !check_buffer_pages_allocated(alloc, buffers[i],
+ sizes[i])) {
+ pr_err_size_seq(sizes, seq);
+ binder_selftest_failures++;
+ }
+ }
+}
+
+static void binder_selftest_free_buf(struct binder_alloc *alloc,
+ struct binder_buffer *buffers[],
+ size_t *sizes, int *seq, size_t end)
+{
+ int i;
+
+ for (i = 0; i < BUFFER_NUM; i++)
+ binder_alloc_free_buf(alloc, buffers[seq[i]]);
+
+ for (i = 0; i < end / PAGE_SIZE; i++) {
+ /**
+ * Error message on a free page can be false positive
+ * if binder shrinker ran during binder_alloc_free_buf
+ * calls above.
+ */
+ if (list_empty(&alloc->pages[i].lru)) {
+ pr_err_size_seq(sizes, seq);
+ pr_err("expect lru but is %s at page index %d\n",
+ alloc->pages[i].page_ptr ? "alloc" : "free", i);
+ binder_selftest_failures++;
+ }
+ }
+}
+
+static void binder_selftest_free_page(struct binder_alloc *alloc)
+{
+ int i;
+ unsigned long count;
+
+ while ((count = list_lru_count(&binder_alloc_lru))) {
+ list_lru_walk(&binder_alloc_lru, binder_alloc_free_page,
+ NULL, count);
+ }
+
+ for (i = 0; i < (alloc->buffer_size / PAGE_SIZE); i++) {
+ if (alloc->pages[i].page_ptr) {
+ pr_err("expect free but is %s at page index %d\n",
+ list_empty(&alloc->pages[i].lru) ?
+ "alloc" : "lru", i);
+ binder_selftest_failures++;
+ }
+ }
+}
+
+static void binder_selftest_alloc_free(struct binder_alloc *alloc,
+ size_t *sizes, int *seq, size_t end)
+{
+ struct binder_buffer *buffers[BUFFER_NUM];
+
+ binder_selftest_alloc_buf(alloc, buffers, sizes, seq);
+ binder_selftest_free_buf(alloc, buffers, sizes, seq, end);
+
+ /* Allocate from lru. */
+ binder_selftest_alloc_buf(alloc, buffers, sizes, seq);
+ if (list_lru_count(&binder_alloc_lru))
+ pr_err("lru list should be empty but is not\n");
+
+ binder_selftest_free_buf(alloc, buffers, sizes, seq, end);
+ binder_selftest_free_page(alloc);
+}
+
+static bool is_dup(int *seq, int index, int val)
+{
+ int i;
+
+ for (i = 0; i < index; i++) {
+ if (seq[i] == val)
+ return true;
+ }
+ return false;
+}
+
+/* Generate BUFFER_NUM factorial free orders. */
+static void binder_selftest_free_seq(struct binder_alloc *alloc,
+ size_t *sizes, int *seq,
+ int index, size_t end)
+{
+ int i;
+
+ if (index == BUFFER_NUM) {
+ binder_selftest_alloc_free(alloc, sizes, seq, end);
+ return;
+ }
+ for (i = 0; i < BUFFER_NUM; i++) {
+ if (is_dup(seq, index, i))
+ continue;
+ seq[index] = i;
+ binder_selftest_free_seq(alloc, sizes, seq, index + 1, end);
+ }
+}
+
+static void binder_selftest_alloc_size(struct binder_alloc *alloc,
+ size_t *end_offset)
+{
+ int i;
+ int seq[BUFFER_NUM] = {0};
+ size_t front_sizes[BUFFER_NUM];
+ size_t back_sizes[BUFFER_NUM];
+ size_t last_offset, offset = 0;
+
+ for (i = 0; i < BUFFER_NUM; i++) {
+ last_offset = offset;
+ offset = end_offset[i];
+ front_sizes[i] = offset - last_offset;
+ back_sizes[BUFFER_NUM - i - 1] = front_sizes[i];
+ }
+ /*
+ * Buffers share the first or last few pages.
+ * Only BUFFER_NUM - 1 buffer sizes are adjustable since
+ * we need one giant buffer before getting to the last page.
+ */
+ back_sizes[0] += alloc->buffer_size - end_offset[BUFFER_NUM - 1];
+ binder_selftest_free_seq(alloc, front_sizes, seq, 0,
+ end_offset[BUFFER_NUM - 1]);
+ binder_selftest_free_seq(alloc, back_sizes, seq, 0, alloc->buffer_size);
+}
+
+static void binder_selftest_alloc_offset(struct binder_alloc *alloc,
+ size_t *end_offset, int index)
+{
+ int align;
+ size_t end, prev;
+
+ if (index == BUFFER_NUM) {
+ binder_selftest_alloc_size(alloc, end_offset);
+ return;
+ }
+ prev = index == 0 ? 0 : end_offset[index - 1];
+ end = prev;
+
+ BUILD_BUG_ON(BUFFER_MIN_SIZE * BUFFER_NUM >= PAGE_SIZE);
+
+ for (align = SAME_PAGE_UNALIGNED; align < LOOP_END; align++) {
+ if (align % 2)
+ end = ALIGN(end, PAGE_SIZE);
+ else
+ end += BUFFER_MIN_SIZE;
+ end_offset[index] = end;
+ binder_selftest_alloc_offset(alloc, end_offset, index + 1);
+ }
+}
+
+/**
+ * binder_selftest_alloc() - Test alloc and free of buffer pages.
+ * @alloc: Pointer to alloc struct.
+ *
+ * Allocate BUFFER_NUM buffers to cover all page alignment cases,
+ * then free them in all orders possible. Check that pages are
+ * correctly allocated, put onto lru when buffers are freed, and
+ * are freed when binder_alloc_free_page is called.
+ */
+void binder_selftest_alloc(struct binder_alloc *alloc)
+{
+ size_t end_offset[BUFFER_NUM];
+
+ if (!binder_selftest_run)
+ return;
+ mutex_lock(&binder_selftest_lock);
+ if (!binder_selftest_run || !alloc->vma)
+ goto done;
+ pr_info("STARTED\n");
+ binder_selftest_alloc_offset(alloc, end_offset, 0);
+ binder_selftest_run = false;
+ if (binder_selftest_failures > 0)
+ pr_info("%d tests FAILED\n", binder_selftest_failures);
+ else
+ pr_info("PASSED\n");
+
+done:
+ mutex_unlock(&binder_selftest_lock);
+}
diff --git a/drivers/android/binder_trace.h b/drivers/android/binder_trace.h
index 7f20f3dc8369..b11dffc521e8 100644
--- a/drivers/android/binder_trace.h
+++ b/drivers/android/binder_trace.h
@@ -23,7 +23,8 @@
struct binder_buffer;
struct binder_node;
struct binder_proc;
-struct binder_ref;
+struct binder_alloc;
+struct binder_ref_data;
struct binder_thread;
struct binder_transaction;
@@ -84,6 +85,30 @@ DEFINE_BINDER_FUNCTION_RETURN_EVENT(binder_ioctl_done);
DEFINE_BINDER_FUNCTION_RETURN_EVENT(binder_write_done);
DEFINE_BINDER_FUNCTION_RETURN_EVENT(binder_read_done);
+TRACE_EVENT(binder_set_priority,
+ TP_PROTO(int proc, int thread, unsigned int old_prio,
+ unsigned int desired_prio, unsigned int new_prio),
+ TP_ARGS(proc, thread, old_prio, new_prio, desired_prio),
+
+ TP_STRUCT__entry(
+ __field(int, proc)
+ __field(int, thread)
+ __field(unsigned int, old_prio)
+ __field(unsigned int, new_prio)
+ __field(unsigned int, desired_prio)
+ ),
+ TP_fast_assign(
+ __entry->proc = proc;
+ __entry->thread = thread;
+ __entry->old_prio = old_prio;
+ __entry->new_prio = new_prio;
+ __entry->desired_prio = desired_prio;
+ ),
+ TP_printk("proc=%d thread=%d old=%d => new=%d desired=%d",
+ __entry->proc, __entry->thread, __entry->old_prio,
+ __entry->new_prio, __entry->desired_prio)
+);
+
TRACE_EVENT(binder_wait_for_work,
TP_PROTO(bool proc_work, bool transaction_stack, bool thread_todo),
TP_ARGS(proc_work, transaction_stack, thread_todo),
@@ -146,8 +171,8 @@ TRACE_EVENT(binder_transaction_received,
TRACE_EVENT(binder_transaction_node_to_ref,
TP_PROTO(struct binder_transaction *t, struct binder_node *node,
- struct binder_ref *ref),
- TP_ARGS(t, node, ref),
+ struct binder_ref_data *rdata),
+ TP_ARGS(t, node, rdata),
TP_STRUCT__entry(
__field(int, debug_id)
@@ -160,8 +185,8 @@ TRACE_EVENT(binder_transaction_node_to_ref,
__entry->debug_id = t->debug_id;
__entry->node_debug_id = node->debug_id;
__entry->node_ptr = node->ptr;
- __entry->ref_debug_id = ref->debug_id;
- __entry->ref_desc = ref->desc;
+ __entry->ref_debug_id = rdata->debug_id;
+ __entry->ref_desc = rdata->desc;
),
TP_printk("transaction=%d node=%d src_ptr=0x%016llx ==> dest_ref=%d dest_desc=%d",
__entry->debug_id, __entry->node_debug_id,
@@ -170,8 +195,9 @@ TRACE_EVENT(binder_transaction_node_to_ref,
);
TRACE_EVENT(binder_transaction_ref_to_node,
- TP_PROTO(struct binder_transaction *t, struct binder_ref *ref),
- TP_ARGS(t, ref),
+ TP_PROTO(struct binder_transaction *t, struct binder_node *node,
+ struct binder_ref_data *rdata),
+ TP_ARGS(t, node, rdata),
TP_STRUCT__entry(
__field(int, debug_id)
@@ -182,10 +208,10 @@ TRACE_EVENT(binder_transaction_ref_to_node,
),
TP_fast_assign(
__entry->debug_id = t->debug_id;
- __entry->ref_debug_id = ref->debug_id;
- __entry->ref_desc = ref->desc;
- __entry->node_debug_id = ref->node->debug_id;
- __entry->node_ptr = ref->node->ptr;
+ __entry->ref_debug_id = rdata->debug_id;
+ __entry->ref_desc = rdata->desc;
+ __entry->node_debug_id = node->debug_id;
+ __entry->node_ptr = node->ptr;
),
TP_printk("transaction=%d node=%d src_ref=%d src_desc=%d ==> dest_ptr=0x%016llx",
__entry->debug_id, __entry->node_debug_id,
@@ -194,9 +220,10 @@ TRACE_EVENT(binder_transaction_ref_to_node,
);
TRACE_EVENT(binder_transaction_ref_to_ref,
- TP_PROTO(struct binder_transaction *t, struct binder_ref *src_ref,
- struct binder_ref *dest_ref),
- TP_ARGS(t, src_ref, dest_ref),
+ TP_PROTO(struct binder_transaction *t, struct binder_node *node,
+ struct binder_ref_data *src_ref,
+ struct binder_ref_data *dest_ref),
+ TP_ARGS(t, node, src_ref, dest_ref),
TP_STRUCT__entry(
__field(int, debug_id)
@@ -208,7 +235,7 @@ TRACE_EVENT(binder_transaction_ref_to_ref,
),
TP_fast_assign(
__entry->debug_id = t->debug_id;
- __entry->node_debug_id = src_ref->node->debug_id;
+ __entry->node_debug_id = node->debug_id;
__entry->src_ref_debug_id = src_ref->debug_id;
__entry->src_ref_desc = src_ref->desc;
__entry->dest_ref_debug_id = dest_ref->debug_id;
@@ -268,9 +295,9 @@ DEFINE_EVENT(binder_buffer_class, binder_transaction_failed_buffer_release,
TP_ARGS(buffer));
TRACE_EVENT(binder_update_page_range,
- TP_PROTO(struct binder_proc *proc, bool allocate,
+ TP_PROTO(struct binder_alloc *alloc, bool allocate,
void *start, void *end),
- TP_ARGS(proc, allocate, start, end),
+ TP_ARGS(alloc, allocate, start, end),
TP_STRUCT__entry(
__field(int, proc)
__field(bool, allocate)
@@ -278,9 +305,9 @@ TRACE_EVENT(binder_update_page_range,
__field(size_t, size)
),
TP_fast_assign(
- __entry->proc = proc->pid;
+ __entry->proc = alloc->pid;
__entry->allocate = allocate;
- __entry->offset = start - proc->buffer;
+ __entry->offset = start - alloc->buffer;
__entry->size = end - start;
),
TP_printk("proc=%d allocate=%d offset=%zu size=%zu",
@@ -288,6 +315,61 @@ TRACE_EVENT(binder_update_page_range,
__entry->offset, __entry->size)
);
+DECLARE_EVENT_CLASS(binder_lru_page_class,
+ TP_PROTO(const struct binder_alloc *alloc, size_t page_index),
+ TP_ARGS(alloc, page_index),
+ TP_STRUCT__entry(
+ __field(int, proc)
+ __field(size_t, page_index)
+ ),
+ TP_fast_assign(
+ __entry->proc = alloc->pid;
+ __entry->page_index = page_index;
+ ),
+ TP_printk("proc=%d page_index=%zu",
+ __entry->proc, __entry->page_index)
+);
+
+DEFINE_EVENT(binder_lru_page_class, binder_alloc_lru_start,
+ TP_PROTO(const struct binder_alloc *alloc, size_t page_index),
+ TP_ARGS(alloc, page_index));
+
+DEFINE_EVENT(binder_lru_page_class, binder_alloc_lru_end,
+ TP_PROTO(const struct binder_alloc *alloc, size_t page_index),
+ TP_ARGS(alloc, page_index));
+
+DEFINE_EVENT(binder_lru_page_class, binder_free_lru_start,
+ TP_PROTO(const struct binder_alloc *alloc, size_t page_index),
+ TP_ARGS(alloc, page_index));
+
+DEFINE_EVENT(binder_lru_page_class, binder_free_lru_end,
+ TP_PROTO(const struct binder_alloc *alloc, size_t page_index),
+ TP_ARGS(alloc, page_index));
+
+DEFINE_EVENT(binder_lru_page_class, binder_alloc_page_start,
+ TP_PROTO(const struct binder_alloc *alloc, size_t page_index),
+ TP_ARGS(alloc, page_index));
+
+DEFINE_EVENT(binder_lru_page_class, binder_alloc_page_end,
+ TP_PROTO(const struct binder_alloc *alloc, size_t page_index),
+ TP_ARGS(alloc, page_index));
+
+DEFINE_EVENT(binder_lru_page_class, binder_unmap_user_start,
+ TP_PROTO(const struct binder_alloc *alloc, size_t page_index),
+ TP_ARGS(alloc, page_index));
+
+DEFINE_EVENT(binder_lru_page_class, binder_unmap_user_end,
+ TP_PROTO(const struct binder_alloc *alloc, size_t page_index),
+ TP_ARGS(alloc, page_index));
+
+DEFINE_EVENT(binder_lru_page_class, binder_unmap_kernel_start,
+ TP_PROTO(const struct binder_alloc *alloc, size_t page_index),
+ TP_ARGS(alloc, page_index));
+
+DEFINE_EVENT(binder_lru_page_class, binder_unmap_kernel_end,
+ TP_PROTO(const struct binder_alloc *alloc, size_t page_index),
+ TP_ARGS(alloc, page_index));
+
TRACE_EVENT(binder_command,
TP_PROTO(uint32_t cmd),
TP_ARGS(cmd),
diff --git a/drivers/base/power/main.c b/drivers/base/power/main.c
index 98517216879d..574d08f1673e 100644
--- a/drivers/base/power/main.c
+++ b/drivers/base/power/main.c
@@ -33,6 +33,7 @@
#include <linux/cpufreq.h>
#include <linux/cpuidle.h>
#include <linux/timer.h>
+#include <linux/wakeup_reason.h>
#include "../base.h"
#include "power.h"
@@ -1353,6 +1354,7 @@ static int __device_suspend(struct device *dev, pm_message_t state, bool async)
pm_callback_t callback = NULL;
char *info = NULL;
int error = 0;
+ char suspend_abort[MAX_SUSPEND_ABORT_LEN];
DECLARE_DPM_WATCHDOG_ON_STACK(wd);
TRACE_DEVICE(dev);
@@ -1375,6 +1377,9 @@ static int __device_suspend(struct device *dev, pm_message_t state, bool async)
pm_wakeup_event(dev, 0);
if (pm_wakeup_pending()) {
+ pm_get_active_wakeup_sources(suspend_abort,
+ MAX_SUSPEND_ABORT_LEN);
+ log_suspend_abort_reason(suspend_abort);
dev->power.direct_complete = false;
async_error = -EBUSY;
goto Complete;
diff --git a/drivers/base/power/opp/core.c b/drivers/base/power/opp/core.c
index 23ee46a0c78c..e494a938a9f1 100644
--- a/drivers/base/power/opp/core.c
+++ b/drivers/base/power/opp/core.c
@@ -708,7 +708,7 @@ static void _remove_opp_dev(struct opp_device *opp_dev,
struct opp_table *opp_table)
{
opp_debug_unregister(opp_dev, opp_table);
- list_del(&opp_dev->node);
+ list_del_rcu(&opp_dev->node);
call_srcu(&opp_table->srcu_head.srcu, &opp_dev->rcu_head,
_kfree_opp_dev_rcu);
}
diff --git a/drivers/base/power/wakeup.c b/drivers/base/power/wakeup.c
index f98121f11f7c..90c16d8952aa 100644
--- a/drivers/base/power/wakeup.c
+++ b/drivers/base/power/wakeup.c
@@ -15,6 +15,7 @@
#include <linux/seq_file.h>
#include <linux/debugfs.h>
#include <linux/pm_wakeirq.h>
+#include <linux/types.h>
#include <trace/events/power.h>
#include "power.h"
@@ -804,6 +805,37 @@ void pm_wakeup_event(struct device *dev, unsigned int msec)
}
EXPORT_SYMBOL_GPL(pm_wakeup_event);
+void pm_get_active_wakeup_sources(char *pending_wakeup_source, size_t max)
+{
+ struct wakeup_source *ws, *last_active_ws = NULL;
+ int len = 0;
+ bool active = false;
+
+ rcu_read_lock();
+ list_for_each_entry_rcu(ws, &wakeup_sources, entry) {
+ if (ws->active && len < max) {
+ if (!active)
+ len += scnprintf(pending_wakeup_source, max,
+ "Pending Wakeup Sources: ");
+ len += scnprintf(pending_wakeup_source + len, max - len,
+ "%s ", ws->name);
+ active = true;
+ } else if (!active &&
+ (!last_active_ws ||
+ ktime_to_ns(ws->last_time) >
+ ktime_to_ns(last_active_ws->last_time))) {
+ last_active_ws = ws;
+ }
+ }
+ if (!active && last_active_ws) {
+ scnprintf(pending_wakeup_source, max,
+ "Last active Wakeup Source: %s",
+ last_active_ws->name);
+ }
+ rcu_read_unlock();
+}
+EXPORT_SYMBOL_GPL(pm_get_active_wakeup_sources);
+
void pm_print_active_wakeup_sources(void)
{
struct wakeup_source *ws;
@@ -1011,7 +1043,7 @@ static int print_wakeup_source_stats(struct seq_file *m,
active_time = ktime_set(0, 0);
}
- seq_printf(m, "%-12s\t%lu\t\t%lu\t\t%lu\t\t%lu\t\t%lld\t\t%lld\t\t%lld\t\t%lld\t\t%lld\n",
+ seq_printf(m, "%-32s\t%lu\t\t%lu\t\t%lu\t\t%lu\t\t%lld\t\t%lld\t\t%lld\t\t%lld\t\t%lld\n",
ws->name, active_count, ws->event_count,
ws->wakeup_count, ws->expire_count,
ktime_to_ms(active_time), ktime_to_ms(total_time),
@@ -1032,7 +1064,7 @@ static int wakeup_sources_stats_show(struct seq_file *m, void *unused)
struct wakeup_source *ws;
int srcuidx;
- seq_puts(m, "name\t\tactive_count\tevent_count\twakeup_count\t"
+ seq_puts(m, "name\t\t\t\t\tactive_count\tevent_count\twakeup_count\t"
"expire_count\tactive_since\ttotal_time\tmax_time\t"
"last_change\tprevent_suspend_time\n");
diff --git a/drivers/base/syscore.c b/drivers/base/syscore.c
index 8d98a329f6ea..96c34a95cc62 100644
--- a/drivers/base/syscore.c
+++ b/drivers/base/syscore.c
@@ -11,6 +11,7 @@
#include <linux/module.h>
#include <linux/suspend.h>
#include <trace/events/power.h>
+#include <linux/wakeup_reason.h>
static LIST_HEAD(syscore_ops_list);
static DEFINE_MUTEX(syscore_ops_lock);
@@ -75,6 +76,8 @@ int syscore_suspend(void)
return 0;
err_out:
+ log_suspend_abort_reason("System core suspend callback %pF failed",
+ ops->suspend);
pr_err("PM: System core suspend callback %pF failed.\n", ops->suspend);
list_for_each_entry_continue(ops, &syscore_ops_list, node)
diff --git a/drivers/block/loop.c b/drivers/block/loop.c
index 9f840d9fdfcb..a08c223e2f61 100644
--- a/drivers/block/loop.c
+++ b/drivers/block/loop.c
@@ -1059,6 +1059,7 @@ static int loop_clr_fd(struct loop_device *lo)
memset(lo->lo_encrypt_key, 0, LO_KEY_SIZE);
memset(lo->lo_crypt_name, 0, LO_NAME_SIZE);
memset(lo->lo_file_name, 0, LO_NAME_SIZE);
+ blk_queue_logical_block_size(lo->lo_queue, 512);
if (bdev) {
bdput(bdev);
invalidate_bdev(bdev);
@@ -1344,6 +1345,24 @@ static int loop_set_dio(struct loop_device *lo, unsigned long arg)
return error;
}
+static int loop_set_block_size(struct loop_device *lo, unsigned long arg)
+{
+ if (lo->lo_state != Lo_bound)
+ return -ENXIO;
+
+ if (arg < 512 || arg > PAGE_SIZE || !is_power_of_2(arg))
+ return -EINVAL;
+
+ blk_mq_freeze_queue(lo->lo_queue);
+
+ blk_queue_logical_block_size(lo->lo_queue, arg);
+ loop_update_dio(lo);
+
+ blk_mq_unfreeze_queue(lo->lo_queue);
+
+ return 0;
+}
+
static int lo_ioctl(struct block_device *bdev, fmode_t mode,
unsigned int cmd, unsigned long arg)
{
@@ -1392,6 +1411,11 @@ static int lo_ioctl(struct block_device *bdev, fmode_t mode,
if ((mode & FMODE_WRITE) || capable(CAP_SYS_ADMIN))
err = loop_set_dio(lo, arg);
break;
+ case LOOP_SET_BLOCK_SIZE:
+ err = -EPERM;
+ if ((mode & FMODE_WRITE) || capable(CAP_SYS_ADMIN))
+ err = loop_set_block_size(lo, arg);
+ break;
default:
err = lo->ioctl ? lo->ioctl(lo, cmd, arg) : -EINVAL;
}
@@ -1546,6 +1570,7 @@ static int lo_compat_ioctl(struct block_device *bdev, fmode_t mode,
arg = (unsigned long) compat_ptr(arg);
case LOOP_SET_FD:
case LOOP_CHANGE_FD:
+ case LOOP_SET_BLOCK_SIZE:
err = lo_ioctl(bdev, mode, cmd, arg);
break;
default:
@@ -1781,6 +1806,7 @@ static int loop_add(struct loop_device **l, int i)
}
lo->lo_queue->queuedata = lo;
+ blk_queue_max_hw_sectors(lo->lo_queue, BLK_DEF_MAX_SECTORS);
/*
* It doesn't make sense to enable merge because the I/O
* submitted to backing file is handled page by page.
diff --git a/drivers/block/zram/Kconfig b/drivers/block/zram/Kconfig
index b8ecba6dcd3b..cb53957d58f9 100644
--- a/drivers/block/zram/Kconfig
+++ b/drivers/block/zram/Kconfig
@@ -12,4 +12,26 @@ config ZRAM
It has several use cases, for example: /tmp storage, use as swap
disks and maybe many more.
- See zram.txt for more information.
+ See Documentation/blockdev/zram.txt for more information.
+
+config ZRAM_WRITEBACK
+ bool "Write back incompressible page to backing device"
+ depends on ZRAM
+ default n
+ help
+ With incompressible page, there is no memory saving to keep it
+ in memory. Instead, write it out to backing device.
+ For this feature, admin should set up backing device via
+ /sys/block/zramX/backing_dev.
+
+ See Documentation/blockdev/zram.txt for more information.
+
+config ZRAM_MEMORY_TRACKING
+ bool "Track zRam block status"
+ depends on ZRAM && DEBUG_FS
+ help
+ With this feature, admin can track the state of allocated blocks
+ of zRAM. Admin could see the information via
+ /sys/kernel/debug/zram/zramX/block_state.
+
+ See Documentation/blockdev/zram.txt for more information.
diff --git a/drivers/block/zram/zcomp.c b/drivers/block/zram/zcomp.c
index 4b5cd3a7b2b6..c084a7f9763d 100644
--- a/drivers/block/zram/zcomp.c
+++ b/drivers/block/zram/zcomp.c
@@ -32,6 +32,9 @@ static const char * const backends[] = {
#if IS_ENABLED(CONFIG_CRYPTO_842)
"842",
#endif
+#if IS_ENABLED(CONFIG_CRYPTO_ZSTD)
+ "zstd",
+#endif
NULL
};
diff --git a/drivers/block/zram/zram_drv.c b/drivers/block/zram/zram_drv.c
index d64a53d3270a..3b98b1609c25 100644
--- a/drivers/block/zram/zram_drv.c
+++ b/drivers/block/zram/zram_drv.c
@@ -31,6 +31,7 @@
#include <linux/err.h>
#include <linux/idr.h>
#include <linux/sysfs.h>
+#include <linux/debugfs.h>
#include "zram_drv.h"
@@ -43,82 +44,105 @@ static const char *default_compressor = "lzo";
/* Module params (documentation at end) */
static unsigned int num_devices = 1;
+/*
+ * Pages that compress to sizes equals or greater than this are stored
+ * uncompressed in memory.
+ */
+static size_t huge_class_size;
-static inline void deprecated_attr_warn(const char *name)
+static void zram_free_page(struct zram *zram, size_t index);
+
+static void zram_slot_lock(struct zram *zram, u32 index)
{
- pr_warn_once("%d (%s) Attribute %s (and others) will be removed. %s\n",
- task_pid_nr(current),
- current->comm,
- name,
- "See zram documentation.");
+ bit_spin_lock(ZRAM_LOCK, &zram->table[index].value);
}
-#define ZRAM_ATTR_RO(name) \
-static ssize_t name##_show(struct device *d, \
- struct device_attribute *attr, char *b) \
-{ \
- struct zram *zram = dev_to_zram(d); \
- \
- deprecated_attr_warn(__stringify(name)); \
- return scnprintf(b, PAGE_SIZE, "%llu\n", \
- (u64)atomic64_read(&zram->stats.name)); \
-} \
-static DEVICE_ATTR_RO(name);
+static void zram_slot_unlock(struct zram *zram, u32 index)
+{
+ bit_spin_unlock(ZRAM_LOCK, &zram->table[index].value);
+}
static inline bool init_done(struct zram *zram)
{
return zram->disksize;
}
+static inline bool zram_allocated(struct zram *zram, u32 index)
+{
+
+ return (zram->table[index].value >> (ZRAM_FLAG_SHIFT + 1)) ||
+ zram->table[index].handle;
+}
+
static inline struct zram *dev_to_zram(struct device *dev)
{
return (struct zram *)dev_to_disk(dev)->private_data;
}
+static unsigned long zram_get_handle(struct zram *zram, u32 index)
+{
+ return zram->table[index].handle;
+}
+
+static void zram_set_handle(struct zram *zram, u32 index, unsigned long handle)
+{
+ zram->table[index].handle = handle;
+}
+
/* flag operations require table entry bit_spin_lock() being held */
-static int zram_test_flag(struct zram_meta *meta, u32 index,
+static bool zram_test_flag(struct zram *zram, u32 index,
enum zram_pageflags flag)
{
- return meta->table[index].value & BIT(flag);
+ return zram->table[index].value & BIT(flag);
}
-static void zram_set_flag(struct zram_meta *meta, u32 index,
+static void zram_set_flag(struct zram *zram, u32 index,
enum zram_pageflags flag)
{
- meta->table[index].value |= BIT(flag);
+ zram->table[index].value |= BIT(flag);
}
-static void zram_clear_flag(struct zram_meta *meta, u32 index,
+static void zram_clear_flag(struct zram *zram, u32 index,
enum zram_pageflags flag)
{
- meta->table[index].value &= ~BIT(flag);
+ zram->table[index].value &= ~BIT(flag);
+}
+
+static inline void zram_set_element(struct zram *zram, u32 index,
+ unsigned long element)
+{
+ zram->table[index].element = element;
+}
+
+static unsigned long zram_get_element(struct zram *zram, u32 index)
+{
+ return zram->table[index].element;
}
-static size_t zram_get_obj_size(struct zram_meta *meta, u32 index)
+static size_t zram_get_obj_size(struct zram *zram, u32 index)
{
- return meta->table[index].value & (BIT(ZRAM_FLAG_SHIFT) - 1);
+ return zram->table[index].value & (BIT(ZRAM_FLAG_SHIFT) - 1);
}
-static void zram_set_obj_size(struct zram_meta *meta,
+static void zram_set_obj_size(struct zram *zram,
u32 index, size_t size)
{
- unsigned long flags = meta->table[index].value >> ZRAM_FLAG_SHIFT;
+ unsigned long flags = zram->table[index].value >> ZRAM_FLAG_SHIFT;
- meta->table[index].value = (flags << ZRAM_FLAG_SHIFT) | size;
+ zram->table[index].value = (flags << ZRAM_FLAG_SHIFT) | size;
}
+#if PAGE_SIZE != 4096
static inline bool is_partial_io(struct bio_vec *bvec)
{
return bvec->bv_len != PAGE_SIZE;
}
-
-static void zram_revalidate_disk(struct zram *zram)
+#else
+static inline bool is_partial_io(struct bio_vec *bvec)
{
- revalidate_disk(zram->disk);
- /* revalidate_disk reset the BDI_CAP_STABLE_WRITES so set again */
- zram->disk->queue->backing_dev_info.capabilities |=
- BDI_CAP_STABLE_WRITES;
+ return false;
}
+#endif
/*
* Check if request is within bounds and aligned on zram logical blocks.
@@ -146,8 +170,7 @@ static inline bool valid_io_request(struct zram *zram,
static void update_position(u32 *index, int *offset, struct bio_vec *bvec)
{
- if (*offset + bvec->bv_len >= PAGE_SIZE)
- (*index)++;
+ *index += (*offset + bvec->bv_len) / PAGE_SIZE;
*offset = (*offset + bvec->bv_len) % PAGE_SIZE;
}
@@ -166,34 +189,39 @@ static inline void update_used_max(struct zram *zram,
} while (old_max != cur_max);
}
-static bool page_zero_filled(void *ptr)
+static inline void zram_fill_page(char *ptr, unsigned long len,
+ unsigned long value)
+{
+ int i;
+ unsigned long *page = (unsigned long *)ptr;
+
+ WARN_ON_ONCE(!IS_ALIGNED(len, sizeof(unsigned long)));
+
+ if (likely(value == 0)) {
+ memset(ptr, 0, len);
+ } else {
+ for (i = 0; i < len / sizeof(*page); i++)
+ page[i] = value;
+ }
+}
+
+static bool page_same_filled(void *ptr, unsigned long *element)
{
unsigned int pos;
unsigned long *page;
+ unsigned long val;
page = (unsigned long *)ptr;
+ val = page[0];
- for (pos = 0; pos != PAGE_SIZE / sizeof(*page); pos++) {
- if (page[pos])
+ for (pos = 1; pos < PAGE_SIZE / sizeof(*page); pos++) {
+ if (val != page[pos])
return false;
}
- return true;
-}
-
-static void handle_zero_page(struct bio_vec *bvec)
-{
- struct page *page = bvec->bv_page;
- void *user_mem;
-
- user_mem = kmap_atomic(page);
- if (is_partial_io(bvec))
- memset(user_mem + bvec->bv_offset, 0, bvec->bv_len);
- else
- clear_page(user_mem);
- kunmap_atomic(user_mem);
+ *element = val;
- flush_dcache_page(page);
+ return true;
}
static ssize_t initstate_show(struct device *dev,
@@ -217,102 +245,516 @@ static ssize_t disksize_show(struct device *dev,
return scnprintf(buf, PAGE_SIZE, "%llu\n", zram->disksize);
}
-static ssize_t orig_data_size_show(struct device *dev,
- struct device_attribute *attr, char *buf)
+static ssize_t mem_limit_store(struct device *dev,
+ struct device_attribute *attr, const char *buf, size_t len)
{
+ u64 limit;
+ char *tmp;
struct zram *zram = dev_to_zram(dev);
- deprecated_attr_warn("orig_data_size");
- return scnprintf(buf, PAGE_SIZE, "%llu\n",
- (u64)(atomic64_read(&zram->stats.pages_stored)) << PAGE_SHIFT);
+ limit = memparse(buf, &tmp);
+ if (buf == tmp) /* no chars parsed, invalid input */
+ return -EINVAL;
+
+ down_write(&zram->init_lock);
+ zram->limit_pages = PAGE_ALIGN(limit) >> PAGE_SHIFT;
+ up_write(&zram->init_lock);
+
+ return len;
}
-static ssize_t mem_used_total_show(struct device *dev,
- struct device_attribute *attr, char *buf)
+static ssize_t mem_used_max_store(struct device *dev,
+ struct device_attribute *attr, const char *buf, size_t len)
{
- u64 val = 0;
+ int err;
+ unsigned long val;
struct zram *zram = dev_to_zram(dev);
- deprecated_attr_warn("mem_used_total");
+ err = kstrtoul(buf, 10, &val);
+ if (err || val != 0)
+ return -EINVAL;
+
down_read(&zram->init_lock);
if (init_done(zram)) {
- struct zram_meta *meta = zram->meta;
- val = zs_get_total_pages(meta->mem_pool);
+ atomic_long_set(&zram->stats.max_used_pages,
+ zs_get_total_pages(zram->mem_pool));
}
up_read(&zram->init_lock);
- return scnprintf(buf, PAGE_SIZE, "%llu\n", val << PAGE_SHIFT);
+ return len;
}
-static ssize_t mem_limit_show(struct device *dev,
+#ifdef CONFIG_ZRAM_WRITEBACK
+static bool zram_wb_enabled(struct zram *zram)
+{
+ return zram->backing_dev;
+}
+
+static void reset_bdev(struct zram *zram)
+{
+ struct block_device *bdev;
+
+ if (!zram_wb_enabled(zram))
+ return;
+
+ bdev = zram->bdev;
+ if (zram->old_block_size)
+ set_blocksize(bdev, zram->old_block_size);
+ blkdev_put(bdev, FMODE_READ|FMODE_WRITE|FMODE_EXCL);
+ /* hope filp_close flush all of IO */
+ filp_close(zram->backing_dev, NULL);
+ zram->backing_dev = NULL;
+ zram->old_block_size = 0;
+ zram->bdev = NULL;
+
+ kvfree(zram->bitmap);
+ zram->bitmap = NULL;
+}
+
+static ssize_t backing_dev_show(struct device *dev,
struct device_attribute *attr, char *buf)
{
- u64 val;
struct zram *zram = dev_to_zram(dev);
+ struct file *file = zram->backing_dev;
+ char *p;
+ ssize_t ret;
- deprecated_attr_warn("mem_limit");
down_read(&zram->init_lock);
- val = zram->limit_pages;
- up_read(&zram->init_lock);
+ if (!zram_wb_enabled(zram)) {
+ memcpy(buf, "none\n", 5);
+ up_read(&zram->init_lock);
+ return 5;
+ }
- return scnprintf(buf, PAGE_SIZE, "%llu\n", val << PAGE_SHIFT);
+ p = file_path(file, buf, PAGE_SIZE - 1);
+ if (IS_ERR(p)) {
+ ret = PTR_ERR(p);
+ goto out;
+ }
+
+ ret = strlen(p);
+ memmove(buf, p, ret);
+ buf[ret++] = '\n';
+out:
+ up_read(&zram->init_lock);
+ return ret;
}
-static ssize_t mem_limit_store(struct device *dev,
+static ssize_t backing_dev_store(struct device *dev,
struct device_attribute *attr, const char *buf, size_t len)
{
- u64 limit;
- char *tmp;
+ char *file_name;
+ size_t sz;
+ struct file *backing_dev = NULL;
+ struct inode *inode;
+ struct address_space *mapping;
+ unsigned int bitmap_sz, old_block_size = 0;
+ unsigned long nr_pages, *bitmap = NULL;
+ struct block_device *bdev = NULL;
+ int err;
struct zram *zram = dev_to_zram(dev);
+ gfp_t kmalloc_flags;
- limit = memparse(buf, &tmp);
- if (buf == tmp) /* no chars parsed, invalid input */
- return -EINVAL;
+ file_name = kmalloc(PATH_MAX, GFP_KERNEL);
+ if (!file_name)
+ return -ENOMEM;
down_write(&zram->init_lock);
- zram->limit_pages = PAGE_ALIGN(limit) >> PAGE_SHIFT;
+ if (init_done(zram)) {
+ pr_info("Can't setup backing device for initialized device\n");
+ err = -EBUSY;
+ goto out;
+ }
+
+ strlcpy(file_name, buf, PATH_MAX);
+ /* ignore trailing newline */
+ sz = strlen(file_name);
+ if (sz > 0 && file_name[sz - 1] == '\n')
+ file_name[sz - 1] = 0x00;
+
+ backing_dev = filp_open(file_name, O_RDWR|O_LARGEFILE, 0);
+ if (IS_ERR(backing_dev)) {
+ err = PTR_ERR(backing_dev);
+ backing_dev = NULL;
+ goto out;
+ }
+
+ mapping = backing_dev->f_mapping;
+ inode = mapping->host;
+
+ /* Support only block device in this moment */
+ if (!S_ISBLK(inode->i_mode)) {
+ err = -ENOTBLK;
+ goto out;
+ }
+
+ bdev = bdgrab(I_BDEV(inode));
+ err = blkdev_get(bdev, FMODE_READ | FMODE_WRITE | FMODE_EXCL, zram);
+ if (err < 0)
+ goto out;
+
+ nr_pages = i_size_read(inode) >> PAGE_SHIFT;
+ bitmap_sz = BITS_TO_LONGS(nr_pages) * sizeof(long);
+ kmalloc_flags = GFP_KERNEL | __GFP_ZERO;
+ if (bitmap_sz > PAGE_SIZE)
+ kmalloc_flags |= __GFP_NOWARN | __GFP_NORETRY;
+
+ bitmap = kmalloc_node(bitmap_sz, kmalloc_flags, NUMA_NO_NODE);
+ if (!bitmap && bitmap_sz > PAGE_SIZE)
+ bitmap = vzalloc(bitmap_sz);
+
+ if (!bitmap) {
+ err = -ENOMEM;
+ goto out;
+ }
+
+ old_block_size = block_size(bdev);
+ err = set_blocksize(bdev, PAGE_SIZE);
+ if (err)
+ goto out;
+
+ reset_bdev(zram);
+ spin_lock_init(&zram->bitmap_lock);
+
+ zram->old_block_size = old_block_size;
+ zram->bdev = bdev;
+ zram->backing_dev = backing_dev;
+ zram->bitmap = bitmap;
+ zram->nr_pages = nr_pages;
up_write(&zram->init_lock);
+ pr_info("setup backing device %s\n", file_name);
+ kfree(file_name);
+
return len;
+out:
+ if (bitmap)
+ kvfree(bitmap);
+
+ if (bdev)
+ blkdev_put(bdev, FMODE_READ | FMODE_WRITE | FMODE_EXCL);
+
+ if (backing_dev)
+ filp_close(backing_dev, NULL);
+
+ up_write(&zram->init_lock);
+
+ kfree(file_name);
+
+ return err;
}
-static ssize_t mem_used_max_show(struct device *dev,
- struct device_attribute *attr, char *buf)
+static unsigned long get_entry_bdev(struct zram *zram)
{
- u64 val = 0;
- struct zram *zram = dev_to_zram(dev);
+ unsigned long entry;
- deprecated_attr_warn("mem_used_max");
- down_read(&zram->init_lock);
- if (init_done(zram))
- val = atomic_long_read(&zram->stats.max_used_pages);
- up_read(&zram->init_lock);
+ spin_lock(&zram->bitmap_lock);
+ /* skip 0 bit to confuse zram.handle = 0 */
+ entry = find_next_zero_bit(zram->bitmap, zram->nr_pages, 1);
+ if (entry == zram->nr_pages) {
+ spin_unlock(&zram->bitmap_lock);
+ return 0;
+ }
- return scnprintf(buf, PAGE_SIZE, "%llu\n", val << PAGE_SHIFT);
+ set_bit(entry, zram->bitmap);
+ spin_unlock(&zram->bitmap_lock);
+
+ return entry;
}
-static ssize_t mem_used_max_store(struct device *dev,
- struct device_attribute *attr, const char *buf, size_t len)
+static void put_entry_bdev(struct zram *zram, unsigned long entry)
{
- int err;
- unsigned long val;
- struct zram *zram = dev_to_zram(dev);
+ int was_set;
- err = kstrtoul(buf, 10, &val);
- if (err || val != 0)
- return -EINVAL;
+ spin_lock(&zram->bitmap_lock);
+ was_set = test_and_clear_bit(entry, zram->bitmap);
+ spin_unlock(&zram->bitmap_lock);
+ WARN_ON_ONCE(!was_set);
+}
+
+static void zram_page_end_io(struct bio *bio)
+{
+ struct page *page = bio->bi_io_vec[0].bv_page;
+
+ page_endio(page, op_is_write(bio_op(bio)), bio->bi_error);
+ bio_put(bio);
+}
+
+/*
+ * Returns 1 if the submission is successful.
+ */
+static int read_from_bdev_async(struct zram *zram, struct bio_vec *bvec,
+ unsigned long entry, struct bio *parent)
+{
+ struct bio *bio;
+
+ bio = bio_alloc(GFP_ATOMIC, 1);
+ if (!bio)
+ return -ENOMEM;
+
+ bio->bi_iter.bi_sector = entry * (PAGE_SIZE >> 9);
+ bio->bi_bdev = zram->bdev;
+ if (!bio_add_page(bio, bvec->bv_page, bvec->bv_len, bvec->bv_offset)) {
+ bio_put(bio);
+ return -EIO;
+ }
+
+ if (!parent) {
+ bio->bi_opf = REQ_OP_READ;
+ bio->bi_end_io = zram_page_end_io;
+ } else {
+ bio->bi_opf = parent->bi_opf;
+ bio_chain(bio, parent);
+ }
+
+ submit_bio(bio);
+ return 1;
+}
+
+struct zram_work {
+ struct work_struct work;
+ struct zram *zram;
+ unsigned long entry;
+ struct bio *bio;
+};
+
+#if PAGE_SIZE != 4096
+static void zram_sync_read(struct work_struct *work)
+{
+ struct bio_vec bvec;
+ struct zram_work *zw = container_of(work, struct zram_work, work);
+ struct zram *zram = zw->zram;
+ unsigned long entry = zw->entry;
+ struct bio *bio = zw->bio;
+
+ read_from_bdev_async(zram, &bvec, entry, bio);
+}
+
+/*
+ * Block layer want one ->make_request_fn to be active at a time
+ * so if we use chained IO with parent IO in same context,
+ * it's a deadlock. To avoid, it, it uses worker thread context.
+ */
+static int read_from_bdev_sync(struct zram *zram, struct bio_vec *bvec,
+ unsigned long entry, struct bio *bio)
+{
+ struct zram_work work;
+
+ work.zram = zram;
+ work.entry = entry;
+ work.bio = bio;
+
+ INIT_WORK_ONSTACK(&work.work, zram_sync_read);
+ queue_work(system_unbound_wq, &work.work);
+ flush_work(&work.work);
+ destroy_work_on_stack(&work.work);
+
+ return 1;
+}
+#else
+static int read_from_bdev_sync(struct zram *zram, struct bio_vec *bvec,
+ unsigned long entry, struct bio *bio)
+{
+ WARN_ON(1);
+ return -EIO;
+}
+#endif
+
+static int read_from_bdev(struct zram *zram, struct bio_vec *bvec,
+ unsigned long entry, struct bio *parent, bool sync)
+{
+ if (sync)
+ return read_from_bdev_sync(zram, bvec, entry, parent);
+ else
+ return read_from_bdev_async(zram, bvec, entry, parent);
+}
+
+static int write_to_bdev(struct zram *zram, struct bio_vec *bvec,
+ u32 index, struct bio *parent,
+ unsigned long *pentry)
+{
+ struct bio *bio;
+ unsigned long entry;
+
+ bio = bio_alloc(GFP_ATOMIC, 1);
+ if (!bio)
+ return -ENOMEM;
+
+ entry = get_entry_bdev(zram);
+ if (!entry) {
+ bio_put(bio);
+ return -ENOSPC;
+ }
+
+ bio->bi_iter.bi_sector = entry * (PAGE_SIZE >> 9);
+ bio->bi_bdev = zram->bdev;
+ if (!bio_add_page(bio, bvec->bv_page, bvec->bv_len,
+ bvec->bv_offset)) {
+ bio_put(bio);
+ put_entry_bdev(zram, entry);
+ return -EIO;
+ }
+
+ if (!parent) {
+ bio->bi_opf = REQ_OP_WRITE | REQ_SYNC;
+ bio->bi_end_io = zram_page_end_io;
+ } else {
+ bio->bi_opf = parent->bi_opf;
+ bio_chain(bio, parent);
+ }
+
+ submit_bio(bio);
+ *pentry = entry;
+
+ return 0;
+}
+
+static void zram_wb_clear(struct zram *zram, u32 index)
+{
+ unsigned long entry;
+
+ zram_clear_flag(zram, index, ZRAM_WB);
+ entry = zram_get_element(zram, index);
+ zram_set_element(zram, index, 0);
+ put_entry_bdev(zram, entry);
+}
+
+#else
+static bool zram_wb_enabled(struct zram *zram) { return false; }
+static inline void reset_bdev(struct zram *zram) {};
+static int write_to_bdev(struct zram *zram, struct bio_vec *bvec,
+ u32 index, struct bio *parent,
+ unsigned long *pentry)
+
+{
+ return -EIO;
+}
+
+static int read_from_bdev(struct zram *zram, struct bio_vec *bvec,
+ unsigned long entry, struct bio *parent, bool sync)
+{
+ return -EIO;
+}
+static void zram_wb_clear(struct zram *zram, u32 index) {}
+#endif
+
+#ifdef CONFIG_ZRAM_MEMORY_TRACKING
+
+static struct dentry *zram_debugfs_root;
+
+static void zram_debugfs_create(void)
+{
+ zram_debugfs_root = debugfs_create_dir("zram", NULL);
+}
+
+static void zram_debugfs_destroy(void)
+{
+ debugfs_remove_recursive(zram_debugfs_root);
+}
+
+static void zram_accessed(struct zram *zram, u32 index)
+{
+ zram->table[index].ac_time = ktime_get_boottime();
+}
+
+static void zram_reset_access(struct zram *zram, u32 index)
+{
+ zram->table[index].ac_time.tv64 = 0;
+}
+
+static ssize_t read_block_state(struct file *file, char __user *buf,
+ size_t count, loff_t *ppos)
+{
+ char *kbuf;
+ ssize_t index, written = 0;
+ struct zram *zram = file->private_data;
+ unsigned long nr_pages = zram->disksize >> PAGE_SHIFT;
+ struct timespec64 ts;
+
+ gfp_t kmalloc_flags;
+
+ kmalloc_flags = GFP_KERNEL;
+ if (count > PAGE_SIZE)
+ kmalloc_flags |= __GFP_NOWARN | __GFP_NORETRY;
+
+ kbuf = kmalloc_node(count, kmalloc_flags, NUMA_NO_NODE);
+ if (!kbuf && count > PAGE_SIZE)
+ kbuf = vmalloc(count);
+ if (!kbuf)
+ return -ENOMEM;
down_read(&zram->init_lock);
- if (init_done(zram)) {
- struct zram_meta *meta = zram->meta;
- atomic_long_set(&zram->stats.max_used_pages,
- zs_get_total_pages(meta->mem_pool));
+ if (!init_done(zram)) {
+ up_read(&zram->init_lock);
+ kvfree(kbuf);
+ return -EINVAL;
}
+
+ for (index = *ppos; index < nr_pages; index++) {
+ int copied;
+
+ zram_slot_lock(zram, index);
+ if (!zram_allocated(zram, index))
+ goto next;
+
+ ts = ktime_to_timespec64(zram->table[index].ac_time);
+ copied = snprintf(kbuf + written, count,
+ "%12zd %12lld.%06lu %c%c%c\n",
+ index, (s64)ts.tv_sec,
+ ts.tv_nsec / NSEC_PER_USEC,
+ zram_test_flag(zram, index, ZRAM_SAME) ? 's' : '.',
+ zram_test_flag(zram, index, ZRAM_WB) ? 'w' : '.',
+ zram_test_flag(zram, index, ZRAM_HUGE) ? 'h' : '.');
+
+ if (count < copied) {
+ zram_slot_unlock(zram, index);
+ break;
+ }
+ written += copied;
+ count -= copied;
+next:
+ zram_slot_unlock(zram, index);
+ *ppos += 1;
+ }
+
up_read(&zram->init_lock);
+ if (copy_to_user(buf, kbuf, written))
+ written = -EFAULT;
+ kvfree(kbuf);
- return len;
+ return written;
+}
+
+static const struct file_operations proc_zram_block_state_op = {
+ .open = simple_open,
+ .read = read_block_state,
+ .llseek = default_llseek,
+};
+
+static void zram_debugfs_register(struct zram *zram)
+{
+ if (!zram_debugfs_root)
+ return;
+
+ zram->debugfs_dir = debugfs_create_dir(zram->disk->disk_name,
+ zram_debugfs_root);
+ debugfs_create_file("block_state", 0400, zram->debugfs_dir,
+ zram, &proc_zram_block_state_op);
}
+static void zram_debugfs_unregister(struct zram *zram)
+{
+ debugfs_remove_recursive(zram->debugfs_dir);
+}
+#else
+static void zram_debugfs_create(void) {};
+static void zram_debugfs_destroy(void) {};
+static void zram_accessed(struct zram *zram, u32 index) {};
+static void zram_reset_access(struct zram *zram, u32 index) {};
+static void zram_debugfs_register(struct zram *zram) {};
+static void zram_debugfs_unregister(struct zram *zram) {};
+#endif
+
/*
* We switched to per-cpu streams and this attr is not needed anymore.
* However, we will keep it around for some time, because:
@@ -351,7 +793,7 @@ static ssize_t comp_algorithm_store(struct device *dev,
struct device_attribute *attr, const char *buf, size_t len)
{
struct zram *zram = dev_to_zram(dev);
- char compressor[CRYPTO_MAX_ALG_NAME];
+ char compressor[ARRAY_SIZE(zram->compressor)];
size_t sz;
strlcpy(compressor, buf, sizeof(compressor));
@@ -370,7 +812,7 @@ static ssize_t comp_algorithm_store(struct device *dev,
return -EBUSY;
}
- strlcpy(zram->compressor, compressor, sizeof(compressor));
+ strcpy(zram->compressor, compressor);
up_write(&zram->init_lock);
return len;
}
@@ -379,7 +821,6 @@ static ssize_t compact_store(struct device *dev,
struct device_attribute *attr, const char *buf, size_t len)
{
struct zram *zram = dev_to_zram(dev);
- struct zram_meta *meta;
down_read(&zram->init_lock);
if (!init_done(zram)) {
@@ -387,8 +828,7 @@ static ssize_t compact_store(struct device *dev,
return -EINVAL;
}
- meta = zram->meta;
- zs_compact(meta->mem_pool);
+ zs_compact(zram->mem_pool);
up_read(&zram->init_lock);
return len;
@@ -425,22 +865,23 @@ static ssize_t mm_stat_show(struct device *dev,
down_read(&zram->init_lock);
if (init_done(zram)) {
- mem_used = zs_get_total_pages(zram->meta->mem_pool);
- zs_pool_stats(zram->meta->mem_pool, &pool_stats);
+ mem_used = zs_get_total_pages(zram->mem_pool);
+ zs_pool_stats(zram->mem_pool, &pool_stats);
}
orig_size = atomic64_read(&zram->stats.pages_stored);
max_used = atomic_long_read(&zram->stats.max_used_pages);
ret = scnprintf(buf, PAGE_SIZE,
- "%8llu %8llu %8llu %8lu %8ld %8llu %8lu\n",
+ "%8llu %8llu %8llu %8lu %8ld %8llu %8lu %8llu\n",
orig_size << PAGE_SHIFT,
(u64)atomic64_read(&zram->stats.compr_data_size),
mem_used << PAGE_SHIFT,
zram->limit_pages << PAGE_SHIFT,
max_used << PAGE_SHIFT,
- (u64)atomic64_read(&zram->stats.zero_pages),
- pool_stats.pages_compacted);
+ (u64)atomic64_read(&zram->stats.same_pages),
+ pool_stats.pages_compacted,
+ (u64)atomic64_read(&zram->stats.huge_pages));
up_read(&zram->init_lock);
return ret;
@@ -466,74 +907,38 @@ static ssize_t debug_stat_show(struct device *dev,
static DEVICE_ATTR_RO(io_stat);
static DEVICE_ATTR_RO(mm_stat);
static DEVICE_ATTR_RO(debug_stat);
-ZRAM_ATTR_RO(num_reads);
-ZRAM_ATTR_RO(num_writes);
-ZRAM_ATTR_RO(failed_reads);
-ZRAM_ATTR_RO(failed_writes);
-ZRAM_ATTR_RO(invalid_io);
-ZRAM_ATTR_RO(notify_free);
-ZRAM_ATTR_RO(zero_pages);
-ZRAM_ATTR_RO(compr_data_size);
-
-static inline bool zram_meta_get(struct zram *zram)
-{
- if (atomic_inc_not_zero(&zram->refcount))
- return true;
- return false;
-}
-
-static inline void zram_meta_put(struct zram *zram)
-{
- atomic_dec(&zram->refcount);
-}
-static void zram_meta_free(struct zram_meta *meta, u64 disksize)
+static void zram_meta_free(struct zram *zram, u64 disksize)
{
size_t num_pages = disksize >> PAGE_SHIFT;
size_t index;
/* Free all pages that are still in this zram device */
- for (index = 0; index < num_pages; index++) {
- unsigned long handle = meta->table[index].handle;
-
- if (!handle)
- continue;
-
- zs_free(meta->mem_pool, handle);
- }
+ for (index = 0; index < num_pages; index++)
+ zram_free_page(zram, index);
- zs_destroy_pool(meta->mem_pool);
- vfree(meta->table);
- kfree(meta);
+ zs_destroy_pool(zram->mem_pool);
+ vfree(zram->table);
}
-static struct zram_meta *zram_meta_alloc(char *pool_name, u64 disksize)
+static bool zram_meta_alloc(struct zram *zram, u64 disksize)
{
size_t num_pages;
- struct zram_meta *meta = kmalloc(sizeof(*meta), GFP_KERNEL);
-
- if (!meta)
- return NULL;
num_pages = disksize >> PAGE_SHIFT;
- meta->table = vzalloc(num_pages * sizeof(*meta->table));
- if (!meta->table) {
- pr_err("Error allocating zram address table\n");
- goto out_error;
- }
+ zram->table = vzalloc(num_pages * sizeof(*zram->table));
+ if (!zram->table)
+ return false;
- meta->mem_pool = zs_create_pool(pool_name);
- if (!meta->mem_pool) {
- pr_err("Error creating memory pool\n");
- goto out_error;
+ zram->mem_pool = zs_create_pool(zram->disk->disk_name);
+ if (!zram->mem_pool) {
+ vfree(zram->table);
+ return false;
}
- return meta;
-
-out_error:
- vfree(meta->table);
- kfree(meta);
- return NULL;
+ if (!huge_class_size)
+ huge_class_size = zs_huge_class_size(zram->mem_pool);
+ return true;
}
/*
@@ -543,191 +948,195 @@ out_error:
*/
static void zram_free_page(struct zram *zram, size_t index)
{
- struct zram_meta *meta = zram->meta;
- unsigned long handle = meta->table[index].handle;
+ unsigned long handle;
- if (unlikely(!handle)) {
- /*
- * No memory is allocated for zero filled pages.
- * Simply clear zero page flag.
- */
- if (zram_test_flag(meta, index, ZRAM_ZERO)) {
- zram_clear_flag(meta, index, ZRAM_ZERO);
- atomic64_dec(&zram->stats.zero_pages);
- }
+ zram_reset_access(zram, index);
+
+ if (zram_test_flag(zram, index, ZRAM_HUGE)) {
+ zram_clear_flag(zram, index, ZRAM_HUGE);
+ atomic64_dec(&zram->stats.huge_pages);
+ }
+
+ if (zram_wb_enabled(zram) && zram_test_flag(zram, index, ZRAM_WB)) {
+ zram_wb_clear(zram, index);
+ atomic64_dec(&zram->stats.pages_stored);
return;
}
- zs_free(meta->mem_pool, handle);
+ /*
+ * No memory is allocated for same element filled pages.
+ * Simply clear same page flag.
+ */
+ if (zram_test_flag(zram, index, ZRAM_SAME)) {
+ zram_clear_flag(zram, index, ZRAM_SAME);
+ zram_set_element(zram, index, 0);
+ atomic64_dec(&zram->stats.same_pages);
+ atomic64_dec(&zram->stats.pages_stored);
+ return;
+ }
+
+ handle = zram_get_handle(zram, index);
+ if (!handle)
+ return;
+
+ zs_free(zram->mem_pool, handle);
- atomic64_sub(zram_get_obj_size(meta, index),
+ atomic64_sub(zram_get_obj_size(zram, index),
&zram->stats.compr_data_size);
atomic64_dec(&zram->stats.pages_stored);
- meta->table[index].handle = 0;
- zram_set_obj_size(meta, index, 0);
+ zram_set_handle(zram, index, 0);
+ zram_set_obj_size(zram, index, 0);
}
-static int zram_decompress_page(struct zram *zram, char *mem, u32 index)
+static int __zram_bvec_read(struct zram *zram, struct page *page, u32 index,
+ struct bio *bio, bool partial_io)
{
- int ret = 0;
- unsigned char *cmem;
- struct zram_meta *meta = zram->meta;
+ int ret;
unsigned long handle;
unsigned int size;
+ void *src, *dst;
- bit_spin_lock(ZRAM_ACCESS, &meta->table[index].value);
- handle = meta->table[index].handle;
- size = zram_get_obj_size(meta, index);
+ if (zram_wb_enabled(zram)) {
+ zram_slot_lock(zram, index);
+ if (zram_test_flag(zram, index, ZRAM_WB)) {
+ struct bio_vec bvec;
- if (!handle || zram_test_flag(meta, index, ZRAM_ZERO)) {
- bit_spin_unlock(ZRAM_ACCESS, &meta->table[index].value);
- memset(mem, 0, PAGE_SIZE);
+ zram_slot_unlock(zram, index);
+
+ bvec.bv_page = page;
+ bvec.bv_len = PAGE_SIZE;
+ bvec.bv_offset = 0;
+ return read_from_bdev(zram, &bvec,
+ zram_get_element(zram, index),
+ bio, partial_io);
+ }
+ zram_slot_unlock(zram, index);
+ }
+
+ zram_slot_lock(zram, index);
+ handle = zram_get_handle(zram, index);
+ if (!handle || zram_test_flag(zram, index, ZRAM_SAME)) {
+ unsigned long value;
+ void *mem;
+
+ value = handle ? zram_get_element(zram, index) : 0;
+ mem = kmap_atomic(page);
+ zram_fill_page(mem, PAGE_SIZE, value);
+ kunmap_atomic(mem);
+ zram_slot_unlock(zram, index);
return 0;
}
- cmem = zs_map_object(meta->mem_pool, handle, ZS_MM_RO);
+ size = zram_get_obj_size(zram, index);
+
+ src = zs_map_object(zram->mem_pool, handle, ZS_MM_RO);
if (size == PAGE_SIZE) {
- memcpy(mem, cmem, PAGE_SIZE);
+ dst = kmap_atomic(page);
+ memcpy(dst, src, PAGE_SIZE);
+ kunmap_atomic(dst);
+ ret = 0;
} else {
struct zcomp_strm *zstrm = zcomp_stream_get(zram->comp);
- ret = zcomp_decompress(zstrm, cmem, size, mem);
+ dst = kmap_atomic(page);
+ ret = zcomp_decompress(zstrm, src, size, dst);
+ kunmap_atomic(dst);
zcomp_stream_put(zram->comp);
}
- zs_unmap_object(meta->mem_pool, handle);
- bit_spin_unlock(ZRAM_ACCESS, &meta->table[index].value);
+ zs_unmap_object(zram->mem_pool, handle);
+ zram_slot_unlock(zram, index);
/* Should NEVER happen. Return bio error if it does. */
- if (unlikely(ret)) {
+ if (unlikely(ret))
pr_err("Decompression failed! err=%d, page=%u\n", ret, index);
- return ret;
- }
- return 0;
+ return ret;
}
static int zram_bvec_read(struct zram *zram, struct bio_vec *bvec,
- u32 index, int offset)
+ u32 index, int offset, struct bio *bio)
{
int ret;
struct page *page;
- unsigned char *user_mem, *uncmem = NULL;
- struct zram_meta *meta = zram->meta;
- page = bvec->bv_page;
- bit_spin_lock(ZRAM_ACCESS, &meta->table[index].value);
- if (unlikely(!meta->table[index].handle) ||
- zram_test_flag(meta, index, ZRAM_ZERO)) {
- bit_spin_unlock(ZRAM_ACCESS, &meta->table[index].value);
- handle_zero_page(bvec);
- return 0;
+ page = bvec->bv_page;
+ if (is_partial_io(bvec)) {
+ /* Use a temporary buffer to decompress the page */
+ page = alloc_page(GFP_NOIO|__GFP_HIGHMEM);
+ if (!page)
+ return -ENOMEM;
}
- bit_spin_unlock(ZRAM_ACCESS, &meta->table[index].value);
- if (is_partial_io(bvec))
- /* Use a temporary buffer to decompress the page */
- uncmem = kmalloc(PAGE_SIZE, GFP_NOIO);
+ ret = __zram_bvec_read(zram, page, index, bio, is_partial_io(bvec));
+ if (unlikely(ret))
+ goto out;
- user_mem = kmap_atomic(page);
- if (!is_partial_io(bvec))
- uncmem = user_mem;
+ if (is_partial_io(bvec)) {
+ void *dst = kmap_atomic(bvec->bv_page);
+ void *src = kmap_atomic(page);
- if (!uncmem) {
- pr_err("Unable to allocate temp memory\n");
- ret = -ENOMEM;
- goto out_cleanup;
+ memcpy(dst + bvec->bv_offset, src + offset, bvec->bv_len);
+ kunmap_atomic(src);
+ kunmap_atomic(dst);
}
-
- ret = zram_decompress_page(zram, uncmem, index);
- /* Should NEVER happen. Return bio error if it does. */
- if (unlikely(ret))
- goto out_cleanup;
-
+out:
if (is_partial_io(bvec))
- memcpy(user_mem + bvec->bv_offset, uncmem + offset,
- bvec->bv_len);
+ __free_page(page);
- flush_dcache_page(page);
- ret = 0;
-out_cleanup:
- kunmap_atomic(user_mem);
- if (is_partial_io(bvec))
- kfree(uncmem);
return ret;
}
-static int zram_bvec_write(struct zram *zram, struct bio_vec *bvec, u32 index,
- int offset)
+static int __zram_bvec_write(struct zram *zram, struct bio_vec *bvec,
+ u32 index, struct bio *bio)
{
int ret = 0;
- unsigned int clen;
- unsigned long handle = 0;
- struct page *page;
- unsigned char *user_mem, *cmem, *src, *uncmem = NULL;
- struct zram_meta *meta = zram->meta;
- struct zcomp_strm *zstrm = NULL;
unsigned long alloced_pages;
+ unsigned long handle = 0;
+ unsigned int comp_len = 0;
+ void *src, *dst, *mem;
+ struct zcomp_strm *zstrm;
+ struct page *page = bvec->bv_page;
+ unsigned long element = 0;
+ enum zram_pageflags flags = 0;
+ bool allow_wb = true;
- page = bvec->bv_page;
- if (is_partial_io(bvec)) {
- /*
- * This is a partial IO. We need to read the full page
- * before to write the changes.
- */
- uncmem = kmalloc(PAGE_SIZE, GFP_NOIO);
- if (!uncmem) {
- ret = -ENOMEM;
- goto out;
- }
- ret = zram_decompress_page(zram, uncmem, index);
- if (ret)
- goto out;
- }
-
-compress_again:
- user_mem = kmap_atomic(page);
- if (is_partial_io(bvec)) {
- memcpy(uncmem + offset, user_mem + bvec->bv_offset,
- bvec->bv_len);
- kunmap_atomic(user_mem);
- user_mem = NULL;
- } else {
- uncmem = user_mem;
- }
-
- if (page_zero_filled(uncmem)) {
- if (user_mem)
- kunmap_atomic(user_mem);
+ mem = kmap_atomic(page);
+ if (page_same_filled(mem, &element)) {
+ kunmap_atomic(mem);
/* Free memory associated with this sector now. */
- bit_spin_lock(ZRAM_ACCESS, &meta->table[index].value);
- zram_free_page(zram, index);
- zram_set_flag(meta, index, ZRAM_ZERO);
- bit_spin_unlock(ZRAM_ACCESS, &meta->table[index].value);
-
- atomic64_inc(&zram->stats.zero_pages);
- ret = 0;
+ flags = ZRAM_SAME;
+ atomic64_inc(&zram->stats.same_pages);
goto out;
}
+ kunmap_atomic(mem);
+compress_again:
zstrm = zcomp_stream_get(zram->comp);
- ret = zcomp_compress(zstrm, uncmem, &clen);
- if (!is_partial_io(bvec)) {
- kunmap_atomic(user_mem);
- user_mem = NULL;
- uncmem = NULL;
- }
+ src = kmap_atomic(page);
+ ret = zcomp_compress(zstrm, src, &comp_len);
+ kunmap_atomic(src);
if (unlikely(ret)) {
+ zcomp_stream_put(zram->comp);
pr_err("Compression failed! err=%d\n", ret);
- goto out;
+ zs_free(zram->mem_pool, handle);
+ return ret;
}
- src = zstrm->buffer;
- if (unlikely(clen > max_zpage_size)) {
- clen = PAGE_SIZE;
- if (is_partial_io(bvec))
- src = uncmem;
+ if (unlikely(comp_len >= huge_class_size)) {
+ comp_len = PAGE_SIZE;
+ if (zram_wb_enabled(zram) && allow_wb) {
+ zcomp_stream_put(zram->comp);
+ ret = write_to_bdev(zram, bvec, index, bio, &element);
+ if (!ret) {
+ flags = ZRAM_WB;
+ ret = 1;
+ goto out;
+ }
+ allow_wb = false;
+ goto compress_again;
+ }
}
/*
@@ -744,71 +1153,108 @@ compress_again:
* from the slow path and handle has already been allocated.
*/
if (!handle)
- handle = zs_malloc(meta->mem_pool, clen,
+ handle = zs_malloc(zram->mem_pool, comp_len,
__GFP_KSWAPD_RECLAIM |
__GFP_NOWARN |
__GFP_HIGHMEM |
__GFP_MOVABLE);
if (!handle) {
zcomp_stream_put(zram->comp);
- zstrm = NULL;
-
atomic64_inc(&zram->stats.writestall);
-
- handle = zs_malloc(meta->mem_pool, clen,
+ handle = zs_malloc(zram->mem_pool, comp_len,
GFP_NOIO | __GFP_HIGHMEM |
__GFP_MOVABLE);
if (handle)
goto compress_again;
-
- pr_err("Error allocating memory for compressed page: %u, size=%u\n",
- index, clen);
- ret = -ENOMEM;
- goto out;
+ return -ENOMEM;
}
- alloced_pages = zs_get_total_pages(meta->mem_pool);
+ alloced_pages = zs_get_total_pages(zram->mem_pool);
update_used_max(zram, alloced_pages);
if (zram->limit_pages && alloced_pages > zram->limit_pages) {
- zs_free(meta->mem_pool, handle);
- ret = -ENOMEM;
- goto out;
+ zcomp_stream_put(zram->comp);
+ zs_free(zram->mem_pool, handle);
+ return -ENOMEM;
}
- cmem = zs_map_object(meta->mem_pool, handle, ZS_MM_WO);
+ dst = zs_map_object(zram->mem_pool, handle, ZS_MM_WO);
- if ((clen == PAGE_SIZE) && !is_partial_io(bvec)) {
+ src = zstrm->buffer;
+ if (comp_len == PAGE_SIZE)
src = kmap_atomic(page);
- memcpy(cmem, src, PAGE_SIZE);
+ memcpy(dst, src, comp_len);
+ if (comp_len == PAGE_SIZE)
kunmap_atomic(src);
- } else {
- memcpy(cmem, src, clen);
- }
zcomp_stream_put(zram->comp);
- zstrm = NULL;
- zs_unmap_object(meta->mem_pool, handle);
-
+ zs_unmap_object(zram->mem_pool, handle);
+ atomic64_add(comp_len, &zram->stats.compr_data_size);
+out:
/*
* Free memory associated with this sector
* before overwriting unused sectors.
*/
- bit_spin_lock(ZRAM_ACCESS, &meta->table[index].value);
+ zram_slot_lock(zram, index);
zram_free_page(zram, index);
- meta->table[index].handle = handle;
- zram_set_obj_size(meta, index, clen);
- bit_spin_unlock(ZRAM_ACCESS, &meta->table[index].value);
+ if (comp_len == PAGE_SIZE) {
+ zram_set_flag(zram, index, ZRAM_HUGE);
+ atomic64_inc(&zram->stats.huge_pages);
+ }
+
+ if (flags) {
+ zram_set_flag(zram, index, flags);
+ zram_set_element(zram, index, element);
+ } else {
+ zram_set_handle(zram, index, handle);
+ zram_set_obj_size(zram, index, comp_len);
+ }
+ zram_slot_unlock(zram, index);
/* Update stats */
- atomic64_add(clen, &zram->stats.compr_data_size);
atomic64_inc(&zram->stats.pages_stored);
+ return ret;
+}
+
+static int zram_bvec_write(struct zram *zram, struct bio_vec *bvec,
+ u32 index, int offset, struct bio *bio)
+{
+ int ret;
+ struct page *page = NULL;
+ void *src;
+ struct bio_vec vec;
+
+ vec = *bvec;
+ if (is_partial_io(bvec)) {
+ void *dst;
+ /*
+ * This is a partial IO. We need to read the full page
+ * before to write the changes.
+ */
+ page = alloc_page(GFP_NOIO|__GFP_HIGHMEM);
+ if (!page)
+ return -ENOMEM;
+
+ ret = __zram_bvec_read(zram, page, index, bio, true);
+ if (ret)
+ goto out;
+
+ src = kmap_atomic(bvec->bv_page);
+ dst = kmap_atomic(page);
+ memcpy(dst + offset, src + bvec->bv_offset, bvec->bv_len);
+ kunmap_atomic(dst);
+ kunmap_atomic(src);
+
+ vec.bv_page = page;
+ vec.bv_len = PAGE_SIZE;
+ vec.bv_offset = 0;
+ }
+
+ ret = __zram_bvec_write(zram, &vec, index, bio);
out:
- if (zstrm)
- zcomp_stream_put(zram->comp);
if (is_partial_io(bvec))
- kfree(uncmem);
+ __free_page(page);
return ret;
}
@@ -821,7 +1267,6 @@ static void zram_bio_discard(struct zram *zram, u32 index,
int offset, struct bio *bio)
{
size_t n = bio->bi_iter.bi_size;
- struct zram_meta *meta = zram->meta;
/*
* zram manages data in physical block size units. Because logical block
@@ -842,17 +1287,22 @@ static void zram_bio_discard(struct zram *zram, u32 index,
}
while (n >= PAGE_SIZE) {
- bit_spin_lock(ZRAM_ACCESS, &meta->table[index].value);
+ zram_slot_lock(zram, index);
zram_free_page(zram, index);
- bit_spin_unlock(ZRAM_ACCESS, &meta->table[index].value);
+ zram_slot_unlock(zram, index);
atomic64_inc(&zram->stats.notify_free);
index++;
n -= PAGE_SIZE;
}
}
+/*
+ * Returns errno if it has some problem. Otherwise return 0 or 1.
+ * Returns 0 if IO request was done synchronously
+ * Returns 1 if IO request was successfully submitted.
+ */
static int zram_bvec_rw(struct zram *zram, struct bio_vec *bvec, u32 index,
- int offset, bool is_write)
+ int offset, bool is_write, struct bio *bio)
{
unsigned long start_time = jiffies;
int rw_acct = is_write ? REQ_OP_WRITE : REQ_OP_READ;
@@ -863,15 +1313,20 @@ static int zram_bvec_rw(struct zram *zram, struct bio_vec *bvec, u32 index,
if (!is_write) {
atomic64_inc(&zram->stats.num_reads);
- ret = zram_bvec_read(zram, bvec, index, offset);
+ ret = zram_bvec_read(zram, bvec, index, offset, bio);
+ flush_dcache_page(bvec->bv_page);
} else {
atomic64_inc(&zram->stats.num_writes);
- ret = zram_bvec_write(zram, bvec, index, offset);
+ ret = zram_bvec_write(zram, bvec, index, offset, bio);
}
generic_end_io_acct(rw_acct, &zram->disk->part0, start_time);
- if (unlikely(ret)) {
+ zram_slot_lock(zram, index);
+ zram_accessed(zram, index);
+ zram_slot_unlock(zram, index);
+
+ if (unlikely(ret < 0)) {
if (!is_write)
atomic64_inc(&zram->stats.failed_reads);
else
@@ -899,34 +1354,21 @@ static void __zram_make_request(struct zram *zram, struct bio *bio)
}
bio_for_each_segment(bvec, bio, iter) {
- int max_transfer_size = PAGE_SIZE - offset;
-
- if (bvec.bv_len > max_transfer_size) {
- /*
- * zram_bvec_rw() can only make operation on a single
- * zram page. Split the bio vector.
- */
- struct bio_vec bv;
-
- bv.bv_page = bvec.bv_page;
- bv.bv_len = max_transfer_size;
- bv.bv_offset = bvec.bv_offset;
+ struct bio_vec bv = bvec;
+ unsigned int unwritten = bvec.bv_len;
+ do {
+ bv.bv_len = min_t(unsigned int, PAGE_SIZE - offset,
+ unwritten);
if (zram_bvec_rw(zram, &bv, index, offset,
- op_is_write(bio_op(bio))) < 0)
+ op_is_write(bio_op(bio)), bio) < 0)
goto out;
- bv.bv_len = bvec.bv_len - max_transfer_size;
- bv.bv_offset += max_transfer_size;
- if (zram_bvec_rw(zram, &bv, index + 1, 0,
- op_is_write(bio_op(bio))) < 0)
- goto out;
- } else
- if (zram_bvec_rw(zram, &bvec, index, offset,
- op_is_write(bio_op(bio))) < 0)
- goto out;
+ bv.bv_offset += bv.bv_len;
+ unwritten -= bv.bv_len;
- update_position(&index, &offset, &bvec);
+ update_position(&index, &offset, &bv);
+ } while (unwritten);
}
bio_endio(bio);
@@ -943,22 +1385,15 @@ static blk_qc_t zram_make_request(struct request_queue *queue, struct bio *bio)
{
struct zram *zram = queue->queuedata;
- if (unlikely(!zram_meta_get(zram)))
- goto error;
-
- blk_queue_split(queue, &bio, queue->bio_split);
-
if (!valid_io_request(zram, bio->bi_iter.bi_sector,
bio->bi_iter.bi_size)) {
atomic64_inc(&zram->stats.invalid_io);
- goto put_zram;
+ goto error;
}
__zram_make_request(zram, bio);
- zram_meta_put(zram);
return BLK_QC_T_NONE;
-put_zram:
- zram_meta_put(zram);
+
error:
bio_io_error(bio);
return BLK_QC_T_NONE;
@@ -968,45 +1403,39 @@ static void zram_slot_free_notify(struct block_device *bdev,
unsigned long index)
{
struct zram *zram;
- struct zram_meta *meta;
zram = bdev->bd_disk->private_data;
- meta = zram->meta;
- bit_spin_lock(ZRAM_ACCESS, &meta->table[index].value);
+ zram_slot_lock(zram, index);
zram_free_page(zram, index);
- bit_spin_unlock(ZRAM_ACCESS, &meta->table[index].value);
+ zram_slot_unlock(zram, index);
atomic64_inc(&zram->stats.notify_free);
}
static int zram_rw_page(struct block_device *bdev, sector_t sector,
struct page *page, bool is_write)
{
- int offset, err = -EIO;
+ int offset, ret;
u32 index;
struct zram *zram;
struct bio_vec bv;
zram = bdev->bd_disk->private_data;
- if (unlikely(!zram_meta_get(zram)))
- goto out;
if (!valid_io_request(zram, sector, PAGE_SIZE)) {
atomic64_inc(&zram->stats.invalid_io);
- err = -EINVAL;
- goto put_zram;
+ ret = -EINVAL;
+ goto out;
}
index = sector >> SECTORS_PER_PAGE_SHIFT;
- offset = sector & (SECTORS_PER_PAGE - 1) << SECTOR_SHIFT;
+ offset = (sector & (SECTORS_PER_PAGE - 1)) << SECTOR_SHIFT;
bv.bv_page = page;
bv.bv_len = PAGE_SIZE;
bv.bv_offset = 0;
- err = zram_bvec_rw(zram, &bv, index, offset, is_write);
-put_zram:
- zram_meta_put(zram);
+ ret = zram_bvec_rw(zram, &bv, index, offset, is_write, NULL);
out:
/*
* If I/O fails, just return error(ie, non-zero) without
@@ -1016,14 +1445,24 @@ out:
* bio->bi_end_io does things to handle the error
* (e.g., SetPageError, set_page_dirty and extra works).
*/
- if (err == 0)
+ if (unlikely(ret < 0))
+ return ret;
+
+ switch (ret) {
+ case 0:
page_endio(page, is_write, 0);
- return err;
+ break;
+ case 1:
+ ret = 0;
+ break;
+ default:
+ WARN_ON(1);
+ }
+ return ret;
}
static void zram_reset_device(struct zram *zram)
{
- struct zram_meta *meta;
struct zcomp *comp;
u64 disksize;
@@ -1036,23 +1475,8 @@ static void zram_reset_device(struct zram *zram)
return;
}
- meta = zram->meta;
comp = zram->comp;
disksize = zram->disksize;
- /*
- * Refcount will go down to 0 eventually and r/w handler
- * cannot handle further I/O so it will bail out by
- * check zram_meta_get.
- */
- zram_meta_put(zram);
- /*
- * We want to free zram_meta in process context to avoid
- * deadlock between reclaim path and any other locks.
- */
- wait_event(zram->io_done, atomic_read(&zram->refcount) == 0);
-
- /* Reset stats */
- memset(&zram->stats, 0, sizeof(zram->stats));
zram->disksize = 0;
set_capacity(zram->disk, 0);
@@ -1060,8 +1484,10 @@ static void zram_reset_device(struct zram *zram)
up_write(&zram->init_lock);
/* I/O operation under all of CPU are done so let's free */
- zram_meta_free(meta, disksize);
+ zram_meta_free(zram, disksize);
+ memset(&zram->stats, 0, sizeof(zram->stats));
zcomp_destroy(comp);
+ reset_bdev(zram);
}
static ssize_t disksize_store(struct device *dev,
@@ -1069,7 +1495,6 @@ static ssize_t disksize_store(struct device *dev,
{
u64 disksize;
struct zcomp *comp;
- struct zram_meta *meta;
struct zram *zram = dev_to_zram(dev);
int err;
@@ -1077,10 +1502,18 @@ static ssize_t disksize_store(struct device *dev,
if (!disksize)
return -EINVAL;
+ down_write(&zram->init_lock);
+ if (init_done(zram)) {
+ pr_info("Cannot change disksize for initialized device\n");
+ err = -EBUSY;
+ goto out_unlock;
+ }
+
disksize = PAGE_ALIGN(disksize);
- meta = zram_meta_alloc(zram->disk->disk_name, disksize);
- if (!meta)
- return -ENOMEM;
+ if (!zram_meta_alloc(zram, disksize)) {
+ err = -ENOMEM;
+ goto out_unlock;
+ }
comp = zcomp_create(zram->compressor);
if (IS_ERR(comp)) {
@@ -1090,29 +1523,19 @@ static ssize_t disksize_store(struct device *dev,
goto out_free_meta;
}
- down_write(&zram->init_lock);
- if (init_done(zram)) {
- pr_info("Cannot change disksize for initialized device\n");
- err = -EBUSY;
- goto out_destroy_comp;
- }
-
- init_waitqueue_head(&zram->io_done);
- atomic_set(&zram->refcount, 1);
- zram->meta = meta;
zram->comp = comp;
zram->disksize = disksize;
set_capacity(zram->disk, zram->disksize >> SECTOR_SHIFT);
- zram_revalidate_disk(zram);
+
+ revalidate_disk(zram->disk);
up_write(&zram->init_lock);
return len;
-out_destroy_comp:
- up_write(&zram->init_lock);
- zcomp_destroy(comp);
out_free_meta:
- zram_meta_free(meta, disksize);
+ zram_meta_free(zram, disksize);
+out_unlock:
+ up_write(&zram->init_lock);
return err;
}
@@ -1151,7 +1574,7 @@ static ssize_t reset_store(struct device *dev,
/* Make sure all the pending I/O are finished */
fsync_bdev(bdev);
zram_reset_device(zram);
- zram_revalidate_disk(zram);
+ revalidate_disk(zram->disk);
bdput(bdev);
mutex_lock(&bdev->bd_mutex);
@@ -1187,39 +1610,33 @@ static DEVICE_ATTR_WO(compact);
static DEVICE_ATTR_RW(disksize);
static DEVICE_ATTR_RO(initstate);
static DEVICE_ATTR_WO(reset);
-static DEVICE_ATTR_RO(orig_data_size);
-static DEVICE_ATTR_RO(mem_used_total);
-static DEVICE_ATTR_RW(mem_limit);
-static DEVICE_ATTR_RW(mem_used_max);
+static DEVICE_ATTR_WO(mem_limit);
+static DEVICE_ATTR_WO(mem_used_max);
static DEVICE_ATTR_RW(max_comp_streams);
static DEVICE_ATTR_RW(comp_algorithm);
+#ifdef CONFIG_ZRAM_WRITEBACK
+static DEVICE_ATTR_RW(backing_dev);
+#endif
static struct attribute *zram_disk_attrs[] = {
&dev_attr_disksize.attr,
&dev_attr_initstate.attr,
&dev_attr_reset.attr,
- &dev_attr_num_reads.attr,
- &dev_attr_num_writes.attr,
- &dev_attr_failed_reads.attr,
- &dev_attr_failed_writes.attr,
&dev_attr_compact.attr,
- &dev_attr_invalid_io.attr,
- &dev_attr_notify_free.attr,
- &dev_attr_zero_pages.attr,
- &dev_attr_orig_data_size.attr,
- &dev_attr_compr_data_size.attr,
- &dev_attr_mem_used_total.attr,
&dev_attr_mem_limit.attr,
&dev_attr_mem_used_max.attr,
&dev_attr_max_comp_streams.attr,
&dev_attr_comp_algorithm.attr,
+#ifdef CONFIG_ZRAM_WRITEBACK
+ &dev_attr_backing_dev.attr,
+#endif
&dev_attr_io_stat.attr,
&dev_attr_mm_stat.attr,
&dev_attr_debug_stat.attr,
NULL,
};
-static struct attribute_group zram_disk_attr_group = {
+static const struct attribute_group zram_disk_attr_group = {
.attrs = zram_disk_attrs,
};
@@ -1281,6 +1698,7 @@ static int zram_add(void)
/* zram devices sort of resembles non-rotational disks */
queue_flag_set_unlocked(QUEUE_FLAG_NONROT, zram->disk->queue);
queue_flag_clear_unlocked(QUEUE_FLAG_ADD_RANDOM, zram->disk->queue);
+
/*
* To ensure that we always get PAGE_SIZE aligned
* and n*PAGE_SIZED sized I/O requests.
@@ -1291,8 +1709,6 @@ static int zram_add(void)
blk_queue_io_min(zram->disk->queue, PAGE_SIZE);
blk_queue_io_opt(zram->disk->queue, PAGE_SIZE);
zram->disk->queue->limits.discard_granularity = PAGE_SIZE;
- zram->disk->queue->limits.max_sectors = SECTORS_PER_PAGE;
- zram->disk->queue->limits.chunk_sectors = 0;
blk_queue_max_discard_sectors(zram->disk->queue, UINT_MAX);
/*
* zram_bio_discard() will clear all logical blocks if logical block
@@ -1308,12 +1724,15 @@ static int zram_add(void)
zram->disk->queue->limits.discard_zeroes_data = 0;
queue_flag_set_unlocked(QUEUE_FLAG_DISCARD, zram->disk->queue);
+ zram->disk->queue->backing_dev_info.capabilities |=
+ BDI_CAP_STABLE_WRITES;
+
disk_to_dev(zram->disk)->groups = zram_disk_attr_groups;
add_disk(zram->disk);
strlcpy(zram->compressor, default_compressor, sizeof(zram->compressor));
- zram->meta = NULL;
+ zram_debugfs_register(zram);
pr_info("Added device: %s\n", zram->disk->disk_name);
return device_id;
@@ -1344,6 +1763,8 @@ static int zram_remove(struct zram *zram)
zram->claim = true;
mutex_unlock(&bdev->bd_mutex);
+ zram_debugfs_unregister(zram);
+
/* Make sure all the pending I/O are finished */
fsync_bdev(bdev);
zram_reset_device(zram);
@@ -1351,8 +1772,8 @@ static int zram_remove(struct zram *zram)
pr_info("Removed device: %s\n", zram->disk->disk_name);
- blk_cleanup_queue(zram->disk->queue);
del_gendisk(zram->disk);
+ blk_cleanup_queue(zram->disk->queue);
put_disk(zram->disk);
kfree(zram);
return 0;
@@ -1432,6 +1853,7 @@ static void destroy_devices(void)
{
class_unregister(&zram_control_class);
idr_for_each(&zram_index_idr, &zram_remove_cb, NULL);
+ zram_debugfs_destroy();
idr_destroy(&zram_index_idr);
unregister_blkdev(zram_major, "zram");
}
@@ -1446,6 +1868,7 @@ static int __init zram_init(void)
return ret;
}
+ zram_debugfs_create();
zram_major = register_blkdev(0, "zram");
if (zram_major <= 0) {
pr_err("Unable to get major number\n");
diff --git a/drivers/block/zram/zram_drv.h b/drivers/block/zram/zram_drv.h
index 74fcf10da374..3a1cac486e96 100644
--- a/drivers/block/zram/zram_drv.h
+++ b/drivers/block/zram/zram_drv.h
@@ -21,22 +21,6 @@
#include "zcomp.h"
-/*-- Configurable parameters */
-
-/*
- * Pages that compress to size greater than this are stored
- * uncompressed in memory.
- */
-static const size_t max_zpage_size = PAGE_SIZE / 4 * 3;
-
-/*
- * NOTE: max_zpage_size must be less than or equal to:
- * ZS_MAX_ALLOC_SIZE. Otherwise, zs_malloc() would
- * always return failure.
- */
-
-/*-- End of configurable params */
-
#define SECTOR_SHIFT 9
#define SECTORS_PER_PAGE_SHIFT (PAGE_SHIFT - SECTOR_SHIFT)
#define SECTORS_PER_PAGE (1 << SECTORS_PER_PAGE_SHIFT)
@@ -60,9 +44,11 @@ static const size_t max_zpage_size = PAGE_SIZE / 4 * 3;
/* Flags for zram pages (table[page_no].value) */
enum zram_pageflags {
- /* Page consists entirely of zeros */
- ZRAM_ZERO = ZRAM_FLAG_SHIFT,
- ZRAM_ACCESS, /* page is now accessed */
+ /* zram slot is locked */
+ ZRAM_LOCK = ZRAM_FLAG_SHIFT,
+ ZRAM_SAME, /* Page consists the same element */
+ ZRAM_WB, /* page is stored on backing_device */
+ ZRAM_HUGE, /* Incompressible page */
__NR_ZRAM_PAGEFLAGS,
};
@@ -71,8 +57,14 @@ enum zram_pageflags {
/* Allocated for each disk page */
struct zram_table_entry {
- unsigned long handle;
+ union {
+ unsigned long handle;
+ unsigned long element;
+ };
unsigned long value;
+#ifdef CONFIG_ZRAM_MEMORY_TRACKING
+ ktime_t ac_time;
+#endif
};
struct zram_stats {
@@ -83,19 +75,16 @@ struct zram_stats {
atomic64_t failed_writes; /* can happen when memory is too low */
atomic64_t invalid_io; /* non-page-aligned I/O requests */
atomic64_t notify_free; /* no. of swap slot free notifications */
- atomic64_t zero_pages; /* no. of zero filled pages */
+ atomic64_t same_pages; /* no. of same element filled pages */
+ atomic64_t huge_pages; /* no. of huge pages */
atomic64_t pages_stored; /* no. of pages currently stored */
atomic_long_t max_used_pages; /* no. of maximum pages stored */
atomic64_t writestall; /* no. of write slow paths */
};
-struct zram_meta {
+struct zram {
struct zram_table_entry *table;
struct zs_pool *mem_pool;
-};
-
-struct zram {
- struct zram_meta *meta;
struct zcomp *comp;
struct gendisk *disk;
/* Prevent concurrent execution of device init */
@@ -106,9 +95,6 @@ struct zram {
unsigned long limit_pages;
struct zram_stats stats;
- atomic_t refcount; /* refcount for zram_meta */
- /* wait all IO under all of cpu are done */
- wait_queue_head_t io_done;
/*
* This is the limit on amount of *uncompressed* worth of data
* we can store in a disk.
@@ -119,5 +105,16 @@ struct zram {
* zram is claimed so open request will be failed
*/
bool claim; /* Protected by bdev->bd_mutex */
+#ifdef CONFIG_ZRAM_WRITEBACK
+ struct file *backing_dev;
+ struct block_device *bdev;
+ unsigned int old_block_size;
+ unsigned long *bitmap;
+ unsigned long nr_pages;
+ spinlock_t bitmap_lock;
+#endif
+#ifdef CONFIG_ZRAM_MEMORY_TRACKING
+ struct dentry *debugfs_dir;
+#endif
};
#endif
diff --git a/drivers/char/ipmi/ipmi_poweroff.c b/drivers/char/ipmi/ipmi_poweroff.c
index 9f2e3be2c5b8..676c910e990f 100644
--- a/drivers/char/ipmi/ipmi_poweroff.c
+++ b/drivers/char/ipmi/ipmi_poweroff.c
@@ -66,7 +66,7 @@ static void (*specific_poweroff_func)(ipmi_user_t user);
/* Holds the old poweroff function so we can restore it on removal. */
static void (*old_poweroff_func)(void);
-static int set_param_ifnum(const char *val, struct kernel_param *kp)
+static int set_param_ifnum(const char *val, const struct kernel_param *kp)
{
int rv = param_set_int(val, kp);
if (rv)
diff --git a/drivers/char/ipmi/ipmi_si_intf.c b/drivers/char/ipmi/ipmi_si_intf.c
index e0a53156b782..89adeb4905f1 100644
--- a/drivers/char/ipmi/ipmi_si_intf.c
+++ b/drivers/char/ipmi/ipmi_si_intf.c
@@ -1344,7 +1344,7 @@ static unsigned int num_slave_addrs;
#define IPMI_MEM_ADDR_SPACE 1
static const char * const addr_space_to_str[] = { "i/o", "mem" };
-static int hotmod_handler(const char *val, struct kernel_param *kp);
+static int hotmod_handler(const char *val, const struct kernel_param *kp);
module_param_call(hotmod, hotmod_handler, NULL, NULL, 0200);
MODULE_PARM_DESC(hotmod, "Add and remove interfaces. See"
@@ -1814,7 +1814,7 @@ static struct smi_info *smi_info_alloc(void)
return info;
}
-static int hotmod_handler(const char *val, struct kernel_param *kp)
+static int hotmod_handler(const char *val, const struct kernel_param *kp)
{
char *str = kstrdup(val, GFP_KERNEL);
int rv;
diff --git a/drivers/char/random.c b/drivers/char/random.c
index 81b65d0e7563..075306fdb1e3 100644
--- a/drivers/char/random.c
+++ b/drivers/char/random.c
@@ -263,7 +263,7 @@
#include <linux/syscalls.h>
#include <linux/completion.h>
#include <linux/uuid.h>
-#include <crypto/chacha20.h>
+#include <crypto/chacha.h>
#include <asm/processor.h>
#include <asm/uaccess.h>
@@ -438,11 +438,10 @@ static int crng_init = 0;
#define crng_ready() (likely(crng_init > 1))
static int crng_init_cnt = 0;
static unsigned long crng_global_init_time = 0;
-#define CRNG_INIT_CNT_THRESH (2*CHACHA20_KEY_SIZE)
-static void _extract_crng(struct crng_state *crng,
- __u8 out[CHACHA20_BLOCK_SIZE]);
+#define CRNG_INIT_CNT_THRESH (2*CHACHA_KEY_SIZE)
+static void _extract_crng(struct crng_state *crng, __u8 out[CHACHA_BLOCK_SIZE]);
static void _crng_backtrack_protect(struct crng_state *crng,
- __u8 tmp[CHACHA20_BLOCK_SIZE], int used);
+ __u8 tmp[CHACHA_BLOCK_SIZE], int used);
static void process_random_ready_list(void);
static struct ratelimit_state unseeded_warning =
@@ -818,7 +817,7 @@ static int crng_fast_load(const char *cp, size_t len)
}
p = (unsigned char *) &primary_crng.state[4];
while (len > 0 && crng_init_cnt < CRNG_INIT_CNT_THRESH) {
- p[crng_init_cnt % CHACHA20_KEY_SIZE] ^= *cp;
+ p[crng_init_cnt % CHACHA_KEY_SIZE] ^= *cp;
cp++; crng_init_cnt++; len--;
}
if (crng_init_cnt >= CRNG_INIT_CNT_THRESH) {
@@ -868,7 +867,7 @@ static void crng_reseed(struct crng_state *crng, struct entropy_store *r)
unsigned long flags;
int i, num;
union {
- __u8 block[CHACHA20_BLOCK_SIZE];
+ __u8 block[CHACHA_BLOCK_SIZE];
__u32 key[8];
} buf;
@@ -879,7 +878,7 @@ static void crng_reseed(struct crng_state *crng, struct entropy_store *r)
} else {
_extract_crng(&primary_crng, buf.block);
_crng_backtrack_protect(&primary_crng, buf.block,
- CHACHA20_KEY_SIZE);
+ CHACHA_KEY_SIZE);
}
spin_lock_irqsave(&crng->lock, flags);
for (i = 0; i < 8; i++) {
@@ -926,7 +925,7 @@ static inline void crng_wait_ready(void)
}
static void _extract_crng(struct crng_state *crng,
- __u8 out[CHACHA20_BLOCK_SIZE])
+ __u8 out[CHACHA_BLOCK_SIZE])
{
unsigned long v, flags;
@@ -943,7 +942,7 @@ static void _extract_crng(struct crng_state *crng,
spin_unlock_irqrestore(&crng->lock, flags);
}
-static void extract_crng(__u8 out[CHACHA20_BLOCK_SIZE])
+static void extract_crng(__u8 out[CHACHA_BLOCK_SIZE])
{
struct crng_state *crng = NULL;
@@ -961,14 +960,14 @@ static void extract_crng(__u8 out[CHACHA20_BLOCK_SIZE])
* enough) to mutate the CRNG key to provide backtracking protection.
*/
static void _crng_backtrack_protect(struct crng_state *crng,
- __u8 tmp[CHACHA20_BLOCK_SIZE], int used)
+ __u8 tmp[CHACHA_BLOCK_SIZE], int used)
{
unsigned long flags;
__u32 *s, *d;
int i;
used = round_up(used, sizeof(__u32));
- if (used + CHACHA20_KEY_SIZE > CHACHA20_BLOCK_SIZE) {
+ if (used + CHACHA_KEY_SIZE > CHACHA_BLOCK_SIZE) {
extract_crng(tmp);
used = 0;
}
@@ -980,7 +979,7 @@ static void _crng_backtrack_protect(struct crng_state *crng,
spin_unlock_irqrestore(&crng->lock, flags);
}
-static void crng_backtrack_protect(__u8 tmp[CHACHA20_BLOCK_SIZE], int used)
+static void crng_backtrack_protect(__u8 tmp[CHACHA_BLOCK_SIZE], int used)
{
struct crng_state *crng = NULL;
@@ -995,8 +994,8 @@ static void crng_backtrack_protect(__u8 tmp[CHACHA20_BLOCK_SIZE], int used)
static ssize_t extract_crng_user(void __user *buf, size_t nbytes)
{
- ssize_t ret = 0, i = CHACHA20_BLOCK_SIZE;
- __u8 tmp[CHACHA20_BLOCK_SIZE];
+ ssize_t ret = 0, i = CHACHA_BLOCK_SIZE;
+ __u8 tmp[CHACHA_BLOCK_SIZE] __aligned(4);
int large_request = (nbytes > 256);
while (nbytes) {
@@ -1010,7 +1009,7 @@ static ssize_t extract_crng_user(void __user *buf, size_t nbytes)
}
extract_crng(tmp);
- i = min_t(int, nbytes, CHACHA20_BLOCK_SIZE);
+ i = min_t(int, nbytes, CHACHA_BLOCK_SIZE);
if (copy_to_user(buf, tmp, i)) {
ret = -EFAULT;
break;
@@ -1564,7 +1563,7 @@ static ssize_t extract_entropy_user(struct entropy_store *r, void __user *buf,
*/
void get_random_bytes(void *buf, int nbytes)
{
- __u8 tmp[CHACHA20_BLOCK_SIZE];
+ __u8 tmp[CHACHA_BLOCK_SIZE] __aligned(4);
#if DEBUG_RANDOM_BOOT > 0
if (!crng_ready())
@@ -1573,10 +1572,10 @@ void get_random_bytes(void *buf, int nbytes)
#endif
trace_get_random_bytes(nbytes, _RET_IP_);
- while (nbytes >= CHACHA20_BLOCK_SIZE) {
+ while (nbytes >= CHACHA_BLOCK_SIZE) {
extract_crng(buf);
- buf += CHACHA20_BLOCK_SIZE;
- nbytes -= CHACHA20_BLOCK_SIZE;
+ buf += CHACHA_BLOCK_SIZE;
+ nbytes -= CHACHA_BLOCK_SIZE;
}
if (nbytes > 0) {
@@ -1584,7 +1583,7 @@ void get_random_bytes(void *buf, int nbytes)
memcpy(buf, tmp, nbytes);
crng_backtrack_protect(tmp, nbytes);
} else
- crng_backtrack_protect(tmp, CHACHA20_BLOCK_SIZE);
+ crng_backtrack_protect(tmp, CHACHA_BLOCK_SIZE);
memzero_explicit(tmp, sizeof(tmp));
}
EXPORT_SYMBOL(get_random_bytes);
@@ -2110,8 +2109,8 @@ struct ctl_table random_table[] = {
struct batched_entropy {
union {
- unsigned long entropy_long[CHACHA20_BLOCK_SIZE / sizeof(unsigned long)];
- unsigned int entropy_int[CHACHA20_BLOCK_SIZE / sizeof(unsigned int)];
+ unsigned long entropy_long[CHACHA_BLOCK_SIZE / sizeof(unsigned long)];
+ unsigned int entropy_int[CHACHA_BLOCK_SIZE / sizeof(unsigned int)];
};
unsigned int position;
};
diff --git a/drivers/clocksource/Kconfig b/drivers/clocksource/Kconfig
index e2c6e43cf8ca..0bed22ff57a8 100644
--- a/drivers/clocksource/Kconfig
+++ b/drivers/clocksource/Kconfig
@@ -305,6 +305,14 @@ config ARM_ARCH_TIMER_EVTSTREAM
This must be disabled for hardware validation purposes to detect any
hardware anomalies of missing events.
+config ARM_ARCH_TIMER_VCT_ACCESS
+ bool "Support for ARM architected timer virtual counter access in userspace"
+ default !ARM64
+ depends on ARM_ARCH_TIMER
+ help
+ This option enables support for reading the ARM architected timer's
+ virtual counter in userspace.
+
config FSL_ERRATUM_A008585
bool "Workaround for Freescale/NXP Erratum A-008585"
default y
diff --git a/drivers/clocksource/arm_arch_timer.c b/drivers/clocksource/arm_arch_timer.c
index a2503db7e533..e3bc592b1055 100644
--- a/drivers/clocksource/arm_arch_timer.c
+++ b/drivers/clocksource/arm_arch_timer.c
@@ -449,7 +449,10 @@ static void arch_counter_set_user_access(void)
| ARCH_TIMER_USR_PCT_ACCESS_EN);
/* Enable user access to the virtual counter */
- cntkctl |= ARCH_TIMER_USR_VCT_ACCESS_EN;
+ if (IS_ENABLED(CONFIG_ARM_ARCH_TIMER_VCT_ACCESS))
+ cntkctl |= ARCH_TIMER_USR_VCT_ACCESS_EN;
+ else
+ cntkctl &= ~ARCH_TIMER_USR_VCT_ACCESS_EN;
arch_timer_set_cntkctl(cntkctl);
}
diff --git a/drivers/cpufreq/Kconfig b/drivers/cpufreq/Kconfig
index cac26fb22891..c70ed51daa15 100644
--- a/drivers/cpufreq/Kconfig
+++ b/drivers/cpufreq/Kconfig
@@ -45,6 +45,15 @@ config CPU_FREQ_STAT_DETAILS
If in doubt, say N.
+config CPU_FREQ_TIMES
+ bool "CPU frequency time-in-state statistics"
+ default y
+ help
+ This driver exports CPU time-in-state information through procfs file
+ system.
+
+ If in doubt, say N.
+
choice
prompt "Default CPUFreq governor"
default CPU_FREQ_DEFAULT_GOV_USERSPACE if ARM_SA1100_CPUFREQ || ARM_SA1110_CPUFREQ
@@ -102,6 +111,16 @@ config CPU_FREQ_DEFAULT_GOV_CONSERVATIVE
governor. If unsure have a look at the help section of the
driver. Fallback governor will be the performance governor.
+config CPU_FREQ_DEFAULT_GOV_INTERACTIVE
+ bool "interactive"
+ select CPU_FREQ_GOV_INTERACTIVE
+ select CPU_FREQ_GOV_PERFORMANCE
+ help
+ Use the CPUFreq governor 'interactive' as default. This allows
+ you to get a full dynamic cpu frequency capable system by simply
+ loading your cpufreq low-level hardware driver, using the
+ 'interactive' governor for latency-sensitive workloads.
+
config CPU_FREQ_DEFAULT_GOV_SCHEDUTIL
bool "schedutil"
depends on SMP
@@ -193,6 +212,26 @@ config CPU_FREQ_GOV_CONSERVATIVE
If in doubt, say N.
+config CPU_FREQ_GOV_INTERACTIVE
+ tristate "'interactive' cpufreq policy governor"
+ depends on CPU_FREQ
+ select CPU_FREQ_GOV_ATTR_SET
+ select IRQ_WORK
+ help
+ 'interactive' - This driver adds a dynamic cpufreq policy governor
+ designed for latency-sensitive workloads.
+
+ This governor attempts to reduce the latency of clock
+ increases so that the system is more responsive to
+ interactive workloads.
+
+ To compile this driver as a module, choose M here: the
+ module will be called cpufreq_interactive.
+
+ For details, take a look at linux/Documentation/cpu-freq.
+
+ If in doubt, say N.
+
config CPU_FREQ_GOV_SCHEDUTIL
bool "'schedutil' cpufreq policy governor"
depends on CPU_FREQ && SMP
diff --git a/drivers/cpufreq/Makefile b/drivers/cpufreq/Makefile
index 0a9b6a093646..f10a18dcd685 100644
--- a/drivers/cpufreq/Makefile
+++ b/drivers/cpufreq/Makefile
@@ -4,12 +4,16 @@ obj-$(CONFIG_CPU_FREQ) += cpufreq.o freq_table.o
# CPUfreq stats
obj-$(CONFIG_CPU_FREQ_STAT) += cpufreq_stats.o
-# CPUfreq governors
+# CPUfreq times
+obj-$(CONFIG_CPU_FREQ_TIMES) += cpufreq_times.o
+
+# CPUfreq governors
obj-$(CONFIG_CPU_FREQ_GOV_PERFORMANCE) += cpufreq_performance.o
obj-$(CONFIG_CPU_FREQ_GOV_POWERSAVE) += cpufreq_powersave.o
obj-$(CONFIG_CPU_FREQ_GOV_USERSPACE) += cpufreq_userspace.o
obj-$(CONFIG_CPU_FREQ_GOV_ONDEMAND) += cpufreq_ondemand.o
obj-$(CONFIG_CPU_FREQ_GOV_CONSERVATIVE) += cpufreq_conservative.o
+obj-$(CONFIG_CPU_FREQ_GOV_INTERACTIVE) += cpufreq_interactive.o
obj-$(CONFIG_CPU_FREQ_GOV_COMMON) += cpufreq_governor.o
obj-$(CONFIG_CPU_FREQ_GOV_ATTR_SET) += cpufreq_governor_attr_set.o
diff --git a/drivers/cpufreq/cpufreq-dt.c b/drivers/cpufreq/cpufreq-dt.c
index 1496617b05d5..fbb278b7eed6 100644
--- a/drivers/cpufreq/cpufreq-dt.c
+++ b/drivers/cpufreq/cpufreq-dt.c
@@ -282,6 +282,13 @@ static int cpufreq_init(struct cpufreq_policy *policy)
policy->cpuinfo.transition_latency = transition_latency;
+ /*
+ * Android: set default parameters for parity between schedutil and
+ * schedfreq
+ */
+ policy->up_transition_delay_us = transition_latency / NSEC_PER_USEC;
+ policy->down_transition_delay_us = 50000; /* 50ms */
+
return 0;
out_free_cpufreq_table:
diff --git a/drivers/cpufreq/cpufreq.c b/drivers/cpufreq/cpufreq.c
index d6d91e8afa9e..249c5bebe211 100644
--- a/drivers/cpufreq/cpufreq.c
+++ b/drivers/cpufreq/cpufreq.c
@@ -19,6 +19,7 @@
#include <linux/cpu.h>
#include <linux/cpufreq.h>
+#include <linux/cpufreq_times.h>
#include <linux/delay.h>
#include <linux/device.h>
#include <linux/init.h>
@@ -29,6 +30,9 @@
#include <linux/suspend.h>
#include <linux/syscore_ops.h>
#include <linux/tick.h>
+#ifdef CONFIG_SMP
+#include <linux/sched.h>
+#endif
#include <trace/events/power.h>
static LIST_HEAD(cpufreq_policy_list);
@@ -117,6 +121,12 @@ bool have_governor_per_policy(void)
}
EXPORT_SYMBOL_GPL(have_governor_per_policy);
+bool cpufreq_driver_is_slow(void)
+{
+ return !(cpufreq_driver->flags & CPUFREQ_DRIVER_FAST);
+}
+EXPORT_SYMBOL_GPL(cpufreq_driver_is_slow);
+
struct kobject *get_governor_parent_kobj(struct cpufreq_policy *policy)
{
if (have_governor_per_policy())
@@ -301,6 +311,92 @@ static void adjust_jiffies(unsigned long val, struct cpufreq_freqs *ci)
#endif
}
+/*********************************************************************
+ * FREQUENCY INVARIANT CPU CAPACITY *
+ *********************************************************************/
+
+static DEFINE_PER_CPU(unsigned long, freq_scale) = SCHED_CAPACITY_SCALE;
+static DEFINE_PER_CPU(unsigned long, max_freq_cpu);
+static DEFINE_PER_CPU(unsigned long, max_freq_scale) = SCHED_CAPACITY_SCALE;
+static DEFINE_PER_CPU(unsigned long, min_freq_scale);
+
+static void
+scale_freq_capacity(const cpumask_t *cpus, unsigned long cur_freq,
+ unsigned long max_freq)
+{
+ unsigned long scale = (cur_freq << SCHED_CAPACITY_SHIFT) / max_freq;
+ int cpu;
+
+ for_each_cpu(cpu, cpus) {
+ per_cpu(freq_scale, cpu) = scale;
+ per_cpu(max_freq_cpu, cpu) = max_freq;
+ }
+
+ pr_debug("cpus %*pbl cur freq/max freq %lu/%lu kHz freq scale %lu\n",
+ cpumask_pr_args(cpus), cur_freq, max_freq, scale);
+}
+
+unsigned long cpufreq_scale_freq_capacity(struct sched_domain *sd, int cpu)
+{
+ return per_cpu(freq_scale, cpu);
+}
+
+static void
+scale_max_freq_capacity(const cpumask_t *cpus, unsigned long policy_max_freq)
+{
+ unsigned long scale, max_freq;
+ int cpu = cpumask_first(cpus);
+
+ if (cpu >= nr_cpu_ids)
+ return;
+
+ max_freq = per_cpu(max_freq_cpu, cpu);
+
+ if (!max_freq)
+ return;
+
+ scale = (policy_max_freq << SCHED_CAPACITY_SHIFT) / max_freq;
+
+ for_each_cpu(cpu, cpus)
+ per_cpu(max_freq_scale, cpu) = scale;
+
+ pr_debug("cpus %*pbl policy max freq/max freq %lu/%lu kHz max freq scale %lu\n",
+ cpumask_pr_args(cpus), policy_max_freq, max_freq, scale);
+}
+
+unsigned long cpufreq_scale_max_freq_capacity(struct sched_domain *sd, int cpu)
+{
+ return per_cpu(max_freq_scale, cpu);
+}
+
+static void
+scale_min_freq_capacity(const cpumask_t *cpus, unsigned long policy_min_freq)
+{
+ unsigned long scale, max_freq;
+ int cpu = cpumask_first(cpus);
+
+ if (cpu >= nr_cpu_ids)
+ return;
+
+ max_freq = per_cpu(max_freq_cpu, cpu);
+
+ if (!max_freq)
+ return;
+
+ scale = (policy_min_freq << SCHED_CAPACITY_SHIFT) / max_freq;
+
+ for_each_cpu(cpu, cpus)
+ per_cpu(min_freq_scale, cpu) = scale;
+
+ pr_debug("cpus %*pbl policy min freq/max freq %lu/%lu kHz min freq scale %lu\n",
+ cpumask_pr_args(cpus), policy_min_freq, max_freq, scale);
+}
+
+unsigned long cpufreq_scale_min_freq_capacity(struct sched_domain *sd, int cpu)
+{
+ return per_cpu(min_freq_scale, cpu);
+}
+
static void __cpufreq_notify_transition(struct cpufreq_policy *policy,
struct cpufreq_freqs *freqs, unsigned int state)
{
@@ -339,6 +435,7 @@ static void __cpufreq_notify_transition(struct cpufreq_policy *policy,
(unsigned long)freqs->new, (unsigned long)freqs->cpu);
trace_cpu_frequency(freqs->new, freqs->cpu);
cpufreq_stats_record_transition(policy, freqs->new);
+ cpufreq_times_record_transition(freqs);
srcu_notifier_call_chain(&cpufreq_transition_notifier_list,
CPUFREQ_POSTCHANGE, freqs);
if (likely(policy) && likely(policy->cpu == freqs->cpu))
@@ -378,6 +475,9 @@ static void cpufreq_notify_post_transition(struct cpufreq_policy *policy,
void cpufreq_freq_transition_begin(struct cpufreq_policy *policy,
struct cpufreq_freqs *freqs)
{
+#ifdef CONFIG_SMP
+ int cpu;
+#endif
/*
* Catch double invocations of _begin() which lead to self-deadlock.
@@ -405,6 +505,12 @@ wait:
spin_unlock(&policy->transition_lock);
+ scale_freq_capacity(policy->cpus, freqs->new, policy->cpuinfo.max_freq);
+#ifdef CONFIG_SMP
+ for_each_cpu(cpu, policy->cpus)
+ trace_cpu_capacity(capacity_curr_of(cpu), cpu);
+#endif
+
cpufreq_notify_transition(policy, freqs, CPUFREQ_PRECHANGE);
}
EXPORT_SYMBOL_GPL(cpufreq_freq_transition_begin);
@@ -1257,6 +1363,7 @@ static int cpufreq_online(unsigned int cpu)
goto out_exit_policy;
cpufreq_stats_create_table(policy);
+ cpufreq_times_create_policy(policy);
blocking_notifier_call_chain(&cpufreq_policy_notifier_list,
CPUFREQ_CREATE_POLICY, policy);
@@ -2201,8 +2308,12 @@ static int cpufreq_set_policy(struct cpufreq_policy *policy,
blocking_notifier_call_chain(&cpufreq_policy_notifier_list,
CPUFREQ_NOTIFY, new_policy);
+ scale_max_freq_capacity(policy->cpus, policy->max);
+ scale_min_freq_capacity(policy->cpus, policy->min);
+
policy->min = new_policy->min;
policy->max = new_policy->max;
+ trace_cpu_frequency_limits(policy->max, policy->min, policy->cpu);
policy->cached_target_freq = UINT_MAX;
diff --git a/drivers/cpufreq/cpufreq_conservative.c b/drivers/cpufreq/cpufreq_conservative.c
index 00a74351f623..0fe251865ac6 100644
--- a/drivers/cpufreq/cpufreq_conservative.c
+++ b/drivers/cpufreq/cpufreq_conservative.c
@@ -302,7 +302,10 @@ static void cs_start(struct cpufreq_policy *policy)
dbs_info->requested_freq = policy->cur;
}
-static struct dbs_governor cs_governor = {
+#ifndef CONFIG_CPU_FREQ_DEFAULT_GOV_CONSERVATIVE
+static
+#endif
+struct dbs_governor cs_governor = {
.gov = CPUFREQ_DBS_GOVERNOR_INITIALIZER("conservative"),
.kobj_type = { .default_attrs = cs_attributes },
.gov_dbs_timer = cs_dbs_timer,
diff --git a/drivers/cpufreq/cpufreq_interactive.c b/drivers/cpufreq/cpufreq_interactive.c
new file mode 100644
index 000000000000..5a77d9129611
--- /dev/null
+++ b/drivers/cpufreq/cpufreq_interactive.c
@@ -0,0 +1,1411 @@
+/*
+ * drivers/cpufreq/cpufreq_interactive.c
+ *
+ * Copyright (C) 2010-2016 Google, Inc.
+ *
+ * This software is licensed under the terms of the GNU General Public
+ * License version 2, as published by the Free Software Foundation, and
+ * may be copied, distributed, and modified under those terms.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * Author: Mike Chan (mike@android.com)
+ */
+
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include <linux/cpu.h>
+#include <linux/cpumask.h>
+#include <linux/cpufreq.h>
+#include <linux/irq_work.h>
+#include <linux/module.h>
+#include <linux/moduleparam.h>
+#include <linux/rwsem.h>
+#include <linux/sched.h>
+#include <linux/sched/rt.h>
+#include <linux/tick.h>
+#include <linux/time.h>
+#include <linux/timer.h>
+#include <linux/kthread.h>
+#include <linux/slab.h>
+
+#define CREATE_TRACE_POINTS
+#include <trace/events/cpufreq_interactive.h>
+
+#define gov_attr_ro(_name) \
+static struct governor_attr _name = \
+__ATTR(_name, 0444, show_##_name, NULL)
+
+#define gov_attr_wo(_name) \
+static struct governor_attr _name = \
+__ATTR(_name, 0200, NULL, store_##_name)
+
+#define gov_attr_rw(_name) \
+static struct governor_attr _name = \
+__ATTR(_name, 0644, show_##_name, store_##_name)
+
+/* Separate instance required for each 'interactive' directory in sysfs */
+struct interactive_tunables {
+ struct gov_attr_set attr_set;
+
+ /* Hi speed to bump to from lo speed when load burst (default max) */
+ unsigned int hispeed_freq;
+
+ /* Go to hi speed when CPU load at or above this value. */
+#define DEFAULT_GO_HISPEED_LOAD 99
+ unsigned long go_hispeed_load;
+
+ /* Target load. Lower values result in higher CPU speeds. */
+ spinlock_t target_loads_lock;
+ unsigned int *target_loads;
+ int ntarget_loads;
+
+ /*
+ * The minimum amount of time to spend at a frequency before we can ramp
+ * down.
+ */
+#define DEFAULT_MIN_SAMPLE_TIME (80 * USEC_PER_MSEC)
+ unsigned long min_sample_time;
+
+ /* The sample rate of the timer used to increase frequency */
+ unsigned long sampling_rate;
+
+ /*
+ * Wait this long before raising speed above hispeed, by default a
+ * single timer interval.
+ */
+ spinlock_t above_hispeed_delay_lock;
+ unsigned int *above_hispeed_delay;
+ int nabove_hispeed_delay;
+
+ /* Non-zero means indefinite speed boost active */
+ int boost;
+ /* Duration of a boot pulse in usecs */
+ int boostpulse_duration;
+ /* End time of boost pulse in ktime converted to usecs */
+ u64 boostpulse_endtime;
+ bool boosted;
+
+ /*
+ * Max additional time to wait in idle, beyond sampling_rate, at speeds
+ * above minimum before wakeup to reduce speed, or -1 if unnecessary.
+ */
+#define DEFAULT_TIMER_SLACK (4 * DEFAULT_SAMPLING_RATE)
+ unsigned long timer_slack_delay;
+ unsigned long timer_slack;
+ bool io_is_busy;
+};
+
+/* Separate instance required for each 'struct cpufreq_policy' */
+struct interactive_policy {
+ struct cpufreq_policy *policy;
+ struct interactive_tunables *tunables;
+ struct list_head tunables_hook;
+};
+
+/* Separate instance required for each CPU */
+struct interactive_cpu {
+ struct update_util_data update_util;
+ struct interactive_policy *ipolicy;
+
+ struct irq_work irq_work;
+ u64 last_sample_time;
+ unsigned long next_sample_jiffies;
+ bool work_in_progress;
+
+ struct rw_semaphore enable_sem;
+ struct timer_list slack_timer;
+
+ spinlock_t load_lock; /* protects the next 4 fields */
+ u64 time_in_idle;
+ u64 time_in_idle_timestamp;
+ u64 cputime_speedadj;
+ u64 cputime_speedadj_timestamp;
+
+ spinlock_t target_freq_lock; /*protects target freq */
+ unsigned int target_freq;
+
+ unsigned int floor_freq;
+ u64 pol_floor_val_time; /* policy floor_validate_time */
+ u64 loc_floor_val_time; /* per-cpu floor_validate_time */
+ u64 pol_hispeed_val_time; /* policy hispeed_validate_time */
+ u64 loc_hispeed_val_time; /* per-cpu hispeed_validate_time */
+};
+
+static DEFINE_PER_CPU(struct interactive_cpu, interactive_cpu);
+
+/* Realtime thread handles frequency scaling */
+static struct task_struct *speedchange_task;
+static cpumask_t speedchange_cpumask;
+static spinlock_t speedchange_cpumask_lock;
+
+/* Target load. Lower values result in higher CPU speeds. */
+#define DEFAULT_TARGET_LOAD 90
+static unsigned int default_target_loads[] = {DEFAULT_TARGET_LOAD};
+
+#define DEFAULT_SAMPLING_RATE (20 * USEC_PER_MSEC)
+#define DEFAULT_ABOVE_HISPEED_DELAY DEFAULT_SAMPLING_RATE
+static unsigned int default_above_hispeed_delay[] = {
+ DEFAULT_ABOVE_HISPEED_DELAY
+};
+
+/* Iterate over interactive policies for tunables */
+#define for_each_ipolicy(__ip) \
+ list_for_each_entry(__ip, &tunables->attr_set.policy_list, tunables_hook)
+
+static struct interactive_tunables *global_tunables;
+static DEFINE_MUTEX(global_tunables_lock);
+
+static inline void update_slack_delay(struct interactive_tunables *tunables)
+{
+ tunables->timer_slack_delay = usecs_to_jiffies(tunables->timer_slack +
+ tunables->sampling_rate);
+}
+
+static bool timer_slack_required(struct interactive_cpu *icpu)
+{
+ struct interactive_policy *ipolicy = icpu->ipolicy;
+ struct interactive_tunables *tunables = ipolicy->tunables;
+
+ if (tunables->timer_slack < 0)
+ return false;
+
+ if (icpu->target_freq > ipolicy->policy->min)
+ return true;
+
+ return false;
+}
+
+static void gov_slack_timer_start(struct interactive_cpu *icpu, int cpu)
+{
+ struct interactive_tunables *tunables = icpu->ipolicy->tunables;
+
+ icpu->slack_timer.expires = jiffies + tunables->timer_slack_delay;
+ add_timer_on(&icpu->slack_timer, cpu);
+}
+
+static void gov_slack_timer_modify(struct interactive_cpu *icpu)
+{
+ struct interactive_tunables *tunables = icpu->ipolicy->tunables;
+
+ mod_timer(&icpu->slack_timer, jiffies + tunables->timer_slack_delay);
+}
+
+static void slack_timer_resched(struct interactive_cpu *icpu, int cpu,
+ bool modify)
+{
+ struct interactive_tunables *tunables = icpu->ipolicy->tunables;
+ unsigned long flags;
+
+ spin_lock_irqsave(&icpu->load_lock, flags);
+
+ icpu->time_in_idle = get_cpu_idle_time(cpu,
+ &icpu->time_in_idle_timestamp,
+ tunables->io_is_busy);
+ icpu->cputime_speedadj = 0;
+ icpu->cputime_speedadj_timestamp = icpu->time_in_idle_timestamp;
+
+ if (timer_slack_required(icpu)) {
+ if (modify)
+ gov_slack_timer_modify(icpu);
+ else
+ gov_slack_timer_start(icpu, cpu);
+ }
+
+ spin_unlock_irqrestore(&icpu->load_lock, flags);
+}
+
+static unsigned int
+freq_to_above_hispeed_delay(struct interactive_tunables *tunables,
+ unsigned int freq)
+{
+ unsigned long flags;
+ unsigned int ret;
+ int i;
+
+ spin_lock_irqsave(&tunables->above_hispeed_delay_lock, flags);
+
+ for (i = 0; i < tunables->nabove_hispeed_delay - 1 &&
+ freq >= tunables->above_hispeed_delay[i + 1]; i += 2)
+ ;
+
+ ret = tunables->above_hispeed_delay[i];
+ spin_unlock_irqrestore(&tunables->above_hispeed_delay_lock, flags);
+
+ return ret;
+}
+
+static unsigned int freq_to_targetload(struct interactive_tunables *tunables,
+ unsigned int freq)
+{
+ unsigned long flags;
+ unsigned int ret;
+ int i;
+
+ spin_lock_irqsave(&tunables->target_loads_lock, flags);
+
+ for (i = 0; i < tunables->ntarget_loads - 1 &&
+ freq >= tunables->target_loads[i + 1]; i += 2)
+ ;
+
+ ret = tunables->target_loads[i];
+ spin_unlock_irqrestore(&tunables->target_loads_lock, flags);
+ return ret;
+}
+
+/*
+ * If increasing frequencies never map to a lower target load then
+ * choose_freq() will find the minimum frequency that does not exceed its
+ * target load given the current load.
+ */
+static unsigned int choose_freq(struct interactive_cpu *icpu,
+ unsigned int loadadjfreq)
+{
+ struct cpufreq_policy *policy = icpu->ipolicy->policy;
+ struct cpufreq_frequency_table *freq_table = policy->freq_table;
+ unsigned int prevfreq, freqmin = 0, freqmax = UINT_MAX, tl;
+ unsigned int freq = policy->cur;
+ int index;
+
+ do {
+ prevfreq = freq;
+ tl = freq_to_targetload(icpu->ipolicy->tunables, freq);
+
+ /*
+ * Find the lowest frequency where the computed load is less
+ * than or equal to the target load.
+ */
+
+ index = cpufreq_frequency_table_target(policy, loadadjfreq / tl,
+ CPUFREQ_RELATION_L);
+
+ freq = freq_table[index].frequency;
+
+ if (freq > prevfreq) {
+ /* The previous frequency is too low */
+ freqmin = prevfreq;
+
+ if (freq < freqmax)
+ continue;
+
+ /* Find highest frequency that is less than freqmax */
+ index = cpufreq_frequency_table_target(policy,
+ freqmax - 1, CPUFREQ_RELATION_H);
+
+ freq = freq_table[index].frequency;
+
+ if (freq == freqmin) {
+ /*
+ * The first frequency below freqmax has already
+ * been found to be too low. freqmax is the
+ * lowest speed we found that is fast enough.
+ */
+ freq = freqmax;
+ break;
+ }
+ } else if (freq < prevfreq) {
+ /* The previous frequency is high enough. */
+ freqmax = prevfreq;
+
+ if (freq > freqmin)
+ continue;
+
+ /* Find lowest frequency that is higher than freqmin */
+ index = cpufreq_frequency_table_target(policy,
+ freqmin + 1, CPUFREQ_RELATION_L);
+
+ freq = freq_table[index].frequency;
+
+ /*
+ * If freqmax is the first frequency above
+ * freqmin then we have already found that
+ * this speed is fast enough.
+ */
+ if (freq == freqmax)
+ break;
+ }
+
+ /* If same frequency chosen as previous then done. */
+ } while (freq != prevfreq);
+
+ return freq;
+}
+
+static u64 update_load(struct interactive_cpu *icpu, int cpu)
+{
+ struct interactive_tunables *tunables = icpu->ipolicy->tunables;
+ u64 now_idle, now, active_time, delta_idle, delta_time;
+
+ now_idle = get_cpu_idle_time(cpu, &now, tunables->io_is_busy);
+ delta_idle = (now_idle - icpu->time_in_idle);
+ delta_time = (now - icpu->time_in_idle_timestamp);
+
+ if (delta_time <= delta_idle)
+ active_time = 0;
+ else
+ active_time = delta_time - delta_idle;
+
+ icpu->cputime_speedadj += active_time * icpu->ipolicy->policy->cur;
+
+ icpu->time_in_idle = now_idle;
+ icpu->time_in_idle_timestamp = now;
+
+ return now;
+}
+
+/* Re-evaluate load to see if a frequency change is required or not */
+static void eval_target_freq(struct interactive_cpu *icpu)
+{
+ struct interactive_tunables *tunables = icpu->ipolicy->tunables;
+ struct cpufreq_policy *policy = icpu->ipolicy->policy;
+ struct cpufreq_frequency_table *freq_table = policy->freq_table;
+ u64 cputime_speedadj, now, max_fvtime;
+ unsigned int new_freq, loadadjfreq, index, delta_time;
+ unsigned long flags;
+ int cpu_load;
+ int cpu = smp_processor_id();
+
+ spin_lock_irqsave(&icpu->load_lock, flags);
+ now = update_load(icpu, smp_processor_id());
+ delta_time = (unsigned int)(now - icpu->cputime_speedadj_timestamp);
+ cputime_speedadj = icpu->cputime_speedadj;
+ spin_unlock_irqrestore(&icpu->load_lock, flags);
+
+ if (WARN_ON_ONCE(!delta_time))
+ return;
+
+ spin_lock_irqsave(&icpu->target_freq_lock, flags);
+ do_div(cputime_speedadj, delta_time);
+ loadadjfreq = (unsigned int)cputime_speedadj * 100;
+ cpu_load = loadadjfreq / policy->cur;
+ tunables->boosted = tunables->boost ||
+ now < tunables->boostpulse_endtime;
+
+ if (cpu_load >= tunables->go_hispeed_load || tunables->boosted) {
+ if (policy->cur < tunables->hispeed_freq) {
+ new_freq = tunables->hispeed_freq;
+ } else {
+ new_freq = choose_freq(icpu, loadadjfreq);
+
+ if (new_freq < tunables->hispeed_freq)
+ new_freq = tunables->hispeed_freq;
+ }
+ } else {
+ new_freq = choose_freq(icpu, loadadjfreq);
+ if (new_freq > tunables->hispeed_freq &&
+ policy->cur < tunables->hispeed_freq)
+ new_freq = tunables->hispeed_freq;
+ }
+
+ if (policy->cur >= tunables->hispeed_freq &&
+ new_freq > policy->cur &&
+ now - icpu->pol_hispeed_val_time < freq_to_above_hispeed_delay(tunables, policy->cur)) {
+ trace_cpufreq_interactive_notyet(cpu, cpu_load,
+ icpu->target_freq, policy->cur, new_freq);
+ goto exit;
+ }
+
+ icpu->loc_hispeed_val_time = now;
+
+ index = cpufreq_frequency_table_target(policy, new_freq,
+ CPUFREQ_RELATION_L);
+ new_freq = freq_table[index].frequency;
+
+ /*
+ * Do not scale below floor_freq unless we have been at or above the
+ * floor frequency for the minimum sample time since last validated.
+ */
+ max_fvtime = max(icpu->pol_floor_val_time, icpu->loc_floor_val_time);
+ if (new_freq < icpu->floor_freq && icpu->target_freq >= policy->cur) {
+ if (now - max_fvtime < tunables->min_sample_time) {
+ trace_cpufreq_interactive_notyet(cpu, cpu_load,
+ icpu->target_freq, policy->cur, new_freq);
+ goto exit;
+ }
+ }
+
+ /*
+ * Update the timestamp for checking whether speed has been held at
+ * or above the selected frequency for a minimum of min_sample_time,
+ * if not boosted to hispeed_freq. If boosted to hispeed_freq then we
+ * allow the speed to drop as soon as the boostpulse duration expires
+ * (or the indefinite boost is turned off).
+ */
+
+ if (!tunables->boosted || new_freq > tunables->hispeed_freq) {
+ icpu->floor_freq = new_freq;
+ if (icpu->target_freq >= policy->cur || new_freq >= policy->cur)
+ icpu->loc_floor_val_time = now;
+ }
+
+ if (icpu->target_freq == new_freq &&
+ icpu->target_freq <= policy->cur) {
+ trace_cpufreq_interactive_already(cpu, cpu_load,
+ icpu->target_freq, policy->cur, new_freq);
+ goto exit;
+ }
+
+ trace_cpufreq_interactive_target(cpu, cpu_load, icpu->target_freq,
+ policy->cur, new_freq);
+
+ icpu->target_freq = new_freq;
+ spin_unlock_irqrestore(&icpu->target_freq_lock, flags);
+
+ spin_lock_irqsave(&speedchange_cpumask_lock, flags);
+ cpumask_set_cpu(cpu, &speedchange_cpumask);
+ spin_unlock_irqrestore(&speedchange_cpumask_lock, flags);
+
+ wake_up_process(speedchange_task);
+ return;
+
+exit:
+ spin_unlock_irqrestore(&icpu->target_freq_lock, flags);
+}
+
+static void cpufreq_interactive_update(struct interactive_cpu *icpu)
+{
+ eval_target_freq(icpu);
+ slack_timer_resched(icpu, smp_processor_id(), true);
+}
+
+static void cpufreq_interactive_idle_end(void)
+{
+ struct interactive_cpu *icpu = &per_cpu(interactive_cpu,
+ smp_processor_id());
+
+ if (!down_read_trylock(&icpu->enable_sem))
+ return;
+
+ if (icpu->ipolicy) {
+ /*
+ * We haven't sampled load for more than sampling_rate time, do
+ * it right now.
+ */
+ if (time_after_eq(jiffies, icpu->next_sample_jiffies))
+ cpufreq_interactive_update(icpu);
+ }
+
+ up_read(&icpu->enable_sem);
+}
+
+static void cpufreq_interactive_get_policy_info(struct cpufreq_policy *policy,
+ unsigned int *pmax_freq,
+ u64 *phvt, u64 *pfvt)
+{
+ struct interactive_cpu *icpu;
+ u64 hvt = ~0ULL, fvt = 0;
+ unsigned int max_freq = 0, i;
+
+ for_each_cpu(i, policy->cpus) {
+ icpu = &per_cpu(interactive_cpu, i);
+
+ fvt = max(fvt, icpu->loc_floor_val_time);
+ if (icpu->target_freq > max_freq) {
+ max_freq = icpu->target_freq;
+ hvt = icpu->loc_hispeed_val_time;
+ } else if (icpu->target_freq == max_freq) {
+ hvt = min(hvt, icpu->loc_hispeed_val_time);
+ }
+ }
+
+ *pmax_freq = max_freq;
+ *phvt = hvt;
+ *pfvt = fvt;
+}
+
+static void cpufreq_interactive_adjust_cpu(unsigned int cpu,
+ struct cpufreq_policy *policy)
+{
+ struct interactive_cpu *icpu;
+ u64 hvt, fvt;
+ unsigned int max_freq;
+ int i;
+
+ cpufreq_interactive_get_policy_info(policy, &max_freq, &hvt, &fvt);
+
+ for_each_cpu(i, policy->cpus) {
+ icpu = &per_cpu(interactive_cpu, i);
+ icpu->pol_floor_val_time = fvt;
+ }
+
+ if (max_freq != policy->cur) {
+ __cpufreq_driver_target(policy, max_freq, CPUFREQ_RELATION_H);
+ for_each_cpu(i, policy->cpus) {
+ icpu = &per_cpu(interactive_cpu, i);
+ icpu->pol_hispeed_val_time = hvt;
+ }
+ }
+
+ trace_cpufreq_interactive_setspeed(cpu, max_freq, policy->cur);
+}
+
+static int cpufreq_interactive_speedchange_task(void *data)
+{
+ unsigned int cpu;
+ cpumask_t tmp_mask;
+ unsigned long flags;
+
+again:
+ set_current_state(TASK_INTERRUPTIBLE);
+ spin_lock_irqsave(&speedchange_cpumask_lock, flags);
+
+ if (cpumask_empty(&speedchange_cpumask)) {
+ spin_unlock_irqrestore(&speedchange_cpumask_lock, flags);
+ schedule();
+
+ if (kthread_should_stop())
+ return 0;
+
+ spin_lock_irqsave(&speedchange_cpumask_lock, flags);
+ }
+
+ set_current_state(TASK_RUNNING);
+ tmp_mask = speedchange_cpumask;
+ cpumask_clear(&speedchange_cpumask);
+ spin_unlock_irqrestore(&speedchange_cpumask_lock, flags);
+
+ for_each_cpu(cpu, &tmp_mask) {
+ struct interactive_cpu *icpu = &per_cpu(interactive_cpu, cpu);
+ struct cpufreq_policy *policy;
+
+ if (unlikely(!down_read_trylock(&icpu->enable_sem)))
+ continue;
+
+ if (likely(icpu->ipolicy)) {
+ policy = icpu->ipolicy->policy;
+ cpufreq_interactive_adjust_cpu(cpu, policy);
+ }
+
+ up_read(&icpu->enable_sem);
+ }
+
+ goto again;
+}
+
+static void cpufreq_interactive_boost(struct interactive_tunables *tunables)
+{
+ struct interactive_policy *ipolicy;
+ struct cpufreq_policy *policy;
+ struct interactive_cpu *icpu;
+ unsigned long flags[2];
+ bool wakeup = false;
+ int i;
+
+ tunables->boosted = true;
+
+ spin_lock_irqsave(&speedchange_cpumask_lock, flags[0]);
+
+ for_each_ipolicy(ipolicy) {
+ policy = ipolicy->policy;
+
+ for_each_cpu(i, policy->cpus) {
+ icpu = &per_cpu(interactive_cpu, i);
+
+ if (!down_read_trylock(&icpu->enable_sem))
+ continue;
+
+ if (!icpu->ipolicy) {
+ up_read(&icpu->enable_sem);
+ continue;
+ }
+
+ spin_lock_irqsave(&icpu->target_freq_lock, flags[1]);
+ if (icpu->target_freq < tunables->hispeed_freq) {
+ icpu->target_freq = tunables->hispeed_freq;
+ cpumask_set_cpu(i, &speedchange_cpumask);
+ icpu->pol_hispeed_val_time = ktime_to_us(ktime_get());
+ wakeup = true;
+ }
+ spin_unlock_irqrestore(&icpu->target_freq_lock, flags[1]);
+
+ up_read(&icpu->enable_sem);
+ }
+ }
+
+ spin_unlock_irqrestore(&speedchange_cpumask_lock, flags[0]);
+
+ if (wakeup)
+ wake_up_process(speedchange_task);
+}
+
+static int cpufreq_interactive_notifier(struct notifier_block *nb,
+ unsigned long val, void *data)
+{
+ struct cpufreq_freqs *freq = data;
+ struct interactive_cpu *icpu = &per_cpu(interactive_cpu, freq->cpu);
+ unsigned long flags;
+
+ if (val != CPUFREQ_POSTCHANGE)
+ return 0;
+
+ if (!down_read_trylock(&icpu->enable_sem))
+ return 0;
+
+ if (!icpu->ipolicy) {
+ up_read(&icpu->enable_sem);
+ return 0;
+ }
+
+ spin_lock_irqsave(&icpu->load_lock, flags);
+ update_load(icpu, freq->cpu);
+ spin_unlock_irqrestore(&icpu->load_lock, flags);
+
+ up_read(&icpu->enable_sem);
+
+ return 0;
+}
+
+static struct notifier_block cpufreq_notifier_block = {
+ .notifier_call = cpufreq_interactive_notifier,
+};
+
+static unsigned int *get_tokenized_data(const char *buf, int *num_tokens)
+{
+ const char *cp = buf;
+ int ntokens = 1, i = 0;
+ unsigned int *tokenized_data;
+ int err = -EINVAL;
+
+ while ((cp = strpbrk(cp + 1, " :")))
+ ntokens++;
+
+ if (!(ntokens & 0x1))
+ goto err;
+
+ tokenized_data = kcalloc(ntokens, sizeof(*tokenized_data), GFP_KERNEL);
+ if (!tokenized_data) {
+ err = -ENOMEM;
+ goto err;
+ }
+
+ cp = buf;
+ while (i < ntokens) {
+ if (kstrtouint(cp, 0, &tokenized_data[i++]) < 0)
+ goto err_kfree;
+
+ cp = strpbrk(cp, " :");
+ if (!cp)
+ break;
+ cp++;
+ }
+
+ if (i != ntokens)
+ goto err_kfree;
+
+ *num_tokens = ntokens;
+ return tokenized_data;
+
+err_kfree:
+ kfree(tokenized_data);
+err:
+ return ERR_PTR(err);
+}
+
+/* Interactive governor sysfs interface */
+static struct interactive_tunables *to_tunables(struct gov_attr_set *attr_set)
+{
+ return container_of(attr_set, struct interactive_tunables, attr_set);
+}
+
+#define show_one(file_name, type) \
+static ssize_t show_##file_name(struct gov_attr_set *attr_set, char *buf) \
+{ \
+ struct interactive_tunables *tunables = to_tunables(attr_set); \
+ return sprintf(buf, type "\n", tunables->file_name); \
+}
+
+static ssize_t show_target_loads(struct gov_attr_set *attr_set, char *buf)
+{
+ struct interactive_tunables *tunables = to_tunables(attr_set);
+ unsigned long flags;
+ ssize_t ret = 0;
+ int i;
+
+ spin_lock_irqsave(&tunables->target_loads_lock, flags);
+
+ for (i = 0; i < tunables->ntarget_loads; i++)
+ ret += sprintf(buf + ret, "%u%s", tunables->target_loads[i],
+ i & 0x1 ? ":" : " ");
+
+ sprintf(buf + ret - 1, "\n");
+ spin_unlock_irqrestore(&tunables->target_loads_lock, flags);
+
+ return ret;
+}
+
+static ssize_t store_target_loads(struct gov_attr_set *attr_set,
+ const char *buf, size_t count)
+{
+ struct interactive_tunables *tunables = to_tunables(attr_set);
+ unsigned int *new_target_loads;
+ unsigned long flags;
+ int ntokens;
+
+ new_target_loads = get_tokenized_data(buf, &ntokens);
+ if (IS_ERR(new_target_loads))
+ return PTR_ERR(new_target_loads);
+
+ spin_lock_irqsave(&tunables->target_loads_lock, flags);
+ if (tunables->target_loads != default_target_loads)
+ kfree(tunables->target_loads);
+ tunables->target_loads = new_target_loads;
+ tunables->ntarget_loads = ntokens;
+ spin_unlock_irqrestore(&tunables->target_loads_lock, flags);
+
+ return count;
+}
+
+static ssize_t show_above_hispeed_delay(struct gov_attr_set *attr_set,
+ char *buf)
+{
+ struct interactive_tunables *tunables = to_tunables(attr_set);
+ unsigned long flags;
+ ssize_t ret = 0;
+ int i;
+
+ spin_lock_irqsave(&tunables->above_hispeed_delay_lock, flags);
+
+ for (i = 0; i < tunables->nabove_hispeed_delay; i++)
+ ret += sprintf(buf + ret, "%u%s",
+ tunables->above_hispeed_delay[i],
+ i & 0x1 ? ":" : " ");
+
+ sprintf(buf + ret - 1, "\n");
+ spin_unlock_irqrestore(&tunables->above_hispeed_delay_lock, flags);
+
+ return ret;
+}
+
+static ssize_t store_above_hispeed_delay(struct gov_attr_set *attr_set,
+ const char *buf, size_t count)
+{
+ struct interactive_tunables *tunables = to_tunables(attr_set);
+ unsigned int *new_above_hispeed_delay = NULL;
+ unsigned long flags;
+ int ntokens;
+
+ new_above_hispeed_delay = get_tokenized_data(buf, &ntokens);
+ if (IS_ERR(new_above_hispeed_delay))
+ return PTR_ERR(new_above_hispeed_delay);
+
+ spin_lock_irqsave(&tunables->above_hispeed_delay_lock, flags);
+ if (tunables->above_hispeed_delay != default_above_hispeed_delay)
+ kfree(tunables->above_hispeed_delay);
+ tunables->above_hispeed_delay = new_above_hispeed_delay;
+ tunables->nabove_hispeed_delay = ntokens;
+ spin_unlock_irqrestore(&tunables->above_hispeed_delay_lock, flags);
+
+ return count;
+}
+
+static ssize_t store_hispeed_freq(struct gov_attr_set *attr_set,
+ const char *buf, size_t count)
+{
+ struct interactive_tunables *tunables = to_tunables(attr_set);
+ unsigned long int val;
+ int ret;
+
+ ret = kstrtoul(buf, 0, &val);
+ if (ret < 0)
+ return ret;
+
+ tunables->hispeed_freq = val;
+
+ return count;
+}
+
+static ssize_t store_go_hispeed_load(struct gov_attr_set *attr_set,
+ const char *buf, size_t count)
+{
+ struct interactive_tunables *tunables = to_tunables(attr_set);
+ unsigned long val;
+ int ret;
+
+ ret = kstrtoul(buf, 0, &val);
+ if (ret < 0)
+ return ret;
+
+ tunables->go_hispeed_load = val;
+
+ return count;
+}
+
+static ssize_t store_min_sample_time(struct gov_attr_set *attr_set,
+ const char *buf, size_t count)
+{
+ struct interactive_tunables *tunables = to_tunables(attr_set);
+ unsigned long val;
+ int ret;
+
+ ret = kstrtoul(buf, 0, &val);
+ if (ret < 0)
+ return ret;
+
+ tunables->min_sample_time = val;
+
+ return count;
+}
+
+static ssize_t show_timer_rate(struct gov_attr_set *attr_set, char *buf)
+{
+ struct interactive_tunables *tunables = to_tunables(attr_set);
+
+ return sprintf(buf, "%lu\n", tunables->sampling_rate);
+}
+
+static ssize_t store_timer_rate(struct gov_attr_set *attr_set, const char *buf,
+ size_t count)
+{
+ struct interactive_tunables *tunables = to_tunables(attr_set);
+ unsigned long val, val_round;
+ int ret;
+
+ ret = kstrtoul(buf, 0, &val);
+ if (ret < 0)
+ return ret;
+
+ val_round = jiffies_to_usecs(usecs_to_jiffies(val));
+ if (val != val_round)
+ pr_warn("timer_rate not aligned to jiffy. Rounded up to %lu\n",
+ val_round);
+
+ tunables->sampling_rate = val_round;
+
+ return count;
+}
+
+static ssize_t store_timer_slack(struct gov_attr_set *attr_set, const char *buf,
+ size_t count)
+{
+ struct interactive_tunables *tunables = to_tunables(attr_set);
+ unsigned long val;
+ int ret;
+
+ ret = kstrtol(buf, 10, &val);
+ if (ret < 0)
+ return ret;
+
+ tunables->timer_slack = val;
+ update_slack_delay(tunables);
+
+ return count;
+}
+
+static ssize_t store_boost(struct gov_attr_set *attr_set, const char *buf,
+ size_t count)
+{
+ struct interactive_tunables *tunables = to_tunables(attr_set);
+ unsigned long val;
+ int ret;
+
+ ret = kstrtoul(buf, 0, &val);
+ if (ret < 0)
+ return ret;
+
+ tunables->boost = val;
+
+ if (tunables->boost) {
+ trace_cpufreq_interactive_boost("on");
+ if (!tunables->boosted)
+ cpufreq_interactive_boost(tunables);
+ } else {
+ tunables->boostpulse_endtime = ktime_to_us(ktime_get());
+ trace_cpufreq_interactive_unboost("off");
+ }
+
+ return count;
+}
+
+static ssize_t store_boostpulse(struct gov_attr_set *attr_set, const char *buf,
+ size_t count)
+{
+ struct interactive_tunables *tunables = to_tunables(attr_set);
+ unsigned long val;
+ int ret;
+
+ ret = kstrtoul(buf, 0, &val);
+ if (ret < 0)
+ return ret;
+
+ tunables->boostpulse_endtime = ktime_to_us(ktime_get()) +
+ tunables->boostpulse_duration;
+ trace_cpufreq_interactive_boost("pulse");
+ if (!tunables->boosted)
+ cpufreq_interactive_boost(tunables);
+
+ return count;
+}
+
+static ssize_t store_boostpulse_duration(struct gov_attr_set *attr_set,
+ const char *buf, size_t count)
+{
+ struct interactive_tunables *tunables = to_tunables(attr_set);
+ unsigned long val;
+ int ret;
+
+ ret = kstrtoul(buf, 0, &val);
+ if (ret < 0)
+ return ret;
+
+ tunables->boostpulse_duration = val;
+
+ return count;
+}
+
+static ssize_t store_io_is_busy(struct gov_attr_set *attr_set, const char *buf,
+ size_t count)
+{
+ struct interactive_tunables *tunables = to_tunables(attr_set);
+ unsigned long val;
+ int ret;
+
+ ret = kstrtoul(buf, 0, &val);
+ if (ret < 0)
+ return ret;
+
+ tunables->io_is_busy = val;
+
+ return count;
+}
+
+show_one(hispeed_freq, "%u");
+show_one(go_hispeed_load, "%lu");
+show_one(min_sample_time, "%lu");
+show_one(timer_slack, "%lu");
+show_one(boost, "%u");
+show_one(boostpulse_duration, "%u");
+show_one(io_is_busy, "%u");
+
+gov_attr_rw(target_loads);
+gov_attr_rw(above_hispeed_delay);
+gov_attr_rw(hispeed_freq);
+gov_attr_rw(go_hispeed_load);
+gov_attr_rw(min_sample_time);
+gov_attr_rw(timer_rate);
+gov_attr_rw(timer_slack);
+gov_attr_rw(boost);
+gov_attr_wo(boostpulse);
+gov_attr_rw(boostpulse_duration);
+gov_attr_rw(io_is_busy);
+
+static struct attribute *interactive_attributes[] = {
+ &target_loads.attr,
+ &above_hispeed_delay.attr,
+ &hispeed_freq.attr,
+ &go_hispeed_load.attr,
+ &min_sample_time.attr,
+ &timer_rate.attr,
+ &timer_slack.attr,
+ &boost.attr,
+ &boostpulse.attr,
+ &boostpulse_duration.attr,
+ &io_is_busy.attr,
+ NULL
+};
+
+static struct kobj_type interactive_tunables_ktype = {
+ .default_attrs = interactive_attributes,
+ .sysfs_ops = &governor_sysfs_ops,
+};
+
+static int cpufreq_interactive_idle_notifier(struct notifier_block *nb,
+ unsigned long val, void *data)
+{
+ if (val == IDLE_END)
+ cpufreq_interactive_idle_end();
+
+ return 0;
+}
+
+static struct notifier_block cpufreq_interactive_idle_nb = {
+ .notifier_call = cpufreq_interactive_idle_notifier,
+};
+
+/* Interactive Governor callbacks */
+struct interactive_governor {
+ struct cpufreq_governor gov;
+ unsigned int usage_count;
+};
+
+static struct interactive_governor interactive_gov;
+
+#define CPU_FREQ_GOV_INTERACTIVE (&interactive_gov.gov)
+
+static void irq_work(struct irq_work *irq_work)
+{
+ struct interactive_cpu *icpu = container_of(irq_work, struct
+ interactive_cpu, irq_work);
+
+ cpufreq_interactive_update(icpu);
+ icpu->work_in_progress = false;
+}
+
+static void update_util_handler(struct update_util_data *data, u64 time,
+ unsigned int flags)
+{
+ struct interactive_cpu *icpu = container_of(data,
+ struct interactive_cpu, update_util);
+ struct interactive_policy *ipolicy = icpu->ipolicy;
+ struct interactive_tunables *tunables = ipolicy->tunables;
+ u64 delta_ns;
+
+ /*
+ * The irq-work may not be allowed to be queued up right now.
+ * Possible reasons:
+ * - Work has already been queued up or is in progress.
+ * - It is too early (too little time from the previous sample).
+ */
+ if (icpu->work_in_progress)
+ return;
+
+ delta_ns = time - icpu->last_sample_time;
+ if ((s64)delta_ns < tunables->sampling_rate * NSEC_PER_USEC)
+ return;
+
+ icpu->last_sample_time = time;
+ icpu->next_sample_jiffies = usecs_to_jiffies(tunables->sampling_rate) +
+ jiffies;
+
+ icpu->work_in_progress = true;
+ irq_work_queue(&icpu->irq_work);
+}
+
+static void gov_set_update_util(struct interactive_policy *ipolicy)
+{
+ struct cpufreq_policy *policy = ipolicy->policy;
+ struct interactive_cpu *icpu;
+ int cpu;
+
+ for_each_cpu(cpu, policy->cpus) {
+ icpu = &per_cpu(interactive_cpu, cpu);
+
+ icpu->last_sample_time = 0;
+ icpu->next_sample_jiffies = 0;
+ cpufreq_add_update_util_hook(cpu, &icpu->update_util,
+ update_util_handler);
+ }
+}
+
+static inline void gov_clear_update_util(struct cpufreq_policy *policy)
+{
+ int i;
+
+ for_each_cpu(i, policy->cpus)
+ cpufreq_remove_update_util_hook(i);
+
+ synchronize_sched();
+}
+
+static void icpu_cancel_work(struct interactive_cpu *icpu)
+{
+ irq_work_sync(&icpu->irq_work);
+ icpu->work_in_progress = false;
+ del_timer_sync(&icpu->slack_timer);
+}
+
+static struct interactive_policy *
+interactive_policy_alloc(struct cpufreq_policy *policy)
+{
+ struct interactive_policy *ipolicy;
+
+ ipolicy = kzalloc(sizeof(*ipolicy), GFP_KERNEL);
+ if (!ipolicy)
+ return NULL;
+
+ ipolicy->policy = policy;
+
+ return ipolicy;
+}
+
+static void interactive_policy_free(struct interactive_policy *ipolicy)
+{
+ kfree(ipolicy);
+}
+
+static struct interactive_tunables *
+interactive_tunables_alloc(struct interactive_policy *ipolicy)
+{
+ struct interactive_tunables *tunables;
+
+ tunables = kzalloc(sizeof(*tunables), GFP_KERNEL);
+ if (!tunables)
+ return NULL;
+
+ gov_attr_set_init(&tunables->attr_set, &ipolicy->tunables_hook);
+ if (!have_governor_per_policy())
+ global_tunables = tunables;
+
+ ipolicy->tunables = tunables;
+
+ return tunables;
+}
+
+static void interactive_tunables_free(struct interactive_tunables *tunables)
+{
+ if (!have_governor_per_policy())
+ global_tunables = NULL;
+
+ kfree(tunables);
+}
+
+int cpufreq_interactive_init(struct cpufreq_policy *policy)
+{
+ struct interactive_policy *ipolicy;
+ struct interactive_tunables *tunables;
+ int ret;
+
+ /* State should be equivalent to EXIT */
+ if (policy->governor_data)
+ return -EBUSY;
+
+ ipolicy = interactive_policy_alloc(policy);
+ if (!ipolicy)
+ return -ENOMEM;
+
+ mutex_lock(&global_tunables_lock);
+
+ if (global_tunables) {
+ if (WARN_ON(have_governor_per_policy())) {
+ ret = -EINVAL;
+ goto free_int_policy;
+ }
+
+ policy->governor_data = ipolicy;
+ ipolicy->tunables = global_tunables;
+
+ gov_attr_set_get(&global_tunables->attr_set,
+ &ipolicy->tunables_hook);
+ goto out;
+ }
+
+ tunables = interactive_tunables_alloc(ipolicy);
+ if (!tunables) {
+ ret = -ENOMEM;
+ goto free_int_policy;
+ }
+
+ tunables->hispeed_freq = policy->max;
+ tunables->above_hispeed_delay = default_above_hispeed_delay;
+ tunables->nabove_hispeed_delay =
+ ARRAY_SIZE(default_above_hispeed_delay);
+ tunables->go_hispeed_load = DEFAULT_GO_HISPEED_LOAD;
+ tunables->target_loads = default_target_loads;
+ tunables->ntarget_loads = ARRAY_SIZE(default_target_loads);
+ tunables->min_sample_time = DEFAULT_MIN_SAMPLE_TIME;
+ tunables->boostpulse_duration = DEFAULT_MIN_SAMPLE_TIME;
+ tunables->sampling_rate = DEFAULT_SAMPLING_RATE;
+ tunables->timer_slack = DEFAULT_TIMER_SLACK;
+ update_slack_delay(tunables);
+
+ spin_lock_init(&tunables->target_loads_lock);
+ spin_lock_init(&tunables->above_hispeed_delay_lock);
+
+ policy->governor_data = ipolicy;
+
+ ret = kobject_init_and_add(&tunables->attr_set.kobj,
+ &interactive_tunables_ktype,
+ get_governor_parent_kobj(policy), "%s",
+ interactive_gov.gov.name);
+ if (ret)
+ goto fail;
+
+ /* One time initialization for governor */
+ if (!interactive_gov.usage_count++) {
+ idle_notifier_register(&cpufreq_interactive_idle_nb);
+ cpufreq_register_notifier(&cpufreq_notifier_block,
+ CPUFREQ_TRANSITION_NOTIFIER);
+ }
+
+ out:
+ mutex_unlock(&global_tunables_lock);
+ return 0;
+
+ fail:
+ policy->governor_data = NULL;
+ interactive_tunables_free(tunables);
+
+ free_int_policy:
+ mutex_unlock(&global_tunables_lock);
+
+ interactive_policy_free(ipolicy);
+ pr_err("governor initialization failed (%d)\n", ret);
+
+ return ret;
+}
+
+void cpufreq_interactive_exit(struct cpufreq_policy *policy)
+{
+ struct interactive_policy *ipolicy = policy->governor_data;
+ struct interactive_tunables *tunables = ipolicy->tunables;
+ unsigned int count;
+
+ mutex_lock(&global_tunables_lock);
+
+ /* Last policy using the governor ? */
+ if (!--interactive_gov.usage_count) {
+ cpufreq_unregister_notifier(&cpufreq_notifier_block,
+ CPUFREQ_TRANSITION_NOTIFIER);
+ idle_notifier_unregister(&cpufreq_interactive_idle_nb);
+ }
+
+ count = gov_attr_set_put(&tunables->attr_set, &ipolicy->tunables_hook);
+ policy->governor_data = NULL;
+ if (!count)
+ interactive_tunables_free(tunables);
+
+ mutex_unlock(&global_tunables_lock);
+
+ interactive_policy_free(ipolicy);
+}
+
+int cpufreq_interactive_start(struct cpufreq_policy *policy)
+{
+ struct interactive_policy *ipolicy = policy->governor_data;
+ struct interactive_cpu *icpu;
+ unsigned int cpu;
+
+ for_each_cpu(cpu, policy->cpus) {
+ icpu = &per_cpu(interactive_cpu, cpu);
+
+ icpu->target_freq = policy->cur;
+ icpu->floor_freq = icpu->target_freq;
+ icpu->pol_floor_val_time = ktime_to_us(ktime_get());
+ icpu->loc_floor_val_time = icpu->pol_floor_val_time;
+ icpu->pol_hispeed_val_time = icpu->pol_floor_val_time;
+ icpu->loc_hispeed_val_time = icpu->pol_floor_val_time;
+
+ down_write(&icpu->enable_sem);
+ icpu->ipolicy = ipolicy;
+ up_write(&icpu->enable_sem);
+
+ slack_timer_resched(icpu, cpu, false);
+ }
+
+ gov_set_update_util(ipolicy);
+ return 0;
+}
+
+void cpufreq_interactive_stop(struct cpufreq_policy *policy)
+{
+ struct interactive_policy *ipolicy = policy->governor_data;
+ struct interactive_cpu *icpu;
+ unsigned int cpu;
+
+ gov_clear_update_util(ipolicy->policy);
+
+ for_each_cpu(cpu, policy->cpus) {
+ icpu = &per_cpu(interactive_cpu, cpu);
+
+ icpu_cancel_work(icpu);
+
+ down_write(&icpu->enable_sem);
+ icpu->ipolicy = NULL;
+ up_write(&icpu->enable_sem);
+ }
+}
+
+void cpufreq_interactive_limits(struct cpufreq_policy *policy)
+{
+ struct interactive_cpu *icpu;
+ unsigned int cpu;
+ unsigned long flags;
+
+ cpufreq_policy_apply_limits(policy);
+
+ for_each_cpu(cpu, policy->cpus) {
+ icpu = &per_cpu(interactive_cpu, cpu);
+
+ spin_lock_irqsave(&icpu->target_freq_lock, flags);
+
+ if (policy->max < icpu->target_freq)
+ icpu->target_freq = policy->max;
+ else if (policy->min > icpu->target_freq)
+ icpu->target_freq = policy->min;
+
+ spin_unlock_irqrestore(&icpu->target_freq_lock, flags);
+ }
+}
+
+static struct interactive_governor interactive_gov = {
+ .gov = {
+ .name = "interactive",
+ .max_transition_latency = TRANSITION_LATENCY_LIMIT,
+ .owner = THIS_MODULE,
+ .init = cpufreq_interactive_init,
+ .exit = cpufreq_interactive_exit,
+ .start = cpufreq_interactive_start,
+ .stop = cpufreq_interactive_stop,
+ .limits = cpufreq_interactive_limits,
+ }
+};
+
+static void cpufreq_interactive_nop_timer(unsigned long data)
+{
+ /*
+ * The purpose of slack-timer is to wake up the CPU from IDLE, in order
+ * to decrease its frequency if it is not set to minimum already.
+ *
+ * This is important for platforms where CPU with higher frequencies
+ * consume higher power even at IDLE.
+ */
+}
+
+static int __init cpufreq_interactive_gov_init(void)
+{
+ struct sched_param param = { .sched_priority = MAX_RT_PRIO - 1 };
+ struct interactive_cpu *icpu;
+ unsigned int cpu;
+
+ for_each_possible_cpu(cpu) {
+ icpu = &per_cpu(interactive_cpu, cpu);
+
+ init_irq_work(&icpu->irq_work, irq_work);
+ spin_lock_init(&icpu->load_lock);
+ spin_lock_init(&icpu->target_freq_lock);
+ init_rwsem(&icpu->enable_sem);
+
+ /* Initialize per-cpu slack-timer */
+ init_timer_pinned(&icpu->slack_timer);
+ icpu->slack_timer.function = cpufreq_interactive_nop_timer;
+ }
+
+ spin_lock_init(&speedchange_cpumask_lock);
+ speedchange_task = kthread_create(cpufreq_interactive_speedchange_task,
+ NULL, "cfinteractive");
+ if (IS_ERR(speedchange_task))
+ return PTR_ERR(speedchange_task);
+
+ sched_setscheduler_nocheck(speedchange_task, SCHED_FIFO, &param);
+ get_task_struct(speedchange_task);
+
+ /* wake up so the thread does not look hung to the freezer */
+ wake_up_process(speedchange_task);
+
+ return cpufreq_register_governor(CPU_FREQ_GOV_INTERACTIVE);
+}
+
+#ifdef CONFIG_CPU_FREQ_DEFAULT_GOV_INTERACTIVE
+struct cpufreq_governor *cpufreq_default_governor(void)
+{
+ return CPU_FREQ_GOV_INTERACTIVE;
+}
+
+fs_initcall(cpufreq_interactive_gov_init);
+#else
+module_init(cpufreq_interactive_gov_init);
+#endif
+
+static void __exit cpufreq_interactive_gov_exit(void)
+{
+ cpufreq_unregister_governor(CPU_FREQ_GOV_INTERACTIVE);
+ kthread_stop(speedchange_task);
+ put_task_struct(speedchange_task);
+}
+module_exit(cpufreq_interactive_gov_exit);
+
+MODULE_AUTHOR("Mike Chan <mike@android.com>");
+MODULE_DESCRIPTION("'cpufreq_interactive' - A dynamic cpufreq governor for Latency sensitive workloads");
+MODULE_LICENSE("GPL");
diff --git a/drivers/cpufreq/cpufreq_performance.c b/drivers/cpufreq/cpufreq_performance.c
index dafb679adc58..399428e40e89 100644
--- a/drivers/cpufreq/cpufreq_performance.c
+++ b/drivers/cpufreq/cpufreq_performance.c
@@ -22,7 +22,10 @@ static void cpufreq_gov_performance_limits(struct cpufreq_policy *policy)
__cpufreq_driver_target(policy, policy->max, CPUFREQ_RELATION_H);
}
-static struct cpufreq_governor cpufreq_gov_performance = {
+#ifdef CONFIG_CPU_FREQ_GOV_PERFORMANCE_MODULE
+static
+#endif
+struct cpufreq_governor cpufreq_gov_performance = {
.name = "performance",
.owner = THIS_MODULE,
.limits = cpufreq_gov_performance_limits,
diff --git a/drivers/cpufreq/cpufreq_powersave.c b/drivers/cpufreq/cpufreq_powersave.c
index 78a651038faf..5daa500fb0a9 100644
--- a/drivers/cpufreq/cpufreq_powersave.c
+++ b/drivers/cpufreq/cpufreq_powersave.c
@@ -22,7 +22,10 @@ static void cpufreq_gov_powersave_limits(struct cpufreq_policy *policy)
__cpufreq_driver_target(policy, policy->min, CPUFREQ_RELATION_L);
}
-static struct cpufreq_governor cpufreq_gov_powersave = {
+#ifndef CONFIG_CPU_FREQ_DEFAULT_GOV_POWERSAVE
+static
+#endif
+struct cpufreq_governor cpufreq_gov_powersave = {
.name = "powersave",
.limits = cpufreq_gov_powersave_limits,
.owner = THIS_MODULE,
diff --git a/drivers/cpufreq/cpufreq_times.c b/drivers/cpufreq/cpufreq_times.c
new file mode 100644
index 000000000000..cd0df201594c
--- /dev/null
+++ b/drivers/cpufreq/cpufreq_times.c
@@ -0,0 +1,650 @@
+/* drivers/cpufreq/cpufreq_times.c
+ *
+ * Copyright (C) 2018 Google, Inc.
+ *
+ * This software is licensed under the terms of the GNU General Public
+ * License version 2, as published by the Free Software Foundation, and
+ * may be copied, distributed, and modified under those terms.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ */
+
+#include <linux/cpufreq.h>
+#include <linux/cpufreq_times.h>
+#include <linux/cputime.h>
+#include <linux/hashtable.h>
+#include <linux/init.h>
+#include <linux/proc_fs.h>
+#include <linux/sched.h>
+#include <linux/seq_file.h>
+#include <linux/slab.h>
+#include <linux/spinlock.h>
+#include <linux/threads.h>
+
+#define UID_HASH_BITS 10
+
+static DECLARE_HASHTABLE(uid_hash_table, UID_HASH_BITS);
+
+static DEFINE_SPINLOCK(task_time_in_state_lock); /* task->time_in_state */
+static DEFINE_SPINLOCK(uid_lock); /* uid_hash_table */
+
+struct concurrent_times {
+ atomic64_t active[NR_CPUS];
+ atomic64_t policy[NR_CPUS];
+};
+
+struct uid_entry {
+ uid_t uid;
+ unsigned int max_state;
+ struct hlist_node hash;
+ struct rcu_head rcu;
+ struct concurrent_times *concurrent_times;
+ u64 time_in_state[0];
+};
+
+/**
+ * struct cpu_freqs - per-cpu frequency information
+ * @offset: start of these freqs' stats in task time_in_state array
+ * @max_state: number of entries in freq_table
+ * @last_index: index in freq_table of last frequency switched to
+ * @freq_table: list of available frequencies
+ */
+struct cpu_freqs {
+ unsigned int offset;
+ unsigned int max_state;
+ unsigned int last_index;
+ unsigned int freq_table[0];
+};
+
+static struct cpu_freqs *all_freqs[NR_CPUS];
+
+static unsigned int next_offset;
+
+
+/* Caller must hold rcu_read_lock() */
+static struct uid_entry *find_uid_entry_rcu(uid_t uid)
+{
+ struct uid_entry *uid_entry;
+
+ hash_for_each_possible_rcu(uid_hash_table, uid_entry, hash, uid) {
+ if (uid_entry->uid == uid)
+ return uid_entry;
+ }
+ return NULL;
+}
+
+/* Caller must hold uid lock */
+static struct uid_entry *find_uid_entry_locked(uid_t uid)
+{
+ struct uid_entry *uid_entry;
+
+ hash_for_each_possible(uid_hash_table, uid_entry, hash, uid) {
+ if (uid_entry->uid == uid)
+ return uid_entry;
+ }
+ return NULL;
+}
+
+/* Caller must hold uid lock */
+static struct uid_entry *find_or_register_uid_locked(uid_t uid)
+{
+ struct uid_entry *uid_entry, *temp;
+ struct concurrent_times *times;
+ unsigned int max_state = READ_ONCE(next_offset);
+ size_t alloc_size = sizeof(*uid_entry) + max_state *
+ sizeof(uid_entry->time_in_state[0]);
+
+ uid_entry = find_uid_entry_locked(uid);
+ if (uid_entry) {
+ if (uid_entry->max_state == max_state)
+ return uid_entry;
+ /* uid_entry->time_in_state is too small to track all freqs, so
+ * expand it.
+ */
+ temp = __krealloc(uid_entry, alloc_size, GFP_ATOMIC);
+ if (!temp)
+ return uid_entry;
+ temp->max_state = max_state;
+ memset(temp->time_in_state + uid_entry->max_state, 0,
+ (max_state - uid_entry->max_state) *
+ sizeof(uid_entry->time_in_state[0]));
+ if (temp != uid_entry) {
+ hlist_replace_rcu(&uid_entry->hash, &temp->hash);
+ kfree_rcu(uid_entry, rcu);
+ }
+ return temp;
+ }
+
+ uid_entry = kzalloc(alloc_size, GFP_ATOMIC);
+ if (!uid_entry)
+ return NULL;
+ times = kzalloc(sizeof(*times), GFP_ATOMIC);
+ if (!times) {
+ kfree(uid_entry);
+ return NULL;
+ }
+
+ uid_entry->uid = uid;
+ uid_entry->max_state = max_state;
+ uid_entry->concurrent_times = times;
+
+ hash_add_rcu(uid_hash_table, &uid_entry->hash, uid);
+
+ return uid_entry;
+}
+
+static bool freq_index_invalid(unsigned int index)
+{
+ unsigned int cpu;
+ struct cpu_freqs *freqs;
+
+ for_each_possible_cpu(cpu) {
+ freqs = all_freqs[cpu];
+ if (!freqs || index < freqs->offset ||
+ freqs->offset + freqs->max_state <= index)
+ continue;
+ return freqs->freq_table[index - freqs->offset] ==
+ CPUFREQ_ENTRY_INVALID;
+ }
+ return true;
+}
+
+static int single_uid_time_in_state_show(struct seq_file *m, void *ptr)
+{
+ struct uid_entry *uid_entry;
+ unsigned int i;
+ u64 time;
+ uid_t uid = from_kuid_munged(current_user_ns(), *(kuid_t *)m->private);
+
+ if (uid == overflowuid)
+ return -EINVAL;
+
+ rcu_read_lock();
+
+ uid_entry = find_uid_entry_rcu(uid);
+ if (!uid_entry) {
+ rcu_read_unlock();
+ return 0;
+ }
+
+ for (i = 0; i < uid_entry->max_state; ++i) {
+ if (freq_index_invalid(i))
+ continue;
+ time = cputime_to_clock_t(uid_entry->time_in_state[i]);
+ seq_write(m, &time, sizeof(time));
+ }
+
+ rcu_read_unlock();
+
+ return 0;
+}
+
+static void *uid_seq_start(struct seq_file *seq, loff_t *pos)
+{
+ if (*pos >= HASH_SIZE(uid_hash_table))
+ return NULL;
+
+ return &uid_hash_table[*pos];
+}
+
+static void *uid_seq_next(struct seq_file *seq, void *v, loff_t *pos)
+{
+ (*pos)++;
+
+ if (*pos >= HASH_SIZE(uid_hash_table))
+ return NULL;
+
+ return &uid_hash_table[*pos];
+}
+
+static void uid_seq_stop(struct seq_file *seq, void *v) { }
+
+static int uid_time_in_state_seq_show(struct seq_file *m, void *v)
+{
+ struct uid_entry *uid_entry;
+ struct cpu_freqs *freqs, *last_freqs = NULL;
+ int i, cpu;
+
+ if (v == uid_hash_table) {
+ seq_puts(m, "uid:");
+ for_each_possible_cpu(cpu) {
+ freqs = all_freqs[cpu];
+ if (!freqs || freqs == last_freqs)
+ continue;
+ last_freqs = freqs;
+ for (i = 0; i < freqs->max_state; i++) {
+ if (freqs->freq_table[i] ==
+ CPUFREQ_ENTRY_INVALID)
+ continue;
+ seq_printf(m, " %d", freqs->freq_table[i]);
+ }
+ }
+ seq_putc(m, '\n');
+ }
+
+ rcu_read_lock();
+
+ hlist_for_each_entry_rcu(uid_entry, (struct hlist_head *)v, hash) {
+ if (uid_entry->max_state)
+ seq_printf(m, "%d:", uid_entry->uid);
+ for (i = 0; i < uid_entry->max_state; ++i) {
+ if (freq_index_invalid(i))
+ continue;
+ seq_printf(m, " %lu", (unsigned long)cputime_to_clock_t(
+ uid_entry->time_in_state[i]));
+ }
+ if (uid_entry->max_state)
+ seq_putc(m, '\n');
+ }
+
+ rcu_read_unlock();
+ return 0;
+}
+
+static int concurrent_time_seq_show(struct seq_file *m, void *v,
+ atomic64_t *(*get_times)(struct concurrent_times *))
+{
+ struct uid_entry *uid_entry;
+ int i, num_possible_cpus = num_possible_cpus();
+
+ rcu_read_lock();
+
+ hlist_for_each_entry_rcu(uid_entry, (struct hlist_head *)v, hash) {
+ atomic64_t *times = get_times(uid_entry->concurrent_times);
+
+ seq_put_decimal_ull(m, "", (u64)uid_entry->uid);
+ seq_putc(m, ':');
+
+ for (i = 0; i < num_possible_cpus; ++i) {
+ u64 time = cputime_to_clock_t(atomic64_read(&times[i]));
+
+ seq_put_decimal_ull(m, " ", time);
+ }
+ seq_putc(m, '\n');
+ }
+
+ rcu_read_unlock();
+
+ return 0;
+}
+
+static inline atomic64_t *get_active_times(struct concurrent_times *times)
+{
+ return times->active;
+}
+
+static int concurrent_active_time_seq_show(struct seq_file *m, void *v)
+{
+ if (v == uid_hash_table) {
+ seq_put_decimal_ull(m, "cpus: ", num_possible_cpus());
+ seq_putc(m, '\n');
+ }
+
+ return concurrent_time_seq_show(m, v, get_active_times);
+}
+
+static inline atomic64_t *get_policy_times(struct concurrent_times *times)
+{
+ return times->policy;
+}
+
+static int concurrent_policy_time_seq_show(struct seq_file *m, void *v)
+{
+ int i;
+ struct cpu_freqs *freqs, *last_freqs = NULL;
+
+ if (v == uid_hash_table) {
+ int cnt = 0;
+
+ for_each_possible_cpu(i) {
+ freqs = all_freqs[i];
+ if (!freqs)
+ continue;
+ if (freqs != last_freqs) {
+ if (last_freqs) {
+ seq_put_decimal_ull(m, ": ", cnt);
+ seq_putc(m, ' ');
+ cnt = 0;
+ }
+ seq_put_decimal_ull(m, "policy", i);
+
+ last_freqs = freqs;
+ }
+ cnt++;
+ }
+ if (last_freqs) {
+ seq_put_decimal_ull(m, ": ", cnt);
+ seq_putc(m, '\n');
+ }
+ }
+
+ return concurrent_time_seq_show(m, v, get_policy_times);
+}
+
+void cpufreq_task_times_init(struct task_struct *p)
+{
+ unsigned long flags;
+
+ spin_lock_irqsave(&task_time_in_state_lock, flags);
+ p->time_in_state = NULL;
+ spin_unlock_irqrestore(&task_time_in_state_lock, flags);
+ p->max_state = 0;
+}
+
+void cpufreq_task_times_alloc(struct task_struct *p)
+{
+ void *temp;
+ unsigned long flags;
+ unsigned int max_state = READ_ONCE(next_offset);
+
+ /* We use one array to avoid multiple allocs per task */
+ temp = kcalloc(max_state, sizeof(p->time_in_state[0]), GFP_ATOMIC);
+ if (!temp)
+ return;
+
+ spin_lock_irqsave(&task_time_in_state_lock, flags);
+ p->time_in_state = temp;
+ spin_unlock_irqrestore(&task_time_in_state_lock, flags);
+ p->max_state = max_state;
+}
+
+/* Caller must hold task_time_in_state_lock */
+static int cpufreq_task_times_realloc_locked(struct task_struct *p)
+{
+ void *temp;
+ unsigned int max_state = READ_ONCE(next_offset);
+
+ temp = krealloc(p->time_in_state, max_state * sizeof(u64), GFP_ATOMIC);
+ if (!temp)
+ return -ENOMEM;
+ p->time_in_state = temp;
+ memset(p->time_in_state + p->max_state, 0,
+ (max_state - p->max_state) * sizeof(u64));
+ p->max_state = max_state;
+ return 0;
+}
+
+void cpufreq_task_times_exit(struct task_struct *p)
+{
+ unsigned long flags;
+ void *temp;
+
+ if (!p->time_in_state)
+ return;
+
+ spin_lock_irqsave(&task_time_in_state_lock, flags);
+ temp = p->time_in_state;
+ p->time_in_state = NULL;
+ spin_unlock_irqrestore(&task_time_in_state_lock, flags);
+ kfree(temp);
+}
+
+int proc_time_in_state_show(struct seq_file *m, struct pid_namespace *ns,
+ struct pid *pid, struct task_struct *p)
+{
+ unsigned int cpu, i;
+ cputime_t cputime;
+ unsigned long flags;
+ struct cpu_freqs *freqs;
+ struct cpu_freqs *last_freqs = NULL;
+
+ spin_lock_irqsave(&task_time_in_state_lock, flags);
+ for_each_possible_cpu(cpu) {
+ freqs = all_freqs[cpu];
+ if (!freqs || freqs == last_freqs)
+ continue;
+ last_freqs = freqs;
+
+ seq_printf(m, "cpu%u\n", cpu);
+ for (i = 0; i < freqs->max_state; i++) {
+ if (freqs->freq_table[i] == CPUFREQ_ENTRY_INVALID)
+ continue;
+ cputime = 0;
+ if (freqs->offset + i < p->max_state &&
+ p->time_in_state)
+ cputime = p->time_in_state[freqs->offset + i];
+ seq_printf(m, "%u %lu\n", freqs->freq_table[i],
+ (unsigned long)cputime_to_clock_t(cputime));
+ }
+ }
+ spin_unlock_irqrestore(&task_time_in_state_lock, flags);
+ return 0;
+}
+
+void cpufreq_acct_update_power(struct task_struct *p, cputime_t cputime)
+{
+ unsigned long flags;
+ unsigned int state;
+ unsigned int active_cpu_cnt = 0;
+ unsigned int policy_cpu_cnt = 0;
+ unsigned int policy_first_cpu;
+ struct uid_entry *uid_entry;
+ struct cpu_freqs *freqs = all_freqs[task_cpu(p)];
+ struct cpufreq_policy *policy;
+ uid_t uid = from_kuid_munged(current_user_ns(), task_uid(p));
+ int cpu = 0;
+
+ if (!freqs || is_idle_task(p) || p->flags & PF_EXITING)
+ return;
+
+ state = freqs->offset + READ_ONCE(freqs->last_index);
+
+ spin_lock_irqsave(&task_time_in_state_lock, flags);
+ if ((state < p->max_state || !cpufreq_task_times_realloc_locked(p)) &&
+ p->time_in_state)
+ p->time_in_state[state] += cputime;
+ spin_unlock_irqrestore(&task_time_in_state_lock, flags);
+
+ spin_lock_irqsave(&uid_lock, flags);
+ uid_entry = find_or_register_uid_locked(uid);
+ if (uid_entry && state < uid_entry->max_state)
+ uid_entry->time_in_state[state] += cputime;
+ spin_unlock_irqrestore(&uid_lock, flags);
+
+ rcu_read_lock();
+ uid_entry = find_uid_entry_rcu(uid);
+ if (!uid_entry) {
+ rcu_read_unlock();
+ return;
+ }
+
+ for_each_possible_cpu(cpu)
+ if (!idle_cpu(cpu))
+ ++active_cpu_cnt;
+
+ atomic64_add(cputime,
+ &uid_entry->concurrent_times->active[active_cpu_cnt - 1]);
+
+ policy = cpufreq_cpu_get(task_cpu(p));
+ if (!policy) {
+ /*
+ * This CPU may have just come up and not have a cpufreq policy
+ * yet.
+ */
+ rcu_read_unlock();
+ return;
+ }
+
+ for_each_cpu(cpu, policy->related_cpus)
+ if (!idle_cpu(cpu))
+ ++policy_cpu_cnt;
+
+ policy_first_cpu = cpumask_first(policy->related_cpus);
+ cpufreq_cpu_put(policy);
+
+ atomic64_add(cputime,
+ &uid_entry->concurrent_times->policy[policy_first_cpu +
+ policy_cpu_cnt - 1]);
+ rcu_read_unlock();
+}
+
+void cpufreq_times_create_policy(struct cpufreq_policy *policy)
+{
+ int cpu, index;
+ unsigned int count = 0;
+ struct cpufreq_frequency_table *pos, *table;
+ struct cpu_freqs *freqs;
+ void *tmp;
+
+ if (all_freqs[policy->cpu])
+ return;
+
+ table = policy->freq_table;
+ if (!table)
+ return;
+
+ cpufreq_for_each_entry(pos, table)
+ count++;
+
+ tmp = kzalloc(sizeof(*freqs) + sizeof(freqs->freq_table[0]) * count,
+ GFP_KERNEL);
+ if (!tmp)
+ return;
+
+ freqs = tmp;
+ freqs->max_state = count;
+
+ index = cpufreq_frequency_table_get_index(policy, policy->cur);
+ if (index >= 0)
+ WRITE_ONCE(freqs->last_index, index);
+
+ cpufreq_for_each_entry(pos, table)
+ freqs->freq_table[pos - table] = pos->frequency;
+
+ freqs->offset = next_offset;
+ WRITE_ONCE(next_offset, freqs->offset + count);
+ for_each_cpu(cpu, policy->related_cpus)
+ all_freqs[cpu] = freqs;
+}
+
+static void uid_entry_reclaim(struct rcu_head *rcu)
+{
+ struct uid_entry *uid_entry = container_of(rcu, struct uid_entry, rcu);
+
+ kfree(uid_entry->concurrent_times);
+ kfree(uid_entry);
+}
+
+void cpufreq_task_times_remove_uids(uid_t uid_start, uid_t uid_end)
+{
+ struct uid_entry *uid_entry;
+ struct hlist_node *tmp;
+ unsigned long flags;
+
+ spin_lock_irqsave(&uid_lock, flags);
+
+ for (; uid_start <= uid_end; uid_start++) {
+ hash_for_each_possible_safe(uid_hash_table, uid_entry, tmp,
+ hash, uid_start) {
+ if (uid_start == uid_entry->uid) {
+ hash_del_rcu(&uid_entry->hash);
+ call_rcu(&uid_entry->rcu, uid_entry_reclaim);
+ }
+ }
+ }
+
+ spin_unlock_irqrestore(&uid_lock, flags);
+}
+
+void cpufreq_times_record_transition(struct cpufreq_freqs *freq)
+{
+ int index;
+ struct cpu_freqs *freqs = all_freqs[freq->cpu];
+ struct cpufreq_policy *policy;
+
+ if (!freqs)
+ return;
+
+ policy = cpufreq_cpu_get(freq->cpu);
+ if (!policy)
+ return;
+
+ index = cpufreq_frequency_table_get_index(policy, freq->new);
+ if (index >= 0)
+ WRITE_ONCE(freqs->last_index, index);
+
+ cpufreq_cpu_put(policy);
+}
+
+static const struct seq_operations uid_time_in_state_seq_ops = {
+ .start = uid_seq_start,
+ .next = uid_seq_next,
+ .stop = uid_seq_stop,
+ .show = uid_time_in_state_seq_show,
+};
+
+static int uid_time_in_state_open(struct inode *inode, struct file *file)
+{
+ return seq_open(file, &uid_time_in_state_seq_ops);
+}
+
+int single_uid_time_in_state_open(struct inode *inode, struct file *file)
+{
+ return single_open(file, single_uid_time_in_state_show,
+ &(inode->i_uid));
+}
+
+static const struct file_operations uid_time_in_state_fops = {
+ .open = uid_time_in_state_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = seq_release,
+};
+
+static const struct seq_operations concurrent_active_time_seq_ops = {
+ .start = uid_seq_start,
+ .next = uid_seq_next,
+ .stop = uid_seq_stop,
+ .show = concurrent_active_time_seq_show,
+};
+
+static int concurrent_active_time_open(struct inode *inode, struct file *file)
+{
+ return seq_open(file, &concurrent_active_time_seq_ops);
+}
+
+static const struct file_operations concurrent_active_time_fops = {
+ .open = concurrent_active_time_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = seq_release,
+};
+
+static const struct seq_operations concurrent_policy_time_seq_ops = {
+ .start = uid_seq_start,
+ .next = uid_seq_next,
+ .stop = uid_seq_stop,
+ .show = concurrent_policy_time_seq_show,
+};
+
+static int concurrent_policy_time_open(struct inode *inode, struct file *file)
+{
+ return seq_open(file, &concurrent_policy_time_seq_ops);
+}
+
+static const struct file_operations concurrent_policy_time_fops = {
+ .open = concurrent_policy_time_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = seq_release,
+};
+
+static int __init cpufreq_times_init(void)
+{
+ proc_create_data("uid_time_in_state", 0444, NULL,
+ &uid_time_in_state_fops, NULL);
+
+ proc_create_data("uid_concurrent_active_time", 0444, NULL,
+ &concurrent_active_time_fops, NULL);
+
+ proc_create_data("uid_concurrent_policy_time", 0444, NULL,
+ &concurrent_policy_time_fops, NULL);
+
+ return 0;
+}
+
+early_initcall(cpufreq_times_init);
diff --git a/drivers/cpufreq/cpufreq_userspace.c b/drivers/cpufreq/cpufreq_userspace.c
index bd897e3e134d..765166d881bb 100644
--- a/drivers/cpufreq/cpufreq_userspace.c
+++ b/drivers/cpufreq/cpufreq_userspace.c
@@ -118,7 +118,10 @@ static void cpufreq_userspace_policy_limits(struct cpufreq_policy *policy)
mutex_unlock(&userspace_mutex);
}
-static struct cpufreq_governor cpufreq_gov_userspace = {
+#ifndef CONFIG_CPU_FREQ_DEFAULT_GOV_USERSPACE
+static
+#endif
+struct cpufreq_governor cpufreq_gov_userspace = {
.name = "userspace",
.init = cpufreq_userspace_policy_init,
.exit = cpufreq_userspace_policy_exit,
diff --git a/drivers/cpuidle/Kconfig.arm b/drivers/cpuidle/Kconfig.arm
index 21340e0be73e..f52144808455 100644
--- a/drivers/cpuidle/Kconfig.arm
+++ b/drivers/cpuidle/Kconfig.arm
@@ -4,6 +4,7 @@
config ARM_CPUIDLE
bool "Generic ARM/ARM64 CPU idle Driver"
select DT_IDLE_STATES
+ select CPU_IDLE_MULTIPLE_DRIVERS
help
Select this to enable generic cpuidle driver for ARM.
It provides a generic idle driver whose idle states are configured
diff --git a/drivers/cpuidle/cpuidle-arm.c b/drivers/cpuidle/cpuidle-arm.c
index f440d385ed34..f47c54546752 100644
--- a/drivers/cpuidle/cpuidle-arm.c
+++ b/drivers/cpuidle/cpuidle-arm.c
@@ -18,6 +18,7 @@
#include <linux/module.h>
#include <linux/of.h>
#include <linux/slab.h>
+#include <linux/topology.h>
#include <asm/cpuidle.h>
@@ -44,7 +45,7 @@ static int arm_enter_idle_state(struct cpuidle_device *dev,
return CPU_PM_CPU_IDLE_ENTER(arm_cpuidle_suspend, idx);
}
-static struct cpuidle_driver arm_idle_driver = {
+static struct cpuidle_driver arm_idle_driver __initdata = {
.name = "arm_idle",
.owner = THIS_MODULE,
/*
@@ -80,30 +81,42 @@ static const struct of_device_id arm_idle_state_match[] __initconst = {
static int __init arm_idle_init(void)
{
int cpu, ret;
- struct cpuidle_driver *drv = &arm_idle_driver;
+ struct cpuidle_driver *drv;
struct cpuidle_device *dev;
- /*
- * Initialize idle states data, starting at index 1.
- * This driver is DT only, if no DT idle states are detected (ret == 0)
- * let the driver initialization fail accordingly since there is no
- * reason to initialize the idle driver if only wfi is supported.
- */
- ret = dt_init_idle_driver(drv, arm_idle_state_match, 1);
- if (ret <= 0)
- return ret ? : -ENODEV;
-
- ret = cpuidle_register_driver(drv);
- if (ret) {
- pr_err("Failed to register cpuidle driver\n");
- return ret;
- }
-
- /*
- * Call arch CPU operations in order to initialize
- * idle states suspend back-end specific data
- */
for_each_possible_cpu(cpu) {
+
+ drv = kmemdup(&arm_idle_driver, sizeof(*drv), GFP_KERNEL);
+ if (!drv) {
+ ret = -ENOMEM;
+ goto out_fail;
+ }
+
+ drv->cpumask = (struct cpumask *)cpumask_of(cpu);
+
+ /*
+ * Initialize idle states data, starting at index 1. This
+ * driver is DT only, if no DT idle states are detected (ret
+ * == 0) let the driver initialization fail accordingly since
+ * there is no reason to initialize the idle driver if only
+ * wfi is supported.
+ */
+ ret = dt_init_idle_driver(drv, arm_idle_state_match, 1);
+ if (ret <= 0) {
+ ret = ret ? : -ENODEV;
+ goto out_kfree_drv;
+ }
+
+ ret = cpuidle_register_driver(drv);
+ if (ret) {
+ pr_err("Failed to register cpuidle driver\n");
+ goto out_kfree_drv;
+ }
+
+ /*
+ * Call arch CPU operations in order to initialize
+ * idle states suspend back-end specific data
+ */
ret = arm_cpuidle_init(cpu);
/*
@@ -115,14 +128,14 @@ static int __init arm_idle_init(void)
if (ret) {
pr_err("CPU %d failed to init idle CPU ops\n", cpu);
- goto out_fail;
+ goto out_unregister_drv;
}
dev = kzalloc(sizeof(*dev), GFP_KERNEL);
if (!dev) {
pr_err("Failed to allocate cpuidle device\n");
ret = -ENOMEM;
- goto out_fail;
+ goto out_unregister_drv;
}
dev->cpu = cpu;
@@ -130,21 +143,28 @@ static int __init arm_idle_init(void)
if (ret) {
pr_err("Failed to register cpuidle device for CPU %d\n",
cpu);
- kfree(dev);
- goto out_fail;
+ goto out_kfree_dev;
}
}
return 0;
+
+out_kfree_dev:
+ kfree(dev);
+out_unregister_drv:
+ cpuidle_unregister_driver(drv);
+out_kfree_drv:
+ kfree(drv);
out_fail:
while (--cpu >= 0) {
dev = per_cpu(cpuidle_devices, cpu);
+ drv = cpuidle_get_cpu_driver(dev);
cpuidle_unregister_device(dev);
+ cpuidle_unregister_driver(drv);
kfree(dev);
+ kfree(drv);
}
- cpuidle_unregister_driver(drv);
-
return ret;
}
device_initcall(arm_idle_init);
diff --git a/drivers/cpuidle/cpuidle.c b/drivers/cpuidle/cpuidle.c
index 35237c8d5206..439f460ac082 100644
--- a/drivers/cpuidle/cpuidle.c
+++ b/drivers/cpuidle/cpuidle.c
@@ -193,7 +193,7 @@ int cpuidle_enter_state(struct cpuidle_device *dev, struct cpuidle_driver *drv,
}
/* Take note of the planned idle state. */
- sched_idle_set_state(target_state);
+ sched_idle_set_state(target_state, index);
trace_cpu_idle_rcuidle(index, dev->cpu);
time_start = ns_to_ktime(local_clock());
@@ -206,7 +206,7 @@ int cpuidle_enter_state(struct cpuidle_device *dev, struct cpuidle_driver *drv,
trace_cpu_idle_rcuidle(PWR_EVENT_EXIT, dev->cpu);
/* The cpu is no longer idle or about to enter idle. */
- sched_idle_set_state(NULL);
+ sched_idle_set_state(NULL, -1);
if (broadcast) {
if (WARN_ON_ONCE(!irqs_disabled()))
diff --git a/drivers/cpuidle/governors/menu.c b/drivers/cpuidle/governors/menu.c
index 03d38c291de6..65bb6fd70439 100644
--- a/drivers/cpuidle/governors/menu.c
+++ b/drivers/cpuidle/governors/menu.c
@@ -178,7 +178,12 @@ static inline int performance_multiplier(unsigned long nr_iowaiters, unsigned lo
/* for higher loadavg, we are more reluctant */
- mult += 2 * get_loadavg(load);
+ /*
+ * this doesn't work as intended - it is almost always 0, but can
+ * sometimes, depending on workload, spike very high into the hundreds
+ * even when the average cpu load is under 10%.
+ */
+ /* mult += 2 * get_loadavg(); */
/* for IO wait tasks (per cpu!) we add 5x each */
mult += 10 * nr_iowaiters;
diff --git a/drivers/dma-buf/fence.c b/drivers/dma-buf/fence.c
index 04bf29808200..883b3bea143c 100644
--- a/drivers/dma-buf/fence.c
+++ b/drivers/dma-buf/fence.c
@@ -68,6 +68,8 @@ int fence_signal_locked(struct fence *fence)
struct fence_cb *cur, *tmp;
int ret = 0;
+ lockdep_assert_held(fence->lock);
+
if (WARN_ON(!fence))
return -EINVAL;
@@ -159,9 +161,6 @@ fence_wait_timeout(struct fence *fence, bool intr, signed long timeout)
if (WARN_ON(timeout < 0))
return -EINVAL;
- if (timeout == 0)
- return fence_is_signaled(fence);
-
trace_fence_wait_start(fence);
ret = fence->ops->wait(fence, intr, timeout);
trace_fence_wait_end(fence);
@@ -329,8 +328,12 @@ fence_remove_callback(struct fence *fence, struct fence_cb *cb)
spin_lock_irqsave(fence->lock, flags);
ret = !list_empty(&cb->node);
- if (ret)
+ if (ret) {
list_del_init(&cb->node);
+ if (list_empty(&fence->cb_list))
+ if (fence->ops->disable_signaling)
+ fence->ops->disable_signaling(fence);
+ }
spin_unlock_irqrestore(fence->lock, flags);
diff --git a/drivers/dma-buf/reservation.c b/drivers/dma-buf/reservation.c
index 723d8af988e5..82f35a4ab390 100644
--- a/drivers/dma-buf/reservation.c
+++ b/drivers/dma-buf/reservation.c
@@ -280,18 +280,24 @@ int reservation_object_get_fences_rcu(struct reservation_object *obj,
unsigned *pshared_count,
struct fence ***pshared)
{
- unsigned shared_count = 0;
- unsigned retry = 1;
- struct fence **shared = NULL, *fence_excl = NULL;
- int ret = 0;
+ struct fence **shared = NULL;
+ struct fence *fence_excl;
+ unsigned int shared_count;
+ int ret = 1;
- while (retry) {
+ do {
struct reservation_object_list *fobj;
unsigned seq;
+ unsigned int i;
- seq = read_seqcount_begin(&obj->seq);
+ shared_count = i = 0;
rcu_read_lock();
+ seq = read_seqcount_begin(&obj->seq);
+
+ fence_excl = rcu_dereference(obj->fence_excl);
+ if (fence_excl && !fence_get_rcu(fence_excl))
+ goto unlock;
fobj = rcu_dereference(obj->fence);
if (fobj) {
@@ -309,52 +315,37 @@ int reservation_object_get_fences_rcu(struct reservation_object *obj,
}
ret = -ENOMEM;
- shared_count = 0;
break;
}
shared = nshared;
- memcpy(shared, fobj->shared, sz);
shared_count = fobj->shared_count;
- } else
- shared_count = 0;
- fence_excl = rcu_dereference(obj->fence_excl);
-
- retry = read_seqcount_retry(&obj->seq, seq);
- if (retry)
- goto unlock;
-
- if (!fence_excl || fence_get_rcu(fence_excl)) {
- unsigned i;
for (i = 0; i < shared_count; ++i) {
- if (fence_get_rcu(shared[i]))
- continue;
-
- /* uh oh, refcount failed, abort and retry */
- while (i--)
- fence_put(shared[i]);
-
- if (fence_excl) {
- fence_put(fence_excl);
- fence_excl = NULL;
- }
-
- retry = 1;
- break;
+ shared[i] = rcu_dereference(fobj->shared[i]);
+ if (!fence_get_rcu(shared[i]))
+ break;
}
- } else
- retry = 1;
+ }
+ if (i != shared_count || read_seqcount_retry(&obj->seq, seq)) {
+ while (i--)
+ fence_put(shared[i]);
+ fence_put(fence_excl);
+ goto unlock;
+ }
+
+ ret = 0;
unlock:
rcu_read_unlock();
- }
- *pshared_count = shared_count;
- if (shared_count)
- *pshared = shared;
- else {
- *pshared = NULL;
+ } while (ret);
+
+ if (!shared_count) {
kfree(shared);
+ shared = NULL;
}
+
+ *pshared_count = shared_count;
+ *pshared = shared;
*pfence_excl = fence_excl;
return ret;
@@ -379,10 +370,7 @@ long reservation_object_wait_timeout_rcu(struct reservation_object *obj,
{
struct fence *fence;
unsigned seq, shared_count, i = 0;
- long ret = timeout;
-
- if (!timeout)
- return reservation_object_test_signaled_rcu(obj, wait_all);
+ long ret = timeout ? timeout : 1;
retry:
fence = NULL;
@@ -397,9 +385,6 @@ retry:
if (fobj)
shared_count = fobj->shared_count;
- if (read_seqcount_retry(&obj->seq, seq))
- goto unlock_retry;
-
for (i = 0; i < shared_count; ++i) {
struct fence *lfence = rcu_dereference(fobj->shared[i]);
@@ -422,9 +407,6 @@ retry:
if (!shared_count) {
struct fence *fence_excl = rcu_dereference(obj->fence_excl);
- if (read_seqcount_retry(&obj->seq, seq))
- goto unlock_retry;
-
if (fence_excl &&
!test_bit(FENCE_FLAG_SIGNALED_BIT, &fence_excl->flags)) {
if (!fence_get_rcu(fence_excl))
@@ -439,6 +421,11 @@ retry:
rcu_read_unlock();
if (fence) {
+ if (read_seqcount_retry(&obj->seq, seq)) {
+ fence_put(fence);
+ goto retry;
+ }
+
ret = fence_wait_timeout(fence, intr, ret);
fence_put(fence);
if (ret > 0 && wait_all && (i + 1 < shared_count))
@@ -484,12 +471,13 @@ bool reservation_object_test_signaled_rcu(struct reservation_object *obj,
bool test_all)
{
unsigned seq, shared_count;
- int ret = true;
+ int ret;
+ rcu_read_lock();
retry:
+ ret = true;
shared_count = 0;
seq = read_seqcount_begin(&obj->seq);
- rcu_read_lock();
if (test_all) {
unsigned i;
@@ -500,46 +488,35 @@ retry:
if (fobj)
shared_count = fobj->shared_count;
- if (read_seqcount_retry(&obj->seq, seq))
- goto unlock_retry;
-
for (i = 0; i < shared_count; ++i) {
struct fence *fence = rcu_dereference(fobj->shared[i]);
ret = reservation_object_test_signaled_single(fence);
if (ret < 0)
- goto unlock_retry;
+ goto retry;
else if (!ret)
break;
}
- /*
- * There could be a read_seqcount_retry here, but nothing cares
- * about whether it's the old or newer fence pointers that are
- * signaled. That race could still have happened after checking
- * read_seqcount_retry. If you care, use ww_mutex_lock.
- */
+ if (read_seqcount_retry(&obj->seq, seq))
+ goto retry;
}
if (!shared_count) {
struct fence *fence_excl = rcu_dereference(obj->fence_excl);
- if (read_seqcount_retry(&obj->seq, seq))
- goto unlock_retry;
-
if (fence_excl) {
ret = reservation_object_test_signaled_single(
fence_excl);
if (ret < 0)
- goto unlock_retry;
+ goto retry;
+
+ if (read_seqcount_retry(&obj->seq, seq))
+ goto retry;
}
}
rcu_read_unlock();
return ret;
-
-unlock_retry:
- rcu_read_unlock();
- goto retry;
}
EXPORT_SYMBOL_GPL(reservation_object_test_signaled_rcu);
diff --git a/drivers/dma-buf/sw_sync.c b/drivers/dma-buf/sw_sync.c
index 4f3511415b29..9dc86d3303ae 100644
--- a/drivers/dma-buf/sw_sync.c
+++ b/drivers/dma-buf/sw_sync.c
@@ -169,6 +169,13 @@ static bool timeline_fence_enable_signaling(struct fence *fence)
return true;
}
+static void timeline_fence_disable_signaling(struct fence *fence)
+{
+ struct sync_pt *pt = container_of(fence, struct sync_pt, base);
+
+ list_del_init(&pt->link);
+}
+
static void timeline_fence_value_str(struct fence *fence,
char *str, int size)
{
@@ -187,6 +194,7 @@ static const struct fence_ops timeline_fence_ops = {
.get_driver_name = timeline_fence_get_driver_name,
.get_timeline_name = timeline_fence_get_timeline_name,
.enable_signaling = timeline_fence_enable_signaling,
+ .disable_signaling = timeline_fence_disable_signaling,
.signaled = timeline_fence_signaled,
.wait = fence_default_wait,
.release = timeline_fence_release,
@@ -360,8 +368,8 @@ static long sw_sync_ioctl_create_fence(struct sync_timeline *obj,
}
sync_file = sync_file_create(&pt->base);
+ fence_put(&pt->base);
if (!sync_file) {
- fence_put(&pt->base);
err = -ENOMEM;
goto err;
}
diff --git a/drivers/dma-buf/sync_file.c b/drivers/dma-buf/sync_file.c
index f0c374d6ab40..e16849fd89aa 100644
--- a/drivers/dma-buf/sync_file.c
+++ b/drivers/dma-buf/sync_file.c
@@ -279,7 +279,7 @@ static void sync_file_free(struct kref *kref)
struct sync_file *sync_file = container_of(kref, struct sync_file,
kref);
- if (test_bit(POLL_ENABLED, &sync_file->fence->flags))
+ if (test_bit(POLL_ENABLED, &sync_file->flags))
fence_remove_callback(sync_file->fence, &sync_file->cb);
fence_put(sync_file->fence);
kfree(sync_file);
@@ -299,10 +299,10 @@ static unsigned int sync_file_poll(struct file *file, poll_table *wait)
poll_wait(file, &sync_file->wq, wait);
- if (!poll_does_not_wait(wait) &&
- !test_and_set_bit(POLL_ENABLED, &sync_file->fence->flags)) {
+ if (list_empty(&sync_file->cb.node) &&
+ !test_and_set_bit(POLL_ENABLED, &sync_file->flags)) {
if (fence_add_callback(sync_file->fence, &sync_file->cb,
- fence_check_cb_func) < 0)
+ fence_check_cb_func) < 0)
wake_up_all(&sync_file->wq);
}
diff --git a/drivers/edac/edac_mc_sysfs.c b/drivers/edac/edac_mc_sysfs.c
index 40d792e96b75..7335a8661280 100644
--- a/drivers/edac/edac_mc_sysfs.c
+++ b/drivers/edac/edac_mc_sysfs.c
@@ -50,7 +50,7 @@ int edac_mc_get_poll_msec(void)
return edac_mc_poll_msec;
}
-static int edac_set_poll_msec(const char *val, struct kernel_param *kp)
+static int edac_set_poll_msec(const char *val, const struct kernel_param *kp)
{
unsigned long l;
int ret;
diff --git a/drivers/edac/edac_module.c b/drivers/edac/edac_module.c
index 5f8543be995a..b0d32843d58b 100644
--- a/drivers/edac/edac_module.c
+++ b/drivers/edac/edac_module.c
@@ -19,7 +19,8 @@
#ifdef CONFIG_EDAC_DEBUG
-static int edac_set_debug_level(const char *buf, struct kernel_param *kp)
+static int edac_set_debug_level(const char *buf,
+ const struct kernel_param *kp)
{
unsigned long val;
int ret;
diff --git a/drivers/firmware/efi/libstub/Makefile b/drivers/firmware/efi/libstub/Makefile
index 310f8feb5174..f82d11a099db 100644
--- a/drivers/firmware/efi/libstub/Makefile
+++ b/drivers/firmware/efi/libstub/Makefile
@@ -19,7 +19,8 @@ cflags-$(CONFIG_EFI_ARMSTUB) += -I$(srctree)/scripts/dtc/libfdt
KBUILD_CFLAGS := $(cflags-y) -DDISABLE_BRANCH_PROFILING \
$(call cc-option,-ffreestanding) \
- $(call cc-option,-fno-stack-protector)
+ $(call cc-option,-fno-stack-protector) \
+ $(DISABLE_LTO)
GCOV_PROFILE := n
KASAN_SANITIZE := n
diff --git a/drivers/firmware/efi/libstub/arm-stub.c b/drivers/firmware/efi/libstub/arm-stub.c
index 993aa56755f6..7ec09ea81b94 100644
--- a/drivers/firmware/efi/libstub/arm-stub.c
+++ b/drivers/firmware/efi/libstub/arm-stub.c
@@ -18,8 +18,6 @@
#include "efistub.h"
-bool __nokaslr;
-
static int efi_get_secureboot(efi_system_table_t *sys_table_arg)
{
static efi_char16_t const sb_var_name[] = {
@@ -268,18 +266,6 @@ unsigned long efi_entry(void *handle, efi_system_table_t *sys_table,
goto fail;
}
- /* check whether 'nokaslr' was passed on the command line */
- if (IS_ENABLED(CONFIG_RANDOMIZE_BASE)) {
- static const u8 default_cmdline[] = CONFIG_CMDLINE;
- const u8 *str, *cmdline = cmdline_ptr;
-
- if (IS_ENABLED(CONFIG_CMDLINE_FORCE))
- cmdline = default_cmdline;
- str = strstr(cmdline, "nokaslr");
- if (str == cmdline || (str > cmdline && *(str - 1) == ' '))
- __nokaslr = true;
- }
-
si = setup_graphics(sys_table);
status = handle_kernel_image(sys_table, image_addr, &image_size,
@@ -291,9 +277,13 @@ unsigned long efi_entry(void *handle, efi_system_table_t *sys_table,
goto fail_free_cmdline;
}
- status = efi_parse_options(cmdline_ptr);
- if (status != EFI_SUCCESS)
- pr_efi_err(sys_table, "Failed to parse EFI cmdline options\n");
+ if (IS_ENABLED(CONFIG_CMDLINE_EXTEND) ||
+ IS_ENABLED(CONFIG_CMDLINE_FORCE) ||
+ cmdline_size == 0)
+ efi_parse_options(CONFIG_CMDLINE);
+
+ if (!IS_ENABLED(CONFIG_CMDLINE_FORCE) && cmdline_size > 0)
+ efi_parse_options(cmdline_ptr);
secure_boot = efi_get_secureboot(sys_table);
if (secure_boot > 0)
diff --git a/drivers/firmware/efi/libstub/arm64-stub.c b/drivers/firmware/efi/libstub/arm64-stub.c
index 959d9b8d4845..f7a6970e9abc 100644
--- a/drivers/firmware/efi/libstub/arm64-stub.c
+++ b/drivers/firmware/efi/libstub/arm64-stub.c
@@ -24,8 +24,6 @@
#include "efistub.h"
-extern bool __nokaslr;
-
efi_status_t check_platform_features(efi_system_table_t *sys_table_arg)
{
u64 tg;
@@ -60,7 +58,7 @@ efi_status_t handle_kernel_image(efi_system_table_t *sys_table_arg,
u64 phys_seed = 0;
if (IS_ENABLED(CONFIG_RANDOMIZE_BASE)) {
- if (!__nokaslr) {
+ if (!nokaslr()) {
status = efi_get_random_bytes(sys_table_arg,
sizeof(phys_seed),
(u8 *)&phys_seed);
diff --git a/drivers/firmware/efi/libstub/efi-stub-helper.c b/drivers/firmware/efi/libstub/efi-stub-helper.c
index 09d10dcf1fc6..d9843329e9b2 100644
--- a/drivers/firmware/efi/libstub/efi-stub-helper.c
+++ b/drivers/firmware/efi/libstub/efi-stub-helper.c
@@ -41,6 +41,13 @@ static unsigned long __chunk_size = EFI_READ_CHUNK_SIZE;
#define EFI_ALLOC_ALIGN EFI_PAGE_SIZE
#endif
+static int __section(.data) __nokaslr;
+
+int __pure nokaslr(void)
+{
+ return __nokaslr;
+}
+
#define EFI_MMAP_NR_SLACK_SLOTS 8
struct file_info {
@@ -351,10 +358,14 @@ void efi_free(efi_system_table_t *sys_table_arg, unsigned long size,
* environments, first in the early boot environment of the EFI boot
* stub, and subsequently during the kernel boot.
*/
-efi_status_t efi_parse_options(char *cmdline)
+efi_status_t efi_parse_options(char const *cmdline)
{
char *str;
+ str = strstr(cmdline, "nokaslr");
+ if (str == cmdline || (str && str > cmdline && *(str - 1) == ' '))
+ __nokaslr = 1;
+
/*
* Currently, the only efi= option we look for is 'nochunk', which
* is intended to work around known issues on certain x86 UEFI
diff --git a/drivers/firmware/efi/libstub/efistub.h b/drivers/firmware/efi/libstub/efistub.h
index fac67992bede..d0e5acab547f 100644
--- a/drivers/firmware/efi/libstub/efistub.h
+++ b/drivers/firmware/efi/libstub/efistub.h
@@ -15,6 +15,8 @@
*/
#undef __init
+extern int __pure nokaslr(void);
+
void efi_char16_printk(efi_system_table_t *, efi_char16_t *);
efi_status_t efi_open_volume(efi_system_table_t *sys_table_arg, void *__image,
diff --git a/drivers/gpu/drm/Kconfig b/drivers/gpu/drm/Kconfig
index 483059a22b1b..43cb33dc8333 100644
--- a/drivers/gpu/drm/Kconfig
+++ b/drivers/gpu/drm/Kconfig
@@ -12,6 +12,7 @@ menuconfig DRM
select I2C
select I2C_ALGOBIT
select DMA_SHARED_BUFFER
+ select SYNC_FILE
help
Kernel-level support for the Direct Rendering Infrastructure (DRI)
introduced in XFree86 4.0. If you say Y here, you need to select
diff --git a/drivers/gpu/drm/drm_atomic.c b/drivers/gpu/drm/drm_atomic.c
index dd6fff1c98d6..4e4043fca223 100644
--- a/drivers/gpu/drm/drm_atomic.c
+++ b/drivers/gpu/drm/drm_atomic.c
@@ -30,6 +30,7 @@
#include <drm/drm_atomic.h>
#include <drm/drm_mode.h>
#include <drm/drm_plane_helper.h>
+#include <linux/sync_file.h>
#include "drm_crtc_internal.h"
@@ -292,6 +293,23 @@ drm_atomic_get_crtc_state(struct drm_atomic_state *state,
}
EXPORT_SYMBOL(drm_atomic_get_crtc_state);
+static void set_out_fence_for_crtc(struct drm_atomic_state *state,
+ struct drm_crtc *crtc, s32 __user *fence_ptr)
+{
+ state->crtcs[drm_crtc_index(crtc)].out_fence_ptr = fence_ptr;
+}
+
+static s32 __user *get_out_fence_for_crtc(struct drm_atomic_state *state,
+ struct drm_crtc *crtc)
+{
+ s32 __user *fence_ptr;
+
+ fence_ptr = state->crtcs[drm_crtc_index(crtc)].out_fence_ptr;
+ state->crtcs[drm_crtc_index(crtc)].out_fence_ptr = NULL;
+
+ return fence_ptr;
+}
+
/**
* drm_atomic_set_mode_for_crtc - set mode for CRTC
* @state: the CRTC whose incoming state to update
@@ -496,6 +514,16 @@ int drm_atomic_crtc_set_property(struct drm_crtc *crtc,
&replaced);
state->color_mgmt_changed |= replaced;
return ret;
+ } else if (property == config->prop_out_fence_ptr) {
+ s32 __user *fence_ptr = u64_to_user_ptr(val);
+
+ if (!fence_ptr)
+ return 0;
+
+ if (put_user(-1, fence_ptr))
+ return -EFAULT;
+
+ set_out_fence_for_crtc(state->state, crtc, fence_ptr);
} else if (crtc->funcs->atomic_set_property)
return crtc->funcs->atomic_set_property(crtc, state, property, val);
else
@@ -538,6 +566,8 @@ drm_atomic_crtc_get_property(struct drm_crtc *crtc,
*val = (state->ctm) ? state->ctm->base.id : 0;
else if (property == config->gamma_lut_property)
*val = (state->gamma_lut) ? state->gamma_lut->base.id : 0;
+ else if (property == config->prop_out_fence_ptr)
+ *val = 0;
else if (crtc->funcs->atomic_get_property)
return crtc->funcs->atomic_get_property(crtc, state, property, val);
else
@@ -693,6 +723,17 @@ int drm_atomic_plane_set_property(struct drm_plane *plane,
drm_atomic_set_fb_for_plane(state, fb);
if (fb)
drm_framebuffer_unreference(fb);
+ } else if (property == config->prop_in_fence_fd) {
+ if (state->fence)
+ return -EINVAL;
+
+ if (U642I64(val) == -1)
+ return 0;
+
+ state->fence = sync_file_get_fence(val);
+ if (!state->fence)
+ return -EINVAL;
+
} else if (property == config->prop_crtc_id) {
struct drm_crtc *crtc = drm_crtc_find(dev, val);
return drm_atomic_set_crtc_for_plane(state, crtc);
@@ -752,6 +793,8 @@ drm_atomic_plane_get_property(struct drm_plane *plane,
if (property == config->prop_fb_id) {
*val = (state->fb) ? state->fb->base.id : 0;
+ } else if (property == config->prop_in_fence_fd) {
+ *val = -1;
} else if (property == config->prop_crtc_id) {
*val = (state->crtc) ? state->crtc->base.id : 0;
} else if (property == config->prop_crtc_x) {
@@ -1154,6 +1197,36 @@ drm_atomic_set_fb_for_plane(struct drm_plane_state *plane_state,
EXPORT_SYMBOL(drm_atomic_set_fb_for_plane);
/**
+ * drm_atomic_set_fence_for_plane - set fence for plane
+ * @plane_state: atomic state object for the plane
+ * @fence: fence to use for the plane
+ *
+ * Helper to setup the plane_state fence in case it is not set yet.
+ * By using this drivers doesn't need to worry if the user choose
+ * implicit or explicit fencing.
+ *
+ * This function will not set the fence to the state if it was set
+ * via explicit fencing interfaces on the atomic ioctl. It will
+ * all drope the reference to the fence as we not storing it
+ * anywhere.
+ *
+ * Otherwise, if plane_state->fence is not set this function we
+ * just set it with the received implict fence.
+ */
+void
+drm_atomic_set_fence_for_plane(struct drm_plane_state *plane_state,
+ struct fence *fence)
+{
+ if (plane_state->fence) {
+ fence_put(fence);
+ return;
+ }
+
+ plane_state->fence = fence;
+}
+EXPORT_SYMBOL(drm_atomic_set_fence_for_plane);
+
+/**
* drm_atomic_set_crtc_for_connector - set crtc for connector
* @conn_state: atomic state object for the connector
* @crtc: crtc to use for the connector
@@ -1472,11 +1545,9 @@ EXPORT_SYMBOL(drm_atomic_nonblocking_commit);
*/
static struct drm_pending_vblank_event *create_vblank_event(
- struct drm_device *dev, struct drm_file *file_priv,
- struct fence *fence, uint64_t user_data)
+ struct drm_device *dev, uint64_t user_data)
{
struct drm_pending_vblank_event *e = NULL;
- int ret;
e = kzalloc(sizeof *e, GFP_KERNEL);
if (!e)
@@ -1486,17 +1557,6 @@ static struct drm_pending_vblank_event *create_vblank_event(
e->event.base.length = sizeof(e->event);
e->event.user_data = user_data;
- if (file_priv) {
- ret = drm_event_reserve_init(dev, file_priv, &e->base,
- &e->event.base);
- if (ret) {
- kfree(e);
- return NULL;
- }
- }
-
- e->base.fence = fence;
-
return e;
}
@@ -1601,6 +1661,206 @@ void drm_atomic_clean_old_fb(struct drm_device *dev,
}
EXPORT_SYMBOL(drm_atomic_clean_old_fb);
+/**
+ * DOC: explicit fencing properties
+ *
+ * Explicit fencing allows userspace to control the buffer synchronization
+ * between devices. A Fence or a group of fences are transfered to/from
+ * userspace using Sync File fds and there are two DRM properties for that.
+ * IN_FENCE_FD on each DRM Plane to send fences to the kernel and
+ * OUT_FENCE_PTR on each DRM CRTC to receive fences from the kernel.
+ *
+ * As a contrast, with implicit fencing the kernel keeps track of any
+ * ongoing rendering, and automatically ensures that the atomic update waits
+ * for any pending rendering to complete. For shared buffers represented with
+ * a struct &dma_buf this is tracked in &reservation_object structures.
+ * Implicit syncing is how Linux traditionally worked (e.g. DRI2/3 on X.org),
+ * whereas explicit fencing is what Android wants.
+ *
+ * "IN_FENCE_FD”:
+ * Use this property to pass a fence that DRM should wait on before
+ * proceeding with the Atomic Commit request and show the framebuffer for
+ * the plane on the screen. The fence can be either a normal fence or a
+ * merged one, the sync_file framework will handle both cases and use a
+ * fence_array if a merged fence is received. Passing -1 here means no
+ * fences to wait on.
+ *
+ * If the Atomic Commit request has the DRM_MODE_ATOMIC_TEST_ONLY flag
+ * it will only check if the Sync File is a valid one.
+ *
+ * On the driver side the fence is stored on the @fence parameter of
+ * struct &drm_plane_state. Drivers which also support implicit fencing
+ * should set the implicit fence using drm_atomic_set_fence_for_plane(),
+ * to make sure there's consistent behaviour between drivers in precedence
+ * of implicit vs. explicit fencing.
+ *
+ * "OUT_FENCE_PTR”:
+ * Use this property to pass a file descriptor pointer to DRM. Once the
+ * Atomic Commit request call returns OUT_FENCE_PTR will be filled with
+ * the file descriptor number of a Sync File. This Sync File contains the
+ * CRTC fence that will be signaled when all framebuffers present on the
+ * Atomic Commit * request for that given CRTC are scanned out on the
+ * screen.
+ *
+ * The Atomic Commit request fails if a invalid pointer is passed. If the
+ * Atomic Commit request fails for any other reason the out fence fd
+ * returned will be -1. On a Atomic Commit with the
+ * DRM_MODE_ATOMIC_TEST_ONLY flag the out fence will also be set to -1.
+ *
+ * Note that out-fences don't have a special interface to drivers and are
+ * internally represented by a struct &drm_pending_vblank_event in struct
+ * &drm_crtc_state, which is also used by the nonblocking atomic commit
+ * helpers and for the DRM event handling for existing userspace.
+ */
+
+struct drm_out_fence_state {
+ s32 __user *out_fence_ptr;
+ struct sync_file *sync_file;
+ int fd;
+};
+
+static int setup_out_fence(struct drm_out_fence_state *fence_state,
+ struct fence *fence)
+{
+ fence_state->fd = get_unused_fd_flags(O_CLOEXEC);
+ if (fence_state->fd < 0)
+ return fence_state->fd;
+
+ if (put_user(fence_state->fd, fence_state->out_fence_ptr))
+ return -EFAULT;
+
+ fence_state->sync_file = sync_file_create(fence);
+ if (!fence_state->sync_file)
+ return -ENOMEM;
+
+ return 0;
+}
+
+static int prepare_crtc_signaling(struct drm_device *dev,
+ struct drm_atomic_state *state,
+ struct drm_mode_atomic *arg,
+ struct drm_file *file_priv,
+ struct drm_out_fence_state **fence_state,
+ unsigned int *num_fences)
+{
+ struct drm_crtc *crtc;
+ struct drm_crtc_state *crtc_state;
+ int i, ret;
+
+ if (arg->flags & DRM_MODE_ATOMIC_TEST_ONLY)
+ return 0;
+
+ for_each_crtc_in_state(state, crtc, crtc_state, i) {
+ s32 __user *fence_ptr;
+
+ fence_ptr = get_out_fence_for_crtc(crtc_state->state, crtc);
+
+ if (arg->flags & DRM_MODE_PAGE_FLIP_EVENT || fence_ptr) {
+ struct drm_pending_vblank_event *e;
+
+ e = create_vblank_event(dev, arg->user_data);
+ if (!e)
+ return -ENOMEM;
+
+ crtc_state->event = e;
+ }
+
+ if (arg->flags & DRM_MODE_PAGE_FLIP_EVENT) {
+ struct drm_pending_vblank_event *e = crtc_state->event;
+
+ if (!file_priv)
+ continue;
+
+ ret = drm_event_reserve_init(dev, file_priv, &e->base,
+ &e->event.base);
+ if (ret) {
+ kfree(e);
+ crtc_state->event = NULL;
+ return ret;
+ }
+ }
+
+ if (fence_ptr) {
+ struct fence *fence;
+ struct drm_out_fence_state *f;
+
+ f = krealloc(*fence_state, sizeof(**fence_state) *
+ (*num_fences + 1), GFP_KERNEL);
+ if (!f)
+ return -ENOMEM;
+
+ memset(&f[*num_fences], 0, sizeof(*f));
+
+ f[*num_fences].out_fence_ptr = fence_ptr;
+ *fence_state = f;
+
+ fence = drm_crtc_create_fence(crtc);
+ if (!fence)
+ return -ENOMEM;
+
+ ret = setup_out_fence(&f[(*num_fences)++], fence);
+ if (ret) {
+ fence_put(fence);
+ return ret;
+ }
+
+ crtc_state->event->base.fence = fence;
+ }
+ }
+
+ return 0;
+}
+
+static void complete_crtc_signaling(struct drm_device *dev,
+ struct drm_atomic_state *state,
+ struct drm_out_fence_state *fence_state,
+ unsigned int num_fences,
+ bool install_fds)
+{
+ struct drm_crtc *crtc;
+ struct drm_crtc_state *crtc_state;
+ int i;
+
+ if (install_fds) {
+ for (i = 0; i < num_fences; i++)
+ fd_install(fence_state[i].fd,
+ fence_state[i].sync_file->file);
+
+ kfree(fence_state);
+ return;
+ }
+
+ for_each_crtc_in_state(state, crtc, crtc_state, i) {
+ struct drm_pending_vblank_event *event = crtc_state->event;
+ /*
+ * Free the allocated event. drm_atomic_helper_setup_commit
+ * can allocate an event too, so only free it if it's ours
+ * to prevent a double free in drm_atomic_state_clear.
+ */
+ if (event && (event->base.fence || event->base.file_priv)) {
+ drm_event_cancel_free(dev, &event->base);
+ crtc_state->event = NULL;
+ }
+ }
+
+ if (!fence_state)
+ return;
+
+ for (i = 0; i < num_fences; i++) {
+ if (fence_state[i].sync_file)
+ fput(fence_state[i].sync_file->file);
+ if (fence_state[i].fd >= 0)
+ put_unused_fd(fence_state[i].fd);
+
+ /* If this fails log error to the user */
+ if (fence_state[i].out_fence_ptr &&
+ put_user(-1, fence_state[i].out_fence_ptr))
+ DRM_DEBUG_ATOMIC("Couldn't clear out_fence_ptr\n");
+ }
+
+ kfree(fence_state);
+}
+
int drm_mode_atomic_ioctl(struct drm_device *dev,
void *data, struct drm_file *file_priv)
{
@@ -1613,11 +1873,10 @@ int drm_mode_atomic_ioctl(struct drm_device *dev,
struct drm_atomic_state *state;
struct drm_modeset_acquire_ctx ctx;
struct drm_plane *plane;
- struct drm_crtc *crtc;
- struct drm_crtc_state *crtc_state;
+ struct drm_out_fence_state *fence_state;
unsigned plane_mask;
int ret = 0;
- unsigned int i, j;
+ unsigned int i, j, num_fences;
/* disallow for drivers not supporting atomic: */
if (!drm_core_check_feature(dev, DRIVER_ATOMIC))
@@ -1658,6 +1917,8 @@ retry:
plane_mask = 0;
copied_objs = 0;
copied_props = 0;
+ fence_state = NULL;
+ num_fences = 0;
for (i = 0; i < arg->count_objs; i++) {
uint32_t obj_id, count_props;
@@ -1732,20 +1993,10 @@ retry:
drm_mode_object_unreference(obj);
}
- if (arg->flags & DRM_MODE_PAGE_FLIP_EVENT) {
- for_each_crtc_in_state(state, crtc, crtc_state, i) {
- struct drm_pending_vblank_event *e;
-
- e = create_vblank_event(dev, file_priv, NULL,
- arg->user_data);
- if (!e) {
- ret = -ENOMEM;
- goto out;
- }
-
- crtc_state->event = e;
- }
- }
+ ret = prepare_crtc_signaling(dev, state, arg, file_priv, &fence_state,
+ &num_fences);
+ if (ret)
+ goto out;
if (arg->flags & DRM_MODE_ATOMIC_TEST_ONLY) {
/*
@@ -1762,20 +2013,7 @@ retry:
out:
drm_atomic_clean_old_fb(dev, plane_mask, ret);
- if (ret && arg->flags & DRM_MODE_PAGE_FLIP_EVENT) {
- /*
- * Free the allocated event. drm_atomic_helper_setup_commit
- * can allocate an event too, so only free it if it's ours
- * to prevent a double free in drm_atomic_state_clear.
- */
- for_each_crtc_in_state(state, crtc, crtc_state, i) {
- struct drm_pending_vblank_event *event = crtc_state->event;
- if (event && (event->base.fence || event->base.file_priv)) {
- drm_event_cancel_free(dev, &event->base);
- crtc_state->event = NULL;
- }
- }
- }
+ complete_crtc_signaling(dev, state, fence_state, num_fences, !ret);
if (ret == -EDEADLK) {
drm_atomic_state_clear(state);
diff --git a/drivers/gpu/drm/drm_atomic_helper.c b/drivers/gpu/drm/drm_atomic_helper.c
index 50acd799babe..f34b4e8455a6 100644
--- a/drivers/gpu/drm/drm_atomic_helper.c
+++ b/drivers/gpu/drm/drm_atomic_helper.c
@@ -3166,6 +3166,9 @@ void __drm_atomic_helper_plane_destroy_state(struct drm_plane_state *state)
{
if (state->fb)
drm_framebuffer_unreference(state->fb);
+
+ if (state->fence)
+ fence_put(state->fence);
}
EXPORT_SYMBOL(__drm_atomic_helper_plane_destroy_state);
diff --git a/drivers/gpu/drm/drm_crtc.c b/drivers/gpu/drm/drm_crtc.c
index 2d7bedf28647..79b3d521c388 100644
--- a/drivers/gpu/drm/drm_crtc.c
+++ b/drivers/gpu/drm/drm_crtc.c
@@ -33,6 +33,7 @@
#include <linux/list.h>
#include <linux/slab.h>
#include <linux/export.h>
+#include <linux/fence.h>
#include <drm/drmP.h>
#include <drm/drm_crtc.h>
#include <drm/drm_edid.h>
@@ -141,6 +142,54 @@ static void drm_crtc_unregister_all(struct drm_device *dev)
}
}
+static const struct fence_ops drm_crtc_fence_ops;
+
+static struct drm_crtc *fence_to_crtc(struct fence *fence)
+{
+ BUG_ON(fence->ops != &drm_crtc_fence_ops);
+ return container_of(fence->lock, struct drm_crtc, fence_lock);
+}
+
+static const char *drm_crtc_fence_get_driver_name(struct fence *fence)
+{
+ struct drm_crtc *crtc = fence_to_crtc(fence);
+
+ return crtc->dev->driver->name;
+}
+
+static const char *drm_crtc_fence_get_timeline_name(struct fence *fence)
+{
+ struct drm_crtc *crtc = fence_to_crtc(fence);
+
+ return crtc->timeline_name;
+}
+
+static bool drm_crtc_fence_enable_signaling(struct fence *fence)
+{
+ return true;
+}
+
+static const struct fence_ops drm_crtc_fence_ops = {
+ .get_driver_name = drm_crtc_fence_get_driver_name,
+ .get_timeline_name = drm_crtc_fence_get_timeline_name,
+ .enable_signaling = drm_crtc_fence_enable_signaling,
+ .wait = fence_default_wait,
+};
+
+struct fence *drm_crtc_create_fence(struct drm_crtc *crtc)
+{
+ struct fence *fence;
+
+ fence = kzalloc(sizeof(*fence), GFP_KERNEL);
+ if (!fence)
+ return NULL;
+
+ fence_init(fence, &drm_crtc_fence_ops, &crtc->fence_lock,
+ crtc->fence_context, ++crtc->fence_seqno);
+
+ return fence;
+}
+
/**
* drm_crtc_init_with_planes - Initialise a new CRTC object with
* specified primary and cursor planes.
@@ -198,6 +247,11 @@ int drm_crtc_init_with_planes(struct drm_device *dev, struct drm_crtc *crtc,
return -ENOMEM;
}
+ crtc->fence_context = fence_context_alloc(1);
+ spin_lock_init(&crtc->fence_lock);
+ snprintf(crtc->timeline_name, sizeof(crtc->timeline_name),
+ "CRTC:%d-%s", crtc->base.id, crtc->name);
+
crtc->base.properties = &crtc->properties;
list_add_tail(&crtc->head, &config->crtc_list);
@@ -213,6 +267,8 @@ int drm_crtc_init_with_planes(struct drm_device *dev, struct drm_crtc *crtc,
if (drm_core_check_feature(dev, DRIVER_ATOMIC)) {
drm_object_attach_property(&crtc->base, config->prop_active, 0);
drm_object_attach_property(&crtc->base, config->prop_mode_id, 0);
+ drm_object_attach_property(&crtc->base,
+ config->prop_out_fence_ptr, 0);
}
return 0;
@@ -365,6 +421,18 @@ static int drm_mode_create_standard_properties(struct drm_device *dev)
return -ENOMEM;
dev->mode_config.prop_fb_id = prop;
+ prop = drm_property_create_signed_range(dev, DRM_MODE_PROP_ATOMIC,
+ "IN_FENCE_FD", -1, INT_MAX);
+ if (!prop)
+ return -ENOMEM;
+ dev->mode_config.prop_in_fence_fd = prop;
+
+ prop = drm_property_create_range(dev, DRM_MODE_PROP_ATOMIC,
+ "OUT_FENCE_PTR", 0, U64_MAX);
+ if (!prop)
+ return -ENOMEM;
+ dev->mode_config.prop_out_fence_ptr = prop;
+
prop = drm_property_create_object(dev, DRM_MODE_PROP_ATOMIC,
"CRTC_ID", DRM_MODE_OBJECT_CRTC);
if (!prop)
diff --git a/drivers/gpu/drm/drm_crtc_internal.h b/drivers/gpu/drm/drm_crtc_internal.h
index c48ba02c5365..df2b51a4f75e 100644
--- a/drivers/gpu/drm/drm_crtc_internal.h
+++ b/drivers/gpu/drm/drm_crtc_internal.h
@@ -41,6 +41,8 @@ int drm_crtc_check_viewport(const struct drm_crtc *crtc,
const struct drm_display_mode *mode,
const struct drm_framebuffer *fb);
+struct fence *drm_crtc_create_fence(struct drm_crtc *crtc);
+
void drm_fb_release(struct drm_file *file_priv);
/* dumb buffer support IOCTLs */
diff --git a/drivers/gpu/drm/drm_fb_cma_helper.c b/drivers/gpu/drm/drm_fb_cma_helper.c
index 1fd6eac1400c..52629b62b002 100644
--- a/drivers/gpu/drm/drm_fb_cma_helper.c
+++ b/drivers/gpu/drm/drm_fb_cma_helper.c
@@ -18,13 +18,16 @@
*/
#include <drm/drmP.h>
+#include <drm/drm_atomic.h>
#include <drm/drm_crtc.h>
#include <drm/drm_fb_helper.h>
#include <drm/drm_crtc_helper.h>
#include <drm/drm_gem_cma_helper.h>
#include <drm/drm_fb_cma_helper.h>
+#include <linux/dma-buf.h>
#include <linux/dma-mapping.h>
#include <linux/module.h>
+#include <linux/reservation.h>
#define DEFAULT_FBDEFIO_DELAY_MS 50
@@ -265,6 +268,38 @@ struct drm_gem_cma_object *drm_fb_cma_get_gem_obj(struct drm_framebuffer *fb,
}
EXPORT_SYMBOL_GPL(drm_fb_cma_get_gem_obj);
+/**
+ * drm_fb_cma_prepare_fb() - Prepare CMA framebuffer
+ * @plane: Which plane
+ * @state: Plane state attach fence to
+ *
+ * This should be put into prepare_fb hook of struct &drm_plane_helper_funcs .
+ *
+ * This function checks if the plane FB has an dma-buf attached, extracts
+ * the exclusive fence and attaches it to plane state for the atomic helper
+ * to wait on.
+ *
+ * There is no need for cleanup_fb for CMA based framebuffer drivers.
+ */
+int drm_fb_cma_prepare_fb(struct drm_plane *plane,
+ struct drm_plane_state *state)
+{
+ struct dma_buf *dma_buf;
+ struct fence *fence;
+
+ if ((plane->state->fb == state->fb) || !state->fb)
+ return 0;
+
+ dma_buf = drm_fb_cma_get_gem_obj(state->fb, 0)->base.dma_buf;
+ if (dma_buf) {
+ fence = reservation_object_get_excl_rcu(dma_buf->resv);
+ drm_atomic_set_fence_for_plane(state, fence);
+ }
+
+ return 0;
+}
+EXPORT_SYMBOL_GPL(drm_fb_cma_prepare_fb);
+
#ifdef CONFIG_DEBUG_FS
static void drm_fb_cma_describe(struct drm_framebuffer *fb, struct seq_file *m)
{
diff --git a/drivers/gpu/drm/drm_fops.c b/drivers/gpu/drm/drm_fops.c
index c37b7b5f1dd3..129b80b12984 100644
--- a/drivers/gpu/drm/drm_fops.c
+++ b/drivers/gpu/drm/drm_fops.c
@@ -664,6 +664,10 @@ void drm_event_cancel_free(struct drm_device *dev,
list_del(&p->pending_link);
}
spin_unlock_irqrestore(&dev->event_lock, flags);
+
+ if (p->fence)
+ fence_put(p->fence);
+
kfree(p);
}
EXPORT_SYMBOL(drm_event_cancel_free);
diff --git a/drivers/gpu/drm/drm_plane.c b/drivers/gpu/drm/drm_plane.c
index 249c0ae52c6d..3957ef8f026b 100644
--- a/drivers/gpu/drm/drm_plane.c
+++ b/drivers/gpu/drm/drm_plane.c
@@ -137,6 +137,7 @@ int drm_universal_plane_init(struct drm_device *dev, struct drm_plane *plane,
if (drm_core_check_feature(dev, DRIVER_ATOMIC)) {
drm_object_attach_property(&plane->base, config->prop_fb_id, 0);
+ drm_object_attach_property(&plane->base, config->prop_in_fence_fd, -1);
drm_object_attach_property(&plane->base, config->prop_crtc_id, 0);
drm_object_attach_property(&plane->base, config->prop_crtc_x, 0);
drm_object_attach_property(&plane->base, config->prop_crtc_y, 0);
diff --git a/drivers/gpu/drm/msm/mdp/mdp5/mdp5_plane.c b/drivers/gpu/drm/msm/mdp/mdp5/mdp5_plane.c
index 83bf997dda03..5e67e8b2b685 100644
--- a/drivers/gpu/drm/msm/mdp/mdp5/mdp5_plane.c
+++ b/drivers/gpu/drm/msm/mdp/mdp5/mdp5_plane.c
@@ -218,9 +218,10 @@ mdp5_plane_duplicate_state(struct drm_plane *plane)
mdp5_state = kmemdup(to_mdp5_plane_state(plane->state),
sizeof(*mdp5_state), GFP_KERNEL);
+ if (!mdp5_state)
+ return NULL;
- if (mdp5_state && mdp5_state->base.fb)
- drm_framebuffer_reference(mdp5_state->base.fb);
+ __drm_atomic_helper_plane_duplicate_state(plane, &mdp5_state->base);
mdp5_state->mode_changed = false;
mdp5_state->pending = false;
diff --git a/drivers/gpu/drm/ttm/ttm_bo.c b/drivers/gpu/drm/ttm/ttm_bo.c
index 52a2a1a75682..5a184089dfa9 100644
--- a/drivers/gpu/drm/ttm/ttm_bo.c
+++ b/drivers/gpu/drm/ttm/ttm_bo.c
@@ -1606,7 +1606,14 @@ EXPORT_SYMBOL(ttm_bo_unmap_virtual);
int ttm_bo_wait(struct ttm_buffer_object *bo,
bool interruptible, bool no_wait)
{
- long timeout = no_wait ? 0 : 15 * HZ;
+ long timeout = 15 * HZ;
+
+ if (no_wait) {
+ if (reservation_object_test_signaled_rcu(bo->resv, true))
+ return 0;
+ else
+ return -EBUSY;
+ }
timeout = reservation_object_wait_timeout_rcu(bo->resv, true,
interruptible, timeout);
diff --git a/drivers/hid/hid-magicmouse.c b/drivers/hid/hid-magicmouse.c
index 20b40ad26325..42ed887ba0be 100644
--- a/drivers/hid/hid-magicmouse.c
+++ b/drivers/hid/hid-magicmouse.c
@@ -34,7 +34,8 @@ module_param(emulate_scroll_wheel, bool, 0644);
MODULE_PARM_DESC(emulate_scroll_wheel, "Emulate a scroll wheel");
static unsigned int scroll_speed = 32;
-static int param_set_scroll_speed(const char *val, struct kernel_param *kp) {
+static int param_set_scroll_speed(const char *val,
+ const struct kernel_param *kp) {
unsigned long speed;
if (!val || kstrtoul(val, 0, &speed) || speed > 63)
return -EINVAL;
diff --git a/drivers/hid/hid-sony.c b/drivers/hid/hid-sony.c
index eee58d15e745..d03203a82e8f 100644
--- a/drivers/hid/hid-sony.c
+++ b/drivers/hid/hid-sony.c
@@ -36,6 +36,8 @@
#include <linux/list.h>
#include <linux/idr.h>
#include <linux/input/mt.h>
+#include <linux/crc32.h>
+#include <asm/unaligned.h>
#include "hid-ids.h"
@@ -46,19 +48,21 @@
#define PS3REMOTE BIT(4)
#define DUALSHOCK4_CONTROLLER_USB BIT(5)
#define DUALSHOCK4_CONTROLLER_BT BIT(6)
-#define MOTION_CONTROLLER_USB BIT(7)
-#define MOTION_CONTROLLER_BT BIT(8)
-#define NAVIGATION_CONTROLLER_USB BIT(9)
-#define NAVIGATION_CONTROLLER_BT BIT(10)
-#define SINO_LITE_CONTROLLER BIT(11)
-#define FUTUREMAX_DANCE_MAT BIT(12)
+#define DUALSHOCK4_DONGLE BIT(7)
+#define MOTION_CONTROLLER_USB BIT(8)
+#define MOTION_CONTROLLER_BT BIT(9)
+#define NAVIGATION_CONTROLLER_USB BIT(10)
+#define NAVIGATION_CONTROLLER_BT BIT(11)
+#define SINO_LITE_CONTROLLER BIT(12)
+#define FUTUREMAX_DANCE_MAT BIT(13)
#define SIXAXIS_CONTROLLER (SIXAXIS_CONTROLLER_USB | SIXAXIS_CONTROLLER_BT)
#define MOTION_CONTROLLER (MOTION_CONTROLLER_USB | MOTION_CONTROLLER_BT)
#define NAVIGATION_CONTROLLER (NAVIGATION_CONTROLLER_USB |\
NAVIGATION_CONTROLLER_BT)
#define DUALSHOCK4_CONTROLLER (DUALSHOCK4_CONTROLLER_USB |\
- DUALSHOCK4_CONTROLLER_BT)
+ DUALSHOCK4_CONTROLLER_BT | \
+ DUALSHOCK4_DONGLE)
#define SONY_LED_SUPPORT (SIXAXIS_CONTROLLER | BUZZ_CONTROLLER |\
DUALSHOCK4_CONTROLLER | MOTION_CONTROLLER |\
NAVIGATION_CONTROLLER)
@@ -71,89 +75,6 @@
#define MAX_LEDS 4
-/*
- * The Sixaxis reports both digital and analog values for each button on the
- * controller except for Start, Select and the PS button. The controller ends
- * up reporting 27 axes which causes them to spill over into the multi-touch
- * axis values. Additionally, the controller only has 20 actual, physical axes
- * so there are several unused axes in between the used ones.
- */
-static u8 sixaxis_rdesc[] = {
- 0x05, 0x01, /* Usage Page (Desktop), */
- 0x09, 0x04, /* Usage (Joystick), */
- 0xA1, 0x01, /* Collection (Application), */
- 0xA1, 0x02, /* Collection (Logical), */
- 0x85, 0x01, /* Report ID (1), */
- 0x75, 0x08, /* Report Size (8), */
- 0x95, 0x01, /* Report Count (1), */
- 0x15, 0x00, /* Logical Minimum (0), */
- 0x26, 0xFF, 0x00, /* Logical Maximum (255), */
- 0x81, 0x03, /* Input (Constant, Variable), */
- 0x75, 0x01, /* Report Size (1), */
- 0x95, 0x13, /* Report Count (19), */
- 0x15, 0x00, /* Logical Minimum (0), */
- 0x25, 0x01, /* Logical Maximum (1), */
- 0x35, 0x00, /* Physical Minimum (0), */
- 0x45, 0x01, /* Physical Maximum (1), */
- 0x05, 0x09, /* Usage Page (Button), */
- 0x19, 0x01, /* Usage Minimum (01h), */
- 0x29, 0x13, /* Usage Maximum (13h), */
- 0x81, 0x02, /* Input (Variable), */
- 0x75, 0x01, /* Report Size (1), */
- 0x95, 0x0D, /* Report Count (13), */
- 0x06, 0x00, 0xFF, /* Usage Page (FF00h), */
- 0x81, 0x03, /* Input (Constant, Variable), */
- 0x15, 0x00, /* Logical Minimum (0), */
- 0x26, 0xFF, 0x00, /* Logical Maximum (255), */
- 0x05, 0x01, /* Usage Page (Desktop), */
- 0x09, 0x01, /* Usage (Pointer), */
- 0xA1, 0x00, /* Collection (Physical), */
- 0x75, 0x08, /* Report Size (8), */
- 0x95, 0x04, /* Report Count (4), */
- 0x35, 0x00, /* Physical Minimum (0), */
- 0x46, 0xFF, 0x00, /* Physical Maximum (255), */
- 0x09, 0x30, /* Usage (X), */
- 0x09, 0x31, /* Usage (Y), */
- 0x09, 0x32, /* Usage (Z), */
- 0x09, 0x35, /* Usage (Rz), */
- 0x81, 0x02, /* Input (Variable), */
- 0xC0, /* End Collection, */
- 0x05, 0x01, /* Usage Page (Desktop), */
- 0x95, 0x13, /* Report Count (19), */
- 0x09, 0x01, /* Usage (Pointer), */
- 0x81, 0x02, /* Input (Variable), */
- 0x95, 0x0C, /* Report Count (12), */
- 0x81, 0x01, /* Input (Constant), */
- 0x75, 0x10, /* Report Size (16), */
- 0x95, 0x04, /* Report Count (4), */
- 0x26, 0xFF, 0x03, /* Logical Maximum (1023), */
- 0x46, 0xFF, 0x03, /* Physical Maximum (1023), */
- 0x09, 0x01, /* Usage (Pointer), */
- 0x81, 0x02, /* Input (Variable), */
- 0xC0, /* End Collection, */
- 0xA1, 0x02, /* Collection (Logical), */
- 0x85, 0x02, /* Report ID (2), */
- 0x75, 0x08, /* Report Size (8), */
- 0x95, 0x30, /* Report Count (48), */
- 0x09, 0x01, /* Usage (Pointer), */
- 0xB1, 0x02, /* Feature (Variable), */
- 0xC0, /* End Collection, */
- 0xA1, 0x02, /* Collection (Logical), */
- 0x85, 0xEE, /* Report ID (238), */
- 0x75, 0x08, /* Report Size (8), */
- 0x95, 0x30, /* Report Count (48), */
- 0x09, 0x01, /* Usage (Pointer), */
- 0xB1, 0x02, /* Feature (Variable), */
- 0xC0, /* End Collection, */
- 0xA1, 0x02, /* Collection (Logical), */
- 0x85, 0xEF, /* Report ID (239), */
- 0x75, 0x08, /* Report Size (8), */
- 0x95, 0x30, /* Report Count (48), */
- 0x09, 0x01, /* Usage (Pointer), */
- 0xB1, 0x02, /* Feature (Variable), */
- 0xC0, /* End Collection, */
- 0xC0 /* End Collection */
-};
/* PS/3 Motion controller */
static u8 motion_rdesc[] = {
@@ -252,567 +173,6 @@ static u8 motion_rdesc[] = {
0xC0 /* End Collection */
};
-/* PS/3 Navigation controller */
-static u8 navigation_rdesc[] = {
- 0x05, 0x01, /* Usage Page (Desktop), */
- 0x09, 0x04, /* Usage (Joystick), */
- 0xA1, 0x01, /* Collection (Application), */
- 0xA1, 0x02, /* Collection (Logical), */
- 0x85, 0x01, /* Report ID (1), */
- 0x75, 0x08, /* Report Size (8), */
- 0x95, 0x01, /* Report Count (1), */
- 0x15, 0x00, /* Logical Minimum (0), */
- 0x26, 0xFF, 0x00, /* Logical Maximum (255), */
- 0x81, 0x03, /* Input (Constant, Variable), */
- 0x75, 0x01, /* Report Size (1), */
- 0x95, 0x13, /* Report Count (19), */
- 0x15, 0x00, /* Logical Minimum (0), */
- 0x25, 0x01, /* Logical Maximum (1), */
- 0x35, 0x00, /* Physical Minimum (0), */
- 0x45, 0x01, /* Physical Maximum (1), */
- 0x05, 0x09, /* Usage Page (Button), */
- 0x19, 0x01, /* Usage Minimum (01h), */
- 0x29, 0x13, /* Usage Maximum (13h), */
- 0x81, 0x02, /* Input (Variable), */
- 0x75, 0x01, /* Report Size (1), */
- 0x95, 0x0D, /* Report Count (13), */
- 0x06, 0x00, 0xFF, /* Usage Page (FF00h), */
- 0x81, 0x03, /* Input (Constant, Variable), */
- 0x15, 0x00, /* Logical Minimum (0), */
- 0x26, 0xFF, 0x00, /* Logical Maximum (255), */
- 0x05, 0x01, /* Usage Page (Desktop), */
- 0x09, 0x01, /* Usage (Pointer), */
- 0xA1, 0x00, /* Collection (Physical), */
- 0x75, 0x08, /* Report Size (8), */
- 0x95, 0x02, /* Report Count (2), */
- 0x35, 0x00, /* Physical Minimum (0), */
- 0x46, 0xFF, 0x00, /* Physical Maximum (255), */
- 0x09, 0x30, /* Usage (X), */
- 0x09, 0x31, /* Usage (Y), */
- 0x81, 0x02, /* Input (Variable), */
- 0xC0, /* End Collection, */
- 0x06, 0x00, 0xFF, /* Usage Page (FF00h), */
- 0x95, 0x06, /* Report Count (6), */
- 0x81, 0x03, /* Input (Constant, Variable), */
- 0x05, 0x01, /* Usage Page (Desktop), */
- 0x75, 0x08, /* Report Size (8), */
- 0x95, 0x05, /* Report Count (5), */
- 0x09, 0x01, /* Usage (Pointer), */
- 0x81, 0x02, /* Input (Variable), */
- 0x06, 0x00, 0xFF, /* Usage Page (FF00h), */
- 0x95, 0x01, /* Report Count (1), */
- 0x81, 0x02, /* Input (Variable), */
- 0x05, 0x01, /* Usage Page (Desktop), */
- 0x95, 0x01, /* Report Count (1), */
- 0x09, 0x01, /* Usage (Pointer), */
- 0x81, 0x02, /* Input (Variable), */
- 0x06, 0x00, 0xFF, /* Usage Page (FF00h), */
- 0x95, 0x1E, /* Report Count (24), */
- 0x81, 0x02, /* Input (Variable), */
- 0x75, 0x08, /* Report Size (8), */
- 0x95, 0x30, /* Report Count (48), */
- 0x09, 0x01, /* Usage (Pointer), */
- 0x91, 0x02, /* Output (Variable), */
- 0x75, 0x08, /* Report Size (8), */
- 0x95, 0x30, /* Report Count (48), */
- 0x09, 0x01, /* Usage (Pointer), */
- 0xB1, 0x02, /* Feature (Variable), */
- 0xC0, /* End Collection, */
- 0xA1, 0x02, /* Collection (Logical), */
- 0x85, 0x02, /* Report ID (2), */
- 0x75, 0x08, /* Report Size (8), */
- 0x95, 0x30, /* Report Count (48), */
- 0x09, 0x01, /* Usage (Pointer), */
- 0xB1, 0x02, /* Feature (Variable), */
- 0xC0, /* End Collection, */
- 0xA1, 0x02, /* Collection (Logical), */
- 0x85, 0xEE, /* Report ID (238), */
- 0x75, 0x08, /* Report Size (8), */
- 0x95, 0x30, /* Report Count (48), */
- 0x09, 0x01, /* Usage (Pointer), */
- 0xB1, 0x02, /* Feature (Variable), */
- 0xC0, /* End Collection, */
- 0xA1, 0x02, /* Collection (Logical), */
- 0x85, 0xEF, /* Report ID (239), */
- 0x75, 0x08, /* Report Size (8), */
- 0x95, 0x30, /* Report Count (48), */
- 0x09, 0x01, /* Usage (Pointer), */
- 0xB1, 0x02, /* Feature (Variable), */
- 0xC0, /* End Collection, */
- 0xC0 /* End Collection */
-};
-
-/*
- * The default descriptor doesn't provide mapping for the accelerometers
- * or orientation sensors. This fixed descriptor maps the accelerometers
- * to usage values 0x40, 0x41 and 0x42 and maps the orientation sensors
- * to usage values 0x43, 0x44 and 0x45.
- */
-static u8 dualshock4_usb_rdesc[] = {
- 0x05, 0x01, /* Usage Page (Desktop), */
- 0x09, 0x05, /* Usage (Gamepad), */
- 0xA1, 0x01, /* Collection (Application), */
- 0x85, 0x01, /* Report ID (1), */
- 0x09, 0x30, /* Usage (X), */
- 0x09, 0x31, /* Usage (Y), */
- 0x09, 0x32, /* Usage (Z), */
- 0x09, 0x35, /* Usage (Rz), */
- 0x15, 0x00, /* Logical Minimum (0), */
- 0x26, 0xFF, 0x00, /* Logical Maximum (255), */
- 0x75, 0x08, /* Report Size (8), */
- 0x95, 0x04, /* Report Count (4), */
- 0x81, 0x02, /* Input (Variable), */
- 0x09, 0x39, /* Usage (Hat Switch), */
- 0x15, 0x00, /* Logical Minimum (0), */
- 0x25, 0x07, /* Logical Maximum (7), */
- 0x35, 0x00, /* Physical Minimum (0), */
- 0x46, 0x3B, 0x01, /* Physical Maximum (315), */
- 0x65, 0x14, /* Unit (Degrees), */
- 0x75, 0x04, /* Report Size (4), */
- 0x95, 0x01, /* Report Count (1), */
- 0x81, 0x42, /* Input (Variable, Null State), */
- 0x65, 0x00, /* Unit, */
- 0x05, 0x09, /* Usage Page (Button), */
- 0x19, 0x01, /* Usage Minimum (01h), */
- 0x29, 0x0E, /* Usage Maximum (0Eh), */
- 0x15, 0x00, /* Logical Minimum (0), */
- 0x25, 0x01, /* Logical Maximum (1), */
- 0x75, 0x01, /* Report Size (1), */
- 0x95, 0x0E, /* Report Count (14), */
- 0x81, 0x02, /* Input (Variable), */
- 0x06, 0x00, 0xFF, /* Usage Page (FF00h), */
- 0x09, 0x20, /* Usage (20h), */
- 0x75, 0x06, /* Report Size (6), */
- 0x95, 0x01, /* Report Count (1), */
- 0x15, 0x00, /* Logical Minimum (0), */
- 0x25, 0x3F, /* Logical Maximum (63), */
- 0x81, 0x02, /* Input (Variable), */
- 0x05, 0x01, /* Usage Page (Desktop), */
- 0x09, 0x33, /* Usage (Rx), */
- 0x09, 0x34, /* Usage (Ry), */
- 0x15, 0x00, /* Logical Minimum (0), */
- 0x26, 0xFF, 0x00, /* Logical Maximum (255), */
- 0x75, 0x08, /* Report Size (8), */
- 0x95, 0x02, /* Report Count (2), */
- 0x81, 0x02, /* Input (Variable), */
- 0x06, 0x00, 0xFF, /* Usage Page (FF00h), */
- 0x09, 0x21, /* Usage (21h), */
- 0x95, 0x03, /* Report Count (3), */
- 0x81, 0x02, /* Input (Variable), */
- 0x05, 0x01, /* Usage Page (Desktop), */
- 0x19, 0x40, /* Usage Minimum (40h), */
- 0x29, 0x42, /* Usage Maximum (42h), */
- 0x16, 0x00, 0x80, /* Logical Minimum (-32768), */
- 0x26, 0x00, 0x7F, /* Logical Maximum (32767), */
- 0x75, 0x10, /* Report Size (16), */
- 0x95, 0x03, /* Report Count (3), */
- 0x81, 0x02, /* Input (Variable), */
- 0x19, 0x43, /* Usage Minimum (43h), */
- 0x29, 0x45, /* Usage Maximum (45h), */
- 0x16, 0x00, 0xE0, /* Logical Minimum (-8192), */
- 0x26, 0xFF, 0x1F, /* Logical Maximum (8191), */
- 0x95, 0x03, /* Report Count (3), */
- 0x81, 0x02, /* Input (Variable), */
- 0x06, 0x00, 0xFF, /* Usage Page (FF00h), */
- 0x09, 0x21, /* Usage (21h), */
- 0x15, 0x00, /* Logical Minimum (0), */
- 0x26, 0xFF, 0x00, /* Logical Maximum (255), */
- 0x75, 0x08, /* Report Size (8), */
- 0x95, 0x27, /* Report Count (39), */
- 0x81, 0x02, /* Input (Variable), */
- 0x85, 0x05, /* Report ID (5), */
- 0x09, 0x22, /* Usage (22h), */
- 0x95, 0x1F, /* Report Count (31), */
- 0x91, 0x02, /* Output (Variable), */
- 0x85, 0x04, /* Report ID (4), */
- 0x09, 0x23, /* Usage (23h), */
- 0x95, 0x24, /* Report Count (36), */
- 0xB1, 0x02, /* Feature (Variable), */
- 0x85, 0x02, /* Report ID (2), */
- 0x09, 0x24, /* Usage (24h), */
- 0x95, 0x24, /* Report Count (36), */
- 0xB1, 0x02, /* Feature (Variable), */
- 0x85, 0x08, /* Report ID (8), */
- 0x09, 0x25, /* Usage (25h), */
- 0x95, 0x03, /* Report Count (3), */
- 0xB1, 0x02, /* Feature (Variable), */
- 0x85, 0x10, /* Report ID (16), */
- 0x09, 0x26, /* Usage (26h), */
- 0x95, 0x04, /* Report Count (4), */
- 0xB1, 0x02, /* Feature (Variable), */
- 0x85, 0x11, /* Report ID (17), */
- 0x09, 0x27, /* Usage (27h), */
- 0x95, 0x02, /* Report Count (2), */
- 0xB1, 0x02, /* Feature (Variable), */
- 0x85, 0x12, /* Report ID (18), */
- 0x06, 0x02, 0xFF, /* Usage Page (FF02h), */
- 0x09, 0x21, /* Usage (21h), */
- 0x95, 0x0F, /* Report Count (15), */
- 0xB1, 0x02, /* Feature (Variable), */
- 0x85, 0x13, /* Report ID (19), */
- 0x09, 0x22, /* Usage (22h), */
- 0x95, 0x16, /* Report Count (22), */
- 0xB1, 0x02, /* Feature (Variable), */
- 0x85, 0x14, /* Report ID (20), */
- 0x06, 0x05, 0xFF, /* Usage Page (FF05h), */
- 0x09, 0x20, /* Usage (20h), */
- 0x95, 0x10, /* Report Count (16), */
- 0xB1, 0x02, /* Feature (Variable), */
- 0x85, 0x15, /* Report ID (21), */
- 0x09, 0x21, /* Usage (21h), */
- 0x95, 0x2C, /* Report Count (44), */
- 0xB1, 0x02, /* Feature (Variable), */
- 0x06, 0x80, 0xFF, /* Usage Page (FF80h), */
- 0x85, 0x80, /* Report ID (128), */
- 0x09, 0x20, /* Usage (20h), */
- 0x95, 0x06, /* Report Count (6), */
- 0xB1, 0x02, /* Feature (Variable), */
- 0x85, 0x81, /* Report ID (129), */
- 0x09, 0x21, /* Usage (21h), */
- 0x95, 0x06, /* Report Count (6), */
- 0xB1, 0x02, /* Feature (Variable), */
- 0x85, 0x82, /* Report ID (130), */
- 0x09, 0x22, /* Usage (22h), */
- 0x95, 0x05, /* Report Count (5), */
- 0xB1, 0x02, /* Feature (Variable), */
- 0x85, 0x83, /* Report ID (131), */
- 0x09, 0x23, /* Usage (23h), */
- 0x95, 0x01, /* Report Count (1), */
- 0xB1, 0x02, /* Feature (Variable), */
- 0x85, 0x84, /* Report ID (132), */
- 0x09, 0x24, /* Usage (24h), */
- 0x95, 0x04, /* Report Count (4), */
- 0xB1, 0x02, /* Feature (Variable), */
- 0x85, 0x85, /* Report ID (133), */
- 0x09, 0x25, /* Usage (25h), */
- 0x95, 0x06, /* Report Count (6), */
- 0xB1, 0x02, /* Feature (Variable), */
- 0x85, 0x86, /* Report ID (134), */
- 0x09, 0x26, /* Usage (26h), */
- 0x95, 0x06, /* Report Count (6), */
- 0xB1, 0x02, /* Feature (Variable), */
- 0x85, 0x87, /* Report ID (135), */
- 0x09, 0x27, /* Usage (27h), */
- 0x95, 0x23, /* Report Count (35), */
- 0xB1, 0x02, /* Feature (Variable), */
- 0x85, 0x88, /* Report ID (136), */
- 0x09, 0x28, /* Usage (28h), */
- 0x95, 0x22, /* Report Count (34), */
- 0xB1, 0x02, /* Feature (Variable), */
- 0x85, 0x89, /* Report ID (137), */
- 0x09, 0x29, /* Usage (29h), */
- 0x95, 0x02, /* Report Count (2), */
- 0xB1, 0x02, /* Feature (Variable), */
- 0x85, 0x90, /* Report ID (144), */
- 0x09, 0x30, /* Usage (30h), */
- 0x95, 0x05, /* Report Count (5), */
- 0xB1, 0x02, /* Feature (Variable), */
- 0x85, 0x91, /* Report ID (145), */
- 0x09, 0x31, /* Usage (31h), */
- 0x95, 0x03, /* Report Count (3), */
- 0xB1, 0x02, /* Feature (Variable), */
- 0x85, 0x92, /* Report ID (146), */
- 0x09, 0x32, /* Usage (32h), */
- 0x95, 0x03, /* Report Count (3), */
- 0xB1, 0x02, /* Feature (Variable), */
- 0x85, 0x93, /* Report ID (147), */
- 0x09, 0x33, /* Usage (33h), */
- 0x95, 0x0C, /* Report Count (12), */
- 0xB1, 0x02, /* Feature (Variable), */
- 0x85, 0xA0, /* Report ID (160), */
- 0x09, 0x40, /* Usage (40h), */
- 0x95, 0x06, /* Report Count (6), */
- 0xB1, 0x02, /* Feature (Variable), */
- 0x85, 0xA1, /* Report ID (161), */
- 0x09, 0x41, /* Usage (41h), */
- 0x95, 0x01, /* Report Count (1), */
- 0xB1, 0x02, /* Feature (Variable), */
- 0x85, 0xA2, /* Report ID (162), */
- 0x09, 0x42, /* Usage (42h), */
- 0x95, 0x01, /* Report Count (1), */
- 0xB1, 0x02, /* Feature (Variable), */
- 0x85, 0xA3, /* Report ID (163), */
- 0x09, 0x43, /* Usage (43h), */
- 0x95, 0x30, /* Report Count (48), */
- 0xB1, 0x02, /* Feature (Variable), */
- 0x85, 0xA4, /* Report ID (164), */
- 0x09, 0x44, /* Usage (44h), */
- 0x95, 0x0D, /* Report Count (13), */
- 0xB1, 0x02, /* Feature (Variable), */
- 0x85, 0xA5, /* Report ID (165), */
- 0x09, 0x45, /* Usage (45h), */
- 0x95, 0x15, /* Report Count (21), */
- 0xB1, 0x02, /* Feature (Variable), */
- 0x85, 0xA6, /* Report ID (166), */
- 0x09, 0x46, /* Usage (46h), */
- 0x95, 0x15, /* Report Count (21), */
- 0xB1, 0x02, /* Feature (Variable), */
- 0x85, 0xF0, /* Report ID (240), */
- 0x09, 0x47, /* Usage (47h), */
- 0x95, 0x3F, /* Report Count (63), */
- 0xB1, 0x02, /* Feature (Variable), */
- 0x85, 0xF1, /* Report ID (241), */
- 0x09, 0x48, /* Usage (48h), */
- 0x95, 0x3F, /* Report Count (63), */
- 0xB1, 0x02, /* Feature (Variable), */
- 0x85, 0xF2, /* Report ID (242), */
- 0x09, 0x49, /* Usage (49h), */
- 0x95, 0x0F, /* Report Count (15), */
- 0xB1, 0x02, /* Feature (Variable), */
- 0x85, 0xA7, /* Report ID (167), */
- 0x09, 0x4A, /* Usage (4Ah), */
- 0x95, 0x01, /* Report Count (1), */
- 0xB1, 0x02, /* Feature (Variable), */
- 0x85, 0xA8, /* Report ID (168), */
- 0x09, 0x4B, /* Usage (4Bh), */
- 0x95, 0x01, /* Report Count (1), */
- 0xB1, 0x02, /* Feature (Variable), */
- 0x85, 0xA9, /* Report ID (169), */
- 0x09, 0x4C, /* Usage (4Ch), */
- 0x95, 0x08, /* Report Count (8), */
- 0xB1, 0x02, /* Feature (Variable), */
- 0x85, 0xAA, /* Report ID (170), */
- 0x09, 0x4E, /* Usage (4Eh), */
- 0x95, 0x01, /* Report Count (1), */
- 0xB1, 0x02, /* Feature (Variable), */
- 0x85, 0xAB, /* Report ID (171), */
- 0x09, 0x4F, /* Usage (4Fh), */
- 0x95, 0x39, /* Report Count (57), */
- 0xB1, 0x02, /* Feature (Variable), */
- 0x85, 0xAC, /* Report ID (172), */
- 0x09, 0x50, /* Usage (50h), */
- 0x95, 0x39, /* Report Count (57), */
- 0xB1, 0x02, /* Feature (Variable), */
- 0x85, 0xAD, /* Report ID (173), */
- 0x09, 0x51, /* Usage (51h), */
- 0x95, 0x0B, /* Report Count (11), */
- 0xB1, 0x02, /* Feature (Variable), */
- 0x85, 0xAE, /* Report ID (174), */
- 0x09, 0x52, /* Usage (52h), */
- 0x95, 0x01, /* Report Count (1), */
- 0xB1, 0x02, /* Feature (Variable), */
- 0x85, 0xAF, /* Report ID (175), */
- 0x09, 0x53, /* Usage (53h), */
- 0x95, 0x02, /* Report Count (2), */
- 0xB1, 0x02, /* Feature (Variable), */
- 0x85, 0xB0, /* Report ID (176), */
- 0x09, 0x54, /* Usage (54h), */
- 0x95, 0x3F, /* Report Count (63), */
- 0xB1, 0x02, /* Feature (Variable), */
- 0xC0 /* End Collection */
-};
-
-/*
- * The default behavior of the Dualshock 4 is to send reports using report
- * type 1 when running over Bluetooth. However, when feature report 2 is
- * requested during the controller initialization it starts sending input
- * reports in report 17. Since report 17 is undefined in the default HID
- * descriptor the button and axis definitions must be moved to report 17 or
- * the HID layer won't process the received input.
- */
-static u8 dualshock4_bt_rdesc[] = {
- 0x05, 0x01, /* Usage Page (Desktop), */
- 0x09, 0x05, /* Usage (Gamepad), */
- 0xA1, 0x01, /* Collection (Application), */
- 0x85, 0x01, /* Report ID (1), */
- 0x75, 0x08, /* Report Size (8), */
- 0x95, 0x0A, /* Report Count (9), */
- 0x81, 0x02, /* Input (Variable), */
- 0x06, 0x04, 0xFF, /* Usage Page (FF04h), */
- 0x85, 0x02, /* Report ID (2), */
- 0x09, 0x24, /* Usage (24h), */
- 0x95, 0x24, /* Report Count (36), */
- 0xB1, 0x02, /* Feature (Variable), */
- 0x85, 0xA3, /* Report ID (163), */
- 0x09, 0x25, /* Usage (25h), */
- 0x95, 0x30, /* Report Count (48), */
- 0xB1, 0x02, /* Feature (Variable), */
- 0x85, 0x05, /* Report ID (5), */
- 0x09, 0x26, /* Usage (26h), */
- 0x95, 0x28, /* Report Count (40), */
- 0xB1, 0x02, /* Feature (Variable), */
- 0x85, 0x06, /* Report ID (6), */
- 0x09, 0x27, /* Usage (27h), */
- 0x95, 0x34, /* Report Count (52), */
- 0xB1, 0x02, /* Feature (Variable), */
- 0x85, 0x07, /* Report ID (7), */
- 0x09, 0x28, /* Usage (28h), */
- 0x95, 0x30, /* Report Count (48), */
- 0xB1, 0x02, /* Feature (Variable), */
- 0x85, 0x08, /* Report ID (8), */
- 0x09, 0x29, /* Usage (29h), */
- 0x95, 0x2F, /* Report Count (47), */
- 0xB1, 0x02, /* Feature (Variable), */
- 0x06, 0x03, 0xFF, /* Usage Page (FF03h), */
- 0x85, 0x03, /* Report ID (3), */
- 0x09, 0x21, /* Usage (21h), */
- 0x95, 0x26, /* Report Count (38), */
- 0xB1, 0x02, /* Feature (Variable), */
- 0x85, 0x04, /* Report ID (4), */
- 0x09, 0x22, /* Usage (22h), */
- 0x95, 0x2E, /* Report Count (46), */
- 0xB1, 0x02, /* Feature (Variable), */
- 0x85, 0xF0, /* Report ID (240), */
- 0x09, 0x47, /* Usage (47h), */
- 0x95, 0x3F, /* Report Count (63), */
- 0xB1, 0x02, /* Feature (Variable), */
- 0x85, 0xF1, /* Report ID (241), */
- 0x09, 0x48, /* Usage (48h), */
- 0x95, 0x3F, /* Report Count (63), */
- 0xB1, 0x02, /* Feature (Variable), */
- 0x85, 0xF2, /* Report ID (242), */
- 0x09, 0x49, /* Usage (49h), */
- 0x95, 0x0F, /* Report Count (15), */
- 0xB1, 0x02, /* Feature (Variable), */
- 0x85, 0x11, /* Report ID (17), */
- 0x06, 0x00, 0xFF, /* Usage Page (FF00h), */
- 0x09, 0x20, /* Usage (20h), */
- 0x95, 0x02, /* Report Count (2), */
- 0x81, 0x02, /* Input (Variable), */
- 0x05, 0x01, /* Usage Page (Desktop), */
- 0x09, 0x30, /* Usage (X), */
- 0x09, 0x31, /* Usage (Y), */
- 0x09, 0x32, /* Usage (Z), */
- 0x09, 0x35, /* Usage (Rz), */
- 0x15, 0x00, /* Logical Minimum (0), */
- 0x26, 0xFF, 0x00, /* Logical Maximum (255), */
- 0x75, 0x08, /* Report Size (8), */
- 0x95, 0x04, /* Report Count (4), */
- 0x81, 0x02, /* Input (Variable), */
- 0x09, 0x39, /* Usage (Hat Switch), */
- 0x15, 0x00, /* Logical Minimum (0), */
- 0x25, 0x07, /* Logical Maximum (7), */
- 0x75, 0x04, /* Report Size (4), */
- 0x95, 0x01, /* Report Count (1), */
- 0x81, 0x42, /* Input (Variable, Null State), */
- 0x05, 0x09, /* Usage Page (Button), */
- 0x19, 0x01, /* Usage Minimum (01h), */
- 0x29, 0x0E, /* Usage Maximum (0Eh), */
- 0x15, 0x00, /* Logical Minimum (0), */
- 0x25, 0x01, /* Logical Maximum (1), */
- 0x75, 0x01, /* Report Size (1), */
- 0x95, 0x0E, /* Report Count (14), */
- 0x81, 0x02, /* Input (Variable), */
- 0x75, 0x06, /* Report Size (6), */
- 0x95, 0x01, /* Report Count (1), */
- 0x81, 0x01, /* Input (Constant), */
- 0x05, 0x01, /* Usage Page (Desktop), */
- 0x09, 0x33, /* Usage (Rx), */
- 0x09, 0x34, /* Usage (Ry), */
- 0x15, 0x00, /* Logical Minimum (0), */
- 0x26, 0xFF, 0x00, /* Logical Maximum (255), */
- 0x75, 0x08, /* Report Size (8), */
- 0x95, 0x02, /* Report Count (2), */
- 0x81, 0x02, /* Input (Variable), */
- 0x06, 0x00, 0xFF, /* Usage Page (FF00h), */
- 0x09, 0x20, /* Usage (20h), */
- 0x95, 0x03, /* Report Count (3), */
- 0x81, 0x02, /* Input (Variable), */
- 0x05, 0x01, /* Usage Page (Desktop), */
- 0x19, 0x40, /* Usage Minimum (40h), */
- 0x29, 0x42, /* Usage Maximum (42h), */
- 0x16, 0x00, 0x80, /* Logical Minimum (-32768), */
- 0x26, 0x00, 0x7F, /* Logical Maximum (32767), */
- 0x75, 0x10, /* Report Size (16), */
- 0x95, 0x03, /* Report Count (3), */
- 0x81, 0x02, /* Input (Variable), */
- 0x19, 0x43, /* Usage Minimum (43h), */
- 0x29, 0x45, /* Usage Maximum (45h), */
- 0x16, 0x00, 0xE0, /* Logical Minimum (-8192), */
- 0x26, 0xFF, 0x1F, /* Logical Maximum (8191), */
- 0x95, 0x03, /* Report Count (3), */
- 0x81, 0x02, /* Input (Variable), */
- 0x06, 0x00, 0xFF, /* Usage Page (FF00h), */
- 0x09, 0x20, /* Usage (20h), */
- 0x15, 0x00, /* Logical Minimum (0), */
- 0x26, 0xFF, 0x00, /* Logical Maximum (255), */
- 0x75, 0x08, /* Report Size (8), */
- 0x95, 0x31, /* Report Count (51), */
- 0x81, 0x02, /* Input (Variable), */
- 0x09, 0x21, /* Usage (21h), */
- 0x75, 0x08, /* Report Size (8), */
- 0x95, 0x4D, /* Report Count (77), */
- 0x91, 0x02, /* Output (Variable), */
- 0x85, 0x12, /* Report ID (18), */
- 0x09, 0x22, /* Usage (22h), */
- 0x95, 0x8D, /* Report Count (141), */
- 0x81, 0x02, /* Input (Variable), */
- 0x09, 0x23, /* Usage (23h), */
- 0x91, 0x02, /* Output (Variable), */
- 0x85, 0x13, /* Report ID (19), */
- 0x09, 0x24, /* Usage (24h), */
- 0x95, 0xCD, /* Report Count (205), */
- 0x81, 0x02, /* Input (Variable), */
- 0x09, 0x25, /* Usage (25h), */
- 0x91, 0x02, /* Output (Variable), */
- 0x85, 0x14, /* Report ID (20), */
- 0x09, 0x26, /* Usage (26h), */
- 0x96, 0x0D, 0x01, /* Report Count (269), */
- 0x81, 0x02, /* Input (Variable), */
- 0x09, 0x27, /* Usage (27h), */
- 0x91, 0x02, /* Output (Variable), */
- 0x85, 0x15, /* Report ID (21), */
- 0x09, 0x28, /* Usage (28h), */
- 0x96, 0x4D, 0x01, /* Report Count (333), */
- 0x81, 0x02, /* Input (Variable), */
- 0x09, 0x29, /* Usage (29h), */
- 0x91, 0x02, /* Output (Variable), */
- 0x85, 0x16, /* Report ID (22), */
- 0x09, 0x2A, /* Usage (2Ah), */
- 0x96, 0x8D, 0x01, /* Report Count (397), */
- 0x81, 0x02, /* Input (Variable), */
- 0x09, 0x2B, /* Usage (2Bh), */
- 0x91, 0x02, /* Output (Variable), */
- 0x85, 0x17, /* Report ID (23), */
- 0x09, 0x2C, /* Usage (2Ch), */
- 0x96, 0xCD, 0x01, /* Report Count (461), */
- 0x81, 0x02, /* Input (Variable), */
- 0x09, 0x2D, /* Usage (2Dh), */
- 0x91, 0x02, /* Output (Variable), */
- 0x85, 0x18, /* Report ID (24), */
- 0x09, 0x2E, /* Usage (2Eh), */
- 0x96, 0x0D, 0x02, /* Report Count (525), */
- 0x81, 0x02, /* Input (Variable), */
- 0x09, 0x2F, /* Usage (2Fh), */
- 0x91, 0x02, /* Output (Variable), */
- 0x85, 0x19, /* Report ID (25), */
- 0x09, 0x30, /* Usage (30h), */
- 0x96, 0x22, 0x02, /* Report Count (546), */
- 0x81, 0x02, /* Input (Variable), */
- 0x09, 0x31, /* Usage (31h), */
- 0x91, 0x02, /* Output (Variable), */
- 0x06, 0x80, 0xFF, /* Usage Page (FF80h), */
- 0x85, 0x82, /* Report ID (130), */
- 0x09, 0x22, /* Usage (22h), */
- 0x95, 0x3F, /* Report Count (63), */
- 0xB1, 0x02, /* Feature (Variable), */
- 0x85, 0x83, /* Report ID (131), */
- 0x09, 0x23, /* Usage (23h), */
- 0xB1, 0x02, /* Feature (Variable), */
- 0x85, 0x84, /* Report ID (132), */
- 0x09, 0x24, /* Usage (24h), */
- 0xB1, 0x02, /* Feature (Variable), */
- 0x85, 0x90, /* Report ID (144), */
- 0x09, 0x30, /* Usage (30h), */
- 0xB1, 0x02, /* Feature (Variable), */
- 0x85, 0x91, /* Report ID (145), */
- 0x09, 0x31, /* Usage (31h), */
- 0xB1, 0x02, /* Feature (Variable), */
- 0x85, 0x92, /* Report ID (146), */
- 0x09, 0x32, /* Usage (32h), */
- 0xB1, 0x02, /* Feature (Variable), */
- 0x85, 0x93, /* Report ID (147), */
- 0x09, 0x33, /* Usage (33h), */
- 0xB1, 0x02, /* Feature (Variable), */
- 0x85, 0xA0, /* Report ID (160), */
- 0x09, 0x40, /* Usage (40h), */
- 0xB1, 0x02, /* Feature (Variable), */
- 0x85, 0xA4, /* Report ID (164), */
- 0x09, 0x44, /* Usage (44h), */
- 0xB1, 0x02, /* Feature (Variable), */
- 0xC0 /* End Collection */
-};
-
static u8 ps3remote_rdesc[] = {
0x05, 0x01, /* GUsagePage Generic Desktop */
0x09, 0x05, /* LUsage 0x05 [Game Pad] */
@@ -975,6 +335,97 @@ static const unsigned int buzz_keymap[] = {
[20] = BTN_TRIGGER_HAPPY20,
};
+/* The Navigation controller is a partial DS3 and uses the same HID report
+ * and hence the same keymap indices, however not not all axes/buttons
+ * are physically present. We use the same axis and button mapping as
+ * the DS3, which uses the Linux gamepad spec.
+ */
+static const unsigned int navigation_absmap[] = {
+ [0x30] = ABS_X,
+ [0x31] = ABS_Y,
+ [0x33] = ABS_Z, /* L2 */
+};
+
+/* Buttons not physically available on the device, but still available
+ * in the reports are explicitly set to 0 for documentation purposes.
+ */
+static const unsigned int navigation_keymap[] = {
+ [0x01] = 0, /* Select */
+ [0x02] = BTN_THUMBL, /* L3 */
+ [0x03] = 0, /* R3 */
+ [0x04] = 0, /* Start */
+ [0x05] = BTN_DPAD_UP, /* Up */
+ [0x06] = BTN_DPAD_RIGHT, /* Right */
+ [0x07] = BTN_DPAD_DOWN, /* Down */
+ [0x08] = BTN_DPAD_LEFT, /* Left */
+ [0x09] = BTN_TL2, /* L2 */
+ [0x0a] = 0, /* R2 */
+ [0x0b] = BTN_TL, /* L1 */
+ [0x0c] = 0, /* R1 */
+ [0x0d] = BTN_NORTH, /* Triangle */
+ [0x0e] = BTN_EAST, /* Circle */
+ [0x0f] = BTN_SOUTH, /* Cross */
+ [0x10] = BTN_WEST, /* Square */
+ [0x11] = BTN_MODE, /* PS */
+};
+
+static const unsigned int sixaxis_absmap[] = {
+ [0x30] = ABS_X,
+ [0x31] = ABS_Y,
+ [0x32] = ABS_RX, /* right stick X */
+ [0x35] = ABS_RY, /* right stick Y */
+};
+
+static const unsigned int sixaxis_keymap[] = {
+ [0x01] = BTN_SELECT, /* Select */
+ [0x02] = BTN_THUMBL, /* L3 */
+ [0x03] = BTN_THUMBR, /* R3 */
+ [0x04] = BTN_START, /* Start */
+ [0x05] = BTN_DPAD_UP, /* Up */
+ [0x06] = BTN_DPAD_RIGHT, /* Right */
+ [0x07] = BTN_DPAD_DOWN, /* Down */
+ [0x08] = BTN_DPAD_LEFT, /* Left */
+ [0x09] = BTN_TL2, /* L2 */
+ [0x0a] = BTN_TR2, /* R2 */
+ [0x0b] = BTN_TL, /* L1 */
+ [0x0c] = BTN_TR, /* R1 */
+ [0x0d] = BTN_NORTH, /* Triangle */
+ [0x0e] = BTN_EAST, /* Circle */
+ [0x0f] = BTN_SOUTH, /* Cross */
+ [0x10] = BTN_WEST, /* Square */
+ [0x11] = BTN_MODE, /* PS */
+};
+
+static const unsigned int ds4_absmap[] = {
+ [0x30] = ABS_X,
+ [0x31] = ABS_Y,
+ [0x32] = ABS_RX, /* right stick X */
+ [0x33] = ABS_Z, /* L2 */
+ [0x34] = ABS_RZ, /* R2 */
+ [0x35] = ABS_RY, /* right stick Y */
+};
+
+static const unsigned int ds4_keymap[] = {
+ [0x1] = BTN_WEST, /* Square */
+ [0x2] = BTN_SOUTH, /* Cross */
+ [0x3] = BTN_EAST, /* Circle */
+ [0x4] = BTN_NORTH, /* Triangle */
+ [0x5] = BTN_TL, /* L1 */
+ [0x6] = BTN_TR, /* R1 */
+ [0x7] = BTN_TL2, /* L2 */
+ [0x8] = BTN_TR2, /* R2 */
+ [0x9] = BTN_SELECT, /* Share */
+ [0xa] = BTN_START, /* Options */
+ [0xb] = BTN_THUMBL, /* L3 */
+ [0xc] = BTN_THUMBR, /* R3 */
+ [0xd] = BTN_MODE, /* PS */
+};
+
+static const struct {int x; int y; } ds4_hat_mapping[] = {
+ {0, -1}, {1, -1}, {1, 0}, {1, 1}, {0, 1}, {-1, 1}, {-1, 0}, {-1, -1},
+ {0, 0}
+};
+
static enum power_supply_property sony_battery_props[] = {
POWER_SUPPLY_PROP_PRESENT,
POWER_SUPPLY_PROP_CAPACITY,
@@ -1019,24 +470,75 @@ struct motion_output_report_02 {
u8 rumble;
};
-#define DS4_REPORT_0x02_SIZE 37
-#define DS4_REPORT_0x05_SIZE 32
-#define DS4_REPORT_0x11_SIZE 78
-#define DS4_REPORT_0x81_SIZE 7
+#define DS4_FEATURE_REPORT_0x02_SIZE 37
+#define DS4_FEATURE_REPORT_0x05_SIZE 41
+#define DS4_FEATURE_REPORT_0x81_SIZE 7
+#define DS4_INPUT_REPORT_0x11_SIZE 78
+#define DS4_OUTPUT_REPORT_0x05_SIZE 32
+#define DS4_OUTPUT_REPORT_0x11_SIZE 78
#define SIXAXIS_REPORT_0xF2_SIZE 17
#define SIXAXIS_REPORT_0xF5_SIZE 8
#define MOTION_REPORT_0x02_SIZE 49
+/* Offsets relative to USB input report (0x1). Bluetooth (0x11) requires an
+ * additional +2.
+ */
+#define DS4_INPUT_REPORT_AXIS_OFFSET 1
+#define DS4_INPUT_REPORT_BUTTON_OFFSET 5
+#define DS4_INPUT_REPORT_TIMESTAMP_OFFSET 10
+#define DS4_INPUT_REPORT_GYRO_X_OFFSET 13
+#define DS4_INPUT_REPORT_BATTERY_OFFSET 30
+#define DS4_INPUT_REPORT_TOUCHPAD_OFFSET 33
+
+#define SENSOR_SUFFIX " Motion Sensors"
+#define DS4_TOUCHPAD_SUFFIX " Touchpad"
+
+/* Default to 4ms poll interval, which is same as USB (not adjustable). */
+#define DS4_BT_DEFAULT_POLL_INTERVAL_MS 4
+#define DS4_BT_MAX_POLL_INTERVAL_MS 62
+#define DS4_GYRO_RES_PER_DEG_S 1024
+#define DS4_ACC_RES_PER_G 8192
+
+#define SIXAXIS_INPUT_REPORT_ACC_X_OFFSET 41
+#define SIXAXIS_ACC_RES_PER_G 113
+
static DEFINE_SPINLOCK(sony_dev_list_lock);
static LIST_HEAD(sony_device_list);
static DEFINE_IDA(sony_device_id_allocator);
+/* Used for calibration of DS4 accelerometer and gyro. */
+struct ds4_calibration_data {
+ int abs_code;
+ short bias;
+ /* Calibration requires scaling against a sensitivity value, which is a
+ * float. Store sensitivity as a fraction to limit floating point
+ * calculations until final calibration.
+ */
+ int sens_numer;
+ int sens_denom;
+};
+
+enum ds4_dongle_state {
+ DONGLE_DISCONNECTED,
+ DONGLE_CALIBRATING,
+ DONGLE_CONNECTED,
+ DONGLE_DISABLED
+};
+
+enum sony_worker {
+ SONY_WORKER_STATE,
+ SONY_WORKER_HOTPLUG
+};
+
struct sony_sc {
spinlock_t lock;
struct list_head list_node;
struct hid_device *hdev;
+ struct input_dev *touchpad;
+ struct input_dev *sensor_dev;
struct led_classdev *leds[MAX_LEDS];
unsigned long quirks;
+ struct work_struct hotplug_worker;
struct work_struct state_worker;
void (*send_output_report)(struct sony_sc *);
struct power_supply *battery;
@@ -1050,7 +552,8 @@ struct sony_sc {
#endif
u8 mac_address[6];
- u8 worker_initialized;
+ u8 hotplug_worker_initialized;
+ u8 state_worker_initialized;
u8 defer_initialization;
u8 cable_state;
u8 battery_charging;
@@ -1059,33 +562,77 @@ struct sony_sc {
u8 led_delay_on[MAX_LEDS];
u8 led_delay_off[MAX_LEDS];
u8 led_count;
+
+ bool timestamp_initialized;
+ u16 prev_timestamp;
+ unsigned int timestamp_us;
+
+ u8 ds4_bt_poll_interval;
+ enum ds4_dongle_state ds4_dongle_state;
+ /* DS4 calibration data */
+ struct ds4_calibration_data ds4_calib_data[6];
};
-static inline void sony_schedule_work(struct sony_sc *sc)
+static void sony_set_leds(struct sony_sc *sc);
+
+static inline void sony_schedule_work(struct sony_sc *sc,
+ enum sony_worker which)
{
- if (!sc->defer_initialization)
- schedule_work(&sc->state_worker);
+ switch (which) {
+ case SONY_WORKER_STATE:
+ if (!sc->defer_initialization)
+ schedule_work(&sc->state_worker);
+ break;
+ case SONY_WORKER_HOTPLUG:
+ if (sc->hotplug_worker_initialized)
+ schedule_work(&sc->hotplug_worker);
+ break;
+ }
}
-static u8 *sixaxis_fixup(struct hid_device *hdev, u8 *rdesc,
- unsigned int *rsize)
+static ssize_t ds4_show_poll_interval(struct device *dev,
+ struct device_attribute
+ *attr, char *buf)
{
- *rsize = sizeof(sixaxis_rdesc);
- return sixaxis_rdesc;
+ struct hid_device *hdev = to_hid_device(dev);
+ struct sony_sc *sc = hid_get_drvdata(hdev);
+
+ return snprintf(buf, PAGE_SIZE, "%i\n", sc->ds4_bt_poll_interval);
}
-static u8 *motion_fixup(struct hid_device *hdev, u8 *rdesc,
- unsigned int *rsize)
+static ssize_t ds4_store_poll_interval(struct device *dev,
+ struct device_attribute *attr,
+ const char *buf, size_t count)
{
- *rsize = sizeof(motion_rdesc);
- return motion_rdesc;
+ struct hid_device *hdev = to_hid_device(dev);
+ struct sony_sc *sc = hid_get_drvdata(hdev);
+ unsigned long flags;
+ u8 interval;
+
+ if (kstrtou8(buf, 0, &interval))
+ return -EINVAL;
+
+ if (interval > DS4_BT_MAX_POLL_INTERVAL_MS)
+ return -EINVAL;
+
+ spin_lock_irqsave(&sc->lock, flags);
+ sc->ds4_bt_poll_interval = interval;
+ spin_unlock_irqrestore(&sc->lock, flags);
+
+ sony_schedule_work(sc, SONY_WORKER_STATE);
+
+ return count;
}
-static u8 *navigation_fixup(struct hid_device *hdev, u8 *rdesc,
+static DEVICE_ATTR(bt_poll_interval, 0644, ds4_show_poll_interval,
+ ds4_store_poll_interval);
+
+
+static u8 *motion_fixup(struct hid_device *hdev, u8 *rdesc,
unsigned int *rsize)
{
- *rsize = sizeof(navigation_rdesc);
- return navigation_rdesc;
+ *rsize = sizeof(motion_rdesc);
+ return motion_rdesc;
}
static u8 *ps3remote_fixup(struct hid_device *hdev, u8 *rdesc,
@@ -1129,6 +676,133 @@ static int ps3remote_mapping(struct hid_device *hdev, struct hid_input *hi,
return 1;
}
+static int navigation_mapping(struct hid_device *hdev, struct hid_input *hi,
+ struct hid_field *field, struct hid_usage *usage,
+ unsigned long **bit, int *max)
+{
+ if ((usage->hid & HID_USAGE_PAGE) == HID_UP_BUTTON) {
+ unsigned int key = usage->hid & HID_USAGE;
+
+ if (key >= ARRAY_SIZE(sixaxis_keymap))
+ return -1;
+
+ key = navigation_keymap[key];
+ if (!key)
+ return -1;
+
+ hid_map_usage_clear(hi, usage, bit, max, EV_KEY, key);
+ return 1;
+ } else if (usage->hid == HID_GD_POINTER) {
+ /* See comment in sixaxis_mapping, basically the L2 (and R2)
+ * triggers are reported through GD Pointer.
+ * In addition we ignore any analog button 'axes' and only
+ * support digital buttons.
+ */
+ switch (usage->usage_index) {
+ case 8: /* L2 */
+ usage->hid = HID_GD_Z;
+ break;
+ default:
+ return -1;
+ }
+
+ hid_map_usage_clear(hi, usage, bit, max, EV_ABS, usage->hid & 0xf);
+ return 1;
+ } else if ((usage->hid & HID_USAGE_PAGE) == HID_UP_GENDESK) {
+ unsigned int abs = usage->hid & HID_USAGE;
+
+ if (abs >= ARRAY_SIZE(navigation_absmap))
+ return -1;
+
+ abs = navigation_absmap[abs];
+
+ hid_map_usage_clear(hi, usage, bit, max, EV_ABS, abs);
+ return 1;
+ }
+
+ return -1;
+}
+
+
+static int sixaxis_mapping(struct hid_device *hdev, struct hid_input *hi,
+ struct hid_field *field, struct hid_usage *usage,
+ unsigned long **bit, int *max)
+{
+ if ((usage->hid & HID_USAGE_PAGE) == HID_UP_BUTTON) {
+ unsigned int key = usage->hid & HID_USAGE;
+
+ if (key >= ARRAY_SIZE(sixaxis_keymap))
+ return -1;
+
+ key = sixaxis_keymap[key];
+ hid_map_usage_clear(hi, usage, bit, max, EV_KEY, key);
+ return 1;
+ } else if (usage->hid == HID_GD_POINTER) {
+ /* The DS3 provides analog values for most buttons and even
+ * for HAT axes through GD Pointer. L2 and R2 are reported
+ * among these as well instead of as GD Z / RZ. Remap L2
+ * and R2 and ignore other analog 'button axes' as there is
+ * no good way for reporting them.
+ */
+ switch (usage->usage_index) {
+ case 8: /* L2 */
+ usage->hid = HID_GD_Z;
+ break;
+ case 9: /* R2 */
+ usage->hid = HID_GD_RZ;
+ break;
+ default:
+ return -1;
+ }
+
+ hid_map_usage_clear(hi, usage, bit, max, EV_ABS, usage->hid & 0xf);
+ return 1;
+ } else if ((usage->hid & HID_USAGE_PAGE) == HID_UP_GENDESK) {
+ unsigned int abs = usage->hid & HID_USAGE;
+
+ if (abs >= ARRAY_SIZE(sixaxis_absmap))
+ return -1;
+
+ abs = sixaxis_absmap[abs];
+
+ hid_map_usage_clear(hi, usage, bit, max, EV_ABS, abs);
+ return 1;
+ }
+
+ return -1;
+}
+
+static int ds4_mapping(struct hid_device *hdev, struct hid_input *hi,
+ struct hid_field *field, struct hid_usage *usage,
+ unsigned long **bit, int *max)
+{
+ if ((usage->hid & HID_USAGE_PAGE) == HID_UP_BUTTON) {
+ unsigned int key = usage->hid & HID_USAGE;
+
+ if (key >= ARRAY_SIZE(ds4_keymap))
+ return -1;
+
+ key = ds4_keymap[key];
+ hid_map_usage_clear(hi, usage, bit, max, EV_KEY, key);
+ return 1;
+ } else if ((usage->hid & HID_USAGE_PAGE) == HID_UP_GENDESK) {
+ unsigned int abs = usage->hid & HID_USAGE;
+
+ /* Let the HID parser deal with the HAT. */
+ if (usage->hid == HID_GD_HATSWITCH)
+ return 0;
+
+ if (abs >= ARRAY_SIZE(ds4_absmap))
+ return -1;
+
+ abs = ds4_absmap[abs];
+ hid_map_usage_clear(hi, usage, bit, max, EV_ABS, abs);
+ return 1;
+ }
+
+ return 0;
+}
+
static u8 *sony_report_fixup(struct hid_device *hdev, u8 *rdesc,
unsigned int *rsize)
{
@@ -1153,30 +827,9 @@ static u8 *sony_report_fixup(struct hid_device *hdev, u8 *rdesc,
rdesc[55] = 0x06;
}
- /*
- * The default Dualshock 4 USB descriptor doesn't assign
- * the gyroscope values to corresponding axes so we need a
- * modified one.
- */
- if (sc->quirks & DUALSHOCK4_CONTROLLER_USB) {
- hid_info(hdev, "Using modified Dualshock 4 report descriptor with gyroscope axes\n");
- rdesc = dualshock4_usb_rdesc;
- *rsize = sizeof(dualshock4_usb_rdesc);
- } else if (sc->quirks & DUALSHOCK4_CONTROLLER_BT) {
- hid_info(hdev, "Using modified Dualshock 4 Bluetooth report descriptor\n");
- rdesc = dualshock4_bt_rdesc;
- *rsize = sizeof(dualshock4_bt_rdesc);
- }
-
- if (sc->quirks & SIXAXIS_CONTROLLER)
- return sixaxis_fixup(hdev, rdesc, rsize);
-
if (sc->quirks & MOTION_CONTROLLER)
return motion_fixup(hdev, rdesc, rsize);
- if (sc->quirks & NAVIGATION_CONTROLLER)
- return navigation_fixup(hdev, rdesc, rsize);
-
if (sc->quirks & PS3REMOTE)
return ps3remote_fixup(hdev, rdesc, rsize);
@@ -1214,6 +867,23 @@ static void sixaxis_parse_report(struct sony_sc *sc, u8 *rd, int size)
sc->battery_capacity = battery_capacity;
sc->battery_charging = battery_charging;
spin_unlock_irqrestore(&sc->lock, flags);
+
+ if (sc->quirks & SIXAXIS_CONTROLLER) {
+ int val;
+
+ offset = SIXAXIS_INPUT_REPORT_ACC_X_OFFSET;
+ val = ((rd[offset+1] << 8) | rd[offset]) - 511;
+ input_report_abs(sc->sensor_dev, ABS_X, val);
+
+ /* Y and Z are swapped and inversed */
+ val = 511 - ((rd[offset+5] << 8) | rd[offset+4]);
+ input_report_abs(sc->sensor_dev, ABS_Y, val);
+
+ val = 511 - ((rd[offset+3] << 8) | rd[offset+2]);
+ input_report_abs(sc->sensor_dev, ABS_Z, val);
+
+ input_sync(sc->sensor_dev);
+ }
}
static void dualshock4_parse_report(struct sony_sc *sc, u8 *rd, int size)
@@ -1222,19 +892,111 @@ static void dualshock4_parse_report(struct sony_sc *sc, u8 *rd, int size)
struct hid_input, list);
struct input_dev *input_dev = hidinput->input;
unsigned long flags;
- int n, offset;
+ int n, m, offset, num_touch_data, max_touch_data;
u8 cable_state, battery_capacity, battery_charging;
+ u16 timestamp;
+
+ /* When using Bluetooth the header is 2 bytes longer, so skip these. */
+ int data_offset = (sc->quirks & DUALSHOCK4_CONTROLLER_BT) ? 2 : 0;
+
+ /* Second bit of third button byte is for the touchpad button. */
+ offset = data_offset + DS4_INPUT_REPORT_BUTTON_OFFSET;
+ input_report_key(sc->touchpad, BTN_LEFT, rd[offset+2] & 0x2);
/*
- * Battery and touchpad data starts at byte 30 in the USB report and
- * 32 in Bluetooth report.
+ * The default behavior of the Dualshock 4 is to send reports using
+ * report type 1 when running over Bluetooth. However, when feature
+ * report 2 is requested during the controller initialization it starts
+ * sending input reports in report 17. Since report 17 is undefined
+ * in the default HID descriptor, the HID layer won't generate events.
+ * While it is possible (and this was done before) to fixup the HID
+ * descriptor to add this mapping, it was better to do this manually.
+ * The reason is there were various pieces software both open and closed
+ * source, relying on the descriptors to be the same across various
+ * operating systems. If the descriptors wouldn't match some
+ * applications e.g. games on Wine would not be able to function due
+ * to different descriptors, which such applications are not parsing.
*/
- offset = (sc->quirks & DUALSHOCK4_CONTROLLER_USB) ? 30 : 32;
+ if (rd[0] == 17) {
+ int value;
+
+ offset = data_offset + DS4_INPUT_REPORT_AXIS_OFFSET;
+ input_report_abs(input_dev, ABS_X, rd[offset]);
+ input_report_abs(input_dev, ABS_Y, rd[offset+1]);
+ input_report_abs(input_dev, ABS_RX, rd[offset+2]);
+ input_report_abs(input_dev, ABS_RY, rd[offset+3]);
+
+ value = rd[offset+4] & 0xf;
+ if (value > 7)
+ value = 8; /* Center 0, 0 */
+ input_report_abs(input_dev, ABS_HAT0X, ds4_hat_mapping[value].x);
+ input_report_abs(input_dev, ABS_HAT0Y, ds4_hat_mapping[value].y);
+
+ input_report_key(input_dev, BTN_WEST, rd[offset+4] & 0x10);
+ input_report_key(input_dev, BTN_SOUTH, rd[offset+4] & 0x20);
+ input_report_key(input_dev, BTN_EAST, rd[offset+4] & 0x40);
+ input_report_key(input_dev, BTN_NORTH, rd[offset+4] & 0x80);
+
+ input_report_key(input_dev, BTN_TL, rd[offset+5] & 0x1);
+ input_report_key(input_dev, BTN_TR, rd[offset+5] & 0x2);
+ input_report_key(input_dev, BTN_TL2, rd[offset+5] & 0x4);
+ input_report_key(input_dev, BTN_TR2, rd[offset+5] & 0x8);
+ input_report_key(input_dev, BTN_SELECT, rd[offset+5] & 0x10);
+ input_report_key(input_dev, BTN_START, rd[offset+5] & 0x20);
+ input_report_key(input_dev, BTN_THUMBL, rd[offset+5] & 0x40);
+ input_report_key(input_dev, BTN_THUMBR, rd[offset+5] & 0x80);
+
+ input_report_key(input_dev, BTN_MODE, rd[offset+6] & 0x1);
+
+ input_report_abs(input_dev, ABS_Z, rd[offset+7]);
+ input_report_abs(input_dev, ABS_RZ, rd[offset+8]);
+
+ input_sync(input_dev);
+ }
+
+ /* Convert timestamp (in 5.33us unit) to timestamp_us */
+ offset = data_offset + DS4_INPUT_REPORT_TIMESTAMP_OFFSET;
+ timestamp = get_unaligned_le16(&rd[offset]);
+ if (!sc->timestamp_initialized) {
+ sc->timestamp_us = ((unsigned int)timestamp * 16) / 3;
+ sc->timestamp_initialized = true;
+ } else {
+ u16 delta;
+
+ if (sc->prev_timestamp > timestamp)
+ delta = (U16_MAX - sc->prev_timestamp + timestamp + 1);
+ else
+ delta = timestamp - sc->prev_timestamp;
+ sc->timestamp_us += (delta * 16) / 3;
+ }
+ sc->prev_timestamp = timestamp;
+ input_event(sc->sensor_dev, EV_MSC, MSC_TIMESTAMP, sc->timestamp_us);
+
+ offset = data_offset + DS4_INPUT_REPORT_GYRO_X_OFFSET;
+ for (n = 0; n < 6; n++) {
+ /* Store data in int for more precision during mult_frac. */
+ int raw_data = (short)((rd[offset+1] << 8) | rd[offset]);
+ struct ds4_calibration_data *calib = &sc->ds4_calib_data[n];
+
+ /* High precision is needed during calibration, but the
+ * calibrated values are within 32-bit.
+ * Note: we swap numerator 'x' and 'numer' in mult_frac for
+ * precision reasons so we don't need 64-bit.
+ */
+ int calib_data = mult_frac(calib->sens_numer,
+ raw_data - calib->bias,
+ calib->sens_denom);
+
+ input_report_abs(sc->sensor_dev, calib->abs_code, calib_data);
+ offset += 2;
+ }
+ input_sync(sc->sensor_dev);
/*
- * The lower 4 bits of byte 30 contain the battery level
+ * The lower 4 bits of byte 30 (or 32 for BT) contain the battery level
* and the 5th bit contains the USB cable state.
*/
+ offset = data_offset + DS4_INPUT_REPORT_BATTERY_OFFSET;
cable_state = (rd[offset] >> 4) & 0x01;
battery_capacity = rd[offset] & 0x0F;
@@ -1261,30 +1023,52 @@ static void dualshock4_parse_report(struct sony_sc *sc, u8 *rd, int size)
sc->battery_charging = battery_charging;
spin_unlock_irqrestore(&sc->lock, flags);
- offset += 5;
-
/*
- * The Dualshock 4 multi-touch trackpad data starts at offset 35 on USB
- * and 37 on Bluetooth.
- * The first 7 bits of the first byte is a counter and bit 8 is a touch
- * indicator that is 0 when pressed and 1 when not pressed.
- * The next 3 bytes are two 12 bit touch coordinates, X and Y.
- * The data for the second touch is in the same format and immediatly
- * follows the data for the first.
+ * The Dualshock 4 multi-touch trackpad data starts at offset 33 on USB
+ * and 35 on Bluetooth.
+ * The first byte indicates the number of touch data in the report.
+ * Trackpad data starts 2 bytes later (e.g. 35 for USB).
*/
- for (n = 0; n < 2; n++) {
- u16 x, y;
+ offset = data_offset + DS4_INPUT_REPORT_TOUCHPAD_OFFSET;
+ max_touch_data = (sc->quirks & DUALSHOCK4_CONTROLLER_BT) ? 4 : 3;
+ if (rd[offset] > 0 && rd[offset] <= max_touch_data)
+ num_touch_data = rd[offset];
+ else
+ num_touch_data = 1;
+ offset += 1;
- x = rd[offset+1] | ((rd[offset+2] & 0xF) << 8);
- y = ((rd[offset+2] & 0xF0) >> 4) | (rd[offset+3] << 4);
+ for (m = 0; m < num_touch_data; m++) {
+ /* Skip past timestamp */
+ offset += 1;
- input_mt_slot(input_dev, n);
- input_mt_report_slot_state(input_dev, MT_TOOL_FINGER,
- !(rd[offset] >> 7));
- input_report_abs(input_dev, ABS_MT_POSITION_X, x);
- input_report_abs(input_dev, ABS_MT_POSITION_Y, y);
+ /*
+ * The first 7 bits of the first byte is a counter and bit 8 is
+ * a touch indicator that is 0 when pressed and 1 when not
+ * pressed.
+ * The next 3 bytes are two 12 bit touch coordinates, X and Y.
+ * The data for the second touch is in the same format and
+ * immediately follows the data for the first.
+ */
+ for (n = 0; n < 2; n++) {
+ u16 x, y;
+ bool active;
- offset += 4;
+ x = rd[offset+1] | ((rd[offset+2] & 0xF) << 8);
+ y = ((rd[offset+2] & 0xF0) >> 4) | (rd[offset+3] << 4);
+
+ active = !(rd[offset] >> 7);
+ input_mt_slot(sc->touchpad, n);
+ input_mt_report_slot_state(sc->touchpad, MT_TOOL_FINGER, active);
+
+ if (active) {
+ input_report_abs(sc->touchpad, ABS_MT_POSITION_X, x);
+ input_report_abs(sc->touchpad, ABS_MT_POSITION_Y, y);
+ }
+
+ offset += 4;
+ }
+ input_mt_sync_frame(sc->touchpad);
+ input_sync(sc->touchpad);
}
}
@@ -1320,15 +1104,87 @@ static int sony_raw_event(struct hid_device *hdev, struct hid_report *report,
} else if ((sc->quirks & NAVIGATION_CONTROLLER) && rd[0] == 0x01 &&
size == 49) {
sixaxis_parse_report(sc, rd, size);
- } else if (((sc->quirks & DUALSHOCK4_CONTROLLER_USB) && rd[0] == 0x01 &&
- size == 64) || ((sc->quirks & DUALSHOCK4_CONTROLLER_BT)
- && rd[0] == 0x11 && size == 78)) {
+ } else if ((sc->quirks & DUALSHOCK4_CONTROLLER_USB) && rd[0] == 0x01 &&
+ size == 64) {
+ dualshock4_parse_report(sc, rd, size);
+ } else if (((sc->quirks & DUALSHOCK4_CONTROLLER_BT) && rd[0] == 0x11 &&
+ size == 78)) {
+ /* CRC check */
+ u8 bthdr = 0xA1;
+ u32 crc;
+ u32 report_crc;
+
+ crc = crc32_le(0xFFFFFFFF, &bthdr, 1);
+ crc = ~crc32_le(crc, rd, DS4_INPUT_REPORT_0x11_SIZE-4);
+ report_crc = get_unaligned_le32(&rd[DS4_INPUT_REPORT_0x11_SIZE-4]);
+ if (crc != report_crc) {
+ hid_dbg(sc->hdev, "DualShock 4 input report's CRC check failed, received crc 0x%0x != 0x%0x\n",
+ report_crc, crc);
+ return -EILSEQ;
+ }
+
+ dualshock4_parse_report(sc, rd, size);
+ } else if ((sc->quirks & DUALSHOCK4_DONGLE) && rd[0] == 0x01 &&
+ size == 64) {
+ unsigned long flags;
+ enum ds4_dongle_state dongle_state;
+
+ /*
+ * In the case of a DS4 USB dongle, bit[2] of byte 31 indicates
+ * if a DS4 is actually connected (indicated by '0').
+ * For non-dongle, this bit is always 0 (connected).
+ */
+ bool connected = (rd[31] & 0x04) ? false : true;
+
+ spin_lock_irqsave(&sc->lock, flags);
+ dongle_state = sc->ds4_dongle_state;
+ spin_unlock_irqrestore(&sc->lock, flags);
+
+ /*
+ * The dongle always sends input reports even when no
+ * DS4 is attached. When a DS4 is connected, we need to
+ * obtain calibration data before we can use it.
+ * The code below tracks dongle state and kicks of
+ * calibration when needed and only allows us to process
+ * input if a DS4 is actually connected.
+ */
+ if (dongle_state == DONGLE_DISCONNECTED && connected) {
+ hid_info(sc->hdev, "DualShock 4 USB dongle: controller connected\n");
+ sony_set_leds(sc);
+
+ spin_lock_irqsave(&sc->lock, flags);
+ sc->ds4_dongle_state = DONGLE_CALIBRATING;
+ spin_unlock_irqrestore(&sc->lock, flags);
+
+ sony_schedule_work(sc, SONY_WORKER_HOTPLUG);
+
+ /* Don't process the report since we don't have
+ * calibration data, but let hidraw have it anyway.
+ */
+ return 0;
+ } else if ((dongle_state == DONGLE_CONNECTED ||
+ dongle_state == DONGLE_DISABLED) && !connected) {
+ hid_info(sc->hdev, "DualShock 4 USB dongle: controller disconnected\n");
+
+ spin_lock_irqsave(&sc->lock, flags);
+ sc->ds4_dongle_state = DONGLE_DISCONNECTED;
+ spin_unlock_irqrestore(&sc->lock, flags);
+
+ /* Return 0, so hidraw can get the report. */
+ return 0;
+ } else if (dongle_state == DONGLE_CALIBRATING ||
+ dongle_state == DONGLE_DISABLED ||
+ dongle_state == DONGLE_DISCONNECTED) {
+ /* Return 0, so hidraw can get the report. */
+ return 0;
+ }
+
dualshock4_parse_report(sc, rd, size);
}
if (sc->defer_initialization) {
sc->defer_initialization = 0;
- sony_schedule_work(sc);
+ sony_schedule_work(sc, SONY_WORKER_STATE);
}
return 0;
@@ -1366,49 +1222,189 @@ static int sony_mapping(struct hid_device *hdev, struct hid_input *hi,
if (sc->quirks & PS3REMOTE)
return ps3remote_mapping(hdev, hi, field, usage, bit, max);
+ if (sc->quirks & NAVIGATION_CONTROLLER)
+ return navigation_mapping(hdev, hi, field, usage, bit, max);
+
+ if (sc->quirks & SIXAXIS_CONTROLLER)
+ return sixaxis_mapping(hdev, hi, field, usage, bit, max);
+
+ if (sc->quirks & DUALSHOCK4_CONTROLLER)
+ return ds4_mapping(hdev, hi, field, usage, bit, max);
+
+
/* Let hid-core decide for the others */
return 0;
}
-static int sony_register_touchpad(struct hid_input *hi, int touch_count,
+static int sony_register_touchpad(struct sony_sc *sc, int touch_count,
int w, int h)
{
- struct input_dev *input_dev = hi->input;
+ size_t name_sz;
+ char *name;
int ret;
- ret = input_mt_init_slots(input_dev, touch_count, 0);
+ sc->touchpad = input_allocate_device();
+ if (!sc->touchpad)
+ return -ENOMEM;
+
+ input_set_drvdata(sc->touchpad, sc);
+ sc->touchpad->dev.parent = &sc->hdev->dev;
+ sc->touchpad->phys = sc->hdev->phys;
+ sc->touchpad->uniq = sc->hdev->uniq;
+ sc->touchpad->id.bustype = sc->hdev->bus;
+ sc->touchpad->id.vendor = sc->hdev->vendor;
+ sc->touchpad->id.product = sc->hdev->product;
+ sc->touchpad->id.version = sc->hdev->version;
+
+ /* Append a suffix to the controller name as there are various
+ * DS4 compatible non-Sony devices with different names.
+ */
+ name_sz = strlen(sc->hdev->name) + sizeof(DS4_TOUCHPAD_SUFFIX);
+ name = kzalloc(name_sz, GFP_KERNEL);
+ if (!name) {
+ ret = -ENOMEM;
+ goto err;
+ }
+ snprintf(name, name_sz, "%s" DS4_TOUCHPAD_SUFFIX, sc->hdev->name);
+ sc->touchpad->name = name;
+
+ ret = input_mt_init_slots(sc->touchpad, touch_count, INPUT_MT_POINTER);
if (ret < 0)
- return ret;
+ goto err;
+
+ /* We map the button underneath the touchpad to BTN_LEFT. */
+ __set_bit(EV_KEY, sc->touchpad->evbit);
+ __set_bit(BTN_LEFT, sc->touchpad->keybit);
+ __set_bit(INPUT_PROP_BUTTONPAD, sc->touchpad->propbit);
+
+ input_set_abs_params(sc->touchpad, ABS_MT_POSITION_X, 0, w, 0, 0);
+ input_set_abs_params(sc->touchpad, ABS_MT_POSITION_Y, 0, h, 0, 0);
- input_set_abs_params(input_dev, ABS_MT_POSITION_X, 0, w, 0, 0);
- input_set_abs_params(input_dev, ABS_MT_POSITION_Y, 0, h, 0, 0);
+ ret = input_register_device(sc->touchpad);
+ if (ret < 0)
+ goto err;
return 0;
+
+err:
+ kfree(sc->touchpad->name);
+ sc->touchpad->name = NULL;
+
+ input_free_device(sc->touchpad);
+ sc->touchpad = NULL;
+
+ return ret;
}
-static int sony_input_configured(struct hid_device *hdev,
- struct hid_input *hidinput)
+static void sony_unregister_touchpad(struct sony_sc *sc)
{
- struct sony_sc *sc = hid_get_drvdata(hdev);
+ if (!sc->touchpad)
+ return;
+
+ kfree(sc->touchpad->name);
+ sc->touchpad->name = NULL;
+
+ input_unregister_device(sc->touchpad);
+ sc->touchpad = NULL;
+}
+
+static int sony_register_sensors(struct sony_sc *sc)
+{
+ size_t name_sz;
+ char *name;
int ret;
+ int range;
- /*
- * The Dualshock 4 touchpad supports 2 touches and has a
- * resolution of 1920x942 (44.86 dots/mm).
+ sc->sensor_dev = input_allocate_device();
+ if (!sc->sensor_dev)
+ return -ENOMEM;
+
+ input_set_drvdata(sc->sensor_dev, sc);
+ sc->sensor_dev->dev.parent = &sc->hdev->dev;
+ sc->sensor_dev->phys = sc->hdev->phys;
+ sc->sensor_dev->uniq = sc->hdev->uniq;
+ sc->sensor_dev->id.bustype = sc->hdev->bus;
+ sc->sensor_dev->id.vendor = sc->hdev->vendor;
+ sc->sensor_dev->id.product = sc->hdev->product;
+ sc->sensor_dev->id.version = sc->hdev->version;
+
+ /* Append a suffix to the controller name as there are various
+ * DS4 compatible non-Sony devices with different names.
*/
- if (sc->quirks & DUALSHOCK4_CONTROLLER) {
- ret = sony_register_touchpad(hidinput, 2, 1920, 942);
- if (ret) {
- hid_err(sc->hdev,
- "Unable to initialize multi-touch slots: %d\n",
- ret);
- return ret;
- }
+ name_sz = strlen(sc->hdev->name) + sizeof(SENSOR_SUFFIX);
+ name = kzalloc(name_sz, GFP_KERNEL);
+ if (!name) {
+ ret = -ENOMEM;
+ goto err;
}
+ snprintf(name, name_sz, "%s" SENSOR_SUFFIX, sc->hdev->name);
+ sc->sensor_dev->name = name;
+
+ if (sc->quirks & SIXAXIS_CONTROLLER) {
+ /* For the DS3 we only support the accelerometer, which works
+ * quite well even without calibration. The device also has
+ * a 1-axis gyro, but it is very difficult to manage from within
+ * the driver even to get data, the sensor is inaccurate and
+ * the behavior is very different between hardware revisions.
+ */
+ input_set_abs_params(sc->sensor_dev, ABS_X, -512, 511, 4, 0);
+ input_set_abs_params(sc->sensor_dev, ABS_Y, -512, 511, 4, 0);
+ input_set_abs_params(sc->sensor_dev, ABS_Z, -512, 511, 4, 0);
+ input_abs_set_res(sc->sensor_dev, ABS_X, SIXAXIS_ACC_RES_PER_G);
+ input_abs_set_res(sc->sensor_dev, ABS_Y, SIXAXIS_ACC_RES_PER_G);
+ input_abs_set_res(sc->sensor_dev, ABS_Z, SIXAXIS_ACC_RES_PER_G);
+ } else if (sc->quirks & DUALSHOCK4_CONTROLLER) {
+ range = DS4_ACC_RES_PER_G*4;
+ input_set_abs_params(sc->sensor_dev, ABS_X, -range, range, 16, 0);
+ input_set_abs_params(sc->sensor_dev, ABS_Y, -range, range, 16, 0);
+ input_set_abs_params(sc->sensor_dev, ABS_Z, -range, range, 16, 0);
+ input_abs_set_res(sc->sensor_dev, ABS_X, DS4_ACC_RES_PER_G);
+ input_abs_set_res(sc->sensor_dev, ABS_Y, DS4_ACC_RES_PER_G);
+ input_abs_set_res(sc->sensor_dev, ABS_Z, DS4_ACC_RES_PER_G);
+
+ range = DS4_GYRO_RES_PER_DEG_S*2048;
+ input_set_abs_params(sc->sensor_dev, ABS_RX, -range, range, 16, 0);
+ input_set_abs_params(sc->sensor_dev, ABS_RY, -range, range, 16, 0);
+ input_set_abs_params(sc->sensor_dev, ABS_RZ, -range, range, 16, 0);
+ input_abs_set_res(sc->sensor_dev, ABS_RX, DS4_GYRO_RES_PER_DEG_S);
+ input_abs_set_res(sc->sensor_dev, ABS_RY, DS4_GYRO_RES_PER_DEG_S);
+ input_abs_set_res(sc->sensor_dev, ABS_RZ, DS4_GYRO_RES_PER_DEG_S);
+
+ __set_bit(EV_MSC, sc->sensor_dev->evbit);
+ __set_bit(MSC_TIMESTAMP, sc->sensor_dev->mscbit);
+ }
+
+ __set_bit(INPUT_PROP_ACCELEROMETER, sc->sensor_dev->propbit);
+
+ ret = input_register_device(sc->sensor_dev);
+ if (ret < 0)
+ goto err;
return 0;
+
+err:
+ kfree(sc->sensor_dev->name);
+ sc->sensor_dev->name = NULL;
+
+ input_free_device(sc->sensor_dev);
+ sc->sensor_dev = NULL;
+
+ return ret;
}
+static void sony_unregister_sensors(struct sony_sc *sc)
+{
+ if (!sc->sensor_dev)
+ return;
+
+ kfree(sc->sensor_dev->name);
+ sc->sensor_dev->name = NULL;
+
+ input_unregister_device(sc->sensor_dev);
+ sc->sensor_dev = NULL;
+}
+
+
/*
* Sending HID_REQ_GET_REPORT changes the operation mode of the ps3 controller
* to "operational". Without this, the ps3 controller will not report any
@@ -1474,26 +1470,176 @@ static int sixaxis_set_operational_bt(struct hid_device *hdev)
}
/*
- * Requesting feature report 0x02 in Bluetooth mode changes the state of the
- * controller so that it sends full input reports of type 0x11.
+ * Request DS4 calibration data for the motion sensors.
+ * For Bluetooth this also affects the operating mode (see below).
*/
-static int dualshock4_set_operational_bt(struct hid_device *hdev)
+static int dualshock4_get_calibration_data(struct sony_sc *sc)
{
u8 *buf;
int ret;
+ short gyro_pitch_bias, gyro_pitch_plus, gyro_pitch_minus;
+ short gyro_yaw_bias, gyro_yaw_plus, gyro_yaw_minus;
+ short gyro_roll_bias, gyro_roll_plus, gyro_roll_minus;
+ short gyro_speed_plus, gyro_speed_minus;
+ short acc_x_plus, acc_x_minus;
+ short acc_y_plus, acc_y_minus;
+ short acc_z_plus, acc_z_minus;
+ int speed_2x;
+ int range_2g;
+
+ /* For Bluetooth we use a different request, which supports CRC.
+ * Note: in Bluetooth mode feature report 0x02 also changes the state
+ * of the controller, so that it sends input reports of type 0x11.
+ */
+ if (sc->quirks & (DUALSHOCK4_CONTROLLER_USB | DUALSHOCK4_DONGLE)) {
+ buf = kmalloc(DS4_FEATURE_REPORT_0x02_SIZE, GFP_KERNEL);
+ if (!buf)
+ return -ENOMEM;
- buf = kmalloc(DS4_REPORT_0x02_SIZE, GFP_KERNEL);
- if (!buf)
- return -ENOMEM;
+ ret = hid_hw_raw_request(sc->hdev, 0x02, buf,
+ DS4_FEATURE_REPORT_0x02_SIZE,
+ HID_FEATURE_REPORT,
+ HID_REQ_GET_REPORT);
+ if (ret < 0)
+ goto err_stop;
+ } else {
+ u8 bthdr = 0xA3;
+ u32 crc;
+ u32 report_crc;
+ int retries;
- ret = hid_hw_raw_request(hdev, 0x02, buf, DS4_REPORT_0x02_SIZE,
- HID_FEATURE_REPORT, HID_REQ_GET_REPORT);
+ buf = kmalloc(DS4_FEATURE_REPORT_0x05_SIZE, GFP_KERNEL);
+ if (!buf)
+ return -ENOMEM;
- kfree(buf);
+ for (retries = 0; retries < 3; retries++) {
+ ret = hid_hw_raw_request(sc->hdev, 0x05, buf,
+ DS4_FEATURE_REPORT_0x05_SIZE,
+ HID_FEATURE_REPORT,
+ HID_REQ_GET_REPORT);
+ if (ret < 0)
+ goto err_stop;
+ /* CRC check */
+ crc = crc32_le(0xFFFFFFFF, &bthdr, 1);
+ crc = ~crc32_le(crc, buf, DS4_FEATURE_REPORT_0x05_SIZE-4);
+ report_crc = get_unaligned_le32(&buf[DS4_FEATURE_REPORT_0x05_SIZE-4]);
+ if (crc != report_crc) {
+ hid_warn(sc->hdev, "DualShock 4 calibration report's CRC check failed, received crc 0x%0x != 0x%0x\n",
+ report_crc, crc);
+ if (retries < 2) {
+ hid_warn(sc->hdev, "Retrying DualShock 4 get calibration report request\n");
+ continue;
+ } else {
+ ret = -EILSEQ;
+ goto err_stop;
+ }
+ } else {
+ break;
+ }
+ }
+ }
+
+ gyro_pitch_bias = get_unaligned_le16(&buf[1]);
+ gyro_yaw_bias = get_unaligned_le16(&buf[3]);
+ gyro_roll_bias = get_unaligned_le16(&buf[5]);
+ if (sc->quirks & DUALSHOCK4_CONTROLLER_USB) {
+ gyro_pitch_plus = get_unaligned_le16(&buf[7]);
+ gyro_pitch_minus = get_unaligned_le16(&buf[9]);
+ gyro_yaw_plus = get_unaligned_le16(&buf[11]);
+ gyro_yaw_minus = get_unaligned_le16(&buf[13]);
+ gyro_roll_plus = get_unaligned_le16(&buf[15]);
+ gyro_roll_minus = get_unaligned_le16(&buf[17]);
+ } else {
+ /* BT + Dongle */
+ gyro_pitch_plus = get_unaligned_le16(&buf[7]);
+ gyro_yaw_plus = get_unaligned_le16(&buf[9]);
+ gyro_roll_plus = get_unaligned_le16(&buf[11]);
+ gyro_pitch_minus = get_unaligned_le16(&buf[13]);
+ gyro_yaw_minus = get_unaligned_le16(&buf[15]);
+ gyro_roll_minus = get_unaligned_le16(&buf[17]);
+ }
+ gyro_speed_plus = get_unaligned_le16(&buf[19]);
+ gyro_speed_minus = get_unaligned_le16(&buf[21]);
+ acc_x_plus = get_unaligned_le16(&buf[23]);
+ acc_x_minus = get_unaligned_le16(&buf[25]);
+ acc_y_plus = get_unaligned_le16(&buf[27]);
+ acc_y_minus = get_unaligned_le16(&buf[29]);
+ acc_z_plus = get_unaligned_le16(&buf[31]);
+ acc_z_minus = get_unaligned_le16(&buf[33]);
+
+ /* Set gyroscope calibration and normalization parameters.
+ * Data values will be normalized to 1/DS4_GYRO_RES_PER_DEG_S degree/s.
+ */
+ speed_2x = (gyro_speed_plus + gyro_speed_minus);
+ sc->ds4_calib_data[0].abs_code = ABS_RX;
+ sc->ds4_calib_data[0].bias = gyro_pitch_bias;
+ sc->ds4_calib_data[0].sens_numer = speed_2x*DS4_GYRO_RES_PER_DEG_S;
+ sc->ds4_calib_data[0].sens_denom = gyro_pitch_plus - gyro_pitch_minus;
+
+ sc->ds4_calib_data[1].abs_code = ABS_RY;
+ sc->ds4_calib_data[1].bias = gyro_yaw_bias;
+ sc->ds4_calib_data[1].sens_numer = speed_2x*DS4_GYRO_RES_PER_DEG_S;
+ sc->ds4_calib_data[1].sens_denom = gyro_yaw_plus - gyro_yaw_minus;
+
+ sc->ds4_calib_data[2].abs_code = ABS_RZ;
+ sc->ds4_calib_data[2].bias = gyro_roll_bias;
+ sc->ds4_calib_data[2].sens_numer = speed_2x*DS4_GYRO_RES_PER_DEG_S;
+ sc->ds4_calib_data[2].sens_denom = gyro_roll_plus - gyro_roll_minus;
+
+ /* Set accelerometer calibration and normalization parameters.
+ * Data values will be normalized to 1/DS4_ACC_RES_PER_G G.
+ */
+ range_2g = acc_x_plus - acc_x_minus;
+ sc->ds4_calib_data[3].abs_code = ABS_X;
+ sc->ds4_calib_data[3].bias = acc_x_plus - range_2g / 2;
+ sc->ds4_calib_data[3].sens_numer = 2*DS4_ACC_RES_PER_G;
+ sc->ds4_calib_data[3].sens_denom = range_2g;
+
+ range_2g = acc_y_plus - acc_y_minus;
+ sc->ds4_calib_data[4].abs_code = ABS_Y;
+ sc->ds4_calib_data[4].bias = acc_y_plus - range_2g / 2;
+ sc->ds4_calib_data[4].sens_numer = 2*DS4_ACC_RES_PER_G;
+ sc->ds4_calib_data[4].sens_denom = range_2g;
+
+ range_2g = acc_z_plus - acc_z_minus;
+ sc->ds4_calib_data[5].abs_code = ABS_Z;
+ sc->ds4_calib_data[5].bias = acc_z_plus - range_2g / 2;
+ sc->ds4_calib_data[5].sens_numer = 2*DS4_ACC_RES_PER_G;
+ sc->ds4_calib_data[5].sens_denom = range_2g;
+
+err_stop:
+ kfree(buf);
return ret;
}
+static void dualshock4_calibration_work(struct work_struct *work)
+{
+ struct sony_sc *sc = container_of(work, struct sony_sc, hotplug_worker);
+ unsigned long flags;
+ enum ds4_dongle_state dongle_state;
+ int ret;
+
+ ret = dualshock4_get_calibration_data(sc);
+ if (ret < 0) {
+ /* This call is very unlikely to fail for the dongle. When it
+ * fails we are probably in a very bad state, so mark the
+ * dongle as disabled. We will re-enable the dongle if a new
+ * DS4 hotplug is detect from sony_raw_event as any issues
+ * are likely resolved then (the dongle is quite stupid).
+ */
+ hid_err(sc->hdev, "DualShock 4 USB dongle: calibration failed, disabling device\n");
+ dongle_state = DONGLE_DISABLED;
+ } else {
+ hid_info(sc->hdev, "DualShock 4 USB dongle: calibration completed\n");
+ dongle_state = DONGLE_CONNECTED;
+ }
+
+ spin_lock_irqsave(&sc->lock, flags);
+ sc->ds4_dongle_state = dongle_state;
+ spin_unlock_irqrestore(&sc->lock, flags);
+}
+
static void sixaxis_set_leds_from_id(struct sony_sc *sc)
{
static const u8 sixaxis_leds[10][4] = {
@@ -1524,10 +1670,10 @@ static void dualshock4_set_leds_from_id(struct sony_sc *sc)
{
/* The first 4 color/index entries match what the PS4 assigns */
static const u8 color_code[7][3] = {
- /* Blue */ { 0x00, 0x00, 0x01 },
- /* Red */ { 0x01, 0x00, 0x00 },
- /* Green */ { 0x00, 0x01, 0x00 },
- /* Pink */ { 0x02, 0x00, 0x01 },
+ /* Blue */ { 0x00, 0x00, 0x40 },
+ /* Red */ { 0x40, 0x00, 0x00 },
+ /* Green */ { 0x00, 0x40, 0x00 },
+ /* Pink */ { 0x20, 0x00, 0x20 },
/* Orange */ { 0x02, 0x01, 0x00 },
/* Teal */ { 0x00, 0x01, 0x01 },
/* White */ { 0x01, 0x01, 0x01 }
@@ -1568,7 +1714,7 @@ static void buzz_set_leds(struct sony_sc *sc)
static void sony_set_leds(struct sony_sc *sc)
{
if (!(sc->quirks & BUZZ_CONTROLLER))
- sony_schedule_work(sc);
+ sony_schedule_work(sc, SONY_WORKER_STATE);
else
buzz_set_leds(sc);
}
@@ -1679,7 +1825,7 @@ static int sony_led_blink_set(struct led_classdev *led, unsigned long *delay_on,
new_off != drv_data->led_delay_off[n]) {
drv_data->led_delay_on[n] = new_on;
drv_data->led_delay_off[n] = new_off;
- sony_schedule_work(drv_data);
+ sony_schedule_work(drv_data, SONY_WORKER_STATE);
}
return 0;
@@ -1881,26 +2027,24 @@ static void dualshock4_send_output_report(struct sony_sc *sc)
int offset;
/*
- * NOTE: The buf[1] field of the Bluetooth report controls
- * the Dualshock 4 reporting rate.
- *
- * Known values include:
- *
- * 0x80 - 1000hz (full speed)
- * 0xA0 - 31hz
- * 0xB0 - 20hz
- * 0xD0 - 66hz
+ * NOTE: The lower 6 bits of buf[1] field of the Bluetooth report
+ * control the interval at which Dualshock 4 reports data:
+ * 0x00 - 1ms
+ * 0x01 - 1ms
+ * 0x02 - 2ms
+ * 0x3E - 62ms
+ * 0x3F - disabled
*/
- if (sc->quirks & DUALSHOCK4_CONTROLLER_USB) {
- memset(buf, 0, DS4_REPORT_0x05_SIZE);
+ if (sc->quirks & (DUALSHOCK4_CONTROLLER_USB | DUALSHOCK4_DONGLE)) {
+ memset(buf, 0, DS4_OUTPUT_REPORT_0x05_SIZE);
buf[0] = 0x05;
- buf[1] = 0xFF;
+ buf[1] = 0x07; /* blink + LEDs + motor */
offset = 4;
} else {
- memset(buf, 0, DS4_REPORT_0x11_SIZE);
+ memset(buf, 0, DS4_OUTPUT_REPORT_0x11_SIZE);
buf[0] = 0x11;
- buf[1] = 0x80;
- buf[3] = 0x0F;
+ buf[1] = 0xC0 /* HID + CRC */ | sc->ds4_bt_poll_interval;
+ buf[3] = 0x07; /* blink + LEDs + motor */
offset = 6;
}
@@ -1924,11 +2068,18 @@ static void dualshock4_send_output_report(struct sony_sc *sc)
buf[offset++] = sc->led_delay_on[3];
buf[offset++] = sc->led_delay_off[3];
- if (sc->quirks & DUALSHOCK4_CONTROLLER_USB)
- hid_hw_output_report(hdev, buf, DS4_REPORT_0x05_SIZE);
- else
- hid_hw_raw_request(hdev, 0x11, buf, DS4_REPORT_0x11_SIZE,
- HID_OUTPUT_REPORT, HID_REQ_SET_REPORT);
+ if (sc->quirks & (DUALSHOCK4_CONTROLLER_USB | DUALSHOCK4_DONGLE))
+ hid_hw_output_report(hdev, buf, DS4_OUTPUT_REPORT_0x05_SIZE);
+ else {
+ /* CRC generation */
+ u8 bthdr = 0xA2;
+ u32 crc;
+
+ crc = crc32_le(0xFFFFFFFF, &bthdr, 1);
+ crc = ~crc32_le(crc, buf, DS4_OUTPUT_REPORT_0x11_SIZE-4);
+ put_unaligned_le32(crc, &buf[74]);
+ hid_hw_output_report(hdev, buf, DS4_OUTPUT_REPORT_0x11_SIZE);
+ }
}
static void motion_send_output_report(struct sony_sc *sc)
@@ -1972,10 +2123,10 @@ static int sony_allocate_output_report(struct sony_sc *sc)
kmalloc(sizeof(union sixaxis_output_report_01),
GFP_KERNEL);
else if (sc->quirks & DUALSHOCK4_CONTROLLER_BT)
- sc->output_report_dmabuf = kmalloc(DS4_REPORT_0x11_SIZE,
+ sc->output_report_dmabuf = kmalloc(DS4_OUTPUT_REPORT_0x11_SIZE,
GFP_KERNEL);
- else if (sc->quirks & DUALSHOCK4_CONTROLLER_USB)
- sc->output_report_dmabuf = kmalloc(DS4_REPORT_0x05_SIZE,
+ else if (sc->quirks & (DUALSHOCK4_CONTROLLER_USB | DUALSHOCK4_DONGLE))
+ sc->output_report_dmabuf = kmalloc(DS4_OUTPUT_REPORT_0x05_SIZE,
GFP_KERNEL);
else if (sc->quirks & MOTION_CONTROLLER)
sc->output_report_dmabuf = kmalloc(MOTION_REPORT_0x02_SIZE,
@@ -2002,7 +2153,7 @@ static int sony_play_effect(struct input_dev *dev, void *data,
sc->left = effect->u.rumble.strong_magnitude / 256;
sc->right = effect->u.rumble.weak_magnitude / 256;
- sony_schedule_work(sc);
+ sony_schedule_work(sc, SONY_WORKER_STATE);
return 0;
}
@@ -2219,8 +2370,8 @@ static int sony_check_add(struct sony_sc *sc)
hid_warn(sc->hdev, "UNIQ does not contain a MAC address; duplicate check skipped\n");
return 0;
}
- } else if (sc->quirks & DUALSHOCK4_CONTROLLER_USB) {
- buf = kmalloc(DS4_REPORT_0x81_SIZE, GFP_KERNEL);
+ } else if (sc->quirks & (DUALSHOCK4_CONTROLLER_USB | DUALSHOCK4_DONGLE)) {
+ buf = kmalloc(DS4_FEATURE_REPORT_0x81_SIZE, GFP_KERNEL);
if (!buf)
return -ENOMEM;
@@ -2230,16 +2381,22 @@ static int sony_check_add(struct sony_sc *sc)
* offset 1.
*/
ret = hid_hw_raw_request(sc->hdev, 0x81, buf,
- DS4_REPORT_0x81_SIZE, HID_FEATURE_REPORT,
+ DS4_FEATURE_REPORT_0x81_SIZE, HID_FEATURE_REPORT,
HID_REQ_GET_REPORT);
- if (ret != DS4_REPORT_0x81_SIZE) {
+ if (ret != DS4_FEATURE_REPORT_0x81_SIZE) {
hid_err(sc->hdev, "failed to retrieve feature report 0x81 with the DualShock 4 MAC address\n");
ret = ret < 0 ? ret : -EINVAL;
goto out_free;
}
memcpy(sc->mac_address, &buf[1], sizeof(sc->mac_address));
+
+ snprintf(sc->hdev->uniq, sizeof(sc->hdev->uniq),
+ "%02hhx:%02hhx:%02hhx:%02hhx:%02hhx:%02hhx",
+ sc->mac_address[5], sc->mac_address[4],
+ sc->mac_address[3], sc->mac_address[2],
+ sc->mac_address[1], sc->mac_address[0]);
} else if ((sc->quirks & SIXAXIS_CONTROLLER_USB) ||
(sc->quirks & NAVIGATION_CONTROLLER_USB)) {
buf = kmalloc(SIXAXIS_REPORT_0xF2_SIZE, GFP_KERNEL);
@@ -2267,6 +2424,12 @@ static int sony_check_add(struct sony_sc *sc)
*/
for (n = 0; n < 6; n++)
sc->mac_address[5-n] = buf[4+n];
+
+ snprintf(sc->hdev->uniq, sizeof(sc->hdev->uniq),
+ "%02hhx:%02hhx:%02hhx:%02hhx:%02hhx:%02hhx",
+ sc->mac_address[5], sc->mac_address[4],
+ sc->mac_address[3], sc->mac_address[2],
+ sc->mac_address[1], sc->mac_address[0]);
} else {
return 0;
}
@@ -2317,57 +2480,27 @@ static inline void sony_init_output_report(struct sony_sc *sc,
{
sc->send_output_report = send_output_report;
- if (!sc->worker_initialized)
+ if (!sc->state_worker_initialized)
INIT_WORK(&sc->state_worker, sony_state_worker);
- sc->worker_initialized = 1;
+ sc->state_worker_initialized = 1;
}
static inline void sony_cancel_work_sync(struct sony_sc *sc)
{
- if (sc->worker_initialized)
+ if (sc->hotplug_worker_initialized)
+ cancel_work_sync(&sc->hotplug_worker);
+ if (sc->state_worker_initialized)
cancel_work_sync(&sc->state_worker);
}
-static int sony_probe(struct hid_device *hdev, const struct hid_device_id *id)
+
+static int sony_input_configured(struct hid_device *hdev,
+ struct hid_input *hidinput)
{
- int ret;
+ struct sony_sc *sc = hid_get_drvdata(hdev);
int append_dev_id;
- unsigned long quirks = id->driver_data;
- struct sony_sc *sc;
- unsigned int connect_mask = HID_CONNECT_DEFAULT;
-
- if (!strcmp(hdev->name, "FutureMax Dance Mat"))
- quirks |= FUTUREMAX_DANCE_MAT;
-
- sc = devm_kzalloc(&hdev->dev, sizeof(*sc), GFP_KERNEL);
- if (sc == NULL) {
- hid_err(hdev, "can't alloc sony descriptor\n");
- return -ENOMEM;
- }
-
- spin_lock_init(&sc->lock);
-
- sc->quirks = quirks;
- hid_set_drvdata(hdev, sc);
- sc->hdev = hdev;
-
- ret = hid_parse(hdev);
- if (ret) {
- hid_err(hdev, "parse failed\n");
- return ret;
- }
-
- if (sc->quirks & VAIO_RDESC_CONSTANT)
- connect_mask |= HID_CONNECT_HIDDEV_FORCE;
- else if (sc->quirks & SIXAXIS_CONTROLLER)
- connect_mask |= HID_CONNECT_HIDDEV_FORCE;
-
- ret = hid_hw_start(hdev, connect_mask);
- if (ret) {
- hid_err(hdev, "hw start failed\n");
- return ret;
- }
+ int ret;
ret = sony_set_device_id(sc);
if (ret < 0) {
@@ -2375,14 +2508,17 @@ static int sony_probe(struct hid_device *hdev, const struct hid_device_id *id)
goto err_stop;
}
+ ret = append_dev_id = sony_check_add(sc);
+ if (ret < 0)
+ goto err_stop;
+
ret = sony_allocate_output_report(sc);
if (ret < 0) {
hid_err(hdev, "failed to allocate the output report buffer\n");
goto err_stop;
}
- if ((sc->quirks & SIXAXIS_CONTROLLER_USB) ||
- (sc->quirks & NAVIGATION_CONTROLLER_USB)) {
+ if (sc->quirks & NAVIGATION_CONTROLLER_USB) {
/*
* The Sony Sixaxis does not handle HID Output Reports on the
* Interrupt EP like it could, so we need to force HID Output
@@ -2402,29 +2538,113 @@ static int sony_probe(struct hid_device *hdev, const struct hid_device_id *id)
hdev->quirks |= HID_QUIRK_NO_OUTPUT_REPORTS_ON_INTR_EP;
hdev->quirks |= HID_QUIRK_SKIP_OUTPUT_REPORT_ID;
sc->defer_initialization = 1;
+
+ ret = sixaxis_set_operational_usb(hdev);
+ if (ret < 0) {
+ hid_err(hdev, "Failed to set controller into operational mode\n");
+ goto err_stop;
+ }
+
+ sony_init_output_report(sc, sixaxis_send_output_report);
+ } else if (sc->quirks & NAVIGATION_CONTROLLER_BT) {
+ /*
+ * The Navigation controller wants output reports sent on the ctrl
+ * endpoint when connected via Bluetooth.
+ */
+ hdev->quirks |= HID_QUIRK_NO_OUTPUT_REPORTS_ON_INTR_EP;
+
+ ret = sixaxis_set_operational_bt(hdev);
+ if (ret < 0) {
+ hid_err(hdev, "Failed to set controller into operational mode\n");
+ goto err_stop;
+ }
+
+ sony_init_output_report(sc, sixaxis_send_output_report);
+ } else if (sc->quirks & SIXAXIS_CONTROLLER_USB) {
+ /*
+ * The Sony Sixaxis does not handle HID Output Reports on the
+ * Interrupt EP and the device only becomes active when the
+ * PS button is pressed. See comment for Navigation controller
+ * above for more details.
+ */
+ hdev->quirks |= HID_QUIRK_NO_OUTPUT_REPORTS_ON_INTR_EP;
+ hdev->quirks |= HID_QUIRK_SKIP_OUTPUT_REPORT_ID;
+ sc->defer_initialization = 1;
+
ret = sixaxis_set_operational_usb(hdev);
+ if (ret < 0) {
+ hid_err(hdev, "Failed to set controller into operational mode\n");
+ goto err_stop;
+ }
+
+ ret = sony_register_sensors(sc);
+ if (ret) {
+ hid_err(sc->hdev,
+ "Unable to initialize motion sensors: %d\n", ret);
+ goto err_stop;
+ }
+
sony_init_output_report(sc, sixaxis_send_output_report);
- } else if ((sc->quirks & SIXAXIS_CONTROLLER_BT) ||
- (sc->quirks & NAVIGATION_CONTROLLER_BT)) {
+ } else if (sc->quirks & SIXAXIS_CONTROLLER_BT) {
/*
* The Sixaxis wants output reports sent on the ctrl endpoint
* when connected via Bluetooth.
*/
hdev->quirks |= HID_QUIRK_NO_OUTPUT_REPORTS_ON_INTR_EP;
+
ret = sixaxis_set_operational_bt(hdev);
+ if (ret < 0) {
+ hid_err(hdev, "Failed to set controller into operational mode\n");
+ goto err_stop;
+ }
+
+ ret = sony_register_sensors(sc);
+ if (ret) {
+ hid_err(sc->hdev,
+ "Unable to initialize motion sensors: %d\n", ret);
+ goto err_stop;
+ }
+
sony_init_output_report(sc, sixaxis_send_output_report);
} else if (sc->quirks & DUALSHOCK4_CONTROLLER) {
+ ret = dualshock4_get_calibration_data(sc);
+ if (ret < 0) {
+ hid_err(hdev, "Failed to get calibration data from Dualshock 4\n");
+ goto err_stop;
+ }
+
+ /*
+ * The Dualshock 4 touchpad supports 2 touches and has a
+ * resolution of 1920x942 (44.86 dots/mm).
+ */
+ ret = sony_register_touchpad(sc, 2, 1920, 942);
+ if (ret) {
+ hid_err(sc->hdev,
+ "Unable to initialize multi-touch slots: %d\n",
+ ret);
+ goto err_stop;
+ }
+
+ ret = sony_register_sensors(sc);
+ if (ret) {
+ hid_err(sc->hdev,
+ "Unable to initialize motion sensors: %d\n", ret);
+ goto err_stop;
+ }
+
if (sc->quirks & DUALSHOCK4_CONTROLLER_BT) {
- /*
- * The DualShock 4 wants output reports sent on the ctrl
- * endpoint when connected via Bluetooth.
- */
- hdev->quirks |= HID_QUIRK_NO_OUTPUT_REPORTS_ON_INTR_EP;
- ret = dualshock4_set_operational_bt(hdev);
- if (ret < 0) {
- hid_err(hdev, "failed to set the Dualshock 4 operational mode\n");
- goto err_stop;
- }
+ sc->ds4_bt_poll_interval = DS4_BT_DEFAULT_POLL_INTERVAL_MS;
+ ret = device_create_file(&sc->hdev->dev, &dev_attr_bt_poll_interval);
+ if (ret)
+ hid_warn(sc->hdev,
+ "can't create sysfs bt_poll_interval attribute err: %d\n",
+ ret);
+ }
+
+ if (sc->quirks & DUALSHOCK4_DONGLE) {
+ INIT_WORK(&sc->hotplug_worker, dualshock4_calibration_work);
+ sc->hotplug_worker_initialized = 1;
+ sc->ds4_dongle_state = DONGLE_DISCONNECTED;
}
sony_init_output_report(sc, dualshock4_send_output_report);
@@ -2434,13 +2654,6 @@ static int sony_probe(struct hid_device *hdev, const struct hid_device_id *id)
ret = 0;
}
- if (ret < 0)
- goto err_stop;
-
- ret = append_dev_id = sony_check_add(sc);
- if (ret < 0)
- goto err_stop;
-
if (sc->quirks & SONY_LED_SUPPORT) {
ret = sony_leds_init(sc);
if (ret < 0)
@@ -2470,10 +2683,20 @@ static int sony_probe(struct hid_device *hdev, const struct hid_device_id *id)
err_close:
hid_hw_close(hdev);
err_stop:
+ /* Piggy back on the default ds4_bt_ poll_interval to determine
+ * if we need to remove the file as we don't know for sure if we
+ * executed that logic.
+ */
+ if (sc->ds4_bt_poll_interval)
+ device_remove_file(&sc->hdev->dev, &dev_attr_bt_poll_interval);
if (sc->quirks & SONY_LED_SUPPORT)
sony_leds_remove(sc);
if (sc->quirks & SONY_BATTERY_SUPPORT)
sony_battery_remove(sc);
+ if (sc->touchpad)
+ sony_unregister_touchpad(sc);
+ if (sc->sensor_dev)
+ sony_unregister_sensors(sc);
sony_cancel_work_sync(sc);
kfree(sc->output_report_dmabuf);
sony_remove_dev_list(sc);
@@ -2482,17 +2705,90 @@ err_stop:
return ret;
}
+static int sony_probe(struct hid_device *hdev, const struct hid_device_id *id)
+{
+ int ret;
+ unsigned long quirks = id->driver_data;
+ struct sony_sc *sc;
+ unsigned int connect_mask = HID_CONNECT_DEFAULT;
+
+ if (!strcmp(hdev->name, "FutureMax Dance Mat"))
+ quirks |= FUTUREMAX_DANCE_MAT;
+
+ sc = devm_kzalloc(&hdev->dev, sizeof(*sc), GFP_KERNEL);
+ if (sc == NULL) {
+ hid_err(hdev, "can't alloc sony descriptor\n");
+ return -ENOMEM;
+ }
+
+ spin_lock_init(&sc->lock);
+
+ sc->quirks = quirks;
+ hid_set_drvdata(hdev, sc);
+ sc->hdev = hdev;
+
+ ret = hid_parse(hdev);
+ if (ret) {
+ hid_err(hdev, "parse failed\n");
+ return ret;
+ }
+
+ if (sc->quirks & VAIO_RDESC_CONSTANT)
+ connect_mask |= HID_CONNECT_HIDDEV_FORCE;
+ else if (sc->quirks & SIXAXIS_CONTROLLER)
+ connect_mask |= HID_CONNECT_HIDDEV_FORCE;
+
+ /* Patch the hw version on DS3/4 compatible devices, so applications can
+ * distinguish between the default HID mappings and the mappings defined
+ * by the Linux game controller spec. This is important for the SDL2
+ * library, which has a game controller database, which uses device ids
+ * in combination with version as a key.
+ */
+ if (sc->quirks & (SIXAXIS_CONTROLLER | DUALSHOCK4_CONTROLLER))
+ hdev->version |= 0x8000;
+
+ ret = hid_hw_start(hdev, connect_mask);
+ if (ret) {
+ hid_err(hdev, "hw start failed\n");
+ return ret;
+ }
+
+ /* sony_input_configured can fail, but this doesn't result
+ * in hid_hw_start failures (intended). Check whether
+ * the HID layer claimed the device else fail.
+ * We don't know the actual reason for the failure, most
+ * likely it is due to EEXIST in case of double connection
+ * of USB and Bluetooth, but could have been due to ENOMEM
+ * or other reasons as well.
+ */
+ if (!(hdev->claimed & HID_CLAIMED_INPUT)) {
+ hid_err(hdev, "failed to claim input\n");
+ return -ENODEV;
+ }
+
+ return ret;
+}
+
static void sony_remove(struct hid_device *hdev)
{
struct sony_sc *sc = hid_get_drvdata(hdev);
+ hid_hw_close(hdev);
+
if (sc->quirks & SONY_LED_SUPPORT)
sony_leds_remove(sc);
- if (sc->quirks & SONY_BATTERY_SUPPORT) {
- hid_hw_close(hdev);
+ if (sc->quirks & SONY_BATTERY_SUPPORT)
sony_battery_remove(sc);
- }
+
+ if (sc->touchpad)
+ sony_unregister_touchpad(sc);
+
+ if (sc->sensor_dev)
+ sony_unregister_sensors(sc);
+
+ if (sc->quirks & DUALSHOCK4_CONTROLLER_BT)
+ device_remove_file(&sc->hdev->dev, &dev_attr_bt_poll_interval);
sony_cancel_work_sync(sc);
@@ -2586,7 +2882,7 @@ static const struct hid_device_id sony_devices[] = {
{ HID_BLUETOOTH_DEVICE(USB_VENDOR_ID_SONY, USB_DEVICE_ID_SONY_PS4_CONTROLLER_2),
.driver_data = DUALSHOCK4_CONTROLLER_BT },
{ HID_USB_DEVICE(USB_VENDOR_ID_SONY, USB_DEVICE_ID_SONY_PS4_CONTROLLER_DONGLE),
- .driver_data = DUALSHOCK4_CONTROLLER_USB },
+ .driver_data = DUALSHOCK4_DONGLE },
/* Nyko Core Controller for PS3 */
{ HID_USB_DEVICE(USB_VENDOR_ID_SINO_LITE, USB_DEVICE_ID_SINO_LITE_CONTROLLER),
.driver_data = SIXAXIS_CONTROLLER_USB | SINO_LITE_CONTROLLER },
diff --git a/drivers/hid/uhid.c b/drivers/hid/uhid.c
index d02ee5304217..7256647d9418 100644
--- a/drivers/hid/uhid.c
+++ b/drivers/hid/uhid.c
@@ -30,6 +30,8 @@
#define UHID_NAME "uhid"
#define UHID_BUFSIZE 32
+static DEFINE_MUTEX(uhid_open_mutex);
+
struct uhid_device {
struct mutex devlock;
bool running;
@@ -144,15 +146,26 @@ static void uhid_hid_stop(struct hid_device *hid)
static int uhid_hid_open(struct hid_device *hid)
{
struct uhid_device *uhid = hid->driver_data;
+ int retval = 0;
- return uhid_queue_event(uhid, UHID_OPEN);
+ mutex_lock(&uhid_open_mutex);
+ if (!hid->open++) {
+ retval = uhid_queue_event(uhid, UHID_OPEN);
+ if (retval)
+ hid->open--;
+ }
+ mutex_unlock(&uhid_open_mutex);
+ return retval;
}
static void uhid_hid_close(struct hid_device *hid)
{
struct uhid_device *uhid = hid->driver_data;
- uhid_queue_event(uhid, UHID_CLOSE);
+ mutex_lock(&uhid_open_mutex);
+ if (!--hid->open)
+ uhid_queue_event(uhid, UHID_CLOSE);
+ mutex_unlock(&uhid_open_mutex);
}
static int uhid_hid_parse(struct hid_device *hid)
diff --git a/drivers/hwtracing/coresight/coresight-etm-perf.c b/drivers/hwtracing/coresight/coresight-etm-perf.c
index 2cd7c718198a..df6331536938 100644
--- a/drivers/hwtracing/coresight/coresight-etm-perf.c
+++ b/drivers/hwtracing/coresight/coresight-etm-perf.c
@@ -201,9 +201,23 @@ static void *etm_setup_aux(int event_cpu, void **pages,
event_data = alloc_event_data(event_cpu);
if (!event_data)
return NULL;
-
INIT_WORK(&event_data->work, free_event_data);
+ /*
+ * In theory nothing prevent tracers in a trace session from being
+ * associated with different sinks, nor having a sink per tracer. But
+ * until we have HW with this kind of topology we need to assume tracers
+ * in a trace session are using the same sink. Therefore go through
+ * the coresight bus and pick the first enabled sink.
+ *
+ * When operated from sysFS users are responsible to enable the sink
+ * while from perf, the perf tools will do it based on the choice made
+ * on the cmd line. As such the "enable_sink" flag in sysFS is reset.
+ */
+ sink = coresight_get_enabled_sink(true);
+ if (!sink)
+ goto err;
+
mask = &event_data->mask;
/* Setup the path for each CPU in a trace session */
@@ -219,28 +233,15 @@ static void *etm_setup_aux(int event_cpu, void **pages,
* list of devices from source to sink that can be
* referenced later when the path is actually needed.
*/
- event_data->path[cpu] = coresight_build_path(csdev);
+ event_data->path[cpu] = coresight_build_path(csdev, sink);
if (IS_ERR(event_data->path[cpu]))
goto err;
}
- /*
- * In theory nothing prevent tracers in a trace session from being
- * associated with different sinks, nor having a sink per tracer. But
- * until we have HW with this kind of topology and a way to convey
- * sink assignement from the perf cmd line we need to assume tracers
- * in a trace session are using the same sink. Therefore pick the sink
- * found at the end of the first available path.
- */
- cpu = cpumask_first(mask);
- /* Grab the sink at the end of the path */
- sink = coresight_get_sink(event_data->path[cpu]);
- if (!sink)
- goto err;
-
if (!sink_ops(sink)->alloc_buffer)
goto err;
+ cpu = cpumask_first(mask);
/* Get the AUX specific data from the sink buffer */
event_data->snk_config =
sink_ops(sink)->alloc_buffer(sink, cpu, pages,
diff --git a/drivers/hwtracing/coresight/coresight-etm4x.c b/drivers/hwtracing/coresight/coresight-etm4x.c
index 4db8d6a4d0cb..83358ebad762 100644
--- a/drivers/hwtracing/coresight/coresight-etm4x.c
+++ b/drivers/hwtracing/coresight/coresight-etm4x.c
@@ -1026,7 +1026,8 @@ static int etm4_probe(struct amba_device *adev, const struct amba_id *id)
}
pm_runtime_put(&adev->dev);
- dev_info(dev, "%s initialized\n", (char *)id->data);
+ dev_info(dev, "CPU%d: %s initialized\n",
+ drvdata->cpu, (char *)id->data);
if (boot_enable) {
coresight_enable(drvdata->csdev);
@@ -1045,20 +1046,25 @@ err_arch_supported:
}
static struct amba_id etm4_ids[] = {
- { /* ETM 4.0 - Cortex-A53 */
+ {
.id = 0x000bb95d,
.mask = 0x000fffff,
- .data = "ETM 4.0",
+ .data = "Cortex-A53 ETM v4.0",
},
- { /* ETM 4.0 - Cortex-A57 */
+ {
.id = 0x000bb95e,
.mask = 0x000fffff,
- .data = "ETM 4.0",
+ .data = "Cortex-A57 ETM v4.0",
},
- { /* ETM 4.0 - A72, Maia, HiSilicon */
+ {
.id = 0x000bb95a,
.mask = 0x000fffff,
- .data = "ETM 4.0",
+ .data = "Cortex-A72 ETM v4.0",
+ },
+ {
+ .id = 0x000bb959,
+ .mask = 0x000fffff,
+ .data = "Cortex-A73 ETM v4.0",
},
{ 0, 0},
};
diff --git a/drivers/hwtracing/coresight/coresight-priv.h b/drivers/hwtracing/coresight/coresight-priv.h
index 196a14be4b3d..ef9d8e93e3b2 100644
--- a/drivers/hwtracing/coresight/coresight-priv.h
+++ b/drivers/hwtracing/coresight/coresight-priv.h
@@ -111,7 +111,9 @@ static inline void CS_UNLOCK(void __iomem *addr)
void coresight_disable_path(struct list_head *path);
int coresight_enable_path(struct list_head *path, u32 mode);
struct coresight_device *coresight_get_sink(struct list_head *path);
-struct list_head *coresight_build_path(struct coresight_device *csdev);
+struct coresight_device *coresight_get_enabled_sink(bool reset);
+struct list_head *coresight_build_path(struct coresight_device *csdev,
+ struct coresight_device *sink);
void coresight_release_path(struct list_head *path);
#ifdef CONFIG_CORESIGHT_SOURCE_ETM3X
diff --git a/drivers/hwtracing/coresight/coresight-tmc-etf.c b/drivers/hwtracing/coresight/coresight-tmc-etf.c
index d6941ea24d8d..1549436e2492 100644
--- a/drivers/hwtracing/coresight/coresight-tmc-etf.c
+++ b/drivers/hwtracing/coresight/coresight-tmc-etf.c
@@ -70,7 +70,7 @@ static void tmc_etb_disable_hw(struct tmc_drvdata *drvdata)
* When operating in sysFS mode the content of the buffer needs to be
* read before the TMC is disabled.
*/
- if (local_read(&drvdata->mode) == CS_MODE_SYSFS)
+ if (drvdata->mode == CS_MODE_SYSFS)
tmc_etb_dump_hw(drvdata);
tmc_disable_hw(drvdata);
@@ -103,19 +103,14 @@ static void tmc_etf_disable_hw(struct tmc_drvdata *drvdata)
CS_LOCK(drvdata->base);
}
-static int tmc_enable_etf_sink_sysfs(struct coresight_device *csdev, u32 mode)
+static int tmc_enable_etf_sink_sysfs(struct coresight_device *csdev)
{
int ret = 0;
bool used = false;
char *buf = NULL;
- long val;
unsigned long flags;
struct tmc_drvdata *drvdata = dev_get_drvdata(csdev->dev.parent);
- /* This shouldn't be happening */
- if (WARN_ON(mode != CS_MODE_SYSFS))
- return -EINVAL;
-
/*
* If we don't have a buffer release the lock and allocate memory.
* Otherwise keep the lock and move along.
@@ -138,13 +133,12 @@ static int tmc_enable_etf_sink_sysfs(struct coresight_device *csdev, u32 mode)
goto out;
}
- val = local_xchg(&drvdata->mode, mode);
/*
* In sysFS mode we can have multiple writers per sink. Since this
* sink is already enabled no memory is needed and the HW need not be
* touched.
*/
- if (val == CS_MODE_SYSFS)
+ if (drvdata->mode == CS_MODE_SYSFS)
goto out;
/*
@@ -163,6 +157,7 @@ static int tmc_enable_etf_sink_sysfs(struct coresight_device *csdev, u32 mode)
drvdata->buf = buf;
}
+ drvdata->mode = CS_MODE_SYSFS;
tmc_etb_enable_hw(drvdata);
out:
spin_unlock_irqrestore(&drvdata->spinlock, flags);
@@ -177,34 +172,29 @@ out:
return ret;
}
-static int tmc_enable_etf_sink_perf(struct coresight_device *csdev, u32 mode)
+static int tmc_enable_etf_sink_perf(struct coresight_device *csdev)
{
int ret = 0;
- long val;
unsigned long flags;
struct tmc_drvdata *drvdata = dev_get_drvdata(csdev->dev.parent);
- /* This shouldn't be happening */
- if (WARN_ON(mode != CS_MODE_PERF))
- return -EINVAL;
-
spin_lock_irqsave(&drvdata->spinlock, flags);
if (drvdata->reading) {
ret = -EINVAL;
goto out;
}
- val = local_xchg(&drvdata->mode, mode);
/*
* In Perf mode there can be only one writer per sink. There
* is also no need to continue if the ETB/ETR is already operated
* from sysFS.
*/
- if (val != CS_MODE_DISABLED) {
+ if (drvdata->mode != CS_MODE_DISABLED) {
ret = -EINVAL;
goto out;
}
+ drvdata->mode = CS_MODE_PERF;
tmc_etb_enable_hw(drvdata);
out:
spin_unlock_irqrestore(&drvdata->spinlock, flags);
@@ -216,9 +206,9 @@ static int tmc_enable_etf_sink(struct coresight_device *csdev, u32 mode)
{
switch (mode) {
case CS_MODE_SYSFS:
- return tmc_enable_etf_sink_sysfs(csdev, mode);
+ return tmc_enable_etf_sink_sysfs(csdev);
case CS_MODE_PERF:
- return tmc_enable_etf_sink_perf(csdev, mode);
+ return tmc_enable_etf_sink_perf(csdev);
}
/* We shouldn't be here */
@@ -227,7 +217,6 @@ static int tmc_enable_etf_sink(struct coresight_device *csdev, u32 mode)
static void tmc_disable_etf_sink(struct coresight_device *csdev)
{
- long val;
unsigned long flags;
struct tmc_drvdata *drvdata = dev_get_drvdata(csdev->dev.parent);
@@ -237,10 +226,11 @@ static void tmc_disable_etf_sink(struct coresight_device *csdev)
return;
}
- val = local_xchg(&drvdata->mode, CS_MODE_DISABLED);
/* Disable the TMC only if it needs to */
- if (val != CS_MODE_DISABLED)
+ if (drvdata->mode != CS_MODE_DISABLED) {
tmc_etb_disable_hw(drvdata);
+ drvdata->mode = CS_MODE_DISABLED;
+ }
spin_unlock_irqrestore(&drvdata->spinlock, flags);
@@ -260,7 +250,7 @@ static int tmc_enable_etf_link(struct coresight_device *csdev,
}
tmc_etf_enable_hw(drvdata);
- local_set(&drvdata->mode, CS_MODE_SYSFS);
+ drvdata->mode = CS_MODE_SYSFS;
spin_unlock_irqrestore(&drvdata->spinlock, flags);
dev_info(drvdata->dev, "TMC-ETF enabled\n");
@@ -280,7 +270,7 @@ static void tmc_disable_etf_link(struct coresight_device *csdev,
}
tmc_etf_disable_hw(drvdata);
- local_set(&drvdata->mode, CS_MODE_DISABLED);
+ drvdata->mode = CS_MODE_DISABLED;
spin_unlock_irqrestore(&drvdata->spinlock, flags);
dev_info(drvdata->dev, "TMC disabled\n");
@@ -383,7 +373,7 @@ static void tmc_update_etf_buffer(struct coresight_device *csdev,
return;
/* This shouldn't happen */
- if (WARN_ON_ONCE(local_read(&drvdata->mode) != CS_MODE_PERF))
+ if (WARN_ON_ONCE(drvdata->mode != CS_MODE_PERF))
return;
CS_UNLOCK(drvdata->base);
@@ -504,7 +494,6 @@ const struct coresight_ops tmc_etf_cs_ops = {
int tmc_read_prepare_etb(struct tmc_drvdata *drvdata)
{
- long val;
enum tmc_mode mode;
int ret = 0;
unsigned long flags;
@@ -528,9 +517,8 @@ int tmc_read_prepare_etb(struct tmc_drvdata *drvdata)
goto out;
}
- val = local_read(&drvdata->mode);
/* Don't interfere if operated from Perf */
- if (val == CS_MODE_PERF) {
+ if (drvdata->mode == CS_MODE_PERF) {
ret = -EINVAL;
goto out;
}
@@ -542,7 +530,7 @@ int tmc_read_prepare_etb(struct tmc_drvdata *drvdata)
}
/* Disable the TMC if need be */
- if (val == CS_MODE_SYSFS)
+ if (drvdata->mode == CS_MODE_SYSFS)
tmc_etb_disable_hw(drvdata);
drvdata->reading = true;
@@ -573,7 +561,7 @@ int tmc_read_unprepare_etb(struct tmc_drvdata *drvdata)
}
/* Re-enable the TMC if need be */
- if (local_read(&drvdata->mode) == CS_MODE_SYSFS) {
+ if (drvdata->mode == CS_MODE_SYSFS) {
/*
* The trace run will continue with the same allocated trace
* buffer. As such zero-out the buffer so that we don't end
diff --git a/drivers/hwtracing/coresight/coresight-tmc-etr.c b/drivers/hwtracing/coresight/coresight-tmc-etr.c
index 886ea83c68e0..2db48576be5b 100644
--- a/drivers/hwtracing/coresight/coresight-tmc-etr.c
+++ b/drivers/hwtracing/coresight/coresight-tmc-etr.c
@@ -15,11 +15,30 @@
* this program. If not, see <http://www.gnu.org/licenses/>.
*/
+#include <linux/circ_buf.h>
#include <linux/coresight.h>
#include <linux/dma-mapping.h>
+#include <linux/slab.h>
+
#include "coresight-priv.h"
#include "coresight-tmc.h"
+/**
+ * struct cs_etr_buffer - keep track of a recording session' specifics
+ * @tmc: generic portion of the TMC buffers
+ * @paddr: the physical address of a DMA'able contiguous memory area
+ * @vaddr: the virtual address associated to @paddr
+ * @size: how much memory we have, starting at @paddr
+ * @dev: the device @vaddr has been tied to
+ */
+struct cs_etr_buffers {
+ struct cs_buffers tmc;
+ dma_addr_t paddr;
+ void __iomem *vaddr;
+ u32 size;
+ struct device *dev;
+};
+
static void tmc_etr_enable_hw(struct tmc_drvdata *drvdata)
{
u32 axictl;
@@ -86,26 +105,22 @@ static void tmc_etr_disable_hw(struct tmc_drvdata *drvdata)
* When operating in sysFS mode the content of the buffer needs to be
* read before the TMC is disabled.
*/
- if (local_read(&drvdata->mode) == CS_MODE_SYSFS)
+ if (drvdata->mode == CS_MODE_SYSFS)
tmc_etr_dump_hw(drvdata);
tmc_disable_hw(drvdata);
CS_LOCK(drvdata->base);
}
-static int tmc_enable_etr_sink_sysfs(struct coresight_device *csdev, u32 mode)
+static int tmc_enable_etr_sink_sysfs(struct coresight_device *csdev)
{
int ret = 0;
bool used = false;
- long val;
unsigned long flags;
void __iomem *vaddr = NULL;
dma_addr_t paddr;
struct tmc_drvdata *drvdata = dev_get_drvdata(csdev->dev.parent);
- /* This shouldn't be happening */
- if (WARN_ON(mode != CS_MODE_SYSFS))
- return -EINVAL;
/*
* If we don't have a buffer release the lock and allocate memory.
@@ -134,13 +149,12 @@ static int tmc_enable_etr_sink_sysfs(struct coresight_device *csdev, u32 mode)
goto out;
}
- val = local_xchg(&drvdata->mode, mode);
/*
* In sysFS mode we can have multiple writers per sink. Since this
* sink is already enabled no memory is needed and the HW need not be
* touched.
*/
- if (val == CS_MODE_SYSFS)
+ if (drvdata->mode == CS_MODE_SYSFS)
goto out;
/*
@@ -155,8 +169,7 @@ static int tmc_enable_etr_sink_sysfs(struct coresight_device *csdev, u32 mode)
drvdata->buf = drvdata->vaddr;
}
- memset(drvdata->vaddr, 0, drvdata->size);
-
+ drvdata->mode = CS_MODE_SYSFS;
tmc_etr_enable_hw(drvdata);
out:
spin_unlock_irqrestore(&drvdata->spinlock, flags);
@@ -171,34 +184,29 @@ out:
return ret;
}
-static int tmc_enable_etr_sink_perf(struct coresight_device *csdev, u32 mode)
+static int tmc_enable_etr_sink_perf(struct coresight_device *csdev)
{
int ret = 0;
- long val;
unsigned long flags;
struct tmc_drvdata *drvdata = dev_get_drvdata(csdev->dev.parent);
- /* This shouldn't be happening */
- if (WARN_ON(mode != CS_MODE_PERF))
- return -EINVAL;
-
spin_lock_irqsave(&drvdata->spinlock, flags);
if (drvdata->reading) {
ret = -EINVAL;
goto out;
}
- val = local_xchg(&drvdata->mode, mode);
/*
* In Perf mode there can be only one writer per sink. There
* is also no need to continue if the ETR is already operated
* from sysFS.
*/
- if (val != CS_MODE_DISABLED) {
+ if (drvdata->mode != CS_MODE_DISABLED) {
ret = -EINVAL;
goto out;
}
+ drvdata->mode = CS_MODE_PERF;
tmc_etr_enable_hw(drvdata);
out:
spin_unlock_irqrestore(&drvdata->spinlock, flags);
@@ -210,9 +218,9 @@ static int tmc_enable_etr_sink(struct coresight_device *csdev, u32 mode)
{
switch (mode) {
case CS_MODE_SYSFS:
- return tmc_enable_etr_sink_sysfs(csdev, mode);
+ return tmc_enable_etr_sink_sysfs(csdev);
case CS_MODE_PERF:
- return tmc_enable_etr_sink_perf(csdev, mode);
+ return tmc_enable_etr_sink_perf(csdev);
}
/* We shouldn't be here */
@@ -221,7 +229,6 @@ static int tmc_enable_etr_sink(struct coresight_device *csdev, u32 mode)
static void tmc_disable_etr_sink(struct coresight_device *csdev)
{
- long val;
unsigned long flags;
struct tmc_drvdata *drvdata = dev_get_drvdata(csdev->dev.parent);
@@ -231,19 +238,244 @@ static void tmc_disable_etr_sink(struct coresight_device *csdev)
return;
}
- val = local_xchg(&drvdata->mode, CS_MODE_DISABLED);
/* Disable the TMC only if it needs to */
- if (val != CS_MODE_DISABLED)
+ if (drvdata->mode != CS_MODE_DISABLED) {
tmc_etr_disable_hw(drvdata);
+ drvdata->mode = CS_MODE_DISABLED;
+ }
spin_unlock_irqrestore(&drvdata->spinlock, flags);
dev_info(drvdata->dev, "TMC-ETR disabled\n");
}
+static void *tmc_alloc_etr_buffer(struct coresight_device *csdev, int cpu,
+ void **pages, int nr_pages, bool overwrite)
+{
+ int node;
+ struct cs_etr_buffers *buf;
+ struct tmc_drvdata *drvdata = dev_get_drvdata(csdev->dev.parent);
+
+ if (cpu == -1)
+ cpu = smp_processor_id();
+ node = cpu_to_node(cpu);
+
+ /* Allocate memory structure for interaction with Perf */
+ buf = kzalloc_node(sizeof(struct cs_etr_buffers), GFP_KERNEL, node);
+ if (!buf)
+ return NULL;
+
+ buf->dev = drvdata->dev;
+ buf->size = drvdata->size;
+ buf->vaddr = dma_alloc_coherent(buf->dev, buf->size,
+ &buf->paddr, GFP_KERNEL);
+ if (!buf->vaddr) {
+ kfree(buf);
+ return NULL;
+ }
+
+ buf->tmc.snapshot = overwrite;
+ buf->tmc.nr_pages = nr_pages;
+ buf->tmc.data_pages = pages;
+
+ return buf;
+}
+
+static void tmc_free_etr_buffer(void *config)
+{
+ struct cs_etr_buffers *buf = config;
+
+ dma_free_coherent(buf->dev, buf->size, buf->vaddr, buf->paddr);
+ kfree(buf);
+}
+
+static int tmc_set_etr_buffer(struct coresight_device *csdev,
+ struct perf_output_handle *handle,
+ void *sink_config)
+{
+ int ret = 0;
+ unsigned long head;
+ struct cs_etr_buffers *buf = sink_config;
+ struct tmc_drvdata *drvdata = dev_get_drvdata(csdev->dev.parent);
+
+ /* wrap head around to the amount of space we have */
+ head = handle->head & ((buf->tmc.nr_pages << PAGE_SHIFT) - 1);
+
+ /* find the page to write to */
+ buf->tmc.cur = head / PAGE_SIZE;
+
+ /* and offset within that page */
+ buf->tmc.offset = head % PAGE_SIZE;
+
+ local_set(&buf->tmc.data_size, 0);
+
+ /* Tell the HW where to put the trace data */
+ drvdata->vaddr = buf->vaddr;
+ drvdata->paddr = buf->paddr;
+ memset(drvdata->vaddr, 0, drvdata->size);
+
+ return ret;
+}
+
+static unsigned long tmc_reset_etr_buffer(struct coresight_device *csdev,
+ struct perf_output_handle *handle,
+ void *sink_config, bool *lost)
+{
+ long size = 0;
+ struct cs_etr_buffers *buf = sink_config;
+ struct tmc_drvdata *drvdata = dev_get_drvdata(csdev->dev.parent);
+
+ if (buf) {
+ /*
+ * In snapshot mode ->data_size holds the new address of the
+ * ring buffer's head. The size itself is the whole address
+ * range since we want the latest information.
+ */
+ if (buf->tmc.snapshot) {
+ size = buf->tmc.nr_pages << PAGE_SHIFT;
+ handle->head = local_xchg(&buf->tmc.data_size, size);
+ }
+
+ /*
+ * Tell the tracer PMU how much we got in this run and if
+ * something went wrong along the way. Nobody else can use
+ * this cs_etr_buffers instance until we are done. As such
+ * resetting parameters here and squaring off with the ring
+ * buffer API in the tracer PMU is fine.
+ */
+ *lost = !!local_xchg(&buf->tmc.lost, 0);
+ size = local_xchg(&buf->tmc.data_size, 0);
+ }
+
+ /* Get ready for another run */
+ drvdata->vaddr = NULL;
+ drvdata->paddr = 0;
+
+ return size;
+}
+
+static void tmc_update_etr_buffer(struct coresight_device *csdev,
+ struct perf_output_handle *handle,
+ void *sink_config)
+{
+ int i, cur;
+ u32 *buf_ptr;
+ u32 read_ptr, write_ptr;
+ u32 status, to_read;
+ unsigned long offset;
+ struct cs_buffers *buf = sink_config;
+ struct tmc_drvdata *drvdata = dev_get_drvdata(csdev->dev.parent);
+
+ if (!buf)
+ return;
+
+ /* This shouldn't happen */
+ if (WARN_ON_ONCE(drvdata->mode != CS_MODE_PERF))
+ return;
+
+ CS_UNLOCK(drvdata->base);
+
+ tmc_flush_and_stop(drvdata);
+
+ read_ptr = readl_relaxed(drvdata->base + TMC_RRP);
+ write_ptr = readl_relaxed(drvdata->base + TMC_RWP);
+
+ /*
+ * Get a hold of the status register and see if a wrap around
+ * has occurred. If so adjust things accordingly.
+ */
+ status = readl_relaxed(drvdata->base + TMC_STS);
+ if (status & TMC_STS_FULL) {
+ local_inc(&buf->lost);
+ to_read = drvdata->size;
+ } else {
+ to_read = CIRC_CNT(write_ptr, read_ptr, drvdata->size);
+ }
+
+ /*
+ * The TMC RAM buffer may be bigger than the space available in the
+ * perf ring buffer (handle->size). If so advance the RRP so that we
+ * get the latest trace data.
+ */
+ if (to_read > handle->size) {
+ u32 buffer_start, mask = 0;
+
+ /* Read buffer start address in system memory */
+ buffer_start = readl_relaxed(drvdata->base + TMC_DBALO);
+
+ /*
+ * The value written to RRP must be byte-address aligned to
+ * the width of the trace memory databus _and_ to a frame
+ * boundary (16 byte), whichever is the biggest. For example,
+ * for 32-bit, 64-bit and 128-bit wide trace memory, the four
+ * LSBs must be 0s. For 256-bit wide trace memory, the five
+ * LSBs must be 0s.
+ */
+ switch (drvdata->memwidth) {
+ case TMC_MEM_INTF_WIDTH_32BITS:
+ case TMC_MEM_INTF_WIDTH_64BITS:
+ case TMC_MEM_INTF_WIDTH_128BITS:
+ mask = GENMASK(31, 5);
+ break;
+ case TMC_MEM_INTF_WIDTH_256BITS:
+ mask = GENMASK(31, 6);
+ break;
+ }
+
+ /*
+ * Make sure the new size is aligned in accordance with the
+ * requirement explained above.
+ */
+ to_read = handle->size & mask;
+ /* Move the RAM read pointer up */
+ read_ptr = (write_ptr + drvdata->size) - to_read;
+ /* Make sure we are still within our limits */
+ if (read_ptr > (buffer_start + (drvdata->size - 1)))
+ read_ptr -= drvdata->size;
+ /* Tell the HW */
+ writel_relaxed(read_ptr, drvdata->base + TMC_RRP);
+ local_inc(&buf->lost);
+ }
+
+ cur = buf->cur;
+ offset = buf->offset;
+
+ /* for every byte to read */
+ for (i = 0; i < to_read; i += 4) {
+ buf_ptr = buf->data_pages[cur] + offset;
+ *buf_ptr = readl_relaxed(drvdata->base + TMC_RRD);
+
+ offset += 4;
+ if (offset >= PAGE_SIZE) {
+ offset = 0;
+ cur++;
+ /* wrap around at the end of the buffer */
+ cur &= buf->nr_pages - 1;
+ }
+ }
+
+ /*
+ * In snapshot mode all we have to do is communicate to
+ * perf_aux_output_end() the address of the current head. In full
+ * trace mode the same function expects a size to move rb->aux_head
+ * forward.
+ */
+ if (buf->snapshot)
+ local_set(&buf->data_size, (cur * PAGE_SIZE) + offset);
+ else
+ local_add(to_read, &buf->data_size);
+
+ CS_LOCK(drvdata->base);
+}
+
static const struct coresight_ops_sink tmc_etr_sink_ops = {
.enable = tmc_enable_etr_sink,
.disable = tmc_disable_etr_sink,
+ .alloc_buffer = tmc_alloc_etr_buffer,
+ .free_buffer = tmc_free_etr_buffer,
+ .set_buffer = tmc_set_etr_buffer,
+ .reset_buffer = tmc_reset_etr_buffer,
+ .update_buffer = tmc_update_etr_buffer,
};
const struct coresight_ops tmc_etr_cs_ops = {
@@ -253,7 +485,6 @@ const struct coresight_ops tmc_etr_cs_ops = {
int tmc_read_prepare_etr(struct tmc_drvdata *drvdata)
{
int ret = 0;
- long val;
unsigned long flags;
/* config types are set a boot time and never change */
@@ -266,9 +497,8 @@ int tmc_read_prepare_etr(struct tmc_drvdata *drvdata)
goto out;
}
- val = local_read(&drvdata->mode);
/* Don't interfere if operated from Perf */
- if (val == CS_MODE_PERF) {
+ if (drvdata->mode == CS_MODE_PERF) {
ret = -EINVAL;
goto out;
}
@@ -280,7 +510,7 @@ int tmc_read_prepare_etr(struct tmc_drvdata *drvdata)
}
/* Disable the TMC if need be */
- if (val == CS_MODE_SYSFS)
+ if (drvdata->mode == CS_MODE_SYSFS)
tmc_etr_disable_hw(drvdata);
drvdata->reading = true;
@@ -303,7 +533,7 @@ int tmc_read_unprepare_etr(struct tmc_drvdata *drvdata)
spin_lock_irqsave(&drvdata->spinlock, flags);
/* RE-enable the TMC if need be */
- if (local_read(&drvdata->mode) == CS_MODE_SYSFS) {
+ if (drvdata->mode == CS_MODE_SYSFS) {
/*
* The trace run will continue with the same allocated trace
* buffer. The trace buffer is cleared in tmc_etr_enable_hw(),
diff --git a/drivers/hwtracing/coresight/coresight-tmc.h b/drivers/hwtracing/coresight/coresight-tmc.h
index 44b3ae346118..51c01851533e 100644
--- a/drivers/hwtracing/coresight/coresight-tmc.h
+++ b/drivers/hwtracing/coresight/coresight-tmc.h
@@ -117,7 +117,7 @@ struct tmc_drvdata {
void __iomem *vaddr;
u32 size;
u32 len;
- local_t mode;
+ u32 mode;
enum tmc_config_type config_type;
enum tmc_mem_intf_width memwidth;
u32 trigger_cntr;
diff --git a/drivers/hwtracing/coresight/coresight.c b/drivers/hwtracing/coresight/coresight.c
index 398e44a9ec45..cd41f401e8b2 100644
--- a/drivers/hwtracing/coresight/coresight.c
+++ b/drivers/hwtracing/coresight/coresight.c
@@ -371,6 +371,52 @@ struct coresight_device *coresight_get_sink(struct list_head *path)
return csdev;
}
+static int coresight_enabled_sink(struct device *dev, void *data)
+{
+ bool *reset = data;
+ struct coresight_device *csdev = to_coresight_device(dev);
+
+ if ((csdev->type == CORESIGHT_DEV_TYPE_SINK ||
+ csdev->type == CORESIGHT_DEV_TYPE_LINKSINK) &&
+ csdev->activated) {
+ /*
+ * Now that we have a handle on the sink for this session,
+ * disable the sysFS "enable_sink" flag so that possible
+ * concurrent perf session that wish to use another sink don't
+ * trip on it. Doing so has no ramification for the current
+ * session.
+ */
+ if (*reset)
+ csdev->activated = false;
+
+ return 1;
+ }
+
+ return 0;
+}
+
+/**
+ * coresight_get_enabled_sink - returns the first enabled sink found on the bus
+ * @deactivate: Whether the 'enable_sink' flag should be reset
+ *
+ * When operated from perf the deactivate parameter should be set to 'true'.
+ * That way the "enabled_sink" flag of the sink that was selected can be reset,
+ * allowing for other concurrent perf sessions to choose a different sink.
+ *
+ * When operated from sysFS users have full control and as such the deactivate
+ * parameter should be set to 'false', hence mandating users to explicitly
+ * clear the flag.
+ */
+struct coresight_device *coresight_get_enabled_sink(bool deactivate)
+{
+ struct device *dev = NULL;
+
+ dev = bus_find_device(&coresight_bustype, NULL, &deactivate,
+ coresight_enabled_sink);
+
+ return dev ? to_coresight_device(dev) : NULL;
+}
+
/**
* _coresight_build_path - recursively build a path from a @csdev to a sink.
* @csdev: The device to start from.
@@ -383,6 +429,7 @@ struct coresight_device *coresight_get_sink(struct list_head *path)
* last one.
*/
static int _coresight_build_path(struct coresight_device *csdev,
+ struct coresight_device *sink,
struct list_head *path)
{
int i;
@@ -390,15 +437,15 @@ static int _coresight_build_path(struct coresight_device *csdev,
struct coresight_node *node;
/* An activated sink has been found. Enqueue the element */
- if ((csdev->type == CORESIGHT_DEV_TYPE_SINK ||
- csdev->type == CORESIGHT_DEV_TYPE_LINKSINK) && csdev->activated)
+ if (csdev == sink)
goto out;
/* Not a sink - recursively explore each port found on this element */
for (i = 0; i < csdev->nr_outport; i++) {
struct coresight_device *child_dev = csdev->conns[i].child_dev;
- if (child_dev && _coresight_build_path(child_dev, path) == 0) {
+ if (child_dev &&
+ _coresight_build_path(child_dev, sink, path) == 0) {
found = true;
break;
}
@@ -425,18 +472,22 @@ out:
return 0;
}
-struct list_head *coresight_build_path(struct coresight_device *csdev)
+struct list_head *coresight_build_path(struct coresight_device *source,
+ struct coresight_device *sink)
{
struct list_head *path;
int rc;
+ if (!sink)
+ return ERR_PTR(-EINVAL);
+
path = kzalloc(sizeof(struct list_head), GFP_KERNEL);
if (!path)
return ERR_PTR(-ENOMEM);
INIT_LIST_HEAD(path);
- rc = _coresight_build_path(csdev, path);
+ rc = _coresight_build_path(source, sink, path);
if (rc) {
kfree(path);
return ERR_PTR(rc);
@@ -500,6 +551,7 @@ static int coresight_validate_source(struct coresight_device *csdev,
int coresight_enable(struct coresight_device *csdev)
{
int cpu, ret = 0;
+ struct coresight_device *sink;
struct list_head *path;
enum coresight_dev_subtype_source subtype;
@@ -522,7 +574,17 @@ int coresight_enable(struct coresight_device *csdev)
goto out;
}
- path = coresight_build_path(csdev);
+ /*
+ * Search for a valid sink for this session but don't reset the
+ * "enable_sink" flag in sysFS. Users get to do that explicitly.
+ */
+ sink = coresight_get_enabled_sink(false);
+ if (!sink) {
+ ret = -EINVAL;
+ goto out;
+ }
+
+ path = coresight_build_path(csdev, sink);
if (IS_ERR(path)) {
pr_err("building path(s) failed\n");
ret = PTR_ERR(path);
diff --git a/drivers/ide/ide.c b/drivers/ide/ide.c
index d127ace6aa57..6ee866fcc5dd 100644
--- a/drivers/ide/ide.c
+++ b/drivers/ide/ide.c
@@ -244,7 +244,7 @@ struct chs_geom {
static unsigned int ide_disks;
static struct chs_geom ide_disks_chs[MAX_HWIFS * MAX_DRIVES];
-static int ide_set_disk_chs(const char *str, struct kernel_param *kp)
+static int ide_set_disk_chs(const char *str, const struct kernel_param *kp)
{
unsigned int a, b, c = 0, h = 0, s = 0, i, j = 1;
@@ -328,7 +328,7 @@ static void ide_dev_apply_params(ide_drive_t *drive, u8 unit)
static unsigned int ide_ignore_cable;
-static int ide_set_ignore_cable(const char *s, struct kernel_param *kp)
+static int ide_set_ignore_cable(const char *s, const struct kernel_param *kp)
{
int i, j = 1;
diff --git a/drivers/infiniband/hw/qib/qib_iba7322.c b/drivers/infiniband/hw/qib/qib_iba7322.c
index cedb447d0162..228cb4c70363 100644
--- a/drivers/infiniband/hw/qib/qib_iba7322.c
+++ b/drivers/infiniband/hw/qib/qib_iba7322.c
@@ -150,7 +150,7 @@ static struct kparam_string kp_txselect = {
.string = txselect_list,
.maxlen = MAX_ATTEN_LEN
};
-static int setup_txselect(const char *, struct kernel_param *);
+static int setup_txselect(const char *, const struct kernel_param *);
module_param_call(txselect, setup_txselect, param_get_string,
&kp_txselect, S_IWUSR | S_IRUGO);
MODULE_PARM_DESC(txselect,
@@ -6177,7 +6177,7 @@ static void set_no_qsfp_atten(struct qib_devdata *dd, int change)
}
/* handle the txselect parameter changing */
-static int setup_txselect(const char *str, struct kernel_param *kp)
+static int setup_txselect(const char *str, const struct kernel_param *kp)
{
struct qib_devdata *dd;
unsigned long val;
diff --git a/drivers/infiniband/ulp/srpt/ib_srpt.c b/drivers/infiniband/ulp/srpt/ib_srpt.c
index 2a44a2c3e859..1c206df10a20 100644
--- a/drivers/infiniband/ulp/srpt/ib_srpt.c
+++ b/drivers/infiniband/ulp/srpt/ib_srpt.c
@@ -80,7 +80,7 @@ module_param(srpt_srq_size, int, 0444);
MODULE_PARM_DESC(srpt_srq_size,
"Shared receive queue (SRQ) size.");
-static int srpt_get_u64_x(char *buffer, struct kernel_param *kp)
+static int srpt_get_u64_x(char *buffer, const struct kernel_param *kp)
{
return sprintf(buffer, "0x%016llx", *(u64 *)kp->arg);
}
diff --git a/drivers/input/Kconfig b/drivers/input/Kconfig
index 6261874c07c9..34ffa0257b4b 100644
--- a/drivers/input/Kconfig
+++ b/drivers/input/Kconfig
@@ -187,6 +187,19 @@ config INPUT_APMPOWER
To compile this driver as a module, choose M here: the
module will be called apm-power.
+config INPUT_KEYRESET
+ bool "Reset key"
+ depends on INPUT
+ select INPUT_KEYCOMBO
+ ---help---
+ Say Y here if you want to reboot when some keys are pressed;
+
+config INPUT_KEYCOMBO
+ bool "Key combo"
+ depends on INPUT
+ ---help---
+ Say Y here if you want to take action when some keys are pressed;
+
comment "Input Device Drivers"
source "drivers/input/keyboard/Kconfig"
diff --git a/drivers/input/Makefile b/drivers/input/Makefile
index 595820bbabe9..6a3281ca3306 100644
--- a/drivers/input/Makefile
+++ b/drivers/input/Makefile
@@ -26,5 +26,7 @@ obj-$(CONFIG_INPUT_TOUCHSCREEN) += touchscreen/
obj-$(CONFIG_INPUT_MISC) += misc/
obj-$(CONFIG_INPUT_APMPOWER) += apm-power.o
+obj-$(CONFIG_INPUT_KEYRESET) += keyreset.o
+obj-$(CONFIG_INPUT_KEYCOMBO) += keycombo.o
obj-$(CONFIG_RMI4_CORE) += rmi4/
diff --git a/drivers/input/keyboard/goldfish_events.c b/drivers/input/keyboard/goldfish_events.c
index f6e643b589b6..c877e56a9bd5 100644
--- a/drivers/input/keyboard/goldfish_events.c
+++ b/drivers/input/keyboard/goldfish_events.c
@@ -17,6 +17,7 @@
#include <linux/interrupt.h>
#include <linux/types.h>
#include <linux/input.h>
+#include <linux/input/mt.h>
#include <linux/kernel.h>
#include <linux/platform_device.h>
#include <linux/slab.h>
@@ -24,6 +25,8 @@
#include <linux/io.h>
#include <linux/acpi.h>
+#define GOLDFISH_MAX_FINGERS 5
+
enum {
REG_READ = 0x00,
REG_SET_PAGE = 0x00,
@@ -52,7 +55,21 @@ static irqreturn_t events_interrupt(int irq, void *dev_id)
value = __raw_readl(edev->addr + REG_READ);
input_event(edev->input, type, code, value);
- input_sync(edev->input);
+ // Send an extra (EV_SYN, SYN_REPORT, 0x0) event
+ // if a key was pressed. Some keyboard device
+ // drivers may only send the EV_KEY event and
+ // not EV_SYN.
+ // Note that sending an extra SYN_REPORT is not
+ // necessary nor correct protocol with other
+ // devices such as touchscreens, which will send
+ // their own SYN_REPORT's when sufficient event
+ // information has been collected (e.g., for
+ // touchscreens, when pressure and X/Y coordinates
+ // have been received). Hence, we will only send
+ // this extra SYN_REPORT if type == EV_KEY.
+ if (type == EV_KEY) {
+ input_sync(edev->input);
+ }
return IRQ_HANDLED;
}
@@ -154,6 +171,15 @@ static int events_probe(struct platform_device *pdev)
input_dev->name = edev->name;
input_dev->id.bustype = BUS_HOST;
+ // Set the Goldfish Device to be multi-touch.
+ // In the Ranchu kernel, there is multi-touch-specific
+ // code for handling ABS_MT_SLOT events.
+ // See drivers/input/input.c:input_handle_abs_event.
+ // If we do not issue input_mt_init_slots,
+ // the kernel will filter out needed ABS_MT_SLOT
+ // events when we touch the screen in more than one place,
+ // preventing multi-touch with more than one finger from working.
+ input_mt_init_slots(input_dev, GOLDFISH_MAX_FINGERS, 0);
events_import_bits(edev, input_dev->evbit, EV_SYN, EV_MAX);
events_import_bits(edev, input_dev->keybit, EV_KEY, KEY_MAX);
diff --git a/drivers/input/keycombo.c b/drivers/input/keycombo.c
new file mode 100644
index 000000000000..2fba451b91d5
--- /dev/null
+++ b/drivers/input/keycombo.c
@@ -0,0 +1,261 @@
+/* drivers/input/keycombo.c
+ *
+ * Copyright (C) 2014 Google, Inc.
+ *
+ * This software is licensed under the terms of the GNU General Public
+ * License version 2, as published by the Free Software Foundation, and
+ * may be copied, distributed, and modified under those terms.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ */
+
+#include <linux/input.h>
+#include <linux/keycombo.h>
+#include <linux/module.h>
+#include <linux/platform_device.h>
+#include <linux/reboot.h>
+#include <linux/sched.h>
+#include <linux/slab.h>
+
+struct keycombo_state {
+ struct input_handler input_handler;
+ unsigned long keybit[BITS_TO_LONGS(KEY_CNT)];
+ unsigned long upbit[BITS_TO_LONGS(KEY_CNT)];
+ unsigned long key[BITS_TO_LONGS(KEY_CNT)];
+ spinlock_t lock;
+ struct workqueue_struct *wq;
+ int key_down_target;
+ int key_down;
+ int key_up;
+ struct delayed_work key_down_work;
+ int delay;
+ struct work_struct key_up_work;
+ void (*key_up_fn)(void *);
+ void (*key_down_fn)(void *);
+ void *priv;
+ int key_is_down;
+ struct wakeup_source combo_held_wake_source;
+ struct wakeup_source combo_up_wake_source;
+};
+
+static void do_key_down(struct work_struct *work)
+{
+ struct delayed_work *dwork = container_of(work, struct delayed_work,
+ work);
+ struct keycombo_state *state = container_of(dwork,
+ struct keycombo_state, key_down_work);
+ if (state->key_down_fn)
+ state->key_down_fn(state->priv);
+}
+
+static void do_key_up(struct work_struct *work)
+{
+ struct keycombo_state *state = container_of(work, struct keycombo_state,
+ key_up_work);
+ if (state->key_up_fn)
+ state->key_up_fn(state->priv);
+ __pm_relax(&state->combo_up_wake_source);
+}
+
+static void keycombo_event(struct input_handle *handle, unsigned int type,
+ unsigned int code, int value)
+{
+ unsigned long flags;
+ struct keycombo_state *state = handle->private;
+
+ if (type != EV_KEY)
+ return;
+
+ if (code >= KEY_MAX)
+ return;
+
+ if (!test_bit(code, state->keybit))
+ return;
+
+ spin_lock_irqsave(&state->lock, flags);
+ if (!test_bit(code, state->key) == !value)
+ goto done;
+ __change_bit(code, state->key);
+ if (test_bit(code, state->upbit)) {
+ if (value)
+ state->key_up++;
+ else
+ state->key_up--;
+ } else {
+ if (value)
+ state->key_down++;
+ else
+ state->key_down--;
+ }
+ if (state->key_down == state->key_down_target && state->key_up == 0) {
+ __pm_stay_awake(&state->combo_held_wake_source);
+ state->key_is_down = 1;
+ if (queue_delayed_work(state->wq, &state->key_down_work,
+ state->delay))
+ pr_debug("Key down work already queued!");
+ } else if (state->key_is_down) {
+ if (!cancel_delayed_work(&state->key_down_work)) {
+ __pm_stay_awake(&state->combo_up_wake_source);
+ queue_work(state->wq, &state->key_up_work);
+ }
+ __pm_relax(&state->combo_held_wake_source);
+ state->key_is_down = 0;
+ }
+done:
+ spin_unlock_irqrestore(&state->lock, flags);
+}
+
+static int keycombo_connect(struct input_handler *handler,
+ struct input_dev *dev,
+ const struct input_device_id *id)
+{
+ int i;
+ int ret;
+ struct input_handle *handle;
+ struct keycombo_state *state =
+ container_of(handler, struct keycombo_state, input_handler);
+ for (i = 0; i < KEY_MAX; i++) {
+ if (test_bit(i, state->keybit) && test_bit(i, dev->keybit))
+ break;
+ }
+ if (i == KEY_MAX)
+ return -ENODEV;
+
+ handle = kzalloc(sizeof(*handle), GFP_KERNEL);
+ if (!handle)
+ return -ENOMEM;
+
+ handle->dev = dev;
+ handle->handler = handler;
+ handle->name = KEYCOMBO_NAME;
+ handle->private = state;
+
+ ret = input_register_handle(handle);
+ if (ret)
+ goto err_input_register_handle;
+
+ ret = input_open_device(handle);
+ if (ret)
+ goto err_input_open_device;
+
+ return 0;
+
+err_input_open_device:
+ input_unregister_handle(handle);
+err_input_register_handle:
+ kfree(handle);
+ return ret;
+}
+
+static void keycombo_disconnect(struct input_handle *handle)
+{
+ input_close_device(handle);
+ input_unregister_handle(handle);
+ kfree(handle);
+}
+
+static const struct input_device_id keycombo_ids[] = {
+ {
+ .flags = INPUT_DEVICE_ID_MATCH_EVBIT,
+ .evbit = { BIT_MASK(EV_KEY) },
+ },
+ { },
+};
+MODULE_DEVICE_TABLE(input, keycombo_ids);
+
+static int keycombo_probe(struct platform_device *pdev)
+{
+ int ret;
+ int key, *keyp;
+ struct keycombo_state *state;
+ struct keycombo_platform_data *pdata = pdev->dev.platform_data;
+
+ if (!pdata)
+ return -EINVAL;
+
+ state = kzalloc(sizeof(*state), GFP_KERNEL);
+ if (!state)
+ return -ENOMEM;
+
+ spin_lock_init(&state->lock);
+ keyp = pdata->keys_down;
+ while ((key = *keyp++)) {
+ if (key >= KEY_MAX)
+ continue;
+ state->key_down_target++;
+ __set_bit(key, state->keybit);
+ }
+ if (pdata->keys_up) {
+ keyp = pdata->keys_up;
+ while ((key = *keyp++)) {
+ if (key >= KEY_MAX)
+ continue;
+ __set_bit(key, state->keybit);
+ __set_bit(key, state->upbit);
+ }
+ }
+
+ state->wq = alloc_ordered_workqueue("keycombo", 0);
+ if (!state->wq)
+ return -ENOMEM;
+
+ state->priv = pdata->priv;
+
+ if (pdata->key_down_fn)
+ state->key_down_fn = pdata->key_down_fn;
+ INIT_DELAYED_WORK(&state->key_down_work, do_key_down);
+
+ if (pdata->key_up_fn)
+ state->key_up_fn = pdata->key_up_fn;
+ INIT_WORK(&state->key_up_work, do_key_up);
+
+ wakeup_source_init(&state->combo_held_wake_source, "key combo");
+ wakeup_source_init(&state->combo_up_wake_source, "key combo up");
+ state->delay = msecs_to_jiffies(pdata->key_down_delay);
+
+ state->input_handler.event = keycombo_event;
+ state->input_handler.connect = keycombo_connect;
+ state->input_handler.disconnect = keycombo_disconnect;
+ state->input_handler.name = KEYCOMBO_NAME;
+ state->input_handler.id_table = keycombo_ids;
+ ret = input_register_handler(&state->input_handler);
+ if (ret) {
+ kfree(state);
+ return ret;
+ }
+ platform_set_drvdata(pdev, state);
+ return 0;
+}
+
+int keycombo_remove(struct platform_device *pdev)
+{
+ struct keycombo_state *state = platform_get_drvdata(pdev);
+ input_unregister_handler(&state->input_handler);
+ destroy_workqueue(state->wq);
+ kfree(state);
+ return 0;
+}
+
+
+struct platform_driver keycombo_driver = {
+ .driver.name = KEYCOMBO_NAME,
+ .probe = keycombo_probe,
+ .remove = keycombo_remove,
+};
+
+static int __init keycombo_init(void)
+{
+ return platform_driver_register(&keycombo_driver);
+}
+
+static void __exit keycombo_exit(void)
+{
+ return platform_driver_unregister(&keycombo_driver);
+}
+
+module_init(keycombo_init);
+module_exit(keycombo_exit);
diff --git a/drivers/input/keyreset.c b/drivers/input/keyreset.c
new file mode 100644
index 000000000000..7e5222aec7c1
--- /dev/null
+++ b/drivers/input/keyreset.c
@@ -0,0 +1,144 @@
+/* drivers/input/keyreset.c
+ *
+ * Copyright (C) 2014 Google, Inc.
+ *
+ * This software is licensed under the terms of the GNU General Public
+ * License version 2, as published by the Free Software Foundation, and
+ * may be copied, distributed, and modified under those terms.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ */
+
+#include <linux/input.h>
+#include <linux/keyreset.h>
+#include <linux/module.h>
+#include <linux/platform_device.h>
+#include <linux/reboot.h>
+#include <linux/sched.h>
+#include <linux/slab.h>
+#include <linux/syscalls.h>
+#include <linux/keycombo.h>
+
+struct keyreset_state {
+ int restart_requested;
+ int (*reset_fn)(void);
+ struct platform_device *pdev_child;
+ struct work_struct restart_work;
+};
+
+static void do_restart(struct work_struct *unused)
+{
+ orderly_reboot();
+}
+
+static void do_reset_fn(void *priv)
+{
+ struct keyreset_state *state = priv;
+ if (state->restart_requested)
+ panic("keyboard reset failed, %d", state->restart_requested);
+ if (state->reset_fn) {
+ state->restart_requested = state->reset_fn();
+ } else {
+ pr_info("keyboard reset\n");
+ schedule_work(&state->restart_work);
+ state->restart_requested = 1;
+ }
+}
+
+static int keyreset_probe(struct platform_device *pdev)
+{
+ int ret = -ENOMEM;
+ struct keycombo_platform_data *pdata_child;
+ struct keyreset_platform_data *pdata = pdev->dev.platform_data;
+ int up_size = 0, down_size = 0, size;
+ int key, *keyp;
+ struct keyreset_state *state;
+
+ if (!pdata)
+ return -EINVAL;
+ state = devm_kzalloc(&pdev->dev, sizeof(*state), GFP_KERNEL);
+ if (!state)
+ return -ENOMEM;
+
+ state->pdev_child = platform_device_alloc(KEYCOMBO_NAME,
+ PLATFORM_DEVID_AUTO);
+ if (!state->pdev_child)
+ return -ENOMEM;
+ state->pdev_child->dev.parent = &pdev->dev;
+ INIT_WORK(&state->restart_work, do_restart);
+
+ keyp = pdata->keys_down;
+ while ((key = *keyp++)) {
+ if (key >= KEY_MAX)
+ continue;
+ down_size++;
+ }
+ if (pdata->keys_up) {
+ keyp = pdata->keys_up;
+ while ((key = *keyp++)) {
+ if (key >= KEY_MAX)
+ continue;
+ up_size++;
+ }
+ }
+ size = sizeof(struct keycombo_platform_data)
+ + sizeof(int) * (down_size + 1);
+ pdata_child = devm_kzalloc(&pdev->dev, size, GFP_KERNEL);
+ if (!pdata_child)
+ goto error;
+ memcpy(pdata_child->keys_down, pdata->keys_down,
+ sizeof(int) * down_size);
+ if (up_size > 0) {
+ pdata_child->keys_up = devm_kzalloc(&pdev->dev, up_size + 1,
+ GFP_KERNEL);
+ if (!pdata_child->keys_up)
+ goto error;
+ memcpy(pdata_child->keys_up, pdata->keys_up,
+ sizeof(int) * up_size);
+ if (!pdata_child->keys_up)
+ goto error;
+ }
+ state->reset_fn = pdata->reset_fn;
+ pdata_child->key_down_fn = do_reset_fn;
+ pdata_child->priv = state;
+ pdata_child->key_down_delay = pdata->key_down_delay;
+ ret = platform_device_add_data(state->pdev_child, pdata_child, size);
+ if (ret)
+ goto error;
+ platform_set_drvdata(pdev, state);
+ return platform_device_add(state->pdev_child);
+error:
+ platform_device_put(state->pdev_child);
+ return ret;
+}
+
+int keyreset_remove(struct platform_device *pdev)
+{
+ struct keyreset_state *state = platform_get_drvdata(pdev);
+ platform_device_put(state->pdev_child);
+ return 0;
+}
+
+
+struct platform_driver keyreset_driver = {
+ .driver.name = KEYRESET_NAME,
+ .probe = keyreset_probe,
+ .remove = keyreset_remove,
+};
+
+static int __init keyreset_init(void)
+{
+ return platform_driver_register(&keyreset_driver);
+}
+
+static void __exit keyreset_exit(void)
+{
+ return platform_driver_unregister(&keyreset_driver);
+}
+
+module_init(keyreset_init);
+module_exit(keyreset_exit);
diff --git a/drivers/input/misc/Kconfig b/drivers/input/misc/Kconfig
index 7ffb614ce566..94360fe63f41 100644
--- a/drivers/input/misc/Kconfig
+++ b/drivers/input/misc/Kconfig
@@ -367,6 +367,17 @@ config INPUT_ATI_REMOTE2
To compile this driver as a module, choose M here: the module will be
called ati_remote2.
+config INPUT_KEYCHORD
+ tristate "Key chord input driver support"
+ help
+ Say Y here if you want to enable the key chord driver
+ accessible at /dev/keychord. This driver can be used
+ for receiving notifications when client specified key
+ combinations are pressed.
+
+ To compile this driver as a module, choose M here: the
+ module will be called keychord.
+
config INPUT_KEYSPAN_REMOTE
tristate "Keyspan DMR USB remote control"
depends on USB_ARCH_HAS_HCD
@@ -535,6 +546,11 @@ config INPUT_SGI_BTNS
To compile this driver as a module, choose M here: the
module will be called sgi_btns.
+config INPUT_GPIO
+ tristate "GPIO driver support"
+ help
+ Say Y here if you want to support gpio based keys, wheels etc...
+
config HP_SDC_RTC
tristate "HP SDC Real Time Clock"
depends on (GSC || HP300) && SERIO
diff --git a/drivers/input/misc/Makefile b/drivers/input/misc/Makefile
index 0b6d025f0487..64bf231faf8c 100644
--- a/drivers/input/misc/Makefile
+++ b/drivers/input/misc/Makefile
@@ -36,10 +36,12 @@ obj-$(CONFIG_INPUT_GP2A) += gp2ap002a00f.o
obj-$(CONFIG_INPUT_GPIO_BEEPER) += gpio-beeper.o
obj-$(CONFIG_INPUT_GPIO_TILT_POLLED) += gpio_tilt_polled.o
obj-$(CONFIG_INPUT_GPIO_DECODER) += gpio_decoder.o
+obj-$(CONFIG_INPUT_GPIO) += gpio_event.o gpio_matrix.o gpio_input.o gpio_output.o gpio_axis.o
obj-$(CONFIG_INPUT_HISI_POWERKEY) += hisi_powerkey.o
obj-$(CONFIG_HP_SDC_RTC) += hp_sdc_rtc.o
obj-$(CONFIG_INPUT_IMS_PCU) += ims-pcu.o
obj-$(CONFIG_INPUT_IXP4XX_BEEPER) += ixp4xx-beeper.o
+obj-$(CONFIG_INPUT_KEYCHORD) += keychord.o
obj-$(CONFIG_INPUT_KEYSPAN_REMOTE) += keyspan_remote.o
obj-$(CONFIG_INPUT_KXTJ9) += kxtj9.o
obj-$(CONFIG_INPUT_M68K_BEEP) += m68kspkr.o
diff --git a/drivers/input/misc/gpio_axis.c b/drivers/input/misc/gpio_axis.c
new file mode 100644
index 000000000000..0acf4a576f53
--- /dev/null
+++ b/drivers/input/misc/gpio_axis.c
@@ -0,0 +1,192 @@
+/* drivers/input/misc/gpio_axis.c
+ *
+ * Copyright (C) 2007 Google, Inc.
+ *
+ * This software is licensed under the terms of the GNU General Public
+ * License version 2, as published by the Free Software Foundation, and
+ * may be copied, distributed, and modified under those terms.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ */
+
+#include <linux/kernel.h>
+#include <linux/gpio.h>
+#include <linux/gpio_event.h>
+#include <linux/interrupt.h>
+#include <linux/slab.h>
+
+struct gpio_axis_state {
+ struct gpio_event_input_devs *input_devs;
+ struct gpio_event_axis_info *info;
+ uint32_t pos;
+};
+
+uint16_t gpio_axis_4bit_gray_map_table[] = {
+ [0x0] = 0x0, [0x1] = 0x1, /* 0000 0001 */
+ [0x3] = 0x2, [0x2] = 0x3, /* 0011 0010 */
+ [0x6] = 0x4, [0x7] = 0x5, /* 0110 0111 */
+ [0x5] = 0x6, [0x4] = 0x7, /* 0101 0100 */
+ [0xc] = 0x8, [0xd] = 0x9, /* 1100 1101 */
+ [0xf] = 0xa, [0xe] = 0xb, /* 1111 1110 */
+ [0xa] = 0xc, [0xb] = 0xd, /* 1010 1011 */
+ [0x9] = 0xe, [0x8] = 0xf, /* 1001 1000 */
+};
+uint16_t gpio_axis_4bit_gray_map(struct gpio_event_axis_info *info, uint16_t in)
+{
+ return gpio_axis_4bit_gray_map_table[in];
+}
+
+uint16_t gpio_axis_5bit_singletrack_map_table[] = {
+ [0x10] = 0x00, [0x14] = 0x01, [0x1c] = 0x02, /* 10000 10100 11100 */
+ [0x1e] = 0x03, [0x1a] = 0x04, [0x18] = 0x05, /* 11110 11010 11000 */
+ [0x08] = 0x06, [0x0a] = 0x07, [0x0e] = 0x08, /* 01000 01010 01110 */
+ [0x0f] = 0x09, [0x0d] = 0x0a, [0x0c] = 0x0b, /* 01111 01101 01100 */
+ [0x04] = 0x0c, [0x05] = 0x0d, [0x07] = 0x0e, /* 00100 00101 00111 */
+ [0x17] = 0x0f, [0x16] = 0x10, [0x06] = 0x11, /* 10111 10110 00110 */
+ [0x02] = 0x12, [0x12] = 0x13, [0x13] = 0x14, /* 00010 10010 10011 */
+ [0x1b] = 0x15, [0x0b] = 0x16, [0x03] = 0x17, /* 11011 01011 00011 */
+ [0x01] = 0x18, [0x09] = 0x19, [0x19] = 0x1a, /* 00001 01001 11001 */
+ [0x1d] = 0x1b, [0x15] = 0x1c, [0x11] = 0x1d, /* 11101 10101 10001 */
+};
+uint16_t gpio_axis_5bit_singletrack_map(
+ struct gpio_event_axis_info *info, uint16_t in)
+{
+ return gpio_axis_5bit_singletrack_map_table[in];
+}
+
+static void gpio_event_update_axis(struct gpio_axis_state *as, int report)
+{
+ struct gpio_event_axis_info *ai = as->info;
+ int i;
+ int change;
+ uint16_t state = 0;
+ uint16_t pos;
+ uint16_t old_pos = as->pos;
+ for (i = ai->count - 1; i >= 0; i--)
+ state = (state << 1) | gpio_get_value(ai->gpio[i]);
+ pos = ai->map(ai, state);
+ if (ai->flags & GPIOEAF_PRINT_RAW)
+ pr_info("axis %d-%d raw %x, pos %d -> %d\n",
+ ai->type, ai->code, state, old_pos, pos);
+ if (report && pos != old_pos) {
+ if (ai->type == EV_REL) {
+ change = (ai->decoded_size + pos - old_pos) %
+ ai->decoded_size;
+ if (change > ai->decoded_size / 2)
+ change -= ai->decoded_size;
+ if (change == ai->decoded_size / 2) {
+ if (ai->flags & GPIOEAF_PRINT_EVENT)
+ pr_info("axis %d-%d unknown direction, "
+ "pos %d -> %d\n", ai->type,
+ ai->code, old_pos, pos);
+ change = 0; /* no closest direction */
+ }
+ if (ai->flags & GPIOEAF_PRINT_EVENT)
+ pr_info("axis %d-%d change %d\n",
+ ai->type, ai->code, change);
+ input_report_rel(as->input_devs->dev[ai->dev],
+ ai->code, change);
+ } else {
+ if (ai->flags & GPIOEAF_PRINT_EVENT)
+ pr_info("axis %d-%d now %d\n",
+ ai->type, ai->code, pos);
+ input_event(as->input_devs->dev[ai->dev],
+ ai->type, ai->code, pos);
+ }
+ input_sync(as->input_devs->dev[ai->dev]);
+ }
+ as->pos = pos;
+}
+
+static irqreturn_t gpio_axis_irq_handler(int irq, void *dev_id)
+{
+ struct gpio_axis_state *as = dev_id;
+ gpio_event_update_axis(as, 1);
+ return IRQ_HANDLED;
+}
+
+int gpio_event_axis_func(struct gpio_event_input_devs *input_devs,
+ struct gpio_event_info *info, void **data, int func)
+{
+ int ret;
+ int i;
+ int irq;
+ struct gpio_event_axis_info *ai;
+ struct gpio_axis_state *as;
+
+ ai = container_of(info, struct gpio_event_axis_info, info);
+ if (func == GPIO_EVENT_FUNC_SUSPEND) {
+ for (i = 0; i < ai->count; i++)
+ disable_irq(gpio_to_irq(ai->gpio[i]));
+ return 0;
+ }
+ if (func == GPIO_EVENT_FUNC_RESUME) {
+ for (i = 0; i < ai->count; i++)
+ enable_irq(gpio_to_irq(ai->gpio[i]));
+ return 0;
+ }
+
+ if (func == GPIO_EVENT_FUNC_INIT) {
+ *data = as = kmalloc(sizeof(*as), GFP_KERNEL);
+ if (as == NULL) {
+ ret = -ENOMEM;
+ goto err_alloc_axis_state_failed;
+ }
+ as->input_devs = input_devs;
+ as->info = ai;
+ if (ai->dev >= input_devs->count) {
+ pr_err("gpio_event_axis: bad device index %d >= %d "
+ "for %d:%d\n", ai->dev, input_devs->count,
+ ai->type, ai->code);
+ ret = -EINVAL;
+ goto err_bad_device_index;
+ }
+
+ input_set_capability(input_devs->dev[ai->dev],
+ ai->type, ai->code);
+ if (ai->type == EV_ABS) {
+ input_set_abs_params(input_devs->dev[ai->dev], ai->code,
+ 0, ai->decoded_size - 1, 0, 0);
+ }
+ for (i = 0; i < ai->count; i++) {
+ ret = gpio_request(ai->gpio[i], "gpio_event_axis");
+ if (ret < 0)
+ goto err_request_gpio_failed;
+ ret = gpio_direction_input(ai->gpio[i]);
+ if (ret < 0)
+ goto err_gpio_direction_input_failed;
+ ret = irq = gpio_to_irq(ai->gpio[i]);
+ if (ret < 0)
+ goto err_get_irq_num_failed;
+ ret = request_irq(irq, gpio_axis_irq_handler,
+ IRQF_TRIGGER_RISING |
+ IRQF_TRIGGER_FALLING,
+ "gpio_event_axis", as);
+ if (ret < 0)
+ goto err_request_irq_failed;
+ }
+ gpio_event_update_axis(as, 0);
+ return 0;
+ }
+
+ ret = 0;
+ as = *data;
+ for (i = ai->count - 1; i >= 0; i--) {
+ free_irq(gpio_to_irq(ai->gpio[i]), as);
+err_request_irq_failed:
+err_get_irq_num_failed:
+err_gpio_direction_input_failed:
+ gpio_free(ai->gpio[i]);
+err_request_gpio_failed:
+ ;
+ }
+err_bad_device_index:
+ kfree(as);
+ *data = NULL;
+err_alloc_axis_state_failed:
+ return ret;
+}
diff --git a/drivers/input/misc/gpio_event.c b/drivers/input/misc/gpio_event.c
new file mode 100644
index 000000000000..90f07eba3ce9
--- /dev/null
+++ b/drivers/input/misc/gpio_event.c
@@ -0,0 +1,228 @@
+/* drivers/input/misc/gpio_event.c
+ *
+ * Copyright (C) 2007 Google, Inc.
+ *
+ * This software is licensed under the terms of the GNU General Public
+ * License version 2, as published by the Free Software Foundation, and
+ * may be copied, distributed, and modified under those terms.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ */
+
+#include <linux/module.h>
+#include <linux/input.h>
+#include <linux/gpio_event.h>
+#include <linux/hrtimer.h>
+#include <linux/platform_device.h>
+#include <linux/slab.h>
+
+struct gpio_event {
+ struct gpio_event_input_devs *input_devs;
+ const struct gpio_event_platform_data *info;
+ void *state[0];
+};
+
+static int gpio_input_event(
+ struct input_dev *dev, unsigned int type, unsigned int code, int value)
+{
+ int i;
+ int devnr;
+ int ret = 0;
+ int tmp_ret;
+ struct gpio_event_info **ii;
+ struct gpio_event *ip = input_get_drvdata(dev);
+
+ for (devnr = 0; devnr < ip->input_devs->count; devnr++)
+ if (ip->input_devs->dev[devnr] == dev)
+ break;
+ if (devnr == ip->input_devs->count) {
+ pr_err("gpio_input_event: unknown device %p\n", dev);
+ return -EIO;
+ }
+
+ for (i = 0, ii = ip->info->info; i < ip->info->info_count; i++, ii++) {
+ if ((*ii)->event) {
+ tmp_ret = (*ii)->event(ip->input_devs, *ii,
+ &ip->state[i],
+ devnr, type, code, value);
+ if (tmp_ret)
+ ret = tmp_ret;
+ }
+ }
+ return ret;
+}
+
+static int gpio_event_call_all_func(struct gpio_event *ip, int func)
+{
+ int i;
+ int ret;
+ struct gpio_event_info **ii;
+
+ if (func == GPIO_EVENT_FUNC_INIT || func == GPIO_EVENT_FUNC_RESUME) {
+ ii = ip->info->info;
+ for (i = 0; i < ip->info->info_count; i++, ii++) {
+ if ((*ii)->func == NULL) {
+ ret = -ENODEV;
+ pr_err("gpio_event_probe: Incomplete pdata, "
+ "no function\n");
+ goto err_no_func;
+ }
+ if (func == GPIO_EVENT_FUNC_RESUME && (*ii)->no_suspend)
+ continue;
+ ret = (*ii)->func(ip->input_devs, *ii, &ip->state[i],
+ func);
+ if (ret) {
+ pr_err("gpio_event_probe: function failed\n");
+ goto err_func_failed;
+ }
+ }
+ return 0;
+ }
+
+ ret = 0;
+ i = ip->info->info_count;
+ ii = ip->info->info + i;
+ while (i > 0) {
+ i--;
+ ii--;
+ if ((func & ~1) == GPIO_EVENT_FUNC_SUSPEND && (*ii)->no_suspend)
+ continue;
+ (*ii)->func(ip->input_devs, *ii, &ip->state[i], func & ~1);
+err_func_failed:
+err_no_func:
+ ;
+ }
+ return ret;
+}
+
+static void __maybe_unused gpio_event_suspend(struct gpio_event *ip)
+{
+ gpio_event_call_all_func(ip, GPIO_EVENT_FUNC_SUSPEND);
+ if (ip->info->power)
+ ip->info->power(ip->info, 0);
+}
+
+static void __maybe_unused gpio_event_resume(struct gpio_event *ip)
+{
+ if (ip->info->power)
+ ip->info->power(ip->info, 1);
+ gpio_event_call_all_func(ip, GPIO_EVENT_FUNC_RESUME);
+}
+
+static int gpio_event_probe(struct platform_device *pdev)
+{
+ int err;
+ struct gpio_event *ip;
+ struct gpio_event_platform_data *event_info;
+ int dev_count = 1;
+ int i;
+ int registered = 0;
+
+ event_info = pdev->dev.platform_data;
+ if (event_info == NULL) {
+ pr_err("gpio_event_probe: No pdata\n");
+ return -ENODEV;
+ }
+ if ((!event_info->name && !event_info->names[0]) ||
+ !event_info->info || !event_info->info_count) {
+ pr_err("gpio_event_probe: Incomplete pdata\n");
+ return -ENODEV;
+ }
+ if (!event_info->name)
+ while (event_info->names[dev_count])
+ dev_count++;
+ ip = kzalloc(sizeof(*ip) +
+ sizeof(ip->state[0]) * event_info->info_count +
+ sizeof(*ip->input_devs) +
+ sizeof(ip->input_devs->dev[0]) * dev_count, GFP_KERNEL);
+ if (ip == NULL) {
+ err = -ENOMEM;
+ pr_err("gpio_event_probe: Failed to allocate private data\n");
+ goto err_kp_alloc_failed;
+ }
+ ip->input_devs = (void*)&ip->state[event_info->info_count];
+ platform_set_drvdata(pdev, ip);
+
+ for (i = 0; i < dev_count; i++) {
+ struct input_dev *input_dev = input_allocate_device();
+ if (input_dev == NULL) {
+ err = -ENOMEM;
+ pr_err("gpio_event_probe: "
+ "Failed to allocate input device\n");
+ goto err_input_dev_alloc_failed;
+ }
+ input_set_drvdata(input_dev, ip);
+ input_dev->name = event_info->name ?
+ event_info->name : event_info->names[i];
+ input_dev->event = gpio_input_event;
+ ip->input_devs->dev[i] = input_dev;
+ }
+ ip->input_devs->count = dev_count;
+ ip->info = event_info;
+ if (event_info->power)
+ ip->info->power(ip->info, 1);
+
+ err = gpio_event_call_all_func(ip, GPIO_EVENT_FUNC_INIT);
+ if (err)
+ goto err_call_all_func_failed;
+
+ for (i = 0; i < dev_count; i++) {
+ err = input_register_device(ip->input_devs->dev[i]);
+ if (err) {
+ pr_err("gpio_event_probe: Unable to register %s "
+ "input device\n", ip->input_devs->dev[i]->name);
+ goto err_input_register_device_failed;
+ }
+ registered++;
+ }
+
+ return 0;
+
+err_input_register_device_failed:
+ gpio_event_call_all_func(ip, GPIO_EVENT_FUNC_UNINIT);
+err_call_all_func_failed:
+ if (event_info->power)
+ ip->info->power(ip->info, 0);
+ for (i = 0; i < registered; i++)
+ input_unregister_device(ip->input_devs->dev[i]);
+ for (i = dev_count - 1; i >= registered; i--) {
+ input_free_device(ip->input_devs->dev[i]);
+err_input_dev_alloc_failed:
+ ;
+ }
+ kfree(ip);
+err_kp_alloc_failed:
+ return err;
+}
+
+static int gpio_event_remove(struct platform_device *pdev)
+{
+ struct gpio_event *ip = platform_get_drvdata(pdev);
+ int i;
+
+ gpio_event_call_all_func(ip, GPIO_EVENT_FUNC_UNINIT);
+ if (ip->info->power)
+ ip->info->power(ip->info, 0);
+ for (i = 0; i < ip->input_devs->count; i++)
+ input_unregister_device(ip->input_devs->dev[i]);
+ kfree(ip);
+ return 0;
+}
+
+static struct platform_driver gpio_event_driver = {
+ .probe = gpio_event_probe,
+ .remove = gpio_event_remove,
+ .driver = {
+ .name = GPIO_EVENT_DEV_NAME,
+ },
+};
+
+module_platform_driver(gpio_event_driver);
+
+MODULE_DESCRIPTION("GPIO Event Driver");
+MODULE_LICENSE("GPL");
+
diff --git a/drivers/input/misc/gpio_input.c b/drivers/input/misc/gpio_input.c
new file mode 100644
index 000000000000..eefd02725aff
--- /dev/null
+++ b/drivers/input/misc/gpio_input.c
@@ -0,0 +1,390 @@
+/* drivers/input/misc/gpio_input.c
+ *
+ * Copyright (C) 2007 Google, Inc.
+ *
+ * This software is licensed under the terms of the GNU General Public
+ * License version 2, as published by the Free Software Foundation, and
+ * may be copied, distributed, and modified under those terms.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ */
+
+#include <linux/kernel.h>
+#include <linux/gpio.h>
+#include <linux/gpio_event.h>
+#include <linux/hrtimer.h>
+#include <linux/input.h>
+#include <linux/interrupt.h>
+#include <linux/slab.h>
+#include <linux/pm_wakeup.h>
+
+enum {
+ DEBOUNCE_UNSTABLE = BIT(0), /* Got irq, while debouncing */
+ DEBOUNCE_PRESSED = BIT(1),
+ DEBOUNCE_NOTPRESSED = BIT(2),
+ DEBOUNCE_WAIT_IRQ = BIT(3), /* Stable irq state */
+ DEBOUNCE_POLL = BIT(4), /* Stable polling state */
+
+ DEBOUNCE_UNKNOWN =
+ DEBOUNCE_PRESSED | DEBOUNCE_NOTPRESSED,
+};
+
+struct gpio_key_state {
+ struct gpio_input_state *ds;
+ uint8_t debounce;
+};
+
+struct gpio_input_state {
+ struct gpio_event_input_devs *input_devs;
+ const struct gpio_event_input_info *info;
+ struct hrtimer timer;
+ int use_irq;
+ int debounce_count;
+ spinlock_t irq_lock;
+ struct wakeup_source *ws;
+ struct gpio_key_state key_state[0];
+};
+
+static enum hrtimer_restart gpio_event_input_timer_func(struct hrtimer *timer)
+{
+ int i;
+ int pressed;
+ struct gpio_input_state *ds =
+ container_of(timer, struct gpio_input_state, timer);
+ unsigned gpio_flags = ds->info->flags;
+ unsigned npolarity;
+ int nkeys = ds->info->keymap_size;
+ const struct gpio_event_direct_entry *key_entry;
+ struct gpio_key_state *key_state;
+ unsigned long irqflags;
+ uint8_t debounce;
+ bool sync_needed;
+
+#if 0
+ key_entry = kp->keys_info->keymap;
+ key_state = kp->key_state;
+ for (i = 0; i < nkeys; i++, key_entry++, key_state++)
+ pr_info("gpio_read_detect_status %d %d\n", key_entry->gpio,
+ gpio_read_detect_status(key_entry->gpio));
+#endif
+ key_entry = ds->info->keymap;
+ key_state = ds->key_state;
+ sync_needed = false;
+ spin_lock_irqsave(&ds->irq_lock, irqflags);
+ for (i = 0; i < nkeys; i++, key_entry++, key_state++) {
+ debounce = key_state->debounce;
+ if (debounce & DEBOUNCE_WAIT_IRQ)
+ continue;
+ if (key_state->debounce & DEBOUNCE_UNSTABLE) {
+ debounce = key_state->debounce = DEBOUNCE_UNKNOWN;
+ enable_irq(gpio_to_irq(key_entry->gpio));
+ if (gpio_flags & GPIOEDF_PRINT_KEY_UNSTABLE)
+ pr_info("gpio_keys_scan_keys: key %x-%x, %d "
+ "(%d) continue debounce\n",
+ ds->info->type, key_entry->code,
+ i, key_entry->gpio);
+ }
+ npolarity = !(gpio_flags & GPIOEDF_ACTIVE_HIGH);
+ pressed = gpio_get_value(key_entry->gpio) ^ npolarity;
+ if (debounce & DEBOUNCE_POLL) {
+ if (pressed == !(debounce & DEBOUNCE_PRESSED)) {
+ ds->debounce_count++;
+ key_state->debounce = DEBOUNCE_UNKNOWN;
+ if (gpio_flags & GPIOEDF_PRINT_KEY_DEBOUNCE)
+ pr_info("gpio_keys_scan_keys: key %x-"
+ "%x, %d (%d) start debounce\n",
+ ds->info->type, key_entry->code,
+ i, key_entry->gpio);
+ }
+ continue;
+ }
+ if (pressed && (debounce & DEBOUNCE_NOTPRESSED)) {
+ if (gpio_flags & GPIOEDF_PRINT_KEY_DEBOUNCE)
+ pr_info("gpio_keys_scan_keys: key %x-%x, %d "
+ "(%d) debounce pressed 1\n",
+ ds->info->type, key_entry->code,
+ i, key_entry->gpio);
+ key_state->debounce = DEBOUNCE_PRESSED;
+ continue;
+ }
+ if (!pressed && (debounce & DEBOUNCE_PRESSED)) {
+ if (gpio_flags & GPIOEDF_PRINT_KEY_DEBOUNCE)
+ pr_info("gpio_keys_scan_keys: key %x-%x, %d "
+ "(%d) debounce pressed 0\n",
+ ds->info->type, key_entry->code,
+ i, key_entry->gpio);
+ key_state->debounce = DEBOUNCE_NOTPRESSED;
+ continue;
+ }
+ /* key is stable */
+ ds->debounce_count--;
+ if (ds->use_irq)
+ key_state->debounce |= DEBOUNCE_WAIT_IRQ;
+ else
+ key_state->debounce |= DEBOUNCE_POLL;
+ if (gpio_flags & GPIOEDF_PRINT_KEYS)
+ pr_info("gpio_keys_scan_keys: key %x-%x, %d (%d) "
+ "changed to %d\n", ds->info->type,
+ key_entry->code, i, key_entry->gpio, pressed);
+ input_event(ds->input_devs->dev[key_entry->dev], ds->info->type,
+ key_entry->code, pressed);
+ sync_needed = true;
+ }
+ if (sync_needed) {
+ for (i = 0; i < ds->input_devs->count; i++)
+ input_sync(ds->input_devs->dev[i]);
+ }
+
+#if 0
+ key_entry = kp->keys_info->keymap;
+ key_state = kp->key_state;
+ for (i = 0; i < nkeys; i++, key_entry++, key_state++) {
+ pr_info("gpio_read_detect_status %d %d\n", key_entry->gpio,
+ gpio_read_detect_status(key_entry->gpio));
+ }
+#endif
+
+ if (ds->debounce_count)
+ hrtimer_start(timer, ds->info->debounce_time, HRTIMER_MODE_REL);
+ else if (!ds->use_irq)
+ hrtimer_start(timer, ds->info->poll_time, HRTIMER_MODE_REL);
+ else
+ __pm_relax(ds->ws);
+
+ spin_unlock_irqrestore(&ds->irq_lock, irqflags);
+
+ return HRTIMER_NORESTART;
+}
+
+static irqreturn_t gpio_event_input_irq_handler(int irq, void *dev_id)
+{
+ struct gpio_key_state *ks = dev_id;
+ struct gpio_input_state *ds = ks->ds;
+ int keymap_index = ks - ds->key_state;
+ const struct gpio_event_direct_entry *key_entry;
+ unsigned long irqflags;
+ int pressed;
+
+ if (!ds->use_irq)
+ return IRQ_HANDLED;
+
+ key_entry = &ds->info->keymap[keymap_index];
+
+ if (ds->info->debounce_time.tv64) {
+ spin_lock_irqsave(&ds->irq_lock, irqflags);
+ if (ks->debounce & DEBOUNCE_WAIT_IRQ) {
+ ks->debounce = DEBOUNCE_UNKNOWN;
+ if (ds->debounce_count++ == 0) {
+ __pm_stay_awake(ds->ws);
+ hrtimer_start(
+ &ds->timer, ds->info->debounce_time,
+ HRTIMER_MODE_REL);
+ }
+ if (ds->info->flags & GPIOEDF_PRINT_KEY_DEBOUNCE)
+ pr_info("gpio_event_input_irq_handler: "
+ "key %x-%x, %d (%d) start debounce\n",
+ ds->info->type, key_entry->code,
+ keymap_index, key_entry->gpio);
+ } else {
+ disable_irq_nosync(irq);
+ ks->debounce = DEBOUNCE_UNSTABLE;
+ }
+ spin_unlock_irqrestore(&ds->irq_lock, irqflags);
+ } else {
+ pressed = gpio_get_value(key_entry->gpio) ^
+ !(ds->info->flags & GPIOEDF_ACTIVE_HIGH);
+ if (ds->info->flags & GPIOEDF_PRINT_KEYS)
+ pr_info("gpio_event_input_irq_handler: key %x-%x, %d "
+ "(%d) changed to %d\n",
+ ds->info->type, key_entry->code, keymap_index,
+ key_entry->gpio, pressed);
+ input_event(ds->input_devs->dev[key_entry->dev], ds->info->type,
+ key_entry->code, pressed);
+ input_sync(ds->input_devs->dev[key_entry->dev]);
+ }
+ return IRQ_HANDLED;
+}
+
+static int gpio_event_input_request_irqs(struct gpio_input_state *ds)
+{
+ int i;
+ int err;
+ unsigned int irq;
+ unsigned long req_flags = IRQF_TRIGGER_RISING | IRQF_TRIGGER_FALLING;
+
+ for (i = 0; i < ds->info->keymap_size; i++) {
+ err = irq = gpio_to_irq(ds->info->keymap[i].gpio);
+ if (err < 0)
+ goto err_gpio_get_irq_num_failed;
+ err = request_irq(irq, gpio_event_input_irq_handler,
+ req_flags, "gpio_keys", &ds->key_state[i]);
+ if (err) {
+ pr_err("gpio_event_input_request_irqs: request_irq "
+ "failed for input %d, irq %d\n",
+ ds->info->keymap[i].gpio, irq);
+ goto err_request_irq_failed;
+ }
+ if (ds->info->info.no_suspend) {
+ err = enable_irq_wake(irq);
+ if (err) {
+ pr_err("gpio_event_input_request_irqs: "
+ "enable_irq_wake failed for input %d, "
+ "irq %d\n",
+ ds->info->keymap[i].gpio, irq);
+ goto err_enable_irq_wake_failed;
+ }
+ }
+ }
+ return 0;
+
+ for (i = ds->info->keymap_size - 1; i >= 0; i--) {
+ irq = gpio_to_irq(ds->info->keymap[i].gpio);
+ if (ds->info->info.no_suspend)
+ disable_irq_wake(irq);
+err_enable_irq_wake_failed:
+ free_irq(irq, &ds->key_state[i]);
+err_request_irq_failed:
+err_gpio_get_irq_num_failed:
+ ;
+ }
+ return err;
+}
+
+int gpio_event_input_func(struct gpio_event_input_devs *input_devs,
+ struct gpio_event_info *info, void **data, int func)
+{
+ int ret;
+ int i;
+ unsigned long irqflags;
+ struct gpio_event_input_info *di;
+ struct gpio_input_state *ds = *data;
+ char *wlname;
+
+ di = container_of(info, struct gpio_event_input_info, info);
+
+ if (func == GPIO_EVENT_FUNC_SUSPEND) {
+ if (ds->use_irq)
+ for (i = 0; i < di->keymap_size; i++)
+ disable_irq(gpio_to_irq(di->keymap[i].gpio));
+ hrtimer_cancel(&ds->timer);
+ return 0;
+ }
+ if (func == GPIO_EVENT_FUNC_RESUME) {
+ spin_lock_irqsave(&ds->irq_lock, irqflags);
+ if (ds->use_irq)
+ for (i = 0; i < di->keymap_size; i++)
+ enable_irq(gpio_to_irq(di->keymap[i].gpio));
+ hrtimer_start(&ds->timer, ktime_set(0, 0), HRTIMER_MODE_REL);
+ spin_unlock_irqrestore(&ds->irq_lock, irqflags);
+ return 0;
+ }
+
+ if (func == GPIO_EVENT_FUNC_INIT) {
+ if (ktime_to_ns(di->poll_time) <= 0)
+ di->poll_time = ktime_set(0, 20 * NSEC_PER_MSEC);
+
+ *data = ds = kzalloc(sizeof(*ds) + sizeof(ds->key_state[0]) *
+ di->keymap_size, GFP_KERNEL);
+ if (ds == NULL) {
+ ret = -ENOMEM;
+ pr_err("gpio_event_input_func: "
+ "Failed to allocate private data\n");
+ goto err_ds_alloc_failed;
+ }
+ ds->debounce_count = di->keymap_size;
+ ds->input_devs = input_devs;
+ ds->info = di;
+ wlname = kasprintf(GFP_KERNEL, "gpio_input:%s%s",
+ input_devs->dev[0]->name,
+ (input_devs->count > 1) ? "..." : "");
+
+ ds->ws = wakeup_source_register(wlname);
+ kfree(wlname);
+ if (!ds->ws) {
+ ret = -ENOMEM;
+ pr_err("gpio_event_input_func: "
+ "Failed to allocate wakeup source\n");
+ goto err_ws_failed;
+ }
+
+ spin_lock_init(&ds->irq_lock);
+
+ for (i = 0; i < di->keymap_size; i++) {
+ int dev = di->keymap[i].dev;
+ if (dev >= input_devs->count) {
+ pr_err("gpio_event_input_func: bad device "
+ "index %d >= %d for key code %d\n",
+ dev, input_devs->count,
+ di->keymap[i].code);
+ ret = -EINVAL;
+ goto err_bad_keymap;
+ }
+ input_set_capability(input_devs->dev[dev], di->type,
+ di->keymap[i].code);
+ ds->key_state[i].ds = ds;
+ ds->key_state[i].debounce = DEBOUNCE_UNKNOWN;
+ }
+
+ for (i = 0; i < di->keymap_size; i++) {
+ ret = gpio_request(di->keymap[i].gpio, "gpio_kp_in");
+ if (ret) {
+ pr_err("gpio_event_input_func: gpio_request "
+ "failed for %d\n", di->keymap[i].gpio);
+ goto err_gpio_request_failed;
+ }
+ ret = gpio_direction_input(di->keymap[i].gpio);
+ if (ret) {
+ pr_err("gpio_event_input_func: "
+ "gpio_direction_input failed for %d\n",
+ di->keymap[i].gpio);
+ goto err_gpio_configure_failed;
+ }
+ }
+
+ ret = gpio_event_input_request_irqs(ds);
+
+ spin_lock_irqsave(&ds->irq_lock, irqflags);
+ ds->use_irq = ret == 0;
+
+ pr_info("GPIO Input Driver: Start gpio inputs for %s%s in %s "
+ "mode\n", input_devs->dev[0]->name,
+ (input_devs->count > 1) ? "..." : "",
+ ret == 0 ? "interrupt" : "polling");
+
+ hrtimer_init(&ds->timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
+ ds->timer.function = gpio_event_input_timer_func;
+ hrtimer_start(&ds->timer, ktime_set(0, 0), HRTIMER_MODE_REL);
+ spin_unlock_irqrestore(&ds->irq_lock, irqflags);
+ return 0;
+ }
+
+ ret = 0;
+ spin_lock_irqsave(&ds->irq_lock, irqflags);
+ hrtimer_cancel(&ds->timer);
+ if (ds->use_irq) {
+ for (i = di->keymap_size - 1; i >= 0; i--) {
+ int irq = gpio_to_irq(di->keymap[i].gpio);
+ if (ds->info->info.no_suspend)
+ disable_irq_wake(irq);
+ free_irq(irq, &ds->key_state[i]);
+ }
+ }
+ spin_unlock_irqrestore(&ds->irq_lock, irqflags);
+
+ for (i = di->keymap_size - 1; i >= 0; i--) {
+err_gpio_configure_failed:
+ gpio_free(di->keymap[i].gpio);
+err_gpio_request_failed:
+ ;
+ }
+err_bad_keymap:
+ wakeup_source_unregister(ds->ws);
+err_ws_failed:
+ kfree(ds);
+err_ds_alloc_failed:
+ return ret;
+}
diff --git a/drivers/input/misc/gpio_matrix.c b/drivers/input/misc/gpio_matrix.c
new file mode 100644
index 000000000000..08769dd88f56
--- /dev/null
+++ b/drivers/input/misc/gpio_matrix.c
@@ -0,0 +1,440 @@
+/* drivers/input/misc/gpio_matrix.c
+ *
+ * Copyright (C) 2007 Google, Inc.
+ *
+ * This software is licensed under the terms of the GNU General Public
+ * License version 2, as published by the Free Software Foundation, and
+ * may be copied, distributed, and modified under those terms.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ */
+
+#include <linux/kernel.h>
+#include <linux/gpio.h>
+#include <linux/gpio_event.h>
+#include <linux/hrtimer.h>
+#include <linux/interrupt.h>
+#include <linux/slab.h>
+
+struct gpio_kp {
+ struct gpio_event_input_devs *input_devs;
+ struct gpio_event_matrix_info *keypad_info;
+ struct hrtimer timer;
+ struct wakeup_source wake_src;
+ int current_output;
+ unsigned int use_irq:1;
+ unsigned int key_state_changed:1;
+ unsigned int last_key_state_changed:1;
+ unsigned int some_keys_pressed:2;
+ unsigned int disabled_irq:1;
+ unsigned long keys_pressed[0];
+};
+
+static void clear_phantom_key(struct gpio_kp *kp, int out, int in)
+{
+ struct gpio_event_matrix_info *mi = kp->keypad_info;
+ int key_index = out * mi->ninputs + in;
+ unsigned short keyentry = mi->keymap[key_index];
+ unsigned short keycode = keyentry & MATRIX_KEY_MASK;
+ unsigned short dev = keyentry >> MATRIX_CODE_BITS;
+
+ if (!test_bit(keycode, kp->input_devs->dev[dev]->key)) {
+ if (mi->flags & GPIOKPF_PRINT_PHANTOM_KEYS)
+ pr_info("gpiomatrix: phantom key %x, %d-%d (%d-%d) "
+ "cleared\n", keycode, out, in,
+ mi->output_gpios[out], mi->input_gpios[in]);
+ __clear_bit(key_index, kp->keys_pressed);
+ } else {
+ if (mi->flags & GPIOKPF_PRINT_PHANTOM_KEYS)
+ pr_info("gpiomatrix: phantom key %x, %d-%d (%d-%d) "
+ "not cleared\n", keycode, out, in,
+ mi->output_gpios[out], mi->input_gpios[in]);
+ }
+}
+
+static int restore_keys_for_input(struct gpio_kp *kp, int out, int in)
+{
+ int rv = 0;
+ int key_index;
+
+ key_index = out * kp->keypad_info->ninputs + in;
+ while (out < kp->keypad_info->noutputs) {
+ if (test_bit(key_index, kp->keys_pressed)) {
+ rv = 1;
+ clear_phantom_key(kp, out, in);
+ }
+ key_index += kp->keypad_info->ninputs;
+ out++;
+ }
+ return rv;
+}
+
+static void remove_phantom_keys(struct gpio_kp *kp)
+{
+ int out, in, inp;
+ int key_index;
+
+ if (kp->some_keys_pressed < 3)
+ return;
+
+ for (out = 0; out < kp->keypad_info->noutputs; out++) {
+ inp = -1;
+ key_index = out * kp->keypad_info->ninputs;
+ for (in = 0; in < kp->keypad_info->ninputs; in++, key_index++) {
+ if (test_bit(key_index, kp->keys_pressed)) {
+ if (inp == -1) {
+ inp = in;
+ continue;
+ }
+ if (inp >= 0) {
+ if (!restore_keys_for_input(kp, out + 1,
+ inp))
+ break;
+ clear_phantom_key(kp, out, inp);
+ inp = -2;
+ }
+ restore_keys_for_input(kp, out, in);
+ }
+ }
+ }
+}
+
+static void report_key(struct gpio_kp *kp, int key_index, int out, int in)
+{
+ struct gpio_event_matrix_info *mi = kp->keypad_info;
+ int pressed = test_bit(key_index, kp->keys_pressed);
+ unsigned short keyentry = mi->keymap[key_index];
+ unsigned short keycode = keyentry & MATRIX_KEY_MASK;
+ unsigned short dev = keyentry >> MATRIX_CODE_BITS;
+
+ if (pressed != test_bit(keycode, kp->input_devs->dev[dev]->key)) {
+ if (keycode == KEY_RESERVED) {
+ if (mi->flags & GPIOKPF_PRINT_UNMAPPED_KEYS)
+ pr_info("gpiomatrix: unmapped key, %d-%d "
+ "(%d-%d) changed to %d\n",
+ out, in, mi->output_gpios[out],
+ mi->input_gpios[in], pressed);
+ } else {
+ if (mi->flags & GPIOKPF_PRINT_MAPPED_KEYS)
+ pr_info("gpiomatrix: key %x, %d-%d (%d-%d) "
+ "changed to %d\n", keycode,
+ out, in, mi->output_gpios[out],
+ mi->input_gpios[in], pressed);
+ input_report_key(kp->input_devs->dev[dev], keycode, pressed);
+ }
+ }
+}
+
+static void report_sync(struct gpio_kp *kp)
+{
+ int i;
+
+ for (i = 0; i < kp->input_devs->count; i++)
+ input_sync(kp->input_devs->dev[i]);
+}
+
+static enum hrtimer_restart gpio_keypad_timer_func(struct hrtimer *timer)
+{
+ int out, in;
+ int key_index;
+ int gpio;
+ struct gpio_kp *kp = container_of(timer, struct gpio_kp, timer);
+ struct gpio_event_matrix_info *mi = kp->keypad_info;
+ unsigned gpio_keypad_flags = mi->flags;
+ unsigned polarity = !!(gpio_keypad_flags & GPIOKPF_ACTIVE_HIGH);
+
+ out = kp->current_output;
+ if (out == mi->noutputs) {
+ out = 0;
+ kp->last_key_state_changed = kp->key_state_changed;
+ kp->key_state_changed = 0;
+ kp->some_keys_pressed = 0;
+ } else {
+ key_index = out * mi->ninputs;
+ for (in = 0; in < mi->ninputs; in++, key_index++) {
+ gpio = mi->input_gpios[in];
+ if (gpio_get_value(gpio) ^ !polarity) {
+ if (kp->some_keys_pressed < 3)
+ kp->some_keys_pressed++;
+ kp->key_state_changed |= !__test_and_set_bit(
+ key_index, kp->keys_pressed);
+ } else
+ kp->key_state_changed |= __test_and_clear_bit(
+ key_index, kp->keys_pressed);
+ }
+ gpio = mi->output_gpios[out];
+ if (gpio_keypad_flags & GPIOKPF_DRIVE_INACTIVE)
+ gpio_set_value(gpio, !polarity);
+ else
+ gpio_direction_input(gpio);
+ out++;
+ }
+ kp->current_output = out;
+ if (out < mi->noutputs) {
+ gpio = mi->output_gpios[out];
+ if (gpio_keypad_flags & GPIOKPF_DRIVE_INACTIVE)
+ gpio_set_value(gpio, polarity);
+ else
+ gpio_direction_output(gpio, polarity);
+ hrtimer_start(timer, mi->settle_time, HRTIMER_MODE_REL);
+ return HRTIMER_NORESTART;
+ }
+ if (gpio_keypad_flags & GPIOKPF_DEBOUNCE) {
+ if (kp->key_state_changed) {
+ hrtimer_start(&kp->timer, mi->debounce_delay,
+ HRTIMER_MODE_REL);
+ return HRTIMER_NORESTART;
+ }
+ kp->key_state_changed = kp->last_key_state_changed;
+ }
+ if (kp->key_state_changed) {
+ if (gpio_keypad_flags & GPIOKPF_REMOVE_SOME_PHANTOM_KEYS)
+ remove_phantom_keys(kp);
+ key_index = 0;
+ for (out = 0; out < mi->noutputs; out++)
+ for (in = 0; in < mi->ninputs; in++, key_index++)
+ report_key(kp, key_index, out, in);
+ report_sync(kp);
+ }
+ if (!kp->use_irq || kp->some_keys_pressed) {
+ hrtimer_start(timer, mi->poll_time, HRTIMER_MODE_REL);
+ return HRTIMER_NORESTART;
+ }
+
+ /* No keys are pressed, reenable interrupt */
+ for (out = 0; out < mi->noutputs; out++) {
+ if (gpio_keypad_flags & GPIOKPF_DRIVE_INACTIVE)
+ gpio_set_value(mi->output_gpios[out], polarity);
+ else
+ gpio_direction_output(mi->output_gpios[out], polarity);
+ }
+ for (in = 0; in < mi->ninputs; in++)
+ enable_irq(gpio_to_irq(mi->input_gpios[in]));
+ __pm_relax(&kp->wake_src);
+ return HRTIMER_NORESTART;
+}
+
+static irqreturn_t gpio_keypad_irq_handler(int irq_in, void *dev_id)
+{
+ int i;
+ struct gpio_kp *kp = dev_id;
+ struct gpio_event_matrix_info *mi = kp->keypad_info;
+ unsigned gpio_keypad_flags = mi->flags;
+
+ if (!kp->use_irq) {
+ /* ignore interrupt while registering the handler */
+ kp->disabled_irq = 1;
+ disable_irq_nosync(irq_in);
+ return IRQ_HANDLED;
+ }
+
+ for (i = 0; i < mi->ninputs; i++)
+ disable_irq_nosync(gpio_to_irq(mi->input_gpios[i]));
+ for (i = 0; i < mi->noutputs; i++) {
+ if (gpio_keypad_flags & GPIOKPF_DRIVE_INACTIVE)
+ gpio_set_value(mi->output_gpios[i],
+ !(gpio_keypad_flags & GPIOKPF_ACTIVE_HIGH));
+ else
+ gpio_direction_input(mi->output_gpios[i]);
+ }
+ __pm_stay_awake(&kp->wake_src);
+ hrtimer_start(&kp->timer, ktime_set(0, 0), HRTIMER_MODE_REL);
+ return IRQ_HANDLED;
+}
+
+static int gpio_keypad_request_irqs(struct gpio_kp *kp)
+{
+ int i;
+ int err;
+ unsigned int irq;
+ unsigned long request_flags;
+ struct gpio_event_matrix_info *mi = kp->keypad_info;
+
+ switch (mi->flags & (GPIOKPF_ACTIVE_HIGH|GPIOKPF_LEVEL_TRIGGERED_IRQ)) {
+ default:
+ request_flags = IRQF_TRIGGER_FALLING;
+ break;
+ case GPIOKPF_ACTIVE_HIGH:
+ request_flags = IRQF_TRIGGER_RISING;
+ break;
+ case GPIOKPF_LEVEL_TRIGGERED_IRQ:
+ request_flags = IRQF_TRIGGER_LOW;
+ break;
+ case GPIOKPF_LEVEL_TRIGGERED_IRQ | GPIOKPF_ACTIVE_HIGH:
+ request_flags = IRQF_TRIGGER_HIGH;
+ break;
+ }
+
+ for (i = 0; i < mi->ninputs; i++) {
+ err = irq = gpio_to_irq(mi->input_gpios[i]);
+ if (err < 0)
+ goto err_gpio_get_irq_num_failed;
+ err = request_irq(irq, gpio_keypad_irq_handler, request_flags,
+ "gpio_kp", kp);
+ if (err) {
+ pr_err("gpiomatrix: request_irq failed for input %d, "
+ "irq %d\n", mi->input_gpios[i], irq);
+ goto err_request_irq_failed;
+ }
+ err = enable_irq_wake(irq);
+ if (err) {
+ pr_err("gpiomatrix: set_irq_wake failed for input %d, "
+ "irq %d\n", mi->input_gpios[i], irq);
+ }
+ disable_irq(irq);
+ if (kp->disabled_irq) {
+ kp->disabled_irq = 0;
+ enable_irq(irq);
+ }
+ }
+ return 0;
+
+ for (i = mi->noutputs - 1; i >= 0; i--) {
+ free_irq(gpio_to_irq(mi->input_gpios[i]), kp);
+err_request_irq_failed:
+err_gpio_get_irq_num_failed:
+ ;
+ }
+ return err;
+}
+
+int gpio_event_matrix_func(struct gpio_event_input_devs *input_devs,
+ struct gpio_event_info *info, void **data, int func)
+{
+ int i;
+ int err;
+ int key_count;
+ struct gpio_kp *kp;
+ struct gpio_event_matrix_info *mi;
+
+ mi = container_of(info, struct gpio_event_matrix_info, info);
+ if (func == GPIO_EVENT_FUNC_SUSPEND || func == GPIO_EVENT_FUNC_RESUME) {
+ /* TODO: disable scanning */
+ return 0;
+ }
+
+ if (func == GPIO_EVENT_FUNC_INIT) {
+ if (mi->keymap == NULL ||
+ mi->input_gpios == NULL ||
+ mi->output_gpios == NULL) {
+ err = -ENODEV;
+ pr_err("gpiomatrix: Incomplete pdata\n");
+ goto err_invalid_platform_data;
+ }
+ key_count = mi->ninputs * mi->noutputs;
+
+ *data = kp = kzalloc(sizeof(*kp) + sizeof(kp->keys_pressed[0]) *
+ BITS_TO_LONGS(key_count), GFP_KERNEL);
+ if (kp == NULL) {
+ err = -ENOMEM;
+ pr_err("gpiomatrix: Failed to allocate private data\n");
+ goto err_kp_alloc_failed;
+ }
+ kp->input_devs = input_devs;
+ kp->keypad_info = mi;
+ for (i = 0; i < key_count; i++) {
+ unsigned short keyentry = mi->keymap[i];
+ unsigned short keycode = keyentry & MATRIX_KEY_MASK;
+ unsigned short dev = keyentry >> MATRIX_CODE_BITS;
+ if (dev >= input_devs->count) {
+ pr_err("gpiomatrix: bad device index %d >= "
+ "%d for key code %d\n",
+ dev, input_devs->count, keycode);
+ err = -EINVAL;
+ goto err_bad_keymap;
+ }
+ if (keycode && keycode <= KEY_MAX)
+ input_set_capability(input_devs->dev[dev],
+ EV_KEY, keycode);
+ }
+
+ for (i = 0; i < mi->noutputs; i++) {
+ err = gpio_request(mi->output_gpios[i], "gpio_kp_out");
+ if (err) {
+ pr_err("gpiomatrix: gpio_request failed for "
+ "output %d\n", mi->output_gpios[i]);
+ goto err_request_output_gpio_failed;
+ }
+ if (gpio_cansleep(mi->output_gpios[i])) {
+ pr_err("gpiomatrix: unsupported output gpio %d,"
+ " can sleep\n", mi->output_gpios[i]);
+ err = -EINVAL;
+ goto err_output_gpio_configure_failed;
+ }
+ if (mi->flags & GPIOKPF_DRIVE_INACTIVE)
+ err = gpio_direction_output(mi->output_gpios[i],
+ !(mi->flags & GPIOKPF_ACTIVE_HIGH));
+ else
+ err = gpio_direction_input(mi->output_gpios[i]);
+ if (err) {
+ pr_err("gpiomatrix: gpio_configure failed for "
+ "output %d\n", mi->output_gpios[i]);
+ goto err_output_gpio_configure_failed;
+ }
+ }
+ for (i = 0; i < mi->ninputs; i++) {
+ err = gpio_request(mi->input_gpios[i], "gpio_kp_in");
+ if (err) {
+ pr_err("gpiomatrix: gpio_request failed for "
+ "input %d\n", mi->input_gpios[i]);
+ goto err_request_input_gpio_failed;
+ }
+ err = gpio_direction_input(mi->input_gpios[i]);
+ if (err) {
+ pr_err("gpiomatrix: gpio_direction_input failed"
+ " for input %d\n", mi->input_gpios[i]);
+ goto err_gpio_direction_input_failed;
+ }
+ }
+ kp->current_output = mi->noutputs;
+ kp->key_state_changed = 1;
+
+ hrtimer_init(&kp->timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
+ kp->timer.function = gpio_keypad_timer_func;
+ wakeup_source_init(&kp->wake_src, "gpio_kp");
+ err = gpio_keypad_request_irqs(kp);
+ kp->use_irq = err == 0;
+
+ pr_info("GPIO Matrix Keypad Driver: Start keypad matrix for "
+ "%s%s in %s mode\n", input_devs->dev[0]->name,
+ (input_devs->count > 1) ? "..." : "",
+ kp->use_irq ? "interrupt" : "polling");
+
+ if (kp->use_irq)
+ __pm_stay_awake(&kp->wake_src);
+ hrtimer_start(&kp->timer, ktime_set(0, 0), HRTIMER_MODE_REL);
+
+ return 0;
+ }
+
+ err = 0;
+ kp = *data;
+
+ if (kp->use_irq)
+ for (i = mi->noutputs - 1; i >= 0; i--)
+ free_irq(gpio_to_irq(mi->input_gpios[i]), kp);
+
+ hrtimer_cancel(&kp->timer);
+ wakeup_source_trash(&kp->wake_src);
+ for (i = mi->noutputs - 1; i >= 0; i--) {
+err_gpio_direction_input_failed:
+ gpio_free(mi->input_gpios[i]);
+err_request_input_gpio_failed:
+ ;
+ }
+ for (i = mi->noutputs - 1; i >= 0; i--) {
+err_output_gpio_configure_failed:
+ gpio_free(mi->output_gpios[i]);
+err_request_output_gpio_failed:
+ ;
+ }
+err_bad_keymap:
+ kfree(kp);
+err_kp_alloc_failed:
+err_invalid_platform_data:
+ return err;
+}
diff --git a/drivers/input/misc/gpio_output.c b/drivers/input/misc/gpio_output.c
new file mode 100644
index 000000000000..2aac2fad0a17
--- /dev/null
+++ b/drivers/input/misc/gpio_output.c
@@ -0,0 +1,97 @@
+/* drivers/input/misc/gpio_output.c
+ *
+ * Copyright (C) 2007 Google, Inc.
+ *
+ * This software is licensed under the terms of the GNU General Public
+ * License version 2, as published by the Free Software Foundation, and
+ * may be copied, distributed, and modified under those terms.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ */
+
+#include <linux/kernel.h>
+#include <linux/gpio.h>
+#include <linux/gpio_event.h>
+
+int gpio_event_output_event(
+ struct gpio_event_input_devs *input_devs, struct gpio_event_info *info,
+ void **data, unsigned int dev, unsigned int type,
+ unsigned int code, int value)
+{
+ int i;
+ struct gpio_event_output_info *oi;
+ oi = container_of(info, struct gpio_event_output_info, info);
+ if (type != oi->type)
+ return 0;
+ if (!(oi->flags & GPIOEDF_ACTIVE_HIGH))
+ value = !value;
+ for (i = 0; i < oi->keymap_size; i++)
+ if (dev == oi->keymap[i].dev && code == oi->keymap[i].code)
+ gpio_set_value(oi->keymap[i].gpio, value);
+ return 0;
+}
+
+int gpio_event_output_func(
+ struct gpio_event_input_devs *input_devs, struct gpio_event_info *info,
+ void **data, int func)
+{
+ int ret;
+ int i;
+ struct gpio_event_output_info *oi;
+ oi = container_of(info, struct gpio_event_output_info, info);
+
+ if (func == GPIO_EVENT_FUNC_SUSPEND || func == GPIO_EVENT_FUNC_RESUME)
+ return 0;
+
+ if (func == GPIO_EVENT_FUNC_INIT) {
+ int output_level = !(oi->flags & GPIOEDF_ACTIVE_HIGH);
+
+ for (i = 0; i < oi->keymap_size; i++) {
+ int dev = oi->keymap[i].dev;
+ if (dev >= input_devs->count) {
+ pr_err("gpio_event_output_func: bad device "
+ "index %d >= %d for key code %d\n",
+ dev, input_devs->count,
+ oi->keymap[i].code);
+ ret = -EINVAL;
+ goto err_bad_keymap;
+ }
+ input_set_capability(input_devs->dev[dev], oi->type,
+ oi->keymap[i].code);
+ }
+
+ for (i = 0; i < oi->keymap_size; i++) {
+ ret = gpio_request(oi->keymap[i].gpio,
+ "gpio_event_output");
+ if (ret) {
+ pr_err("gpio_event_output_func: gpio_request "
+ "failed for %d\n", oi->keymap[i].gpio);
+ goto err_gpio_request_failed;
+ }
+ ret = gpio_direction_output(oi->keymap[i].gpio,
+ output_level);
+ if (ret) {
+ pr_err("gpio_event_output_func: "
+ "gpio_direction_output failed for %d\n",
+ oi->keymap[i].gpio);
+ goto err_gpio_direction_output_failed;
+ }
+ }
+ return 0;
+ }
+
+ ret = 0;
+ for (i = oi->keymap_size - 1; i >= 0; i--) {
+err_gpio_direction_output_failed:
+ gpio_free(oi->keymap[i].gpio);
+err_gpio_request_failed:
+ ;
+ }
+err_bad_keymap:
+ return ret;
+}
+
diff --git a/drivers/input/misc/keychord.c b/drivers/input/misc/keychord.c
new file mode 100644
index 000000000000..82fefdff366a
--- /dev/null
+++ b/drivers/input/misc/keychord.c
@@ -0,0 +1,467 @@
+/*
+ * drivers/input/misc/keychord.c
+ *
+ * Copyright (C) 2008 Google, Inc.
+ * Author: Mike Lockwood <lockwood@android.com>
+ *
+ * This software is licensed under the terms of the GNU General Public
+ * License version 2, as published by the Free Software Foundation, and
+ * may be copied, distributed, and modified under those terms.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+*/
+
+#include <linux/poll.h>
+#include <linux/slab.h>
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/spinlock.h>
+#include <linux/fs.h>
+#include <linux/miscdevice.h>
+#include <linux/keychord.h>
+#include <linux/sched.h>
+
+#define KEYCHORD_NAME "keychord"
+#define BUFFER_SIZE 16
+
+MODULE_AUTHOR("Mike Lockwood <lockwood@android.com>");
+MODULE_DESCRIPTION("Key chord input driver");
+MODULE_SUPPORTED_DEVICE("keychord");
+MODULE_LICENSE("GPL");
+
+#define NEXT_KEYCHORD(kc) ((struct input_keychord *) \
+ ((char *)kc + sizeof(struct input_keychord) + \
+ kc->count * sizeof(kc->keycodes[0])))
+
+struct keychord_device {
+ struct input_handler input_handler;
+ int registered;
+
+ /* list of keychords to monitor */
+ struct input_keychord *keychords;
+ int keychord_count;
+
+ /* bitmask of keys contained in our keychords */
+ unsigned long keybit[BITS_TO_LONGS(KEY_CNT)];
+ /* current state of the keys */
+ unsigned long keystate[BITS_TO_LONGS(KEY_CNT)];
+ /* number of keys that are currently pressed */
+ int key_down;
+
+ /* second input_device_id is needed for null termination */
+ struct input_device_id device_ids[2];
+
+ spinlock_t lock;
+ wait_queue_head_t waitq;
+ unsigned char head;
+ unsigned char tail;
+ __u16 buff[BUFFER_SIZE];
+ /* Bit to serialize writes to this device */
+#define KEYCHORD_BUSY 0x01
+ unsigned long flags;
+ wait_queue_head_t write_waitq;
+};
+
+static int check_keychord(struct keychord_device *kdev,
+ struct input_keychord *keychord)
+{
+ int i;
+
+ if (keychord->count != kdev->key_down)
+ return 0;
+
+ for (i = 0; i < keychord->count; i++) {
+ if (!test_bit(keychord->keycodes[i], kdev->keystate))
+ return 0;
+ }
+
+ /* we have a match */
+ return 1;
+}
+
+static void keychord_event(struct input_handle *handle, unsigned int type,
+ unsigned int code, int value)
+{
+ struct keychord_device *kdev = handle->private;
+ struct input_keychord *keychord;
+ unsigned long flags;
+ int i, got_chord = 0;
+
+ if (type != EV_KEY || code >= KEY_MAX)
+ return;
+
+ spin_lock_irqsave(&kdev->lock, flags);
+ /* do nothing if key state did not change */
+ if (!test_bit(code, kdev->keystate) == !value)
+ goto done;
+ __change_bit(code, kdev->keystate);
+ if (value)
+ kdev->key_down++;
+ else
+ kdev->key_down--;
+
+ /* don't notify on key up */
+ if (!value)
+ goto done;
+ /* ignore this event if it is not one of the keys we are monitoring */
+ if (!test_bit(code, kdev->keybit))
+ goto done;
+
+ keychord = kdev->keychords;
+ if (!keychord)
+ goto done;
+
+ /* check to see if the keyboard state matches any keychords */
+ for (i = 0; i < kdev->keychord_count; i++) {
+ if (check_keychord(kdev, keychord)) {
+ kdev->buff[kdev->head] = keychord->id;
+ kdev->head = (kdev->head + 1) % BUFFER_SIZE;
+ got_chord = 1;
+ break;
+ }
+ /* skip to next keychord */
+ keychord = NEXT_KEYCHORD(keychord);
+ }
+
+done:
+ spin_unlock_irqrestore(&kdev->lock, flags);
+
+ if (got_chord) {
+ pr_info("keychord: got keychord id %d. Any tasks: %d\n",
+ keychord->id,
+ !list_empty_careful(&kdev->waitq.task_list));
+ wake_up_interruptible(&kdev->waitq);
+ }
+}
+
+static int keychord_connect(struct input_handler *handler,
+ struct input_dev *dev,
+ const struct input_device_id *id)
+{
+ int i, ret;
+ struct input_handle *handle;
+ struct keychord_device *kdev =
+ container_of(handler, struct keychord_device, input_handler);
+
+ /*
+ * ignore this input device if it does not contain any keycodes
+ * that we are monitoring
+ */
+ for (i = 0; i < KEY_MAX; i++) {
+ if (test_bit(i, kdev->keybit) && test_bit(i, dev->keybit))
+ break;
+ }
+ if (i == KEY_MAX)
+ return -ENODEV;
+
+ handle = kzalloc(sizeof(*handle), GFP_KERNEL);
+ if (!handle)
+ return -ENOMEM;
+
+ handle->dev = dev;
+ handle->handler = handler;
+ handle->name = KEYCHORD_NAME;
+ handle->private = kdev;
+
+ ret = input_register_handle(handle);
+ if (ret)
+ goto err_input_register_handle;
+
+ ret = input_open_device(handle);
+ if (ret)
+ goto err_input_open_device;
+
+ pr_info("keychord: using input dev %s for fevent\n", dev->name);
+ return 0;
+
+err_input_open_device:
+ input_unregister_handle(handle);
+err_input_register_handle:
+ kfree(handle);
+ return ret;
+}
+
+static void keychord_disconnect(struct input_handle *handle)
+{
+ input_close_device(handle);
+ input_unregister_handle(handle);
+ kfree(handle);
+}
+
+/*
+ * keychord_read is used to read keychord events from the driver
+ */
+static ssize_t keychord_read(struct file *file, char __user *buffer,
+ size_t count, loff_t *ppos)
+{
+ struct keychord_device *kdev = file->private_data;
+ __u16 id;
+ int retval;
+ unsigned long flags;
+
+ if (count < sizeof(id))
+ return -EINVAL;
+ count = sizeof(id);
+
+ if (kdev->head == kdev->tail && (file->f_flags & O_NONBLOCK))
+ return -EAGAIN;
+
+ retval = wait_event_interruptible(kdev->waitq,
+ kdev->head != kdev->tail);
+ if (retval)
+ return retval;
+
+ spin_lock_irqsave(&kdev->lock, flags);
+ /* pop a keychord ID off the queue */
+ id = kdev->buff[kdev->tail];
+ kdev->tail = (kdev->tail + 1) % BUFFER_SIZE;
+ spin_unlock_irqrestore(&kdev->lock, flags);
+
+ if (copy_to_user(buffer, &id, count))
+ return -EFAULT;
+
+ return count;
+}
+
+/*
+ * serializes writes on a device. can use mutex_lock_interruptible()
+ * for this particular use case as well - a matter of preference.
+ */
+static int
+keychord_write_lock(struct keychord_device *kdev)
+{
+ int ret;
+ unsigned long flags;
+
+ spin_lock_irqsave(&kdev->lock, flags);
+ while (kdev->flags & KEYCHORD_BUSY) {
+ spin_unlock_irqrestore(&kdev->lock, flags);
+ ret = wait_event_interruptible(kdev->write_waitq,
+ ((kdev->flags & KEYCHORD_BUSY) == 0));
+ if (ret)
+ return ret;
+ spin_lock_irqsave(&kdev->lock, flags);
+ }
+ kdev->flags |= KEYCHORD_BUSY;
+ spin_unlock_irqrestore(&kdev->lock, flags);
+ return 0;
+}
+
+static void
+keychord_write_unlock(struct keychord_device *kdev)
+{
+ unsigned long flags;
+
+ spin_lock_irqsave(&kdev->lock, flags);
+ kdev->flags &= ~KEYCHORD_BUSY;
+ spin_unlock_irqrestore(&kdev->lock, flags);
+ wake_up_interruptible(&kdev->write_waitq);
+}
+
+/*
+ * keychord_write is used to configure the driver
+ */
+static ssize_t keychord_write(struct file *file, const char __user *buffer,
+ size_t count, loff_t *ppos)
+{
+ struct keychord_device *kdev = file->private_data;
+ struct input_keychord *keychords = 0;
+ struct input_keychord *keychord;
+ int ret, i, key;
+ unsigned long flags;
+ size_t resid = count;
+ size_t key_bytes;
+
+ if (count < sizeof(struct input_keychord) || count > PAGE_SIZE)
+ return -EINVAL;
+ keychords = kzalloc(count, GFP_KERNEL);
+ if (!keychords)
+ return -ENOMEM;
+
+ /* read list of keychords from userspace */
+ if (copy_from_user(keychords, buffer, count)) {
+ kfree(keychords);
+ return -EFAULT;
+ }
+
+ /*
+ * Serialize writes to this device to prevent various races.
+ * 1) writers racing here could do duplicate input_unregister_handler()
+ * calls, resulting in attempting to unlink a node from a list that
+ * does not exist.
+ * 2) writers racing here could do duplicate input_register_handler() calls
+ * below, resulting in a duplicate insertion of a node into the list.
+ * 3) a double kfree of keychords can occur (in the event that
+ * input_register_handler() fails below.
+ */
+ ret = keychord_write_lock(kdev);
+ if (ret) {
+ kfree(keychords);
+ return ret;
+ }
+
+ /* unregister handler before changing configuration */
+ if (kdev->registered) {
+ input_unregister_handler(&kdev->input_handler);
+ kdev->registered = 0;
+ }
+
+ spin_lock_irqsave(&kdev->lock, flags);
+ /* clear any existing configuration */
+ kfree(kdev->keychords);
+ kdev->keychords = 0;
+ kdev->keychord_count = 0;
+ kdev->key_down = 0;
+ memset(kdev->keybit, 0, sizeof(kdev->keybit));
+ memset(kdev->keystate, 0, sizeof(kdev->keystate));
+ kdev->head = kdev->tail = 0;
+
+ keychord = keychords;
+
+ while (resid > 0) {
+ /* Is the entire keychord entry header present ? */
+ if (resid < sizeof(struct input_keychord)) {
+ pr_err("keychord: Insufficient bytes present for header %zu\n",
+ resid);
+ goto err_unlock_return;
+ }
+ resid -= sizeof(struct input_keychord);
+ if (keychord->count <= 0) {
+ pr_err("keychord: invalid keycode count %d\n",
+ keychord->count);
+ goto err_unlock_return;
+ }
+ key_bytes = keychord->count * sizeof(keychord->keycodes[0]);
+ /* Do we have all the expected keycodes ? */
+ if (resid < key_bytes) {
+ pr_err("keychord: Insufficient bytes present for keycount %zu\n",
+ resid);
+ goto err_unlock_return;
+ }
+ resid -= key_bytes;
+
+ if (keychord->version != KEYCHORD_VERSION) {
+ pr_err("keychord: unsupported version %d\n",
+ keychord->version);
+ goto err_unlock_return;
+ }
+
+ /* keep track of the keys we are monitoring in keybit */
+ for (i = 0; i < keychord->count; i++) {
+ key = keychord->keycodes[i];
+ if (key < 0 || key >= KEY_CNT) {
+ pr_err("keychord: keycode %d out of range\n",
+ key);
+ goto err_unlock_return;
+ }
+ __set_bit(key, kdev->keybit);
+ }
+
+ kdev->keychord_count++;
+ keychord = NEXT_KEYCHORD(keychord);
+ }
+
+ kdev->keychords = keychords;
+ spin_unlock_irqrestore(&kdev->lock, flags);
+
+ ret = input_register_handler(&kdev->input_handler);
+ if (ret) {
+ kfree(keychords);
+ kdev->keychords = 0;
+ keychord_write_unlock(kdev);
+ return ret;
+ }
+ kdev->registered = 1;
+
+ keychord_write_unlock(kdev);
+
+ return count;
+
+err_unlock_return:
+ spin_unlock_irqrestore(&kdev->lock, flags);
+ kfree(keychords);
+ keychord_write_unlock(kdev);
+ return -EINVAL;
+}
+
+static unsigned int keychord_poll(struct file *file, poll_table *wait)
+{
+ struct keychord_device *kdev = file->private_data;
+
+ poll_wait(file, &kdev->waitq, wait);
+
+ if (kdev->head != kdev->tail)
+ return POLLIN | POLLRDNORM;
+
+ return 0;
+}
+
+static int keychord_open(struct inode *inode, struct file *file)
+{
+ struct keychord_device *kdev;
+
+ kdev = kzalloc(sizeof(struct keychord_device), GFP_KERNEL);
+ if (!kdev)
+ return -ENOMEM;
+
+ spin_lock_init(&kdev->lock);
+ init_waitqueue_head(&kdev->waitq);
+ init_waitqueue_head(&kdev->write_waitq);
+
+ kdev->input_handler.event = keychord_event;
+ kdev->input_handler.connect = keychord_connect;
+ kdev->input_handler.disconnect = keychord_disconnect;
+ kdev->input_handler.name = KEYCHORD_NAME;
+ kdev->input_handler.id_table = kdev->device_ids;
+
+ kdev->device_ids[0].flags = INPUT_DEVICE_ID_MATCH_EVBIT;
+ __set_bit(EV_KEY, kdev->device_ids[0].evbit);
+
+ file->private_data = kdev;
+
+ return 0;
+}
+
+static int keychord_release(struct inode *inode, struct file *file)
+{
+ struct keychord_device *kdev = file->private_data;
+
+ if (kdev->registered)
+ input_unregister_handler(&kdev->input_handler);
+ kfree(kdev->keychords);
+ kfree(kdev);
+
+ return 0;
+}
+
+static const struct file_operations keychord_fops = {
+ .owner = THIS_MODULE,
+ .open = keychord_open,
+ .release = keychord_release,
+ .read = keychord_read,
+ .write = keychord_write,
+ .poll = keychord_poll,
+};
+
+static struct miscdevice keychord_misc = {
+ .fops = &keychord_fops,
+ .name = KEYCHORD_NAME,
+ .minor = MISC_DYNAMIC_MINOR,
+};
+
+static int __init keychord_init(void)
+{
+ return misc_register(&keychord_misc);
+}
+
+static void __exit keychord_exit(void)
+{
+ misc_deregister(&keychord_misc);
+}
+
+module_init(keychord_init);
+module_exit(keychord_exit);
diff --git a/drivers/isdn/hardware/mISDN/avmfritz.c b/drivers/isdn/hardware/mISDN/avmfritz.c
index e3fa1cd64470..a57b04f749fe 100644
--- a/drivers/isdn/hardware/mISDN/avmfritz.c
+++ b/drivers/isdn/hardware/mISDN/avmfritz.c
@@ -156,7 +156,7 @@ _set_debug(struct fritzcard *card)
}
static int
-set_debug(const char *val, struct kernel_param *kp)
+set_debug(const char *val, const struct kernel_param *kp)
{
int ret;
struct fritzcard *card;
diff --git a/drivers/isdn/hardware/mISDN/mISDNinfineon.c b/drivers/isdn/hardware/mISDN/mISDNinfineon.c
index d5bdbaf93a1a..1fc290659e94 100644
--- a/drivers/isdn/hardware/mISDN/mISDNinfineon.c
+++ b/drivers/isdn/hardware/mISDN/mISDNinfineon.c
@@ -244,7 +244,7 @@ _set_debug(struct inf_hw *card)
}
static int
-set_debug(const char *val, struct kernel_param *kp)
+set_debug(const char *val, const struct kernel_param *kp)
{
int ret;
struct inf_hw *card;
diff --git a/drivers/isdn/hardware/mISDN/netjet.c b/drivers/isdn/hardware/mISDN/netjet.c
index afde4edef9ae..e9fcae4569af 100644
--- a/drivers/isdn/hardware/mISDN/netjet.c
+++ b/drivers/isdn/hardware/mISDN/netjet.c
@@ -111,7 +111,7 @@ _set_debug(struct tiger_hw *card)
}
static int
-set_debug(const char *val, struct kernel_param *kp)
+set_debug(const char *val, const struct kernel_param *kp)
{
int ret;
struct tiger_hw *card;
diff --git a/drivers/isdn/hardware/mISDN/speedfax.c b/drivers/isdn/hardware/mISDN/speedfax.c
index 9815bb4eec9c..1f1446ed8d5f 100644
--- a/drivers/isdn/hardware/mISDN/speedfax.c
+++ b/drivers/isdn/hardware/mISDN/speedfax.c
@@ -94,7 +94,7 @@ _set_debug(struct sfax_hw *card)
}
static int
-set_debug(const char *val, struct kernel_param *kp)
+set_debug(const char *val, const struct kernel_param *kp)
{
int ret;
struct sfax_hw *card;
diff --git a/drivers/isdn/hardware/mISDN/w6692.c b/drivers/isdn/hardware/mISDN/w6692.c
index 3b067ea656bd..0db6783a139f 100644
--- a/drivers/isdn/hardware/mISDN/w6692.c
+++ b/drivers/isdn/hardware/mISDN/w6692.c
@@ -101,7 +101,7 @@ _set_debug(struct w6692_hw *card)
}
static int
-set_debug(const char *val, struct kernel_param *kp)
+set_debug(const char *val, const struct kernel_param *kp)
{
int ret;
struct w6692_hw *card;
diff --git a/drivers/md/Kconfig b/drivers/md/Kconfig
index 197e29d1c2e6..57f740d76b4f 100644
--- a/drivers/md/Kconfig
+++ b/drivers/md/Kconfig
@@ -501,4 +501,52 @@ config DM_LOG_WRITES
If unsure, say N.
+config DM_VERITY_AVB
+ tristate "Support AVB specific verity error behavior"
+ depends on DM_VERITY
+ ---help---
+ Enables Android Verified Boot platform-specific error
+ behavior. In particular, it will modify the vbmeta partition
+ specified on the kernel command-line when non-transient error
+ occurs (followed by a panic).
+
+ If unsure, say N.
+
+config DM_ANDROID_VERITY
+ bool "Android verity target support"
+ depends on BLK_DEV_DM=y
+ depends on DM_VERITY=y
+ depends on X509_CERTIFICATE_PARSER
+ depends on SYSTEM_TRUSTED_KEYRING
+ depends on CRYPTO_RSA
+ depends on KEYS
+ depends on ASYMMETRIC_KEY_TYPE
+ depends on ASYMMETRIC_PUBLIC_KEY_SUBTYPE
+ ---help---
+ This device-mapper target is virtually a VERITY target. This
+ target is setup by reading the metadata contents piggybacked
+ to the actual data blocks in the block device. The signature
+ of the metadata contents are verified against the key included
+ in the system keyring. Upon success, the underlying verity
+ target is setup.
+
+config DM_ANDROID_VERITY_AT_MOST_ONCE_DEFAULT_ENABLED
+ bool "Verity will validate blocks at most once"
+ depends on DM_VERITY
+ ---help---
+ Default enables at_most_once option for dm-verity
+
+ Verify data blocks only the first time they are read from the
+ data device, rather than every time. This reduces the overhead
+ of dm-verity so that it can be used on systems that are memory
+ and/or CPU constrained. However, it provides a reduced level
+ of security because only offline tampering of the data device's
+ content will be detected, not online tampering.
+
+ Hash blocks are still verified each time they are read from the
+ hash device, since verification of hash blocks is less performance
+ critical than data blocks, and a hash block will not be verified
+ any more after all the data blocks it covers have been verified anyway.
+
+ If unsure, say N.
endif # MD
diff --git a/drivers/md/Makefile b/drivers/md/Makefile
index 3cbda1af87a0..63c317063efa 100644
--- a/drivers/md/Makefile
+++ b/drivers/md/Makefile
@@ -59,6 +59,7 @@ obj-$(CONFIG_DM_CACHE_SMQ) += dm-cache-smq.o
obj-$(CONFIG_DM_CACHE_CLEANER) += dm-cache-cleaner.o
obj-$(CONFIG_DM_ERA) += dm-era.o
obj-$(CONFIG_DM_LOG_WRITES) += dm-log-writes.o
+obj-$(CONFIG_DM_ANDROID_VERITY) += dm-android-verity.o
ifeq ($(CONFIG_DM_UEVENT),y)
dm-mod-objs += dm-uevent.o
@@ -67,3 +68,7 @@ endif
ifeq ($(CONFIG_DM_VERITY_FEC),y)
dm-verity-objs += dm-verity-fec.o
endif
+
+ifeq ($(CONFIG_DM_VERITY_AVB),y)
+dm-verity-objs += dm-verity-avb.o
+endif
diff --git a/drivers/md/dm-android-verity.c b/drivers/md/dm-android-verity.c
new file mode 100644
index 000000000000..f9491dec3fcc
--- /dev/null
+++ b/drivers/md/dm-android-verity.c
@@ -0,0 +1,923 @@
+/*
+ * Copyright (C) 2015 Google, Inc.
+ *
+ * This software is licensed under the terms of the GNU General Public
+ * License version 2, as published by the Free Software Foundation, and
+ * may be copied, distributed, and modified under those terms.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ */
+
+#include <linux/buffer_head.h>
+#include <linux/debugfs.h>
+#include <linux/delay.h>
+#include <linux/device.h>
+#include <linux/device-mapper.h>
+#include <linux/errno.h>
+#include <linux/fs.h>
+#include <linux/fcntl.h>
+#include <linux/init.h>
+#include <linux/kernel.h>
+#include <linux/key.h>
+#include <linux/module.h>
+#include <linux/mount.h>
+#include <linux/namei.h>
+#include <linux/of.h>
+#include <linux/reboot.h>
+#include <linux/string.h>
+#include <linux/vmalloc.h>
+
+#include <asm/setup.h>
+#include <crypto/hash.h>
+#include <crypto/hash_info.h>
+#include <crypto/public_key.h>
+#include <crypto/sha.h>
+#include <keys/asymmetric-type.h>
+#include <keys/system_keyring.h>
+
+#include "dm-verity.h"
+#include "dm-android-verity.h"
+
+static char verifiedbootstate[VERITY_COMMANDLINE_PARAM_LENGTH];
+static char veritymode[VERITY_COMMANDLINE_PARAM_LENGTH];
+static char veritykeyid[VERITY_DEFAULT_KEY_ID_LENGTH];
+static char buildvariant[BUILD_VARIANT];
+
+static bool target_added;
+static bool verity_enabled = true;
+struct dentry *debug_dir;
+static int android_verity_ctr(struct dm_target *ti, unsigned argc, char **argv);
+
+static struct target_type android_verity_target = {
+ .name = "android-verity",
+ .version = {1, 0, 0},
+ .module = THIS_MODULE,
+ .ctr = android_verity_ctr,
+ .dtr = verity_dtr,
+ .map = verity_map,
+ .status = verity_status,
+ .prepare_ioctl = verity_prepare_ioctl,
+ .iterate_devices = verity_iterate_devices,
+ .io_hints = verity_io_hints,
+};
+
+static int __init verified_boot_state_param(char *line)
+{
+ strlcpy(verifiedbootstate, line, sizeof(verifiedbootstate));
+ return 1;
+}
+
+__setup("androidboot.verifiedbootstate=", verified_boot_state_param);
+
+static int __init verity_mode_param(char *line)
+{
+ strlcpy(veritymode, line, sizeof(veritymode));
+ return 1;
+}
+
+__setup("androidboot.veritymode=", verity_mode_param);
+
+static int __init verity_keyid_param(char *line)
+{
+ strlcpy(veritykeyid, line, sizeof(veritykeyid));
+ return 1;
+}
+
+__setup("veritykeyid=", verity_keyid_param);
+
+static int __init verity_buildvariant(char *line)
+{
+ strlcpy(buildvariant, line, sizeof(buildvariant));
+ return 1;
+}
+
+__setup("buildvariant=", verity_buildvariant);
+
+static inline bool default_verity_key_id(void)
+{
+ return veritykeyid[0] != '\0';
+}
+
+static inline bool is_eng(void)
+{
+ static const char typeeng[] = "eng";
+
+ return !strncmp(buildvariant, typeeng, sizeof(typeeng));
+}
+
+static inline bool is_userdebug(void)
+{
+ static const char typeuserdebug[] = "userdebug";
+
+ return !strncmp(buildvariant, typeuserdebug, sizeof(typeuserdebug));
+}
+
+static inline bool is_unlocked(void)
+{
+ static const char unlocked[] = "orange";
+
+ return !strncmp(verifiedbootstate, unlocked, sizeof(unlocked));
+}
+
+static int read_block_dev(struct bio_read *payload, struct block_device *bdev,
+ sector_t offset, int length)
+{
+ struct bio *bio;
+ int err = 0, i;
+
+ payload->number_of_pages = DIV_ROUND_UP(length, PAGE_SIZE);
+
+ bio = bio_alloc(GFP_KERNEL, payload->number_of_pages);
+ if (!bio) {
+ DMERR("Error while allocating bio");
+ return -ENOMEM;
+ }
+
+ bio->bi_bdev = bdev;
+ bio->bi_iter.bi_sector = offset;
+ bio_set_op_attrs(bio, REQ_OP_READ, READ_SYNC);
+
+ payload->page_io = kzalloc(sizeof(struct page *) *
+ payload->number_of_pages, GFP_KERNEL);
+ if (!payload->page_io) {
+ DMERR("page_io array alloc failed");
+ err = -ENOMEM;
+ goto free_bio;
+ }
+
+ for (i = 0; i < payload->number_of_pages; i++) {
+ payload->page_io[i] = alloc_page(GFP_KERNEL);
+ if (!payload->page_io[i]) {
+ DMERR("alloc_page failed");
+ err = -ENOMEM;
+ goto free_pages;
+ }
+ if (!bio_add_page(bio, payload->page_io[i], PAGE_SIZE, 0)) {
+ DMERR("bio_add_page error");
+ err = -EIO;
+ goto free_pages;
+ }
+ }
+
+ if (!submit_bio_wait(bio))
+ /* success */
+ goto free_bio;
+ DMERR("bio read failed");
+ err = -EIO;
+
+free_pages:
+ for (i = 0; i < payload->number_of_pages; i++)
+ if (payload->page_io[i])
+ __free_page(payload->page_io[i]);
+ kfree(payload->page_io);
+free_bio:
+ bio_put(bio);
+ return err;
+}
+
+static inline u64 fec_div_round_up(u64 x, u64 y)
+{
+ u64 remainder;
+
+ return div64_u64_rem(x, y, &remainder) +
+ (remainder > 0 ? 1 : 0);
+}
+
+static inline void populate_fec_metadata(struct fec_header *header,
+ struct fec_ecc_metadata *ecc)
+{
+ ecc->blocks = fec_div_round_up(le64_to_cpu(header->inp_size),
+ FEC_BLOCK_SIZE);
+ ecc->roots = le32_to_cpu(header->roots);
+ ecc->start = le64_to_cpu(header->inp_size);
+}
+
+static inline int validate_fec_header(struct fec_header *header, u64 offset)
+{
+ /* move offset to make the sanity check work for backup header
+ * as well. */
+ offset -= offset % FEC_BLOCK_SIZE;
+ if (le32_to_cpu(header->magic) != FEC_MAGIC ||
+ le32_to_cpu(header->version) != FEC_VERSION ||
+ le32_to_cpu(header->size) != sizeof(struct fec_header) ||
+ le32_to_cpu(header->roots) == 0 ||
+ le32_to_cpu(header->roots) >= FEC_RSM)
+ return -EINVAL;
+
+ return 0;
+}
+
+static int extract_fec_header(dev_t dev, struct fec_header *fec,
+ struct fec_ecc_metadata *ecc)
+{
+ u64 device_size;
+ struct bio_read payload;
+ int i, err = 0;
+ struct block_device *bdev;
+
+ bdev = blkdev_get_by_dev(dev, FMODE_READ, NULL);
+
+ if (IS_ERR_OR_NULL(bdev)) {
+ DMERR("bdev get error");
+ return PTR_ERR(bdev);
+ }
+
+ device_size = i_size_read(bdev->bd_inode);
+
+ /* fec metadata size is a power of 2 and PAGE_SIZE
+ * is a power of 2 as well.
+ */
+ BUG_ON(FEC_BLOCK_SIZE > PAGE_SIZE);
+ /* 512 byte sector alignment */
+ BUG_ON(((device_size - FEC_BLOCK_SIZE) % (1 << SECTOR_SHIFT)) != 0);
+
+ err = read_block_dev(&payload, bdev, (device_size -
+ FEC_BLOCK_SIZE) / (1 << SECTOR_SHIFT), FEC_BLOCK_SIZE);
+ if (err) {
+ DMERR("Error while reading verity metadata");
+ goto error;
+ }
+
+ BUG_ON(sizeof(struct fec_header) > PAGE_SIZE);
+ memcpy(fec, page_address(payload.page_io[0]),
+ sizeof(*fec));
+
+ ecc->valid = true;
+ if (validate_fec_header(fec, device_size - FEC_BLOCK_SIZE)) {
+ /* Try the backup header */
+ memcpy(fec, page_address(payload.page_io[0]) + FEC_BLOCK_SIZE
+ - sizeof(*fec) ,
+ sizeof(*fec));
+ if (validate_fec_header(fec, device_size -
+ sizeof(struct fec_header)))
+ ecc->valid = false;
+ }
+
+ if (ecc->valid)
+ populate_fec_metadata(fec, ecc);
+
+ for (i = 0; i < payload.number_of_pages; i++)
+ __free_page(payload.page_io[i]);
+ kfree(payload.page_io);
+
+error:
+ blkdev_put(bdev, FMODE_READ);
+ return err;
+}
+static void find_metadata_offset(struct fec_header *fec,
+ struct block_device *bdev, u64 *metadata_offset)
+{
+ u64 device_size;
+
+ device_size = i_size_read(bdev->bd_inode);
+
+ if (le32_to_cpu(fec->magic) == FEC_MAGIC)
+ *metadata_offset = le64_to_cpu(fec->inp_size) -
+ VERITY_METADATA_SIZE;
+ else
+ *metadata_offset = device_size - VERITY_METADATA_SIZE;
+}
+
+static int find_size(dev_t dev, u64 *device_size)
+{
+ struct block_device *bdev;
+
+ bdev = blkdev_get_by_dev(dev, FMODE_READ, NULL);
+ if (IS_ERR_OR_NULL(bdev)) {
+ DMERR("blkdev_get_by_dev failed");
+ return PTR_ERR(bdev);
+ }
+
+ *device_size = i_size_read(bdev->bd_inode);
+ *device_size >>= SECTOR_SHIFT;
+
+ DMINFO("blkdev size in sectors: %llu", *device_size);
+ blkdev_put(bdev, FMODE_READ);
+ return 0;
+}
+
+static int verify_header(struct android_metadata_header *header)
+{
+ int retval = -EINVAL;
+
+ if (is_userdebug() && le32_to_cpu(header->magic_number) ==
+ VERITY_METADATA_MAGIC_DISABLE)
+ return VERITY_STATE_DISABLE;
+
+ if (!(le32_to_cpu(header->magic_number) ==
+ VERITY_METADATA_MAGIC_NUMBER) ||
+ (le32_to_cpu(header->magic_number) ==
+ VERITY_METADATA_MAGIC_DISABLE)) {
+ DMERR("Incorrect magic number");
+ return retval;
+ }
+
+ if (le32_to_cpu(header->protocol_version) !=
+ VERITY_METADATA_VERSION) {
+ DMERR("Unsupported version %u",
+ le32_to_cpu(header->protocol_version));
+ return retval;
+ }
+
+ return 0;
+}
+
+static int extract_metadata(dev_t dev, struct fec_header *fec,
+ struct android_metadata **metadata,
+ bool *verity_enabled)
+{
+ struct block_device *bdev;
+ struct android_metadata_header *header;
+ int i;
+ u32 table_length, copy_length, offset;
+ u64 metadata_offset;
+ struct bio_read payload;
+ int err = 0;
+
+ bdev = blkdev_get_by_dev(dev, FMODE_READ, NULL);
+
+ if (IS_ERR_OR_NULL(bdev)) {
+ DMERR("blkdev_get_by_dev failed");
+ return -ENODEV;
+ }
+
+ find_metadata_offset(fec, bdev, &metadata_offset);
+
+ /* Verity metadata size is a power of 2 and PAGE_SIZE
+ * is a power of 2 as well.
+ * PAGE_SIZE is also a multiple of 512 bytes.
+ */
+ if (VERITY_METADATA_SIZE > PAGE_SIZE)
+ BUG_ON(VERITY_METADATA_SIZE % PAGE_SIZE != 0);
+ /* 512 byte sector alignment */
+ BUG_ON(metadata_offset % (1 << SECTOR_SHIFT) != 0);
+
+ err = read_block_dev(&payload, bdev, metadata_offset /
+ (1 << SECTOR_SHIFT), VERITY_METADATA_SIZE);
+ if (err) {
+ DMERR("Error while reading verity metadata");
+ goto blkdev_release;
+ }
+
+ header = kzalloc(sizeof(*header), GFP_KERNEL);
+ if (!header) {
+ DMERR("kzalloc failed for header");
+ err = -ENOMEM;
+ goto free_payload;
+ }
+
+ memcpy(header, page_address(payload.page_io[0]),
+ sizeof(*header));
+
+ DMINFO("bio magic_number:%u protocol_version:%d table_length:%u",
+ le32_to_cpu(header->magic_number),
+ le32_to_cpu(header->protocol_version),
+ le32_to_cpu(header->table_length));
+
+ err = verify_header(header);
+
+ if (err == VERITY_STATE_DISABLE) {
+ DMERR("Mounting root with verity disabled");
+ *verity_enabled = false;
+ /* we would still have to read the metadata to figure out
+ * the data blocks size. Or may be could map the entire
+ * partition similar to mounting the device.
+ *
+ * Reset error as well as the verity_enabled flag is changed.
+ */
+ err = 0;
+ } else if (err)
+ goto free_header;
+
+ *metadata = kzalloc(sizeof(**metadata), GFP_KERNEL);
+ if (!*metadata) {
+ DMERR("kzalloc for metadata failed");
+ err = -ENOMEM;
+ goto free_header;
+ }
+
+ (*metadata)->header = header;
+ table_length = le32_to_cpu(header->table_length);
+
+ if (table_length == 0 ||
+ table_length > (VERITY_METADATA_SIZE -
+ sizeof(struct android_metadata_header))) {
+ DMERR("table_length too long");
+ err = -EINVAL;
+ goto free_metadata;
+ }
+
+ (*metadata)->verity_table = kzalloc(table_length + 1, GFP_KERNEL);
+
+ if (!(*metadata)->verity_table) {
+ DMERR("kzalloc verity_table failed");
+ err = -ENOMEM;
+ goto free_metadata;
+ }
+
+ if (sizeof(struct android_metadata_header) +
+ table_length <= PAGE_SIZE) {
+ memcpy((*metadata)->verity_table,
+ page_address(payload.page_io[0])
+ + sizeof(struct android_metadata_header),
+ table_length);
+ } else {
+ copy_length = PAGE_SIZE -
+ sizeof(struct android_metadata_header);
+ memcpy((*metadata)->verity_table,
+ page_address(payload.page_io[0])
+ + sizeof(struct android_metadata_header),
+ copy_length);
+ table_length -= copy_length;
+ offset = copy_length;
+ i = 1;
+ while (table_length != 0) {
+ if (table_length > PAGE_SIZE) {
+ memcpy((*metadata)->verity_table + offset,
+ page_address(payload.page_io[i]),
+ PAGE_SIZE);
+ offset += PAGE_SIZE;
+ table_length -= PAGE_SIZE;
+ } else {
+ memcpy((*metadata)->verity_table + offset,
+ page_address(payload.page_io[i]),
+ table_length);
+ table_length = 0;
+ }
+ i++;
+ }
+ }
+ (*metadata)->verity_table[table_length] = '\0';
+
+ DMINFO("verity_table: %s", (*metadata)->verity_table);
+ goto free_payload;
+
+free_metadata:
+ kfree(*metadata);
+free_header:
+ kfree(header);
+free_payload:
+ for (i = 0; i < payload.number_of_pages; i++)
+ if (payload.page_io[i])
+ __free_page(payload.page_io[i]);
+ kfree(payload.page_io);
+blkdev_release:
+ blkdev_put(bdev, FMODE_READ);
+ return err;
+}
+
+/* helper functions to extract properties from dts */
+const char *find_dt_value(const char *name)
+{
+ struct device_node *firmware;
+ const char *value;
+
+ firmware = of_find_node_by_path("/firmware/android");
+ if (!firmware)
+ return NULL;
+ value = of_get_property(firmware, name, NULL);
+ of_node_put(firmware);
+
+ return value;
+}
+
+static int verity_mode(void)
+{
+ static const char enforcing[] = "enforcing";
+ static const char verified_mode_prop[] = "veritymode";
+ const char *value;
+
+ value = find_dt_value(verified_mode_prop);
+ if (!value)
+ value = veritymode;
+ if (!strncmp(value, enforcing, sizeof(enforcing) - 1))
+ return DM_VERITY_MODE_RESTART;
+
+ return DM_VERITY_MODE_EIO;
+}
+
+static void handle_error(void)
+{
+ int mode = verity_mode();
+ if (mode == DM_VERITY_MODE_RESTART) {
+ DMERR("triggering restart");
+ kernel_restart("dm-verity device corrupted");
+ } else {
+ DMERR("Mounting verity root failed");
+ }
+}
+
+static struct public_key_signature *table_make_digest(
+ enum hash_algo hash,
+ const void *table,
+ unsigned long table_len)
+{
+ struct public_key_signature *pks = NULL;
+ struct crypto_shash *tfm;
+ struct shash_desc *desc;
+ size_t digest_size, desc_size;
+ int ret;
+
+ /* Allocate the hashing algorithm we're going to need and find out how
+ * big the hash operational data will be.
+ */
+ tfm = crypto_alloc_shash(hash_algo_name[hash], 0, 0);
+ if (IS_ERR(tfm))
+ return ERR_CAST(tfm);
+
+ desc_size = crypto_shash_descsize(tfm) + sizeof(*desc);
+ digest_size = crypto_shash_digestsize(tfm);
+
+ /* We allocate the hash operational data storage on the end of out
+ * context data and the digest output buffer on the end of that.
+ */
+ ret = -ENOMEM;
+ pks = kzalloc(digest_size + sizeof(*pks) + desc_size, GFP_KERNEL);
+ if (!pks)
+ goto error;
+
+ pks->pkey_algo = "rsa";
+ pks->hash_algo = hash_algo_name[hash];
+ pks->digest = (u8 *)pks + sizeof(*pks) + desc_size;
+ pks->digest_size = digest_size;
+
+ desc = (struct shash_desc *)(pks + 1);
+ desc->tfm = tfm;
+ desc->flags = CRYPTO_TFM_REQ_MAY_SLEEP;
+
+ ret = crypto_shash_init(desc);
+ if (ret < 0)
+ goto error;
+
+ ret = crypto_shash_finup(desc, table, table_len, pks->digest);
+ if (ret < 0)
+ goto error;
+
+ crypto_free_shash(tfm);
+ return pks;
+
+error:
+ kfree(pks);
+ crypto_free_shash(tfm);
+ return ERR_PTR(ret);
+}
+
+
+static int verify_verity_signature(char *key_id,
+ struct android_metadata *metadata)
+{
+ struct public_key_signature *pks = NULL;
+ int retval = -EINVAL;
+
+ if (!key_id)
+ goto error;
+
+ pks = table_make_digest(HASH_ALGO_SHA256,
+ (const void *)metadata->verity_table,
+ le32_to_cpu(metadata->header->table_length));
+ if (IS_ERR(pks)) {
+ DMERR("hashing failed");
+ retval = PTR_ERR(pks);
+ pks = NULL;
+ goto error;
+ }
+
+ pks->s = kmemdup(&metadata->header->signature[0], RSANUMBYTES, GFP_KERNEL);
+ if (!pks->s) {
+ DMERR("Error allocating memory for signature");
+ goto error;
+ }
+ pks->s_size = RSANUMBYTES;
+
+ retval = verify_signature_one(pks, NULL, key_id);
+ kfree(pks->s);
+error:
+ kfree(pks);
+ return retval;
+}
+
+static inline bool test_mult_overflow(sector_t a, u32 b)
+{
+ sector_t r = (sector_t)~0ULL;
+
+ sector_div(r, b);
+ return a > r;
+}
+
+static int add_as_linear_device(struct dm_target *ti, char *dev)
+{
+ /*Move to linear mapping defines*/
+ char *linear_table_args[DM_LINEAR_ARGS] = {dev,
+ DM_LINEAR_TARGET_OFFSET};
+ int err = 0;
+
+ android_verity_target.dtr = dm_linear_dtr,
+ android_verity_target.map = dm_linear_map,
+ android_verity_target.status = dm_linear_status,
+ android_verity_target.prepare_ioctl = dm_linear_prepare_ioctl,
+ android_verity_target.iterate_devices = dm_linear_iterate_devices,
+ android_verity_target.direct_access = dm_linear_direct_access,
+ android_verity_target.io_hints = NULL;
+
+ set_disk_ro(dm_disk(dm_table_get_md(ti->table)), 0);
+
+ err = dm_linear_ctr(ti, DM_LINEAR_ARGS, linear_table_args);
+
+ if (!err) {
+ DMINFO("Added android-verity as a linear target");
+ target_added = true;
+ } else
+ DMERR("Failed to add android-verity as linear target");
+
+ return err;
+}
+
+static int create_linear_device(struct dm_target *ti, dev_t dev,
+ char *target_device)
+{
+ u64 device_size = 0;
+ int err = find_size(dev, &device_size);
+
+ if (err) {
+ DMERR("error finding bdev size");
+ handle_error();
+ return err;
+ }
+
+ ti->len = device_size;
+ err = add_as_linear_device(ti, target_device);
+ if (err) {
+ handle_error();
+ return err;
+ }
+ verity_enabled = false;
+ return 0;
+}
+
+/*
+ * Target parameters:
+ * <key id> Key id of the public key in the system keyring.
+ * Verity metadata's signature would be verified against
+ * this. If the key id contains spaces, replace them
+ * with '#'.
+ * <block device> The block device for which dm-verity is being setup.
+ */
+static int android_verity_ctr(struct dm_target *ti, unsigned argc, char **argv)
+{
+ dev_t uninitialized_var(dev);
+ struct android_metadata *metadata = NULL;
+ int err = 0, i, mode;
+ char *key_id = NULL, *table_ptr, dummy, *target_device;
+ char *verity_table_args[VERITY_TABLE_ARGS + 2 + VERITY_TABLE_OPT_FEC_ARGS];
+ /* One for specifying number of opt args and one for mode */
+ sector_t data_sectors;
+ u32 data_block_size;
+ unsigned int no_of_args = VERITY_TABLE_ARGS + 2 + VERITY_TABLE_OPT_FEC_ARGS;
+ struct fec_header uninitialized_var(fec);
+ struct fec_ecc_metadata uninitialized_var(ecc);
+ char buf[FEC_ARG_LENGTH], *buf_ptr;
+ unsigned long long tmpll;
+
+ if (argc == 1) {
+ /* Use the default keyid */
+ if (default_verity_key_id())
+ key_id = veritykeyid;
+ else if (!is_eng()) {
+ DMERR("veritykeyid= is not set");
+ handle_error();
+ return -EINVAL;
+ }
+ target_device = argv[0];
+ } else if (argc == 2) {
+ key_id = argv[0];
+ target_device = argv[1];
+ } else {
+ DMERR("Incorrect number of arguments");
+ handle_error();
+ return -EINVAL;
+ }
+
+ dev = name_to_dev_t(target_device);
+ if (!dev) {
+ DMERR("no dev found for %s", target_device);
+ handle_error();
+ return -EINVAL;
+ }
+
+ if (is_eng())
+ return create_linear_device(ti, dev, target_device);
+
+ strreplace(key_id, '#', ' ');
+
+ DMINFO("key:%s dev:%s", key_id, target_device);
+
+ if (extract_fec_header(dev, &fec, &ecc)) {
+ DMERR("Error while extracting fec header");
+ handle_error();
+ return -EINVAL;
+ }
+
+ err = extract_metadata(dev, &fec, &metadata, &verity_enabled);
+
+ if (err) {
+ /* Allow invalid metadata when the device is unlocked */
+ if (is_unlocked()) {
+ DMWARN("Allow invalid metadata when unlocked");
+ return create_linear_device(ti, dev, target_device);
+ }
+ DMERR("Error while extracting metadata");
+ handle_error();
+ goto free_metadata;
+ }
+
+ if (verity_enabled) {
+ err = verify_verity_signature(key_id, metadata);
+
+ if (err) {
+ DMERR("Signature verification failed");
+ handle_error();
+ goto free_metadata;
+ } else
+ DMINFO("Signature verification success");
+ }
+
+ table_ptr = metadata->verity_table;
+
+ for (i = 0; i < VERITY_TABLE_ARGS; i++) {
+ verity_table_args[i] = strsep(&table_ptr, " ");
+ if (verity_table_args[i] == NULL)
+ break;
+ }
+
+ if (i != VERITY_TABLE_ARGS) {
+ DMERR("Verity table not in the expected format");
+ err = -EINVAL;
+ handle_error();
+ goto free_metadata;
+ }
+
+ if (sscanf(verity_table_args[5], "%llu%c", &tmpll, &dummy)
+ != 1) {
+ DMERR("Verity table not in the expected format");
+ handle_error();
+ err = -EINVAL;
+ goto free_metadata;
+ }
+
+ if (tmpll > ULONG_MAX) {
+ DMERR("<num_data_blocks> too large. Forgot to turn on CONFIG_LBDAF?");
+ handle_error();
+ err = -EINVAL;
+ goto free_metadata;
+ }
+
+ data_sectors = tmpll;
+
+ if (sscanf(verity_table_args[3], "%u%c", &data_block_size, &dummy)
+ != 1) {
+ DMERR("Verity table not in the expected format");
+ handle_error();
+ err = -EINVAL;
+ goto free_metadata;
+ }
+
+ if (test_mult_overflow(data_sectors, data_block_size >>
+ SECTOR_SHIFT)) {
+ DMERR("data_sectors too large");
+ handle_error();
+ err = -EOVERFLOW;
+ goto free_metadata;
+ }
+
+ data_sectors *= data_block_size >> SECTOR_SHIFT;
+ DMINFO("Data sectors %llu", (unsigned long long)data_sectors);
+
+ /* update target length */
+ ti->len = data_sectors;
+
+ /* Setup linear target and free */
+ if (!verity_enabled) {
+ err = add_as_linear_device(ti, target_device);
+ goto free_metadata;
+ }
+
+ /*substitute data_dev and hash_dev*/
+ verity_table_args[1] = target_device;
+ verity_table_args[2] = target_device;
+
+ mode = verity_mode();
+
+ if (ecc.valid && IS_BUILTIN(CONFIG_DM_VERITY_FEC)) {
+ if (mode) {
+ err = snprintf(buf, FEC_ARG_LENGTH,
+ "%u %s " VERITY_TABLE_OPT_FEC_FORMAT,
+ 1 + VERITY_TABLE_OPT_FEC_ARGS,
+ mode == DM_VERITY_MODE_RESTART ?
+ VERITY_TABLE_OPT_RESTART :
+ VERITY_TABLE_OPT_LOGGING,
+ target_device,
+ ecc.start / FEC_BLOCK_SIZE, ecc.blocks,
+ ecc.roots);
+ } else {
+ err = snprintf(buf, FEC_ARG_LENGTH,
+ "%u " VERITY_TABLE_OPT_FEC_FORMAT,
+ VERITY_TABLE_OPT_FEC_ARGS, target_device,
+ ecc.start / FEC_BLOCK_SIZE, ecc.blocks,
+ ecc.roots);
+ }
+ } else if (mode) {
+ err = snprintf(buf, FEC_ARG_LENGTH,
+ "2 " VERITY_TABLE_OPT_IGNZERO " %s",
+ mode == DM_VERITY_MODE_RESTART ?
+ VERITY_TABLE_OPT_RESTART : VERITY_TABLE_OPT_LOGGING);
+ } else {
+ err = snprintf(buf, FEC_ARG_LENGTH, "1 %s",
+ "ignore_zero_blocks");
+ }
+
+ if (err < 0 || err >= FEC_ARG_LENGTH)
+ goto free_metadata;
+
+ buf_ptr = buf;
+
+ for (i = VERITY_TABLE_ARGS; i < (VERITY_TABLE_ARGS +
+ VERITY_TABLE_OPT_FEC_ARGS + 2); i++) {
+ verity_table_args[i] = strsep(&buf_ptr, " ");
+ if (verity_table_args[i] == NULL) {
+ no_of_args = i;
+ break;
+ }
+ }
+
+ err = verity_ctr(ti, no_of_args, verity_table_args);
+ if (err) {
+ DMERR("android-verity failed to create a verity target");
+ } else {
+ target_added = true;
+ DMINFO("android-verity created as verity target");
+ }
+
+free_metadata:
+ if (metadata) {
+ kfree(metadata->header);
+ kfree(metadata->verity_table);
+ }
+ kfree(metadata);
+ return err;
+}
+
+static int __init dm_android_verity_init(void)
+{
+ int r;
+ struct dentry *file;
+
+ r = dm_register_target(&android_verity_target);
+ if (r < 0)
+ DMERR("register failed %d", r);
+
+ /* Tracks the status of the last added target */
+ debug_dir = debugfs_create_dir("android_verity", NULL);
+
+ if (IS_ERR_OR_NULL(debug_dir)) {
+ DMERR("Cannot create android_verity debugfs directory: %ld",
+ PTR_ERR(debug_dir));
+ goto end;
+ }
+
+ file = debugfs_create_bool("target_added", S_IRUGO, debug_dir,
+ &target_added);
+
+ if (IS_ERR_OR_NULL(file)) {
+ DMERR("Cannot create android_verity debugfs directory: %ld",
+ PTR_ERR(debug_dir));
+ debugfs_remove_recursive(debug_dir);
+ goto end;
+ }
+
+ file = debugfs_create_bool("verity_enabled", S_IRUGO, debug_dir,
+ &verity_enabled);
+
+ if (IS_ERR_OR_NULL(file)) {
+ DMERR("Cannot create android_verity debugfs directory: %ld",
+ PTR_ERR(debug_dir));
+ debugfs_remove_recursive(debug_dir);
+ }
+
+end:
+ return r;
+}
+
+static void __exit dm_android_verity_exit(void)
+{
+ if (!IS_ERR_OR_NULL(debug_dir))
+ debugfs_remove_recursive(debug_dir);
+
+ dm_unregister_target(&android_verity_target);
+}
+
+module_init(dm_android_verity_init);
+module_exit(dm_android_verity_exit);
diff --git a/drivers/md/dm-android-verity.h b/drivers/md/dm-android-verity.h
new file mode 100644
index 000000000000..c8d7ab642780
--- /dev/null
+++ b/drivers/md/dm-android-verity.h
@@ -0,0 +1,123 @@
+/*
+ * Copyright (C) 2015 Google, Inc.
+ *
+ * This software is licensed under the terms of the GNU General Public
+ * License version 2, as published by the Free Software Foundation, and
+ * may be copied, distributed, and modified under those terms.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ */
+
+#ifndef DM_ANDROID_VERITY_H
+#define DM_ANDROID_VERITY_H
+
+#include <crypto/sha.h>
+
+#define RSANUMBYTES 256
+#define VERITY_METADATA_MAGIC_NUMBER 0xb001b001
+#define VERITY_METADATA_MAGIC_DISABLE 0x46464f56
+#define VERITY_METADATA_VERSION 0
+#define VERITY_STATE_DISABLE 1
+#define DATA_BLOCK_SIZE (4 * 1024)
+#define VERITY_METADATA_SIZE (8 * DATA_BLOCK_SIZE)
+#define VERITY_TABLE_ARGS 10
+#define VERITY_COMMANDLINE_PARAM_LENGTH 20
+#define BUILD_VARIANT 20
+
+/*
+ * <subject>:<sha1-id> is the format for the identifier.
+ * subject can either be the Common Name(CN) + Organization Name(O) or
+ * just the CN if the it is prefixed with O
+ * From https://tools.ietf.org/html/rfc5280#appendix-A
+ * ub-organization-name-length INTEGER ::= 64
+ * ub-common-name-length INTEGER ::= 64
+ *
+ * http://lxr.free-electrons.com/source/crypto/asymmetric_keys/x509_cert_parser.c?v=3.9#L278
+ * ctx->o_size + 2 + ctx->cn_size + 1
+ * + 41 characters for ":" and sha1 id
+ * 64 + 2 + 64 + 1 + 1 + 40 (172)
+ * setting VERITY_DEFAULT_KEY_ID_LENGTH to 200 characters.
+ */
+#define VERITY_DEFAULT_KEY_ID_LENGTH 200
+
+#define FEC_MAGIC 0xFECFECFE
+#define FEC_BLOCK_SIZE (4 * 1024)
+#define FEC_VERSION 0
+#define FEC_RSM 255
+#define FEC_ARG_LENGTH 300
+
+#define VERITY_TABLE_OPT_RESTART "restart_on_corruption"
+#define VERITY_TABLE_OPT_LOGGING "ignore_corruption"
+#define VERITY_TABLE_OPT_IGNZERO "ignore_zero_blocks"
+
+#define VERITY_TABLE_OPT_FEC_FORMAT \
+ "use_fec_from_device %s fec_start %llu fec_blocks %llu fec_roots %u ignore_zero_blocks"
+#define VERITY_TABLE_OPT_FEC_ARGS 9
+
+#define VERITY_DEBUG 0
+
+#define DM_MSG_PREFIX "android-verity"
+
+#define DM_LINEAR_ARGS 2
+#define DM_LINEAR_TARGET_OFFSET "0"
+
+/*
+ * There can be two formats.
+ * if fec is present
+ * <data_blocks> <verity_tree> <verity_metdata_32K><fec_data><fec_data_4K>
+ * if fec is not present
+ * <data_blocks> <verity_tree> <verity_metdata_32K>
+ */
+struct fec_header {
+ __le32 magic;
+ __le32 version;
+ __le32 size;
+ __le32 roots;
+ __le32 fec_size;
+ __le64 inp_size;
+ u8 hash[SHA256_DIGEST_SIZE];
+} __attribute__((packed));
+
+struct android_metadata_header {
+ __le32 magic_number;
+ __le32 protocol_version;
+ char signature[RSANUMBYTES];
+ __le32 table_length;
+};
+
+struct android_metadata {
+ struct android_metadata_header *header;
+ char *verity_table;
+};
+
+struct fec_ecc_metadata {
+ bool valid;
+ u32 roots;
+ u64 blocks;
+ u64 rounds;
+ u64 start;
+};
+
+struct bio_read {
+ struct page **page_io;
+ int number_of_pages;
+};
+
+extern struct target_type linear_target;
+
+extern void dm_linear_dtr(struct dm_target *ti);
+extern int dm_linear_map(struct dm_target *ti, struct bio *bio);
+extern void dm_linear_status(struct dm_target *ti, status_type_t type,
+ unsigned status_flags, char *result, unsigned maxlen);
+extern int dm_linear_prepare_ioctl(struct dm_target *ti,
+ struct block_device **bdev, fmode_t *mode);
+extern int dm_linear_iterate_devices(struct dm_target *ti,
+ iterate_devices_callout_fn fn, void *data);
+extern int dm_linear_ctr(struct dm_target *ti, unsigned int argc, char **argv);
+extern long dm_linear_direct_access(struct dm_target *ti, sector_t sector,
+ void **kaddr, pfn_t *pfn, long size);
+#endif /* DM_ANDROID_VERITY_H */
diff --git a/drivers/md/dm-crypt.c b/drivers/md/dm-crypt.c
index 0aedd0ebccec..45a1f9232dca 100644
--- a/drivers/md/dm-crypt.c
+++ b/drivers/md/dm-crypt.c
@@ -115,6 +115,10 @@ struct iv_tcw_private {
enum flags { DM_CRYPT_SUSPENDED, DM_CRYPT_KEY_VALID,
DM_CRYPT_SAME_CPU, DM_CRYPT_NO_OFFLOAD };
+enum cipher_flags {
+ CRYPT_IV_LARGE_SECTORS, /* Calculate IV from sector_size, not 512B sectors */
+};
+
/*
* The fields in here must be read only after initialization.
*/
@@ -150,11 +154,14 @@ struct crypt_config {
} iv_gen_private;
sector_t iv_offset;
unsigned int iv_size;
+ unsigned short int sector_size;
+ unsigned char sector_shift;
/* ESSIV: struct crypto_cipher *essiv_tfm */
void *iv_private;
struct crypto_skcipher **tfms;
unsigned tfms_count;
+ unsigned long cipher_flags;
/*
* Layout of each crypto request:
@@ -484,6 +491,11 @@ static int crypt_iv_lmk_ctr(struct crypt_config *cc, struct dm_target *ti,
{
struct iv_lmk_private *lmk = &cc->iv_gen_private.lmk;
+ if (cc->sector_size != (1 << SECTOR_SHIFT)) {
+ ti->error = "Unsupported sector size for LMK";
+ return -EINVAL;
+ }
+
lmk->hash_tfm = crypto_alloc_shash("md5", 0, 0);
if (IS_ERR(lmk->hash_tfm)) {
ti->error = "Error initializing LMK hash";
@@ -633,6 +645,11 @@ static int crypt_iv_tcw_ctr(struct crypt_config *cc, struct dm_target *ti,
{
struct iv_tcw_private *tcw = &cc->iv_gen_private.tcw;
+ if (cc->sector_size != (1 << SECTOR_SHIFT)) {
+ ti->error = "Unsupported sector size for TCW";
+ return -EINVAL;
+ }
+
if (cc->key_size <= (cc->iv_size + TCW_WHITENING_SIZE)) {
ti->error = "Wrong key size for TCW";
return -EINVAL;
@@ -846,21 +863,27 @@ static int crypt_convert_block(struct crypt_config *cc,
u8 *iv;
int r;
+ /* Reject unexpected unaligned bio. */
+ if (unlikely(bv_in.bv_len & (cc->sector_size - 1)))
+ return -EIO;
+
dmreq = dmreq_of_req(cc, req);
iv = iv_of_dmreq(cc, dmreq);
dmreq->iv_sector = ctx->cc_sector;
+ if (test_bit(CRYPT_IV_LARGE_SECTORS, &cc->cipher_flags))
+ dmreq->iv_sector >>= cc->sector_shift;
dmreq->ctx = ctx;
sg_init_table(&dmreq->sg_in, 1);
- sg_set_page(&dmreq->sg_in, bv_in.bv_page, 1 << SECTOR_SHIFT,
+ sg_set_page(&dmreq->sg_in, bv_in.bv_page, cc->sector_size,
bv_in.bv_offset);
sg_init_table(&dmreq->sg_out, 1);
- sg_set_page(&dmreq->sg_out, bv_out.bv_page, 1 << SECTOR_SHIFT,
+ sg_set_page(&dmreq->sg_out, bv_out.bv_page, cc->sector_size,
bv_out.bv_offset);
- bio_advance_iter(ctx->bio_in, &ctx->iter_in, 1 << SECTOR_SHIFT);
- bio_advance_iter(ctx->bio_out, &ctx->iter_out, 1 << SECTOR_SHIFT);
+ bio_advance_iter(ctx->bio_in, &ctx->iter_in, cc->sector_size);
+ bio_advance_iter(ctx->bio_out, &ctx->iter_out, cc->sector_size);
if (cc->iv_gen_ops) {
r = cc->iv_gen_ops->generator(cc, iv, dmreq);
@@ -869,7 +892,7 @@ static int crypt_convert_block(struct crypt_config *cc,
}
skcipher_request_set_crypt(req, &dmreq->sg_in, &dmreq->sg_out,
- 1 << SECTOR_SHIFT, iv);
+ cc->sector_size, iv);
if (bio_data_dir(ctx->bio_in) == WRITE)
r = crypto_skcipher_encrypt(req);
@@ -919,6 +942,7 @@ static void crypt_free_req(struct crypt_config *cc,
static int crypt_convert(struct crypt_config *cc,
struct convert_context *ctx)
{
+ unsigned int sector_step = cc->sector_size >> SECTOR_SHIFT;
int r;
atomic_set(&ctx->cc_pending, 1);
@@ -926,7 +950,6 @@ static int crypt_convert(struct crypt_config *cc,
while (ctx->iter_in.bi_size && ctx->iter_out.bi_size) {
crypt_alloc_req(cc, ctx);
-
atomic_inc(&ctx->cc_pending);
r = crypt_convert_block(cc, ctx, ctx->req);
@@ -946,14 +969,14 @@ static int crypt_convert(struct crypt_config *cc,
*/
case -EINPROGRESS:
ctx->req = NULL;
- ctx->cc_sector++;
+ ctx->cc_sector += sector_step;
continue;
/*
* The request was already processed (synchronously).
*/
case 0:
atomic_dec(&ctx->cc_pending);
- ctx->cc_sector++;
+ ctx->cc_sector += sector_step;
cond_resched();
continue;
@@ -1468,6 +1491,13 @@ static int crypt_alloc_tfms(struct crypt_config *cc, char *ciphermode)
}
}
+ /*
+ * dm-crypt performance can vary greatly depending on which crypto
+ * algorithm implementation is used. Help people debug performance
+ * problems by logging the ->cra_driver_name.
+ */
+ DMINFO("%s using implementation \"%s\"", ciphermode,
+ crypto_skcipher_alg(any_tfm(cc))->base.cra_driver_name);
return 0;
}
@@ -1743,7 +1773,7 @@ static int crypt_ctr(struct dm_target *ti, unsigned int argc, char **argv)
char dummy;
static struct dm_arg _args[] = {
- {0, 3, "Invalid number of feature args"},
+ {0, 5, "Invalid number of feature args"},
};
if (argc < 5) {
@@ -1759,6 +1789,8 @@ static int crypt_ctr(struct dm_target *ti, unsigned int argc, char **argv)
return -ENOMEM;
}
cc->key_size = key_size;
+ cc->sector_size = (1 << SECTOR_SHIFT);
+ cc->sector_shift = 0;
ti->private = cc;
ret = crypt_ctr_cipher(ti, argv[0], argv[1]);
@@ -1810,7 +1842,8 @@ static int crypt_ctr(struct dm_target *ti, unsigned int argc, char **argv)
mutex_init(&cc->bio_alloc_lock);
ret = -EINVAL;
- if (sscanf(argv[2], "%llu%c", &tmpll, &dummy) != 1) {
+ if ((sscanf(argv[2], "%llu%c", &tmpll, &dummy) != 1) ||
+ (tmpll & ((cc->sector_size >> SECTOR_SHIFT) - 1))) {
ti->error = "Invalid iv_offset sector";
goto bad;
}
@@ -1858,6 +1891,21 @@ static int crypt_ctr(struct dm_target *ti, unsigned int argc, char **argv)
else if (!strcasecmp(opt_string, "submit_from_crypt_cpus"))
set_bit(DM_CRYPT_NO_OFFLOAD, &cc->flags);
+ else if (sscanf(opt_string, "sector_size:%hu%c",
+ &cc->sector_size, &dummy) == 1) {
+ if (cc->sector_size < (1 << SECTOR_SHIFT) ||
+ cc->sector_size > 4096 ||
+ (cc->sector_size & (cc->sector_size - 1))) {
+ ti->error = "Invalid feature value for sector_size";
+ goto bad;
+ }
+ if (ti->len & ((cc->sector_size >> SECTOR_SHIFT) - 1)) {
+ ti->error = "Device size is not multiple of sector_size feature";
+ goto bad;
+ }
+ cc->sector_shift = __ffs(cc->sector_size) - SECTOR_SHIFT;
+ } else if (!strcasecmp(opt_string, "iv_large_sectors"))
+ set_bit(CRYPT_IV_LARGE_SECTORS, &cc->cipher_flags);
else {
ti->error = "Invalid feature arguments";
goto bad;
@@ -1866,16 +1914,24 @@ static int crypt_ctr(struct dm_target *ti, unsigned int argc, char **argv)
}
ret = -ENOMEM;
- cc->io_queue = alloc_workqueue("kcryptd_io", WQ_MEM_RECLAIM, 1);
+ cc->io_queue = alloc_workqueue("kcryptd_io",
+ WQ_HIGHPRI |
+ WQ_MEM_RECLAIM,
+ 1);
if (!cc->io_queue) {
ti->error = "Couldn't create kcryptd io queue";
goto bad;
}
if (test_bit(DM_CRYPT_SAME_CPU, &cc->flags))
- cc->crypt_queue = alloc_workqueue("kcryptd", WQ_CPU_INTENSIVE | WQ_MEM_RECLAIM, 1);
+ cc->crypt_queue = alloc_workqueue("kcryptd",
+ WQ_HIGHPRI |
+ WQ_MEM_RECLAIM, 1);
else
- cc->crypt_queue = alloc_workqueue("kcryptd", WQ_CPU_INTENSIVE | WQ_MEM_RECLAIM | WQ_UNBOUND,
+ cc->crypt_queue = alloc_workqueue("kcryptd",
+ WQ_HIGHPRI |
+ WQ_MEM_RECLAIM |
+ WQ_UNBOUND,
num_online_cpus());
if (!cc->crypt_queue) {
ti->error = "Couldn't create kcryptd queue";
@@ -1930,6 +1986,16 @@ static int crypt_map(struct dm_target *ti, struct bio *bio)
bio_data_dir(bio) == WRITE)
dm_accept_partial_bio(bio, ((BIO_MAX_PAGES << PAGE_SHIFT) >> SECTOR_SHIFT));
+ /*
+ * Ensure that bio is a multiple of internal sector encryption size
+ * and is aligned to this size as defined in IO hints.
+ */
+ if (unlikely((bio->bi_iter.bi_sector & ((cc->sector_size >> SECTOR_SHIFT) - 1)) != 0))
+ return -EIO;
+
+ if (unlikely(bio->bi_iter.bi_size & (cc->sector_size - 1)))
+ return -EIO;
+
io = dm_per_bio_data(bio, cc->per_bio_data_size);
crypt_io_init(io, cc, bio, dm_target_offset(ti, bio->bi_iter.bi_sector));
io->ctx.req = (struct skcipher_request *)(io + 1);
@@ -1970,6 +2036,8 @@ static void crypt_status(struct dm_target *ti, status_type_t type,
num_feature_args += !!ti->num_discard_bios;
num_feature_args += test_bit(DM_CRYPT_SAME_CPU, &cc->flags);
num_feature_args += test_bit(DM_CRYPT_NO_OFFLOAD, &cc->flags);
+ num_feature_args += cc->sector_size != (1 << SECTOR_SHIFT);
+ num_feature_args += test_bit(CRYPT_IV_LARGE_SECTORS, &cc->cipher_flags);
if (num_feature_args) {
DMEMIT(" %d", num_feature_args);
if (ti->num_discard_bios)
@@ -1978,6 +2046,10 @@ static void crypt_status(struct dm_target *ti, status_type_t type,
DMEMIT(" same_cpu_crypt");
if (test_bit(DM_CRYPT_NO_OFFLOAD, &cc->flags))
DMEMIT(" submit_from_crypt_cpus");
+ if (cc->sector_size != (1 << SECTOR_SHIFT))
+ DMEMIT(" sector_size:%d", cc->sector_size);
+ if (test_bit(CRYPT_IV_LARGE_SECTORS, &cc->cipher_flags))
+ DMEMIT(" iv_large_sectors");
}
break;
@@ -2060,6 +2132,8 @@ static int crypt_iterate_devices(struct dm_target *ti,
static void crypt_io_hints(struct dm_target *ti, struct queue_limits *limits)
{
+ struct crypt_config *cc = ti->private;
+
/*
* Unfortunate constraint that is required to avoid the potential
* for exceeding underlying device's max_segments limits -- due to
@@ -2067,11 +2141,17 @@ static void crypt_io_hints(struct dm_target *ti, struct queue_limits *limits)
* bio that are not as physically contiguous as the original bio.
*/
limits->max_segment_size = PAGE_SIZE;
+
+ limits->logical_block_size =
+ max_t(unsigned short, limits->logical_block_size, cc->sector_size);
+ limits->physical_block_size =
+ max_t(unsigned, limits->physical_block_size, cc->sector_size);
+ limits->io_min = max_t(unsigned, limits->io_min, cc->sector_size);
}
static struct target_type crypt_target = {
.name = "crypt",
- .version = {1, 14, 1},
+ .version = {1, 17, 0},
.module = THIS_MODULE,
.ctr = crypt_ctr,
.dtr = crypt_dtr,
diff --git a/drivers/md/dm-ioctl.c b/drivers/md/dm-ioctl.c
index 6964b252952a..446d76e14c58 100644
--- a/drivers/md/dm-ioctl.c
+++ b/drivers/md/dm-ioctl.c
@@ -1921,6 +1921,45 @@ void dm_interface_exit(void)
dm_hash_exit();
}
+
+/**
+ * dm_ioctl_export - Permanently export a mapped device via the ioctl interface
+ * @md: Pointer to mapped_device
+ * @name: Buffer (size DM_NAME_LEN) for name
+ * @uuid: Buffer (size DM_UUID_LEN) for uuid or NULL if not desired
+ */
+int dm_ioctl_export(struct mapped_device *md, const char *name,
+ const char *uuid)
+{
+ int r = 0;
+ struct hash_cell *hc;
+
+ if (!md) {
+ r = -ENXIO;
+ goto out;
+ }
+
+ /* The name and uuid can only be set once. */
+ mutex_lock(&dm_hash_cells_mutex);
+ hc = dm_get_mdptr(md);
+ mutex_unlock(&dm_hash_cells_mutex);
+ if (hc) {
+ DMERR("%s: already exported", dm_device_name(md));
+ r = -ENXIO;
+ goto out;
+ }
+
+ r = dm_hash_insert(name, uuid, md);
+ if (r) {
+ DMERR("%s: could not bind to '%s'", dm_device_name(md), name);
+ goto out;
+ }
+
+ /* Let udev know we've changed. */
+ dm_kobject_uevent(md, KOBJ_CHANGE, dm_get_event_nr(md));
+out:
+ return r;
+}
/**
* dm_copy_name_and_uuid - Copy mapped device name & uuid into supplied buffers
* @md: Pointer to mapped_device
diff --git a/drivers/md/dm-linear.c b/drivers/md/dm-linear.c
index 4788b0b989a9..4ad62d680547 100644
--- a/drivers/md/dm-linear.c
+++ b/drivers/md/dm-linear.c
@@ -25,7 +25,7 @@ struct linear_c {
/*
* Construct a linear mapping: <dev_path> <offset>
*/
-static int linear_ctr(struct dm_target *ti, unsigned int argc, char **argv)
+int dm_linear_ctr(struct dm_target *ti, unsigned int argc, char **argv)
{
struct linear_c *lc;
unsigned long long tmp;
@@ -66,14 +66,16 @@ static int linear_ctr(struct dm_target *ti, unsigned int argc, char **argv)
kfree(lc);
return ret;
}
+EXPORT_SYMBOL_GPL(dm_linear_ctr);
-static void linear_dtr(struct dm_target *ti)
+void dm_linear_dtr(struct dm_target *ti)
{
struct linear_c *lc = (struct linear_c *) ti->private;
dm_put_device(ti, lc->dev);
kfree(lc);
}
+EXPORT_SYMBOL_GPL(dm_linear_dtr);
static sector_t linear_map_sector(struct dm_target *ti, sector_t bi_sector)
{
@@ -92,14 +94,15 @@ static void linear_map_bio(struct dm_target *ti, struct bio *bio)
linear_map_sector(ti, bio->bi_iter.bi_sector);
}
-static int linear_map(struct dm_target *ti, struct bio *bio)
+int dm_linear_map(struct dm_target *ti, struct bio *bio)
{
linear_map_bio(ti, bio);
return DM_MAPIO_REMAPPED;
}
+EXPORT_SYMBOL_GPL(dm_linear_map);
-static void linear_status(struct dm_target *ti, status_type_t type,
+void dm_linear_status(struct dm_target *ti, status_type_t type,
unsigned status_flags, char *result, unsigned maxlen)
{
struct linear_c *lc = (struct linear_c *) ti->private;
@@ -115,8 +118,9 @@ static void linear_status(struct dm_target *ti, status_type_t type,
break;
}
}
+EXPORT_SYMBOL_GPL(dm_linear_status);
-static int linear_prepare_ioctl(struct dm_target *ti,
+int dm_linear_prepare_ioctl(struct dm_target *ti,
struct block_device **bdev, fmode_t *mode)
{
struct linear_c *lc = (struct linear_c *) ti->private;
@@ -132,16 +136,18 @@ static int linear_prepare_ioctl(struct dm_target *ti,
return 1;
return 0;
}
+EXPORT_SYMBOL_GPL(dm_linear_prepare_ioctl);
-static int linear_iterate_devices(struct dm_target *ti,
+int dm_linear_iterate_devices(struct dm_target *ti,
iterate_devices_callout_fn fn, void *data)
{
struct linear_c *lc = ti->private;
return fn(ti, lc->dev, lc->start, ti->len, data);
}
+EXPORT_SYMBOL_GPL(dm_linear_iterate_devices);
-static long linear_direct_access(struct dm_target *ti, sector_t sector,
+long dm_linear_direct_access(struct dm_target *ti, sector_t sector,
void **kaddr, pfn_t *pfn, long size)
{
struct linear_c *lc = ti->private;
@@ -158,18 +164,19 @@ static long linear_direct_access(struct dm_target *ti, sector_t sector,
return ret;
}
+EXPORT_SYMBOL_GPL(dm_linear_direct_access);
static struct target_type linear_target = {
.name = "linear",
.version = {1, 3, 0},
.module = THIS_MODULE,
- .ctr = linear_ctr,
- .dtr = linear_dtr,
- .map = linear_map,
- .status = linear_status,
- .prepare_ioctl = linear_prepare_ioctl,
- .iterate_devices = linear_iterate_devices,
- .direct_access = linear_direct_access,
+ .ctr = dm_linear_ctr,
+ .dtr = dm_linear_dtr,
+ .map = dm_linear_map,
+ .status = dm_linear_status,
+ .prepare_ioctl = dm_linear_prepare_ioctl,
+ .iterate_devices = dm_linear_iterate_devices,
+ .direct_access = dm_linear_direct_access,
};
int __init dm_linear_init(void)
diff --git a/drivers/md/dm-table.c b/drivers/md/dm-table.c
index 5ac239d0f787..0e79a435e8bd 100644
--- a/drivers/md/dm-table.c
+++ b/drivers/md/dm-table.c
@@ -11,6 +11,7 @@
#include <linux/vmalloc.h>
#include <linux/blkdev.h>
#include <linux/namei.h>
+#include <linux/mount.h>
#include <linux/ctype.h>
#include <linux/string.h>
#include <linux/slab.h>
@@ -510,14 +511,14 @@ static int adjoin(struct dm_table *table, struct dm_target *ti)
* On the other hand, dm-switch needs to process bulk data using messages and
* excessive use of GFP_NOIO could cause trouble.
*/
-static char **realloc_argv(unsigned *array_size, char **old_argv)
+static char **realloc_argv(unsigned *size, char **old_argv)
{
char **argv;
unsigned new_size;
gfp_t gfp;
- if (*array_size) {
- new_size = *array_size * 2;
+ if (*size) {
+ new_size = *size * 2;
gfp = GFP_KERNEL;
} else {
new_size = 8;
@@ -525,8 +526,8 @@ static char **realloc_argv(unsigned *array_size, char **old_argv)
}
argv = kmalloc(new_size * sizeof(*argv), gfp);
if (argv) {
- memcpy(argv, old_argv, *array_size * sizeof(*argv));
- *array_size = new_size;
+ memcpy(argv, old_argv, *size * sizeof(*argv));
+ *size = new_size;
}
kfree(old_argv);
diff --git a/drivers/md/dm-verity-avb.c b/drivers/md/dm-verity-avb.c
new file mode 100644
index 000000000000..89f95e459e5a
--- /dev/null
+++ b/drivers/md/dm-verity-avb.c
@@ -0,0 +1,229 @@
+/*
+ * Copyright (C) 2017 Google.
+ *
+ * This file is released under the GPLv2.
+ *
+ * Based on drivers/md/dm-verity-chromeos.c
+ */
+
+#include <linux/device-mapper.h>
+#include <linux/module.h>
+#include <linux/mount.h>
+
+#define DM_MSG_PREFIX "verity-avb"
+
+/* Set via module parameters. */
+static char avb_vbmeta_device[64];
+static char avb_invalidate_on_error[4];
+
+static void invalidate_vbmeta_endio(struct bio *bio)
+{
+ if (bio->bi_error)
+ DMERR("invalidate_vbmeta_endio: error %d", bio->bi_error);
+ complete(bio->bi_private);
+}
+
+static int invalidate_vbmeta_submit(struct bio *bio,
+ struct block_device *bdev,
+ int op, int access_last_sector,
+ struct page *page)
+{
+ DECLARE_COMPLETION_ONSTACK(wait);
+
+ bio->bi_private = &wait;
+ bio->bi_end_io = invalidate_vbmeta_endio;
+ bio->bi_bdev = bdev;
+ bio_set_op_attrs(bio, op, REQ_SYNC | REQ_NOIDLE);
+
+ bio->bi_iter.bi_sector = 0;
+ if (access_last_sector) {
+ sector_t last_sector;
+
+ last_sector = (i_size_read(bdev->bd_inode)>>SECTOR_SHIFT) - 1;
+ bio->bi_iter.bi_sector = last_sector;
+ }
+ if (!bio_add_page(bio, page, PAGE_SIZE, 0)) {
+ DMERR("invalidate_vbmeta_submit: bio_add_page error");
+ return -EIO;
+ }
+
+ submit_bio(bio);
+ /* Wait up to 2 seconds for completion or fail. */
+ if (!wait_for_completion_timeout(&wait, msecs_to_jiffies(2000)))
+ return -EIO;
+ return 0;
+}
+
+static int invalidate_vbmeta(dev_t vbmeta_devt)
+{
+ int ret = 0;
+ struct block_device *bdev;
+ struct bio *bio;
+ struct page *page;
+ fmode_t dev_mode;
+ /* Ensure we do synchronous unblocked I/O. We may also need
+ * sync_bdev() on completion, but it really shouldn't.
+ */
+ int access_last_sector = 0;
+
+ DMINFO("invalidate_vbmeta: acting on device %d:%d",
+ MAJOR(vbmeta_devt), MINOR(vbmeta_devt));
+
+ /* First we open the device for reading. */
+ dev_mode = FMODE_READ | FMODE_EXCL;
+ bdev = blkdev_get_by_dev(vbmeta_devt, dev_mode,
+ invalidate_vbmeta);
+ if (IS_ERR(bdev)) {
+ DMERR("invalidate_kernel: could not open device for reading");
+ dev_mode = 0;
+ ret = -ENOENT;
+ goto failed_to_read;
+ }
+
+ bio = bio_alloc(GFP_NOIO, 1);
+ if (!bio) {
+ ret = -ENOMEM;
+ goto failed_bio_alloc;
+ }
+
+ page = alloc_page(GFP_NOIO);
+ if (!page) {
+ ret = -ENOMEM;
+ goto failed_to_alloc_page;
+ }
+
+ access_last_sector = 0;
+ ret = invalidate_vbmeta_submit(bio, bdev, REQ_OP_READ,
+ access_last_sector, page);
+ if (ret) {
+ DMERR("invalidate_vbmeta: error reading");
+ goto failed_to_submit_read;
+ }
+
+ /* We have a page. Let's make sure it looks right. */
+ if (memcmp("AVB0", page_address(page), 4) == 0) {
+ /* Stamp it. */
+ memcpy(page_address(page), "AVE0", 4);
+ DMINFO("invalidate_vbmeta: found vbmeta partition");
+ } else {
+ /* Could be this is on a AVB footer, check. Also, since the
+ * AVB footer is in the last 64 bytes, adjust for the fact that
+ * we're dealing with 512-byte sectors.
+ */
+ size_t offset = (1<<SECTOR_SHIFT) - 64;
+
+ access_last_sector = 1;
+ ret = invalidate_vbmeta_submit(bio, bdev, REQ_OP_READ,
+ access_last_sector, page);
+ if (ret) {
+ DMERR("invalidate_vbmeta: error reading");
+ goto failed_to_submit_read;
+ }
+ if (memcmp("AVBf", page_address(page) + offset, 4) != 0) {
+ DMERR("invalidate_vbmeta on non-vbmeta partition");
+ ret = -EINVAL;
+ goto invalid_header;
+ }
+ /* Stamp it. */
+ memcpy(page_address(page) + offset, "AVE0", 4);
+ DMINFO("invalidate_vbmeta: found vbmeta footer partition");
+ }
+
+ /* Now rewrite the changed page - the block dev was being
+ * changed on read. Let's reopen here.
+ */
+ blkdev_put(bdev, dev_mode);
+ dev_mode = FMODE_WRITE | FMODE_EXCL;
+ bdev = blkdev_get_by_dev(vbmeta_devt, dev_mode,
+ invalidate_vbmeta);
+ if (IS_ERR(bdev)) {
+ DMERR("invalidate_vbmeta: could not open device for writing");
+ dev_mode = 0;
+ ret = -ENOENT;
+ goto failed_to_write;
+ }
+
+ /* We re-use the same bio to do the write after the read. Need to reset
+ * it to initialize bio->bi_remaining.
+ */
+ bio_reset(bio);
+
+ ret = invalidate_vbmeta_submit(bio, bdev, REQ_OP_WRITE,
+ access_last_sector, page);
+ if (ret) {
+ DMERR("invalidate_vbmeta: error writing");
+ goto failed_to_submit_write;
+ }
+
+ DMERR("invalidate_vbmeta: completed.");
+ ret = 0;
+failed_to_submit_write:
+failed_to_write:
+invalid_header:
+ __free_page(page);
+failed_to_submit_read:
+ /* Technically, we'll leak a page with the pending bio, but
+ * we're about to reboot anyway.
+ */
+failed_to_alloc_page:
+ bio_put(bio);
+failed_bio_alloc:
+ if (dev_mode)
+ blkdev_put(bdev, dev_mode);
+failed_to_read:
+ return ret;
+}
+
+void dm_verity_avb_error_handler(void)
+{
+ dev_t dev;
+
+ DMINFO("AVB error handler called for %s", avb_vbmeta_device);
+
+ if (strcmp(avb_invalidate_on_error, "yes") != 0) {
+ DMINFO("Not configured to invalidate");
+ return;
+ }
+
+ if (avb_vbmeta_device[0] == '\0') {
+ DMERR("avb_vbmeta_device parameter not set");
+ goto fail_no_dev;
+ }
+
+ dev = name_to_dev_t(avb_vbmeta_device);
+ if (!dev) {
+ DMERR("No matching partition for device: %s",
+ avb_vbmeta_device);
+ goto fail_no_dev;
+ }
+
+ invalidate_vbmeta(dev);
+
+fail_no_dev:
+ ;
+}
+
+static int __init dm_verity_avb_init(void)
+{
+ DMINFO("AVB error handler initialized with vbmeta device: %s",
+ avb_vbmeta_device);
+ return 0;
+}
+
+static void __exit dm_verity_avb_exit(void)
+{
+}
+
+module_init(dm_verity_avb_init);
+module_exit(dm_verity_avb_exit);
+
+MODULE_AUTHOR("David Zeuthen <zeuthen@google.com>");
+MODULE_DESCRIPTION("AVB-specific error handler for dm-verity");
+MODULE_LICENSE("GPL");
+
+/* Declare parameter with no module prefix */
+#undef MODULE_PARAM_PREFIX
+#define MODULE_PARAM_PREFIX "androidboot.vbmeta."
+module_param_string(device, avb_vbmeta_device, sizeof(avb_vbmeta_device), 0);
+module_param_string(invalidate_on_error, avb_invalidate_on_error,
+ sizeof(avb_invalidate_on_error), 0);
diff --git a/drivers/md/dm-verity-fec.c b/drivers/md/dm-verity-fec.c
index 78f36012eaca..3b6231596284 100644
--- a/drivers/md/dm-verity-fec.c
+++ b/drivers/md/dm-verity-fec.c
@@ -11,6 +11,7 @@
#include "dm-verity-fec.h"
#include <linux/math64.h>
+#include <linux/sysfs.h>
#define DM_MSG_PREFIX "verity-fec"
@@ -175,9 +176,11 @@ error:
if (r < 0 && neras)
DMERR_LIMIT("%s: FEC %llu: failed to correct: %d",
v->data_dev->name, (unsigned long long)rsb, r);
- else if (r > 0)
+ else if (r > 0) {
DMWARN_LIMIT("%s: FEC %llu: corrected %d errors",
v->data_dev->name, (unsigned long long)rsb, r);
+ atomic_add_unless(&v->fec->corrected, 1, INT_MAX);
+ }
return r;
}
@@ -556,6 +559,7 @@ unsigned verity_fec_status_table(struct dm_verity *v, unsigned sz,
void verity_fec_dtr(struct dm_verity *v)
{
struct dm_verity_fec *f = v->fec;
+ struct kobject *kobj = &f->kobj_holder.kobj;
if (!verity_fec_is_enabled(v))
goto out;
@@ -572,6 +576,12 @@ void verity_fec_dtr(struct dm_verity *v)
if (f->dev)
dm_put_device(v->ti, f->dev);
+
+ if (kobj->state_initialized) {
+ kobject_put(kobj);
+ wait_for_completion(dm_get_completion_from_kobject(kobj));
+ }
+
out:
kfree(f);
v->fec = NULL;
@@ -660,6 +670,28 @@ int verity_fec_parse_opt_args(struct dm_arg_set *as, struct dm_verity *v,
return 0;
}
+static ssize_t corrected_show(struct kobject *kobj, struct kobj_attribute *attr,
+ char *buf)
+{
+ struct dm_verity_fec *f = container_of(kobj, struct dm_verity_fec,
+ kobj_holder.kobj);
+
+ return sprintf(buf, "%d\n", atomic_read(&f->corrected));
+}
+
+static struct kobj_attribute attr_corrected = __ATTR_RO(corrected);
+
+static struct attribute *fec_attrs[] = {
+ &attr_corrected.attr,
+ NULL
+};
+
+static struct kobj_type fec_ktype = {
+ .sysfs_ops = &kobj_sysfs_ops,
+ .default_attrs = fec_attrs,
+ .release = dm_kobject_release
+};
+
/*
* Allocate dm_verity_fec for v->fec. Must be called before verity_fec_ctr.
*/
@@ -683,8 +715,10 @@ int verity_fec_ctr_alloc(struct dm_verity *v)
*/
int verity_fec_ctr(struct dm_verity *v)
{
+ int r;
struct dm_verity_fec *f = v->fec;
struct dm_target *ti = v->ti;
+ struct mapped_device *md = dm_table_get_md(ti->table);
u64 hash_blocks;
if (!verity_fec_is_enabled(v)) {
@@ -692,6 +726,16 @@ int verity_fec_ctr(struct dm_verity *v)
return 0;
}
+ /* Create a kobject and sysfs attributes */
+ init_completion(&f->kobj_holder.completion);
+
+ r = kobject_init_and_add(&f->kobj_holder.kobj, &fec_ktype,
+ &disk_to_dev(dm_disk(md))->kobj, "%s", "fec");
+ if (r) {
+ ti->error = "Cannot create kobject";
+ return r;
+ }
+
/*
* FEC is computed over data blocks, possible metadata, and
* hash blocks. In other words, FEC covers total of fec_blocks
diff --git a/drivers/md/dm-verity-fec.h b/drivers/md/dm-verity-fec.h
index bb31ce87a933..4db0cae262eb 100644
--- a/drivers/md/dm-verity-fec.h
+++ b/drivers/md/dm-verity-fec.h
@@ -12,6 +12,8 @@
#ifndef DM_VERITY_FEC_H
#define DM_VERITY_FEC_H
+#include "dm.h"
+#include "dm-core.h"
#include "dm-verity.h"
#include <linux/rslib.h>
@@ -51,6 +53,8 @@ struct dm_verity_fec {
mempool_t *extra_pool; /* mempool for extra buffers */
mempool_t *output_pool; /* mempool for output */
struct kmem_cache *cache; /* cache for buffers */
+ atomic_t corrected; /* corrected errors */
+ struct dm_kobject_holder kobj_holder; /* for sysfs attributes */
};
/* per-bio data */
diff --git a/drivers/md/dm-verity-target.c b/drivers/md/dm-verity-target.c
index 0aba34a7b3b3..1a1fac47e4c7 100644
--- a/drivers/md/dm-verity-target.c
+++ b/drivers/md/dm-verity-target.c
@@ -19,6 +19,7 @@
#include <linux/module.h>
#include <linux/reboot.h>
+#include <linux/vmalloc.h>
#define DM_MSG_PREFIX "verity"
@@ -32,6 +33,7 @@
#define DM_VERITY_OPT_LOGGING "ignore_corruption"
#define DM_VERITY_OPT_RESTART "restart_on_corruption"
#define DM_VERITY_OPT_IGN_ZEROES "ignore_zero_blocks"
+#define DM_VERITY_OPT_AT_MOST_ONCE "check_at_most_once"
#define DM_VERITY_OPTS_MAX (2 + DM_VERITY_OPTS_FEC)
@@ -233,8 +235,12 @@ out:
if (v->mode == DM_VERITY_MODE_LOGGING)
return 0;
- if (v->mode == DM_VERITY_MODE_RESTART)
+ if (v->mode == DM_VERITY_MODE_RESTART) {
+#ifdef CONFIG_DM_VERITY_AVB
+ dm_verity_avb_error_handler();
+#endif
kernel_restart("dm-verity device corrupted");
+ }
return 1;
}
@@ -395,6 +401,18 @@ static int verity_bv_zero(struct dm_verity *v, struct dm_verity_io *io,
}
/*
+ * Moves the bio iter one data block forward.
+ */
+static inline void verity_bv_skip_block(struct dm_verity *v,
+ struct dm_verity_io *io,
+ struct bvec_iter *iter)
+{
+ struct bio *bio = dm_bio_from_per_bio_data(io, v->ti->per_io_data_size);
+
+ bio_advance_iter(bio, iter, 1 << v->data_dev_block_bits);
+}
+
+/*
* Verify one "dm_verity_io" structure.
*/
static int verity_verify_io(struct dm_verity_io *io)
@@ -406,9 +424,16 @@ static int verity_verify_io(struct dm_verity_io *io)
for (b = 0; b < io->n_blocks; b++) {
int r;
+ sector_t cur_block = io->block + b;
struct shash_desc *desc = verity_io_hash_desc(v, io);
- r = verity_hash_for_block(v, io, io->block + b,
+ if (v->validated_blocks &&
+ likely(test_bit(cur_block, v->validated_blocks))) {
+ verity_bv_skip_block(v, io, &io->iter);
+ continue;
+ }
+
+ r = verity_hash_for_block(v, io, cur_block,
verity_io_want_digest(v, io),
&is_zero);
if (unlikely(r < 0))
@@ -441,13 +466,16 @@ static int verity_verify_io(struct dm_verity_io *io)
return r;
if (likely(memcmp(verity_io_real_digest(v, io),
- verity_io_want_digest(v, io), v->digest_size) == 0))
+ verity_io_want_digest(v, io), v->digest_size) == 0)) {
+ if (v->validated_blocks)
+ set_bit(cur_block, v->validated_blocks);
continue;
+ }
else if (verity_fec_decode(v, io, DM_VERITY_BLOCK_TYPE_DATA,
- io->block + b, NULL, &start) == 0)
+ cur_block, NULL, &start) == 0)
continue;
else if (verity_handle_err(v, DM_VERITY_BLOCK_TYPE_DATA,
- io->block + b))
+ cur_block))
return -EIO;
}
@@ -551,7 +579,7 @@ static void verity_submit_prefetch(struct dm_verity *v, struct dm_verity_io *io)
* Bio map function. It allocates dm_verity_io structure and bio vector and
* fills them. Then it issues prefetches and the I/O.
*/
-static int verity_map(struct dm_target *ti, struct bio *bio)
+int verity_map(struct dm_target *ti, struct bio *bio)
{
struct dm_verity *v = ti->private;
struct dm_verity_io *io;
@@ -592,11 +620,12 @@ static int verity_map(struct dm_target *ti, struct bio *bio)
return DM_MAPIO_SUBMITTED;
}
+EXPORT_SYMBOL_GPL(verity_map);
/*
* Status: V (valid) or C (corruption found)
*/
-static void verity_status(struct dm_target *ti, status_type_t type,
+void verity_status(struct dm_target *ti, status_type_t type,
unsigned status_flags, char *result, unsigned maxlen)
{
struct dm_verity *v = ti->private;
@@ -633,6 +662,8 @@ static void verity_status(struct dm_target *ti, status_type_t type,
args += DM_VERITY_OPTS_FEC;
if (v->zero_digest)
args++;
+ if (v->validated_blocks)
+ args++;
if (!args)
return;
DMEMIT(" %u", args);
@@ -651,12 +682,15 @@ static void verity_status(struct dm_target *ti, status_type_t type,
}
if (v->zero_digest)
DMEMIT(" " DM_VERITY_OPT_IGN_ZEROES);
+ if (v->validated_blocks)
+ DMEMIT(" " DM_VERITY_OPT_AT_MOST_ONCE);
sz = verity_fec_status_table(v, sz, result, maxlen);
break;
}
}
+EXPORT_SYMBOL_GPL(verity_status);
-static int verity_prepare_ioctl(struct dm_target *ti,
+int verity_prepare_ioctl(struct dm_target *ti,
struct block_device **bdev, fmode_t *mode)
{
struct dm_verity *v = ti->private;
@@ -668,16 +702,18 @@ static int verity_prepare_ioctl(struct dm_target *ti,
return 1;
return 0;
}
+EXPORT_SYMBOL_GPL(verity_prepare_ioctl);
-static int verity_iterate_devices(struct dm_target *ti,
+int verity_iterate_devices(struct dm_target *ti,
iterate_devices_callout_fn fn, void *data)
{
struct dm_verity *v = ti->private;
return fn(ti, v->data_dev, v->data_start, ti->len, data);
}
+EXPORT_SYMBOL_GPL(verity_iterate_devices);
-static void verity_io_hints(struct dm_target *ti, struct queue_limits *limits)
+void verity_io_hints(struct dm_target *ti, struct queue_limits *limits)
{
struct dm_verity *v = ti->private;
@@ -689,8 +725,9 @@ static void verity_io_hints(struct dm_target *ti, struct queue_limits *limits)
blk_limits_io_min(limits, limits->logical_block_size);
}
+EXPORT_SYMBOL_GPL(verity_io_hints);
-static void verity_dtr(struct dm_target *ti)
+void verity_dtr(struct dm_target *ti)
{
struct dm_verity *v = ti->private;
@@ -700,6 +737,7 @@ static void verity_dtr(struct dm_target *ti)
if (v->bufio)
dm_bufio_client_destroy(v->bufio);
+ vfree(v->validated_blocks);
kfree(v->salt);
kfree(v->root_digest);
kfree(v->zero_digest);
@@ -719,6 +757,27 @@ static void verity_dtr(struct dm_target *ti)
kfree(v);
}
+EXPORT_SYMBOL_GPL(verity_dtr);
+
+static int verity_alloc_most_once(struct dm_verity *v)
+{
+ struct dm_target *ti = v->ti;
+
+ /* the bitset can only handle INT_MAX blocks */
+ if (v->data_blocks > INT_MAX) {
+ ti->error = "device too large to use check_at_most_once";
+ return -E2BIG;
+ }
+
+ v->validated_blocks = vzalloc(BITS_TO_LONGS(v->data_blocks) *
+ sizeof(unsigned long));
+ if (!v->validated_blocks) {
+ ti->error = "failed to allocate bitset for check_at_most_once";
+ return -ENOMEM;
+ }
+
+ return 0;
+}
static int verity_alloc_zero_digest(struct dm_verity *v)
{
@@ -789,6 +848,12 @@ static int verity_parse_opt_args(struct dm_arg_set *as, struct dm_verity *v)
}
continue;
+ } else if (!strcasecmp(arg_name, DM_VERITY_OPT_AT_MOST_ONCE)) {
+ r = verity_alloc_most_once(v);
+ if (r)
+ return r;
+ continue;
+
} else if (verity_is_fec_opt_arg(arg_name)) {
r = verity_fec_parse_opt_args(as, v, &argc, arg_name);
if (r)
@@ -817,7 +882,7 @@ static int verity_parse_opt_args(struct dm_arg_set *as, struct dm_verity *v)
* <digest>
* <salt> Hex string or "-" if no salt.
*/
-static int verity_ctr(struct dm_target *ti, unsigned argc, char **argv)
+int verity_ctr(struct dm_target *ti, unsigned argc, char **argv)
{
struct dm_verity *v;
struct dm_arg_set as;
@@ -930,6 +995,15 @@ static int verity_ctr(struct dm_target *ti, unsigned argc, char **argv)
v->tfm = NULL;
goto bad;
}
+
+ /*
+ * dm-verity performance can vary greatly depending on which hash
+ * algorithm implementation is used. Help people debug performance
+ * problems by logging the ->cra_driver_name.
+ */
+ DMINFO("%s using implementation \"%s\"", v->alg_name,
+ crypto_shash_alg(v->tfm)->base.cra_driver_name);
+
v->digest_size = crypto_shash_digestsize(v->tfm);
if ((1 << v->hash_dev_block_bits) < v->digest_size * 2) {
ti->error = "Digest size too big";
@@ -981,6 +1055,14 @@ static int verity_ctr(struct dm_target *ti, unsigned argc, char **argv)
goto bad;
}
+#ifdef CONFIG_DM_ANDROID_VERITY_AT_MOST_ONCE_DEFAULT_ENABLED
+ if (!v->validated_blocks) {
+ r = verity_alloc_most_once(v);
+ if (r)
+ goto bad;
+ }
+#endif
+
v->hash_per_block_bits =
__fls((1 << v->hash_dev_block_bits) / v->digest_size);
@@ -1053,10 +1135,11 @@ bad:
return r;
}
+EXPORT_SYMBOL_GPL(verity_ctr);
static struct target_type verity_target = {
.name = "verity",
- .version = {1, 3, 0},
+ .version = {1, 4, 0},
.module = THIS_MODULE,
.ctr = verity_ctr,
.dtr = verity_dtr,
diff --git a/drivers/md/dm-verity.h b/drivers/md/dm-verity.h
index fb419f422d73..d216fc76d350 100644
--- a/drivers/md/dm-verity.h
+++ b/drivers/md/dm-verity.h
@@ -63,6 +63,7 @@ struct dm_verity {
sector_t hash_level_block[DM_VERITY_MAX_LEVELS];
struct dm_verity_fec *fec; /* forward error correction */
+ unsigned long *validated_blocks; /* bitset blocks validated */
};
struct dm_verity_io {
@@ -126,4 +127,15 @@ extern int verity_hash(struct dm_verity *v, struct shash_desc *desc,
extern int verity_hash_for_block(struct dm_verity *v, struct dm_verity_io *io,
sector_t block, u8 *digest, bool *is_zero);
+extern void verity_status(struct dm_target *ti, status_type_t type,
+ unsigned status_flags, char *result, unsigned maxlen);
+extern int verity_prepare_ioctl(struct dm_target *ti,
+ struct block_device **bdev, fmode_t *mode);
+extern int verity_iterate_devices(struct dm_target *ti,
+ iterate_devices_callout_fn fn, void *data);
+extern void verity_io_hints(struct dm_target *ti, struct queue_limits *limits);
+extern void verity_dtr(struct dm_target *ti);
+extern int verity_ctr(struct dm_target *ti, unsigned argc, char **argv);
+extern int verity_map(struct dm_target *ti, struct bio *bio);
+extern void dm_verity_avb_error_handler(void);
#endif /* DM_VERITY_H */
diff --git a/drivers/md/dm.h b/drivers/md/dm.h
index f0aad08b9654..ed25f30a7550 100644
--- a/drivers/md/dm.h
+++ b/drivers/md/dm.h
@@ -80,8 +80,6 @@ void dm_set_md_type(struct mapped_device *md, unsigned type);
unsigned dm_get_md_type(struct mapped_device *md);
struct target_type *dm_get_immutable_target_type(struct mapped_device *md);
-int dm_setup_md_queue(struct mapped_device *md, struct dm_table *t);
-
/*
* To check the return value from dm_table_find_target().
*/
diff --git a/drivers/md/md.c b/drivers/md/md.c
index a7a0e3acdb2f..ff538293bc11 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -5098,7 +5098,7 @@ static struct kobject *md_probe(dev_t dev, int *part, void *data)
return NULL;
}
-static int add_named_array(const char *val, struct kernel_param *kp)
+static int add_named_array(const char *val, const struct kernel_param *kp)
{
/* val must be "md_*" where * is not all digits.
* We allocate an array with a large free minor number, and
@@ -8989,11 +8989,11 @@ static __exit void md_exit(void)
subsys_initcall(md_init);
module_exit(md_exit)
-static int get_ro(char *buffer, struct kernel_param *kp)
+static int get_ro(char *buffer, const struct kernel_param *kp)
{
return sprintf(buffer, "%d", start_readonly);
}
-static int set_ro(const char *val, struct kernel_param *kp)
+static int set_ro(const char *val, const struct kernel_param *kp)
{
return kstrtouint(val, 10, (unsigned int *)&start_readonly);
}
diff --git a/drivers/media/pci/tw686x/tw686x-core.c b/drivers/media/pci/tw686x/tw686x-core.c
index 71a0453b1af1..279d4478c2a1 100644
--- a/drivers/media/pci/tw686x/tw686x-core.c
+++ b/drivers/media/pci/tw686x/tw686x-core.c
@@ -72,12 +72,12 @@ static const char *dma_mode_name(unsigned int mode)
}
}
-static int tw686x_dma_mode_get(char *buffer, struct kernel_param *kp)
+static int tw686x_dma_mode_get(char *buffer, const struct kernel_param *kp)
{
return sprintf(buffer, dma_mode_name(dma_mode));
}
-static int tw686x_dma_mode_set(const char *val, struct kernel_param *kp)
+static int tw686x_dma_mode_set(const char *val, const struct kernel_param *kp)
{
if (!strcasecmp(val, dma_mode_name(TW686X_DMA_MODE_MEMCPY)))
dma_mode = TW686X_DMA_MODE_MEMCPY;
diff --git a/drivers/media/usb/uvc/uvc_driver.c b/drivers/media/usb/uvc/uvc_driver.c
index cde43b63c3da..8412dfc02b58 100644
--- a/drivers/media/usb/uvc/uvc_driver.c
+++ b/drivers/media/usb/uvc/uvc_driver.c
@@ -2184,7 +2184,7 @@ static int uvc_reset_resume(struct usb_interface *intf)
* Module parameters
*/
-static int uvc_clock_param_get(char *buffer, struct kernel_param *kp)
+static int uvc_clock_param_get(char *buffer, const struct kernel_param *kp)
{
if (uvc_clock_param == CLOCK_MONOTONIC)
return sprintf(buffer, "CLOCK_MONOTONIC");
@@ -2192,7 +2192,7 @@ static int uvc_clock_param_get(char *buffer, struct kernel_param *kp)
return sprintf(buffer, "CLOCK_REALTIME");
}
-static int uvc_clock_param_set(const char *val, struct kernel_param *kp)
+static int uvc_clock_param_set(const char *val, const struct kernel_param *kp)
{
if (strncasecmp(val, "clock_", strlen("clock_")) == 0)
val += strlen("clock_");
diff --git a/drivers/media/v4l2-core/v4l2-ioctl.c b/drivers/media/v4l2-core/v4l2-ioctl.c
index 4510e8a37244..610caa10481b 100644
--- a/drivers/media/v4l2-core/v4l2-ioctl.c
+++ b/drivers/media/v4l2-core/v4l2-ioctl.c
@@ -2455,11 +2455,8 @@ struct v4l2_ioctl_info {
unsigned int ioctl;
u32 flags;
const char * const name;
- union {
- u32 offset;
- int (*func)(const struct v4l2_ioctl_ops *ops,
- struct file *file, void *fh, void *p);
- } u;
+ int (*func)(const struct v4l2_ioctl_ops *ops, struct file *file,
+ void *fh, void *p);
void (*debug)(const void *arg, bool write_only);
};
@@ -2467,25 +2464,21 @@ struct v4l2_ioctl_info {
#define INFO_FL_PRIO (1 << 0)
/* This control can be valid if the filehandle passes a control handler. */
#define INFO_FL_CTRL (1 << 1)
-/* This is a standard ioctl, no need for special code */
-#define INFO_FL_STD (1 << 2)
/* This is ioctl has its own function */
-#define INFO_FL_FUNC (1 << 3)
+#define INFO_FL_FUNC (1 << 2)
/* Queuing ioctl */
-#define INFO_FL_QUEUE (1 << 4)
+#define INFO_FL_QUEUE (1 << 3)
/* Zero struct from after the field to the end */
#define INFO_FL_CLEAR(v4l2_struct, field) \
((offsetof(struct v4l2_struct, field) + \
sizeof(((struct v4l2_struct *)0)->field)) << 16)
#define INFO_FL_CLEAR_MASK (_IOC_SIZEMASK << 16)
-#define IOCTL_INFO_STD(_ioctl, _vidioc, _debug, _flags) \
- [_IOC_NR(_ioctl)] = { \
- .ioctl = _ioctl, \
- .flags = _flags | INFO_FL_STD, \
- .name = #_ioctl, \
- .u.offset = offsetof(struct v4l2_ioctl_ops, _vidioc), \
- .debug = _debug, \
+#define DEFINE_IOCTL_STD_FNC(_vidioc) \
+ static int __v4l_ ## _vidioc ## _fnc( \
+ const struct v4l2_ioctl_ops *ops, \
+ struct file *file, void *fh, void *p) { \
+ return ops->_vidioc(file, fh, p); \
}
#define IOCTL_INFO_FNC(_ioctl, _func, _debug, _flags) \
@@ -2493,10 +2486,44 @@ struct v4l2_ioctl_info {
.ioctl = _ioctl, \
.flags = _flags | INFO_FL_FUNC, \
.name = #_ioctl, \
- .u.func = _func, \
+ .func = _func, \
.debug = _debug, \
}
+#define IOCTL_INFO_STD(_ioctl, _vidioc, _debug, _flags) \
+ IOCTL_INFO_FNC(_ioctl, __v4l_ ## _vidioc ## _fnc, _debug, _flags)
+
+DEFINE_IOCTL_STD_FNC(vidioc_g_fbuf)
+DEFINE_IOCTL_STD_FNC(vidioc_s_fbuf)
+DEFINE_IOCTL_STD_FNC(vidioc_expbuf)
+DEFINE_IOCTL_STD_FNC(vidioc_g_std)
+DEFINE_IOCTL_STD_FNC(vidioc_g_audio)
+DEFINE_IOCTL_STD_FNC(vidioc_s_audio)
+DEFINE_IOCTL_STD_FNC(vidioc_g_input)
+DEFINE_IOCTL_STD_FNC(vidioc_g_edid)
+DEFINE_IOCTL_STD_FNC(vidioc_s_edid)
+DEFINE_IOCTL_STD_FNC(vidioc_g_output)
+DEFINE_IOCTL_STD_FNC(vidioc_g_audout)
+DEFINE_IOCTL_STD_FNC(vidioc_s_audout)
+DEFINE_IOCTL_STD_FNC(vidioc_g_selection)
+DEFINE_IOCTL_STD_FNC(vidioc_s_selection)
+DEFINE_IOCTL_STD_FNC(vidioc_g_jpegcomp)
+DEFINE_IOCTL_STD_FNC(vidioc_s_jpegcomp)
+DEFINE_IOCTL_STD_FNC(vidioc_enumaudio)
+DEFINE_IOCTL_STD_FNC(vidioc_enumaudout)
+DEFINE_IOCTL_STD_FNC(vidioc_enum_framesizes)
+DEFINE_IOCTL_STD_FNC(vidioc_enum_frameintervals)
+DEFINE_IOCTL_STD_FNC(vidioc_g_enc_index)
+DEFINE_IOCTL_STD_FNC(vidioc_encoder_cmd)
+DEFINE_IOCTL_STD_FNC(vidioc_try_encoder_cmd)
+DEFINE_IOCTL_STD_FNC(vidioc_decoder_cmd)
+DEFINE_IOCTL_STD_FNC(vidioc_try_decoder_cmd)
+DEFINE_IOCTL_STD_FNC(vidioc_s_dv_timings)
+DEFINE_IOCTL_STD_FNC(vidioc_g_dv_timings)
+DEFINE_IOCTL_STD_FNC(vidioc_enum_dv_timings)
+DEFINE_IOCTL_STD_FNC(vidioc_query_dv_timings)
+DEFINE_IOCTL_STD_FNC(vidioc_dv_timings_cap)
+
static struct v4l2_ioctl_info v4l2_ioctls[] = {
IOCTL_INFO_FNC(VIDIOC_QUERYCAP, v4l_querycap, v4l_print_querycap, 0),
IOCTL_INFO_FNC(VIDIOC_ENUM_FMT, v4l_enum_fmt, v4l_print_fmtdesc, INFO_FL_CLEAR(v4l2_fmtdesc, type)),
@@ -2681,14 +2708,8 @@ static long __video_do_ioctl(struct file *file,
}
write_only = _IOC_DIR(cmd) == _IOC_WRITE;
- if (info->flags & INFO_FL_STD) {
- typedef int (*vidioc_op)(struct file *file, void *fh, void *p);
- const void *p = vfd->ioctl_ops;
- const vidioc_op *vidioc = p + info->u.offset;
-
- ret = (*vidioc)(file, fh, arg);
- } else if (info->flags & INFO_FL_FUNC) {
- ret = info->u.func(ops, file, fh, arg);
+ if (info->flags & INFO_FL_FUNC) {
+ ret = info->func(ops, file, fh, arg);
} else if (!ops->vidioc_default) {
ret = -ENOTTY;
} else {
diff --git a/drivers/message/fusion/mptbase.c b/drivers/message/fusion/mptbase.c
index 89c7ed16b4df..1cdab6d00b43 100644
--- a/drivers/message/fusion/mptbase.c
+++ b/drivers/message/fusion/mptbase.c
@@ -99,7 +99,7 @@ module_param(mpt_channel_mapping, int, 0);
MODULE_PARM_DESC(mpt_channel_mapping, " Mapping id's to channels (default=0)");
static int mpt_debug_level;
-static int mpt_set_debug_level(const char *val, struct kernel_param *kp);
+static int mpt_set_debug_level(const char *val, const struct kernel_param *kp);
module_param_call(mpt_debug_level, mpt_set_debug_level, param_get_int,
&mpt_debug_level, 0600);
MODULE_PARM_DESC(mpt_debug_level,
@@ -242,7 +242,7 @@ pci_enable_io_access(struct pci_dev *pdev)
pci_write_config_word(pdev, PCI_COMMAND, command_reg);
}
-static int mpt_set_debug_level(const char *val, struct kernel_param *kp)
+static int mpt_set_debug_level(const char *val, const struct kernel_param *kp)
{
int ret = param_set_int(val, kp);
MPT_ADAPTER *ioc;
diff --git a/drivers/misc/Kconfig b/drivers/misc/Kconfig
index 64971baf11fa..9360e6ebb4ea 100644
--- a/drivers/misc/Kconfig
+++ b/drivers/misc/Kconfig
@@ -766,6 +766,27 @@ config PANEL_BOOT_MESSAGE
An empty message will only clear the display at driver init time. Any other
printf()-formatted message is valid with newline and escape codes.
+config UID_SYS_STATS
+ bool "Per-UID statistics"
+ depends on PROFILING && TASK_XACCT && TASK_IO_ACCOUNTING
+ help
+ Per UID based cpu time statistics exported to /proc/uid_cputime
+ Per UID based io statistics exported to /proc/uid_io
+ Per UID based procstat control in /proc/uid_procstat
+
+config UID_SYS_STATS_DEBUG
+ bool "Per-TASK statistics"
+ depends on UID_SYS_STATS
+ default n
+ help
+ Per TASK based io statistics exported to /proc/uid_io
+
+config MEMORY_STATE_TIME
+ tristate "Memory freq/bandwidth time statistics"
+ depends on PROFILING
+ help
+ Memory time statistics exported to /sys/kernel/memory_state_time
+
source "drivers/misc/c2port/Kconfig"
source "drivers/misc/eeprom/Kconfig"
source "drivers/misc/cb710/Kconfig"
diff --git a/drivers/misc/Makefile b/drivers/misc/Makefile
index 2bf79ba4a39e..564406bf24a0 100644
--- a/drivers/misc/Makefile
+++ b/drivers/misc/Makefile
@@ -54,6 +54,9 @@ obj-$(CONFIG_VEXPRESS_SYSCFG) += vexpress-syscfg.o
obj-$(CONFIG_CXL_BASE) += cxl/
obj-$(CONFIG_PANEL) += panel.o
+obj-$(CONFIG_UID_SYS_STATS) += uid_sys_stats.o
+obj-$(CONFIG_MEMORY_STATE_TIME) += memory_state_time.o
+
lkdtm-$(CONFIG_LKDTM) += lkdtm_core.o
lkdtm-$(CONFIG_LKDTM) += lkdtm_bugs.o
lkdtm-$(CONFIG_LKDTM) += lkdtm_heap.o
@@ -61,6 +64,7 @@ lkdtm-$(CONFIG_LKDTM) += lkdtm_perms.o
lkdtm-$(CONFIG_LKDTM) += lkdtm_rodata_objcopy.o
lkdtm-$(CONFIG_LKDTM) += lkdtm_usercopy.o
+CFLAGS_lkdtm_rodata.o += $(DISABLE_LTO)
KCOV_INSTRUMENT_lkdtm_rodata.o := n
OBJCOPYFLAGS :=
diff --git a/drivers/misc/kgdbts.c b/drivers/misc/kgdbts.c
index 99635dd9dbac..2290845eaba5 100644
--- a/drivers/misc/kgdbts.c
+++ b/drivers/misc/kgdbts.c
@@ -1130,7 +1130,8 @@ static void kgdbts_put_char(u8 chr)
ts.run_test(0, chr);
}
-static int param_set_kgdbts_var(const char *kmessage, struct kernel_param *kp)
+static int param_set_kgdbts_var(const char *kmessage,
+ const struct kernel_param *kp)
{
int len = strlen(kmessage);
diff --git a/drivers/misc/lkdtm.h b/drivers/misc/lkdtm.h
index fdf954c2107f..996689142e2d 100644
--- a/drivers/misc/lkdtm.h
+++ b/drivers/misc/lkdtm.h
@@ -21,6 +21,9 @@ void lkdtm_SPINLOCKUP(void);
void lkdtm_HUNG_TASK(void);
void lkdtm_ATOMIC_UNDERFLOW(void);
void lkdtm_ATOMIC_OVERFLOW(void);
+void lkdtm_CORRUPT_LIST_ADD(void);
+void lkdtm_CORRUPT_LIST_DEL(void);
+void lkdtm_CORRUPT_USER_DS(void);
/* lkdtm_heap.c */
void lkdtm_OVERWRITE_ALLOCATION(void);
diff --git a/drivers/misc/lkdtm_bugs.c b/drivers/misc/lkdtm_bugs.c
index 182ae1894b32..da1cf47e554a 100644
--- a/drivers/misc/lkdtm_bugs.c
+++ b/drivers/misc/lkdtm_bugs.c
@@ -5,7 +5,13 @@
* test source files.
*/
#include "lkdtm.h"
+#include <linux/list.h>
#include <linux/sched.h>
+#include <linux/uaccess.h>
+
+struct lkdtm_list {
+ struct list_head node;
+};
/*
* Make sure our attempts to over run the kernel stack doesn't trigger
@@ -146,3 +152,75 @@ void lkdtm_ATOMIC_OVERFLOW(void)
pr_info("attempting bad atomic overflow\n");
atomic_inc(&over);
}
+
+void lkdtm_CORRUPT_LIST_ADD(void)
+{
+ /*
+ * Initially, an empty list via LIST_HEAD:
+ * test_head.next = &test_head
+ * test_head.prev = &test_head
+ */
+ LIST_HEAD(test_head);
+ struct lkdtm_list good, bad;
+ void *target[2] = { };
+ void *redirection = &target;
+
+ pr_info("attempting good list addition\n");
+
+ /*
+ * Adding to the list performs these actions:
+ * test_head.next->prev = &good.node
+ * good.node.next = test_head.next
+ * good.node.prev = test_head
+ * test_head.next = good.node
+ */
+ list_add(&good.node, &test_head);
+
+ pr_info("attempting corrupted list addition\n");
+ /*
+ * In simulating this "write what where" primitive, the "what" is
+ * the address of &bad.node, and the "where" is the address held
+ * by "redirection".
+ */
+ test_head.next = redirection;
+ list_add(&bad.node, &test_head);
+
+ if (target[0] == NULL && target[1] == NULL)
+ pr_err("Overwrite did not happen, but no BUG?!\n");
+ else
+ pr_err("list_add() corruption not detected!\n");
+}
+
+void lkdtm_CORRUPT_LIST_DEL(void)
+{
+ LIST_HEAD(test_head);
+ struct lkdtm_list item;
+ void *target[2] = { };
+ void *redirection = &target;
+
+ list_add(&item.node, &test_head);
+
+ pr_info("attempting good list removal\n");
+ list_del(&item.node);
+
+ pr_info("attempting corrupted list removal\n");
+ list_add(&item.node, &test_head);
+
+ /* As with the list_add() test above, this corrupts "next". */
+ item.node.next = redirection;
+ list_del(&item.node);
+
+ if (target[0] == NULL && target[1] == NULL)
+ pr_err("Overwrite did not happen, but no BUG?!\n");
+ else
+ pr_err("list_del() corruption not detected!\n");
+}
+
+void lkdtm_CORRUPT_USER_DS(void)
+{
+ pr_info("setting bad task size limit\n");
+ set_fs(KERNEL_DS);
+
+ /* Make sure we do not keep running with a KERNEL_DS! */
+ force_sig(SIGKILL, current);
+}
diff --git a/drivers/misc/lkdtm_core.c b/drivers/misc/lkdtm_core.c
index b2989f2d3126..b72fb64ae351 100644
--- a/drivers/misc/lkdtm_core.c
+++ b/drivers/misc/lkdtm_core.c
@@ -197,6 +197,9 @@ struct crashtype crashtypes[] = {
CRASHTYPE(EXCEPTION),
CRASHTYPE(LOOP),
CRASHTYPE(OVERFLOW),
+ CRASHTYPE(CORRUPT_LIST_ADD),
+ CRASHTYPE(CORRUPT_LIST_DEL),
+ CRASHTYPE(CORRUPT_USER_DS),
CRASHTYPE(CORRUPT_STACK),
CRASHTYPE(UNALIGNED_LOAD_STORE_WRITE),
CRASHTYPE(OVERWRITE_ALLOCATION),
diff --git a/drivers/misc/memory_state_time.c b/drivers/misc/memory_state_time.c
new file mode 100644
index 000000000000..ba94dcf09169
--- /dev/null
+++ b/drivers/misc/memory_state_time.c
@@ -0,0 +1,462 @@
+/* drivers/misc/memory_state_time.c
+ *
+ * Copyright (C) 2016 Google, Inc.
+ *
+ * This software is licensed under the terms of the GNU General Public
+ * License version 2, as published by the Free Software Foundation, and
+ * may be copied, distributed, and modified under those terms.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ */
+
+#include <linux/device.h>
+#include <linux/err.h>
+#include <linux/errno.h>
+#include <linux/hashtable.h>
+#include <linux/kconfig.h>
+#include <linux/kernel.h>
+#include <linux/kobject.h>
+#include <linux/memory-state-time.h>
+#include <linux/module.h>
+#include <linux/mutex.h>
+#include <linux/of_platform.h>
+#include <linux/slab.h>
+#include <linux/sysfs.h>
+#include <linux/time.h>
+#include <linux/timekeeping.h>
+#include <linux/workqueue.h>
+
+#define KERNEL_ATTR_RO(_name) \
+static struct kobj_attribute _name##_attr = __ATTR_RO(_name)
+
+#define KERNEL_ATTR_RW(_name) \
+static struct kobj_attribute _name##_attr = \
+ __ATTR(_name, 0644, _name##_show, _name##_store)
+
+#define FREQ_HASH_BITS 4
+DECLARE_HASHTABLE(freq_hash_table, FREQ_HASH_BITS);
+
+static DEFINE_MUTEX(mem_lock);
+
+#define TAG "memory_state_time"
+#define BW_NODE "/soc/memory-state-time"
+#define FREQ_TBL "freq-tbl"
+#define BW_TBL "bw-buckets"
+#define NUM_SOURCES "num-sources"
+
+#define LOWEST_FREQ 2
+
+static int curr_bw;
+static int curr_freq;
+static u32 *bw_buckets;
+static u32 *freq_buckets;
+static int num_freqs;
+static int num_buckets;
+static int registered_bw_sources;
+static u64 last_update;
+static bool init_success;
+static struct workqueue_struct *memory_wq;
+static u32 num_sources = 10;
+static int *bandwidths;
+
+struct freq_entry {
+ int freq;
+ u64 *buckets; /* Bandwidth buckets. */
+ struct hlist_node hash;
+};
+
+struct queue_container {
+ struct work_struct update_state;
+ int value;
+ u64 time_now;
+ int id;
+ struct mutex *lock;
+};
+
+static int find_bucket(int bw)
+{
+ int i;
+
+ if (bw_buckets != NULL) {
+ for (i = 0; i < num_buckets; i++) {
+ if (bw_buckets[i] > bw) {
+ pr_debug("Found bucket %d for bandwidth %d\n",
+ i, bw);
+ return i;
+ }
+ }
+ return num_buckets - 1;
+ }
+ return 0;
+}
+
+static u64 get_time_diff(u64 time_now)
+{
+ u64 ms;
+
+ ms = time_now - last_update;
+ last_update = time_now;
+ return ms;
+}
+
+static ssize_t show_stat_show(struct kobject *kobj,
+ struct kobj_attribute *attr, char *buf)
+{
+ int i, j;
+ int len = 0;
+ struct freq_entry *freq_entry;
+
+ for (i = 0; i < num_freqs; i++) {
+ hash_for_each_possible(freq_hash_table, freq_entry, hash,
+ freq_buckets[i]) {
+ if (freq_entry->freq == freq_buckets[i]) {
+ len += scnprintf(buf + len, PAGE_SIZE - len,
+ "%d ", freq_buckets[i]);
+ if (len >= PAGE_SIZE)
+ break;
+ for (j = 0; j < num_buckets; j++) {
+ len += scnprintf(buf + len,
+ PAGE_SIZE - len,
+ "%llu ",
+ freq_entry->buckets[j]);
+ }
+ len += scnprintf(buf + len, PAGE_SIZE - len,
+ "\n");
+ }
+ }
+ }
+ pr_debug("Current Time: %llu\n", ktime_get_boot_ns());
+ return len;
+}
+KERNEL_ATTR_RO(show_stat);
+
+static void update_table(u64 time_now)
+{
+ struct freq_entry *freq_entry;
+
+ pr_debug("Last known bw %d freq %d\n", curr_bw, curr_freq);
+ hash_for_each_possible(freq_hash_table, freq_entry, hash, curr_freq) {
+ if (curr_freq == freq_entry->freq) {
+ freq_entry->buckets[find_bucket(curr_bw)]
+ += get_time_diff(time_now);
+ break;
+ }
+ }
+}
+
+static bool freq_exists(int freq)
+{
+ int i;
+
+ for (i = 0; i < num_freqs; i++) {
+ if (freq == freq_buckets[i])
+ return true;
+ }
+ return false;
+}
+
+static int calculate_total_bw(int bw, int index)
+{
+ int i;
+ int total_bw = 0;
+
+ pr_debug("memory_state_time New bw %d for id %d\n", bw, index);
+ bandwidths[index] = bw;
+ for (i = 0; i < registered_bw_sources; i++)
+ total_bw += bandwidths[i];
+ return total_bw;
+}
+
+static void freq_update_do_work(struct work_struct *work)
+{
+ struct queue_container *freq_state_update
+ = container_of(work, struct queue_container,
+ update_state);
+ if (freq_state_update) {
+ mutex_lock(&mem_lock);
+ update_table(freq_state_update->time_now);
+ curr_freq = freq_state_update->value;
+ mutex_unlock(&mem_lock);
+ kfree(freq_state_update);
+ }
+}
+
+static void bw_update_do_work(struct work_struct *work)
+{
+ struct queue_container *bw_state_update
+ = container_of(work, struct queue_container,
+ update_state);
+ if (bw_state_update) {
+ mutex_lock(&mem_lock);
+ update_table(bw_state_update->time_now);
+ curr_bw = calculate_total_bw(bw_state_update->value,
+ bw_state_update->id);
+ mutex_unlock(&mem_lock);
+ kfree(bw_state_update);
+ }
+}
+
+static void memory_state_freq_update(struct memory_state_update_block *ub,
+ int value)
+{
+ if (IS_ENABLED(CONFIG_MEMORY_STATE_TIME)) {
+ if (freq_exists(value) && init_success) {
+ struct queue_container *freq_container
+ = kmalloc(sizeof(struct queue_container),
+ GFP_KERNEL);
+ if (!freq_container)
+ return;
+ INIT_WORK(&freq_container->update_state,
+ freq_update_do_work);
+ freq_container->time_now = ktime_get_boot_ns();
+ freq_container->value = value;
+ pr_debug("Scheduling freq update in work queue\n");
+ queue_work(memory_wq, &freq_container->update_state);
+ } else {
+ pr_debug("Freq does not exist.\n");
+ }
+ }
+}
+
+static void memory_state_bw_update(struct memory_state_update_block *ub,
+ int value)
+{
+ if (IS_ENABLED(CONFIG_MEMORY_STATE_TIME)) {
+ if (init_success) {
+ struct queue_container *bw_container
+ = kmalloc(sizeof(struct queue_container),
+ GFP_KERNEL);
+ if (!bw_container)
+ return;
+ INIT_WORK(&bw_container->update_state,
+ bw_update_do_work);
+ bw_container->time_now = ktime_get_boot_ns();
+ bw_container->value = value;
+ bw_container->id = ub->id;
+ pr_debug("Scheduling bandwidth update in work queue\n");
+ queue_work(memory_wq, &bw_container->update_state);
+ }
+ }
+}
+
+struct memory_state_update_block *memory_state_register_frequency_source(void)
+{
+ struct memory_state_update_block *block;
+
+ if (IS_ENABLED(CONFIG_MEMORY_STATE_TIME)) {
+ pr_debug("Allocating frequency source\n");
+ block = kmalloc(sizeof(struct memory_state_update_block),
+ GFP_KERNEL);
+ if (!block)
+ return NULL;
+ block->update_call = memory_state_freq_update;
+ return block;
+ }
+ pr_err("Config option disabled.\n");
+ return NULL;
+}
+EXPORT_SYMBOL_GPL(memory_state_register_frequency_source);
+
+struct memory_state_update_block *memory_state_register_bandwidth_source(void)
+{
+ struct memory_state_update_block *block;
+
+ if (IS_ENABLED(CONFIG_MEMORY_STATE_TIME)) {
+ pr_debug("Allocating bandwidth source %d\n",
+ registered_bw_sources);
+ block = kmalloc(sizeof(struct memory_state_update_block),
+ GFP_KERNEL);
+ if (!block)
+ return NULL;
+ block->update_call = memory_state_bw_update;
+ if (registered_bw_sources < num_sources) {
+ block->id = registered_bw_sources++;
+ } else {
+ pr_err("Unable to allocate source; max number reached\n");
+ kfree(block);
+ return NULL;
+ }
+ return block;
+ }
+ pr_err("Config option disabled.\n");
+ return NULL;
+}
+EXPORT_SYMBOL_GPL(memory_state_register_bandwidth_source);
+
+/* Buckets are designated by their maximum.
+ * Returns the buckets decided by the capability of the device.
+ */
+static int get_bw_buckets(struct device *dev)
+{
+ int ret, lenb;
+ struct device_node *node = dev->of_node;
+
+ of_property_read_u32(node, NUM_SOURCES, &num_sources);
+ if (!of_find_property(node, BW_TBL, &lenb)) {
+ pr_err("Missing %s property\n", BW_TBL);
+ return -ENODATA;
+ }
+
+ bandwidths = devm_kzalloc(dev,
+ sizeof(*bandwidths) * num_sources, GFP_KERNEL);
+ if (!bandwidths)
+ return -ENOMEM;
+ lenb /= sizeof(*bw_buckets);
+ bw_buckets = devm_kzalloc(dev, lenb * sizeof(*bw_buckets),
+ GFP_KERNEL);
+ if (!bw_buckets) {
+ devm_kfree(dev, bandwidths);
+ return -ENOMEM;
+ }
+ ret = of_property_read_u32_array(node, BW_TBL, bw_buckets,
+ lenb);
+ if (ret < 0) {
+ devm_kfree(dev, bandwidths);
+ devm_kfree(dev, bw_buckets);
+ pr_err("Unable to read bandwidth table from device tree.\n");
+ return ret;
+ }
+
+ curr_bw = 0;
+ num_buckets = lenb;
+ return 0;
+}
+
+/* Adds struct freq_entry nodes to the hashtable for each compatible frequency.
+ * Returns the supported number of frequencies.
+ */
+static int freq_buckets_init(struct device *dev)
+{
+ struct freq_entry *freq_entry;
+ int i;
+ int ret, lenf;
+ struct device_node *node = dev->of_node;
+
+ if (!of_find_property(node, FREQ_TBL, &lenf)) {
+ pr_err("Missing %s property\n", FREQ_TBL);
+ return -ENODATA;
+ }
+
+ lenf /= sizeof(*freq_buckets);
+ freq_buckets = devm_kzalloc(dev, lenf * sizeof(*freq_buckets),
+ GFP_KERNEL);
+ if (!freq_buckets)
+ return -ENOMEM;
+ pr_debug("freqs found len %d\n", lenf);
+ ret = of_property_read_u32_array(node, FREQ_TBL, freq_buckets,
+ lenf);
+ if (ret < 0) {
+ devm_kfree(dev, freq_buckets);
+ pr_err("Unable to read frequency table from device tree.\n");
+ return ret;
+ }
+ pr_debug("ret freq %d\n", ret);
+
+ num_freqs = lenf;
+ curr_freq = freq_buckets[LOWEST_FREQ];
+
+ for (i = 0; i < num_freqs; i++) {
+ freq_entry = devm_kzalloc(dev, sizeof(struct freq_entry),
+ GFP_KERNEL);
+ if (!freq_entry)
+ return -ENOMEM;
+ freq_entry->buckets = devm_kzalloc(dev, sizeof(u64)*num_buckets,
+ GFP_KERNEL);
+ if (!freq_entry->buckets) {
+ devm_kfree(dev, freq_entry);
+ return -ENOMEM;
+ }
+ pr_debug("memory_state_time Adding freq to ht %d\n",
+ freq_buckets[i]);
+ freq_entry->freq = freq_buckets[i];
+ hash_add(freq_hash_table, &freq_entry->hash, freq_buckets[i]);
+ }
+ return 0;
+}
+
+struct kobject *memory_kobj;
+EXPORT_SYMBOL_GPL(memory_kobj);
+
+static struct attribute *memory_attrs[] = {
+ &show_stat_attr.attr,
+ NULL
+};
+
+static struct attribute_group memory_attr_group = {
+ .attrs = memory_attrs,
+};
+
+static int memory_state_time_probe(struct platform_device *pdev)
+{
+ int error;
+
+ error = get_bw_buckets(&pdev->dev);
+ if (error)
+ return error;
+ error = freq_buckets_init(&pdev->dev);
+ if (error)
+ return error;
+ last_update = ktime_get_boot_ns();
+ init_success = true;
+
+ pr_debug("memory_state_time initialized with num_freqs %d\n",
+ num_freqs);
+ return 0;
+}
+
+static const struct of_device_id match_table[] = {
+ { .compatible = "memory-state-time" },
+ {}
+};
+
+static struct platform_driver memory_state_time_driver = {
+ .probe = memory_state_time_probe,
+ .driver = {
+ .name = "memory-state-time",
+ .of_match_table = match_table,
+ .owner = THIS_MODULE,
+ },
+};
+
+static int __init memory_state_time_init(void)
+{
+ int error;
+
+ hash_init(freq_hash_table);
+ memory_wq = create_singlethread_workqueue("memory_wq");
+ if (!memory_wq) {
+ pr_err("Unable to create workqueue.\n");
+ return -EINVAL;
+ }
+ /*
+ * Create sys/kernel directory for memory_state_time.
+ */
+ memory_kobj = kobject_create_and_add(TAG, kernel_kobj);
+ if (!memory_kobj) {
+ pr_err("Unable to allocate memory_kobj for sysfs directory.\n");
+ error = -ENOMEM;
+ goto wq;
+ }
+ error = sysfs_create_group(memory_kobj, &memory_attr_group);
+ if (error) {
+ pr_err("Unable to create sysfs folder.\n");
+ goto kobj;
+ }
+
+ error = platform_driver_register(&memory_state_time_driver);
+ if (error) {
+ pr_err("Unable to register memory_state_time platform driver.\n");
+ goto group;
+ }
+ return 0;
+
+group: sysfs_remove_group(memory_kobj, &memory_attr_group);
+kobj: kobject_put(memory_kobj);
+wq: destroy_workqueue(memory_wq);
+ return error;
+}
+module_init(memory_state_time_init);
diff --git a/drivers/misc/uid_sys_stats.c b/drivers/misc/uid_sys_stats.c
new file mode 100644
index 000000000000..345f2292fa09
--- /dev/null
+++ b/drivers/misc/uid_sys_stats.c
@@ -0,0 +1,706 @@
+/* drivers/misc/uid_sys_stats.c
+ *
+ * Copyright (C) 2014 - 2015 Google, Inc.
+ *
+ * This software is licensed under the terms of the GNU General Public
+ * License version 2, as published by the Free Software Foundation, and
+ * may be copied, distributed, and modified under those terms.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ */
+
+#include <linux/atomic.h>
+#include <linux/cpufreq_times.h>
+#include <linux/err.h>
+#include <linux/hashtable.h>
+#include <linux/init.h>
+#include <linux/kernel.h>
+#include <linux/list.h>
+#include <linux/mm.h>
+#include <linux/proc_fs.h>
+#include <linux/profile.h>
+#include <linux/rtmutex.h>
+#include <linux/sched.h>
+#include <linux/seq_file.h>
+#include <linux/slab.h>
+#include <linux/uaccess.h>
+
+
+#define UID_HASH_BITS 10
+DECLARE_HASHTABLE(hash_table, UID_HASH_BITS);
+
+static DEFINE_RT_MUTEX(uid_lock);
+static struct proc_dir_entry *cpu_parent;
+static struct proc_dir_entry *io_parent;
+static struct proc_dir_entry *proc_parent;
+
+struct io_stats {
+ u64 read_bytes;
+ u64 write_bytes;
+ u64 rchar;
+ u64 wchar;
+ u64 fsync;
+};
+
+#define UID_STATE_FOREGROUND 0
+#define UID_STATE_BACKGROUND 1
+#define UID_STATE_BUCKET_SIZE 2
+
+#define UID_STATE_TOTAL_CURR 2
+#define UID_STATE_TOTAL_LAST 3
+#define UID_STATE_DEAD_TASKS 4
+#define UID_STATE_SIZE 5
+
+#define MAX_TASK_COMM_LEN 256
+
+struct task_entry {
+ char comm[MAX_TASK_COMM_LEN];
+ pid_t pid;
+ struct io_stats io[UID_STATE_SIZE];
+ struct hlist_node hash;
+};
+
+struct uid_entry {
+ uid_t uid;
+ cputime_t utime;
+ cputime_t stime;
+ cputime_t active_utime;
+ cputime_t active_stime;
+ int state;
+ struct io_stats io[UID_STATE_SIZE];
+ struct hlist_node hash;
+#ifdef CONFIG_UID_SYS_STATS_DEBUG
+ DECLARE_HASHTABLE(task_entries, UID_HASH_BITS);
+#endif
+};
+
+static u64 compute_write_bytes(struct task_struct *task)
+{
+ if (task->ioac.write_bytes <= task->ioac.cancelled_write_bytes)
+ return 0;
+
+ return task->ioac.write_bytes - task->ioac.cancelled_write_bytes;
+}
+
+static void compute_io_bucket_stats(struct io_stats *io_bucket,
+ struct io_stats *io_curr,
+ struct io_stats *io_last,
+ struct io_stats *io_dead)
+{
+ /* tasks could switch to another uid group, but its io_last in the
+ * previous uid group could still be positive.
+ * therefore before each update, do an overflow check first
+ */
+ int64_t delta;
+
+ delta = io_curr->read_bytes + io_dead->read_bytes -
+ io_last->read_bytes;
+ io_bucket->read_bytes += delta > 0 ? delta : 0;
+ delta = io_curr->write_bytes + io_dead->write_bytes -
+ io_last->write_bytes;
+ io_bucket->write_bytes += delta > 0 ? delta : 0;
+ delta = io_curr->rchar + io_dead->rchar - io_last->rchar;
+ io_bucket->rchar += delta > 0 ? delta : 0;
+ delta = io_curr->wchar + io_dead->wchar - io_last->wchar;
+ io_bucket->wchar += delta > 0 ? delta : 0;
+ delta = io_curr->fsync + io_dead->fsync - io_last->fsync;
+ io_bucket->fsync += delta > 0 ? delta : 0;
+
+ io_last->read_bytes = io_curr->read_bytes;
+ io_last->write_bytes = io_curr->write_bytes;
+ io_last->rchar = io_curr->rchar;
+ io_last->wchar = io_curr->wchar;
+ io_last->fsync = io_curr->fsync;
+
+ memset(io_dead, 0, sizeof(struct io_stats));
+}
+
+#ifdef CONFIG_UID_SYS_STATS_DEBUG
+static void get_full_task_comm(struct task_entry *task_entry,
+ struct task_struct *task)
+{
+ int i = 0, offset = 0, len = 0;
+ /* save one byte for terminating null character */
+ int unused_len = MAX_TASK_COMM_LEN - TASK_COMM_LEN - 1;
+ char buf[unused_len];
+ struct mm_struct *mm = task->mm;
+
+ /* fill the first TASK_COMM_LEN bytes with thread name */
+ __get_task_comm(task_entry->comm, TASK_COMM_LEN, task);
+ i = strlen(task_entry->comm);
+ while (i < TASK_COMM_LEN)
+ task_entry->comm[i++] = ' ';
+
+ /* next the executable file name */
+ if (mm) {
+ down_read(&mm->mmap_sem);
+ if (mm->exe_file) {
+ char *pathname = d_path(&mm->exe_file->f_path, buf,
+ unused_len);
+
+ if (!IS_ERR(pathname)) {
+ len = strlcpy(task_entry->comm + i, pathname,
+ unused_len);
+ i += len;
+ task_entry->comm[i++] = ' ';
+ unused_len--;
+ }
+ }
+ up_read(&mm->mmap_sem);
+ }
+ unused_len -= len;
+
+ /* fill the rest with command line argument
+ * replace each null or new line character
+ * between args in argv with whitespace */
+ len = get_cmdline(task, buf, unused_len);
+ while (offset < len) {
+ if (buf[offset] != '\0' && buf[offset] != '\n')
+ task_entry->comm[i++] = buf[offset];
+ else
+ task_entry->comm[i++] = ' ';
+ offset++;
+ }
+
+ /* get rid of trailing whitespaces in case when arg is memset to
+ * zero before being reset in userspace
+ */
+ while (task_entry->comm[i-1] == ' ')
+ i--;
+ task_entry->comm[i] = '\0';
+}
+
+static struct task_entry *find_task_entry(struct uid_entry *uid_entry,
+ struct task_struct *task)
+{
+ struct task_entry *task_entry;
+
+ hash_for_each_possible(uid_entry->task_entries, task_entry, hash,
+ task->pid) {
+ if (task->pid == task_entry->pid) {
+ /* if thread name changed, update the entire command */
+ int len = strnchr(task_entry->comm, ' ', TASK_COMM_LEN)
+ - task_entry->comm;
+
+ if (strncmp(task_entry->comm, task->comm, len))
+ get_full_task_comm(task_entry, task);
+ return task_entry;
+ }
+ }
+ return NULL;
+}
+
+static struct task_entry *find_or_register_task(struct uid_entry *uid_entry,
+ struct task_struct *task)
+{
+ struct task_entry *task_entry;
+ pid_t pid = task->pid;
+
+ task_entry = find_task_entry(uid_entry, task);
+ if (task_entry)
+ return task_entry;
+
+ task_entry = kzalloc(sizeof(struct task_entry), GFP_ATOMIC);
+ if (!task_entry)
+ return NULL;
+
+ get_full_task_comm(task_entry, task);
+
+ task_entry->pid = pid;
+ hash_add(uid_entry->task_entries, &task_entry->hash, (unsigned int)pid);
+
+ return task_entry;
+}
+
+static void remove_uid_tasks(struct uid_entry *uid_entry)
+{
+ struct task_entry *task_entry;
+ unsigned long bkt_task;
+ struct hlist_node *tmp_task;
+
+ hash_for_each_safe(uid_entry->task_entries, bkt_task,
+ tmp_task, task_entry, hash) {
+ hash_del(&task_entry->hash);
+ kfree(task_entry);
+ }
+}
+
+static void set_io_uid_tasks_zero(struct uid_entry *uid_entry)
+{
+ struct task_entry *task_entry;
+ unsigned long bkt_task;
+
+ hash_for_each(uid_entry->task_entries, bkt_task, task_entry, hash) {
+ memset(&task_entry->io[UID_STATE_TOTAL_CURR], 0,
+ sizeof(struct io_stats));
+ }
+}
+
+static void add_uid_tasks_io_stats(struct uid_entry *uid_entry,
+ struct task_struct *task, int slot)
+{
+ struct task_entry *task_entry = find_or_register_task(uid_entry, task);
+ struct io_stats *task_io_slot = &task_entry->io[slot];
+
+ task_io_slot->read_bytes += task->ioac.read_bytes;
+ task_io_slot->write_bytes += compute_write_bytes(task);
+ task_io_slot->rchar += task->ioac.rchar;
+ task_io_slot->wchar += task->ioac.wchar;
+ task_io_slot->fsync += task->ioac.syscfs;
+}
+
+static void compute_io_uid_tasks(struct uid_entry *uid_entry)
+{
+ struct task_entry *task_entry;
+ unsigned long bkt_task;
+
+ hash_for_each(uid_entry->task_entries, bkt_task, task_entry, hash) {
+ compute_io_bucket_stats(&task_entry->io[uid_entry->state],
+ &task_entry->io[UID_STATE_TOTAL_CURR],
+ &task_entry->io[UID_STATE_TOTAL_LAST],
+ &task_entry->io[UID_STATE_DEAD_TASKS]);
+ }
+}
+
+static void show_io_uid_tasks(struct seq_file *m, struct uid_entry *uid_entry)
+{
+ struct task_entry *task_entry;
+ unsigned long bkt_task;
+
+ hash_for_each(uid_entry->task_entries, bkt_task, task_entry, hash) {
+ /* Separated by comma because space exists in task comm */
+ seq_printf(m, "task,%s,%lu,%llu,%llu,%llu,%llu,%llu,%llu,%llu,%llu,%llu,%llu\n",
+ task_entry->comm,
+ (unsigned long)task_entry->pid,
+ task_entry->io[UID_STATE_FOREGROUND].rchar,
+ task_entry->io[UID_STATE_FOREGROUND].wchar,
+ task_entry->io[UID_STATE_FOREGROUND].read_bytes,
+ task_entry->io[UID_STATE_FOREGROUND].write_bytes,
+ task_entry->io[UID_STATE_BACKGROUND].rchar,
+ task_entry->io[UID_STATE_BACKGROUND].wchar,
+ task_entry->io[UID_STATE_BACKGROUND].read_bytes,
+ task_entry->io[UID_STATE_BACKGROUND].write_bytes,
+ task_entry->io[UID_STATE_FOREGROUND].fsync,
+ task_entry->io[UID_STATE_BACKGROUND].fsync);
+ }
+}
+#else
+static void remove_uid_tasks(struct uid_entry *uid_entry) {};
+static void set_io_uid_tasks_zero(struct uid_entry *uid_entry) {};
+static void add_uid_tasks_io_stats(struct uid_entry *uid_entry,
+ struct task_struct *task, int slot) {};
+static void compute_io_uid_tasks(struct uid_entry *uid_entry) {};
+static void show_io_uid_tasks(struct seq_file *m,
+ struct uid_entry *uid_entry) {}
+#endif
+
+static struct uid_entry *find_uid_entry(uid_t uid)
+{
+ struct uid_entry *uid_entry;
+ hash_for_each_possible(hash_table, uid_entry, hash, uid) {
+ if (uid_entry->uid == uid)
+ return uid_entry;
+ }
+ return NULL;
+}
+
+static struct uid_entry *find_or_register_uid(uid_t uid)
+{
+ struct uid_entry *uid_entry;
+
+ uid_entry = find_uid_entry(uid);
+ if (uid_entry)
+ return uid_entry;
+
+ uid_entry = kzalloc(sizeof(struct uid_entry), GFP_ATOMIC);
+ if (!uid_entry)
+ return NULL;
+
+ uid_entry->uid = uid;
+#ifdef CONFIG_UID_SYS_STATS_DEBUG
+ hash_init(uid_entry->task_entries);
+#endif
+ hash_add(hash_table, &uid_entry->hash, uid);
+
+ return uid_entry;
+}
+
+static int uid_cputime_show(struct seq_file *m, void *v)
+{
+ struct uid_entry *uid_entry = NULL;
+ struct task_struct *task, *temp;
+ struct user_namespace *user_ns = current_user_ns();
+ cputime_t utime;
+ cputime_t stime;
+ unsigned long bkt;
+ uid_t uid;
+
+ rt_mutex_lock(&uid_lock);
+
+ hash_for_each(hash_table, bkt, uid_entry, hash) {
+ uid_entry->active_stime = 0;
+ uid_entry->active_utime = 0;
+ }
+
+ rcu_read_lock();
+ do_each_thread(temp, task) {
+ uid = from_kuid_munged(user_ns, task_uid(task));
+ if (!uid_entry || uid_entry->uid != uid)
+ uid_entry = find_or_register_uid(uid);
+ if (!uid_entry) {
+ rcu_read_unlock();
+ rt_mutex_unlock(&uid_lock);
+ pr_err("%s: failed to find the uid_entry for uid %d\n",
+ __func__, uid);
+ return -ENOMEM;
+ }
+ task_cputime_adjusted(task, &utime, &stime);
+ uid_entry->active_utime += utime;
+ uid_entry->active_stime += stime;
+ } while_each_thread(temp, task);
+ rcu_read_unlock();
+
+ hash_for_each(hash_table, bkt, uid_entry, hash) {
+ cputime_t total_utime = uid_entry->utime +
+ uid_entry->active_utime;
+ cputime_t total_stime = uid_entry->stime +
+ uid_entry->active_stime;
+ seq_printf(m, "%d: %llu %llu\n", uid_entry->uid,
+ (unsigned long long)jiffies_to_msecs(
+ cputime_to_jiffies(total_utime)) * USEC_PER_MSEC,
+ (unsigned long long)jiffies_to_msecs(
+ cputime_to_jiffies(total_stime)) * USEC_PER_MSEC);
+ }
+
+ rt_mutex_unlock(&uid_lock);
+ return 0;
+}
+
+static int uid_cputime_open(struct inode *inode, struct file *file)
+{
+ return single_open(file, uid_cputime_show, PDE_DATA(inode));
+}
+
+static const struct file_operations uid_cputime_fops = {
+ .open = uid_cputime_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = single_release,
+};
+
+static int uid_remove_open(struct inode *inode, struct file *file)
+{
+ return single_open(file, NULL, NULL);
+}
+
+static ssize_t uid_remove_write(struct file *file,
+ const char __user *buffer, size_t count, loff_t *ppos)
+{
+ struct uid_entry *uid_entry;
+ struct hlist_node *tmp;
+ char uids[128];
+ char *start_uid, *end_uid = NULL;
+ long int uid_start = 0, uid_end = 0;
+
+ if (count >= sizeof(uids))
+ count = sizeof(uids) - 1;
+
+ if (copy_from_user(uids, buffer, count))
+ return -EFAULT;
+
+ uids[count] = '\0';
+ end_uid = uids;
+ start_uid = strsep(&end_uid, "-");
+
+ if (!start_uid || !end_uid)
+ return -EINVAL;
+
+ if (kstrtol(start_uid, 10, &uid_start) != 0 ||
+ kstrtol(end_uid, 10, &uid_end) != 0) {
+ return -EINVAL;
+ }
+
+ /* Also remove uids from /proc/uid_time_in_state */
+ cpufreq_task_times_remove_uids(uid_start, uid_end);
+
+ rt_mutex_lock(&uid_lock);
+
+ for (; uid_start <= uid_end; uid_start++) {
+ hash_for_each_possible_safe(hash_table, uid_entry, tmp,
+ hash, (uid_t)uid_start) {
+ if (uid_start == uid_entry->uid) {
+ remove_uid_tasks(uid_entry);
+ hash_del(&uid_entry->hash);
+ kfree(uid_entry);
+ }
+ }
+ }
+
+ rt_mutex_unlock(&uid_lock);
+ return count;
+}
+
+static const struct file_operations uid_remove_fops = {
+ .open = uid_remove_open,
+ .release = single_release,
+ .write = uid_remove_write,
+};
+
+
+static void add_uid_io_stats(struct uid_entry *uid_entry,
+ struct task_struct *task, int slot)
+{
+ struct io_stats *io_slot = &uid_entry->io[slot];
+
+ io_slot->read_bytes += task->ioac.read_bytes;
+ io_slot->write_bytes += compute_write_bytes(task);
+ io_slot->rchar += task->ioac.rchar;
+ io_slot->wchar += task->ioac.wchar;
+ io_slot->fsync += task->ioac.syscfs;
+
+ add_uid_tasks_io_stats(uid_entry, task, slot);
+}
+
+static void update_io_stats_all_locked(void)
+{
+ struct uid_entry *uid_entry = NULL;
+ struct task_struct *task, *temp;
+ struct user_namespace *user_ns = current_user_ns();
+ unsigned long bkt;
+ uid_t uid;
+
+ hash_for_each(hash_table, bkt, uid_entry, hash) {
+ memset(&uid_entry->io[UID_STATE_TOTAL_CURR], 0,
+ sizeof(struct io_stats));
+ set_io_uid_tasks_zero(uid_entry);
+ }
+
+ rcu_read_lock();
+ do_each_thread(temp, task) {
+ uid = from_kuid_munged(user_ns, task_uid(task));
+ if (!uid_entry || uid_entry->uid != uid)
+ uid_entry = find_or_register_uid(uid);
+ if (!uid_entry)
+ continue;
+ add_uid_io_stats(uid_entry, task, UID_STATE_TOTAL_CURR);
+ } while_each_thread(temp, task);
+ rcu_read_unlock();
+
+ hash_for_each(hash_table, bkt, uid_entry, hash) {
+ compute_io_bucket_stats(&uid_entry->io[uid_entry->state],
+ &uid_entry->io[UID_STATE_TOTAL_CURR],
+ &uid_entry->io[UID_STATE_TOTAL_LAST],
+ &uid_entry->io[UID_STATE_DEAD_TASKS]);
+ compute_io_uid_tasks(uid_entry);
+ }
+}
+
+static void update_io_stats_uid_locked(struct uid_entry *uid_entry)
+{
+ struct task_struct *task, *temp;
+ struct user_namespace *user_ns = current_user_ns();
+
+ memset(&uid_entry->io[UID_STATE_TOTAL_CURR], 0,
+ sizeof(struct io_stats));
+ set_io_uid_tasks_zero(uid_entry);
+
+ rcu_read_lock();
+ do_each_thread(temp, task) {
+ if (from_kuid_munged(user_ns, task_uid(task)) != uid_entry->uid)
+ continue;
+ add_uid_io_stats(uid_entry, task, UID_STATE_TOTAL_CURR);
+ } while_each_thread(temp, task);
+ rcu_read_unlock();
+
+ compute_io_bucket_stats(&uid_entry->io[uid_entry->state],
+ &uid_entry->io[UID_STATE_TOTAL_CURR],
+ &uid_entry->io[UID_STATE_TOTAL_LAST],
+ &uid_entry->io[UID_STATE_DEAD_TASKS]);
+ compute_io_uid_tasks(uid_entry);
+}
+
+
+static int uid_io_show(struct seq_file *m, void *v)
+{
+ struct uid_entry *uid_entry;
+ unsigned long bkt;
+
+ rt_mutex_lock(&uid_lock);
+
+ update_io_stats_all_locked();
+
+ hash_for_each(hash_table, bkt, uid_entry, hash) {
+ seq_printf(m, "%d %llu %llu %llu %llu %llu %llu %llu %llu %llu %llu\n",
+ uid_entry->uid,
+ uid_entry->io[UID_STATE_FOREGROUND].rchar,
+ uid_entry->io[UID_STATE_FOREGROUND].wchar,
+ uid_entry->io[UID_STATE_FOREGROUND].read_bytes,
+ uid_entry->io[UID_STATE_FOREGROUND].write_bytes,
+ uid_entry->io[UID_STATE_BACKGROUND].rchar,
+ uid_entry->io[UID_STATE_BACKGROUND].wchar,
+ uid_entry->io[UID_STATE_BACKGROUND].read_bytes,
+ uid_entry->io[UID_STATE_BACKGROUND].write_bytes,
+ uid_entry->io[UID_STATE_FOREGROUND].fsync,
+ uid_entry->io[UID_STATE_BACKGROUND].fsync);
+
+ show_io_uid_tasks(m, uid_entry);
+ }
+
+ rt_mutex_unlock(&uid_lock);
+ return 0;
+}
+
+static int uid_io_open(struct inode *inode, struct file *file)
+{
+ return single_open(file, uid_io_show, PDE_DATA(inode));
+}
+
+static const struct file_operations uid_io_fops = {
+ .open = uid_io_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = single_release,
+};
+
+static int uid_procstat_open(struct inode *inode, struct file *file)
+{
+ return single_open(file, NULL, NULL);
+}
+
+static ssize_t uid_procstat_write(struct file *file,
+ const char __user *buffer, size_t count, loff_t *ppos)
+{
+ struct uid_entry *uid_entry;
+ uid_t uid;
+ int argc, state;
+ char input[128];
+
+ if (count >= sizeof(input))
+ return -EINVAL;
+
+ if (copy_from_user(input, buffer, count))
+ return -EFAULT;
+
+ input[count] = '\0';
+
+ argc = sscanf(input, "%u %d", &uid, &state);
+ if (argc != 2)
+ return -EINVAL;
+
+ if (state != UID_STATE_BACKGROUND && state != UID_STATE_FOREGROUND)
+ return -EINVAL;
+
+ rt_mutex_lock(&uid_lock);
+
+ uid_entry = find_or_register_uid(uid);
+ if (!uid_entry) {
+ rt_mutex_unlock(&uid_lock);
+ return -EINVAL;
+ }
+
+ if (uid_entry->state == state) {
+ rt_mutex_unlock(&uid_lock);
+ return count;
+ }
+
+ update_io_stats_uid_locked(uid_entry);
+
+ uid_entry->state = state;
+
+ rt_mutex_unlock(&uid_lock);
+
+ return count;
+}
+
+static const struct file_operations uid_procstat_fops = {
+ .open = uid_procstat_open,
+ .release = single_release,
+ .write = uid_procstat_write,
+};
+
+static int process_notifier(struct notifier_block *self,
+ unsigned long cmd, void *v)
+{
+ struct task_struct *task = v;
+ struct uid_entry *uid_entry;
+ cputime_t utime, stime;
+ uid_t uid;
+
+ if (!task)
+ return NOTIFY_OK;
+
+ rt_mutex_lock(&uid_lock);
+ uid = from_kuid_munged(current_user_ns(), task_uid(task));
+ uid_entry = find_or_register_uid(uid);
+ if (!uid_entry) {
+ pr_err("%s: failed to find uid %d\n", __func__, uid);
+ goto exit;
+ }
+
+ task_cputime_adjusted(task, &utime, &stime);
+ uid_entry->utime += utime;
+ uid_entry->stime += stime;
+
+ add_uid_io_stats(uid_entry, task, UID_STATE_DEAD_TASKS);
+
+exit:
+ rt_mutex_unlock(&uid_lock);
+ return NOTIFY_OK;
+}
+
+static struct notifier_block process_notifier_block = {
+ .notifier_call = process_notifier,
+};
+
+static int __init proc_uid_sys_stats_init(void)
+{
+ hash_init(hash_table);
+
+ cpu_parent = proc_mkdir("uid_cputime", NULL);
+ if (!cpu_parent) {
+ pr_err("%s: failed to create uid_cputime proc entry\n",
+ __func__);
+ goto err;
+ }
+
+ proc_create_data("remove_uid_range", 0222, cpu_parent,
+ &uid_remove_fops, NULL);
+ proc_create_data("show_uid_stat", 0444, cpu_parent,
+ &uid_cputime_fops, NULL);
+
+ io_parent = proc_mkdir("uid_io", NULL);
+ if (!io_parent) {
+ pr_err("%s: failed to create uid_io proc entry\n",
+ __func__);
+ goto err;
+ }
+
+ proc_create_data("stats", 0444, io_parent,
+ &uid_io_fops, NULL);
+
+ proc_parent = proc_mkdir("uid_procstat", NULL);
+ if (!proc_parent) {
+ pr_err("%s: failed to create uid_procstat proc entry\n",
+ __func__);
+ goto err;
+ }
+
+ proc_create_data("set", 0222, proc_parent,
+ &uid_procstat_fops, NULL);
+
+ profile_event_register(PROFILE_TASK_EXIT, &process_notifier_block);
+
+ return 0;
+
+err:
+ remove_proc_subtree("uid_cputime", NULL);
+ remove_proc_subtree("uid_io", NULL);
+ remove_proc_subtree("uid_procstat", NULL);
+ return -ENOMEM;
+}
+
+early_initcall(proc_uid_sys_stats_init);
diff --git a/drivers/mmc/card/Kconfig b/drivers/mmc/card/Kconfig
index 5562308699bc..6142ec1b9dfb 100644
--- a/drivers/mmc/card/Kconfig
+++ b/drivers/mmc/card/Kconfig
@@ -68,3 +68,15 @@ config MMC_TEST
This driver is only of interest to those developing or
testing a host driver. Most people should say N here.
+
+config MMC_SIMULATE_MAX_SPEED
+ bool "Turn on maximum speed control per block device"
+ depends on MMC_BLOCK
+ help
+ Say Y here to enable MMC device speed limiting. Used to test and
+ simulate the behavior of the system when confronted with a slow MMC.
+
+ Enables max_read_speed, max_write_speed and cache_size attributes to
+ control the write or read maximum KB/second speed behaviors.
+
+ If unsure, say N here.
diff --git a/drivers/mmc/card/block.c b/drivers/mmc/card/block.c
index 709a872ed484..817fcf8c0ac6 100644
--- a/drivers/mmc/card/block.c
+++ b/drivers/mmc/card/block.c
@@ -287,6 +287,250 @@ out:
return ret;
}
+#ifdef CONFIG_MMC_SIMULATE_MAX_SPEED
+
+static int max_read_speed, max_write_speed, cache_size = 4;
+
+module_param(max_read_speed, int, S_IRUSR | S_IRGRP);
+MODULE_PARM_DESC(max_read_speed, "maximum KB/s read speed 0=off");
+module_param(max_write_speed, int, S_IRUSR | S_IRGRP);
+MODULE_PARM_DESC(max_write_speed, "maximum KB/s write speed 0=off");
+module_param(cache_size, int, S_IRUSR | S_IRGRP);
+MODULE_PARM_DESC(cache_size, "MB high speed memory or SLC cache");
+
+/*
+ * helper macros and expectations:
+ * size - unsigned long number of bytes
+ * jiffies - unsigned long HZ timestamp difference
+ * speed - unsigned KB/s transfer rate
+ */
+#define size_and_speed_to_jiffies(size, speed) \
+ ((size) * HZ / (speed) / 1024UL)
+#define jiffies_and_speed_to_size(jiffies, speed) \
+ (((speed) * (jiffies) * 1024UL) / HZ)
+#define jiffies_and_size_to_speed(jiffies, size) \
+ ((size) * HZ / (jiffies) / 1024UL)
+
+/* Limits to report warning */
+/* jiffies_and_size_to_speed(10*HZ, queue_max_hw_sectors(q) * 512UL) ~ 25 */
+#define MIN_SPEED(q) 250 /* 10 times faster than a floppy disk */
+#define MAX_SPEED(q) jiffies_and_size_to_speed(1, queue_max_sectors(q) * 512UL)
+
+#define speed_valid(speed) ((speed) > 0)
+
+static const char off[] = "off\n";
+
+static int max_speed_show(int speed, char *buf)
+{
+ if (speed)
+ return scnprintf(buf, PAGE_SIZE, "%uKB/s\n", speed);
+ else
+ return scnprintf(buf, PAGE_SIZE, off);
+}
+
+static int max_speed_store(const char *buf, struct request_queue *q)
+{
+ unsigned int limit, set = 0;
+
+ if (!strncasecmp(off, buf, sizeof(off) - 2))
+ return set;
+ if (kstrtouint(buf, 0, &set) || (set > INT_MAX))
+ return -EINVAL;
+ if (set == 0)
+ return set;
+ limit = MAX_SPEED(q);
+ if (set > limit)
+ pr_warn("max speed %u ineffective above %u\n", set, limit);
+ limit = MIN_SPEED(q);
+ if (set < limit)
+ pr_warn("max speed %u painful below %u\n", set, limit);
+ return set;
+}
+
+static ssize_t max_write_speed_show(struct device *dev,
+ struct device_attribute *attr, char *buf)
+{
+ struct mmc_blk_data *md = mmc_blk_get(dev_to_disk(dev));
+ int ret = max_speed_show(atomic_read(&md->queue.max_write_speed), buf);
+
+ mmc_blk_put(md);
+ return ret;
+}
+
+static ssize_t max_write_speed_store(struct device *dev,
+ struct device_attribute *attr,
+ const char *buf, size_t count)
+{
+ struct mmc_blk_data *md = mmc_blk_get(dev_to_disk(dev));
+ int set = max_speed_store(buf, md->queue.queue);
+
+ if (set < 0) {
+ mmc_blk_put(md);
+ return set;
+ }
+
+ atomic_set(&md->queue.max_write_speed, set);
+ mmc_blk_put(md);
+ return count;
+}
+
+static const DEVICE_ATTR(max_write_speed, S_IRUGO | S_IWUSR,
+ max_write_speed_show, max_write_speed_store);
+
+static ssize_t max_read_speed_show(struct device *dev,
+ struct device_attribute *attr, char *buf)
+{
+ struct mmc_blk_data *md = mmc_blk_get(dev_to_disk(dev));
+ int ret = max_speed_show(atomic_read(&md->queue.max_read_speed), buf);
+
+ mmc_blk_put(md);
+ return ret;
+}
+
+static ssize_t max_read_speed_store(struct device *dev,
+ struct device_attribute *attr,
+ const char *buf, size_t count)
+{
+ struct mmc_blk_data *md = mmc_blk_get(dev_to_disk(dev));
+ int set = max_speed_store(buf, md->queue.queue);
+
+ if (set < 0) {
+ mmc_blk_put(md);
+ return set;
+ }
+
+ atomic_set(&md->queue.max_read_speed, set);
+ mmc_blk_put(md);
+ return count;
+}
+
+static const DEVICE_ATTR(max_read_speed, S_IRUGO | S_IWUSR,
+ max_read_speed_show, max_read_speed_store);
+
+static ssize_t cache_size_show(struct device *dev,
+ struct device_attribute *attr, char *buf)
+{
+ struct mmc_blk_data *md = mmc_blk_get(dev_to_disk(dev));
+ struct mmc_queue *mq = &md->queue;
+ int cache_size = atomic_read(&mq->cache_size);
+ int ret;
+
+ if (!cache_size)
+ ret = scnprintf(buf, PAGE_SIZE, off);
+ else {
+ int speed = atomic_read(&mq->max_write_speed);
+
+ if (!speed_valid(speed))
+ ret = scnprintf(buf, PAGE_SIZE, "%uMB\n", cache_size);
+ else { /* We accept race between cache_jiffies and cache_used */
+ unsigned long size = jiffies_and_speed_to_size(
+ jiffies - mq->cache_jiffies, speed);
+ long used = atomic_long_read(&mq->cache_used);
+
+ if (size >= used)
+ size = 0;
+ else
+ size = (used - size) * 100 / cache_size
+ / 1024UL / 1024UL;
+
+ ret = scnprintf(buf, PAGE_SIZE, "%uMB %lu%% used\n",
+ cache_size, size);
+ }
+ }
+
+ mmc_blk_put(md);
+ return ret;
+}
+
+static ssize_t cache_size_store(struct device *dev,
+ struct device_attribute *attr,
+ const char *buf, size_t count)
+{
+ struct mmc_blk_data *md;
+ unsigned int set = 0;
+
+ if (strncasecmp(off, buf, sizeof(off) - 2)
+ && (kstrtouint(buf, 0, &set) || (set > INT_MAX)))
+ return -EINVAL;
+
+ md = mmc_blk_get(dev_to_disk(dev));
+ atomic_set(&md->queue.cache_size, set);
+ mmc_blk_put(md);
+ return count;
+}
+
+static const DEVICE_ATTR(cache_size, S_IRUGO | S_IWUSR,
+ cache_size_show, cache_size_store);
+
+/* correct for write-back */
+static long mmc_blk_cache_used(struct mmc_queue *mq, unsigned long waitfor)
+{
+ long used = 0;
+ int speed = atomic_read(&mq->max_write_speed);
+
+ if (speed_valid(speed)) {
+ unsigned long size = jiffies_and_speed_to_size(
+ waitfor - mq->cache_jiffies, speed);
+ used = atomic_long_read(&mq->cache_used);
+
+ if (size >= used)
+ used = 0;
+ else
+ used -= size;
+ }
+
+ atomic_long_set(&mq->cache_used, used);
+ mq->cache_jiffies = waitfor;
+
+ return used;
+}
+
+static void mmc_blk_simulate_delay(
+ struct mmc_queue *mq,
+ struct request *req,
+ unsigned long waitfor)
+{
+ int max_speed;
+
+ if (!req)
+ return;
+
+ max_speed = (rq_data_dir(req) == READ)
+ ? atomic_read(&mq->max_read_speed)
+ : atomic_read(&mq->max_write_speed);
+ if (speed_valid(max_speed)) {
+ unsigned long bytes = blk_rq_bytes(req);
+
+ if (rq_data_dir(req) != READ) {
+ int cache_size = atomic_read(&mq->cache_size);
+
+ if (cache_size) {
+ unsigned long size = cache_size * 1024L * 1024L;
+ long used = mmc_blk_cache_used(mq, waitfor);
+
+ used += bytes;
+ atomic_long_set(&mq->cache_used, used);
+ bytes = 0;
+ if (used > size)
+ bytes = used - size;
+ }
+ }
+ waitfor += size_and_speed_to_jiffies(bytes, max_speed);
+ if (time_is_after_jiffies(waitfor)) {
+ long msecs = jiffies_to_msecs(waitfor - jiffies);
+
+ if (likely(msecs > 0))
+ msleep(msecs);
+ }
+ }
+}
+
+#else
+
+#define mmc_blk_simulate_delay(mq, req, waitfor)
+
+#endif
+
static int mmc_blk_open(struct block_device *bdev, fmode_t mode)
{
struct mmc_blk_data *md = mmc_blk_get(bdev->bd_disk);
@@ -1284,6 +1528,23 @@ static int mmc_blk_issue_flush(struct mmc_queue *mq, struct request *req)
if (ret)
ret = -EIO;
+#ifdef CONFIG_MMC_SIMULATE_MAX_SPEED
+ else if (atomic_read(&mq->cache_size)) {
+ long used = mmc_blk_cache_used(mq, jiffies);
+
+ if (used) {
+ int speed = atomic_read(&mq->max_write_speed);
+
+ if (speed_valid(speed)) {
+ unsigned long msecs = jiffies_to_msecs(
+ size_and_speed_to_jiffies(
+ used, speed));
+ if (msecs)
+ msleep(msecs);
+ }
+ }
+ }
+#endif
blk_end_request_all(req, ret);
return ret ? 0 : 1;
@@ -1965,6 +2226,9 @@ static int mmc_blk_issue_rw_rq(struct mmc_queue *mq, struct request *rqc)
struct mmc_async_req *areq;
const u8 packed_nr = 2;
u8 reqs = 0;
+#ifdef CONFIG_MMC_SIMULATE_MAX_SPEED
+ unsigned long waitfor = jiffies;
+#endif
if (!rqc && !mq->mqrq_prev->req)
return 0;
@@ -2015,6 +2279,8 @@ static int mmc_blk_issue_rw_rq(struct mmc_queue *mq, struct request *rqc)
*/
mmc_blk_reset_success(md, type);
+ mmc_blk_simulate_delay(mq, rqc, waitfor);
+
if (mmc_packed_cmd(mq_rq->cmd_type)) {
ret = mmc_blk_end_packed_req(mq_rq);
break;
@@ -2437,6 +2703,14 @@ static void mmc_blk_remove_req(struct mmc_blk_data *md)
card->ext_csd.boot_ro_lockable)
device_remove_file(disk_to_dev(md->disk),
&md->power_ro_lock);
+#ifdef CONFIG_MMC_SIMULATE_MAX_SPEED
+ device_remove_file(disk_to_dev(md->disk),
+ &dev_attr_max_write_speed);
+ device_remove_file(disk_to_dev(md->disk),
+ &dev_attr_max_read_speed);
+ device_remove_file(disk_to_dev(md->disk),
+ &dev_attr_cache_size);
+#endif
del_gendisk(md->disk);
}
@@ -2471,6 +2745,24 @@ static int mmc_add_disk(struct mmc_blk_data *md)
ret = device_create_file(disk_to_dev(md->disk), &md->force_ro);
if (ret)
goto force_ro_fail;
+#ifdef CONFIG_MMC_SIMULATE_MAX_SPEED
+ atomic_set(&md->queue.max_write_speed, max_write_speed);
+ ret = device_create_file(disk_to_dev(md->disk),
+ &dev_attr_max_write_speed);
+ if (ret)
+ goto max_write_speed_fail;
+ atomic_set(&md->queue.max_read_speed, max_read_speed);
+ ret = device_create_file(disk_to_dev(md->disk),
+ &dev_attr_max_read_speed);
+ if (ret)
+ goto max_read_speed_fail;
+ atomic_set(&md->queue.cache_size, cache_size);
+ atomic_long_set(&md->queue.cache_used, 0);
+ md->queue.cache_jiffies = jiffies;
+ ret = device_create_file(disk_to_dev(md->disk), &dev_attr_cache_size);
+ if (ret)
+ goto cache_size_fail;
+#endif
if ((md->area_type & MMC_BLK_DATA_AREA_BOOT) &&
card->ext_csd.boot_ro_lockable) {
@@ -2495,6 +2787,14 @@ static int mmc_add_disk(struct mmc_blk_data *md)
return ret;
power_ro_lock_fail:
+#ifdef CONFIG_MMC_SIMULATE_MAX_SPEED
+ device_remove_file(disk_to_dev(md->disk), &dev_attr_cache_size);
+cache_size_fail:
+ device_remove_file(disk_to_dev(md->disk), &dev_attr_max_read_speed);
+max_read_speed_fail:
+ device_remove_file(disk_to_dev(md->disk), &dev_attr_max_write_speed);
+max_write_speed_fail:
+#endif
device_remove_file(disk_to_dev(md->disk), &md->force_ro);
force_ro_fail:
del_gendisk(md->disk);
diff --git a/drivers/mmc/card/queue.c b/drivers/mmc/card/queue.c
index 8037f73a109a..1810f765f0d1 100644
--- a/drivers/mmc/card/queue.c
+++ b/drivers/mmc/card/queue.c
@@ -19,6 +19,7 @@
#include <linux/mmc/card.h>
#include <linux/mmc/host.h>
+#include <linux/sched/rt.h>
#include "queue.h"
#include "block.h"
@@ -53,6 +54,11 @@ static int mmc_queue_thread(void *d)
{
struct mmc_queue *mq = d;
struct request_queue *q = mq->queue;
+ struct sched_param scheduler_params = {0};
+
+ scheduler_params.sched_priority = 1;
+
+ sched_setscheduler(current, SCHED_FIFO, &scheduler_params);
current->flags |= PF_MEMALLOC;
diff --git a/drivers/mmc/card/queue.h b/drivers/mmc/card/queue.h
index 342f1e3f301e..fe58d31cbc7e 100644
--- a/drivers/mmc/card/queue.h
+++ b/drivers/mmc/card/queue.h
@@ -62,6 +62,14 @@ struct mmc_queue {
struct mmc_queue_req mqrq[2];
struct mmc_queue_req *mqrq_cur;
struct mmc_queue_req *mqrq_prev;
+#ifdef CONFIG_MMC_SIMULATE_MAX_SPEED
+ atomic_t max_write_speed;
+ atomic_t max_read_speed;
+ atomic_t cache_size;
+ /* i/o tracking */
+ atomic_long_t cache_used;
+ unsigned long cache_jiffies;
+#endif
};
extern int mmc_init_queue(struct mmc_queue *, struct mmc_card *, spinlock_t *,
diff --git a/drivers/mmc/core/Kconfig b/drivers/mmc/core/Kconfig
index 250f223aaa80..daad32f85033 100644
--- a/drivers/mmc/core/Kconfig
+++ b/drivers/mmc/core/Kconfig
@@ -22,3 +22,18 @@ config PWRSEQ_SIMPLE
This driver can also be built as a module. If so, the module
will be called pwrseq_simple.
+
+config MMC_EMBEDDED_SDIO
+ boolean "MMC embedded SDIO device support (EXPERIMENTAL)"
+ help
+ If you say Y here, support will be added for embedded SDIO
+ devices which do not contain the necessary enumeration
+ support in hardware to be properly detected.
+
+config MMC_PARANOID_SD_INIT
+ bool "Enable paranoid SD card initialization (EXPERIMENTAL)"
+ help
+ If you say Y here, the MMC layer will be extra paranoid
+ about re-trying SD init requests. This can be a useful
+ work-around for buggy controllers and hardware. Enable
+ if you are experiencing issues with SD detection.
diff --git a/drivers/mmc/core/core.c b/drivers/mmc/core/core.c
index cff5829790c9..e19c370fe1c9 100644
--- a/drivers/mmc/core/core.c
+++ b/drivers/mmc/core/core.c
@@ -201,6 +201,20 @@ void mmc_request_done(struct mmc_host *host, struct mmc_request *mrq)
pr_debug("%s: %d bytes transferred: %d\n",
mmc_hostname(host),
mrq->data->bytes_xfered, mrq->data->error);
+#ifdef CONFIG_BLOCK
+ if (mrq->lat_hist_enabled) {
+ ktime_t completion;
+ u_int64_t delta_us;
+
+ completion = ktime_get();
+ delta_us = ktime_us_delta(completion,
+ mrq->io_start);
+ blk_update_latency_hist(
+ (mrq->data->flags & MMC_DATA_READ) ?
+ &host->io_lat_read :
+ &host->io_lat_write, delta_us);
+ }
+#endif
}
if (mrq->stop) {
@@ -699,8 +713,16 @@ struct mmc_async_req *mmc_start_req(struct mmc_host *host,
}
}
- if (!err && areq)
+ if (!err && areq) {
+#ifdef CONFIG_BLOCK
+ if (host->latency_hist_enabled) {
+ areq->mrq->io_start = ktime_get();
+ areq->mrq->lat_hist_enabled = 1;
+ } else
+ areq->mrq->lat_hist_enabled = 0;
+#endif
start_err = __mmc_start_data_req(host, areq->mrq);
+ }
if (host->areq)
mmc_post_req(host, host->areq->mrq, 0);
@@ -2051,7 +2073,7 @@ void mmc_init_erase(struct mmc_card *card)
}
static unsigned int mmc_mmc_erase_timeout(struct mmc_card *card,
- unsigned int arg, unsigned int qty)
+ unsigned int arg, unsigned int qty)
{
unsigned int erase_timeout;
@@ -3034,6 +3056,22 @@ void mmc_init_context_info(struct mmc_host *host)
init_waitqueue_head(&host->context_info.wait);
}
+#ifdef CONFIG_MMC_EMBEDDED_SDIO
+void mmc_set_embedded_sdio_data(struct mmc_host *host,
+ struct sdio_cis *cis,
+ struct sdio_cccr *cccr,
+ struct sdio_embedded_func *funcs,
+ int num_funcs)
+{
+ host->embedded_sdio_data.cis = cis;
+ host->embedded_sdio_data.cccr = cccr;
+ host->embedded_sdio_data.funcs = funcs;
+ host->embedded_sdio_data.num_funcs = num_funcs;
+}
+
+EXPORT_SYMBOL(mmc_set_embedded_sdio_data);
+#endif
+
static int __init mmc_init(void)
{
int ret;
@@ -3066,6 +3104,63 @@ static void __exit mmc_exit(void)
mmc_unregister_bus();
}
+#ifdef CONFIG_BLOCK
+static ssize_t
+latency_hist_show(struct device *dev, struct device_attribute *attr, char *buf)
+{
+ struct mmc_host *host = cls_dev_to_mmc_host(dev);
+ size_t written_bytes;
+
+ written_bytes = blk_latency_hist_show("Read", &host->io_lat_read,
+ buf, PAGE_SIZE);
+ written_bytes += blk_latency_hist_show("Write", &host->io_lat_write,
+ buf + written_bytes, PAGE_SIZE - written_bytes);
+
+ return written_bytes;
+}
+
+/*
+ * Values permitted 0, 1, 2.
+ * 0 -> Disable IO latency histograms (default)
+ * 1 -> Enable IO latency histograms
+ * 2 -> Zero out IO latency histograms
+ */
+static ssize_t
+latency_hist_store(struct device *dev, struct device_attribute *attr,
+ const char *buf, size_t count)
+{
+ struct mmc_host *host = cls_dev_to_mmc_host(dev);
+ long value;
+
+ if (kstrtol(buf, 0, &value))
+ return -EINVAL;
+ if (value == BLK_IO_LAT_HIST_ZERO) {
+ memset(&host->io_lat_read, 0, sizeof(host->io_lat_read));
+ memset(&host->io_lat_write, 0, sizeof(host->io_lat_write));
+ } else if (value == BLK_IO_LAT_HIST_ENABLE ||
+ value == BLK_IO_LAT_HIST_DISABLE)
+ host->latency_hist_enabled = value;
+ return count;
+}
+
+static DEVICE_ATTR(latency_hist, S_IRUGO | S_IWUSR,
+ latency_hist_show, latency_hist_store);
+
+void
+mmc_latency_hist_sysfs_init(struct mmc_host *host)
+{
+ if (device_create_file(&host->class_dev, &dev_attr_latency_hist))
+ dev_err(&host->class_dev,
+ "Failed to create latency_hist sysfs entry\n");
+}
+
+void
+mmc_latency_hist_sysfs_exit(struct mmc_host *host)
+{
+ device_remove_file(&host->class_dev, &dev_attr_latency_hist);
+}
+#endif
+
subsys_initcall(mmc_init);
module_exit(mmc_exit);
diff --git a/drivers/mmc/core/host.c b/drivers/mmc/core/host.c
index 848b3453517e..07f88919693d 100644
--- a/drivers/mmc/core/host.c
+++ b/drivers/mmc/core/host.c
@@ -31,8 +31,6 @@
#include "slot-gpio.h"
#include "pwrseq.h"
-#define cls_dev_to_mmc_host(d) container_of(d, struct mmc_host, class_dev)
-
static DEFINE_IDA(mmc_host_ida);
static DEFINE_SPINLOCK(mmc_host_lock);
@@ -428,8 +426,13 @@ int mmc_add_host(struct mmc_host *host)
mmc_add_host_debugfs(host);
#endif
+#ifdef CONFIG_BLOCK
+ mmc_latency_hist_sysfs_init(host);
+#endif
+
mmc_start_host(host);
- mmc_register_pm_notifier(host);
+ if (!(host->pm_flags & MMC_PM_IGNORE_PM_NOTIFY))
+ mmc_register_pm_notifier(host);
return 0;
}
@@ -446,13 +449,18 @@ EXPORT_SYMBOL(mmc_add_host);
*/
void mmc_remove_host(struct mmc_host *host)
{
- mmc_unregister_pm_notifier(host);
+ if (!(host->pm_flags & MMC_PM_IGNORE_PM_NOTIFY))
+ mmc_unregister_pm_notifier(host);
mmc_stop_host(host);
#ifdef CONFIG_DEBUG_FS
mmc_remove_host_debugfs(host);
#endif
+#ifdef CONFIG_BLOCK
+ mmc_latency_hist_sysfs_exit(host);
+#endif
+
device_del(&host->class_dev);
led_trigger_unregister_simple(host->led);
diff --git a/drivers/mmc/core/host.h b/drivers/mmc/core/host.h
index 992bf5397633..bf38533406fd 100644
--- a/drivers/mmc/core/host.h
+++ b/drivers/mmc/core/host.h
@@ -12,6 +12,8 @@
#define _MMC_CORE_HOST_H
#include <linux/mmc/host.h>
+#define cls_dev_to_mmc_host(d) container_of(d, struct mmc_host, class_dev)
+
int mmc_register_host_class(void);
void mmc_unregister_host_class(void);
@@ -21,5 +23,8 @@ void mmc_retune_hold(struct mmc_host *host);
void mmc_retune_release(struct mmc_host *host);
int mmc_retune(struct mmc_host *host);
+void mmc_latency_hist_sysfs_init(struct mmc_host *host);
+void mmc_latency_hist_sysfs_exit(struct mmc_host *host);
+
#endif
diff --git a/drivers/mmc/core/mmc.c b/drivers/mmc/core/mmc.c
index 97e51309d7af..675c6f2882f6 100644
--- a/drivers/mmc/core/mmc.c
+++ b/drivers/mmc/core/mmc.c
@@ -617,6 +617,12 @@ static int mmc_decode_ext_csd(struct mmc_card *card, u8 *ext_csd)
card->ext_csd.ffu_capable =
(ext_csd[EXT_CSD_SUPPORTED_MODE] & 0x1) &&
!(ext_csd[EXT_CSD_FW_CONFIG] & 0x1);
+
+ card->ext_csd.pre_eol_info = ext_csd[EXT_CSD_PRE_EOL_INFO];
+ card->ext_csd.device_life_time_est_typ_a =
+ ext_csd[EXT_CSD_DEVICE_LIFE_TIME_EST_TYP_A];
+ card->ext_csd.device_life_time_est_typ_b =
+ ext_csd[EXT_CSD_DEVICE_LIFE_TIME_EST_TYP_B];
}
out:
return err;
@@ -746,6 +752,11 @@ MMC_DEV_ATTR(manfid, "0x%06x\n", card->cid.manfid);
MMC_DEV_ATTR(name, "%s\n", card->cid.prod_name);
MMC_DEV_ATTR(oemid, "0x%04x\n", card->cid.oemid);
MMC_DEV_ATTR(prv, "0x%x\n", card->cid.prv);
+MMC_DEV_ATTR(rev, "0x%x\n", card->ext_csd.rev);
+MMC_DEV_ATTR(pre_eol_info, "%02x\n", card->ext_csd.pre_eol_info);
+MMC_DEV_ATTR(life_time, "0x%02x 0x%02x\n",
+ card->ext_csd.device_life_time_est_typ_a,
+ card->ext_csd.device_life_time_est_typ_b);
MMC_DEV_ATTR(serial, "0x%08x\n", card->cid.serial);
MMC_DEV_ATTR(enhanced_area_offset, "%llu\n",
card->ext_csd.enhanced_area_offset);
@@ -799,6 +810,9 @@ static struct attribute *mmc_std_attrs[] = {
&dev_attr_name.attr,
&dev_attr_oemid.attr,
&dev_attr_prv.attr,
+ &dev_attr_rev.attr,
+ &dev_attr_pre_eol_info.attr,
+ &dev_attr_life_time.attr,
&dev_attr_serial.attr,
&dev_attr_enhanced_area_offset.attr,
&dev_attr_enhanced_area_size.attr,
diff --git a/drivers/mmc/core/sd.c b/drivers/mmc/core/sd.c
index f09148a4ab55..ad709340e663 100644
--- a/drivers/mmc/core/sd.c
+++ b/drivers/mmc/core/sd.c
@@ -847,6 +847,9 @@ int mmc_sd_setup_card(struct mmc_host *host, struct mmc_card *card,
bool reinit)
{
int err;
+#ifdef CONFIG_MMC_PARANOID_SD_INIT
+ int retries;
+#endif
if (!reinit) {
/*
@@ -873,7 +876,26 @@ int mmc_sd_setup_card(struct mmc_host *host, struct mmc_card *card,
/*
* Fetch switch information from card.
*/
+#ifdef CONFIG_MMC_PARANOID_SD_INIT
+ for (retries = 1; retries <= 3; retries++) {
+ err = mmc_read_switch(card);
+ if (!err) {
+ if (retries > 1) {
+ printk(KERN_WARNING
+ "%s: recovered\n",
+ mmc_hostname(host));
+ }
+ break;
+ } else {
+ printk(KERN_WARNING
+ "%s: read switch failed (attempt %d)\n",
+ mmc_hostname(host), retries);
+ }
+ }
+#else
err = mmc_read_switch(card);
+#endif
+
if (err)
return err;
}
@@ -1071,7 +1093,10 @@ static int mmc_sd_alive(struct mmc_host *host)
*/
static void mmc_sd_detect(struct mmc_host *host)
{
- int err;
+ int err = 0;
+#ifdef CONFIG_MMC_PARANOID_SD_INIT
+ int retries = 5;
+#endif
BUG_ON(!host);
BUG_ON(!host->card);
@@ -1081,7 +1106,23 @@ static void mmc_sd_detect(struct mmc_host *host)
/*
* Just check if our card has been removed.
*/
+#ifdef CONFIG_MMC_PARANOID_SD_INIT
+ while(retries) {
+ err = mmc_send_status(host->card, NULL);
+ if (err) {
+ retries--;
+ udelay(5);
+ continue;
+ }
+ break;
+ }
+ if (!retries) {
+ printk(KERN_ERR "%s(%s): Unable to re-detect card (%d)\n",
+ __func__, mmc_hostname(host), err);
+ }
+#else
err = _mmc_detect_card_removed(host);
+#endif
mmc_put_card(host->card);
@@ -1143,6 +1184,9 @@ static int mmc_sd_suspend(struct mmc_host *host)
static int _mmc_sd_resume(struct mmc_host *host)
{
int err = 0;
+#ifdef CONFIG_MMC_PARANOID_SD_INIT
+ int retries;
+#endif
BUG_ON(!host);
BUG_ON(!host->card);
@@ -1153,7 +1197,23 @@ static int _mmc_sd_resume(struct mmc_host *host)
goto out;
mmc_power_up(host, host->card->ocr);
+#ifdef CONFIG_MMC_PARANOID_SD_INIT
+ retries = 5;
+ while (retries) {
+ err = mmc_sd_init_card(host, host->card->ocr, host->card);
+
+ if (err) {
+ printk(KERN_ERR "%s: Re-init card rc = %d (retries = %d)\n",
+ mmc_hostname(host), err, retries);
+ mdelay(5);
+ retries--;
+ continue;
+ }
+ break;
+ }
+#else
err = mmc_sd_init_card(host, host->card->ocr, host->card);
+#endif
mmc_card_clr_suspended(host->card);
out:
@@ -1228,6 +1288,9 @@ int mmc_attach_sd(struct mmc_host *host)
{
int err;
u32 ocr, rocr;
+#ifdef CONFIG_MMC_PARANOID_SD_INIT
+ int retries;
+#endif
BUG_ON(!host);
WARN_ON(!host->claimed);
@@ -1264,9 +1327,27 @@ int mmc_attach_sd(struct mmc_host *host)
/*
* Detect and init the card.
*/
+#ifdef CONFIG_MMC_PARANOID_SD_INIT
+ retries = 5;
+ while (retries) {
+ err = mmc_sd_init_card(host, rocr, NULL);
+ if (err) {
+ retries--;
+ continue;
+ }
+ break;
+ }
+
+ if (!retries) {
+ printk(KERN_ERR "%s: mmc_sd_init_card() failure (err = %d)\n",
+ mmc_hostname(host), err);
+ goto err;
+ }
+#else
err = mmc_sd_init_card(host, rocr, NULL);
if (err)
goto err;
+#endif
mmc_release_host(host);
err = mmc_add_card(host->card);
diff --git a/drivers/mmc/core/sdio.c b/drivers/mmc/core/sdio.c
index bd44ba8116d1..b5ec3c8cf580 100644
--- a/drivers/mmc/core/sdio.c
+++ b/drivers/mmc/core/sdio.c
@@ -10,6 +10,7 @@
*/
#include <linux/err.h>
+#include <linux/module.h>
#include <linux/pm_runtime.h>
#include <linux/mmc/host.h>
@@ -21,6 +22,7 @@
#include "core.h"
#include "bus.h"
+#include "host.h"
#include "sd.h"
#include "sdio_bus.h"
#include "mmc_ops.h"
@@ -28,6 +30,10 @@
#include "sdio_ops.h"
#include "sdio_cis.h"
+#ifdef CONFIG_MMC_EMBEDDED_SDIO
+#include <linux/mmc/sdio_ids.h>
+#endif
+
static int sdio_read_fbr(struct sdio_func *func)
{
int ret;
@@ -697,19 +703,35 @@ try_again:
goto finish;
}
- /*
- * Read the common registers.
- */
- err = sdio_read_cccr(card, ocr);
- if (err)
- goto remove;
+#ifdef CONFIG_MMC_EMBEDDED_SDIO
+ if (host->embedded_sdio_data.cccr)
+ memcpy(&card->cccr, host->embedded_sdio_data.cccr, sizeof(struct sdio_cccr));
+ else {
+#endif
+ /*
+ * Read the common registers.
+ */
+ err = sdio_read_cccr(card, ocr);
+ if (err)
+ goto remove;
+#ifdef CONFIG_MMC_EMBEDDED_SDIO
+ }
+#endif
- /*
- * Read the common CIS tuples.
- */
- err = sdio_read_common_cis(card);
- if (err)
- goto remove;
+#ifdef CONFIG_MMC_EMBEDDED_SDIO
+ if (host->embedded_sdio_data.cis)
+ memcpy(&card->cis, host->embedded_sdio_data.cis, sizeof(struct sdio_cis));
+ else {
+#endif
+ /*
+ * Read the common CIS tuples.
+ */
+ err = sdio_read_common_cis(card);
+ if (err)
+ goto remove;
+#ifdef CONFIG_MMC_EMBEDDED_SDIO
+ }
+#endif
if (oldcard) {
int same = (card->cis.vendor == oldcard->cis.vendor &&
@@ -1118,14 +1140,36 @@ int mmc_attach_sdio(struct mmc_host *host)
funcs = (ocr & 0x70000000) >> 28;
card->sdio_funcs = 0;
+#ifdef CONFIG_MMC_EMBEDDED_SDIO
+ if (host->embedded_sdio_data.funcs)
+ card->sdio_funcs = funcs = host->embedded_sdio_data.num_funcs;
+#endif
+
/*
* Initialize (but don't add) all present functions.
*/
for (i = 0; i < funcs; i++, card->sdio_funcs++) {
- err = sdio_init_func(host->card, i + 1);
- if (err)
- goto remove;
-
+#ifdef CONFIG_MMC_EMBEDDED_SDIO
+ if (host->embedded_sdio_data.funcs) {
+ struct sdio_func *tmp;
+
+ tmp = sdio_alloc_func(host->card);
+ if (IS_ERR(tmp))
+ goto remove;
+ tmp->num = (i + 1);
+ card->sdio_func[i] = tmp;
+ tmp->class = host->embedded_sdio_data.funcs[i].f_class;
+ tmp->max_blksize = host->embedded_sdio_data.funcs[i].f_maxblksize;
+ tmp->vendor = card->cis.vendor;
+ tmp->device = card->cis.device;
+ } else {
+#endif
+ err = sdio_init_func(host->card, i + 1);
+ if (err)
+ goto remove;
+#ifdef CONFIG_MMC_EMBEDDED_SDIO
+ }
+#endif
/*
* Enable Runtime PM for this func (if supported)
*/
@@ -1173,3 +1217,42 @@ err:
return err;
}
+int sdio_reset_comm(struct mmc_card *card)
+{
+ struct mmc_host *host = card->host;
+ u32 ocr;
+ u32 rocr;
+ int err;
+
+ printk("%s():\n", __func__);
+ mmc_claim_host(host);
+
+ mmc_retune_disable(host);
+
+ mmc_go_idle(host);
+
+ mmc_set_clock(host, host->f_min);
+
+ err = mmc_send_io_op_cond(host, 0, &ocr);
+ if (err)
+ goto err;
+
+ rocr = mmc_select_voltage(host, ocr);
+ if (!rocr) {
+ err = -EINVAL;
+ goto err;
+ }
+
+ err = mmc_sdio_init_card(host, rocr, card, 0);
+ if (err)
+ goto err;
+
+ mmc_release_host(host);
+ return 0;
+err:
+ printk("%s: Error resetting SDIO communications (%d)\n",
+ mmc_hostname(host), err);
+ mmc_release_host(host);
+ return err;
+}
+EXPORT_SYMBOL(sdio_reset_comm);
diff --git a/drivers/mmc/core/sdio_bus.c b/drivers/mmc/core/sdio_bus.c
index d56a3b6c2fb9..528524a22c80 100644
--- a/drivers/mmc/core/sdio_bus.c
+++ b/drivers/mmc/core/sdio_bus.c
@@ -28,6 +28,10 @@
#include "sdio_cis.h"
#include "sdio_bus.h"
+#ifdef CONFIG_MMC_EMBEDDED_SDIO
+#include <linux/mmc/host.h>
+#endif
+
#define to_sdio_driver(d) container_of(d, struct sdio_driver, drv)
/* show configuration fields */
@@ -263,7 +267,14 @@ static void sdio_release_func(struct device *dev)
{
struct sdio_func *func = dev_to_sdio_func(dev);
- sdio_free_func_cis(func);
+#ifdef CONFIG_MMC_EMBEDDED_SDIO
+ /*
+ * If this device is embedded then we never allocated
+ * cis tables for this func
+ */
+ if (!func->card->host->embedded_sdio_data.funcs)
+#endif
+ sdio_free_func_cis(func);
kfree(func->info);
kfree(func->tmpbuf);
diff --git a/drivers/mmc/core/sdio_io.c b/drivers/mmc/core/sdio_io.c
index 406e5f037e32..3734cba53dbb 100644
--- a/drivers/mmc/core/sdio_io.c
+++ b/drivers/mmc/core/sdio_io.c
@@ -390,6 +390,39 @@ u8 sdio_readb(struct sdio_func *func, unsigned int addr, int *err_ret)
EXPORT_SYMBOL_GPL(sdio_readb);
/**
+ * sdio_readb_ext - read a single byte from a SDIO function
+ * @func: SDIO function to access
+ * @addr: address to read
+ * @err_ret: optional status value from transfer
+ * @in: value to add to argument
+ *
+ * Reads a single byte from the address space of a given SDIO
+ * function. If there is a problem reading the address, 0xff
+ * is returned and @err_ret will contain the error code.
+ */
+unsigned char sdio_readb_ext(struct sdio_func *func, unsigned int addr,
+ int *err_ret, unsigned in)
+{
+ int ret;
+ unsigned char val;
+
+ BUG_ON(!func);
+
+ if (err_ret)
+ *err_ret = 0;
+
+ ret = mmc_io_rw_direct(func->card, 0, func->num, addr, (u8)in, &val);
+ if (ret) {
+ if (err_ret)
+ *err_ret = ret;
+ return 0xFF;
+ }
+
+ return val;
+}
+EXPORT_SYMBOL_GPL(sdio_readb_ext);
+
+/**
* sdio_writeb - write a single byte to a SDIO function
* @func: SDIO function to access
* @b: byte to write
diff --git a/drivers/mtd/devices/block2mtd.c b/drivers/mtd/devices/block2mtd.c
index 7c887f111a7d..62fd6905c648 100644
--- a/drivers/mtd/devices/block2mtd.c
+++ b/drivers/mtd/devices/block2mtd.c
@@ -431,7 +431,7 @@ static int block2mtd_setup2(const char *val)
}
-static int block2mtd_setup(const char *val, struct kernel_param *kp)
+static int block2mtd_setup(const char *val, const struct kernel_param *kp)
{
#ifdef MODULE
return block2mtd_setup2(val);
diff --git a/drivers/mtd/devices/phram.c b/drivers/mtd/devices/phram.c
index 8b66e52ca3cc..7287696a21f9 100644
--- a/drivers/mtd/devices/phram.c
+++ b/drivers/mtd/devices/phram.c
@@ -266,7 +266,7 @@ static int phram_setup(const char *val)
return ret;
}
-static int phram_param_call(const char *val, struct kernel_param *kp)
+static int phram_param_call(const char *val, const struct kernel_param *kp)
{
#ifdef MODULE
return phram_setup(val);
diff --git a/drivers/mtd/nand/Kconfig b/drivers/mtd/nand/Kconfig
index b254090b8a1b..50ee1bad3690 100644
--- a/drivers/mtd/nand/Kconfig
+++ b/drivers/mtd/nand/Kconfig
@@ -1,3 +1,10 @@
+config MTD_NAND_IDS
+ tristate "Include chip ids for known NAND devices."
+ depends on MTD
+ help
+ Useful for NAND drivers that do not use the NAND subsystem but
+ still like to take advantage of the known chip information.
+
config MTD_NAND_ECC
tristate
@@ -109,9 +116,6 @@ config MTD_NAND_OMAP_BCH
config MTD_NAND_OMAP_BCH_BUILD
def_tristate MTD_NAND_OMAP2 && MTD_NAND_OMAP_BCH
-config MTD_NAND_IDS
- tristate
-
config MTD_NAND_RICOH
tristate "Ricoh xD card reader"
default n
diff --git a/drivers/mtd/ubi/build.c b/drivers/mtd/ubi/build.c
index ad2b57c6b13f..541c17917a4d 100644
--- a/drivers/mtd/ubi/build.c
+++ b/drivers/mtd/ubi/build.c
@@ -1403,7 +1403,7 @@ static int __init bytes_str_to_int(const char *str)
* This function returns zero in case of success and a negative error code in
* case of error.
*/
-static int __init ubi_mtd_param_parse(const char *val, struct kernel_param *kp)
+static int __init ubi_mtd_param_parse(const char *val, const struct kernel_param *kp)
{
int i, len;
struct mtd_dev_param *p;
diff --git a/drivers/net/ppp/Kconfig b/drivers/net/ppp/Kconfig
index 1373c6d7278d..282aec4860eb 100644
--- a/drivers/net/ppp/Kconfig
+++ b/drivers/net/ppp/Kconfig
@@ -149,6 +149,23 @@ config PPPOL2TP
tunnels. L2TP is replacing PPTP for VPN uses.
if TTY
+config PPPOLAC
+ tristate "PPP on L2TP Access Concentrator"
+ depends on PPP && INET
+ help
+ L2TP (RFC 2661) is a tunneling protocol widely used in virtual private
+ networks. This driver handles L2TP data packets between a UDP socket
+ and a PPP channel, but only permits one session per socket. Thus it is
+ fairly simple and suited for clients.
+
+config PPPOPNS
+ tristate "PPP on PPTP Network Server"
+ depends on PPP && INET
+ help
+ PPTP (RFC 2637) is a tunneling protocol widely used in virtual private
+ networks. This driver handles PPTP data packets between a RAW socket
+ and a PPP channel. It is fairly simple and easy to use.
+
config PPP_ASYNC
tristate "PPP support for async serial ports"
depends on PPP
diff --git a/drivers/net/ppp/Makefile b/drivers/net/ppp/Makefile
index a6b6297b0066..d283d03c4683 100644
--- a/drivers/net/ppp/Makefile
+++ b/drivers/net/ppp/Makefile
@@ -11,3 +11,5 @@ obj-$(CONFIG_PPP_SYNC_TTY) += ppp_synctty.o
obj-$(CONFIG_PPPOE) += pppox.o pppoe.o
obj-$(CONFIG_PPPOL2TP) += pppox.o
obj-$(CONFIG_PPTP) += pppox.o pptp.o
+obj-$(CONFIG_PPPOLAC) += pppox.o pppolac.o
+obj-$(CONFIG_PPPOPNS) += pppox.o pppopns.o
diff --git a/drivers/net/ppp/pppolac.c b/drivers/net/ppp/pppolac.c
new file mode 100644
index 000000000000..3a45cf805288
--- /dev/null
+++ b/drivers/net/ppp/pppolac.c
@@ -0,0 +1,450 @@
+/* drivers/net/pppolac.c
+ *
+ * Driver for PPP on L2TP Access Concentrator / PPPoLAC Socket (RFC 2661)
+ *
+ * Copyright (C) 2009 Google, Inc.
+ *
+ * This software is licensed under the terms of the GNU General Public
+ * License version 2, as published by the Free Software Foundation, and
+ * may be copied, distributed, and modified under those terms.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ */
+
+/* This driver handles L2TP data packets between a UDP socket and a PPP channel.
+ * The socket must keep connected, and only one session per socket is permitted.
+ * Sequencing of outgoing packets is controlled by LNS. Incoming packets with
+ * sequences are reordered within a sliding window of one second. Currently
+ * reordering only happens when a packet is received. It is done for simplicity
+ * since no additional locks or threads are required. This driver only works on
+ * IPv4 due to the lack of UDP encapsulation support in IPv6. */
+
+#include <linux/module.h>
+#include <linux/jiffies.h>
+#include <linux/workqueue.h>
+#include <linux/skbuff.h>
+#include <linux/file.h>
+#include <linux/netdevice.h>
+#include <linux/net.h>
+#include <linux/udp.h>
+#include <linux/ppp_defs.h>
+#include <linux/if_ppp.h>
+#include <linux/if_pppox.h>
+#include <linux/ppp_channel.h>
+#include <net/tcp_states.h>
+#include <asm/uaccess.h>
+
+#define L2TP_CONTROL_BIT 0x80
+#define L2TP_LENGTH_BIT 0x40
+#define L2TP_SEQUENCE_BIT 0x08
+#define L2TP_OFFSET_BIT 0x02
+#define L2TP_VERSION 0x02
+#define L2TP_VERSION_MASK 0x0F
+
+#define PPP_ADDR 0xFF
+#define PPP_CTRL 0x03
+
+union unaligned {
+ __u32 u32;
+} __attribute__((packed));
+
+static inline union unaligned *unaligned(void *ptr)
+{
+ return (union unaligned *)ptr;
+}
+
+struct meta {
+ __u32 sequence;
+ __u32 timestamp;
+};
+
+static inline struct meta *skb_meta(struct sk_buff *skb)
+{
+ return (struct meta *)skb->cb;
+}
+
+/******************************************************************************/
+
+static int pppolac_recv_core(struct sock *sk_udp, struct sk_buff *skb)
+{
+ struct sock *sk = (struct sock *)sk_udp->sk_user_data;
+ struct pppolac_opt *opt = &pppox_sk(sk)->proto.lac;
+ struct meta *meta = skb_meta(skb);
+ __u32 now = jiffies;
+ __u8 bits;
+ __u8 *ptr;
+
+ /* Drop the packet if L2TP header is missing. */
+ if (skb->len < sizeof(struct udphdr) + 6)
+ goto drop;
+
+ /* Put it back if it is a control packet. */
+ if (skb->data[sizeof(struct udphdr)] & L2TP_CONTROL_BIT)
+ return opt->backlog_rcv(sk_udp, skb);
+
+ /* Skip UDP header. */
+ skb_pull(skb, sizeof(struct udphdr));
+
+ /* Check the version. */
+ if ((skb->data[1] & L2TP_VERSION_MASK) != L2TP_VERSION)
+ goto drop;
+ bits = skb->data[0];
+ ptr = &skb->data[2];
+
+ /* Check the length if it is present. */
+ if (bits & L2TP_LENGTH_BIT) {
+ if ((ptr[0] << 8 | ptr[1]) != skb->len)
+ goto drop;
+ ptr += 2;
+ }
+
+ /* Skip all fields including optional ones. */
+ if (!skb_pull(skb, 6 + (bits & L2TP_SEQUENCE_BIT ? 4 : 0) +
+ (bits & L2TP_LENGTH_BIT ? 2 : 0) +
+ (bits & L2TP_OFFSET_BIT ? 2 : 0)))
+ goto drop;
+
+ /* Skip the offset padding if it is present. */
+ if (bits & L2TP_OFFSET_BIT &&
+ !skb_pull(skb, skb->data[-2] << 8 | skb->data[-1]))
+ goto drop;
+
+ /* Check the tunnel and the session. */
+ if (unaligned(ptr)->u32 != opt->local)
+ goto drop;
+
+ /* Check the sequence if it is present. */
+ if (bits & L2TP_SEQUENCE_BIT) {
+ meta->sequence = ptr[4] << 8 | ptr[5];
+ if ((__s16)(meta->sequence - opt->recv_sequence) < 0)
+ goto drop;
+ }
+
+ /* Skip PPP address and control if they are present. */
+ if (skb->len >= 2 && skb->data[0] == PPP_ADDR &&
+ skb->data[1] == PPP_CTRL)
+ skb_pull(skb, 2);
+
+ /* Fix PPP protocol if it is compressed. */
+ if (skb->len >= 1 && skb->data[0] & 1)
+ skb_push(skb, 1)[0] = 0;
+
+ /* Drop the packet if PPP protocol is missing. */
+ if (skb->len < 2)
+ goto drop;
+
+ /* Perform reordering if sequencing is enabled. */
+ atomic_set(&opt->sequencing, bits & L2TP_SEQUENCE_BIT);
+ if (bits & L2TP_SEQUENCE_BIT) {
+ struct sk_buff *skb1;
+
+ /* Insert the packet into receive queue in order. */
+ skb_set_owner_r(skb, sk);
+ skb_queue_walk(&sk->sk_receive_queue, skb1) {
+ struct meta *meta1 = skb_meta(skb1);
+ __s16 order = meta->sequence - meta1->sequence;
+ if (order == 0)
+ goto drop;
+ if (order < 0) {
+ meta->timestamp = meta1->timestamp;
+ skb_insert(skb1, skb, &sk->sk_receive_queue);
+ skb = NULL;
+ break;
+ }
+ }
+ if (skb) {
+ meta->timestamp = now;
+ skb_queue_tail(&sk->sk_receive_queue, skb);
+ }
+
+ /* Remove packets from receive queue as long as
+ * 1. the receive buffer is full,
+ * 2. they are queued longer than one second, or
+ * 3. there are no missing packets before them. */
+ skb_queue_walk_safe(&sk->sk_receive_queue, skb, skb1) {
+ meta = skb_meta(skb);
+ if (atomic_read(&sk->sk_rmem_alloc) < sk->sk_rcvbuf &&
+ now - meta->timestamp < HZ &&
+ meta->sequence != opt->recv_sequence)
+ break;
+ skb_unlink(skb, &sk->sk_receive_queue);
+ opt->recv_sequence = (__u16)(meta->sequence + 1);
+ skb_orphan(skb);
+ ppp_input(&pppox_sk(sk)->chan, skb);
+ }
+ return NET_RX_SUCCESS;
+ }
+
+ /* Flush receive queue if sequencing is disabled. */
+ skb_queue_purge(&sk->sk_receive_queue);
+ skb_orphan(skb);
+ ppp_input(&pppox_sk(sk)->chan, skb);
+ return NET_RX_SUCCESS;
+drop:
+ kfree_skb(skb);
+ return NET_RX_DROP;
+}
+
+static int pppolac_recv(struct sock *sk_udp, struct sk_buff *skb)
+{
+ sock_hold(sk_udp);
+ sk_receive_skb(sk_udp, skb, 0);
+ return 0;
+}
+
+static struct sk_buff_head delivery_queue;
+
+static void pppolac_xmit_core(struct work_struct *delivery_work)
+{
+ mm_segment_t old_fs = get_fs();
+ struct sk_buff *skb;
+
+ set_fs(KERNEL_DS);
+ while ((skb = skb_dequeue(&delivery_queue))) {
+ struct sock *sk_udp = skb->sk;
+ struct kvec iov = {.iov_base = skb->data, .iov_len = skb->len};
+ struct msghdr msg = {
+ .msg_flags = MSG_NOSIGNAL | MSG_DONTWAIT,
+ };
+
+ iov_iter_kvec(&msg.msg_iter, WRITE | ITER_KVEC, &iov, 1,
+ skb->len);
+ sk_udp->sk_prot->sendmsg(sk_udp, &msg, skb->len);
+ kfree_skb(skb);
+ }
+ set_fs(old_fs);
+}
+
+static DECLARE_WORK(delivery_work, pppolac_xmit_core);
+
+static int pppolac_xmit(struct ppp_channel *chan, struct sk_buff *skb)
+{
+ struct sock *sk_udp = (struct sock *)chan->private;
+ struct pppolac_opt *opt = &pppox_sk(sk_udp->sk_user_data)->proto.lac;
+
+ /* Install PPP address and control. */
+ skb_push(skb, 2);
+ skb->data[0] = PPP_ADDR;
+ skb->data[1] = PPP_CTRL;
+
+ /* Install L2TP header. */
+ if (atomic_read(&opt->sequencing)) {
+ skb_push(skb, 10);
+ skb->data[0] = L2TP_SEQUENCE_BIT;
+ skb->data[6] = opt->xmit_sequence >> 8;
+ skb->data[7] = opt->xmit_sequence;
+ skb->data[8] = 0;
+ skb->data[9] = 0;
+ opt->xmit_sequence++;
+ } else {
+ skb_push(skb, 6);
+ skb->data[0] = 0;
+ }
+ skb->data[1] = L2TP_VERSION;
+ unaligned(&skb->data[2])->u32 = opt->remote;
+
+ /* Now send the packet via the delivery queue. */
+ skb_set_owner_w(skb, sk_udp);
+ skb_queue_tail(&delivery_queue, skb);
+ schedule_work(&delivery_work);
+ return 1;
+}
+
+/******************************************************************************/
+
+static struct ppp_channel_ops pppolac_channel_ops = {
+ .start_xmit = pppolac_xmit,
+};
+
+static int pppolac_connect(struct socket *sock, struct sockaddr *useraddr,
+ int addrlen, int flags)
+{
+ struct sock *sk = sock->sk;
+ struct pppox_sock *po = pppox_sk(sk);
+ struct sockaddr_pppolac *addr = (struct sockaddr_pppolac *)useraddr;
+ struct socket *sock_udp = NULL;
+ struct sock *sk_udp;
+ int error;
+
+ if (addrlen != sizeof(struct sockaddr_pppolac) ||
+ !addr->local.tunnel || !addr->local.session ||
+ !addr->remote.tunnel || !addr->remote.session) {
+ return -EINVAL;
+ }
+
+ lock_sock(sk);
+ error = -EALREADY;
+ if (sk->sk_state != PPPOX_NONE)
+ goto out;
+
+ sock_udp = sockfd_lookup(addr->udp_socket, &error);
+ if (!sock_udp)
+ goto out;
+ sk_udp = sock_udp->sk;
+ lock_sock(sk_udp);
+
+ /* Remove this check when IPv6 supports UDP encapsulation. */
+ error = -EAFNOSUPPORT;
+ if (sk_udp->sk_family != AF_INET)
+ goto out;
+ error = -EPROTONOSUPPORT;
+ if (sk_udp->sk_protocol != IPPROTO_UDP)
+ goto out;
+ error = -EDESTADDRREQ;
+ if (sk_udp->sk_state != TCP_ESTABLISHED)
+ goto out;
+ error = -EBUSY;
+ if (udp_sk(sk_udp)->encap_type || sk_udp->sk_user_data)
+ goto out;
+ if (!sk_udp->sk_bound_dev_if) {
+ struct dst_entry *dst = sk_dst_get(sk_udp);
+ error = -ENODEV;
+ if (!dst)
+ goto out;
+ sk_udp->sk_bound_dev_if = dst->dev->ifindex;
+ dst_release(dst);
+ }
+
+ po->chan.hdrlen = 12;
+ po->chan.private = sk_udp;
+ po->chan.ops = &pppolac_channel_ops;
+ po->chan.mtu = PPP_MRU - 80;
+ po->proto.lac.local = unaligned(&addr->local)->u32;
+ po->proto.lac.remote = unaligned(&addr->remote)->u32;
+ atomic_set(&po->proto.lac.sequencing, 1);
+ po->proto.lac.backlog_rcv = sk_udp->sk_backlog_rcv;
+
+ error = ppp_register_channel(&po->chan);
+ if (error)
+ goto out;
+
+ sk->sk_state = PPPOX_CONNECTED;
+ udp_sk(sk_udp)->encap_type = UDP_ENCAP_L2TPINUDP;
+ udp_sk(sk_udp)->encap_rcv = pppolac_recv;
+ sk_udp->sk_backlog_rcv = pppolac_recv_core;
+ sk_udp->sk_user_data = sk;
+out:
+ if (sock_udp) {
+ release_sock(sk_udp);
+ if (error)
+ sockfd_put(sock_udp);
+ }
+ release_sock(sk);
+ return error;
+}
+
+static int pppolac_release(struct socket *sock)
+{
+ struct sock *sk = sock->sk;
+
+ if (!sk)
+ return 0;
+
+ lock_sock(sk);
+ if (sock_flag(sk, SOCK_DEAD)) {
+ release_sock(sk);
+ return -EBADF;
+ }
+
+ if (sk->sk_state != PPPOX_NONE) {
+ struct sock *sk_udp = (struct sock *)pppox_sk(sk)->chan.private;
+ lock_sock(sk_udp);
+ skb_queue_purge(&sk->sk_receive_queue);
+ pppox_unbind_sock(sk);
+ udp_sk(sk_udp)->encap_type = 0;
+ udp_sk(sk_udp)->encap_rcv = NULL;
+ sk_udp->sk_backlog_rcv = pppox_sk(sk)->proto.lac.backlog_rcv;
+ sk_udp->sk_user_data = NULL;
+ release_sock(sk_udp);
+ sockfd_put(sk_udp->sk_socket);
+ }
+
+ sock_orphan(sk);
+ sock->sk = NULL;
+ release_sock(sk);
+ sock_put(sk);
+ return 0;
+}
+
+/******************************************************************************/
+
+static struct proto pppolac_proto = {
+ .name = "PPPOLAC",
+ .owner = THIS_MODULE,
+ .obj_size = sizeof(struct pppox_sock),
+};
+
+static struct proto_ops pppolac_proto_ops = {
+ .family = PF_PPPOX,
+ .owner = THIS_MODULE,
+ .release = pppolac_release,
+ .bind = sock_no_bind,
+ .connect = pppolac_connect,
+ .socketpair = sock_no_socketpair,
+ .accept = sock_no_accept,
+ .getname = sock_no_getname,
+ .poll = sock_no_poll,
+ .ioctl = pppox_ioctl,
+ .listen = sock_no_listen,
+ .shutdown = sock_no_shutdown,
+ .setsockopt = sock_no_setsockopt,
+ .getsockopt = sock_no_getsockopt,
+ .sendmsg = sock_no_sendmsg,
+ .recvmsg = sock_no_recvmsg,
+ .mmap = sock_no_mmap,
+};
+
+static int pppolac_create(struct net *net, struct socket *sock, int kern)
+{
+ struct sock *sk;
+
+ sk = sk_alloc(net, PF_PPPOX, GFP_KERNEL, &pppolac_proto, kern);
+ if (!sk)
+ return -ENOMEM;
+
+ sock_init_data(sock, sk);
+ sock->state = SS_UNCONNECTED;
+ sock->ops = &pppolac_proto_ops;
+ sk->sk_protocol = PX_PROTO_OLAC;
+ sk->sk_state = PPPOX_NONE;
+ return 0;
+}
+
+/******************************************************************************/
+
+static struct pppox_proto pppolac_pppox_proto = {
+ .create = pppolac_create,
+ .owner = THIS_MODULE,
+};
+
+static int __init pppolac_init(void)
+{
+ int error;
+
+ error = proto_register(&pppolac_proto, 0);
+ if (error)
+ return error;
+
+ error = register_pppox_proto(PX_PROTO_OLAC, &pppolac_pppox_proto);
+ if (error)
+ proto_unregister(&pppolac_proto);
+ else
+ skb_queue_head_init(&delivery_queue);
+ return error;
+}
+
+static void __exit pppolac_exit(void)
+{
+ unregister_pppox_proto(PX_PROTO_OLAC);
+ proto_unregister(&pppolac_proto);
+}
+
+module_init(pppolac_init);
+module_exit(pppolac_exit);
+
+MODULE_DESCRIPTION("PPP on L2TP Access Concentrator (PPPoLAC)");
+MODULE_AUTHOR("Chia-chi Yeh <chiachi@android.com>");
+MODULE_LICENSE("GPL");
diff --git a/drivers/net/ppp/pppopns.c b/drivers/net/ppp/pppopns.c
new file mode 100644
index 000000000000..cdb4fa1af734
--- /dev/null
+++ b/drivers/net/ppp/pppopns.c
@@ -0,0 +1,429 @@
+/* drivers/net/pppopns.c
+ *
+ * Driver for PPP on PPTP Network Server / PPPoPNS Socket (RFC 2637)
+ *
+ * Copyright (C) 2009 Google, Inc.
+ *
+ * This software is licensed under the terms of the GNU General Public
+ * License version 2, as published by the Free Software Foundation, and
+ * may be copied, distributed, and modified under those terms.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ */
+
+/* This driver handles PPTP data packets between a RAW socket and a PPP channel.
+ * The socket is created in the kernel space and connected to the same address
+ * of the control socket. Outgoing packets are always sent with sequences but
+ * without acknowledgements. Incoming packets with sequences are reordered
+ * within a sliding window of one second. Currently reordering only happens when
+ * a packet is received. It is done for simplicity since no additional locks or
+ * threads are required. This driver should work on both IPv4 and IPv6. */
+
+#include <linux/module.h>
+#include <linux/jiffies.h>
+#include <linux/workqueue.h>
+#include <linux/skbuff.h>
+#include <linux/file.h>
+#include <linux/netdevice.h>
+#include <linux/net.h>
+#include <linux/ppp_defs.h>
+#include <linux/if.h>
+#include <linux/if_ppp.h>
+#include <linux/if_pppox.h>
+#include <linux/ppp_channel.h>
+#include <asm/uaccess.h>
+
+#define GRE_HEADER_SIZE 8
+
+#define PPTP_GRE_BITS htons(0x2001)
+#define PPTP_GRE_BITS_MASK htons(0xEF7F)
+#define PPTP_GRE_SEQ_BIT htons(0x1000)
+#define PPTP_GRE_ACK_BIT htons(0x0080)
+#define PPTP_GRE_TYPE htons(0x880B)
+
+#define PPP_ADDR 0xFF
+#define PPP_CTRL 0x03
+
+struct header {
+ __u16 bits;
+ __u16 type;
+ __u16 length;
+ __u16 call;
+ __u32 sequence;
+} __attribute__((packed));
+
+struct meta {
+ __u32 sequence;
+ __u32 timestamp;
+};
+
+static inline struct meta *skb_meta(struct sk_buff *skb)
+{
+ return (struct meta *)skb->cb;
+}
+
+/******************************************************************************/
+
+static int pppopns_recv_core(struct sock *sk_raw, struct sk_buff *skb)
+{
+ struct sock *sk = (struct sock *)sk_raw->sk_user_data;
+ struct pppopns_opt *opt = &pppox_sk(sk)->proto.pns;
+ struct meta *meta = skb_meta(skb);
+ __u32 now = jiffies;
+ struct header *hdr;
+
+ /* Skip transport header */
+ skb_pull(skb, skb_transport_header(skb) - skb->data);
+
+ /* Drop the packet if GRE header is missing. */
+ if (skb->len < GRE_HEADER_SIZE)
+ goto drop;
+ hdr = (struct header *)skb->data;
+
+ /* Check the header. */
+ if (hdr->type != PPTP_GRE_TYPE || hdr->call != opt->local ||
+ (hdr->bits & PPTP_GRE_BITS_MASK) != PPTP_GRE_BITS)
+ goto drop;
+
+ /* Skip all fields including optional ones. */
+ if (!skb_pull(skb, GRE_HEADER_SIZE +
+ (hdr->bits & PPTP_GRE_SEQ_BIT ? 4 : 0) +
+ (hdr->bits & PPTP_GRE_ACK_BIT ? 4 : 0)))
+ goto drop;
+
+ /* Check the length. */
+ if (skb->len != ntohs(hdr->length))
+ goto drop;
+
+ /* Check the sequence if it is present. */
+ if (hdr->bits & PPTP_GRE_SEQ_BIT) {
+ meta->sequence = ntohl(hdr->sequence);
+ if ((__s32)(meta->sequence - opt->recv_sequence) < 0)
+ goto drop;
+ }
+
+ /* Skip PPP address and control if they are present. */
+ if (skb->len >= 2 && skb->data[0] == PPP_ADDR &&
+ skb->data[1] == PPP_CTRL)
+ skb_pull(skb, 2);
+
+ /* Fix PPP protocol if it is compressed. */
+ if (skb->len >= 1 && skb->data[0] & 1)
+ skb_push(skb, 1)[0] = 0;
+
+ /* Drop the packet if PPP protocol is missing. */
+ if (skb->len < 2)
+ goto drop;
+
+ /* Perform reordering if sequencing is enabled. */
+ if (hdr->bits & PPTP_GRE_SEQ_BIT) {
+ struct sk_buff *skb1;
+
+ /* Insert the packet into receive queue in order. */
+ skb_set_owner_r(skb, sk);
+ skb_queue_walk(&sk->sk_receive_queue, skb1) {
+ struct meta *meta1 = skb_meta(skb1);
+ __s32 order = meta->sequence - meta1->sequence;
+ if (order == 0)
+ goto drop;
+ if (order < 0) {
+ meta->timestamp = meta1->timestamp;
+ skb_insert(skb1, skb, &sk->sk_receive_queue);
+ skb = NULL;
+ break;
+ }
+ }
+ if (skb) {
+ meta->timestamp = now;
+ skb_queue_tail(&sk->sk_receive_queue, skb);
+ }
+
+ /* Remove packets from receive queue as long as
+ * 1. the receive buffer is full,
+ * 2. they are queued longer than one second, or
+ * 3. there are no missing packets before them. */
+ skb_queue_walk_safe(&sk->sk_receive_queue, skb, skb1) {
+ meta = skb_meta(skb);
+ if (atomic_read(&sk->sk_rmem_alloc) < sk->sk_rcvbuf &&
+ now - meta->timestamp < HZ &&
+ meta->sequence != opt->recv_sequence)
+ break;
+ skb_unlink(skb, &sk->sk_receive_queue);
+ opt->recv_sequence = meta->sequence + 1;
+ skb_orphan(skb);
+ ppp_input(&pppox_sk(sk)->chan, skb);
+ }
+ return NET_RX_SUCCESS;
+ }
+
+ /* Flush receive queue if sequencing is disabled. */
+ skb_queue_purge(&sk->sk_receive_queue);
+ skb_orphan(skb);
+ ppp_input(&pppox_sk(sk)->chan, skb);
+ return NET_RX_SUCCESS;
+drop:
+ kfree_skb(skb);
+ return NET_RX_DROP;
+}
+
+static void pppopns_recv(struct sock *sk_raw)
+{
+ struct sk_buff *skb;
+ while ((skb = skb_dequeue(&sk_raw->sk_receive_queue))) {
+ sock_hold(sk_raw);
+ sk_receive_skb(sk_raw, skb, 0);
+ }
+}
+
+static struct sk_buff_head delivery_queue;
+
+static void pppopns_xmit_core(struct work_struct *delivery_work)
+{
+ mm_segment_t old_fs = get_fs();
+ struct sk_buff *skb;
+
+ set_fs(KERNEL_DS);
+ while ((skb = skb_dequeue(&delivery_queue))) {
+ struct sock *sk_raw = skb->sk;
+ struct kvec iov = {.iov_base = skb->data, .iov_len = skb->len};
+ struct msghdr msg = {
+ .msg_flags = MSG_NOSIGNAL | MSG_DONTWAIT,
+ };
+
+ iov_iter_kvec(&msg.msg_iter, WRITE | ITER_KVEC, &iov, 1,
+ skb->len);
+ sk_raw->sk_prot->sendmsg(sk_raw, &msg, skb->len);
+ kfree_skb(skb);
+ }
+ set_fs(old_fs);
+}
+
+static DECLARE_WORK(delivery_work, pppopns_xmit_core);
+
+static int pppopns_xmit(struct ppp_channel *chan, struct sk_buff *skb)
+{
+ struct sock *sk_raw = (struct sock *)chan->private;
+ struct pppopns_opt *opt = &pppox_sk(sk_raw->sk_user_data)->proto.pns;
+ struct header *hdr;
+ __u16 length;
+
+ /* Install PPP address and control. */
+ skb_push(skb, 2);
+ skb->data[0] = PPP_ADDR;
+ skb->data[1] = PPP_CTRL;
+ length = skb->len;
+
+ /* Install PPTP GRE header. */
+ hdr = (struct header *)skb_push(skb, 12);
+ hdr->bits = PPTP_GRE_BITS | PPTP_GRE_SEQ_BIT;
+ hdr->type = PPTP_GRE_TYPE;
+ hdr->length = htons(length);
+ hdr->call = opt->remote;
+ hdr->sequence = htonl(opt->xmit_sequence);
+ opt->xmit_sequence++;
+
+ /* Now send the packet via the delivery queue. */
+ skb_set_owner_w(skb, sk_raw);
+ skb_queue_tail(&delivery_queue, skb);
+ schedule_work(&delivery_work);
+ return 1;
+}
+
+/******************************************************************************/
+
+static struct ppp_channel_ops pppopns_channel_ops = {
+ .start_xmit = pppopns_xmit,
+};
+
+static int pppopns_connect(struct socket *sock, struct sockaddr *useraddr,
+ int addrlen, int flags)
+{
+ struct sock *sk = sock->sk;
+ struct pppox_sock *po = pppox_sk(sk);
+ struct sockaddr_pppopns *addr = (struct sockaddr_pppopns *)useraddr;
+ struct sockaddr_storage ss;
+ struct socket *sock_tcp = NULL;
+ struct socket *sock_raw = NULL;
+ struct sock *sk_tcp;
+ struct sock *sk_raw;
+ int error;
+
+ if (addrlen != sizeof(struct sockaddr_pppopns))
+ return -EINVAL;
+
+ lock_sock(sk);
+ error = -EALREADY;
+ if (sk->sk_state != PPPOX_NONE)
+ goto out;
+
+ sock_tcp = sockfd_lookup(addr->tcp_socket, &error);
+ if (!sock_tcp)
+ goto out;
+ sk_tcp = sock_tcp->sk;
+ error = -EPROTONOSUPPORT;
+ if (sk_tcp->sk_protocol != IPPROTO_TCP)
+ goto out;
+ addrlen = sizeof(struct sockaddr_storage);
+ error = kernel_getpeername(sock_tcp, (struct sockaddr *)&ss, &addrlen);
+ if (error)
+ goto out;
+ if (!sk_tcp->sk_bound_dev_if) {
+ struct dst_entry *dst = sk_dst_get(sk_tcp);
+ error = -ENODEV;
+ if (!dst)
+ goto out;
+ sk_tcp->sk_bound_dev_if = dst->dev->ifindex;
+ dst_release(dst);
+ }
+
+ error = sock_create(ss.ss_family, SOCK_RAW, IPPROTO_GRE, &sock_raw);
+ if (error)
+ goto out;
+ sk_raw = sock_raw->sk;
+ sk_raw->sk_bound_dev_if = sk_tcp->sk_bound_dev_if;
+ error = kernel_connect(sock_raw, (struct sockaddr *)&ss, addrlen, 0);
+ if (error)
+ goto out;
+
+ po->chan.hdrlen = 14;
+ po->chan.private = sk_raw;
+ po->chan.ops = &pppopns_channel_ops;
+ po->chan.mtu = PPP_MRU - 80;
+ po->proto.pns.local = addr->local;
+ po->proto.pns.remote = addr->remote;
+ po->proto.pns.data_ready = sk_raw->sk_data_ready;
+ po->proto.pns.backlog_rcv = sk_raw->sk_backlog_rcv;
+
+ error = ppp_register_channel(&po->chan);
+ if (error)
+ goto out;
+
+ sk->sk_state = PPPOX_CONNECTED;
+ lock_sock(sk_raw);
+ sk_raw->sk_data_ready = pppopns_recv;
+ sk_raw->sk_backlog_rcv = pppopns_recv_core;
+ sk_raw->sk_user_data = sk;
+ release_sock(sk_raw);
+out:
+ if (sock_tcp)
+ sockfd_put(sock_tcp);
+ if (error && sock_raw)
+ sock_release(sock_raw);
+ release_sock(sk);
+ return error;
+}
+
+static int pppopns_release(struct socket *sock)
+{
+ struct sock *sk = sock->sk;
+
+ if (!sk)
+ return 0;
+
+ lock_sock(sk);
+ if (sock_flag(sk, SOCK_DEAD)) {
+ release_sock(sk);
+ return -EBADF;
+ }
+
+ if (sk->sk_state != PPPOX_NONE) {
+ struct sock *sk_raw = (struct sock *)pppox_sk(sk)->chan.private;
+ lock_sock(sk_raw);
+ skb_queue_purge(&sk->sk_receive_queue);
+ pppox_unbind_sock(sk);
+ sk_raw->sk_data_ready = pppox_sk(sk)->proto.pns.data_ready;
+ sk_raw->sk_backlog_rcv = pppox_sk(sk)->proto.pns.backlog_rcv;
+ sk_raw->sk_user_data = NULL;
+ release_sock(sk_raw);
+ sock_release(sk_raw->sk_socket);
+ }
+
+ sock_orphan(sk);
+ sock->sk = NULL;
+ release_sock(sk);
+ sock_put(sk);
+ return 0;
+}
+
+/******************************************************************************/
+
+static struct proto pppopns_proto = {
+ .name = "PPPOPNS",
+ .owner = THIS_MODULE,
+ .obj_size = sizeof(struct pppox_sock),
+};
+
+static struct proto_ops pppopns_proto_ops = {
+ .family = PF_PPPOX,
+ .owner = THIS_MODULE,
+ .release = pppopns_release,
+ .bind = sock_no_bind,
+ .connect = pppopns_connect,
+ .socketpair = sock_no_socketpair,
+ .accept = sock_no_accept,
+ .getname = sock_no_getname,
+ .poll = sock_no_poll,
+ .ioctl = pppox_ioctl,
+ .listen = sock_no_listen,
+ .shutdown = sock_no_shutdown,
+ .setsockopt = sock_no_setsockopt,
+ .getsockopt = sock_no_getsockopt,
+ .sendmsg = sock_no_sendmsg,
+ .recvmsg = sock_no_recvmsg,
+ .mmap = sock_no_mmap,
+};
+
+static int pppopns_create(struct net *net, struct socket *sock, int kern)
+{
+ struct sock *sk;
+
+ sk = sk_alloc(net, PF_PPPOX, GFP_KERNEL, &pppopns_proto, kern);
+ if (!sk)
+ return -ENOMEM;
+
+ sock_init_data(sock, sk);
+ sock->state = SS_UNCONNECTED;
+ sock->ops = &pppopns_proto_ops;
+ sk->sk_protocol = PX_PROTO_OPNS;
+ sk->sk_state = PPPOX_NONE;
+ return 0;
+}
+
+/******************************************************************************/
+
+static struct pppox_proto pppopns_pppox_proto = {
+ .create = pppopns_create,
+ .owner = THIS_MODULE,
+};
+
+static int __init pppopns_init(void)
+{
+ int error;
+
+ error = proto_register(&pppopns_proto, 0);
+ if (error)
+ return error;
+
+ error = register_pppox_proto(PX_PROTO_OPNS, &pppopns_pppox_proto);
+ if (error)
+ proto_unregister(&pppopns_proto);
+ else
+ skb_queue_head_init(&delivery_queue);
+ return error;
+}
+
+static void __exit pppopns_exit(void)
+{
+ unregister_pppox_proto(PX_PROTO_OPNS);
+ proto_unregister(&pppopns_proto);
+}
+
+module_init(pppopns_init);
+module_exit(pppopns_exit);
+
+MODULE_DESCRIPTION("PPP on PPTP Network Server (PPPoPNS)");
+MODULE_AUTHOR("Chia-chi Yeh <chiachi@android.com>");
+MODULE_LICENSE("GPL");
diff --git a/drivers/net/tun.c b/drivers/net/tun.c
index 7a0d5e928bec..7f47b88d050e 100644
--- a/drivers/net/tun.c
+++ b/drivers/net/tun.c
@@ -2021,6 +2021,12 @@ static long __tun_chr_ioctl(struct file *file, unsigned int cmd,
int le;
int ret;
+#ifdef CONFIG_ANDROID_PARANOID_NETWORK
+ if (cmd != TUNGETIFF && !capable(CAP_NET_ADMIN)) {
+ return -EPERM;
+ }
+#endif
+
if (cmd == TUNSETIFF || cmd == TUNSETQUEUE || _IOC_TYPE(cmd) == 0x89) {
if (copy_from_user(&ifr, argp, ifreq_len))
return -EFAULT;
diff --git a/drivers/net/wireless/Kconfig b/drivers/net/wireless/Kconfig
index 8c8edaf1bba6..c7b29abf190d 100644
--- a/drivers/net/wireless/Kconfig
+++ b/drivers/net/wireless/Kconfig
@@ -100,4 +100,11 @@ config USB_NET_RNDIS_WLAN
If you choose to build a module, it'll be called rndis_wlan.
+config VIRT_WIFI
+ tristate "Wifi wrapper for ethernet drivers"
+ depends on CFG80211
+ ---help---
+ This option adds support for ethernet connections to appear as if they
+ are wifi connections through a special rtnetlink device.
+
endif # WLAN
diff --git a/drivers/net/wireless/Makefile b/drivers/net/wireless/Makefile
index f00d42953fb8..1056115c7995 100644
--- a/drivers/net/wireless/Makefile
+++ b/drivers/net/wireless/Makefile
@@ -25,3 +25,5 @@ obj-$(CONFIG_PCMCIA_WL3501) += wl3501_cs.o
obj-$(CONFIG_USB_NET_RNDIS_WLAN) += rndis_wlan.o
obj-$(CONFIG_MAC80211_HWSIM) += mac80211_hwsim.o
+
+obj-$(CONFIG_VIRT_WIFI) += virt_wifi.o
diff --git a/drivers/net/wireless/broadcom/brcm80211/brcmfmac/cfg80211.c b/drivers/net/wireless/broadcom/brcm80211/brcmfmac/cfg80211.c
index 530f52120972..78c0adc338fe 100644
--- a/drivers/net/wireless/broadcom/brcm80211/brcmfmac/cfg80211.c
+++ b/drivers/net/wireless/broadcom/brcm80211/brcmfmac/cfg80211.c
@@ -2774,7 +2774,6 @@ static s32 brcmf_inform_single_bss(struct brcmf_cfg80211_info *cfg,
struct brcmf_bss_info_le *bi)
{
struct wiphy *wiphy = cfg_to_wiphy(cfg);
- struct ieee80211_channel *notify_channel;
struct cfg80211_bss *bss;
struct ieee80211_supported_band *band;
struct brcmu_chan ch;
@@ -2784,7 +2783,7 @@ static s32 brcmf_inform_single_bss(struct brcmf_cfg80211_info *cfg,
u16 notify_interval;
u8 *notify_ie;
size_t notify_ielen;
- s32 notify_signal;
+ struct cfg80211_inform_bss bss_data = {};
if (le32_to_cpu(bi->length) > WL_BSS_INFO_MAX) {
brcmf_err("Bss info is larger than buffer. Discarding\n");
@@ -2804,27 +2803,28 @@ static s32 brcmf_inform_single_bss(struct brcmf_cfg80211_info *cfg,
band = wiphy->bands[NL80211_BAND_5GHZ];
freq = ieee80211_channel_to_frequency(channel, band->band);
- notify_channel = ieee80211_get_channel(wiphy, freq);
+ bss_data.chan = ieee80211_get_channel(wiphy, freq);
+ bss_data.scan_width = NL80211_BSS_CHAN_WIDTH_20;
+ bss_data.boottime_ns = ktime_to_ns(ktime_get_boottime());
notify_capability = le16_to_cpu(bi->capability);
notify_interval = le16_to_cpu(bi->beacon_period);
notify_ie = (u8 *)bi + le16_to_cpu(bi->ie_offset);
notify_ielen = le32_to_cpu(bi->ie_length);
- notify_signal = (s16)le16_to_cpu(bi->RSSI) * 100;
+ bss_data.signal = (s16)le16_to_cpu(bi->RSSI) * 100;
brcmf_dbg(CONN, "bssid: %pM\n", bi->BSSID);
brcmf_dbg(CONN, "Channel: %d(%d)\n", channel, freq);
brcmf_dbg(CONN, "Capability: %X\n", notify_capability);
brcmf_dbg(CONN, "Beacon interval: %d\n", notify_interval);
- brcmf_dbg(CONN, "Signal: %d\n", notify_signal);
-
- bss = cfg80211_inform_bss(wiphy, notify_channel,
- CFG80211_BSS_FTYPE_UNKNOWN,
- (const u8 *)bi->BSSID,
- 0, notify_capability,
- notify_interval, notify_ie,
- notify_ielen, notify_signal,
- GFP_KERNEL);
+ brcmf_dbg(CONN, "Signal: %d\n", bss_data.signal);
+
+ bss = cfg80211_inform_bss_data(wiphy, &bss_data,
+ CFG80211_BSS_FTYPE_UNKNOWN,
+ (const u8 *)bi->BSSID,
+ 0, notify_capability,
+ notify_interval, notify_ie,
+ notify_ielen, GFP_KERNEL);
if (!bss)
return -ENOMEM;
diff --git a/drivers/net/wireless/ti/wlcore/init.c b/drivers/net/wireless/ti/wlcore/init.c
index d0b7734030ef..b7974b4dbb34 100644
--- a/drivers/net/wireless/ti/wlcore/init.c
+++ b/drivers/net/wireless/ti/wlcore/init.c
@@ -549,6 +549,11 @@ static int wl12xx_init_ap_role(struct wl1271 *wl, struct wl12xx_vif *wlvif)
{
int ret;
+ /* Disable filtering */
+ ret = wl1271_acx_group_address_tbl(wl, wlvif, false, NULL, 0);
+ if (ret < 0)
+ return ret;
+
ret = wl1271_acx_ap_max_tx_retry(wl, wlvif);
if (ret < 0)
return ret;
diff --git a/drivers/net/wireless/virt_wifi.c b/drivers/net/wireless/virt_wifi.c
new file mode 100644
index 000000000000..17dbc364b8a5
--- /dev/null
+++ b/drivers/net/wireless/virt_wifi.c
@@ -0,0 +1,631 @@
+// SPDX-License-Identifier: GPL-2.0
+/* drivers/net/wireless/virt_wifi.c
+ *
+ * A fake implementation of cfg80211_ops that can be tacked on to an ethernet
+ * net_device to make it appear as a wireless connection.
+ *
+ * Copyright (C) 2018 Google, Inc.
+ *
+ * Author: schuffelen@google.com
+ */
+
+#include <net/cfg80211.h>
+#include <net/rtnetlink.h>
+#include <linux/etherdevice.h>
+#include <linux/module.h>
+
+#include <net/cfg80211.h>
+#include <net/rtnetlink.h>
+#include <linux/etherdevice.h>
+#include <linux/module.h>
+
+static struct wiphy *common_wiphy;
+
+struct virt_wifi_wiphy_priv {
+ struct delayed_work scan_result;
+ struct cfg80211_scan_request *scan_request;
+ bool being_deleted;
+};
+
+static struct ieee80211_channel channel_2ghz = {
+ .band = NL80211_BAND_2GHZ,
+ .center_freq = 2432,
+ .hw_value = 2432,
+ .max_power = 20,
+};
+
+static struct ieee80211_rate bitrates_2ghz[] = {
+ { .bitrate = 10 },
+ { .bitrate = 20 },
+ { .bitrate = 55 },
+ { .bitrate = 110 },
+ { .bitrate = 60 },
+ { .bitrate = 120 },
+ { .bitrate = 240 },
+};
+
+static struct ieee80211_supported_band band_2ghz = {
+ .channels = &channel_2ghz,
+ .bitrates = bitrates_2ghz,
+ .band = NL80211_BAND_2GHZ,
+ .n_channels = 1,
+ .n_bitrates = ARRAY_SIZE(bitrates_2ghz),
+ .ht_cap = {
+ .ht_supported = true,
+ .cap = IEEE80211_HT_CAP_SUP_WIDTH_20_40 |
+ IEEE80211_HT_CAP_GRN_FLD |
+ IEEE80211_HT_CAP_SGI_20 |
+ IEEE80211_HT_CAP_SGI_40 |
+ IEEE80211_HT_CAP_DSSSCCK40,
+ .ampdu_factor = 0x3,
+ .ampdu_density = 0x6,
+ .mcs = {
+ .rx_mask = {0xff, 0xff},
+ .tx_params = IEEE80211_HT_MCS_TX_DEFINED,
+ },
+ },
+};
+
+static struct ieee80211_channel channel_5ghz = {
+ .band = NL80211_BAND_5GHZ,
+ .center_freq = 5240,
+ .hw_value = 5240,
+ .max_power = 20,
+};
+
+static struct ieee80211_rate bitrates_5ghz[] = {
+ { .bitrate = 60 },
+ { .bitrate = 120 },
+ { .bitrate = 240 },
+};
+
+#define RX_MCS_MAP (IEEE80211_VHT_MCS_SUPPORT_0_9 << 0 | \
+ IEEE80211_VHT_MCS_SUPPORT_0_9 << 2 | \
+ IEEE80211_VHT_MCS_SUPPORT_0_9 << 4 | \
+ IEEE80211_VHT_MCS_SUPPORT_0_9 << 6 | \
+ IEEE80211_VHT_MCS_SUPPORT_0_9 << 8 | \
+ IEEE80211_VHT_MCS_SUPPORT_0_9 << 10 | \
+ IEEE80211_VHT_MCS_SUPPORT_0_9 << 12 | \
+ IEEE80211_VHT_MCS_SUPPORT_0_9 << 14)
+
+#define TX_MCS_MAP (IEEE80211_VHT_MCS_SUPPORT_0_9 << 0 | \
+ IEEE80211_VHT_MCS_SUPPORT_0_9 << 2 | \
+ IEEE80211_VHT_MCS_SUPPORT_0_9 << 4 | \
+ IEEE80211_VHT_MCS_SUPPORT_0_9 << 6 | \
+ IEEE80211_VHT_MCS_SUPPORT_0_9 << 8 | \
+ IEEE80211_VHT_MCS_SUPPORT_0_9 << 10 | \
+ IEEE80211_VHT_MCS_SUPPORT_0_9 << 12 | \
+ IEEE80211_VHT_MCS_SUPPORT_0_9 << 14)
+
+static struct ieee80211_supported_band band_5ghz = {
+ .channels = &channel_5ghz,
+ .bitrates = bitrates_5ghz,
+ .band = NL80211_BAND_5GHZ,
+ .n_channels = 1,
+ .n_bitrates = ARRAY_SIZE(bitrates_5ghz),
+ .ht_cap = {
+ .ht_supported = true,
+ .cap = IEEE80211_HT_CAP_SUP_WIDTH_20_40 |
+ IEEE80211_HT_CAP_GRN_FLD |
+ IEEE80211_HT_CAP_SGI_20 |
+ IEEE80211_HT_CAP_SGI_40 |
+ IEEE80211_HT_CAP_DSSSCCK40,
+ .ampdu_factor = 0x3,
+ .ampdu_density = 0x6,
+ .mcs = {
+ .rx_mask = {0xff, 0xff},
+ .tx_params = IEEE80211_HT_MCS_TX_DEFINED,
+ },
+ },
+ .vht_cap = {
+ .vht_supported = true,
+ .cap = IEEE80211_VHT_CAP_MAX_MPDU_LENGTH_11454 |
+ IEEE80211_VHT_CAP_SUPP_CHAN_WIDTH_160_80PLUS80MHZ |
+ IEEE80211_VHT_CAP_RXLDPC |
+ IEEE80211_VHT_CAP_SHORT_GI_80 |
+ IEEE80211_VHT_CAP_SHORT_GI_160 |
+ IEEE80211_VHT_CAP_TXSTBC |
+ IEEE80211_VHT_CAP_RXSTBC_1 |
+ IEEE80211_VHT_CAP_RXSTBC_2 |
+ IEEE80211_VHT_CAP_RXSTBC_3 |
+ IEEE80211_VHT_CAP_RXSTBC_4 |
+ IEEE80211_VHT_CAP_MAX_A_MPDU_LENGTH_EXPONENT_MASK,
+ .vht_mcs = {
+ .rx_mcs_map = cpu_to_le16(RX_MCS_MAP),
+ .tx_mcs_map = cpu_to_le16(TX_MCS_MAP),
+ }
+ },
+};
+
+/* Assigned at module init. Guaranteed locally-administered and unicast. */
+static u8 fake_router_bssid[ETH_ALEN] __ro_after_init = {};
+
+/* Called with the rtnl lock held. */
+static int virt_wifi_scan(struct wiphy *wiphy,
+ struct cfg80211_scan_request *request)
+{
+ struct virt_wifi_wiphy_priv *priv = wiphy_priv(wiphy);
+
+ wiphy_debug(wiphy, "scan\n");
+
+ if (priv->scan_request || priv->being_deleted)
+ return -EBUSY;
+
+ priv->scan_request = request;
+ schedule_delayed_work(&priv->scan_result, HZ * 2);
+
+ return 0;
+}
+
+/* Acquires and releases the rdev BSS lock. */
+static void virt_wifi_scan_result(struct work_struct *work)
+{
+ struct {
+ u8 tag;
+ u8 len;
+ u8 ssid[8];
+ } __packed ssid = {
+ .tag = WLAN_EID_SSID, .len = 8, .ssid = "VirtWifi",
+ };
+ struct cfg80211_bss *informed_bss;
+ struct virt_wifi_wiphy_priv *priv =
+ container_of(work, struct virt_wifi_wiphy_priv,
+ scan_result.work);
+ struct wiphy *wiphy = priv_to_wiphy(priv);
+ struct cfg80211_scan_info scan_info = { .aborted = false };
+
+ informed_bss = cfg80211_inform_bss(wiphy, &channel_5ghz,
+ CFG80211_BSS_FTYPE_PRESP,
+ fake_router_bssid,
+ ktime_get_boot_ns(),
+ WLAN_CAPABILITY_ESS, 0,
+ (void *)&ssid, sizeof(ssid),
+ DBM_TO_MBM(-50), GFP_KERNEL);
+ cfg80211_put_bss(wiphy, informed_bss);
+
+ /* Schedules work which acquires and releases the rtnl lock. */
+ cfg80211_scan_done(priv->scan_request, &scan_info);
+ priv->scan_request = NULL;
+}
+
+/* May acquire and release the rdev BSS lock. */
+static void virt_wifi_cancel_scan(struct wiphy *wiphy)
+{
+ struct virt_wifi_wiphy_priv *priv = wiphy_priv(wiphy);
+
+ cancel_delayed_work_sync(&priv->scan_result);
+ /* Clean up dangling callbacks if necessary. */
+ if (priv->scan_request) {
+ struct cfg80211_scan_info scan_info = { .aborted = true };
+ /* Schedules work which acquires and releases the rtnl lock. */
+ cfg80211_scan_done(priv->scan_request, &scan_info);
+ priv->scan_request = NULL;
+ }
+}
+
+struct virt_wifi_netdev_priv {
+ struct delayed_work connect;
+ struct net_device *lowerdev;
+ struct net_device *upperdev;
+ u32 tx_packets;
+ u32 tx_failed;
+ u8 connect_requested_bss[ETH_ALEN];
+ bool is_up;
+ bool is_connected;
+ bool being_deleted;
+};
+
+/* Called with the rtnl lock held. */
+static int virt_wifi_connect(struct wiphy *wiphy, struct net_device *netdev,
+ struct cfg80211_connect_params *sme)
+{
+ struct virt_wifi_netdev_priv *priv = netdev_priv(netdev);
+ bool could_schedule;
+
+ if (priv->being_deleted || !priv->is_up)
+ return -EBUSY;
+
+ could_schedule = schedule_delayed_work(&priv->connect, HZ * 2);
+ if (!could_schedule)
+ return -EBUSY;
+
+ if (sme->bssid)
+ ether_addr_copy(priv->connect_requested_bss, sme->bssid);
+ else
+ eth_zero_addr(priv->connect_requested_bss);
+
+ wiphy_debug(wiphy, "connect\n");
+
+ return 0;
+}
+
+/* Acquires and releases the rdev event lock. */
+static void virt_wifi_connect_complete(struct work_struct *work)
+{
+ struct virt_wifi_netdev_priv *priv =
+ container_of(work, struct virt_wifi_netdev_priv, connect.work);
+ u8 *requested_bss = priv->connect_requested_bss;
+ bool has_addr = !is_zero_ether_addr(requested_bss);
+ bool right_addr = ether_addr_equal(requested_bss, fake_router_bssid);
+ u16 status = WLAN_STATUS_SUCCESS;
+
+ if (!priv->is_up || (has_addr && !right_addr))
+ status = WLAN_STATUS_UNSPECIFIED_FAILURE;
+ else
+ priv->is_connected = true;
+
+ /* Schedules an event that acquires the rtnl lock. */
+ cfg80211_connect_result(priv->upperdev, requested_bss, NULL, 0, NULL, 0,
+ status, GFP_KERNEL);
+ netif_carrier_on(priv->upperdev);
+}
+
+/* May acquire and release the rdev event lock. */
+static void virt_wifi_cancel_connect(struct net_device *netdev)
+{
+ struct virt_wifi_netdev_priv *priv = netdev_priv(netdev);
+
+ /* If there is work pending, clean up dangling callbacks. */
+ if (cancel_delayed_work_sync(&priv->connect)) {
+ /* Schedules an event that acquires the rtnl lock. */
+ cfg80211_connect_result(priv->upperdev,
+ priv->connect_requested_bss, NULL, 0,
+ NULL, 0,
+ WLAN_STATUS_UNSPECIFIED_FAILURE,
+ GFP_KERNEL);
+ }
+}
+
+/* Called with the rtnl lock held. Acquires the rdev event lock. */
+static int virt_wifi_disconnect(struct wiphy *wiphy, struct net_device *netdev,
+ u16 reason_code)
+{
+ struct virt_wifi_netdev_priv *priv = netdev_priv(netdev);
+
+ if (priv->being_deleted)
+ return -EBUSY;
+
+ wiphy_debug(wiphy, "disconnect\n");
+ virt_wifi_cancel_connect(netdev);
+
+ cfg80211_disconnected(netdev, reason_code, NULL, 0, true, GFP_KERNEL);
+ priv->is_connected = false;
+ netif_carrier_off(netdev);
+
+ return 0;
+}
+
+/* Called with the rtnl lock held. */
+static int virt_wifi_get_station(struct wiphy *wiphy, struct net_device *dev,
+ const u8 *mac, struct station_info *sinfo)
+{
+ struct virt_wifi_netdev_priv *priv = netdev_priv(dev);
+
+ wiphy_debug(wiphy, "get_station\n");
+
+ if (!priv->is_connected || !ether_addr_equal(mac, fake_router_bssid))
+ return -ENOENT;
+
+ sinfo->filled = BIT_ULL(NL80211_STA_INFO_TX_PACKETS) |
+ BIT_ULL(NL80211_STA_INFO_TX_FAILED) |
+ BIT_ULL(NL80211_STA_INFO_SIGNAL) |
+ BIT_ULL(NL80211_STA_INFO_TX_BITRATE);
+ sinfo->tx_packets = priv->tx_packets;
+ sinfo->tx_failed = priv->tx_failed;
+ /* For CFG80211_SIGNAL_TYPE_MBM, value is expressed in _dBm_ */
+ sinfo->signal = -50;
+ sinfo->txrate = (struct rate_info) {
+ .legacy = 10, /* units are 100kbit/s */
+ };
+ return 0;
+}
+
+/* Called with the rtnl lock held. */
+static int virt_wifi_dump_station(struct wiphy *wiphy, struct net_device *dev,
+ int idx, u8 *mac, struct station_info *sinfo)
+{
+ struct virt_wifi_netdev_priv *priv = netdev_priv(dev);
+
+ wiphy_debug(wiphy, "dump_station\n");
+
+ if (idx != 0 || !priv->is_connected)
+ return -ENOENT;
+
+ ether_addr_copy(mac, fake_router_bssid);
+ return virt_wifi_get_station(wiphy, dev, fake_router_bssid, sinfo);
+}
+
+static const struct cfg80211_ops virt_wifi_cfg80211_ops = {
+ .scan = virt_wifi_scan,
+
+ .connect = virt_wifi_connect,
+ .disconnect = virt_wifi_disconnect,
+
+ .get_station = virt_wifi_get_station,
+ .dump_station = virt_wifi_dump_station,
+};
+
+/* Acquires and releases the rtnl lock. */
+static struct wiphy *virt_wifi_make_wiphy(void)
+{
+ struct wiphy *wiphy;
+ struct virt_wifi_wiphy_priv *priv;
+ int err;
+
+ wiphy = wiphy_new(&virt_wifi_cfg80211_ops, sizeof(*priv));
+
+ if (!wiphy)
+ return NULL;
+
+ wiphy->max_scan_ssids = 4;
+ wiphy->max_scan_ie_len = 1000;
+ wiphy->signal_type = CFG80211_SIGNAL_TYPE_MBM;
+
+ wiphy->bands[NL80211_BAND_2GHZ] = &band_2ghz;
+ wiphy->bands[NL80211_BAND_5GHZ] = &band_5ghz;
+ wiphy->bands[NL80211_BAND_60GHZ] = NULL;
+
+ wiphy->regulatory_flags = REGULATORY_WIPHY_SELF_MANAGED;
+ wiphy->interface_modes = BIT(NL80211_IFTYPE_STATION);
+
+ priv = wiphy_priv(wiphy);
+ priv->being_deleted = false;
+ priv->scan_request = NULL;
+ INIT_DELAYED_WORK(&priv->scan_result, virt_wifi_scan_result);
+
+ err = wiphy_register(wiphy);
+ if (err < 0) {
+ wiphy_free(wiphy);
+ return NULL;
+ }
+
+ return wiphy;
+}
+
+/* Acquires and releases the rtnl lock. */
+static void virt_wifi_destroy_wiphy(struct wiphy *wiphy)
+{
+ struct virt_wifi_wiphy_priv *priv;
+
+ WARN(!wiphy, "%s called with null wiphy", __func__);
+ if (!wiphy)
+ return;
+
+ priv = wiphy_priv(wiphy);
+ priv->being_deleted = true;
+ virt_wifi_cancel_scan(wiphy);
+
+ if (wiphy->registered)
+ wiphy_unregister(wiphy);
+ wiphy_free(wiphy);
+}
+
+/* Enters and exits a RCU-bh critical section. */
+static netdev_tx_t virt_wifi_start_xmit(struct sk_buff *skb,
+ struct net_device *dev)
+{
+ struct virt_wifi_netdev_priv *priv = netdev_priv(dev);
+
+ priv->tx_packets++;
+ if (!priv->is_connected) {
+ priv->tx_failed++;
+ return NET_XMIT_DROP;
+ }
+
+ skb->dev = priv->lowerdev;
+ return dev_queue_xmit(skb);
+}
+
+/* Called with rtnl lock held. */
+static int virt_wifi_net_device_open(struct net_device *dev)
+{
+ struct virt_wifi_netdev_priv *priv = netdev_priv(dev);
+
+ priv->is_up = true;
+ return 0;
+}
+
+/* Called with rtnl lock held. */
+static int virt_wifi_net_device_stop(struct net_device *dev)
+{
+ struct virt_wifi_netdev_priv *n_priv = netdev_priv(dev);
+ struct virt_wifi_wiphy_priv *w_priv;
+
+ n_priv->is_up = false;
+
+ if (!dev->ieee80211_ptr)
+ return 0;
+ w_priv = wiphy_priv(dev->ieee80211_ptr->wiphy);
+
+ virt_wifi_cancel_scan(dev->ieee80211_ptr->wiphy);
+ virt_wifi_cancel_connect(dev);
+ netif_carrier_off(dev);
+
+ return 0;
+}
+
+static const struct net_device_ops virt_wifi_ops = {
+ .ndo_start_xmit = virt_wifi_start_xmit,
+ .ndo_open = virt_wifi_net_device_open,
+ .ndo_stop = virt_wifi_net_device_stop,
+};
+
+/* Invoked as part of rtnl lock release. */
+static void virt_wifi_net_device_destructor(struct net_device *dev)
+{
+ /* Delayed past dellink to allow nl80211 to react to the device being
+ * deleted.
+ */
+ kfree(dev->ieee80211_ptr);
+ dev->ieee80211_ptr = NULL;
+ free_netdev(dev);
+}
+
+/* No lock interaction. */
+static void virt_wifi_setup(struct net_device *dev)
+{
+ ether_setup(dev);
+ dev->netdev_ops = &virt_wifi_ops;
+ dev->destructor = virt_wifi_net_device_destructor;
+}
+
+/* Called in a RCU read critical section from netif_receive_skb */
+static rx_handler_result_t virt_wifi_rx_handler(struct sk_buff **pskb)
+{
+ struct sk_buff *skb = *pskb;
+ struct virt_wifi_netdev_priv *priv =
+ rcu_dereference(skb->dev->rx_handler_data);
+
+ if (!priv->is_connected)
+ return RX_HANDLER_PASS;
+
+ /* GFP_ATOMIC because this is a packet interrupt handler. */
+ skb = skb_share_check(skb, GFP_ATOMIC);
+ if (!skb) {
+ dev_err(&priv->upperdev->dev, "can't skb_share_check\n");
+ return RX_HANDLER_CONSUMED;
+ }
+
+ *pskb = skb;
+ skb->dev = priv->upperdev;
+ skb->pkt_type = PACKET_HOST;
+ return RX_HANDLER_ANOTHER;
+}
+
+/* Called with rtnl lock held. */
+static int virt_wifi_newlink(struct net *src_net, struct net_device *dev,
+ struct nlattr *tb[], struct nlattr *data[])
+{
+ struct virt_wifi_netdev_priv *priv = netdev_priv(dev);
+ int err;
+
+ if (!tb[IFLA_LINK])
+ return -EINVAL;
+
+ netif_carrier_off(dev);
+
+ priv->upperdev = dev;
+ priv->lowerdev = __dev_get_by_index(src_net,
+ nla_get_u32(tb[IFLA_LINK]));
+
+ if (!priv->lowerdev)
+ return -ENODEV;
+ if (!tb[IFLA_MTU])
+ dev->mtu = priv->lowerdev->mtu;
+ else if (dev->mtu > priv->lowerdev->mtu)
+ return -EINVAL;
+
+ err = netdev_rx_handler_register(priv->lowerdev, virt_wifi_rx_handler,
+ priv);
+ if (err) {
+ dev_err(&priv->lowerdev->dev,
+ "can't netdev_rx_handler_register: %d\n", err);
+ return err;
+ }
+
+ eth_hw_addr_inherit(dev, priv->lowerdev);
+ netif_stacked_transfer_operstate(priv->lowerdev, dev);
+
+ SET_NETDEV_DEV(dev, &priv->lowerdev->dev);
+ dev->ieee80211_ptr = kzalloc(sizeof(*dev->ieee80211_ptr), GFP_KERNEL);
+
+ if (!dev->ieee80211_ptr)
+ goto remove_handler;
+
+ dev->ieee80211_ptr->iftype = NL80211_IFTYPE_STATION;
+ dev->ieee80211_ptr->wiphy = common_wiphy;
+
+ err = register_netdevice(dev);
+ if (err) {
+ dev_err(&priv->lowerdev->dev, "can't register_netdevice: %d\n",
+ err);
+ goto free_wireless_dev;
+ }
+
+ err = netdev_upper_dev_link(priv->lowerdev, dev);
+ if (err) {
+ dev_err(&priv->lowerdev->dev, "can't netdev_upper_dev_link: %d\n",
+ err);
+ goto unregister_netdev;
+ }
+
+ priv->being_deleted = false;
+ priv->is_connected = false;
+ priv->is_up = false;
+ INIT_DELAYED_WORK(&priv->connect, virt_wifi_connect_complete);
+
+ return 0;
+unregister_netdev:
+ unregister_netdevice(dev);
+free_wireless_dev:
+ kfree(dev->ieee80211_ptr);
+ dev->ieee80211_ptr = NULL;
+remove_handler:
+ netdev_rx_handler_unregister(priv->lowerdev);
+
+ return err;
+}
+
+/* Called with rtnl lock held. */
+static void virt_wifi_dellink(struct net_device *dev,
+ struct list_head *head)
+{
+ struct virt_wifi_netdev_priv *priv = netdev_priv(dev);
+
+ if (dev->ieee80211_ptr)
+ virt_wifi_cancel_scan(dev->ieee80211_ptr->wiphy);
+
+ priv->being_deleted = true;
+ virt_wifi_cancel_connect(dev);
+ netif_carrier_off(dev);
+
+ netdev_rx_handler_unregister(priv->lowerdev);
+ netdev_upper_dev_unlink(priv->lowerdev, dev);
+
+ unregister_netdevice_queue(dev, head);
+
+ /* Deleting the wiphy is handled in the module destructor. */
+}
+
+static struct rtnl_link_ops virt_wifi_link_ops = {
+ .kind = "virt_wifi",
+ .setup = virt_wifi_setup,
+ .newlink = virt_wifi_newlink,
+ .dellink = virt_wifi_dellink,
+ .priv_size = sizeof(struct virt_wifi_netdev_priv),
+};
+
+/* Acquires and releases the rtnl lock. */
+static int __init virt_wifi_init_module(void)
+{
+ int err;
+
+ /* Guaranteed to be locallly-administered and not multicast. */
+ eth_random_addr(fake_router_bssid);
+
+ common_wiphy = virt_wifi_make_wiphy();
+ if (!common_wiphy)
+ return -ENOMEM;
+
+ err = rtnl_link_register(&virt_wifi_link_ops);
+ if (err)
+ virt_wifi_destroy_wiphy(common_wiphy);
+
+ return err;
+}
+
+/* Acquires and releases the rtnl lock. */
+static void __exit virt_wifi_cleanup_module(void)
+{
+ /* Will delete any devices that depend on the wiphy. */
+ rtnl_link_unregister(&virt_wifi_link_ops);
+ virt_wifi_destroy_wiphy(common_wiphy);
+}
+
+module_init(virt_wifi_init_module);
+module_exit(virt_wifi_cleanup_module);
+
+MODULE_LICENSE("GPL v2");
+MODULE_AUTHOR("Cody Schuffelen <schuffelen@google.com>");
+MODULE_DESCRIPTION("Driver for a wireless wrapper of ethernet devices");
+MODULE_ALIAS_RTNL_LINK("virt_wifi");
diff --git a/drivers/nfc/fdp/i2c.c b/drivers/nfc/fdp/i2c.c
index 712936f5d2d6..fbd26ecbf4a4 100644
--- a/drivers/nfc/fdp/i2c.c
+++ b/drivers/nfc/fdp/i2c.c
@@ -177,6 +177,16 @@ static int fdp_nci_i2c_read(struct fdp_i2c_phy *phy, struct sk_buff **skb)
/* Packet that contains a length */
if (tmp[0] == 0 && tmp[1] == 0) {
phy->next_read_size = (tmp[2] << 8) + tmp[3] + 3;
+ /*
+ * Ensure next_read_size does not exceed sizeof(tmp)
+ * for reading that many bytes during next iteration
+ */
+ if (phy->next_read_size > FDP_NCI_I2C_MAX_PAYLOAD) {
+ dev_dbg(&client->dev, "%s: corrupted packet\n",
+ __func__);
+ phy->next_read_size = 5;
+ goto flush;
+ }
} else {
phy->next_read_size = FDP_NCI_I2C_MIN_PAYLOAD;
diff --git a/drivers/nfc/st21nfca/dep.c b/drivers/nfc/st21nfca/dep.c
index 798a32bbac5d..206285210ab5 100644
--- a/drivers/nfc/st21nfca/dep.c
+++ b/drivers/nfc/st21nfca/dep.c
@@ -217,7 +217,8 @@ static int st21nfca_tm_recv_atr_req(struct nfc_hci_dev *hdev,
atr_req = (struct st21nfca_atr_req *)skb->data;
- if (atr_req->length < sizeof(struct st21nfca_atr_req)) {
+ if (atr_req->length < sizeof(struct st21nfca_atr_req) ||
+ atr_req->length > skb->len) {
r = -EPROTO;
goto exit;
}
diff --git a/drivers/nfc/st21nfca/se.c b/drivers/nfc/st21nfca/se.c
index 3a98563d4a12..6e84e120150d 100644
--- a/drivers/nfc/st21nfca/se.c
+++ b/drivers/nfc/st21nfca/se.c
@@ -320,23 +320,33 @@ int st21nfca_connectivity_event_received(struct nfc_hci_dev *hdev, u8 host,
* AID 81 5 to 16
* PARAMETERS 82 0 to 255
*/
- if (skb->len < NFC_MIN_AID_LENGTH + 2 &&
+ if (skb->len < NFC_MIN_AID_LENGTH + 2 ||
skb->data[0] != NFC_EVT_TRANSACTION_AID_TAG)
return -EPROTO;
+ /*
+ * Buffer should have enough space for at least
+ * two tag fields + two length fields + aid_len (skb->data[1])
+ */
+ if (skb->len < skb->data[1] + 4)
+ return -EPROTO;
+
transaction = (struct nfc_evt_transaction *)devm_kzalloc(dev,
skb->len - 2, GFP_KERNEL);
transaction->aid_len = skb->data[1];
memcpy(transaction->aid, &skb->data[2],
transaction->aid_len);
+ transaction->params_len = skb->data[transaction->aid_len + 3];
- /* Check next byte is PARAMETERS tag (82) */
+ /* Check next byte is PARAMETERS tag (82) and the length field */
if (skb->data[transaction->aid_len + 2] !=
- NFC_EVT_TRANSACTION_PARAMS_TAG)
+ NFC_EVT_TRANSACTION_PARAMS_TAG ||
+ skb->len < transaction->aid_len + transaction->params_len + 4) {
+ devm_kfree(dev, transaction);
return -EPROTO;
+ }
- transaction->params_len = skb->data[transaction->aid_len + 3];
memcpy(transaction->params, skb->data +
transaction->aid_len + 4, transaction->params_len);
diff --git a/drivers/of/fdt.c b/drivers/of/fdt.c
index e9360d5cbcba..e37a2a597b2d 100644
--- a/drivers/of/fdt.c
+++ b/drivers/of/fdt.c
@@ -1063,42 +1063,66 @@ int __init early_init_dt_scan_memory(unsigned long node, const char *uname,
return 0;
}
+/*
+ * Convert configs to something easy to use in C code
+ */
+#if defined(CONFIG_CMDLINE_FORCE)
+static const int overwrite_incoming_cmdline = 1;
+static const int read_dt_cmdline;
+static const int concat_cmdline;
+#elif defined(CONFIG_CMDLINE_EXTEND)
+static const int overwrite_incoming_cmdline;
+static const int read_dt_cmdline = 1;
+static const int concat_cmdline = 1;
+#else /* CMDLINE_FROM_BOOTLOADER */
+static const int overwrite_incoming_cmdline;
+static const int read_dt_cmdline = 1;
+static const int concat_cmdline;
+#endif
+
+#ifdef CONFIG_CMDLINE
+static const char *config_cmdline = CONFIG_CMDLINE;
+#else
+static const char *config_cmdline = "";
+#endif
+
int __init early_init_dt_scan_chosen(unsigned long node, const char *uname,
int depth, void *data)
{
- int l;
- const char *p;
+ int l = 0;
+ const char *p = NULL;
+ char *cmdline = data;
pr_debug("search \"chosen\", depth: %d, uname: %s\n", depth, uname);
- if (depth != 1 || !data ||
+ if (depth != 1 || !cmdline ||
(strcmp(uname, "chosen") != 0 && strcmp(uname, "chosen@0") != 0))
return 0;
early_init_dt_check_for_initrd(node);
- /* Retrieve command line */
- p = of_get_flat_dt_prop(node, "bootargs", &l);
- if (p != NULL && l > 0)
- strlcpy(data, p, min((int)l, COMMAND_LINE_SIZE));
-
- /*
- * CONFIG_CMDLINE is meant to be a default in case nothing else
- * managed to set the command line, unless CONFIG_CMDLINE_FORCE
- * is set in which case we override whatever was found earlier.
- */
-#ifdef CONFIG_CMDLINE
-#if defined(CONFIG_CMDLINE_EXTEND)
- strlcat(data, " ", COMMAND_LINE_SIZE);
- strlcat(data, CONFIG_CMDLINE, COMMAND_LINE_SIZE);
-#elif defined(CONFIG_CMDLINE_FORCE)
- strlcpy(data, CONFIG_CMDLINE, COMMAND_LINE_SIZE);
-#else
- /* No arguments from boot loader, use kernel's cmdl*/
- if (!((char *)data)[0])
- strlcpy(data, CONFIG_CMDLINE, COMMAND_LINE_SIZE);
-#endif
-#endif /* CONFIG_CMDLINE */
+ /* Put CONFIG_CMDLINE in if forced or if data had nothing in it to start */
+ if (overwrite_incoming_cmdline || !cmdline[0])
+ strlcpy(cmdline, config_cmdline, COMMAND_LINE_SIZE);
+
+ /* Retrieve command line unless forcing */
+ if (read_dt_cmdline)
+ p = of_get_flat_dt_prop(node, "bootargs", &l);
+
+ if (p != NULL && l > 0) {
+ if (concat_cmdline) {
+ int cmdline_len;
+ int copy_len;
+ strlcat(cmdline, " ", COMMAND_LINE_SIZE);
+ cmdline_len = strlen(cmdline);
+ copy_len = COMMAND_LINE_SIZE - cmdline_len - 1;
+ copy_len = min((int)l, copy_len);
+ strncpy(cmdline + cmdline_len, p, copy_len);
+ cmdline[cmdline_len + copy_len] = '\0';
+ } else {
+ strlcpy(cmdline, p, min((int)l, COMMAND_LINE_SIZE));
+ }
+ }
pr_debug("Command line is: %s\n", (char*)data);
diff --git a/drivers/pci/pcie/aspm.c b/drivers/pci/pcie/aspm.c
index 6643a7bc381c..e38bb48d3245 100644
--- a/drivers/pci/pcie/aspm.c
+++ b/drivers/pci/pcie/aspm.c
@@ -786,7 +786,8 @@ void pci_disable_link_state(struct pci_dev *pdev, int state)
}
EXPORT_SYMBOL(pci_disable_link_state);
-static int pcie_aspm_set_policy(const char *val, struct kernel_param *kp)
+static int pcie_aspm_set_policy(const char *val,
+ const struct kernel_param *kp)
{
int i;
struct pcie_link_state *link;
@@ -813,7 +814,7 @@ static int pcie_aspm_set_policy(const char *val, struct kernel_param *kp)
return 0;
}
-static int pcie_aspm_get_policy(char *buffer, struct kernel_param *kp)
+static int pcie_aspm_get_policy(char *buffer, const struct kernel_param *kp)
{
int i, cnt = 0;
for (i = 0; i < ARRAY_SIZE(policy_str); i++)
diff --git a/drivers/perf/arm_pmu.c b/drivers/perf/arm_pmu.c
index af82edc7fa5c..f51759da76f2 100644
--- a/drivers/perf/arm_pmu.c
+++ b/drivers/perf/arm_pmu.c
@@ -1018,7 +1018,7 @@ int arm_pmu_device_probe(struct platform_device *pdev,
const struct pmu_probe_info *probe_table)
{
const struct of_device_id *of_id;
- const int (*init_fn)(struct arm_pmu *);
+ int (*init_fn)(struct arm_pmu *);
struct device_node *node = pdev->dev.of_node;
struct arm_pmu *pmu;
int ret = -ENODEV;
diff --git a/drivers/platform/goldfish/Makefile b/drivers/platform/goldfish/Makefile
index d3487125838c..277a820ee4e1 100644
--- a/drivers/platform/goldfish/Makefile
+++ b/drivers/platform/goldfish/Makefile
@@ -2,4 +2,5 @@
# Makefile for Goldfish platform specific drivers
#
obj-$(CONFIG_GOLDFISH_BUS) += pdev_bus.o
-obj-$(CONFIG_GOLDFISH_PIPE) += goldfish_pipe.o
+obj-$(CONFIG_GOLDFISH_PIPE) += goldfish_pipe_all.o
+goldfish_pipe_all-objs := goldfish_pipe.o goldfish_pipe_v2.o
diff --git a/drivers/platform/goldfish/goldfish_pipe.c b/drivers/platform/goldfish/goldfish_pipe.c
index 1aba2c74160e..91e0a5645799 100644
--- a/drivers/platform/goldfish/goldfish_pipe.c
+++ b/drivers/platform/goldfish/goldfish_pipe.c
@@ -15,52 +15,11 @@
*
*/
-/* This source file contains the implementation of a special device driver
- * that intends to provide a *very* fast communication channel between the
- * guest system and the QEMU emulator.
- *
- * Usage from the guest is simply the following (error handling simplified):
- *
- * int fd = open("/dev/qemu_pipe",O_RDWR);
- * .... write() or read() through the pipe.
- *
- * This driver doesn't deal with the exact protocol used during the session.
- * It is intended to be as simple as something like:
- *
- * // do this _just_ after opening the fd to connect to a specific
- * // emulator service.
- * const char* msg = "<pipename>";
- * if (write(fd, msg, strlen(msg)+1) < 0) {
- * ... could not connect to <pipename> service
- * close(fd);
- * }
- *
- * // after this, simply read() and write() to communicate with the
- * // service. Exact protocol details left as an exercise to the reader.
- *
- * This driver is very fast because it doesn't copy any data through
- * intermediate buffers, since the emulator is capable of translating
- * guest user addresses into host ones.
- *
- * Note that we must however ensure that each user page involved in the
- * exchange is properly mapped during a transfer.
+/* This source file contains the implementation of the legacy version of
+ * a goldfish pipe device driver. See goldfish_pipe_v2.c for the current
+ * version.
*/
-
-#include <linux/module.h>
-#include <linux/interrupt.h>
-#include <linux/kernel.h>
-#include <linux/spinlock.h>
-#include <linux/miscdevice.h>
-#include <linux/platform_device.h>
-#include <linux/poll.h>
-#include <linux/sched.h>
-#include <linux/bitops.h>
-#include <linux/slab.h>
-#include <linux/io.h>
-#include <linux/goldfish.h>
-#include <linux/dma-mapping.h>
-#include <linux/mm.h>
-#include <linux/acpi.h>
+#include "goldfish_pipe.h"
/*
* IMPORTANT: The following constants must match the ones used and defined
@@ -110,29 +69,15 @@
#define PIPE_WAKE_READ (1 << 1) /* pipe can now be read from */
#define PIPE_WAKE_WRITE (1 << 2) /* pipe can now be written to */
-struct access_params {
- unsigned long channel;
- u32 size;
- unsigned long address;
- u32 cmd;
- u32 result;
- /* reserved for future extension */
- u32 flags;
-};
+#define MAX_PAGES_TO_GRAB 32
-/* The global driver data. Holds a reference to the i/o page used to
- * communicate with the emulator, and a wake queue for blocked tasks
- * waiting to be awoken.
- */
-struct goldfish_pipe_dev {
- spinlock_t lock;
- unsigned char __iomem *base;
- struct access_params *aps;
- int irq;
- u32 version;
-};
+#define DEBUG 0
-static struct goldfish_pipe_dev pipe_dev[1];
+#if DEBUG
+#define DPRINT(...) { printk(KERN_ERR __VA_ARGS__); }
+#else
+#define DPRINT(...)
+#endif
/* This data type models a given pipe instance */
struct goldfish_pipe {
@@ -142,6 +87,15 @@ struct goldfish_pipe {
wait_queue_head_t wake_queue;
};
+struct access_params {
+ unsigned long channel;
+ u32 size;
+ unsigned long address;
+ u32 cmd;
+ u32 result;
+ /* reserved for future extension */
+ u32 flags;
+};
/* Bit flags for the 'flags' field */
enum {
@@ -231,8 +185,10 @@ static int setup_access_params_addr(struct platform_device *pdev,
if (valid_batchbuffer_addr(dev, aps)) {
dev->aps = aps;
return 0;
- } else
+ } else {
+ devm_kfree(&pdev->dev, aps);
return -1;
+ }
}
/* A value that will not be set by qemu emulator */
@@ -269,6 +225,7 @@ static ssize_t goldfish_pipe_read_write(struct file *filp, char __user *buffer,
struct goldfish_pipe *pipe = filp->private_data;
struct goldfish_pipe_dev *dev = pipe->dev;
unsigned long address, address_end;
+ struct page* pages[MAX_PAGES_TO_GRAB] = {};
int count = 0, ret = -EINVAL;
/* If the emulator already closed the pipe, no need to go further */
@@ -293,45 +250,61 @@ static ssize_t goldfish_pipe_read_write(struct file *filp, char __user *buffer,
while (address < address_end) {
unsigned long page_end = (address & PAGE_MASK) + PAGE_SIZE;
- unsigned long next = page_end < address_end ? page_end
- : address_end;
- unsigned long avail = next - address;
- int status, wakeBit;
- struct page *page;
-
- /* Either vaddr or paddr depending on the device version */
- unsigned long xaddr;
+ unsigned long next, avail;
+ int status, wakeBit, page_i, num_contiguous_pages;
+ long first_page, last_page, requested_pages;
+ unsigned long xaddr, xaddr_prev, xaddr_i;
/*
- * We grab the pages on a page-by-page basis in case user
- * space gives us a potentially huge buffer but the read only
- * returns a small amount, then there's no need to pin that
- * much memory to the process.
+ * Attempt to grab multiple physically contiguous pages.
*/
- down_read(&current->mm->mmap_sem);
- ret = get_user_pages(address, 1, is_write ? 0 : FOLL_WRITE,
- &page, NULL);
- up_read(&current->mm->mmap_sem);
- if (ret < 0)
- break;
+ first_page = address & PAGE_MASK;
+ last_page = (address_end - 1) & PAGE_MASK;
+ requested_pages = ((last_page - first_page) >> PAGE_SHIFT) + 1;
+ if (requested_pages > MAX_PAGES_TO_GRAB) {
+ requested_pages = MAX_PAGES_TO_GRAB;
+ }
+ ret = get_user_pages_fast(first_page, requested_pages,
+ !is_write, pages);
+
+ DPRINT("%s: requested pages: %d %d %p\n", __FUNCTION__,
+ ret, requested_pages, first_page);
+ if (ret == 0) {
+ DPRINT("%s: error: (requested pages == 0) (wanted %d)\n",
+ __FUNCTION__, requested_pages);
+ mutex_unlock(&pipe->lock);
+ return ret;
+ }
+ if (ret < 0) {
+ DPRINT("%s: (requested pages < 0) %d \n",
+ __FUNCTION__, requested_pages);
+ mutex_unlock(&pipe->lock);
+ return ret;
+ }
- if (dev->version) {
- /* Device version 1 or newer (qemu-android) expects the
- * physical address.
- */
- xaddr = page_to_phys(page) | (address & ~PAGE_MASK);
- } else {
- /* Device version 0 (classic emulator) expects the
- * virtual address.
- */
- xaddr = address;
+ xaddr = page_to_phys(pages[0]) | (address & ~PAGE_MASK);
+ xaddr_prev = xaddr;
+ num_contiguous_pages = ret == 0 ? 0 : 1;
+ for (page_i = 1; page_i < ret; page_i++) {
+ xaddr_i = page_to_phys(pages[page_i]) | (address & ~PAGE_MASK);
+ if (xaddr_i == xaddr_prev + PAGE_SIZE) {
+ page_end += PAGE_SIZE;
+ xaddr_prev = xaddr_i;
+ num_contiguous_pages++;
+ } else {
+ DPRINT("%s: discontinuous page boundary: %d pages instead\n",
+ __FUNCTION__, page_i);
+ break;
+ }
}
+ next = page_end < address_end ? page_end : address_end;
+ avail = next - address;
/* Now, try to transfer the bytes in the current page */
spin_lock_irqsave(&dev->lock, irq_flags);
if (access_with_param(dev,
- is_write ? CMD_WRITE_BUFFER : CMD_READ_BUFFER,
- xaddr, avail, pipe, &status)) {
+ is_write ? CMD_WRITE_BUFFER : CMD_READ_BUFFER,
+ xaddr, avail, pipe, &status)) {
gf_write_ptr(pipe, dev->base + PIPE_REG_CHANNEL,
dev->base + PIPE_REG_CHANNEL_HIGH);
writel(avail, dev->base + PIPE_REG_SIZE);
@@ -344,9 +317,13 @@ static ssize_t goldfish_pipe_read_write(struct file *filp, char __user *buffer,
}
spin_unlock_irqrestore(&dev->lock, irq_flags);
- if (status > 0 && !is_write)
- set_page_dirty(page);
- put_page(page);
+ for (page_i = 0; page_i < ret; page_i++) {
+ if (status > 0 && !is_write &&
+ page_i < num_contiguous_pages) {
+ set_page_dirty(pages[page_i]);
+ }
+ put_page(pages[page_i]);
+ }
if (status > 0) { /* Correct transfer */
count += status;
@@ -368,7 +345,7 @@ static ssize_t goldfish_pipe_read_write(struct file *filp, char __user *buffer,
*/
if (status != PIPE_ERROR_AGAIN)
pr_info_ratelimited("goldfish_pipe: backend returned error %d on %s\n",
- status, is_write ? "write" : "read");
+ status, is_write ? "write" : "read");
ret = 0;
break;
}
@@ -378,7 +355,7 @@ static ssize_t goldfish_pipe_read_write(struct file *filp, char __user *buffer,
* non-blocking mode, just return the error code.
*/
if (status != PIPE_ERROR_AGAIN ||
- (filp->f_flags & O_NONBLOCK) != 0) {
+ (filp->f_flags & O_NONBLOCK) != 0) {
ret = goldfish_pipe_error_convert(status);
break;
}
@@ -392,7 +369,7 @@ static ssize_t goldfish_pipe_read_write(struct file *filp, char __user *buffer,
/* Tell the emulator we're going to wait for a wake event */
goldfish_cmd(pipe,
- is_write ? CMD_WAKE_ON_WRITE : CMD_WAKE_ON_READ);
+ is_write ? CMD_WAKE_ON_WRITE : CMD_WAKE_ON_READ);
/* Unlock the pipe, then wait for the wake signal */
mutex_unlock(&pipe->lock);
@@ -538,6 +515,8 @@ static int goldfish_pipe_open(struct inode *inode, struct file *file)
pipe->dev = dev;
mutex_init(&pipe->lock);
+ DPRINT("%s: call. pipe_dev pipe_dev=0x%lx new_pipe_addr=0x%lx file=0x%lx\n", __FUNCTION__, pipe_dev, pipe, file);
+ // spin lock init, write head of list, i guess
init_waitqueue_head(&pipe->wake_queue);
/*
@@ -560,6 +539,7 @@ static int goldfish_pipe_release(struct inode *inode, struct file *filp)
{
struct goldfish_pipe *pipe = filp->private_data;
+ DPRINT("%s: call. pipe=0x%lx file=0x%lx\n", __FUNCTION__, pipe, filp);
/* The guest is closing the channel, so tell the emulator right now */
goldfish_cmd(pipe, CMD_CLOSE);
kfree(pipe);
@@ -576,98 +556,33 @@ static const struct file_operations goldfish_pipe_fops = {
.release = goldfish_pipe_release,
};
-static struct miscdevice goldfish_pipe_device = {
+static struct miscdevice goldfish_pipe_dev = {
.minor = MISC_DYNAMIC_MINOR,
.name = "goldfish_pipe",
.fops = &goldfish_pipe_fops,
};
-static int goldfish_pipe_probe(struct platform_device *pdev)
+int goldfish_pipe_device_init_v1(struct platform_device *pdev)
{
- int err;
- struct resource *r;
struct goldfish_pipe_dev *dev = pipe_dev;
-
- /* not thread safe, but this should not happen */
- WARN_ON(dev->base != NULL);
-
- spin_lock_init(&dev->lock);
-
- r = platform_get_resource(pdev, IORESOURCE_MEM, 0);
- if (r == NULL || resource_size(r) < PAGE_SIZE) {
- dev_err(&pdev->dev, "can't allocate i/o page\n");
- return -EINVAL;
- }
- dev->base = devm_ioremap(&pdev->dev, r->start, PAGE_SIZE);
- if (dev->base == NULL) {
- dev_err(&pdev->dev, "ioremap failed\n");
- return -EINVAL;
- }
-
- r = platform_get_resource(pdev, IORESOURCE_IRQ, 0);
- if (r == NULL) {
- err = -EINVAL;
- goto error;
- }
- dev->irq = r->start;
-
- err = devm_request_irq(&pdev->dev, dev->irq, goldfish_pipe_interrupt,
+ int err = devm_request_irq(&pdev->dev, dev->irq, goldfish_pipe_interrupt,
IRQF_SHARED, "goldfish_pipe", dev);
if (err) {
- dev_err(&pdev->dev, "unable to allocate IRQ\n");
- goto error;
+ dev_err(&pdev->dev, "unable to allocate IRQ for v1\n");
+ return err;
}
- err = misc_register(&goldfish_pipe_device);
+ err = misc_register(&goldfish_pipe_dev);
if (err) {
- dev_err(&pdev->dev, "unable to register device\n");
- goto error;
+ dev_err(&pdev->dev, "unable to register v1 device\n");
+ return err;
}
- setup_access_params_addr(pdev, dev);
- /* Although the pipe device in the classic Android emulator does not
- * recognize the 'version' register, it won't treat this as an error
- * either and will simply return 0, which is fine.
- */
- dev->version = readl(dev->base + PIPE_REG_VERSION);
+ setup_access_params_addr(pdev, dev);
return 0;
-
-error:
- dev->base = NULL;
- return err;
}
-static int goldfish_pipe_remove(struct platform_device *pdev)
+void goldfish_pipe_device_deinit_v1(struct platform_device *pdev)
{
- struct goldfish_pipe_dev *dev = pipe_dev;
- misc_deregister(&goldfish_pipe_device);
- dev->base = NULL;
- return 0;
+ misc_deregister(&goldfish_pipe_dev);
}
-
-static const struct acpi_device_id goldfish_pipe_acpi_match[] = {
- { "GFSH0003", 0 },
- { },
-};
-MODULE_DEVICE_TABLE(acpi, goldfish_pipe_acpi_match);
-
-static const struct of_device_id goldfish_pipe_of_match[] = {
- { .compatible = "google,android-pipe", },
- {},
-};
-MODULE_DEVICE_TABLE(of, goldfish_pipe_of_match);
-
-static struct platform_driver goldfish_pipe = {
- .probe = goldfish_pipe_probe,
- .remove = goldfish_pipe_remove,
- .driver = {
- .name = "goldfish_pipe",
- .owner = THIS_MODULE,
- .of_match_table = goldfish_pipe_of_match,
- .acpi_match_table = ACPI_PTR(goldfish_pipe_acpi_match),
- }
-};
-
-module_platform_driver(goldfish_pipe);
-MODULE_AUTHOR("David Turner <digit@google.com>");
-MODULE_LICENSE("GPL");
diff --git a/drivers/platform/goldfish/goldfish_pipe.h b/drivers/platform/goldfish/goldfish_pipe.h
new file mode 100644
index 000000000000..6cd1b63be8c9
--- /dev/null
+++ b/drivers/platform/goldfish/goldfish_pipe.h
@@ -0,0 +1,92 @@
+/*
+ * Copyright (C) 2016 Google, Inc.
+ *
+ * This software is licensed under the terms of the GNU General Public
+ * License version 2, as published by the Free Software Foundation, and
+ * may be copied, distributed, and modified under those terms.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ */
+#ifndef GOLDFISH_PIPE_H
+#define GOLDFISH_PIPE_H
+
+#include <linux/module.h>
+#include <linux/interrupt.h>
+#include <linux/kernel.h>
+#include <linux/spinlock.h>
+#include <linux/miscdevice.h>
+#include <linux/platform_device.h>
+#include <linux/poll.h>
+#include <linux/sched.h>
+#include <linux/bitops.h>
+#include <linux/slab.h>
+#include <linux/io.h>
+#include <linux/goldfish.h>
+#include <linux/dma-mapping.h>
+#include <linux/mm.h>
+#include <linux/acpi.h>
+
+
+/* Initialize the legacy version of the pipe device driver */
+int goldfish_pipe_device_init_v1(struct platform_device *pdev);
+
+/* Deinitialize the legacy version of the pipe device driver */
+void goldfish_pipe_device_deinit_v1(struct platform_device *pdev);
+
+/* Forward declarations for the device struct */
+struct goldfish_pipe;
+struct goldfish_pipe_device_buffers;
+
+/* The global driver data. Holds a reference to the i/o page used to
+ * communicate with the emulator, and a wake queue for blocked tasks
+ * waiting to be awoken.
+ */
+struct goldfish_pipe_dev {
+ /*
+ * Global device spinlock. Protects the following members:
+ * - pipes, pipes_capacity
+ * - [*pipes, *pipes + pipes_capacity) - array data
+ * - first_signalled_pipe,
+ * goldfish_pipe::prev_signalled,
+ * goldfish_pipe::next_signalled,
+ * goldfish_pipe::signalled_flags - all singnalled-related fields,
+ * in all allocated pipes
+ * - open_command_params - PIPE_CMD_OPEN-related buffers
+ *
+ * It looks like a lot of different fields, but the trick is that the only
+ * operation that happens often is the signalled pipes array manipulation.
+ * That's why it's OK for now to keep the rest of the fields under the same
+ * lock. If we notice too much contention because of PIPE_CMD_OPEN,
+ * then we should add a separate lock there.
+ */
+ spinlock_t lock;
+
+ /*
+ * Array of the pipes of |pipes_capacity| elements,
+ * indexed by goldfish_pipe::id
+ */
+ struct goldfish_pipe **pipes;
+ u32 pipes_capacity;
+
+ /* Pointers to the buffers host uses for interaction with this driver */
+ struct goldfish_pipe_dev_buffers *buffers;
+
+ /* Head of a doubly linked list of signalled pipes */
+ struct goldfish_pipe *first_signalled_pipe;
+
+ /* Some device-specific data */
+ int irq;
+ int version;
+ unsigned char __iomem *base;
+
+ /* v1-specific access parameters */
+ struct access_params *aps;
+};
+
+extern struct goldfish_pipe_dev pipe_dev[1];
+
+#endif /* GOLDFISH_PIPE_H */
diff --git a/drivers/platform/goldfish/goldfish_pipe_v2.c b/drivers/platform/goldfish/goldfish_pipe_v2.c
new file mode 100644
index 000000000000..ad373ed36555
--- /dev/null
+++ b/drivers/platform/goldfish/goldfish_pipe_v2.c
@@ -0,0 +1,889 @@
+/*
+ * Copyright (C) 2012 Intel, Inc.
+ * Copyright (C) 2013 Intel, Inc.
+ * Copyright (C) 2014 Linaro Limited
+ * Copyright (C) 2011-2016 Google, Inc.
+ *
+ * This software is licensed under the terms of the GNU General Public
+ * License version 2, as published by the Free Software Foundation, and
+ * may be copied, distributed, and modified under those terms.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ */
+
+/* This source file contains the implementation of a special device driver
+ * that intends to provide a *very* fast communication channel between the
+ * guest system and the QEMU emulator.
+ *
+ * Usage from the guest is simply the following (error handling simplified):
+ *
+ * int fd = open("/dev/qemu_pipe",O_RDWR);
+ * .... write() or read() through the pipe.
+ *
+ * This driver doesn't deal with the exact protocol used during the session.
+ * It is intended to be as simple as something like:
+ *
+ * // do this _just_ after opening the fd to connect to a specific
+ * // emulator service.
+ * const char* msg = "<pipename>";
+ * if (write(fd, msg, strlen(msg)+1) < 0) {
+ * ... could not connect to <pipename> service
+ * close(fd);
+ * }
+ *
+ * // after this, simply read() and write() to communicate with the
+ * // service. Exact protocol details left as an exercise to the reader.
+ *
+ * This driver is very fast because it doesn't copy any data through
+ * intermediate buffers, since the emulator is capable of translating
+ * guest user addresses into host ones.
+ *
+ * Note that we must however ensure that each user page involved in the
+ * exchange is properly mapped during a transfer.
+ */
+
+#include "goldfish_pipe.h"
+
+
+/*
+ * Update this when something changes in the driver's behavior so the host
+ * can benefit from knowing it
+ */
+enum {
+ PIPE_DRIVER_VERSION = 2,
+ PIPE_CURRENT_DEVICE_VERSION = 2
+};
+
+/*
+ * IMPORTANT: The following constants must match the ones used and defined
+ * in external/qemu/hw/goldfish_pipe.c in the Android source tree.
+ */
+
+/* List of bitflags returned in status of CMD_POLL command */
+enum PipePollFlags {
+ PIPE_POLL_IN = 1 << 0,
+ PIPE_POLL_OUT = 1 << 1,
+ PIPE_POLL_HUP = 1 << 2
+};
+
+/* Possible status values used to signal errors - see goldfish_pipe_error_convert */
+enum PipeErrors {
+ PIPE_ERROR_INVAL = -1,
+ PIPE_ERROR_AGAIN = -2,
+ PIPE_ERROR_NOMEM = -3,
+ PIPE_ERROR_IO = -4
+};
+
+/* Bit-flags used to signal events from the emulator */
+enum PipeWakeFlags {
+ PIPE_WAKE_CLOSED = 1 << 0, /* emulator closed pipe */
+ PIPE_WAKE_READ = 1 << 1, /* pipe can now be read from */
+ PIPE_WAKE_WRITE = 1 << 2 /* pipe can now be written to */
+};
+
+/* Bit flags for the 'flags' field */
+enum PipeFlagsBits {
+ BIT_CLOSED_ON_HOST = 0, /* pipe closed by host */
+ BIT_WAKE_ON_WRITE = 1, /* want to be woken on writes */
+ BIT_WAKE_ON_READ = 2, /* want to be woken on reads */
+};
+
+enum PipeRegs {
+ PIPE_REG_CMD = 0,
+
+ PIPE_REG_SIGNAL_BUFFER_HIGH = 4,
+ PIPE_REG_SIGNAL_BUFFER = 8,
+ PIPE_REG_SIGNAL_BUFFER_COUNT = 12,
+
+ PIPE_REG_OPEN_BUFFER_HIGH = 20,
+ PIPE_REG_OPEN_BUFFER = 24,
+
+ PIPE_REG_VERSION = 36,
+
+ PIPE_REG_GET_SIGNALLED = 48,
+};
+
+enum PipeCmdCode {
+ PIPE_CMD_OPEN = 1, /* to be used by the pipe device itself */
+ PIPE_CMD_CLOSE,
+ PIPE_CMD_POLL,
+ PIPE_CMD_WRITE,
+ PIPE_CMD_WAKE_ON_WRITE,
+ PIPE_CMD_READ,
+ PIPE_CMD_WAKE_ON_READ,
+
+ /*
+ * TODO(zyy): implement a deferred read/write execution to allow parallel
+ * processing of pipe operations on the host.
+ */
+ PIPE_CMD_WAKE_ON_DONE_IO,
+};
+
+enum {
+ MAX_BUFFERS_PER_COMMAND = 336,
+ MAX_SIGNALLED_PIPES = 64,
+ INITIAL_PIPES_CAPACITY = 64
+};
+
+struct goldfish_pipe_dev;
+struct goldfish_pipe;
+struct goldfish_pipe_command;
+
+/* A per-pipe command structure, shared with the host */
+struct goldfish_pipe_command {
+ s32 cmd; /* PipeCmdCode, guest -> host */
+ s32 id; /* pipe id, guest -> host */
+ s32 status; /* command execution status, host -> guest */
+ s32 reserved; /* to pad to 64-bit boundary */
+ union {
+ /* Parameters for PIPE_CMD_{READ,WRITE} */
+ struct {
+ u32 buffers_count; /* number of buffers, guest -> host */
+ s32 consumed_size; /* number of consumed bytes, host -> guest */
+ u64 ptrs[MAX_BUFFERS_PER_COMMAND]; /* buffer pointers, guest -> host */
+ u32 sizes[MAX_BUFFERS_PER_COMMAND]; /* buffer sizes, guest -> host */
+ } rw_params;
+ };
+};
+
+/* A single signalled pipe information */
+struct signalled_pipe_buffer {
+ u32 id;
+ u32 flags;
+};
+
+/* Parameters for the PIPE_CMD_OPEN command */
+struct open_command_param {
+ u64 command_buffer_ptr;
+ u32 rw_params_max_count;
+};
+
+/* Device-level set of buffers shared with the host */
+struct goldfish_pipe_dev_buffers {
+ struct open_command_param open_command_params;
+ struct signalled_pipe_buffer signalled_pipe_buffers[MAX_SIGNALLED_PIPES];
+};
+
+/* This data type models a given pipe instance */
+struct goldfish_pipe {
+ u32 id; /* pipe ID - index into goldfish_pipe_dev::pipes array */
+ unsigned long flags; /* The wake flags pipe is waiting for
+ * Note: not protected with any lock, uses atomic operations
+ * and barriers to make it thread-safe.
+ */
+ unsigned long signalled_flags; /* wake flags host have signalled,
+ * - protected by goldfish_pipe_dev::lock */
+
+ struct goldfish_pipe_command *command_buffer; /* A pointer to command buffer */
+
+ /* doubly linked list of signalled pipes, protected by goldfish_pipe_dev::lock */
+ struct goldfish_pipe *prev_signalled;
+ struct goldfish_pipe *next_signalled;
+
+ /*
+ * A pipe's own lock. Protects the following:
+ * - *command_buffer - makes sure a command can safely write its parameters
+ * to the host and read the results back.
+ */
+ struct mutex lock;
+
+ wait_queue_head_t wake_queue; /* A wake queue for sleeping until host signals an event */
+ struct goldfish_pipe_dev *dev; /* Pointer to the parent goldfish_pipe_dev instance */
+};
+
+struct goldfish_pipe_dev pipe_dev[1] = {};
+
+static int goldfish_cmd_locked(struct goldfish_pipe *pipe, enum PipeCmdCode cmd)
+{
+ pipe->command_buffer->cmd = cmd;
+ pipe->command_buffer->status = PIPE_ERROR_INVAL; /* failure by default */
+ writel(pipe->id, pipe->dev->base + PIPE_REG_CMD);
+ return pipe->command_buffer->status;
+}
+
+static int goldfish_cmd(struct goldfish_pipe *pipe, enum PipeCmdCode cmd)
+{
+ int status;
+ if (mutex_lock_interruptible(&pipe->lock))
+ return PIPE_ERROR_IO;
+ status = goldfish_cmd_locked(pipe, cmd);
+ mutex_unlock(&pipe->lock);
+ return status;
+}
+
+/*
+ * This function converts an error code returned by the emulator through
+ * the PIPE_REG_STATUS i/o register into a valid negative errno value.
+ */
+static int goldfish_pipe_error_convert(int status)
+{
+ switch (status) {
+ case PIPE_ERROR_AGAIN:
+ return -EAGAIN;
+ case PIPE_ERROR_NOMEM:
+ return -ENOMEM;
+ case PIPE_ERROR_IO:
+ return -EIO;
+ default:
+ return -EINVAL;
+ }
+}
+
+static int pin_user_pages(unsigned long first_page, unsigned long last_page,
+ unsigned last_page_size, int is_write,
+ struct page *pages[MAX_BUFFERS_PER_COMMAND], unsigned *iter_last_page_size)
+{
+ int ret;
+ int requested_pages = ((last_page - first_page) >> PAGE_SHIFT) + 1;
+ if (requested_pages > MAX_BUFFERS_PER_COMMAND) {
+ requested_pages = MAX_BUFFERS_PER_COMMAND;
+ *iter_last_page_size = PAGE_SIZE;
+ } else {
+ *iter_last_page_size = last_page_size;
+ }
+
+ ret = get_user_pages_fast(
+ first_page, requested_pages, !is_write, pages);
+ if (ret <= 0)
+ return -EFAULT;
+ if (ret < requested_pages)
+ *iter_last_page_size = PAGE_SIZE;
+ return ret;
+
+}
+
+static void release_user_pages(struct page **pages, int pages_count,
+ int is_write, s32 consumed_size)
+{
+ int i;
+ for (i = 0; i < pages_count; i++) {
+ if (!is_write && consumed_size > 0) {
+ set_page_dirty(pages[i]);
+ }
+ put_page(pages[i]);
+ }
+}
+
+/* Populate the call parameters, merging adjacent pages together */
+static void populate_rw_params(
+ struct page **pages, int pages_count,
+ unsigned long address, unsigned long address_end,
+ unsigned long first_page, unsigned long last_page,
+ unsigned iter_last_page_size, int is_write,
+ struct goldfish_pipe_command *command)
+{
+ /*
+ * Process the first page separately - it's the only page that
+ * needs special handling for its start address.
+ */
+ unsigned long xaddr = page_to_phys(pages[0]);
+ unsigned long xaddr_prev = xaddr;
+ int buffer_idx = 0;
+ int i = 1;
+ int size_on_page = first_page == last_page
+ ? (int)(address_end - address)
+ : (PAGE_SIZE - (address & ~PAGE_MASK));
+ command->rw_params.ptrs[0] = (u64)(xaddr | (address & ~PAGE_MASK));
+ command->rw_params.sizes[0] = size_on_page;
+ for (; i < pages_count; ++i) {
+ xaddr = page_to_phys(pages[i]);
+ size_on_page = (i == pages_count - 1) ? iter_last_page_size : PAGE_SIZE;
+ if (xaddr == xaddr_prev + PAGE_SIZE) {
+ command->rw_params.sizes[buffer_idx] += size_on_page;
+ } else {
+ ++buffer_idx;
+ command->rw_params.ptrs[buffer_idx] = (u64)xaddr;
+ command->rw_params.sizes[buffer_idx] = size_on_page;
+ }
+ xaddr_prev = xaddr;
+ }
+ command->rw_params.buffers_count = buffer_idx + 1;
+}
+
+static int transfer_max_buffers(struct goldfish_pipe* pipe,
+ unsigned long address, unsigned long address_end, int is_write,
+ unsigned long last_page, unsigned int last_page_size,
+ s32* consumed_size, int* status)
+{
+ struct page *pages[MAX_BUFFERS_PER_COMMAND];
+ unsigned long first_page = address & PAGE_MASK;
+ unsigned int iter_last_page_size;
+ int pages_count = pin_user_pages(first_page, last_page,
+ last_page_size, is_write,
+ pages, &iter_last_page_size);
+ if (pages_count < 0)
+ return pages_count;
+
+ /* Serialize access to the pipe command buffers */
+ if (mutex_lock_interruptible(&pipe->lock))
+ return -ERESTARTSYS;
+
+ populate_rw_params(pages, pages_count, address, address_end,
+ first_page, last_page, iter_last_page_size, is_write,
+ pipe->command_buffer);
+
+ /* Transfer the data */
+ *status = goldfish_cmd_locked(pipe,
+ is_write ? PIPE_CMD_WRITE : PIPE_CMD_READ);
+
+ *consumed_size = pipe->command_buffer->rw_params.consumed_size;
+
+ mutex_unlock(&pipe->lock);
+
+ release_user_pages(pages, pages_count, is_write, *consumed_size);
+
+ return 0;
+}
+
+static int wait_for_host_signal(struct goldfish_pipe *pipe, int is_write)
+{
+ u32 wakeBit = is_write ? BIT_WAKE_ON_WRITE : BIT_WAKE_ON_READ;
+ set_bit(wakeBit, &pipe->flags);
+
+ /* Tell the emulator we're going to wait for a wake event */
+ (void)goldfish_cmd(pipe,
+ is_write ? PIPE_CMD_WAKE_ON_WRITE : PIPE_CMD_WAKE_ON_READ);
+
+ while (test_bit(wakeBit, &pipe->flags)) {
+ if (wait_event_interruptible(
+ pipe->wake_queue,
+ !test_bit(wakeBit, &pipe->flags)))
+ return -ERESTARTSYS;
+
+ if (test_bit(BIT_CLOSED_ON_HOST, &pipe->flags))
+ return -EIO;
+ }
+
+ return 0;
+}
+
+static ssize_t goldfish_pipe_read_write(struct file *filp,
+ char __user *buffer, size_t bufflen, int is_write)
+{
+ struct goldfish_pipe *pipe = filp->private_data;
+ int count = 0, ret = -EINVAL;
+ unsigned long address, address_end, last_page;
+ unsigned int last_page_size;
+
+ /* If the emulator already closed the pipe, no need to go further */
+ if (unlikely(test_bit(BIT_CLOSED_ON_HOST, &pipe->flags)))
+ return -EIO;
+ /* Null reads or writes succeeds */
+ if (unlikely(bufflen == 0))
+ return 0;
+ /* Check the buffer range for access */
+ if (unlikely(!access_ok(is_write ? VERIFY_WRITE : VERIFY_READ,
+ buffer, bufflen)))
+ return -EFAULT;
+
+ address = (unsigned long)buffer;
+ address_end = address + bufflen;
+ last_page = (address_end - 1) & PAGE_MASK;
+ last_page_size = ((address_end - 1) & ~PAGE_MASK) + 1;
+
+ while (address < address_end) {
+ s32 consumed_size;
+ int status;
+ ret = transfer_max_buffers(pipe, address, address_end, is_write,
+ last_page, last_page_size, &consumed_size, &status);
+ if (ret < 0)
+ break;
+
+ if (consumed_size > 0) {
+ /* No matter what's the status, we've transfered something */
+ count += consumed_size;
+ address += consumed_size;
+ }
+ if (status > 0)
+ continue;
+ if (status == 0) {
+ /* EOF */
+ ret = 0;
+ break;
+ }
+ if (count > 0) {
+ /*
+ * An error occured, but we already transfered
+ * something on one of the previous iterations.
+ * Just return what we already copied and log this
+ * err.
+ */
+ if (status != PIPE_ERROR_AGAIN)
+ pr_info_ratelimited("goldfish_pipe: backend error %d on %s\n",
+ status, is_write ? "write" : "read");
+ break;
+ }
+
+ /*
+ * If the error is not PIPE_ERROR_AGAIN, or if we are in
+ * non-blocking mode, just return the error code.
+ */
+ if (status != PIPE_ERROR_AGAIN || (filp->f_flags & O_NONBLOCK) != 0) {
+ ret = goldfish_pipe_error_convert(status);
+ break;
+ }
+
+ status = wait_for_host_signal(pipe, is_write);
+ if (status < 0)
+ return status;
+ }
+
+ if (count > 0)
+ return count;
+ return ret;
+}
+
+static ssize_t goldfish_pipe_read(struct file *filp, char __user *buffer,
+ size_t bufflen, loff_t *ppos)
+{
+ return goldfish_pipe_read_write(filp, buffer, bufflen, /* is_write */ 0);
+}
+
+static ssize_t goldfish_pipe_write(struct file *filp,
+ const char __user *buffer, size_t bufflen,
+ loff_t *ppos)
+{
+ return goldfish_pipe_read_write(filp,
+ /* cast away the const */(char __user *)buffer, bufflen,
+ /* is_write */ 1);
+}
+
+static unsigned int goldfish_pipe_poll(struct file *filp, poll_table *wait)
+{
+ struct goldfish_pipe *pipe = filp->private_data;
+ unsigned int mask = 0;
+ int status;
+
+ poll_wait(filp, &pipe->wake_queue, wait);
+
+ status = goldfish_cmd(pipe, PIPE_CMD_POLL);
+ if (status < 0) {
+ return -ERESTARTSYS;
+ }
+
+ if (status & PIPE_POLL_IN)
+ mask |= POLLIN | POLLRDNORM;
+ if (status & PIPE_POLL_OUT)
+ mask |= POLLOUT | POLLWRNORM;
+ if (status & PIPE_POLL_HUP)
+ mask |= POLLHUP;
+ if (test_bit(BIT_CLOSED_ON_HOST, &pipe->flags))
+ mask |= POLLERR;
+
+ return mask;
+}
+
+static void signalled_pipes_add_locked(struct goldfish_pipe_dev *dev,
+ u32 id, u32 flags)
+{
+ struct goldfish_pipe *pipe;
+
+ BUG_ON(id >= dev->pipes_capacity);
+
+ pipe = dev->pipes[id];
+ if (!pipe)
+ return;
+ pipe->signalled_flags |= flags;
+
+ if (pipe->prev_signalled || pipe->next_signalled
+ || dev->first_signalled_pipe == pipe)
+ return; /* already in the list */
+ pipe->next_signalled = dev->first_signalled_pipe;
+ if (dev->first_signalled_pipe) {
+ dev->first_signalled_pipe->prev_signalled = pipe;
+ }
+ dev->first_signalled_pipe = pipe;
+}
+
+static void signalled_pipes_remove_locked(struct goldfish_pipe_dev *dev,
+ struct goldfish_pipe *pipe) {
+ if (pipe->prev_signalled)
+ pipe->prev_signalled->next_signalled = pipe->next_signalled;
+ if (pipe->next_signalled)
+ pipe->next_signalled->prev_signalled = pipe->prev_signalled;
+ if (pipe == dev->first_signalled_pipe)
+ dev->first_signalled_pipe = pipe->next_signalled;
+ pipe->prev_signalled = NULL;
+ pipe->next_signalled = NULL;
+}
+
+static struct goldfish_pipe *signalled_pipes_pop_front(struct goldfish_pipe_dev *dev,
+ int *wakes)
+{
+ struct goldfish_pipe *pipe;
+ unsigned long flags;
+ spin_lock_irqsave(&dev->lock, flags);
+
+ pipe = dev->first_signalled_pipe;
+ if (pipe) {
+ *wakes = pipe->signalled_flags;
+ pipe->signalled_flags = 0;
+ /*
+ * This is an optimized version of signalled_pipes_remove_locked() -
+ * we want to make it as fast as possible to wake the sleeping pipe
+ * operations faster
+ */
+ dev->first_signalled_pipe = pipe->next_signalled;
+ if (dev->first_signalled_pipe)
+ dev->first_signalled_pipe->prev_signalled = NULL;
+ pipe->next_signalled = NULL;
+ }
+
+ spin_unlock_irqrestore(&dev->lock, flags);
+ return pipe;
+}
+
+static void goldfish_interrupt_task(unsigned long unused)
+{
+ struct goldfish_pipe_dev *dev = pipe_dev;
+ /* Iterate over the signalled pipes and wake them one by one */
+ struct goldfish_pipe *pipe;
+ int wakes;
+ while ((pipe = signalled_pipes_pop_front(dev, &wakes)) != NULL) {
+ if (wakes & PIPE_WAKE_CLOSED) {
+ pipe->flags = 1 << BIT_CLOSED_ON_HOST;
+ } else {
+ if (wakes & PIPE_WAKE_READ)
+ clear_bit(BIT_WAKE_ON_READ, &pipe->flags);
+ if (wakes & PIPE_WAKE_WRITE)
+ clear_bit(BIT_WAKE_ON_WRITE, &pipe->flags);
+ }
+ /*
+ * wake_up_interruptible() implies a write barrier, so don't explicitly
+ * add another one here.
+ */
+ wake_up_interruptible(&pipe->wake_queue);
+ }
+}
+DECLARE_TASKLET(goldfish_interrupt_tasklet, goldfish_interrupt_task, 0);
+
+/*
+ * The general idea of the interrupt handling:
+ *
+ * 1. device raises an interrupt if there's at least one signalled pipe
+ * 2. IRQ handler reads the signalled pipes and their count from the device
+ * 3. device writes them into a shared buffer and returns the count
+ * it only resets the IRQ if it has returned all signalled pipes,
+ * otherwise it leaves it raised, so IRQ handler will be called
+ * again for the next chunk
+ * 4. IRQ handler adds all returned pipes to the device's signalled pipes list
+ * 5. IRQ handler launches a tasklet to process the signalled pipes from the
+ * list in a separate context
+ */
+static irqreturn_t goldfish_pipe_interrupt(int irq, void *dev_id)
+{
+ u32 count;
+ u32 i;
+ unsigned long flags;
+ struct goldfish_pipe_dev *dev = dev_id;
+ if (dev != pipe_dev)
+ return IRQ_NONE;
+
+ /* Request the signalled pipes from the device */
+ spin_lock_irqsave(&dev->lock, flags);
+
+ count = readl(dev->base + PIPE_REG_GET_SIGNALLED);
+ if (count == 0) {
+ spin_unlock_irqrestore(&dev->lock, flags);
+ return IRQ_NONE;
+ }
+ if (count > MAX_SIGNALLED_PIPES)
+ count = MAX_SIGNALLED_PIPES;
+
+ for (i = 0; i < count; ++i)
+ signalled_pipes_add_locked(dev,
+ dev->buffers->signalled_pipe_buffers[i].id,
+ dev->buffers->signalled_pipe_buffers[i].flags);
+
+ spin_unlock_irqrestore(&dev->lock, flags);
+
+ tasklet_schedule(&goldfish_interrupt_tasklet);
+ return IRQ_HANDLED;
+}
+
+static int get_free_pipe_id_locked(struct goldfish_pipe_dev *dev)
+{
+ int id;
+ for (id = 0; id < dev->pipes_capacity; ++id)
+ if (!dev->pipes[id])
+ return id;
+
+ {
+ /* Reallocate the array */
+ u32 new_capacity = 2 * dev->pipes_capacity;
+ struct goldfish_pipe **pipes =
+ kcalloc(new_capacity, sizeof(*pipes),
+ GFP_ATOMIC);
+ if (!pipes)
+ return -ENOMEM;
+ memcpy(pipes, dev->pipes, sizeof(*pipes) * dev->pipes_capacity);
+ kfree(dev->pipes);
+ dev->pipes = pipes;
+ id = dev->pipes_capacity;
+ dev->pipes_capacity = new_capacity;
+ }
+ return id;
+}
+
+/**
+ * goldfish_pipe_open - open a channel to the AVD
+ * @inode: inode of device
+ * @file: file struct of opener
+ *
+ * Create a new pipe link between the emulator and the use application.
+ * Each new request produces a new pipe.
+ *
+ * Note: we use the pipe ID as a mux. All goldfish emulations are 32bit
+ * right now so this is fine. A move to 64bit will need this addressing
+ */
+static int goldfish_pipe_open(struct inode *inode, struct file *file)
+{
+ struct goldfish_pipe_dev *dev = pipe_dev;
+ unsigned long flags;
+ int id;
+ int status;
+
+ /* Allocate new pipe kernel object */
+ struct goldfish_pipe *pipe = kzalloc(sizeof(*pipe), GFP_KERNEL);
+ if (pipe == NULL)
+ return -ENOMEM;
+
+ pipe->dev = dev;
+ mutex_init(&pipe->lock);
+ init_waitqueue_head(&pipe->wake_queue);
+
+ /*
+ * Command buffer needs to be allocated on its own page to make sure it is
+ * physically contiguous in host's address space.
+ */
+ pipe->command_buffer =
+ (struct goldfish_pipe_command*)__get_free_page(GFP_KERNEL);
+ if (!pipe->command_buffer) {
+ status = -ENOMEM;
+ goto err_pipe;
+ }
+
+ spin_lock_irqsave(&dev->lock, flags);
+
+ id = get_free_pipe_id_locked(dev);
+ if (id < 0) {
+ status = id;
+ goto err_id_locked;
+ }
+
+ dev->pipes[id] = pipe;
+ pipe->id = id;
+ pipe->command_buffer->id = id;
+
+ /* Now tell the emulator we're opening a new pipe. */
+ dev->buffers->open_command_params.rw_params_max_count =
+ MAX_BUFFERS_PER_COMMAND;
+ dev->buffers->open_command_params.command_buffer_ptr =
+ (u64)(unsigned long)__pa(pipe->command_buffer);
+ status = goldfish_cmd_locked(pipe, PIPE_CMD_OPEN);
+ spin_unlock_irqrestore(&dev->lock, flags);
+ if (status < 0)
+ goto err_cmd;
+ /* All is done, save the pipe into the file's private data field */
+ file->private_data = pipe;
+ return 0;
+
+err_cmd:
+ spin_lock_irqsave(&dev->lock, flags);
+ dev->pipes[id] = NULL;
+err_id_locked:
+ spin_unlock_irqrestore(&dev->lock, flags);
+ free_page((unsigned long)pipe->command_buffer);
+err_pipe:
+ kfree(pipe);
+ return status;
+}
+
+static int goldfish_pipe_release(struct inode *inode, struct file *filp)
+{
+ unsigned long flags;
+ struct goldfish_pipe *pipe = filp->private_data;
+ struct goldfish_pipe_dev *dev = pipe->dev;
+
+ /* The guest is closing the channel, so tell the emulator right now */
+ (void)goldfish_cmd(pipe, PIPE_CMD_CLOSE);
+
+ spin_lock_irqsave(&dev->lock, flags);
+ dev->pipes[pipe->id] = NULL;
+ signalled_pipes_remove_locked(dev, pipe);
+ spin_unlock_irqrestore(&dev->lock, flags);
+
+ filp->private_data = NULL;
+ free_page((unsigned long)pipe->command_buffer);
+ kfree(pipe);
+ return 0;
+}
+
+static const struct file_operations goldfish_pipe_fops = {
+ .owner = THIS_MODULE,
+ .read = goldfish_pipe_read,
+ .write = goldfish_pipe_write,
+ .poll = goldfish_pipe_poll,
+ .open = goldfish_pipe_open,
+ .release = goldfish_pipe_release,
+};
+
+static struct miscdevice goldfish_pipe_dev = {
+ .minor = MISC_DYNAMIC_MINOR,
+ .name = "goldfish_pipe",
+ .fops = &goldfish_pipe_fops,
+};
+
+static int goldfish_pipe_device_init_v2(struct platform_device *pdev)
+{
+ char *page;
+ struct goldfish_pipe_dev *dev = pipe_dev;
+ int err = devm_request_irq(&pdev->dev, dev->irq, goldfish_pipe_interrupt,
+ IRQF_SHARED, "goldfish_pipe", dev);
+ if (err) {
+ dev_err(&pdev->dev, "unable to allocate IRQ for v2\n");
+ return err;
+ }
+
+ err = misc_register(&goldfish_pipe_dev);
+ if (err) {
+ dev_err(&pdev->dev, "unable to register v2 device\n");
+ return err;
+ }
+
+ dev->first_signalled_pipe = NULL;
+ dev->pipes_capacity = INITIAL_PIPES_CAPACITY;
+ dev->pipes = kcalloc(dev->pipes_capacity, sizeof(*dev->pipes), GFP_KERNEL);
+ if (!dev->pipes)
+ return -ENOMEM;
+
+ /*
+ * We're going to pass two buffers, open_command_params and
+ * signalled_pipe_buffers, to the host. This means each of those buffers
+ * needs to be contained in a single physical page. The easiest choice is
+ * to just allocate a page and place the buffers in it.
+ */
+ BUG_ON(sizeof(*dev->buffers) > PAGE_SIZE);
+ page = (char*)__get_free_page(GFP_KERNEL);
+ if (!page) {
+ kfree(dev->pipes);
+ return -ENOMEM;
+ }
+ dev->buffers = (struct goldfish_pipe_dev_buffers*)page;
+
+ /* Send the buffer addresses to the host */
+ {
+ u64 paddr = __pa(&dev->buffers->signalled_pipe_buffers);
+ writel((u32)(unsigned long)(paddr >> 32), dev->base + PIPE_REG_SIGNAL_BUFFER_HIGH);
+ writel((u32)(unsigned long)paddr, dev->base + PIPE_REG_SIGNAL_BUFFER);
+ writel((u32)MAX_SIGNALLED_PIPES, dev->base + PIPE_REG_SIGNAL_BUFFER_COUNT);
+
+ paddr = __pa(&dev->buffers->open_command_params);
+ writel((u32)(unsigned long)(paddr >> 32), dev->base + PIPE_REG_OPEN_BUFFER_HIGH);
+ writel((u32)(unsigned long)paddr, dev->base + PIPE_REG_OPEN_BUFFER);
+ }
+ return 0;
+}
+
+static void goldfish_pipe_device_deinit_v2(struct platform_device *pdev) {
+ struct goldfish_pipe_dev *dev = pipe_dev;
+ misc_deregister(&goldfish_pipe_dev);
+ kfree(dev->pipes);
+ free_page((unsigned long)dev->buffers);
+}
+
+static int goldfish_pipe_probe(struct platform_device *pdev)
+{
+ int err;
+ struct resource *r;
+ struct goldfish_pipe_dev *dev = pipe_dev;
+
+ BUG_ON(sizeof(struct goldfish_pipe_command) > PAGE_SIZE);
+
+ /* not thread safe, but this should not happen */
+ WARN_ON(dev->base != NULL);
+
+ spin_lock_init(&dev->lock);
+
+ r = platform_get_resource(pdev, IORESOURCE_MEM, 0);
+ if (r == NULL || resource_size(r) < PAGE_SIZE) {
+ dev_err(&pdev->dev, "can't allocate i/o page\n");
+ return -EINVAL;
+ }
+ dev->base = devm_ioremap(&pdev->dev, r->start, PAGE_SIZE);
+ if (dev->base == NULL) {
+ dev_err(&pdev->dev, "ioremap failed\n");
+ return -EINVAL;
+ }
+
+ r = platform_get_resource(pdev, IORESOURCE_IRQ, 0);
+ if (r == NULL) {
+ err = -EINVAL;
+ goto error;
+ }
+ dev->irq = r->start;
+
+ /*
+ * Exchange the versions with the host device
+ *
+ * Note: v1 driver used to not report its version, so we write it before
+ * reading device version back: this allows the host implementation to
+ * detect the old driver (if there was no version write before read).
+ */
+ writel((u32)PIPE_DRIVER_VERSION, dev->base + PIPE_REG_VERSION);
+ dev->version = readl(dev->base + PIPE_REG_VERSION);
+ if (dev->version < PIPE_CURRENT_DEVICE_VERSION) {
+ /* initialize the old device version */
+ err = goldfish_pipe_device_init_v1(pdev);
+ } else {
+ /* Host device supports the new interface */
+ err = goldfish_pipe_device_init_v2(pdev);
+ }
+ if (!err)
+ return 0;
+
+error:
+ dev->base = NULL;
+ return err;
+}
+
+static int goldfish_pipe_remove(struct platform_device *pdev)
+{
+ struct goldfish_pipe_dev *dev = pipe_dev;
+ if (dev->version < PIPE_CURRENT_DEVICE_VERSION)
+ goldfish_pipe_device_deinit_v1(pdev);
+ else
+ goldfish_pipe_device_deinit_v2(pdev);
+ dev->base = NULL;
+ return 0;
+}
+
+static const struct acpi_device_id goldfish_pipe_acpi_match[] = {
+ { "GFSH0003", 0 },
+ { },
+};
+MODULE_DEVICE_TABLE(acpi, goldfish_pipe_acpi_match);
+
+static const struct of_device_id goldfish_pipe_of_match[] = {
+ { .compatible = "google,android-pipe", },
+ {},
+};
+MODULE_DEVICE_TABLE(of, goldfish_pipe_of_match);
+
+static struct platform_driver goldfish_pipe_driver = {
+ .probe = goldfish_pipe_probe,
+ .remove = goldfish_pipe_remove,
+ .driver = {
+ .name = "goldfish_pipe",
+ .of_match_table = goldfish_pipe_of_match,
+ .acpi_match_table = ACPI_PTR(goldfish_pipe_acpi_match),
+ }
+};
+
+module_platform_driver(goldfish_pipe_driver);
+MODULE_AUTHOR("David Turner <digit@google.com>");
+MODULE_LICENSE("GPL");
diff --git a/drivers/platform/x86/thinkpad_acpi.c b/drivers/platform/x86/thinkpad_acpi.c
index b65ce7519411..60ee94e57242 100644
--- a/drivers/platform/x86/thinkpad_acpi.c
+++ b/drivers/platform/x86/thinkpad_acpi.c
@@ -9526,7 +9526,7 @@ static struct ibm_init_struct ibms_init[] __initdata = {
},
};
-static int __init set_ibm_param(const char *val, struct kernel_param *kp)
+static int __init set_ibm_param(const char *val, const struct kernel_param *kp)
{
unsigned int i;
struct ibm_struct *ibm;
diff --git a/drivers/power/supply/power_supply_sysfs.c b/drivers/power/supply/power_supply_sysfs.c
index bcde8d13476a..fdb824fdf6c1 100644
--- a/drivers/power/supply/power_supply_sysfs.c
+++ b/drivers/power/supply/power_supply_sysfs.c
@@ -107,7 +107,10 @@ static ssize_t power_supply_show_property(struct device *dev,
else if (off >= POWER_SUPPLY_PROP_MODEL_NAME)
return sprintf(buf, "%s\n", value.strval);
- return sprintf(buf, "%d\n", value.intval);
+ if (off == POWER_SUPPLY_PROP_CHARGE_COUNTER_EXT)
+ return sprintf(buf, "%lld\n", value.int64val);
+ else
+ return sprintf(buf, "%d\n", value.intval);
}
static ssize_t power_supply_store_property(struct device *dev,
@@ -198,6 +201,12 @@ static struct device_attribute power_supply_attrs[] = {
POWER_SUPPLY_ATTR(scope),
POWER_SUPPLY_ATTR(charge_term_current),
POWER_SUPPLY_ATTR(calibrate),
+ /* Local extensions */
+ POWER_SUPPLY_ATTR(usb_hc),
+ POWER_SUPPLY_ATTR(usb_otg),
+ POWER_SUPPLY_ATTR(charge_enabled),
+ /* Local extensions of type int64_t */
+ POWER_SUPPLY_ATTR(charge_counter_ext),
/* Properties of type `const char *' */
POWER_SUPPLY_ATTR(model_name),
POWER_SUPPLY_ATTR(manufacturer),
diff --git a/drivers/rtc/rtc-palmas.c b/drivers/rtc/rtc-palmas.c
index 4bcfb88674d3..34aea38ebfa6 100644
--- a/drivers/rtc/rtc-palmas.c
+++ b/drivers/rtc/rtc-palmas.c
@@ -45,6 +45,42 @@ struct palmas_rtc {
/* Total number of RTC registers needed to set time*/
#define PALMAS_NUM_TIME_REGS (PALMAS_YEARS_REG - PALMAS_SECONDS_REG + 1)
+/*
+ * Special bin2bcd mapping to deal with bcd storage of year.
+ *
+ * 0-69 -> 0xD0
+ * 70-99 (1970 - 1999) -> 0xD0 - 0xF9 (correctly rolls to 0x00)
+ * 100-199 (2000 - 2099) -> 0x00 - 0x99 (does not roll to 0xA0 :-( )
+ * 200-229 (2100 - 2129) -> 0xA0 - 0xC9 (really for completeness)
+ * 230- -> 0xC9
+ *
+ * Confirmed: the only transition that does not work correctly for this rtc
+ * clock is the transition from 2099 to 2100, it proceeds to 2000. We will
+ * accept this issue since the clock retains and transitions the year correctly
+ * in all other conditions.
+ */
+static unsigned char year_bin2bcd(int val)
+{
+ if (val < 70)
+ return 0xD0;
+ if (val < 100)
+ return bin2bcd(val - 20) | 0x80; /* KISS leverage of bin2bcd */
+ if (val >= 230)
+ return 0xC9;
+ if (val >= 200)
+ return bin2bcd(val - 180) | 0x80;
+ return bin2bcd(val - 100);
+}
+
+static int year_bcd2bin(unsigned char val)
+{
+ if (val >= 0xD0)
+ return bcd2bin(val & 0x7F) + 20;
+ if (val >= 0xA0)
+ return bcd2bin(val & 0x7F) + 180;
+ return bcd2bin(val) + 100;
+}
+
static int palmas_rtc_read_time(struct device *dev, struct rtc_time *tm)
{
unsigned char rtc_data[PALMAS_NUM_TIME_REGS];
@@ -71,7 +107,7 @@ static int palmas_rtc_read_time(struct device *dev, struct rtc_time *tm)
tm->tm_hour = bcd2bin(rtc_data[2]);
tm->tm_mday = bcd2bin(rtc_data[3]);
tm->tm_mon = bcd2bin(rtc_data[4]) - 1;
- tm->tm_year = bcd2bin(rtc_data[5]) + 100;
+ tm->tm_year = year_bcd2bin(rtc_data[5]);
return ret;
}
@@ -87,7 +123,7 @@ static int palmas_rtc_set_time(struct device *dev, struct rtc_time *tm)
rtc_data[2] = bin2bcd(tm->tm_hour);
rtc_data[3] = bin2bcd(tm->tm_mday);
rtc_data[4] = bin2bcd(tm->tm_mon + 1);
- rtc_data[5] = bin2bcd(tm->tm_year - 100);
+ rtc_data[5] = year_bin2bcd(tm->tm_year);
/* Stop RTC while updating the RTC time registers */
ret = palmas_update_bits(palmas, PALMAS_RTC_BASE, PALMAS_RTC_CTRL_REG,
@@ -142,7 +178,7 @@ static int palmas_rtc_read_alarm(struct device *dev, struct rtc_wkalrm *alm)
alm->time.tm_hour = bcd2bin(alarm_data[2]);
alm->time.tm_mday = bcd2bin(alarm_data[3]);
alm->time.tm_mon = bcd2bin(alarm_data[4]) - 1;
- alm->time.tm_year = bcd2bin(alarm_data[5]) + 100;
+ alm->time.tm_year = year_bcd2bin(alarm_data[5]);
ret = palmas_read(palmas, PALMAS_RTC_BASE, PALMAS_RTC_INTERRUPTS_REG,
&int_val);
@@ -173,7 +209,7 @@ static int palmas_rtc_set_alarm(struct device *dev, struct rtc_wkalrm *alm)
alarm_data[2] = bin2bcd(alm->time.tm_hour);
alarm_data[3] = bin2bcd(alm->time.tm_mday);
alarm_data[4] = bin2bcd(alm->time.tm_mon + 1);
- alarm_data[5] = bin2bcd(alm->time.tm_year - 100);
+ alarm_data[5] = year_bin2bcd(alm->time.tm_year);
ret = palmas_bulk_write(palmas, PALMAS_RTC_BASE,
PALMAS_ALARM_SECONDS_REG, alarm_data, PALMAS_NUM_TIME_REGS);
diff --git a/drivers/scsi/fcoe/fcoe_transport.c b/drivers/scsi/fcoe/fcoe_transport.c
index 375c536cbc68..c5eb0c468f0b 100644
--- a/drivers/scsi/fcoe/fcoe_transport.c
+++ b/drivers/scsi/fcoe/fcoe_transport.c
@@ -32,13 +32,13 @@ MODULE_AUTHOR("Open-FCoE.org");
MODULE_DESCRIPTION("FIP discovery protocol and FCoE transport for FCoE HBAs");
MODULE_LICENSE("GPL v2");
-static int fcoe_transport_create(const char *, struct kernel_param *);
-static int fcoe_transport_destroy(const char *, struct kernel_param *);
+static int fcoe_transport_create(const char *, const struct kernel_param *);
+static int fcoe_transport_destroy(const char *, const struct kernel_param *);
static int fcoe_transport_show(char *buffer, const struct kernel_param *kp);
static struct fcoe_transport *fcoe_transport_lookup(struct net_device *device);
static struct fcoe_transport *fcoe_netdev_map_lookup(struct net_device *device);
-static int fcoe_transport_enable(const char *, struct kernel_param *);
-static int fcoe_transport_disable(const char *, struct kernel_param *);
+static int fcoe_transport_enable(const char *, const struct kernel_param *);
+static int fcoe_transport_disable(const char *, const struct kernel_param *);
static int libfcoe_device_notification(struct notifier_block *notifier,
ulong event, void *ptr);
@@ -865,7 +865,8 @@ EXPORT_SYMBOL(fcoe_ctlr_destroy_store);
*
* Returns: 0 for success
*/
-static int fcoe_transport_create(const char *buffer, struct kernel_param *kp)
+static int fcoe_transport_create(const char *buffer,
+ const struct kernel_param *kp)
{
int rc = -ENODEV;
struct net_device *netdev = NULL;
@@ -930,7 +931,8 @@ out_nodev:
*
* Returns: 0 for success
*/
-static int fcoe_transport_destroy(const char *buffer, struct kernel_param *kp)
+static int fcoe_transport_destroy(const char *buffer,
+ const struct kernel_param *kp)
{
int rc = -ENODEV;
struct net_device *netdev = NULL;
@@ -974,7 +976,8 @@ out_nodev:
*
* Returns: 0 for success
*/
-static int fcoe_transport_disable(const char *buffer, struct kernel_param *kp)
+static int fcoe_transport_disable(const char *buffer,
+ const struct kernel_param *kp)
{
int rc = -ENODEV;
struct net_device *netdev = NULL;
@@ -1008,7 +1011,8 @@ out_nodev:
*
* Returns: 0 for success
*/
-static int fcoe_transport_enable(const char *buffer, struct kernel_param *kp)
+static int fcoe_transport_enable(const char *buffer,
+ const struct kernel_param *kp)
{
int rc = -ENODEV;
struct net_device *netdev = NULL;
diff --git a/drivers/scsi/mpt3sas/mpt3sas_base.c b/drivers/scsi/mpt3sas/mpt3sas_base.c
index a1a5ceb42ce6..8e83e346e0b3 100644
--- a/drivers/scsi/mpt3sas/mpt3sas_base.c
+++ b/drivers/scsi/mpt3sas/mpt3sas_base.c
@@ -105,7 +105,7 @@ _base_get_ioc_facts(struct MPT3SAS_ADAPTER *ioc);
*
*/
static int
-_scsih_set_fwfault_debug(const char *val, struct kernel_param *kp)
+_scsih_set_fwfault_debug(const char *val, const struct kernel_param *kp)
{
int ret = param_set_int(val, kp);
struct MPT3SAS_ADAPTER *ioc;
diff --git a/drivers/scsi/mpt3sas/mpt3sas_scsih.c b/drivers/scsi/mpt3sas/mpt3sas_scsih.c
index ec48c010a3ba..caa00451d451 100644
--- a/drivers/scsi/mpt3sas/mpt3sas_scsih.c
+++ b/drivers/scsi/mpt3sas/mpt3sas_scsih.c
@@ -281,7 +281,7 @@ struct _scsi_io_transfer {
* Note: The logging levels are defined in mpt3sas_debug.h.
*/
static int
-_scsih_set_debug_level(const char *val, struct kernel_param *kp)
+_scsih_set_debug_level(const char *val, const struct kernel_param *kp)
{
int ret = param_set_int(val, kp);
struct MPT3SAS_ADAPTER *ioc;
diff --git a/drivers/scsi/ufs/ufshcd.c b/drivers/scsi/ufs/ufshcd.c
index 5cfd56f08ffb..0e41d4c541a3 100644
--- a/drivers/scsi/ufs/ufshcd.c
+++ b/drivers/scsi/ufs/ufshcd.c
@@ -41,6 +41,7 @@
#include <linux/devfreq.h>
#include <linux/nls.h>
#include <linux/of.h>
+#include <linux/blkdev.h>
#include "ufshcd.h"
#include "ufs_quirks.h"
#include "unipro.h"
@@ -1488,6 +1489,17 @@ static int ufshcd_queuecommand(struct Scsi_Host *host, struct scsi_cmnd *cmd)
clear_bit_unlock(tag, &hba->lrb_in_use);
goto out;
}
+
+ /* IO svc time latency histogram */
+ if (hba != NULL && cmd->request != NULL) {
+ if (hba->latency_hist_enabled &&
+ (cmd->request->cmd_type == REQ_TYPE_FS)) {
+ cmd->request->lat_hist_io_start = ktime_get();
+ cmd->request->lat_hist_enabled = 1;
+ } else
+ cmd->request->lat_hist_enabled = 0;
+ }
+
WARN_ON(hba->clk_gating.state != CLKS_ON);
lrbp = &hba->lrb[tag];
@@ -3696,6 +3708,7 @@ static void __ufshcd_transfer_req_compl(struct ufs_hba *hba,
struct scsi_cmnd *cmd;
int result;
int index;
+ struct request *req;
for_each_set_bit(index, &completed_reqs, hba->nutrs) {
lrbp = &hba->lrb[index];
@@ -3707,6 +3720,22 @@ static void __ufshcd_transfer_req_compl(struct ufs_hba *hba,
/* Mark completed command as NULL in LRB */
lrbp->cmd = NULL;
clear_bit_unlock(index, &hba->lrb_in_use);
+ req = cmd->request;
+ if (req) {
+ /* Update IO svc time latency histogram */
+ if (req->lat_hist_enabled) {
+ ktime_t completion;
+ u_int64_t delta_us;
+
+ completion = ktime_get();
+ delta_us = ktime_us_delta(completion,
+ req->lat_hist_io_start);
+ blk_update_latency_hist(
+ (rq_data_dir(req) == READ) ?
+ &hba->io_lat_read :
+ &hba->io_lat_write, delta_us);
+ }
+ }
/* Do not touch lrbp after scsi done */
cmd->scsi_done(cmd);
__ufshcd_release(hba);
@@ -6495,6 +6524,61 @@ out:
}
EXPORT_SYMBOL(ufshcd_shutdown);
+/*
+ * Values permitted 0, 1, 2.
+ * 0 -> Disable IO latency histograms (default)
+ * 1 -> Enable IO latency histograms
+ * 2 -> Zero out IO latency histograms
+ */
+static ssize_t
+latency_hist_store(struct device *dev, struct device_attribute *attr,
+ const char *buf, size_t count)
+{
+ struct ufs_hba *hba = dev_get_drvdata(dev);
+ long value;
+
+ if (kstrtol(buf, 0, &value))
+ return -EINVAL;
+ if (value == BLK_IO_LAT_HIST_ZERO) {
+ memset(&hba->io_lat_read, 0, sizeof(hba->io_lat_read));
+ memset(&hba->io_lat_write, 0, sizeof(hba->io_lat_write));
+ } else if (value == BLK_IO_LAT_HIST_ENABLE ||
+ value == BLK_IO_LAT_HIST_DISABLE)
+ hba->latency_hist_enabled = value;
+ return count;
+}
+
+ssize_t
+latency_hist_show(struct device *dev, struct device_attribute *attr,
+ char *buf)
+{
+ struct ufs_hba *hba = dev_get_drvdata(dev);
+ size_t written_bytes;
+
+ written_bytes = blk_latency_hist_show("Read", &hba->io_lat_read,
+ buf, PAGE_SIZE);
+ written_bytes += blk_latency_hist_show("Write", &hba->io_lat_write,
+ buf + written_bytes, PAGE_SIZE - written_bytes);
+
+ return written_bytes;
+}
+
+static DEVICE_ATTR(latency_hist, S_IRUGO | S_IWUSR,
+ latency_hist_show, latency_hist_store);
+
+static void
+ufshcd_init_latency_hist(struct ufs_hba *hba)
+{
+ if (device_create_file(hba->dev, &dev_attr_latency_hist))
+ dev_err(hba->dev, "Failed to create latency_hist sysfs entry\n");
+}
+
+static void
+ufshcd_exit_latency_hist(struct ufs_hba *hba)
+{
+ device_create_file(hba->dev, &dev_attr_latency_hist);
+}
+
/**
* ufshcd_remove - de-allocate SCSI host and host memory space
* data structure memory
@@ -6508,6 +6592,7 @@ void ufshcd_remove(struct ufs_hba *hba)
ufshcd_hba_stop(hba, true);
ufshcd_exit_clk_gating(hba);
+ ufshcd_exit_latency_hist(hba);
if (ufshcd_is_clkscaling_enabled(hba))
devfreq_remove_device(hba->devfreq);
ufshcd_hba_exit(hba);
@@ -6856,6 +6941,8 @@ int ufshcd_init(struct ufs_hba *hba, void __iomem *mmio_base, unsigned int irq)
/* Hold auto suspend until async scan completes */
pm_runtime_get_sync(dev);
+ ufshcd_init_latency_hist(hba);
+
/*
* We are assuming that device wasn't put in sleep/power-down
* state exclusively during the boot stage before kernel.
@@ -6872,6 +6959,7 @@ out_remove_scsi_host:
scsi_remove_host(hba->host);
exit_gating:
ufshcd_exit_clk_gating(hba);
+ ufshcd_exit_latency_hist(hba);
out_disable:
hba->is_irq_enabled = false;
ufshcd_hba_exit(hba);
diff --git a/drivers/scsi/ufs/ufshcd.h b/drivers/scsi/ufs/ufshcd.h
index 6dbd2e176333..096e6673f29b 100644
--- a/drivers/scsi/ufs/ufshcd.h
+++ b/drivers/scsi/ufs/ufshcd.h
@@ -575,6 +575,10 @@ struct ufs_hba {
bool is_urgent_bkops_lvl_checked;
struct ufs_desc_size desc_size;
+
+ int latency_hist_enabled;
+ struct io_latency_state io_lat_read;
+ struct io_latency_state io_lat_write;
};
/* Returns true if clocks can be gated. Otherwise false */
diff --git a/drivers/staging/android/Kconfig b/drivers/staging/android/Kconfig
index 6c00d6f765c6..a17c483f906e 100644
--- a/drivers/staging/android/Kconfig
+++ b/drivers/staging/android/Kconfig
@@ -24,8 +24,28 @@ config ANDROID_LOW_MEMORY_KILLER
scripts (/init.rc), and it defines priority values with minimum free memory size
for each priority.
+config ANDROID_LOW_MEMORY_KILLER_AUTODETECT_OOM_ADJ_VALUES
+ bool "Android Low Memory Killer: detect oom_adj values"
+ depends on ANDROID_LOW_MEMORY_KILLER
+ default y
+ ---help---
+ Detect oom_adj values written to
+ /sys/module/lowmemorykiller/parameters/adj and convert them
+ to oom_score_adj values.
+
+config ANDROID_VSOC
+ tristate "Android Virtual SoC support"
+ default n
+ depends on PCI_MSI
+ ---help---
+ This option adds support for the Virtual SoC driver needed to boot
+ a 'cuttlefish' Android image inside QEmu. The driver interacts with
+ a QEmu ivshmem device. If built as a module, it will be called vsoc.
+
source "drivers/staging/android/ion/Kconfig"
+source "drivers/staging/android/fiq_debugger/Kconfig"
+
endif # if ANDROID
endmenu
diff --git a/drivers/staging/android/Makefile b/drivers/staging/android/Makefile
index 7ed1be798909..93c5f5a7390a 100644
--- a/drivers/staging/android/Makefile
+++ b/drivers/staging/android/Makefile
@@ -1,6 +1,8 @@
ccflags-y += -I$(src) # needed for trace events
obj-y += ion/
+obj-$(CONFIG_FIQ_DEBUGGER) += fiq_debugger/
obj-$(CONFIG_ASHMEM) += ashmem.o
obj-$(CONFIG_ANDROID_LOW_MEMORY_KILLER) += lowmemorykiller.o
+obj-$(CONFIG_ANDROID_VSOC) += vsoc.o
diff --git a/drivers/staging/android/TODO b/drivers/staging/android/TODO
index 64d8c8720960..edfb6809eb9e 100644
--- a/drivers/staging/android/TODO
+++ b/drivers/staging/android/TODO
@@ -33,5 +33,14 @@ sync framework:
- clean up and ABI check for security issues
- move it to drivers/base/dma-buf
+vsoc.c, uapi/vsoc_shm.h
+ - The current driver uses the same wait queue for all of the futexes in a
+ region. This will cause false wakeups in regions with a large number of
+ waiting threads. We should eventually use multiple queues and select the
+ queue based on the region.
+ - Add debugfs support for examining the permissions of regions.
+ - Remove VSOC_WAIT_FOR_INCOMING_INTERRUPT ioctl. This functionality has been
+ superseded by the futex and is there for legacy reasons.
+
Please send patches to Greg Kroah-Hartman <greg@kroah.com> and Cc:
Arve Hjønnevåg <arve@android.com> and Riley Andrews <riandrews@android.com>
diff --git a/drivers/staging/android/ashmem.c b/drivers/staging/android/ashmem.c
index c6314d1552ea..5af176b707a3 100644
--- a/drivers/staging/android/ashmem.c
+++ b/drivers/staging/android/ashmem.c
@@ -415,22 +415,14 @@ static int ashmem_mmap(struct file *file, struct vm_area_struct *vma)
}
get_file(asma->file);
- /*
- * XXX - Reworked to use shmem_zero_setup() instead of
- * shmem_set_file while we're in staging. -jstultz
- */
- if (vma->vm_flags & VM_SHARED) {
- ret = shmem_zero_setup(vma);
- if (ret) {
- fput(asma->file);
- goto out;
- }
+ if (vma->vm_flags & VM_SHARED)
+ shmem_set_file(vma, asma->file);
+ else {
+ if (vma->vm_file)
+ fput(vma->vm_file);
+ vma->vm_file = asma->file;
}
- if (vma->vm_file)
- fput(vma->vm_file);
- vma->vm_file = asma->file;
-
out:
mutex_unlock(&ashmem_mutex);
return ret;
@@ -467,9 +459,9 @@ ashmem_shrink_scan(struct shrinker *shrink, struct shrink_control *sc)
loff_t start = range->pgstart * PAGE_SIZE;
loff_t end = (range->pgend + 1) * PAGE_SIZE;
- vfs_fallocate(range->asma->file,
- FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE,
- start, end - start);
+ range->asma->file->f_op->fallocate(range->asma->file,
+ FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE,
+ start, end - start);
range->purged = ASHMEM_WAS_PURGED;
lru_del(range);
diff --git a/drivers/staging/android/fiq_debugger/Kconfig b/drivers/staging/android/fiq_debugger/Kconfig
new file mode 100644
index 000000000000..60fc224d4efc
--- /dev/null
+++ b/drivers/staging/android/fiq_debugger/Kconfig
@@ -0,0 +1,58 @@
+config FIQ_DEBUGGER
+ bool "FIQ Mode Serial Debugger"
+ default n
+ depends on ARM || ARM64
+ help
+ The FIQ serial debugger can accept commands even when the
+ kernel is unresponsive due to being stuck with interrupts
+ disabled.
+
+config FIQ_DEBUGGER_NO_SLEEP
+ bool "Keep serial debugger active"
+ depends on FIQ_DEBUGGER
+ default n
+ help
+ Enables the serial debugger at boot. Passing
+ fiq_debugger.no_sleep on the kernel commandline will
+ override this config option.
+
+config FIQ_DEBUGGER_WAKEUP_IRQ_ALWAYS_ON
+ bool "Don't disable wakeup IRQ when debugger is active"
+ depends on FIQ_DEBUGGER
+ default n
+ help
+ Don't disable the wakeup irq when enabling the uart clock. This will
+ cause extra interrupts, but it makes the serial debugger usable with
+ on some MSM radio builds that ignore the uart clock request in power
+ collapse.
+
+config FIQ_DEBUGGER_CONSOLE
+ bool "Console on FIQ Serial Debugger port"
+ depends on FIQ_DEBUGGER
+ default n
+ help
+ Enables a console so that printk messages are displayed on
+ the debugger serial port as the occur.
+
+config FIQ_DEBUGGER_CONSOLE_DEFAULT_ENABLE
+ bool "Put the FIQ debugger into console mode by default"
+ depends on FIQ_DEBUGGER_CONSOLE
+ default n
+ help
+ If enabled, this puts the fiq debugger into console mode by default.
+ Otherwise, the fiq debugger will start out in debug mode.
+
+config FIQ_DEBUGGER_UART_OVERLAY
+ bool "Install uart DT overlay"
+ depends on FIQ_DEBUGGER
+ select OF_OVERLAY
+ default n
+ help
+ If enabled, fiq debugger is calling fiq_debugger_uart_overlay()
+ that will apply overlay uart_overlay@0 to disable proper uart.
+
+config FIQ_WATCHDOG
+ bool
+ select FIQ_DEBUGGER
+ select PSTORE_RAM
+ default n
diff --git a/drivers/staging/android/fiq_debugger/Makefile b/drivers/staging/android/fiq_debugger/Makefile
new file mode 100644
index 000000000000..a7ca4871cad3
--- /dev/null
+++ b/drivers/staging/android/fiq_debugger/Makefile
@@ -0,0 +1,4 @@
+obj-y += fiq_debugger.o
+obj-$(CONFIG_ARM) += fiq_debugger_arm.o
+obj-$(CONFIG_ARM64) += fiq_debugger_arm64.o
+obj-$(CONFIG_FIQ_WATCHDOG) += fiq_watchdog.o
diff --git a/drivers/staging/android/fiq_debugger/fiq_debugger.c b/drivers/staging/android/fiq_debugger/fiq_debugger.c
new file mode 100644
index 000000000000..d9bc125308bd
--- /dev/null
+++ b/drivers/staging/android/fiq_debugger/fiq_debugger.c
@@ -0,0 +1,1246 @@
+/*
+ * drivers/staging/android/fiq_debugger.c
+ *
+ * Serial Debugger Interface accessed through an FIQ interrupt.
+ *
+ * Copyright (C) 2008 Google, Inc.
+ *
+ * This software is licensed under the terms of the GNU General Public
+ * License version 2, as published by the Free Software Foundation, and
+ * may be copied, distributed, and modified under those terms.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ */
+
+#include <stdarg.h>
+#include <linux/module.h>
+#include <linux/io.h>
+#include <linux/console.h>
+#include <linux/interrupt.h>
+#include <linux/clk.h>
+#include <linux/platform_device.h>
+#include <linux/kernel_stat.h>
+#include <linux/kmsg_dump.h>
+#include <linux/irq.h>
+#include <linux/delay.h>
+#include <linux/reboot.h>
+#include <linux/sched.h>
+#include <linux/slab.h>
+#include <linux/smp.h>
+#include <linux/timer.h>
+#include <linux/tty.h>
+#include <linux/tty_flip.h>
+
+#ifdef CONFIG_FIQ_GLUE
+#include <asm/fiq_glue.h>
+#endif
+
+#ifdef CONFIG_FIQ_DEBUGGER_UART_OVERLAY
+#include <linux/of.h>
+#endif
+
+#include <linux/uaccess.h>
+
+#include "fiq_debugger.h"
+#include "fiq_debugger_priv.h"
+#include "fiq_debugger_ringbuf.h"
+
+#define DEBUG_MAX 64
+#define MAX_UNHANDLED_FIQ_COUNT 1000000
+
+#define MAX_FIQ_DEBUGGER_PORTS 4
+
+struct fiq_debugger_state {
+#ifdef CONFIG_FIQ_GLUE
+ struct fiq_glue_handler handler;
+#endif
+ struct fiq_debugger_output output;
+
+ int fiq;
+ int uart_irq;
+ int signal_irq;
+ int wakeup_irq;
+ bool wakeup_irq_no_set_wake;
+ struct clk *clk;
+ struct fiq_debugger_pdata *pdata;
+ struct platform_device *pdev;
+
+ char debug_cmd[DEBUG_MAX];
+ int debug_busy;
+ int debug_abort;
+
+ char debug_buf[DEBUG_MAX];
+ int debug_count;
+
+ bool no_sleep;
+ bool debug_enable;
+ bool ignore_next_wakeup_irq;
+ struct timer_list sleep_timer;
+ spinlock_t sleep_timer_lock;
+ bool uart_enabled;
+ struct wakeup_source debugger_wake_src;
+ bool console_enable;
+ int current_cpu;
+ atomic_t unhandled_fiq_count;
+ bool in_fiq;
+
+ struct work_struct work;
+ spinlock_t work_lock;
+ char work_cmd[DEBUG_MAX];
+
+#ifdef CONFIG_FIQ_DEBUGGER_CONSOLE
+ spinlock_t console_lock;
+ struct console console;
+ struct tty_port tty_port;
+ struct fiq_debugger_ringbuf *tty_rbuf;
+ bool syslog_dumping;
+#endif
+
+ unsigned int last_irqs[NR_IRQS];
+ unsigned int last_local_timer_irqs[NR_CPUS];
+};
+
+#ifdef CONFIG_FIQ_DEBUGGER_CONSOLE
+struct tty_driver *fiq_tty_driver;
+#endif
+
+#ifdef CONFIG_FIQ_DEBUGGER_NO_SLEEP
+static bool initial_no_sleep = true;
+#else
+static bool initial_no_sleep;
+#endif
+
+#ifdef CONFIG_FIQ_DEBUGGER_CONSOLE_DEFAULT_ENABLE
+static bool initial_debug_enable = true;
+static bool initial_console_enable = true;
+#else
+static bool initial_debug_enable;
+static bool initial_console_enable;
+#endif
+
+static bool fiq_kgdb_enable;
+static bool fiq_debugger_disable;
+
+module_param_named(no_sleep, initial_no_sleep, bool, 0644);
+module_param_named(debug_enable, initial_debug_enable, bool, 0644);
+module_param_named(console_enable, initial_console_enable, bool, 0644);
+module_param_named(kgdb_enable, fiq_kgdb_enable, bool, 0644);
+module_param_named(disable, fiq_debugger_disable, bool, 0644);
+
+#ifdef CONFIG_FIQ_DEBUGGER_WAKEUP_IRQ_ALWAYS_ON
+static inline
+void fiq_debugger_enable_wakeup_irq(struct fiq_debugger_state *state) {}
+static inline
+void fiq_debugger_disable_wakeup_irq(struct fiq_debugger_state *state) {}
+#else
+static inline
+void fiq_debugger_enable_wakeup_irq(struct fiq_debugger_state *state)
+{
+ if (state->wakeup_irq < 0)
+ return;
+ enable_irq(state->wakeup_irq);
+ if (!state->wakeup_irq_no_set_wake)
+ enable_irq_wake(state->wakeup_irq);
+}
+static inline
+void fiq_debugger_disable_wakeup_irq(struct fiq_debugger_state *state)
+{
+ if (state->wakeup_irq < 0)
+ return;
+ disable_irq_nosync(state->wakeup_irq);
+ if (!state->wakeup_irq_no_set_wake)
+ disable_irq_wake(state->wakeup_irq);
+}
+#endif
+
+static inline bool fiq_debugger_have_fiq(struct fiq_debugger_state *state)
+{
+ return (state->fiq >= 0);
+}
+
+#ifdef CONFIG_FIQ_GLUE
+static void fiq_debugger_force_irq(struct fiq_debugger_state *state)
+{
+ unsigned int irq = state->signal_irq;
+
+ if (WARN_ON(!fiq_debugger_have_fiq(state)))
+ return;
+ if (state->pdata->force_irq) {
+ state->pdata->force_irq(state->pdev, irq);
+ } else {
+ struct irq_chip *chip = irq_get_chip(irq);
+ if (chip && chip->irq_retrigger)
+ chip->irq_retrigger(irq_get_irq_data(irq));
+ }
+}
+#endif
+
+static void fiq_debugger_uart_enable(struct fiq_debugger_state *state)
+{
+ if (state->clk)
+ clk_enable(state->clk);
+ if (state->pdata->uart_enable)
+ state->pdata->uart_enable(state->pdev);
+}
+
+static void fiq_debugger_uart_disable(struct fiq_debugger_state *state)
+{
+ if (state->pdata->uart_disable)
+ state->pdata->uart_disable(state->pdev);
+ if (state->clk)
+ clk_disable(state->clk);
+}
+
+static void fiq_debugger_uart_flush(struct fiq_debugger_state *state)
+{
+ if (state->pdata->uart_flush)
+ state->pdata->uart_flush(state->pdev);
+}
+
+static void fiq_debugger_putc(struct fiq_debugger_state *state, char c)
+{
+ state->pdata->uart_putc(state->pdev, c);
+}
+
+static void fiq_debugger_puts(struct fiq_debugger_state *state, char *s)
+{
+ unsigned c;
+ while ((c = *s++)) {
+ if (c == '\n')
+ fiq_debugger_putc(state, '\r');
+ fiq_debugger_putc(state, c);
+ }
+}
+
+static void fiq_debugger_prompt(struct fiq_debugger_state *state)
+{
+ fiq_debugger_puts(state, "debug> ");
+}
+
+static void fiq_debugger_dump_kernel_log(struct fiq_debugger_state *state)
+{
+ char buf[512];
+ size_t len;
+ struct kmsg_dumper dumper = { .active = true };
+
+
+ kmsg_dump_rewind_nolock(&dumper);
+ while (kmsg_dump_get_line_nolock(&dumper, true, buf,
+ sizeof(buf) - 1, &len)) {
+ buf[len] = 0;
+ fiq_debugger_puts(state, buf);
+ }
+}
+
+static void fiq_debugger_printf(struct fiq_debugger_output *output,
+ const char *fmt, ...)
+{
+ struct fiq_debugger_state *state;
+ char buf[256];
+ va_list ap;
+
+ state = container_of(output, struct fiq_debugger_state, output);
+ va_start(ap, fmt);
+ vsnprintf(buf, sizeof(buf), fmt, ap);
+ va_end(ap);
+
+ fiq_debugger_puts(state, buf);
+}
+
+/* Safe outside fiq context */
+static int fiq_debugger_printf_nfiq(void *cookie, const char *fmt, ...)
+{
+ struct fiq_debugger_state *state = cookie;
+ char buf[256];
+ va_list ap;
+ unsigned long irq_flags;
+
+ va_start(ap, fmt);
+ vsnprintf(buf, 128, fmt, ap);
+ va_end(ap);
+
+ local_irq_save(irq_flags);
+ fiq_debugger_puts(state, buf);
+ fiq_debugger_uart_flush(state);
+ local_irq_restore(irq_flags);
+ return state->debug_abort;
+}
+
+static void fiq_debugger_dump_irqs(struct fiq_debugger_state *state)
+{
+ int n;
+ struct irq_desc *desc;
+
+ fiq_debugger_printf(&state->output,
+ "irqnr total since-last status name\n");
+ for_each_irq_desc(n, desc) {
+ struct irqaction *act = desc->action;
+ if (!act && !kstat_irqs(n))
+ continue;
+ fiq_debugger_printf(&state->output, "%5d: %10u %11u %8x %s\n", n,
+ kstat_irqs(n),
+ kstat_irqs(n) - state->last_irqs[n],
+ desc->status_use_accessors,
+ (act && act->name) ? act->name : "???");
+ state->last_irqs[n] = kstat_irqs(n);
+ }
+}
+
+static void fiq_debugger_do_ps(struct fiq_debugger_state *state)
+{
+ struct task_struct *g;
+ struct task_struct *p;
+ unsigned task_state;
+ static const char stat_nam[] = "RSDTtZX";
+
+ fiq_debugger_printf(&state->output, "pid ppid prio task pc\n");
+ read_lock(&tasklist_lock);
+ do_each_thread(g, p) {
+ task_state = p->state ? __ffs(p->state) + 1 : 0;
+ fiq_debugger_printf(&state->output,
+ "%5d %5d %4d ", p->pid, p->parent->pid, p->prio);
+ fiq_debugger_printf(&state->output, "%-13.13s %c", p->comm,
+ task_state >= sizeof(stat_nam) ? '?' : stat_nam[task_state]);
+ if (task_state == TASK_RUNNING)
+ fiq_debugger_printf(&state->output, " running\n");
+ else
+ fiq_debugger_printf(&state->output, " %08lx\n",
+ thread_saved_pc(p));
+ } while_each_thread(g, p);
+ read_unlock(&tasklist_lock);
+}
+
+#ifdef CONFIG_FIQ_DEBUGGER_CONSOLE
+static void fiq_debugger_begin_syslog_dump(struct fiq_debugger_state *state)
+{
+ state->syslog_dumping = true;
+}
+
+static void fiq_debugger_end_syslog_dump(struct fiq_debugger_state *state)
+{
+ state->syslog_dumping = false;
+}
+#else
+extern int do_syslog(int type, char __user *bug, int count);
+static void fiq_debugger_begin_syslog_dump(struct fiq_debugger_state *state)
+{
+ do_syslog(5 /* clear */, NULL, 0);
+}
+
+static void fiq_debugger_end_syslog_dump(struct fiq_debugger_state *state)
+{
+ fiq_debugger_dump_kernel_log(state);
+}
+#endif
+
+static void fiq_debugger_do_sysrq(struct fiq_debugger_state *state, char rq)
+{
+ if ((rq == 'g' || rq == 'G') && !fiq_kgdb_enable) {
+ fiq_debugger_printf(&state->output, "sysrq-g blocked\n");
+ return;
+ }
+ fiq_debugger_begin_syslog_dump(state);
+ handle_sysrq(rq);
+ fiq_debugger_end_syslog_dump(state);
+}
+
+#ifdef CONFIG_KGDB
+static void fiq_debugger_do_kgdb(struct fiq_debugger_state *state)
+{
+ if (!fiq_kgdb_enable) {
+ fiq_debugger_printf(&state->output, "kgdb through fiq debugger not enabled\n");
+ return;
+ }
+
+ fiq_debugger_printf(&state->output, "enabling console and triggering kgdb\n");
+ state->console_enable = true;
+ handle_sysrq('g');
+}
+#endif
+
+static void fiq_debugger_schedule_work(struct fiq_debugger_state *state,
+ char *cmd)
+{
+ unsigned long flags;
+
+ spin_lock_irqsave(&state->work_lock, flags);
+ if (state->work_cmd[0] != '\0') {
+ fiq_debugger_printf(&state->output, "work command processor busy\n");
+ spin_unlock_irqrestore(&state->work_lock, flags);
+ return;
+ }
+
+ strlcpy(state->work_cmd, cmd, sizeof(state->work_cmd));
+ spin_unlock_irqrestore(&state->work_lock, flags);
+
+ schedule_work(&state->work);
+}
+
+static void fiq_debugger_work(struct work_struct *work)
+{
+ struct fiq_debugger_state *state;
+ char work_cmd[DEBUG_MAX];
+ char *cmd;
+ unsigned long flags;
+
+ state = container_of(work, struct fiq_debugger_state, work);
+
+ spin_lock_irqsave(&state->work_lock, flags);
+
+ strlcpy(work_cmd, state->work_cmd, sizeof(work_cmd));
+ state->work_cmd[0] = '\0';
+
+ spin_unlock_irqrestore(&state->work_lock, flags);
+
+ cmd = work_cmd;
+ if (!strncmp(cmd, "reboot", 6)) {
+ cmd += 6;
+ while (*cmd == ' ')
+ cmd++;
+ if (*cmd != '\0')
+ kernel_restart(cmd);
+ else
+ kernel_restart(NULL);
+ } else {
+ fiq_debugger_printf(&state->output, "unknown work command '%s'\n",
+ work_cmd);
+ }
+}
+
+/* This function CANNOT be called in FIQ context */
+static void fiq_debugger_irq_exec(struct fiq_debugger_state *state, char *cmd)
+{
+ if (!strcmp(cmd, "ps"))
+ fiq_debugger_do_ps(state);
+ if (!strcmp(cmd, "sysrq"))
+ fiq_debugger_do_sysrq(state, 'h');
+ if (!strncmp(cmd, "sysrq ", 6))
+ fiq_debugger_do_sysrq(state, cmd[6]);
+#ifdef CONFIG_KGDB
+ if (!strcmp(cmd, "kgdb"))
+ fiq_debugger_do_kgdb(state);
+#endif
+ if (!strncmp(cmd, "reboot", 6))
+ fiq_debugger_schedule_work(state, cmd);
+}
+
+static void fiq_debugger_help(struct fiq_debugger_state *state)
+{
+ fiq_debugger_printf(&state->output,
+ "FIQ Debugger commands:\n"
+ " pc PC status\n"
+ " regs Register dump\n"
+ " allregs Extended Register dump\n"
+ " bt Stack trace\n"
+ " reboot [<c>] Reboot with command <c>\n"
+ " reset [<c>] Hard reset with command <c>\n"
+ " irqs Interupt status\n"
+ " kmsg Kernel log\n"
+ " version Kernel version\n");
+ fiq_debugger_printf(&state->output,
+ " sleep Allow sleep while in FIQ\n"
+ " nosleep Disable sleep while in FIQ\n"
+ " console Switch terminal to console\n"
+ " cpu Current CPU\n"
+ " cpu <number> Switch to CPU<number>\n");
+ fiq_debugger_printf(&state->output,
+ " ps Process list\n"
+ " sysrq sysrq options\n"
+ " sysrq <param> Execute sysrq with <param>\n");
+#ifdef CONFIG_KGDB
+ fiq_debugger_printf(&state->output,
+ " kgdb Enter kernel debugger\n");
+#endif
+}
+
+static void fiq_debugger_take_affinity(void *info)
+{
+ struct fiq_debugger_state *state = info;
+ struct cpumask cpumask;
+
+ cpumask_clear(&cpumask);
+ cpumask_set_cpu(get_cpu(), &cpumask);
+
+ irq_set_affinity(state->uart_irq, &cpumask);
+}
+
+static void fiq_debugger_switch_cpu(struct fiq_debugger_state *state, int cpu)
+{
+ if (!fiq_debugger_have_fiq(state))
+ smp_call_function_single(cpu, fiq_debugger_take_affinity, state,
+ false);
+ state->current_cpu = cpu;
+}
+
+static bool fiq_debugger_fiq_exec(struct fiq_debugger_state *state,
+ const char *cmd, const struct pt_regs *regs,
+ void *svc_sp)
+{
+ bool signal_helper = false;
+
+ if (!strcmp(cmd, "help") || !strcmp(cmd, "?")) {
+ fiq_debugger_help(state);
+ } else if (!strcmp(cmd, "pc")) {
+ fiq_debugger_dump_pc(&state->output, regs);
+ } else if (!strcmp(cmd, "regs")) {
+ fiq_debugger_dump_regs(&state->output, regs);
+ } else if (!strcmp(cmd, "allregs")) {
+ fiq_debugger_dump_allregs(&state->output, regs);
+ } else if (!strcmp(cmd, "bt")) {
+ fiq_debugger_dump_stacktrace(&state->output, regs, 100, svc_sp);
+ } else if (!strncmp(cmd, "reset", 5)) {
+ cmd += 5;
+ while (*cmd == ' ')
+ cmd++;
+ if (*cmd) {
+ char tmp_cmd[32];
+ strlcpy(tmp_cmd, cmd, sizeof(tmp_cmd));
+ machine_restart(tmp_cmd);
+ } else {
+ machine_restart(NULL);
+ }
+ } else if (!strcmp(cmd, "irqs")) {
+ fiq_debugger_dump_irqs(state);
+ } else if (!strcmp(cmd, "kmsg")) {
+ fiq_debugger_dump_kernel_log(state);
+ } else if (!strcmp(cmd, "version")) {
+ fiq_debugger_printf(&state->output, "%s\n", linux_banner);
+ } else if (!strcmp(cmd, "sleep")) {
+ state->no_sleep = false;
+ fiq_debugger_printf(&state->output, "enabling sleep\n");
+ } else if (!strcmp(cmd, "nosleep")) {
+ state->no_sleep = true;
+ fiq_debugger_printf(&state->output, "disabling sleep\n");
+ } else if (!strcmp(cmd, "console")) {
+ fiq_debugger_printf(&state->output, "console mode\n");
+ fiq_debugger_uart_flush(state);
+ state->console_enable = true;
+ } else if (!strcmp(cmd, "cpu")) {
+ fiq_debugger_printf(&state->output, "cpu %d\n", state->current_cpu);
+ } else if (!strncmp(cmd, "cpu ", 4)) {
+ unsigned long cpu = 0;
+ if (kstrtoul(cmd + 4, 10, &cpu) == 0)
+ fiq_debugger_switch_cpu(state, cpu);
+ else
+ fiq_debugger_printf(&state->output, "invalid cpu\n");
+ fiq_debugger_printf(&state->output, "cpu %d\n", state->current_cpu);
+ } else {
+ if (state->debug_busy) {
+ fiq_debugger_printf(&state->output,
+ "command processor busy. trying to abort.\n");
+ state->debug_abort = -1;
+ } else {
+ strcpy(state->debug_cmd, cmd);
+ state->debug_busy = 1;
+ }
+
+ return true;
+ }
+ if (!state->console_enable)
+ fiq_debugger_prompt(state);
+
+ return signal_helper;
+}
+
+static void fiq_debugger_sleep_timer_expired(unsigned long data)
+{
+ struct fiq_debugger_state *state = (struct fiq_debugger_state *)data;
+ unsigned long flags;
+
+ spin_lock_irqsave(&state->sleep_timer_lock, flags);
+ if (state->uart_enabled && !state->no_sleep) {
+ if (state->debug_enable && !state->console_enable) {
+ state->debug_enable = false;
+ fiq_debugger_printf_nfiq(state,
+ "suspending fiq debugger\n");
+ }
+ state->ignore_next_wakeup_irq = true;
+ fiq_debugger_uart_disable(state);
+ state->uart_enabled = false;
+ fiq_debugger_enable_wakeup_irq(state);
+ }
+ __pm_relax(&state->debugger_wake_src);
+ spin_unlock_irqrestore(&state->sleep_timer_lock, flags);
+}
+
+static void fiq_debugger_handle_wakeup(struct fiq_debugger_state *state)
+{
+ unsigned long flags;
+
+ spin_lock_irqsave(&state->sleep_timer_lock, flags);
+ if (state->wakeup_irq >= 0 && state->ignore_next_wakeup_irq) {
+ state->ignore_next_wakeup_irq = false;
+ } else if (!state->uart_enabled) {
+ __pm_stay_awake(&state->debugger_wake_src);
+ fiq_debugger_uart_enable(state);
+ state->uart_enabled = true;
+ fiq_debugger_disable_wakeup_irq(state);
+ mod_timer(&state->sleep_timer, jiffies + HZ / 2);
+ }
+ spin_unlock_irqrestore(&state->sleep_timer_lock, flags);
+}
+
+static irqreturn_t fiq_debugger_wakeup_irq_handler(int irq, void *dev)
+{
+ struct fiq_debugger_state *state = dev;
+
+ if (!state->no_sleep)
+ fiq_debugger_puts(state, "WAKEUP\n");
+ fiq_debugger_handle_wakeup(state);
+
+ return IRQ_HANDLED;
+}
+
+static
+void fiq_debugger_handle_console_irq_context(struct fiq_debugger_state *state)
+{
+#if defined(CONFIG_FIQ_DEBUGGER_CONSOLE)
+ if (state->tty_port.ops) {
+ int i;
+ int count = fiq_debugger_ringbuf_level(state->tty_rbuf);
+ for (i = 0; i < count; i++) {
+ int c = fiq_debugger_ringbuf_peek(state->tty_rbuf, 0);
+ tty_insert_flip_char(&state->tty_port, c, TTY_NORMAL);
+ if (!fiq_debugger_ringbuf_consume(state->tty_rbuf, 1))
+ pr_warn("fiq tty failed to consume byte\n");
+ }
+ tty_flip_buffer_push(&state->tty_port);
+ }
+#endif
+}
+
+static void fiq_debugger_handle_irq_context(struct fiq_debugger_state *state)
+{
+ if (!state->no_sleep) {
+ unsigned long flags;
+
+ spin_lock_irqsave(&state->sleep_timer_lock, flags);
+ __pm_stay_awake(&state->debugger_wake_src);
+ mod_timer(&state->sleep_timer, jiffies + HZ * 5);
+ spin_unlock_irqrestore(&state->sleep_timer_lock, flags);
+ }
+ fiq_debugger_handle_console_irq_context(state);
+ if (state->debug_busy) {
+ fiq_debugger_irq_exec(state, state->debug_cmd);
+ if (!state->console_enable)
+ fiq_debugger_prompt(state);
+ state->debug_busy = 0;
+ }
+}
+
+static int fiq_debugger_getc(struct fiq_debugger_state *state)
+{
+ return state->pdata->uart_getc(state->pdev);
+}
+
+static bool fiq_debugger_handle_uart_interrupt(struct fiq_debugger_state *state,
+ int this_cpu, const struct pt_regs *regs, void *svc_sp)
+{
+ int c;
+ static int last_c;
+ int count = 0;
+ bool signal_helper = false;
+
+ if (this_cpu != state->current_cpu) {
+ if (state->in_fiq)
+ return false;
+
+ if (atomic_inc_return(&state->unhandled_fiq_count) !=
+ MAX_UNHANDLED_FIQ_COUNT)
+ return false;
+
+ fiq_debugger_printf(&state->output,
+ "fiq_debugger: cpu %d not responding, "
+ "reverting to cpu %d\n", state->current_cpu,
+ this_cpu);
+
+ atomic_set(&state->unhandled_fiq_count, 0);
+ fiq_debugger_switch_cpu(state, this_cpu);
+ return false;
+ }
+
+ state->in_fiq = true;
+
+ while ((c = fiq_debugger_getc(state)) != FIQ_DEBUGGER_NO_CHAR) {
+ count++;
+ if (!state->debug_enable) {
+ if ((c == 13) || (c == 10)) {
+ state->debug_enable = true;
+ state->debug_count = 0;
+ fiq_debugger_prompt(state);
+ }
+ } else if (c == FIQ_DEBUGGER_BREAK) {
+ state->console_enable = false;
+ fiq_debugger_puts(state, "fiq debugger mode\n");
+ state->debug_count = 0;
+ fiq_debugger_prompt(state);
+#ifdef CONFIG_FIQ_DEBUGGER_CONSOLE
+ } else if (state->console_enable && state->tty_rbuf) {
+ fiq_debugger_ringbuf_push(state->tty_rbuf, c);
+ signal_helper = true;
+#endif
+ } else if ((c >= ' ') && (c < 127)) {
+ if (state->debug_count < (DEBUG_MAX - 1)) {
+ state->debug_buf[state->debug_count++] = c;
+ fiq_debugger_putc(state, c);
+ }
+ } else if ((c == 8) || (c == 127)) {
+ if (state->debug_count > 0) {
+ state->debug_count--;
+ fiq_debugger_putc(state, 8);
+ fiq_debugger_putc(state, ' ');
+ fiq_debugger_putc(state, 8);
+ }
+ } else if ((c == 13) || (c == 10)) {
+ if (c == '\r' || (c == '\n' && last_c != '\r')) {
+ fiq_debugger_putc(state, '\r');
+ fiq_debugger_putc(state, '\n');
+ }
+ if (state->debug_count) {
+ state->debug_buf[state->debug_count] = 0;
+ state->debug_count = 0;
+ signal_helper |=
+ fiq_debugger_fiq_exec(state,
+ state->debug_buf,
+ regs, svc_sp);
+ } else {
+ fiq_debugger_prompt(state);
+ }
+ }
+ last_c = c;
+ }
+ if (!state->console_enable)
+ fiq_debugger_uart_flush(state);
+ if (state->pdata->fiq_ack)
+ state->pdata->fiq_ack(state->pdev, state->fiq);
+
+ /* poke sleep timer if necessary */
+ if (state->debug_enable && !state->no_sleep)
+ signal_helper = true;
+
+ atomic_set(&state->unhandled_fiq_count, 0);
+ state->in_fiq = false;
+
+ return signal_helper;
+}
+
+#ifdef CONFIG_FIQ_GLUE
+static void fiq_debugger_fiq(struct fiq_glue_handler *h,
+ const struct pt_regs *regs, void *svc_sp)
+{
+ struct fiq_debugger_state *state =
+ container_of(h, struct fiq_debugger_state, handler);
+ unsigned int this_cpu = THREAD_INFO(svc_sp)->cpu;
+ bool need_irq;
+
+ need_irq = fiq_debugger_handle_uart_interrupt(state, this_cpu, regs,
+ svc_sp);
+ if (need_irq)
+ fiq_debugger_force_irq(state);
+}
+#endif
+
+/*
+ * When not using FIQs, we only use this single interrupt as an entry point.
+ * This just effectively takes over the UART interrupt and does all the work
+ * in this context.
+ */
+static irqreturn_t fiq_debugger_uart_irq(int irq, void *dev)
+{
+ struct fiq_debugger_state *state = dev;
+ bool not_done;
+
+ fiq_debugger_handle_wakeup(state);
+
+ /* handle the debugger irq in regular context */
+ not_done = fiq_debugger_handle_uart_interrupt(state, smp_processor_id(),
+ get_irq_regs(),
+ current_thread_info());
+ if (not_done)
+ fiq_debugger_handle_irq_context(state);
+
+ return IRQ_HANDLED;
+}
+
+/*
+ * If FIQs are used, not everything can happen in fiq context.
+ * FIQ handler does what it can and then signals this interrupt to finish the
+ * job in irq context.
+ */
+static irqreturn_t fiq_debugger_signal_irq(int irq, void *dev)
+{
+ struct fiq_debugger_state *state = dev;
+
+ if (state->pdata->force_irq_ack)
+ state->pdata->force_irq_ack(state->pdev, state->signal_irq);
+
+ fiq_debugger_handle_irq_context(state);
+
+ return IRQ_HANDLED;
+}
+
+#ifdef CONFIG_FIQ_GLUE
+static void fiq_debugger_resume(struct fiq_glue_handler *h)
+{
+ struct fiq_debugger_state *state =
+ container_of(h, struct fiq_debugger_state, handler);
+ if (state->pdata->uart_resume)
+ state->pdata->uart_resume(state->pdev);
+}
+#endif
+
+#if defined(CONFIG_FIQ_DEBUGGER_CONSOLE)
+struct tty_driver *fiq_debugger_console_device(struct console *co, int *index)
+{
+ *index = co->index;
+ return fiq_tty_driver;
+}
+
+static void fiq_debugger_console_write(struct console *co,
+ const char *s, unsigned int count)
+{
+ struct fiq_debugger_state *state;
+ unsigned long flags;
+
+ state = container_of(co, struct fiq_debugger_state, console);
+
+ if (!state->console_enable && !state->syslog_dumping)
+ return;
+
+ fiq_debugger_uart_enable(state);
+ spin_lock_irqsave(&state->console_lock, flags);
+ while (count--) {
+ if (*s == '\n')
+ fiq_debugger_putc(state, '\r');
+ fiq_debugger_putc(state, *s++);
+ }
+ fiq_debugger_uart_flush(state);
+ spin_unlock_irqrestore(&state->console_lock, flags);
+ fiq_debugger_uart_disable(state);
+}
+
+static struct console fiq_debugger_console = {
+ .name = "ttyFIQ",
+ .device = fiq_debugger_console_device,
+ .write = fiq_debugger_console_write,
+ .flags = CON_PRINTBUFFER | CON_ANYTIME | CON_ENABLED,
+};
+
+int fiq_tty_open(struct tty_struct *tty, struct file *filp)
+{
+ int line = tty->index;
+ struct fiq_debugger_state **states = tty->driver->driver_state;
+ struct fiq_debugger_state *state = states[line];
+
+ return tty_port_open(&state->tty_port, tty, filp);
+}
+
+void fiq_tty_close(struct tty_struct *tty, struct file *filp)
+{
+ tty_port_close(tty->port, tty, filp);
+}
+
+int fiq_tty_write(struct tty_struct *tty, const unsigned char *buf, int count)
+{
+ int i;
+ int line = tty->index;
+ struct fiq_debugger_state **states = tty->driver->driver_state;
+ struct fiq_debugger_state *state = states[line];
+
+ if (!state->console_enable)
+ return count;
+
+ fiq_debugger_uart_enable(state);
+ spin_lock_irq(&state->console_lock);
+ for (i = 0; i < count; i++)
+ fiq_debugger_putc(state, *buf++);
+ spin_unlock_irq(&state->console_lock);
+ fiq_debugger_uart_disable(state);
+
+ return count;
+}
+
+int fiq_tty_write_room(struct tty_struct *tty)
+{
+ return 16;
+}
+
+#ifdef CONFIG_CONSOLE_POLL
+static int fiq_tty_poll_init(struct tty_driver *driver, int line, char *options)
+{
+ return 0;
+}
+
+static int fiq_tty_poll_get_char(struct tty_driver *driver, int line)
+{
+ struct fiq_debugger_state **states = driver->driver_state;
+ struct fiq_debugger_state *state = states[line];
+ int c = NO_POLL_CHAR;
+
+ fiq_debugger_uart_enable(state);
+ if (fiq_debugger_have_fiq(state)) {
+ int count = fiq_debugger_ringbuf_level(state->tty_rbuf);
+ if (count > 0) {
+ c = fiq_debugger_ringbuf_peek(state->tty_rbuf, 0);
+ fiq_debugger_ringbuf_consume(state->tty_rbuf, 1);
+ }
+ } else {
+ c = fiq_debugger_getc(state);
+ if (c == FIQ_DEBUGGER_NO_CHAR)
+ c = NO_POLL_CHAR;
+ }
+ fiq_debugger_uart_disable(state);
+
+ return c;
+}
+
+static void fiq_tty_poll_put_char(struct tty_driver *driver, int line, char ch)
+{
+ struct fiq_debugger_state **states = driver->driver_state;
+ struct fiq_debugger_state *state = states[line];
+ fiq_debugger_uart_enable(state);
+ fiq_debugger_putc(state, ch);
+ fiq_debugger_uart_disable(state);
+}
+#endif
+
+static const struct tty_port_operations fiq_tty_port_ops;
+
+static const struct tty_operations fiq_tty_driver_ops = {
+ .write = fiq_tty_write,
+ .write_room = fiq_tty_write_room,
+ .open = fiq_tty_open,
+ .close = fiq_tty_close,
+#ifdef CONFIG_CONSOLE_POLL
+ .poll_init = fiq_tty_poll_init,
+ .poll_get_char = fiq_tty_poll_get_char,
+ .poll_put_char = fiq_tty_poll_put_char,
+#endif
+};
+
+static int fiq_debugger_tty_init(void)
+{
+ int ret;
+ struct fiq_debugger_state **states = NULL;
+
+ states = kzalloc(sizeof(*states) * MAX_FIQ_DEBUGGER_PORTS, GFP_KERNEL);
+ if (!states) {
+ pr_err("Failed to allocate fiq debugger state structres\n");
+ return -ENOMEM;
+ }
+
+ fiq_tty_driver = alloc_tty_driver(MAX_FIQ_DEBUGGER_PORTS);
+ if (!fiq_tty_driver) {
+ pr_err("Failed to allocate fiq debugger tty\n");
+ ret = -ENOMEM;
+ goto err_free_state;
+ }
+
+ fiq_tty_driver->owner = THIS_MODULE;
+ fiq_tty_driver->driver_name = "fiq-debugger";
+ fiq_tty_driver->name = "ttyFIQ";
+ fiq_tty_driver->type = TTY_DRIVER_TYPE_SERIAL;
+ fiq_tty_driver->subtype = SERIAL_TYPE_NORMAL;
+ fiq_tty_driver->init_termios = tty_std_termios;
+ fiq_tty_driver->flags = TTY_DRIVER_REAL_RAW |
+ TTY_DRIVER_DYNAMIC_DEV;
+ fiq_tty_driver->driver_state = states;
+
+ fiq_tty_driver->init_termios.c_cflag =
+ B115200 | CS8 | CREAD | HUPCL | CLOCAL;
+ fiq_tty_driver->init_termios.c_ispeed = 115200;
+ fiq_tty_driver->init_termios.c_ospeed = 115200;
+
+ tty_set_operations(fiq_tty_driver, &fiq_tty_driver_ops);
+
+ ret = tty_register_driver(fiq_tty_driver);
+ if (ret) {
+ pr_err("Failed to register fiq tty: %d\n", ret);
+ goto err_free_tty;
+ }
+
+ pr_info("Registered FIQ tty driver\n");
+ return 0;
+
+err_free_tty:
+ put_tty_driver(fiq_tty_driver);
+ fiq_tty_driver = NULL;
+err_free_state:
+ kfree(states);
+ return ret;
+}
+
+static int fiq_debugger_tty_init_one(struct fiq_debugger_state *state)
+{
+ int ret;
+ struct device *tty_dev;
+ struct fiq_debugger_state **states = fiq_tty_driver->driver_state;
+
+ states[state->pdev->id] = state;
+
+ state->tty_rbuf = fiq_debugger_ringbuf_alloc(1024);
+ if (!state->tty_rbuf) {
+ pr_err("Failed to allocate fiq debugger ringbuf\n");
+ ret = -ENOMEM;
+ goto err;
+ }
+
+ tty_port_init(&state->tty_port);
+ state->tty_port.ops = &fiq_tty_port_ops;
+
+ tty_dev = tty_port_register_device(&state->tty_port, fiq_tty_driver,
+ state->pdev->id, &state->pdev->dev);
+ if (IS_ERR(tty_dev)) {
+ pr_err("Failed to register fiq debugger tty device\n");
+ ret = PTR_ERR(tty_dev);
+ goto err;
+ }
+
+ device_set_wakeup_capable(tty_dev, 1);
+
+ pr_info("Registered fiq debugger ttyFIQ%d\n", state->pdev->id);
+
+ return 0;
+
+err:
+ fiq_debugger_ringbuf_free(state->tty_rbuf);
+ state->tty_rbuf = NULL;
+ return ret;
+}
+#endif
+
+static int fiq_debugger_dev_suspend(struct device *dev)
+{
+ struct platform_device *pdev = to_platform_device(dev);
+ struct fiq_debugger_state *state = platform_get_drvdata(pdev);
+
+ if (state->pdata->uart_dev_suspend)
+ return state->pdata->uart_dev_suspend(pdev);
+ return 0;
+}
+
+static int fiq_debugger_dev_resume(struct device *dev)
+{
+ struct platform_device *pdev = to_platform_device(dev);
+ struct fiq_debugger_state *state = platform_get_drvdata(pdev);
+
+ if (state->pdata->uart_dev_resume)
+ return state->pdata->uart_dev_resume(pdev);
+ return 0;
+}
+
+static int fiq_debugger_probe(struct platform_device *pdev)
+{
+ int ret;
+ struct fiq_debugger_pdata *pdata = dev_get_platdata(&pdev->dev);
+ struct fiq_debugger_state *state;
+ int fiq;
+ int uart_irq;
+
+ if (pdev->id >= MAX_FIQ_DEBUGGER_PORTS)
+ return -EINVAL;
+
+ if (!pdata->uart_getc || !pdata->uart_putc)
+ return -EINVAL;
+ if ((pdata->uart_enable && !pdata->uart_disable) ||
+ (!pdata->uart_enable && pdata->uart_disable))
+ return -EINVAL;
+
+ fiq = platform_get_irq_byname(pdev, "fiq");
+ uart_irq = platform_get_irq_byname(pdev, "uart_irq");
+
+ /* uart_irq mode and fiq mode are mutually exclusive, but one of them
+ * is required */
+ if ((uart_irq < 0 && fiq < 0) || (uart_irq >= 0 && fiq >= 0))
+ return -EINVAL;
+ if (fiq >= 0 && !pdata->fiq_enable)
+ return -EINVAL;
+
+ state = kzalloc(sizeof(*state), GFP_KERNEL);
+ state->output.printf = fiq_debugger_printf;
+ setup_timer(&state->sleep_timer, fiq_debugger_sleep_timer_expired,
+ (unsigned long)state);
+ state->pdata = pdata;
+ state->pdev = pdev;
+ state->no_sleep = initial_no_sleep;
+ state->debug_enable = initial_debug_enable;
+ state->console_enable = initial_console_enable;
+
+ state->fiq = fiq;
+ state->uart_irq = uart_irq;
+ state->signal_irq = platform_get_irq_byname(pdev, "signal");
+ state->wakeup_irq = platform_get_irq_byname(pdev, "wakeup");
+
+ INIT_WORK(&state->work, fiq_debugger_work);
+ spin_lock_init(&state->work_lock);
+
+ platform_set_drvdata(pdev, state);
+
+ spin_lock_init(&state->sleep_timer_lock);
+
+ if (state->wakeup_irq < 0 && fiq_debugger_have_fiq(state))
+ state->no_sleep = true;
+ state->ignore_next_wakeup_irq = !state->no_sleep;
+
+ wakeup_source_init(&state->debugger_wake_src, "serial-debug");
+
+ state->clk = clk_get(&pdev->dev, NULL);
+ if (IS_ERR(state->clk))
+ state->clk = NULL;
+
+ /* do not call pdata->uart_enable here since uart_init may still
+ * need to do some initialization before uart_enable can work.
+ * So, only try to manage the clock during init.
+ */
+ if (state->clk)
+ clk_enable(state->clk);
+
+ if (pdata->uart_init) {
+ ret = pdata->uart_init(pdev);
+ if (ret)
+ goto err_uart_init;
+ }
+
+ fiq_debugger_printf_nfiq(state,
+ "<hit enter %sto activate fiq debugger>\n",
+ state->no_sleep ? "" : "twice ");
+
+#ifdef CONFIG_FIQ_GLUE
+ if (fiq_debugger_have_fiq(state)) {
+ state->handler.fiq = fiq_debugger_fiq;
+ state->handler.resume = fiq_debugger_resume;
+ ret = fiq_glue_register_handler(&state->handler);
+ if (ret) {
+ pr_err("%s: could not install fiq handler\n", __func__);
+ goto err_register_irq;
+ }
+
+ pdata->fiq_enable(pdev, state->fiq, 1);
+ } else
+#endif
+ {
+ ret = request_irq(state->uart_irq, fiq_debugger_uart_irq,
+ IRQF_NO_SUSPEND, "debug", state);
+ if (ret) {
+ pr_err("%s: could not install irq handler\n", __func__);
+ goto err_register_irq;
+ }
+
+ /* for irq-only mode, we want this irq to wake us up, if it
+ * can.
+ */
+ enable_irq_wake(state->uart_irq);
+ }
+
+ if (state->clk)
+ clk_disable(state->clk);
+
+ if (state->signal_irq >= 0) {
+ ret = request_irq(state->signal_irq, fiq_debugger_signal_irq,
+ IRQF_TRIGGER_RISING, "debug-signal", state);
+ if (ret)
+ pr_err("serial_debugger: could not install signal_irq");
+ }
+
+ if (state->wakeup_irq >= 0) {
+ ret = request_irq(state->wakeup_irq,
+ fiq_debugger_wakeup_irq_handler,
+ IRQF_TRIGGER_FALLING,
+ "debug-wakeup", state);
+ if (ret) {
+ pr_err("serial_debugger: "
+ "could not install wakeup irq\n");
+ state->wakeup_irq = -1;
+ } else {
+ ret = enable_irq_wake(state->wakeup_irq);
+ if (ret) {
+ pr_err("serial_debugger: "
+ "could not enable wakeup\n");
+ state->wakeup_irq_no_set_wake = true;
+ }
+ }
+ }
+ if (state->no_sleep)
+ fiq_debugger_handle_wakeup(state);
+
+#if defined(CONFIG_FIQ_DEBUGGER_CONSOLE)
+ spin_lock_init(&state->console_lock);
+ state->console = fiq_debugger_console;
+ state->console.index = pdev->id;
+ if (!console_set_on_cmdline)
+ add_preferred_console(state->console.name,
+ state->console.index, NULL);
+ register_console(&state->console);
+ fiq_debugger_tty_init_one(state);
+#endif
+ return 0;
+
+err_register_irq:
+ if (pdata->uart_free)
+ pdata->uart_free(pdev);
+err_uart_init:
+ if (state->clk)
+ clk_disable(state->clk);
+ if (state->clk)
+ clk_put(state->clk);
+ wakeup_source_trash(&state->debugger_wake_src);
+ platform_set_drvdata(pdev, NULL);
+ kfree(state);
+ return ret;
+}
+
+static const struct dev_pm_ops fiq_debugger_dev_pm_ops = {
+ .suspend = fiq_debugger_dev_suspend,
+ .resume = fiq_debugger_dev_resume,
+};
+
+static struct platform_driver fiq_debugger_driver = {
+ .probe = fiq_debugger_probe,
+ .driver = {
+ .name = "fiq_debugger",
+ .pm = &fiq_debugger_dev_pm_ops,
+ },
+};
+
+#if defined(CONFIG_FIQ_DEBUGGER_UART_OVERLAY)
+int fiq_debugger_uart_overlay(void)
+{
+ struct device_node *onp = of_find_node_by_path("/uart_overlay@0");
+ int ret;
+
+ if (!onp) {
+ pr_err("serial_debugger: uart overlay not found\n");
+ return -ENODEV;
+ }
+
+ ret = of_overlay_create(onp);
+ if (ret < 0) {
+ pr_err("serial_debugger: fail to create overlay: %d\n", ret);
+ of_node_put(onp);
+ return ret;
+ }
+
+ pr_info("serial_debugger: uart overlay applied\n");
+ return 0;
+}
+#endif
+
+static int __init fiq_debugger_init(void)
+{
+ if (fiq_debugger_disable) {
+ pr_err("serial_debugger: disabled\n");
+ return -ENODEV;
+ }
+#if defined(CONFIG_FIQ_DEBUGGER_CONSOLE)
+ fiq_debugger_tty_init();
+#endif
+#if defined(CONFIG_FIQ_DEBUGGER_UART_OVERLAY)
+ fiq_debugger_uart_overlay();
+#endif
+ return platform_driver_register(&fiq_debugger_driver);
+}
+
+postcore_initcall(fiq_debugger_init);
diff --git a/drivers/staging/android/fiq_debugger/fiq_debugger.h b/drivers/staging/android/fiq_debugger/fiq_debugger.h
new file mode 100644
index 000000000000..c9ec4f8db086
--- /dev/null
+++ b/drivers/staging/android/fiq_debugger/fiq_debugger.h
@@ -0,0 +1,64 @@
+/*
+ * drivers/staging/android/fiq_debugger/fiq_debugger.h
+ *
+ * Copyright (C) 2010 Google, Inc.
+ * Author: Colin Cross <ccross@android.com>
+ *
+ * This software is licensed under the terms of the GNU General Public
+ * License version 2, as published by the Free Software Foundation, and
+ * may be copied, distributed, and modified under those terms.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ */
+
+#ifndef _ARCH_ARM_MACH_TEGRA_FIQ_DEBUGGER_H_
+#define _ARCH_ARM_MACH_TEGRA_FIQ_DEBUGGER_H_
+
+#include <linux/serial_core.h>
+
+#define FIQ_DEBUGGER_NO_CHAR NO_POLL_CHAR
+#define FIQ_DEBUGGER_BREAK 0x00ff0100
+
+#define FIQ_DEBUGGER_FIQ_IRQ_NAME "fiq"
+#define FIQ_DEBUGGER_SIGNAL_IRQ_NAME "signal"
+#define FIQ_DEBUGGER_WAKEUP_IRQ_NAME "wakeup"
+
+/**
+ * struct fiq_debugger_pdata - fiq debugger platform data
+ * @uart_resume: used to restore uart state right before enabling
+ * the fiq.
+ * @uart_enable: Do the work necessary to communicate with the uart
+ * hw (enable clocks, etc.). This must be ref-counted.
+ * @uart_disable: Do the work necessary to disable the uart hw
+ * (disable clocks, etc.). This must be ref-counted.
+ * @uart_dev_suspend: called during PM suspend, generally not needed
+ * for real fiq mode debugger.
+ * @uart_dev_resume: called during PM resume, generally not needed
+ * for real fiq mode debugger.
+ */
+struct fiq_debugger_pdata {
+ int (*uart_init)(struct platform_device *pdev);
+ void (*uart_free)(struct platform_device *pdev);
+ int (*uart_resume)(struct platform_device *pdev);
+ int (*uart_getc)(struct platform_device *pdev);
+ void (*uart_putc)(struct platform_device *pdev, unsigned int c);
+ void (*uart_flush)(struct platform_device *pdev);
+ void (*uart_enable)(struct platform_device *pdev);
+ void (*uart_disable)(struct platform_device *pdev);
+
+ int (*uart_dev_suspend)(struct platform_device *pdev);
+ int (*uart_dev_resume)(struct platform_device *pdev);
+
+ void (*fiq_enable)(struct platform_device *pdev, unsigned int fiq,
+ bool enable);
+ void (*fiq_ack)(struct platform_device *pdev, unsigned int fiq);
+
+ void (*force_irq)(struct platform_device *pdev, unsigned int irq);
+ void (*force_irq_ack)(struct platform_device *pdev, unsigned int irq);
+};
+
+#endif
diff --git a/drivers/staging/android/fiq_debugger/fiq_debugger_arm.c b/drivers/staging/android/fiq_debugger/fiq_debugger_arm.c
new file mode 100644
index 000000000000..8b3e0137be1a
--- /dev/null
+++ b/drivers/staging/android/fiq_debugger/fiq_debugger_arm.c
@@ -0,0 +1,240 @@
+/*
+ * Copyright (C) 2014 Google, Inc.
+ * Author: Colin Cross <ccross@android.com>
+ *
+ * This software is licensed under the terms of the GNU General Public
+ * License version 2, as published by the Free Software Foundation, and
+ * may be copied, distributed, and modified under those terms.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ */
+
+#include <linux/ptrace.h>
+#include <linux/uaccess.h>
+
+#include <asm/stacktrace.h>
+
+#include "fiq_debugger_priv.h"
+
+static char *mode_name(unsigned cpsr)
+{
+ switch (cpsr & MODE_MASK) {
+ case USR_MODE: return "USR";
+ case FIQ_MODE: return "FIQ";
+ case IRQ_MODE: return "IRQ";
+ case SVC_MODE: return "SVC";
+ case ABT_MODE: return "ABT";
+ case UND_MODE: return "UND";
+ case SYSTEM_MODE: return "SYS";
+ default: return "???";
+ }
+}
+
+void fiq_debugger_dump_pc(struct fiq_debugger_output *output,
+ const struct pt_regs *regs)
+{
+ output->printf(output, " pc %08x cpsr %08x mode %s\n",
+ regs->ARM_pc, regs->ARM_cpsr, mode_name(regs->ARM_cpsr));
+}
+
+void fiq_debugger_dump_regs(struct fiq_debugger_output *output,
+ const struct pt_regs *regs)
+{
+ output->printf(output,
+ " r0 %08x r1 %08x r2 %08x r3 %08x\n",
+ regs->ARM_r0, regs->ARM_r1, regs->ARM_r2, regs->ARM_r3);
+ output->printf(output,
+ " r4 %08x r5 %08x r6 %08x r7 %08x\n",
+ regs->ARM_r4, regs->ARM_r5, regs->ARM_r6, regs->ARM_r7);
+ output->printf(output,
+ " r8 %08x r9 %08x r10 %08x r11 %08x mode %s\n",
+ regs->ARM_r8, regs->ARM_r9, regs->ARM_r10, regs->ARM_fp,
+ mode_name(regs->ARM_cpsr));
+ output->printf(output,
+ " ip %08x sp %08x lr %08x pc %08x cpsr %08x\n",
+ regs->ARM_ip, regs->ARM_sp, regs->ARM_lr, regs->ARM_pc,
+ regs->ARM_cpsr);
+}
+
+struct mode_regs {
+ unsigned long sp_svc;
+ unsigned long lr_svc;
+ unsigned long spsr_svc;
+
+ unsigned long sp_abt;
+ unsigned long lr_abt;
+ unsigned long spsr_abt;
+
+ unsigned long sp_und;
+ unsigned long lr_und;
+ unsigned long spsr_und;
+
+ unsigned long sp_irq;
+ unsigned long lr_irq;
+ unsigned long spsr_irq;
+
+ unsigned long r8_fiq;
+ unsigned long r9_fiq;
+ unsigned long r10_fiq;
+ unsigned long r11_fiq;
+ unsigned long r12_fiq;
+ unsigned long sp_fiq;
+ unsigned long lr_fiq;
+ unsigned long spsr_fiq;
+};
+
+static void __naked get_mode_regs(struct mode_regs *regs)
+{
+ asm volatile (
+ "mrs r1, cpsr\n"
+ "msr cpsr_c, #0xd3 @(SVC_MODE | PSR_I_BIT | PSR_F_BIT)\n"
+ "stmia r0!, {r13 - r14}\n"
+ "mrs r2, spsr\n"
+ "msr cpsr_c, #0xd7 @(ABT_MODE | PSR_I_BIT | PSR_F_BIT)\n"
+ "stmia r0!, {r2, r13 - r14}\n"
+ "mrs r2, spsr\n"
+ "msr cpsr_c, #0xdb @(UND_MODE | PSR_I_BIT | PSR_F_BIT)\n"
+ "stmia r0!, {r2, r13 - r14}\n"
+ "mrs r2, spsr\n"
+ "msr cpsr_c, #0xd2 @(IRQ_MODE | PSR_I_BIT | PSR_F_BIT)\n"
+ "stmia r0!, {r2, r13 - r14}\n"
+ "mrs r2, spsr\n"
+ "msr cpsr_c, #0xd1 @(FIQ_MODE | PSR_I_BIT | PSR_F_BIT)\n"
+ "stmia r0!, {r2, r8 - r14}\n"
+ "mrs r2, spsr\n"
+ "stmia r0!, {r2}\n"
+ "msr cpsr_c, r1\n"
+ "bx lr\n");
+}
+
+
+void fiq_debugger_dump_allregs(struct fiq_debugger_output *output,
+ const struct pt_regs *regs)
+{
+ struct mode_regs mode_regs;
+ unsigned long mode = regs->ARM_cpsr & MODE_MASK;
+
+ fiq_debugger_dump_regs(output, regs);
+ get_mode_regs(&mode_regs);
+
+ output->printf(output,
+ "%csvc: sp %08x lr %08x spsr %08x\n",
+ mode == SVC_MODE ? '*' : ' ',
+ mode_regs.sp_svc, mode_regs.lr_svc, mode_regs.spsr_svc);
+ output->printf(output,
+ "%cabt: sp %08x lr %08x spsr %08x\n",
+ mode == ABT_MODE ? '*' : ' ',
+ mode_regs.sp_abt, mode_regs.lr_abt, mode_regs.spsr_abt);
+ output->printf(output,
+ "%cund: sp %08x lr %08x spsr %08x\n",
+ mode == UND_MODE ? '*' : ' ',
+ mode_regs.sp_und, mode_regs.lr_und, mode_regs.spsr_und);
+ output->printf(output,
+ "%cirq: sp %08x lr %08x spsr %08x\n",
+ mode == IRQ_MODE ? '*' : ' ',
+ mode_regs.sp_irq, mode_regs.lr_irq, mode_regs.spsr_irq);
+ output->printf(output,
+ "%cfiq: r8 %08x r9 %08x r10 %08x r11 %08x r12 %08x\n",
+ mode == FIQ_MODE ? '*' : ' ',
+ mode_regs.r8_fiq, mode_regs.r9_fiq, mode_regs.r10_fiq,
+ mode_regs.r11_fiq, mode_regs.r12_fiq);
+ output->printf(output,
+ " fiq: sp %08x lr %08x spsr %08x\n",
+ mode_regs.sp_fiq, mode_regs.lr_fiq, mode_regs.spsr_fiq);
+}
+
+struct stacktrace_state {
+ struct fiq_debugger_output *output;
+ unsigned int depth;
+};
+
+static int report_trace(struct stackframe *frame, void *d)
+{
+ struct stacktrace_state *sts = d;
+
+ if (sts->depth) {
+ sts->output->printf(sts->output,
+ " pc: %p (%pF), lr %p (%pF), sp %p, fp %p\n",
+ frame->pc, frame->pc, frame->lr, frame->lr,
+ frame->sp, frame->fp);
+ sts->depth--;
+ return 0;
+ }
+ sts->output->printf(sts->output, " ...\n");
+
+ return sts->depth == 0;
+}
+
+struct frame_tail {
+ struct frame_tail *fp;
+ unsigned long sp;
+ unsigned long lr;
+} __attribute__((packed));
+
+static struct frame_tail *user_backtrace(struct fiq_debugger_output *output,
+ struct frame_tail *tail)
+{
+ struct frame_tail buftail[2];
+
+ /* Also check accessibility of one struct frame_tail beyond */
+ if (!access_ok(VERIFY_READ, tail, sizeof(buftail))) {
+ output->printf(output, " invalid frame pointer %p\n",
+ tail);
+ return NULL;
+ }
+ if (__copy_from_user_inatomic(buftail, tail, sizeof(buftail))) {
+ output->printf(output,
+ " failed to copy frame pointer %p\n", tail);
+ return NULL;
+ }
+
+ output->printf(output, " %p\n", buftail[0].lr);
+
+ /* frame pointers should strictly progress back up the stack
+ * (towards higher addresses) */
+ if (tail >= buftail[0].fp)
+ return NULL;
+
+ return buftail[0].fp-1;
+}
+
+void fiq_debugger_dump_stacktrace(struct fiq_debugger_output *output,
+ const struct pt_regs *regs, unsigned int depth, void *ssp)
+{
+ struct frame_tail *tail;
+ struct thread_info *real_thread_info = THREAD_INFO(ssp);
+ struct stacktrace_state sts;
+
+ sts.depth = depth;
+ sts.output = output;
+ *current_thread_info() = *real_thread_info;
+
+ if (!current)
+ output->printf(output, "current NULL\n");
+ else
+ output->printf(output, "pid: %d comm: %s\n",
+ current->pid, current->comm);
+ fiq_debugger_dump_regs(output, regs);
+
+ if (!user_mode(regs)) {
+ struct stackframe frame;
+ frame.fp = regs->ARM_fp;
+ frame.sp = regs->ARM_sp;
+ frame.lr = regs->ARM_lr;
+ frame.pc = regs->ARM_pc;
+ output->printf(output,
+ " pc: %p (%pF), lr %p (%pF), sp %p, fp %p\n",
+ regs->ARM_pc, regs->ARM_pc, regs->ARM_lr, regs->ARM_lr,
+ regs->ARM_sp, regs->ARM_fp);
+ walk_stackframe(&frame, report_trace, &sts);
+ return;
+ }
+
+ tail = ((struct frame_tail *) regs->ARM_fp) - 1;
+ while (depth-- && tail && !((unsigned long) tail & 3))
+ tail = user_backtrace(output, tail);
+}
diff --git a/drivers/staging/android/fiq_debugger/fiq_debugger_arm64.c b/drivers/staging/android/fiq_debugger/fiq_debugger_arm64.c
new file mode 100644
index 000000000000..97246bcbcd62
--- /dev/null
+++ b/drivers/staging/android/fiq_debugger/fiq_debugger_arm64.c
@@ -0,0 +1,202 @@
+/*
+ * Copyright (C) 2014 Google, Inc.
+ * Author: Colin Cross <ccross@android.com>
+ *
+ * This software is licensed under the terms of the GNU General Public
+ * License version 2, as published by the Free Software Foundation, and
+ * may be copied, distributed, and modified under those terms.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ */
+
+#include <linux/ptrace.h>
+#include <asm/stacktrace.h>
+
+#include "fiq_debugger_priv.h"
+
+static char *mode_name(const struct pt_regs *regs)
+{
+ if (compat_user_mode(regs)) {
+ return "USR";
+ } else {
+ switch (processor_mode(regs)) {
+ case PSR_MODE_EL0t: return "EL0t";
+ case PSR_MODE_EL1t: return "EL1t";
+ case PSR_MODE_EL1h: return "EL1h";
+ case PSR_MODE_EL2t: return "EL2t";
+ case PSR_MODE_EL2h: return "EL2h";
+ default: return "???";
+ }
+ }
+}
+
+void fiq_debugger_dump_pc(struct fiq_debugger_output *output,
+ const struct pt_regs *regs)
+{
+ output->printf(output, " pc %016lx cpsr %08lx mode %s\n",
+ regs->pc, regs->pstate, mode_name(regs));
+}
+
+void fiq_debugger_dump_regs_aarch32(struct fiq_debugger_output *output,
+ const struct pt_regs *regs)
+{
+ output->printf(output, " r0 %08x r1 %08x r2 %08x r3 %08x\n",
+ regs->compat_usr(0), regs->compat_usr(1),
+ regs->compat_usr(2), regs->compat_usr(3));
+ output->printf(output, " r4 %08x r5 %08x r6 %08x r7 %08x\n",
+ regs->compat_usr(4), regs->compat_usr(5),
+ regs->compat_usr(6), regs->compat_usr(7));
+ output->printf(output, " r8 %08x r9 %08x r10 %08x r11 %08x\n",
+ regs->compat_usr(8), regs->compat_usr(9),
+ regs->compat_usr(10), regs->compat_usr(11));
+ output->printf(output, " ip %08x sp %08x lr %08x pc %08x\n",
+ regs->compat_usr(12), regs->compat_sp,
+ regs->compat_lr, regs->pc);
+ output->printf(output, " cpsr %08x (%s)\n",
+ regs->pstate, mode_name(regs));
+}
+
+void fiq_debugger_dump_regs_aarch64(struct fiq_debugger_output *output,
+ const struct pt_regs *regs)
+{
+
+ output->printf(output, " x0 %016lx x1 %016lx\n",
+ regs->regs[0], regs->regs[1]);
+ output->printf(output, " x2 %016lx x3 %016lx\n",
+ regs->regs[2], regs->regs[3]);
+ output->printf(output, " x4 %016lx x5 %016lx\n",
+ regs->regs[4], regs->regs[5]);
+ output->printf(output, " x6 %016lx x7 %016lx\n",
+ regs->regs[6], regs->regs[7]);
+ output->printf(output, " x8 %016lx x9 %016lx\n",
+ regs->regs[8], regs->regs[9]);
+ output->printf(output, " x10 %016lx x11 %016lx\n",
+ regs->regs[10], regs->regs[11]);
+ output->printf(output, " x12 %016lx x13 %016lx\n",
+ regs->regs[12], regs->regs[13]);
+ output->printf(output, " x14 %016lx x15 %016lx\n",
+ regs->regs[14], regs->regs[15]);
+ output->printf(output, " x16 %016lx x17 %016lx\n",
+ regs->regs[16], regs->regs[17]);
+ output->printf(output, " x18 %016lx x19 %016lx\n",
+ regs->regs[18], regs->regs[19]);
+ output->printf(output, " x20 %016lx x21 %016lx\n",
+ regs->regs[20], regs->regs[21]);
+ output->printf(output, " x22 %016lx x23 %016lx\n",
+ regs->regs[22], regs->regs[23]);
+ output->printf(output, " x24 %016lx x25 %016lx\n",
+ regs->regs[24], regs->regs[25]);
+ output->printf(output, " x26 %016lx x27 %016lx\n",
+ regs->regs[26], regs->regs[27]);
+ output->printf(output, " x28 %016lx x29 %016lx\n",
+ regs->regs[28], regs->regs[29]);
+ output->printf(output, " x30 %016lx sp %016lx\n",
+ regs->regs[30], regs->sp);
+ output->printf(output, " pc %016lx cpsr %08x (%s)\n",
+ regs->pc, regs->pstate, mode_name(regs));
+}
+
+void fiq_debugger_dump_regs(struct fiq_debugger_output *output,
+ const struct pt_regs *regs)
+{
+ if (compat_user_mode(regs))
+ fiq_debugger_dump_regs_aarch32(output, regs);
+ else
+ fiq_debugger_dump_regs_aarch64(output, regs);
+}
+
+#define READ_SPECIAL_REG(x) ({ \
+ u64 val; \
+ asm volatile ("mrs %0, " # x : "=r"(val)); \
+ val; \
+})
+
+void fiq_debugger_dump_allregs(struct fiq_debugger_output *output,
+ const struct pt_regs *regs)
+{
+ u32 pstate = READ_SPECIAL_REG(CurrentEl);
+ bool in_el2 = (pstate & PSR_MODE_MASK) >= PSR_MODE_EL2t;
+
+ fiq_debugger_dump_regs(output, regs);
+
+ output->printf(output, " sp_el0 %016lx\n",
+ READ_SPECIAL_REG(sp_el0));
+
+ if (in_el2)
+ output->printf(output, " sp_el1 %016lx\n",
+ READ_SPECIAL_REG(sp_el1));
+
+ output->printf(output, " elr_el1 %016lx\n",
+ READ_SPECIAL_REG(elr_el1));
+
+ output->printf(output, " spsr_el1 %08lx\n",
+ READ_SPECIAL_REG(spsr_el1));
+
+ if (in_el2) {
+ output->printf(output, " spsr_irq %08lx\n",
+ READ_SPECIAL_REG(spsr_irq));
+ output->printf(output, " spsr_abt %08lx\n",
+ READ_SPECIAL_REG(spsr_abt));
+ output->printf(output, " spsr_und %08lx\n",
+ READ_SPECIAL_REG(spsr_und));
+ output->printf(output, " spsr_fiq %08lx\n",
+ READ_SPECIAL_REG(spsr_fiq));
+ output->printf(output, " spsr_el2 %08lx\n",
+ READ_SPECIAL_REG(elr_el2));
+ output->printf(output, " spsr_el2 %08lx\n",
+ READ_SPECIAL_REG(spsr_el2));
+ }
+}
+
+struct stacktrace_state {
+ struct fiq_debugger_output *output;
+ unsigned int depth;
+};
+
+static int report_trace(struct stackframe *frame, void *d)
+{
+ struct stacktrace_state *sts = d;
+
+ if (sts->depth) {
+ sts->output->printf(sts->output, "%pF:\n", frame->pc);
+ sts->output->printf(sts->output,
+ " pc %016lx sp %016lx fp %016lx\n",
+ frame->pc, frame->sp, frame->fp);
+ sts->depth--;
+ return 0;
+ }
+ sts->output->printf(sts->output, " ...\n");
+
+ return sts->depth == 0;
+}
+
+void fiq_debugger_dump_stacktrace(struct fiq_debugger_output *output,
+ const struct pt_regs *regs, unsigned int depth, void *ssp)
+{
+ struct thread_info *real_thread_info = THREAD_INFO(ssp);
+ struct stacktrace_state sts;
+
+ sts.depth = depth;
+ sts.output = output;
+ *current_thread_info() = *real_thread_info;
+
+ if (!current)
+ output->printf(output, "current NULL\n");
+ else
+ output->printf(output, "pid: %d comm: %s\n",
+ current->pid, current->comm);
+ fiq_debugger_dump_regs(output, regs);
+
+ if (!user_mode(regs)) {
+ struct stackframe frame;
+ frame.fp = regs->regs[29];
+ frame.sp = regs->sp;
+ frame.pc = regs->pc;
+ output->printf(output, "\n");
+ walk_stackframe(current, &frame, report_trace, &sts);
+ }
+}
diff --git a/drivers/staging/android/fiq_debugger/fiq_debugger_priv.h b/drivers/staging/android/fiq_debugger/fiq_debugger_priv.h
new file mode 100644
index 000000000000..d5d051f727a8
--- /dev/null
+++ b/drivers/staging/android/fiq_debugger/fiq_debugger_priv.h
@@ -0,0 +1,37 @@
+/*
+ * Copyright (C) 2014 Google, Inc.
+ * Author: Colin Cross <ccross@android.com>
+ *
+ * This software is licensed under the terms of the GNU General Public
+ * License version 2, as published by the Free Software Foundation, and
+ * may be copied, distributed, and modified under those terms.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ */
+
+#ifndef _FIQ_DEBUGGER_PRIV_H_
+#define _FIQ_DEBUGGER_PRIV_H_
+
+#define THREAD_INFO(sp) ((struct thread_info *) \
+ ((unsigned long)(sp) & ~(THREAD_SIZE - 1)))
+
+struct fiq_debugger_output {
+ void (*printf)(struct fiq_debugger_output *output, const char *fmt, ...);
+};
+
+struct pt_regs;
+
+void fiq_debugger_dump_pc(struct fiq_debugger_output *output,
+ const struct pt_regs *regs);
+void fiq_debugger_dump_regs(struct fiq_debugger_output *output,
+ const struct pt_regs *regs);
+void fiq_debugger_dump_allregs(struct fiq_debugger_output *output,
+ const struct pt_regs *regs);
+void fiq_debugger_dump_stacktrace(struct fiq_debugger_output *output,
+ const struct pt_regs *regs, unsigned int depth, void *ssp);
+
+#endif
diff --git a/drivers/staging/android/fiq_debugger/fiq_debugger_ringbuf.h b/drivers/staging/android/fiq_debugger/fiq_debugger_ringbuf.h
new file mode 100644
index 000000000000..10c3c5d09098
--- /dev/null
+++ b/drivers/staging/android/fiq_debugger/fiq_debugger_ringbuf.h
@@ -0,0 +1,94 @@
+/*
+ * drivers/staging/android/fiq_debugger/fiq_debugger_ringbuf.h
+ *
+ * simple lockless ringbuffer
+ *
+ * Copyright (C) 2010 Google, Inc.
+ *
+ * This software is licensed under the terms of the GNU General Public
+ * License version 2, as published by the Free Software Foundation, and
+ * may be copied, distributed, and modified under those terms.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ */
+
+#include <linux/kernel.h>
+#include <linux/slab.h>
+
+struct fiq_debugger_ringbuf {
+ int len;
+ int head;
+ int tail;
+ u8 buf[];
+};
+
+
+static inline struct fiq_debugger_ringbuf *fiq_debugger_ringbuf_alloc(int len)
+{
+ struct fiq_debugger_ringbuf *rbuf;
+
+ rbuf = kzalloc(sizeof(*rbuf) + len, GFP_KERNEL);
+ if (rbuf == NULL)
+ return NULL;
+
+ rbuf->len = len;
+ rbuf->head = 0;
+ rbuf->tail = 0;
+ smp_mb();
+
+ return rbuf;
+}
+
+static inline void fiq_debugger_ringbuf_free(struct fiq_debugger_ringbuf *rbuf)
+{
+ kfree(rbuf);
+}
+
+static inline int fiq_debugger_ringbuf_level(struct fiq_debugger_ringbuf *rbuf)
+{
+ int level = rbuf->head - rbuf->tail;
+
+ if (level < 0)
+ level = rbuf->len + level;
+
+ return level;
+}
+
+static inline int fiq_debugger_ringbuf_room(struct fiq_debugger_ringbuf *rbuf)
+{
+ return rbuf->len - fiq_debugger_ringbuf_level(rbuf) - 1;
+}
+
+static inline u8
+fiq_debugger_ringbuf_peek(struct fiq_debugger_ringbuf *rbuf, int i)
+{
+ return rbuf->buf[(rbuf->tail + i) % rbuf->len];
+}
+
+static inline int
+fiq_debugger_ringbuf_consume(struct fiq_debugger_ringbuf *rbuf, int count)
+{
+ count = min(count, fiq_debugger_ringbuf_level(rbuf));
+
+ rbuf->tail = (rbuf->tail + count) % rbuf->len;
+ smp_mb();
+
+ return count;
+}
+
+static inline int
+fiq_debugger_ringbuf_push(struct fiq_debugger_ringbuf *rbuf, u8 datum)
+{
+ if (fiq_debugger_ringbuf_room(rbuf) == 0)
+ return 0;
+
+ rbuf->buf[rbuf->head] = datum;
+ smp_mb();
+ rbuf->head = (rbuf->head + 1) % rbuf->len;
+ smp_mb();
+
+ return 1;
+}
diff --git a/drivers/staging/android/fiq_debugger/fiq_watchdog.c b/drivers/staging/android/fiq_debugger/fiq_watchdog.c
new file mode 100644
index 000000000000..194b54138417
--- /dev/null
+++ b/drivers/staging/android/fiq_debugger/fiq_watchdog.c
@@ -0,0 +1,56 @@
+/*
+ * Copyright (C) 2014 Google, Inc.
+ *
+ * This software is licensed under the terms of the GNU General Public
+ * License version 2, as published by the Free Software Foundation, and
+ * may be copied, distributed, and modified under those terms.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ */
+
+#include <linux/kernel.h>
+#include <linux/spinlock.h>
+#include <linux/pstore_ram.h>
+
+#include "fiq_watchdog.h"
+#include "fiq_debugger_priv.h"
+
+static DEFINE_RAW_SPINLOCK(fiq_watchdog_lock);
+
+static void fiq_watchdog_printf(struct fiq_debugger_output *output,
+ const char *fmt, ...)
+{
+ char buf[256];
+ va_list ap;
+ int len;
+
+ va_start(ap, fmt);
+ len = vscnprintf(buf, sizeof(buf), fmt, ap);
+ va_end(ap);
+
+ ramoops_console_write_buf(buf, len);
+}
+
+struct fiq_debugger_output fiq_watchdog_output = {
+ .printf = fiq_watchdog_printf,
+};
+
+void fiq_watchdog_triggered(const struct pt_regs *regs, void *svc_sp)
+{
+ char msg[24];
+ int len;
+
+ raw_spin_lock(&fiq_watchdog_lock);
+
+ len = scnprintf(msg, sizeof(msg), "watchdog fiq cpu %d\n",
+ THREAD_INFO(svc_sp)->cpu);
+ ramoops_console_write_buf(msg, len);
+
+ fiq_debugger_dump_stacktrace(&fiq_watchdog_output, regs, 100, svc_sp);
+
+ raw_spin_unlock(&fiq_watchdog_lock);
+}
diff --git a/drivers/staging/android/fiq_debugger/fiq_watchdog.h b/drivers/staging/android/fiq_debugger/fiq_watchdog.h
new file mode 100644
index 000000000000..c6b507f8d976
--- /dev/null
+++ b/drivers/staging/android/fiq_debugger/fiq_watchdog.h
@@ -0,0 +1,20 @@
+/*
+ * Copyright (C) 2014 Google, Inc.
+ *
+ * This software is licensed under the terms of the GNU General Public
+ * License version 2, as published by the Free Software Foundation, and
+ * may be copied, distributed, and modified under those terms.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ */
+
+#ifndef _FIQ_WATCHDOG_H_
+#define _FIQ_WATCHDOG_H_
+
+void fiq_watchdog_triggered(const struct pt_regs *regs, void *svc_sp);
+
+#endif
diff --git a/drivers/staging/android/ion/ion.c b/drivers/staging/android/ion/ion.c
index 806e9b30b9dc..3af862c56d76 100644
--- a/drivers/staging/android/ion/ion.c
+++ b/drivers/staging/android/ion/ion.c
@@ -622,7 +622,7 @@ static int ion_debug_client_show(struct seq_file *s, void *unused)
mutex_lock(&debugfs_mutex);
if (!is_client_alive(client)) {
- seq_printf(s, "ion_client 0x%p dead, can't dump its buffers\n",
+ seq_printf(s, "ion_client 0x%pK dead, can't dump its buffers\n",
client);
mutex_unlock(&debugfs_mutex);
return 0;
@@ -775,7 +775,6 @@ void ion_client_destroy(struct ion_client *client)
struct ion_device *dev = client->dev;
struct rb_node *n;
- pr_debug("%s: %d\n", __func__, __LINE__);
mutex_lock(&debugfs_mutex);
while ((n = rb_first(&client->handles))) {
struct ion_handle *handle = rb_entry(n, struct ion_handle,
@@ -848,9 +847,6 @@ static void ion_buffer_sync_for_device(struct ion_buffer *buffer,
int pages = PAGE_ALIGN(buffer->size) / PAGE_SIZE;
int i;
- pr_debug("%s: syncing for device %s\n", __func__,
- dev ? dev_name(dev) : "null");
-
if (!ion_buffer_fault_user_mappings(buffer))
return;
@@ -904,7 +900,6 @@ static void ion_vm_open(struct vm_area_struct *vma)
mutex_lock(&buffer->lock);
list_add(&vma_list->list, &buffer->vmas);
mutex_unlock(&buffer->lock);
- pr_debug("%s: adding %p\n", __func__, vma);
}
static void ion_vm_close(struct vm_area_struct *vma)
@@ -912,14 +907,12 @@ static void ion_vm_close(struct vm_area_struct *vma)
struct ion_buffer *buffer = vma->vm_private_data;
struct ion_vma_list *vma_list, *tmp;
- pr_debug("%s\n", __func__);
mutex_lock(&buffer->lock);
list_for_each_entry_safe(vma_list, tmp, &buffer->vmas, list) {
if (vma_list->vma != vma)
continue;
list_del(&vma_list->list);
kfree(vma_list);
- pr_debug("%s: deleting %p\n", __func__, vma);
break;
}
mutex_unlock(&buffer->lock);
@@ -1231,7 +1224,6 @@ static int ion_release(struct inode *inode, struct file *file)
{
struct ion_client *client = file->private_data;
- pr_debug("%s: %d\n", __func__, __LINE__);
ion_client_destroy(client);
return 0;
}
@@ -1243,7 +1235,6 @@ static int ion_open(struct inode *inode, struct file *file)
struct ion_client *client;
char debug_name[64];
- pr_debug("%s: %d\n", __func__, __LINE__);
snprintf(debug_name, 64, "%u", task_pid_nr(current->group_leader));
client = ion_client_create(dev, debug_name);
if (IS_ERR(client))
diff --git a/drivers/staging/android/ion/ion_cma_heap.c b/drivers/staging/android/ion/ion_cma_heap.c
index 6c7de74bc7ab..0ca90bac4e61 100644
--- a/drivers/staging/android/ion/ion_cma_heap.c
+++ b/drivers/staging/android/ion/ion_cma_heap.c
@@ -49,8 +49,6 @@ static int ion_cma_allocate(struct ion_heap *heap, struct ion_buffer *buffer,
struct device *dev = cma_heap->dev;
struct ion_cma_buffer_info *info;
- dev_dbg(dev, "Request buffer allocation len %ld\n", len);
-
if (buffer->flags & ION_FLAG_CACHED)
return -EINVAL;
@@ -79,7 +77,6 @@ static int ion_cma_allocate(struct ion_heap *heap, struct ion_buffer *buffer,
/* keep this for memory release */
buffer->priv_virt = info;
buffer->sg_table = info->table;
- dev_dbg(dev, "Allocate buffer %p\n", buffer);
return 0;
free_table:
@@ -97,7 +94,6 @@ static void ion_cma_free(struct ion_buffer *buffer)
struct device *dev = cma_heap->dev;
struct ion_cma_buffer_info *info = buffer->priv_virt;
- dev_dbg(dev, "Release buffer %p\n", buffer);
/* release memory */
dma_free_coherent(dev, buffer->size, info->cpu_addr, info->handle);
/* release sg table */
diff --git a/drivers/staging/android/lowmemorykiller.c b/drivers/staging/android/lowmemorykiller.c
index ec3b66561412..687be3615ff7 100644
--- a/drivers/staging/android/lowmemorykiller.c
+++ b/drivers/staging/android/lowmemorykiller.c
@@ -43,6 +43,9 @@
#include <linux/profile.h>
#include <linux/notifier.h>
+#define CREATE_TRACE_POINTS
+#include "trace/lowmemorykiller.h"
+
static u32 lowmem_debug_level = 1;
static short lowmem_adj[6] = {
0,
@@ -93,6 +96,7 @@ static unsigned long lowmem_scan(struct shrinker *s, struct shrink_control *sc)
int other_free = global_page_state(NR_FREE_PAGES) - totalreserve_pages;
int other_file = global_node_page_state(NR_FILE_PAGES) -
global_node_page_state(NR_SHMEM) -
+ global_node_page_state(NR_UNEVICTABLE) -
total_swapcache_pages();
if (lowmem_adj_size < array_size)
@@ -160,23 +164,27 @@ static unsigned long lowmem_scan(struct shrinker *s, struct shrink_control *sc)
p->comm, p->pid, oom_score_adj, tasksize);
}
if (selected) {
+ long cache_size = other_file * (long)(PAGE_SIZE / 1024);
+ long cache_limit = minfree * (long)(PAGE_SIZE / 1024);
+ long free = other_free * (long)(PAGE_SIZE / 1024);
+
task_lock(selected);
send_sig(SIGKILL, selected, 0);
if (selected->mm)
task_set_lmk_waiting(selected);
task_unlock(selected);
- lowmem_print(1, "Killing '%s' (%d), adj %hd,\n"
+ trace_lowmemory_kill(selected, cache_size, cache_limit, free);
+ lowmem_print(1, "Killing '%s' (%d) (tgid %d), adj %hd,\n"
" to free %ldkB on behalf of '%s' (%d) because\n"
" cache %ldkB is below limit %ldkB for oom_score_adj %hd\n"
" Free memory is %ldkB above reserved\n",
- selected->comm, selected->pid,
+ selected->comm, selected->pid, selected->tgid,
selected_oom_score_adj,
selected_tasksize * (long)(PAGE_SIZE / 1024),
current->comm, current->pid,
- other_file * (long)(PAGE_SIZE / 1024),
- minfree * (long)(PAGE_SIZE / 1024),
+ cache_size, cache_limit,
min_score_adj,
- other_free * (long)(PAGE_SIZE / 1024));
+ free);
lowmem_deathpending_timeout = jiffies + HZ;
rem += selected_tasksize;
}
@@ -200,12 +208,96 @@ static int __init lowmem_init(void)
}
device_initcall(lowmem_init);
+#ifdef CONFIG_ANDROID_LOW_MEMORY_KILLER_AUTODETECT_OOM_ADJ_VALUES
+static short lowmem_oom_adj_to_oom_score_adj(short oom_adj)
+{
+ if (oom_adj == OOM_ADJUST_MAX)
+ return OOM_SCORE_ADJ_MAX;
+ else
+ return (oom_adj * OOM_SCORE_ADJ_MAX) / -OOM_DISABLE;
+}
+
+static void lowmem_autodetect_oom_adj_values(void)
+{
+ int i;
+ short oom_adj;
+ short oom_score_adj;
+ int array_size = ARRAY_SIZE(lowmem_adj);
+
+ if (lowmem_adj_size < array_size)
+ array_size = lowmem_adj_size;
+
+ if (array_size <= 0)
+ return;
+
+ oom_adj = lowmem_adj[array_size - 1];
+ if (oom_adj > OOM_ADJUST_MAX)
+ return;
+
+ oom_score_adj = lowmem_oom_adj_to_oom_score_adj(oom_adj);
+ if (oom_score_adj <= OOM_ADJUST_MAX)
+ return;
+
+ lowmem_print(1, "lowmem_shrink: convert oom_adj to oom_score_adj:\n");
+ for (i = 0; i < array_size; i++) {
+ oom_adj = lowmem_adj[i];
+ oom_score_adj = lowmem_oom_adj_to_oom_score_adj(oom_adj);
+ lowmem_adj[i] = oom_score_adj;
+ lowmem_print(1, "oom_adj %d => oom_score_adj %d\n",
+ oom_adj, oom_score_adj);
+ }
+}
+
+static int lowmem_adj_array_set(const char *val, const struct kernel_param *kp)
+{
+ int ret;
+
+ ret = param_array_ops.set(val, kp);
+
+ /* HACK: Autodetect oom_adj values in lowmem_adj array */
+ lowmem_autodetect_oom_adj_values();
+
+ return ret;
+}
+
+static int lowmem_adj_array_get(char *buffer, const struct kernel_param *kp)
+{
+ return param_array_ops.get(buffer, kp);
+}
+
+static void lowmem_adj_array_free(void *arg)
+{
+ param_array_ops.free(arg);
+}
+
+static struct kernel_param_ops lowmem_adj_array_ops = {
+ .set = lowmem_adj_array_set,
+ .get = lowmem_adj_array_get,
+ .free = lowmem_adj_array_free,
+};
+
+static const struct kparam_array __param_arr_adj = {
+ .max = ARRAY_SIZE(lowmem_adj),
+ .num = &lowmem_adj_size,
+ .ops = &param_ops_short,
+ .elemsize = sizeof(lowmem_adj[0]),
+ .elem = lowmem_adj,
+};
+#endif
+
/*
* not really modular, but the easiest way to keep compat with existing
* bootargs behaviour is to continue using module_param here.
*/
module_param_named(cost, lowmem_shrinker.seeks, int, 0644);
+#ifdef CONFIG_ANDROID_LOW_MEMORY_KILLER_AUTODETECT_OOM_ADJ_VALUES
+module_param_cb(adj, &lowmem_adj_array_ops,
+ .arr = &__param_arr_adj,
+ 0644);
+__MODULE_PARM_TYPE(adj, "array of short");
+#else
module_param_array_named(adj, lowmem_adj, short, &lowmem_adj_size, 0644);
+#endif
module_param_array_named(minfree, lowmem_minfree, uint, &lowmem_minfree_size,
0644);
module_param_named(debug_level, lowmem_debug_level, uint, 0644);
diff --git a/drivers/staging/android/trace/lowmemorykiller.h b/drivers/staging/android/trace/lowmemorykiller.h
new file mode 100644
index 000000000000..f43d3fae75ee
--- /dev/null
+++ b/drivers/staging/android/trace/lowmemorykiller.h
@@ -0,0 +1,41 @@
+#undef TRACE_SYSTEM
+#define TRACE_INCLUDE_PATH ../../drivers/staging/android/trace
+#define TRACE_SYSTEM lowmemorykiller
+
+#if !defined(_TRACE_LOWMEMORYKILLER_H) || defined(TRACE_HEADER_MULTI_READ)
+#define _TRACE_LOWMEMORYKILLER_H
+
+#include <linux/tracepoint.h>
+
+TRACE_EVENT(lowmemory_kill,
+ TP_PROTO(struct task_struct *killed_task, long cache_size, \
+ long cache_limit, long free),
+
+ TP_ARGS(killed_task, cache_size, cache_limit, free),
+
+ TP_STRUCT__entry(
+ __array(char, comm, TASK_COMM_LEN)
+ __field(pid_t, pid)
+ __field(long, pagecache_size)
+ __field(long, pagecache_limit)
+ __field(long, free)
+ ),
+
+ TP_fast_assign(
+ memcpy(__entry->comm, killed_task->comm, TASK_COMM_LEN);
+ __entry->pid = killed_task->pid;
+ __entry->pagecache_size = cache_size;
+ __entry->pagecache_limit = cache_limit;
+ __entry->free = free;
+ ),
+
+ TP_printk("%s (%d), page cache %ldkB (limit %ldkB), free %ldKb",
+ __entry->comm, __entry->pid, __entry->pagecache_size,
+ __entry->pagecache_limit, __entry->free)
+);
+
+
+#endif /* if !defined(_TRACE_LOWMEMORYKILLER_H) || defined(TRACE_HEADER_MULTI_READ) */
+
+/* This part must be outside protection */
+#include <trace/define_trace.h>
diff --git a/drivers/staging/android/uapi/vsoc_shm.h b/drivers/staging/android/uapi/vsoc_shm.h
new file mode 100644
index 000000000000..741b1387c25b
--- /dev/null
+++ b/drivers/staging/android/uapi/vsoc_shm.h
@@ -0,0 +1,303 @@
+/*
+ * Copyright (C) 2017 Google, Inc.
+ *
+ * This software is licensed under the terms of the GNU General Public
+ * License version 2, as published by the Free Software Foundation, and
+ * may be copied, distributed, and modified under those terms.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ */
+
+#ifndef _UAPI_LINUX_VSOC_SHM_H
+#define _UAPI_LINUX_VSOC_SHM_H
+
+#include <linux/types.h>
+
+/**
+ * A permission is a token that permits a receiver to read and/or write an area
+ * of memory within a Vsoc region.
+ *
+ * An fd_scoped permission grants both read and write access, and can be
+ * attached to a file description (see open(2)).
+ * Ownership of the area can then be shared by passing a file descriptor
+ * among processes.
+ *
+ * begin_offset and end_offset define the area of memory that is controlled by
+ * the permission. owner_offset points to a word, also in shared memory, that
+ * controls ownership of the area.
+ *
+ * ownership of the region expires when the associated file description is
+ * released.
+ *
+ * At most one permission can be attached to each file description.
+ *
+ * This is useful when implementing HALs like gralloc that scope and pass
+ * ownership of shared resources via file descriptors.
+ *
+ * The caller is responsibe for doing any fencing.
+ *
+ * The calling process will normally identify a currently free area of
+ * memory. It will construct a proposed fd_scoped_permission_arg structure:
+ *
+ * begin_offset and end_offset describe the area being claimed
+ *
+ * owner_offset points to the location in shared memory that indicates the
+ * owner of the area.
+ *
+ * owned_value is the value that will be stored in owner_offset iff the
+ * permission can be granted. It must be different than VSOC_REGION_FREE.
+ *
+ * Two fd_scoped_permission structures are compatible if they vary only by
+ * their owned_value fields.
+ *
+ * The driver ensures that, for any group of simultaneous callers proposing
+ * compatible fd_scoped_permissions, it will accept exactly one of the
+ * propopsals. The other callers will get a failure with errno of EAGAIN.
+ *
+ * A process receiving a file descriptor can identify the region being
+ * granted using the VSOC_GET_FD_SCOPED_PERMISSION ioctl.
+ */
+struct fd_scoped_permission {
+ __u32 begin_offset;
+ __u32 end_offset;
+ __u32 owner_offset;
+ __u32 owned_value;
+};
+
+/*
+ * This value represents a free area of memory. The driver expects to see this
+ * value at owner_offset when creating a permission otherwise it will not do it,
+ * and will write this value back once the permission is no longer needed.
+ */
+#define VSOC_REGION_FREE ((__u32)0)
+
+/**
+ * ioctl argument for VSOC_CREATE_FD_SCOPE_PERMISSION
+ */
+struct fd_scoped_permission_arg {
+ struct fd_scoped_permission perm;
+ __s32 managed_region_fd;
+};
+
+#define VSOC_NODE_FREE ((__u32)0)
+
+/*
+ * Describes a signal table in shared memory. Each non-zero entry in the
+ * table indicates that the receiver should signal the futex at the given
+ * offset. Offsets are relative to the region, not the shared memory window.
+ *
+ * interrupt_signalled_offset is used to reliably signal interrupts across the
+ * vmm boundary. There are two roles: transmitter and receiver. For example,
+ * in the host_to_guest_signal_table the host is the transmitter and the
+ * guest is the receiver. The protocol is as follows:
+ *
+ * 1. The transmitter should convert the offset of the futex to an offset
+ * in the signal table [0, (1 << num_nodes_lg2))
+ * The transmitter can choose any appropriate hashing algorithm, including
+ * hash = futex_offset & ((1 << num_nodes_lg2) - 1)
+ *
+ * 3. The transmitter should atomically compare and swap futex_offset with 0
+ * at hash. There are 3 possible outcomes
+ * a. The swap fails because the futex_offset is already in the table.
+ * The transmitter should stop.
+ * b. Some other offset is in the table. This is a hash collision. The
+ * transmitter should move to another table slot and try again. One
+ * possible algorithm:
+ * hash = (hash + 1) & ((1 << num_nodes_lg2) - 1)
+ * c. The swap worked. Continue below.
+ *
+ * 3. The transmitter atomically swaps 1 with the value at the
+ * interrupt_signalled_offset. There are two outcomes:
+ * a. The prior value was 1. In this case an interrupt has already been
+ * posted. The transmitter is done.
+ * b. The prior value was 0, indicating that the receiver may be sleeping.
+ * The transmitter will issue an interrupt.
+ *
+ * 4. On waking the receiver immediately exchanges a 0 with the
+ * interrupt_signalled_offset. If it receives a 0 then this a spurious
+ * interrupt. That may occasionally happen in the current protocol, but
+ * should be rare.
+ *
+ * 5. The receiver scans the signal table by atomicaly exchanging 0 at each
+ * location. If a non-zero offset is returned from the exchange the
+ * receiver wakes all sleepers at the given offset:
+ * futex((int*)(region_base + old_value), FUTEX_WAKE, MAX_INT);
+ *
+ * 6. The receiver thread then does a conditional wait, waking immediately
+ * if the value at interrupt_signalled_offset is non-zero. This catches cases
+ * here additional signals were posted while the table was being scanned.
+ * On the guest the wait is handled via the VSOC_WAIT_FOR_INCOMING_INTERRUPT
+ * ioctl.
+ */
+struct vsoc_signal_table_layout {
+ /* log_2(Number of signal table entries) */
+ __u32 num_nodes_lg2;
+ /*
+ * Offset to the first signal table entry relative to the start of the
+ * region
+ */
+ __u32 futex_uaddr_table_offset;
+ /*
+ * Offset to an atomic_t / atomic uint32_t. A non-zero value indicates
+ * that one or more offsets are currently posted in the table.
+ * semi-unique access to an entry in the table
+ */
+ __u32 interrupt_signalled_offset;
+};
+
+#define VSOC_REGION_WHOLE ((__s32)0)
+#define VSOC_DEVICE_NAME_SZ 16
+
+/**
+ * Each HAL would (usually) talk to a single device region
+ * Mulitple entities care about these regions:
+ * - The ivshmem_server will populate the regions in shared memory
+ * - The guest kernel will read the region, create minor device nodes, and
+ * allow interested parties to register for FUTEX_WAKE events in the region
+ * - HALs will access via the minor device nodes published by the guest kernel
+ * - Host side processes will access the region via the ivshmem_server:
+ * 1. Pass name to ivshmem_server at a UNIX socket
+ * 2. ivshmemserver will reply with 2 fds:
+ * - host->guest doorbell fd
+ * - guest->host doorbell fd
+ * - fd for the shared memory region
+ * - region offset
+ * 3. Start a futex receiver thread on the doorbell fd pointed at the
+ * signal_nodes
+ */
+struct vsoc_device_region {
+ __u16 current_version;
+ __u16 min_compatible_version;
+ __u32 region_begin_offset;
+ __u32 region_end_offset;
+ __u32 offset_of_region_data;
+ struct vsoc_signal_table_layout guest_to_host_signal_table;
+ struct vsoc_signal_table_layout host_to_guest_signal_table;
+ /* Name of the device. Must always be terminated with a '\0', so
+ * the longest supported device name is 15 characters.
+ */
+ char device_name[VSOC_DEVICE_NAME_SZ];
+ /* There are two ways that permissions to access regions are handled:
+ * - When subdivided_by is VSOC_REGION_WHOLE, any process that can
+ * open the device node for the region gains complete access to it.
+ * - When subdivided is set processes that open the region cannot
+ * access it. Access to a sub-region must be established by invoking
+ * the VSOC_CREATE_FD_SCOPE_PERMISSION ioctl on the region
+ * referenced in subdivided_by, providing a fileinstance
+ * (represented by a fd) opened on this region.
+ */
+ __u32 managed_by;
+};
+
+/*
+ * The vsoc layout descriptor.
+ * The first 4K should be reserved for the shm header and region descriptors.
+ * The regions should be page aligned.
+ */
+
+struct vsoc_shm_layout_descriptor {
+ __u16 major_version;
+ __u16 minor_version;
+
+ /* size of the shm. This may be redundant but nice to have */
+ __u32 size;
+
+ /* number of shared memory regions */
+ __u32 region_count;
+
+ /* The offset to the start of region descriptors */
+ __u32 vsoc_region_desc_offset;
+};
+
+/*
+ * This specifies the current version that should be stored in
+ * vsoc_shm_layout_descriptor.major_version and
+ * vsoc_shm_layout_descriptor.minor_version.
+ * It should be updated only if the vsoc_device_region and
+ * vsoc_shm_layout_descriptor structures have changed.
+ * Versioning within each region is transferred
+ * via the min_compatible_version and current_version fields in
+ * vsoc_device_region. The driver does not consult these fields: they are left
+ * for the HALs and host processes and will change independently of the layout
+ * version.
+ */
+#define CURRENT_VSOC_LAYOUT_MAJOR_VERSION 2
+#define CURRENT_VSOC_LAYOUT_MINOR_VERSION 0
+
+#define VSOC_CREATE_FD_SCOPED_PERMISSION \
+ _IOW(0xF5, 0, struct fd_scoped_permission)
+#define VSOC_GET_FD_SCOPED_PERMISSION _IOR(0xF5, 1, struct fd_scoped_permission)
+
+/*
+ * This is used to signal the host to scan the guest_to_host_signal_table
+ * for new futexes to wake. This sends an interrupt if one is not already
+ * in flight.
+ */
+#define VSOC_MAYBE_SEND_INTERRUPT_TO_HOST _IO(0xF5, 2)
+
+/*
+ * When this returns the guest will scan host_to_guest_signal_table to
+ * check for new futexes to wake.
+ */
+/* TODO(ghartman): Consider moving this to the bottom half */
+#define VSOC_WAIT_FOR_INCOMING_INTERRUPT _IO(0xF5, 3)
+
+/*
+ * Guest HALs will use this to retrieve the region description after
+ * opening their device node.
+ */
+#define VSOC_DESCRIBE_REGION _IOR(0xF5, 4, struct vsoc_device_region)
+
+/*
+ * Wake any threads that may be waiting for a host interrupt on this region.
+ * This is mostly used during shutdown.
+ */
+#define VSOC_SELF_INTERRUPT _IO(0xF5, 5)
+
+/*
+ * This is used to signal the host to scan the guest_to_host_signal_table
+ * for new futexes to wake. This sends an interrupt unconditionally.
+ */
+#define VSOC_SEND_INTERRUPT_TO_HOST _IO(0xF5, 6)
+
+enum wait_types {
+ VSOC_WAIT_UNDEFINED = 0,
+ VSOC_WAIT_IF_EQUAL = 1,
+ VSOC_WAIT_IF_EQUAL_TIMEOUT = 2
+};
+
+/*
+ * Wait for a condition to be true
+ *
+ * Note, this is sized and aligned so the 32 bit and 64 bit layouts are
+ * identical.
+ */
+struct vsoc_cond_wait {
+ /* Input: Offset of the 32 bit word to check */
+ __u32 offset;
+ /* Input: Value that will be compared with the offset */
+ __u32 value;
+ /* Monotonic time to wake at in seconds */
+ __u64 wake_time_sec;
+ /* Input: Monotonic time to wait in nanoseconds */
+ __u32 wake_time_nsec;
+ /* Input: Type of wait */
+ __u32 wait_type;
+ /* Output: Number of times the thread woke before returning. */
+ __u32 wakes;
+ /* Ensure that we're 8-byte aligned and 8 byte length for 32/64 bit
+ * compatibility.
+ */
+ __u32 reserved_1;
+};
+
+#define VSOC_COND_WAIT _IOWR(0xF5, 7, struct vsoc_cond_wait)
+
+/* Wake any local threads waiting at the offset given in arg */
+#define VSOC_COND_WAKE _IO(0xF5, 8)
+
+#endif /* _UAPI_LINUX_VSOC_SHM_H */
diff --git a/drivers/staging/android/vsoc.c b/drivers/staging/android/vsoc.c
new file mode 100644
index 000000000000..954ed2c5d807
--- /dev/null
+++ b/drivers/staging/android/vsoc.c
@@ -0,0 +1,1165 @@
+/*
+ * drivers/android/staging/vsoc.c
+ *
+ * Android Virtual System on a Chip (VSoC) driver
+ *
+ * Copyright (C) 2017 Google, Inc.
+ *
+ * Author: ghartman@google.com
+ *
+ * This software is licensed under the terms of the GNU General Public
+ * License version 2, as published by the Free Software Foundation, and
+ * may be copied, distributed, and modified under those terms.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ *
+ * Based on drivers/char/kvm_ivshmem.c - driver for KVM Inter-VM shared memory
+ * Copyright 2009 Cam Macdonell <cam@cs.ualberta.ca>
+ *
+ * Based on cirrusfb.c and 8139cp.c:
+ * Copyright 1999-2001 Jeff Garzik
+ * Copyright 2001-2004 Jeff Garzik
+ */
+
+#include <linux/dma-mapping.h>
+#include <linux/freezer.h>
+#include <linux/futex.h>
+#include <linux/init.h>
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/mutex.h>
+#include <linux/pci.h>
+#include <linux/proc_fs.h>
+#include <linux/sched.h>
+#include <linux/syscalls.h>
+#include <linux/uaccess.h>
+#include <linux/interrupt.h>
+#include <linux/mutex.h>
+#include <linux/cdev.h>
+#include <linux/file.h>
+#include "uapi/vsoc_shm.h"
+
+#define VSOC_DEV_NAME "vsoc"
+
+/*
+ * Description of the ivshmem-doorbell PCI device used by QEmu. These
+ * constants follow docs/specs/ivshmem-spec.txt, which can be found in
+ * the QEmu repository. This was last reconciled with the version that
+ * came out with 2.8
+ */
+
+/*
+ * These constants are determined KVM Inter-VM shared memory device
+ * register offsets
+ */
+enum {
+ INTR_MASK = 0x00, /* Interrupt Mask */
+ INTR_STATUS = 0x04, /* Interrupt Status */
+ IV_POSITION = 0x08, /* VM ID */
+ DOORBELL = 0x0c, /* Doorbell */
+};
+
+static const int REGISTER_BAR; /* Equal to 0 */
+static const int MAX_REGISTER_BAR_LEN = 0x100;
+/*
+ * The MSI-x BAR is not used directly.
+ *
+ * static const int MSI_X_BAR = 1;
+ */
+static const int SHARED_MEMORY_BAR = 2;
+
+struct vsoc_region_data {
+ char name[VSOC_DEVICE_NAME_SZ + 1];
+ wait_queue_head_t interrupt_wait_queue;
+ /* TODO(b/73664181): Use multiple futex wait queues */
+ wait_queue_head_t futex_wait_queue;
+ /* Flag indicating that an interrupt has been signalled by the host. */
+ atomic_t *incoming_signalled;
+ /* Flag indicating the guest has signalled the host. */
+ atomic_t *outgoing_signalled;
+ bool irq_requested;
+ bool device_created;
+};
+
+struct vsoc_device {
+ /* Kernel virtual address of REGISTER_BAR. */
+ void __iomem *regs;
+ /* Physical address of SHARED_MEMORY_BAR. */
+ phys_addr_t shm_phys_start;
+ /* Kernel virtual address of SHARED_MEMORY_BAR. */
+ void __iomem *kernel_mapped_shm;
+ /* Size of the entire shared memory window in bytes. */
+ size_t shm_size;
+ /*
+ * Pointer to the virtual address of the shared memory layout structure.
+ * This is probably identical to kernel_mapped_shm, but saving this
+ * here saves a lot of annoying casts.
+ */
+ struct vsoc_shm_layout_descriptor *layout;
+ /*
+ * Points to a table of region descriptors in the kernel's virtual
+ * address space. Calculated from
+ * vsoc_shm_layout_descriptor.vsoc_region_desc_offset
+ */
+ struct vsoc_device_region *regions;
+ /* Head of a list of permissions that have been granted. */
+ struct list_head permissions;
+ struct pci_dev *dev;
+ /* Per-region (and therefore per-interrupt) information. */
+ struct vsoc_region_data *regions_data;
+ /*
+ * Table of msi-x entries. This has to be separated from struct
+ * vsoc_region_data because the kernel deals with them as an array.
+ */
+ struct msix_entry *msix_entries;
+ /* Mutex that protectes the permission list */
+ struct mutex mtx;
+ /* Major number assigned by the kernel */
+ int major;
+ /* Character device assigned by the kernel */
+ struct cdev cdev;
+ /* Device class assigned by the kernel */
+ struct class *class;
+ /*
+ * Flags that indicate what we've initialized. These are used to do an
+ * orderly cleanup of the device.
+ */
+ bool enabled_device;
+ bool requested_regions;
+ bool cdev_added;
+ bool class_added;
+ bool msix_enabled;
+};
+
+static struct vsoc_device vsoc_dev;
+
+/*
+ * TODO(ghartman): Add a /sys filesystem entry that summarizes the permissions.
+ */
+
+struct fd_scoped_permission_node {
+ struct fd_scoped_permission permission;
+ struct list_head list;
+};
+
+struct vsoc_private_data {
+ struct fd_scoped_permission_node *fd_scoped_permission_node;
+};
+
+static long vsoc_ioctl(struct file *, unsigned int, unsigned long);
+static int vsoc_mmap(struct file *, struct vm_area_struct *);
+static int vsoc_open(struct inode *, struct file *);
+static int vsoc_release(struct inode *, struct file *);
+static ssize_t vsoc_read(struct file *, char __user *, size_t, loff_t *);
+static ssize_t vsoc_write(struct file *, const char __user *, size_t, loff_t *);
+static loff_t vsoc_lseek(struct file *filp, loff_t offset, int origin);
+static int do_create_fd_scoped_permission(
+ struct vsoc_device_region *region_p,
+ struct fd_scoped_permission_node *np,
+ struct fd_scoped_permission_arg __user *arg);
+static void do_destroy_fd_scoped_permission(
+ struct vsoc_device_region *owner_region_p,
+ struct fd_scoped_permission *perm);
+static long do_vsoc_describe_region(struct file *,
+ struct vsoc_device_region __user *);
+static ssize_t vsoc_get_area(struct file *filp, __u32 *perm_off);
+
+/**
+ * Validate arguments on entry points to the driver.
+ */
+inline int vsoc_validate_inode(struct inode *inode)
+{
+ if (iminor(inode) >= vsoc_dev.layout->region_count) {
+ dev_err(&vsoc_dev.dev->dev,
+ "describe_region: invalid region %d\n", iminor(inode));
+ return -ENODEV;
+ }
+ return 0;
+}
+
+inline int vsoc_validate_filep(struct file *filp)
+{
+ int ret = vsoc_validate_inode(file_inode(filp));
+
+ if (ret)
+ return ret;
+ if (!filp->private_data) {
+ dev_err(&vsoc_dev.dev->dev,
+ "No private data on fd, region %d\n",
+ iminor(file_inode(filp)));
+ return -EBADFD;
+ }
+ return 0;
+}
+
+/* Converts from shared memory offset to virtual address */
+static inline void *shm_off_to_virtual_addr(__u32 offset)
+{
+ return (void __force *)vsoc_dev.kernel_mapped_shm + offset;
+}
+
+/* Converts from shared memory offset to physical address */
+static inline phys_addr_t shm_off_to_phys_addr(__u32 offset)
+{
+ return vsoc_dev.shm_phys_start + offset;
+}
+
+/**
+ * Convenience functions to obtain the region from the inode or file.
+ * Dangerous to call before validating the inode/file.
+ */
+static inline struct vsoc_device_region *vsoc_region_from_inode(
+ struct inode *inode)
+{
+ return &vsoc_dev.regions[iminor(inode)];
+}
+
+static inline struct vsoc_device_region *vsoc_region_from_filep(
+ struct file *inode)
+{
+ return vsoc_region_from_inode(file_inode(inode));
+}
+
+static inline uint32_t vsoc_device_region_size(struct vsoc_device_region *r)
+{
+ return r->region_end_offset - r->region_begin_offset;
+}
+
+static const struct file_operations vsoc_ops = {
+ .owner = THIS_MODULE,
+ .open = vsoc_open,
+ .mmap = vsoc_mmap,
+ .read = vsoc_read,
+ .unlocked_ioctl = vsoc_ioctl,
+ .compat_ioctl = vsoc_ioctl,
+ .write = vsoc_write,
+ .llseek = vsoc_lseek,
+ .release = vsoc_release,
+};
+
+static struct pci_device_id vsoc_id_table[] = {
+ {0x1af4, 0x1110, PCI_ANY_ID, PCI_ANY_ID, 0, 0, 0},
+ {0},
+};
+
+MODULE_DEVICE_TABLE(pci, vsoc_id_table);
+
+static void vsoc_remove_device(struct pci_dev *pdev);
+static int vsoc_probe_device(struct pci_dev *pdev,
+ const struct pci_device_id *ent);
+
+static struct pci_driver vsoc_pci_driver = {
+ .name = "vsoc",
+ .id_table = vsoc_id_table,
+ .probe = vsoc_probe_device,
+ .remove = vsoc_remove_device,
+};
+
+static int do_create_fd_scoped_permission(
+ struct vsoc_device_region *region_p,
+ struct fd_scoped_permission_node *np,
+ struct fd_scoped_permission_arg __user *arg)
+{
+ struct file *managed_filp;
+ s32 managed_fd;
+ atomic_t *owner_ptr = NULL;
+ struct vsoc_device_region *managed_region_p;
+
+ if (copy_from_user(&np->permission, &arg->perm, sizeof(*np)) ||
+ copy_from_user(&managed_fd,
+ &arg->managed_region_fd, sizeof(managed_fd))) {
+ return -EFAULT;
+ }
+ managed_filp = fdget(managed_fd).file;
+ /* Check that it's a valid fd, */
+ if (!managed_filp || vsoc_validate_filep(managed_filp))
+ return -EPERM;
+ /* EEXIST if the given fd already has a permission. */
+ if (((struct vsoc_private_data *)managed_filp->private_data)->
+ fd_scoped_permission_node)
+ return -EEXIST;
+ managed_region_p = vsoc_region_from_filep(managed_filp);
+ /* Check that the provided region is managed by this one */
+ if (&vsoc_dev.regions[managed_region_p->managed_by] != region_p)
+ return -EPERM;
+ /* The area must be well formed and have non-zero size */
+ if (np->permission.begin_offset >= np->permission.end_offset)
+ return -EINVAL;
+ /* The area must fit in the memory window */
+ if (np->permission.end_offset >
+ vsoc_device_region_size(managed_region_p))
+ return -ERANGE;
+ /* The area must be in the region data section */
+ if (np->permission.begin_offset <
+ managed_region_p->offset_of_region_data)
+ return -ERANGE;
+ /* The area must be page aligned */
+ if (!PAGE_ALIGNED(np->permission.begin_offset) ||
+ !PAGE_ALIGNED(np->permission.end_offset))
+ return -EINVAL;
+ /* Owner offset must be naturally aligned in the window */
+ if (np->permission.owner_offset &
+ (sizeof(np->permission.owner_offset) - 1))
+ return -EINVAL;
+ /* The owner flag must reside in the owner memory */
+ if (np->permission.owner_offset + sizeof(np->permission.owner_offset) >
+ vsoc_device_region_size(region_p))
+ return -ERANGE;
+ /* The owner flag must reside in the data section */
+ if (np->permission.owner_offset < region_p->offset_of_region_data)
+ return -EINVAL;
+ /* The owner value must change to claim the memory */
+ if (np->permission.owned_value == VSOC_REGION_FREE)
+ return -EINVAL;
+ owner_ptr =
+ (atomic_t *)shm_off_to_virtual_addr(region_p->region_begin_offset +
+ np->permission.owner_offset);
+ /* We've already verified that this is in the shared memory window, so
+ * it should be safe to write to this address.
+ */
+ if (atomic_cmpxchg(owner_ptr,
+ VSOC_REGION_FREE,
+ np->permission.owned_value) != VSOC_REGION_FREE) {
+ return -EBUSY;
+ }
+ ((struct vsoc_private_data *)managed_filp->private_data)->
+ fd_scoped_permission_node = np;
+ /* The file offset needs to be adjusted if the calling
+ * process did any read/write operations on the fd
+ * before creating the permission.
+ */
+ if (managed_filp->f_pos) {
+ if (managed_filp->f_pos > np->permission.end_offset) {
+ /* If the offset is beyond the permission end, set it
+ * to the end.
+ */
+ managed_filp->f_pos = np->permission.end_offset;
+ } else {
+ /* If the offset is within the permission interval
+ * keep it there otherwise reset it to zero.
+ */
+ if (managed_filp->f_pos < np->permission.begin_offset) {
+ managed_filp->f_pos = 0;
+ } else {
+ managed_filp->f_pos -=
+ np->permission.begin_offset;
+ }
+ }
+ }
+ return 0;
+}
+
+static void do_destroy_fd_scoped_permission_node(
+ struct vsoc_device_region *owner_region_p,
+ struct fd_scoped_permission_node *node)
+{
+ if (node) {
+ do_destroy_fd_scoped_permission(owner_region_p,
+ &node->permission);
+ mutex_lock(&vsoc_dev.mtx);
+ list_del(&node->list);
+ mutex_unlock(&vsoc_dev.mtx);
+ kfree(node);
+ }
+}
+
+static void do_destroy_fd_scoped_permission(
+ struct vsoc_device_region *owner_region_p,
+ struct fd_scoped_permission *perm)
+{
+ atomic_t *owner_ptr = NULL;
+ int prev = 0;
+
+ if (!perm)
+ return;
+ owner_ptr = (atomic_t *)shm_off_to_virtual_addr(
+ owner_region_p->region_begin_offset + perm->owner_offset);
+ prev = atomic_xchg(owner_ptr, VSOC_REGION_FREE);
+ if (prev != perm->owned_value)
+ dev_err(&vsoc_dev.dev->dev,
+ "%x-%x: owner (%s) %x: expected to be %x was %x",
+ perm->begin_offset, perm->end_offset,
+ owner_region_p->device_name, perm->owner_offset,
+ perm->owned_value, prev);
+}
+
+static long do_vsoc_describe_region(struct file *filp,
+ struct vsoc_device_region __user *dest)
+{
+ struct vsoc_device_region *region_p;
+ int retval = vsoc_validate_filep(filp);
+
+ if (retval)
+ return retval;
+ region_p = vsoc_region_from_filep(filp);
+ if (copy_to_user(dest, region_p, sizeof(*region_p)))
+ return -EFAULT;
+ return 0;
+}
+
+/**
+ * Implements the inner logic of cond_wait. Copies to and from userspace are
+ * done in the helper function below.
+ */
+static int handle_vsoc_cond_wait(struct file *filp, struct vsoc_cond_wait *arg)
+{
+ DEFINE_WAIT(wait);
+ u32 region_number = iminor(file_inode(filp));
+ struct vsoc_region_data *data = vsoc_dev.regions_data + region_number;
+ struct hrtimer_sleeper timeout, *to = NULL;
+ int ret = 0;
+ struct vsoc_device_region *region_p = vsoc_region_from_filep(filp);
+ atomic_t *address = NULL;
+ struct timespec ts;
+
+ /* Ensure that the offset is aligned */
+ if (arg->offset & (sizeof(uint32_t) - 1))
+ return -EADDRNOTAVAIL;
+ /* Ensure that the offset is within shared memory */
+ if (((uint64_t)arg->offset) + region_p->region_begin_offset +
+ sizeof(uint32_t) > region_p->region_end_offset)
+ return -E2BIG;
+ address = shm_off_to_virtual_addr(region_p->region_begin_offset +
+ arg->offset);
+
+ /* Ensure that the type of wait is valid */
+ switch (arg->wait_type) {
+ case VSOC_WAIT_IF_EQUAL:
+ break;
+ case VSOC_WAIT_IF_EQUAL_TIMEOUT:
+ to = &timeout;
+ break;
+ default:
+ return -EINVAL;
+ }
+
+ if (to) {
+ /* Copy the user-supplied timesec into the kernel structure.
+ * We do things this way to flatten differences between 32 bit
+ * and 64 bit timespecs.
+ */
+ ts.tv_sec = arg->wake_time_sec;
+ ts.tv_nsec = arg->wake_time_nsec;
+
+ if (!timespec_valid(&ts))
+ return -EINVAL;
+ hrtimer_init_on_stack(&to->timer, CLOCK_MONOTONIC,
+ HRTIMER_MODE_ABS);
+ hrtimer_set_expires_range_ns(&to->timer, timespec_to_ktime(ts),
+ current->timer_slack_ns);
+
+ hrtimer_init_sleeper(to, current);
+ }
+
+ while (1) {
+ prepare_to_wait(&data->futex_wait_queue, &wait,
+ TASK_INTERRUPTIBLE);
+ /*
+ * Check the sentinel value after prepare_to_wait. If the value
+ * changes after this check the writer will call signal,
+ * changing the task state from INTERRUPTIBLE to RUNNING. That
+ * will ensure that schedule() will eventually schedule this
+ * task.
+ */
+ if (atomic_read(address) != arg->value) {
+ ret = 0;
+ break;
+ }
+ if (to) {
+ hrtimer_start_expires(&to->timer, HRTIMER_MODE_ABS);
+ if (likely(to->task))
+ freezable_schedule();
+ hrtimer_cancel(&to->timer);
+ if (!to->task) {
+ ret = -ETIMEDOUT;
+ break;
+ }
+ } else {
+ freezable_schedule();
+ }
+ /* Count the number of times that we woke up. This is useful
+ * for unit testing.
+ */
+ ++arg->wakes;
+ if (signal_pending(current)) {
+ ret = -EINTR;
+ break;
+ }
+ }
+ finish_wait(&data->futex_wait_queue, &wait);
+ if (to)
+ destroy_hrtimer_on_stack(&to->timer);
+ return ret;
+}
+
+/**
+ * Handles the details of copying from/to userspace to ensure that the copies
+ * happen on all of the return paths of cond_wait.
+ */
+static int do_vsoc_cond_wait(struct file *filp,
+ struct vsoc_cond_wait __user *untrusted_in)
+{
+ struct vsoc_cond_wait arg;
+ int rval = 0;
+
+ if (copy_from_user(&arg, untrusted_in, sizeof(arg)))
+ return -EFAULT;
+ /* wakes is an out parameter. Initialize it to something sensible. */
+ arg.wakes = 0;
+ rval = handle_vsoc_cond_wait(filp, &arg);
+ if (copy_to_user(untrusted_in, &arg, sizeof(arg)))
+ return -EFAULT;
+ return rval;
+}
+
+static int do_vsoc_cond_wake(struct file *filp, uint32_t offset)
+{
+ struct vsoc_device_region *region_p = vsoc_region_from_filep(filp);
+ u32 region_number = iminor(file_inode(filp));
+ struct vsoc_region_data *data = vsoc_dev.regions_data + region_number;
+ /* Ensure that the offset is aligned */
+ if (offset & (sizeof(uint32_t) - 1))
+ return -EADDRNOTAVAIL;
+ /* Ensure that the offset is within shared memory */
+ if (((uint64_t)offset) + region_p->region_begin_offset +
+ sizeof(uint32_t) > region_p->region_end_offset)
+ return -E2BIG;
+ /*
+ * TODO(b/73664181): Use multiple futex wait queues.
+ * We need to wake every sleeper when the condition changes. Typically
+ * only a single thread will be waiting on the condition, but there
+ * are exceptions. The worst case is about 10 threads.
+ */
+ wake_up_interruptible_all(&data->futex_wait_queue);
+ return 0;
+}
+
+static long vsoc_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
+{
+ int rv = 0;
+ struct vsoc_device_region *region_p;
+ u32 reg_num;
+ struct vsoc_region_data *reg_data;
+ int retval = vsoc_validate_filep(filp);
+
+ if (retval)
+ return retval;
+ region_p = vsoc_region_from_filep(filp);
+ reg_num = iminor(file_inode(filp));
+ reg_data = vsoc_dev.regions_data + reg_num;
+ switch (cmd) {
+ case VSOC_CREATE_FD_SCOPED_PERMISSION:
+ {
+ struct fd_scoped_permission_node *node = NULL;
+
+ node = kzalloc(sizeof(*node), GFP_KERNEL);
+ /* We can't allocate memory for the permission */
+ if (!node)
+ return -ENOMEM;
+ INIT_LIST_HEAD(&node->list);
+ rv = do_create_fd_scoped_permission(
+ region_p,
+ node,
+ (struct fd_scoped_permission_arg __user *)arg);
+ if (!rv) {
+ mutex_lock(&vsoc_dev.mtx);
+ list_add(&node->list, &vsoc_dev.permissions);
+ mutex_unlock(&vsoc_dev.mtx);
+ } else {
+ kfree(node);
+ return rv;
+ }
+ }
+ break;
+
+ case VSOC_GET_FD_SCOPED_PERMISSION:
+ {
+ struct fd_scoped_permission_node *node =
+ ((struct vsoc_private_data *)filp->private_data)->
+ fd_scoped_permission_node;
+ if (!node)
+ return -ENOENT;
+ if (copy_to_user
+ ((struct fd_scoped_permission __user *)arg,
+ &node->permission, sizeof(node->permission)))
+ return -EFAULT;
+ }
+ break;
+
+ case VSOC_MAYBE_SEND_INTERRUPT_TO_HOST:
+ if (!atomic_xchg(
+ reg_data->outgoing_signalled,
+ 1)) {
+ writel(reg_num, vsoc_dev.regs + DOORBELL);
+ return 0;
+ } else {
+ return -EBUSY;
+ }
+ break;
+
+ case VSOC_SEND_INTERRUPT_TO_HOST:
+ writel(reg_num, vsoc_dev.regs + DOORBELL);
+ return 0;
+
+ case VSOC_WAIT_FOR_INCOMING_INTERRUPT:
+ wait_event_interruptible(
+ reg_data->interrupt_wait_queue,
+ (atomic_read(reg_data->incoming_signalled) != 0));
+ break;
+
+ case VSOC_DESCRIBE_REGION:
+ return do_vsoc_describe_region(
+ filp,
+ (struct vsoc_device_region __user *)arg);
+
+ case VSOC_SELF_INTERRUPT:
+ atomic_set(reg_data->incoming_signalled, 1);
+ wake_up_interruptible(&reg_data->interrupt_wait_queue);
+ break;
+
+ case VSOC_COND_WAIT:
+ return do_vsoc_cond_wait(filp,
+ (struct vsoc_cond_wait __user *)arg);
+ case VSOC_COND_WAKE:
+ return do_vsoc_cond_wake(filp, arg);
+
+ default:
+ return -EINVAL;
+ }
+ return 0;
+}
+
+static ssize_t vsoc_read(struct file *filp, char __user *buffer, size_t len,
+ loff_t *poffset)
+{
+ __u32 area_off;
+ const void *area_p;
+ ssize_t area_len;
+ int retval = vsoc_validate_filep(filp);
+
+ if (retval)
+ return retval;
+ area_len = vsoc_get_area(filp, &area_off);
+ area_p = shm_off_to_virtual_addr(area_off);
+ area_p += *poffset;
+ area_len -= *poffset;
+ if (area_len <= 0)
+ return 0;
+ if (area_len < len)
+ len = area_len;
+ if (copy_to_user(buffer, area_p, len))
+ return -EFAULT;
+ *poffset += len;
+ return len;
+}
+
+static loff_t vsoc_lseek(struct file *filp, loff_t offset, int origin)
+{
+ ssize_t area_len = 0;
+ int retval = vsoc_validate_filep(filp);
+
+ if (retval)
+ return retval;
+ area_len = vsoc_get_area(filp, NULL);
+ switch (origin) {
+ case SEEK_SET:
+ break;
+
+ case SEEK_CUR:
+ if (offset > 0 && offset + filp->f_pos < 0)
+ return -EOVERFLOW;
+ offset += filp->f_pos;
+ break;
+
+ case SEEK_END:
+ if (offset > 0 && offset + area_len < 0)
+ return -EOVERFLOW;
+ offset += area_len;
+ break;
+
+ case SEEK_DATA:
+ if (offset >= area_len)
+ return -EINVAL;
+ if (offset < 0)
+ offset = 0;
+ break;
+
+ case SEEK_HOLE:
+ /* Next hole is always the end of the region, unless offset is
+ * beyond that
+ */
+ if (offset < area_len)
+ offset = area_len;
+ break;
+
+ default:
+ return -EINVAL;
+ }
+
+ if (offset < 0 || offset > area_len)
+ return -EINVAL;
+ filp->f_pos = offset;
+
+ return offset;
+}
+
+static ssize_t vsoc_write(struct file *filp, const char __user *buffer,
+ size_t len, loff_t *poffset)
+{
+ __u32 area_off;
+ void *area_p;
+ ssize_t area_len;
+ int retval = vsoc_validate_filep(filp);
+
+ if (retval)
+ return retval;
+ area_len = vsoc_get_area(filp, &area_off);
+ area_p = shm_off_to_virtual_addr(area_off);
+ area_p += *poffset;
+ area_len -= *poffset;
+ if (area_len <= 0)
+ return 0;
+ if (area_len < len)
+ len = area_len;
+ if (copy_from_user(area_p, buffer, len))
+ return -EFAULT;
+ *poffset += len;
+ return len;
+}
+
+static irqreturn_t vsoc_interrupt(int irq, void *region_data_v)
+{
+ struct vsoc_region_data *region_data =
+ (struct vsoc_region_data *)region_data_v;
+ int reg_num = region_data - vsoc_dev.regions_data;
+
+ if (unlikely(!region_data))
+ return IRQ_NONE;
+
+ if (unlikely(reg_num < 0 ||
+ reg_num >= vsoc_dev.layout->region_count)) {
+ dev_err(&vsoc_dev.dev->dev,
+ "invalid irq @%p reg_num=0x%04x\n",
+ region_data, reg_num);
+ return IRQ_NONE;
+ }
+ if (unlikely(vsoc_dev.regions_data + reg_num != region_data)) {
+ dev_err(&vsoc_dev.dev->dev,
+ "irq not aligned @%p reg_num=0x%04x\n",
+ region_data, reg_num);
+ return IRQ_NONE;
+ }
+ wake_up_interruptible(&region_data->interrupt_wait_queue);
+ return IRQ_HANDLED;
+}
+
+static int vsoc_probe_device(struct pci_dev *pdev,
+ const struct pci_device_id *ent)
+{
+ int result;
+ int i;
+ resource_size_t reg_size;
+ dev_t devt;
+
+ vsoc_dev.dev = pdev;
+ result = pci_enable_device(pdev);
+ if (result) {
+ dev_err(&pdev->dev,
+ "pci_enable_device failed %s: error %d\n",
+ pci_name(pdev), result);
+ return result;
+ }
+ vsoc_dev.enabled_device = true;
+ result = pci_request_regions(pdev, "vsoc");
+ if (result < 0) {
+ dev_err(&pdev->dev, "pci_request_regions failed\n");
+ vsoc_remove_device(pdev);
+ return -EBUSY;
+ }
+ vsoc_dev.requested_regions = true;
+ /* Set up the control registers in BAR 0 */
+ reg_size = pci_resource_len(pdev, REGISTER_BAR);
+ if (reg_size > MAX_REGISTER_BAR_LEN)
+ vsoc_dev.regs =
+ pci_iomap(pdev, REGISTER_BAR, MAX_REGISTER_BAR_LEN);
+ else
+ vsoc_dev.regs = pci_iomap(pdev, REGISTER_BAR, reg_size);
+
+ if (!vsoc_dev.regs) {
+ dev_err(&pdev->dev,
+ "cannot map registers of size %zu\n",
+ (size_t)reg_size);
+ vsoc_remove_device(pdev);
+ return -EBUSY;
+ }
+
+ /* Map the shared memory in BAR 2 */
+ vsoc_dev.shm_phys_start = pci_resource_start(pdev, SHARED_MEMORY_BAR);
+ vsoc_dev.shm_size = pci_resource_len(pdev, SHARED_MEMORY_BAR);
+
+ dev_info(&pdev->dev, "shared memory @ DMA %pa size=0x%zx\n",
+ &vsoc_dev.shm_phys_start, vsoc_dev.shm_size);
+ vsoc_dev.kernel_mapped_shm = pci_iomap_wc(pdev, SHARED_MEMORY_BAR, 0);
+ if (!vsoc_dev.kernel_mapped_shm) {
+ dev_err(&vsoc_dev.dev->dev, "cannot iomap region\n");
+ vsoc_remove_device(pdev);
+ return -EBUSY;
+ }
+
+ vsoc_dev.layout = (struct vsoc_shm_layout_descriptor __force *)
+ vsoc_dev.kernel_mapped_shm;
+ dev_info(&pdev->dev, "major_version: %d\n",
+ vsoc_dev.layout->major_version);
+ dev_info(&pdev->dev, "minor_version: %d\n",
+ vsoc_dev.layout->minor_version);
+ dev_info(&pdev->dev, "size: 0x%x\n", vsoc_dev.layout->size);
+ dev_info(&pdev->dev, "regions: %d\n", vsoc_dev.layout->region_count);
+ if (vsoc_dev.layout->major_version !=
+ CURRENT_VSOC_LAYOUT_MAJOR_VERSION) {
+ dev_err(&vsoc_dev.dev->dev,
+ "driver supports only major_version %d\n",
+ CURRENT_VSOC_LAYOUT_MAJOR_VERSION);
+ vsoc_remove_device(pdev);
+ return -EBUSY;
+ }
+ result = alloc_chrdev_region(&devt, 0, vsoc_dev.layout->region_count,
+ VSOC_DEV_NAME);
+ if (result) {
+ dev_err(&vsoc_dev.dev->dev, "alloc_chrdev_region failed\n");
+ vsoc_remove_device(pdev);
+ return -EBUSY;
+ }
+ vsoc_dev.major = MAJOR(devt);
+ cdev_init(&vsoc_dev.cdev, &vsoc_ops);
+ vsoc_dev.cdev.owner = THIS_MODULE;
+ result = cdev_add(&vsoc_dev.cdev, devt, vsoc_dev.layout->region_count);
+ if (result) {
+ dev_err(&vsoc_dev.dev->dev, "cdev_add error\n");
+ vsoc_remove_device(pdev);
+ return -EBUSY;
+ }
+ vsoc_dev.cdev_added = true;
+ vsoc_dev.class = class_create(THIS_MODULE, VSOC_DEV_NAME);
+ if (IS_ERR(vsoc_dev.class)) {
+ dev_err(&vsoc_dev.dev->dev, "class_create failed\n");
+ vsoc_remove_device(pdev);
+ return PTR_ERR(vsoc_dev.class);
+ }
+ vsoc_dev.class_added = true;
+ vsoc_dev.regions = (struct vsoc_device_region __force *)
+ ((void *)vsoc_dev.layout +
+ vsoc_dev.layout->vsoc_region_desc_offset);
+ vsoc_dev.msix_entries = kcalloc(
+ vsoc_dev.layout->region_count,
+ sizeof(vsoc_dev.msix_entries[0]), GFP_KERNEL);
+ if (!vsoc_dev.msix_entries) {
+ dev_err(&vsoc_dev.dev->dev,
+ "unable to allocate msix_entries\n");
+ vsoc_remove_device(pdev);
+ return -ENOSPC;
+ }
+ vsoc_dev.regions_data = kcalloc(
+ vsoc_dev.layout->region_count,
+ sizeof(vsoc_dev.regions_data[0]), GFP_KERNEL);
+ if (!vsoc_dev.regions_data) {
+ dev_err(&vsoc_dev.dev->dev,
+ "unable to allocate regions' data\n");
+ vsoc_remove_device(pdev);
+ return -ENOSPC;
+ }
+ for (i = 0; i < vsoc_dev.layout->region_count; ++i)
+ vsoc_dev.msix_entries[i].entry = i;
+
+ result = pci_enable_msix_exact(vsoc_dev.dev, vsoc_dev.msix_entries,
+ vsoc_dev.layout->region_count);
+ if (result) {
+ dev_info(&pdev->dev, "pci_enable_msix failed: %d\n", result);
+ vsoc_remove_device(pdev);
+ return -ENOSPC;
+ }
+ /* Check that all regions are well formed */
+ for (i = 0; i < vsoc_dev.layout->region_count; ++i) {
+ const struct vsoc_device_region *region = vsoc_dev.regions + i;
+
+ if (!PAGE_ALIGNED(region->region_begin_offset) ||
+ !PAGE_ALIGNED(region->region_end_offset)) {
+ dev_err(&vsoc_dev.dev->dev,
+ "region %d not aligned (%x:%x)", i,
+ region->region_begin_offset,
+ region->region_end_offset);
+ vsoc_remove_device(pdev);
+ return -EFAULT;
+ }
+ if (region->region_begin_offset >= region->region_end_offset ||
+ region->region_end_offset > vsoc_dev.shm_size) {
+ dev_err(&vsoc_dev.dev->dev,
+ "region %d offsets are wrong: %x %x %zx",
+ i, region->region_begin_offset,
+ region->region_end_offset, vsoc_dev.shm_size);
+ vsoc_remove_device(pdev);
+ return -EFAULT;
+ }
+ if (region->managed_by >= vsoc_dev.layout->region_count) {
+ dev_err(&vsoc_dev.dev->dev,
+ "region %d has invalid owner: %u",
+ i, region->managed_by);
+ vsoc_remove_device(pdev);
+ return -EFAULT;
+ }
+ }
+ vsoc_dev.msix_enabled = true;
+ for (i = 0; i < vsoc_dev.layout->region_count; ++i) {
+ const struct vsoc_device_region *region = vsoc_dev.regions + i;
+ size_t name_sz = sizeof(vsoc_dev.regions_data[i].name) - 1;
+ const struct vsoc_signal_table_layout *h_to_g_signal_table =
+ &region->host_to_guest_signal_table;
+ const struct vsoc_signal_table_layout *g_to_h_signal_table =
+ &region->guest_to_host_signal_table;
+
+ vsoc_dev.regions_data[i].name[name_sz] = '\0';
+ memcpy(vsoc_dev.regions_data[i].name, region->device_name,
+ name_sz);
+ dev_info(&pdev->dev, "region %d name=%s\n",
+ i, vsoc_dev.regions_data[i].name);
+ init_waitqueue_head(
+ &vsoc_dev.regions_data[i].interrupt_wait_queue);
+ init_waitqueue_head(&vsoc_dev.regions_data[i].futex_wait_queue);
+ vsoc_dev.regions_data[i].incoming_signalled =
+ shm_off_to_virtual_addr(region->region_begin_offset) +
+ h_to_g_signal_table->interrupt_signalled_offset;
+ vsoc_dev.regions_data[i].outgoing_signalled =
+ shm_off_to_virtual_addr(region->region_begin_offset) +
+ g_to_h_signal_table->interrupt_signalled_offset;
+ result = request_irq(
+ vsoc_dev.msix_entries[i].vector,
+ vsoc_interrupt, 0,
+ vsoc_dev.regions_data[i].name,
+ vsoc_dev.regions_data + i);
+ if (result) {
+ dev_info(&pdev->dev,
+ "request_irq failed irq=%d vector=%d\n",
+ i, vsoc_dev.msix_entries[i].vector);
+ vsoc_remove_device(pdev);
+ return -ENOSPC;
+ }
+ vsoc_dev.regions_data[i].irq_requested = true;
+ if (!device_create(vsoc_dev.class, NULL,
+ MKDEV(vsoc_dev.major, i),
+ NULL, vsoc_dev.regions_data[i].name)) {
+ dev_err(&vsoc_dev.dev->dev, "device_create failed\n");
+ vsoc_remove_device(pdev);
+ return -EBUSY;
+ }
+ vsoc_dev.regions_data[i].device_created = true;
+ }
+ return 0;
+}
+
+/*
+ * This should undo all of the allocations in the probe function in reverse
+ * order.
+ *
+ * Notes:
+ *
+ * The device may have been partially initialized, so double check
+ * that the allocations happened.
+ *
+ * This function may be called multiple times, so mark resources as freed
+ * as they are deallocated.
+ */
+static void vsoc_remove_device(struct pci_dev *pdev)
+{
+ int i;
+ /*
+ * pdev is the first thing to be set on probe and the last thing
+ * to be cleared here. If it's NULL then there is no cleanup.
+ */
+ if (!pdev || !vsoc_dev.dev)
+ return;
+ dev_info(&pdev->dev, "remove_device\n");
+ if (vsoc_dev.regions_data) {
+ for (i = 0; i < vsoc_dev.layout->region_count; ++i) {
+ if (vsoc_dev.regions_data[i].device_created) {
+ device_destroy(vsoc_dev.class,
+ MKDEV(vsoc_dev.major, i));
+ vsoc_dev.regions_data[i].device_created = false;
+ }
+ if (vsoc_dev.regions_data[i].irq_requested)
+ free_irq(vsoc_dev.msix_entries[i].vector, NULL);
+ vsoc_dev.regions_data[i].irq_requested = false;
+ }
+ kfree(vsoc_dev.regions_data);
+ vsoc_dev.regions_data = NULL;
+ }
+ if (vsoc_dev.msix_enabled) {
+ pci_disable_msix(pdev);
+ vsoc_dev.msix_enabled = false;
+ }
+ kfree(vsoc_dev.msix_entries);
+ vsoc_dev.msix_entries = NULL;
+ vsoc_dev.regions = NULL;
+ if (vsoc_dev.class_added) {
+ class_destroy(vsoc_dev.class);
+ vsoc_dev.class_added = false;
+ }
+ if (vsoc_dev.cdev_added) {
+ cdev_del(&vsoc_dev.cdev);
+ vsoc_dev.cdev_added = false;
+ }
+ if (vsoc_dev.major && vsoc_dev.layout) {
+ unregister_chrdev_region(MKDEV(vsoc_dev.major, 0),
+ vsoc_dev.layout->region_count);
+ vsoc_dev.major = 0;
+ }
+ vsoc_dev.layout = NULL;
+ if (vsoc_dev.kernel_mapped_shm) {
+ pci_iounmap(pdev, vsoc_dev.kernel_mapped_shm);
+ vsoc_dev.kernel_mapped_shm = NULL;
+ }
+ if (vsoc_dev.regs) {
+ pci_iounmap(pdev, vsoc_dev.regs);
+ vsoc_dev.regs = NULL;
+ }
+ if (vsoc_dev.requested_regions) {
+ pci_release_regions(pdev);
+ vsoc_dev.requested_regions = false;
+ }
+ if (vsoc_dev.enabled_device) {
+ pci_disable_device(pdev);
+ vsoc_dev.enabled_device = false;
+ }
+ /* Do this last: it indicates that the device is not initialized. */
+ vsoc_dev.dev = NULL;
+}
+
+static void __exit vsoc_cleanup_module(void)
+{
+ vsoc_remove_device(vsoc_dev.dev);
+ pci_unregister_driver(&vsoc_pci_driver);
+}
+
+static int __init vsoc_init_module(void)
+{
+ int err = -ENOMEM;
+
+ INIT_LIST_HEAD(&vsoc_dev.permissions);
+ mutex_init(&vsoc_dev.mtx);
+
+ err = pci_register_driver(&vsoc_pci_driver);
+ if (err < 0)
+ return err;
+ return 0;
+}
+
+static int vsoc_open(struct inode *inode, struct file *filp)
+{
+ /* Can't use vsoc_validate_filep because filp is still incomplete */
+ int ret = vsoc_validate_inode(inode);
+
+ if (ret)
+ return ret;
+ filp->private_data =
+ kzalloc(sizeof(struct vsoc_private_data), GFP_KERNEL);
+ if (!filp->private_data)
+ return -ENOMEM;
+ return 0;
+}
+
+static int vsoc_release(struct inode *inode, struct file *filp)
+{
+ struct vsoc_private_data *private_data = NULL;
+ struct fd_scoped_permission_node *node = NULL;
+ struct vsoc_device_region *owner_region_p = NULL;
+ int retval = vsoc_validate_filep(filp);
+
+ if (retval)
+ return retval;
+ private_data = (struct vsoc_private_data *)filp->private_data;
+ if (!private_data)
+ return 0;
+
+ node = private_data->fd_scoped_permission_node;
+ if (node) {
+ owner_region_p = vsoc_region_from_inode(inode);
+ if (owner_region_p->managed_by != VSOC_REGION_WHOLE) {
+ owner_region_p =
+ &vsoc_dev.regions[owner_region_p->managed_by];
+ }
+ do_destroy_fd_scoped_permission_node(owner_region_p, node);
+ private_data->fd_scoped_permission_node = NULL;
+ }
+ kfree(private_data);
+ filp->private_data = NULL;
+
+ return 0;
+}
+
+/*
+ * Returns the device relative offset and length of the area specified by the
+ * fd scoped permission. If there is no fd scoped permission set, a default
+ * permission covering the entire region is assumed, unless the region is owned
+ * by another one, in which case the default is a permission with zero size.
+ */
+static ssize_t vsoc_get_area(struct file *filp, __u32 *area_offset)
+{
+ __u32 off = 0;
+ ssize_t length = 0;
+ struct vsoc_device_region *region_p;
+ struct fd_scoped_permission *perm;
+
+ region_p = vsoc_region_from_filep(filp);
+ off = region_p->region_begin_offset;
+ perm = &((struct vsoc_private_data *)filp->private_data)->
+ fd_scoped_permission_node->permission;
+ if (perm) {
+ off += perm->begin_offset;
+ length = perm->end_offset - perm->begin_offset;
+ } else if (region_p->managed_by == VSOC_REGION_WHOLE) {
+ /* No permission set and the regions is not owned by another,
+ * default to full region access.
+ */
+ length = vsoc_device_region_size(region_p);
+ } else {
+ /* return zero length, access is denied. */
+ length = 0;
+ }
+ if (area_offset)
+ *area_offset = off;
+ return length;
+}
+
+static int vsoc_mmap(struct file *filp, struct vm_area_struct *vma)
+{
+ unsigned long len = vma->vm_end - vma->vm_start;
+ __u32 area_off;
+ phys_addr_t mem_off;
+ ssize_t area_len;
+ int retval = vsoc_validate_filep(filp);
+
+ if (retval)
+ return retval;
+ area_len = vsoc_get_area(filp, &area_off);
+ /* Add the requested offset */
+ area_off += (vma->vm_pgoff << PAGE_SHIFT);
+ area_len -= (vma->vm_pgoff << PAGE_SHIFT);
+ if (area_len < len)
+ return -EINVAL;
+ vma->vm_page_prot = pgprot_noncached(vma->vm_page_prot);
+ mem_off = shm_off_to_phys_addr(area_off);
+ if (io_remap_pfn_range(vma, vma->vm_start, mem_off >> PAGE_SHIFT,
+ len, vma->vm_page_prot))
+ return -EAGAIN;
+ return 0;
+}
+
+module_init(vsoc_init_module);
+module_exit(vsoc_cleanup_module);
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Greg Hartman <ghartman@google.com>");
+MODULE_DESCRIPTION("VSoC interpretation of QEmu's ivshmem device");
+MODULE_VERSION("1.0");
diff --git a/drivers/staging/goldfish/Kconfig b/drivers/staging/goldfish/Kconfig
index 4e094602437c..d293bbc22c79 100644
--- a/drivers/staging/goldfish/Kconfig
+++ b/drivers/staging/goldfish/Kconfig
@@ -4,6 +4,14 @@ config GOLDFISH_AUDIO
---help---
Emulated audio channel for the Goldfish Android Virtual Device
+config GOLDFISH_SYNC
+ tristate "Goldfish AVD Sync Driver"
+ depends on GOLDFISH
+ depends on SW_SYNC
+ depends on SYNC_FILE
+ ---help---
+ Emulated sync fences for the Goldfish Android Virtual Device
+
config MTD_GOLDFISH_NAND
tristate "Goldfish NAND device"
depends on GOLDFISH
diff --git a/drivers/staging/goldfish/Makefile b/drivers/staging/goldfish/Makefile
index dec34ad58162..3313fce4e940 100644
--- a/drivers/staging/goldfish/Makefile
+++ b/drivers/staging/goldfish/Makefile
@@ -4,3 +4,9 @@
obj-$(CONFIG_GOLDFISH_AUDIO) += goldfish_audio.o
obj-$(CONFIG_MTD_GOLDFISH_NAND) += goldfish_nand.o
+
+# and sync
+
+ccflags-y := -Idrivers/staging/android
+goldfish_sync-objs := goldfish_sync_timeline_fence.o goldfish_sync_timeline.o
+obj-$(CONFIG_GOLDFISH_SYNC) += goldfish_sync.o
diff --git a/drivers/staging/goldfish/goldfish_audio.c b/drivers/staging/goldfish/goldfish_audio.c
index bd559956f199..0bb0ee2e691f 100644
--- a/drivers/staging/goldfish/goldfish_audio.c
+++ b/drivers/staging/goldfish/goldfish_audio.c
@@ -28,6 +28,7 @@
#include <linux/uaccess.h>
#include <linux/slab.h>
#include <linux/goldfish.h>
+#include <linux/acpi.h>
MODULE_AUTHOR("Google, Inc.");
MODULE_DESCRIPTION("Android QEMU Audio Driver");
@@ -116,6 +117,7 @@ static ssize_t goldfish_audio_read(struct file *fp, char __user *buf,
size_t count, loff_t *pos)
{
struct goldfish_audio *data = fp->private_data;
+ unsigned long irq_flags;
int length;
int result = 0;
@@ -129,6 +131,10 @@ static ssize_t goldfish_audio_read(struct file *fp, char __user *buf,
wait_event_interruptible(data->wait, data->buffer_status &
AUDIO_INT_READ_BUFFER_FULL);
+ spin_lock_irqsave(&data->lock, irq_flags);
+ data->buffer_status &= ~AUDIO_INT_READ_BUFFER_FULL;
+ spin_unlock_irqrestore(&data->lock, irq_flags);
+
length = AUDIO_READ(data, AUDIO_READ_BUFFER_AVAILABLE);
/* copy data to user space */
@@ -351,12 +357,19 @@ static const struct of_device_id goldfish_audio_of_match[] = {
};
MODULE_DEVICE_TABLE(of, goldfish_audio_of_match);
+static const struct acpi_device_id goldfish_audio_acpi_match[] = {
+ { "GFSH0005", 0 },
+ { },
+};
+MODULE_DEVICE_TABLE(acpi, goldfish_audio_acpi_match);
+
static struct platform_driver goldfish_audio_driver = {
.probe = goldfish_audio_probe,
.remove = goldfish_audio_remove,
.driver = {
.name = "goldfish_audio",
.of_match_table = goldfish_audio_of_match,
+ .acpi_match_table = ACPI_PTR(goldfish_audio_acpi_match),
}
};
diff --git a/drivers/staging/goldfish/goldfish_sync_timeline.c b/drivers/staging/goldfish/goldfish_sync_timeline.c
new file mode 100644
index 000000000000..5bef4c6c0283
--- /dev/null
+++ b/drivers/staging/goldfish/goldfish_sync_timeline.c
@@ -0,0 +1,962 @@
+/*
+ * Copyright (C) 2016 Google, Inc.
+ *
+ * This software is licensed under the terms of the GNU General Public
+ * License version 2, as published by the Free Software Foundation, and
+ * may be copied, distributed, and modified under those terms.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ */
+
+#include <linux/fdtable.h>
+#include <linux/file.h>
+#include <linux/init.h>
+#include <linux/miscdevice.h>
+#include <linux/module.h>
+#include <linux/kernel.h>
+#include <linux/platform_device.h>
+
+#include <linux/interrupt.h>
+#include <linux/kref.h>
+#include <linux/spinlock.h>
+#include <linux/types.h>
+
+#include <linux/io.h>
+#include <linux/mm.h>
+#include <linux/acpi.h>
+
+#include <linux/string.h>
+
+#include <linux/fs.h>
+#include <linux/syscalls.h>
+#include <linux/sync_file.h>
+#include <linux/fence.h>
+
+#include "goldfish_sync_timeline_fence.h"
+
+#define ERR(...) printk(KERN_ERR __VA_ARGS__);
+
+#define INFO(...) printk(KERN_INFO __VA_ARGS__);
+
+#define DPRINT(...) pr_debug(__VA_ARGS__);
+
+#define DTRACE() DPRINT("%s: enter", __func__)
+
+/* The Goldfish sync driver is designed to provide a interface
+ * between the underlying host's sync device and the kernel's
+ * fence sync framework..
+ * The purpose of the device/driver is to enable lightweight
+ * creation and signaling of timelines and fences
+ * in order to synchronize the guest with host-side graphics events.
+ *
+ * Each time the interrupt trips, the driver
+ * may perform a sync operation.
+ */
+
+/* The operations are: */
+
+/* Ready signal - used to mark when irq should lower */
+#define CMD_SYNC_READY 0
+
+/* Create a new timeline. writes timeline handle */
+#define CMD_CREATE_SYNC_TIMELINE 1
+
+/* Create a fence object. reads timeline handle and time argument.
+ * Writes fence fd to the SYNC_REG_HANDLE register. */
+#define CMD_CREATE_SYNC_FENCE 2
+
+/* Increments timeline. reads timeline handle and time argument */
+#define CMD_SYNC_TIMELINE_INC 3
+
+/* Destroys a timeline. reads timeline handle */
+#define CMD_DESTROY_SYNC_TIMELINE 4
+
+/* Starts a wait on the host with
+ * the given glsync object and sync thread handle. */
+#define CMD_TRIGGER_HOST_WAIT 5
+
+/* The register layout is: */
+
+#define SYNC_REG_BATCH_COMMAND 0x00 /* host->guest batch commands */
+#define SYNC_REG_BATCH_GUESTCOMMAND 0x04 /* guest->host batch commands */
+#define SYNC_REG_BATCH_COMMAND_ADDR 0x08 /* communicate physical address of host->guest batch commands */
+#define SYNC_REG_BATCH_COMMAND_ADDR_HIGH 0x0c /* 64-bit part */
+#define SYNC_REG_BATCH_GUESTCOMMAND_ADDR 0x10 /* communicate physical address of guest->host commands */
+#define SYNC_REG_BATCH_GUESTCOMMAND_ADDR_HIGH 0x14 /* 64-bit part */
+#define SYNC_REG_INIT 0x18 /* signals that the device has been probed */
+
+/* There is an ioctl associated with goldfish sync driver.
+ * Make it conflict with ioctls that are not likely to be used
+ * in the emulator.
+ *
+ * '@' 00-0F linux/radeonfb.h conflict!
+ * '@' 00-0F drivers/video/aty/aty128fb.c conflict!
+ */
+#define GOLDFISH_SYNC_IOC_MAGIC '@'
+
+#define GOLDFISH_SYNC_IOC_QUEUE_WORK _IOWR(GOLDFISH_SYNC_IOC_MAGIC, 0, struct goldfish_sync_ioctl_info)
+
+/* The above definitions (command codes, register layout, ioctl definitions)
+ * need to be in sync with the following files:
+ *
+ * Host-side (emulator):
+ * external/qemu/android/emulation/goldfish_sync.h
+ * external/qemu-android/hw/misc/goldfish_sync.c
+ *
+ * Guest-side (system image):
+ * device/generic/goldfish-opengl/system/egl/goldfish_sync.h
+ * device/generic/goldfish/ueventd.ranchu.rc
+ * platform/build/target/board/generic/sepolicy/file_contexts
+ */
+struct goldfish_sync_hostcmd {
+ /* sorted for alignment */
+ uint64_t handle;
+ uint64_t hostcmd_handle;
+ uint32_t cmd;
+ uint32_t time_arg;
+};
+
+struct goldfish_sync_guestcmd {
+ uint64_t host_command; /* uint64_t for alignment */
+ uint64_t glsync_handle;
+ uint64_t thread_handle;
+ uint64_t guest_timeline_handle;
+};
+
+#define GOLDFISH_SYNC_MAX_CMDS 32
+
+struct goldfish_sync_state {
+ char __iomem *reg_base;
+ int irq;
+
+ /* Spinlock protects |to_do| / |to_do_end|. */
+ spinlock_t lock;
+ /* |mutex_lock| protects all concurrent access
+ * to timelines for both kernel and user space. */
+ struct mutex mutex_lock;
+
+ /* Buffer holding commands issued from host. */
+ struct goldfish_sync_hostcmd to_do[GOLDFISH_SYNC_MAX_CMDS];
+ uint32_t to_do_end;
+
+ /* Addresses for the reading or writing
+ * of individual commands. The host can directly write
+ * to |batch_hostcmd| (and then this driver immediately
+ * copies contents to |to_do|). This driver either replies
+ * through |batch_hostcmd| or simply issues a
+ * guest->host command through |batch_guestcmd|.
+ */
+ struct goldfish_sync_hostcmd *batch_hostcmd;
+ struct goldfish_sync_guestcmd *batch_guestcmd;
+
+ /* Used to give this struct itself to a work queue
+ * function for executing actual sync commands. */
+ struct work_struct work_item;
+};
+
+static struct goldfish_sync_state global_sync_state[1];
+
+struct goldfish_sync_timeline_obj {
+ struct goldfish_sync_timeline *sync_tl;
+ uint32_t current_time;
+ /* We need to be careful about when we deallocate
+ * this |goldfish_sync_timeline_obj| struct.
+ * In order to ensure proper cleanup, we need to
+ * consider the triggered host-side wait that may
+ * still be in flight when the guest close()'s a
+ * goldfish_sync device's sync context fd (and
+ * destroys the |sync_tl| field above).
+ * The host-side wait may raise IRQ
+ * and tell the kernel to increment the timeline _after_
+ * the |sync_tl| has already been set to null.
+ *
+ * From observations on OpenGL apps and CTS tests, this
+ * happens at some very low probability upon context
+ * destruction or process close, but it does happen
+ * and it needs to be handled properly. Otherwise,
+ * if we clean up the surrounding |goldfish_sync_timeline_obj|
+ * too early, any |handle| field of any host->guest command
+ * might not even point to a null |sync_tl| field,
+ * but to garbage memory or even a reclaimed |sync_tl|.
+ * If we do not count such "pending waits" and kfree the object
+ * immediately upon |goldfish_sync_timeline_destroy|,
+ * we might get mysterous RCU stalls after running a long
+ * time because the garbage memory that is being read
+ * happens to be interpretable as a |spinlock_t| struct
+ * that is currently in the locked state.
+ *
+ * To track when to free the |goldfish_sync_timeline_obj|
+ * itself, we maintain a kref.
+ * The kref essentially counts the timeline itself plus
+ * the number of waits in flight. kref_init/kref_put
+ * are issued on
+ * |goldfish_sync_timeline_create|/|goldfish_sync_timeline_destroy|
+ * and kref_get/kref_put are issued on
+ * |goldfish_sync_fence_create|/|goldfish_sync_timeline_inc|.
+ *
+ * The timeline is destroyed after reference count
+ * reaches zero, which would happen after
+ * |goldfish_sync_timeline_destroy| and all pending
+ * |goldfish_sync_timeline_inc|'s are fulfilled.
+ *
+ * NOTE (1): We assume that |fence_create| and
+ * |timeline_inc| calls are 1:1, otherwise the kref scheme
+ * will not work. This is a valid assumption as long
+ * as the host-side virtual device implementation
+ * does not insert any timeline increments
+ * that we did not trigger from here.
+ *
+ * NOTE (2): The use of kref by itself requires no locks,
+ * but this does not mean everything works without locks.
+ * Related timeline operations do require a lock of some sort,
+ * or at least are not proven to work without it.
+ * In particualr, we assume that all the operations
+ * done on the |kref| field above are done in contexts where
+ * |global_sync_state->mutex_lock| is held. Do not
+ * remove that lock until everything is proven to work
+ * without it!!! */
+ struct kref kref;
+};
+
+/* We will call |delete_timeline_obj| when the last reference count
+ * of the kref is decremented. This deletes the sync
+ * timeline object along with the wrapper itself. */
+static void delete_timeline_obj(struct kref* kref) {
+ struct goldfish_sync_timeline_obj* obj =
+ container_of(kref, struct goldfish_sync_timeline_obj, kref);
+
+ goldfish_sync_timeline_put_internal(obj->sync_tl);
+ obj->sync_tl = NULL;
+ kfree(obj);
+}
+
+static uint64_t gensym_ctr;
+static void gensym(char *dst)
+{
+ sprintf(dst, "goldfish_sync:gensym:%llu", gensym_ctr);
+ gensym_ctr++;
+}
+
+/* |goldfish_sync_timeline_create| assumes that |global_sync_state->mutex_lock|
+ * is held. */
+static struct goldfish_sync_timeline_obj*
+goldfish_sync_timeline_create(void)
+{
+
+ char timeline_name[256];
+ struct goldfish_sync_timeline *res_sync_tl = NULL;
+ struct goldfish_sync_timeline_obj *res;
+
+ DTRACE();
+
+ gensym(timeline_name);
+
+ res_sync_tl = goldfish_sync_timeline_create_internal(timeline_name);
+ if (!res_sync_tl) {
+ ERR("Failed to create goldfish_sw_sync timeline.");
+ return NULL;
+ }
+
+ res = kzalloc(sizeof(struct goldfish_sync_timeline_obj), GFP_KERNEL);
+ res->sync_tl = res_sync_tl;
+ res->current_time = 0;
+ kref_init(&res->kref);
+
+ DPRINT("new timeline_obj=0x%p", res);
+ return res;
+}
+
+/* |goldfish_sync_fence_create| assumes that |global_sync_state->mutex_lock|
+ * is held. */
+static int
+goldfish_sync_fence_create(struct goldfish_sync_timeline_obj *obj,
+ uint32_t val)
+{
+
+ int fd;
+ char fence_name[256];
+ struct sync_pt *syncpt = NULL;
+ struct sync_file *sync_file_obj = NULL;
+ struct goldfish_sync_timeline *tl;
+
+ DTRACE();
+
+ if (!obj) return -1;
+
+ tl = obj->sync_tl;
+
+ syncpt = goldfish_sync_pt_create_internal(
+ tl, sizeof(struct sync_pt) + 4, val);
+ if (!syncpt) {
+ ERR("could not create sync point! "
+ "goldfish_sync_timeline=0x%p val=%d",
+ tl, val);
+ return -1;
+ }
+
+ fd = get_unused_fd_flags(O_CLOEXEC);
+ if (fd < 0) {
+ ERR("could not get unused fd for sync fence. "
+ "errno=%d", fd);
+ goto err_cleanup_pt;
+ }
+
+ gensym(fence_name);
+
+ sync_file_obj = sync_file_create(&syncpt->base);
+ if (!sync_file_obj) {
+ ERR("could not create sync fence! "
+ "goldfish_sync_timeline=0x%p val=%d sync_pt=0x%p",
+ tl, val, syncpt);
+ goto err_cleanup_fd_pt;
+ }
+
+ DPRINT("installing sync fence into fd %d sync_file_obj=0x%p",
+ fd, sync_file_obj);
+ fd_install(fd, sync_file_obj->file);
+ kref_get(&obj->kref);
+
+ return fd;
+
+err_cleanup_fd_pt:
+ put_unused_fd(fd);
+err_cleanup_pt:
+ fence_put(&syncpt->base);
+ return -1;
+}
+
+/* |goldfish_sync_timeline_inc| assumes that |global_sync_state->mutex_lock|
+ * is held. */
+static void
+goldfish_sync_timeline_inc(struct goldfish_sync_timeline_obj *obj, uint32_t inc)
+{
+ DTRACE();
+ /* Just give up if someone else nuked the timeline.
+ * Whoever it was won't care that it doesn't get signaled. */
+ if (!obj) return;
+
+ DPRINT("timeline_obj=0x%p", obj);
+ goldfish_sync_timeline_signal_internal(obj->sync_tl, inc);
+ DPRINT("incremented timeline. increment max_time");
+ obj->current_time += inc;
+
+ /* Here, we will end up deleting the timeline object if it
+ * turns out that this call was a pending increment after
+ * |goldfish_sync_timeline_destroy| was called. */
+ kref_put(&obj->kref, delete_timeline_obj);
+ DPRINT("done");
+}
+
+/* |goldfish_sync_timeline_destroy| assumes
+ * that |global_sync_state->mutex_lock| is held. */
+static void
+goldfish_sync_timeline_destroy(struct goldfish_sync_timeline_obj *obj)
+{
+ DTRACE();
+ /* See description of |goldfish_sync_timeline_obj| for why we
+ * should not immediately destroy |obj| */
+ kref_put(&obj->kref, delete_timeline_obj);
+}
+
+static inline void
+goldfish_sync_cmd_queue(struct goldfish_sync_state *sync_state,
+ uint32_t cmd,
+ uint64_t handle,
+ uint32_t time_arg,
+ uint64_t hostcmd_handle)
+{
+ struct goldfish_sync_hostcmd *to_add;
+
+ DTRACE();
+
+ BUG_ON(sync_state->to_do_end == GOLDFISH_SYNC_MAX_CMDS);
+
+ to_add = &sync_state->to_do[sync_state->to_do_end];
+
+ to_add->cmd = cmd;
+ to_add->handle = handle;
+ to_add->time_arg = time_arg;
+ to_add->hostcmd_handle = hostcmd_handle;
+
+ sync_state->to_do_end += 1;
+}
+
+static inline void
+goldfish_sync_hostcmd_reply(struct goldfish_sync_state *sync_state,
+ uint32_t cmd,
+ uint64_t handle,
+ uint32_t time_arg,
+ uint64_t hostcmd_handle)
+{
+ unsigned long irq_flags;
+ struct goldfish_sync_hostcmd *batch_hostcmd =
+ sync_state->batch_hostcmd;
+
+ DTRACE();
+
+ spin_lock_irqsave(&sync_state->lock, irq_flags);
+
+ batch_hostcmd->cmd = cmd;
+ batch_hostcmd->handle = handle;
+ batch_hostcmd->time_arg = time_arg;
+ batch_hostcmd->hostcmd_handle = hostcmd_handle;
+ writel(0, sync_state->reg_base + SYNC_REG_BATCH_COMMAND);
+
+ spin_unlock_irqrestore(&sync_state->lock, irq_flags);
+}
+
+static inline void
+goldfish_sync_send_guestcmd(struct goldfish_sync_state *sync_state,
+ uint32_t cmd,
+ uint64_t glsync_handle,
+ uint64_t thread_handle,
+ uint64_t timeline_handle)
+{
+ unsigned long irq_flags;
+ struct goldfish_sync_guestcmd *batch_guestcmd =
+ sync_state->batch_guestcmd;
+
+ DTRACE();
+
+ spin_lock_irqsave(&sync_state->lock, irq_flags);
+
+ batch_guestcmd->host_command = (uint64_t)cmd;
+ batch_guestcmd->glsync_handle = (uint64_t)glsync_handle;
+ batch_guestcmd->thread_handle = (uint64_t)thread_handle;
+ batch_guestcmd->guest_timeline_handle = (uint64_t)timeline_handle;
+ writel(0, sync_state->reg_base + SYNC_REG_BATCH_GUESTCOMMAND);
+
+ spin_unlock_irqrestore(&sync_state->lock, irq_flags);
+}
+
+/* |goldfish_sync_interrupt| handles IRQ raises from the virtual device.
+ * In the context of OpenGL, this interrupt will fire whenever we need
+ * to signal a fence fd in the guest, with the command
+ * |CMD_SYNC_TIMELINE_INC|.
+ * However, because this function will be called in an interrupt context,
+ * it is necessary to do the actual work of signaling off of interrupt context.
+ * The shared work queue is used for this purpose. At the end when
+ * all pending commands are intercepted by the interrupt handler,
+ * we call |schedule_work|, which will later run the actual
+ * desired sync command in |goldfish_sync_work_item_fn|.
+ */
+static irqreturn_t goldfish_sync_interrupt(int irq, void *dev_id)
+{
+
+ struct goldfish_sync_state *sync_state = dev_id;
+
+ uint32_t nextcmd;
+ uint32_t command_r;
+ uint64_t handle_rw;
+ uint32_t time_r;
+ uint64_t hostcmd_handle_rw;
+
+ int count = 0;
+
+ DTRACE();
+
+ sync_state = dev_id;
+
+ spin_lock(&sync_state->lock);
+
+ for (;;) {
+
+ readl(sync_state->reg_base + SYNC_REG_BATCH_COMMAND);
+ nextcmd = sync_state->batch_hostcmd->cmd;
+
+ if (nextcmd == 0)
+ break;
+
+ command_r = nextcmd;
+ handle_rw = sync_state->batch_hostcmd->handle;
+ time_r = sync_state->batch_hostcmd->time_arg;
+ hostcmd_handle_rw = sync_state->batch_hostcmd->hostcmd_handle;
+
+ goldfish_sync_cmd_queue(
+ sync_state,
+ command_r,
+ handle_rw,
+ time_r,
+ hostcmd_handle_rw);
+
+ count++;
+ }
+
+ spin_unlock(&sync_state->lock);
+
+ schedule_work(&sync_state->work_item);
+
+ return (count == 0) ? IRQ_NONE : IRQ_HANDLED;
+}
+
+/* |goldfish_sync_work_item_fn| does the actual work of servicing
+ * host->guest sync commands. This function is triggered whenever
+ * the IRQ for the goldfish sync device is raised. Once it starts
+ * running, it grabs the contents of the buffer containing the
+ * commands it needs to execute (there may be multiple, because
+ * our IRQ is active high and not edge triggered), and then
+ * runs all of them one after the other.
+ */
+static void goldfish_sync_work_item_fn(struct work_struct *input)
+{
+
+ struct goldfish_sync_state *sync_state;
+ int sync_fence_fd;
+
+ struct goldfish_sync_timeline_obj *timeline;
+ uint64_t timeline_ptr;
+
+ uint64_t hostcmd_handle;
+
+ uint32_t cmd;
+ uint64_t handle;
+ uint32_t time_arg;
+
+ struct goldfish_sync_hostcmd *todo;
+ uint32_t todo_end;
+
+ unsigned long irq_flags;
+
+ struct goldfish_sync_hostcmd to_run[GOLDFISH_SYNC_MAX_CMDS];
+ uint32_t i = 0;
+
+ sync_state = container_of(input, struct goldfish_sync_state, work_item);
+
+ mutex_lock(&sync_state->mutex_lock);
+
+ spin_lock_irqsave(&sync_state->lock, irq_flags); {
+
+ todo_end = sync_state->to_do_end;
+
+ DPRINT("num sync todos: %u", sync_state->to_do_end);
+
+ for (i = 0; i < todo_end; i++)
+ to_run[i] = sync_state->to_do[i];
+
+ /* We expect that commands will come in at a slow enough rate
+ * so that incoming items will not be more than
+ * GOLDFISH_SYNC_MAX_CMDS.
+ *
+ * This is because the way the sync device is used,
+ * it's only for managing buffer data transfers per frame,
+ * with a sequential dependency between putting things in
+ * to_do and taking them out. Once a set of commands is
+ * queued up in to_do, the user of the device waits for
+ * them to be processed before queuing additional commands,
+ * which limits the rate at which commands come in
+ * to the rate at which we take them out here.
+ *
+ * We also don't expect more than MAX_CMDS to be issued
+ * at once; there is a correspondence between
+ * which buffers need swapping to the (display / buffer queue)
+ * to particular commands, and we don't expect there to be
+ * enough display or buffer queues in operation at once
+ * to overrun GOLDFISH_SYNC_MAX_CMDS.
+ */
+ sync_state->to_do_end = 0;
+
+ } spin_unlock_irqrestore(&sync_state->lock, irq_flags);
+
+ for (i = 0; i < todo_end; i++) {
+ DPRINT("todo index: %u", i);
+
+ todo = &to_run[i];
+
+ cmd = todo->cmd;
+
+ handle = (uint64_t)todo->handle;
+ time_arg = todo->time_arg;
+ hostcmd_handle = (uint64_t)todo->hostcmd_handle;
+
+ DTRACE();
+
+ timeline = (struct goldfish_sync_timeline_obj *)(uintptr_t)handle;
+
+ switch (cmd) {
+ case CMD_SYNC_READY:
+ break;
+ case CMD_CREATE_SYNC_TIMELINE:
+ DPRINT("exec CMD_CREATE_SYNC_TIMELINE: "
+ "handle=0x%llx time_arg=%d",
+ handle, time_arg);
+ timeline = goldfish_sync_timeline_create();
+ timeline_ptr = (uintptr_t)timeline;
+ goldfish_sync_hostcmd_reply(sync_state, CMD_CREATE_SYNC_TIMELINE,
+ timeline_ptr,
+ 0,
+ hostcmd_handle);
+ DPRINT("sync timeline created: %p", timeline);
+ break;
+ case CMD_CREATE_SYNC_FENCE:
+ DPRINT("exec CMD_CREATE_SYNC_FENCE: "
+ "handle=0x%llx time_arg=%d",
+ handle, time_arg);
+ sync_fence_fd = goldfish_sync_fence_create(timeline, time_arg);
+ goldfish_sync_hostcmd_reply(sync_state, CMD_CREATE_SYNC_FENCE,
+ sync_fence_fd,
+ 0,
+ hostcmd_handle);
+ break;
+ case CMD_SYNC_TIMELINE_INC:
+ DPRINT("exec CMD_SYNC_TIMELINE_INC: "
+ "handle=0x%llx time_arg=%d",
+ handle, time_arg);
+ goldfish_sync_timeline_inc(timeline, time_arg);
+ break;
+ case CMD_DESTROY_SYNC_TIMELINE:
+ DPRINT("exec CMD_DESTROY_SYNC_TIMELINE: "
+ "handle=0x%llx time_arg=%d",
+ handle, time_arg);
+ goldfish_sync_timeline_destroy(timeline);
+ break;
+ }
+ DPRINT("Done executing sync command");
+ }
+ mutex_unlock(&sync_state->mutex_lock);
+}
+
+/* Guest-side interface: file operations */
+
+/* Goldfish sync context and ioctl info.
+ *
+ * When a sync context is created by open()-ing the goldfish sync device, we
+ * create a sync context (|goldfish_sync_context|).
+ *
+ * Currently, the only data required to track is the sync timeline itself
+ * along with the current time, which are all packed up in the
+ * |goldfish_sync_timeline_obj| field. We use a |goldfish_sync_context|
+ * as the filp->private_data.
+ *
+ * Next, when a sync context user requests that work be queued and a fence
+ * fd provided, we use the |goldfish_sync_ioctl_info| struct, which holds
+ * information about which host handles to touch for this particular
+ * queue-work operation. We need to know about the host-side sync thread
+ * and the particular host-side GLsync object. We also possibly write out
+ * a file descriptor.
+ */
+struct goldfish_sync_context {
+ struct goldfish_sync_timeline_obj *timeline;
+};
+
+struct goldfish_sync_ioctl_info {
+ uint64_t host_glsync_handle_in;
+ uint64_t host_syncthread_handle_in;
+ int fence_fd_out;
+};
+
+static int goldfish_sync_open(struct inode *inode, struct file *file)
+{
+
+ struct goldfish_sync_context *sync_context;
+
+ DTRACE();
+
+ mutex_lock(&global_sync_state->mutex_lock);
+
+ sync_context = kzalloc(sizeof(struct goldfish_sync_context), GFP_KERNEL);
+
+ if (sync_context == NULL) {
+ ERR("Creation of goldfish sync context failed!");
+ mutex_unlock(&global_sync_state->mutex_lock);
+ return -ENOMEM;
+ }
+
+ sync_context->timeline = NULL;
+
+ file->private_data = sync_context;
+
+ DPRINT("successfully create a sync context @0x%p", sync_context);
+
+ mutex_unlock(&global_sync_state->mutex_lock);
+
+ return 0;
+}
+
+static int goldfish_sync_release(struct inode *inode, struct file *file)
+{
+
+ struct goldfish_sync_context *sync_context;
+
+ DTRACE();
+
+ mutex_lock(&global_sync_state->mutex_lock);
+
+ sync_context = file->private_data;
+
+ if (sync_context->timeline)
+ goldfish_sync_timeline_destroy(sync_context->timeline);
+
+ sync_context->timeline = NULL;
+
+ kfree(sync_context);
+
+ mutex_unlock(&global_sync_state->mutex_lock);
+
+ return 0;
+}
+
+/* |goldfish_sync_ioctl| is the guest-facing interface of goldfish sync
+ * and is used in conjunction with eglCreateSyncKHR to queue up the
+ * actual work of waiting for the EGL sync command to complete,
+ * possibly returning a fence fd to the guest.
+ */
+static long goldfish_sync_ioctl(struct file *file,
+ unsigned int cmd,
+ unsigned long arg)
+{
+ struct goldfish_sync_context *sync_context_data;
+ struct goldfish_sync_timeline_obj *timeline;
+ int fd_out;
+ struct goldfish_sync_ioctl_info ioctl_data;
+
+ DTRACE();
+
+ sync_context_data = file->private_data;
+ fd_out = -1;
+
+ switch (cmd) {
+ case GOLDFISH_SYNC_IOC_QUEUE_WORK:
+
+ DPRINT("exec GOLDFISH_SYNC_IOC_QUEUE_WORK");
+
+ mutex_lock(&global_sync_state->mutex_lock);
+
+ if (copy_from_user(&ioctl_data,
+ (void __user *)arg,
+ sizeof(ioctl_data))) {
+ ERR("Failed to copy memory for ioctl_data from user.");
+ mutex_unlock(&global_sync_state->mutex_lock);
+ return -EFAULT;
+ }
+
+ if (ioctl_data.host_syncthread_handle_in == 0) {
+ DPRINT("Error: zero host syncthread handle!!!");
+ mutex_unlock(&global_sync_state->mutex_lock);
+ return -EFAULT;
+ }
+
+ if (!sync_context_data->timeline) {
+ DPRINT("no timeline yet, create one.");
+ sync_context_data->timeline = goldfish_sync_timeline_create();
+ DPRINT("timeline: 0x%p", &sync_context_data->timeline);
+ }
+
+ timeline = sync_context_data->timeline;
+ fd_out = goldfish_sync_fence_create(timeline,
+ timeline->current_time + 1);
+ DPRINT("Created fence with fd %d and current time %u (timeline: 0x%p)",
+ fd_out,
+ sync_context_data->timeline->current_time + 1,
+ sync_context_data->timeline);
+
+ ioctl_data.fence_fd_out = fd_out;
+
+ if (copy_to_user((void __user *)arg,
+ &ioctl_data,
+ sizeof(ioctl_data))) {
+ DPRINT("Error, could not copy to user!!!");
+
+ sys_close(fd_out);
+ /* We won't be doing an increment, kref_put immediately. */
+ kref_put(&timeline->kref, delete_timeline_obj);
+ mutex_unlock(&global_sync_state->mutex_lock);
+ return -EFAULT;
+ }
+
+ /* We are now about to trigger a host-side wait;
+ * accumulate on |pending_waits|. */
+ goldfish_sync_send_guestcmd(global_sync_state,
+ CMD_TRIGGER_HOST_WAIT,
+ ioctl_data.host_glsync_handle_in,
+ ioctl_data.host_syncthread_handle_in,
+ (uint64_t)(uintptr_t)(sync_context_data->timeline));
+
+ mutex_unlock(&global_sync_state->mutex_lock);
+ return 0;
+ default:
+ return -ENOTTY;
+ }
+}
+
+static const struct file_operations goldfish_sync_fops = {
+ .owner = THIS_MODULE,
+ .open = goldfish_sync_open,
+ .release = goldfish_sync_release,
+ .unlocked_ioctl = goldfish_sync_ioctl,
+ .compat_ioctl = goldfish_sync_ioctl,
+};
+
+static struct miscdevice goldfish_sync_device = {
+ .name = "goldfish_sync",
+ .fops = &goldfish_sync_fops,
+};
+
+
+static bool setup_verify_batch_cmd_addr(struct goldfish_sync_state *sync_state,
+ void *batch_addr,
+ uint32_t addr_offset,
+ uint32_t addr_offset_high)
+{
+ uint64_t batch_addr_phys;
+ uint32_t batch_addr_phys_test_lo;
+ uint32_t batch_addr_phys_test_hi;
+
+ if (!batch_addr) {
+ ERR("Could not use batch command address!");
+ return false;
+ }
+
+ batch_addr_phys = virt_to_phys(batch_addr);
+ writel((uint32_t)(batch_addr_phys),
+ sync_state->reg_base + addr_offset);
+ writel((uint32_t)(batch_addr_phys >> 32),
+ sync_state->reg_base + addr_offset_high);
+
+ batch_addr_phys_test_lo =
+ readl(sync_state->reg_base + addr_offset);
+ batch_addr_phys_test_hi =
+ readl(sync_state->reg_base + addr_offset_high);
+
+ if (virt_to_phys(batch_addr) !=
+ (((uint64_t)batch_addr_phys_test_hi << 32) |
+ batch_addr_phys_test_lo)) {
+ ERR("Invalid batch command address!");
+ return false;
+ }
+
+ return true;
+}
+
+int goldfish_sync_probe(struct platform_device *pdev)
+{
+ struct resource *ioresource;
+ struct goldfish_sync_state *sync_state = global_sync_state;
+ int status;
+
+ DTRACE();
+
+ sync_state->to_do_end = 0;
+
+ spin_lock_init(&sync_state->lock);
+ mutex_init(&sync_state->mutex_lock);
+
+ platform_set_drvdata(pdev, sync_state);
+
+ ioresource = platform_get_resource(pdev, IORESOURCE_MEM, 0);
+ if (ioresource == NULL) {
+ ERR("platform_get_resource failed");
+ return -ENODEV;
+ }
+
+ sync_state->reg_base =
+ devm_ioremap(&pdev->dev, ioresource->start, PAGE_SIZE);
+ if (sync_state->reg_base == NULL) {
+ ERR("Could not ioremap");
+ return -ENOMEM;
+ }
+
+ sync_state->irq = platform_get_irq(pdev, 0);
+ if (sync_state->irq < 0) {
+ ERR("Could not platform_get_irq");
+ return -ENODEV;
+ }
+
+ status = devm_request_irq(&pdev->dev,
+ sync_state->irq,
+ goldfish_sync_interrupt,
+ IRQF_SHARED,
+ pdev->name,
+ sync_state);
+ if (status) {
+ ERR("request_irq failed");
+ return -ENODEV;
+ }
+
+ INIT_WORK(&sync_state->work_item,
+ goldfish_sync_work_item_fn);
+
+ misc_register(&goldfish_sync_device);
+
+ /* Obtain addresses for batch send/recv of commands. */
+ {
+ struct goldfish_sync_hostcmd *batch_addr_hostcmd;
+ struct goldfish_sync_guestcmd *batch_addr_guestcmd;
+
+ batch_addr_hostcmd =
+ devm_kzalloc(&pdev->dev, sizeof(struct goldfish_sync_hostcmd),
+ GFP_KERNEL);
+ batch_addr_guestcmd =
+ devm_kzalloc(&pdev->dev, sizeof(struct goldfish_sync_guestcmd),
+ GFP_KERNEL);
+
+ if (!setup_verify_batch_cmd_addr(sync_state,
+ batch_addr_hostcmd,
+ SYNC_REG_BATCH_COMMAND_ADDR,
+ SYNC_REG_BATCH_COMMAND_ADDR_HIGH)) {
+ ERR("goldfish_sync: Could not setup batch command address");
+ return -ENODEV;
+ }
+
+ if (!setup_verify_batch_cmd_addr(sync_state,
+ batch_addr_guestcmd,
+ SYNC_REG_BATCH_GUESTCOMMAND_ADDR,
+ SYNC_REG_BATCH_GUESTCOMMAND_ADDR_HIGH)) {
+ ERR("goldfish_sync: Could not setup batch guest command address");
+ return -ENODEV;
+ }
+
+ sync_state->batch_hostcmd = batch_addr_hostcmd;
+ sync_state->batch_guestcmd = batch_addr_guestcmd;
+ }
+
+ INFO("goldfish_sync: Initialized goldfish sync device");
+
+ writel(0, sync_state->reg_base + SYNC_REG_INIT);
+
+ return 0;
+}
+
+static int goldfish_sync_remove(struct platform_device *pdev)
+{
+ struct goldfish_sync_state *sync_state = global_sync_state;
+
+ DTRACE();
+
+ misc_deregister(&goldfish_sync_device);
+ memset(sync_state, 0, sizeof(struct goldfish_sync_state));
+ return 0;
+}
+
+static const struct of_device_id goldfish_sync_of_match[] = {
+ { .compatible = "google,goldfish-sync", },
+ {},
+};
+MODULE_DEVICE_TABLE(of, goldfish_sync_of_match);
+
+static const struct acpi_device_id goldfish_sync_acpi_match[] = {
+ { "GFSH0006", 0 },
+ { },
+};
+
+MODULE_DEVICE_TABLE(acpi, goldfish_sync_acpi_match);
+
+static struct platform_driver goldfish_sync = {
+ .probe = goldfish_sync_probe,
+ .remove = goldfish_sync_remove,
+ .driver = {
+ .name = "goldfish_sync",
+ .of_match_table = goldfish_sync_of_match,
+ .acpi_match_table = ACPI_PTR(goldfish_sync_acpi_match),
+ }
+};
+
+module_platform_driver(goldfish_sync);
+
+MODULE_AUTHOR("Google, Inc.");
+MODULE_DESCRIPTION("Android QEMU Sync Driver");
+MODULE_LICENSE("GPL");
+MODULE_VERSION("1.0");
diff --git a/drivers/staging/goldfish/goldfish_sync_timeline_fence.c b/drivers/staging/goldfish/goldfish_sync_timeline_fence.c
new file mode 100644
index 000000000000..e671618cf888
--- /dev/null
+++ b/drivers/staging/goldfish/goldfish_sync_timeline_fence.c
@@ -0,0 +1,254 @@
+#include <linux/slab.h>
+#include <linux/fs.h>
+#include <linux/syscalls.h>
+#include <linux/sync_file.h>
+#include <linux/fence.h>
+
+#include "goldfish_sync_timeline_fence.h"
+
+/*
+ * Timeline-based sync for Goldfish Sync
+ * Based on "Sync File validation framework"
+ * (drivers/dma-buf/sw_sync.c)
+ *
+ * Copyright (C) 2017 Google, Inc.
+ *
+ * This software is licensed under the terms of the GNU General Public
+ * License version 2, as published by the Free Software Foundation, and
+ * may be copied, distributed, and modified under those terms.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ */
+
+/**
+ * struct goldfish_sync_timeline - sync object
+ * @kref: reference count on fence.
+ * @name: name of the goldfish_sync_timeline. Useful for debugging
+ * @child_list_head: list of children sync_pts for this goldfish_sync_timeline
+ * @child_list_lock: lock protecting @child_list_head and fence.status
+ * @active_list_head: list of active (unsignaled/errored) sync_pts
+ */
+struct goldfish_sync_timeline {
+ struct kref kref;
+ char name[32];
+
+ /* protected by child_list_lock */
+ u64 context;
+ int value;
+
+ struct list_head child_list_head;
+ spinlock_t child_list_lock;
+
+ struct list_head active_list_head;
+};
+
+static inline struct goldfish_sync_timeline *fence_parent(struct fence *fence)
+{
+ return container_of(fence->lock, struct goldfish_sync_timeline,
+ child_list_lock);
+}
+
+static const struct fence_ops goldfish_sync_timeline_fence_ops;
+
+static inline struct sync_pt *goldfish_sync_fence_to_sync_pt(struct fence *fence)
+{
+ if (fence->ops != &goldfish_sync_timeline_fence_ops)
+ return NULL;
+ return container_of(fence, struct sync_pt, base);
+}
+
+/**
+ * goldfish_sync_timeline_create_internal() - creates a sync object
+ * @name: sync_timeline name
+ *
+ * Creates a new sync_timeline. Returns the sync_timeline object or NULL in
+ * case of error.
+ */
+struct goldfish_sync_timeline
+*goldfish_sync_timeline_create_internal(const char *name)
+{
+ struct goldfish_sync_timeline *obj;
+
+ obj = kzalloc(sizeof(*obj), GFP_KERNEL);
+ if (!obj)
+ return NULL;
+
+ kref_init(&obj->kref);
+ obj->context = fence_context_alloc(1);
+ strlcpy(obj->name, name, sizeof(obj->name));
+
+ INIT_LIST_HEAD(&obj->child_list_head);
+ INIT_LIST_HEAD(&obj->active_list_head);
+ spin_lock_init(&obj->child_list_lock);
+
+ return obj;
+}
+
+static void goldfish_sync_timeline_free_internal(struct kref *kref)
+{
+ struct goldfish_sync_timeline *obj =
+ container_of(kref, struct goldfish_sync_timeline, kref);
+
+ kfree(obj);
+}
+
+static void goldfish_sync_timeline_get_internal(
+ struct goldfish_sync_timeline *obj)
+{
+ kref_get(&obj->kref);
+}
+
+void goldfish_sync_timeline_put_internal(struct goldfish_sync_timeline *obj)
+{
+ kref_put(&obj->kref, goldfish_sync_timeline_free_internal);
+}
+
+/**
+ * goldfish_sync_timeline_signal() -
+ * signal a status change on a goldfish_sync_timeline
+ * @obj: sync_timeline to signal
+ * @inc: num to increment on timeline->value
+ *
+ * A sync implementation should call this any time one of it's fences
+ * has signaled or has an error condition.
+ */
+void goldfish_sync_timeline_signal_internal(struct goldfish_sync_timeline *obj,
+ unsigned int inc)
+{
+ unsigned long flags;
+ struct sync_pt *pt, *next;
+
+ spin_lock_irqsave(&obj->child_list_lock, flags);
+
+ obj->value += inc;
+
+ list_for_each_entry_safe(pt, next, &obj->active_list_head,
+ active_list) {
+ if (fence_is_signaled_locked(&pt->base))
+ list_del_init(&pt->active_list);
+ }
+
+ spin_unlock_irqrestore(&obj->child_list_lock, flags);
+}
+
+/**
+ * goldfish_sync_pt_create_internal() - creates a sync pt
+ * @parent: fence's parent sync_timeline
+ * @size: size to allocate for this pt
+ * @inc: value of the fence
+ *
+ * Creates a new sync_pt as a child of @parent. @size bytes will be
+ * allocated allowing for implementation specific data to be kept after
+ * the generic sync_timeline struct. Returns the sync_pt object or
+ * NULL in case of error.
+ */
+struct sync_pt *goldfish_sync_pt_create_internal(
+ struct goldfish_sync_timeline *obj, int size,
+ unsigned int value)
+{
+ unsigned long flags;
+ struct sync_pt *pt;
+
+ if (size < sizeof(*pt))
+ return NULL;
+
+ pt = kzalloc(size, GFP_KERNEL);
+ if (!pt)
+ return NULL;
+
+ spin_lock_irqsave(&obj->child_list_lock, flags);
+ goldfish_sync_timeline_get_internal(obj);
+ fence_init(&pt->base, &goldfish_sync_timeline_fence_ops, &obj->child_list_lock,
+ obj->context, value);
+ list_add_tail(&pt->child_list, &obj->child_list_head);
+ INIT_LIST_HEAD(&pt->active_list);
+ spin_unlock_irqrestore(&obj->child_list_lock, flags);
+ return pt;
+}
+
+static const char *goldfish_sync_timeline_fence_get_driver_name(
+ struct fence *fence)
+{
+ return "sw_sync";
+}
+
+static const char *goldfish_sync_timeline_fence_get_timeline_name(
+ struct fence *fence)
+{
+ struct goldfish_sync_timeline *parent = fence_parent(fence);
+
+ return parent->name;
+}
+
+static void goldfish_sync_timeline_fence_release(struct fence *fence)
+{
+ struct sync_pt *pt = goldfish_sync_fence_to_sync_pt(fence);
+ struct goldfish_sync_timeline *parent = fence_parent(fence);
+ unsigned long flags;
+
+ spin_lock_irqsave(fence->lock, flags);
+ list_del(&pt->child_list);
+ if (!list_empty(&pt->active_list))
+ list_del(&pt->active_list);
+ spin_unlock_irqrestore(fence->lock, flags);
+
+ goldfish_sync_timeline_put_internal(parent);
+ fence_free(fence);
+}
+
+static bool goldfish_sync_timeline_fence_signaled(struct fence *fence)
+{
+ struct goldfish_sync_timeline *parent = fence_parent(fence);
+
+ return (fence->seqno > parent->value) ? false : true;
+}
+
+static bool goldfish_sync_timeline_fence_enable_signaling(struct fence *fence)
+{
+ struct sync_pt *pt = goldfish_sync_fence_to_sync_pt(fence);
+ struct goldfish_sync_timeline *parent = fence_parent(fence);
+
+ if (goldfish_sync_timeline_fence_signaled(fence))
+ return false;
+
+ list_add_tail(&pt->active_list, &parent->active_list_head);
+ return true;
+}
+
+static void goldfish_sync_timeline_fence_disable_signaling(struct fence *fence)
+{
+ struct sync_pt *pt = container_of(fence, struct sync_pt, base);
+
+ list_del_init(&pt->active_list);
+}
+
+static void goldfish_sync_timeline_fence_value_str(struct fence *fence,
+ char *str, int size)
+{
+ snprintf(str, size, "%d", fence->seqno);
+}
+
+static void goldfish_sync_timeline_fence_timeline_value_str(
+ struct fence *fence,
+ char *str, int size)
+{
+ struct goldfish_sync_timeline *parent = fence_parent(fence);
+
+ snprintf(str, size, "%d", parent->value);
+}
+
+static const struct fence_ops goldfish_sync_timeline_fence_ops = {
+ .get_driver_name = goldfish_sync_timeline_fence_get_driver_name,
+ .get_timeline_name = goldfish_sync_timeline_fence_get_timeline_name,
+ .enable_signaling = goldfish_sync_timeline_fence_enable_signaling,
+ .disable_signaling = goldfish_sync_timeline_fence_disable_signaling,
+ .signaled = goldfish_sync_timeline_fence_signaled,
+ .wait = fence_default_wait,
+ .release = goldfish_sync_timeline_fence_release,
+ .fence_value_str = goldfish_sync_timeline_fence_value_str,
+ .timeline_value_str = goldfish_sync_timeline_fence_timeline_value_str,
+};
diff --git a/drivers/staging/goldfish/goldfish_sync_timeline_fence.h b/drivers/staging/goldfish/goldfish_sync_timeline_fence.h
new file mode 100644
index 000000000000..fc25924652c1
--- /dev/null
+++ b/drivers/staging/goldfish/goldfish_sync_timeline_fence.h
@@ -0,0 +1,58 @@
+#include <linux/sync_file.h>
+#include <linux/fence.h>
+
+/**
+ * struct sync_pt - sync_pt object
+ * @base: base fence object
+ * @child_list: sync timeline child's list
+ * @active_list: sync timeline active child's list
+ */
+struct sync_pt {
+ struct fence base;
+ struct list_head child_list;
+ struct list_head active_list;
+};
+
+/**
+ * goldfish_sync_timeline_create_internal() - creates a sync object
+ * @name: goldfish_sync_timeline name
+ *
+ * Creates a new goldfish_sync_timeline.
+ * Returns the goldfish_sync_timeline object or NULL in case of error.
+ */
+struct goldfish_sync_timeline
+*goldfish_sync_timeline_create_internal(const char *name);
+
+/**
+ * goldfish_sync_pt_create_internal() - creates a sync pt
+ * @parent: fence's parent goldfish_sync_timeline
+ * @size: size to allocate for this pt
+ * @inc: value of the fence
+ *
+ * Creates a new sync_pt as a child of @parent. @size bytes will be
+ * allocated allowing for implementation specific data to be kept after
+ * the generic sync_timeline struct. Returns the sync_pt object or
+ * NULL in case of error.
+ */
+struct sync_pt
+*goldfish_sync_pt_create_internal(struct goldfish_sync_timeline *obj,
+ int size, unsigned int value);
+
+/**
+ * goldfish_sync_timeline_signal_internal() -
+ * signal a status change on a sync_timeline
+ * @obj: goldfish_sync_timeline to signal
+ * @inc: num to increment on timeline->value
+ *
+ * A sync implementation should call this any time one of it's fences
+ * has signaled or has an error condition.
+ */
+void goldfish_sync_timeline_signal_internal(struct goldfish_sync_timeline *obj,
+ unsigned int inc);
+
+/**
+ * goldfish_sync_timeline_put_internal() - dec refcount of a sync_timeline
+ * and clean up memory if it was the last ref.
+ * @obj: goldfish_sync_timeline to decref
+ */
+void goldfish_sync_timeline_put_internal(struct goldfish_sync_timeline *obj);
diff --git a/drivers/staging/lustre/lustre/mdc/mdc_request.c b/drivers/staging/lustre/lustre/mdc/mdc_request.c
index f56ea643f9bf..6a146f45bedb 100644
--- a/drivers/staging/lustre/lustre/mdc/mdc_request.c
+++ b/drivers/staging/lustre/lustre/mdc/mdc_request.c
@@ -1219,9 +1219,9 @@ struct readpage_param {
* in PAGE_SIZE (if PAGE_SIZE greater than LU_PAGE_SIZE), and the
* lu_dirpage for this integrated page will be adjusted.
**/
-static int mdc_read_page_remote(void *data, struct page *page0)
+static int mdc_read_page_remote(struct file *data, struct page *page0)
{
- struct readpage_param *rp = data;
+ struct readpage_param *rp = (struct readpage_param *)data;
struct page **page_pool;
struct page *page;
struct lu_dirpage *dp;
diff --git a/drivers/tee/Kconfig b/drivers/tee/Kconfig
new file mode 100644
index 000000000000..a6df12d88f90
--- /dev/null
+++ b/drivers/tee/Kconfig
@@ -0,0 +1,19 @@
+# Generic Trusted Execution Environment Configuration
+config TEE
+ tristate "Trusted Execution Environment support"
+ depends on HAVE_ARM_SMCCC || COMPILE_TEST
+ select DMA_SHARED_BUFFER
+ select GENERIC_ALLOCATOR
+ help
+ This implements a generic interface towards a Trusted Execution
+ Environment (TEE).
+
+if TEE
+
+menu "TEE drivers"
+
+source "drivers/tee/optee/Kconfig"
+
+endmenu
+
+endif
diff --git a/drivers/tee/Makefile b/drivers/tee/Makefile
new file mode 100644
index 000000000000..7a4e4a1ac39c
--- /dev/null
+++ b/drivers/tee/Makefile
@@ -0,0 +1,5 @@
+obj-$(CONFIG_TEE) += tee.o
+tee-objs += tee_core.o
+tee-objs += tee_shm.o
+tee-objs += tee_shm_pool.o
+obj-$(CONFIG_OPTEE) += optee/
diff --git a/drivers/tee/optee/Kconfig b/drivers/tee/optee/Kconfig
new file mode 100644
index 000000000000..0126de898036
--- /dev/null
+++ b/drivers/tee/optee/Kconfig
@@ -0,0 +1,7 @@
+# OP-TEE Trusted Execution Environment Configuration
+config OPTEE
+ tristate "OP-TEE"
+ depends on HAVE_ARM_SMCCC
+ help
+ This implements the OP-TEE Trusted Execution Environment (TEE)
+ driver.
diff --git a/drivers/tee/optee/Makefile b/drivers/tee/optee/Makefile
new file mode 100644
index 000000000000..220cf4298f0d
--- /dev/null
+++ b/drivers/tee/optee/Makefile
@@ -0,0 +1,6 @@
+obj-$(CONFIG_OPTEE) += optee.o
+optee-objs += core.o
+optee-objs += call.o
+optee-objs += rpc.o
+optee-objs += supp.o
+optee-objs += shm_pool.o
diff --git a/drivers/tee/optee/call.c b/drivers/tee/optee/call.c
new file mode 100644
index 000000000000..a5afbe6dee68
--- /dev/null
+++ b/drivers/tee/optee/call.c
@@ -0,0 +1,662 @@
+/*
+ * Copyright (c) 2015, Linaro Limited
+ *
+ * This software is licensed under the terms of the GNU General Public
+ * License version 2, as published by the Free Software Foundation, and
+ * may be copied, distributed, and modified under those terms.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ */
+#include <linux/arm-smccc.h>
+#include <linux/device.h>
+#include <linux/err.h>
+#include <linux/errno.h>
+#include <linux/mm.h>
+#include <linux/slab.h>
+#include <linux/tee_drv.h>
+#include <linux/types.h>
+#include <linux/uaccess.h>
+#include "optee_private.h"
+#include "optee_smc.h"
+
+struct optee_call_waiter {
+ struct list_head list_node;
+ struct completion c;
+};
+
+static void optee_cq_wait_init(struct optee_call_queue *cq,
+ struct optee_call_waiter *w)
+{
+ /*
+ * We're preparing to make a call to secure world. In case we can't
+ * allocate a thread in secure world we'll end up waiting in
+ * optee_cq_wait_for_completion().
+ *
+ * Normally if there's no contention in secure world the call will
+ * complete and we can cleanup directly with optee_cq_wait_final().
+ */
+ mutex_lock(&cq->mutex);
+
+ /*
+ * We add ourselves to the queue, but we don't wait. This
+ * guarantees that we don't lose a completion if secure world
+ * returns busy and another thread just exited and try to complete
+ * someone.
+ */
+ init_completion(&w->c);
+ list_add_tail(&w->list_node, &cq->waiters);
+
+ mutex_unlock(&cq->mutex);
+}
+
+static void optee_cq_wait_for_completion(struct optee_call_queue *cq,
+ struct optee_call_waiter *w)
+{
+ wait_for_completion(&w->c);
+
+ mutex_lock(&cq->mutex);
+
+ /* Move to end of list to get out of the way for other waiters */
+ list_del(&w->list_node);
+ reinit_completion(&w->c);
+ list_add_tail(&w->list_node, &cq->waiters);
+
+ mutex_unlock(&cq->mutex);
+}
+
+static void optee_cq_complete_one(struct optee_call_queue *cq)
+{
+ struct optee_call_waiter *w;
+
+ list_for_each_entry(w, &cq->waiters, list_node) {
+ if (!completion_done(&w->c)) {
+ complete(&w->c);
+ break;
+ }
+ }
+}
+
+static void optee_cq_wait_final(struct optee_call_queue *cq,
+ struct optee_call_waiter *w)
+{
+ /*
+ * We're done with the call to secure world. The thread in secure
+ * world that was used for this call is now available for some
+ * other task to use.
+ */
+ mutex_lock(&cq->mutex);
+
+ /* Get out of the list */
+ list_del(&w->list_node);
+
+ /* Wake up one eventual waiting task */
+ optee_cq_complete_one(cq);
+
+ /*
+ * If we're completed we've got a completion from another task that
+ * was just done with its call to secure world. Since yet another
+ * thread now is available in secure world wake up another eventual
+ * waiting task.
+ */
+ if (completion_done(&w->c))
+ optee_cq_complete_one(cq);
+
+ mutex_unlock(&cq->mutex);
+}
+
+/* Requires the filpstate mutex to be held */
+static struct optee_session *find_session(struct optee_context_data *ctxdata,
+ u32 session_id)
+{
+ struct optee_session *sess;
+
+ list_for_each_entry(sess, &ctxdata->sess_list, list_node)
+ if (sess->session_id == session_id)
+ return sess;
+
+ return NULL;
+}
+
+/**
+ * optee_do_call_with_arg() - Do an SMC to OP-TEE in secure world
+ * @ctx: calling context
+ * @parg: physical address of message to pass to secure world
+ *
+ * Does and SMC to OP-TEE in secure world and handles eventual resulting
+ * Remote Procedure Calls (RPC) from OP-TEE.
+ *
+ * Returns return code from secure world, 0 is OK
+ */
+u32 optee_do_call_with_arg(struct tee_context *ctx, phys_addr_t parg)
+{
+ struct optee *optee = tee_get_drvdata(ctx->teedev);
+ struct optee_call_waiter w;
+ struct optee_rpc_param param = { };
+ struct optee_call_ctx call_ctx = { };
+ u32 ret;
+
+ param.a0 = OPTEE_SMC_CALL_WITH_ARG;
+ reg_pair_from_64(&param.a1, &param.a2, parg);
+ /* Initialize waiter */
+ optee_cq_wait_init(&optee->call_queue, &w);
+ while (true) {
+ struct arm_smccc_res res;
+
+ optee->invoke_fn(param.a0, param.a1, param.a2, param.a3,
+ param.a4, param.a5, param.a6, param.a7,
+ &res);
+
+ if (res.a0 == OPTEE_SMC_RETURN_ETHREAD_LIMIT) {
+ /*
+ * Out of threads in secure world, wait for a thread
+ * become available.
+ */
+ optee_cq_wait_for_completion(&optee->call_queue, &w);
+ } else if (OPTEE_SMC_RETURN_IS_RPC(res.a0)) {
+ param.a0 = res.a0;
+ param.a1 = res.a1;
+ param.a2 = res.a2;
+ param.a3 = res.a3;
+ optee_handle_rpc(ctx, &param, &call_ctx);
+ } else {
+ ret = res.a0;
+ break;
+ }
+ }
+
+ optee_rpc_finalize_call(&call_ctx);
+ /*
+ * We're done with our thread in secure world, if there's any
+ * thread waiters wake up one.
+ */
+ optee_cq_wait_final(&optee->call_queue, &w);
+
+ return ret;
+}
+
+static struct tee_shm *get_msg_arg(struct tee_context *ctx, size_t num_params,
+ struct optee_msg_arg **msg_arg,
+ phys_addr_t *msg_parg)
+{
+ int rc;
+ struct tee_shm *shm;
+ struct optee_msg_arg *ma;
+
+ shm = tee_shm_alloc(ctx, OPTEE_MSG_GET_ARG_SIZE(num_params),
+ TEE_SHM_MAPPED);
+ if (IS_ERR(shm))
+ return shm;
+
+ ma = tee_shm_get_va(shm, 0);
+ if (IS_ERR(ma)) {
+ rc = PTR_ERR(ma);
+ goto out;
+ }
+
+ rc = tee_shm_get_pa(shm, 0, msg_parg);
+ if (rc)
+ goto out;
+
+ memset(ma, 0, OPTEE_MSG_GET_ARG_SIZE(num_params));
+ ma->num_params = num_params;
+ *msg_arg = ma;
+out:
+ if (rc) {
+ tee_shm_free(shm);
+ return ERR_PTR(rc);
+ }
+
+ return shm;
+}
+
+int optee_open_session(struct tee_context *ctx,
+ struct tee_ioctl_open_session_arg *arg,
+ struct tee_param *param)
+{
+ struct optee_context_data *ctxdata = ctx->data;
+ int rc;
+ struct tee_shm *shm;
+ struct optee_msg_arg *msg_arg;
+ phys_addr_t msg_parg;
+ struct optee_session *sess = NULL;
+
+ /* +2 for the meta parameters added below */
+ shm = get_msg_arg(ctx, arg->num_params + 2, &msg_arg, &msg_parg);
+ if (IS_ERR(shm))
+ return PTR_ERR(shm);
+
+ msg_arg->cmd = OPTEE_MSG_CMD_OPEN_SESSION;
+ msg_arg->cancel_id = arg->cancel_id;
+
+ /*
+ * Initialize and add the meta parameters needed when opening a
+ * session.
+ */
+ msg_arg->params[0].attr = OPTEE_MSG_ATTR_TYPE_VALUE_INPUT |
+ OPTEE_MSG_ATTR_META;
+ msg_arg->params[1].attr = OPTEE_MSG_ATTR_TYPE_VALUE_INPUT |
+ OPTEE_MSG_ATTR_META;
+ memcpy(&msg_arg->params[0].u.value, arg->uuid, sizeof(arg->uuid));
+ memcpy(&msg_arg->params[1].u.value, arg->uuid, sizeof(arg->clnt_uuid));
+ msg_arg->params[1].u.value.c = arg->clnt_login;
+
+ rc = optee_to_msg_param(msg_arg->params + 2, arg->num_params, param);
+ if (rc)
+ goto out;
+
+ sess = kzalloc(sizeof(*sess), GFP_KERNEL);
+ if (!sess) {
+ rc = -ENOMEM;
+ goto out;
+ }
+
+ if (optee_do_call_with_arg(ctx, msg_parg)) {
+ msg_arg->ret = TEEC_ERROR_COMMUNICATION;
+ msg_arg->ret_origin = TEEC_ORIGIN_COMMS;
+ }
+
+ if (msg_arg->ret == TEEC_SUCCESS) {
+ /* A new session has been created, add it to the list. */
+ sess->session_id = msg_arg->session;
+ mutex_lock(&ctxdata->mutex);
+ list_add(&sess->list_node, &ctxdata->sess_list);
+ mutex_unlock(&ctxdata->mutex);
+ } else {
+ kfree(sess);
+ }
+
+ if (optee_from_msg_param(param, arg->num_params, msg_arg->params + 2)) {
+ arg->ret = TEEC_ERROR_COMMUNICATION;
+ arg->ret_origin = TEEC_ORIGIN_COMMS;
+ /* Close session again to avoid leakage */
+ optee_close_session(ctx, msg_arg->session);
+ } else {
+ arg->session = msg_arg->session;
+ arg->ret = msg_arg->ret;
+ arg->ret_origin = msg_arg->ret_origin;
+ }
+out:
+ tee_shm_free(shm);
+
+ return rc;
+}
+
+int optee_close_session(struct tee_context *ctx, u32 session)
+{
+ struct optee_context_data *ctxdata = ctx->data;
+ struct tee_shm *shm;
+ struct optee_msg_arg *msg_arg;
+ phys_addr_t msg_parg;
+ struct optee_session *sess;
+
+ /* Check that the session is valid and remove it from the list */
+ mutex_lock(&ctxdata->mutex);
+ sess = find_session(ctxdata, session);
+ if (sess)
+ list_del(&sess->list_node);
+ mutex_unlock(&ctxdata->mutex);
+ if (!sess)
+ return -EINVAL;
+ kfree(sess);
+
+ shm = get_msg_arg(ctx, 0, &msg_arg, &msg_parg);
+ if (IS_ERR(shm))
+ return PTR_ERR(shm);
+
+ msg_arg->cmd = OPTEE_MSG_CMD_CLOSE_SESSION;
+ msg_arg->session = session;
+ optee_do_call_with_arg(ctx, msg_parg);
+
+ tee_shm_free(shm);
+ return 0;
+}
+
+int optee_invoke_func(struct tee_context *ctx, struct tee_ioctl_invoke_arg *arg,
+ struct tee_param *param)
+{
+ struct optee_context_data *ctxdata = ctx->data;
+ struct tee_shm *shm;
+ struct optee_msg_arg *msg_arg;
+ phys_addr_t msg_parg;
+ struct optee_session *sess;
+ int rc;
+
+ /* Check that the session is valid */
+ mutex_lock(&ctxdata->mutex);
+ sess = find_session(ctxdata, arg->session);
+ mutex_unlock(&ctxdata->mutex);
+ if (!sess)
+ return -EINVAL;
+
+ shm = get_msg_arg(ctx, arg->num_params, &msg_arg, &msg_parg);
+ if (IS_ERR(shm))
+ return PTR_ERR(shm);
+ msg_arg->cmd = OPTEE_MSG_CMD_INVOKE_COMMAND;
+ msg_arg->func = arg->func;
+ msg_arg->session = arg->session;
+ msg_arg->cancel_id = arg->cancel_id;
+
+ rc = optee_to_msg_param(msg_arg->params, arg->num_params, param);
+ if (rc)
+ goto out;
+
+ if (optee_do_call_with_arg(ctx, msg_parg)) {
+ msg_arg->ret = TEEC_ERROR_COMMUNICATION;
+ msg_arg->ret_origin = TEEC_ORIGIN_COMMS;
+ }
+
+ if (optee_from_msg_param(param, arg->num_params, msg_arg->params)) {
+ msg_arg->ret = TEEC_ERROR_COMMUNICATION;
+ msg_arg->ret_origin = TEEC_ORIGIN_COMMS;
+ }
+
+ arg->ret = msg_arg->ret;
+ arg->ret_origin = msg_arg->ret_origin;
+out:
+ tee_shm_free(shm);
+ return rc;
+}
+
+int optee_cancel_req(struct tee_context *ctx, u32 cancel_id, u32 session)
+{
+ struct optee_context_data *ctxdata = ctx->data;
+ struct tee_shm *shm;
+ struct optee_msg_arg *msg_arg;
+ phys_addr_t msg_parg;
+ struct optee_session *sess;
+
+ /* Check that the session is valid */
+ mutex_lock(&ctxdata->mutex);
+ sess = find_session(ctxdata, session);
+ mutex_unlock(&ctxdata->mutex);
+ if (!sess)
+ return -EINVAL;
+
+ shm = get_msg_arg(ctx, 0, &msg_arg, &msg_parg);
+ if (IS_ERR(shm))
+ return PTR_ERR(shm);
+
+ msg_arg->cmd = OPTEE_MSG_CMD_CANCEL;
+ msg_arg->session = session;
+ msg_arg->cancel_id = cancel_id;
+ optee_do_call_with_arg(ctx, msg_parg);
+
+ tee_shm_free(shm);
+ return 0;
+}
+
+/**
+ * optee_enable_shm_cache() - Enables caching of some shared memory allocation
+ * in OP-TEE
+ * @optee: main service struct
+ */
+void optee_enable_shm_cache(struct optee *optee)
+{
+ struct optee_call_waiter w;
+
+ /* We need to retry until secure world isn't busy. */
+ optee_cq_wait_init(&optee->call_queue, &w);
+ while (true) {
+ struct arm_smccc_res res;
+
+ optee->invoke_fn(OPTEE_SMC_ENABLE_SHM_CACHE, 0, 0, 0, 0, 0, 0,
+ 0, &res);
+ if (res.a0 == OPTEE_SMC_RETURN_OK)
+ break;
+ optee_cq_wait_for_completion(&optee->call_queue, &w);
+ }
+ optee_cq_wait_final(&optee->call_queue, &w);
+}
+
+/**
+ * optee_disable_shm_cache() - Disables caching of some shared memory allocation
+ * in OP-TEE
+ * @optee: main service struct
+ */
+void optee_disable_shm_cache(struct optee *optee)
+{
+ struct optee_call_waiter w;
+
+ /* We need to retry until secure world isn't busy. */
+ optee_cq_wait_init(&optee->call_queue, &w);
+ while (true) {
+ union {
+ struct arm_smccc_res smccc;
+ struct optee_smc_disable_shm_cache_result result;
+ } res;
+
+ optee->invoke_fn(OPTEE_SMC_DISABLE_SHM_CACHE, 0, 0, 0, 0, 0, 0,
+ 0, &res.smccc);
+ if (res.result.status == OPTEE_SMC_RETURN_ENOTAVAIL)
+ break; /* All shm's freed */
+ if (res.result.status == OPTEE_SMC_RETURN_OK) {
+ struct tee_shm *shm;
+
+ shm = reg_pair_to_ptr(res.result.shm_upper32,
+ res.result.shm_lower32);
+ tee_shm_free(shm);
+ } else {
+ optee_cq_wait_for_completion(&optee->call_queue, &w);
+ }
+ }
+ optee_cq_wait_final(&optee->call_queue, &w);
+}
+
+#define PAGELIST_ENTRIES_PER_PAGE \
+ ((OPTEE_MSG_NONCONTIG_PAGE_SIZE / sizeof(u64)) - 1)
+
+/**
+ * optee_fill_pages_list() - write list of user pages to given shared
+ * buffer.
+ *
+ * @dst: page-aligned buffer where list of pages will be stored
+ * @pages: array of pages that represents shared buffer
+ * @num_pages: number of entries in @pages
+ * @page_offset: offset of user buffer from page start
+ *
+ * @dst should be big enough to hold list of user page addresses and
+ * links to the next pages of buffer
+ */
+void optee_fill_pages_list(u64 *dst, struct page **pages, int num_pages,
+ size_t page_offset)
+{
+ int n = 0;
+ phys_addr_t optee_page;
+ /*
+ * Refer to OPTEE_MSG_ATTR_NONCONTIG description in optee_msg.h
+ * for details.
+ */
+ struct {
+ u64 pages_list[PAGELIST_ENTRIES_PER_PAGE];
+ u64 next_page_data;
+ } *pages_data;
+
+ /*
+ * Currently OP-TEE uses 4k page size and it does not looks
+ * like this will change in the future. On other hand, there are
+ * no know ARM architectures with page size < 4k.
+ * Thus the next built assert looks redundant. But the following
+ * code heavily relies on this assumption, so it is better be
+ * safe than sorry.
+ */
+ BUILD_BUG_ON(PAGE_SIZE < OPTEE_MSG_NONCONTIG_PAGE_SIZE);
+
+ pages_data = (void *)dst;
+ /*
+ * If linux page is bigger than 4k, and user buffer offset is
+ * larger than 4k/8k/12k/etc this will skip first 4k pages,
+ * because they bear no value data for OP-TEE.
+ */
+ optee_page = page_to_phys(*pages) +
+ round_down(page_offset, OPTEE_MSG_NONCONTIG_PAGE_SIZE);
+
+ while (true) {
+ pages_data->pages_list[n++] = optee_page;
+
+ if (n == PAGELIST_ENTRIES_PER_PAGE) {
+ pages_data->next_page_data =
+ virt_to_phys(pages_data + 1);
+ pages_data++;
+ n = 0;
+ }
+
+ optee_page += OPTEE_MSG_NONCONTIG_PAGE_SIZE;
+ if (!(optee_page & ~PAGE_MASK)) {
+ if (!--num_pages)
+ break;
+ pages++;
+ optee_page = page_to_phys(*pages);
+ }
+ }
+}
+
+/*
+ * The final entry in each pagelist page is a pointer to the next
+ * pagelist page.
+ */
+static size_t get_pages_list_size(size_t num_entries)
+{
+ int pages = DIV_ROUND_UP(num_entries, PAGELIST_ENTRIES_PER_PAGE);
+
+ return pages * OPTEE_MSG_NONCONTIG_PAGE_SIZE;
+}
+
+u64 *optee_allocate_pages_list(size_t num_entries)
+{
+ return alloc_pages_exact(get_pages_list_size(num_entries), GFP_KERNEL);
+}
+
+void optee_free_pages_list(void *list, size_t num_entries)
+{
+ free_pages_exact(list, get_pages_list_size(num_entries));
+}
+
+static bool is_normal_memory(pgprot_t p)
+{
+#if defined(CONFIG_ARM)
+ return (pgprot_val(p) & L_PTE_MT_MASK) == L_PTE_MT_WRITEALLOC;
+#elif defined(CONFIG_ARM64)
+ return (pgprot_val(p) & PTE_ATTRINDX_MASK) == PTE_ATTRINDX(MT_NORMAL);
+#else
+#error "Unuspported architecture"
+#endif
+}
+
+static int __check_mem_type(struct vm_area_struct *vma, unsigned long end)
+{
+ while (vma && is_normal_memory(vma->vm_page_prot)) {
+ if (vma->vm_end >= end)
+ return 0;
+ vma = vma->vm_next;
+ }
+
+ return -EINVAL;
+}
+
+static int check_mem_type(unsigned long start, size_t num_pages)
+{
+ struct mm_struct *mm = current->mm;
+ int rc;
+
+ down_read(&mm->mmap_sem);
+ rc = __check_mem_type(find_vma(mm, start),
+ start + num_pages * PAGE_SIZE);
+ up_read(&mm->mmap_sem);
+
+ return rc;
+}
+
+int optee_shm_register(struct tee_context *ctx, struct tee_shm *shm,
+ struct page **pages, size_t num_pages,
+ unsigned long start)
+{
+ struct tee_shm *shm_arg = NULL;
+ struct optee_msg_arg *msg_arg;
+ u64 *pages_list;
+ phys_addr_t msg_parg;
+ int rc;
+
+ if (!num_pages)
+ return -EINVAL;
+
+ rc = check_mem_type(start, num_pages);
+ if (rc)
+ return rc;
+
+ pages_list = optee_allocate_pages_list(num_pages);
+ if (!pages_list)
+ return -ENOMEM;
+
+ shm_arg = get_msg_arg(ctx, 1, &msg_arg, &msg_parg);
+ if (IS_ERR(shm_arg)) {
+ rc = PTR_ERR(shm_arg);
+ goto out;
+ }
+
+ optee_fill_pages_list(pages_list, pages, num_pages,
+ tee_shm_get_page_offset(shm));
+
+ msg_arg->cmd = OPTEE_MSG_CMD_REGISTER_SHM;
+ msg_arg->params->attr = OPTEE_MSG_ATTR_TYPE_TMEM_OUTPUT |
+ OPTEE_MSG_ATTR_NONCONTIG;
+ msg_arg->params->u.tmem.shm_ref = (unsigned long)shm;
+ msg_arg->params->u.tmem.size = tee_shm_get_size(shm);
+ /*
+ * In the least bits of msg_arg->params->u.tmem.buf_ptr we
+ * store buffer offset from 4k page, as described in OP-TEE ABI.
+ */
+ msg_arg->params->u.tmem.buf_ptr = virt_to_phys(pages_list) |
+ (tee_shm_get_page_offset(shm) & (OPTEE_MSG_NONCONTIG_PAGE_SIZE - 1));
+
+ if (optee_do_call_with_arg(ctx, msg_parg) ||
+ msg_arg->ret != TEEC_SUCCESS)
+ rc = -EINVAL;
+
+ tee_shm_free(shm_arg);
+out:
+ optee_free_pages_list(pages_list, num_pages);
+ return rc;
+}
+
+int optee_shm_unregister(struct tee_context *ctx, struct tee_shm *shm)
+{
+ struct tee_shm *shm_arg;
+ struct optee_msg_arg *msg_arg;
+ phys_addr_t msg_parg;
+ int rc = 0;
+
+ shm_arg = get_msg_arg(ctx, 1, &msg_arg, &msg_parg);
+ if (IS_ERR(shm_arg))
+ return PTR_ERR(shm_arg);
+
+ msg_arg->cmd = OPTEE_MSG_CMD_UNREGISTER_SHM;
+
+ msg_arg->params[0].attr = OPTEE_MSG_ATTR_TYPE_RMEM_INPUT;
+ msg_arg->params[0].u.rmem.shm_ref = (unsigned long)shm;
+
+ if (optee_do_call_with_arg(ctx, msg_parg) ||
+ msg_arg->ret != TEEC_SUCCESS)
+ rc = -EINVAL;
+ tee_shm_free(shm_arg);
+ return rc;
+}
+
+int optee_shm_register_supp(struct tee_context *ctx, struct tee_shm *shm,
+ struct page **pages, size_t num_pages,
+ unsigned long start)
+{
+ /*
+ * We don't want to register supplicant memory in OP-TEE.
+ * Instead information about it will be passed in RPC code.
+ */
+ return check_mem_type(start, num_pages);
+}
+
+int optee_shm_unregister_supp(struct tee_context *ctx, struct tee_shm *shm)
+{
+ return 0;
+}
diff --git a/drivers/tee/optee/core.c b/drivers/tee/optee/core.c
new file mode 100644
index 000000000000..e9843c53fe31
--- /dev/null
+++ b/drivers/tee/optee/core.c
@@ -0,0 +1,705 @@
+/*
+ * Copyright (c) 2015, Linaro Limited
+ *
+ * This software is licensed under the terms of the GNU General Public
+ * License version 2, as published by the Free Software Foundation, and
+ * may be copied, distributed, and modified under those terms.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ */
+
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include <linux/arm-smccc.h>
+#include <linux/errno.h>
+#include <linux/io.h>
+#include <linux/module.h>
+#include <linux/of.h>
+#include <linux/of_platform.h>
+#include <linux/platform_device.h>
+#include <linux/slab.h>
+#include <linux/string.h>
+#include <linux/tee_drv.h>
+#include <linux/types.h>
+#include <linux/uaccess.h>
+#include "optee_private.h"
+#include "optee_smc.h"
+#include "shm_pool.h"
+
+#define DRIVER_NAME "optee"
+
+#define OPTEE_SHM_NUM_PRIV_PAGES 1
+
+/**
+ * optee_from_msg_param() - convert from OPTEE_MSG parameters to
+ * struct tee_param
+ * @params: subsystem internal parameter representation
+ * @num_params: number of elements in the parameter arrays
+ * @msg_params: OPTEE_MSG parameters
+ * Returns 0 on success or <0 on failure
+ */
+int optee_from_msg_param(struct tee_param *params, size_t num_params,
+ const struct optee_msg_param *msg_params)
+{
+ int rc;
+ size_t n;
+ struct tee_shm *shm;
+ phys_addr_t pa;
+
+ for (n = 0; n < num_params; n++) {
+ struct tee_param *p = params + n;
+ const struct optee_msg_param *mp = msg_params + n;
+ u32 attr = mp->attr & OPTEE_MSG_ATTR_TYPE_MASK;
+
+ switch (attr) {
+ case OPTEE_MSG_ATTR_TYPE_NONE:
+ p->attr = TEE_IOCTL_PARAM_ATTR_TYPE_NONE;
+ memset(&p->u, 0, sizeof(p->u));
+ break;
+ case OPTEE_MSG_ATTR_TYPE_VALUE_INPUT:
+ case OPTEE_MSG_ATTR_TYPE_VALUE_OUTPUT:
+ case OPTEE_MSG_ATTR_TYPE_VALUE_INOUT:
+ p->attr = TEE_IOCTL_PARAM_ATTR_TYPE_VALUE_INPUT +
+ attr - OPTEE_MSG_ATTR_TYPE_VALUE_INPUT;
+ p->u.value.a = mp->u.value.a;
+ p->u.value.b = mp->u.value.b;
+ p->u.value.c = mp->u.value.c;
+ break;
+ case OPTEE_MSG_ATTR_TYPE_TMEM_INPUT:
+ case OPTEE_MSG_ATTR_TYPE_TMEM_OUTPUT:
+ case OPTEE_MSG_ATTR_TYPE_TMEM_INOUT:
+ p->attr = TEE_IOCTL_PARAM_ATTR_TYPE_MEMREF_INPUT +
+ attr - OPTEE_MSG_ATTR_TYPE_TMEM_INPUT;
+ p->u.memref.size = mp->u.tmem.size;
+ shm = (struct tee_shm *)(unsigned long)
+ mp->u.tmem.shm_ref;
+ if (!shm) {
+ p->u.memref.shm_offs = 0;
+ p->u.memref.shm = NULL;
+ break;
+ }
+ rc = tee_shm_get_pa(shm, 0, &pa);
+ if (rc)
+ return rc;
+ p->u.memref.shm_offs = mp->u.tmem.buf_ptr - pa;
+ p->u.memref.shm = shm;
+
+ /* Check that the memref is covered by the shm object */
+ if (p->u.memref.size) {
+ size_t o = p->u.memref.shm_offs +
+ p->u.memref.size - 1;
+
+ rc = tee_shm_get_pa(shm, o, NULL);
+ if (rc)
+ return rc;
+ }
+ break;
+ case OPTEE_MSG_ATTR_TYPE_RMEM_INPUT:
+ case OPTEE_MSG_ATTR_TYPE_RMEM_OUTPUT:
+ case OPTEE_MSG_ATTR_TYPE_RMEM_INOUT:
+ p->attr = TEE_IOCTL_PARAM_ATTR_TYPE_MEMREF_INPUT +
+ attr - OPTEE_MSG_ATTR_TYPE_RMEM_INPUT;
+ p->u.memref.size = mp->u.rmem.size;
+ shm = (struct tee_shm *)(unsigned long)
+ mp->u.rmem.shm_ref;
+
+ if (!shm) {
+ p->u.memref.shm_offs = 0;
+ p->u.memref.shm = NULL;
+ break;
+ }
+ p->u.memref.shm_offs = mp->u.rmem.offs;
+ p->u.memref.shm = shm;
+
+ break;
+
+ default:
+ return -EINVAL;
+ }
+ }
+ return 0;
+}
+
+static int to_msg_param_tmp_mem(struct optee_msg_param *mp,
+ const struct tee_param *p)
+{
+ int rc;
+ phys_addr_t pa;
+
+ mp->attr = OPTEE_MSG_ATTR_TYPE_TMEM_INPUT + p->attr -
+ TEE_IOCTL_PARAM_ATTR_TYPE_MEMREF_INPUT;
+
+ mp->u.tmem.shm_ref = (unsigned long)p->u.memref.shm;
+ mp->u.tmem.size = p->u.memref.size;
+
+ if (!p->u.memref.shm) {
+ mp->u.tmem.buf_ptr = 0;
+ return 0;
+ }
+
+ rc = tee_shm_get_pa(p->u.memref.shm, p->u.memref.shm_offs, &pa);
+ if (rc)
+ return rc;
+
+ mp->u.tmem.buf_ptr = pa;
+ mp->attr |= OPTEE_MSG_ATTR_CACHE_PREDEFINED <<
+ OPTEE_MSG_ATTR_CACHE_SHIFT;
+
+ return 0;
+}
+
+static int to_msg_param_reg_mem(struct optee_msg_param *mp,
+ const struct tee_param *p)
+{
+ mp->attr = OPTEE_MSG_ATTR_TYPE_RMEM_INPUT + p->attr -
+ TEE_IOCTL_PARAM_ATTR_TYPE_MEMREF_INPUT;
+
+ mp->u.rmem.shm_ref = (unsigned long)p->u.memref.shm;
+ mp->u.rmem.size = p->u.memref.size;
+ mp->u.rmem.offs = p->u.memref.shm_offs;
+ return 0;
+}
+
+/**
+ * optee_to_msg_param() - convert from struct tee_params to OPTEE_MSG parameters
+ * @msg_params: OPTEE_MSG parameters
+ * @num_params: number of elements in the parameter arrays
+ * @params: subsystem itnernal parameter representation
+ * Returns 0 on success or <0 on failure
+ */
+int optee_to_msg_param(struct optee_msg_param *msg_params, size_t num_params,
+ const struct tee_param *params)
+{
+ int rc;
+ size_t n;
+
+ for (n = 0; n < num_params; n++) {
+ const struct tee_param *p = params + n;
+ struct optee_msg_param *mp = msg_params + n;
+
+ switch (p->attr) {
+ case TEE_IOCTL_PARAM_ATTR_TYPE_NONE:
+ mp->attr = TEE_IOCTL_PARAM_ATTR_TYPE_NONE;
+ memset(&mp->u, 0, sizeof(mp->u));
+ break;
+ case TEE_IOCTL_PARAM_ATTR_TYPE_VALUE_INPUT:
+ case TEE_IOCTL_PARAM_ATTR_TYPE_VALUE_OUTPUT:
+ case TEE_IOCTL_PARAM_ATTR_TYPE_VALUE_INOUT:
+ mp->attr = OPTEE_MSG_ATTR_TYPE_VALUE_INPUT + p->attr -
+ TEE_IOCTL_PARAM_ATTR_TYPE_VALUE_INPUT;
+ mp->u.value.a = p->u.value.a;
+ mp->u.value.b = p->u.value.b;
+ mp->u.value.c = p->u.value.c;
+ break;
+ case TEE_IOCTL_PARAM_ATTR_TYPE_MEMREF_INPUT:
+ case TEE_IOCTL_PARAM_ATTR_TYPE_MEMREF_OUTPUT:
+ case TEE_IOCTL_PARAM_ATTR_TYPE_MEMREF_INOUT:
+ if (tee_shm_is_registered(p->u.memref.shm))
+ rc = to_msg_param_reg_mem(mp, p);
+ else
+ rc = to_msg_param_tmp_mem(mp, p);
+ if (rc)
+ return rc;
+ break;
+ default:
+ return -EINVAL;
+ }
+ }
+ return 0;
+}
+
+static void optee_get_version(struct tee_device *teedev,
+ struct tee_ioctl_version_data *vers)
+{
+ struct tee_ioctl_version_data v = {
+ .impl_id = TEE_IMPL_ID_OPTEE,
+ .impl_caps = TEE_OPTEE_CAP_TZ,
+ .gen_caps = TEE_GEN_CAP_GP,
+ };
+ struct optee *optee = tee_get_drvdata(teedev);
+
+ if (optee->sec_caps & OPTEE_SMC_SEC_CAP_DYNAMIC_SHM)
+ v.gen_caps |= TEE_GEN_CAP_REG_MEM;
+ *vers = v;
+}
+
+static int optee_open(struct tee_context *ctx)
+{
+ struct optee_context_data *ctxdata;
+ struct tee_device *teedev = ctx->teedev;
+ struct optee *optee = tee_get_drvdata(teedev);
+
+ ctxdata = kzalloc(sizeof(*ctxdata), GFP_KERNEL);
+ if (!ctxdata)
+ return -ENOMEM;
+
+ if (teedev == optee->supp_teedev) {
+ bool busy = true;
+
+ mutex_lock(&optee->supp.mutex);
+ if (!optee->supp.ctx) {
+ busy = false;
+ optee->supp.ctx = ctx;
+ }
+ mutex_unlock(&optee->supp.mutex);
+ if (busy) {
+ kfree(ctxdata);
+ return -EBUSY;
+ }
+ }
+
+ mutex_init(&ctxdata->mutex);
+ INIT_LIST_HEAD(&ctxdata->sess_list);
+
+ ctx->data = ctxdata;
+ return 0;
+}
+
+static void optee_release(struct tee_context *ctx)
+{
+ struct optee_context_data *ctxdata = ctx->data;
+ struct tee_device *teedev = ctx->teedev;
+ struct optee *optee = tee_get_drvdata(teedev);
+ struct tee_shm *shm;
+ struct optee_msg_arg *arg = NULL;
+ phys_addr_t parg;
+ struct optee_session *sess;
+ struct optee_session *sess_tmp;
+
+ if (!ctxdata)
+ return;
+
+ shm = tee_shm_alloc(ctx, sizeof(struct optee_msg_arg), TEE_SHM_MAPPED);
+ if (!IS_ERR(shm)) {
+ arg = tee_shm_get_va(shm, 0);
+ /*
+ * If va2pa fails for some reason, we can't call into
+ * secure world, only free the memory. Secure OS will leak
+ * sessions and finally refuse more sessions, but we will
+ * at least let normal world reclaim its memory.
+ */
+ if (!IS_ERR(arg))
+ if (tee_shm_va2pa(shm, arg, &parg))
+ arg = NULL; /* prevent usage of parg below */
+ }
+
+ list_for_each_entry_safe(sess, sess_tmp, &ctxdata->sess_list,
+ list_node) {
+ list_del(&sess->list_node);
+ if (!IS_ERR_OR_NULL(arg)) {
+ memset(arg, 0, sizeof(*arg));
+ arg->cmd = OPTEE_MSG_CMD_CLOSE_SESSION;
+ arg->session = sess->session_id;
+ optee_do_call_with_arg(ctx, parg);
+ }
+ kfree(sess);
+ }
+ kfree(ctxdata);
+
+ if (!IS_ERR(shm))
+ tee_shm_free(shm);
+
+ ctx->data = NULL;
+
+ if (teedev == optee->supp_teedev)
+ optee_supp_release(&optee->supp);
+}
+
+static const struct tee_driver_ops optee_ops = {
+ .get_version = optee_get_version,
+ .open = optee_open,
+ .release = optee_release,
+ .open_session = optee_open_session,
+ .close_session = optee_close_session,
+ .invoke_func = optee_invoke_func,
+ .cancel_req = optee_cancel_req,
+ .shm_register = optee_shm_register,
+ .shm_unregister = optee_shm_unregister,
+};
+
+static const struct tee_desc optee_desc = {
+ .name = DRIVER_NAME "-clnt",
+ .ops = &optee_ops,
+ .owner = THIS_MODULE,
+};
+
+static const struct tee_driver_ops optee_supp_ops = {
+ .get_version = optee_get_version,
+ .open = optee_open,
+ .release = optee_release,
+ .supp_recv = optee_supp_recv,
+ .supp_send = optee_supp_send,
+ .shm_register = optee_shm_register_supp,
+ .shm_unregister = optee_shm_unregister_supp,
+};
+
+static const struct tee_desc optee_supp_desc = {
+ .name = DRIVER_NAME "-supp",
+ .ops = &optee_supp_ops,
+ .owner = THIS_MODULE,
+ .flags = TEE_DESC_PRIVILEGED,
+};
+
+static bool optee_msg_api_uid_is_optee_api(optee_invoke_fn *invoke_fn)
+{
+ struct arm_smccc_res res;
+
+ invoke_fn(OPTEE_SMC_CALLS_UID, 0, 0, 0, 0, 0, 0, 0, &res);
+
+ if (res.a0 == OPTEE_MSG_UID_0 && res.a1 == OPTEE_MSG_UID_1 &&
+ res.a2 == OPTEE_MSG_UID_2 && res.a3 == OPTEE_MSG_UID_3)
+ return true;
+ return false;
+}
+
+static bool optee_msg_api_revision_is_compatible(optee_invoke_fn *invoke_fn)
+{
+ union {
+ struct arm_smccc_res smccc;
+ struct optee_smc_calls_revision_result result;
+ } res;
+
+ invoke_fn(OPTEE_SMC_CALLS_REVISION, 0, 0, 0, 0, 0, 0, 0, &res.smccc);
+
+ if (res.result.major == OPTEE_MSG_REVISION_MAJOR &&
+ (int)res.result.minor >= OPTEE_MSG_REVISION_MINOR)
+ return true;
+ return false;
+}
+
+static bool optee_msg_exchange_capabilities(optee_invoke_fn *invoke_fn,
+ u32 *sec_caps)
+{
+ union {
+ struct arm_smccc_res smccc;
+ struct optee_smc_exchange_capabilities_result result;
+ } res;
+ u32 a1 = 0;
+
+ /*
+ * TODO This isn't enough to tell if it's UP system (from kernel
+ * point of view) or not, is_smp() returns the the information
+ * needed, but can't be called directly from here.
+ */
+ if (!IS_ENABLED(CONFIG_SMP) || nr_cpu_ids == 1)
+ a1 |= OPTEE_SMC_NSEC_CAP_UNIPROCESSOR;
+
+ invoke_fn(OPTEE_SMC_EXCHANGE_CAPABILITIES, a1, 0, 0, 0, 0, 0, 0,
+ &res.smccc);
+
+ if (res.result.status != OPTEE_SMC_RETURN_OK)
+ return false;
+
+ *sec_caps = res.result.capabilities;
+ return true;
+}
+
+static struct tee_shm_pool *
+optee_config_shm_memremap(optee_invoke_fn *invoke_fn, void **memremaped_shm,
+ u32 sec_caps)
+{
+ union {
+ struct arm_smccc_res smccc;
+ struct optee_smc_get_shm_config_result result;
+ } res;
+ unsigned long vaddr;
+ phys_addr_t paddr;
+ size_t size;
+ phys_addr_t begin;
+ phys_addr_t end;
+ void *va;
+ struct tee_shm_pool_mgr *priv_mgr;
+ struct tee_shm_pool_mgr *dmabuf_mgr;
+ void *rc;
+
+ invoke_fn(OPTEE_SMC_GET_SHM_CONFIG, 0, 0, 0, 0, 0, 0, 0, &res.smccc);
+ if (res.result.status != OPTEE_SMC_RETURN_OK) {
+ pr_info("shm service not available\n");
+ return ERR_PTR(-ENOENT);
+ }
+
+ if (res.result.settings != OPTEE_SMC_SHM_CACHED) {
+ pr_err("only normal cached shared memory supported\n");
+ return ERR_PTR(-EINVAL);
+ }
+
+ begin = roundup(res.result.start, PAGE_SIZE);
+ end = rounddown(res.result.start + res.result.size, PAGE_SIZE);
+ paddr = begin;
+ size = end - begin;
+
+ if (size < 2 * OPTEE_SHM_NUM_PRIV_PAGES * PAGE_SIZE) {
+ pr_err("too small shared memory area\n");
+ return ERR_PTR(-EINVAL);
+ }
+
+ va = memremap(paddr, size, MEMREMAP_WB);
+ if (!va) {
+ pr_err("shared memory ioremap failed\n");
+ return ERR_PTR(-EINVAL);
+ }
+ vaddr = (unsigned long)va;
+
+ /*
+ * If OP-TEE can work with unregistered SHM, we will use own pool
+ * for private shm
+ */
+ if (sec_caps & OPTEE_SMC_SEC_CAP_DYNAMIC_SHM) {
+ rc = optee_shm_pool_alloc_pages();
+ if (IS_ERR(rc))
+ goto err_memunmap;
+ priv_mgr = rc;
+ } else {
+ const size_t sz = OPTEE_SHM_NUM_PRIV_PAGES * PAGE_SIZE;
+
+ rc = tee_shm_pool_mgr_alloc_res_mem(vaddr, paddr, sz,
+ 3 /* 8 bytes aligned */);
+ if (IS_ERR(rc))
+ goto err_memunmap;
+ priv_mgr = rc;
+
+ vaddr += sz;
+ paddr += sz;
+ size -= sz;
+ }
+
+ rc = tee_shm_pool_mgr_alloc_res_mem(vaddr, paddr, size, PAGE_SHIFT);
+ if (IS_ERR(rc))
+ goto err_free_priv_mgr;
+ dmabuf_mgr = rc;
+
+ rc = tee_shm_pool_alloc(priv_mgr, dmabuf_mgr);
+ if (IS_ERR(rc))
+ goto err_free_dmabuf_mgr;
+
+ *memremaped_shm = va;
+
+ return rc;
+
+err_free_dmabuf_mgr:
+ tee_shm_pool_mgr_destroy(dmabuf_mgr);
+err_free_priv_mgr:
+ tee_shm_pool_mgr_destroy(priv_mgr);
+err_memunmap:
+ memunmap(va);
+ return rc;
+}
+
+/* Simple wrapper functions to be able to use a function pointer */
+static void optee_smccc_smc(unsigned long a0, unsigned long a1,
+ unsigned long a2, unsigned long a3,
+ unsigned long a4, unsigned long a5,
+ unsigned long a6, unsigned long a7,
+ struct arm_smccc_res *res)
+{
+ arm_smccc_smc(a0, a1, a2, a3, a4, a5, a6, a7, res);
+}
+
+static void optee_smccc_hvc(unsigned long a0, unsigned long a1,
+ unsigned long a2, unsigned long a3,
+ unsigned long a4, unsigned long a5,
+ unsigned long a6, unsigned long a7,
+ struct arm_smccc_res *res)
+{
+ arm_smccc_hvc(a0, a1, a2, a3, a4, a5, a6, a7, res);
+}
+
+static optee_invoke_fn *get_invoke_func(struct device_node *np)
+{
+ const char *method;
+
+ pr_info("probing for conduit method from DT.\n");
+
+ if (of_property_read_string(np, "method", &method)) {
+ pr_warn("missing \"method\" property\n");
+ return ERR_PTR(-ENXIO);
+ }
+
+ if (!strcmp("hvc", method))
+ return optee_smccc_hvc;
+ else if (!strcmp("smc", method))
+ return optee_smccc_smc;
+
+ pr_warn("invalid \"method\" property: %s\n", method);
+ return ERR_PTR(-EINVAL);
+}
+
+static struct optee *optee_probe(struct device_node *np)
+{
+ optee_invoke_fn *invoke_fn;
+ struct tee_shm_pool *pool;
+ struct optee *optee = NULL;
+ void *memremaped_shm = NULL;
+ struct tee_device *teedev;
+ u32 sec_caps;
+ int rc;
+
+ invoke_fn = get_invoke_func(np);
+ if (IS_ERR(invoke_fn))
+ return (void *)invoke_fn;
+
+ if (!optee_msg_api_uid_is_optee_api(invoke_fn)) {
+ pr_warn("api uid mismatch\n");
+ return ERR_PTR(-EINVAL);
+ }
+
+ if (!optee_msg_api_revision_is_compatible(invoke_fn)) {
+ pr_warn("api revision mismatch\n");
+ return ERR_PTR(-EINVAL);
+ }
+
+ if (!optee_msg_exchange_capabilities(invoke_fn, &sec_caps)) {
+ pr_warn("capabilities mismatch\n");
+ return ERR_PTR(-EINVAL);
+ }
+
+ /*
+ * We have no other option for shared memory, if secure world
+ * doesn't have any reserved memory we can use we can't continue.
+ */
+ if (!(sec_caps & OPTEE_SMC_SEC_CAP_HAVE_RESERVED_SHM))
+ return ERR_PTR(-EINVAL);
+
+ pool = optee_config_shm_memremap(invoke_fn, &memremaped_shm, sec_caps);
+ if (IS_ERR(pool))
+ return (void *)pool;
+
+ optee = kzalloc(sizeof(*optee), GFP_KERNEL);
+ if (!optee) {
+ rc = -ENOMEM;
+ goto err;
+ }
+
+ optee->invoke_fn = invoke_fn;
+ optee->sec_caps = sec_caps;
+
+ teedev = tee_device_alloc(&optee_desc, NULL, pool, optee);
+ if (IS_ERR(teedev)) {
+ rc = PTR_ERR(teedev);
+ goto err;
+ }
+ optee->teedev = teedev;
+
+ teedev = tee_device_alloc(&optee_supp_desc, NULL, pool, optee);
+ if (IS_ERR(teedev)) {
+ rc = PTR_ERR(teedev);
+ goto err;
+ }
+ optee->supp_teedev = teedev;
+
+ rc = tee_device_register(optee->teedev);
+ if (rc)
+ goto err;
+
+ rc = tee_device_register(optee->supp_teedev);
+ if (rc)
+ goto err;
+
+ mutex_init(&optee->call_queue.mutex);
+ INIT_LIST_HEAD(&optee->call_queue.waiters);
+ optee_wait_queue_init(&optee->wait_queue);
+ optee_supp_init(&optee->supp);
+ optee->memremaped_shm = memremaped_shm;
+ optee->pool = pool;
+
+ optee_enable_shm_cache(optee);
+
+ pr_info("initialized driver\n");
+ return optee;
+err:
+ if (optee) {
+ /*
+ * tee_device_unregister() is safe to call even if the
+ * devices hasn't been registered with
+ * tee_device_register() yet.
+ */
+ tee_device_unregister(optee->supp_teedev);
+ tee_device_unregister(optee->teedev);
+ kfree(optee);
+ }
+ if (pool)
+ tee_shm_pool_free(pool);
+ if (memremaped_shm)
+ memunmap(memremaped_shm);
+ return ERR_PTR(rc);
+}
+
+static void optee_remove(struct optee *optee)
+{
+ /*
+ * Ask OP-TEE to free all cached shared memory objects to decrease
+ * reference counters and also avoid wild pointers in secure world
+ * into the old shared memory range.
+ */
+ optee_disable_shm_cache(optee);
+
+ /*
+ * The two devices has to be unregistered before we can free the
+ * other resources.
+ */
+ tee_device_unregister(optee->supp_teedev);
+ tee_device_unregister(optee->teedev);
+
+ tee_shm_pool_free(optee->pool);
+ if (optee->memremaped_shm)
+ memunmap(optee->memremaped_shm);
+ optee_wait_queue_exit(&optee->wait_queue);
+ optee_supp_uninit(&optee->supp);
+ mutex_destroy(&optee->call_queue.mutex);
+
+ kfree(optee);
+}
+
+static const struct of_device_id optee_match[] = {
+ { .compatible = "linaro,optee-tz" },
+ {},
+};
+
+static struct optee *optee_svc;
+
+static int __init optee_driver_init(void)
+{
+ struct device_node *fw_np;
+ struct device_node *np;
+ struct optee *optee;
+
+ /* Node is supposed to be below /firmware */
+ fw_np = of_find_node_by_name(NULL, "firmware");
+ if (!fw_np)
+ return -ENODEV;
+
+ np = of_find_matching_node(fw_np, optee_match);
+ if (!np)
+ return -ENODEV;
+
+ optee = optee_probe(np);
+ of_node_put(np);
+
+ if (IS_ERR(optee))
+ return PTR_ERR(optee);
+
+ optee_svc = optee;
+
+ return 0;
+}
+module_init(optee_driver_init);
+
+static void __exit optee_driver_exit(void)
+{
+ struct optee *optee = optee_svc;
+
+ optee_svc = NULL;
+ if (optee)
+ optee_remove(optee);
+}
+module_exit(optee_driver_exit);
+
+MODULE_AUTHOR("Linaro");
+MODULE_DESCRIPTION("OP-TEE driver");
+MODULE_SUPPORTED_DEVICE("");
+MODULE_VERSION("1.0");
+MODULE_LICENSE("GPL v2");
diff --git a/drivers/tee/optee/optee_msg.h b/drivers/tee/optee/optee_msg.h
new file mode 100644
index 000000000000..30504901be80
--- /dev/null
+++ b/drivers/tee/optee/optee_msg.h
@@ -0,0 +1,444 @@
+/*
+ * Copyright (c) 2015-2016, Linaro Limited
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+#ifndef _OPTEE_MSG_H
+#define _OPTEE_MSG_H
+
+#include <linux/bitops.h>
+#include <linux/types.h>
+
+/*
+ * This file defines the OP-TEE message protocol used to communicate
+ * with an instance of OP-TEE running in secure world.
+ *
+ * This file is divided into three sections.
+ * 1. Formatting of messages.
+ * 2. Requests from normal world
+ * 3. Requests from secure world, Remote Procedure Call (RPC), handled by
+ * tee-supplicant.
+ */
+
+/*****************************************************************************
+ * Part 1 - formatting of messages
+ *****************************************************************************/
+
+#define OPTEE_MSG_ATTR_TYPE_NONE 0x0
+#define OPTEE_MSG_ATTR_TYPE_VALUE_INPUT 0x1
+#define OPTEE_MSG_ATTR_TYPE_VALUE_OUTPUT 0x2
+#define OPTEE_MSG_ATTR_TYPE_VALUE_INOUT 0x3
+#define OPTEE_MSG_ATTR_TYPE_RMEM_INPUT 0x5
+#define OPTEE_MSG_ATTR_TYPE_RMEM_OUTPUT 0x6
+#define OPTEE_MSG_ATTR_TYPE_RMEM_INOUT 0x7
+#define OPTEE_MSG_ATTR_TYPE_TMEM_INPUT 0x9
+#define OPTEE_MSG_ATTR_TYPE_TMEM_OUTPUT 0xa
+#define OPTEE_MSG_ATTR_TYPE_TMEM_INOUT 0xb
+
+#define OPTEE_MSG_ATTR_TYPE_MASK GENMASK(7, 0)
+
+/*
+ * Meta parameter to be absorbed by the Secure OS and not passed
+ * to the Trusted Application.
+ *
+ * Currently only used with OPTEE_MSG_CMD_OPEN_SESSION.
+ */
+#define OPTEE_MSG_ATTR_META BIT(8)
+
+/*
+ * Pointer to a list of pages used to register user-defined SHM buffer.
+ * Used with OPTEE_MSG_ATTR_TYPE_TMEM_*.
+ * buf_ptr should point to the beginning of the buffer. Buffer will contain
+ * list of page addresses. OP-TEE core can reconstruct contiguous buffer from
+ * that page addresses list. Page addresses are stored as 64 bit values.
+ * Last entry on a page should point to the next page of buffer.
+ * Every entry in buffer should point to a 4k page beginning (12 least
+ * significant bits must be equal to zero).
+ *
+ * 12 least significant bints of optee_msg_param.u.tmem.buf_ptr should hold page
+ * offset of the user buffer.
+ *
+ * So, entries should be placed like members of this structure:
+ *
+ * struct page_data {
+ * uint64_t pages_array[OPTEE_MSG_NONCONTIG_PAGE_SIZE/sizeof(uint64_t) - 1];
+ * uint64_t next_page_data;
+ * };
+ *
+ * Structure is designed to exactly fit into the page size
+ * OPTEE_MSG_NONCONTIG_PAGE_SIZE which is a standard 4KB page.
+ *
+ * The size of 4KB is chosen because this is the smallest page size for ARM
+ * architectures. If REE uses larger pages, it should divide them to 4KB ones.
+ */
+#define OPTEE_MSG_ATTR_NONCONTIG BIT(9)
+
+/*
+ * Memory attributes for caching passed with temp memrefs. The actual value
+ * used is defined outside the message protocol with the exception of
+ * OPTEE_MSG_ATTR_CACHE_PREDEFINED which means the attributes already
+ * defined for the memory range should be used. If optee_smc.h is used as
+ * bearer of this protocol OPTEE_SMC_SHM_* is used for values.
+ */
+#define OPTEE_MSG_ATTR_CACHE_SHIFT 16
+#define OPTEE_MSG_ATTR_CACHE_MASK GENMASK(2, 0)
+#define OPTEE_MSG_ATTR_CACHE_PREDEFINED 0
+
+/*
+ * Same values as TEE_LOGIN_* from TEE Internal API
+ */
+#define OPTEE_MSG_LOGIN_PUBLIC 0x00000000
+#define OPTEE_MSG_LOGIN_USER 0x00000001
+#define OPTEE_MSG_LOGIN_GROUP 0x00000002
+#define OPTEE_MSG_LOGIN_APPLICATION 0x00000004
+#define OPTEE_MSG_LOGIN_APPLICATION_USER 0x00000005
+#define OPTEE_MSG_LOGIN_APPLICATION_GROUP 0x00000006
+
+/*
+ * Page size used in non-contiguous buffer entries
+ */
+#define OPTEE_MSG_NONCONTIG_PAGE_SIZE 4096
+
+/**
+ * struct optee_msg_param_tmem - temporary memory reference parameter
+ * @buf_ptr: Address of the buffer
+ * @size: Size of the buffer
+ * @shm_ref: Temporary shared memory reference, pointer to a struct tee_shm
+ *
+ * Secure and normal world communicates pointers as physical address
+ * instead of the virtual address. This is because secure and normal world
+ * have completely independent memory mapping. Normal world can even have a
+ * hypervisor which need to translate the guest physical address (AKA IPA
+ * in ARM documentation) to a real physical address before passing the
+ * structure to secure world.
+ */
+struct optee_msg_param_tmem {
+ u64 buf_ptr;
+ u64 size;
+ u64 shm_ref;
+};
+
+/**
+ * struct optee_msg_param_rmem - registered memory reference parameter
+ * @offs: Offset into shared memory reference
+ * @size: Size of the buffer
+ * @shm_ref: Shared memory reference, pointer to a struct tee_shm
+ */
+struct optee_msg_param_rmem {
+ u64 offs;
+ u64 size;
+ u64 shm_ref;
+};
+
+/**
+ * struct optee_msg_param_value - opaque value parameter
+ *
+ * Value parameters are passed unchecked between normal and secure world.
+ */
+struct optee_msg_param_value {
+ u64 a;
+ u64 b;
+ u64 c;
+};
+
+/**
+ * struct optee_msg_param - parameter used together with struct optee_msg_arg
+ * @attr: attributes
+ * @tmem: parameter by temporary memory reference
+ * @rmem: parameter by registered memory reference
+ * @value: parameter by opaque value
+ *
+ * @attr & OPTEE_MSG_ATTR_TYPE_MASK indicates if tmem, rmem or value is used in
+ * the union. OPTEE_MSG_ATTR_TYPE_VALUE_* indicates value,
+ * OPTEE_MSG_ATTR_TYPE_TMEM_* indicates @tmem and
+ * OPTEE_MSG_ATTR_TYPE_RMEM_* indicates @rmem,
+ * OPTEE_MSG_ATTR_TYPE_NONE indicates that none of the members are used.
+ */
+struct optee_msg_param {
+ u64 attr;
+ union {
+ struct optee_msg_param_tmem tmem;
+ struct optee_msg_param_rmem rmem;
+ struct optee_msg_param_value value;
+ } u;
+};
+
+/**
+ * struct optee_msg_arg - call argument
+ * @cmd: Command, one of OPTEE_MSG_CMD_* or OPTEE_MSG_RPC_CMD_*
+ * @func: Trusted Application function, specific to the Trusted Application,
+ * used if cmd == OPTEE_MSG_CMD_INVOKE_COMMAND
+ * @session: In parameter for all OPTEE_MSG_CMD_* except
+ * OPTEE_MSG_CMD_OPEN_SESSION where it's an output parameter instead
+ * @cancel_id: Cancellation id, a unique value to identify this request
+ * @ret: return value
+ * @ret_origin: origin of the return value
+ * @num_params: number of parameters supplied to the OS Command
+ * @params: the parameters supplied to the OS Command
+ *
+ * All normal calls to Trusted OS uses this struct. If cmd requires further
+ * information than what these field holds it can be passed as a parameter
+ * tagged as meta (setting the OPTEE_MSG_ATTR_META bit in corresponding
+ * attrs field). All parameters tagged as meta has to come first.
+ *
+ * Temp memref parameters can be fragmented if supported by the Trusted OS
+ * (when optee_smc.h is bearer of this protocol this is indicated with
+ * OPTEE_SMC_SEC_CAP_UNREGISTERED_SHM). If a logical memref parameter is
+ * fragmented then has all but the last fragment the
+ * OPTEE_MSG_ATTR_FRAGMENT bit set in attrs. Even if a memref is fragmented
+ * it will still be presented as a single logical memref to the Trusted
+ * Application.
+ */
+struct optee_msg_arg {
+ u32 cmd;
+ u32 func;
+ u32 session;
+ u32 cancel_id;
+ u32 pad;
+ u32 ret;
+ u32 ret_origin;
+ u32 num_params;
+
+ /* num_params tells the actual number of element in params */
+ struct optee_msg_param params[0];
+};
+
+/**
+ * OPTEE_MSG_GET_ARG_SIZE - return size of struct optee_msg_arg
+ *
+ * @num_params: Number of parameters embedded in the struct optee_msg_arg
+ *
+ * Returns the size of the struct optee_msg_arg together with the number
+ * of embedded parameters.
+ */
+#define OPTEE_MSG_GET_ARG_SIZE(num_params) \
+ (sizeof(struct optee_msg_arg) + \
+ sizeof(struct optee_msg_param) * (num_params))
+
+/*****************************************************************************
+ * Part 2 - requests from normal world
+ *****************************************************************************/
+
+/*
+ * Return the following UID if using API specified in this file without
+ * further extensions:
+ * 384fb3e0-e7f8-11e3-af63-0002a5d5c51b.
+ * Represented in 4 32-bit words in OPTEE_MSG_UID_0, OPTEE_MSG_UID_1,
+ * OPTEE_MSG_UID_2, OPTEE_MSG_UID_3.
+ */
+#define OPTEE_MSG_UID_0 0x384fb3e0
+#define OPTEE_MSG_UID_1 0xe7f811e3
+#define OPTEE_MSG_UID_2 0xaf630002
+#define OPTEE_MSG_UID_3 0xa5d5c51b
+#define OPTEE_MSG_FUNCID_CALLS_UID 0xFF01
+
+/*
+ * Returns 2.0 if using API specified in this file without further
+ * extensions. Represented in 2 32-bit words in OPTEE_MSG_REVISION_MAJOR
+ * and OPTEE_MSG_REVISION_MINOR
+ */
+#define OPTEE_MSG_REVISION_MAJOR 2
+#define OPTEE_MSG_REVISION_MINOR 0
+#define OPTEE_MSG_FUNCID_CALLS_REVISION 0xFF03
+
+/*
+ * Get UUID of Trusted OS.
+ *
+ * Used by non-secure world to figure out which Trusted OS is installed.
+ * Note that returned UUID is the UUID of the Trusted OS, not of the API.
+ *
+ * Returns UUID in 4 32-bit words in the same way as
+ * OPTEE_MSG_FUNCID_CALLS_UID described above.
+ */
+#define OPTEE_MSG_OS_OPTEE_UUID_0 0x486178e0
+#define OPTEE_MSG_OS_OPTEE_UUID_1 0xe7f811e3
+#define OPTEE_MSG_OS_OPTEE_UUID_2 0xbc5e0002
+#define OPTEE_MSG_OS_OPTEE_UUID_3 0xa5d5c51b
+#define OPTEE_MSG_FUNCID_GET_OS_UUID 0x0000
+
+/*
+ * Get revision of Trusted OS.
+ *
+ * Used by non-secure world to figure out which version of the Trusted OS
+ * is installed. Note that the returned revision is the revision of the
+ * Trusted OS, not of the API.
+ *
+ * Returns revision in 2 32-bit words in the same way as
+ * OPTEE_MSG_CALLS_REVISION described above.
+ */
+#define OPTEE_MSG_FUNCID_GET_OS_REVISION 0x0001
+
+/*
+ * Do a secure call with struct optee_msg_arg as argument
+ * The OPTEE_MSG_CMD_* below defines what goes in struct optee_msg_arg::cmd
+ *
+ * OPTEE_MSG_CMD_OPEN_SESSION opens a session to a Trusted Application.
+ * The first two parameters are tagged as meta, holding two value
+ * parameters to pass the following information:
+ * param[0].u.value.a-b uuid of Trusted Application
+ * param[1].u.value.a-b uuid of Client
+ * param[1].u.value.c Login class of client OPTEE_MSG_LOGIN_*
+ *
+ * OPTEE_MSG_CMD_INVOKE_COMMAND invokes a command a previously opened
+ * session to a Trusted Application. struct optee_msg_arg::func is Trusted
+ * Application function, specific to the Trusted Application.
+ *
+ * OPTEE_MSG_CMD_CLOSE_SESSION closes a previously opened session to
+ * Trusted Application.
+ *
+ * OPTEE_MSG_CMD_CANCEL cancels a currently invoked command.
+ *
+ * OPTEE_MSG_CMD_REGISTER_SHM registers a shared memory reference. The
+ * information is passed as:
+ * [in] param[0].attr OPTEE_MSG_ATTR_TYPE_TMEM_INPUT
+ * [| OPTEE_MSG_ATTR_FRAGMENT]
+ * [in] param[0].u.tmem.buf_ptr physical address (of first fragment)
+ * [in] param[0].u.tmem.size size (of first fragment)
+ * [in] param[0].u.tmem.shm_ref holds shared memory reference
+ * ...
+ * The shared memory can optionally be fragmented, temp memrefs can follow
+ * each other with all but the last with the OPTEE_MSG_ATTR_FRAGMENT bit set.
+ *
+ * OPTEE_MSG_CMD_UNREGISTER_SHM unregisteres a previously registered shared
+ * memory reference. The information is passed as:
+ * [in] param[0].attr OPTEE_MSG_ATTR_TYPE_RMEM_INPUT
+ * [in] param[0].u.rmem.shm_ref holds shared memory reference
+ * [in] param[0].u.rmem.offs 0
+ * [in] param[0].u.rmem.size 0
+ */
+#define OPTEE_MSG_CMD_OPEN_SESSION 0
+#define OPTEE_MSG_CMD_INVOKE_COMMAND 1
+#define OPTEE_MSG_CMD_CLOSE_SESSION 2
+#define OPTEE_MSG_CMD_CANCEL 3
+#define OPTEE_MSG_CMD_REGISTER_SHM 4
+#define OPTEE_MSG_CMD_UNREGISTER_SHM 5
+#define OPTEE_MSG_FUNCID_CALL_WITH_ARG 0x0004
+
+/*****************************************************************************
+ * Part 3 - Requests from secure world, RPC
+ *****************************************************************************/
+
+/*
+ * All RPC is done with a struct optee_msg_arg as bearer of information,
+ * struct optee_msg_arg::arg holds values defined by OPTEE_MSG_RPC_CMD_* below
+ *
+ * RPC communication with tee-supplicant is reversed compared to normal
+ * client communication desribed above. The supplicant receives requests
+ * and sends responses.
+ */
+
+/*
+ * Load a TA into memory, defined in tee-supplicant
+ */
+#define OPTEE_MSG_RPC_CMD_LOAD_TA 0
+
+/*
+ * Reserved
+ */
+#define OPTEE_MSG_RPC_CMD_RPMB 1
+
+/*
+ * File system access, defined in tee-supplicant
+ */
+#define OPTEE_MSG_RPC_CMD_FS 2
+
+/*
+ * Get time
+ *
+ * Returns number of seconds and nano seconds since the Epoch,
+ * 1970-01-01 00:00:00 +0000 (UTC).
+ *
+ * [out] param[0].u.value.a Number of seconds
+ * [out] param[0].u.value.b Number of nano seconds.
+ */
+#define OPTEE_MSG_RPC_CMD_GET_TIME 3
+
+/*
+ * Wait queue primitive, helper for secure world to implement a wait queue.
+ *
+ * If secure world need to wait for a secure world mutex it issues a sleep
+ * request instead of spinning in secure world. Conversely is a wakeup
+ * request issued when a secure world mutex with a thread waiting thread is
+ * unlocked.
+ *
+ * Waiting on a key
+ * [in] param[0].u.value.a OPTEE_MSG_RPC_WAIT_QUEUE_SLEEP
+ * [in] param[0].u.value.b wait key
+ *
+ * Waking up a key
+ * [in] param[0].u.value.a OPTEE_MSG_RPC_WAIT_QUEUE_WAKEUP
+ * [in] param[0].u.value.b wakeup key
+ */
+#define OPTEE_MSG_RPC_CMD_WAIT_QUEUE 4
+#define OPTEE_MSG_RPC_WAIT_QUEUE_SLEEP 0
+#define OPTEE_MSG_RPC_WAIT_QUEUE_WAKEUP 1
+
+/*
+ * Suspend execution
+ *
+ * [in] param[0].value .a number of milliseconds to suspend
+ */
+#define OPTEE_MSG_RPC_CMD_SUSPEND 5
+
+/*
+ * Allocate a piece of shared memory
+ *
+ * Shared memory can optionally be fragmented, to support that additional
+ * spare param entries are allocated to make room for eventual fragments.
+ * The spare param entries has .attr = OPTEE_MSG_ATTR_TYPE_NONE when
+ * unused. All returned temp memrefs except the last should have the
+ * OPTEE_MSG_ATTR_FRAGMENT bit set in the attr field.
+ *
+ * [in] param[0].u.value.a type of memory one of
+ * OPTEE_MSG_RPC_SHM_TYPE_* below
+ * [in] param[0].u.value.b requested size
+ * [in] param[0].u.value.c required alignment
+ *
+ * [out] param[0].u.tmem.buf_ptr physical address (of first fragment)
+ * [out] param[0].u.tmem.size size (of first fragment)
+ * [out] param[0].u.tmem.shm_ref shared memory reference
+ * ...
+ * [out] param[n].u.tmem.buf_ptr physical address
+ * [out] param[n].u.tmem.size size
+ * [out] param[n].u.tmem.shm_ref shared memory reference (same value
+ * as in param[n-1].u.tmem.shm_ref)
+ */
+#define OPTEE_MSG_RPC_CMD_SHM_ALLOC 6
+/* Memory that can be shared with a non-secure user space application */
+#define OPTEE_MSG_RPC_SHM_TYPE_APPL 0
+/* Memory only shared with non-secure kernel */
+#define OPTEE_MSG_RPC_SHM_TYPE_KERNEL 1
+
+/*
+ * Free shared memory previously allocated with OPTEE_MSG_RPC_CMD_SHM_ALLOC
+ *
+ * [in] param[0].u.value.a type of memory one of
+ * OPTEE_MSG_RPC_SHM_TYPE_* above
+ * [in] param[0].u.value.b value of shared memory reference
+ * returned in param[0].u.tmem.shm_ref
+ * above
+ */
+#define OPTEE_MSG_RPC_CMD_SHM_FREE 7
+
+#endif /* _OPTEE_MSG_H */
diff --git a/drivers/tee/optee/optee_private.h b/drivers/tee/optee/optee_private.h
new file mode 100644
index 000000000000..35e79386c556
--- /dev/null
+++ b/drivers/tee/optee/optee_private.h
@@ -0,0 +1,199 @@
+/*
+ * Copyright (c) 2015, Linaro Limited
+ *
+ * This software is licensed under the terms of the GNU General Public
+ * License version 2, as published by the Free Software Foundation, and
+ * may be copied, distributed, and modified under those terms.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ */
+
+#ifndef OPTEE_PRIVATE_H
+#define OPTEE_PRIVATE_H
+
+#include <linux/arm-smccc.h>
+#include <linux/semaphore.h>
+#include <linux/tee_drv.h>
+#include <linux/types.h>
+#include "optee_msg.h"
+
+#define OPTEE_MAX_ARG_SIZE 1024
+
+/* Some Global Platform error codes used in this driver */
+#define TEEC_SUCCESS 0x00000000
+#define TEEC_ERROR_BAD_PARAMETERS 0xFFFF0006
+#define TEEC_ERROR_COMMUNICATION 0xFFFF000E
+#define TEEC_ERROR_OUT_OF_MEMORY 0xFFFF000C
+
+#define TEEC_ORIGIN_COMMS 0x00000002
+
+typedef void (optee_invoke_fn)(unsigned long, unsigned long, unsigned long,
+ unsigned long, unsigned long, unsigned long,
+ unsigned long, unsigned long,
+ struct arm_smccc_res *);
+
+struct optee_call_queue {
+ /* Serializes access to this struct */
+ struct mutex mutex;
+ struct list_head waiters;
+};
+
+struct optee_wait_queue {
+ /* Serializes access to this struct */
+ struct mutex mu;
+ struct list_head db;
+};
+
+/**
+ * struct optee_supp - supplicant synchronization struct
+ * @ctx the context of current connected supplicant.
+ * if !NULL the supplicant device is available for use,
+ * else busy
+ * @mutex: held while accessing content of this struct
+ * @req_id: current request id if supplicant is doing synchronous
+ * communication, else -1
+ * @reqs: queued request not yet retrieved by supplicant
+ * @idr: IDR holding all requests currently being processed
+ * by supplicant
+ * @reqs_c: completion used by supplicant when waiting for a
+ * request to be queued.
+ */
+struct optee_supp {
+ /* Serializes access to this struct */
+ struct mutex mutex;
+ struct tee_context *ctx;
+
+ int req_id;
+ struct list_head reqs;
+ struct idr idr;
+ struct completion reqs_c;
+};
+
+/**
+ * struct optee - main service struct
+ * @supp_teedev: supplicant device
+ * @teedev: client device
+ * @invoke_fn: function to issue smc or hvc
+ * @call_queue: queue of threads waiting to call @invoke_fn
+ * @wait_queue: queue of threads from secure world waiting for a
+ * secure world sync object
+ * @supp: supplicant synchronization struct for RPC to supplicant
+ * @pool: shared memory pool
+ * @memremaped_shm virtual address of memory in shared memory pool
+ * @sec_caps: secure world capabilities defined by
+ * OPTEE_SMC_SEC_CAP_* in optee_smc.h
+ */
+struct optee {
+ struct tee_device *supp_teedev;
+ struct tee_device *teedev;
+ optee_invoke_fn *invoke_fn;
+ struct optee_call_queue call_queue;
+ struct optee_wait_queue wait_queue;
+ struct optee_supp supp;
+ struct tee_shm_pool *pool;
+ void *memremaped_shm;
+ u32 sec_caps;
+};
+
+struct optee_session {
+ struct list_head list_node;
+ u32 session_id;
+};
+
+struct optee_context_data {
+ /* Serializes access to this struct */
+ struct mutex mutex;
+ struct list_head sess_list;
+};
+
+struct optee_rpc_param {
+ u32 a0;
+ u32 a1;
+ u32 a2;
+ u32 a3;
+ u32 a4;
+ u32 a5;
+ u32 a6;
+ u32 a7;
+};
+
+/* Holds context that is preserved during one STD call */
+struct optee_call_ctx {
+ /* information about pages list used in last allocation */
+ void *pages_list;
+ size_t num_entries;
+};
+
+void optee_handle_rpc(struct tee_context *ctx, struct optee_rpc_param *param,
+ struct optee_call_ctx *call_ctx);
+void optee_rpc_finalize_call(struct optee_call_ctx *call_ctx);
+
+void optee_wait_queue_init(struct optee_wait_queue *wq);
+void optee_wait_queue_exit(struct optee_wait_queue *wq);
+
+u32 optee_supp_thrd_req(struct tee_context *ctx, u32 func, size_t num_params,
+ struct tee_param *param);
+
+int optee_supp_read(struct tee_context *ctx, void __user *buf, size_t len);
+int optee_supp_write(struct tee_context *ctx, void __user *buf, size_t len);
+void optee_supp_init(struct optee_supp *supp);
+void optee_supp_uninit(struct optee_supp *supp);
+void optee_supp_release(struct optee_supp *supp);
+
+int optee_supp_recv(struct tee_context *ctx, u32 *func, u32 *num_params,
+ struct tee_param *param);
+int optee_supp_send(struct tee_context *ctx, u32 ret, u32 num_params,
+ struct tee_param *param);
+
+u32 optee_do_call_with_arg(struct tee_context *ctx, phys_addr_t parg);
+int optee_open_session(struct tee_context *ctx,
+ struct tee_ioctl_open_session_arg *arg,
+ struct tee_param *param);
+int optee_close_session(struct tee_context *ctx, u32 session);
+int optee_invoke_func(struct tee_context *ctx, struct tee_ioctl_invoke_arg *arg,
+ struct tee_param *param);
+int optee_cancel_req(struct tee_context *ctx, u32 cancel_id, u32 session);
+
+void optee_enable_shm_cache(struct optee *optee);
+void optee_disable_shm_cache(struct optee *optee);
+
+int optee_shm_register(struct tee_context *ctx, struct tee_shm *shm,
+ struct page **pages, size_t num_pages,
+ unsigned long start);
+int optee_shm_unregister(struct tee_context *ctx, struct tee_shm *shm);
+
+int optee_shm_register_supp(struct tee_context *ctx, struct tee_shm *shm,
+ struct page **pages, size_t num_pages,
+ unsigned long start);
+int optee_shm_unregister_supp(struct tee_context *ctx, struct tee_shm *shm);
+
+int optee_from_msg_param(struct tee_param *params, size_t num_params,
+ const struct optee_msg_param *msg_params);
+int optee_to_msg_param(struct optee_msg_param *msg_params, size_t num_params,
+ const struct tee_param *params);
+
+u64 *optee_allocate_pages_list(size_t num_entries);
+void optee_free_pages_list(void *array, size_t num_entries);
+void optee_fill_pages_list(u64 *dst, struct page **pages, int num_pages,
+ size_t page_offset);
+
+/*
+ * Small helpers
+ */
+
+static inline void *reg_pair_to_ptr(u32 reg0, u32 reg1)
+{
+ return (void *)(unsigned long)(((u64)reg0 << 32) | reg1);
+}
+
+static inline void reg_pair_from_64(u32 *reg0, u32 *reg1, u64 val)
+{
+ *reg0 = val >> 32;
+ *reg1 = val;
+}
+
+#endif /*OPTEE_PRIVATE_H*/
diff --git a/drivers/tee/optee/optee_smc.h b/drivers/tee/optee/optee_smc.h
new file mode 100644
index 000000000000..7cd327243ada
--- /dev/null
+++ b/drivers/tee/optee/optee_smc.h
@@ -0,0 +1,457 @@
+/*
+ * Copyright (c) 2015-2016, Linaro Limited
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+#ifndef OPTEE_SMC_H
+#define OPTEE_SMC_H
+
+#include <linux/arm-smccc.h>
+#include <linux/bitops.h>
+
+#define OPTEE_SMC_STD_CALL_VAL(func_num) \
+ ARM_SMCCC_CALL_VAL(ARM_SMCCC_STD_CALL, ARM_SMCCC_SMC_32, \
+ ARM_SMCCC_OWNER_TRUSTED_OS, (func_num))
+#define OPTEE_SMC_FAST_CALL_VAL(func_num) \
+ ARM_SMCCC_CALL_VAL(ARM_SMCCC_FAST_CALL, ARM_SMCCC_SMC_32, \
+ ARM_SMCCC_OWNER_TRUSTED_OS, (func_num))
+
+/*
+ * Function specified by SMC Calling convention.
+ */
+#define OPTEE_SMC_FUNCID_CALLS_COUNT 0xFF00
+#define OPTEE_SMC_CALLS_COUNT \
+ ARM_SMCCC_CALL_VAL(OPTEE_SMC_FAST_CALL, SMCCC_SMC_32, \
+ SMCCC_OWNER_TRUSTED_OS_END, \
+ OPTEE_SMC_FUNCID_CALLS_COUNT)
+
+/*
+ * Normal cached memory (write-back), shareable for SMP systems and not
+ * shareable for UP systems.
+ */
+#define OPTEE_SMC_SHM_CACHED 1
+
+/*
+ * a0..a7 is used as register names in the descriptions below, on arm32
+ * that translates to r0..r7 and on arm64 to w0..w7. In both cases it's
+ * 32-bit registers.
+ */
+
+/*
+ * Function specified by SMC Calling convention
+ *
+ * Return one of the following UIDs if using API specified in this file
+ * without further extentions:
+ * 65cb6b93-af0c-4617-8ed6-644a8d1140f8
+ * see also OPTEE_SMC_UID_* in optee_msg.h
+ */
+#define OPTEE_SMC_FUNCID_CALLS_UID OPTEE_MSG_FUNCID_CALLS_UID
+#define OPTEE_SMC_CALLS_UID \
+ ARM_SMCCC_CALL_VAL(ARM_SMCCC_FAST_CALL, ARM_SMCCC_SMC_32, \
+ ARM_SMCCC_OWNER_TRUSTED_OS_END, \
+ OPTEE_SMC_FUNCID_CALLS_UID)
+
+/*
+ * Function specified by SMC Calling convention
+ *
+ * Returns 2.0 if using API specified in this file without further extentions.
+ * see also OPTEE_MSG_REVISION_* in optee_msg.h
+ */
+#define OPTEE_SMC_FUNCID_CALLS_REVISION OPTEE_MSG_FUNCID_CALLS_REVISION
+#define OPTEE_SMC_CALLS_REVISION \
+ ARM_SMCCC_CALL_VAL(ARM_SMCCC_FAST_CALL, ARM_SMCCC_SMC_32, \
+ ARM_SMCCC_OWNER_TRUSTED_OS_END, \
+ OPTEE_SMC_FUNCID_CALLS_REVISION)
+
+struct optee_smc_calls_revision_result {
+ unsigned long major;
+ unsigned long minor;
+ unsigned long reserved0;
+ unsigned long reserved1;
+};
+
+/*
+ * Get UUID of Trusted OS.
+ *
+ * Used by non-secure world to figure out which Trusted OS is installed.
+ * Note that returned UUID is the UUID of the Trusted OS, not of the API.
+ *
+ * Returns UUID in a0-4 in the same way as OPTEE_SMC_CALLS_UID
+ * described above.
+ */
+#define OPTEE_SMC_FUNCID_GET_OS_UUID OPTEE_MSG_FUNCID_GET_OS_UUID
+#define OPTEE_SMC_CALL_GET_OS_UUID \
+ OPTEE_SMC_FAST_CALL_VAL(OPTEE_SMC_FUNCID_GET_OS_UUID)
+
+/*
+ * Get revision of Trusted OS.
+ *
+ * Used by non-secure world to figure out which version of the Trusted OS
+ * is installed. Note that the returned revision is the revision of the
+ * Trusted OS, not of the API.
+ *
+ * Returns revision in a0-1 in the same way as OPTEE_SMC_CALLS_REVISION
+ * described above.
+ */
+#define OPTEE_SMC_FUNCID_GET_OS_REVISION OPTEE_MSG_FUNCID_GET_OS_REVISION
+#define OPTEE_SMC_CALL_GET_OS_REVISION \
+ OPTEE_SMC_FAST_CALL_VAL(OPTEE_SMC_FUNCID_GET_OS_REVISION)
+
+/*
+ * Call with struct optee_msg_arg as argument
+ *
+ * Call register usage:
+ * a0 SMC Function ID, OPTEE_SMC*CALL_WITH_ARG
+ * a1 Upper 32bit of a 64bit physical pointer to a struct optee_msg_arg
+ * a2 Lower 32bit of a 64bit physical pointer to a struct optee_msg_arg
+ * a3 Cache settings, not used if physical pointer is in a predefined shared
+ * memory area else per OPTEE_SMC_SHM_*
+ * a4-6 Not used
+ * a7 Hypervisor Client ID register
+ *
+ * Normal return register usage:
+ * a0 Return value, OPTEE_SMC_RETURN_*
+ * a1-3 Not used
+ * a4-7 Preserved
+ *
+ * OPTEE_SMC_RETURN_ETHREAD_LIMIT return register usage:
+ * a0 Return value, OPTEE_SMC_RETURN_ETHREAD_LIMIT
+ * a1-3 Preserved
+ * a4-7 Preserved
+ *
+ * RPC return register usage:
+ * a0 Return value, OPTEE_SMC_RETURN_IS_RPC(val)
+ * a1-2 RPC parameters
+ * a3-7 Resume information, must be preserved
+ *
+ * Possible return values:
+ * OPTEE_SMC_RETURN_UNKNOWN_FUNCTION Trusted OS does not recognize this
+ * function.
+ * OPTEE_SMC_RETURN_OK Call completed, result updated in
+ * the previously supplied struct
+ * optee_msg_arg.
+ * OPTEE_SMC_RETURN_ETHREAD_LIMIT Number of Trusted OS threads exceeded,
+ * try again later.
+ * OPTEE_SMC_RETURN_EBADADDR Bad physcial pointer to struct
+ * optee_msg_arg.
+ * OPTEE_SMC_RETURN_EBADCMD Bad/unknown cmd in struct optee_msg_arg
+ * OPTEE_SMC_RETURN_IS_RPC() Call suspended by RPC call to normal
+ * world.
+ */
+#define OPTEE_SMC_FUNCID_CALL_WITH_ARG OPTEE_MSG_FUNCID_CALL_WITH_ARG
+#define OPTEE_SMC_CALL_WITH_ARG \
+ OPTEE_SMC_STD_CALL_VAL(OPTEE_SMC_FUNCID_CALL_WITH_ARG)
+
+/*
+ * Get Shared Memory Config
+ *
+ * Returns the Secure/Non-secure shared memory config.
+ *
+ * Call register usage:
+ * a0 SMC Function ID, OPTEE_SMC_GET_SHM_CONFIG
+ * a1-6 Not used
+ * a7 Hypervisor Client ID register
+ *
+ * Have config return register usage:
+ * a0 OPTEE_SMC_RETURN_OK
+ * a1 Physical address of start of SHM
+ * a2 Size of of SHM
+ * a3 Cache settings of memory, as defined by the
+ * OPTEE_SMC_SHM_* values above
+ * a4-7 Preserved
+ *
+ * Not available register usage:
+ * a0 OPTEE_SMC_RETURN_ENOTAVAIL
+ * a1-3 Not used
+ * a4-7 Preserved
+ */
+#define OPTEE_SMC_FUNCID_GET_SHM_CONFIG 7
+#define OPTEE_SMC_GET_SHM_CONFIG \
+ OPTEE_SMC_FAST_CALL_VAL(OPTEE_SMC_FUNCID_GET_SHM_CONFIG)
+
+struct optee_smc_get_shm_config_result {
+ unsigned long status;
+ unsigned long start;
+ unsigned long size;
+ unsigned long settings;
+};
+
+/*
+ * Exchanges capabilities between normal world and secure world
+ *
+ * Call register usage:
+ * a0 SMC Function ID, OPTEE_SMC_EXCHANGE_CAPABILITIES
+ * a1 bitfield of normal world capabilities OPTEE_SMC_NSEC_CAP_*
+ * a2-6 Not used
+ * a7 Hypervisor Client ID register
+ *
+ * Normal return register usage:
+ * a0 OPTEE_SMC_RETURN_OK
+ * a1 bitfield of secure world capabilities OPTEE_SMC_SEC_CAP_*
+ * a2-7 Preserved
+ *
+ * Error return register usage:
+ * a0 OPTEE_SMC_RETURN_ENOTAVAIL, can't use the capabilities from normal world
+ * a1 bitfield of secure world capabilities OPTEE_SMC_SEC_CAP_*
+ * a2-7 Preserved
+ */
+/* Normal world works as a uniprocessor system */
+#define OPTEE_SMC_NSEC_CAP_UNIPROCESSOR BIT(0)
+/* Secure world has reserved shared memory for normal world to use */
+#define OPTEE_SMC_SEC_CAP_HAVE_RESERVED_SHM BIT(0)
+/* Secure world can communicate via previously unregistered shared memory */
+#define OPTEE_SMC_SEC_CAP_UNREGISTERED_SHM BIT(1)
+
+/*
+ * Secure world supports commands "register/unregister shared memory",
+ * secure world accepts command buffers located in any parts of non-secure RAM
+ */
+#define OPTEE_SMC_SEC_CAP_DYNAMIC_SHM BIT(2)
+
+#define OPTEE_SMC_FUNCID_EXCHANGE_CAPABILITIES 9
+#define OPTEE_SMC_EXCHANGE_CAPABILITIES \
+ OPTEE_SMC_FAST_CALL_VAL(OPTEE_SMC_FUNCID_EXCHANGE_CAPABILITIES)
+
+struct optee_smc_exchange_capabilities_result {
+ unsigned long status;
+ unsigned long capabilities;
+ unsigned long reserved0;
+ unsigned long reserved1;
+};
+
+/*
+ * Disable and empties cache of shared memory objects
+ *
+ * Secure world can cache frequently used shared memory objects, for
+ * example objects used as RPC arguments. When secure world is idle this
+ * function returns one shared memory reference to free. To disable the
+ * cache and free all cached objects this function has to be called until
+ * it returns OPTEE_SMC_RETURN_ENOTAVAIL.
+ *
+ * Call register usage:
+ * a0 SMC Function ID, OPTEE_SMC_DISABLE_SHM_CACHE
+ * a1-6 Not used
+ * a7 Hypervisor Client ID register
+ *
+ * Normal return register usage:
+ * a0 OPTEE_SMC_RETURN_OK
+ * a1 Upper 32bit of a 64bit Shared memory cookie
+ * a2 Lower 32bit of a 64bit Shared memory cookie
+ * a3-7 Preserved
+ *
+ * Cache empty return register usage:
+ * a0 OPTEE_SMC_RETURN_ENOTAVAIL
+ * a1-7 Preserved
+ *
+ * Not idle return register usage:
+ * a0 OPTEE_SMC_RETURN_EBUSY
+ * a1-7 Preserved
+ */
+#define OPTEE_SMC_FUNCID_DISABLE_SHM_CACHE 10
+#define OPTEE_SMC_DISABLE_SHM_CACHE \
+ OPTEE_SMC_FAST_CALL_VAL(OPTEE_SMC_FUNCID_DISABLE_SHM_CACHE)
+
+struct optee_smc_disable_shm_cache_result {
+ unsigned long status;
+ unsigned long shm_upper32;
+ unsigned long shm_lower32;
+ unsigned long reserved0;
+};
+
+/*
+ * Enable cache of shared memory objects
+ *
+ * Secure world can cache frequently used shared memory objects, for
+ * example objects used as RPC arguments. When secure world is idle this
+ * function returns OPTEE_SMC_RETURN_OK and the cache is enabled. If
+ * secure world isn't idle OPTEE_SMC_RETURN_EBUSY is returned.
+ *
+ * Call register usage:
+ * a0 SMC Function ID, OPTEE_SMC_ENABLE_SHM_CACHE
+ * a1-6 Not used
+ * a7 Hypervisor Client ID register
+ *
+ * Normal return register usage:
+ * a0 OPTEE_SMC_RETURN_OK
+ * a1-7 Preserved
+ *
+ * Not idle return register usage:
+ * a0 OPTEE_SMC_RETURN_EBUSY
+ * a1-7 Preserved
+ */
+#define OPTEE_SMC_FUNCID_ENABLE_SHM_CACHE 11
+#define OPTEE_SMC_ENABLE_SHM_CACHE \
+ OPTEE_SMC_FAST_CALL_VAL(OPTEE_SMC_FUNCID_ENABLE_SHM_CACHE)
+
+/*
+ * Resume from RPC (for example after processing a foreign interrupt)
+ *
+ * Call register usage:
+ * a0 SMC Function ID, OPTEE_SMC_CALL_RETURN_FROM_RPC
+ * a1-3 Value of a1-3 when OPTEE_SMC_CALL_WITH_ARG returned
+ * OPTEE_SMC_RETURN_RPC in a0
+ *
+ * Return register usage is the same as for OPTEE_SMC_*CALL_WITH_ARG above.
+ *
+ * Possible return values
+ * OPTEE_SMC_RETURN_UNKNOWN_FUNCTION Trusted OS does not recognize this
+ * function.
+ * OPTEE_SMC_RETURN_OK Original call completed, result
+ * updated in the previously supplied.
+ * struct optee_msg_arg
+ * OPTEE_SMC_RETURN_RPC Call suspended by RPC call to normal
+ * world.
+ * OPTEE_SMC_RETURN_ERESUME Resume failed, the opaque resume
+ * information was corrupt.
+ */
+#define OPTEE_SMC_FUNCID_RETURN_FROM_RPC 3
+#define OPTEE_SMC_CALL_RETURN_FROM_RPC \
+ OPTEE_SMC_STD_CALL_VAL(OPTEE_SMC_FUNCID_RETURN_FROM_RPC)
+
+#define OPTEE_SMC_RETURN_RPC_PREFIX_MASK 0xFFFF0000
+#define OPTEE_SMC_RETURN_RPC_PREFIX 0xFFFF0000
+#define OPTEE_SMC_RETURN_RPC_FUNC_MASK 0x0000FFFF
+
+#define OPTEE_SMC_RETURN_GET_RPC_FUNC(ret) \
+ ((ret) & OPTEE_SMC_RETURN_RPC_FUNC_MASK)
+
+#define OPTEE_SMC_RPC_VAL(func) ((func) | OPTEE_SMC_RETURN_RPC_PREFIX)
+
+/*
+ * Allocate memory for RPC parameter passing. The memory is used to hold a
+ * struct optee_msg_arg.
+ *
+ * "Call" register usage:
+ * a0 This value, OPTEE_SMC_RETURN_RPC_ALLOC
+ * a1 Size in bytes of required argument memory
+ * a2 Not used
+ * a3 Resume information, must be preserved
+ * a4-5 Not used
+ * a6-7 Resume information, must be preserved
+ *
+ * "Return" register usage:
+ * a0 SMC Function ID, OPTEE_SMC_CALL_RETURN_FROM_RPC.
+ * a1 Upper 32bits of 64bit physical pointer to allocated
+ * memory, (a1 == 0 && a2 == 0) if size was 0 or if memory can't
+ * be allocated.
+ * a2 Lower 32bits of 64bit physical pointer to allocated
+ * memory, (a1 == 0 && a2 == 0) if size was 0 or if memory can't
+ * be allocated
+ * a3 Preserved
+ * a4 Upper 32bits of 64bit Shared memory cookie used when freeing
+ * the memory or doing an RPC
+ * a5 Lower 32bits of 64bit Shared memory cookie used when freeing
+ * the memory or doing an RPC
+ * a6-7 Preserved
+ */
+#define OPTEE_SMC_RPC_FUNC_ALLOC 0
+#define OPTEE_SMC_RETURN_RPC_ALLOC \
+ OPTEE_SMC_RPC_VAL(OPTEE_SMC_RPC_FUNC_ALLOC)
+
+/*
+ * Free memory previously allocated by OPTEE_SMC_RETURN_RPC_ALLOC
+ *
+ * "Call" register usage:
+ * a0 This value, OPTEE_SMC_RETURN_RPC_FREE
+ * a1 Upper 32bits of 64bit shared memory cookie belonging to this
+ * argument memory
+ * a2 Lower 32bits of 64bit shared memory cookie belonging to this
+ * argument memory
+ * a3-7 Resume information, must be preserved
+ *
+ * "Return" register usage:
+ * a0 SMC Function ID, OPTEE_SMC_CALL_RETURN_FROM_RPC.
+ * a1-2 Not used
+ * a3-7 Preserved
+ */
+#define OPTEE_SMC_RPC_FUNC_FREE 2
+#define OPTEE_SMC_RETURN_RPC_FREE \
+ OPTEE_SMC_RPC_VAL(OPTEE_SMC_RPC_FUNC_FREE)
+
+/*
+ * Deliver foreign interrupt to normal world.
+ *
+ * "Call" register usage:
+ * a0 OPTEE_SMC_RETURN_RPC_FOREIGN_INTR
+ * a1-7 Resume information, must be preserved
+ *
+ * "Return" register usage:
+ * a0 SMC Function ID, OPTEE_SMC_CALL_RETURN_FROM_RPC.
+ * a1-7 Preserved
+ */
+#define OPTEE_SMC_RPC_FUNC_FOREIGN_INTR 4
+#define OPTEE_SMC_RETURN_RPC_FOREIGN_INTR \
+ OPTEE_SMC_RPC_VAL(OPTEE_SMC_RPC_FUNC_FOREIGN_INTR)
+
+/*
+ * Do an RPC request. The supplied struct optee_msg_arg tells which
+ * request to do and the parameters for the request. The following fields
+ * are used (the rest are unused):
+ * - cmd the Request ID
+ * - ret return value of the request, filled in by normal world
+ * - num_params number of parameters for the request
+ * - params the parameters
+ * - param_attrs attributes of the parameters
+ *
+ * "Call" register usage:
+ * a0 OPTEE_SMC_RETURN_RPC_CMD
+ * a1 Upper 32bit of a 64bit Shared memory cookie holding a
+ * struct optee_msg_arg, must be preserved, only the data should
+ * be updated
+ * a2 Lower 32bit of a 64bit Shared memory cookie holding a
+ * struct optee_msg_arg, must be preserved, only the data should
+ * be updated
+ * a3-7 Resume information, must be preserved
+ *
+ * "Return" register usage:
+ * a0 SMC Function ID, OPTEE_SMC_CALL_RETURN_FROM_RPC.
+ * a1-2 Not used
+ * a3-7 Preserved
+ */
+#define OPTEE_SMC_RPC_FUNC_CMD 5
+#define OPTEE_SMC_RETURN_RPC_CMD \
+ OPTEE_SMC_RPC_VAL(OPTEE_SMC_RPC_FUNC_CMD)
+
+/* Returned in a0 */
+#define OPTEE_SMC_RETURN_UNKNOWN_FUNCTION 0xFFFFFFFF
+
+/* Returned in a0 only from Trusted OS functions */
+#define OPTEE_SMC_RETURN_OK 0x0
+#define OPTEE_SMC_RETURN_ETHREAD_LIMIT 0x1
+#define OPTEE_SMC_RETURN_EBUSY 0x2
+#define OPTEE_SMC_RETURN_ERESUME 0x3
+#define OPTEE_SMC_RETURN_EBADADDR 0x4
+#define OPTEE_SMC_RETURN_EBADCMD 0x5
+#define OPTEE_SMC_RETURN_ENOMEM 0x6
+#define OPTEE_SMC_RETURN_ENOTAVAIL 0x7
+#define OPTEE_SMC_RETURN_IS_RPC(ret) __optee_smc_return_is_rpc((ret))
+
+static inline bool __optee_smc_return_is_rpc(u32 ret)
+{
+ return ret != OPTEE_SMC_RETURN_UNKNOWN_FUNCTION &&
+ (ret & OPTEE_SMC_RETURN_RPC_PREFIX_MASK) ==
+ OPTEE_SMC_RETURN_RPC_PREFIX;
+}
+
+#endif /* OPTEE_SMC_H */
diff --git a/drivers/tee/optee/rpc.c b/drivers/tee/optee/rpc.c
new file mode 100644
index 000000000000..41aea12e2bcc
--- /dev/null
+++ b/drivers/tee/optee/rpc.c
@@ -0,0 +1,452 @@
+/*
+ * Copyright (c) 2015-2016, Linaro Limited
+ *
+ * This software is licensed under the terms of the GNU General Public
+ * License version 2, as published by the Free Software Foundation, and
+ * may be copied, distributed, and modified under those terms.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ */
+
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include <linux/delay.h>
+#include <linux/device.h>
+#include <linux/slab.h>
+#include <linux/tee_drv.h>
+#include "optee_private.h"
+#include "optee_smc.h"
+
+struct wq_entry {
+ struct list_head link;
+ struct completion c;
+ u32 key;
+};
+
+void optee_wait_queue_init(struct optee_wait_queue *priv)
+{
+ mutex_init(&priv->mu);
+ INIT_LIST_HEAD(&priv->db);
+}
+
+void optee_wait_queue_exit(struct optee_wait_queue *priv)
+{
+ mutex_destroy(&priv->mu);
+}
+
+static void handle_rpc_func_cmd_get_time(struct optee_msg_arg *arg)
+{
+ struct timespec64 ts;
+
+ if (arg->num_params != 1)
+ goto bad;
+ if ((arg->params[0].attr & OPTEE_MSG_ATTR_TYPE_MASK) !=
+ OPTEE_MSG_ATTR_TYPE_VALUE_OUTPUT)
+ goto bad;
+
+ getnstimeofday64(&ts);
+ arg->params[0].u.value.a = ts.tv_sec;
+ arg->params[0].u.value.b = ts.tv_nsec;
+
+ arg->ret = TEEC_SUCCESS;
+ return;
+bad:
+ arg->ret = TEEC_ERROR_BAD_PARAMETERS;
+}
+
+static struct wq_entry *wq_entry_get(struct optee_wait_queue *wq, u32 key)
+{
+ struct wq_entry *w;
+
+ mutex_lock(&wq->mu);
+
+ list_for_each_entry(w, &wq->db, link)
+ if (w->key == key)
+ goto out;
+
+ w = kmalloc(sizeof(*w), GFP_KERNEL);
+ if (w) {
+ init_completion(&w->c);
+ w->key = key;
+ list_add_tail(&w->link, &wq->db);
+ }
+out:
+ mutex_unlock(&wq->mu);
+ return w;
+}
+
+static void wq_sleep(struct optee_wait_queue *wq, u32 key)
+{
+ struct wq_entry *w = wq_entry_get(wq, key);
+
+ if (w) {
+ wait_for_completion(&w->c);
+ mutex_lock(&wq->mu);
+ list_del(&w->link);
+ mutex_unlock(&wq->mu);
+ kfree(w);
+ }
+}
+
+static void wq_wakeup(struct optee_wait_queue *wq, u32 key)
+{
+ struct wq_entry *w = wq_entry_get(wq, key);
+
+ if (w)
+ complete(&w->c);
+}
+
+static void handle_rpc_func_cmd_wq(struct optee *optee,
+ struct optee_msg_arg *arg)
+{
+ if (arg->num_params != 1)
+ goto bad;
+
+ if ((arg->params[0].attr & OPTEE_MSG_ATTR_TYPE_MASK) !=
+ OPTEE_MSG_ATTR_TYPE_VALUE_INPUT)
+ goto bad;
+
+ switch (arg->params[0].u.value.a) {
+ case OPTEE_MSG_RPC_WAIT_QUEUE_SLEEP:
+ wq_sleep(&optee->wait_queue, arg->params[0].u.value.b);
+ break;
+ case OPTEE_MSG_RPC_WAIT_QUEUE_WAKEUP:
+ wq_wakeup(&optee->wait_queue, arg->params[0].u.value.b);
+ break;
+ default:
+ goto bad;
+ }
+
+ arg->ret = TEEC_SUCCESS;
+ return;
+bad:
+ arg->ret = TEEC_ERROR_BAD_PARAMETERS;
+}
+
+static void handle_rpc_func_cmd_wait(struct optee_msg_arg *arg)
+{
+ u32 msec_to_wait;
+
+ if (arg->num_params != 1)
+ goto bad;
+
+ if ((arg->params[0].attr & OPTEE_MSG_ATTR_TYPE_MASK) !=
+ OPTEE_MSG_ATTR_TYPE_VALUE_INPUT)
+ goto bad;
+
+ msec_to_wait = arg->params[0].u.value.a;
+
+ /* Go to interruptible sleep */
+ msleep_interruptible(msec_to_wait);
+
+ arg->ret = TEEC_SUCCESS;
+ return;
+bad:
+ arg->ret = TEEC_ERROR_BAD_PARAMETERS;
+}
+
+static void handle_rpc_supp_cmd(struct tee_context *ctx,
+ struct optee_msg_arg *arg)
+{
+ struct tee_param *params;
+
+ arg->ret_origin = TEEC_ORIGIN_COMMS;
+
+ params = kmalloc_array(arg->num_params, sizeof(struct tee_param),
+ GFP_KERNEL);
+ if (!params) {
+ arg->ret = TEEC_ERROR_OUT_OF_MEMORY;
+ return;
+ }
+
+ if (optee_from_msg_param(params, arg->num_params, arg->params)) {
+ arg->ret = TEEC_ERROR_BAD_PARAMETERS;
+ goto out;
+ }
+
+ arg->ret = optee_supp_thrd_req(ctx, arg->cmd, arg->num_params, params);
+
+ if (optee_to_msg_param(arg->params, arg->num_params, params))
+ arg->ret = TEEC_ERROR_BAD_PARAMETERS;
+out:
+ kfree(params);
+}
+
+static struct tee_shm *cmd_alloc_suppl(struct tee_context *ctx, size_t sz)
+{
+ u32 ret;
+ struct tee_param param;
+ struct optee *optee = tee_get_drvdata(ctx->teedev);
+ struct tee_shm *shm;
+
+ param.attr = TEE_IOCTL_PARAM_ATTR_TYPE_VALUE_INOUT;
+ param.u.value.a = OPTEE_MSG_RPC_SHM_TYPE_APPL;
+ param.u.value.b = sz;
+ param.u.value.c = 0;
+
+ ret = optee_supp_thrd_req(ctx, OPTEE_MSG_RPC_CMD_SHM_ALLOC, 1, &param);
+ if (ret)
+ return ERR_PTR(-ENOMEM);
+
+ mutex_lock(&optee->supp.mutex);
+ /* Increases count as secure world doesn't have a reference */
+ shm = tee_shm_get_from_id(optee->supp.ctx, param.u.value.c);
+ mutex_unlock(&optee->supp.mutex);
+ return shm;
+}
+
+static void handle_rpc_func_cmd_shm_alloc(struct tee_context *ctx,
+ struct optee_msg_arg *arg,
+ struct optee_call_ctx *call_ctx)
+{
+ phys_addr_t pa;
+ struct tee_shm *shm;
+ size_t sz;
+ size_t n;
+
+ arg->ret_origin = TEEC_ORIGIN_COMMS;
+
+ if (!arg->num_params ||
+ arg->params[0].attr != OPTEE_MSG_ATTR_TYPE_VALUE_INPUT) {
+ arg->ret = TEEC_ERROR_BAD_PARAMETERS;
+ return;
+ }
+
+ for (n = 1; n < arg->num_params; n++) {
+ if (arg->params[n].attr != OPTEE_MSG_ATTR_TYPE_NONE) {
+ arg->ret = TEEC_ERROR_BAD_PARAMETERS;
+ return;
+ }
+ }
+
+ sz = arg->params[0].u.value.b;
+ switch (arg->params[0].u.value.a) {
+ case OPTEE_MSG_RPC_SHM_TYPE_APPL:
+ shm = cmd_alloc_suppl(ctx, sz);
+ break;
+ case OPTEE_MSG_RPC_SHM_TYPE_KERNEL:
+ shm = tee_shm_alloc(ctx, sz, TEE_SHM_MAPPED);
+ break;
+ default:
+ arg->ret = TEEC_ERROR_BAD_PARAMETERS;
+ return;
+ }
+
+ if (IS_ERR(shm)) {
+ arg->ret = TEEC_ERROR_OUT_OF_MEMORY;
+ return;
+ }
+
+ if (tee_shm_get_pa(shm, 0, &pa)) {
+ arg->ret = TEEC_ERROR_BAD_PARAMETERS;
+ goto bad;
+ }
+
+ sz = tee_shm_get_size(shm);
+
+ if (tee_shm_is_registered(shm)) {
+ struct page **pages;
+ u64 *pages_list;
+ size_t page_num;
+
+ pages = tee_shm_get_pages(shm, &page_num);
+ if (!pages || !page_num) {
+ arg->ret = TEEC_ERROR_OUT_OF_MEMORY;
+ goto bad;
+ }
+
+ pages_list = optee_allocate_pages_list(page_num);
+ if (!pages_list) {
+ arg->ret = TEEC_ERROR_OUT_OF_MEMORY;
+ goto bad;
+ }
+
+ call_ctx->pages_list = pages_list;
+ call_ctx->num_entries = page_num;
+
+ arg->params[0].attr = OPTEE_MSG_ATTR_TYPE_TMEM_OUTPUT |
+ OPTEE_MSG_ATTR_NONCONTIG;
+ /*
+ * In the least bits of u.tmem.buf_ptr we store buffer offset
+ * from 4k page, as described in OP-TEE ABI.
+ */
+ arg->params[0].u.tmem.buf_ptr = virt_to_phys(pages_list) |
+ (tee_shm_get_page_offset(shm) &
+ (OPTEE_MSG_NONCONTIG_PAGE_SIZE - 1));
+ arg->params[0].u.tmem.size = tee_shm_get_size(shm);
+ arg->params[0].u.tmem.shm_ref = (unsigned long)shm;
+
+ optee_fill_pages_list(pages_list, pages, page_num,
+ tee_shm_get_page_offset(shm));
+ } else {
+ arg->params[0].attr = OPTEE_MSG_ATTR_TYPE_TMEM_OUTPUT;
+ arg->params[0].u.tmem.buf_ptr = pa;
+ arg->params[0].u.tmem.size = sz;
+ arg->params[0].u.tmem.shm_ref = (unsigned long)shm;
+ }
+
+ arg->ret = TEEC_SUCCESS;
+ return;
+bad:
+ tee_shm_free(shm);
+}
+
+static void cmd_free_suppl(struct tee_context *ctx, struct tee_shm *shm)
+{
+ struct tee_param param;
+
+ param.attr = TEE_IOCTL_PARAM_ATTR_TYPE_VALUE_INOUT;
+ param.u.value.a = OPTEE_MSG_RPC_SHM_TYPE_APPL;
+ param.u.value.b = tee_shm_get_id(shm);
+ param.u.value.c = 0;
+
+ /*
+ * Match the tee_shm_get_from_id() in cmd_alloc_suppl() as secure
+ * world has released its reference.
+ *
+ * It's better to do this before sending the request to supplicant
+ * as we'd like to let the process doing the initial allocation to
+ * do release the last reference too in order to avoid stacking
+ * many pending fput() on the client process. This could otherwise
+ * happen if secure world does many allocate and free in a single
+ * invoke.
+ */
+ tee_shm_put(shm);
+
+ optee_supp_thrd_req(ctx, OPTEE_MSG_RPC_CMD_SHM_FREE, 1, &param);
+}
+
+static void handle_rpc_func_cmd_shm_free(struct tee_context *ctx,
+ struct optee_msg_arg *arg)
+{
+ struct tee_shm *shm;
+
+ arg->ret_origin = TEEC_ORIGIN_COMMS;
+
+ if (arg->num_params != 1 ||
+ arg->params[0].attr != OPTEE_MSG_ATTR_TYPE_VALUE_INPUT) {
+ arg->ret = TEEC_ERROR_BAD_PARAMETERS;
+ return;
+ }
+
+ shm = (struct tee_shm *)(unsigned long)arg->params[0].u.value.b;
+ switch (arg->params[0].u.value.a) {
+ case OPTEE_MSG_RPC_SHM_TYPE_APPL:
+ cmd_free_suppl(ctx, shm);
+ break;
+ case OPTEE_MSG_RPC_SHM_TYPE_KERNEL:
+ tee_shm_free(shm);
+ break;
+ default:
+ arg->ret = TEEC_ERROR_BAD_PARAMETERS;
+ }
+ arg->ret = TEEC_SUCCESS;
+}
+
+static void free_pages_list(struct optee_call_ctx *call_ctx)
+{
+ if (call_ctx->pages_list) {
+ optee_free_pages_list(call_ctx->pages_list,
+ call_ctx->num_entries);
+ call_ctx->pages_list = NULL;
+ call_ctx->num_entries = 0;
+ }
+}
+
+void optee_rpc_finalize_call(struct optee_call_ctx *call_ctx)
+{
+ free_pages_list(call_ctx);
+}
+
+static void handle_rpc_func_cmd(struct tee_context *ctx, struct optee *optee,
+ struct tee_shm *shm,
+ struct optee_call_ctx *call_ctx)
+{
+ struct optee_msg_arg *arg;
+
+ arg = tee_shm_get_va(shm, 0);
+ if (IS_ERR(arg)) {
+ pr_err("%s: tee_shm_get_va %p failed\n", __func__, shm);
+ return;
+ }
+
+ switch (arg->cmd) {
+ case OPTEE_MSG_RPC_CMD_GET_TIME:
+ handle_rpc_func_cmd_get_time(arg);
+ break;
+ case OPTEE_MSG_RPC_CMD_WAIT_QUEUE:
+ handle_rpc_func_cmd_wq(optee, arg);
+ break;
+ case OPTEE_MSG_RPC_CMD_SUSPEND:
+ handle_rpc_func_cmd_wait(arg);
+ break;
+ case OPTEE_MSG_RPC_CMD_SHM_ALLOC:
+ free_pages_list(call_ctx);
+ handle_rpc_func_cmd_shm_alloc(ctx, arg, call_ctx);
+ break;
+ case OPTEE_MSG_RPC_CMD_SHM_FREE:
+ handle_rpc_func_cmd_shm_free(ctx, arg);
+ break;
+ default:
+ handle_rpc_supp_cmd(ctx, arg);
+ }
+}
+
+/**
+ * optee_handle_rpc() - handle RPC from secure world
+ * @ctx: context doing the RPC
+ * @param: value of registers for the RPC
+ * @call_ctx: call context. Preserved during one OP-TEE invocation
+ *
+ * Result of RPC is written back into @param.
+ */
+void optee_handle_rpc(struct tee_context *ctx, struct optee_rpc_param *param,
+ struct optee_call_ctx *call_ctx)
+{
+ struct tee_device *teedev = ctx->teedev;
+ struct optee *optee = tee_get_drvdata(teedev);
+ struct tee_shm *shm;
+ phys_addr_t pa;
+
+ switch (OPTEE_SMC_RETURN_GET_RPC_FUNC(param->a0)) {
+ case OPTEE_SMC_RPC_FUNC_ALLOC:
+ shm = tee_shm_alloc(ctx, param->a1, TEE_SHM_MAPPED);
+ if (!IS_ERR(shm) && !tee_shm_get_pa(shm, 0, &pa)) {
+ reg_pair_from_64(&param->a1, &param->a2, pa);
+ reg_pair_from_64(&param->a4, &param->a5,
+ (unsigned long)shm);
+ } else {
+ param->a1 = 0;
+ param->a2 = 0;
+ param->a4 = 0;
+ param->a5 = 0;
+ }
+ break;
+ case OPTEE_SMC_RPC_FUNC_FREE:
+ shm = reg_pair_to_ptr(param->a1, param->a2);
+ tee_shm_free(shm);
+ break;
+ case OPTEE_SMC_RPC_FUNC_FOREIGN_INTR:
+ /*
+ * A foreign interrupt was raised while secure world was
+ * executing, since they are handled in Linux a dummy RPC is
+ * performed to let Linux take the interrupt through the normal
+ * vector.
+ */
+ break;
+ case OPTEE_SMC_RPC_FUNC_CMD:
+ shm = reg_pair_to_ptr(param->a1, param->a2);
+ handle_rpc_func_cmd(ctx, optee, shm, call_ctx);
+ break;
+ default:
+ pr_warn("Unknown RPC func 0x%x\n",
+ (u32)OPTEE_SMC_RETURN_GET_RPC_FUNC(param->a0));
+ break;
+ }
+
+ param->a0 = OPTEE_SMC_CALL_RETURN_FROM_RPC;
+}
diff --git a/drivers/tee/optee/shm_pool.c b/drivers/tee/optee/shm_pool.c
new file mode 100644
index 000000000000..49397813fff1
--- /dev/null
+++ b/drivers/tee/optee/shm_pool.c
@@ -0,0 +1,75 @@
+/*
+ * Copyright (c) 2015, Linaro Limited
+ * Copyright (c) 2017, EPAM Systems
+ *
+ * This software is licensed under the terms of the GNU General Public
+ * License version 2, as published by the Free Software Foundation, and
+ * may be copied, distributed, and modified under those terms.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ */
+#include <linux/device.h>
+#include <linux/dma-buf.h>
+#include <linux/genalloc.h>
+#include <linux/slab.h>
+#include <linux/tee_drv.h>
+#include "optee_private.h"
+#include "optee_smc.h"
+#include "shm_pool.h"
+
+static int pool_op_alloc(struct tee_shm_pool_mgr *poolm,
+ struct tee_shm *shm, size_t size)
+{
+ unsigned int order = get_order(size);
+ struct page *page;
+
+ page = alloc_pages(GFP_KERNEL | __GFP_ZERO, order);
+ if (!page)
+ return -ENOMEM;
+
+ shm->kaddr = page_address(page);
+ shm->paddr = page_to_phys(page);
+ shm->size = PAGE_SIZE << order;
+
+ return 0;
+}
+
+static void pool_op_free(struct tee_shm_pool_mgr *poolm,
+ struct tee_shm *shm)
+{
+ free_pages((unsigned long)shm->kaddr, get_order(shm->size));
+ shm->kaddr = NULL;
+}
+
+static void pool_op_destroy_poolmgr(struct tee_shm_pool_mgr *poolm)
+{
+ kfree(poolm);
+}
+
+static const struct tee_shm_pool_mgr_ops pool_ops = {
+ .alloc = pool_op_alloc,
+ .free = pool_op_free,
+ .destroy_poolmgr = pool_op_destroy_poolmgr,
+};
+
+/**
+ * optee_shm_pool_alloc_pages() - create page-based allocator pool
+ *
+ * This pool is used when OP-TEE supports dymanic SHM. In this case
+ * command buffers and such are allocated from kernel's own memory.
+ */
+struct tee_shm_pool_mgr *optee_shm_pool_alloc_pages(void)
+{
+ struct tee_shm_pool_mgr *mgr = kzalloc(sizeof(*mgr), GFP_KERNEL);
+
+ if (!mgr)
+ return ERR_PTR(-ENOMEM);
+
+ mgr->ops = &pool_ops;
+
+ return mgr;
+}
diff --git a/drivers/tee/optee/shm_pool.h b/drivers/tee/optee/shm_pool.h
new file mode 100644
index 000000000000..4e753c3bf7ec
--- /dev/null
+++ b/drivers/tee/optee/shm_pool.h
@@ -0,0 +1,23 @@
+/*
+ * Copyright (c) 2015, Linaro Limited
+ * Copyright (c) 2016, EPAM Systems
+ *
+ * This software is licensed under the terms of the GNU General Public
+ * License version 2, as published by the Free Software Foundation, and
+ * may be copied, distributed, and modified under those terms.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ */
+
+#ifndef SHM_POOL_H
+#define SHM_POOL_H
+
+#include <linux/tee_drv.h>
+
+struct tee_shm_pool_mgr *optee_shm_pool_alloc_pages(void);
+
+#endif
diff --git a/drivers/tee/optee/supp.c b/drivers/tee/optee/supp.c
new file mode 100644
index 000000000000..df35fc01fd3e
--- /dev/null
+++ b/drivers/tee/optee/supp.c
@@ -0,0 +1,382 @@
+/*
+ * Copyright (c) 2015, Linaro Limited
+ *
+ * This software is licensed under the terms of the GNU General Public
+ * License version 2, as published by the Free Software Foundation, and
+ * may be copied, distributed, and modified under those terms.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ */
+#include <linux/device.h>
+#include <linux/slab.h>
+#include <linux/uaccess.h>
+#include "optee_private.h"
+
+struct optee_supp_req {
+ struct list_head link;
+
+ bool busy;
+ u32 func;
+ u32 ret;
+ size_t num_params;
+ struct tee_param *param;
+
+ struct completion c;
+};
+
+void optee_supp_init(struct optee_supp *supp)
+{
+ memset(supp, 0, sizeof(*supp));
+ mutex_init(&supp->mutex);
+ init_completion(&supp->reqs_c);
+ idr_init(&supp->idr);
+ INIT_LIST_HEAD(&supp->reqs);
+ supp->req_id = -1;
+}
+
+void optee_supp_uninit(struct optee_supp *supp)
+{
+ mutex_destroy(&supp->mutex);
+ idr_destroy(&supp->idr);
+}
+
+void optee_supp_release(struct optee_supp *supp)
+{
+ int id;
+ struct optee_supp_req *req;
+ struct optee_supp_req *req_tmp;
+
+ mutex_lock(&supp->mutex);
+
+ /* Abort all request retrieved by supplicant */
+ idr_for_each_entry(&supp->idr, req, id) {
+ req->busy = false;
+ idr_remove(&supp->idr, id);
+ req->ret = TEEC_ERROR_COMMUNICATION;
+ complete(&req->c);
+ }
+
+ /* Abort all queued requests */
+ list_for_each_entry_safe(req, req_tmp, &supp->reqs, link) {
+ list_del(&req->link);
+ req->ret = TEEC_ERROR_COMMUNICATION;
+ complete(&req->c);
+ }
+
+ supp->ctx = NULL;
+ supp->req_id = -1;
+
+ mutex_unlock(&supp->mutex);
+}
+
+/**
+ * optee_supp_thrd_req() - request service from supplicant
+ * @ctx: context doing the request
+ * @func: function requested
+ * @num_params: number of elements in @param array
+ * @param: parameters for function
+ *
+ * Returns result of operation to be passed to secure world
+ */
+u32 optee_supp_thrd_req(struct tee_context *ctx, u32 func, size_t num_params,
+ struct tee_param *param)
+
+{
+ struct optee *optee = tee_get_drvdata(ctx->teedev);
+ struct optee_supp *supp = &optee->supp;
+ struct optee_supp_req *req = kzalloc(sizeof(*req), GFP_KERNEL);
+ bool interruptable;
+ u32 ret;
+
+ if (!req)
+ return TEEC_ERROR_OUT_OF_MEMORY;
+
+ init_completion(&req->c);
+ req->func = func;
+ req->num_params = num_params;
+ req->param = param;
+
+ /* Insert the request in the request list */
+ mutex_lock(&supp->mutex);
+ list_add_tail(&req->link, &supp->reqs);
+ mutex_unlock(&supp->mutex);
+
+ /* Tell an eventual waiter there's a new request */
+ complete(&supp->reqs_c);
+
+ /*
+ * Wait for supplicant to process and return result, once we've
+ * returned from wait_for_completion(&req->c) successfully we have
+ * exclusive access again.
+ */
+ while (wait_for_completion_interruptible(&req->c)) {
+ mutex_lock(&supp->mutex);
+ interruptable = !supp->ctx;
+ if (interruptable) {
+ /*
+ * There's no supplicant available and since the
+ * supp->mutex currently is held none can
+ * become available until the mutex released
+ * again.
+ *
+ * Interrupting an RPC to supplicant is only
+ * allowed as a way of slightly improving the user
+ * experience in case the supplicant hasn't been
+ * started yet. During normal operation the supplicant
+ * will serve all requests in a timely manner and
+ * interrupting then wouldn't make sense.
+ */
+ interruptable = !req->busy;
+ if (!req->busy)
+ list_del(&req->link);
+ }
+ mutex_unlock(&supp->mutex);
+
+ if (interruptable) {
+ req->ret = TEEC_ERROR_COMMUNICATION;
+ break;
+ }
+ }
+
+ ret = req->ret;
+ kfree(req);
+
+ return ret;
+}
+
+static struct optee_supp_req *supp_pop_entry(struct optee_supp *supp,
+ int num_params, int *id)
+{
+ struct optee_supp_req *req;
+
+ if (supp->req_id != -1) {
+ /*
+ * Supplicant should not mix synchronous and asnynchronous
+ * requests.
+ */
+ return ERR_PTR(-EINVAL);
+ }
+
+ if (list_empty(&supp->reqs))
+ return NULL;
+
+ req = list_first_entry(&supp->reqs, struct optee_supp_req, link);
+
+ if (num_params < req->num_params) {
+ /* Not enough room for parameters */
+ return ERR_PTR(-EINVAL);
+ }
+
+ *id = idr_alloc(&supp->idr, req, 1, 0, GFP_KERNEL);
+ if (*id < 0)
+ return ERR_PTR(-ENOMEM);
+
+ list_del(&req->link);
+ req->busy = true;
+
+ return req;
+}
+
+static int supp_check_recv_params(size_t num_params, struct tee_param *params,
+ size_t *num_meta)
+{
+ size_t n;
+
+ if (!num_params)
+ return -EINVAL;
+
+ /*
+ * If there's memrefs we need to decrease those as they where
+ * increased earlier and we'll even refuse to accept any below.
+ */
+ for (n = 0; n < num_params; n++)
+ if (tee_param_is_memref(params + n) && params[n].u.memref.shm)
+ tee_shm_put(params[n].u.memref.shm);
+
+ /*
+ * We only expect parameters as TEE_IOCTL_PARAM_ATTR_TYPE_NONE with
+ * or without the TEE_IOCTL_PARAM_ATTR_META bit set.
+ */
+ for (n = 0; n < num_params; n++)
+ if (params[n].attr &&
+ params[n].attr != TEE_IOCTL_PARAM_ATTR_META)
+ return -EINVAL;
+
+ /* At most we'll need one meta parameter so no need to check for more */
+ if (params->attr == TEE_IOCTL_PARAM_ATTR_META)
+ *num_meta = 1;
+ else
+ *num_meta = 0;
+
+ return 0;
+}
+
+/**
+ * optee_supp_recv() - receive request for supplicant
+ * @ctx: context receiving the request
+ * @func: requested function in supplicant
+ * @num_params: number of elements allocated in @param, updated with number
+ * used elements
+ * @param: space for parameters for @func
+ *
+ * Returns 0 on success or <0 on failure
+ */
+int optee_supp_recv(struct tee_context *ctx, u32 *func, u32 *num_params,
+ struct tee_param *param)
+{
+ struct tee_device *teedev = ctx->teedev;
+ struct optee *optee = tee_get_drvdata(teedev);
+ struct optee_supp *supp = &optee->supp;
+ struct optee_supp_req *req = NULL;
+ int id;
+ size_t num_meta;
+ int rc;
+
+ rc = supp_check_recv_params(*num_params, param, &num_meta);
+ if (rc)
+ return rc;
+
+ while (true) {
+ mutex_lock(&supp->mutex);
+ req = supp_pop_entry(supp, *num_params - num_meta, &id);
+ mutex_unlock(&supp->mutex);
+
+ if (req) {
+ if (IS_ERR(req))
+ return PTR_ERR(req);
+ break;
+ }
+
+ /*
+ * If we didn't get a request we'll block in
+ * wait_for_completion() to avoid needless spinning.
+ *
+ * This is where supplicant will be hanging most of
+ * the time, let's make this interruptable so we
+ * can easily restart supplicant if needed.
+ */
+ if (wait_for_completion_interruptible(&supp->reqs_c))
+ return -ERESTARTSYS;
+ }
+
+ if (num_meta) {
+ /*
+ * tee-supplicant support meta parameters -> requsts can be
+ * processed asynchronously.
+ */
+ param->attr = TEE_IOCTL_PARAM_ATTR_TYPE_VALUE_INOUT |
+ TEE_IOCTL_PARAM_ATTR_META;
+ param->u.value.a = id;
+ param->u.value.b = 0;
+ param->u.value.c = 0;
+ } else {
+ mutex_lock(&supp->mutex);
+ supp->req_id = id;
+ mutex_unlock(&supp->mutex);
+ }
+
+ *func = req->func;
+ *num_params = req->num_params + num_meta;
+ memcpy(param + num_meta, req->param,
+ sizeof(struct tee_param) * req->num_params);
+
+ return 0;
+}
+
+static struct optee_supp_req *supp_pop_req(struct optee_supp *supp,
+ size_t num_params,
+ struct tee_param *param,
+ size_t *num_meta)
+{
+ struct optee_supp_req *req;
+ int id;
+ size_t nm;
+ const u32 attr = TEE_IOCTL_PARAM_ATTR_TYPE_VALUE_INOUT |
+ TEE_IOCTL_PARAM_ATTR_META;
+
+ if (!num_params)
+ return ERR_PTR(-EINVAL);
+
+ if (supp->req_id == -1) {
+ if (param->attr != attr)
+ return ERR_PTR(-EINVAL);
+ id = param->u.value.a;
+ nm = 1;
+ } else {
+ id = supp->req_id;
+ nm = 0;
+ }
+
+ req = idr_find(&supp->idr, id);
+ if (!req)
+ return ERR_PTR(-ENOENT);
+
+ if ((num_params - nm) != req->num_params)
+ return ERR_PTR(-EINVAL);
+
+ req->busy = false;
+ idr_remove(&supp->idr, id);
+ supp->req_id = -1;
+ *num_meta = nm;
+
+ return req;
+}
+
+/**
+ * optee_supp_send() - send result of request from supplicant
+ * @ctx: context sending result
+ * @ret: return value of request
+ * @num_params: number of parameters returned
+ * @param: returned parameters
+ *
+ * Returns 0 on success or <0 on failure.
+ */
+int optee_supp_send(struct tee_context *ctx, u32 ret, u32 num_params,
+ struct tee_param *param)
+{
+ struct tee_device *teedev = ctx->teedev;
+ struct optee *optee = tee_get_drvdata(teedev);
+ struct optee_supp *supp = &optee->supp;
+ struct optee_supp_req *req;
+ size_t n;
+ size_t num_meta;
+
+ mutex_lock(&supp->mutex);
+ req = supp_pop_req(supp, num_params, param, &num_meta);
+ mutex_unlock(&supp->mutex);
+
+ if (IS_ERR(req)) {
+ /* Something is wrong, let supplicant restart. */
+ return PTR_ERR(req);
+ }
+
+ /* Update out and in/out parameters */
+ for (n = 0; n < req->num_params; n++) {
+ struct tee_param *p = req->param + n;
+
+ switch (p->attr & TEE_IOCTL_PARAM_ATTR_TYPE_MASK) {
+ case TEE_IOCTL_PARAM_ATTR_TYPE_VALUE_OUTPUT:
+ case TEE_IOCTL_PARAM_ATTR_TYPE_VALUE_INOUT:
+ p->u.value.a = param[n + num_meta].u.value.a;
+ p->u.value.b = param[n + num_meta].u.value.b;
+ p->u.value.c = param[n + num_meta].u.value.c;
+ break;
+ case TEE_IOCTL_PARAM_ATTR_TYPE_MEMREF_OUTPUT:
+ case TEE_IOCTL_PARAM_ATTR_TYPE_MEMREF_INOUT:
+ p->u.memref.size = param[n + num_meta].u.memref.size;
+ break;
+ default:
+ break;
+ }
+ }
+ req->ret = ret;
+
+ /* Let the requesting thread continue */
+ complete(&req->c);
+
+ return 0;
+}
diff --git a/drivers/tee/tee_core.c b/drivers/tee/tee_core.c
new file mode 100644
index 000000000000..6c4b200a4560
--- /dev/null
+++ b/drivers/tee/tee_core.c
@@ -0,0 +1,949 @@
+/*
+ * Copyright (c) 2015-2016, Linaro Limited
+ *
+ * This software is licensed under the terms of the GNU General Public
+ * License version 2, as published by the Free Software Foundation, and
+ * may be copied, distributed, and modified under those terms.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ */
+
+#define pr_fmt(fmt) "%s: " fmt, __func__
+
+#include <linux/cdev.h>
+#include <linux/device.h>
+#include <linux/fs.h>
+#include <linux/idr.h>
+#include <linux/module.h>
+#include <linux/slab.h>
+#include <linux/tee_drv.h>
+#include <linux/uaccess.h>
+#include "tee_private.h"
+
+#define TEE_NUM_DEVICES 32
+
+#define TEE_IOCTL_PARAM_SIZE(x) (sizeof(struct tee_param) * (x))
+
+/*
+ * Unprivileged devices in the lower half range and privileged devices in
+ * the upper half range.
+ */
+static DECLARE_BITMAP(dev_mask, TEE_NUM_DEVICES);
+static DEFINE_SPINLOCK(driver_lock);
+
+static struct class *tee_class;
+static dev_t tee_devt;
+
+static int tee_open(struct inode *inode, struct file *filp)
+{
+ int rc;
+ struct tee_device *teedev;
+ struct tee_context *ctx;
+
+ teedev = container_of(inode->i_cdev, struct tee_device, cdev);
+ if (!tee_device_get(teedev))
+ return -EINVAL;
+
+ ctx = kzalloc(sizeof(*ctx), GFP_KERNEL);
+ if (!ctx) {
+ rc = -ENOMEM;
+ goto err;
+ }
+
+ kref_init(&ctx->refcount);
+ ctx->teedev = teedev;
+ INIT_LIST_HEAD(&ctx->list_shm);
+ filp->private_data = ctx;
+ rc = teedev->desc->ops->open(ctx);
+ if (rc)
+ goto err;
+
+ return 0;
+err:
+ kfree(ctx);
+ tee_device_put(teedev);
+ return rc;
+}
+
+void teedev_ctx_get(struct tee_context *ctx)
+{
+ if (ctx->releasing)
+ return;
+
+ kref_get(&ctx->refcount);
+}
+
+static void teedev_ctx_release(struct kref *ref)
+{
+ struct tee_context *ctx = container_of(ref, struct tee_context,
+ refcount);
+ ctx->releasing = true;
+ ctx->teedev->desc->ops->release(ctx);
+ kfree(ctx);
+}
+
+void teedev_ctx_put(struct tee_context *ctx)
+{
+ if (ctx->releasing)
+ return;
+
+ kref_put(&ctx->refcount, teedev_ctx_release);
+}
+
+static void teedev_close_context(struct tee_context *ctx)
+{
+ tee_device_put(ctx->teedev);
+ teedev_ctx_put(ctx);
+}
+
+static int tee_release(struct inode *inode, struct file *filp)
+{
+ teedev_close_context(filp->private_data);
+ return 0;
+}
+
+static int tee_ioctl_version(struct tee_context *ctx,
+ struct tee_ioctl_version_data __user *uvers)
+{
+ struct tee_ioctl_version_data vers;
+
+ ctx->teedev->desc->ops->get_version(ctx->teedev, &vers);
+
+ if (ctx->teedev->desc->flags & TEE_DESC_PRIVILEGED)
+ vers.gen_caps |= TEE_GEN_CAP_PRIVILEGED;
+
+ if (copy_to_user(uvers, &vers, sizeof(vers)))
+ return -EFAULT;
+
+ return 0;
+}
+
+static int tee_ioctl_shm_alloc(struct tee_context *ctx,
+ struct tee_ioctl_shm_alloc_data __user *udata)
+{
+ long ret;
+ struct tee_ioctl_shm_alloc_data data;
+ struct tee_shm *shm;
+
+ if (copy_from_user(&data, udata, sizeof(data)))
+ return -EFAULT;
+
+ /* Currently no input flags are supported */
+ if (data.flags)
+ return -EINVAL;
+
+ shm = tee_shm_alloc(ctx, data.size, TEE_SHM_MAPPED | TEE_SHM_DMA_BUF);
+ if (IS_ERR(shm))
+ return PTR_ERR(shm);
+
+ data.id = shm->id;
+ data.flags = shm->flags;
+ data.size = shm->size;
+
+ if (copy_to_user(udata, &data, sizeof(data)))
+ ret = -EFAULT;
+ else
+ ret = tee_shm_get_fd(shm);
+
+ /*
+ * When user space closes the file descriptor the shared memory
+ * should be freed or if tee_shm_get_fd() failed then it will
+ * be freed immediately.
+ */
+ tee_shm_put(shm);
+ return ret;
+}
+
+static int
+tee_ioctl_shm_register(struct tee_context *ctx,
+ struct tee_ioctl_shm_register_data __user *udata)
+{
+ long ret;
+ struct tee_ioctl_shm_register_data data;
+ struct tee_shm *shm;
+
+ if (copy_from_user(&data, udata, sizeof(data)))
+ return -EFAULT;
+
+ /* Currently no input flags are supported */
+ if (data.flags)
+ return -EINVAL;
+
+ shm = tee_shm_register(ctx, data.addr, data.length,
+ TEE_SHM_DMA_BUF | TEE_SHM_USER_MAPPED);
+ if (IS_ERR(shm))
+ return PTR_ERR(shm);
+
+ data.id = shm->id;
+ data.flags = shm->flags;
+ data.length = shm->size;
+
+ if (copy_to_user(udata, &data, sizeof(data)))
+ ret = -EFAULT;
+ else
+ ret = tee_shm_get_fd(shm);
+ /*
+ * When user space closes the file descriptor the shared memory
+ * should be freed or if tee_shm_get_fd() failed then it will
+ * be freed immediately.
+ */
+ tee_shm_put(shm);
+ return ret;
+}
+
+static int params_from_user(struct tee_context *ctx, struct tee_param *params,
+ size_t num_params,
+ struct tee_ioctl_param __user *uparams)
+{
+ size_t n;
+
+ for (n = 0; n < num_params; n++) {
+ struct tee_shm *shm;
+ struct tee_ioctl_param ip;
+
+ if (copy_from_user(&ip, uparams + n, sizeof(ip)))
+ return -EFAULT;
+
+ /* All unused attribute bits has to be zero */
+ if (ip.attr & ~TEE_IOCTL_PARAM_ATTR_MASK)
+ return -EINVAL;
+
+ params[n].attr = ip.attr;
+ switch (ip.attr & TEE_IOCTL_PARAM_ATTR_TYPE_MASK) {
+ case TEE_IOCTL_PARAM_ATTR_TYPE_NONE:
+ case TEE_IOCTL_PARAM_ATTR_TYPE_VALUE_OUTPUT:
+ break;
+ case TEE_IOCTL_PARAM_ATTR_TYPE_VALUE_INPUT:
+ case TEE_IOCTL_PARAM_ATTR_TYPE_VALUE_INOUT:
+ params[n].u.value.a = ip.a;
+ params[n].u.value.b = ip.b;
+ params[n].u.value.c = ip.c;
+ break;
+ case TEE_IOCTL_PARAM_ATTR_TYPE_MEMREF_INPUT:
+ case TEE_IOCTL_PARAM_ATTR_TYPE_MEMREF_OUTPUT:
+ case TEE_IOCTL_PARAM_ATTR_TYPE_MEMREF_INOUT:
+ /*
+ * If we fail to get a pointer to a shared memory
+ * object (and increase the ref count) from an
+ * identifier we return an error. All pointers that
+ * has been added in params have an increased ref
+ * count. It's the callers responibility to do
+ * tee_shm_put() on all resolved pointers.
+ */
+ shm = tee_shm_get_from_id(ctx, ip.c);
+ if (IS_ERR(shm))
+ return PTR_ERR(shm);
+
+ params[n].u.memref.shm_offs = ip.a;
+ params[n].u.memref.size = ip.b;
+ params[n].u.memref.shm = shm;
+ break;
+ default:
+ /* Unknown attribute */
+ return -EINVAL;
+ }
+ }
+ return 0;
+}
+
+static int params_to_user(struct tee_ioctl_param __user *uparams,
+ size_t num_params, struct tee_param *params)
+{
+ size_t n;
+
+ for (n = 0; n < num_params; n++) {
+ struct tee_ioctl_param __user *up = uparams + n;
+ struct tee_param *p = params + n;
+
+ switch (p->attr) {
+ case TEE_IOCTL_PARAM_ATTR_TYPE_VALUE_OUTPUT:
+ case TEE_IOCTL_PARAM_ATTR_TYPE_VALUE_INOUT:
+ if (put_user(p->u.value.a, &up->a) ||
+ put_user(p->u.value.b, &up->b) ||
+ put_user(p->u.value.c, &up->c))
+ return -EFAULT;
+ break;
+ case TEE_IOCTL_PARAM_ATTR_TYPE_MEMREF_OUTPUT:
+ case TEE_IOCTL_PARAM_ATTR_TYPE_MEMREF_INOUT:
+ if (put_user((u64)p->u.memref.size, &up->b))
+ return -EFAULT;
+ default:
+ break;
+ }
+ }
+ return 0;
+}
+
+static int tee_ioctl_open_session(struct tee_context *ctx,
+ struct tee_ioctl_buf_data __user *ubuf)
+{
+ int rc;
+ size_t n;
+ struct tee_ioctl_buf_data buf;
+ struct tee_ioctl_open_session_arg __user *uarg;
+ struct tee_ioctl_open_session_arg arg;
+ struct tee_ioctl_param __user *uparams = NULL;
+ struct tee_param *params = NULL;
+ bool have_session = false;
+
+ if (!ctx->teedev->desc->ops->open_session)
+ return -EINVAL;
+
+ if (copy_from_user(&buf, ubuf, sizeof(buf)))
+ return -EFAULT;
+
+ if (buf.buf_len > TEE_MAX_ARG_SIZE ||
+ buf.buf_len < sizeof(struct tee_ioctl_open_session_arg))
+ return -EINVAL;
+
+ uarg = u64_to_user_ptr(buf.buf_ptr);
+ if (copy_from_user(&arg, uarg, sizeof(arg)))
+ return -EFAULT;
+
+ if (sizeof(arg) + TEE_IOCTL_PARAM_SIZE(arg.num_params) != buf.buf_len)
+ return -EINVAL;
+
+ if (arg.num_params) {
+ params = kcalloc(arg.num_params, sizeof(struct tee_param),
+ GFP_KERNEL);
+ if (!params)
+ return -ENOMEM;
+ uparams = uarg->params;
+ rc = params_from_user(ctx, params, arg.num_params, uparams);
+ if (rc)
+ goto out;
+ }
+
+ rc = ctx->teedev->desc->ops->open_session(ctx, &arg, params);
+ if (rc)
+ goto out;
+ have_session = true;
+
+ if (put_user(arg.session, &uarg->session) ||
+ put_user(arg.ret, &uarg->ret) ||
+ put_user(arg.ret_origin, &uarg->ret_origin)) {
+ rc = -EFAULT;
+ goto out;
+ }
+ rc = params_to_user(uparams, arg.num_params, params);
+out:
+ /*
+ * If we've succeeded to open the session but failed to communicate
+ * it back to user space, close the session again to avoid leakage.
+ */
+ if (rc && have_session && ctx->teedev->desc->ops->close_session)
+ ctx->teedev->desc->ops->close_session(ctx, arg.session);
+
+ if (params) {
+ /* Decrease ref count for all valid shared memory pointers */
+ for (n = 0; n < arg.num_params; n++)
+ if (tee_param_is_memref(params + n) &&
+ params[n].u.memref.shm)
+ tee_shm_put(params[n].u.memref.shm);
+ kfree(params);
+ }
+
+ return rc;
+}
+
+static int tee_ioctl_invoke(struct tee_context *ctx,
+ struct tee_ioctl_buf_data __user *ubuf)
+{
+ int rc;
+ size_t n;
+ struct tee_ioctl_buf_data buf;
+ struct tee_ioctl_invoke_arg __user *uarg;
+ struct tee_ioctl_invoke_arg arg;
+ struct tee_ioctl_param __user *uparams = NULL;
+ struct tee_param *params = NULL;
+
+ if (!ctx->teedev->desc->ops->invoke_func)
+ return -EINVAL;
+
+ if (copy_from_user(&buf, ubuf, sizeof(buf)))
+ return -EFAULT;
+
+ if (buf.buf_len > TEE_MAX_ARG_SIZE ||
+ buf.buf_len < sizeof(struct tee_ioctl_invoke_arg))
+ return -EINVAL;
+
+ uarg = u64_to_user_ptr(buf.buf_ptr);
+ if (copy_from_user(&arg, uarg, sizeof(arg)))
+ return -EFAULT;
+
+ if (sizeof(arg) + TEE_IOCTL_PARAM_SIZE(arg.num_params) != buf.buf_len)
+ return -EINVAL;
+
+ if (arg.num_params) {
+ params = kcalloc(arg.num_params, sizeof(struct tee_param),
+ GFP_KERNEL);
+ if (!params)
+ return -ENOMEM;
+ uparams = uarg->params;
+ rc = params_from_user(ctx, params, arg.num_params, uparams);
+ if (rc)
+ goto out;
+ }
+
+ rc = ctx->teedev->desc->ops->invoke_func(ctx, &arg, params);
+ if (rc)
+ goto out;
+
+ if (put_user(arg.ret, &uarg->ret) ||
+ put_user(arg.ret_origin, &uarg->ret_origin)) {
+ rc = -EFAULT;
+ goto out;
+ }
+ rc = params_to_user(uparams, arg.num_params, params);
+out:
+ if (params) {
+ /* Decrease ref count for all valid shared memory pointers */
+ for (n = 0; n < arg.num_params; n++)
+ if (tee_param_is_memref(params + n) &&
+ params[n].u.memref.shm)
+ tee_shm_put(params[n].u.memref.shm);
+ kfree(params);
+ }
+ return rc;
+}
+
+static int tee_ioctl_cancel(struct tee_context *ctx,
+ struct tee_ioctl_cancel_arg __user *uarg)
+{
+ struct tee_ioctl_cancel_arg arg;
+
+ if (!ctx->teedev->desc->ops->cancel_req)
+ return -EINVAL;
+
+ if (copy_from_user(&arg, uarg, sizeof(arg)))
+ return -EFAULT;
+
+ return ctx->teedev->desc->ops->cancel_req(ctx, arg.cancel_id,
+ arg.session);
+}
+
+static int
+tee_ioctl_close_session(struct tee_context *ctx,
+ struct tee_ioctl_close_session_arg __user *uarg)
+{
+ struct tee_ioctl_close_session_arg arg;
+
+ if (!ctx->teedev->desc->ops->close_session)
+ return -EINVAL;
+
+ if (copy_from_user(&arg, uarg, sizeof(arg)))
+ return -EFAULT;
+
+ return ctx->teedev->desc->ops->close_session(ctx, arg.session);
+}
+
+static int params_to_supp(struct tee_context *ctx,
+ struct tee_ioctl_param __user *uparams,
+ size_t num_params, struct tee_param *params)
+{
+ size_t n;
+
+ for (n = 0; n < num_params; n++) {
+ struct tee_ioctl_param ip;
+ struct tee_param *p = params + n;
+
+ ip.attr = p->attr;
+ switch (p->attr & TEE_IOCTL_PARAM_ATTR_TYPE_MASK) {
+ case TEE_IOCTL_PARAM_ATTR_TYPE_VALUE_INPUT:
+ case TEE_IOCTL_PARAM_ATTR_TYPE_VALUE_INOUT:
+ ip.a = p->u.value.a;
+ ip.b = p->u.value.b;
+ ip.c = p->u.value.c;
+ break;
+ case TEE_IOCTL_PARAM_ATTR_TYPE_MEMREF_INPUT:
+ case TEE_IOCTL_PARAM_ATTR_TYPE_MEMREF_OUTPUT:
+ case TEE_IOCTL_PARAM_ATTR_TYPE_MEMREF_INOUT:
+ ip.b = p->u.memref.size;
+ if (!p->u.memref.shm) {
+ ip.a = 0;
+ ip.c = (u64)-1; /* invalid shm id */
+ break;
+ }
+ ip.a = p->u.memref.shm_offs;
+ ip.c = p->u.memref.shm->id;
+ break;
+ default:
+ ip.a = 0;
+ ip.b = 0;
+ ip.c = 0;
+ break;
+ }
+
+ if (copy_to_user(uparams + n, &ip, sizeof(ip)))
+ return -EFAULT;
+ }
+
+ return 0;
+}
+
+static int tee_ioctl_supp_recv(struct tee_context *ctx,
+ struct tee_ioctl_buf_data __user *ubuf)
+{
+ int rc;
+ struct tee_ioctl_buf_data buf;
+ struct tee_iocl_supp_recv_arg __user *uarg;
+ struct tee_param *params;
+ u32 num_params;
+ u32 func;
+
+ if (!ctx->teedev->desc->ops->supp_recv)
+ return -EINVAL;
+
+ if (copy_from_user(&buf, ubuf, sizeof(buf)))
+ return -EFAULT;
+
+ if (buf.buf_len > TEE_MAX_ARG_SIZE ||
+ buf.buf_len < sizeof(struct tee_iocl_supp_recv_arg))
+ return -EINVAL;
+
+ uarg = u64_to_user_ptr(buf.buf_ptr);
+ if (get_user(num_params, &uarg->num_params))
+ return -EFAULT;
+
+ if (sizeof(*uarg) + TEE_IOCTL_PARAM_SIZE(num_params) != buf.buf_len)
+ return -EINVAL;
+
+ params = kcalloc(num_params, sizeof(struct tee_param), GFP_KERNEL);
+ if (!params)
+ return -ENOMEM;
+
+ rc = params_from_user(ctx, params, num_params, uarg->params);
+ if (rc)
+ goto out;
+
+ rc = ctx->teedev->desc->ops->supp_recv(ctx, &func, &num_params, params);
+ if (rc)
+ goto out;
+
+ if (put_user(func, &uarg->func) ||
+ put_user(num_params, &uarg->num_params)) {
+ rc = -EFAULT;
+ goto out;
+ }
+
+ rc = params_to_supp(ctx, uarg->params, num_params, params);
+out:
+ kfree(params);
+ return rc;
+}
+
+static int params_from_supp(struct tee_param *params, size_t num_params,
+ struct tee_ioctl_param __user *uparams)
+{
+ size_t n;
+
+ for (n = 0; n < num_params; n++) {
+ struct tee_param *p = params + n;
+ struct tee_ioctl_param ip;
+
+ if (copy_from_user(&ip, uparams + n, sizeof(ip)))
+ return -EFAULT;
+
+ /* All unused attribute bits has to be zero */
+ if (ip.attr & ~TEE_IOCTL_PARAM_ATTR_MASK)
+ return -EINVAL;
+
+ p->attr = ip.attr;
+ switch (ip.attr & TEE_IOCTL_PARAM_ATTR_TYPE_MASK) {
+ case TEE_IOCTL_PARAM_ATTR_TYPE_VALUE_OUTPUT:
+ case TEE_IOCTL_PARAM_ATTR_TYPE_VALUE_INOUT:
+ /* Only out and in/out values can be updated */
+ p->u.value.a = ip.a;
+ p->u.value.b = ip.b;
+ p->u.value.c = ip.c;
+ break;
+ case TEE_IOCTL_PARAM_ATTR_TYPE_MEMREF_OUTPUT:
+ case TEE_IOCTL_PARAM_ATTR_TYPE_MEMREF_INOUT:
+ /*
+ * Only the size of the memref can be updated.
+ * Since we don't have access to the original
+ * parameters here, only store the supplied size.
+ * The driver will copy the updated size into the
+ * original parameters.
+ */
+ p->u.memref.shm = NULL;
+ p->u.memref.shm_offs = 0;
+ p->u.memref.size = ip.b;
+ break;
+ default:
+ memset(&p->u, 0, sizeof(p->u));
+ break;
+ }
+ }
+ return 0;
+}
+
+static int tee_ioctl_supp_send(struct tee_context *ctx,
+ struct tee_ioctl_buf_data __user *ubuf)
+{
+ long rc;
+ struct tee_ioctl_buf_data buf;
+ struct tee_iocl_supp_send_arg __user *uarg;
+ struct tee_param *params;
+ u32 num_params;
+ u32 ret;
+
+ /* Not valid for this driver */
+ if (!ctx->teedev->desc->ops->supp_send)
+ return -EINVAL;
+
+ if (copy_from_user(&buf, ubuf, sizeof(buf)))
+ return -EFAULT;
+
+ if (buf.buf_len > TEE_MAX_ARG_SIZE ||
+ buf.buf_len < sizeof(struct tee_iocl_supp_send_arg))
+ return -EINVAL;
+
+ uarg = u64_to_user_ptr(buf.buf_ptr);
+ if (get_user(ret, &uarg->ret) ||
+ get_user(num_params, &uarg->num_params))
+ return -EFAULT;
+
+ if (sizeof(*uarg) + TEE_IOCTL_PARAM_SIZE(num_params) > buf.buf_len)
+ return -EINVAL;
+
+ params = kcalloc(num_params, sizeof(struct tee_param), GFP_KERNEL);
+ if (!params)
+ return -ENOMEM;
+
+ rc = params_from_supp(params, num_params, uarg->params);
+ if (rc)
+ goto out;
+
+ rc = ctx->teedev->desc->ops->supp_send(ctx, ret, num_params, params);
+out:
+ kfree(params);
+ return rc;
+}
+
+static long tee_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
+{
+ struct tee_context *ctx = filp->private_data;
+ void __user *uarg = (void __user *)arg;
+
+ switch (cmd) {
+ case TEE_IOC_VERSION:
+ return tee_ioctl_version(ctx, uarg);
+ case TEE_IOC_SHM_ALLOC:
+ return tee_ioctl_shm_alloc(ctx, uarg);
+ case TEE_IOC_SHM_REGISTER:
+ return tee_ioctl_shm_register(ctx, uarg);
+ case TEE_IOC_OPEN_SESSION:
+ return tee_ioctl_open_session(ctx, uarg);
+ case TEE_IOC_INVOKE:
+ return tee_ioctl_invoke(ctx, uarg);
+ case TEE_IOC_CANCEL:
+ return tee_ioctl_cancel(ctx, uarg);
+ case TEE_IOC_CLOSE_SESSION:
+ return tee_ioctl_close_session(ctx, uarg);
+ case TEE_IOC_SUPPL_RECV:
+ return tee_ioctl_supp_recv(ctx, uarg);
+ case TEE_IOC_SUPPL_SEND:
+ return tee_ioctl_supp_send(ctx, uarg);
+ default:
+ return -EINVAL;
+ }
+}
+
+static const struct file_operations tee_fops = {
+ .owner = THIS_MODULE,
+ .open = tee_open,
+ .release = tee_release,
+ .unlocked_ioctl = tee_ioctl,
+ .compat_ioctl = tee_ioctl,
+};
+
+static void tee_release_device(struct device *dev)
+{
+ struct tee_device *teedev = container_of(dev, struct tee_device, dev);
+
+ spin_lock(&driver_lock);
+ clear_bit(teedev->id, dev_mask);
+ spin_unlock(&driver_lock);
+ mutex_destroy(&teedev->mutex);
+ idr_destroy(&teedev->idr);
+ kfree(teedev);
+}
+
+/**
+ * tee_device_alloc() - Allocate a new struct tee_device instance
+ * @teedesc: Descriptor for this driver
+ * @dev: Parent device for this device
+ * @pool: Shared memory pool, NULL if not used
+ * @driver_data: Private driver data for this device
+ *
+ * Allocates a new struct tee_device instance. The device is
+ * removed by tee_device_unregister().
+ *
+ * @returns a pointer to a 'struct tee_device' or an ERR_PTR on failure
+ */
+struct tee_device *tee_device_alloc(const struct tee_desc *teedesc,
+ struct device *dev,
+ struct tee_shm_pool *pool,
+ void *driver_data)
+{
+ struct tee_device *teedev;
+ void *ret;
+ int rc;
+ int offs = 0;
+
+ if (!teedesc || !teedesc->name || !teedesc->ops ||
+ !teedesc->ops->get_version || !teedesc->ops->open ||
+ !teedesc->ops->release || !pool)
+ return ERR_PTR(-EINVAL);
+
+ teedev = kzalloc(sizeof(*teedev), GFP_KERNEL);
+ if (!teedev) {
+ ret = ERR_PTR(-ENOMEM);
+ goto err;
+ }
+
+ if (teedesc->flags & TEE_DESC_PRIVILEGED)
+ offs = TEE_NUM_DEVICES / 2;
+
+ spin_lock(&driver_lock);
+ teedev->id = find_next_zero_bit(dev_mask, TEE_NUM_DEVICES, offs);
+ if (teedev->id < TEE_NUM_DEVICES)
+ set_bit(teedev->id, dev_mask);
+ spin_unlock(&driver_lock);
+
+ if (teedev->id >= TEE_NUM_DEVICES) {
+ ret = ERR_PTR(-ENOMEM);
+ goto err;
+ }
+
+ snprintf(teedev->name, sizeof(teedev->name), "tee%s%d",
+ teedesc->flags & TEE_DESC_PRIVILEGED ? "priv" : "",
+ teedev->id - offs);
+
+ teedev->dev.class = tee_class;
+ teedev->dev.release = tee_release_device;
+ teedev->dev.parent = dev;
+
+ teedev->dev.devt = MKDEV(MAJOR(tee_devt), teedev->id);
+
+ rc = dev_set_name(&teedev->dev, "%s", teedev->name);
+ if (rc) {
+ ret = ERR_PTR(rc);
+ goto err_devt;
+ }
+
+ cdev_init(&teedev->cdev, &tee_fops);
+ teedev->cdev.owner = teedesc->owner;
+ teedev->cdev.kobj.parent = &teedev->dev.kobj;
+
+ dev_set_drvdata(&teedev->dev, driver_data);
+ device_initialize(&teedev->dev);
+
+ /* 1 as tee_device_unregister() does one final tee_device_put() */
+ teedev->num_users = 1;
+ init_completion(&teedev->c_no_users);
+ mutex_init(&teedev->mutex);
+ idr_init(&teedev->idr);
+
+ teedev->desc = teedesc;
+ teedev->pool = pool;
+
+ return teedev;
+err_devt:
+ unregister_chrdev_region(teedev->dev.devt, 1);
+err:
+ pr_err("could not register %s driver\n",
+ teedesc->flags & TEE_DESC_PRIVILEGED ? "privileged" : "client");
+ if (teedev && teedev->id < TEE_NUM_DEVICES) {
+ spin_lock(&driver_lock);
+ clear_bit(teedev->id, dev_mask);
+ spin_unlock(&driver_lock);
+ }
+ kfree(teedev);
+ return ret;
+}
+EXPORT_SYMBOL_GPL(tee_device_alloc);
+
+static ssize_t implementation_id_show(struct device *dev,
+ struct device_attribute *attr, char *buf)
+{
+ struct tee_device *teedev = container_of(dev, struct tee_device, dev);
+ struct tee_ioctl_version_data vers;
+
+ teedev->desc->ops->get_version(teedev, &vers);
+ return scnprintf(buf, PAGE_SIZE, "%d\n", vers.impl_id);
+}
+static DEVICE_ATTR_RO(implementation_id);
+
+static struct attribute *tee_dev_attrs[] = {
+ &dev_attr_implementation_id.attr,
+ NULL
+};
+
+static const struct attribute_group tee_dev_group = {
+ .attrs = tee_dev_attrs,
+};
+
+/**
+ * tee_device_register() - Registers a TEE device
+ * @teedev: Device to register
+ *
+ * tee_device_unregister() need to be called to remove the @teedev if
+ * this function fails.
+ *
+ * @returns < 0 on failure
+ */
+int tee_device_register(struct tee_device *teedev)
+{
+ int rc;
+
+ if (teedev->flags & TEE_DEVICE_FLAG_REGISTERED) {
+ dev_err(&teedev->dev, "attempt to register twice\n");
+ return -EINVAL;
+ }
+
+ rc = cdev_add(&teedev->cdev, teedev->dev.devt, 1);
+ if (rc) {
+ dev_err(&teedev->dev,
+ "unable to cdev_add() %s, major %d, minor %d, err=%d\n",
+ teedev->name, MAJOR(teedev->dev.devt),
+ MINOR(teedev->dev.devt), rc);
+ return rc;
+ }
+
+ rc = device_add(&teedev->dev);
+ if (rc) {
+ dev_err(&teedev->dev,
+ "unable to device_add() %s, major %d, minor %d, err=%d\n",
+ teedev->name, MAJOR(teedev->dev.devt),
+ MINOR(teedev->dev.devt), rc);
+ goto err_device_add;
+ }
+
+ rc = sysfs_create_group(&teedev->dev.kobj, &tee_dev_group);
+ if (rc) {
+ dev_err(&teedev->dev,
+ "failed to create sysfs attributes, err=%d\n", rc);
+ goto err_sysfs_create_group;
+ }
+
+ teedev->flags |= TEE_DEVICE_FLAG_REGISTERED;
+ return 0;
+
+err_sysfs_create_group:
+ device_del(&teedev->dev);
+err_device_add:
+ cdev_del(&teedev->cdev);
+ return rc;
+}
+EXPORT_SYMBOL_GPL(tee_device_register);
+
+void tee_device_put(struct tee_device *teedev)
+{
+ mutex_lock(&teedev->mutex);
+ /* Shouldn't put in this state */
+ if (!WARN_ON(!teedev->desc)) {
+ teedev->num_users--;
+ if (!teedev->num_users) {
+ teedev->desc = NULL;
+ complete(&teedev->c_no_users);
+ }
+ }
+ mutex_unlock(&teedev->mutex);
+}
+
+bool tee_device_get(struct tee_device *teedev)
+{
+ mutex_lock(&teedev->mutex);
+ if (!teedev->desc) {
+ mutex_unlock(&teedev->mutex);
+ return false;
+ }
+ teedev->num_users++;
+ mutex_unlock(&teedev->mutex);
+ return true;
+}
+
+/**
+ * tee_device_unregister() - Removes a TEE device
+ * @teedev: Device to unregister
+ *
+ * This function should be called to remove the @teedev even if
+ * tee_device_register() hasn't been called yet. Does nothing if
+ * @teedev is NULL.
+ */
+void tee_device_unregister(struct tee_device *teedev)
+{
+ if (!teedev)
+ return;
+
+ if (teedev->flags & TEE_DEVICE_FLAG_REGISTERED) {
+ sysfs_remove_group(&teedev->dev.kobj, &tee_dev_group);
+ cdev_del(&teedev->cdev);
+ device_del(&teedev->dev);
+ }
+
+ tee_device_put(teedev);
+ wait_for_completion(&teedev->c_no_users);
+
+ /*
+ * No need to take a mutex any longer now since teedev->desc was
+ * set to NULL before teedev->c_no_users was completed.
+ */
+
+ teedev->pool = NULL;
+
+ put_device(&teedev->dev);
+}
+EXPORT_SYMBOL_GPL(tee_device_unregister);
+
+/**
+ * tee_get_drvdata() - Return driver_data pointer
+ * @teedev: Device containing the driver_data pointer
+ * @returns the driver_data pointer supplied to tee_register().
+ */
+void *tee_get_drvdata(struct tee_device *teedev)
+{
+ return dev_get_drvdata(&teedev->dev);
+}
+EXPORT_SYMBOL_GPL(tee_get_drvdata);
+
+static int __init tee_init(void)
+{
+ int rc;
+
+ tee_class = class_create(THIS_MODULE, "tee");
+ if (IS_ERR(tee_class)) {
+ pr_err("couldn't create class\n");
+ return PTR_ERR(tee_class);
+ }
+
+ rc = alloc_chrdev_region(&tee_devt, 0, TEE_NUM_DEVICES, "tee");
+ if (rc) {
+ pr_err("failed to allocate char dev region\n");
+ class_destroy(tee_class);
+ tee_class = NULL;
+ }
+
+ return rc;
+}
+
+static void __exit tee_exit(void)
+{
+ class_destroy(tee_class);
+ tee_class = NULL;
+ unregister_chrdev_region(tee_devt, TEE_NUM_DEVICES);
+}
+
+subsys_initcall(tee_init);
+module_exit(tee_exit);
+
+MODULE_AUTHOR("Linaro");
+MODULE_DESCRIPTION("TEE Driver");
+MODULE_VERSION("1.0");
+MODULE_LICENSE("GPL v2");
diff --git a/drivers/tee/tee_private.h b/drivers/tee/tee_private.h
new file mode 100644
index 000000000000..85d99d621603
--- /dev/null
+++ b/drivers/tee/tee_private.h
@@ -0,0 +1,79 @@
+/*
+ * Copyright (c) 2015-2016, Linaro Limited
+ *
+ * This software is licensed under the terms of the GNU General Public
+ * License version 2, as published by the Free Software Foundation, and
+ * may be copied, distributed, and modified under those terms.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ */
+#ifndef TEE_PRIVATE_H
+#define TEE_PRIVATE_H
+
+#include <linux/cdev.h>
+#include <linux/completion.h>
+#include <linux/device.h>
+#include <linux/kref.h>
+#include <linux/mutex.h>
+#include <linux/types.h>
+
+/**
+ * struct tee_shm_pool - shared memory pool
+ * @private_mgr: pool manager for shared memory only between kernel
+ * and secure world
+ * @dma_buf_mgr: pool manager for shared memory exported to user space
+ */
+struct tee_shm_pool {
+ struct tee_shm_pool_mgr *private_mgr;
+ struct tee_shm_pool_mgr *dma_buf_mgr;
+};
+
+#define TEE_DEVICE_FLAG_REGISTERED 0x1
+#define TEE_MAX_DEV_NAME_LEN 32
+
+/**
+ * struct tee_device - TEE Device representation
+ * @name: name of device
+ * @desc: description of device
+ * @id: unique id of device
+ * @flags: represented by TEE_DEVICE_FLAG_REGISTERED above
+ * @dev: embedded basic device structure
+ * @cdev: embedded cdev
+ * @num_users: number of active users of this device
+ * @c_no_user: completion used when unregistering the device
+ * @mutex: mutex protecting @num_users and @idr
+ * @idr: register of shared memory object allocated on this device
+ * @pool: shared memory pool
+ */
+struct tee_device {
+ char name[TEE_MAX_DEV_NAME_LEN];
+ const struct tee_desc *desc;
+ int id;
+ unsigned int flags;
+
+ struct device dev;
+ struct cdev cdev;
+
+ size_t num_users;
+ struct completion c_no_users;
+ struct mutex mutex; /* protects num_users and idr */
+
+ struct idr idr;
+ struct tee_shm_pool *pool;
+};
+
+int tee_shm_init(void);
+
+int tee_shm_get_fd(struct tee_shm *shm);
+
+bool tee_device_get(struct tee_device *teedev);
+void tee_device_put(struct tee_device *teedev);
+
+void teedev_ctx_get(struct tee_context *ctx);
+void teedev_ctx_put(struct tee_context *ctx);
+
+#endif /*TEE_PRIVATE_H*/
diff --git a/drivers/tee/tee_shm.c b/drivers/tee/tee_shm.c
new file mode 100644
index 000000000000..ed2d71c3337d
--- /dev/null
+++ b/drivers/tee/tee_shm.c
@@ -0,0 +1,510 @@
+/*
+ * Copyright (c) 2015-2016, Linaro Limited
+ *
+ * This software is licensed under the terms of the GNU General Public
+ * License version 2, as published by the Free Software Foundation, and
+ * may be copied, distributed, and modified under those terms.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ */
+#include <linux/device.h>
+#include <linux/dma-buf.h>
+#include <linux/fdtable.h>
+#include <linux/idr.h>
+#include <linux/sched.h>
+#include <linux/slab.h>
+#include <linux/tee_drv.h>
+#include "tee_private.h"
+
+static void tee_shm_release(struct tee_shm *shm)
+{
+ struct tee_device *teedev = shm->teedev;
+
+ mutex_lock(&teedev->mutex);
+ idr_remove(&teedev->idr, shm->id);
+ if (shm->ctx)
+ list_del(&shm->link);
+ mutex_unlock(&teedev->mutex);
+
+ if (shm->flags & TEE_SHM_POOL) {
+ struct tee_shm_pool_mgr *poolm;
+
+ if (shm->flags & TEE_SHM_DMA_BUF)
+ poolm = teedev->pool->dma_buf_mgr;
+ else
+ poolm = teedev->pool->private_mgr;
+
+ poolm->ops->free(poolm, shm);
+ } else if (shm->flags & TEE_SHM_REGISTER) {
+ size_t n;
+ int rc = teedev->desc->ops->shm_unregister(shm->ctx, shm);
+
+ if (rc)
+ dev_err(teedev->dev.parent,
+ "unregister shm %p failed: %d", shm, rc);
+
+ for (n = 0; n < shm->num_pages; n++)
+ put_page(shm->pages[n]);
+
+ kfree(shm->pages);
+ }
+
+ if (shm->ctx)
+ teedev_ctx_put(shm->ctx);
+
+ kfree(shm);
+
+ tee_device_put(teedev);
+}
+
+static struct sg_table *tee_shm_op_map_dma_buf(struct dma_buf_attachment
+ *attach, enum dma_data_direction dir)
+{
+ return NULL;
+}
+
+static void tee_shm_op_unmap_dma_buf(struct dma_buf_attachment *attach,
+ struct sg_table *table,
+ enum dma_data_direction dir)
+{
+}
+
+static void tee_shm_op_release(struct dma_buf *dmabuf)
+{
+ struct tee_shm *shm = dmabuf->priv;
+
+ tee_shm_release(shm);
+}
+
+static void *tee_shm_op_kmap_atomic(struct dma_buf *dmabuf, unsigned long pgnum)
+{
+ return NULL;
+}
+
+static void *tee_shm_op_kmap(struct dma_buf *dmabuf, unsigned long pgnum)
+{
+ return NULL;
+}
+
+static int tee_shm_op_mmap(struct dma_buf *dmabuf, struct vm_area_struct *vma)
+{
+ struct tee_shm *shm = dmabuf->priv;
+ size_t size = vma->vm_end - vma->vm_start;
+
+ /* Refuse sharing shared memory provided by application */
+ if (shm->flags & TEE_SHM_REGISTER)
+ return -EINVAL;
+
+ return remap_pfn_range(vma, vma->vm_start, shm->paddr >> PAGE_SHIFT,
+ size, vma->vm_page_prot);
+}
+
+static const struct dma_buf_ops tee_shm_dma_buf_ops = {
+ .map_dma_buf = tee_shm_op_map_dma_buf,
+ .unmap_dma_buf = tee_shm_op_unmap_dma_buf,
+ .release = tee_shm_op_release,
+ .kmap_atomic = tee_shm_op_kmap_atomic,
+ .kmap = tee_shm_op_kmap,
+ .mmap = tee_shm_op_mmap,
+};
+
+static struct tee_shm *__tee_shm_alloc(struct tee_context *ctx,
+ struct tee_device *teedev,
+ size_t size, u32 flags)
+{
+ struct tee_shm_pool_mgr *poolm = NULL;
+ struct tee_shm *shm;
+ void *ret;
+ int rc;
+
+ if (ctx && ctx->teedev != teedev) {
+ dev_err(teedev->dev.parent, "ctx and teedev mismatch\n");
+ return ERR_PTR(-EINVAL);
+ }
+
+ if (!(flags & TEE_SHM_MAPPED)) {
+ dev_err(teedev->dev.parent,
+ "only mapped allocations supported\n");
+ return ERR_PTR(-EINVAL);
+ }
+
+ if ((flags & ~(TEE_SHM_MAPPED | TEE_SHM_DMA_BUF))) {
+ dev_err(teedev->dev.parent, "invalid shm flags 0x%x", flags);
+ return ERR_PTR(-EINVAL);
+ }
+
+ if (!tee_device_get(teedev))
+ return ERR_PTR(-EINVAL);
+
+ if (!teedev->pool) {
+ /* teedev has been detached from driver */
+ ret = ERR_PTR(-EINVAL);
+ goto err_dev_put;
+ }
+
+ shm = kzalloc(sizeof(*shm), GFP_KERNEL);
+ if (!shm) {
+ ret = ERR_PTR(-ENOMEM);
+ goto err_dev_put;
+ }
+
+ shm->flags = flags | TEE_SHM_POOL;
+ shm->teedev = teedev;
+ shm->ctx = ctx;
+ if (flags & TEE_SHM_DMA_BUF)
+ poolm = teedev->pool->dma_buf_mgr;
+ else
+ poolm = teedev->pool->private_mgr;
+
+ rc = poolm->ops->alloc(poolm, shm, size);
+ if (rc) {
+ ret = ERR_PTR(rc);
+ goto err_kfree;
+ }
+
+ mutex_lock(&teedev->mutex);
+ shm->id = idr_alloc(&teedev->idr, shm, 1, 0, GFP_KERNEL);
+ mutex_unlock(&teedev->mutex);
+ if (shm->id < 0) {
+ ret = ERR_PTR(shm->id);
+ goto err_pool_free;
+ }
+
+ if (flags & TEE_SHM_DMA_BUF) {
+ DEFINE_DMA_BUF_EXPORT_INFO(exp_info);
+
+ exp_info.ops = &tee_shm_dma_buf_ops;
+ exp_info.size = shm->size;
+ exp_info.flags = O_RDWR;
+ exp_info.priv = shm;
+
+ shm->dmabuf = dma_buf_export(&exp_info);
+ if (IS_ERR(shm->dmabuf)) {
+ ret = ERR_CAST(shm->dmabuf);
+ goto err_rem;
+ }
+ }
+
+ if (ctx) {
+ teedev_ctx_get(ctx);
+ mutex_lock(&teedev->mutex);
+ list_add_tail(&shm->link, &ctx->list_shm);
+ mutex_unlock(&teedev->mutex);
+ }
+
+ return shm;
+err_rem:
+ mutex_lock(&teedev->mutex);
+ idr_remove(&teedev->idr, shm->id);
+ mutex_unlock(&teedev->mutex);
+err_pool_free:
+ poolm->ops->free(poolm, shm);
+err_kfree:
+ kfree(shm);
+err_dev_put:
+ tee_device_put(teedev);
+ return ret;
+}
+
+/**
+ * tee_shm_alloc() - Allocate shared memory
+ * @ctx: Context that allocates the shared memory
+ * @size: Requested size of shared memory
+ * @flags: Flags setting properties for the requested shared memory.
+ *
+ * Memory allocated as global shared memory is automatically freed when the
+ * TEE file pointer is closed. The @flags field uses the bits defined by
+ * TEE_SHM_* in <linux/tee_drv.h>. TEE_SHM_MAPPED must currently always be
+ * set. If TEE_SHM_DMA_BUF global shared memory will be allocated and
+ * associated with a dma-buf handle, else driver private memory.
+ */
+struct tee_shm *tee_shm_alloc(struct tee_context *ctx, size_t size, u32 flags)
+{
+ return __tee_shm_alloc(ctx, ctx->teedev, size, flags);
+}
+EXPORT_SYMBOL_GPL(tee_shm_alloc);
+
+struct tee_shm *tee_shm_priv_alloc(struct tee_device *teedev, size_t size)
+{
+ return __tee_shm_alloc(NULL, teedev, size, TEE_SHM_MAPPED);
+}
+EXPORT_SYMBOL_GPL(tee_shm_priv_alloc);
+
+struct tee_shm *tee_shm_register(struct tee_context *ctx, unsigned long addr,
+ size_t length, u32 flags)
+{
+ struct tee_device *teedev = ctx->teedev;
+ const u32 req_flags = TEE_SHM_DMA_BUF | TEE_SHM_USER_MAPPED;
+ struct tee_shm *shm;
+ void *ret;
+ int rc;
+ int num_pages;
+ unsigned long start;
+
+ if (flags != req_flags)
+ return ERR_PTR(-ENOTSUPP);
+
+ if (!tee_device_get(teedev))
+ return ERR_PTR(-EINVAL);
+
+ if (!teedev->desc->ops->shm_register ||
+ !teedev->desc->ops->shm_unregister) {
+ tee_device_put(teedev);
+ return ERR_PTR(-ENOTSUPP);
+ }
+
+ teedev_ctx_get(ctx);
+
+ shm = kzalloc(sizeof(*shm), GFP_KERNEL);
+ if (!shm) {
+ ret = ERR_PTR(-ENOMEM);
+ goto err;
+ }
+
+ shm->flags = flags | TEE_SHM_REGISTER;
+ shm->teedev = teedev;
+ shm->ctx = ctx;
+ shm->id = -1;
+ start = rounddown(addr, PAGE_SIZE);
+ shm->offset = addr - start;
+ shm->size = length;
+ num_pages = (roundup(addr + length, PAGE_SIZE) - start) / PAGE_SIZE;
+ shm->pages = kcalloc(num_pages, sizeof(*shm->pages), GFP_KERNEL);
+ if (!shm->pages) {
+ ret = ERR_PTR(-ENOMEM);
+ goto err;
+ }
+
+ rc = get_user_pages_fast(start, num_pages, 1, shm->pages);
+ if (rc > 0)
+ shm->num_pages = rc;
+ if (rc != num_pages) {
+ if (rc >= 0)
+ rc = -ENOMEM;
+ ret = ERR_PTR(rc);
+ goto err;
+ }
+
+ mutex_lock(&teedev->mutex);
+ shm->id = idr_alloc(&teedev->idr, shm, 1, 0, GFP_KERNEL);
+ mutex_unlock(&teedev->mutex);
+
+ if (shm->id < 0) {
+ ret = ERR_PTR(shm->id);
+ goto err;
+ }
+
+ rc = teedev->desc->ops->shm_register(ctx, shm, shm->pages,
+ shm->num_pages, start);
+ if (rc) {
+ ret = ERR_PTR(rc);
+ goto err;
+ }
+
+ if (flags & TEE_SHM_DMA_BUF) {
+ DEFINE_DMA_BUF_EXPORT_INFO(exp_info);
+
+ exp_info.ops = &tee_shm_dma_buf_ops;
+ exp_info.size = shm->size;
+ exp_info.flags = O_RDWR;
+ exp_info.priv = shm;
+
+ shm->dmabuf = dma_buf_export(&exp_info);
+ if (IS_ERR(shm->dmabuf)) {
+ ret = ERR_CAST(shm->dmabuf);
+ teedev->desc->ops->shm_unregister(ctx, shm);
+ goto err;
+ }
+ }
+
+ mutex_lock(&teedev->mutex);
+ list_add_tail(&shm->link, &ctx->list_shm);
+ mutex_unlock(&teedev->mutex);
+
+ return shm;
+err:
+ if (shm) {
+ size_t n;
+
+ if (shm->id >= 0) {
+ mutex_lock(&teedev->mutex);
+ idr_remove(&teedev->idr, shm->id);
+ mutex_unlock(&teedev->mutex);
+ }
+ if (shm->pages) {
+ for (n = 0; n < shm->num_pages; n++)
+ put_page(shm->pages[n]);
+ kfree(shm->pages);
+ }
+ }
+ kfree(shm);
+ teedev_ctx_put(ctx);
+ tee_device_put(teedev);
+ return ret;
+}
+EXPORT_SYMBOL_GPL(tee_shm_register);
+
+/**
+ * tee_shm_get_fd() - Increase reference count and return file descriptor
+ * @shm: Shared memory handle
+ * @returns user space file descriptor to shared memory
+ */
+int tee_shm_get_fd(struct tee_shm *shm)
+{
+ int fd;
+
+ if (!(shm->flags & TEE_SHM_DMA_BUF))
+ return -EINVAL;
+
+ fd = dma_buf_fd(shm->dmabuf, O_CLOEXEC);
+ if (fd >= 0)
+ get_dma_buf(shm->dmabuf);
+ return fd;
+}
+
+/**
+ * tee_shm_free() - Free shared memory
+ * @shm: Handle to shared memory to free
+ */
+void tee_shm_free(struct tee_shm *shm)
+{
+ /*
+ * dma_buf_put() decreases the dmabuf reference counter and will
+ * call tee_shm_release() when the last reference is gone.
+ *
+ * In the case of driver private memory we call tee_shm_release
+ * directly instead as it doesn't have a reference counter.
+ */
+ if (shm->flags & TEE_SHM_DMA_BUF)
+ dma_buf_put(shm->dmabuf);
+ else
+ tee_shm_release(shm);
+}
+EXPORT_SYMBOL_GPL(tee_shm_free);
+
+/**
+ * tee_shm_va2pa() - Get physical address of a virtual address
+ * @shm: Shared memory handle
+ * @va: Virtual address to tranlsate
+ * @pa: Returned physical address
+ * @returns 0 on success and < 0 on failure
+ */
+int tee_shm_va2pa(struct tee_shm *shm, void *va, phys_addr_t *pa)
+{
+ if (!(shm->flags & TEE_SHM_MAPPED))
+ return -EINVAL;
+ /* Check that we're in the range of the shm */
+ if ((char *)va < (char *)shm->kaddr)
+ return -EINVAL;
+ if ((char *)va >= ((char *)shm->kaddr + shm->size))
+ return -EINVAL;
+
+ return tee_shm_get_pa(
+ shm, (unsigned long)va - (unsigned long)shm->kaddr, pa);
+}
+EXPORT_SYMBOL_GPL(tee_shm_va2pa);
+
+/**
+ * tee_shm_pa2va() - Get virtual address of a physical address
+ * @shm: Shared memory handle
+ * @pa: Physical address to tranlsate
+ * @va: Returned virtual address
+ * @returns 0 on success and < 0 on failure
+ */
+int tee_shm_pa2va(struct tee_shm *shm, phys_addr_t pa, void **va)
+{
+ if (!(shm->flags & TEE_SHM_MAPPED))
+ return -EINVAL;
+ /* Check that we're in the range of the shm */
+ if (pa < shm->paddr)
+ return -EINVAL;
+ if (pa >= (shm->paddr + shm->size))
+ return -EINVAL;
+
+ if (va) {
+ void *v = tee_shm_get_va(shm, pa - shm->paddr);
+
+ if (IS_ERR(v))
+ return PTR_ERR(v);
+ *va = v;
+ }
+ return 0;
+}
+EXPORT_SYMBOL_GPL(tee_shm_pa2va);
+
+/**
+ * tee_shm_get_va() - Get virtual address of a shared memory plus an offset
+ * @shm: Shared memory handle
+ * @offs: Offset from start of this shared memory
+ * @returns virtual address of the shared memory + offs if offs is within
+ * the bounds of this shared memory, else an ERR_PTR
+ */
+void *tee_shm_get_va(struct tee_shm *shm, size_t offs)
+{
+ if (!(shm->flags & TEE_SHM_MAPPED))
+ return ERR_PTR(-EINVAL);
+ if (offs >= shm->size)
+ return ERR_PTR(-EINVAL);
+ return (char *)shm->kaddr + offs;
+}
+EXPORT_SYMBOL_GPL(tee_shm_get_va);
+
+/**
+ * tee_shm_get_pa() - Get physical address of a shared memory plus an offset
+ * @shm: Shared memory handle
+ * @offs: Offset from start of this shared memory
+ * @pa: Physical address to return
+ * @returns 0 if offs is within the bounds of this shared memory, else an
+ * error code.
+ */
+int tee_shm_get_pa(struct tee_shm *shm, size_t offs, phys_addr_t *pa)
+{
+ if (offs >= shm->size)
+ return -EINVAL;
+ if (pa)
+ *pa = shm->paddr + offs;
+ return 0;
+}
+EXPORT_SYMBOL_GPL(tee_shm_get_pa);
+
+/**
+ * tee_shm_get_from_id() - Find shared memory object and increase reference
+ * count
+ * @ctx: Context owning the shared memory
+ * @id: Id of shared memory object
+ * @returns a pointer to 'struct tee_shm' on success or an ERR_PTR on failure
+ */
+struct tee_shm *tee_shm_get_from_id(struct tee_context *ctx, int id)
+{
+ struct tee_device *teedev;
+ struct tee_shm *shm;
+
+ if (!ctx)
+ return ERR_PTR(-EINVAL);
+
+ teedev = ctx->teedev;
+ mutex_lock(&teedev->mutex);
+ shm = idr_find(&teedev->idr, id);
+ if (!shm || shm->ctx != ctx)
+ shm = ERR_PTR(-EINVAL);
+ else if (shm->flags & TEE_SHM_DMA_BUF)
+ get_dma_buf(shm->dmabuf);
+ mutex_unlock(&teedev->mutex);
+ return shm;
+}
+EXPORT_SYMBOL_GPL(tee_shm_get_from_id);
+
+/**
+ * tee_shm_put() - Decrease reference count on a shared memory handle
+ * @shm: Shared memory handle
+ */
+void tee_shm_put(struct tee_shm *shm)
+{
+ if (shm->flags & TEE_SHM_DMA_BUF)
+ dma_buf_put(shm->dmabuf);
+}
+EXPORT_SYMBOL_GPL(tee_shm_put);
diff --git a/drivers/tee/tee_shm_pool.c b/drivers/tee/tee_shm_pool.c
new file mode 100644
index 000000000000..e6d4b9e4a864
--- /dev/null
+++ b/drivers/tee/tee_shm_pool.c
@@ -0,0 +1,195 @@
+/*
+ * Copyright (c) 2015, Linaro Limited
+ *
+ * This software is licensed under the terms of the GNU General Public
+ * License version 2, as published by the Free Software Foundation, and
+ * may be copied, distributed, and modified under those terms.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ */
+#include <linux/device.h>
+#include <linux/dma-buf.h>
+#include <linux/genalloc.h>
+#include <linux/slab.h>
+#include <linux/tee_drv.h>
+#include "tee_private.h"
+
+static int pool_op_gen_alloc(struct tee_shm_pool_mgr *poolm,
+ struct tee_shm *shm, size_t size)
+{
+ unsigned long va;
+ struct gen_pool *genpool = poolm->private_data;
+ size_t s = roundup(size, 1 << genpool->min_alloc_order);
+
+ va = gen_pool_alloc(genpool, s);
+ if (!va)
+ return -ENOMEM;
+
+ memset((void *)va, 0, s);
+ shm->kaddr = (void *)va;
+ shm->paddr = gen_pool_virt_to_phys(genpool, va);
+ shm->size = s;
+ return 0;
+}
+
+static void pool_op_gen_free(struct tee_shm_pool_mgr *poolm,
+ struct tee_shm *shm)
+{
+ gen_pool_free(poolm->private_data, (unsigned long)shm->kaddr,
+ shm->size);
+ shm->kaddr = NULL;
+}
+
+static void pool_op_gen_destroy_poolmgr(struct tee_shm_pool_mgr *poolm)
+{
+ gen_pool_destroy(poolm->private_data);
+ kfree(poolm);
+}
+
+static const struct tee_shm_pool_mgr_ops pool_ops_generic = {
+ .alloc = pool_op_gen_alloc,
+ .free = pool_op_gen_free,
+ .destroy_poolmgr = pool_op_gen_destroy_poolmgr,
+};
+
+/**
+ * tee_shm_pool_alloc_res_mem() - Create a shared memory pool from reserved
+ * memory range
+ * @priv_info: Information for driver private shared memory pool
+ * @dmabuf_info: Information for dma-buf shared memory pool
+ *
+ * Start and end of pools will must be page aligned.
+ *
+ * Allocation with the flag TEE_SHM_DMA_BUF set will use the range supplied
+ * in @dmabuf, others will use the range provided by @priv.
+ *
+ * @returns pointer to a 'struct tee_shm_pool' or an ERR_PTR on failure.
+ */
+struct tee_shm_pool *
+tee_shm_pool_alloc_res_mem(struct tee_shm_pool_mem_info *priv_info,
+ struct tee_shm_pool_mem_info *dmabuf_info)
+{
+ struct tee_shm_pool_mgr *priv_mgr;
+ struct tee_shm_pool_mgr *dmabuf_mgr;
+ void *rc;
+
+ /*
+ * Create the pool for driver private shared memory
+ */
+ rc = tee_shm_pool_mgr_alloc_res_mem(priv_info->vaddr, priv_info->paddr,
+ priv_info->size,
+ 3 /* 8 byte aligned */);
+ if (IS_ERR(rc))
+ return rc;
+ priv_mgr = rc;
+
+ /*
+ * Create the pool for dma_buf shared memory
+ */
+ rc = tee_shm_pool_mgr_alloc_res_mem(dmabuf_info->vaddr,
+ dmabuf_info->paddr,
+ dmabuf_info->size, PAGE_SHIFT);
+ if (IS_ERR(rc))
+ goto err_free_priv_mgr;
+ dmabuf_mgr = rc;
+
+ rc = tee_shm_pool_alloc(priv_mgr, dmabuf_mgr);
+ if (IS_ERR(rc))
+ goto err_free_dmabuf_mgr;
+
+ return rc;
+
+err_free_dmabuf_mgr:
+ tee_shm_pool_mgr_destroy(dmabuf_mgr);
+err_free_priv_mgr:
+ tee_shm_pool_mgr_destroy(priv_mgr);
+
+ return rc;
+}
+EXPORT_SYMBOL_GPL(tee_shm_pool_alloc_res_mem);
+
+struct tee_shm_pool_mgr *tee_shm_pool_mgr_alloc_res_mem(unsigned long vaddr,
+ phys_addr_t paddr,
+ size_t size,
+ int min_alloc_order)
+{
+ const size_t page_mask = PAGE_SIZE - 1;
+ struct tee_shm_pool_mgr *mgr;
+ int rc;
+
+ /* Start and end must be page aligned */
+ if (vaddr & page_mask || paddr & page_mask || size & page_mask)
+ return ERR_PTR(-EINVAL);
+
+ mgr = kzalloc(sizeof(*mgr), GFP_KERNEL);
+ if (!mgr)
+ return ERR_PTR(-ENOMEM);
+
+ mgr->private_data = gen_pool_create(min_alloc_order, -1);
+ if (!mgr->private_data) {
+ rc = -ENOMEM;
+ goto err;
+ }
+
+ gen_pool_set_algo(mgr->private_data, gen_pool_best_fit, NULL);
+ rc = gen_pool_add_virt(mgr->private_data, vaddr, paddr, size, -1);
+ if (rc) {
+ gen_pool_destroy(mgr->private_data);
+ goto err;
+ }
+
+ mgr->ops = &pool_ops_generic;
+
+ return mgr;
+err:
+ kfree(mgr);
+
+ return ERR_PTR(rc);
+}
+EXPORT_SYMBOL_GPL(tee_shm_pool_mgr_alloc_res_mem);
+
+static bool check_mgr_ops(struct tee_shm_pool_mgr *mgr)
+{
+ return mgr && mgr->ops && mgr->ops->alloc && mgr->ops->free &&
+ mgr->ops->destroy_poolmgr;
+}
+
+struct tee_shm_pool *tee_shm_pool_alloc(struct tee_shm_pool_mgr *priv_mgr,
+ struct tee_shm_pool_mgr *dmabuf_mgr)
+{
+ struct tee_shm_pool *pool;
+
+ if (!check_mgr_ops(priv_mgr) || !check_mgr_ops(dmabuf_mgr))
+ return ERR_PTR(-EINVAL);
+
+ pool = kzalloc(sizeof(*pool), GFP_KERNEL);
+ if (!pool)
+ return ERR_PTR(-ENOMEM);
+
+ pool->private_mgr = priv_mgr;
+ pool->dma_buf_mgr = dmabuf_mgr;
+
+ return pool;
+}
+EXPORT_SYMBOL_GPL(tee_shm_pool_alloc);
+
+/**
+ * tee_shm_pool_free() - Free a shared memory pool
+ * @pool: The shared memory pool to free
+ *
+ * There must be no remaining shared memory allocated from this pool when
+ * this function is called.
+ */
+void tee_shm_pool_free(struct tee_shm_pool *pool)
+{
+ if (pool->private_mgr)
+ tee_shm_pool_mgr_destroy(pool->private_mgr);
+ if (pool->dma_buf_mgr)
+ tee_shm_pool_mgr_destroy(pool->dma_buf_mgr);
+ kfree(pool);
+}
+EXPORT_SYMBOL_GPL(tee_shm_pool_free);
diff --git a/drivers/thermal/hisi_thermal.c b/drivers/thermal/hisi_thermal.c
index c5285ed34fdd..2d855a96cdd9 100644
--- a/drivers/thermal/hisi_thermal.c
+++ b/drivers/thermal/hisi_thermal.c
@@ -23,243 +23,450 @@
#include <linux/module.h>
#include <linux/platform_device.h>
#include <linux/io.h>
+#include <linux/of_device.h>
#include "thermal_core.h"
-#define TEMP0_TH (0x4)
-#define TEMP0_RST_TH (0x8)
-#define TEMP0_CFG (0xC)
-#define TEMP0_EN (0x10)
-#define TEMP0_INT_EN (0x14)
-#define TEMP0_INT_CLR (0x18)
-#define TEMP0_RST_MSK (0x1C)
-#define TEMP0_VALUE (0x28)
-
-#define HISI_TEMP_BASE (-60000)
-#define HISI_TEMP_RESET (100000)
-#define HISI_TEMP_STEP (784)
-
-#define HISI_MAX_SENSORS 4
+#define HI6220_TEMP0_LAG (0x0)
+#define HI6220_TEMP0_TH (0x4)
+#define HI6220_TEMP0_RST_TH (0x8)
+#define HI6220_TEMP0_CFG (0xC)
+#define HI6220_TEMP0_CFG_SS_MSK (0xF000)
+#define HI6220_TEMP0_CFG_HDAK_MSK (0x30)
+#define HI6220_TEMP0_EN (0x10)
+#define HI6220_TEMP0_INT_EN (0x14)
+#define HI6220_TEMP0_INT_CLR (0x18)
+#define HI6220_TEMP0_RST_MSK (0x1C)
+#define HI6220_TEMP0_VALUE (0x28)
+
+#define HI3660_OFFSET(chan) ((chan) * 0x40)
+#define HI3660_TEMP(chan) (HI3660_OFFSET(chan) + 0x1C)
+#define HI3660_TH(chan) (HI3660_OFFSET(chan) + 0x20)
+#define HI3660_LAG(chan) (HI3660_OFFSET(chan) + 0x28)
+#define HI3660_INT_EN(chan) (HI3660_OFFSET(chan) + 0x2C)
+#define HI3660_INT_CLR(chan) (HI3660_OFFSET(chan) + 0x30)
+
+#define HI6220_TEMP_BASE (-60000)
+#define HI6220_TEMP_RESET (100000)
+#define HI6220_TEMP_STEP (785)
+#define HI6220_TEMP_LAG (3500)
+
+#define HI3660_TEMP_BASE (-63780)
+#define HI3660_TEMP_STEP (205)
+#define HI3660_TEMP_LAG (4000)
+
+#define HI6220_DEFAULT_SENSOR 2
+#define HI3660_DEFAULT_SENSOR 1
struct hisi_thermal_sensor {
- struct hisi_thermal_data *thermal;
struct thermal_zone_device *tzd;
-
- long sensor_temp;
uint32_t id;
uint32_t thres_temp;
};
struct hisi_thermal_data {
- struct mutex thermal_lock; /* protects register data */
+ int (*get_temp)(struct hisi_thermal_data *data);
+ int (*enable_sensor)(struct hisi_thermal_data *data);
+ int (*disable_sensor)(struct hisi_thermal_data *data);
+ int (*irq_handler)(struct hisi_thermal_data *data);
struct platform_device *pdev;
struct clk *clk;
- struct hisi_thermal_sensor sensors[HISI_MAX_SENSORS];
-
- int irq, irq_bind_sensor;
- bool irq_enabled;
-
+ struct hisi_thermal_sensor sensor;
void __iomem *regs;
+ int irq;
};
/*
* The temperature computation on the tsensor is as follow:
* Unit: millidegree Celsius
- * Step: 255/200 (0.7843)
+ * Step: 200/255 (0.7843)
* Temperature base: -60°C
*
- * The register is programmed in temperature steps, every step is 784
+ * The register is programmed in temperature steps, every step is 785
* millidegree and begins at -60 000 m°C
*
* The temperature from the steps:
*
- * Temp = TempBase + (steps x 784)
+ * Temp = TempBase + (steps x 785)
*
* and the steps from the temperature:
*
- * steps = (Temp - TempBase) / 784
+ * steps = (Temp - TempBase) / 785
*
*/
-static inline int hisi_thermal_step_to_temp(int step)
+static inline int hi6220_thermal_step_to_temp(int step)
{
- return HISI_TEMP_BASE + (step * HISI_TEMP_STEP);
+ return HI6220_TEMP_BASE + (step * HI6220_TEMP_STEP);
}
-static inline long hisi_thermal_temp_to_step(long temp)
+static inline int hi6220_thermal_temp_to_step(int temp)
{
- return (temp - HISI_TEMP_BASE) / HISI_TEMP_STEP;
+ return DIV_ROUND_UP(temp - HI6220_TEMP_BASE, HI6220_TEMP_STEP);
}
-static inline long hisi_thermal_round_temp(int temp)
+/*
+ * for Hi3660,
+ * Step: 189/922 (0.205)
+ * Temperature base: -63.780°C
+ *
+ * The register is programmed in temperature steps, every step is 205
+ * millidegree and begins at -63 780 m°C
+ */
+static inline int hi3660_thermal_step_to_temp(int step)
{
- return hisi_thermal_step_to_temp(
- hisi_thermal_temp_to_step(temp));
+ return HI3660_TEMP_BASE + step * HI3660_TEMP_STEP;
}
-static long hisi_thermal_get_sensor_temp(struct hisi_thermal_data *data,
- struct hisi_thermal_sensor *sensor)
+static inline int hi3660_thermal_temp_to_step(int temp)
{
- long val;
+ return DIV_ROUND_UP(temp - HI3660_TEMP_BASE, HI3660_TEMP_STEP);
+}
- mutex_lock(&data->thermal_lock);
+/*
+ * The lag register contains 5 bits encoding the temperature in steps.
+ *
+ * Each time the temperature crosses the threshold boundary, an
+ * interrupt is raised. It could be when the temperature is going
+ * above the threshold or below. However, if the temperature is
+ * fluctuating around this value due to the load, we can receive
+ * several interrupts which may not desired.
+ *
+ * We can setup a temperature representing the delta between the
+ * threshold and the current temperature when the temperature is
+ * decreasing.
+ *
+ * For instance: the lag register is 5°C, the threshold is 65°C, when
+ * the temperature reaches 65°C an interrupt is raised and when the
+ * temperature decrease to 65°C - 5°C another interrupt is raised.
+ *
+ * A very short lag can lead to an interrupt storm, a long lag
+ * increase the latency to react to the temperature changes. In our
+ * case, that is not really a problem as we are polling the
+ * temperature.
+ *
+ * [0:4] : lag register
+ *
+ * The temperature is coded in steps, cf. HI6220_TEMP_STEP.
+ *
+ * Min : 0x00 : 0.0 °C
+ * Max : 0x1F : 24.3 °C
+ *
+ * The 'value' parameter is in milliCelsius.
+ */
+static inline void hi6220_thermal_set_lag(void __iomem *addr, int value)
+{
+ writel(DIV_ROUND_UP(value, HI6220_TEMP_STEP) & 0x1F,
+ addr + HI6220_TEMP0_LAG);
+}
- /* disable interrupt */
- writel(0x0, data->regs + TEMP0_INT_EN);
- writel(0x1, data->regs + TEMP0_INT_CLR);
+static inline void hi6220_thermal_alarm_clear(void __iomem *addr, int value)
+{
+ writel(value, addr + HI6220_TEMP0_INT_CLR);
+}
- /* disable module firstly */
- writel(0x0, data->regs + TEMP0_EN);
+static inline void hi6220_thermal_alarm_enable(void __iomem *addr, int value)
+{
+ writel(value, addr + HI6220_TEMP0_INT_EN);
+}
- /* select sensor id */
- writel((sensor->id << 12), data->regs + TEMP0_CFG);
+static inline void hi6220_thermal_alarm_set(void __iomem *addr, int temp)
+{
+ writel(hi6220_thermal_temp_to_step(temp) | 0x0FFFFFF00,
+ addr + HI6220_TEMP0_TH);
+}
- /* enable module */
- writel(0x1, data->regs + TEMP0_EN);
+static inline void hi6220_thermal_reset_set(void __iomem *addr, int temp)
+{
+ writel(hi6220_thermal_temp_to_step(temp), addr + HI6220_TEMP0_RST_TH);
+}
- usleep_range(3000, 5000);
+static inline void hi6220_thermal_reset_enable(void __iomem *addr, int value)
+{
+ writel(value, addr + HI6220_TEMP0_RST_MSK);
+}
- val = readl(data->regs + TEMP0_VALUE);
- val = hisi_thermal_step_to_temp(val);
+static inline void hi6220_thermal_enable(void __iomem *addr, int value)
+{
+ writel(value, addr + HI6220_TEMP0_EN);
+}
- mutex_unlock(&data->thermal_lock);
+static inline int hi6220_thermal_get_temperature(void __iomem *addr)
+{
+ return hi6220_thermal_step_to_temp(readl(addr + HI6220_TEMP0_VALUE));
+}
- return val;
+/*
+ * [0:6] lag register
+ *
+ * The temperature is coded in steps, cf. HI3660_TEMP_STEP.
+ *
+ * Min : 0x00 : 0.0 °C
+ * Max : 0x7F : 26.0 °C
+ *
+ */
+static inline void hi3660_thermal_set_lag(void __iomem *addr,
+ int id, int value)
+{
+ writel(DIV_ROUND_UP(value, HI3660_TEMP_STEP) & 0x7F,
+ addr + HI3660_LAG(id));
}
-static void hisi_thermal_enable_bind_irq_sensor
- (struct hisi_thermal_data *data)
+static inline void hi3660_thermal_alarm_clear(void __iomem *addr,
+ int id, int value)
{
- struct hisi_thermal_sensor *sensor;
+ writel(value, addr + HI3660_INT_CLR(id));
+}
- mutex_lock(&data->thermal_lock);
+static inline void hi3660_thermal_alarm_enable(void __iomem *addr,
+ int id, int value)
+{
+ writel(value, addr + HI3660_INT_EN(id));
+}
- sensor = &data->sensors[data->irq_bind_sensor];
+static inline void hi3660_thermal_alarm_set(void __iomem *addr,
+ int id, int value)
+{
+ writel(value, addr + HI3660_TH(id));
+}
- /* setting the hdak time */
- writel(0x0, data->regs + TEMP0_CFG);
+static inline int hi3660_thermal_get_temperature(void __iomem *addr, int id)
+{
+ return hi3660_thermal_step_to_temp(readl(addr + HI3660_TEMP(id)));
+}
+
+/*
+ * Temperature configuration register - Sensor selection
+ *
+ * Bits [19:12]
+ *
+ * 0x0: local sensor (default)
+ * 0x1: remote sensor 1 (ACPU cluster 1)
+ * 0x2: remote sensor 2 (ACPU cluster 0)
+ * 0x3: remote sensor 3 (G3D)
+ */
+static inline void hi6220_thermal_sensor_select(void __iomem *addr, int sensor)
+{
+ writel((readl(addr + HI6220_TEMP0_CFG) & ~HI6220_TEMP0_CFG_SS_MSK) |
+ (sensor << 12), addr + HI6220_TEMP0_CFG);
+}
+
+/*
+ * Temperature configuration register - Hdak conversion polling interval
+ *
+ * Bits [5:4]
+ *
+ * 0x0 : 0.768 ms
+ * 0x1 : 6.144 ms
+ * 0x2 : 49.152 ms
+ * 0x3 : 393.216 ms
+ */
+static inline void hi6220_thermal_hdak_set(void __iomem *addr, int value)
+{
+ writel((readl(addr + HI6220_TEMP0_CFG) & ~HI6220_TEMP0_CFG_HDAK_MSK) |
+ (value << 4), addr + HI6220_TEMP0_CFG);
+}
+
+static int hi6220_thermal_irq_handler(struct hisi_thermal_data *data)
+{
+ hi6220_thermal_alarm_clear(data->regs, 1);
+ return 0;
+}
+
+static int hi3660_thermal_irq_handler(struct hisi_thermal_data *data)
+{
+ hi3660_thermal_alarm_clear(data->regs, data->sensor.id, 1);
+ return 0;
+}
+
+static int hi6220_thermal_get_temp(struct hisi_thermal_data *data)
+{
+ return hi6220_thermal_get_temperature(data->regs);
+}
+
+static int hi3660_thermal_get_temp(struct hisi_thermal_data *data)
+{
+ return hi3660_thermal_get_temperature(data->regs, data->sensor.id);
+}
+
+static int hi6220_thermal_disable_sensor(struct hisi_thermal_data *data)
+{
+ /* disable sensor module */
+ hi6220_thermal_enable(data->regs, 0);
+ hi6220_thermal_alarm_enable(data->regs, 0);
+ hi6220_thermal_reset_enable(data->regs, 0);
+
+ clk_disable_unprepare(data->clk);
+
+ return 0;
+}
+
+static int hi3660_thermal_disable_sensor(struct hisi_thermal_data *data)
+{
+ /* disable sensor module */
+ hi3660_thermal_alarm_enable(data->regs, data->sensor.id, 0);
+ return 0;
+}
+
+static int hi6220_thermal_enable_sensor(struct hisi_thermal_data *data)
+{
+ struct hisi_thermal_sensor *sensor = &data->sensor;
+ int ret;
+
+ /* enable clock for tsensor */
+ ret = clk_prepare_enable(data->clk);
+ if (ret)
+ return ret;
/* disable module firstly */
- writel(0x0, data->regs + TEMP0_RST_MSK);
- writel(0x0, data->regs + TEMP0_EN);
+ hi6220_thermal_reset_enable(data->regs, 0);
+ hi6220_thermal_enable(data->regs, 0);
/* select sensor id */
- writel((sensor->id << 12), data->regs + TEMP0_CFG);
+ hi6220_thermal_sensor_select(data->regs, sensor->id);
+
+ /* setting the hdak time */
+ hi6220_thermal_hdak_set(data->regs, 0);
+
+ /* setting lag value between current temp and the threshold */
+ hi6220_thermal_set_lag(data->regs, HI6220_TEMP_LAG);
/* enable for interrupt */
- writel(hisi_thermal_temp_to_step(sensor->thres_temp) | 0x0FFFFFF00,
- data->regs + TEMP0_TH);
+ hi6220_thermal_alarm_set(data->regs, sensor->thres_temp);
- writel(hisi_thermal_temp_to_step(HISI_TEMP_RESET),
- data->regs + TEMP0_RST_TH);
+ hi6220_thermal_reset_set(data->regs, HI6220_TEMP_RESET);
/* enable module */
- writel(0x1, data->regs + TEMP0_RST_MSK);
- writel(0x1, data->regs + TEMP0_EN);
+ hi6220_thermal_reset_enable(data->regs, 1);
+ hi6220_thermal_enable(data->regs, 1);
- writel(0x0, data->regs + TEMP0_INT_CLR);
- writel(0x1, data->regs + TEMP0_INT_EN);
+ hi6220_thermal_alarm_clear(data->regs, 0);
+ hi6220_thermal_alarm_enable(data->regs, 1);
- usleep_range(3000, 5000);
-
- mutex_unlock(&data->thermal_lock);
+ return 0;
}
-static void hisi_thermal_disable_sensor(struct hisi_thermal_data *data)
+static int hi3660_thermal_enable_sensor(struct hisi_thermal_data *data)
{
- mutex_lock(&data->thermal_lock);
+ unsigned int value;
+ struct hisi_thermal_sensor *sensor = &data->sensor;
- /* disable sensor module */
- writel(0x0, data->regs + TEMP0_INT_EN);
- writel(0x0, data->regs + TEMP0_RST_MSK);
- writel(0x0, data->regs + TEMP0_EN);
+ /* disable interrupt */
+ hi3660_thermal_alarm_enable(data->regs, sensor->id, 0);
- mutex_unlock(&data->thermal_lock);
-}
+ /* setting lag value between current temp and the threshold */
+ hi3660_thermal_set_lag(data->regs, sensor->id, HI3660_TEMP_LAG);
-static int hisi_thermal_get_temp(void *_sensor, int *temp)
-{
- struct hisi_thermal_sensor *sensor = _sensor;
- struct hisi_thermal_data *data = sensor->thermal;
+ /* set interrupt threshold */
+ value = hi3660_thermal_temp_to_step(sensor->thres_temp);
+ hi3660_thermal_alarm_set(data->regs, sensor->id, value);
- int sensor_id = -1, i;
- long max_temp = 0;
+ /* enable interrupt */
+ hi3660_thermal_alarm_clear(data->regs, sensor->id, 1);
+ hi3660_thermal_alarm_enable(data->regs, sensor->id, 1);
- *temp = hisi_thermal_get_sensor_temp(data, sensor);
+ return 0;
+}
- sensor->sensor_temp = *temp;
+static int hi6220_thermal_probe(struct hisi_thermal_data *data)
+{
+ struct platform_device *pdev = data->pdev;
+ struct device *dev = &pdev->dev;
+ struct resource *res;
+ int ret;
- for (i = 0; i < HISI_MAX_SENSORS; i++) {
- if (!data->sensors[i].tzd)
- continue;
+ data->get_temp = hi6220_thermal_get_temp;
+ data->enable_sensor = hi6220_thermal_enable_sensor;
+ data->disable_sensor = hi6220_thermal_disable_sensor;
+ data->irq_handler = hi6220_thermal_irq_handler;
- if (data->sensors[i].sensor_temp >= max_temp) {
- max_temp = data->sensors[i].sensor_temp;
- sensor_id = i;
- }
+ res = platform_get_resource(pdev, IORESOURCE_MEM, 0);
+ data->regs = devm_ioremap_resource(dev, res);
+ if (IS_ERR(data->regs)) {
+ dev_err(dev, "failed to get io address\n");
+ return PTR_ERR(data->regs);
}
- /* If no sensor has been enabled, then skip to enable irq */
- if (sensor_id == -1)
- return 0;
-
- mutex_lock(&data->thermal_lock);
- data->irq_bind_sensor = sensor_id;
- mutex_unlock(&data->thermal_lock);
-
- dev_dbg(&data->pdev->dev, "id=%d, irq=%d, temp=%d, thres=%d\n",
- sensor->id, data->irq_enabled, *temp, sensor->thres_temp);
- /*
- * Bind irq to sensor for two cases:
- * Reenable alarm IRQ if temperature below threshold;
- * if irq has been enabled, always set it;
- */
- if (data->irq_enabled) {
- hisi_thermal_enable_bind_irq_sensor(data);
- return 0;
+ data->clk = devm_clk_get(dev, "thermal_clk");
+ if (IS_ERR(data->clk)) {
+ ret = PTR_ERR(data->clk);
+ if (ret != -EPROBE_DEFER)
+ dev_err(dev, "failed to get thermal clk: %d\n", ret);
+ return ret;
}
- if (max_temp < sensor->thres_temp) {
- data->irq_enabled = true;
- hisi_thermal_enable_bind_irq_sensor(data);
- enable_irq(data->irq);
- }
+ data->irq = platform_get_irq(pdev, 0);
+ if (data->irq < 0)
+ return data->irq;
+
+ data->sensor.id = HI6220_DEFAULT_SENSOR;
return 0;
}
-static struct thermal_zone_of_device_ops hisi_of_thermal_ops = {
- .get_temp = hisi_thermal_get_temp,
-};
+static int hi3660_thermal_probe(struct hisi_thermal_data *data)
+{
+ struct platform_device *pdev = data->pdev;
+ struct device *dev = &pdev->dev;
+ struct resource *res;
+
+ data->get_temp = hi3660_thermal_get_temp;
+ data->enable_sensor = hi3660_thermal_enable_sensor;
+ data->disable_sensor = hi3660_thermal_disable_sensor;
+ data->irq_handler = hi3660_thermal_irq_handler;
+
+ res = platform_get_resource(pdev, IORESOURCE_MEM, 0);
+ data->regs = devm_ioremap_resource(dev, res);
+ if (IS_ERR(data->regs)) {
+ dev_err(dev, "failed to get io address\n");
+ return PTR_ERR(data->regs);
+ }
+
+ data->irq = platform_get_irq(pdev, 0);
+ if (data->irq < 0)
+ return data->irq;
+
+ data->sensor.id = HI3660_DEFAULT_SENSOR;
-static irqreturn_t hisi_thermal_alarm_irq(int irq, void *dev)
+ return 0;
+}
+
+static int hisi_thermal_get_temp(void *__data, int *temp)
{
- struct hisi_thermal_data *data = dev;
+ struct hisi_thermal_data *data = __data;
+ struct hisi_thermal_sensor *sensor = &data->sensor;
+
+ *temp = data->get_temp(data);
- disable_irq_nosync(irq);
- data->irq_enabled = false;
+ dev_dbg(&data->pdev->dev, "id=%d, temp=%d, thres=%d\n",
+ sensor->id, *temp, sensor->thres_temp);
- return IRQ_WAKE_THREAD;
+ return 0;
}
+static const struct thermal_zone_of_device_ops hisi_of_thermal_ops = {
+ .get_temp = hisi_thermal_get_temp,
+};
+
static irqreturn_t hisi_thermal_alarm_irq_thread(int irq, void *dev)
{
struct hisi_thermal_data *data = dev;
- struct hisi_thermal_sensor *sensor;
- int i;
+ struct hisi_thermal_sensor *sensor = &data->sensor;
+ int temp = 0;
- mutex_lock(&data->thermal_lock);
- sensor = &data->sensors[data->irq_bind_sensor];
+ data->irq_handler(data);
- dev_crit(&data->pdev->dev, "THERMAL ALARM: T > %d\n",
- sensor->thres_temp);
- mutex_unlock(&data->thermal_lock);
+ hisi_thermal_get_temp(data, &temp);
- for (i = 0; i < HISI_MAX_SENSORS; i++) {
- if (!data->sensors[i].tzd)
- continue;
+ if (temp >= sensor->thres_temp) {
+ dev_crit(&data->pdev->dev, "THERMAL ALARM: %d > %d\n",
+ temp, sensor->thres_temp);
- thermal_zone_device_update(data->sensors[i].tzd,
+ thermal_zone_device_update(data->sensor.tzd,
THERMAL_EVENT_UNSPECIFIED);
+
+ } else {
+ dev_crit(&data->pdev->dev, "THERMAL ALARM stopped: %d < %d\n",
+ temp, sensor->thres_temp);
}
return IRQ_HANDLED;
@@ -267,17 +474,14 @@ static irqreturn_t hisi_thermal_alarm_irq_thread(int irq, void *dev)
static int hisi_thermal_register_sensor(struct platform_device *pdev,
struct hisi_thermal_data *data,
- struct hisi_thermal_sensor *sensor,
- int index)
+ struct hisi_thermal_sensor *sensor)
{
int ret, i;
const struct thermal_trip *trip;
- sensor->id = index;
- sensor->thermal = data;
-
sensor->tzd = devm_thermal_zone_of_sensor_register(&pdev->dev,
- sensor->id, sensor, &hisi_of_thermal_ops);
+ sensor->id, data,
+ &hisi_of_thermal_ops);
if (IS_ERR(sensor->tzd)) {
ret = PTR_ERR(sensor->tzd);
sensor->tzd = NULL;
@@ -290,7 +494,7 @@ static int hisi_thermal_register_sensor(struct platform_device *pdev,
for (i = 0; i < of_thermal_get_ntrips(sensor->tzd); i++) {
if (trip[i].type == THERMAL_TRIP_PASSIVE) {
- sensor->thres_temp = hisi_thermal_round_temp(trip[i].temperature);
+ sensor->thres_temp = trip[i].temperature;
break;
}
}
@@ -299,7 +503,14 @@ static int hisi_thermal_register_sensor(struct platform_device *pdev,
}
static const struct of_device_id of_hisi_thermal_match[] = {
- { .compatible = "hisilicon,tsensor" },
+ {
+ .compatible = "hisilicon,tsensor",
+ .data = hi6220_thermal_probe
+ },
+ {
+ .compatible = "hisilicon,hi3660-tsensor",
+ .data = hi3660_thermal_probe
+ },
{ /* end */ }
};
MODULE_DEVICE_TABLE(of, of_hisi_thermal_match);
@@ -316,69 +527,51 @@ static void hisi_thermal_toggle_sensor(struct hisi_thermal_sensor *sensor,
static int hisi_thermal_probe(struct platform_device *pdev)
{
struct hisi_thermal_data *data;
- struct resource *res;
- int i;
+ int const (*platform_probe)(struct hisi_thermal_data *);
+ struct device *dev = &pdev->dev;
int ret;
- data = devm_kzalloc(&pdev->dev, sizeof(*data), GFP_KERNEL);
+ data = devm_kzalloc(dev, sizeof(*data), GFP_KERNEL);
if (!data)
return -ENOMEM;
- mutex_init(&data->thermal_lock);
data->pdev = pdev;
+ platform_set_drvdata(pdev, data);
- res = platform_get_resource(pdev, IORESOURCE_MEM, 0);
- data->regs = devm_ioremap_resource(&pdev->dev, res);
- if (IS_ERR(data->regs)) {
- dev_err(&pdev->dev, "failed to get io address\n");
- return PTR_ERR(data->regs);
+ platform_probe = of_device_get_match_data(dev);
+ if (!platform_probe) {
+ dev_err(dev, "failed to get probe func\n");
+ return -EINVAL;
}
- data->irq = platform_get_irq(pdev, 0);
- if (data->irq < 0)
- return data->irq;
-
- platform_set_drvdata(pdev, data);
-
- data->clk = devm_clk_get(&pdev->dev, "thermal_clk");
- if (IS_ERR(data->clk)) {
- ret = PTR_ERR(data->clk);
- if (ret != -EPROBE_DEFER)
- dev_err(&pdev->dev,
- "failed to get thermal clk: %d\n", ret);
+ ret = platform_probe(data);
+ if (ret)
return ret;
- }
- /* enable clock for thermal */
- ret = clk_prepare_enable(data->clk);
+ ret = hisi_thermal_register_sensor(pdev, data,
+ &data->sensor);
if (ret) {
- dev_err(&pdev->dev, "failed to enable thermal clk: %d\n", ret);
+ dev_err(dev, "failed to register thermal sensor: %d\n", ret);
return ret;
}
- hisi_thermal_enable_bind_irq_sensor(data);
- data->irq_enabled = true;
-
- for (i = 0; i < HISI_MAX_SENSORS; ++i) {
- ret = hisi_thermal_register_sensor(pdev, data,
- &data->sensors[i], i);
- if (ret)
- dev_err(&pdev->dev,
- "failed to register thermal sensor: %d\n", ret);
- else
- hisi_thermal_toggle_sensor(&data->sensors[i], true);
+ ret = data->enable_sensor(data);
+ if (ret) {
+ dev_err(dev, "Failed to setup the sensor: %d\n", ret);
+ return ret;
}
- ret = devm_request_threaded_irq(&pdev->dev, data->irq,
- hisi_thermal_alarm_irq,
- hisi_thermal_alarm_irq_thread,
- 0, "hisi_thermal", data);
- if (ret < 0) {
- dev_err(&pdev->dev, "failed to request alarm irq: %d\n", ret);
- return ret;
+ if (data->irq) {
+ ret = devm_request_threaded_irq(dev, data->irq, NULL,
+ hisi_thermal_alarm_irq_thread,
+ IRQF_ONESHOT, "hisi_thermal", data);
+ if (ret < 0) {
+ dev_err(dev, "failed to request alarm irq: %d\n", ret);
+ return ret;
+ }
}
- enable_irq(data->irq);
+ hisi_thermal_toggle_sensor(&data->sensor, true);
return 0;
}
@@ -386,19 +579,11 @@ static int hisi_thermal_probe(struct platform_device *pdev)
static int hisi_thermal_remove(struct platform_device *pdev)
{
struct hisi_thermal_data *data = platform_get_drvdata(pdev);
- int i;
-
- for (i = 0; i < HISI_MAX_SENSORS; i++) {
- struct hisi_thermal_sensor *sensor = &data->sensors[i];
+ struct hisi_thermal_sensor *sensor = &data->sensor;
- if (!sensor->tzd)
- continue;
+ hisi_thermal_toggle_sensor(sensor, false);
- hisi_thermal_toggle_sensor(sensor, false);
- }
-
- hisi_thermal_disable_sensor(data);
- clk_disable_unprepare(data->clk);
+ data->disable_sensor(data);
return 0;
}
@@ -408,10 +593,7 @@ static int hisi_thermal_suspend(struct device *dev)
{
struct hisi_thermal_data *data = dev_get_drvdata(dev);
- hisi_thermal_disable_sensor(data);
- data->irq_enabled = false;
-
- clk_disable_unprepare(data->clk);
+ data->disable_sensor(data);
return 0;
}
@@ -419,16 +601,8 @@ static int hisi_thermal_suspend(struct device *dev)
static int hisi_thermal_resume(struct device *dev)
{
struct hisi_thermal_data *data = dev_get_drvdata(dev);
- int ret;
- ret = clk_prepare_enable(data->clk);
- if (ret)
- return ret;
-
- data->irq_enabled = true;
- hisi_thermal_enable_bind_irq_sensor(data);
-
- return 0;
+ return data->enable_sensor(data);
}
#endif
diff --git a/drivers/tty/serial/kgdboc.c b/drivers/tty/serial/kgdboc.c
index f2b0d8cee8ef..6c0c6c5d0aaa 100644
--- a/drivers/tty/serial/kgdboc.c
+++ b/drivers/tty/serial/kgdboc.c
@@ -230,7 +230,8 @@ static void kgdboc_put_char(u8 chr)
kgdb_tty_line, chr);
}
-static int param_set_kgdboc_var(const char *kmessage, struct kernel_param *kp)
+static int param_set_kgdboc_var(const char *kmessage,
+ const struct kernel_param *kp)
{
size_t len = strlen(kmessage);
diff --git a/drivers/tty/serial/serial_core.c b/drivers/tty/serial/serial_core.c
index 53e6db8b0330..17c2ee2acb65 100644
--- a/drivers/tty/serial/serial_core.c
+++ b/drivers/tty/serial/serial_core.c
@@ -131,6 +131,9 @@ static void __uart_start(struct tty_struct *tty)
struct uart_state *state = tty->driver_data;
struct uart_port *port = state->uart_port;
+ if (port && port->ops->wake_peer)
+ port->ops->wake_peer(port);
+
if (port && !uart_tx_stopped(port))
port->ops->start_tx(port);
}
diff --git a/drivers/usb/gadget/Kconfig b/drivers/usb/gadget/Kconfig
index f3ee80ece682..f6cce5ac69c5 100644
--- a/drivers/usb/gadget/Kconfig
+++ b/drivers/usb/gadget/Kconfig
@@ -209,6 +209,18 @@ config USB_F_PRINTER
config USB_F_TCM
tristate
+config USB_F_MTP
+ tristate
+
+config USB_F_PTP
+ tristate
+
+config USB_F_AUDIO_SRC
+ tristate
+
+config USB_F_ACC
+ tristate
+
# this first set of drivers all depend on bulk-capable hardware.
config USB_CONFIGFS
@@ -362,6 +374,44 @@ config USB_CONFIGFS_F_FS
implemented in kernel space (for instance Ethernet, serial or
mass storage) and other are implemented in user space.
+config USB_CONFIGFS_F_MTP
+ boolean "MTP gadget"
+ depends on USB_CONFIGFS
+ select USB_F_MTP
+ help
+ USB gadget MTP support
+
+config USB_CONFIGFS_F_PTP
+ boolean "PTP gadget"
+ depends on USB_CONFIGFS && USB_CONFIGFS_F_MTP
+ select USB_F_PTP
+ help
+ USB gadget PTP support
+
+config USB_CONFIGFS_F_ACC
+ boolean "Accessory gadget"
+ depends on USB_CONFIGFS
+ select USB_F_ACC
+ help
+ USB gadget Accessory support
+
+config USB_CONFIGFS_F_AUDIO_SRC
+ boolean "Audio Source gadget"
+ depends on USB_CONFIGFS && USB_CONFIGFS_F_ACC
+ depends on SND
+ select SND_PCM
+ select USB_F_AUDIO_SRC
+ help
+ USB gadget Audio Source support
+
+config USB_CONFIGFS_UEVENT
+ boolean "Uevent notification of Gadget state"
+ depends on USB_CONFIGFS
+ help
+ Enable uevent notifications to userspace when the gadget
+ state changes. The gadget can be in any of the following
+ three states: "CONNECTED/DISCONNECTED/CONFIGURED"
+
config USB_CONFIGFS_F_UAC1
bool "Audio Class 1.0"
depends on USB_CONFIGFS
diff --git a/drivers/usb/gadget/composite.c b/drivers/usb/gadget/composite.c
index 2c022a08f163..a8b4ca04cc50 100644
--- a/drivers/usb/gadget/composite.c
+++ b/drivers/usb/gadget/composite.c
@@ -1996,6 +1996,12 @@ void composite_disconnect(struct usb_gadget *gadget)
struct usb_composite_dev *cdev = get_gadget_data(gadget);
unsigned long flags;
+ if (cdev == NULL) {
+ WARN(1, "%s: Calling disconnect on a Gadget that is \
+ not connected\n", __func__);
+ return;
+ }
+
/* REVISIT: should we have config and device level
* disconnect callbacks?
*/
diff --git a/drivers/usb/gadget/configfs.c b/drivers/usb/gadget/configfs.c
index a5ca409dc97e..b1d22d8c9f7e 100644
--- a/drivers/usb/gadget/configfs.c
+++ b/drivers/usb/gadget/configfs.c
@@ -9,6 +9,31 @@
#include "u_f.h"
#include "u_os_desc.h"
+#ifdef CONFIG_USB_CONFIGFS_UEVENT
+#include <linux/platform_device.h>
+#include <linux/kdev_t.h>
+#include <linux/usb/ch9.h>
+
+#ifdef CONFIG_USB_CONFIGFS_F_ACC
+extern int acc_ctrlrequest(struct usb_composite_dev *cdev,
+ const struct usb_ctrlrequest *ctrl);
+void acc_disconnect(void);
+#endif
+static struct class *android_class;
+static struct device *android_device;
+static int index;
+
+struct device *create_function_device(char *name)
+{
+ if (android_device && !IS_ERR(android_device))
+ return device_create(android_class, android_device,
+ MKDEV(0, index++), NULL, name);
+ else
+ return ERR_PTR(-EINVAL);
+}
+EXPORT_SYMBOL_GPL(create_function_device);
+#endif
+
int check_user_usb_string(const char *name,
struct usb_gadget_strings *stringtab_dev)
{
@@ -60,6 +85,12 @@ struct gadget_info {
bool use_os_desc;
char b_vendor_code;
char qw_sign[OS_STRING_QW_SIGN_LEN];
+#ifdef CONFIG_USB_CONFIGFS_UEVENT
+ bool connected;
+ bool sw_connected;
+ struct work_struct work;
+ struct device *dev;
+#endif
};
static inline struct gadget_info *to_gadget_info(struct config_item *item)
@@ -265,7 +296,7 @@ static ssize_t gadget_dev_desc_UDC_store(struct config_item *item,
mutex_lock(&gi->lock);
- if (!strlen(name)) {
+ if (!strlen(name) || strcmp(name, "none") == 0) {
ret = unregister_gadget(gi);
if (ret)
goto err;
@@ -1369,6 +1400,60 @@ err_comp_cleanup:
return ret;
}
+#ifdef CONFIG_USB_CONFIGFS_UEVENT
+static void android_work(struct work_struct *data)
+{
+ struct gadget_info *gi = container_of(data, struct gadget_info, work);
+ struct usb_composite_dev *cdev = &gi->cdev;
+ char *disconnected[2] = { "USB_STATE=DISCONNECTED", NULL };
+ char *connected[2] = { "USB_STATE=CONNECTED", NULL };
+ char *configured[2] = { "USB_STATE=CONFIGURED", NULL };
+ /* 0-connected 1-configured 2-disconnected*/
+ bool status[3] = { false, false, false };
+ unsigned long flags;
+ bool uevent_sent = false;
+
+ spin_lock_irqsave(&cdev->lock, flags);
+ if (cdev->config)
+ status[1] = true;
+
+ if (gi->connected != gi->sw_connected) {
+ if (gi->connected)
+ status[0] = true;
+ else
+ status[2] = true;
+ gi->sw_connected = gi->connected;
+ }
+ spin_unlock_irqrestore(&cdev->lock, flags);
+
+ if (status[0]) {
+ kobject_uevent_env(&android_device->kobj,
+ KOBJ_CHANGE, connected);
+ pr_info("%s: sent uevent %s\n", __func__, connected[0]);
+ uevent_sent = true;
+ }
+
+ if (status[1]) {
+ kobject_uevent_env(&android_device->kobj,
+ KOBJ_CHANGE, configured);
+ pr_info("%s: sent uevent %s\n", __func__, configured[0]);
+ uevent_sent = true;
+ }
+
+ if (status[2]) {
+ kobject_uevent_env(&android_device->kobj,
+ KOBJ_CHANGE, disconnected);
+ pr_info("%s: sent uevent %s\n", __func__, disconnected[0]);
+ uevent_sent = true;
+ }
+
+ if (!uevent_sent) {
+ pr_info("%s: did not send uevent (%d %d %p)\n", __func__,
+ gi->connected, gi->sw_connected, cdev->config);
+ }
+}
+#endif
+
static void configfs_composite_unbind(struct usb_gadget *gadget)
{
struct usb_composite_dev *cdev;
@@ -1388,14 +1473,91 @@ static void configfs_composite_unbind(struct usb_gadget *gadget)
set_gadget_data(gadget, NULL);
}
+#ifdef CONFIG_USB_CONFIGFS_UEVENT
+static int android_setup(struct usb_gadget *gadget,
+ const struct usb_ctrlrequest *c)
+{
+ struct usb_composite_dev *cdev = get_gadget_data(gadget);
+ unsigned long flags;
+ struct gadget_info *gi = container_of(cdev, struct gadget_info, cdev);
+ int value = -EOPNOTSUPP;
+ struct usb_function_instance *fi;
+
+ spin_lock_irqsave(&cdev->lock, flags);
+ if (!gi->connected) {
+ gi->connected = 1;
+ schedule_work(&gi->work);
+ }
+ spin_unlock_irqrestore(&cdev->lock, flags);
+ list_for_each_entry(fi, &gi->available_func, cfs_list) {
+ if (fi != NULL && fi->f != NULL && fi->f->setup != NULL) {
+ value = fi->f->setup(fi->f, c);
+ if (value >= 0)
+ break;
+ }
+ }
+
+#ifdef CONFIG_USB_CONFIGFS_F_ACC
+ if (value < 0)
+ value = acc_ctrlrequest(cdev, c);
+#endif
+
+ if (value < 0)
+ value = composite_setup(gadget, c);
+
+ spin_lock_irqsave(&cdev->lock, flags);
+ if (c->bRequest == USB_REQ_SET_CONFIGURATION &&
+ cdev->config) {
+ schedule_work(&gi->work);
+ }
+ spin_unlock_irqrestore(&cdev->lock, flags);
+
+ return value;
+}
+
+static void android_disconnect(struct usb_gadget *gadget)
+{
+ struct usb_composite_dev *cdev = get_gadget_data(gadget);
+ struct gadget_info *gi = container_of(cdev, struct gadget_info, cdev);
+
+ /* FIXME: There's a race between usb_gadget_udc_stop() which is likely
+ * to set the gadget driver to NULL in the udc driver and this drivers
+ * gadget disconnect fn which likely checks for the gadget driver to
+ * be a null ptr. It happens that unbind (doing set_gadget_data(NULL))
+ * is called before the gadget driver is set to NULL and the udc driver
+ * calls disconnect fn which results in cdev being a null ptr.
+ */
+ if (cdev == NULL) {
+ WARN(1, "%s: gadget driver already disconnected\n", __func__);
+ return;
+ }
+
+ /* accessory HID support can be active while the
+ accessory function is not actually enabled,
+ so we need to inform it when we are disconnected.
+ */
+
+#ifdef CONFIG_USB_CONFIGFS_F_ACC
+ acc_disconnect();
+#endif
+ gi->connected = 0;
+ schedule_work(&gi->work);
+ composite_disconnect(gadget);
+}
+#endif
+
static const struct usb_gadget_driver configfs_driver_template = {
.bind = configfs_composite_bind,
.unbind = configfs_composite_unbind,
-
+#ifdef CONFIG_USB_CONFIGFS_UEVENT
+ .setup = android_setup,
+ .reset = android_disconnect,
+ .disconnect = android_disconnect,
+#else
.setup = composite_setup,
.reset = composite_disconnect,
.disconnect = composite_disconnect,
-
+#endif
.suspend = composite_suspend,
.resume = composite_resume,
@@ -1407,6 +1569,89 @@ static const struct usb_gadget_driver configfs_driver_template = {
.match_existing_only = 1,
};
+#ifdef CONFIG_USB_CONFIGFS_UEVENT
+static ssize_t state_show(struct device *pdev, struct device_attribute *attr,
+ char *buf)
+{
+ struct gadget_info *dev = dev_get_drvdata(pdev);
+ struct usb_composite_dev *cdev;
+ char *state = "DISCONNECTED";
+ unsigned long flags;
+
+ if (!dev)
+ goto out;
+
+ cdev = &dev->cdev;
+
+ if (!cdev)
+ goto out;
+
+ spin_lock_irqsave(&cdev->lock, flags);
+ if (cdev->config)
+ state = "CONFIGURED";
+ else if (dev->connected)
+ state = "CONNECTED";
+ spin_unlock_irqrestore(&cdev->lock, flags);
+out:
+ return sprintf(buf, "%s\n", state);
+}
+
+static DEVICE_ATTR(state, S_IRUGO, state_show, NULL);
+
+static struct device_attribute *android_usb_attributes[] = {
+ &dev_attr_state,
+ NULL
+};
+
+static int android_device_create(struct gadget_info *gi)
+{
+ struct device_attribute **attrs;
+ struct device_attribute *attr;
+
+ INIT_WORK(&gi->work, android_work);
+ android_device = device_create(android_class, NULL,
+ MKDEV(0, 0), NULL, "android0");
+ if (IS_ERR(android_device))
+ return PTR_ERR(android_device);
+
+ dev_set_drvdata(android_device, gi);
+
+ attrs = android_usb_attributes;
+ while ((attr = *attrs++)) {
+ int err;
+
+ err = device_create_file(android_device, attr);
+ if (err) {
+ device_destroy(android_device->class,
+ android_device->devt);
+ return err;
+ }
+ }
+
+ return 0;
+}
+
+static void android_device_destroy(void)
+{
+ struct device_attribute **attrs;
+ struct device_attribute *attr;
+
+ attrs = android_usb_attributes;
+ while ((attr = *attrs++))
+ device_remove_file(android_device, attr);
+ device_destroy(android_device->class, android_device->devt);
+}
+#else
+static inline int android_device_create(struct gadget_info *gi)
+{
+ return 0;
+}
+
+static inline void android_device_destroy(void)
+{
+}
+#endif
+
static struct config_group *gadgets_make(
struct config_group *group,
const char *name)
@@ -1458,7 +1703,11 @@ static struct config_group *gadgets_make(
if (!gi->composite.gadget_driver.function)
goto err;
+ if (android_device_create(gi) < 0)
+ goto err;
+
return &gi->group;
+
err:
kfree(gi);
return ERR_PTR(-ENOMEM);
@@ -1467,6 +1716,7 @@ err:
static void gadgets_drop(struct config_group *group, struct config_item *item)
{
config_item_put(item);
+ android_device_destroy();
}
static struct configfs_group_operations gadgets_ops = {
@@ -1506,6 +1756,13 @@ static int __init gadget_cfs_init(void)
config_group_init(&gadget_subsys.su_group);
ret = configfs_register_subsystem(&gadget_subsys);
+
+#ifdef CONFIG_USB_CONFIGFS_UEVENT
+ android_class = class_create(THIS_MODULE, "android_usb");
+ if (IS_ERR(android_class))
+ return PTR_ERR(android_class);
+#endif
+
return ret;
}
module_init(gadget_cfs_init);
@@ -1513,5 +1770,10 @@ module_init(gadget_cfs_init);
static void __exit gadget_cfs_exit(void)
{
configfs_unregister_subsystem(&gadget_subsys);
+#ifdef CONFIG_USB_CONFIGFS_UEVENT
+ if (!IS_ERR(android_class))
+ class_destroy(android_class);
+#endif
+
}
module_exit(gadget_cfs_exit);
diff --git a/drivers/usb/gadget/function/Makefile b/drivers/usb/gadget/function/Makefile
index cb8c225e8549..78682d5e4dc7 100644
--- a/drivers/usb/gadget/function/Makefile
+++ b/drivers/usb/gadget/function/Makefile
@@ -46,3 +46,11 @@ usb_f_printer-y := f_printer.o
obj-$(CONFIG_USB_F_PRINTER) += usb_f_printer.o
usb_f_tcm-y := f_tcm.o
obj-$(CONFIG_USB_F_TCM) += usb_f_tcm.o
+usb_f_mtp-y := f_mtp.o
+obj-$(CONFIG_USB_F_MTP) += usb_f_mtp.o
+usb_f_ptp-y := f_ptp.o
+obj-$(CONFIG_USB_F_PTP) += usb_f_ptp.o
+usb_f_audio_source-y := f_audio_source.o
+obj-$(CONFIG_USB_F_AUDIO_SRC) += usb_f_audio_source.o
+usb_f_accessory-y := f_accessory.o
+obj-$(CONFIG_USB_F_ACC) += usb_f_accessory.o
diff --git a/drivers/usb/gadget/function/f_accessory.c b/drivers/usb/gadget/function/f_accessory.c
new file mode 100644
index 000000000000..7aa2656a2328
--- /dev/null
+++ b/drivers/usb/gadget/function/f_accessory.c
@@ -0,0 +1,1352 @@
+/*
+ * Gadget Function Driver for Android USB accessories
+ *
+ * Copyright (C) 2011 Google, Inc.
+ * Author: Mike Lockwood <lockwood@android.com>
+ *
+ * This software is licensed under the terms of the GNU General Public
+ * License version 2, as published by the Free Software Foundation, and
+ * may be copied, distributed, and modified under those terms.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ */
+
+/* #define DEBUG */
+/* #define VERBOSE_DEBUG */
+
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/poll.h>
+#include <linux/delay.h>
+#include <linux/wait.h>
+#include <linux/err.h>
+#include <linux/interrupt.h>
+#include <linux/kthread.h>
+#include <linux/freezer.h>
+
+#include <linux/types.h>
+#include <linux/file.h>
+#include <linux/device.h>
+#include <linux/miscdevice.h>
+
+#include <linux/hid.h>
+#include <linux/hiddev.h>
+#include <linux/usb.h>
+#include <linux/usb/ch9.h>
+#include <linux/usb/f_accessory.h>
+
+#include <linux/configfs.h>
+#include <linux/usb/composite.h>
+
+#define MAX_INST_NAME_LEN 40
+#define BULK_BUFFER_SIZE 16384
+#define ACC_STRING_SIZE 256
+
+#define PROTOCOL_VERSION 2
+
+/* String IDs */
+#define INTERFACE_STRING_INDEX 0
+
+/* number of tx and rx requests to allocate */
+#define TX_REQ_MAX 4
+#define RX_REQ_MAX 2
+
+struct acc_hid_dev {
+ struct list_head list;
+ struct hid_device *hid;
+ struct acc_dev *dev;
+ /* accessory defined ID */
+ int id;
+ /* HID report descriptor */
+ u8 *report_desc;
+ /* length of HID report descriptor */
+ int report_desc_len;
+ /* number of bytes of report_desc we have received so far */
+ int report_desc_offset;
+};
+
+struct acc_dev {
+ struct usb_function function;
+ struct usb_composite_dev *cdev;
+ spinlock_t lock;
+
+ struct usb_ep *ep_in;
+ struct usb_ep *ep_out;
+
+ /* online indicates state of function_set_alt & function_unbind
+ * set to 1 when we connect
+ */
+ int online:1;
+
+ /* disconnected indicates state of open & release
+ * Set to 1 when we disconnect.
+ * Not cleared until our file is closed.
+ */
+ int disconnected:1;
+
+ /* strings sent by the host */
+ char manufacturer[ACC_STRING_SIZE];
+ char model[ACC_STRING_SIZE];
+ char description[ACC_STRING_SIZE];
+ char version[ACC_STRING_SIZE];
+ char uri[ACC_STRING_SIZE];
+ char serial[ACC_STRING_SIZE];
+
+ /* for acc_complete_set_string */
+ int string_index;
+
+ /* set to 1 if we have a pending start request */
+ int start_requested;
+
+ int audio_mode;
+
+ /* synchronize access to our device file */
+ atomic_t open_excl;
+
+ struct list_head tx_idle;
+
+ wait_queue_head_t read_wq;
+ wait_queue_head_t write_wq;
+ struct usb_request *rx_req[RX_REQ_MAX];
+ int rx_done;
+
+ /* delayed work for handling ACCESSORY_START */
+ struct delayed_work start_work;
+
+ /* worker for registering and unregistering hid devices */
+ struct work_struct hid_work;
+
+ /* list of active HID devices */
+ struct list_head hid_list;
+
+ /* list of new HID devices to register */
+ struct list_head new_hid_list;
+
+ /* list of dead HID devices to unregister */
+ struct list_head dead_hid_list;
+};
+
+static struct usb_interface_descriptor acc_interface_desc = {
+ .bLength = USB_DT_INTERFACE_SIZE,
+ .bDescriptorType = USB_DT_INTERFACE,
+ .bInterfaceNumber = 0,
+ .bNumEndpoints = 2,
+ .bInterfaceClass = USB_CLASS_VENDOR_SPEC,
+ .bInterfaceSubClass = USB_SUBCLASS_VENDOR_SPEC,
+ .bInterfaceProtocol = 0,
+};
+
+static struct usb_endpoint_descriptor acc_highspeed_in_desc = {
+ .bLength = USB_DT_ENDPOINT_SIZE,
+ .bDescriptorType = USB_DT_ENDPOINT,
+ .bEndpointAddress = USB_DIR_IN,
+ .bmAttributes = USB_ENDPOINT_XFER_BULK,
+ .wMaxPacketSize = __constant_cpu_to_le16(512),
+};
+
+static struct usb_endpoint_descriptor acc_highspeed_out_desc = {
+ .bLength = USB_DT_ENDPOINT_SIZE,
+ .bDescriptorType = USB_DT_ENDPOINT,
+ .bEndpointAddress = USB_DIR_OUT,
+ .bmAttributes = USB_ENDPOINT_XFER_BULK,
+ .wMaxPacketSize = __constant_cpu_to_le16(512),
+};
+
+static struct usb_endpoint_descriptor acc_fullspeed_in_desc = {
+ .bLength = USB_DT_ENDPOINT_SIZE,
+ .bDescriptorType = USB_DT_ENDPOINT,
+ .bEndpointAddress = USB_DIR_IN,
+ .bmAttributes = USB_ENDPOINT_XFER_BULK,
+};
+
+static struct usb_endpoint_descriptor acc_fullspeed_out_desc = {
+ .bLength = USB_DT_ENDPOINT_SIZE,
+ .bDescriptorType = USB_DT_ENDPOINT,
+ .bEndpointAddress = USB_DIR_OUT,
+ .bmAttributes = USB_ENDPOINT_XFER_BULK,
+};
+
+static struct usb_descriptor_header *fs_acc_descs[] = {
+ (struct usb_descriptor_header *) &acc_interface_desc,
+ (struct usb_descriptor_header *) &acc_fullspeed_in_desc,
+ (struct usb_descriptor_header *) &acc_fullspeed_out_desc,
+ NULL,
+};
+
+static struct usb_descriptor_header *hs_acc_descs[] = {
+ (struct usb_descriptor_header *) &acc_interface_desc,
+ (struct usb_descriptor_header *) &acc_highspeed_in_desc,
+ (struct usb_descriptor_header *) &acc_highspeed_out_desc,
+ NULL,
+};
+
+static struct usb_string acc_string_defs[] = {
+ [INTERFACE_STRING_INDEX].s = "Android Accessory Interface",
+ { }, /* end of list */
+};
+
+static struct usb_gadget_strings acc_string_table = {
+ .language = 0x0409, /* en-US */
+ .strings = acc_string_defs,
+};
+
+static struct usb_gadget_strings *acc_strings[] = {
+ &acc_string_table,
+ NULL,
+};
+
+/* temporary variable used between acc_open() and acc_gadget_bind() */
+static struct acc_dev *_acc_dev;
+
+struct acc_instance {
+ struct usb_function_instance func_inst;
+ const char *name;
+};
+
+static inline struct acc_dev *func_to_dev(struct usb_function *f)
+{
+ return container_of(f, struct acc_dev, function);
+}
+
+static struct usb_request *acc_request_new(struct usb_ep *ep, int buffer_size)
+{
+ struct usb_request *req = usb_ep_alloc_request(ep, GFP_KERNEL);
+
+ if (!req)
+ return NULL;
+
+ /* now allocate buffers for the requests */
+ req->buf = kmalloc(buffer_size, GFP_KERNEL);
+ if (!req->buf) {
+ usb_ep_free_request(ep, req);
+ return NULL;
+ }
+
+ return req;
+}
+
+static void acc_request_free(struct usb_request *req, struct usb_ep *ep)
+{
+ if (req) {
+ kfree(req->buf);
+ usb_ep_free_request(ep, req);
+ }
+}
+
+/* add a request to the tail of a list */
+static void req_put(struct acc_dev *dev, struct list_head *head,
+ struct usb_request *req)
+{
+ unsigned long flags;
+
+ spin_lock_irqsave(&dev->lock, flags);
+ list_add_tail(&req->list, head);
+ spin_unlock_irqrestore(&dev->lock, flags);
+}
+
+/* remove a request from the head of a list */
+static struct usb_request *req_get(struct acc_dev *dev, struct list_head *head)
+{
+ unsigned long flags;
+ struct usb_request *req;
+
+ spin_lock_irqsave(&dev->lock, flags);
+ if (list_empty(head)) {
+ req = 0;
+ } else {
+ req = list_first_entry(head, struct usb_request, list);
+ list_del(&req->list);
+ }
+ spin_unlock_irqrestore(&dev->lock, flags);
+ return req;
+}
+
+static void acc_set_disconnected(struct acc_dev *dev)
+{
+ dev->disconnected = 1;
+}
+
+static void acc_complete_in(struct usb_ep *ep, struct usb_request *req)
+{
+ struct acc_dev *dev = _acc_dev;
+
+ if (req->status == -ESHUTDOWN) {
+ pr_debug("acc_complete_in set disconnected");
+ acc_set_disconnected(dev);
+ }
+
+ req_put(dev, &dev->tx_idle, req);
+
+ wake_up(&dev->write_wq);
+}
+
+static void acc_complete_out(struct usb_ep *ep, struct usb_request *req)
+{
+ struct acc_dev *dev = _acc_dev;
+
+ dev->rx_done = 1;
+ if (req->status == -ESHUTDOWN) {
+ pr_debug("acc_complete_out set disconnected");
+ acc_set_disconnected(dev);
+ }
+
+ wake_up(&dev->read_wq);
+}
+
+static void acc_complete_set_string(struct usb_ep *ep, struct usb_request *req)
+{
+ struct acc_dev *dev = ep->driver_data;
+ char *string_dest = NULL;
+ int length = req->actual;
+
+ if (req->status != 0) {
+ pr_err("acc_complete_set_string, err %d\n", req->status);
+ return;
+ }
+
+ switch (dev->string_index) {
+ case ACCESSORY_STRING_MANUFACTURER:
+ string_dest = dev->manufacturer;
+ break;
+ case ACCESSORY_STRING_MODEL:
+ string_dest = dev->model;
+ break;
+ case ACCESSORY_STRING_DESCRIPTION:
+ string_dest = dev->description;
+ break;
+ case ACCESSORY_STRING_VERSION:
+ string_dest = dev->version;
+ break;
+ case ACCESSORY_STRING_URI:
+ string_dest = dev->uri;
+ break;
+ case ACCESSORY_STRING_SERIAL:
+ string_dest = dev->serial;
+ break;
+ }
+ if (string_dest) {
+ unsigned long flags;
+
+ if (length >= ACC_STRING_SIZE)
+ length = ACC_STRING_SIZE - 1;
+
+ spin_lock_irqsave(&dev->lock, flags);
+ memcpy(string_dest, req->buf, length);
+ /* ensure zero termination */
+ string_dest[length] = 0;
+ spin_unlock_irqrestore(&dev->lock, flags);
+ } else {
+ pr_err("unknown accessory string index %d\n",
+ dev->string_index);
+ }
+}
+
+static void acc_complete_set_hid_report_desc(struct usb_ep *ep,
+ struct usb_request *req)
+{
+ struct acc_hid_dev *hid = req->context;
+ struct acc_dev *dev = hid->dev;
+ int length = req->actual;
+
+ if (req->status != 0) {
+ pr_err("acc_complete_set_hid_report_desc, err %d\n",
+ req->status);
+ return;
+ }
+
+ memcpy(hid->report_desc + hid->report_desc_offset, req->buf, length);
+ hid->report_desc_offset += length;
+ if (hid->report_desc_offset == hid->report_desc_len) {
+ /* After we have received the entire report descriptor
+ * we schedule work to initialize the HID device
+ */
+ schedule_work(&dev->hid_work);
+ }
+}
+
+static void acc_complete_send_hid_event(struct usb_ep *ep,
+ struct usb_request *req)
+{
+ struct acc_hid_dev *hid = req->context;
+ int length = req->actual;
+
+ if (req->status != 0) {
+ pr_err("acc_complete_send_hid_event, err %d\n", req->status);
+ return;
+ }
+
+ hid_report_raw_event(hid->hid, HID_INPUT_REPORT, req->buf, length, 1);
+}
+
+static int acc_hid_parse(struct hid_device *hid)
+{
+ struct acc_hid_dev *hdev = hid->driver_data;
+
+ hid_parse_report(hid, hdev->report_desc, hdev->report_desc_len);
+ return 0;
+}
+
+static int acc_hid_start(struct hid_device *hid)
+{
+ return 0;
+}
+
+static void acc_hid_stop(struct hid_device *hid)
+{
+}
+
+static int acc_hid_open(struct hid_device *hid)
+{
+ return 0;
+}
+
+static void acc_hid_close(struct hid_device *hid)
+{
+}
+
+static int acc_hid_raw_request(struct hid_device *hid, unsigned char reportnum,
+ __u8 *buf, size_t len, unsigned char rtype, int reqtype)
+{
+ return 0;
+}
+
+static struct hid_ll_driver acc_hid_ll_driver = {
+ .parse = acc_hid_parse,
+ .start = acc_hid_start,
+ .stop = acc_hid_stop,
+ .open = acc_hid_open,
+ .close = acc_hid_close,
+ .raw_request = acc_hid_raw_request,
+};
+
+static struct acc_hid_dev *acc_hid_new(struct acc_dev *dev,
+ int id, int desc_len)
+{
+ struct acc_hid_dev *hdev;
+
+ hdev = kzalloc(sizeof(*hdev), GFP_ATOMIC);
+ if (!hdev)
+ return NULL;
+ hdev->report_desc = kzalloc(desc_len, GFP_ATOMIC);
+ if (!hdev->report_desc) {
+ kfree(hdev);
+ return NULL;
+ }
+ hdev->dev = dev;
+ hdev->id = id;
+ hdev->report_desc_len = desc_len;
+
+ return hdev;
+}
+
+static struct acc_hid_dev *acc_hid_get(struct list_head *list, int id)
+{
+ struct acc_hid_dev *hid;
+
+ list_for_each_entry(hid, list, list) {
+ if (hid->id == id)
+ return hid;
+ }
+ return NULL;
+}
+
+static int acc_register_hid(struct acc_dev *dev, int id, int desc_length)
+{
+ struct acc_hid_dev *hid;
+ unsigned long flags;
+
+ /* report descriptor length must be > 0 */
+ if (desc_length <= 0)
+ return -EINVAL;
+
+ spin_lock_irqsave(&dev->lock, flags);
+ /* replace HID if one already exists with this ID */
+ hid = acc_hid_get(&dev->hid_list, id);
+ if (!hid)
+ hid = acc_hid_get(&dev->new_hid_list, id);
+ if (hid)
+ list_move(&hid->list, &dev->dead_hid_list);
+
+ hid = acc_hid_new(dev, id, desc_length);
+ if (!hid) {
+ spin_unlock_irqrestore(&dev->lock, flags);
+ return -ENOMEM;
+ }
+
+ list_add(&hid->list, &dev->new_hid_list);
+ spin_unlock_irqrestore(&dev->lock, flags);
+
+ /* schedule work to register the HID device */
+ schedule_work(&dev->hid_work);
+ return 0;
+}
+
+static int acc_unregister_hid(struct acc_dev *dev, int id)
+{
+ struct acc_hid_dev *hid;
+ unsigned long flags;
+
+ spin_lock_irqsave(&dev->lock, flags);
+ hid = acc_hid_get(&dev->hid_list, id);
+ if (!hid)
+ hid = acc_hid_get(&dev->new_hid_list, id);
+ if (!hid) {
+ spin_unlock_irqrestore(&dev->lock, flags);
+ return -EINVAL;
+ }
+
+ list_move(&hid->list, &dev->dead_hid_list);
+ spin_unlock_irqrestore(&dev->lock, flags);
+
+ schedule_work(&dev->hid_work);
+ return 0;
+}
+
+static int create_bulk_endpoints(struct acc_dev *dev,
+ struct usb_endpoint_descriptor *in_desc,
+ struct usb_endpoint_descriptor *out_desc)
+{
+ struct usb_composite_dev *cdev = dev->cdev;
+ struct usb_request *req;
+ struct usb_ep *ep;
+ int i;
+
+ DBG(cdev, "create_bulk_endpoints dev: %p\n", dev);
+
+ ep = usb_ep_autoconfig(cdev->gadget, in_desc);
+ if (!ep) {
+ DBG(cdev, "usb_ep_autoconfig for ep_in failed\n");
+ return -ENODEV;
+ }
+ DBG(cdev, "usb_ep_autoconfig for ep_in got %s\n", ep->name);
+ ep->driver_data = dev; /* claim the endpoint */
+ dev->ep_in = ep;
+
+ ep = usb_ep_autoconfig(cdev->gadget, out_desc);
+ if (!ep) {
+ DBG(cdev, "usb_ep_autoconfig for ep_out failed\n");
+ return -ENODEV;
+ }
+ DBG(cdev, "usb_ep_autoconfig for ep_out got %s\n", ep->name);
+ ep->driver_data = dev; /* claim the endpoint */
+ dev->ep_out = ep;
+
+ /* now allocate requests for our endpoints */
+ for (i = 0; i < TX_REQ_MAX; i++) {
+ req = acc_request_new(dev->ep_in, BULK_BUFFER_SIZE);
+ if (!req)
+ goto fail;
+ req->complete = acc_complete_in;
+ req_put(dev, &dev->tx_idle, req);
+ }
+ for (i = 0; i < RX_REQ_MAX; i++) {
+ req = acc_request_new(dev->ep_out, BULK_BUFFER_SIZE);
+ if (!req)
+ goto fail;
+ req->complete = acc_complete_out;
+ dev->rx_req[i] = req;
+ }
+
+ return 0;
+
+fail:
+ pr_err("acc_bind() could not allocate requests\n");
+ while ((req = req_get(dev, &dev->tx_idle)))
+ acc_request_free(req, dev->ep_in);
+ for (i = 0; i < RX_REQ_MAX; i++)
+ acc_request_free(dev->rx_req[i], dev->ep_out);
+ return -1;
+}
+
+static ssize_t acc_read(struct file *fp, char __user *buf,
+ size_t count, loff_t *pos)
+{
+ struct acc_dev *dev = fp->private_data;
+ struct usb_request *req;
+ ssize_t r = count;
+ unsigned xfer;
+ int ret = 0;
+
+ pr_debug("acc_read(%zu)\n", count);
+
+ if (dev->disconnected) {
+ pr_debug("acc_read disconnected");
+ return -ENODEV;
+ }
+
+ if (count > BULK_BUFFER_SIZE)
+ count = BULK_BUFFER_SIZE;
+
+ /* we will block until we're online */
+ pr_debug("acc_read: waiting for online\n");
+ ret = wait_event_interruptible(dev->read_wq, dev->online);
+ if (ret < 0) {
+ r = ret;
+ goto done;
+ }
+
+ if (dev->rx_done) {
+ // last req cancelled. try to get it.
+ req = dev->rx_req[0];
+ goto copy_data;
+ }
+
+requeue_req:
+ /* queue a request */
+ req = dev->rx_req[0];
+ req->length = count;
+ dev->rx_done = 0;
+ ret = usb_ep_queue(dev->ep_out, req, GFP_KERNEL);
+ if (ret < 0) {
+ r = -EIO;
+ goto done;
+ } else {
+ pr_debug("rx %p queue\n", req);
+ }
+
+ /* wait for a request to complete */
+ ret = wait_event_interruptible(dev->read_wq, dev->rx_done);
+ if (ret < 0) {
+ r = ret;
+ ret = usb_ep_dequeue(dev->ep_out, req);
+ if (ret != 0) {
+ // cancel failed. There can be a data already received.
+ // it will be retrieved in the next read.
+ pr_debug("acc_read: cancelling failed %d", ret);
+ }
+ goto done;
+ }
+
+copy_data:
+ dev->rx_done = 0;
+ if (dev->online) {
+ /* If we got a 0-len packet, throw it back and try again. */
+ if (req->actual == 0)
+ goto requeue_req;
+
+ pr_debug("rx %p %u\n", req, req->actual);
+ xfer = (req->actual < count) ? req->actual : count;
+ r = xfer;
+ if (copy_to_user(buf, req->buf, xfer))
+ r = -EFAULT;
+ } else
+ r = -EIO;
+
+done:
+ pr_debug("acc_read returning %zd\n", r);
+ return r;
+}
+
+static ssize_t acc_write(struct file *fp, const char __user *buf,
+ size_t count, loff_t *pos)
+{
+ struct acc_dev *dev = fp->private_data;
+ struct usb_request *req = 0;
+ ssize_t r = count;
+ unsigned xfer;
+ int ret;
+
+ pr_debug("acc_write(%zu)\n", count);
+
+ if (!dev->online || dev->disconnected) {
+ pr_debug("acc_write disconnected or not online");
+ return -ENODEV;
+ }
+
+ while (count > 0) {
+ if (!dev->online) {
+ pr_debug("acc_write dev->error\n");
+ r = -EIO;
+ break;
+ }
+
+ /* get an idle tx request to use */
+ req = 0;
+ ret = wait_event_interruptible(dev->write_wq,
+ ((req = req_get(dev, &dev->tx_idle)) || !dev->online));
+ if (!req) {
+ r = ret;
+ break;
+ }
+
+ if (count > BULK_BUFFER_SIZE) {
+ xfer = BULK_BUFFER_SIZE;
+ /* ZLP, They will be more TX requests so not yet. */
+ req->zero = 0;
+ } else {
+ xfer = count;
+ /* If the data length is a multple of the
+ * maxpacket size then send a zero length packet(ZLP).
+ */
+ req->zero = ((xfer % dev->ep_in->maxpacket) == 0);
+ }
+ if (copy_from_user(req->buf, buf, xfer)) {
+ r = -EFAULT;
+ break;
+ }
+
+ req->length = xfer;
+ ret = usb_ep_queue(dev->ep_in, req, GFP_KERNEL);
+ if (ret < 0) {
+ pr_debug("acc_write: xfer error %d\n", ret);
+ r = -EIO;
+ break;
+ }
+
+ buf += xfer;
+ count -= xfer;
+
+ /* zero this so we don't try to free it on error exit */
+ req = 0;
+ }
+
+ if (req)
+ req_put(dev, &dev->tx_idle, req);
+
+ pr_debug("acc_write returning %zd\n", r);
+ return r;
+}
+
+static long acc_ioctl(struct file *fp, unsigned code, unsigned long value)
+{
+ struct acc_dev *dev = fp->private_data;
+ char *src = NULL;
+ int ret;
+
+ switch (code) {
+ case ACCESSORY_GET_STRING_MANUFACTURER:
+ src = dev->manufacturer;
+ break;
+ case ACCESSORY_GET_STRING_MODEL:
+ src = dev->model;
+ break;
+ case ACCESSORY_GET_STRING_DESCRIPTION:
+ src = dev->description;
+ break;
+ case ACCESSORY_GET_STRING_VERSION:
+ src = dev->version;
+ break;
+ case ACCESSORY_GET_STRING_URI:
+ src = dev->uri;
+ break;
+ case ACCESSORY_GET_STRING_SERIAL:
+ src = dev->serial;
+ break;
+ case ACCESSORY_IS_START_REQUESTED:
+ return dev->start_requested;
+ case ACCESSORY_GET_AUDIO_MODE:
+ return dev->audio_mode;
+ }
+ if (!src)
+ return -EINVAL;
+
+ ret = strlen(src) + 1;
+ if (copy_to_user((void __user *)value, src, ret))
+ ret = -EFAULT;
+ return ret;
+}
+
+static int acc_open(struct inode *ip, struct file *fp)
+{
+ printk(KERN_INFO "acc_open\n");
+ if (atomic_xchg(&_acc_dev->open_excl, 1))
+ return -EBUSY;
+
+ _acc_dev->disconnected = 0;
+ fp->private_data = _acc_dev;
+ return 0;
+}
+
+static int acc_release(struct inode *ip, struct file *fp)
+{
+ printk(KERN_INFO "acc_release\n");
+
+ WARN_ON(!atomic_xchg(&_acc_dev->open_excl, 0));
+ /* indicate that we are disconnected
+ * still could be online so don't touch online flag
+ */
+ _acc_dev->disconnected = 1;
+ return 0;
+}
+
+/* file operations for /dev/usb_accessory */
+static const struct file_operations acc_fops = {
+ .owner = THIS_MODULE,
+ .read = acc_read,
+ .write = acc_write,
+ .unlocked_ioctl = acc_ioctl,
+ .open = acc_open,
+ .release = acc_release,
+};
+
+static int acc_hid_probe(struct hid_device *hdev,
+ const struct hid_device_id *id)
+{
+ int ret;
+
+ ret = hid_parse(hdev);
+ if (ret)
+ return ret;
+ return hid_hw_start(hdev, HID_CONNECT_DEFAULT);
+}
+
+static struct miscdevice acc_device = {
+ .minor = MISC_DYNAMIC_MINOR,
+ .name = "usb_accessory",
+ .fops = &acc_fops,
+};
+
+static const struct hid_device_id acc_hid_table[] = {
+ { HID_USB_DEVICE(HID_ANY_ID, HID_ANY_ID) },
+ { }
+};
+
+static struct hid_driver acc_hid_driver = {
+ .name = "USB accessory",
+ .id_table = acc_hid_table,
+ .probe = acc_hid_probe,
+};
+
+static void acc_complete_setup_noop(struct usb_ep *ep, struct usb_request *req)
+{
+ /*
+ * Default no-op function when nothing needs to be done for the
+ * setup request
+ */
+}
+
+int acc_ctrlrequest(struct usb_composite_dev *cdev,
+ const struct usb_ctrlrequest *ctrl)
+{
+ struct acc_dev *dev = _acc_dev;
+ int value = -EOPNOTSUPP;
+ struct acc_hid_dev *hid;
+ int offset;
+ u8 b_requestType = ctrl->bRequestType;
+ u8 b_request = ctrl->bRequest;
+ u16 w_index = le16_to_cpu(ctrl->wIndex);
+ u16 w_value = le16_to_cpu(ctrl->wValue);
+ u16 w_length = le16_to_cpu(ctrl->wLength);
+ unsigned long flags;
+
+/*
+ printk(KERN_INFO "acc_ctrlrequest "
+ "%02x.%02x v%04x i%04x l%u\n",
+ b_requestType, b_request,
+ w_value, w_index, w_length);
+*/
+
+ if (b_requestType == (USB_DIR_OUT | USB_TYPE_VENDOR)) {
+ if (b_request == ACCESSORY_START) {
+ dev->start_requested = 1;
+ schedule_delayed_work(
+ &dev->start_work, msecs_to_jiffies(10));
+ value = 0;
+ cdev->req->complete = acc_complete_setup_noop;
+ } else if (b_request == ACCESSORY_SEND_STRING) {
+ dev->string_index = w_index;
+ cdev->gadget->ep0->driver_data = dev;
+ cdev->req->complete = acc_complete_set_string;
+ value = w_length;
+ } else if (b_request == ACCESSORY_SET_AUDIO_MODE &&
+ w_index == 0 && w_length == 0) {
+ dev->audio_mode = w_value;
+ cdev->req->complete = acc_complete_setup_noop;
+ value = 0;
+ } else if (b_request == ACCESSORY_REGISTER_HID) {
+ cdev->req->complete = acc_complete_setup_noop;
+ value = acc_register_hid(dev, w_value, w_index);
+ } else if (b_request == ACCESSORY_UNREGISTER_HID) {
+ cdev->req->complete = acc_complete_setup_noop;
+ value = acc_unregister_hid(dev, w_value);
+ } else if (b_request == ACCESSORY_SET_HID_REPORT_DESC) {
+ spin_lock_irqsave(&dev->lock, flags);
+ hid = acc_hid_get(&dev->new_hid_list, w_value);
+ spin_unlock_irqrestore(&dev->lock, flags);
+ if (!hid) {
+ value = -EINVAL;
+ goto err;
+ }
+ offset = w_index;
+ if (offset != hid->report_desc_offset
+ || offset + w_length > hid->report_desc_len) {
+ value = -EINVAL;
+ goto err;
+ }
+ cdev->req->context = hid;
+ cdev->req->complete = acc_complete_set_hid_report_desc;
+ value = w_length;
+ } else if (b_request == ACCESSORY_SEND_HID_EVENT) {
+ spin_lock_irqsave(&dev->lock, flags);
+ hid = acc_hid_get(&dev->hid_list, w_value);
+ spin_unlock_irqrestore(&dev->lock, flags);
+ if (!hid) {
+ value = -EINVAL;
+ goto err;
+ }
+ cdev->req->context = hid;
+ cdev->req->complete = acc_complete_send_hid_event;
+ value = w_length;
+ }
+ } else if (b_requestType == (USB_DIR_IN | USB_TYPE_VENDOR)) {
+ if (b_request == ACCESSORY_GET_PROTOCOL) {
+ *((u16 *)cdev->req->buf) = PROTOCOL_VERSION;
+ value = sizeof(u16);
+ cdev->req->complete = acc_complete_setup_noop;
+ /* clear any string left over from a previous session */
+ memset(dev->manufacturer, 0, sizeof(dev->manufacturer));
+ memset(dev->model, 0, sizeof(dev->model));
+ memset(dev->description, 0, sizeof(dev->description));
+ memset(dev->version, 0, sizeof(dev->version));
+ memset(dev->uri, 0, sizeof(dev->uri));
+ memset(dev->serial, 0, sizeof(dev->serial));
+ dev->start_requested = 0;
+ dev->audio_mode = 0;
+ }
+ }
+
+ if (value >= 0) {
+ cdev->req->zero = 0;
+ cdev->req->length = value;
+ value = usb_ep_queue(cdev->gadget->ep0, cdev->req, GFP_ATOMIC);
+ if (value < 0)
+ ERROR(cdev, "%s setup response queue error\n",
+ __func__);
+ }
+
+err:
+ if (value == -EOPNOTSUPP)
+ VDBG(cdev,
+ "unknown class-specific control req "
+ "%02x.%02x v%04x i%04x l%u\n",
+ ctrl->bRequestType, ctrl->bRequest,
+ w_value, w_index, w_length);
+ return value;
+}
+EXPORT_SYMBOL_GPL(acc_ctrlrequest);
+
+static int
+__acc_function_bind(struct usb_configuration *c,
+ struct usb_function *f, bool configfs)
+{
+ struct usb_composite_dev *cdev = c->cdev;
+ struct acc_dev *dev = func_to_dev(f);
+ int id;
+ int ret;
+
+ DBG(cdev, "acc_function_bind dev: %p\n", dev);
+
+ if (configfs) {
+ if (acc_string_defs[INTERFACE_STRING_INDEX].id == 0) {
+ ret = usb_string_id(c->cdev);
+ if (ret < 0)
+ return ret;
+ acc_string_defs[INTERFACE_STRING_INDEX].id = ret;
+ acc_interface_desc.iInterface = ret;
+ }
+ dev->cdev = c->cdev;
+ }
+ ret = hid_register_driver(&acc_hid_driver);
+ if (ret)
+ return ret;
+
+ dev->start_requested = 0;
+
+ /* allocate interface ID(s) */
+ id = usb_interface_id(c, f);
+ if (id < 0)
+ return id;
+ acc_interface_desc.bInterfaceNumber = id;
+
+ /* allocate endpoints */
+ ret = create_bulk_endpoints(dev, &acc_fullspeed_in_desc,
+ &acc_fullspeed_out_desc);
+ if (ret)
+ return ret;
+
+ /* support high speed hardware */
+ if (gadget_is_dualspeed(c->cdev->gadget)) {
+ acc_highspeed_in_desc.bEndpointAddress =
+ acc_fullspeed_in_desc.bEndpointAddress;
+ acc_highspeed_out_desc.bEndpointAddress =
+ acc_fullspeed_out_desc.bEndpointAddress;
+ }
+
+ DBG(cdev, "%s speed %s: IN/%s, OUT/%s\n",
+ gadget_is_dualspeed(c->cdev->gadget) ? "dual" : "full",
+ f->name, dev->ep_in->name, dev->ep_out->name);
+ return 0;
+}
+
+static int
+acc_function_bind_configfs(struct usb_configuration *c,
+ struct usb_function *f) {
+ return __acc_function_bind(c, f, true);
+}
+
+static void
+kill_all_hid_devices(struct acc_dev *dev)
+{
+ struct acc_hid_dev *hid;
+ struct list_head *entry, *temp;
+ unsigned long flags;
+
+ /* do nothing if usb accessory device doesn't exist */
+ if (!dev)
+ return;
+
+ spin_lock_irqsave(&dev->lock, flags);
+ list_for_each_safe(entry, temp, &dev->hid_list) {
+ hid = list_entry(entry, struct acc_hid_dev, list);
+ list_del(&hid->list);
+ list_add(&hid->list, &dev->dead_hid_list);
+ }
+ list_for_each_safe(entry, temp, &dev->new_hid_list) {
+ hid = list_entry(entry, struct acc_hid_dev, list);
+ list_del(&hid->list);
+ list_add(&hid->list, &dev->dead_hid_list);
+ }
+ spin_unlock_irqrestore(&dev->lock, flags);
+
+ schedule_work(&dev->hid_work);
+}
+
+static void
+acc_hid_unbind(struct acc_dev *dev)
+{
+ hid_unregister_driver(&acc_hid_driver);
+ kill_all_hid_devices(dev);
+}
+
+static void
+acc_function_unbind(struct usb_configuration *c, struct usb_function *f)
+{
+ struct acc_dev *dev = func_to_dev(f);
+ struct usb_request *req;
+ int i;
+
+ dev->online = 0; /* clear online flag */
+ wake_up(&dev->read_wq); /* unblock reads on closure */
+ wake_up(&dev->write_wq); /* likewise for writes */
+
+ while ((req = req_get(dev, &dev->tx_idle)))
+ acc_request_free(req, dev->ep_in);
+ for (i = 0; i < RX_REQ_MAX; i++)
+ acc_request_free(dev->rx_req[i], dev->ep_out);
+
+ acc_hid_unbind(dev);
+}
+
+static void acc_start_work(struct work_struct *data)
+{
+ char *envp[2] = { "ACCESSORY=START", NULL };
+
+ kobject_uevent_env(&acc_device.this_device->kobj, KOBJ_CHANGE, envp);
+}
+
+static int acc_hid_init(struct acc_hid_dev *hdev)
+{
+ struct hid_device *hid;
+ int ret;
+
+ hid = hid_allocate_device();
+ if (IS_ERR(hid))
+ return PTR_ERR(hid);
+
+ hid->ll_driver = &acc_hid_ll_driver;
+ hid->dev.parent = acc_device.this_device;
+
+ hid->bus = BUS_USB;
+ hid->vendor = HID_ANY_ID;
+ hid->product = HID_ANY_ID;
+ hid->driver_data = hdev;
+ ret = hid_add_device(hid);
+ if (ret) {
+ pr_err("can't add hid device: %d\n", ret);
+ hid_destroy_device(hid);
+ return ret;
+ }
+
+ hdev->hid = hid;
+ return 0;
+}
+
+static void acc_hid_delete(struct acc_hid_dev *hid)
+{
+ kfree(hid->report_desc);
+ kfree(hid);
+}
+
+static void acc_hid_work(struct work_struct *data)
+{
+ struct acc_dev *dev = _acc_dev;
+ struct list_head *entry, *temp;
+ struct acc_hid_dev *hid;
+ struct list_head new_list, dead_list;
+ unsigned long flags;
+
+ INIT_LIST_HEAD(&new_list);
+
+ spin_lock_irqsave(&dev->lock, flags);
+
+ /* copy hids that are ready for initialization to new_list */
+ list_for_each_safe(entry, temp, &dev->new_hid_list) {
+ hid = list_entry(entry, struct acc_hid_dev, list);
+ if (hid->report_desc_offset == hid->report_desc_len)
+ list_move(&hid->list, &new_list);
+ }
+
+ if (list_empty(&dev->dead_hid_list)) {
+ INIT_LIST_HEAD(&dead_list);
+ } else {
+ /* move all of dev->dead_hid_list to dead_list */
+ dead_list.prev = dev->dead_hid_list.prev;
+ dead_list.next = dev->dead_hid_list.next;
+ dead_list.next->prev = &dead_list;
+ dead_list.prev->next = &dead_list;
+ INIT_LIST_HEAD(&dev->dead_hid_list);
+ }
+
+ spin_unlock_irqrestore(&dev->lock, flags);
+
+ /* register new HID devices */
+ list_for_each_safe(entry, temp, &new_list) {
+ hid = list_entry(entry, struct acc_hid_dev, list);
+ if (acc_hid_init(hid)) {
+ pr_err("can't add HID device %p\n", hid);
+ acc_hid_delete(hid);
+ } else {
+ spin_lock_irqsave(&dev->lock, flags);
+ list_move(&hid->list, &dev->hid_list);
+ spin_unlock_irqrestore(&dev->lock, flags);
+ }
+ }
+
+ /* remove dead HID devices */
+ list_for_each_safe(entry, temp, &dead_list) {
+ hid = list_entry(entry, struct acc_hid_dev, list);
+ list_del(&hid->list);
+ if (hid->hid)
+ hid_destroy_device(hid->hid);
+ acc_hid_delete(hid);
+ }
+}
+
+static int acc_function_set_alt(struct usb_function *f,
+ unsigned intf, unsigned alt)
+{
+ struct acc_dev *dev = func_to_dev(f);
+ struct usb_composite_dev *cdev = f->config->cdev;
+ int ret;
+
+ DBG(cdev, "acc_function_set_alt intf: %d alt: %d\n", intf, alt);
+
+ ret = config_ep_by_speed(cdev->gadget, f, dev->ep_in);
+ if (ret)
+ return ret;
+
+ ret = usb_ep_enable(dev->ep_in);
+ if (ret)
+ return ret;
+
+ ret = config_ep_by_speed(cdev->gadget, f, dev->ep_out);
+ if (ret)
+ return ret;
+
+ ret = usb_ep_enable(dev->ep_out);
+ if (ret) {
+ usb_ep_disable(dev->ep_in);
+ return ret;
+ }
+
+ dev->online = 1;
+ dev->disconnected = 0; /* if online then not disconnected */
+
+ /* readers may be blocked waiting for us to go online */
+ wake_up(&dev->read_wq);
+ return 0;
+}
+
+static void acc_function_disable(struct usb_function *f)
+{
+ struct acc_dev *dev = func_to_dev(f);
+ struct usb_composite_dev *cdev = dev->cdev;
+
+ DBG(cdev, "acc_function_disable\n");
+ acc_set_disconnected(dev); /* this now only sets disconnected */
+ dev->online = 0; /* so now need to clear online flag here too */
+ usb_ep_disable(dev->ep_in);
+ usb_ep_disable(dev->ep_out);
+
+ /* readers may be blocked waiting for us to go online */
+ wake_up(&dev->read_wq);
+
+ VDBG(cdev, "%s disabled\n", dev->function.name);
+}
+
+static int acc_setup(void)
+{
+ struct acc_dev *dev;
+ int ret;
+
+ dev = kzalloc(sizeof(*dev), GFP_KERNEL);
+ if (!dev)
+ return -ENOMEM;
+
+ spin_lock_init(&dev->lock);
+ init_waitqueue_head(&dev->read_wq);
+ init_waitqueue_head(&dev->write_wq);
+ atomic_set(&dev->open_excl, 0);
+ INIT_LIST_HEAD(&dev->tx_idle);
+ INIT_LIST_HEAD(&dev->hid_list);
+ INIT_LIST_HEAD(&dev->new_hid_list);
+ INIT_LIST_HEAD(&dev->dead_hid_list);
+ INIT_DELAYED_WORK(&dev->start_work, acc_start_work);
+ INIT_WORK(&dev->hid_work, acc_hid_work);
+
+ /* _acc_dev must be set before calling usb_gadget_register_driver */
+ _acc_dev = dev;
+
+ ret = misc_register(&acc_device);
+ if (ret)
+ goto err;
+
+ return 0;
+
+err:
+ kfree(dev);
+ pr_err("USB accessory gadget driver failed to initialize\n");
+ return ret;
+}
+
+void acc_disconnect(void)
+{
+ /* unregister all HID devices if USB is disconnected */
+ kill_all_hid_devices(_acc_dev);
+}
+EXPORT_SYMBOL_GPL(acc_disconnect);
+
+static void acc_cleanup(void)
+{
+ misc_deregister(&acc_device);
+ kfree(_acc_dev);
+ _acc_dev = NULL;
+}
+static struct acc_instance *to_acc_instance(struct config_item *item)
+{
+ return container_of(to_config_group(item), struct acc_instance,
+ func_inst.group);
+}
+
+static void acc_attr_release(struct config_item *item)
+{
+ struct acc_instance *fi_acc = to_acc_instance(item);
+
+ usb_put_function_instance(&fi_acc->func_inst);
+}
+
+static struct configfs_item_operations acc_item_ops = {
+ .release = acc_attr_release,
+};
+
+static struct config_item_type acc_func_type = {
+ .ct_item_ops = &acc_item_ops,
+ .ct_owner = THIS_MODULE,
+};
+
+static struct acc_instance *to_fi_acc(struct usb_function_instance *fi)
+{
+ return container_of(fi, struct acc_instance, func_inst);
+}
+
+static int acc_set_inst_name(struct usb_function_instance *fi, const char *name)
+{
+ struct acc_instance *fi_acc;
+ char *ptr;
+ int name_len;
+
+ name_len = strlen(name) + 1;
+ if (name_len > MAX_INST_NAME_LEN)
+ return -ENAMETOOLONG;
+
+ ptr = kstrndup(name, name_len, GFP_KERNEL);
+ if (!ptr)
+ return -ENOMEM;
+
+ fi_acc = to_fi_acc(fi);
+ fi_acc->name = ptr;
+ return 0;
+}
+
+static void acc_free_inst(struct usb_function_instance *fi)
+{
+ struct acc_instance *fi_acc;
+
+ fi_acc = to_fi_acc(fi);
+ kfree(fi_acc->name);
+ acc_cleanup();
+}
+
+static struct usb_function_instance *acc_alloc_inst(void)
+{
+ struct acc_instance *fi_acc;
+ struct acc_dev *dev;
+ int err;
+
+ fi_acc = kzalloc(sizeof(*fi_acc), GFP_KERNEL);
+ if (!fi_acc)
+ return ERR_PTR(-ENOMEM);
+ fi_acc->func_inst.set_inst_name = acc_set_inst_name;
+ fi_acc->func_inst.free_func_inst = acc_free_inst;
+
+ err = acc_setup();
+ if (err) {
+ kfree(fi_acc);
+ pr_err("Error setting ACCESSORY\n");
+ return ERR_PTR(err);
+ }
+
+ config_group_init_type_name(&fi_acc->func_inst.group,
+ "", &acc_func_type);
+ dev = _acc_dev;
+ return &fi_acc->func_inst;
+}
+
+static void acc_free(struct usb_function *f)
+{
+/*NO-OP: no function specific resource allocation in mtp_alloc*/
+}
+
+int acc_ctrlrequest_configfs(struct usb_function *f,
+ const struct usb_ctrlrequest *ctrl) {
+ if (f->config != NULL && f->config->cdev != NULL)
+ return acc_ctrlrequest(f->config->cdev, ctrl);
+ else
+ return -1;
+}
+
+static struct usb_function *acc_alloc(struct usb_function_instance *fi)
+{
+ struct acc_dev *dev = _acc_dev;
+
+ pr_info("acc_alloc\n");
+
+ dev->function.name = "accessory";
+ dev->function.strings = acc_strings,
+ dev->function.fs_descriptors = fs_acc_descs;
+ dev->function.hs_descriptors = hs_acc_descs;
+ dev->function.bind = acc_function_bind_configfs;
+ dev->function.unbind = acc_function_unbind;
+ dev->function.set_alt = acc_function_set_alt;
+ dev->function.disable = acc_function_disable;
+ dev->function.free_func = acc_free;
+ dev->function.setup = acc_ctrlrequest_configfs;
+
+ return &dev->function;
+}
+DECLARE_USB_FUNCTION_INIT(accessory, acc_alloc_inst, acc_alloc);
+MODULE_LICENSE("GPL");
diff --git a/drivers/usb/gadget/function/f_audio_source.c b/drivers/usb/gadget/function/f_audio_source.c
new file mode 100644
index 000000000000..8124af33b738
--- /dev/null
+++ b/drivers/usb/gadget/function/f_audio_source.c
@@ -0,0 +1,1071 @@
+/*
+ * Gadget Function Driver for USB audio source device
+ *
+ * Copyright (C) 2012 Google, Inc.
+ *
+ * This software is licensed under the terms of the GNU General Public
+ * License version 2, as published by the Free Software Foundation, and
+ * may be copied, distributed, and modified under those terms.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ */
+
+#include <linux/device.h>
+#include <linux/usb/audio.h>
+#include <linux/wait.h>
+#include <linux/pm_qos.h>
+#include <sound/core.h>
+#include <sound/initval.h>
+#include <sound/pcm.h>
+
+#include <linux/usb.h>
+#include <linux/usb_usual.h>
+#include <linux/usb/ch9.h>
+#include <linux/configfs.h>
+#include <linux/usb/composite.h>
+#include <linux/module.h>
+#include <linux/moduleparam.h>
+#define SAMPLE_RATE 44100
+#define FRAMES_PER_MSEC (SAMPLE_RATE / 1000)
+
+#define IN_EP_MAX_PACKET_SIZE 256
+
+/* Number of requests to allocate */
+#define IN_EP_REQ_COUNT 4
+
+#define AUDIO_AC_INTERFACE 0
+#define AUDIO_AS_INTERFACE 1
+#define AUDIO_NUM_INTERFACES 2
+#define MAX_INST_NAME_LEN 40
+
+/* B.3.1 Standard AC Interface Descriptor */
+static struct usb_interface_descriptor ac_interface_desc = {
+ .bLength = USB_DT_INTERFACE_SIZE,
+ .bDescriptorType = USB_DT_INTERFACE,
+ .bNumEndpoints = 0,
+ .bInterfaceClass = USB_CLASS_AUDIO,
+ .bInterfaceSubClass = USB_SUBCLASS_AUDIOCONTROL,
+};
+
+DECLARE_UAC_AC_HEADER_DESCRIPTOR(2);
+
+#define UAC_DT_AC_HEADER_LENGTH UAC_DT_AC_HEADER_SIZE(AUDIO_NUM_INTERFACES)
+/* 1 input terminal, 1 output terminal and 1 feature unit */
+#define UAC_DT_TOTAL_LENGTH (UAC_DT_AC_HEADER_LENGTH \
+ + UAC_DT_INPUT_TERMINAL_SIZE + UAC_DT_OUTPUT_TERMINAL_SIZE \
+ + UAC_DT_FEATURE_UNIT_SIZE(0))
+/* B.3.2 Class-Specific AC Interface Descriptor */
+static struct uac1_ac_header_descriptor_2 ac_header_desc = {
+ .bLength = UAC_DT_AC_HEADER_LENGTH,
+ .bDescriptorType = USB_DT_CS_INTERFACE,
+ .bDescriptorSubtype = UAC_HEADER,
+ .bcdADC = __constant_cpu_to_le16(0x0100),
+ .wTotalLength = __constant_cpu_to_le16(UAC_DT_TOTAL_LENGTH),
+ .bInCollection = AUDIO_NUM_INTERFACES,
+ .baInterfaceNr = {
+ [0] = AUDIO_AC_INTERFACE,
+ [1] = AUDIO_AS_INTERFACE,
+ }
+};
+
+#define INPUT_TERMINAL_ID 1
+static struct uac_input_terminal_descriptor input_terminal_desc = {
+ .bLength = UAC_DT_INPUT_TERMINAL_SIZE,
+ .bDescriptorType = USB_DT_CS_INTERFACE,
+ .bDescriptorSubtype = UAC_INPUT_TERMINAL,
+ .bTerminalID = INPUT_TERMINAL_ID,
+ .wTerminalType = UAC_INPUT_TERMINAL_MICROPHONE,
+ .bAssocTerminal = 0,
+ .wChannelConfig = 0x3,
+};
+
+DECLARE_UAC_FEATURE_UNIT_DESCRIPTOR(0);
+
+#define FEATURE_UNIT_ID 2
+static struct uac_feature_unit_descriptor_0 feature_unit_desc = {
+ .bLength = UAC_DT_FEATURE_UNIT_SIZE(0),
+ .bDescriptorType = USB_DT_CS_INTERFACE,
+ .bDescriptorSubtype = UAC_FEATURE_UNIT,
+ .bUnitID = FEATURE_UNIT_ID,
+ .bSourceID = INPUT_TERMINAL_ID,
+ .bControlSize = 2,
+};
+
+#define OUTPUT_TERMINAL_ID 3
+static struct uac1_output_terminal_descriptor output_terminal_desc = {
+ .bLength = UAC_DT_OUTPUT_TERMINAL_SIZE,
+ .bDescriptorType = USB_DT_CS_INTERFACE,
+ .bDescriptorSubtype = UAC_OUTPUT_TERMINAL,
+ .bTerminalID = OUTPUT_TERMINAL_ID,
+ .wTerminalType = UAC_TERMINAL_STREAMING,
+ .bAssocTerminal = FEATURE_UNIT_ID,
+ .bSourceID = FEATURE_UNIT_ID,
+};
+
+/* B.4.1 Standard AS Interface Descriptor */
+static struct usb_interface_descriptor as_interface_alt_0_desc = {
+ .bLength = USB_DT_INTERFACE_SIZE,
+ .bDescriptorType = USB_DT_INTERFACE,
+ .bAlternateSetting = 0,
+ .bNumEndpoints = 0,
+ .bInterfaceClass = USB_CLASS_AUDIO,
+ .bInterfaceSubClass = USB_SUBCLASS_AUDIOSTREAMING,
+};
+
+static struct usb_interface_descriptor as_interface_alt_1_desc = {
+ .bLength = USB_DT_INTERFACE_SIZE,
+ .bDescriptorType = USB_DT_INTERFACE,
+ .bAlternateSetting = 1,
+ .bNumEndpoints = 1,
+ .bInterfaceClass = USB_CLASS_AUDIO,
+ .bInterfaceSubClass = USB_SUBCLASS_AUDIOSTREAMING,
+};
+
+/* B.4.2 Class-Specific AS Interface Descriptor */
+static struct uac1_as_header_descriptor as_header_desc = {
+ .bLength = UAC_DT_AS_HEADER_SIZE,
+ .bDescriptorType = USB_DT_CS_INTERFACE,
+ .bDescriptorSubtype = UAC_AS_GENERAL,
+ .bTerminalLink = INPUT_TERMINAL_ID,
+ .bDelay = 1,
+ .wFormatTag = UAC_FORMAT_TYPE_I_PCM,
+};
+
+DECLARE_UAC_FORMAT_TYPE_I_DISCRETE_DESC(1);
+
+static struct uac_format_type_i_discrete_descriptor_1 as_type_i_desc = {
+ .bLength = UAC_FORMAT_TYPE_I_DISCRETE_DESC_SIZE(1),
+ .bDescriptorType = USB_DT_CS_INTERFACE,
+ .bDescriptorSubtype = UAC_FORMAT_TYPE,
+ .bFormatType = UAC_FORMAT_TYPE_I,
+ .bSubframeSize = 2,
+ .bBitResolution = 16,
+ .bSamFreqType = 1,
+};
+
+/* Standard ISO IN Endpoint Descriptor for highspeed */
+static struct usb_endpoint_descriptor hs_as_in_ep_desc = {
+ .bLength = USB_DT_ENDPOINT_AUDIO_SIZE,
+ .bDescriptorType = USB_DT_ENDPOINT,
+ .bEndpointAddress = USB_DIR_IN,
+ .bmAttributes = USB_ENDPOINT_SYNC_SYNC
+ | USB_ENDPOINT_XFER_ISOC,
+ .wMaxPacketSize = __constant_cpu_to_le16(IN_EP_MAX_PACKET_SIZE),
+ .bInterval = 4, /* poll 1 per millisecond */
+};
+
+/* Standard ISO IN Endpoint Descriptor for highspeed */
+static struct usb_endpoint_descriptor fs_as_in_ep_desc = {
+ .bLength = USB_DT_ENDPOINT_AUDIO_SIZE,
+ .bDescriptorType = USB_DT_ENDPOINT,
+ .bEndpointAddress = USB_DIR_IN,
+ .bmAttributes = USB_ENDPOINT_SYNC_SYNC
+ | USB_ENDPOINT_XFER_ISOC,
+ .wMaxPacketSize = __constant_cpu_to_le16(IN_EP_MAX_PACKET_SIZE),
+ .bInterval = 1, /* poll 1 per millisecond */
+};
+
+/* Class-specific AS ISO OUT Endpoint Descriptor */
+static struct uac_iso_endpoint_descriptor as_iso_in_desc = {
+ .bLength = UAC_ISO_ENDPOINT_DESC_SIZE,
+ .bDescriptorType = USB_DT_CS_ENDPOINT,
+ .bDescriptorSubtype = UAC_EP_GENERAL,
+ .bmAttributes = 1,
+ .bLockDelayUnits = 1,
+ .wLockDelay = __constant_cpu_to_le16(1),
+};
+
+static struct usb_descriptor_header *hs_audio_desc[] = {
+ (struct usb_descriptor_header *)&ac_interface_desc,
+ (struct usb_descriptor_header *)&ac_header_desc,
+
+ (struct usb_descriptor_header *)&input_terminal_desc,
+ (struct usb_descriptor_header *)&output_terminal_desc,
+ (struct usb_descriptor_header *)&feature_unit_desc,
+
+ (struct usb_descriptor_header *)&as_interface_alt_0_desc,
+ (struct usb_descriptor_header *)&as_interface_alt_1_desc,
+ (struct usb_descriptor_header *)&as_header_desc,
+
+ (struct usb_descriptor_header *)&as_type_i_desc,
+
+ (struct usb_descriptor_header *)&hs_as_in_ep_desc,
+ (struct usb_descriptor_header *)&as_iso_in_desc,
+ NULL,
+};
+
+static struct usb_descriptor_header *fs_audio_desc[] = {
+ (struct usb_descriptor_header *)&ac_interface_desc,
+ (struct usb_descriptor_header *)&ac_header_desc,
+
+ (struct usb_descriptor_header *)&input_terminal_desc,
+ (struct usb_descriptor_header *)&output_terminal_desc,
+ (struct usb_descriptor_header *)&feature_unit_desc,
+
+ (struct usb_descriptor_header *)&as_interface_alt_0_desc,
+ (struct usb_descriptor_header *)&as_interface_alt_1_desc,
+ (struct usb_descriptor_header *)&as_header_desc,
+
+ (struct usb_descriptor_header *)&as_type_i_desc,
+
+ (struct usb_descriptor_header *)&fs_as_in_ep_desc,
+ (struct usb_descriptor_header *)&as_iso_in_desc,
+ NULL,
+};
+
+static struct snd_pcm_hardware audio_hw_info = {
+ .info = SNDRV_PCM_INFO_MMAP |
+ SNDRV_PCM_INFO_MMAP_VALID |
+ SNDRV_PCM_INFO_BATCH |
+ SNDRV_PCM_INFO_INTERLEAVED |
+ SNDRV_PCM_INFO_BLOCK_TRANSFER,
+
+ .formats = SNDRV_PCM_FMTBIT_S16_LE,
+ .channels_min = 2,
+ .channels_max = 2,
+ .rate_min = SAMPLE_RATE,
+ .rate_max = SAMPLE_RATE,
+
+ .buffer_bytes_max = 1024 * 1024,
+ .period_bytes_min = 64,
+ .period_bytes_max = 512 * 1024,
+ .periods_min = 2,
+ .periods_max = 1024,
+};
+
+/*-------------------------------------------------------------------------*/
+
+struct audio_source_config {
+ int card;
+ int device;
+};
+
+struct audio_dev {
+ struct usb_function func;
+ struct snd_card *card;
+ struct snd_pcm *pcm;
+ struct snd_pcm_substream *substream;
+
+ struct list_head idle_reqs;
+ struct usb_ep *in_ep;
+
+ spinlock_t lock;
+
+ /* beginning, end and current position in our buffer */
+ void *buffer_start;
+ void *buffer_end;
+ void *buffer_pos;
+
+ /* byte size of a "period" */
+ unsigned int period;
+ /* bytes sent since last call to snd_pcm_period_elapsed */
+ unsigned int period_offset;
+ /* time we started playing */
+ ktime_t start_time;
+ /* number of frames sent since start_time */
+ s64 frames_sent;
+ struct audio_source_config *config;
+ /* for creating and issuing QoS requests */
+ struct pm_qos_request pm_qos;
+};
+
+static inline struct audio_dev *func_to_audio(struct usb_function *f)
+{
+ return container_of(f, struct audio_dev, func);
+}
+
+/*-------------------------------------------------------------------------*/
+
+struct audio_source_instance {
+ struct usb_function_instance func_inst;
+ const char *name;
+ struct audio_source_config *config;
+ struct device *audio_device;
+};
+
+static void audio_source_attr_release(struct config_item *item);
+
+static struct configfs_item_operations audio_source_item_ops = {
+ .release = audio_source_attr_release,
+};
+
+static struct config_item_type audio_source_func_type = {
+ .ct_item_ops = &audio_source_item_ops,
+ .ct_owner = THIS_MODULE,
+};
+
+static ssize_t audio_source_pcm_show(struct device *dev,
+ struct device_attribute *attr, char *buf);
+
+static DEVICE_ATTR(pcm, S_IRUGO, audio_source_pcm_show, NULL);
+
+static struct device_attribute *audio_source_function_attributes[] = {
+ &dev_attr_pcm,
+ NULL
+};
+
+/*--------------------------------------------------------------------------*/
+
+static struct usb_request *audio_request_new(struct usb_ep *ep, int buffer_size)
+{
+ struct usb_request *req = usb_ep_alloc_request(ep, GFP_KERNEL);
+
+ if (!req)
+ return NULL;
+
+ req->buf = kmalloc(buffer_size, GFP_KERNEL);
+ if (!req->buf) {
+ usb_ep_free_request(ep, req);
+ return NULL;
+ }
+ req->length = buffer_size;
+ return req;
+}
+
+static void audio_request_free(struct usb_request *req, struct usb_ep *ep)
+{
+ if (req) {
+ kfree(req->buf);
+ usb_ep_free_request(ep, req);
+ }
+}
+
+static void audio_req_put(struct audio_dev *audio, struct usb_request *req)
+{
+ unsigned long flags;
+
+ spin_lock_irqsave(&audio->lock, flags);
+ list_add_tail(&req->list, &audio->idle_reqs);
+ spin_unlock_irqrestore(&audio->lock, flags);
+}
+
+static struct usb_request *audio_req_get(struct audio_dev *audio)
+{
+ unsigned long flags;
+ struct usb_request *req;
+
+ spin_lock_irqsave(&audio->lock, flags);
+ if (list_empty(&audio->idle_reqs)) {
+ req = 0;
+ } else {
+ req = list_first_entry(&audio->idle_reqs, struct usb_request,
+ list);
+ list_del(&req->list);
+ }
+ spin_unlock_irqrestore(&audio->lock, flags);
+ return req;
+}
+
+/* send the appropriate number of packets to match our bitrate */
+static void audio_send(struct audio_dev *audio)
+{
+ struct snd_pcm_runtime *runtime;
+ struct usb_request *req;
+ int length, length1, length2, ret;
+ s64 msecs;
+ s64 frames;
+ ktime_t now;
+
+ /* audio->substream will be null if we have been closed */
+ if (!audio->substream)
+ return;
+ /* audio->buffer_pos will be null if we have been stopped */
+ if (!audio->buffer_pos)
+ return;
+
+ runtime = audio->substream->runtime;
+
+ /* compute number of frames to send */
+ now = ktime_get();
+ msecs = div_s64((ktime_to_ns(now) - ktime_to_ns(audio->start_time)),
+ 1000000);
+ frames = div_s64((msecs * SAMPLE_RATE), 1000);
+
+ /* Readjust our frames_sent if we fall too far behind.
+ * If we get too far behind it is better to drop some frames than
+ * to keep sending data too fast in an attempt to catch up.
+ */
+ if (frames - audio->frames_sent > 10 * FRAMES_PER_MSEC)
+ audio->frames_sent = frames - FRAMES_PER_MSEC;
+
+ frames -= audio->frames_sent;
+
+ /* We need to send something to keep the pipeline going */
+ if (frames <= 0)
+ frames = FRAMES_PER_MSEC;
+
+ while (frames > 0) {
+ req = audio_req_get(audio);
+ if (!req)
+ break;
+
+ length = frames_to_bytes(runtime, frames);
+ if (length > IN_EP_MAX_PACKET_SIZE)
+ length = IN_EP_MAX_PACKET_SIZE;
+
+ if (audio->buffer_pos + length > audio->buffer_end)
+ length1 = audio->buffer_end - audio->buffer_pos;
+ else
+ length1 = length;
+ memcpy(req->buf, audio->buffer_pos, length1);
+ if (length1 < length) {
+ /* Wrap around and copy remaining length
+ * at beginning of buffer.
+ */
+ length2 = length - length1;
+ memcpy(req->buf + length1, audio->buffer_start,
+ length2);
+ audio->buffer_pos = audio->buffer_start + length2;
+ } else {
+ audio->buffer_pos += length1;
+ if (audio->buffer_pos >= audio->buffer_end)
+ audio->buffer_pos = audio->buffer_start;
+ }
+
+ req->length = length;
+ ret = usb_ep_queue(audio->in_ep, req, GFP_ATOMIC);
+ if (ret < 0) {
+ pr_err("usb_ep_queue failed ret: %d\n", ret);
+ audio_req_put(audio, req);
+ break;
+ }
+
+ frames -= bytes_to_frames(runtime, length);
+ audio->frames_sent += bytes_to_frames(runtime, length);
+ }
+}
+
+static void audio_control_complete(struct usb_ep *ep, struct usb_request *req)
+{
+ /* nothing to do here */
+}
+
+static void audio_data_complete(struct usb_ep *ep, struct usb_request *req)
+{
+ struct audio_dev *audio = req->context;
+
+ pr_debug("audio_data_complete req->status %d req->actual %d\n",
+ req->status, req->actual);
+
+ audio_req_put(audio, req);
+
+ if (!audio->buffer_start || req->status)
+ return;
+
+ audio->period_offset += req->actual;
+ if (audio->period_offset >= audio->period) {
+ snd_pcm_period_elapsed(audio->substream);
+ audio->period_offset = 0;
+ }
+ audio_send(audio);
+}
+
+static int audio_set_endpoint_req(struct usb_function *f,
+ const struct usb_ctrlrequest *ctrl)
+{
+ int value = -EOPNOTSUPP;
+ u16 ep = le16_to_cpu(ctrl->wIndex);
+ u16 len = le16_to_cpu(ctrl->wLength);
+ u16 w_value = le16_to_cpu(ctrl->wValue);
+
+ pr_debug("bRequest 0x%x, w_value 0x%04x, len %d, endpoint %d\n",
+ ctrl->bRequest, w_value, len, ep);
+
+ switch (ctrl->bRequest) {
+ case UAC_SET_CUR:
+ case UAC_SET_MIN:
+ case UAC_SET_MAX:
+ case UAC_SET_RES:
+ value = len;
+ break;
+ default:
+ break;
+ }
+
+ return value;
+}
+
+static int audio_get_endpoint_req(struct usb_function *f,
+ const struct usb_ctrlrequest *ctrl)
+{
+ struct usb_composite_dev *cdev = f->config->cdev;
+ int value = -EOPNOTSUPP;
+ u8 ep = ((le16_to_cpu(ctrl->wIndex) >> 8) & 0xFF);
+ u16 len = le16_to_cpu(ctrl->wLength);
+ u16 w_value = le16_to_cpu(ctrl->wValue);
+ u8 *buf = cdev->req->buf;
+
+ pr_debug("bRequest 0x%x, w_value 0x%04x, len %d, endpoint %d\n",
+ ctrl->bRequest, w_value, len, ep);
+
+ if (w_value == UAC_EP_CS_ATTR_SAMPLE_RATE << 8) {
+ switch (ctrl->bRequest) {
+ case UAC_GET_CUR:
+ case UAC_GET_MIN:
+ case UAC_GET_MAX:
+ case UAC_GET_RES:
+ /* return our sample rate */
+ buf[0] = (u8)SAMPLE_RATE;
+ buf[1] = (u8)(SAMPLE_RATE >> 8);
+ buf[2] = (u8)(SAMPLE_RATE >> 16);
+ value = 3;
+ break;
+ default:
+ break;
+ }
+ }
+
+ return value;
+}
+
+static int
+audio_setup(struct usb_function *f, const struct usb_ctrlrequest *ctrl)
+{
+ struct usb_composite_dev *cdev = f->config->cdev;
+ struct usb_request *req = cdev->req;
+ int value = -EOPNOTSUPP;
+ u16 w_index = le16_to_cpu(ctrl->wIndex);
+ u16 w_value = le16_to_cpu(ctrl->wValue);
+ u16 w_length = le16_to_cpu(ctrl->wLength);
+
+ /* composite driver infrastructure handles everything; interface
+ * activation uses set_alt().
+ */
+ switch (ctrl->bRequestType) {
+ case USB_DIR_OUT | USB_TYPE_CLASS | USB_RECIP_ENDPOINT:
+ value = audio_set_endpoint_req(f, ctrl);
+ break;
+
+ case USB_DIR_IN | USB_TYPE_CLASS | USB_RECIP_ENDPOINT:
+ value = audio_get_endpoint_req(f, ctrl);
+ break;
+ }
+
+ /* respond with data transfer or status phase? */
+ if (value >= 0) {
+ pr_debug("audio req%02x.%02x v%04x i%04x l%d\n",
+ ctrl->bRequestType, ctrl->bRequest,
+ w_value, w_index, w_length);
+ req->zero = 0;
+ req->length = value;
+ req->complete = audio_control_complete;
+ value = usb_ep_queue(cdev->gadget->ep0, req, GFP_ATOMIC);
+ if (value < 0)
+ pr_err("audio response on err %d\n", value);
+ }
+
+ /* device either stalls (value < 0) or reports success */
+ return value;
+}
+
+static int audio_set_alt(struct usb_function *f, unsigned intf, unsigned alt)
+{
+ struct audio_dev *audio = func_to_audio(f);
+ struct usb_composite_dev *cdev = f->config->cdev;
+ int ret;
+
+ pr_debug("audio_set_alt intf %d, alt %d\n", intf, alt);
+
+ ret = config_ep_by_speed(cdev->gadget, f, audio->in_ep);
+ if (ret)
+ return ret;
+
+ usb_ep_enable(audio->in_ep);
+ return 0;
+}
+
+static void audio_disable(struct usb_function *f)
+{
+ struct audio_dev *audio = func_to_audio(f);
+
+ pr_debug("audio_disable\n");
+ usb_ep_disable(audio->in_ep);
+}
+
+static void audio_free_func(struct usb_function *f)
+{
+ /* no-op */
+}
+
+/*-------------------------------------------------------------------------*/
+
+static void audio_build_desc(struct audio_dev *audio)
+{
+ u8 *sam_freq;
+ int rate;
+
+ /* Set channel numbers */
+ input_terminal_desc.bNrChannels = 2;
+ as_type_i_desc.bNrChannels = 2;
+
+ /* Set sample rates */
+ rate = SAMPLE_RATE;
+ sam_freq = as_type_i_desc.tSamFreq[0];
+ memcpy(sam_freq, &rate, 3);
+}
+
+
+static int snd_card_setup(struct usb_configuration *c,
+ struct audio_source_config *config);
+static struct audio_source_instance *to_fi_audio_source(
+ const struct usb_function_instance *fi);
+
+
+/* audio function driver setup/binding */
+static int
+audio_bind(struct usb_configuration *c, struct usb_function *f)
+{
+ struct usb_composite_dev *cdev = c->cdev;
+ struct audio_dev *audio = func_to_audio(f);
+ int status;
+ struct usb_ep *ep;
+ struct usb_request *req;
+ int i;
+ int err;
+
+ if (IS_ENABLED(CONFIG_USB_CONFIGFS)) {
+ struct audio_source_instance *fi_audio =
+ to_fi_audio_source(f->fi);
+ struct audio_source_config *config =
+ fi_audio->config;
+
+ err = snd_card_setup(c, config);
+ if (err)
+ return err;
+ }
+
+ audio_build_desc(audio);
+
+ /* allocate instance-specific interface IDs, and patch descriptors */
+ status = usb_interface_id(c, f);
+ if (status < 0)
+ goto fail;
+ ac_interface_desc.bInterfaceNumber = status;
+
+ /* AUDIO_AC_INTERFACE */
+ ac_header_desc.baInterfaceNr[0] = status;
+
+ status = usb_interface_id(c, f);
+ if (status < 0)
+ goto fail;
+ as_interface_alt_0_desc.bInterfaceNumber = status;
+ as_interface_alt_1_desc.bInterfaceNumber = status;
+
+ /* AUDIO_AS_INTERFACE */
+ ac_header_desc.baInterfaceNr[1] = status;
+
+ status = -ENODEV;
+
+ /* allocate our endpoint */
+ ep = usb_ep_autoconfig(cdev->gadget, &fs_as_in_ep_desc);
+ if (!ep)
+ goto fail;
+ audio->in_ep = ep;
+ ep->driver_data = audio; /* claim */
+
+ if (gadget_is_dualspeed(c->cdev->gadget))
+ hs_as_in_ep_desc.bEndpointAddress =
+ fs_as_in_ep_desc.bEndpointAddress;
+
+ f->fs_descriptors = fs_audio_desc;
+ f->hs_descriptors = hs_audio_desc;
+
+ for (i = 0, status = 0; i < IN_EP_REQ_COUNT && status == 0; i++) {
+ req = audio_request_new(ep, IN_EP_MAX_PACKET_SIZE);
+ if (req) {
+ req->context = audio;
+ req->complete = audio_data_complete;
+ audio_req_put(audio, req);
+ } else
+ status = -ENOMEM;
+ }
+
+fail:
+ return status;
+}
+
+static void
+audio_unbind(struct usb_configuration *c, struct usb_function *f)
+{
+ struct audio_dev *audio = func_to_audio(f);
+ struct usb_request *req;
+
+ while ((req = audio_req_get(audio)))
+ audio_request_free(req, audio->in_ep);
+
+ snd_card_free_when_closed(audio->card);
+ audio->card = NULL;
+ audio->pcm = NULL;
+ audio->substream = NULL;
+ audio->in_ep = NULL;
+
+ if (IS_ENABLED(CONFIG_USB_CONFIGFS)) {
+ struct audio_source_instance *fi_audio =
+ to_fi_audio_source(f->fi);
+ struct audio_source_config *config =
+ fi_audio->config;
+
+ config->card = -1;
+ config->device = -1;
+ }
+}
+
+static void audio_pcm_playback_start(struct audio_dev *audio)
+{
+ audio->start_time = ktime_get();
+ audio->frames_sent = 0;
+ audio_send(audio);
+}
+
+static void audio_pcm_playback_stop(struct audio_dev *audio)
+{
+ unsigned long flags;
+
+ spin_lock_irqsave(&audio->lock, flags);
+ audio->buffer_start = 0;
+ audio->buffer_end = 0;
+ audio->buffer_pos = 0;
+ spin_unlock_irqrestore(&audio->lock, flags);
+}
+
+static int audio_pcm_open(struct snd_pcm_substream *substream)
+{
+ struct snd_pcm_runtime *runtime = substream->runtime;
+ struct audio_dev *audio = substream->private_data;
+
+ runtime->private_data = audio;
+ runtime->hw = audio_hw_info;
+ snd_pcm_limit_hw_rates(runtime);
+ runtime->hw.channels_max = 2;
+
+ audio->substream = substream;
+
+ /* Add the QoS request and set the latency to 0 */
+ pm_qos_add_request(&audio->pm_qos, PM_QOS_CPU_DMA_LATENCY, 0);
+
+ return 0;
+}
+
+static int audio_pcm_close(struct snd_pcm_substream *substream)
+{
+ struct audio_dev *audio = substream->private_data;
+ unsigned long flags;
+
+ spin_lock_irqsave(&audio->lock, flags);
+
+ /* Remove the QoS request */
+ pm_qos_remove_request(&audio->pm_qos);
+
+ audio->substream = NULL;
+ spin_unlock_irqrestore(&audio->lock, flags);
+
+ return 0;
+}
+
+static int audio_pcm_hw_params(struct snd_pcm_substream *substream,
+ struct snd_pcm_hw_params *params)
+{
+ unsigned int channels = params_channels(params);
+ unsigned int rate = params_rate(params);
+
+ if (rate != SAMPLE_RATE)
+ return -EINVAL;
+ if (channels != 2)
+ return -EINVAL;
+
+ return snd_pcm_lib_alloc_vmalloc_buffer(substream,
+ params_buffer_bytes(params));
+}
+
+static int audio_pcm_hw_free(struct snd_pcm_substream *substream)
+{
+ return snd_pcm_lib_free_vmalloc_buffer(substream);
+}
+
+static int audio_pcm_prepare(struct snd_pcm_substream *substream)
+{
+ struct snd_pcm_runtime *runtime = substream->runtime;
+ struct audio_dev *audio = runtime->private_data;
+
+ audio->period = snd_pcm_lib_period_bytes(substream);
+ audio->period_offset = 0;
+ audio->buffer_start = runtime->dma_area;
+ audio->buffer_end = audio->buffer_start
+ + snd_pcm_lib_buffer_bytes(substream);
+ audio->buffer_pos = audio->buffer_start;
+
+ return 0;
+}
+
+static snd_pcm_uframes_t audio_pcm_pointer(struct snd_pcm_substream *substream)
+{
+ struct snd_pcm_runtime *runtime = substream->runtime;
+ struct audio_dev *audio = runtime->private_data;
+ ssize_t bytes = audio->buffer_pos - audio->buffer_start;
+
+ /* return offset of next frame to fill in our buffer */
+ return bytes_to_frames(runtime, bytes);
+}
+
+static int audio_pcm_playback_trigger(struct snd_pcm_substream *substream,
+ int cmd)
+{
+ struct audio_dev *audio = substream->runtime->private_data;
+ int ret = 0;
+
+ switch (cmd) {
+ case SNDRV_PCM_TRIGGER_START:
+ case SNDRV_PCM_TRIGGER_RESUME:
+ audio_pcm_playback_start(audio);
+ break;
+
+ case SNDRV_PCM_TRIGGER_STOP:
+ case SNDRV_PCM_TRIGGER_SUSPEND:
+ audio_pcm_playback_stop(audio);
+ break;
+
+ default:
+ ret = -EINVAL;
+ }
+
+ return ret;
+}
+
+static struct audio_dev _audio_dev = {
+ .func = {
+ .name = "audio_source",
+ .bind = audio_bind,
+ .unbind = audio_unbind,
+ .set_alt = audio_set_alt,
+ .setup = audio_setup,
+ .disable = audio_disable,
+ .free_func = audio_free_func,
+ },
+ .lock = __SPIN_LOCK_UNLOCKED(_audio_dev.lock),
+ .idle_reqs = LIST_HEAD_INIT(_audio_dev.idle_reqs),
+};
+
+static struct snd_pcm_ops audio_playback_ops = {
+ .open = audio_pcm_open,
+ .close = audio_pcm_close,
+ .ioctl = snd_pcm_lib_ioctl,
+ .hw_params = audio_pcm_hw_params,
+ .hw_free = audio_pcm_hw_free,
+ .prepare = audio_pcm_prepare,
+ .trigger = audio_pcm_playback_trigger,
+ .pointer = audio_pcm_pointer,
+};
+
+int audio_source_bind_config(struct usb_configuration *c,
+ struct audio_source_config *config)
+{
+ struct audio_dev *audio;
+ int err;
+
+ config->card = -1;
+ config->device = -1;
+
+ audio = &_audio_dev;
+
+ err = snd_card_setup(c, config);
+ if (err)
+ return err;
+
+ err = usb_add_function(c, &audio->func);
+ if (err)
+ goto add_fail;
+
+ return 0;
+
+add_fail:
+ snd_card_free(audio->card);
+ return err;
+}
+
+static int snd_card_setup(struct usb_configuration *c,
+ struct audio_source_config *config)
+{
+ struct audio_dev *audio;
+ struct snd_card *card;
+ struct snd_pcm *pcm;
+ int err;
+
+ audio = &_audio_dev;
+
+ err = snd_card_new(&c->cdev->gadget->dev,
+ SNDRV_DEFAULT_IDX1, SNDRV_DEFAULT_STR1,
+ THIS_MODULE, 0, &card);
+ if (err)
+ return err;
+
+ err = snd_pcm_new(card, "USB audio source", 0, 1, 0, &pcm);
+ if (err)
+ goto pcm_fail;
+
+ pcm->private_data = audio;
+ pcm->info_flags = 0;
+ audio->pcm = pcm;
+
+ strlcpy(pcm->name, "USB gadget audio", sizeof(pcm->name));
+
+ snd_pcm_set_ops(pcm, SNDRV_PCM_STREAM_PLAYBACK, &audio_playback_ops);
+ snd_pcm_lib_preallocate_pages_for_all(pcm, SNDRV_DMA_TYPE_DEV,
+ NULL, 0, 64 * 1024);
+
+ strlcpy(card->driver, "audio_source", sizeof(card->driver));
+ strlcpy(card->shortname, card->driver, sizeof(card->shortname));
+ strlcpy(card->longname, "USB accessory audio source",
+ sizeof(card->longname));
+
+ err = snd_card_register(card);
+ if (err)
+ goto register_fail;
+
+ config->card = pcm->card->number;
+ config->device = pcm->device;
+ audio->card = card;
+ return 0;
+
+register_fail:
+pcm_fail:
+ snd_card_free(audio->card);
+ return err;
+}
+
+static struct audio_source_instance *to_audio_source_instance(
+ struct config_item *item)
+{
+ return container_of(to_config_group(item), struct audio_source_instance,
+ func_inst.group);
+}
+
+static struct audio_source_instance *to_fi_audio_source(
+ const struct usb_function_instance *fi)
+{
+ return container_of(fi, struct audio_source_instance, func_inst);
+}
+
+static void audio_source_attr_release(struct config_item *item)
+{
+ struct audio_source_instance *fi_audio = to_audio_source_instance(item);
+
+ usb_put_function_instance(&fi_audio->func_inst);
+}
+
+static int audio_source_set_inst_name(struct usb_function_instance *fi,
+ const char *name)
+{
+ struct audio_source_instance *fi_audio;
+ char *ptr;
+ int name_len;
+
+ name_len = strlen(name) + 1;
+ if (name_len > MAX_INST_NAME_LEN)
+ return -ENAMETOOLONG;
+
+ ptr = kstrndup(name, name_len, GFP_KERNEL);
+ if (!ptr)
+ return -ENOMEM;
+
+ fi_audio = to_fi_audio_source(fi);
+ fi_audio->name = ptr;
+
+ return 0;
+}
+
+static void audio_source_free_inst(struct usb_function_instance *fi)
+{
+ struct audio_source_instance *fi_audio;
+
+ fi_audio = to_fi_audio_source(fi);
+ device_destroy(fi_audio->audio_device->class,
+ fi_audio->audio_device->devt);
+ kfree(fi_audio->name);
+ kfree(fi_audio->config);
+}
+
+static ssize_t audio_source_pcm_show(struct device *dev,
+ struct device_attribute *attr, char *buf)
+{
+ struct audio_source_instance *fi_audio = dev_get_drvdata(dev);
+ struct audio_source_config *config = fi_audio->config;
+
+ /* print PCM card and device numbers */
+ return sprintf(buf, "%d %d\n", config->card, config->device);
+}
+
+struct device *create_function_device(char *name);
+
+static struct usb_function_instance *audio_source_alloc_inst(void)
+{
+ struct audio_source_instance *fi_audio;
+ struct device_attribute **attrs;
+ struct device_attribute *attr;
+ struct device *dev;
+ void *err_ptr;
+ int err = 0;
+
+ fi_audio = kzalloc(sizeof(*fi_audio), GFP_KERNEL);
+ if (!fi_audio)
+ return ERR_PTR(-ENOMEM);
+
+ fi_audio->func_inst.set_inst_name = audio_source_set_inst_name;
+ fi_audio->func_inst.free_func_inst = audio_source_free_inst;
+
+ fi_audio->config = kzalloc(sizeof(struct audio_source_config),
+ GFP_KERNEL);
+ if (!fi_audio->config) {
+ err_ptr = ERR_PTR(-ENOMEM);
+ goto fail_audio;
+ }
+
+ config_group_init_type_name(&fi_audio->func_inst.group, "",
+ &audio_source_func_type);
+ dev = create_function_device("f_audio_source");
+
+ if (IS_ERR(dev)) {
+ err_ptr = dev;
+ goto fail_audio_config;
+ }
+
+ fi_audio->config->card = -1;
+ fi_audio->config->device = -1;
+ fi_audio->audio_device = dev;
+
+ attrs = audio_source_function_attributes;
+ if (attrs) {
+ while ((attr = *attrs++) && !err)
+ err = device_create_file(dev, attr);
+ if (err) {
+ err_ptr = ERR_PTR(-EINVAL);
+ goto fail_device;
+ }
+ }
+
+ dev_set_drvdata(dev, fi_audio);
+ _audio_dev.config = fi_audio->config;
+
+ return &fi_audio->func_inst;
+
+fail_device:
+ device_destroy(dev->class, dev->devt);
+fail_audio_config:
+ kfree(fi_audio->config);
+fail_audio:
+ kfree(fi_audio);
+ return err_ptr;
+
+}
+
+static struct usb_function *audio_source_alloc(struct usb_function_instance *fi)
+{
+ return &_audio_dev.func;
+}
+
+DECLARE_USB_FUNCTION_INIT(audio_source, audio_source_alloc_inst,
+ audio_source_alloc);
+MODULE_LICENSE("GPL");
diff --git a/drivers/usb/gadget/function/f_midi.c b/drivers/usb/gadget/function/f_midi.c
index 70ac1963b598..7d6a48c5ca40 100644
--- a/drivers/usb/gadget/function/f_midi.c
+++ b/drivers/usb/gadget/function/f_midi.c
@@ -1168,6 +1168,65 @@ static void f_midi_free_inst(struct usb_function_instance *f)
kfree(opts);
}
+#ifdef CONFIG_USB_CONFIGFS_UEVENT
+extern struct device *create_function_device(char *name);
+static ssize_t alsa_show(struct device *dev,
+ struct device_attribute *attr, char *buf)
+{
+ struct usb_function_instance *fi_midi = dev_get_drvdata(dev);
+ struct f_midi *midi;
+
+ if (!fi_midi->f)
+ dev_warn(dev, "f_midi: function not set\n");
+
+ if (fi_midi && fi_midi->f) {
+ midi = func_to_midi(fi_midi->f);
+ if (midi->rmidi && midi->rmidi->card)
+ return sprintf(buf, "%d %d\n",
+ midi->rmidi->card->number, midi->rmidi->device);
+ }
+
+ /* print PCM card and device numbers */
+ return sprintf(buf, "%d %d\n", -1, -1);
+}
+
+static DEVICE_ATTR(alsa, S_IRUGO, alsa_show, NULL);
+
+static struct device_attribute *alsa_function_attributes[] = {
+ &dev_attr_alsa,
+ NULL
+};
+
+static int create_alsa_device(struct usb_function_instance *fi)
+{
+ struct device *dev;
+ struct device_attribute **attrs;
+ struct device_attribute *attr;
+ int err = 0;
+
+ dev = create_function_device("f_midi");
+ if (IS_ERR(dev))
+ return PTR_ERR(dev);
+
+ attrs = alsa_function_attributes;
+ if (attrs) {
+ while ((attr = *attrs++) && !err)
+ err = device_create_file(dev, attr);
+ if (err) {
+ device_destroy(dev->class, dev->devt);
+ return -EINVAL;
+ }
+ }
+ dev_set_drvdata(dev, fi);
+ return 0;
+}
+#else
+static int create_alsa_device(struct usb_function_instance *fi)
+{
+ return 0;
+}
+#endif
+
static struct usb_function_instance *f_midi_alloc_inst(void)
{
struct f_midi_opts *opts;
@@ -1185,6 +1244,11 @@ static struct usb_function_instance *f_midi_alloc_inst(void)
opts->in_ports = 1;
opts->out_ports = 1;
+ if (create_alsa_device(&opts->func_inst)) {
+ kfree(opts);
+ return ERR_PTR(-ENODEV);
+ }
+
config_group_init_type_name(&opts->func_inst.group, "",
&midi_func_type);
@@ -1202,6 +1266,7 @@ static void f_midi_free(struct usb_function *f)
mutex_lock(&opts->lock);
kfifo_free(&midi->in_req_fifo);
kfree(midi);
+ opts->func_inst.f = NULL;
--opts->refcnt;
mutex_unlock(&opts->lock);
}
@@ -1281,6 +1346,7 @@ static struct usb_function *f_midi_alloc(struct usb_function_instance *fi)
midi->func.disable = f_midi_disable;
midi->func.free_func = f_midi_free;
+ fi->f = &midi->func;
return &midi->func;
setup_fail:
diff --git a/drivers/usb/gadget/function/f_mtp.c b/drivers/usb/gadget/function/f_mtp.c
new file mode 100644
index 000000000000..77fe8f54a4b8
--- /dev/null
+++ b/drivers/usb/gadget/function/f_mtp.c
@@ -0,0 +1,1564 @@
+/*
+ * Gadget Function Driver for MTP
+ *
+ * Copyright (C) 2010 Google, Inc.
+ * Author: Mike Lockwood <lockwood@android.com>
+ *
+ * This software is licensed under the terms of the GNU General Public
+ * License version 2, as published by the Free Software Foundation, and
+ * may be copied, distributed, and modified under those terms.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ */
+
+/* #define DEBUG */
+/* #define VERBOSE_DEBUG */
+
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/poll.h>
+#include <linux/delay.h>
+#include <linux/wait.h>
+#include <linux/err.h>
+#include <linux/interrupt.h>
+
+#include <linux/types.h>
+#include <linux/file.h>
+#include <linux/device.h>
+#include <linux/miscdevice.h>
+
+#include <linux/usb.h>
+#include <linux/usb_usual.h>
+#include <linux/usb/ch9.h>
+#include <linux/usb/f_mtp.h>
+#include <linux/configfs.h>
+#include <linux/usb/composite.h>
+
+#include "configfs.h"
+
+#define MTP_BULK_BUFFER_SIZE 16384
+#define INTR_BUFFER_SIZE 28
+#define MAX_INST_NAME_LEN 40
+#define MTP_MAX_FILE_SIZE 0xFFFFFFFFL
+
+/* String IDs */
+#define INTERFACE_STRING_INDEX 0
+
+/* values for mtp_dev.state */
+#define STATE_OFFLINE 0 /* initial state, disconnected */
+#define STATE_READY 1 /* ready for userspace calls */
+#define STATE_BUSY 2 /* processing userspace calls */
+#define STATE_CANCELED 3 /* transaction canceled by host */
+#define STATE_ERROR 4 /* error from completion routine */
+
+/* number of tx and rx requests to allocate */
+#define TX_REQ_MAX 4
+#define RX_REQ_MAX 2
+#define INTR_REQ_MAX 5
+
+/* ID for Microsoft MTP OS String */
+#define MTP_OS_STRING_ID 0xEE
+
+/* MTP class reqeusts */
+#define MTP_REQ_CANCEL 0x64
+#define MTP_REQ_GET_EXT_EVENT_DATA 0x65
+#define MTP_REQ_RESET 0x66
+#define MTP_REQ_GET_DEVICE_STATUS 0x67
+
+/* constants for device status */
+#define MTP_RESPONSE_OK 0x2001
+#define MTP_RESPONSE_DEVICE_BUSY 0x2019
+#define DRIVER_NAME "mtp"
+
+static const char mtp_shortname[] = DRIVER_NAME "_usb";
+
+struct mtp_dev {
+ struct usb_function function;
+ struct usb_composite_dev *cdev;
+ spinlock_t lock;
+
+ struct usb_ep *ep_in;
+ struct usb_ep *ep_out;
+ struct usb_ep *ep_intr;
+
+ int state;
+
+ /* synchronize access to our device file */
+ atomic_t open_excl;
+ /* to enforce only one ioctl at a time */
+ atomic_t ioctl_excl;
+
+ struct list_head tx_idle;
+ struct list_head intr_idle;
+
+ wait_queue_head_t read_wq;
+ wait_queue_head_t write_wq;
+ wait_queue_head_t intr_wq;
+ struct usb_request *rx_req[RX_REQ_MAX];
+ int rx_done;
+
+ /* for processing MTP_SEND_FILE, MTP_RECEIVE_FILE and
+ * MTP_SEND_FILE_WITH_HEADER ioctls on a work queue
+ */
+ struct workqueue_struct *wq;
+ struct work_struct send_file_work;
+ struct work_struct receive_file_work;
+ struct file *xfer_file;
+ loff_t xfer_file_offset;
+ int64_t xfer_file_length;
+ unsigned xfer_send_header;
+ uint16_t xfer_command;
+ uint32_t xfer_transaction_id;
+ int xfer_result;
+};
+
+static struct usb_interface_descriptor mtp_interface_desc = {
+ .bLength = USB_DT_INTERFACE_SIZE,
+ .bDescriptorType = USB_DT_INTERFACE,
+ .bInterfaceNumber = 0,
+ .bNumEndpoints = 3,
+ .bInterfaceClass = USB_CLASS_VENDOR_SPEC,
+ .bInterfaceSubClass = USB_SUBCLASS_VENDOR_SPEC,
+ .bInterfaceProtocol = 0,
+};
+
+static struct usb_interface_descriptor ptp_interface_desc = {
+ .bLength = USB_DT_INTERFACE_SIZE,
+ .bDescriptorType = USB_DT_INTERFACE,
+ .bInterfaceNumber = 0,
+ .bNumEndpoints = 3,
+ .bInterfaceClass = USB_CLASS_STILL_IMAGE,
+ .bInterfaceSubClass = 1,
+ .bInterfaceProtocol = 1,
+};
+
+static struct usb_endpoint_descriptor mtp_ss_in_desc = {
+ .bLength = USB_DT_ENDPOINT_SIZE,
+ .bDescriptorType = USB_DT_ENDPOINT,
+ .bEndpointAddress = USB_DIR_IN,
+ .bmAttributes = USB_ENDPOINT_XFER_BULK,
+ .wMaxPacketSize = __constant_cpu_to_le16(1024),
+};
+
+static struct usb_ss_ep_comp_descriptor mtp_ss_in_comp_desc = {
+ .bLength = sizeof(mtp_ss_in_comp_desc),
+ .bDescriptorType = USB_DT_SS_ENDPOINT_COMP,
+ /* .bMaxBurst = DYNAMIC, */
+};
+
+static struct usb_endpoint_descriptor mtp_ss_out_desc = {
+ .bLength = USB_DT_ENDPOINT_SIZE,
+ .bDescriptorType = USB_DT_ENDPOINT,
+ .bEndpointAddress = USB_DIR_OUT,
+ .bmAttributes = USB_ENDPOINT_XFER_BULK,
+ .wMaxPacketSize = __constant_cpu_to_le16(1024),
+};
+
+static struct usb_ss_ep_comp_descriptor mtp_ss_out_comp_desc = {
+ .bLength = sizeof(mtp_ss_out_comp_desc),
+ .bDescriptorType = USB_DT_SS_ENDPOINT_COMP,
+ /* .bMaxBurst = DYNAMIC, */
+};
+
+static struct usb_endpoint_descriptor mtp_highspeed_in_desc = {
+ .bLength = USB_DT_ENDPOINT_SIZE,
+ .bDescriptorType = USB_DT_ENDPOINT,
+ .bEndpointAddress = USB_DIR_IN,
+ .bmAttributes = USB_ENDPOINT_XFER_BULK,
+ .wMaxPacketSize = __constant_cpu_to_le16(512),
+};
+
+static struct usb_endpoint_descriptor mtp_highspeed_out_desc = {
+ .bLength = USB_DT_ENDPOINT_SIZE,
+ .bDescriptorType = USB_DT_ENDPOINT,
+ .bEndpointAddress = USB_DIR_OUT,
+ .bmAttributes = USB_ENDPOINT_XFER_BULK,
+ .wMaxPacketSize = __constant_cpu_to_le16(512),
+};
+
+static struct usb_endpoint_descriptor mtp_fullspeed_in_desc = {
+ .bLength = USB_DT_ENDPOINT_SIZE,
+ .bDescriptorType = USB_DT_ENDPOINT,
+ .bEndpointAddress = USB_DIR_IN,
+ .bmAttributes = USB_ENDPOINT_XFER_BULK,
+};
+
+static struct usb_endpoint_descriptor mtp_fullspeed_out_desc = {
+ .bLength = USB_DT_ENDPOINT_SIZE,
+ .bDescriptorType = USB_DT_ENDPOINT,
+ .bEndpointAddress = USB_DIR_OUT,
+ .bmAttributes = USB_ENDPOINT_XFER_BULK,
+};
+
+static struct usb_endpoint_descriptor mtp_intr_desc = {
+ .bLength = USB_DT_ENDPOINT_SIZE,
+ .bDescriptorType = USB_DT_ENDPOINT,
+ .bEndpointAddress = USB_DIR_IN,
+ .bmAttributes = USB_ENDPOINT_XFER_INT,
+ .wMaxPacketSize = __constant_cpu_to_le16(INTR_BUFFER_SIZE),
+ .bInterval = 6,
+};
+
+static struct usb_ss_ep_comp_descriptor mtp_intr_ss_comp_desc = {
+ .bLength = sizeof(mtp_intr_ss_comp_desc),
+ .bDescriptorType = USB_DT_SS_ENDPOINT_COMP,
+ .wBytesPerInterval = cpu_to_le16(INTR_BUFFER_SIZE),
+};
+
+static struct usb_descriptor_header *fs_mtp_descs[] = {
+ (struct usb_descriptor_header *) &mtp_interface_desc,
+ (struct usb_descriptor_header *) &mtp_fullspeed_in_desc,
+ (struct usb_descriptor_header *) &mtp_fullspeed_out_desc,
+ (struct usb_descriptor_header *) &mtp_intr_desc,
+ NULL,
+};
+
+static struct usb_descriptor_header *hs_mtp_descs[] = {
+ (struct usb_descriptor_header *) &mtp_interface_desc,
+ (struct usb_descriptor_header *) &mtp_highspeed_in_desc,
+ (struct usb_descriptor_header *) &mtp_highspeed_out_desc,
+ (struct usb_descriptor_header *) &mtp_intr_desc,
+ NULL,
+};
+
+static struct usb_descriptor_header *ss_mtp_descs[] = {
+ (struct usb_descriptor_header *) &mtp_interface_desc,
+ (struct usb_descriptor_header *) &mtp_ss_in_desc,
+ (struct usb_descriptor_header *) &mtp_ss_in_comp_desc,
+ (struct usb_descriptor_header *) &mtp_ss_out_desc,
+ (struct usb_descriptor_header *) &mtp_ss_out_comp_desc,
+ (struct usb_descriptor_header *) &mtp_intr_desc,
+ (struct usb_descriptor_header *) &mtp_intr_ss_comp_desc,
+ NULL,
+};
+
+static struct usb_descriptor_header *fs_ptp_descs[] = {
+ (struct usb_descriptor_header *) &ptp_interface_desc,
+ (struct usb_descriptor_header *) &mtp_fullspeed_in_desc,
+ (struct usb_descriptor_header *) &mtp_fullspeed_out_desc,
+ (struct usb_descriptor_header *) &mtp_intr_desc,
+ NULL,
+};
+
+static struct usb_descriptor_header *hs_ptp_descs[] = {
+ (struct usb_descriptor_header *) &ptp_interface_desc,
+ (struct usb_descriptor_header *) &mtp_highspeed_in_desc,
+ (struct usb_descriptor_header *) &mtp_highspeed_out_desc,
+ (struct usb_descriptor_header *) &mtp_intr_desc,
+ NULL,
+};
+
+static struct usb_descriptor_header *ss_ptp_descs[] = {
+ (struct usb_descriptor_header *) &ptp_interface_desc,
+ (struct usb_descriptor_header *) &mtp_ss_in_desc,
+ (struct usb_descriptor_header *) &mtp_ss_in_comp_desc,
+ (struct usb_descriptor_header *) &mtp_ss_out_desc,
+ (struct usb_descriptor_header *) &mtp_ss_out_comp_desc,
+ (struct usb_descriptor_header *) &mtp_intr_desc,
+ (struct usb_descriptor_header *) &mtp_intr_ss_comp_desc,
+ NULL,
+};
+
+static struct usb_string mtp_string_defs[] = {
+ /* Naming interface "MTP" so libmtp will recognize us */
+ [INTERFACE_STRING_INDEX].s = "MTP",
+ { }, /* end of list */
+};
+
+static struct usb_gadget_strings mtp_string_table = {
+ .language = 0x0409, /* en-US */
+ .strings = mtp_string_defs,
+};
+
+static struct usb_gadget_strings *mtp_strings[] = {
+ &mtp_string_table,
+ NULL,
+};
+
+/* Microsoft MTP OS String */
+static u8 mtp_os_string[] = {
+ 18, /* sizeof(mtp_os_string) */
+ USB_DT_STRING,
+ /* Signature field: "MSFT100" */
+ 'M', 0, 'S', 0, 'F', 0, 'T', 0, '1', 0, '0', 0, '0', 0,
+ /* vendor code */
+ 1,
+ /* padding */
+ 0
+};
+
+/* Microsoft Extended Configuration Descriptor Header Section */
+struct mtp_ext_config_desc_header {
+ __le32 dwLength;
+ __u16 bcdVersion;
+ __le16 wIndex;
+ __u8 bCount;
+ __u8 reserved[7];
+};
+
+/* Microsoft Extended Configuration Descriptor Function Section */
+struct mtp_ext_config_desc_function {
+ __u8 bFirstInterfaceNumber;
+ __u8 bInterfaceCount;
+ __u8 compatibleID[8];
+ __u8 subCompatibleID[8];
+ __u8 reserved[6];
+};
+
+/* MTP Extended Configuration Descriptor */
+struct {
+ struct mtp_ext_config_desc_header header;
+ struct mtp_ext_config_desc_function function;
+} mtp_ext_config_desc = {
+ .header = {
+ .dwLength = __constant_cpu_to_le32(sizeof(mtp_ext_config_desc)),
+ .bcdVersion = __constant_cpu_to_le16(0x0100),
+ .wIndex = __constant_cpu_to_le16(4),
+ .bCount = 1,
+ },
+ .function = {
+ .bFirstInterfaceNumber = 0,
+ .bInterfaceCount = 1,
+ .compatibleID = { 'M', 'T', 'P' },
+ },
+};
+
+struct mtp_device_status {
+ __le16 wLength;
+ __le16 wCode;
+};
+
+struct mtp_data_header {
+ /* length of packet, including this header */
+ __le32 length;
+ /* container type (2 for data packet) */
+ __le16 type;
+ /* MTP command code */
+ __le16 command;
+ /* MTP transaction ID */
+ __le32 transaction_id;
+};
+
+struct mtp_instance {
+ struct usb_function_instance func_inst;
+ const char *name;
+ struct mtp_dev *dev;
+ char mtp_ext_compat_id[16];
+ struct usb_os_desc mtp_os_desc;
+};
+
+/* temporary variable used between mtp_open() and mtp_gadget_bind() */
+static struct mtp_dev *_mtp_dev;
+
+static inline struct mtp_dev *func_to_mtp(struct usb_function *f)
+{
+ return container_of(f, struct mtp_dev, function);
+}
+
+static struct usb_request *mtp_request_new(struct usb_ep *ep, int buffer_size)
+{
+ struct usb_request *req = usb_ep_alloc_request(ep, GFP_KERNEL);
+
+ if (!req)
+ return NULL;
+
+ /* now allocate buffers for the requests */
+ req->buf = kmalloc(buffer_size, GFP_KERNEL);
+ if (!req->buf) {
+ usb_ep_free_request(ep, req);
+ return NULL;
+ }
+
+ return req;
+}
+
+static void mtp_request_free(struct usb_request *req, struct usb_ep *ep)
+{
+ if (req) {
+ kfree(req->buf);
+ usb_ep_free_request(ep, req);
+ }
+}
+
+static inline int mtp_lock(atomic_t *excl)
+{
+ if (atomic_inc_return(excl) == 1) {
+ return 0;
+ } else {
+ atomic_dec(excl);
+ return -1;
+ }
+}
+
+static inline void mtp_unlock(atomic_t *excl)
+{
+ atomic_dec(excl);
+}
+
+/* add a request to the tail of a list */
+static void mtp_req_put(struct mtp_dev *dev, struct list_head *head,
+ struct usb_request *req)
+{
+ unsigned long flags;
+
+ spin_lock_irqsave(&dev->lock, flags);
+ list_add_tail(&req->list, head);
+ spin_unlock_irqrestore(&dev->lock, flags);
+}
+
+/* remove a request from the head of a list */
+static struct usb_request
+*mtp_req_get(struct mtp_dev *dev, struct list_head *head)
+{
+ unsigned long flags;
+ struct usb_request *req;
+
+ spin_lock_irqsave(&dev->lock, flags);
+ if (list_empty(head)) {
+ req = 0;
+ } else {
+ req = list_first_entry(head, struct usb_request, list);
+ list_del(&req->list);
+ }
+ spin_unlock_irqrestore(&dev->lock, flags);
+ return req;
+}
+
+static void mtp_complete_in(struct usb_ep *ep, struct usb_request *req)
+{
+ struct mtp_dev *dev = _mtp_dev;
+
+ if (req->status != 0)
+ dev->state = STATE_ERROR;
+
+ mtp_req_put(dev, &dev->tx_idle, req);
+
+ wake_up(&dev->write_wq);
+}
+
+static void mtp_complete_out(struct usb_ep *ep, struct usb_request *req)
+{
+ struct mtp_dev *dev = _mtp_dev;
+
+ dev->rx_done = 1;
+ if (req->status != 0)
+ dev->state = STATE_ERROR;
+
+ wake_up(&dev->read_wq);
+}
+
+static void mtp_complete_intr(struct usb_ep *ep, struct usb_request *req)
+{
+ struct mtp_dev *dev = _mtp_dev;
+
+ if (req->status != 0)
+ dev->state = STATE_ERROR;
+
+ mtp_req_put(dev, &dev->intr_idle, req);
+
+ wake_up(&dev->intr_wq);
+}
+
+static int mtp_create_bulk_endpoints(struct mtp_dev *dev,
+ struct usb_endpoint_descriptor *in_desc,
+ struct usb_endpoint_descriptor *out_desc,
+ struct usb_endpoint_descriptor *intr_desc)
+{
+ struct usb_composite_dev *cdev = dev->cdev;
+ struct usb_request *req;
+ struct usb_ep *ep;
+ int i;
+
+ DBG(cdev, "create_bulk_endpoints dev: %p\n", dev);
+
+ ep = usb_ep_autoconfig(cdev->gadget, in_desc);
+ if (!ep) {
+ DBG(cdev, "usb_ep_autoconfig for ep_in failed\n");
+ return -ENODEV;
+ }
+ DBG(cdev, "usb_ep_autoconfig for ep_in got %s\n", ep->name);
+ ep->driver_data = dev; /* claim the endpoint */
+ dev->ep_in = ep;
+
+ ep = usb_ep_autoconfig(cdev->gadget, out_desc);
+ if (!ep) {
+ DBG(cdev, "usb_ep_autoconfig for ep_out failed\n");
+ return -ENODEV;
+ }
+ DBG(cdev, "usb_ep_autoconfig for mtp ep_out got %s\n", ep->name);
+ ep->driver_data = dev; /* claim the endpoint */
+ dev->ep_out = ep;
+
+ ep = usb_ep_autoconfig(cdev->gadget, intr_desc);
+ if (!ep) {
+ DBG(cdev, "usb_ep_autoconfig for ep_intr failed\n");
+ return -ENODEV;
+ }
+ DBG(cdev, "usb_ep_autoconfig for mtp ep_intr got %s\n", ep->name);
+ ep->driver_data = dev; /* claim the endpoint */
+ dev->ep_intr = ep;
+
+ /* now allocate requests for our endpoints */
+ for (i = 0; i < TX_REQ_MAX; i++) {
+ req = mtp_request_new(dev->ep_in, MTP_BULK_BUFFER_SIZE);
+ if (!req)
+ goto fail;
+ req->complete = mtp_complete_in;
+ mtp_req_put(dev, &dev->tx_idle, req);
+ }
+ for (i = 0; i < RX_REQ_MAX; i++) {
+ req = mtp_request_new(dev->ep_out, MTP_BULK_BUFFER_SIZE);
+ if (!req)
+ goto fail;
+ req->complete = mtp_complete_out;
+ dev->rx_req[i] = req;
+ }
+ for (i = 0; i < INTR_REQ_MAX; i++) {
+ req = mtp_request_new(dev->ep_intr, INTR_BUFFER_SIZE);
+ if (!req)
+ goto fail;
+ req->complete = mtp_complete_intr;
+ mtp_req_put(dev, &dev->intr_idle, req);
+ }
+
+ return 0;
+
+fail:
+ pr_err("mtp_bind() could not allocate requests\n");
+ return -1;
+}
+
+static ssize_t mtp_read(struct file *fp, char __user *buf,
+ size_t count, loff_t *pos)
+{
+ struct mtp_dev *dev = fp->private_data;
+ struct usb_composite_dev *cdev = dev->cdev;
+ struct usb_request *req;
+ ssize_t r = count;
+ unsigned xfer;
+ int ret = 0;
+ size_t len = 0;
+
+ DBG(cdev, "mtp_read(%zu)\n", count);
+
+ /* we will block until we're online */
+ DBG(cdev, "mtp_read: waiting for online state\n");
+ ret = wait_event_interruptible(dev->read_wq,
+ dev->state != STATE_OFFLINE);
+ if (ret < 0) {
+ r = ret;
+ goto done;
+ }
+ spin_lock_irq(&dev->lock);
+ if (dev->ep_out->desc) {
+ len = usb_ep_align_maybe(cdev->gadget, dev->ep_out, count);
+ if (len > MTP_BULK_BUFFER_SIZE) {
+ spin_unlock_irq(&dev->lock);
+ return -EINVAL;
+ }
+ }
+
+ if (dev->state == STATE_CANCELED) {
+ /* report cancelation to userspace */
+ dev->state = STATE_READY;
+ spin_unlock_irq(&dev->lock);
+ return -ECANCELED;
+ }
+ dev->state = STATE_BUSY;
+ spin_unlock_irq(&dev->lock);
+
+requeue_req:
+ /* queue a request */
+ req = dev->rx_req[0];
+ req->length = len;
+ dev->rx_done = 0;
+ ret = usb_ep_queue(dev->ep_out, req, GFP_KERNEL);
+ if (ret < 0) {
+ r = -EIO;
+ goto done;
+ } else {
+ DBG(cdev, "rx %p queue\n", req);
+ }
+
+ /* wait for a request to complete */
+ ret = wait_event_interruptible(dev->read_wq, dev->rx_done);
+ if (ret < 0) {
+ r = ret;
+ usb_ep_dequeue(dev->ep_out, req);
+ goto done;
+ }
+ if (dev->state == STATE_BUSY) {
+ /* If we got a 0-len packet, throw it back and try again. */
+ if (req->actual == 0)
+ goto requeue_req;
+
+ DBG(cdev, "rx %p %d\n", req, req->actual);
+ xfer = (req->actual < count) ? req->actual : count;
+ r = xfer;
+ if (copy_to_user(buf, req->buf, xfer))
+ r = -EFAULT;
+ } else
+ r = -EIO;
+
+done:
+ spin_lock_irq(&dev->lock);
+ if (dev->state == STATE_CANCELED)
+ r = -ECANCELED;
+ else if (dev->state != STATE_OFFLINE)
+ dev->state = STATE_READY;
+ spin_unlock_irq(&dev->lock);
+
+ DBG(cdev, "mtp_read returning %zd\n", r);
+ return r;
+}
+
+static ssize_t mtp_write(struct file *fp, const char __user *buf,
+ size_t count, loff_t *pos)
+{
+ struct mtp_dev *dev = fp->private_data;
+ struct usb_composite_dev *cdev = dev->cdev;
+ struct usb_request *req = 0;
+ ssize_t r = count;
+ unsigned xfer;
+ int sendZLP = 0;
+ int ret;
+
+ DBG(cdev, "mtp_write(%zu)\n", count);
+
+ spin_lock_irq(&dev->lock);
+ if (dev->state == STATE_CANCELED) {
+ /* report cancelation to userspace */
+ dev->state = STATE_READY;
+ spin_unlock_irq(&dev->lock);
+ return -ECANCELED;
+ }
+ if (dev->state == STATE_OFFLINE) {
+ spin_unlock_irq(&dev->lock);
+ return -ENODEV;
+ }
+ dev->state = STATE_BUSY;
+ spin_unlock_irq(&dev->lock);
+
+ /* we need to send a zero length packet to signal the end of transfer
+ * if the transfer size is aligned to a packet boundary.
+ */
+ if ((count & (dev->ep_in->maxpacket - 1)) == 0)
+ sendZLP = 1;
+
+ while (count > 0 || sendZLP) {
+ /* so we exit after sending ZLP */
+ if (count == 0)
+ sendZLP = 0;
+
+ if (dev->state != STATE_BUSY) {
+ DBG(cdev, "mtp_write dev->error\n");
+ r = -EIO;
+ break;
+ }
+
+ /* get an idle tx request to use */
+ req = 0;
+ ret = wait_event_interruptible(dev->write_wq,
+ ((req = mtp_req_get(dev, &dev->tx_idle))
+ || dev->state != STATE_BUSY));
+ if (!req) {
+ r = ret;
+ break;
+ }
+
+ if (count > MTP_BULK_BUFFER_SIZE)
+ xfer = MTP_BULK_BUFFER_SIZE;
+ else
+ xfer = count;
+ if (xfer && copy_from_user(req->buf, buf, xfer)) {
+ r = -EFAULT;
+ break;
+ }
+
+ req->length = xfer;
+ ret = usb_ep_queue(dev->ep_in, req, GFP_KERNEL);
+ if (ret < 0) {
+ DBG(cdev, "mtp_write: xfer error %d\n", ret);
+ r = -EIO;
+ break;
+ }
+
+ buf += xfer;
+ count -= xfer;
+
+ /* zero this so we don't try to free it on error exit */
+ req = 0;
+ }
+
+ if (req)
+ mtp_req_put(dev, &dev->tx_idle, req);
+
+ spin_lock_irq(&dev->lock);
+ if (dev->state == STATE_CANCELED)
+ r = -ECANCELED;
+ else if (dev->state != STATE_OFFLINE)
+ dev->state = STATE_READY;
+ spin_unlock_irq(&dev->lock);
+
+ DBG(cdev, "mtp_write returning %zd\n", r);
+ return r;
+}
+
+/* read from a local file and write to USB */
+static void send_file_work(struct work_struct *data)
+{
+ struct mtp_dev *dev = container_of(data, struct mtp_dev,
+ send_file_work);
+ struct usb_composite_dev *cdev = dev->cdev;
+ struct usb_request *req = 0;
+ struct mtp_data_header *header;
+ struct file *filp;
+ loff_t offset;
+ int64_t count;
+ int xfer, ret, hdr_size;
+ int r = 0;
+ int sendZLP = 0;
+
+ /* read our parameters */
+ smp_rmb();
+ filp = dev->xfer_file;
+ offset = dev->xfer_file_offset;
+ count = dev->xfer_file_length;
+
+ if (count < 0) {
+ dev->xfer_result = -EINVAL;
+ return;
+ }
+
+ DBG(cdev, "send_file_work(%lld %lld)\n", offset, count);
+
+ if (dev->xfer_send_header) {
+ hdr_size = sizeof(struct mtp_data_header);
+ count += hdr_size;
+ } else {
+ hdr_size = 0;
+ }
+
+ /* we need to send a zero length packet to signal the end of transfer
+ * if the transfer size is aligned to a packet boundary.
+ */
+ if ((count & (dev->ep_in->maxpacket - 1)) == 0)
+ sendZLP = 1;
+
+ while (count > 0 || sendZLP) {
+ /* so we exit after sending ZLP */
+ if (count == 0)
+ sendZLP = 0;
+
+ /* get an idle tx request to use */
+ req = 0;
+ ret = wait_event_interruptible(dev->write_wq,
+ (req = mtp_req_get(dev, &dev->tx_idle))
+ || dev->state != STATE_BUSY);
+ if (dev->state == STATE_CANCELED) {
+ r = -ECANCELED;
+ break;
+ }
+ if (!req) {
+ r = ret;
+ break;
+ }
+
+ if (count > MTP_BULK_BUFFER_SIZE)
+ xfer = MTP_BULK_BUFFER_SIZE;
+ else
+ xfer = count;
+
+ if (hdr_size) {
+ /* prepend MTP data header */
+ header = (struct mtp_data_header *)req->buf;
+ /*
+ * set file size with header according to
+ * MTP Specification v1.0
+ */
+ header->length = (count > MTP_MAX_FILE_SIZE) ?
+ MTP_MAX_FILE_SIZE : __cpu_to_le32(count);
+ header->type = __cpu_to_le16(2); /* data packet */
+ header->command = __cpu_to_le16(dev->xfer_command);
+ header->transaction_id =
+ __cpu_to_le32(dev->xfer_transaction_id);
+ }
+
+ ret = vfs_read(filp, req->buf + hdr_size, xfer - hdr_size,
+ &offset);
+ if (ret < 0) {
+ r = ret;
+ break;
+ }
+ xfer = ret + hdr_size;
+ hdr_size = 0;
+
+ req->length = xfer;
+ ret = usb_ep_queue(dev->ep_in, req, GFP_KERNEL);
+ if (ret < 0) {
+ DBG(cdev, "send_file_work: xfer error %d\n", ret);
+ dev->state = STATE_ERROR;
+ r = -EIO;
+ break;
+ }
+
+ count -= xfer;
+
+ /* zero this so we don't try to free it on error exit */
+ req = 0;
+ }
+
+ if (req)
+ mtp_req_put(dev, &dev->tx_idle, req);
+
+ DBG(cdev, "send_file_work returning %d\n", r);
+ /* write the result */
+ dev->xfer_result = r;
+ smp_wmb();
+}
+
+/* read from USB and write to a local file */
+static void receive_file_work(struct work_struct *data)
+{
+ struct mtp_dev *dev = container_of(data, struct mtp_dev,
+ receive_file_work);
+ struct usb_composite_dev *cdev = dev->cdev;
+ struct usb_request *read_req = NULL, *write_req = NULL;
+ struct file *filp;
+ loff_t offset;
+ int64_t count, len;
+ int ret, cur_buf = 0;
+ int r = 0;
+
+ /* read our parameters */
+ smp_rmb();
+ filp = dev->xfer_file;
+ offset = dev->xfer_file_offset;
+ count = dev->xfer_file_length;
+
+ if (count < 0) {
+ dev->xfer_result = -EINVAL;
+ return;
+ }
+
+ DBG(cdev, "receive_file_work(%lld)\n", count);
+
+ while (count > 0 || write_req) {
+ if (count > 0) {
+ /* queue a request */
+ read_req = dev->rx_req[cur_buf];
+ cur_buf = (cur_buf + 1) % RX_REQ_MAX;
+
+ len = usb_ep_align_maybe(cdev->gadget, dev->ep_out, count);
+ if (len > MTP_BULK_BUFFER_SIZE)
+ len = MTP_BULK_BUFFER_SIZE;
+ read_req->length = len;
+ dev->rx_done = 0;
+ ret = usb_ep_queue(dev->ep_out, read_req, GFP_KERNEL);
+ if (ret < 0) {
+ r = -EIO;
+ dev->state = STATE_ERROR;
+ break;
+ }
+ }
+
+ if (write_req) {
+ DBG(cdev, "rx %p %d\n", write_req, write_req->actual);
+ ret = vfs_write(filp, write_req->buf, write_req->actual,
+ &offset);
+ DBG(cdev, "vfs_write %d\n", ret);
+ if (ret != write_req->actual) {
+ r = -EIO;
+ dev->state = STATE_ERROR;
+ break;
+ }
+ write_req = NULL;
+ }
+
+ if (read_req) {
+ /* wait for our last read to complete */
+ ret = wait_event_interruptible(dev->read_wq,
+ dev->rx_done || dev->state != STATE_BUSY);
+ if (dev->state == STATE_CANCELED) {
+ r = -ECANCELED;
+ if (!dev->rx_done)
+ usb_ep_dequeue(dev->ep_out, read_req);
+ break;
+ }
+ if (read_req->status) {
+ r = read_req->status;
+ break;
+ }
+ /* if xfer_file_length is 0xFFFFFFFF, then we read until
+ * we get a zero length packet
+ */
+ if (count != 0xFFFFFFFF)
+ count -= read_req->actual;
+ if (read_req->actual < read_req->length) {
+ /*
+ * short packet is used to signal EOF for
+ * sizes > 4 gig
+ */
+ DBG(cdev, "got short packet\n");
+ count = 0;
+ }
+
+ write_req = read_req;
+ read_req = NULL;
+ }
+ }
+
+ DBG(cdev, "receive_file_work returning %d\n", r);
+ /* write the result */
+ dev->xfer_result = r;
+ smp_wmb();
+}
+
+static int mtp_send_event(struct mtp_dev *dev, struct mtp_event *event)
+{
+ struct usb_request *req = NULL;
+ int ret;
+ int length = event->length;
+
+ DBG(dev->cdev, "mtp_send_event(%zu)\n", event->length);
+
+ if (length < 0 || length > INTR_BUFFER_SIZE)
+ return -EINVAL;
+ if (dev->state == STATE_OFFLINE)
+ return -ENODEV;
+
+ ret = wait_event_interruptible_timeout(dev->intr_wq,
+ (req = mtp_req_get(dev, &dev->intr_idle)),
+ msecs_to_jiffies(1000));
+ if (!req)
+ return -ETIME;
+
+ if (copy_from_user(req->buf, (void __user *)event->data, length)) {
+ mtp_req_put(dev, &dev->intr_idle, req);
+ return -EFAULT;
+ }
+ req->length = length;
+ ret = usb_ep_queue(dev->ep_intr, req, GFP_KERNEL);
+ if (ret)
+ mtp_req_put(dev, &dev->intr_idle, req);
+
+ return ret;
+}
+
+static long mtp_ioctl(struct file *fp, unsigned code, unsigned long value)
+{
+ struct mtp_dev *dev = fp->private_data;
+ struct file *filp = NULL;
+ int ret = -EINVAL;
+
+ if (mtp_lock(&dev->ioctl_excl))
+ return -EBUSY;
+
+ switch (code) {
+ case MTP_SEND_FILE:
+ case MTP_RECEIVE_FILE:
+ case MTP_SEND_FILE_WITH_HEADER:
+ {
+ struct mtp_file_range mfr;
+ struct work_struct *work;
+
+ spin_lock_irq(&dev->lock);
+ if (dev->state == STATE_CANCELED) {
+ /* report cancelation to userspace */
+ dev->state = STATE_READY;
+ spin_unlock_irq(&dev->lock);
+ ret = -ECANCELED;
+ goto out;
+ }
+ if (dev->state == STATE_OFFLINE) {
+ spin_unlock_irq(&dev->lock);
+ ret = -ENODEV;
+ goto out;
+ }
+ dev->state = STATE_BUSY;
+ spin_unlock_irq(&dev->lock);
+
+ if (copy_from_user(&mfr, (void __user *)value, sizeof(mfr))) {
+ ret = -EFAULT;
+ goto fail;
+ }
+ /* hold a reference to the file while we are working with it */
+ filp = fget(mfr.fd);
+ if (!filp) {
+ ret = -EBADF;
+ goto fail;
+ }
+
+ /* write the parameters */
+ dev->xfer_file = filp;
+ dev->xfer_file_offset = mfr.offset;
+ dev->xfer_file_length = mfr.length;
+ smp_wmb();
+
+ if (code == MTP_SEND_FILE_WITH_HEADER) {
+ work = &dev->send_file_work;
+ dev->xfer_send_header = 1;
+ dev->xfer_command = mfr.command;
+ dev->xfer_transaction_id = mfr.transaction_id;
+ } else if (code == MTP_SEND_FILE) {
+ work = &dev->send_file_work;
+ dev->xfer_send_header = 0;
+ } else {
+ work = &dev->receive_file_work;
+ }
+
+ /* We do the file transfer on a work queue so it will run
+ * in kernel context, which is necessary for vfs_read and
+ * vfs_write to use our buffers in the kernel address space.
+ */
+ queue_work(dev->wq, work);
+ /* wait for operation to complete */
+ flush_workqueue(dev->wq);
+ fput(filp);
+
+ /* read the result */
+ smp_rmb();
+ ret = dev->xfer_result;
+ break;
+ }
+ case MTP_SEND_EVENT:
+ {
+ struct mtp_event event;
+ /* return here so we don't change dev->state below,
+ * which would interfere with bulk transfer state.
+ */
+ if (copy_from_user(&event, (void __user *)value, sizeof(event)))
+ ret = -EFAULT;
+ else
+ ret = mtp_send_event(dev, &event);
+ goto out;
+ }
+ }
+
+fail:
+ spin_lock_irq(&dev->lock);
+ if (dev->state == STATE_CANCELED)
+ ret = -ECANCELED;
+ else if (dev->state != STATE_OFFLINE)
+ dev->state = STATE_READY;
+ spin_unlock_irq(&dev->lock);
+out:
+ mtp_unlock(&dev->ioctl_excl);
+ DBG(dev->cdev, "ioctl returning %d\n", ret);
+ return ret;
+}
+
+static int mtp_open(struct inode *ip, struct file *fp)
+{
+ printk(KERN_INFO "mtp_open\n");
+ if (mtp_lock(&_mtp_dev->open_excl))
+ return -EBUSY;
+
+ /* clear any error condition */
+ if (_mtp_dev->state != STATE_OFFLINE)
+ _mtp_dev->state = STATE_READY;
+
+ fp->private_data = _mtp_dev;
+ return 0;
+}
+
+static int mtp_release(struct inode *ip, struct file *fp)
+{
+ printk(KERN_INFO "mtp_release\n");
+
+ mtp_unlock(&_mtp_dev->open_excl);
+ return 0;
+}
+
+/* file operations for /dev/mtp_usb */
+static const struct file_operations mtp_fops = {
+ .owner = THIS_MODULE,
+ .read = mtp_read,
+ .write = mtp_write,
+ .unlocked_ioctl = mtp_ioctl,
+ .open = mtp_open,
+ .release = mtp_release,
+};
+
+static struct miscdevice mtp_device = {
+ .minor = MISC_DYNAMIC_MINOR,
+ .name = mtp_shortname,
+ .fops = &mtp_fops,
+};
+
+static int mtp_ctrlrequest(struct usb_composite_dev *cdev,
+ const struct usb_ctrlrequest *ctrl)
+{
+ struct mtp_dev *dev = _mtp_dev;
+ int value = -EOPNOTSUPP;
+ u16 w_index = le16_to_cpu(ctrl->wIndex);
+ u16 w_value = le16_to_cpu(ctrl->wValue);
+ u16 w_length = le16_to_cpu(ctrl->wLength);
+ unsigned long flags;
+
+ VDBG(cdev, "mtp_ctrlrequest "
+ "%02x.%02x v%04x i%04x l%u\n",
+ ctrl->bRequestType, ctrl->bRequest,
+ w_value, w_index, w_length);
+
+ /* Handle MTP OS string */
+ if (ctrl->bRequestType ==
+ (USB_DIR_IN | USB_TYPE_STANDARD | USB_RECIP_DEVICE)
+ && ctrl->bRequest == USB_REQ_GET_DESCRIPTOR
+ && (w_value >> 8) == USB_DT_STRING
+ && (w_value & 0xFF) == MTP_OS_STRING_ID) {
+ value = (w_length < sizeof(mtp_os_string)
+ ? w_length : sizeof(mtp_os_string));
+ memcpy(cdev->req->buf, mtp_os_string, value);
+ } else if ((ctrl->bRequestType & USB_TYPE_MASK) == USB_TYPE_VENDOR) {
+ /* Handle MTP OS descriptor */
+ DBG(cdev, "vendor request: %d index: %d value: %d length: %d\n",
+ ctrl->bRequest, w_index, w_value, w_length);
+
+ if (ctrl->bRequest == 1
+ && (ctrl->bRequestType & USB_DIR_IN)
+ && (w_index == 4 || w_index == 5)) {
+ value = (w_length < sizeof(mtp_ext_config_desc) ?
+ w_length : sizeof(mtp_ext_config_desc));
+ memcpy(cdev->req->buf, &mtp_ext_config_desc, value);
+ }
+ } else if ((ctrl->bRequestType & USB_TYPE_MASK) == USB_TYPE_CLASS) {
+ DBG(cdev, "class request: %d index: %d value: %d length: %d\n",
+ ctrl->bRequest, w_index, w_value, w_length);
+
+ if (ctrl->bRequest == MTP_REQ_CANCEL && w_index == 0
+ && w_value == 0) {
+ DBG(cdev, "MTP_REQ_CANCEL\n");
+
+ spin_lock_irqsave(&dev->lock, flags);
+ if (dev->state == STATE_BUSY) {
+ dev->state = STATE_CANCELED;
+ wake_up(&dev->read_wq);
+ wake_up(&dev->write_wq);
+ }
+ spin_unlock_irqrestore(&dev->lock, flags);
+
+ /* We need to queue a request to read the remaining
+ * bytes, but we don't actually need to look at
+ * the contents.
+ */
+ value = w_length;
+ } else if (ctrl->bRequest == MTP_REQ_GET_DEVICE_STATUS
+ && w_index == 0 && w_value == 0) {
+ struct mtp_device_status *status = cdev->req->buf;
+
+ status->wLength =
+ __constant_cpu_to_le16(sizeof(*status));
+
+ DBG(cdev, "MTP_REQ_GET_DEVICE_STATUS\n");
+ spin_lock_irqsave(&dev->lock, flags);
+ /* device status is "busy" until we report
+ * the cancelation to userspace
+ */
+ if (dev->state == STATE_CANCELED)
+ status->wCode =
+ __cpu_to_le16(MTP_RESPONSE_DEVICE_BUSY);
+ else
+ status->wCode =
+ __cpu_to_le16(MTP_RESPONSE_OK);
+ spin_unlock_irqrestore(&dev->lock, flags);
+ value = sizeof(*status);
+ }
+ }
+
+ /* respond with data transfer or status phase? */
+ if (value >= 0) {
+ int rc;
+
+ cdev->req->zero = value < w_length;
+ cdev->req->length = value;
+ rc = usb_ep_queue(cdev->gadget->ep0, cdev->req, GFP_ATOMIC);
+ if (rc < 0)
+ ERROR(cdev, "%s: response queue error\n", __func__);
+ }
+ return value;
+}
+
+static int
+mtp_function_bind(struct usb_configuration *c, struct usb_function *f)
+{
+ struct usb_composite_dev *cdev = c->cdev;
+ struct mtp_dev *dev = func_to_mtp(f);
+ int id;
+ int ret;
+ struct mtp_instance *fi_mtp;
+
+ dev->cdev = cdev;
+ DBG(cdev, "mtp_function_bind dev: %p\n", dev);
+
+ /* allocate interface ID(s) */
+ id = usb_interface_id(c, f);
+ if (id < 0)
+ return id;
+ mtp_interface_desc.bInterfaceNumber = id;
+
+ if (mtp_string_defs[INTERFACE_STRING_INDEX].id == 0) {
+ ret = usb_string_id(c->cdev);
+ if (ret < 0)
+ return ret;
+ mtp_string_defs[INTERFACE_STRING_INDEX].id = ret;
+ mtp_interface_desc.iInterface = ret;
+ }
+
+ fi_mtp = container_of(f->fi, struct mtp_instance, func_inst);
+
+ if (cdev->use_os_string) {
+ f->os_desc_table = kzalloc(sizeof(*f->os_desc_table),
+ GFP_KERNEL);
+ if (!f->os_desc_table)
+ return -ENOMEM;
+ f->os_desc_n = 1;
+ f->os_desc_table[0].os_desc = &fi_mtp->mtp_os_desc;
+ }
+
+ /* allocate endpoints */
+ ret = mtp_create_bulk_endpoints(dev, &mtp_fullspeed_in_desc,
+ &mtp_fullspeed_out_desc, &mtp_intr_desc);
+ if (ret)
+ return ret;
+
+ /* support high speed hardware */
+ if (gadget_is_dualspeed(c->cdev->gadget)) {
+ mtp_highspeed_in_desc.bEndpointAddress =
+ mtp_fullspeed_in_desc.bEndpointAddress;
+ mtp_highspeed_out_desc.bEndpointAddress =
+ mtp_fullspeed_out_desc.bEndpointAddress;
+ }
+ /* support super speed hardware */
+ if (gadget_is_superspeed(c->cdev->gadget)) {
+ unsigned max_burst;
+
+ /* Calculate bMaxBurst, we know packet size is 1024 */
+ max_burst = min_t(unsigned, MTP_BULK_BUFFER_SIZE / 1024, 15);
+ mtp_ss_in_desc.bEndpointAddress =
+ mtp_fullspeed_in_desc.bEndpointAddress;
+ mtp_ss_in_comp_desc.bMaxBurst = max_burst;
+ mtp_ss_out_desc.bEndpointAddress =
+ mtp_fullspeed_out_desc.bEndpointAddress;
+ mtp_ss_out_comp_desc.bMaxBurst = max_burst;
+ }
+
+ DBG(cdev, "%s speed %s: IN/%s, OUT/%s\n",
+ gadget_is_superspeed(c->cdev->gadget) ? "super" :
+ (gadget_is_dualspeed(c->cdev->gadget) ? "dual" : "full"),
+ f->name, dev->ep_in->name, dev->ep_out->name);
+ return 0;
+}
+
+static void
+mtp_function_unbind(struct usb_configuration *c, struct usb_function *f)
+{
+ struct mtp_dev *dev = func_to_mtp(f);
+ struct usb_request *req;
+ int i;
+
+ mtp_string_defs[INTERFACE_STRING_INDEX].id = 0;
+ while ((req = mtp_req_get(dev, &dev->tx_idle)))
+ mtp_request_free(req, dev->ep_in);
+ for (i = 0; i < RX_REQ_MAX; i++)
+ mtp_request_free(dev->rx_req[i], dev->ep_out);
+ while ((req = mtp_req_get(dev, &dev->intr_idle)))
+ mtp_request_free(req, dev->ep_intr);
+ dev->state = STATE_OFFLINE;
+ kfree(f->os_desc_table);
+ f->os_desc_n = 0;
+}
+
+static int mtp_function_set_alt(struct usb_function *f,
+ unsigned intf, unsigned alt)
+{
+ struct mtp_dev *dev = func_to_mtp(f);
+ struct usb_composite_dev *cdev = f->config->cdev;
+ int ret;
+
+ DBG(cdev, "mtp_function_set_alt intf: %d alt: %d\n", intf, alt);
+
+ ret = config_ep_by_speed(cdev->gadget, f, dev->ep_in);
+ if (ret)
+ return ret;
+
+ ret = usb_ep_enable(dev->ep_in);
+ if (ret)
+ return ret;
+
+ ret = config_ep_by_speed(cdev->gadget, f, dev->ep_out);
+ if (ret)
+ return ret;
+
+ ret = usb_ep_enable(dev->ep_out);
+ if (ret) {
+ usb_ep_disable(dev->ep_in);
+ return ret;
+ }
+
+ ret = config_ep_by_speed(cdev->gadget, f, dev->ep_intr);
+ if (ret)
+ return ret;
+
+ ret = usb_ep_enable(dev->ep_intr);
+ if (ret) {
+ usb_ep_disable(dev->ep_out);
+ usb_ep_disable(dev->ep_in);
+ return ret;
+ }
+ dev->state = STATE_READY;
+
+ /* readers may be blocked waiting for us to go online */
+ wake_up(&dev->read_wq);
+ return 0;
+}
+
+static void mtp_function_disable(struct usb_function *f)
+{
+ struct mtp_dev *dev = func_to_mtp(f);
+ struct usb_composite_dev *cdev = dev->cdev;
+
+ DBG(cdev, "mtp_function_disable\n");
+ dev->state = STATE_OFFLINE;
+ usb_ep_disable(dev->ep_in);
+ usb_ep_disable(dev->ep_out);
+ usb_ep_disable(dev->ep_intr);
+
+ /* readers may be blocked waiting for us to go online */
+ wake_up(&dev->read_wq);
+
+ VDBG(cdev, "%s disabled\n", dev->function.name);
+}
+
+static int __mtp_setup(struct mtp_instance *fi_mtp)
+{
+ struct mtp_dev *dev;
+ int ret;
+
+ dev = kzalloc(sizeof(*dev), GFP_KERNEL);
+
+ if (fi_mtp != NULL)
+ fi_mtp->dev = dev;
+
+ if (!dev)
+ return -ENOMEM;
+
+ spin_lock_init(&dev->lock);
+ init_waitqueue_head(&dev->read_wq);
+ init_waitqueue_head(&dev->write_wq);
+ init_waitqueue_head(&dev->intr_wq);
+ atomic_set(&dev->open_excl, 0);
+ atomic_set(&dev->ioctl_excl, 0);
+ INIT_LIST_HEAD(&dev->tx_idle);
+ INIT_LIST_HEAD(&dev->intr_idle);
+
+ dev->wq = create_singlethread_workqueue("f_mtp");
+ if (!dev->wq) {
+ ret = -ENOMEM;
+ goto err1;
+ }
+ INIT_WORK(&dev->send_file_work, send_file_work);
+ INIT_WORK(&dev->receive_file_work, receive_file_work);
+
+ _mtp_dev = dev;
+
+ ret = misc_register(&mtp_device);
+ if (ret)
+ goto err2;
+
+ return 0;
+
+err2:
+ destroy_workqueue(dev->wq);
+err1:
+ _mtp_dev = NULL;
+ kfree(dev);
+ printk(KERN_ERR "mtp gadget driver failed to initialize\n");
+ return ret;
+}
+
+static int mtp_setup_configfs(struct mtp_instance *fi_mtp)
+{
+ return __mtp_setup(fi_mtp);
+}
+
+
+static void mtp_cleanup(void)
+{
+ struct mtp_dev *dev = _mtp_dev;
+
+ if (!dev)
+ return;
+
+ misc_deregister(&mtp_device);
+ destroy_workqueue(dev->wq);
+ _mtp_dev = NULL;
+ kfree(dev);
+}
+
+static struct mtp_instance *to_mtp_instance(struct config_item *item)
+{
+ return container_of(to_config_group(item), struct mtp_instance,
+ func_inst.group);
+}
+
+static void mtp_attr_release(struct config_item *item)
+{
+ struct mtp_instance *fi_mtp = to_mtp_instance(item);
+
+ usb_put_function_instance(&fi_mtp->func_inst);
+}
+
+static struct configfs_item_operations mtp_item_ops = {
+ .release = mtp_attr_release,
+};
+
+static struct config_item_type mtp_func_type = {
+ .ct_item_ops = &mtp_item_ops,
+ .ct_owner = THIS_MODULE,
+};
+
+
+static struct mtp_instance *to_fi_mtp(struct usb_function_instance *fi)
+{
+ return container_of(fi, struct mtp_instance, func_inst);
+}
+
+static int mtp_set_inst_name(struct usb_function_instance *fi, const char *name)
+{
+ struct mtp_instance *fi_mtp;
+ char *ptr;
+ int name_len;
+
+ name_len = strlen(name) + 1;
+ if (name_len > MAX_INST_NAME_LEN)
+ return -ENAMETOOLONG;
+
+ ptr = kstrndup(name, name_len, GFP_KERNEL);
+ if (!ptr)
+ return -ENOMEM;
+
+ fi_mtp = to_fi_mtp(fi);
+ fi_mtp->name = ptr;
+
+ return 0;
+}
+
+static void mtp_free_inst(struct usb_function_instance *fi)
+{
+ struct mtp_instance *fi_mtp;
+
+ fi_mtp = to_fi_mtp(fi);
+ kfree(fi_mtp->name);
+ mtp_cleanup();
+ kfree(fi_mtp);
+}
+
+struct usb_function_instance *alloc_inst_mtp_ptp(bool mtp_config)
+{
+ struct mtp_instance *fi_mtp;
+ int ret = 0;
+ struct usb_os_desc *descs[1];
+ char *names[1];
+
+ fi_mtp = kzalloc(sizeof(*fi_mtp), GFP_KERNEL);
+ if (!fi_mtp)
+ return ERR_PTR(-ENOMEM);
+ fi_mtp->func_inst.set_inst_name = mtp_set_inst_name;
+ fi_mtp->func_inst.free_func_inst = mtp_free_inst;
+
+ fi_mtp->mtp_os_desc.ext_compat_id = fi_mtp->mtp_ext_compat_id;
+ INIT_LIST_HEAD(&fi_mtp->mtp_os_desc.ext_prop);
+ descs[0] = &fi_mtp->mtp_os_desc;
+ names[0] = "MTP";
+
+ if (mtp_config) {
+ ret = mtp_setup_configfs(fi_mtp);
+ if (ret) {
+ kfree(fi_mtp);
+ pr_err("Error setting MTP\n");
+ return ERR_PTR(ret);
+ }
+ } else
+ fi_mtp->dev = _mtp_dev;
+
+ config_group_init_type_name(&fi_mtp->func_inst.group,
+ "", &mtp_func_type);
+ usb_os_desc_prepare_interf_dir(&fi_mtp->func_inst.group, 1,
+ descs, names, THIS_MODULE);
+
+ return &fi_mtp->func_inst;
+}
+EXPORT_SYMBOL_GPL(alloc_inst_mtp_ptp);
+
+static struct usb_function_instance *mtp_alloc_inst(void)
+{
+ return alloc_inst_mtp_ptp(true);
+}
+
+static int mtp_ctrlreq_configfs(struct usb_function *f,
+ const struct usb_ctrlrequest *ctrl)
+{
+ return mtp_ctrlrequest(f->config->cdev, ctrl);
+}
+
+static void mtp_free(struct usb_function *f)
+{
+ /*NO-OP: no function specific resource allocation in mtp_alloc*/
+}
+
+struct usb_function *function_alloc_mtp_ptp(struct usb_function_instance *fi,
+ bool mtp_config)
+{
+ struct mtp_instance *fi_mtp = to_fi_mtp(fi);
+ struct mtp_dev *dev;
+
+ /*
+ * PTP piggybacks on MTP function so make sure we have
+ * created MTP function before we associate this PTP
+ * function with a gadget configuration.
+ */
+ if (fi_mtp->dev == NULL) {
+ pr_err("Error: Create MTP function before linking"
+ " PTP function with a gadget configuration\n");
+ pr_err("\t1: Delete existing PTP function if any\n");
+ pr_err("\t2: Create MTP function\n");
+ pr_err("\t3: Create and symlink PTP function"
+ " with a gadget configuration\n");
+ return ERR_PTR(-EINVAL); /* Invalid Configuration */
+ }
+
+ dev = fi_mtp->dev;
+ dev->function.name = DRIVER_NAME;
+ dev->function.strings = mtp_strings;
+ if (mtp_config) {
+ dev->function.fs_descriptors = fs_mtp_descs;
+ dev->function.hs_descriptors = hs_mtp_descs;
+ dev->function.ss_descriptors = ss_mtp_descs;
+ } else {
+ dev->function.fs_descriptors = fs_ptp_descs;
+ dev->function.hs_descriptors = hs_ptp_descs;
+ dev->function.ss_descriptors = ss_ptp_descs;
+ }
+ dev->function.bind = mtp_function_bind;
+ dev->function.unbind = mtp_function_unbind;
+ dev->function.set_alt = mtp_function_set_alt;
+ dev->function.disable = mtp_function_disable;
+ dev->function.setup = mtp_ctrlreq_configfs;
+ dev->function.free_func = mtp_free;
+
+ return &dev->function;
+}
+EXPORT_SYMBOL_GPL(function_alloc_mtp_ptp);
+
+static struct usb_function *mtp_alloc(struct usb_function_instance *fi)
+{
+ return function_alloc_mtp_ptp(fi, true);
+}
+
+DECLARE_USB_FUNCTION_INIT(mtp, mtp_alloc_inst, mtp_alloc);
+MODULE_LICENSE("GPL");
diff --git a/drivers/usb/gadget/function/f_mtp.h b/drivers/usb/gadget/function/f_mtp.h
new file mode 100644
index 000000000000..7adb1ff08eff
--- /dev/null
+++ b/drivers/usb/gadget/function/f_mtp.h
@@ -0,0 +1,18 @@
+/*
+ * Copyright (C) 2014 Google, Inc.
+ * Author: Badhri Jagan Sridharan <badhri@android.com>
+ *
+ * This software is licensed under the terms of the GNU General Public
+ * License version 2, as published by the Free Software Foundation, and
+ * may be copied, distributed, and modified under those terms.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ */
+
+extern struct usb_function_instance *alloc_inst_mtp_ptp(bool mtp_config);
+extern struct usb_function *function_alloc_mtp_ptp(
+ struct usb_function_instance *fi, bool mtp_config);
diff --git a/drivers/usb/gadget/function/f_ptp.c b/drivers/usb/gadget/function/f_ptp.c
new file mode 100644
index 000000000000..da3e4d53e085
--- /dev/null
+++ b/drivers/usb/gadget/function/f_ptp.c
@@ -0,0 +1,38 @@
+/*
+ * Gadget Function Driver for PTP
+ *
+ * Copyright (C) 2014 Google, Inc.
+ * Author: Badhri Jagan Sridharan <badhri@android.com>
+ *
+ * This software is licensed under the terms of the GNU General Public
+ * License version 2, as published by the Free Software Foundation, and
+ * may be copied, distributed, and modified under those terms.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ */
+
+#include <linux/module.h>
+#include <linux/types.h>
+
+#include <linux/configfs.h>
+#include <linux/usb/composite.h>
+
+#include "f_mtp.h"
+
+static struct usb_function_instance *ptp_alloc_inst(void)
+{
+ return alloc_inst_mtp_ptp(false);
+}
+
+static struct usb_function *ptp_alloc(struct usb_function_instance *fi)
+{
+ return function_alloc_mtp_ptp(fi, false);
+}
+
+DECLARE_USB_FUNCTION_INIT(ptp, ptp_alloc_inst, ptp_alloc);
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Badhri Jagan Sridharan");
diff --git a/drivers/usb/phy/Kconfig b/drivers/usb/phy/Kconfig
index 125cea1c3c8d..e47bb366877d 100644
--- a/drivers/usb/phy/Kconfig
+++ b/drivers/usb/phy/Kconfig
@@ -6,6 +6,14 @@ menu "USB Physical Layer drivers"
config USB_PHY
def_bool n
+config USB_OTG_WAKELOCK
+ bool "Hold a wakelock when USB connected"
+ depends on PM_WAKELOCKS
+ select USB_OTG_UTILS
+ help
+ Select this to automatically hold a wakelock when USB is
+ connected, preventing suspend.
+
#
# USB Transceiver Drivers
#
@@ -209,4 +217,13 @@ config USB_ULPI_VIEWPORT
Provides read/write operations to the ULPI phy register set for
controllers with a viewport register (e.g. Chipidea/ARC controllers).
+config DUAL_ROLE_USB_INTF
+ bool "Generic DUAL ROLE sysfs interface"
+ depends on SYSFS && USB_PHY
+ help
+ A generic sysfs interface to track and change the state of
+ dual role usb phys. The usb phy drivers can register to
+ this interface to expose it capabilities to the userspace
+ and thereby allowing userspace to change the port mode.
+
endmenu
diff --git a/drivers/usb/phy/Makefile b/drivers/usb/phy/Makefile
index b433e5d89be4..f65ac3e1fc07 100644
--- a/drivers/usb/phy/Makefile
+++ b/drivers/usb/phy/Makefile
@@ -3,6 +3,8 @@
#
obj-$(CONFIG_USB_PHY) += phy.o
obj-$(CONFIG_OF) += of.o
+obj-$(CONFIG_USB_OTG_WAKELOCK) += otg-wakelock.o
+obj-$(CONFIG_DUAL_ROLE_USB_INTF) += class-dual-role.o
# transceiver drivers, keep the list sorted
diff --git a/drivers/usb/phy/class-dual-role.c b/drivers/usb/phy/class-dual-role.c
new file mode 100644
index 000000000000..51fcb545a9d5
--- /dev/null
+++ b/drivers/usb/phy/class-dual-role.c
@@ -0,0 +1,529 @@
+/*
+ * class-dual-role.c
+ *
+ * Copyright (C) 2015 Google, Inc.
+ *
+ * This software is licensed under the terms of the GNU General Public
+ * License version 2, as published by the Free Software Foundation, and
+ * may be copied, distributed, and modified under those terms.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ */
+
+#include <linux/ctype.h>
+#include <linux/device.h>
+#include <linux/usb/class-dual-role.h>
+#include <linux/err.h>
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/slab.h>
+#include <linux/stat.h>
+#include <linux/types.h>
+
+#define DUAL_ROLE_NOTIFICATION_TIMEOUT 2000
+
+static ssize_t dual_role_store_property(struct device *dev,
+ struct device_attribute *attr,
+ const char *buf, size_t count);
+static ssize_t dual_role_show_property(struct device *dev,
+ struct device_attribute *attr,
+ char *buf);
+
+#define DUAL_ROLE_ATTR(_name) \
+{ \
+ .attr = { .name = #_name }, \
+ .show = dual_role_show_property, \
+ .store = dual_role_store_property, \
+}
+
+static struct device_attribute dual_role_attrs[] = {
+ DUAL_ROLE_ATTR(supported_modes),
+ DUAL_ROLE_ATTR(mode),
+ DUAL_ROLE_ATTR(power_role),
+ DUAL_ROLE_ATTR(data_role),
+ DUAL_ROLE_ATTR(powers_vconn),
+};
+
+struct class *dual_role_class;
+EXPORT_SYMBOL_GPL(dual_role_class);
+
+static struct device_type dual_role_dev_type;
+
+static char *kstrdupcase(const char *str, gfp_t gfp, bool to_upper)
+{
+ char *ret, *ustr;
+
+ ustr = ret = kmalloc(strlen(str) + 1, gfp);
+
+ if (!ret)
+ return NULL;
+
+ while (*str)
+ *ustr++ = to_upper ? toupper(*str++) : tolower(*str++);
+
+ *ustr = 0;
+
+ return ret;
+}
+
+static void dual_role_changed_work(struct work_struct *work)
+{
+ struct dual_role_phy_instance *dual_role =
+ container_of(work, struct dual_role_phy_instance,
+ changed_work);
+
+ dev_dbg(&dual_role->dev, "%s\n", __func__);
+ kobject_uevent(&dual_role->dev.kobj, KOBJ_CHANGE);
+}
+
+void dual_role_instance_changed(struct dual_role_phy_instance *dual_role)
+{
+ dev_dbg(&dual_role->dev, "%s\n", __func__);
+ pm_wakeup_event(&dual_role->dev, DUAL_ROLE_NOTIFICATION_TIMEOUT);
+ schedule_work(&dual_role->changed_work);
+}
+EXPORT_SYMBOL_GPL(dual_role_instance_changed);
+
+int dual_role_get_property(struct dual_role_phy_instance *dual_role,
+ enum dual_role_property prop,
+ unsigned int *val)
+{
+ return dual_role->desc->get_property(dual_role, prop, val);
+}
+EXPORT_SYMBOL_GPL(dual_role_get_property);
+
+int dual_role_set_property(struct dual_role_phy_instance *dual_role,
+ enum dual_role_property prop,
+ const unsigned int *val)
+{
+ if (!dual_role->desc->set_property)
+ return -ENODEV;
+
+ return dual_role->desc->set_property(dual_role, prop, val);
+}
+EXPORT_SYMBOL_GPL(dual_role_set_property);
+
+int dual_role_property_is_writeable(struct dual_role_phy_instance *dual_role,
+ enum dual_role_property prop)
+{
+ if (!dual_role->desc->property_is_writeable)
+ return -ENODEV;
+
+ return dual_role->desc->property_is_writeable(dual_role, prop);
+}
+EXPORT_SYMBOL_GPL(dual_role_property_is_writeable);
+
+static void dual_role_dev_release(struct device *dev)
+{
+ struct dual_role_phy_instance *dual_role =
+ container_of(dev, struct dual_role_phy_instance, dev);
+ pr_debug("device: '%s': %s\n", dev_name(dev), __func__);
+ kfree(dual_role);
+}
+
+static struct dual_role_phy_instance *__must_check
+__dual_role_register(struct device *parent,
+ const struct dual_role_phy_desc *desc)
+{
+ struct device *dev;
+ struct dual_role_phy_instance *dual_role;
+ int rc;
+
+ dual_role = kzalloc(sizeof(*dual_role), GFP_KERNEL);
+ if (!dual_role)
+ return ERR_PTR(-ENOMEM);
+
+ dev = &dual_role->dev;
+
+ device_initialize(dev);
+
+ dev->class = dual_role_class;
+ dev->type = &dual_role_dev_type;
+ dev->parent = parent;
+ dev->release = dual_role_dev_release;
+ dev_set_drvdata(dev, dual_role);
+ dual_role->desc = desc;
+
+ rc = dev_set_name(dev, "%s", desc->name);
+ if (rc)
+ goto dev_set_name_failed;
+
+ INIT_WORK(&dual_role->changed_work, dual_role_changed_work);
+
+ rc = device_init_wakeup(dev, true);
+ if (rc)
+ goto wakeup_init_failed;
+
+ rc = device_add(dev);
+ if (rc)
+ goto device_add_failed;
+
+ dual_role_instance_changed(dual_role);
+
+ return dual_role;
+
+device_add_failed:
+ device_init_wakeup(dev, false);
+wakeup_init_failed:
+dev_set_name_failed:
+ put_device(dev);
+ kfree(dual_role);
+
+ return ERR_PTR(rc);
+}
+
+static void dual_role_instance_unregister(struct dual_role_phy_instance
+ *dual_role)
+{
+ cancel_work_sync(&dual_role->changed_work);
+ device_init_wakeup(&dual_role->dev, false);
+ device_unregister(&dual_role->dev);
+}
+
+static void devm_dual_role_release(struct device *dev, void *res)
+{
+ struct dual_role_phy_instance **dual_role = res;
+
+ dual_role_instance_unregister(*dual_role);
+}
+
+struct dual_role_phy_instance *__must_check
+devm_dual_role_instance_register(struct device *parent,
+ const struct dual_role_phy_desc *desc)
+{
+ struct dual_role_phy_instance **ptr, *dual_role;
+
+ ptr = devres_alloc(devm_dual_role_release, sizeof(*ptr), GFP_KERNEL);
+
+ if (!ptr)
+ return ERR_PTR(-ENOMEM);
+ dual_role = __dual_role_register(parent, desc);
+ if (IS_ERR(dual_role)) {
+ devres_free(ptr);
+ } else {
+ *ptr = dual_role;
+ devres_add(parent, ptr);
+ }
+ return dual_role;
+}
+EXPORT_SYMBOL_GPL(devm_dual_role_instance_register);
+
+static int devm_dual_role_match(struct device *dev, void *res, void *data)
+{
+ struct dual_role_phy_instance **r = res;
+
+ if (WARN_ON(!r || !*r))
+ return 0;
+
+ return *r == data;
+}
+
+void devm_dual_role_instance_unregister(struct device *dev,
+ struct dual_role_phy_instance
+ *dual_role)
+{
+ int rc;
+
+ rc = devres_release(dev, devm_dual_role_release,
+ devm_dual_role_match, dual_role);
+ WARN_ON(rc);
+}
+EXPORT_SYMBOL_GPL(devm_dual_role_instance_unregister);
+
+void *dual_role_get_drvdata(struct dual_role_phy_instance *dual_role)
+{
+ return dual_role->drv_data;
+}
+EXPORT_SYMBOL_GPL(dual_role_get_drvdata);
+
+/***************** Device attribute functions **************************/
+
+/* port type */
+static char *supported_modes_text[] = {
+ "ufp dfp", "dfp", "ufp"
+};
+
+/* current mode */
+static char *mode_text[] = {
+ "ufp", "dfp", "none"
+};
+
+/* Power role */
+static char *pr_text[] = {
+ "source", "sink", "none"
+};
+
+/* Data role */
+static char *dr_text[] = {
+ "host", "device", "none"
+};
+
+/* Vconn supply */
+static char *vconn_supply_text[] = {
+ "n", "y"
+};
+
+static ssize_t dual_role_show_property(struct device *dev,
+ struct device_attribute *attr, char *buf)
+{
+ ssize_t ret = 0;
+ struct dual_role_phy_instance *dual_role = dev_get_drvdata(dev);
+ const ptrdiff_t off = attr - dual_role_attrs;
+ unsigned int value;
+
+ if (off == DUAL_ROLE_PROP_SUPPORTED_MODES) {
+ value = dual_role->desc->supported_modes;
+ } else {
+ ret = dual_role_get_property(dual_role, off, &value);
+
+ if (ret < 0) {
+ if (ret == -ENODATA)
+ dev_dbg(dev,
+ "driver has no data for `%s' property\n",
+ attr->attr.name);
+ else if (ret != -ENODEV)
+ dev_err(dev,
+ "driver failed to report `%s' property: %zd\n",
+ attr->attr.name, ret);
+ return ret;
+ }
+ }
+
+ if (off == DUAL_ROLE_PROP_SUPPORTED_MODES) {
+ BUILD_BUG_ON(DUAL_ROLE_PROP_SUPPORTED_MODES_TOTAL !=
+ ARRAY_SIZE(supported_modes_text));
+ if (value < DUAL_ROLE_PROP_SUPPORTED_MODES_TOTAL)
+ return snprintf(buf, PAGE_SIZE, "%s\n",
+ supported_modes_text[value]);
+ else
+ return -EIO;
+ } else if (off == DUAL_ROLE_PROP_MODE) {
+ BUILD_BUG_ON(DUAL_ROLE_PROP_MODE_TOTAL !=
+ ARRAY_SIZE(mode_text));
+ if (value < DUAL_ROLE_PROP_MODE_TOTAL)
+ return snprintf(buf, PAGE_SIZE, "%s\n",
+ mode_text[value]);
+ else
+ return -EIO;
+ } else if (off == DUAL_ROLE_PROP_PR) {
+ BUILD_BUG_ON(DUAL_ROLE_PROP_PR_TOTAL != ARRAY_SIZE(pr_text));
+ if (value < DUAL_ROLE_PROP_PR_TOTAL)
+ return snprintf(buf, PAGE_SIZE, "%s\n",
+ pr_text[value]);
+ else
+ return -EIO;
+ } else if (off == DUAL_ROLE_PROP_DR) {
+ BUILD_BUG_ON(DUAL_ROLE_PROP_DR_TOTAL != ARRAY_SIZE(dr_text));
+ if (value < DUAL_ROLE_PROP_DR_TOTAL)
+ return snprintf(buf, PAGE_SIZE, "%s\n",
+ dr_text[value]);
+ else
+ return -EIO;
+ } else if (off == DUAL_ROLE_PROP_VCONN_SUPPLY) {
+ BUILD_BUG_ON(DUAL_ROLE_PROP_VCONN_SUPPLY_TOTAL !=
+ ARRAY_SIZE(vconn_supply_text));
+ if (value < DUAL_ROLE_PROP_VCONN_SUPPLY_TOTAL)
+ return snprintf(buf, PAGE_SIZE, "%s\n",
+ vconn_supply_text[value]);
+ else
+ return -EIO;
+ } else
+ return -EIO;
+}
+
+static ssize_t dual_role_store_property(struct device *dev,
+ struct device_attribute *attr,
+ const char *buf, size_t count)
+{
+ ssize_t ret;
+ struct dual_role_phy_instance *dual_role = dev_get_drvdata(dev);
+ const ptrdiff_t off = attr - dual_role_attrs;
+ unsigned int value;
+ int total, i;
+ char *dup_buf, **text_array;
+ bool result = false;
+
+ dup_buf = kstrdupcase(buf, GFP_KERNEL, false);
+ switch (off) {
+ case DUAL_ROLE_PROP_MODE:
+ total = DUAL_ROLE_PROP_MODE_TOTAL;
+ text_array = mode_text;
+ break;
+ case DUAL_ROLE_PROP_PR:
+ total = DUAL_ROLE_PROP_PR_TOTAL;
+ text_array = pr_text;
+ break;
+ case DUAL_ROLE_PROP_DR:
+ total = DUAL_ROLE_PROP_DR_TOTAL;
+ text_array = dr_text;
+ break;
+ case DUAL_ROLE_PROP_VCONN_SUPPLY:
+ ret = strtobool(dup_buf, &result);
+ value = result;
+ if (!ret)
+ goto setprop;
+ default:
+ ret = -EINVAL;
+ goto error;
+ }
+
+ for (i = 0; i <= total; i++) {
+ if (i == total) {
+ ret = -ENOTSUPP;
+ goto error;
+ }
+ if (!strncmp(*(text_array + i), dup_buf,
+ strlen(*(text_array + i)))) {
+ value = i;
+ break;
+ }
+ }
+
+setprop:
+ ret = dual_role->desc->set_property(dual_role, off, &value);
+
+error:
+ kfree(dup_buf);
+
+ if (ret < 0)
+ return ret;
+
+ return count;
+}
+
+static umode_t dual_role_attr_is_visible(struct kobject *kobj,
+ struct attribute *attr, int attrno)
+{
+ struct device *dev = container_of(kobj, struct device, kobj);
+ struct dual_role_phy_instance *dual_role = dev_get_drvdata(dev);
+ umode_t mode = S_IRUSR | S_IRGRP | S_IROTH;
+ int i;
+
+ if (attrno == DUAL_ROLE_PROP_SUPPORTED_MODES)
+ return mode;
+
+ for (i = 0; i < dual_role->desc->num_properties; i++) {
+ int property = dual_role->desc->properties[i];
+
+ if (property == attrno) {
+ if (dual_role->desc->property_is_writeable &&
+ dual_role_property_is_writeable(dual_role, property)
+ > 0)
+ mode |= S_IWUSR;
+
+ return mode;
+ }
+ }
+
+ return 0;
+}
+
+static struct attribute *__dual_role_attrs[ARRAY_SIZE(dual_role_attrs) + 1];
+
+static struct attribute_group dual_role_attr_group = {
+ .attrs = __dual_role_attrs,
+ .is_visible = dual_role_attr_is_visible,
+};
+
+static const struct attribute_group *dual_role_attr_groups[] = {
+ &dual_role_attr_group,
+ NULL,
+};
+
+void dual_role_init_attrs(struct device_type *dev_type)
+{
+ int i;
+
+ dev_type->groups = dual_role_attr_groups;
+
+ for (i = 0; i < ARRAY_SIZE(dual_role_attrs); i++)
+ __dual_role_attrs[i] = &dual_role_attrs[i].attr;
+}
+
+int dual_role_uevent(struct device *dev, struct kobj_uevent_env *env)
+{
+ struct dual_role_phy_instance *dual_role = dev_get_drvdata(dev);
+ int ret = 0, j;
+ char *prop_buf;
+ char *attrname;
+
+ dev_dbg(dev, "uevent\n");
+
+ if (!dual_role || !dual_role->desc) {
+ dev_dbg(dev, "No dual_role phy yet\n");
+ return ret;
+ }
+
+ dev_dbg(dev, "DUAL_ROLE_NAME=%s\n", dual_role->desc->name);
+
+ ret = add_uevent_var(env, "DUAL_ROLE_NAME=%s", dual_role->desc->name);
+ if (ret)
+ return ret;
+
+ prop_buf = (char *)get_zeroed_page(GFP_KERNEL);
+ if (!prop_buf)
+ return -ENOMEM;
+
+ for (j = 0; j < dual_role->desc->num_properties; j++) {
+ struct device_attribute *attr;
+ char *line;
+
+ attr = &dual_role_attrs[dual_role->desc->properties[j]];
+
+ ret = dual_role_show_property(dev, attr, prop_buf);
+ if (ret == -ENODEV || ret == -ENODATA) {
+ ret = 0;
+ continue;
+ }
+
+ if (ret < 0)
+ goto out;
+ line = strnchr(prop_buf, PAGE_SIZE, '\n');
+ if (line)
+ *line = 0;
+
+ attrname = kstrdupcase(attr->attr.name, GFP_KERNEL, true);
+ if (!attrname)
+ ret = -ENOMEM;
+
+ dev_dbg(dev, "prop %s=%s\n", attrname, prop_buf);
+
+ ret = add_uevent_var(env, "DUAL_ROLE_%s=%s", attrname,
+ prop_buf);
+ kfree(attrname);
+ if (ret)
+ goto out;
+ }
+
+out:
+ free_page((unsigned long)prop_buf);
+
+ return ret;
+}
+
+/******************* Module Init ***********************************/
+
+static int __init dual_role_class_init(void)
+{
+ dual_role_class = class_create(THIS_MODULE, "dual_role_usb");
+
+ if (IS_ERR(dual_role_class))
+ return PTR_ERR(dual_role_class);
+
+ dual_role_class->dev_uevent = dual_role_uevent;
+ dual_role_init_attrs(&dual_role_dev_type);
+
+ return 0;
+}
+
+static void __exit dual_role_class_exit(void)
+{
+ class_destroy(dual_role_class);
+}
+
+subsys_initcall(dual_role_class_init);
+module_exit(dual_role_class_exit);
diff --git a/drivers/usb/phy/otg-wakelock.c b/drivers/usb/phy/otg-wakelock.c
new file mode 100644
index 000000000000..ecd741027f53
--- /dev/null
+++ b/drivers/usb/phy/otg-wakelock.c
@@ -0,0 +1,170 @@
+/*
+ * otg-wakelock.c
+ *
+ * Copyright (C) 2011 Google, Inc.
+ *
+ * This software is licensed under the terms of the GNU General Public
+ * License version 2, as published by the Free Software Foundation, and
+ * may be copied, distributed, and modified under those terms.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ */
+
+#include <linux/kernel.h>
+#include <linux/device.h>
+#include <linux/err.h>
+#include <linux/module.h>
+#include <linux/notifier.h>
+#include <linux/spinlock.h>
+#include <linux/usb/otg.h>
+
+#define TEMPORARY_HOLD_TIME 2000
+
+static bool enabled = true;
+static struct usb_phy *otgwl_xceiv;
+static struct notifier_block otgwl_nb;
+
+/*
+ * otgwl_spinlock is held while the VBUS lock is grabbed or dropped and the
+ * held field is updated to match.
+ */
+
+static DEFINE_SPINLOCK(otgwl_spinlock);
+
+/*
+ * Only one lock, but since these 3 fields are associated with each other...
+ */
+
+struct otgwl_lock {
+ char name[40];
+ struct wakeup_source wakesrc;
+ bool held;
+};
+
+/*
+ * VBUS present lock. Also used as a timed lock on charger
+ * connect/disconnect and USB host disconnect, to allow the system
+ * to react to the change in power.
+ */
+
+static struct otgwl_lock vbus_lock;
+
+static void otgwl_hold(struct otgwl_lock *lock)
+{
+ if (!lock->held) {
+ __pm_stay_awake(&lock->wakesrc);
+ lock->held = true;
+ }
+}
+
+static void otgwl_temporary_hold(struct otgwl_lock *lock)
+{
+ __pm_wakeup_event(&lock->wakesrc, TEMPORARY_HOLD_TIME);
+ lock->held = false;
+}
+
+static void otgwl_drop(struct otgwl_lock *lock)
+{
+ if (lock->held) {
+ __pm_relax(&lock->wakesrc);
+ lock->held = false;
+ }
+}
+
+static void otgwl_handle_event(unsigned long event)
+{
+ unsigned long irqflags;
+
+ spin_lock_irqsave(&otgwl_spinlock, irqflags);
+
+ if (!enabled) {
+ otgwl_drop(&vbus_lock);
+ spin_unlock_irqrestore(&otgwl_spinlock, irqflags);
+ return;
+ }
+
+ switch (event) {
+ case USB_EVENT_VBUS:
+ case USB_EVENT_ENUMERATED:
+ otgwl_hold(&vbus_lock);
+ break;
+
+ case USB_EVENT_NONE:
+ case USB_EVENT_ID:
+ case USB_EVENT_CHARGER:
+ otgwl_temporary_hold(&vbus_lock);
+ break;
+
+ default:
+ break;
+ }
+
+ spin_unlock_irqrestore(&otgwl_spinlock, irqflags);
+}
+
+static int otgwl_otg_notifications(struct notifier_block *nb,
+ unsigned long event, void *unused)
+{
+ otgwl_handle_event(event);
+ return NOTIFY_OK;
+}
+
+static int set_enabled(const char *val, const struct kernel_param *kp)
+{
+ int rv = param_set_bool(val, kp);
+
+ if (rv)
+ return rv;
+
+ if (otgwl_xceiv)
+ otgwl_handle_event(otgwl_xceiv->last_event);
+
+ return 0;
+}
+
+static struct kernel_param_ops enabled_param_ops = {
+ .set = set_enabled,
+ .get = param_get_bool,
+};
+
+module_param_cb(enabled, &enabled_param_ops, &enabled, 0644);
+MODULE_PARM_DESC(enabled, "enable wakelock when VBUS present");
+
+static int __init otg_wakelock_init(void)
+{
+ int ret;
+ struct usb_phy *phy;
+
+ phy = usb_get_phy(USB_PHY_TYPE_USB2);
+
+ if (IS_ERR(phy)) {
+ pr_err("%s: No USB transceiver found\n", __func__);
+ return PTR_ERR(phy);
+ }
+ otgwl_xceiv = phy;
+
+ snprintf(vbus_lock.name, sizeof(vbus_lock.name), "vbus-%s",
+ dev_name(otgwl_xceiv->dev));
+ wakeup_source_init(&vbus_lock.wakesrc, vbus_lock.name);
+
+ otgwl_nb.notifier_call = otgwl_otg_notifications;
+ ret = usb_register_notifier(otgwl_xceiv, &otgwl_nb);
+
+ if (ret) {
+ pr_err("%s: usb_register_notifier on transceiver %s"
+ " failed\n", __func__,
+ dev_name(otgwl_xceiv->dev));
+ otgwl_xceiv = NULL;
+ wakeup_source_trash(&vbus_lock.wakesrc);
+ return ret;
+ }
+
+ otgwl_handle_event(otgwl_xceiv->last_event);
+ return ret;
+}
+
+late_initcall(otg_wakelock_init);
diff --git a/drivers/video/console/dummycon.c b/drivers/video/console/dummycon.c
index b90ef96e43d6..25d5bc3dcdb5 100644
--- a/drivers/video/console/dummycon.c
+++ b/drivers/video/console/dummycon.c
@@ -41,12 +41,55 @@ static void dummycon_init(struct vc_data *vc, int init)
vc_resize(vc, DUMMY_COLUMNS, DUMMY_ROWS);
}
-static int dummycon_dummy(void)
+static void dummycon_deinit(struct vc_data *vc)
+{
+}
+
+static void dummycon_clear(struct vc_data *vc, int a, int b, int c, int d)
+{
+}
+
+static void dummycon_putc(struct vc_data *vc, int a, int b, int c)
+{
+}
+
+static void dummycon_putcs(struct vc_data *vc, const unsigned short *s, int a, int b, int c)
+{
+}
+
+static void dummycon_cursor(struct vc_data *vc, int a)
+{
+}
+
+static int dummycon_scroll(struct vc_data *vc, int a, int b, int c, int d)
+{
+ return 0;
+}
+
+static int dummycon_switch(struct vc_data *vc)
{
return 0;
}
-#define DUMMY (void *)dummycon_dummy
+static int dummycon_blank(struct vc_data *vc, int a, int b)
+{
+ return 0;
+}
+
+static int dummycon_font_set(struct vc_data *vc, struct console_font *f, unsigned u)
+{
+ return 0;
+}
+
+static int dummycon_font_default(struct vc_data *vc, struct console_font *f, char *c)
+{
+ return 0;
+}
+
+static int dummycon_font_copy(struct vc_data *vc, int a)
+{
+ return 0;
+}
/*
* The console `switch' structure for the dummy console
@@ -58,16 +101,16 @@ const struct consw dummy_con = {
.owner = THIS_MODULE,
.con_startup = dummycon_startup,
.con_init = dummycon_init,
- .con_deinit = DUMMY,
- .con_clear = DUMMY,
- .con_putc = DUMMY,
- .con_putcs = DUMMY,
- .con_cursor = DUMMY,
- .con_scroll = DUMMY,
- .con_switch = DUMMY,
- .con_blank = DUMMY,
- .con_font_set = DUMMY,
- .con_font_default = DUMMY,
- .con_font_copy = DUMMY,
+ .con_deinit = dummycon_deinit,
+ .con_clear = dummycon_clear,
+ .con_putc = dummycon_putc,
+ .con_putcs = dummycon_putcs,
+ .con_cursor = dummycon_cursor,
+ .con_scroll = dummycon_scroll,
+ .con_switch = dummycon_switch,
+ .con_blank = dummycon_blank,
+ .con_font_set = dummycon_font_set,
+ .con_font_default = dummycon_font_default,
+ .con_font_copy = dummycon_font_copy,
};
EXPORT_SYMBOL_GPL(dummy_con);
diff --git a/drivers/video/fbdev/goldfishfb.c b/drivers/video/fbdev/goldfishfb.c
index 14a93cb21310..8c93ad1dd9cc 100644
--- a/drivers/video/fbdev/goldfishfb.c
+++ b/drivers/video/fbdev/goldfishfb.c
@@ -26,6 +26,7 @@
#include <linux/interrupt.h>
#include <linux/ioport.h>
#include <linux/platform_device.h>
+#include <linux/acpi.h>
enum {
FB_GET_WIDTH = 0x00,
@@ -234,7 +235,7 @@ static int goldfish_fb_probe(struct platform_device *pdev)
fb->fb.var.activate = FB_ACTIVATE_NOW;
fb->fb.var.height = readl(fb->reg_base + FB_GET_PHYS_HEIGHT);
fb->fb.var.width = readl(fb->reg_base + FB_GET_PHYS_WIDTH);
- fb->fb.var.pixclock = 10000;
+ fb->fb.var.pixclock = 0;
fb->fb.var.red.offset = 11;
fb->fb.var.red.length = 5;
@@ -305,12 +306,25 @@ static int goldfish_fb_remove(struct platform_device *pdev)
return 0;
}
+static const struct of_device_id goldfish_fb_of_match[] = {
+ { .compatible = "google,goldfish-fb", },
+ {},
+};
+MODULE_DEVICE_TABLE(of, goldfish_fb_of_match);
+
+static const struct acpi_device_id goldfish_fb_acpi_match[] = {
+ { "GFSH0004", 0 },
+ { },
+};
+MODULE_DEVICE_TABLE(acpi, goldfish_fb_acpi_match);
static struct platform_driver goldfish_fb_driver = {
.probe = goldfish_fb_probe,
.remove = goldfish_fb_remove,
.driver = {
- .name = "goldfish_fb"
+ .name = "goldfish_fb",
+ .of_match_table = goldfish_fb_of_match,
+ .acpi_match_table = ACPI_PTR(goldfish_fb_acpi_match),
}
};
diff --git a/drivers/w1/masters/ds2482.c b/drivers/w1/masters/ds2482.c
index 2e30db1b1a43..fa13fa8c81af 100644
--- a/drivers/w1/masters/ds2482.c
+++ b/drivers/w1/masters/ds2482.c
@@ -18,6 +18,8 @@
#include <linux/slab.h>
#include <linux/i2c.h>
#include <linux/delay.h>
+#include <linux/gpio.h>
+#include <linux/platform_data/ds2482.h>
#include <asm/delay.h>
#include "../w1.h"
@@ -97,7 +99,8 @@ static const u8 ds2482_chan_rd[8] =
static int ds2482_probe(struct i2c_client *client,
const struct i2c_device_id *id);
static int ds2482_remove(struct i2c_client *client);
-
+static int ds2482_suspend(struct device *dev);
+static int ds2482_resume(struct device *dev);
/**
* Driver data (common to all clients)
@@ -108,9 +111,15 @@ static const struct i2c_device_id ds2482_id[] = {
};
MODULE_DEVICE_TABLE(i2c, ds2482_id);
+static const struct dev_pm_ops ds2482_pm_ops = {
+ .suspend = ds2482_suspend,
+ .resume = ds2482_resume,
+};
+
static struct i2c_driver ds2482_driver = {
.driver = {
.name = "ds2482",
+ .pm = &ds2482_pm_ops,
},
.probe = ds2482_probe,
.remove = ds2482_remove,
@@ -132,6 +141,7 @@ struct ds2482_w1_chan {
struct ds2482_data {
struct i2c_client *client;
struct mutex access_lock;
+ int slpz_gpio;
/* 1-wire interface(s) */
int w1_count; /* 1 or 8 */
@@ -460,11 +470,31 @@ static u8 ds2482_w1_set_pullup(void *data, int delay)
return retval;
}
+static int ds2482_suspend(struct device *dev)
+{
+ struct i2c_client *client = to_i2c_client(dev);
+ struct ds2482_data *data = i2c_get_clientdata(client);
+
+ if (data->slpz_gpio >= 0)
+ gpio_set_value(data->slpz_gpio, 0);
+ return 0;
+}
+
+static int ds2482_resume(struct device *dev)
+{
+ struct i2c_client *client = to_i2c_client(dev);
+ struct ds2482_data *data = i2c_get_clientdata(client);
+
+ if (data->slpz_gpio >= 0)
+ gpio_set_value(data->slpz_gpio, 1);
+ return 0;
+}
static int ds2482_probe(struct i2c_client *client,
const struct i2c_device_id *id)
{
struct ds2482_data *data;
+ struct ds2482_platform_data *pdata;
int err = -ENODEV;
int temp1;
int idx;
@@ -531,6 +561,16 @@ static int ds2482_probe(struct i2c_client *client,
}
}
+ pdata = client->dev.platform_data;
+ data->slpz_gpio = pdata ? pdata->slpz_gpio : -1;
+
+ if (data->slpz_gpio >= 0) {
+ err = gpio_request_one(data->slpz_gpio, GPIOF_OUT_INIT_HIGH,
+ "ds2482.slpz");
+ if (err < 0)
+ goto exit_w1_remove;
+ }
+
return 0;
exit_w1_remove:
@@ -555,6 +595,11 @@ static int ds2482_remove(struct i2c_client *client)
w1_remove_master_device(&data->w1_ch[idx].w1_bm);
}
+ if (data->slpz_gpio >= 0) {
+ gpio_set_value(data->slpz_gpio, 0);
+ gpio_free(data->slpz_gpio);
+ }
+
/* Free the memory */
kfree(data);
return 0;
diff --git a/fs/Kconfig b/fs/Kconfig
index 4bd03a2b0518..20a8d95b8b3d 100644
--- a/fs/Kconfig
+++ b/fs/Kconfig
@@ -227,6 +227,7 @@ source "fs/orangefs/Kconfig"
source "fs/adfs/Kconfig"
source "fs/affs/Kconfig"
source "fs/ecryptfs/Kconfig"
+source "fs/sdcardfs/Kconfig"
source "fs/hfs/Kconfig"
source "fs/hfsplus/Kconfig"
source "fs/befs/Kconfig"
diff --git a/fs/Makefile b/fs/Makefile
index ed2b63257ba9..f207d4303052 100644
--- a/fs/Makefile
+++ b/fs/Makefile
@@ -3,7 +3,7 @@
#
# 14 Sep 2000, Christoph Hellwig <hch@infradead.org>
# Rewritten to use lists instead of if-statements.
-#
+#
obj-y := open.o read_write.o file_table.o super.o \
char_dev.o stat.o exec.o pipe.o namei.o fcntl.o \
@@ -61,7 +61,7 @@ obj-y += devpts/
obj-$(CONFIG_PROFILING) += dcookies.o
obj-$(CONFIG_DLM) += dlm/
-
+
# Do not add any filesystems before this line
obj-$(CONFIG_FSCACHE) += fscache/
obj-$(CONFIG_REISERFS_FS) += reiserfs/
@@ -83,6 +83,7 @@ obj-$(CONFIG_ISO9660_FS) += isofs/
obj-$(CONFIG_HFSPLUS_FS) += hfsplus/ # Before hfs to find wrapped HFS+
obj-$(CONFIG_HFS_FS) += hfs/
obj-$(CONFIG_ECRYPT_FS) += ecryptfs/
+obj-$(CONFIG_SDCARD_FS) += sdcardfs/
obj-$(CONFIG_VXFS_FS) += freevxfs/
obj-$(CONFIG_NFS_FS) += nfs/
obj-$(CONFIG_EXPORTFS) += exportfs/
diff --git a/fs/afs/file.c b/fs/afs/file.c
index 72372970725b..37f97bab78ab 100644
--- a/fs/afs/file.c
+++ b/fs/afs/file.c
@@ -123,11 +123,10 @@ static void afs_file_readpage_read_complete(struct page *page,
/*
* read page from file, directory or symlink, given a key to use
*/
-int afs_page_filler(void *data, struct page *page)
+static int __afs_page_filler(struct key *key, struct page *page)
{
struct inode *inode = page->mapping->host;
struct afs_vnode *vnode = AFS_FS_I(inode);
- struct key *key = data;
size_t len;
off_t offset;
int ret;
@@ -209,6 +208,13 @@ error:
return ret;
}
+int afs_page_filler(struct file *data, struct page *page)
+{
+ struct key *key = (struct key *)data;
+
+ return __afs_page_filler(key, page);
+}
+
/*
* read page from file, directory or symlink, given a file to nominate the key
* to be used
@@ -221,14 +227,14 @@ static int afs_readpage(struct file *file, struct page *page)
if (file) {
key = file->private_data;
ASSERT(key != NULL);
- ret = afs_page_filler(key, page);
+ ret = __afs_page_filler(key, page);
} else {
struct inode *inode = page->mapping->host;
key = afs_request_key(AFS_FS_S(inode->i_sb)->volume->cell);
if (IS_ERR(key)) {
ret = PTR_ERR(key);
} else {
- ret = afs_page_filler(key, page);
+ ret = __afs_page_filler(key, page);
key_put(key);
}
}
diff --git a/fs/afs/internal.h b/fs/afs/internal.h
index dd98dcda6a3f..b4165cce985d 100644
--- a/fs/afs/internal.h
+++ b/fs/afs/internal.h
@@ -497,7 +497,7 @@ extern const struct file_operations afs_file_operations;
extern int afs_open(struct inode *, struct file *);
extern int afs_release(struct inode *, struct file *);
-extern int afs_page_filler(void *, struct page *);
+extern int afs_page_filler(struct file *, struct page *);
/*
* flock.c
diff --git a/fs/attr.c b/fs/attr.c
index c902b3d53508..c4093c5196be 100644
--- a/fs/attr.c
+++ b/fs/attr.c
@@ -200,7 +200,7 @@ EXPORT_SYMBOL(setattr_copy);
* the file open for write, as there can be no conflicting delegation in
* that case.
*/
-int notify_change(struct dentry * dentry, struct iattr * attr, struct inode **delegated_inode)
+int notify_change2(struct vfsmount *mnt, struct dentry * dentry, struct iattr * attr, struct inode **delegated_inode)
{
struct inode *inode = dentry->d_inode;
umode_t mode = inode->i_mode;
@@ -224,7 +224,7 @@ int notify_change(struct dentry * dentry, struct iattr * attr, struct inode **de
return -EPERM;
if (!inode_owner_or_capable(inode)) {
- error = inode_permission(inode, MAY_WRITE);
+ error = inode_permission2(mnt, inode, MAY_WRITE);
if (error)
return error;
}
@@ -307,7 +307,9 @@ int notify_change(struct dentry * dentry, struct iattr * attr, struct inode **de
if (error)
return error;
- if (inode->i_op->setattr)
+ if (mnt && inode->i_op->setattr2)
+ error = inode->i_op->setattr2(mnt, dentry, attr);
+ else if (inode->i_op->setattr)
error = inode->i_op->setattr(dentry, attr);
else
error = simple_setattr(dentry, attr);
@@ -320,4 +322,10 @@ int notify_change(struct dentry * dentry, struct iattr * attr, struct inode **de
return error;
}
+EXPORT_SYMBOL(notify_change2);
+
+int notify_change(struct dentry * dentry, struct iattr * attr, struct inode **delegated_inode)
+{
+ return notify_change2(NULL, dentry, attr, delegated_inode);
+}
EXPORT_SYMBOL(notify_change);
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index 793d4d571d8d..1a45df7ef9a6 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -3830,8 +3830,8 @@ retry:
if (wbc->sync_mode == WB_SYNC_ALL)
tag_pages_for_writeback(mapping, index, end);
while (!done && !nr_to_write_done && (index <= end) &&
- (nr_pages = pagevec_lookup_tag(&pvec, mapping, &index, tag,
- min(end - index, (pgoff_t)PAGEVEC_SIZE-1) + 1))) {
+ (nr_pages = pagevec_lookup_range_tag(&pvec, mapping, &index, end,
+ tag))) {
unsigned i;
scanned = 1;
@@ -3841,11 +3841,6 @@ retry:
if (!PagePrivate(page))
continue;
- if (!wbc->range_cyclic && page->index > end) {
- done = 1;
- break;
- }
-
spin_lock(&mapping->private_lock);
if (!PagePrivate(page)) {
spin_unlock(&mapping->private_lock);
@@ -3978,8 +3973,8 @@ retry:
tag_pages_for_writeback(mapping, index, end);
done_index = index;
while (!done && !nr_to_write_done && (index <= end) &&
- (nr_pages = pagevec_lookup_tag(&pvec, mapping, &index, tag,
- min(end - index, (pgoff_t)PAGEVEC_SIZE-1) + 1))) {
+ (nr_pages = pagevec_lookup_range_tag(&pvec, mapping,
+ &index, end, tag))) {
unsigned i;
scanned = 1;
@@ -4004,12 +3999,6 @@ retry:
continue;
}
- if (!wbc->range_cyclic && page->index > end) {
- done = 1;
- unlock_page(page);
- continue;
- }
-
if (wbc->sync_mode != WB_SYNC_NONE) {
if (PageWriteback(page))
flush_fn(data);
diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c
index 7b79a54a2789..546d643b09d4 100644
--- a/fs/ceph/addr.c
+++ b/fs/ceph/addr.c
@@ -838,21 +838,15 @@ retry:
struct page **pages = NULL, **data_pages;
mempool_t *pool = NULL; /* Becomes non-null if mempool used */
struct page *page;
- int want;
u64 offset = 0, len = 0;
max_pages = max_pages_ever;
get_more_pages:
first = -1;
- want = min(end - index,
- min((pgoff_t)PAGEVEC_SIZE,
- max_pages - (pgoff_t)locked_pages) - 1)
- + 1;
- pvec_pages = pagevec_lookup_tag(&pvec, mapping, &index,
- PAGECACHE_TAG_DIRTY,
- want);
- dout("pagevec_lookup_tag got %d\n", pvec_pages);
+ pvec_pages = pagevec_lookup_range_tag(&pvec, mapping, &index,
+ end, PAGECACHE_TAG_DIRTY);
+ dout("pagevec_lookup_range_tag got %d\n", pvec_pages);
if (!pvec_pages && !locked_pages)
break;
for (i = 0; i < pvec_pages && locked_pages < max_pages; i++) {
@@ -870,12 +864,6 @@ get_more_pages:
unlock_page(page);
break;
}
- if (!wbc->range_cyclic && page->index > end) {
- dout("end of range %p\n", page);
- done = 1;
- unlock_page(page);
- break;
- }
if (strip_unit_end && (page->index > strip_unit_end)) {
dout("end of strip unit %p\n", page);
unlock_page(page);
diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c
index 441d434a48c1..c4a27f6b6fa8 100644
--- a/fs/cifs/connect.c
+++ b/fs/cifs/connect.c
@@ -2439,7 +2439,7 @@ cifs_set_cifscreds(struct smb_vol *vol, struct cifs_ses *ses)
}
down_read(&key->sem);
- upayload = user_key_payload(key);
+ upayload = user_key_payload_locked(key);
if (IS_ERR_OR_NULL(upayload)) {
rc = upayload ? PTR_ERR(upayload) : -EINVAL;
goto out_key_put;
diff --git a/fs/coredump.c b/fs/coredump.c
index 4407e27beca9..00a900a51a8b 100644
--- a/fs/coredump.c
+++ b/fs/coredump.c
@@ -744,7 +744,7 @@ void do_coredump(const siginfo_t *siginfo)
goto close_fail;
if (!(cprm.file->f_mode & FMODE_CAN_WRITE))
goto close_fail;
- if (do_truncate(cprm.file->f_path.dentry, 0, 0, cprm.file))
+ if (do_truncate2(cprm.file->f_path.mnt, cprm.file->f_path.dentry, 0, 0, cprm.file))
goto close_fail;
}
diff --git a/fs/crypto/Kconfig b/fs/crypto/Kconfig
index 92348faf9865..08b46e6e3995 100644
--- a/fs/crypto/Kconfig
+++ b/fs/crypto/Kconfig
@@ -1,6 +1,5 @@
config FS_ENCRYPTION
tristate "FS Encryption (Per-file encryption)"
- depends on BLOCK
select CRYPTO
select CRYPTO_AES
select CRYPTO_CBC
@@ -8,9 +7,7 @@ config FS_ENCRYPTION
select CRYPTO_XTS
select CRYPTO_CTS
select CRYPTO_CTR
- select CRYPTO_SHA256
select KEYS
- select ENCRYPTED_KEYS
help
Enable encryption of files and directories. This
feature is similar to ecryptfs, but it is more memory
diff --git a/fs/crypto/Makefile b/fs/crypto/Makefile
index f17684c48739..cb496989a6b6 100644
--- a/fs/crypto/Makefile
+++ b/fs/crypto/Makefile
@@ -1,3 +1,4 @@
obj-$(CONFIG_FS_ENCRYPTION) += fscrypto.o
-fscrypto-y := crypto.o fname.o policy.o keyinfo.o
+fscrypto-y := crypto.o fname.o hooks.o keyinfo.o policy.o
+fscrypto-$(CONFIG_BLOCK) += bio.o
diff --git a/fs/crypto/bio.c b/fs/crypto/bio.c
new file mode 100644
index 000000000000..d7b4c480e39e
--- /dev/null
+++ b/fs/crypto/bio.c
@@ -0,0 +1,154 @@
+/*
+ * This contains encryption functions for per-file encryption.
+ *
+ * Copyright (C) 2015, Google, Inc.
+ * Copyright (C) 2015, Motorola Mobility
+ *
+ * Written by Michael Halcrow, 2014.
+ *
+ * Filename encryption additions
+ * Uday Savagaonkar, 2014
+ * Encryption policy handling additions
+ * Ildar Muslukhov, 2014
+ * Add fscrypt_pullback_bio_page()
+ * Jaegeuk Kim, 2015.
+ *
+ * This has not yet undergone a rigorous security audit.
+ *
+ * The usage of AES-XTS should conform to recommendations in NIST
+ * Special Publication 800-38E and IEEE P1619/D16.
+ */
+
+#include <linux/pagemap.h>
+#include <linux/module.h>
+#include <linux/bio.h>
+#include <linux/namei.h>
+#include "fscrypt_private.h"
+
+static void __fscrypt_decrypt_bio(struct bio *bio, bool done)
+{
+ struct bio_vec *bv;
+ int i;
+
+ bio_for_each_segment_all(bv, bio, i) {
+ struct page *page = bv->bv_page;
+ int ret = fscrypt_decrypt_page(page->mapping->host, page,
+ PAGE_SIZE, 0, page->index);
+
+ if (ret) {
+ WARN_ON_ONCE(1);
+ SetPageError(page);
+ } else if (done) {
+ SetPageUptodate(page);
+ }
+ if (done)
+ unlock_page(page);
+ }
+}
+
+void fscrypt_decrypt_bio(struct bio *bio)
+{
+ __fscrypt_decrypt_bio(bio, false);
+}
+EXPORT_SYMBOL(fscrypt_decrypt_bio);
+
+static void completion_pages(struct work_struct *work)
+{
+ struct fscrypt_ctx *ctx =
+ container_of(work, struct fscrypt_ctx, r.work);
+ struct bio *bio = ctx->r.bio;
+
+ __fscrypt_decrypt_bio(bio, true);
+ fscrypt_release_ctx(ctx);
+ bio_put(bio);
+}
+
+void fscrypt_enqueue_decrypt_bio(struct fscrypt_ctx *ctx, struct bio *bio)
+{
+ INIT_WORK(&ctx->r.work, completion_pages);
+ ctx->r.bio = bio;
+ fscrypt_enqueue_decrypt_work(&ctx->r.work);
+}
+EXPORT_SYMBOL(fscrypt_enqueue_decrypt_bio);
+
+void fscrypt_pullback_bio_page(struct page **page, bool restore)
+{
+ struct fscrypt_ctx *ctx;
+ struct page *bounce_page;
+
+ /* The bounce data pages are unmapped. */
+ if ((*page)->mapping)
+ return;
+
+ /* The bounce data page is unmapped. */
+ bounce_page = *page;
+ ctx = (struct fscrypt_ctx *)page_private(bounce_page);
+
+ /* restore control page */
+ *page = ctx->w.control_page;
+
+ if (restore)
+ fscrypt_restore_control_page(bounce_page);
+}
+EXPORT_SYMBOL(fscrypt_pullback_bio_page);
+
+int fscrypt_zeroout_range(const struct inode *inode, pgoff_t lblk,
+ sector_t pblk, unsigned int len)
+{
+ struct fscrypt_ctx *ctx;
+ struct page *ciphertext_page = NULL;
+ struct bio *bio;
+ int ret, err = 0;
+
+ BUG_ON(inode->i_sb->s_blocksize != PAGE_SIZE);
+
+ ctx = fscrypt_get_ctx(inode, GFP_NOFS);
+ if (IS_ERR(ctx))
+ return PTR_ERR(ctx);
+
+ ciphertext_page = fscrypt_alloc_bounce_page(ctx, GFP_NOWAIT);
+ if (IS_ERR(ciphertext_page)) {
+ err = PTR_ERR(ciphertext_page);
+ goto errout;
+ }
+
+ while (len--) {
+ err = fscrypt_do_page_crypto(inode, FS_ENCRYPT, lblk,
+ ZERO_PAGE(0), ciphertext_page,
+ PAGE_SIZE, 0, GFP_NOFS);
+ if (err)
+ goto errout;
+
+ bio = bio_alloc(GFP_NOWAIT, 1);
+ if (!bio) {
+ err = -ENOMEM;
+ goto errout;
+ }
+ bio->bi_bdev = inode->i_sb->s_bdev;
+ bio->bi_iter.bi_sector =
+ pblk << (inode->i_sb->s_blocksize_bits - 9);
+ bio_set_op_attrs(bio, REQ_OP_WRITE, 0);
+ ret = bio_add_page(bio, ciphertext_page,
+ inode->i_sb->s_blocksize, 0);
+ if (ret != inode->i_sb->s_blocksize) {
+ /* should never happen! */
+ WARN_ON(1);
+ bio_put(bio);
+ err = -EIO;
+ goto errout;
+ }
+ err = submit_bio_wait(bio);
+ if (err == 0 && bio->bi_error)
+ err = -EIO;
+ bio_put(bio);
+ if (err)
+ goto errout;
+ lblk++;
+ pblk++;
+ }
+ err = 0;
+errout:
+ fscrypt_release_ctx(ctx);
+ return err;
+}
+EXPORT_SYMBOL(fscrypt_zeroout_range);
diff --git a/fs/crypto/crypto.c b/fs/crypto/crypto.c
index 1a8962569b5c..4dc788e3bc96 100644
--- a/fs/crypto/crypto.c
+++ b/fs/crypto/crypto.c
@@ -24,10 +24,11 @@
#include <linux/module.h>
#include <linux/scatterlist.h>
#include <linux/ratelimit.h>
-#include <linux/bio.h>
#include <linux/dcache.h>
#include <linux/namei.h>
-#include <linux/fscrypto.h>
+#include <crypto/aes.h>
+#include <crypto/skcipher.h>
+#include "fscrypt_private.h"
static unsigned int num_prealloc_crypto_pages = 32;
static unsigned int num_prealloc_crypto_ctxs = 128;
@@ -50,6 +51,12 @@ static DEFINE_MUTEX(fscrypt_init_mutex);
static struct kmem_cache *fscrypt_ctx_cachep;
struct kmem_cache *fscrypt_info_cachep;
+void fscrypt_enqueue_decrypt_work(struct work_struct *work)
+{
+ queue_work(fscrypt_read_workqueue, work);
+}
+EXPORT_SYMBOL(fscrypt_enqueue_decrypt_work);
+
/**
* fscrypt_release_ctx() - Releases an encryption context
* @ctx: The encryption context to release.
@@ -63,7 +70,7 @@ void fscrypt_release_ctx(struct fscrypt_ctx *ctx)
{
unsigned long flags;
- if (ctx->flags & FS_WRITE_PATH_FL && ctx->w.bounce_page) {
+ if (ctx->flags & FS_CTX_HAS_BOUNCE_BUFFER_FL && ctx->w.bounce_page) {
mempool_free(ctx->w.bounce_page, fscrypt_bounce_page_pool);
ctx->w.bounce_page = NULL;
}
@@ -88,7 +95,7 @@ EXPORT_SYMBOL(fscrypt_release_ctx);
* Return: An allocated and initialized encryption context on success; error
* value or NULL otherwise.
*/
-struct fscrypt_ctx *fscrypt_get_ctx(struct inode *inode, gfp_t gfp_flags)
+struct fscrypt_ctx *fscrypt_get_ctx(const struct inode *inode, gfp_t gfp_flags)
{
struct fscrypt_ctx *ctx = NULL;
struct fscrypt_info *ci = inode->i_crypt_info;
@@ -121,134 +128,149 @@ struct fscrypt_ctx *fscrypt_get_ctx(struct inode *inode, gfp_t gfp_flags)
} else {
ctx->flags &= ~FS_CTX_REQUIRES_FREE_ENCRYPT_FL;
}
- ctx->flags &= ~FS_WRITE_PATH_FL;
+ ctx->flags &= ~FS_CTX_HAS_BOUNCE_BUFFER_FL;
return ctx;
}
EXPORT_SYMBOL(fscrypt_get_ctx);
-/**
- * page_crypt_complete() - completion callback for page crypto
- * @req: The asynchronous cipher request context
- * @res: The result of the cipher operation
- */
-static void page_crypt_complete(struct crypto_async_request *req, int res)
+void fscrypt_generate_iv(union fscrypt_iv *iv, u64 lblk_num,
+ const struct fscrypt_info *ci)
{
- struct fscrypt_completion_result *ecr = req->data;
+ memset(iv, 0, ci->ci_mode->ivsize);
+ iv->lblk_num = cpu_to_le64(lblk_num);
- if (res == -EINPROGRESS)
- return;
- ecr->res = res;
- complete(&ecr->completion);
-}
+ if (ci->ci_flags & FS_POLICY_FLAG_DIRECT_KEY)
+ memcpy(iv->nonce, ci->ci_nonce, FS_KEY_DERIVATION_NONCE_SIZE);
-typedef enum {
- FS_DECRYPT = 0,
- FS_ENCRYPT,
-} fscrypt_direction_t;
+ if (ci->ci_essiv_tfm != NULL)
+ crypto_cipher_encrypt_one(ci->ci_essiv_tfm, iv->raw, iv->raw);
+}
-static int do_page_crypto(struct inode *inode,
- fscrypt_direction_t rw, pgoff_t index,
- struct page *src_page, struct page *dest_page,
- gfp_t gfp_flags)
+int fscrypt_do_page_crypto(const struct inode *inode, fscrypt_direction_t rw,
+ u64 lblk_num, struct page *src_page,
+ struct page *dest_page, unsigned int len,
+ unsigned int offs, gfp_t gfp_flags)
{
- struct {
- __le64 index;
- u8 padding[FS_XTS_TWEAK_SIZE - sizeof(__le64)];
- } xts_tweak;
+ union fscrypt_iv iv;
struct skcipher_request *req = NULL;
- DECLARE_FS_COMPLETION_RESULT(ecr);
+ DECLARE_CRYPTO_WAIT(wait);
struct scatterlist dst, src;
struct fscrypt_info *ci = inode->i_crypt_info;
struct crypto_skcipher *tfm = ci->ci_ctfm;
int res = 0;
+ BUG_ON(len == 0);
+
+ fscrypt_generate_iv(&iv, lblk_num, ci);
+
req = skcipher_request_alloc(tfm, gfp_flags);
- if (!req) {
- printk_ratelimited(KERN_ERR
- "%s: crypto_request_alloc() failed\n",
- __func__);
+ if (!req)
return -ENOMEM;
- }
skcipher_request_set_callback(
req, CRYPTO_TFM_REQ_MAY_BACKLOG | CRYPTO_TFM_REQ_MAY_SLEEP,
- page_crypt_complete, &ecr);
-
- BUILD_BUG_ON(sizeof(xts_tweak) != FS_XTS_TWEAK_SIZE);
- xts_tweak.index = cpu_to_le64(index);
- memset(xts_tweak.padding, 0, sizeof(xts_tweak.padding));
+ crypto_req_done, &wait);
sg_init_table(&dst, 1);
- sg_set_page(&dst, dest_page, PAGE_SIZE, 0);
+ sg_set_page(&dst, dest_page, len, offs);
sg_init_table(&src, 1);
- sg_set_page(&src, src_page, PAGE_SIZE, 0);
- skcipher_request_set_crypt(req, &src, &dst, PAGE_SIZE, &xts_tweak);
+ sg_set_page(&src, src_page, len, offs);
+ skcipher_request_set_crypt(req, &src, &dst, len, &iv);
if (rw == FS_DECRYPT)
- res = crypto_skcipher_decrypt(req);
+ res = crypto_wait_req(crypto_skcipher_decrypt(req), &wait);
else
- res = crypto_skcipher_encrypt(req);
- if (res == -EINPROGRESS || res == -EBUSY) {
- BUG_ON(req->base.data != &ecr);
- wait_for_completion(&ecr.completion);
- res = ecr.res;
- }
+ res = crypto_wait_req(crypto_skcipher_encrypt(req), &wait);
skcipher_request_free(req);
if (res) {
- printk_ratelimited(KERN_ERR
- "%s: crypto_skcipher_encrypt() returned %d\n",
- __func__, res);
+ fscrypt_err(inode->i_sb,
+ "%scryption failed for inode %lu, block %llu: %d",
+ (rw == FS_DECRYPT ? "de" : "en"),
+ inode->i_ino, lblk_num, res);
return res;
}
return 0;
}
-static struct page *alloc_bounce_page(struct fscrypt_ctx *ctx, gfp_t gfp_flags)
+struct page *fscrypt_alloc_bounce_page(struct fscrypt_ctx *ctx,
+ gfp_t gfp_flags)
{
ctx->w.bounce_page = mempool_alloc(fscrypt_bounce_page_pool, gfp_flags);
if (ctx->w.bounce_page == NULL)
return ERR_PTR(-ENOMEM);
- ctx->flags |= FS_WRITE_PATH_FL;
+ ctx->flags |= FS_CTX_HAS_BOUNCE_BUFFER_FL;
return ctx->w.bounce_page;
}
/**
* fscypt_encrypt_page() - Encrypts a page
- * @inode: The inode for which the encryption should take place
- * @plaintext_page: The page to encrypt. Must be locked.
- * @gfp_flags: The gfp flag for memory allocation
+ * @inode: The inode for which the encryption should take place
+ * @page: The page to encrypt. Must be locked for bounce-page
+ * encryption.
+ * @len: Length of data to encrypt in @page and encrypted
+ * data in returned page.
+ * @offs: Offset of data within @page and returned
+ * page holding encrypted data.
+ * @lblk_num: Logical block number. This must be unique for multiple
+ * calls with same inode, except when overwriting
+ * previously written data.
+ * @gfp_flags: The gfp flag for memory allocation
*
- * Allocates a ciphertext page and encrypts plaintext_page into it using the ctx
- * encryption context.
+ * Encrypts @page using the ctx encryption context. Performs encryption
+ * either in-place or into a newly allocated bounce page.
+ * Called on the page write path.
*
- * Called on the page write path. The caller must call
+ * Bounce page allocation is the default.
+ * In this case, the contents of @page are encrypted and stored in an
+ * allocated bounce page. @page has to be locked and the caller must call
* fscrypt_restore_control_page() on the returned ciphertext page to
* release the bounce buffer and the encryption context.
*
- * Return: An allocated page with the encrypted content on success. Else, an
+ * In-place encryption is used by setting the FS_CFLG_OWN_PAGES flag in
+ * fscrypt_operations. Here, the input-page is returned with its content
+ * encrypted.
+ *
+ * Return: A page with the encrypted content on success. Else, an
* error value or NULL.
*/
-struct page *fscrypt_encrypt_page(struct inode *inode,
- struct page *plaintext_page, gfp_t gfp_flags)
+struct page *fscrypt_encrypt_page(const struct inode *inode,
+ struct page *page,
+ unsigned int len,
+ unsigned int offs,
+ u64 lblk_num, gfp_t gfp_flags)
+
{
struct fscrypt_ctx *ctx;
- struct page *ciphertext_page = NULL;
+ struct page *ciphertext_page = page;
int err;
- BUG_ON(!PageLocked(plaintext_page));
+ BUG_ON(len % FS_CRYPTO_BLOCK_SIZE != 0);
+
+ if (inode->i_sb->s_cop->flags & FS_CFLG_OWN_PAGES) {
+ /* with inplace-encryption we just encrypt the page */
+ err = fscrypt_do_page_crypto(inode, FS_ENCRYPT, lblk_num, page,
+ ciphertext_page, len, offs,
+ gfp_flags);
+ if (err)
+ return ERR_PTR(err);
+
+ return ciphertext_page;
+ }
+
+ BUG_ON(!PageLocked(page));
ctx = fscrypt_get_ctx(inode, gfp_flags);
if (IS_ERR(ctx))
return (struct page *)ctx;
/* The encryption operation will require a bounce page. */
- ciphertext_page = alloc_bounce_page(ctx, gfp_flags);
+ ciphertext_page = fscrypt_alloc_bounce_page(ctx, gfp_flags);
if (IS_ERR(ciphertext_page))
goto errout;
- ctx->w.control_page = plaintext_page;
- err = do_page_crypto(inode, FS_ENCRYPT, plaintext_page->index,
- plaintext_page, ciphertext_page,
- gfp_flags);
+ ctx->w.control_page = page;
+ err = fscrypt_do_page_crypto(inode, FS_ENCRYPT, lblk_num,
+ page, ciphertext_page, len, offs,
+ gfp_flags);
if (err) {
ciphertext_page = ERR_PTR(err);
goto errout;
@@ -265,8 +287,13 @@ errout:
EXPORT_SYMBOL(fscrypt_encrypt_page);
/**
- * f2crypt_decrypt_page() - Decrypts a page in-place
- * @page: The page to decrypt. Must be locked.
+ * fscrypt_decrypt_page() - Decrypts a page in-place
+ * @inode: The corresponding inode for the page to decrypt.
+ * @page: The page to decrypt. Must be locked in case
+ * it is a writeback page (FS_CFLG_OWN_PAGES unset).
+ * @len: Number of bytes in @page to be decrypted.
+ * @offs: Start of data in @page.
+ * @lblk_num: Logical block number.
*
* Decrypts page in-place using the ctx encryption context.
*
@@ -274,76 +301,17 @@ EXPORT_SYMBOL(fscrypt_encrypt_page);
*
* Return: Zero on success, non-zero otherwise.
*/
-int fscrypt_decrypt_page(struct page *page)
+int fscrypt_decrypt_page(const struct inode *inode, struct page *page,
+ unsigned int len, unsigned int offs, u64 lblk_num)
{
- BUG_ON(!PageLocked(page));
+ if (!(inode->i_sb->s_cop->flags & FS_CFLG_OWN_PAGES))
+ BUG_ON(!PageLocked(page));
- return do_page_crypto(page->mapping->host,
- FS_DECRYPT, page->index, page, page, GFP_NOFS);
+ return fscrypt_do_page_crypto(inode, FS_DECRYPT, lblk_num, page, page,
+ len, offs, GFP_NOFS);
}
EXPORT_SYMBOL(fscrypt_decrypt_page);
-int fscrypt_zeroout_range(struct inode *inode, pgoff_t lblk,
- sector_t pblk, unsigned int len)
-{
- struct fscrypt_ctx *ctx;
- struct page *ciphertext_page = NULL;
- struct bio *bio;
- int ret, err = 0;
-
- BUG_ON(inode->i_sb->s_blocksize != PAGE_SIZE);
-
- ctx = fscrypt_get_ctx(inode, GFP_NOFS);
- if (IS_ERR(ctx))
- return PTR_ERR(ctx);
-
- ciphertext_page = alloc_bounce_page(ctx, GFP_NOWAIT);
- if (IS_ERR(ciphertext_page)) {
- err = PTR_ERR(ciphertext_page);
- goto errout;
- }
-
- while (len--) {
- err = do_page_crypto(inode, FS_ENCRYPT, lblk,
- ZERO_PAGE(0), ciphertext_page,
- GFP_NOFS);
- if (err)
- goto errout;
-
- bio = bio_alloc(GFP_NOWAIT, 1);
- if (!bio) {
- err = -ENOMEM;
- goto errout;
- }
- bio->bi_bdev = inode->i_sb->s_bdev;
- bio->bi_iter.bi_sector =
- pblk << (inode->i_sb->s_blocksize_bits - 9);
- bio_set_op_attrs(bio, REQ_OP_WRITE, 0);
- ret = bio_add_page(bio, ciphertext_page,
- inode->i_sb->s_blocksize, 0);
- if (ret != inode->i_sb->s_blocksize) {
- /* should never happen! */
- WARN_ON(1);
- bio_put(bio);
- err = -EIO;
- goto errout;
- }
- err = submit_bio_wait(bio);
- if ((err == 0) && bio->bi_error)
- err = -EIO;
- bio_put(bio);
- if (err)
- goto errout;
- lblk++;
- pblk++;
- }
- err = 0;
-errout:
- fscrypt_release_ctx(ctx);
- return err;
-}
-EXPORT_SYMBOL(fscrypt_zeroout_range);
-
/*
* Validate dentries for encrypted directories to make sure we aren't
* potentially caching stale data after a key has been added or
@@ -358,12 +326,11 @@ static int fscrypt_d_revalidate(struct dentry *dentry, unsigned int flags)
return -ECHILD;
dir = dget_parent(dentry);
- if (!d_inode(dir)->i_sb->s_cop->is_encrypted(d_inode(dir))) {
+ if (!IS_ENCRYPTED(d_inode(dir))) {
dput(dir);
return 0;
}
- /* this should eventually be an flag in d_flags */
spin_lock(&dentry->d_lock);
cached_with_key = dentry->d_flags & DCACHE_ENCRYPTED_WITH_KEY;
spin_unlock(&dentry->d_lock);
@@ -390,64 +357,6 @@ static int fscrypt_d_revalidate(struct dentry *dentry, unsigned int flags)
const struct dentry_operations fscrypt_d_ops = {
.d_revalidate = fscrypt_d_revalidate,
};
-EXPORT_SYMBOL(fscrypt_d_ops);
-
-/*
- * Call fscrypt_decrypt_page on every single page, reusing the encryption
- * context.
- */
-static void completion_pages(struct work_struct *work)
-{
- struct fscrypt_ctx *ctx =
- container_of(work, struct fscrypt_ctx, r.work);
- struct bio *bio = ctx->r.bio;
- struct bio_vec *bv;
- int i;
-
- bio_for_each_segment_all(bv, bio, i) {
- struct page *page = bv->bv_page;
- int ret = fscrypt_decrypt_page(page);
-
- if (ret) {
- WARN_ON_ONCE(1);
- SetPageError(page);
- } else {
- SetPageUptodate(page);
- }
- unlock_page(page);
- }
- fscrypt_release_ctx(ctx);
- bio_put(bio);
-}
-
-void fscrypt_decrypt_bio_pages(struct fscrypt_ctx *ctx, struct bio *bio)
-{
- INIT_WORK(&ctx->r.work, completion_pages);
- ctx->r.bio = bio;
- queue_work(fscrypt_read_workqueue, &ctx->r.work);
-}
-EXPORT_SYMBOL(fscrypt_decrypt_bio_pages);
-
-void fscrypt_pullback_bio_page(struct page **page, bool restore)
-{
- struct fscrypt_ctx *ctx;
- struct page *bounce_page;
-
- /* The bounce data pages are unmapped. */
- if ((*page)->mapping)
- return;
-
- /* The bounce data page is unmapped. */
- bounce_page = *page;
- ctx = (struct fscrypt_ctx *)page_private(bounce_page);
-
- /* restore control page */
- *page = ctx->w.control_page;
-
- if (restore)
- fscrypt_restore_control_page(bounce_page);
-}
-EXPORT_SYMBOL(fscrypt_pullback_bio_page);
void fscrypt_restore_control_page(struct page *page)
{
@@ -474,16 +383,21 @@ static void fscrypt_destroy(void)
/**
* fscrypt_initialize() - allocate major buffers for fs encryption.
+ * @cop_flags: fscrypt operations flags
*
* We only call this when we start accessing encrypted files, since it
* results in memory getting allocated that wouldn't otherwise be used.
*
* Return: Zero on success, non-zero otherwise.
*/
-int fscrypt_initialize(void)
+int fscrypt_initialize(unsigned int cop_flags)
{
int i, res = -ENOMEM;
+ /* No need to allocate a bounce page pool if this FS won't use it. */
+ if (cop_flags & FS_CFLG_OWN_PAGES)
+ return 0;
+
mutex_lock(&fscrypt_init_mutex);
if (fscrypt_bounce_page_pool)
goto already_initialized;
@@ -510,7 +424,27 @@ fail:
mutex_unlock(&fscrypt_init_mutex);
return res;
}
-EXPORT_SYMBOL(fscrypt_initialize);
+
+void fscrypt_msg(struct super_block *sb, const char *level,
+ const char *fmt, ...)
+{
+ static DEFINE_RATELIMIT_STATE(rs, DEFAULT_RATELIMIT_INTERVAL,
+ DEFAULT_RATELIMIT_BURST);
+ struct va_format vaf;
+ va_list args;
+
+ if (!__ratelimit(&rs))
+ return;
+
+ va_start(args, fmt);
+ vaf.fmt = fmt;
+ vaf.va = &args;
+ if (sb)
+ printk("%sfscrypt (%s): %pV\n", level, sb->s_id, &vaf);
+ else
+ printk("%sfscrypt: %pV\n", level, &vaf);
+ va_end(args);
+}
/**
* fscrypt_init() - Set up for fs encryption.
@@ -561,6 +495,8 @@ static void __exit fscrypt_exit(void)
destroy_workqueue(fscrypt_read_workqueue);
kmem_cache_destroy(fscrypt_ctx_cachep);
kmem_cache_destroy(fscrypt_info_cachep);
+
+ fscrypt_essiv_cleanup();
}
module_exit(fscrypt_exit);
diff --git a/fs/crypto/fname.c b/fs/crypto/fname.c
index e14bb7b67e9c..217d4219b386 100644
--- a/fs/crypto/fname.c
+++ b/fs/crypto/fname.c
@@ -12,89 +12,71 @@
#include <linux/scatterlist.h>
#include <linux/ratelimit.h>
-#include <linux/fscrypto.h>
+#include <crypto/skcipher.h>
+#include "fscrypt_private.h"
-/**
- * fname_crypt_complete() - completion callback for filename crypto
- * @req: The asynchronous cipher request context
- * @res: The result of the cipher operation
- */
-static void fname_crypt_complete(struct crypto_async_request *req, int res)
+static inline bool fscrypt_is_dot_dotdot(const struct qstr *str)
{
- struct fscrypt_completion_result *ecr = req->data;
+ if (str->len == 1 && str->name[0] == '.')
+ return true;
- if (res == -EINPROGRESS)
- return;
- ecr->res = res;
- complete(&ecr->completion);
+ if (str->len == 2 && str->name[0] == '.' && str->name[1] == '.')
+ return true;
+
+ return false;
}
/**
* fname_encrypt() - encrypt a filename
*
- * The caller must have allocated sufficient memory for the @oname string.
+ * The output buffer must be at least as large as the input buffer.
+ * Any extra space is filled with NUL padding before encryption.
*
* Return: 0 on success, -errno on failure
*/
-static int fname_encrypt(struct inode *inode,
- const struct qstr *iname, struct fscrypt_str *oname)
+int fname_encrypt(struct inode *inode, const struct qstr *iname,
+ u8 *out, unsigned int olen)
{
struct skcipher_request *req = NULL;
- DECLARE_FS_COMPLETION_RESULT(ecr);
+ DECLARE_CRYPTO_WAIT(wait);
struct fscrypt_info *ci = inode->i_crypt_info;
struct crypto_skcipher *tfm = ci->ci_ctfm;
- int res = 0;
- char iv[FS_CRYPTO_BLOCK_SIZE];
+ union fscrypt_iv iv;
struct scatterlist sg;
- int padding = 4 << (ci->ci_flags & FS_POLICY_FLAGS_PAD_MASK);
- unsigned int lim;
- unsigned int cryptlen;
-
- lim = inode->i_sb->s_cop->max_namelen(inode);
- if (iname->len <= 0 || iname->len > lim)
- return -EIO;
+ int res;
/*
* Copy the filename to the output buffer for encrypting in-place and
* pad it with the needed number of NUL bytes.
*/
- cryptlen = max_t(unsigned int, iname->len, FS_CRYPTO_BLOCK_SIZE);
- cryptlen = round_up(cryptlen, padding);
- cryptlen = min(cryptlen, lim);
- memcpy(oname->name, iname->name, iname->len);
- memset(oname->name + iname->len, 0, cryptlen - iname->len);
+ if (WARN_ON(olen < iname->len))
+ return -ENOBUFS;
+ memcpy(out, iname->name, iname->len);
+ memset(out + iname->len, 0, olen - iname->len);
/* Initialize the IV */
- memset(iv, 0, FS_CRYPTO_BLOCK_SIZE);
+ fscrypt_generate_iv(&iv, 0, ci);
/* Set up the encryption request */
req = skcipher_request_alloc(tfm, GFP_NOFS);
- if (!req) {
- printk_ratelimited(KERN_ERR
- "%s: skcipher_request_alloc() failed\n", __func__);
+ if (!req)
return -ENOMEM;
- }
skcipher_request_set_callback(req,
CRYPTO_TFM_REQ_MAY_BACKLOG | CRYPTO_TFM_REQ_MAY_SLEEP,
- fname_crypt_complete, &ecr);
- sg_init_one(&sg, oname->name, cryptlen);
- skcipher_request_set_crypt(req, &sg, &sg, cryptlen, iv);
+ crypto_req_done, &wait);
+ sg_init_one(&sg, out, olen);
+ skcipher_request_set_crypt(req, &sg, &sg, olen, &iv);
/* Do the encryption */
- res = crypto_skcipher_encrypt(req);
- if (res == -EINPROGRESS || res == -EBUSY) {
- /* Request is being completed asynchronously; wait for it */
- wait_for_completion(&ecr.completion);
- res = ecr.res;
- }
+ res = crypto_wait_req(crypto_skcipher_encrypt(req), &wait);
skcipher_request_free(req);
if (res < 0) {
- printk_ratelimited(KERN_ERR
- "%s: Error (error code %d)\n", __func__, res);
+ fscrypt_err(inode->i_sb,
+ "Filename encryption failed for inode %lu: %d",
+ inode->i_ino, res);
return res;
}
- oname->len = cryptlen;
return 0;
}
@@ -110,45 +92,34 @@ static int fname_decrypt(struct inode *inode,
struct fscrypt_str *oname)
{
struct skcipher_request *req = NULL;
- DECLARE_FS_COMPLETION_RESULT(ecr);
+ DECLARE_CRYPTO_WAIT(wait);
struct scatterlist src_sg, dst_sg;
struct fscrypt_info *ci = inode->i_crypt_info;
struct crypto_skcipher *tfm = ci->ci_ctfm;
- int res = 0;
- char iv[FS_CRYPTO_BLOCK_SIZE];
- unsigned lim;
-
- lim = inode->i_sb->s_cop->max_namelen(inode);
- if (iname->len <= 0 || iname->len > lim)
- return -EIO;
+ union fscrypt_iv iv;
+ int res;
/* Allocate request */
req = skcipher_request_alloc(tfm, GFP_NOFS);
- if (!req) {
- printk_ratelimited(KERN_ERR
- "%s: crypto_request_alloc() failed\n", __func__);
+ if (!req)
return -ENOMEM;
- }
skcipher_request_set_callback(req,
CRYPTO_TFM_REQ_MAY_BACKLOG | CRYPTO_TFM_REQ_MAY_SLEEP,
- fname_crypt_complete, &ecr);
+ crypto_req_done, &wait);
/* Initialize IV */
- memset(iv, 0, FS_CRYPTO_BLOCK_SIZE);
+ fscrypt_generate_iv(&iv, 0, ci);
/* Create decryption request */
sg_init_one(&src_sg, iname->name, iname->len);
sg_init_one(&dst_sg, oname->name, oname->len);
- skcipher_request_set_crypt(req, &src_sg, &dst_sg, iname->len, iv);
- res = crypto_skcipher_decrypt(req);
- if (res == -EINPROGRESS || res == -EBUSY) {
- wait_for_completion(&ecr.completion);
- res = ecr.res;
- }
+ skcipher_request_set_crypt(req, &src_sg, &dst_sg, iname->len, &iv);
+ res = crypto_wait_req(crypto_skcipher_decrypt(req), &wait);
skcipher_request_free(req);
if (res < 0) {
- printk_ratelimited(KERN_ERR
- "%s: Error (error code %d)\n", __func__, res);
+ fscrypt_err(inode->i_sb,
+ "Filename decryption failed for inode %lu: %d",
+ inode->i_ino, res);
return res;
}
@@ -159,6 +130,8 @@ static int fname_decrypt(struct inode *inode,
static const char *lookup_table =
"ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+,";
+#define BASE64_CHARS(nbytes) DIV_ROUND_UP((nbytes) * 4, 3)
+
/**
* digest_encode() -
*
@@ -209,47 +182,52 @@ static int digest_decode(const char *src, int len, char *dst)
return cp - dst;
}
-u32 fscrypt_fname_encrypted_size(struct inode *inode, u32 ilen)
+bool fscrypt_fname_encrypted_size(const struct inode *inode, u32 orig_len,
+ u32 max_len, u32 *encrypted_len_ret)
{
- int padding = 32;
- struct fscrypt_info *ci = inode->i_crypt_info;
-
- if (ci)
- padding = 4 << (ci->ci_flags & FS_POLICY_FLAGS_PAD_MASK);
- ilen = max(ilen, (u32)FS_CRYPTO_BLOCK_SIZE);
- return round_up(ilen, padding);
+ int padding = 4 << (inode->i_crypt_info->ci_flags &
+ FS_POLICY_FLAGS_PAD_MASK);
+ u32 encrypted_len;
+
+ if (orig_len > max_len)
+ return false;
+ encrypted_len = max(orig_len, (u32)FS_CRYPTO_BLOCK_SIZE);
+ encrypted_len = round_up(encrypted_len, padding);
+ *encrypted_len_ret = min(encrypted_len, max_len);
+ return true;
}
-EXPORT_SYMBOL(fscrypt_fname_encrypted_size);
/**
- * fscrypt_fname_crypto_alloc_obuff() -
+ * fscrypt_fname_alloc_buffer - allocate a buffer for presented filenames
*
- * Allocates an output buffer that is sufficient for the crypto operation
- * specified by the context and the direction.
+ * Allocate a buffer that is large enough to hold any decrypted or encoded
+ * filename (null-terminated), for the given maximum encrypted filename length.
+ *
+ * Return: 0 on success, -errno on failure
*/
-int fscrypt_fname_alloc_buffer(struct inode *inode,
- u32 ilen, struct fscrypt_str *crypto_str)
+int fscrypt_fname_alloc_buffer(const struct inode *inode,
+ u32 max_encrypted_len,
+ struct fscrypt_str *crypto_str)
{
- unsigned int olen = fscrypt_fname_encrypted_size(inode, ilen);
+ const u32 max_encoded_len =
+ max_t(u32, BASE64_CHARS(FSCRYPT_FNAME_MAX_UNDIGESTED_SIZE),
+ 1 + BASE64_CHARS(sizeof(struct fscrypt_digested_name)));
+ u32 max_presented_len;
- crypto_str->len = olen;
- if (olen < FS_FNAME_CRYPTO_DIGEST_SIZE * 2)
- olen = FS_FNAME_CRYPTO_DIGEST_SIZE * 2;
- /*
- * Allocated buffer can hold one more character to null-terminate the
- * string
- */
- crypto_str->name = kmalloc(olen + 1, GFP_NOFS);
- if (!(crypto_str->name))
+ max_presented_len = max(max_encoded_len, max_encrypted_len);
+
+ crypto_str->name = kmalloc(max_presented_len + 1, GFP_NOFS);
+ if (!crypto_str->name)
return -ENOMEM;
+ crypto_str->len = max_presented_len;
return 0;
}
EXPORT_SYMBOL(fscrypt_fname_alloc_buffer);
/**
- * fscrypt_fname_crypto_free_buffer() -
+ * fscrypt_fname_free_buffer - free the buffer for presented filenames
*
- * Frees the buffer allocated for crypto operation.
+ * Free the buffer allocated by fscrypt_fname_alloc_buffer().
*/
void fscrypt_fname_free_buffer(struct fscrypt_str *crypto_str)
{
@@ -266,6 +244,10 @@ EXPORT_SYMBOL(fscrypt_fname_free_buffer);
*
* The caller must have allocated sufficient memory for the @oname string.
*
+ * If the key is available, we'll decrypt the disk name; otherwise, we'll encode
+ * it for presentation. Short names are directly base64-encoded, while long
+ * names are encoded in fscrypt_digested_name format.
+ *
* Return: 0 on success, -errno on failure
*/
int fscrypt_fname_disk_to_usr(struct inode *inode,
@@ -274,7 +256,7 @@ int fscrypt_fname_disk_to_usr(struct inode *inode,
struct fscrypt_str *oname)
{
const struct qstr qname = FSTR_TO_QSTR(iname);
- char buf[24];
+ struct fscrypt_digested_name digested_name;
if (fscrypt_is_dot_dotdot(&qname)) {
oname->name[0] = '.';
@@ -289,77 +271,82 @@ int fscrypt_fname_disk_to_usr(struct inode *inode,
if (inode->i_crypt_info)
return fname_decrypt(inode, iname, oname);
- if (iname->len <= FS_FNAME_CRYPTO_DIGEST_SIZE) {
+ if (iname->len <= FSCRYPT_FNAME_MAX_UNDIGESTED_SIZE) {
oname->len = digest_encode(iname->name, iname->len,
oname->name);
return 0;
}
if (hash) {
- memcpy(buf, &hash, 4);
- memcpy(buf + 4, &minor_hash, 4);
+ digested_name.hash = hash;
+ digested_name.minor_hash = minor_hash;
} else {
- memset(buf, 0, 8);
+ digested_name.hash = 0;
+ digested_name.minor_hash = 0;
}
- memcpy(buf + 8, iname->name + ((iname->len - 17) & ~15), 16);
+ memcpy(digested_name.digest,
+ FSCRYPT_FNAME_DIGEST(iname->name, iname->len),
+ FSCRYPT_FNAME_DIGEST_SIZE);
oname->name[0] = '_';
- oname->len = 1 + digest_encode(buf, 24, oname->name + 1);
+ oname->len = 1 + digest_encode((const char *)&digested_name,
+ sizeof(digested_name), oname->name + 1);
return 0;
}
EXPORT_SYMBOL(fscrypt_fname_disk_to_usr);
/**
- * fscrypt_fname_usr_to_disk() - converts a filename from user space to disk
- * space
+ * fscrypt_setup_filename() - prepare to search a possibly encrypted directory
+ * @dir: the directory that will be searched
+ * @iname: the user-provided filename being searched for
+ * @lookup: 1 if we're allowed to proceed without the key because it's
+ * ->lookup() or we're finding the dir_entry for deletion; 0 if we cannot
+ * proceed without the key because we're going to create the dir_entry.
+ * @fname: the filename information to be filled in
*
- * The caller must have allocated sufficient memory for the @oname string.
+ * Given a user-provided filename @iname, this function sets @fname->disk_name
+ * to the name that would be stored in the on-disk directory entry, if possible.
+ * If the directory is unencrypted this is simply @iname. Else, if we have the
+ * directory's encryption key, then @iname is the plaintext, so we encrypt it to
+ * get the disk_name.
+ *
+ * Else, for keyless @lookup operations, @iname is the presented ciphertext, so
+ * we decode it to get either the ciphertext disk_name (for short names) or the
+ * fscrypt_digested_name (for long names). Non-@lookup operations will be
+ * impossible in this case, so we fail them with ENOKEY.
+ *
+ * If successful, fscrypt_free_filename() must be called later to clean up.
*
* Return: 0 on success, -errno on failure
*/
-int fscrypt_fname_usr_to_disk(struct inode *inode,
- const struct qstr *iname,
- struct fscrypt_str *oname)
-{
- if (fscrypt_is_dot_dotdot(iname)) {
- oname->name[0] = '.';
- oname->name[iname->len - 1] = '.';
- oname->len = iname->len;
- return 0;
- }
- if (inode->i_crypt_info)
- return fname_encrypt(inode, iname, oname);
- /*
- * Without a proper key, a user is not allowed to modify the filenames
- * in a directory. Consequently, a user space name cannot be mapped to
- * a disk-space name
- */
- return -ENOKEY;
-}
-EXPORT_SYMBOL(fscrypt_fname_usr_to_disk);
-
int fscrypt_setup_filename(struct inode *dir, const struct qstr *iname,
int lookup, struct fscrypt_name *fname)
{
- int ret = 0, bigname = 0;
+ int ret;
+ int digested;
memset(fname, 0, sizeof(struct fscrypt_name));
fname->usr_fname = iname;
- if (!dir->i_sb->s_cop->is_encrypted(dir) ||
- fscrypt_is_dot_dotdot(iname)) {
+ if (!IS_ENCRYPTED(dir) || fscrypt_is_dot_dotdot(iname)) {
fname->disk_name.name = (unsigned char *)iname->name;
fname->disk_name.len = iname->len;
return 0;
}
ret = fscrypt_get_encryption_info(dir);
- if (ret && ret != -EOPNOTSUPP)
+ if (ret)
return ret;
if (dir->i_crypt_info) {
- ret = fscrypt_fname_alloc_buffer(dir, iname->len,
- &fname->crypto_buf);
- if (ret)
- return ret;
- ret = fname_encrypt(dir, iname, &fname->crypto_buf);
+ if (!fscrypt_fname_encrypted_size(dir, iname->len,
+ dir->i_sb->s_cop->max_namelen,
+ &fname->crypto_buf.len))
+ return -ENAMETOOLONG;
+ fname->crypto_buf.name = kmalloc(fname->crypto_buf.len,
+ GFP_NOFS);
+ if (!fname->crypto_buf.name)
+ return -ENOMEM;
+
+ ret = fname_encrypt(dir, iname, fname->crypto_buf.name,
+ fname->crypto_buf.len);
if (ret)
goto errout;
fname->disk_name.name = fname->crypto_buf.name;
@@ -373,25 +360,37 @@ int fscrypt_setup_filename(struct inode *dir, const struct qstr *iname,
* We don't have the key and we are doing a lookup; decode the
* user-supplied name
*/
- if (iname->name[0] == '_')
- bigname = 1;
- if ((bigname && (iname->len != 33)) || (!bigname && (iname->len > 43)))
- return -ENOENT;
+ if (iname->name[0] == '_') {
+ if (iname->len !=
+ 1 + BASE64_CHARS(sizeof(struct fscrypt_digested_name)))
+ return -ENOENT;
+ digested = 1;
+ } else {
+ if (iname->len >
+ BASE64_CHARS(FSCRYPT_FNAME_MAX_UNDIGESTED_SIZE))
+ return -ENOENT;
+ digested = 0;
+ }
- fname->crypto_buf.name = kmalloc(32, GFP_KERNEL);
+ fname->crypto_buf.name =
+ kmalloc(max_t(size_t, FSCRYPT_FNAME_MAX_UNDIGESTED_SIZE,
+ sizeof(struct fscrypt_digested_name)),
+ GFP_KERNEL);
if (fname->crypto_buf.name == NULL)
return -ENOMEM;
- ret = digest_decode(iname->name + bigname, iname->len - bigname,
+ ret = digest_decode(iname->name + digested, iname->len - digested,
fname->crypto_buf.name);
if (ret < 0) {
ret = -ENOENT;
goto errout;
}
fname->crypto_buf.len = ret;
- if (bigname) {
- memcpy(&fname->hash, fname->crypto_buf.name, 4);
- memcpy(&fname->minor_hash, fname->crypto_buf.name + 4, 4);
+ if (digested) {
+ const struct fscrypt_digested_name *n =
+ (const void *)fname->crypto_buf.name;
+ fname->hash = n->hash;
+ fname->minor_hash = n->minor_hash;
} else {
fname->disk_name.name = fname->crypto_buf.name;
fname->disk_name.len = fname->crypto_buf.len;
@@ -399,16 +398,7 @@ int fscrypt_setup_filename(struct inode *dir, const struct qstr *iname,
return 0;
errout:
- fscrypt_fname_free_buffer(&fname->crypto_buf);
+ kfree(fname->crypto_buf.name);
return ret;
}
EXPORT_SYMBOL(fscrypt_setup_filename);
-
-void fscrypt_free_filename(struct fscrypt_name *fname)
-{
- kfree(fname->crypto_buf.name);
- fname->crypto_buf.name = NULL;
- fname->usr_fname = NULL;
- fname->disk_name.name = NULL;
-}
-EXPORT_SYMBOL(fscrypt_free_filename);
diff --git a/fs/crypto/fscrypt_private.h b/fs/crypto/fscrypt_private.h
new file mode 100644
index 000000000000..681963a235b8
--- /dev/null
+++ b/fs/crypto/fscrypt_private.h
@@ -0,0 +1,174 @@
+/*
+ * fscrypt_private.h
+ *
+ * Copyright (C) 2015, Google, Inc.
+ *
+ * This contains encryption key functions.
+ *
+ * Written by Michael Halcrow, Ildar Muslukhov, and Uday Savagaonkar, 2015.
+ */
+
+#ifndef _FSCRYPT_PRIVATE_H
+#define _FSCRYPT_PRIVATE_H
+
+#define __FS_HAS_ENCRYPTION 1
+#include <linux/fscrypt.h>
+#include <crypto/hash.h>
+
+/* Encryption parameters */
+#define FS_KEY_DERIVATION_NONCE_SIZE 16
+
+/**
+ * Encryption context for inode
+ *
+ * Protector format:
+ * 1 byte: Protector format (1 = this version)
+ * 1 byte: File contents encryption mode
+ * 1 byte: File names encryption mode
+ * 1 byte: Flags
+ * 8 bytes: Master Key descriptor
+ * 16 bytes: Encryption Key derivation nonce
+ */
+struct fscrypt_context {
+ u8 format;
+ u8 contents_encryption_mode;
+ u8 filenames_encryption_mode;
+ u8 flags;
+ u8 master_key_descriptor[FS_KEY_DESCRIPTOR_SIZE];
+ u8 nonce[FS_KEY_DERIVATION_NONCE_SIZE];
+} __packed;
+
+#define FS_ENCRYPTION_CONTEXT_FORMAT_V1 1
+
+/**
+ * For encrypted symlinks, the ciphertext length is stored at the beginning
+ * of the string in little-endian format.
+ */
+struct fscrypt_symlink_data {
+ __le16 len;
+ char encrypted_path[1];
+} __packed;
+
+/*
+ * fscrypt_info - the "encryption key" for an inode
+ *
+ * When an encrypted file's key is made available, an instance of this struct is
+ * allocated and stored in ->i_crypt_info. Once created, it remains until the
+ * inode is evicted.
+ */
+struct fscrypt_info {
+
+ /* The actual crypto transform used for encryption and decryption */
+ struct crypto_skcipher *ci_ctfm;
+
+ /*
+ * Cipher for ESSIV IV generation. Only set for CBC contents
+ * encryption, otherwise is NULL.
+ */
+ struct crypto_cipher *ci_essiv_tfm;
+
+ /*
+ * Encryption mode used for this inode. It corresponds to either
+ * ci_data_mode or ci_filename_mode, depending on the inode type.
+ */
+ struct fscrypt_mode *ci_mode;
+
+ /*
+ * If non-NULL, then this inode uses a master key directly rather than a
+ * derived key, and ci_ctfm will equal ci_master_key->mk_ctfm.
+ * Otherwise, this inode uses a derived key.
+ */
+ struct fscrypt_master_key *ci_master_key;
+
+ /* fields from the fscrypt_context */
+ u8 ci_data_mode;
+ u8 ci_filename_mode;
+ u8 ci_flags;
+ u8 ci_master_key_descriptor[FS_KEY_DESCRIPTOR_SIZE];
+ u8 ci_nonce[FS_KEY_DERIVATION_NONCE_SIZE];
+};
+
+typedef enum {
+ FS_DECRYPT = 0,
+ FS_ENCRYPT,
+} fscrypt_direction_t;
+
+#define FS_CTX_REQUIRES_FREE_ENCRYPT_FL 0x00000001
+#define FS_CTX_HAS_BOUNCE_BUFFER_FL 0x00000002
+
+static inline bool fscrypt_valid_enc_modes(u32 contents_mode,
+ u32 filenames_mode)
+{
+ if (contents_mode == FS_ENCRYPTION_MODE_AES_128_CBC &&
+ filenames_mode == FS_ENCRYPTION_MODE_AES_128_CTS)
+ return true;
+
+ if (contents_mode == FS_ENCRYPTION_MODE_AES_256_XTS &&
+ filenames_mode == FS_ENCRYPTION_MODE_AES_256_CTS)
+ return true;
+
+ if (contents_mode == FS_ENCRYPTION_MODE_ADIANTUM &&
+ filenames_mode == FS_ENCRYPTION_MODE_ADIANTUM)
+ return true;
+
+ return false;
+}
+
+/* crypto.c */
+extern struct kmem_cache *fscrypt_info_cachep;
+extern int fscrypt_initialize(unsigned int cop_flags);
+extern int fscrypt_do_page_crypto(const struct inode *inode,
+ fscrypt_direction_t rw, u64 lblk_num,
+ struct page *src_page,
+ struct page *dest_page,
+ unsigned int len, unsigned int offs,
+ gfp_t gfp_flags);
+extern struct page *fscrypt_alloc_bounce_page(struct fscrypt_ctx *ctx,
+ gfp_t gfp_flags);
+extern const struct dentry_operations fscrypt_d_ops;
+
+extern void __printf(3, 4) __cold
+fscrypt_msg(struct super_block *sb, const char *level, const char *fmt, ...);
+
+#define fscrypt_warn(sb, fmt, ...) \
+ fscrypt_msg(sb, KERN_WARNING, fmt, ##__VA_ARGS__)
+#define fscrypt_err(sb, fmt, ...) \
+ fscrypt_msg(sb, KERN_ERR, fmt, ##__VA_ARGS__)
+
+#define FSCRYPT_MAX_IV_SIZE 32
+
+union fscrypt_iv {
+ struct {
+ /* logical block number within the file */
+ __le64 lblk_num;
+
+ /* per-file nonce; only set in DIRECT_KEY mode */
+ u8 nonce[FS_KEY_DERIVATION_NONCE_SIZE];
+ };
+ u8 raw[FSCRYPT_MAX_IV_SIZE];
+};
+
+void fscrypt_generate_iv(union fscrypt_iv *iv, u64 lblk_num,
+ const struct fscrypt_info *ci);
+
+/* fname.c */
+extern int fname_encrypt(struct inode *inode, const struct qstr *iname,
+ u8 *out, unsigned int olen);
+extern bool fscrypt_fname_encrypted_size(const struct inode *inode,
+ u32 orig_len, u32 max_len,
+ u32 *encrypted_len_ret);
+
+/* keyinfo.c */
+
+struct fscrypt_mode {
+ const char *friendly_name;
+ const char *cipher_str;
+ int keysize;
+ int ivsize;
+ bool logged_impl_name;
+ bool needs_essiv;
+};
+
+extern void __exit fscrypt_essiv_cleanup(void);
+
+#endif /* _FSCRYPT_PRIVATE_H */
diff --git a/fs/crypto/hooks.c b/fs/crypto/hooks.c
new file mode 100644
index 000000000000..926e5df20ec3
--- /dev/null
+++ b/fs/crypto/hooks.c
@@ -0,0 +1,271 @@
+/*
+ * fs/crypto/hooks.c
+ *
+ * Encryption hooks for higher-level filesystem operations.
+ */
+
+#include <linux/ratelimit.h>
+#include "fscrypt_private.h"
+
+/**
+ * fscrypt_file_open - prepare to open a possibly-encrypted regular file
+ * @inode: the inode being opened
+ * @filp: the struct file being set up
+ *
+ * Currently, an encrypted regular file can only be opened if its encryption key
+ * is available; access to the raw encrypted contents is not supported.
+ * Therefore, we first set up the inode's encryption key (if not already done)
+ * and return an error if it's unavailable.
+ *
+ * We also verify that if the parent directory (from the path via which the file
+ * is being opened) is encrypted, then the inode being opened uses the same
+ * encryption policy. This is needed as part of the enforcement that all files
+ * in an encrypted directory tree use the same encryption policy, as a
+ * protection against certain types of offline attacks. Note that this check is
+ * needed even when opening an *unencrypted* file, since it's forbidden to have
+ * an unencrypted file in an encrypted directory.
+ *
+ * Return: 0 on success, -ENOKEY if the key is missing, or another -errno code
+ */
+int fscrypt_file_open(struct inode *inode, struct file *filp)
+{
+ int err;
+ struct dentry *dir;
+
+ err = fscrypt_require_key(inode);
+ if (err)
+ return err;
+
+ dir = dget_parent(file_dentry(filp));
+ if (IS_ENCRYPTED(d_inode(dir)) &&
+ !fscrypt_has_permitted_context(d_inode(dir), inode)) {
+ fscrypt_warn(inode->i_sb,
+ "inconsistent encryption contexts: %lu/%lu",
+ d_inode(dir)->i_ino, inode->i_ino);
+ err = -EPERM;
+ }
+ dput(dir);
+ return err;
+}
+EXPORT_SYMBOL_GPL(fscrypt_file_open);
+
+int __fscrypt_prepare_link(struct inode *inode, struct inode *dir)
+{
+ int err;
+
+ err = fscrypt_require_key(dir);
+ if (err)
+ return err;
+
+ if (!fscrypt_has_permitted_context(dir, inode))
+ return -EPERM;
+
+ return 0;
+}
+EXPORT_SYMBOL_GPL(__fscrypt_prepare_link);
+
+int __fscrypt_prepare_rename(struct inode *old_dir, struct dentry *old_dentry,
+ struct inode *new_dir, struct dentry *new_dentry,
+ unsigned int flags)
+{
+ int err;
+
+ err = fscrypt_require_key(old_dir);
+ if (err)
+ return err;
+
+ err = fscrypt_require_key(new_dir);
+ if (err)
+ return err;
+
+ if (old_dir != new_dir) {
+ if (IS_ENCRYPTED(new_dir) &&
+ !fscrypt_has_permitted_context(new_dir,
+ d_inode(old_dentry)))
+ return -EPERM;
+
+ if ((flags & RENAME_EXCHANGE) &&
+ IS_ENCRYPTED(old_dir) &&
+ !fscrypt_has_permitted_context(old_dir,
+ d_inode(new_dentry)))
+ return -EPERM;
+ }
+ return 0;
+}
+EXPORT_SYMBOL_GPL(__fscrypt_prepare_rename);
+
+int __fscrypt_prepare_lookup(struct inode *dir, struct dentry *dentry)
+{
+ int err = fscrypt_get_encryption_info(dir);
+
+ if (err)
+ return err;
+
+ if (fscrypt_has_encryption_key(dir)) {
+ spin_lock(&dentry->d_lock);
+ dentry->d_flags |= DCACHE_ENCRYPTED_WITH_KEY;
+ spin_unlock(&dentry->d_lock);
+ }
+
+ d_set_d_op(dentry, &fscrypt_d_ops);
+ return 0;
+}
+EXPORT_SYMBOL_GPL(__fscrypt_prepare_lookup);
+
+int __fscrypt_prepare_symlink(struct inode *dir, unsigned int len,
+ unsigned int max_len,
+ struct fscrypt_str *disk_link)
+{
+ int err;
+
+ /*
+ * To calculate the size of the encrypted symlink target we need to know
+ * the amount of NUL padding, which is determined by the flags set in
+ * the encryption policy which will be inherited from the directory.
+ * The easiest way to get access to this is to just load the directory's
+ * fscrypt_info, since we'll need it to create the dir_entry anyway.
+ *
+ * Note: in test_dummy_encryption mode, @dir may be unencrypted.
+ */
+ err = fscrypt_get_encryption_info(dir);
+ if (err)
+ return err;
+ if (!fscrypt_has_encryption_key(dir))
+ return -ENOKEY;
+
+ /*
+ * Calculate the size of the encrypted symlink and verify it won't
+ * exceed max_len. Note that for historical reasons, encrypted symlink
+ * targets are prefixed with the ciphertext length, despite this
+ * actually being redundant with i_size. This decreases by 2 bytes the
+ * longest symlink target we can accept.
+ *
+ * We could recover 1 byte by not counting a null terminator, but
+ * counting it (even though it is meaningless for ciphertext) is simpler
+ * for now since filesystems will assume it is there and subtract it.
+ */
+ if (!fscrypt_fname_encrypted_size(dir, len,
+ max_len - sizeof(struct fscrypt_symlink_data),
+ &disk_link->len))
+ return -ENAMETOOLONG;
+ disk_link->len += sizeof(struct fscrypt_symlink_data);
+
+ disk_link->name = NULL;
+ return 0;
+}
+EXPORT_SYMBOL_GPL(__fscrypt_prepare_symlink);
+
+int __fscrypt_encrypt_symlink(struct inode *inode, const char *target,
+ unsigned int len, struct fscrypt_str *disk_link)
+{
+ int err;
+ struct qstr iname = QSTR_INIT(target, len);
+ struct fscrypt_symlink_data *sd;
+ unsigned int ciphertext_len;
+
+ err = fscrypt_require_key(inode);
+ if (err)
+ return err;
+
+ if (disk_link->name) {
+ /* filesystem-provided buffer */
+ sd = (struct fscrypt_symlink_data *)disk_link->name;
+ } else {
+ sd = kmalloc(disk_link->len, GFP_NOFS);
+ if (!sd)
+ return -ENOMEM;
+ }
+ ciphertext_len = disk_link->len - sizeof(*sd);
+ sd->len = cpu_to_le16(ciphertext_len);
+
+ err = fname_encrypt(inode, &iname, sd->encrypted_path, ciphertext_len);
+ if (err) {
+ if (!disk_link->name)
+ kfree(sd);
+ return err;
+ }
+ /*
+ * Null-terminating the ciphertext doesn't make sense, but we still
+ * count the null terminator in the length, so we might as well
+ * initialize it just in case the filesystem writes it out.
+ */
+ sd->encrypted_path[ciphertext_len] = '\0';
+
+ if (!disk_link->name)
+ disk_link->name = (unsigned char *)sd;
+ return 0;
+}
+EXPORT_SYMBOL_GPL(__fscrypt_encrypt_symlink);
+
+/**
+ * fscrypt_get_symlink - get the target of an encrypted symlink
+ * @inode: the symlink inode
+ * @caddr: the on-disk contents of the symlink
+ * @max_size: size of @caddr buffer
+ * @done: if successful, will be set up to free the returned target
+ *
+ * If the symlink's encryption key is available, we decrypt its target.
+ * Otherwise, we encode its target for presentation.
+ *
+ * This may sleep, so the filesystem must have dropped out of RCU mode already.
+ *
+ * Return: the presentable symlink target or an ERR_PTR()
+ */
+const char *fscrypt_get_symlink(struct inode *inode, const void *caddr,
+ unsigned int max_size,
+ struct delayed_call *done)
+{
+ const struct fscrypt_symlink_data *sd;
+ struct fscrypt_str cstr, pstr;
+ int err;
+
+ /* This is for encrypted symlinks only */
+ if (WARN_ON(!IS_ENCRYPTED(inode)))
+ return ERR_PTR(-EINVAL);
+
+ /*
+ * Try to set up the symlink's encryption key, but we can continue
+ * regardless of whether the key is available or not.
+ */
+ err = fscrypt_get_encryption_info(inode);
+ if (err)
+ return ERR_PTR(err);
+
+ /*
+ * For historical reasons, encrypted symlink targets are prefixed with
+ * the ciphertext length, even though this is redundant with i_size.
+ */
+
+ if (max_size < sizeof(*sd))
+ return ERR_PTR(-EUCLEAN);
+ sd = caddr;
+ cstr.name = (unsigned char *)sd->encrypted_path;
+ cstr.len = le16_to_cpu(sd->len);
+
+ if (cstr.len == 0)
+ return ERR_PTR(-EUCLEAN);
+
+ if (cstr.len + sizeof(*sd) - 1 > max_size)
+ return ERR_PTR(-EUCLEAN);
+
+ err = fscrypt_fname_alloc_buffer(inode, cstr.len, &pstr);
+ if (err)
+ return ERR_PTR(err);
+
+ err = fscrypt_fname_disk_to_usr(inode, 0, 0, &cstr, &pstr);
+ if (err)
+ goto err_kfree;
+
+ err = -EUCLEAN;
+ if (pstr.name[0] == '\0')
+ goto err_kfree;
+
+ pstr.name[pstr.len] = '\0';
+ set_delayed_call(done, kfree_link, pstr.name);
+ return pstr.name;
+
+err_kfree:
+ kfree(pstr.name);
+ return ERR_PTR(err);
+}
+EXPORT_SYMBOL_GPL(fscrypt_get_symlink);
diff --git a/fs/crypto/keyinfo.c b/fs/crypto/keyinfo.c
index a755fa1a0017..98472a4670fb 100644
--- a/fs/crypto/keyinfo.c
+++ b/fs/crypto/keyinfo.c
@@ -9,35 +9,35 @@
*/
#include <keys/user-type.h>
+#include <linux/hashtable.h>
#include <linux/scatterlist.h>
-#include <linux/fscrypto.h>
+#include <linux/ratelimit.h>
+#include <crypto/aes.h>
+#include <crypto/algapi.h>
+#include <crypto/sha.h>
+#include <crypto/skcipher.h>
+#include "fscrypt_private.h"
-static void derive_crypt_complete(struct crypto_async_request *req, int rc)
-{
- struct fscrypt_completion_result *ecr = req->data;
-
- if (rc == -EINPROGRESS)
- return;
+static struct crypto_shash *essiv_hash_tfm;
- ecr->res = rc;
- complete(&ecr->completion);
-}
+/* Table of keys referenced by FS_POLICY_FLAG_DIRECT_KEY policies */
+static DEFINE_HASHTABLE(fscrypt_master_keys, 6); /* 6 bits = 64 buckets */
+static DEFINE_SPINLOCK(fscrypt_master_keys_lock);
-/**
- * derive_key_aes() - Derive a key using AES-128-ECB
- * @deriving_key: Encryption key used for derivation.
- * @source_key: Source key to which to apply derivation.
- * @derived_key: Derived key.
+/*
+ * Key derivation function. This generates the derived key by encrypting the
+ * master key with AES-128-ECB using the inode's nonce as the AES key.
*
- * Return: Zero on success; non-zero otherwise.
+ * The master key must be at least as long as the derived key. If the master
+ * key is longer, then only the first 'derived_keysize' bytes are used.
*/
-static int derive_key_aes(u8 deriving_key[FS_AES_128_ECB_KEY_SIZE],
- u8 source_key[FS_AES_256_XTS_KEY_SIZE],
- u8 derived_key[FS_AES_256_XTS_KEY_SIZE])
+static int derive_key_aes(const u8 *master_key,
+ const struct fscrypt_context *ctx,
+ u8 *derived_key, unsigned int derived_keysize)
{
int res = 0;
struct skcipher_request *req = NULL;
- DECLARE_FS_COMPLETION_RESULT(ecr);
+ DECLARE_CRYPTO_WAIT(wait);
struct scatterlist src_sg, dst_sg;
struct crypto_skcipher *tfm = crypto_alloc_skcipher("ecb(aes)", 0, 0);
@@ -54,116 +54,436 @@ static int derive_key_aes(u8 deriving_key[FS_AES_128_ECB_KEY_SIZE],
}
skcipher_request_set_callback(req,
CRYPTO_TFM_REQ_MAY_BACKLOG | CRYPTO_TFM_REQ_MAY_SLEEP,
- derive_crypt_complete, &ecr);
- res = crypto_skcipher_setkey(tfm, deriving_key,
- FS_AES_128_ECB_KEY_SIZE);
+ crypto_req_done, &wait);
+ res = crypto_skcipher_setkey(tfm, ctx->nonce, sizeof(ctx->nonce));
if (res < 0)
goto out;
- sg_init_one(&src_sg, source_key, FS_AES_256_XTS_KEY_SIZE);
- sg_init_one(&dst_sg, derived_key, FS_AES_256_XTS_KEY_SIZE);
- skcipher_request_set_crypt(req, &src_sg, &dst_sg,
- FS_AES_256_XTS_KEY_SIZE, NULL);
- res = crypto_skcipher_encrypt(req);
- if (res == -EINPROGRESS || res == -EBUSY) {
- wait_for_completion(&ecr.completion);
- res = ecr.res;
- }
+ sg_init_one(&src_sg, master_key, derived_keysize);
+ sg_init_one(&dst_sg, derived_key, derived_keysize);
+ skcipher_request_set_crypt(req, &src_sg, &dst_sg, derived_keysize,
+ NULL);
+ res = crypto_wait_req(crypto_skcipher_encrypt(req), &wait);
out:
skcipher_request_free(req);
crypto_free_skcipher(tfm);
return res;
}
-static int validate_user_key(struct fscrypt_info *crypt_info,
- struct fscrypt_context *ctx, u8 *raw_key,
- u8 *prefix, int prefix_size)
+/*
+ * Search the current task's subscribed keyrings for a "logon" key with
+ * description prefix:descriptor, and if found acquire a read lock on it and
+ * return a pointer to its validated payload in *payload_ret.
+ */
+static struct key *
+find_and_lock_process_key(const char *prefix,
+ const u8 descriptor[FS_KEY_DESCRIPTOR_SIZE],
+ unsigned int min_keysize,
+ const struct fscrypt_key **payload_ret)
{
- u8 *full_key_descriptor;
- struct key *keyring_key;
- struct fscrypt_key *master_key;
+ char *description;
+ struct key *key;
const struct user_key_payload *ukp;
- int full_key_len = prefix_size + (FS_KEY_DESCRIPTOR_SIZE * 2) + 1;
- int res;
+ const struct fscrypt_key *payload;
- full_key_descriptor = kmalloc(full_key_len, GFP_NOFS);
- if (!full_key_descriptor)
- return -ENOMEM;
+ description = kasprintf(GFP_NOFS, "%s%*phN", prefix,
+ FS_KEY_DESCRIPTOR_SIZE, descriptor);
+ if (!description)
+ return ERR_PTR(-ENOMEM);
- memcpy(full_key_descriptor, prefix, prefix_size);
- sprintf(full_key_descriptor + prefix_size,
- "%*phN", FS_KEY_DESCRIPTOR_SIZE,
- ctx->master_key_descriptor);
- full_key_descriptor[full_key_len - 1] = '\0';
- keyring_key = request_key(&key_type_logon, full_key_descriptor, NULL);
- kfree(full_key_descriptor);
- if (IS_ERR(keyring_key))
- return PTR_ERR(keyring_key);
- down_read(&keyring_key->sem);
-
- if (keyring_key->type != &key_type_logon) {
- printk_once(KERN_WARNING
- "%s: key type must be logon\n", __func__);
- res = -ENOKEY;
- goto out;
- }
- ukp = user_key_payload(keyring_key);
- if (!ukp) {
- /* key was revoked before we acquired its semaphore */
- res = -EKEYREVOKED;
- goto out;
+ key = request_key(&key_type_logon, description, NULL);
+ kfree(description);
+ if (IS_ERR(key))
+ return key;
+
+ down_read(&key->sem);
+ ukp = user_key_payload_locked(key);
+
+ if (!ukp) /* was the key revoked before we acquired its semaphore? */
+ goto invalid;
+
+ payload = (const struct fscrypt_key *)ukp->data;
+
+ if (ukp->datalen != sizeof(struct fscrypt_key) ||
+ payload->size < 1 || payload->size > FS_MAX_KEY_SIZE) {
+ fscrypt_warn(NULL,
+ "key with description '%s' has invalid payload",
+ key->description);
+ goto invalid;
}
- if (ukp->datalen != sizeof(struct fscrypt_key)) {
- res = -EINVAL;
- goto out;
+
+ if (payload->size < min_keysize) {
+ fscrypt_warn(NULL,
+ "key with description '%s' is too short (got %u bytes, need %u+ bytes)",
+ key->description, payload->size, min_keysize);
+ goto invalid;
}
- master_key = (struct fscrypt_key *)ukp->data;
- BUILD_BUG_ON(FS_AES_128_ECB_KEY_SIZE != FS_KEY_DERIVATION_NONCE_SIZE);
-
- if (master_key->size != FS_AES_256_XTS_KEY_SIZE) {
- printk_once(KERN_WARNING
- "%s: key size incorrect: %d\n",
- __func__, master_key->size);
- res = -ENOKEY;
- goto out;
+
+ *payload_ret = payload;
+ return key;
+
+invalid:
+ up_read(&key->sem);
+ key_put(key);
+ return ERR_PTR(-ENOKEY);
+}
+
+static struct fscrypt_mode available_modes[] = {
+ [FS_ENCRYPTION_MODE_AES_256_XTS] = {
+ .friendly_name = "AES-256-XTS",
+ .cipher_str = "xts(aes)",
+ .keysize = 64,
+ .ivsize = 16,
+ },
+ [FS_ENCRYPTION_MODE_AES_256_CTS] = {
+ .friendly_name = "AES-256-CTS-CBC",
+ .cipher_str = "cts(cbc(aes))",
+ .keysize = 32,
+ .ivsize = 16,
+ },
+ [FS_ENCRYPTION_MODE_AES_128_CBC] = {
+ .friendly_name = "AES-128-CBC",
+ .cipher_str = "cbc(aes)",
+ .keysize = 16,
+ .ivsize = 16,
+ .needs_essiv = true,
+ },
+ [FS_ENCRYPTION_MODE_AES_128_CTS] = {
+ .friendly_name = "AES-128-CTS-CBC",
+ .cipher_str = "cts(cbc(aes))",
+ .keysize = 16,
+ .ivsize = 16,
+ },
+ [FS_ENCRYPTION_MODE_ADIANTUM] = {
+ .friendly_name = "Adiantum",
+ .cipher_str = "adiantum(xchacha12,aes)",
+ .keysize = 32,
+ .ivsize = 32,
+ },
+};
+
+static struct fscrypt_mode *
+select_encryption_mode(const struct fscrypt_info *ci, const struct inode *inode)
+{
+ if (!fscrypt_valid_enc_modes(ci->ci_data_mode, ci->ci_filename_mode)) {
+ fscrypt_warn(inode->i_sb,
+ "inode %lu uses unsupported encryption modes (contents mode %d, filenames mode %d)",
+ inode->i_ino, ci->ci_data_mode,
+ ci->ci_filename_mode);
+ return ERR_PTR(-EINVAL);
}
- res = derive_key_aes(ctx->nonce, master_key->raw, raw_key);
-out:
- up_read(&keyring_key->sem);
- key_put(keyring_key);
- return res;
+
+ if (S_ISREG(inode->i_mode))
+ return &available_modes[ci->ci_data_mode];
+
+ if (S_ISDIR(inode->i_mode) || S_ISLNK(inode->i_mode))
+ return &available_modes[ci->ci_filename_mode];
+
+ WARN_ONCE(1, "fscrypt: filesystem tried to load encryption info for inode %lu, which is not encryptable (file type %d)\n",
+ inode->i_ino, (inode->i_mode & S_IFMT));
+ return ERR_PTR(-EINVAL);
}
-static int determine_cipher_type(struct fscrypt_info *ci, struct inode *inode,
- const char **cipher_str_ret, int *keysize_ret)
+/* Find the master key, then derive the inode's actual encryption key */
+static int find_and_derive_key(const struct inode *inode,
+ const struct fscrypt_context *ctx,
+ u8 *derived_key, const struct fscrypt_mode *mode)
{
- if (S_ISREG(inode->i_mode)) {
- if (ci->ci_data_mode == FS_ENCRYPTION_MODE_AES_256_XTS) {
- *cipher_str_ret = "xts(aes)";
- *keysize_ret = FS_AES_256_XTS_KEY_SIZE;
- return 0;
+ struct key *key;
+ const struct fscrypt_key *payload;
+ int err;
+
+ key = find_and_lock_process_key(FS_KEY_DESC_PREFIX,
+ ctx->master_key_descriptor,
+ mode->keysize, &payload);
+ if (key == ERR_PTR(-ENOKEY) && inode->i_sb->s_cop->key_prefix) {
+ key = find_and_lock_process_key(inode->i_sb->s_cop->key_prefix,
+ ctx->master_key_descriptor,
+ mode->keysize, &payload);
+ }
+ if (IS_ERR(key))
+ return PTR_ERR(key);
+
+ if (ctx->flags & FS_POLICY_FLAG_DIRECT_KEY) {
+ if (mode->ivsize < offsetofend(union fscrypt_iv, nonce)) {
+ fscrypt_warn(inode->i_sb,
+ "direct key mode not allowed with %s",
+ mode->friendly_name);
+ err = -EINVAL;
+ } else if (ctx->contents_encryption_mode !=
+ ctx->filenames_encryption_mode) {
+ fscrypt_warn(inode->i_sb,
+ "direct key mode not allowed with different contents and filenames modes");
+ err = -EINVAL;
+ } else {
+ memcpy(derived_key, payload->raw, mode->keysize);
+ err = 0;
}
- pr_warn_once("fscrypto: unsupported contents encryption mode "
- "%d for inode %lu\n",
- ci->ci_data_mode, inode->i_ino);
- return -ENOKEY;
+ } else {
+ err = derive_key_aes(payload->raw, ctx, derived_key,
+ mode->keysize);
}
+ up_read(&key->sem);
+ key_put(key);
+ return err;
+}
+
+/* Allocate and key a symmetric cipher object for the given encryption mode */
+static struct crypto_skcipher *
+allocate_skcipher_for_mode(struct fscrypt_mode *mode, const u8 *raw_key,
+ const struct inode *inode)
+{
+ struct crypto_skcipher *tfm;
+ int err;
- if (S_ISDIR(inode->i_mode) || S_ISLNK(inode->i_mode)) {
- if (ci->ci_filename_mode == FS_ENCRYPTION_MODE_AES_256_CTS) {
- *cipher_str_ret = "cts(cbc(aes))";
- *keysize_ret = FS_AES_256_CTS_KEY_SIZE;
- return 0;
+ tfm = crypto_alloc_skcipher(mode->cipher_str, 0, 0);
+ if (IS_ERR(tfm)) {
+ fscrypt_warn(inode->i_sb,
+ "error allocating '%s' transform for inode %lu: %ld",
+ mode->cipher_str, inode->i_ino, PTR_ERR(tfm));
+ return tfm;
+ }
+ if (unlikely(!mode->logged_impl_name)) {
+ /*
+ * fscrypt performance can vary greatly depending on which
+ * crypto algorithm implementation is used. Help people debug
+ * performance problems by logging the ->cra_driver_name the
+ * first time a mode is used. Note that multiple threads can
+ * race here, but it doesn't really matter.
+ */
+ mode->logged_impl_name = true;
+ pr_info("fscrypt: %s using implementation \"%s\"\n",
+ mode->friendly_name,
+ crypto_skcipher_alg(tfm)->base.cra_driver_name);
+ }
+ crypto_skcipher_set_flags(tfm, CRYPTO_TFM_REQ_WEAK_KEY);
+ err = crypto_skcipher_setkey(tfm, raw_key, mode->keysize);
+ if (err)
+ goto err_free_tfm;
+
+ return tfm;
+
+err_free_tfm:
+ crypto_free_skcipher(tfm);
+ return ERR_PTR(err);
+}
+
+/* Master key referenced by FS_POLICY_FLAG_DIRECT_KEY policy */
+struct fscrypt_master_key {
+ struct hlist_node mk_node;
+ atomic_t mk_refcount;
+ const struct fscrypt_mode *mk_mode;
+ struct crypto_skcipher *mk_ctfm;
+ u8 mk_descriptor[FS_KEY_DESCRIPTOR_SIZE];
+ u8 mk_raw[FS_MAX_KEY_SIZE];
+};
+
+static void free_master_key(struct fscrypt_master_key *mk)
+{
+ if (mk) {
+ crypto_free_skcipher(mk->mk_ctfm);
+ kzfree(mk);
+ }
+}
+
+static void put_master_key(struct fscrypt_master_key *mk)
+{
+ if (!atomic_dec_and_lock(&mk->mk_refcount, &fscrypt_master_keys_lock))
+ return;
+ hash_del(&mk->mk_node);
+ spin_unlock(&fscrypt_master_keys_lock);
+
+ free_master_key(mk);
+}
+
+/*
+ * Find/insert the given master key into the fscrypt_master_keys table. If
+ * found, it is returned with elevated refcount, and 'to_insert' is freed if
+ * non-NULL. If not found, 'to_insert' is inserted and returned if it's
+ * non-NULL; otherwise NULL is returned.
+ */
+static struct fscrypt_master_key *
+find_or_insert_master_key(struct fscrypt_master_key *to_insert,
+ const u8 *raw_key, const struct fscrypt_mode *mode,
+ const struct fscrypt_info *ci)
+{
+ unsigned long hash_key;
+ struct fscrypt_master_key *mk;
+
+ /*
+ * Careful: to avoid potentially leaking secret key bytes via timing
+ * information, we must key the hash table by descriptor rather than by
+ * raw key, and use crypto_memneq() when comparing raw keys.
+ */
+
+ BUILD_BUG_ON(sizeof(hash_key) > FS_KEY_DESCRIPTOR_SIZE);
+ memcpy(&hash_key, ci->ci_master_key_descriptor, sizeof(hash_key));
+
+ spin_lock(&fscrypt_master_keys_lock);
+ hash_for_each_possible(fscrypt_master_keys, mk, mk_node, hash_key) {
+ if (memcmp(ci->ci_master_key_descriptor, mk->mk_descriptor,
+ FS_KEY_DESCRIPTOR_SIZE) != 0)
+ continue;
+ if (mode != mk->mk_mode)
+ continue;
+ if (crypto_memneq(raw_key, mk->mk_raw, mode->keysize))
+ continue;
+ /* using existing tfm with same (descriptor, mode, raw_key) */
+ atomic_inc(&mk->mk_refcount);
+ spin_unlock(&fscrypt_master_keys_lock);
+ free_master_key(to_insert);
+ return mk;
+ }
+ if (to_insert)
+ hash_add(fscrypt_master_keys, &to_insert->mk_node, hash_key);
+ spin_unlock(&fscrypt_master_keys_lock);
+ return to_insert;
+}
+
+/* Prepare to encrypt directly using the master key in the given mode */
+static struct fscrypt_master_key *
+fscrypt_get_master_key(const struct fscrypt_info *ci, struct fscrypt_mode *mode,
+ const u8 *raw_key, const struct inode *inode)
+{
+ struct fscrypt_master_key *mk;
+ int err;
+
+ /* Is there already a tfm for this key? */
+ mk = find_or_insert_master_key(NULL, raw_key, mode, ci);
+ if (mk)
+ return mk;
+
+ /* Nope, allocate one. */
+ mk = kzalloc(sizeof(*mk), GFP_NOFS);
+ if (!mk)
+ return ERR_PTR(-ENOMEM);
+ atomic_set(&mk->mk_refcount, 1);
+ mk->mk_mode = mode;
+ mk->mk_ctfm = allocate_skcipher_for_mode(mode, raw_key, inode);
+ if (IS_ERR(mk->mk_ctfm)) {
+ err = PTR_ERR(mk->mk_ctfm);
+ mk->mk_ctfm = NULL;
+ goto err_free_mk;
+ }
+ memcpy(mk->mk_descriptor, ci->ci_master_key_descriptor,
+ FS_KEY_DESCRIPTOR_SIZE);
+ memcpy(mk->mk_raw, raw_key, mode->keysize);
+
+ return find_or_insert_master_key(mk, raw_key, mode, ci);
+
+err_free_mk:
+ free_master_key(mk);
+ return ERR_PTR(err);
+}
+
+static int derive_essiv_salt(const u8 *key, int keysize, u8 *salt)
+{
+ struct crypto_shash *tfm = READ_ONCE(essiv_hash_tfm);
+
+ /* init hash transform on demand */
+ if (unlikely(!tfm)) {
+ struct crypto_shash *prev_tfm;
+
+ tfm = crypto_alloc_shash("sha256", 0, 0);
+ if (IS_ERR(tfm)) {
+ fscrypt_warn(NULL,
+ "error allocating SHA-256 transform: %ld",
+ PTR_ERR(tfm));
+ return PTR_ERR(tfm);
+ }
+ prev_tfm = cmpxchg(&essiv_hash_tfm, NULL, tfm);
+ if (prev_tfm) {
+ crypto_free_shash(tfm);
+ tfm = prev_tfm;
}
- pr_warn_once("fscrypto: unsupported filenames encryption mode "
- "%d for inode %lu\n",
- ci->ci_filename_mode, inode->i_ino);
- return -ENOKEY;
}
- pr_warn_once("fscrypto: unsupported file type %d for inode %lu\n",
- (inode->i_mode & S_IFMT), inode->i_ino);
- return -ENOKEY;
+ {
+ SHASH_DESC_ON_STACK(desc, tfm);
+ desc->tfm = tfm;
+ desc->flags = 0;
+
+ return crypto_shash_digest(desc, key, keysize, salt);
+ }
+}
+
+static int init_essiv_generator(struct fscrypt_info *ci, const u8 *raw_key,
+ int keysize)
+{
+ int err;
+ struct crypto_cipher *essiv_tfm;
+ u8 salt[SHA256_DIGEST_SIZE];
+
+ essiv_tfm = crypto_alloc_cipher("aes", 0, 0);
+ if (IS_ERR(essiv_tfm))
+ return PTR_ERR(essiv_tfm);
+
+ ci->ci_essiv_tfm = essiv_tfm;
+
+ err = derive_essiv_salt(raw_key, keysize, salt);
+ if (err)
+ goto out;
+
+ /*
+ * Using SHA256 to derive the salt/key will result in AES-256 being
+ * used for IV generation. File contents encryption will still use the
+ * configured keysize (AES-128) nevertheless.
+ */
+ err = crypto_cipher_setkey(essiv_tfm, salt, sizeof(salt));
+ if (err)
+ goto out;
+
+out:
+ memzero_explicit(salt, sizeof(salt));
+ return err;
+}
+
+void __exit fscrypt_essiv_cleanup(void)
+{
+ crypto_free_shash(essiv_hash_tfm);
+}
+
+/*
+ * Given the encryption mode and key (normally the derived key, but for
+ * FS_POLICY_FLAG_DIRECT_KEY mode it's the master key), set up the inode's
+ * symmetric cipher transform object(s).
+ */
+static int setup_crypto_transform(struct fscrypt_info *ci,
+ struct fscrypt_mode *mode,
+ const u8 *raw_key, const struct inode *inode)
+{
+ struct fscrypt_master_key *mk;
+ struct crypto_skcipher *ctfm;
+ int err;
+
+ if (ci->ci_flags & FS_POLICY_FLAG_DIRECT_KEY) {
+ mk = fscrypt_get_master_key(ci, mode, raw_key, inode);
+ if (IS_ERR(mk))
+ return PTR_ERR(mk);
+ ctfm = mk->mk_ctfm;
+ } else {
+ mk = NULL;
+ ctfm = allocate_skcipher_for_mode(mode, raw_key, inode);
+ if (IS_ERR(ctfm))
+ return PTR_ERR(ctfm);
+ }
+ ci->ci_master_key = mk;
+ ci->ci_ctfm = ctfm;
+
+ if (mode->needs_essiv) {
+ /* ESSIV implies 16-byte IVs which implies !DIRECT_KEY */
+ WARN_ON(mode->ivsize != AES_BLOCK_SIZE);
+ WARN_ON(ci->ci_flags & FS_POLICY_FLAG_DIRECT_KEY);
+
+ err = init_essiv_generator(ci, raw_key, mode->keysize);
+ if (err) {
+ fscrypt_warn(inode->i_sb,
+ "error initializing ESSIV generator for inode %lu: %d",
+ inode->i_ino, err);
+ return err;
+ }
+ }
+ return 0;
}
static void put_crypt_info(struct fscrypt_info *ci)
@@ -171,7 +491,12 @@ static void put_crypt_info(struct fscrypt_info *ci)
if (!ci)
return;
- crypto_free_skcipher(ci->ci_ctfm);
+ if (ci->ci_master_key) {
+ put_master_key(ci->ci_master_key);
+ } else {
+ crypto_free_skcipher(ci->ci_ctfm);
+ crypto_free_cipher(ci->ci_essiv_tfm);
+ }
kmem_cache_free(fscrypt_info_cachep, ci);
}
@@ -179,30 +504,28 @@ int fscrypt_get_encryption_info(struct inode *inode)
{
struct fscrypt_info *crypt_info;
struct fscrypt_context ctx;
- struct crypto_skcipher *ctfm;
- const char *cipher_str;
- int keysize;
+ struct fscrypt_mode *mode;
u8 *raw_key = NULL;
int res;
if (inode->i_crypt_info)
return 0;
- res = fscrypt_initialize();
+ res = fscrypt_initialize(inode->i_sb->s_cop->flags);
if (res)
return res;
- if (!inode->i_sb->s_cop->get_context)
- return -EOPNOTSUPP;
-
res = inode->i_sb->s_cop->get_context(inode, &ctx, sizeof(ctx));
if (res < 0) {
- if (!fscrypt_dummy_context_enabled(inode))
+ if (!fscrypt_dummy_context_enabled(inode) ||
+ IS_ENCRYPTED(inode))
return res;
+ /* Fake up a context for an unencrypted directory */
+ memset(&ctx, 0, sizeof(ctx));
ctx.format = FS_ENCRYPTION_CONTEXT_FORMAT_V1;
ctx.contents_encryption_mode = FS_ENCRYPTION_MODE_AES_256_XTS;
ctx.filenames_encryption_mode = FS_ENCRYPTION_MODE_AES_256_CTS;
- ctx.flags = 0;
+ memset(ctx.master_key_descriptor, 0x42, FS_KEY_DESCRIPTOR_SIZE);
} else if (res != sizeof(ctx)) {
return -EINVAL;
}
@@ -213,65 +536,39 @@ int fscrypt_get_encryption_info(struct inode *inode)
if (ctx.flags & ~FS_POLICY_FLAGS_VALID)
return -EINVAL;
- crypt_info = kmem_cache_alloc(fscrypt_info_cachep, GFP_NOFS);
+ crypt_info = kmem_cache_zalloc(fscrypt_info_cachep, GFP_NOFS);
if (!crypt_info)
return -ENOMEM;
crypt_info->ci_flags = ctx.flags;
crypt_info->ci_data_mode = ctx.contents_encryption_mode;
crypt_info->ci_filename_mode = ctx.filenames_encryption_mode;
- crypt_info->ci_ctfm = NULL;
- memcpy(crypt_info->ci_master_key, ctx.master_key_descriptor,
- sizeof(crypt_info->ci_master_key));
+ memcpy(crypt_info->ci_master_key_descriptor, ctx.master_key_descriptor,
+ FS_KEY_DESCRIPTOR_SIZE);
+ memcpy(crypt_info->ci_nonce, ctx.nonce, FS_KEY_DERIVATION_NONCE_SIZE);
- res = determine_cipher_type(crypt_info, inode, &cipher_str, &keysize);
- if (res)
+ mode = select_encryption_mode(crypt_info, inode);
+ if (IS_ERR(mode)) {
+ res = PTR_ERR(mode);
goto out;
+ }
+ WARN_ON(mode->ivsize > FSCRYPT_MAX_IV_SIZE);
+ crypt_info->ci_mode = mode;
/*
- * This cannot be a stack buffer because it is passed to the scatterlist
- * crypto API as part of key derivation.
+ * This cannot be a stack buffer because it may be passed to the
+ * scatterlist crypto API as part of key derivation.
*/
res = -ENOMEM;
- raw_key = kmalloc(FS_MAX_KEY_SIZE, GFP_NOFS);
+ raw_key = kmalloc(mode->keysize, GFP_NOFS);
if (!raw_key)
goto out;
- if (fscrypt_dummy_context_enabled(inode)) {
- memset(raw_key, 0x42, FS_AES_256_XTS_KEY_SIZE);
- goto got_key;
- }
-
- res = validate_user_key(crypt_info, &ctx, raw_key,
- FS_KEY_DESC_PREFIX, FS_KEY_DESC_PREFIX_SIZE);
- if (res && inode->i_sb->s_cop->key_prefix) {
- u8 *prefix = NULL;
- int prefix_size, res2;
-
- prefix_size = inode->i_sb->s_cop->key_prefix(inode, &prefix);
- res2 = validate_user_key(crypt_info, &ctx, raw_key,
- prefix, prefix_size);
- if (res2) {
- if (res2 == -ENOKEY)
- res = -ENOKEY;
- goto out;
- }
- } else if (res) {
- goto out;
- }
-got_key:
- ctfm = crypto_alloc_skcipher(cipher_str, 0, 0);
- if (!ctfm || IS_ERR(ctfm)) {
- res = ctfm ? PTR_ERR(ctfm) : -ENOMEM;
- printk(KERN_DEBUG
- "%s: error %d (inode %u) allocating crypto tfm\n",
- __func__, res, (unsigned) inode->i_ino);
+ res = find_and_derive_key(inode, &ctx, raw_key, mode);
+ if (res)
goto out;
- }
- crypt_info->ci_ctfm = ctfm;
- crypto_skcipher_clear_flags(ctfm, ~0);
- crypto_skcipher_set_flags(ctfm, CRYPTO_TFM_REQ_WEAK_KEY);
- res = crypto_skcipher_setkey(ctfm, raw_key, keysize);
+
+ res = setup_crypto_transform(crypt_info, mode, raw_key, inode);
if (res)
goto out;
@@ -286,19 +583,9 @@ out:
}
EXPORT_SYMBOL(fscrypt_get_encryption_info);
-void fscrypt_put_encryption_info(struct inode *inode, struct fscrypt_info *ci)
+void fscrypt_put_encryption_info(struct inode *inode)
{
- struct fscrypt_info *prev;
-
- if (ci == NULL)
- ci = ACCESS_ONCE(inode->i_crypt_info);
- if (ci == NULL)
- return;
-
- prev = cmpxchg(&inode->i_crypt_info, ci, NULL);
- if (prev != ci)
- return;
-
- put_crypt_info(ci);
+ put_crypt_info(inode->i_crypt_info);
+ inode->i_crypt_info = NULL;
}
EXPORT_SYMBOL(fscrypt_put_encryption_info);
diff --git a/fs/crypto/policy.c b/fs/crypto/policy.c
index c160d2d0e18d..6a9c476e3e67 100644
--- a/fs/crypto/policy.c
+++ b/fs/crypto/policy.c
@@ -10,76 +10,37 @@
#include <linux/random.h>
#include <linux/string.h>
-#include <linux/fscrypto.h>
#include <linux/mount.h>
-
-static int inode_has_encryption_context(struct inode *inode)
-{
- if (!inode->i_sb->s_cop->get_context)
- return 0;
- return (inode->i_sb->s_cop->get_context(inode, NULL, 0L) > 0);
-}
+#include "fscrypt_private.h"
/*
- * check whether the policy is consistent with the encryption context
- * for the inode
+ * check whether an encryption policy is consistent with an encryption context
*/
-static int is_encryption_context_consistent_with_policy(struct inode *inode,
+static bool is_encryption_context_consistent_with_policy(
+ const struct fscrypt_context *ctx,
const struct fscrypt_policy *policy)
{
- struct fscrypt_context ctx;
- int res;
-
- if (!inode->i_sb->s_cop->get_context)
- return 0;
-
- res = inode->i_sb->s_cop->get_context(inode, &ctx, sizeof(ctx));
- if (res != sizeof(ctx))
- return 0;
-
- return (memcmp(ctx.master_key_descriptor, policy->master_key_descriptor,
- FS_KEY_DESCRIPTOR_SIZE) == 0 &&
- (ctx.flags == policy->flags) &&
- (ctx.contents_encryption_mode ==
- policy->contents_encryption_mode) &&
- (ctx.filenames_encryption_mode ==
- policy->filenames_encryption_mode));
+ return memcmp(ctx->master_key_descriptor, policy->master_key_descriptor,
+ FS_KEY_DESCRIPTOR_SIZE) == 0 &&
+ (ctx->flags == policy->flags) &&
+ (ctx->contents_encryption_mode ==
+ policy->contents_encryption_mode) &&
+ (ctx->filenames_encryption_mode ==
+ policy->filenames_encryption_mode);
}
static int create_encryption_context_from_policy(struct inode *inode,
const struct fscrypt_policy *policy)
{
struct fscrypt_context ctx;
- int res;
-
- if (!inode->i_sb->s_cop->set_context)
- return -EOPNOTSUPP;
-
- if (inode->i_sb->s_cop->prepare_context) {
- res = inode->i_sb->s_cop->prepare_context(inode);
- if (res)
- return res;
- }
ctx.format = FS_ENCRYPTION_CONTEXT_FORMAT_V1;
memcpy(ctx.master_key_descriptor, policy->master_key_descriptor,
FS_KEY_DESCRIPTOR_SIZE);
- if (!fscrypt_valid_contents_enc_mode(
- policy->contents_encryption_mode)) {
- printk(KERN_WARNING
- "%s: Invalid contents encryption mode %d\n", __func__,
- policy->contents_encryption_mode);
- return -EINVAL;
- }
-
- if (!fscrypt_valid_filenames_enc_mode(
- policy->filenames_encryption_mode)) {
- printk(KERN_WARNING
- "%s: Invalid filenames encryption mode %d\n", __func__,
- policy->filenames_encryption_mode);
+ if (!fscrypt_valid_enc_modes(policy->contents_encryption_mode,
+ policy->filenames_encryption_mode))
return -EINVAL;
- }
if (policy->flags & ~FS_POLICY_FLAGS_VALID)
return -EINVAL;
@@ -93,16 +54,20 @@ static int create_encryption_context_from_policy(struct inode *inode,
return inode->i_sb->s_cop->set_context(inode, &ctx, sizeof(ctx), NULL);
}
-int fscrypt_process_policy(struct file *filp,
- const struct fscrypt_policy *policy)
+int fscrypt_ioctl_set_policy(struct file *filp, const void __user *arg)
{
+ struct fscrypt_policy policy;
struct inode *inode = file_inode(filp);
int ret;
+ struct fscrypt_context ctx;
+
+ if (copy_from_user(&policy, arg, sizeof(policy)))
+ return -EFAULT;
if (!inode_owner_or_capable(inode))
return -EACCES;
- if (policy->version != 0)
+ if (policy.version != 0)
return -EINVAL;
ret = mnt_want_write_file(filp);
@@ -111,22 +76,23 @@ int fscrypt_process_policy(struct file *filp,
inode_lock(inode);
- if (!inode_has_encryption_context(inode)) {
+ ret = inode->i_sb->s_cop->get_context(inode, &ctx, sizeof(ctx));
+ if (ret == -ENODATA) {
if (!S_ISDIR(inode->i_mode))
ret = -ENOTDIR;
- else if (!inode->i_sb->s_cop->empty_dir)
- ret = -EOPNOTSUPP;
else if (!inode->i_sb->s_cop->empty_dir(inode))
ret = -ENOTEMPTY;
else
ret = create_encryption_context_from_policy(inode,
- policy);
- } else if (!is_encryption_context_consistent_with_policy(inode,
- policy)) {
- printk(KERN_WARNING
- "%s: Policy inconsistent with encryption context\n",
- __func__);
- ret = -EINVAL;
+ &policy);
+ } else if (ret == sizeof(ctx) &&
+ is_encryption_context_consistent_with_policy(&ctx,
+ &policy)) {
+ /* The file already uses the same encryption policy. */
+ ret = 0;
+ } else if (ret >= 0 || ret == -ERANGE) {
+ /* The file already uses a different encryption policy. */
+ ret = -EEXIST;
}
inode_unlock(inode);
@@ -134,32 +100,38 @@ int fscrypt_process_policy(struct file *filp,
mnt_drop_write_file(filp);
return ret;
}
-EXPORT_SYMBOL(fscrypt_process_policy);
+EXPORT_SYMBOL(fscrypt_ioctl_set_policy);
-int fscrypt_get_policy(struct inode *inode, struct fscrypt_policy *policy)
+int fscrypt_ioctl_get_policy(struct file *filp, void __user *arg)
{
+ struct inode *inode = file_inode(filp);
struct fscrypt_context ctx;
+ struct fscrypt_policy policy;
int res;
- if (!inode->i_sb->s_cop->get_context ||
- !inode->i_sb->s_cop->is_encrypted(inode))
+ if (!IS_ENCRYPTED(inode))
return -ENODATA;
res = inode->i_sb->s_cop->get_context(inode, &ctx, sizeof(ctx));
+ if (res < 0 && res != -ERANGE)
+ return res;
if (res != sizeof(ctx))
- return -ENODATA;
+ return -EINVAL;
if (ctx.format != FS_ENCRYPTION_CONTEXT_FORMAT_V1)
return -EINVAL;
- policy->version = 0;
- policy->contents_encryption_mode = ctx.contents_encryption_mode;
- policy->filenames_encryption_mode = ctx.filenames_encryption_mode;
- policy->flags = ctx.flags;
- memcpy(&policy->master_key_descriptor, ctx.master_key_descriptor,
+ policy.version = 0;
+ policy.contents_encryption_mode = ctx.contents_encryption_mode;
+ policy.filenames_encryption_mode = ctx.filenames_encryption_mode;
+ policy.flags = ctx.flags;
+ memcpy(policy.master_key_descriptor, ctx.master_key_descriptor,
FS_KEY_DESCRIPTOR_SIZE);
+
+ if (copy_to_user(arg, &policy, sizeof(policy)))
+ return -EFAULT;
return 0;
}
-EXPORT_SYMBOL(fscrypt_get_policy);
+EXPORT_SYMBOL(fscrypt_ioctl_get_policy);
/**
* fscrypt_has_permitted_context() - is a file's encryption policy permitted
@@ -194,11 +166,11 @@ int fscrypt_has_permitted_context(struct inode *parent, struct inode *child)
return 1;
/* No restrictions if the parent directory is unencrypted */
- if (!cops->is_encrypted(parent))
+ if (!IS_ENCRYPTED(parent))
return 1;
/* Encrypted directories must not contain unencrypted files */
- if (!cops->is_encrypted(child))
+ if (!IS_ENCRYPTED(child))
return 0;
/*
@@ -226,7 +198,8 @@ int fscrypt_has_permitted_context(struct inode *parent, struct inode *child)
child_ci = child->i_crypt_info;
if (parent_ci && child_ci) {
- return memcmp(parent_ci->ci_master_key, child_ci->ci_master_key,
+ return memcmp(parent_ci->ci_master_key_descriptor,
+ child_ci->ci_master_key_descriptor,
FS_KEY_DESCRIPTOR_SIZE) == 0 &&
(parent_ci->ci_data_mode == child_ci->ci_data_mode) &&
(parent_ci->ci_filename_mode ==
@@ -258,9 +231,9 @@ EXPORT_SYMBOL(fscrypt_has_permitted_context);
* @parent: Parent inode from which the context is inherited.
* @child: Child inode that inherits the context from @parent.
* @fs_data: private data given by FS.
- * @preload: preload child i_crypt_info
+ * @preload: preload child i_crypt_info if true
*
- * Return: Zero on success, non-zero otherwise
+ * Return: 0 on success, -errno on failure
*/
int fscrypt_inherit_context(struct inode *parent, struct inode *child,
void *fs_data, bool preload)
@@ -269,9 +242,6 @@ int fscrypt_inherit_context(struct inode *parent, struct inode *child,
struct fscrypt_info *ci;
int res;
- if (!parent->i_sb->s_cop->set_context)
- return -EOPNOTSUPP;
-
res = fscrypt_get_encryption_info(parent);
if (res < 0)
return res;
@@ -281,19 +251,11 @@ int fscrypt_inherit_context(struct inode *parent, struct inode *child,
return -ENOKEY;
ctx.format = FS_ENCRYPTION_CONTEXT_FORMAT_V1;
- if (fscrypt_dummy_context_enabled(parent)) {
- ctx.contents_encryption_mode = FS_ENCRYPTION_MODE_AES_256_XTS;
- ctx.filenames_encryption_mode = FS_ENCRYPTION_MODE_AES_256_CTS;
- ctx.flags = 0;
- memset(ctx.master_key_descriptor, 0x42, FS_KEY_DESCRIPTOR_SIZE);
- res = 0;
- } else {
- ctx.contents_encryption_mode = ci->ci_data_mode;
- ctx.filenames_encryption_mode = ci->ci_filename_mode;
- ctx.flags = ci->ci_flags;
- memcpy(ctx.master_key_descriptor, ci->ci_master_key,
- FS_KEY_DESCRIPTOR_SIZE);
- }
+ ctx.contents_encryption_mode = ci->ci_data_mode;
+ ctx.filenames_encryption_mode = ci->ci_filename_mode;
+ ctx.flags = ci->ci_flags;
+ memcpy(ctx.master_key_descriptor, ci->ci_master_key_descriptor,
+ FS_KEY_DESCRIPTOR_SIZE);
get_random_bytes(ctx.nonce, FS_KEY_DERIVATION_NONCE_SIZE);
res = parent->i_sb->s_cop->set_context(child, &ctx,
sizeof(ctx), fs_data);
diff --git a/fs/dcache.c b/fs/dcache.c
index f903b86b06e5..183e0ae89c2b 100644
--- a/fs/dcache.c
+++ b/fs/dcache.c
@@ -3257,6 +3257,7 @@ char *d_absolute_path(const struct path *path,
return ERR_PTR(error);
return res;
}
+EXPORT_SYMBOL(d_absolute_path);
/*
* same as __d_path but appends "(deleted)" for unlinked files.
diff --git a/fs/ecryptfs/ecryptfs_kernel.h b/fs/ecryptfs/ecryptfs_kernel.h
index a896e46671ea..d4d8ad1ba42e 100644
--- a/fs/ecryptfs/ecryptfs_kernel.h
+++ b/fs/ecryptfs/ecryptfs_kernel.h
@@ -125,7 +125,7 @@ ecryptfs_get_key_payload_data(struct key *key)
if (auth_tok)
return auth_tok;
- ukp = user_key_payload(key);
+ ukp = user_key_payload_locked(key);
if (!ukp)
return ERR_PTR(-EKEYREVOKED);
diff --git a/fs/eventpoll.c b/fs/eventpoll.c
index 3cbc30413add..5b96ba77e623 100644
--- a/fs/eventpoll.c
+++ b/fs/eventpoll.c
@@ -34,6 +34,7 @@
#include <linux/mutex.h>
#include <linux/anon_inodes.h>
#include <linux/device.h>
+#include <linux/freezer.h>
#include <asm/uaccess.h>
#include <asm/io.h>
#include <asm/mman.h>
@@ -1673,7 +1674,8 @@ fetch_events:
}
spin_unlock_irqrestore(&ep->lock, flags);
- if (!schedule_hrtimeout_range(to, slack, HRTIMER_MODE_ABS))
+ if (!freezable_schedule_hrtimeout_range(to, slack,
+ HRTIMER_MODE_ABS))
timed_out = 1;
spin_lock_irqsave(&ep->lock, flags);
diff --git a/fs/exec.c b/fs/exec.c
index fcd8642ef2d2..df13a1945c70 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -1303,7 +1303,7 @@ EXPORT_SYMBOL(flush_old_exec);
void would_dump(struct linux_binprm *bprm, struct file *file)
{
struct inode *inode = file_inode(file);
- if (inode_permission(inode, MAY_READ) < 0) {
+ if (inode_permission2(file->f_path.mnt, inode, MAY_READ) < 0) {
struct user_namespace *old, *user_ns;
bprm->interp_flags |= BINPRM_FLAGS_ENFORCE_NONDUMP;
diff --git a/fs/exofs/inode.c b/fs/exofs/inode.c
index d8072bc074a4..7f5b73528663 100644
--- a/fs/exofs/inode.c
+++ b/fs/exofs/inode.c
@@ -377,9 +377,8 @@ err:
* and will start a new collection. Eventually caller must submit the last
* segment if present.
*/
-static int readpage_strip(void *data, struct page *page)
+static int __readpage_strip(struct page_collect *pcol, struct page *page)
{
- struct page_collect *pcol = data;
struct inode *inode = pcol->inode;
struct exofs_i_info *oi = exofs_i(inode);
loff_t i_size = i_size_read(inode);
@@ -470,6 +469,13 @@ fail:
return ret;
}
+static int readpage_strip(struct file *data, struct page *page)
+{
+ struct page_collect *pcol = (struct page_collect *)data;
+
+ return __readpage_strip(pcol, page);
+}
+
static int exofs_readpages(struct file *file, struct address_space *mapping,
struct list_head *pages, unsigned nr_pages)
{
@@ -499,7 +505,7 @@ static int _readpage(struct page *page, bool read_4_write)
_pcol_init(&pcol, 1, page->mapping->host);
pcol.read_4_write = read_4_write;
- ret = readpage_strip(&pcol, page);
+ ret = __readpage_strip(&pcol, page);
if (ret) {
EXOFS_ERR("_readpage => %d\n", ret);
return ret;
diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index 567a6c7af677..5f3d2e2bdbdf 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -32,13 +32,15 @@
#include <linux/percpu_counter.h>
#include <linux/ratelimit.h>
#include <crypto/hash.h>
-#include <linux/fscrypto.h>
#include <linux/falloc.h>
#include <linux/percpu-rwsem.h>
#ifdef __KERNEL__
#include <linux/compat.h>
#endif
+#define __FS_HAS_ENCRYPTION IS_ENABLED(CONFIG_EXT4_FS_ENCRYPTION)
+#include <linux/fscrypt.h>
+
/*
* The fourth extended filesystem constants/structures
*/
@@ -1342,11 +1344,6 @@ struct ext4_super_block {
/* Number of quota types we support */
#define EXT4_MAXQUOTAS 3
-#ifdef CONFIG_EXT4_FS_ENCRYPTION
-#define EXT4_KEY_DESC_PREFIX "ext4:"
-#define EXT4_KEY_DESC_PREFIX_SIZE 5
-#endif
-
/*
* fourth extended-fs super-block data in memory
*/
@@ -1516,12 +1513,6 @@ struct ext4_sb_info {
/* Barrier between changing inodes' journal flags and writepages ops. */
struct percpu_rw_semaphore s_journal_flag_rwsem;
-
- /* Encryption support */
-#ifdef CONFIG_EXT4_FS_ENCRYPTION
- u8 key_prefix[EXT4_KEY_DESC_PREFIX_SIZE];
- u8 key_prefix_size;
-#endif
};
static inline struct ext4_sb_info *EXT4_SB(struct super_block *sb)
@@ -2272,11 +2263,6 @@ extern unsigned ext4_free_clusters_after_init(struct super_block *sb,
struct ext4_group_desc *gdp);
ext4_fsblk_t ext4_inode_to_goal_block(struct inode *);
-static inline int ext4_sb_has_crypto(struct super_block *sb)
-{
- return ext4_has_feature_encrypt(sb);
-}
-
static inline bool ext4_encrypted_inode(struct inode *inode)
{
return ext4_test_inode_flag(inode, EXT4_INODE_ENCRYPT);
@@ -2325,28 +2311,6 @@ static inline int ext4_fname_setup_filename(struct inode *dir,
}
static inline void ext4_fname_free_filename(struct ext4_filename *fname) { }
-#define fscrypt_set_d_op(i)
-#define fscrypt_get_ctx fscrypt_notsupp_get_ctx
-#define fscrypt_release_ctx fscrypt_notsupp_release_ctx
-#define fscrypt_encrypt_page fscrypt_notsupp_encrypt_page
-#define fscrypt_decrypt_page fscrypt_notsupp_decrypt_page
-#define fscrypt_decrypt_bio_pages fscrypt_notsupp_decrypt_bio_pages
-#define fscrypt_pullback_bio_page fscrypt_notsupp_pullback_bio_page
-#define fscrypt_restore_control_page fscrypt_notsupp_restore_control_page
-#define fscrypt_zeroout_range fscrypt_notsupp_zeroout_range
-#define fscrypt_process_policy fscrypt_notsupp_process_policy
-#define fscrypt_get_policy fscrypt_notsupp_get_policy
-#define fscrypt_has_permitted_context fscrypt_notsupp_has_permitted_context
-#define fscrypt_inherit_context fscrypt_notsupp_inherit_context
-#define fscrypt_get_encryption_info fscrypt_notsupp_get_encryption_info
-#define fscrypt_put_encryption_info fscrypt_notsupp_put_encryption_info
-#define fscrypt_setup_filename fscrypt_notsupp_setup_filename
-#define fscrypt_free_filename fscrypt_notsupp_free_filename
-#define fscrypt_fname_encrypted_size fscrypt_notsupp_fname_encrypted_size
-#define fscrypt_fname_alloc_buffer fscrypt_notsupp_fname_alloc_buffer
-#define fscrypt_fname_free_buffer fscrypt_notsupp_fname_free_buffer
-#define fscrypt_fname_disk_to_usr fscrypt_notsupp_fname_disk_to_usr
-#define fscrypt_fname_usr_to_disk fscrypt_notsupp_fname_usr_to_disk
#endif
/* dir.c */
@@ -2368,17 +2332,16 @@ extern int ext4_find_dest_de(struct inode *dir, struct inode *inode,
void *buf, int buf_size,
struct ext4_filename *fname,
struct ext4_dir_entry_2 **dest_de);
-int ext4_insert_dentry(struct inode *dir,
- struct inode *inode,
- struct ext4_dir_entry_2 *de,
- int buf_size,
- struct ext4_filename *fname);
+void ext4_insert_dentry(struct inode *inode,
+ struct ext4_dir_entry_2 *de,
+ int buf_size,
+ struct ext4_filename *fname);
static inline void ext4_update_dx_flag(struct inode *inode)
{
if (!ext4_has_feature_dir_index(inode->i_sb))
ext4_clear_inode_flag(inode, EXT4_INODE_INDEX);
}
-static unsigned char ext4_filetype_table[] = {
+static const unsigned char ext4_filetype_table[] = {
DT_UNKNOWN, DT_REG, DT_DIR, DT_CHR, DT_BLK, DT_FIFO, DT_SOCK, DT_LNK
};
@@ -2445,7 +2408,8 @@ extern int ext4_mb_add_groupinfo(struct super_block *sb,
ext4_group_t i, struct ext4_group_desc *desc);
extern int ext4_group_add_blocks(handle_t *handle, struct super_block *sb,
ext4_fsblk_t block, unsigned long count);
-extern int ext4_trim_fs(struct super_block *, struct fstrim_range *);
+extern int ext4_trim_fs(struct super_block *, struct fstrim_range *,
+ unsigned long blkdev_flags);
/* inode.c */
int ext4_inode_is_fast_symlink(struct inode *inode);
@@ -3062,7 +3026,7 @@ extern int ext4_handle_dirty_dirent_node(handle_t *handle,
struct inode *inode,
struct buffer_head *bh);
#define S_SHIFT 12
-static unsigned char ext4_type_by_mode[S_IFMT >> S_SHIFT] = {
+static const unsigned char ext4_type_by_mode[S_IFMT >> S_SHIFT] = {
[S_IFREG >> S_SHIFT] = EXT4_FT_REG_FILE,
[S_IFDIR >> S_SHIFT] = EXT4_FT_DIR,
[S_IFCHR >> S_SHIFT] = EXT4_FT_CHRDEV,
diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c
index 4f78e099de1d..004c088c2dd6 100644
--- a/fs/ext4/ialloc.c
+++ b/fs/ext4/ialloc.c
@@ -1072,6 +1072,17 @@ got:
if (err)
goto fail_drop;
+ /*
+ * Since the encryption xattr will always be unique, create it first so
+ * that it's less likely to end up in an external xattr block and
+ * prevent its deduplication.
+ */
+ if (encrypt) {
+ err = fscrypt_inherit_context(dir, inode, handle, true);
+ if (err)
+ goto fail_free_drop;
+ }
+
err = ext4_init_acl(handle, inode, dir);
if (err)
goto fail_free_drop;
@@ -1093,13 +1104,6 @@ got:
ei->i_datasync_tid = handle->h_transaction->t_tid;
}
- if (encrypt) {
- /* give pointer to avoid set_context with journal ops. */
- err = fscrypt_inherit_context(dir, inode, &encrypt, true);
- if (err)
- goto fail_free_drop;
- }
-
err = ext4_mark_inode_dirty(handle, inode);
if (err) {
ext4_std_error(sb, err);
diff --git a/fs/ext4/inline.c b/fs/ext4/inline.c
index 9a13f86fed62..ddb3dcf65d08 100644
--- a/fs/ext4/inline.c
+++ b/fs/ext4/inline.c
@@ -18,6 +18,7 @@
#include "ext4.h"
#include "xattr.h"
#include "truncate.h"
+#include <trace/events/android_fs.h>
#define EXT4_XATTR_SYSTEM_DATA "data"
#define EXT4_MIN_INLINE_DATA_SIZE ((sizeof(__le32) * EXT4_N_BLOCKS))
@@ -502,6 +503,17 @@ int ext4_readpage_inline(struct inode *inode, struct page *page)
return -EAGAIN;
}
+ if (trace_android_fs_dataread_start_enabled()) {
+ char *path, pathbuf[MAX_TRACE_PATHBUF_LEN];
+
+ path = android_fstrace_get_pathname(pathbuf,
+ MAX_TRACE_PATHBUF_LEN,
+ inode);
+ trace_android_fs_dataread_start(inode, page_offset(page),
+ PAGE_SIZE, current->pid,
+ path, current->comm);
+ }
+
/*
* Current inline data can only exist in the 1st page,
* So for all the other pages, just set them uptodate.
@@ -513,6 +525,8 @@ int ext4_readpage_inline(struct inode *inode, struct page *page)
SetPageUptodate(page);
}
+ trace_android_fs_dataread_end(inode, page_offset(page), PAGE_SIZE);
+
up_read(&EXT4_I(inode)->xattr_sem);
unlock_page(page);
@@ -1028,7 +1042,7 @@ static int ext4_add_dirent_to_inline(handle_t *handle,
err = ext4_journal_get_write_access(handle, iloc->bh);
if (err)
return err;
- ext4_insert_dentry(dir, inode, de, inline_size, fname);
+ ext4_insert_dentry(inode, de, inline_size, fname);
ext4_show_inline_dir(dir, iloc->bh, inline_start, inline_size);
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 4815be26b15f..dfc19bf7394d 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -44,6 +44,7 @@
#include "truncate.h"
#include <trace/events/ext4.h>
+#include <trace/events/android_fs.h>
#define MPAGE_DA_EXTENT_TAIL 0x01
@@ -1165,7 +1166,8 @@ static int ext4_block_write_begin(struct page *page, loff_t pos, unsigned len,
if (unlikely(err))
page_zero_new_buffers(page, from, to);
else if (decrypt)
- err = fscrypt_decrypt_page(page);
+ err = fscrypt_decrypt_page(page->mapping->host, page,
+ PAGE_SIZE, 0, page->index);
return err;
}
#endif
@@ -1182,6 +1184,16 @@ static int ext4_write_begin(struct file *file, struct address_space *mapping,
pgoff_t index;
unsigned from, to;
+ if (trace_android_fs_datawrite_start_enabled()) {
+ char *path, pathbuf[MAX_TRACE_PATHBUF_LEN];
+
+ path = android_fstrace_get_pathname(pathbuf,
+ MAX_TRACE_PATHBUF_LEN,
+ inode);
+ trace_android_fs_datawrite_start(inode, pos, len,
+ current->pid, path,
+ current->comm);
+ }
trace_ext4_write_begin(inode, pos, len, flags);
/*
* Reserve one block more for addition to orphan list in case
@@ -1320,6 +1332,7 @@ static int ext4_write_end(struct file *file,
int i_size_changed = 0;
int inline_data = ext4_has_inline_data(inode);
+ trace_android_fs_datawrite_end(inode, pos, len);
trace_ext4_write_end(inode, pos, len, copied);
if (inline_data) {
ret = ext4_write_inline_data_end(inode, pos, len,
@@ -1425,6 +1438,7 @@ static int ext4_journalled_write_end(struct file *file,
int size_changed = 0;
int inline_data = ext4_has_inline_data(inode);
+ trace_android_fs_datawrite_end(inode, pos, len);
trace_ext4_journalled_write_end(inode, pos, len, copied);
from = pos & (PAGE_SIZE - 1);
to = from + len;
@@ -2553,8 +2567,8 @@ static int mpage_prepare_extent_to_map(struct mpage_da_data *mpd)
mpd->map.m_len = 0;
mpd->next_page = index;
while (index <= end) {
- nr_pages = pagevec_lookup_tag(&pvec, mapping, &index, tag,
- min(end - index, (pgoff_t)PAGEVEC_SIZE-1) + 1);
+ nr_pages = pagevec_lookup_range_tag(&pvec, mapping, &index, end,
+ tag);
if (nr_pages == 0)
goto out;
@@ -2562,16 +2576,6 @@ static int mpage_prepare_extent_to_map(struct mpage_da_data *mpd)
struct page *page = pvec.pages[i];
/*
- * At this point, the page may be truncated or
- * invalidated (changing page->mapping to NULL), or
- * even swizzled back from swapper_space to tmpfs file
- * mapping. However, page->index will not change
- * because we have a reference on the page.
- */
- if (page->index > end)
- goto out;
-
- /*
* Accumulated enough dirty pages? This doesn't apply
* to WB_SYNC_ALL mode. For integrity sync we have to
* keep going because someone may be concurrently
@@ -2922,6 +2926,16 @@ static int ext4_da_write_begin(struct file *file, struct address_space *mapping,
len, flags, pagep, fsdata);
}
*fsdata = (void *)0;
+ if (trace_android_fs_datawrite_start_enabled()) {
+ char *path, pathbuf[MAX_TRACE_PATHBUF_LEN];
+
+ path = android_fstrace_get_pathname(pathbuf,
+ MAX_TRACE_PATHBUF_LEN,
+ inode);
+ trace_android_fs_datawrite_start(inode, pos, len,
+ current->pid,
+ path, current->comm);
+ }
trace_ext4_da_write_begin(inode, pos, len, flags);
if (ext4_test_inode_state(inode, EXT4_STATE_MAY_INLINE_DATA)) {
@@ -3040,6 +3054,7 @@ static int ext4_da_write_end(struct file *file,
return ext4_write_end(file, mapping, pos,
len, copied, page, fsdata);
+ trace_android_fs_datawrite_end(inode, pos, len);
trace_ext4_da_write_end(inode, pos, len, copied);
start = pos & (PAGE_SIZE - 1);
end = start + copied - 1;
@@ -3603,6 +3618,7 @@ static ssize_t ext4_direct_IO(struct kiocb *iocb, struct iov_iter *iter)
size_t count = iov_iter_count(iter);
loff_t offset = iocb->ki_pos;
ssize_t ret;
+ int rw = iov_iter_rw(iter);
#ifdef CONFIG_EXT4_FS_ENCRYPTION
if (ext4_encrypted_inode(inode) && S_ISREG(inode->i_mode))
@@ -3619,12 +3635,42 @@ static ssize_t ext4_direct_IO(struct kiocb *iocb, struct iov_iter *iter)
if (ext4_has_inline_data(inode))
return 0;
+ if (trace_android_fs_dataread_start_enabled() &&
+ (rw == READ)) {
+ char *path, pathbuf[MAX_TRACE_PATHBUF_LEN];
+
+ path = android_fstrace_get_pathname(pathbuf,
+ MAX_TRACE_PATHBUF_LEN,
+ inode);
+ trace_android_fs_dataread_start(inode, offset, count,
+ current->pid, path,
+ current->comm);
+ }
+ if (trace_android_fs_datawrite_start_enabled() &&
+ (rw == WRITE)) {
+ char *path, pathbuf[MAX_TRACE_PATHBUF_LEN];
+
+ path = android_fstrace_get_pathname(pathbuf,
+ MAX_TRACE_PATHBUF_LEN,
+ inode);
+ trace_android_fs_datawrite_start(inode, offset, count,
+ current->pid, path,
+ current->comm);
+ }
trace_ext4_direct_IO_enter(inode, offset, count, iov_iter_rw(iter));
if (iov_iter_rw(iter) == READ)
ret = ext4_direct_IO_read(iocb, iter);
else
ret = ext4_direct_IO_write(iocb, iter);
trace_ext4_direct_IO_exit(inode, offset, count, iov_iter_rw(iter), ret);
+
+ if (trace_android_fs_dataread_start_enabled() &&
+ (rw == READ))
+ trace_android_fs_dataread_end(inode, offset, count);
+ if (trace_android_fs_datawrite_start_enabled() &&
+ (rw == WRITE))
+ trace_android_fs_datawrite_end(inode, offset, count);
+
return ret;
}
@@ -3775,7 +3821,8 @@ static int __ext4_block_zero_page_range(handle_t *handle,
/* We expect the key to be set. */
BUG_ON(!fscrypt_has_encryption_key(inode));
BUG_ON(blocksize != PAGE_SIZE);
- WARN_ON_ONCE(fscrypt_decrypt_page(page));
+ WARN_ON_ONCE(fscrypt_decrypt_page(page->mapping->host,
+ page, PAGE_SIZE, 0, page->index));
}
}
if (ext4_should_journal_data(inode)) {
@@ -4393,8 +4440,11 @@ void ext4_set_inode_flags(struct inode *inode)
new_fl |= S_DIRSYNC;
if (test_opt(inode->i_sb, DAX) && S_ISREG(inode->i_mode))
new_fl |= S_DAX;
+ if (flags & EXT4_ENCRYPT_FL)
+ new_fl |= S_ENCRYPTED;
inode_set_flags(inode, new_fl,
- S_SYNC|S_APPEND|S_IMMUTABLE|S_NOATIME|S_DIRSYNC|S_DAX);
+ S_SYNC|S_APPEND|S_IMMUTABLE|S_NOATIME|S_DIRSYNC|S_DAX|
+ S_ENCRYPTED);
}
/* Propagate flags from i_flags to EXT4_I(inode)->i_flags */
diff --git a/fs/ext4/ioctl.c b/fs/ext4/ioctl.c
index 2880e017cd0a..4bc35e6a5cc0 100644
--- a/fs/ext4/ioctl.c
+++ b/fs/ext4/ioctl.c
@@ -191,6 +191,7 @@ journal_err_out:
return err;
}
+#ifdef CONFIG_EXT4_FS_ENCRYPTION
static int uuid_is_zero(__u8 u[16])
{
int i;
@@ -200,6 +201,7 @@ static int uuid_is_zero(__u8 u[16])
return 0;
return 1;
}
+#endif
static int ext4_ioctl_setflags(struct inode *inode,
unsigned int flags)
@@ -737,11 +739,13 @@ resizefs_out:
return err;
}
+ case FIDTRIM:
case FITRIM:
{
struct request_queue *q = bdev_get_queue(sb->s_bdev);
struct fstrim_range range;
int ret = 0;
+ int flags = cmd == FIDTRIM ? BLKDEV_DISCARD_SECURE : 0;
if (!capable(CAP_SYS_ADMIN))
return -EPERM;
@@ -749,13 +753,16 @@ resizefs_out:
if (!blk_queue_discard(q))
return -EOPNOTSUPP;
+ if ((flags & BLKDEV_DISCARD_SECURE) && !blk_queue_secure_erase(q))
+ return -EOPNOTSUPP;
+
if (copy_from_user(&range, (struct fstrim_range __user *)arg,
sizeof(range)))
return -EFAULT;
range.minlen = max((unsigned int)range.minlen,
q->limits.discard_granularity);
- ret = ext4_trim_fs(sb, &range);
+ ret = ext4_trim_fs(sb, &range, flags);
if (ret < 0)
return ret;
@@ -767,28 +774,19 @@ resizefs_out:
}
case EXT4_IOC_PRECACHE_EXTENTS:
return ext4_ext_precache(inode);
- case EXT4_IOC_SET_ENCRYPTION_POLICY: {
-#ifdef CONFIG_EXT4_FS_ENCRYPTION
- struct fscrypt_policy policy;
+ case EXT4_IOC_SET_ENCRYPTION_POLICY:
if (!ext4_has_feature_encrypt(sb))
return -EOPNOTSUPP;
+ return fscrypt_ioctl_set_policy(filp, (const void __user *)arg);
- if (copy_from_user(&policy,
- (struct fscrypt_policy __user *)arg,
- sizeof(policy)))
- return -EFAULT;
- return fscrypt_process_policy(filp, &policy);
-#else
- return -EOPNOTSUPP;
-#endif
- }
case EXT4_IOC_GET_ENCRYPTION_PWSALT: {
+#ifdef CONFIG_EXT4_FS_ENCRYPTION
int err, err2;
struct ext4_sb_info *sbi = EXT4_SB(sb);
handle_t *handle;
- if (!ext4_sb_has_crypto(sb))
+ if (!ext4_has_feature_encrypt(sb))
return -EOPNOTSUPP;
if (uuid_is_zero(sbi->s_es->s_encrypt_pw_salt)) {
err = mnt_want_write_file(filp);
@@ -818,30 +816,18 @@ resizefs_out:
sbi->s_es->s_encrypt_pw_salt, 16))
return -EFAULT;
return 0;
- }
- case EXT4_IOC_GET_ENCRYPTION_POLICY: {
-#ifdef CONFIG_EXT4_FS_ENCRYPTION
- struct fscrypt_policy policy;
- int err = 0;
-
- if (!ext4_encrypted_inode(inode))
- return -ENOENT;
- err = fscrypt_get_policy(inode, &policy);
- if (err)
- return err;
- if (copy_to_user((void __user *)arg, &policy, sizeof(policy)))
- return -EFAULT;
- return 0;
#else
return -EOPNOTSUPP;
#endif
}
+ case EXT4_IOC_GET_ENCRYPTION_POLICY:
+ return fscrypt_ioctl_get_policy(filp, (void __user *)arg);
+
case EXT4_IOC_FSGETXATTR:
{
struct fsxattr fa;
memset(&fa, 0, sizeof(struct fsxattr));
- ext4_get_inode_flags(ei);
fa.fsx_xflags = ext4_iflags_to_xflags(ei->i_flags & EXT4_FL_USER_VISIBLE);
if (ext4_has_feature_project(inode->i_sb)) {
diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c
index a49d0e5d7baf..3d6f73e38873 100644
--- a/fs/ext4/mballoc.c
+++ b/fs/ext4/mballoc.c
@@ -2775,7 +2775,8 @@ int ext4_mb_release(struct super_block *sb)
}
static inline int ext4_issue_discard(struct super_block *sb,
- ext4_group_t block_group, ext4_grpblk_t cluster, int count)
+ ext4_group_t block_group, ext4_grpblk_t cluster, int count,
+ unsigned long flags)
{
ext4_fsblk_t discard_block;
@@ -2784,7 +2785,7 @@ static inline int ext4_issue_discard(struct super_block *sb,
count = EXT4_C2B(EXT4_SB(sb), count);
trace_ext4_discard_blocks(sb,
(unsigned long long) discard_block, count);
- return sb_issue_discard(sb, discard_block, count, GFP_NOFS, 0);
+ return sb_issue_discard(sb, discard_block, count, GFP_NOFS, flags);
}
/*
@@ -2806,7 +2807,7 @@ static void ext4_free_data_callback(struct super_block *sb,
if (test_opt(sb, DISCARD)) {
err = ext4_issue_discard(sb, entry->efd_group,
entry->efd_start_cluster,
- entry->efd_count);
+ entry->efd_count, 0);
if (err && err != -EOPNOTSUPP)
ext4_msg(sb, KERN_WARNING, "discard request in"
" group:%d block:%d count:%d failed"
@@ -4865,7 +4866,8 @@ do_more:
* them with group lock_held
*/
if (test_opt(sb, DISCARD)) {
- err = ext4_issue_discard(sb, block_group, bit, count);
+ err = ext4_issue_discard(sb, block_group, bit, count,
+ 0);
if (err && err != -EOPNOTSUPP)
ext4_msg(sb, KERN_WARNING, "discard request in"
" group:%d block:%d count:%lu failed"
@@ -5061,13 +5063,15 @@ error_return:
* @count: number of blocks to TRIM
* @group: alloc. group we are working with
* @e4b: ext4 buddy for the group
+ * @blkdev_flags: flags for the block device
*
* Trim "count" blocks starting at "start" in the "group". To assure that no
* one will allocate those blocks, mark it as used in buddy bitmap. This must
* be called with under the group lock.
*/
static int ext4_trim_extent(struct super_block *sb, int start, int count,
- ext4_group_t group, struct ext4_buddy *e4b)
+ ext4_group_t group, struct ext4_buddy *e4b,
+ unsigned long blkdev_flags)
__releases(bitlock)
__acquires(bitlock)
{
@@ -5088,7 +5092,7 @@ __acquires(bitlock)
*/
mb_mark_used(e4b, &ex);
ext4_unlock_group(sb, group);
- ret = ext4_issue_discard(sb, group, start, count);
+ ret = ext4_issue_discard(sb, group, start, count, blkdev_flags);
ext4_lock_group(sb, group);
mb_free_blocks(NULL, e4b, start, ex.fe_len);
return ret;
@@ -5101,6 +5105,7 @@ __acquires(bitlock)
* @start: first group block to examine
* @max: last group block to examine
* @minblocks: minimum extent block count
+ * @blkdev_flags: flags for the block device
*
* ext4_trim_all_free walks through group's buddy bitmap searching for free
* extents. When the free block is found, ext4_trim_extent is called to TRIM
@@ -5115,7 +5120,7 @@ __acquires(bitlock)
static ext4_grpblk_t
ext4_trim_all_free(struct super_block *sb, ext4_group_t group,
ext4_grpblk_t start, ext4_grpblk_t max,
- ext4_grpblk_t minblocks)
+ ext4_grpblk_t minblocks, unsigned long blkdev_flags)
{
void *bitmap;
ext4_grpblk_t next, count = 0, free_count = 0;
@@ -5148,7 +5153,8 @@ ext4_trim_all_free(struct super_block *sb, ext4_group_t group,
if ((next - start) >= minblocks) {
ret = ext4_trim_extent(sb, start,
- next - start, group, &e4b);
+ next - start, group, &e4b,
+ blkdev_flags);
if (ret && ret != -EOPNOTSUPP)
break;
ret = 0;
@@ -5190,6 +5196,7 @@ out:
* ext4_trim_fs() -- trim ioctl handle function
* @sb: superblock for filesystem
* @range: fstrim_range structure
+ * @blkdev_flags: flags for the block device
*
* start: First Byte to trim
* len: number of Bytes to trim from start
@@ -5198,7 +5205,8 @@ out:
* start to start+len. For each such a group ext4_trim_all_free function
* is invoked to trim all free space.
*/
-int ext4_trim_fs(struct super_block *sb, struct fstrim_range *range)
+int ext4_trim_fs(struct super_block *sb, struct fstrim_range *range,
+ unsigned long blkdev_flags)
{
struct ext4_group_info *grp;
ext4_group_t group, first_group, last_group;
@@ -5254,7 +5262,7 @@ int ext4_trim_fs(struct super_block *sb, struct fstrim_range *range)
if (grp->bb_free >= minlen) {
cnt = ext4_trim_all_free(sb, group, first_cluster,
- end, minlen);
+ end, minlen, blkdev_flags);
if (cnt < 0) {
ret = cnt;
break;
diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c
index 3c3757ee11f0..b7b58b126242 100644
--- a/fs/ext4/namei.c
+++ b/fs/ext4/namei.c
@@ -1238,37 +1238,24 @@ static void dx_insert_block(struct dx_frame *frame, u32 hash, ext4_lblk_t block)
}
/*
- * NOTE! unlike strncmp, ext4_match returns 1 for success, 0 for failure.
+ * Test whether a directory entry matches the filename being searched for.
*
- * `len <= EXT4_NAME_LEN' is guaranteed by caller.
- * `de != NULL' is guaranteed by caller.
+ * Return: %true if the directory entry matches, otherwise %false.
*/
-static inline int ext4_match(struct ext4_filename *fname,
- struct ext4_dir_entry_2 *de)
+static inline bool ext4_match(const struct ext4_filename *fname,
+ const struct ext4_dir_entry_2 *de)
{
- const void *name = fname_name(fname);
- u32 len = fname_len(fname);
+ struct fscrypt_name f;
if (!de->inode)
- return 0;
+ return false;
+ f.usr_fname = fname->usr_fname;
+ f.disk_name = fname->disk_name;
#ifdef CONFIG_EXT4_FS_ENCRYPTION
- if (unlikely(!name)) {
- if (fname->usr_fname->name[0] == '_') {
- int ret;
- if (de->name_len <= 32)
- return 0;
- ret = memcmp(de->name + ((de->name_len - 17) & ~15),
- fname->crypto_buf.name + 8, 16);
- return (ret == 0) ? 1 : 0;
- }
- name = fname->crypto_buf.name;
- len = fname->crypto_buf.len;
- }
+ f.crypto_buf = fname->crypto_buf;
#endif
- if (de->name_len != len)
- return 0;
- return (memcmp(de->name, name, len) == 0) ? 1 : 0;
+ return fscrypt_match_name(&f, de->name, de->name_len);
}
/*
@@ -1282,48 +1269,31 @@ int ext4_search_dir(struct buffer_head *bh, char *search_buf, int buf_size,
struct ext4_dir_entry_2 * de;
char * dlimit;
int de_len;
- int res;
de = (struct ext4_dir_entry_2 *)search_buf;
dlimit = search_buf + buf_size;
while ((char *) de < dlimit) {
/* this code is executed quadratically often */
/* do minimal checking `by hand' */
- if ((char *) de + de->name_len <= dlimit) {
- res = ext4_match(fname, de);
- if (res < 0) {
- res = -1;
- goto return_result;
- }
- if (res > 0) {
- /* found a match - just to be sure, do
- * a full check */
- if (ext4_check_dir_entry(dir, NULL, de, bh,
- bh->b_data,
- bh->b_size, offset)) {
- res = -1;
- goto return_result;
- }
- *res_dir = de;
- res = 1;
- goto return_result;
- }
-
+ if ((char *) de + de->name_len <= dlimit &&
+ ext4_match(fname, de)) {
+ /* found a match - just to be sure, do
+ * a full check */
+ if (ext4_check_dir_entry(dir, NULL, de, bh, bh->b_data,
+ bh->b_size, offset))
+ return -1;
+ *res_dir = de;
+ return 1;
}
/* prevent looping on a bad block */
de_len = ext4_rec_len_from_disk(de->rec_len,
dir->i_sb->s_blocksize);
- if (de_len <= 0) {
- res = -1;
- goto return_result;
- }
+ if (de_len <= 0)
+ return -1;
offset += de_len;
de = (struct ext4_dir_entry_2 *) ((char *) de + de_len);
}
-
- res = 0;
-return_result:
- return res;
+ return 0;
}
static int is_dx_internal_node(struct inode *dir, ext4_lblk_t block,
@@ -1577,24 +1547,14 @@ static struct dentry *ext4_lookup(struct inode *dir, struct dentry *dentry, unsi
struct inode *inode;
struct ext4_dir_entry_2 *de;
struct buffer_head *bh;
+ int err;
- if (ext4_encrypted_inode(dir)) {
- int res = fscrypt_get_encryption_info(dir);
-
- /*
- * DCACHE_ENCRYPTED_WITH_KEY is set if the dentry is
- * created while the directory was encrypted and we
- * have access to the key.
- */
- if (fscrypt_has_encryption_key(dir))
- fscrypt_set_encrypted_dentry(dentry);
- fscrypt_set_d_op(dentry);
- if (res && res != -ENOKEY)
- return ERR_PTR(res);
- }
+ err = fscrypt_prepare_lookup(dir, dentry, flags);
+ if (err)
+ return ERR_PTR(err);
- if (dentry->d_name.len > EXT4_NAME_LEN)
- return ERR_PTR(-ENAMETOOLONG);
+ if (dentry->d_name.len > EXT4_NAME_LEN)
+ return ERR_PTR(-ENAMETOOLONG);
bh = ext4_find_entry(dir, &dentry->d_name, &de, NULL);
if (IS_ERR(bh))
@@ -1622,16 +1582,9 @@ static struct dentry *ext4_lookup(struct inode *dir, struct dentry *dentry, unsi
if (!IS_ERR(inode) && ext4_encrypted_inode(dir) &&
(S_ISDIR(inode->i_mode) || S_ISLNK(inode->i_mode)) &&
!fscrypt_has_permitted_context(dir, inode)) {
- int nokey = ext4_encrypted_inode(inode) &&
- !fscrypt_has_encryption_key(inode);
- if (nokey) {
- iput(inode);
- return ERR_PTR(-ENOKEY);
- }
ext4_warning(inode->i_sb,
"Inconsistent encryption contexts: %lu/%lu",
- (unsigned long) dir->i_ino,
- (unsigned long) inode->i_ino);
+ dir->i_ino, inode->i_ino);
iput(inode);
return ERR_PTR(-EPERM);
}
@@ -1839,24 +1792,15 @@ int ext4_find_dest_de(struct inode *dir, struct inode *inode,
int nlen, rlen;
unsigned int offset = 0;
char *top;
- int res;
de = (struct ext4_dir_entry_2 *)buf;
top = buf + buf_size - reclen;
while ((char *) de <= top) {
if (ext4_check_dir_entry(dir, NULL, de, bh,
- buf, buf_size, offset)) {
- res = -EFSCORRUPTED;
- goto return_result;
- }
- /* Provide crypto context and crypto buffer to ext4 match */
- res = ext4_match(fname, de);
- if (res < 0)
- goto return_result;
- if (res > 0) {
- res = -EEXIST;
- goto return_result;
- }
+ buf, buf_size, offset))
+ return -EFSCORRUPTED;
+ if (ext4_match(fname, de))
+ return -EEXIST;
nlen = EXT4_DIR_REC_LEN(de->name_len);
rlen = ext4_rec_len_from_disk(de->rec_len, buf_size);
if ((de->inode ? rlen - nlen : rlen) >= reclen)
@@ -1864,22 +1808,17 @@ int ext4_find_dest_de(struct inode *dir, struct inode *inode,
de = (struct ext4_dir_entry_2 *)((char *)de + rlen);
offset += rlen;
}
-
if ((char *) de > top)
- res = -ENOSPC;
- else {
- *dest_de = de;
- res = 0;
- }
-return_result:
- return res;
+ return -ENOSPC;
+
+ *dest_de = de;
+ return 0;
}
-int ext4_insert_dentry(struct inode *dir,
- struct inode *inode,
- struct ext4_dir_entry_2 *de,
- int buf_size,
- struct ext4_filename *fname)
+void ext4_insert_dentry(struct inode *inode,
+ struct ext4_dir_entry_2 *de,
+ int buf_size,
+ struct ext4_filename *fname)
{
int nlen, rlen;
@@ -1898,7 +1837,6 @@ int ext4_insert_dentry(struct inode *dir,
ext4_set_de_type(inode->i_sb, de, inode->i_mode);
de->name_len = fname_len(fname);
memcpy(de->name, fname_name(fname), fname_len(fname));
- return 0;
}
/*
@@ -1934,11 +1872,8 @@ static int add_dirent_to_buf(handle_t *handle, struct ext4_filename *fname,
return err;
}
- /* By now the buffer is marked for journaling. Due to crypto operations,
- * the following function call may fail */
- err = ext4_insert_dentry(dir, inode, de, blocksize, fname);
- if (err < 0)
- return err;
+ /* By now the buffer is marked for journaling */
+ ext4_insert_dentry(inode, de, blocksize, fname);
/*
* XXX shouldn't update any times until successful
@@ -3084,36 +3019,16 @@ static int ext4_symlink(struct inode *dir,
struct inode *inode;
int err, len = strlen(symname);
int credits;
- bool encryption_required;
struct fscrypt_str disk_link;
- struct fscrypt_symlink_data *sd = NULL;
-
- disk_link.len = len + 1;
- disk_link.name = (char *) symname;
- encryption_required = (ext4_encrypted_inode(dir) ||
- DUMMY_ENCRYPTION_ENABLED(EXT4_SB(dir->i_sb)));
- if (encryption_required) {
- err = fscrypt_get_encryption_info(dir);
- if (err)
- return err;
- if (!fscrypt_has_encryption_key(dir))
- return -ENOKEY;
- disk_link.len = (fscrypt_fname_encrypted_size(dir, len) +
- sizeof(struct fscrypt_symlink_data));
- sd = kzalloc(disk_link.len, GFP_KERNEL);
- if (!sd)
- return -ENOMEM;
- }
-
- if (disk_link.len > dir->i_sb->s_blocksize) {
- err = -ENAMETOOLONG;
- goto err_free_sd;
- }
+ err = fscrypt_prepare_symlink(dir, symname, len, dir->i_sb->s_blocksize,
+ &disk_link);
+ if (err)
+ return err;
err = dquot_initialize(dir);
if (err)
- goto err_free_sd;
+ return err;
if ((disk_link.len > EXT4_N_BLOCKS * 4)) {
/*
@@ -3142,27 +3057,18 @@ static int ext4_symlink(struct inode *dir,
if (IS_ERR(inode)) {
if (handle)
ext4_journal_stop(handle);
- err = PTR_ERR(inode);
- goto err_free_sd;
+ return PTR_ERR(inode);
}
- if (encryption_required) {
- struct qstr istr;
- struct fscrypt_str ostr =
- FSTR_INIT(sd->encrypted_path, disk_link.len);
-
- istr.name = (const unsigned char *) symname;
- istr.len = len;
- err = fscrypt_fname_usr_to_disk(inode, &istr, &ostr);
+ if (IS_ENCRYPTED(inode)) {
+ err = fscrypt_encrypt_symlink(inode, symname, len, &disk_link);
if (err)
goto err_drop_inode;
- sd->len = cpu_to_le16(ostr.len);
- disk_link.name = (char *) sd;
inode->i_op = &ext4_encrypted_symlink_inode_operations;
}
if ((disk_link.len > EXT4_N_BLOCKS * 4)) {
- if (!encryption_required)
+ if (!IS_ENCRYPTED(inode))
inode->i_op = &ext4_symlink_inode_operations;
inode_nohighmem(inode);
ext4_set_aops(inode);
@@ -3204,7 +3110,7 @@ static int ext4_symlink(struct inode *dir,
} else {
/* clear the extent format for fast symlink */
ext4_clear_inode_flag(inode, EXT4_INODE_EXTENTS);
- if (!encryption_required) {
+ if (!IS_ENCRYPTED(inode)) {
inode->i_op = &ext4_fast_symlink_inode_operations;
inode->i_link = (char *)&EXT4_I(inode)->i_data;
}
@@ -3219,16 +3125,17 @@ static int ext4_symlink(struct inode *dir,
if (handle)
ext4_journal_stop(handle);
- kfree(sd);
- return err;
+ goto out_free_encrypted_link;
+
err_drop_inode:
if (handle)
ext4_journal_stop(handle);
clear_nlink(inode);
unlock_new_inode(inode);
iput(inode);
-err_free_sd:
- kfree(sd);
+out_free_encrypted_link:
+ if (disk_link.name != (unsigned char *)symname)
+ kfree(disk_link.name);
return err;
}
diff --git a/fs/ext4/page-io.c b/fs/ext4/page-io.c
index 0094923e5ebf..0718a8667f0c 100644
--- a/fs/ext4/page-io.c
+++ b/fs/ext4/page-io.c
@@ -24,7 +24,6 @@
#include <linux/slab.h>
#include <linux/mm.h>
#include <linux/backing-dev.h>
-#include <linux/fscrypto.h>
#include "ext4_jbd2.h"
#include "xattr.h"
@@ -470,7 +469,8 @@ int ext4_bio_write_page(struct ext4_io_submit *io,
gfp_t gfp_flags = GFP_NOFS;
retry_encrypt:
- data_page = fscrypt_encrypt_page(inode, page, gfp_flags);
+ data_page = fscrypt_encrypt_page(inode, page, PAGE_SIZE, 0,
+ page->index, gfp_flags);
if (IS_ERR(data_page)) {
ret = PTR_ERR(data_page);
if (ret == -ENOMEM && wbc->sync_mode == WB_SYNC_ALL) {
diff --git a/fs/ext4/readpage.c b/fs/ext4/readpage.c
index a81b829d56de..c39a12d6d2d6 100644
--- a/fs/ext4/readpage.c
+++ b/fs/ext4/readpage.c
@@ -45,6 +45,7 @@
#include <linux/cleancache.h>
#include "ext4.h"
+#include <trace/events/android_fs.h>
static inline bool ext4_bio_encrypted(struct bio *bio)
{
@@ -55,6 +56,17 @@ static inline bool ext4_bio_encrypted(struct bio *bio)
#endif
}
+static void
+ext4_trace_read_completion(struct bio *bio)
+{
+ struct page *first_page = bio->bi_io_vec[0].bv_page;
+
+ if (first_page != NULL)
+ trace_android_fs_dataread_end(first_page->mapping->host,
+ page_offset(first_page),
+ bio->bi_iter.bi_size);
+}
+
/*
* I/O completion handler for multipage BIOs.
*
@@ -72,11 +84,14 @@ static void mpage_end_io(struct bio *bio)
struct bio_vec *bv;
int i;
+ if (trace_android_fs_dataread_start_enabled())
+ ext4_trace_read_completion(bio);
+
if (ext4_bio_encrypted(bio)) {
if (bio->bi_error) {
fscrypt_release_ctx(bio->bi_private);
} else {
- fscrypt_decrypt_bio_pages(bio->bi_private, bio);
+ fscrypt_enqueue_decrypt_bio(bio->bi_private, bio);
return;
}
}
@@ -95,6 +110,30 @@ static void mpage_end_io(struct bio *bio)
bio_put(bio);
}
+static void
+ext4_submit_bio_read(struct bio *bio)
+{
+ if (trace_android_fs_dataread_start_enabled()) {
+ struct page *first_page = bio->bi_io_vec[0].bv_page;
+
+ if (first_page != NULL) {
+ char *path, pathbuf[MAX_TRACE_PATHBUF_LEN];
+
+ path = android_fstrace_get_pathname(pathbuf,
+ MAX_TRACE_PATHBUF_LEN,
+ first_page->mapping->host);
+ trace_android_fs_dataread_start(
+ first_page->mapping->host,
+ page_offset(first_page),
+ bio->bi_iter.bi_size,
+ current->pid,
+ path,
+ current->comm);
+ }
+ }
+ submit_bio(bio);
+}
+
int ext4_mpage_readpages(struct address_space *mapping,
struct list_head *pages, struct page *page,
unsigned nr_pages)
@@ -235,7 +274,7 @@ int ext4_mpage_readpages(struct address_space *mapping,
*/
if (bio && (last_block_in_bio != blocks[0] - 1)) {
submit_and_realloc:
- submit_bio(bio);
+ ext4_submit_bio_read(bio);
bio = NULL;
}
if (bio == NULL) {
@@ -268,14 +307,14 @@ int ext4_mpage_readpages(struct address_space *mapping,
if (((map.m_flags & EXT4_MAP_BOUNDARY) &&
(relative_block == map.m_len)) ||
(first_hole != blocks_per_page)) {
- submit_bio(bio);
+ ext4_submit_bio_read(bio);
bio = NULL;
} else
last_block_in_bio = blocks[blocks_per_page - 1];
goto next_page;
confused:
if (bio) {
- submit_bio(bio);
+ ext4_submit_bio_read(bio);
bio = NULL;
}
if (!PageUptodate(page))
@@ -288,6 +327,6 @@ int ext4_mpage_readpages(struct address_space *mapping,
}
BUG_ON(pages && !list_empty(pages));
if (bio)
- submit_bio(bio);
+ ext4_submit_bio_read(bio);
return 0;
}
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index a6c7ace9cfd1..dd851f9fe9c1 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -1028,9 +1028,7 @@ void ext4_clear_inode(struct inode *inode)
jbd2_free_inode(EXT4_I(inode)->jinode);
EXT4_I(inode)->jinode = NULL;
}
-#ifdef CONFIG_EXT4_FS_ENCRYPTION
- fscrypt_put_encryption_info(inode, NULL);
-#endif
+ fscrypt_put_encryption_info(inode);
}
static struct inode *ext4_nfs_get_inode(struct super_block *sb,
@@ -1113,80 +1111,86 @@ static int ext4_get_context(struct inode *inode, void *ctx, size_t len)
EXT4_XATTR_NAME_ENCRYPTION_CONTEXT, ctx, len);
}
-static int ext4_key_prefix(struct inode *inode, u8 **key)
-{
- *key = EXT4_SB(inode->i_sb)->key_prefix;
- return EXT4_SB(inode->i_sb)->key_prefix_size;
-}
-
-static int ext4_prepare_context(struct inode *inode)
-{
- return ext4_convert_inline_data(inode);
-}
-
static int ext4_set_context(struct inode *inode, const void *ctx, size_t len,
void *fs_data)
{
- handle_t *handle;
- int res, res2;
+ handle_t *handle = fs_data;
+ int res, res2, retries = 0;
+
+ res = ext4_convert_inline_data(inode);
+ if (res)
+ return res;
- /* fs_data is null when internally used. */
- if (fs_data) {
- res = ext4_xattr_set(inode, EXT4_XATTR_INDEX_ENCRYPTION,
- EXT4_XATTR_NAME_ENCRYPTION_CONTEXT, ctx,
- len, 0);
+ /*
+ * If a journal handle was specified, then the encryption context is
+ * being set on a new inode via inheritance and is part of a larger
+ * transaction to create the inode. Otherwise the encryption context is
+ * being set on an existing inode in its own transaction. Only in the
+ * latter case should the "retry on ENOSPC" logic be used.
+ */
+
+ if (handle) {
+ res = ext4_xattr_set_handle(handle, inode,
+ EXT4_XATTR_INDEX_ENCRYPTION,
+ EXT4_XATTR_NAME_ENCRYPTION_CONTEXT,
+ ctx, len, 0);
if (!res) {
ext4_set_inode_flag(inode, EXT4_INODE_ENCRYPT);
ext4_clear_inode_state(inode,
EXT4_STATE_MAY_INLINE_DATA);
+ /*
+ * Update inode->i_flags - S_ENCRYPTED will be enabled,
+ * S_DAX may be disabled
+ */
+ ext4_set_inode_flags(inode);
}
return res;
}
+ res = dquot_initialize(inode);
+ if (res)
+ return res;
+retry:
handle = ext4_journal_start(inode, EXT4_HT_MISC,
ext4_jbd2_credits_xattr(inode));
if (IS_ERR(handle))
return PTR_ERR(handle);
- res = ext4_xattr_set(inode, EXT4_XATTR_INDEX_ENCRYPTION,
- EXT4_XATTR_NAME_ENCRYPTION_CONTEXT, ctx,
- len, 0);
+ res = ext4_xattr_set_handle(handle, inode, EXT4_XATTR_INDEX_ENCRYPTION,
+ EXT4_XATTR_NAME_ENCRYPTION_CONTEXT,
+ ctx, len, 0);
if (!res) {
ext4_set_inode_flag(inode, EXT4_INODE_ENCRYPT);
+ /*
+ * Update inode->i_flags - S_ENCRYPTED will be enabled,
+ * S_DAX may be disabled
+ */
+ ext4_set_inode_flags(inode);
res = ext4_mark_inode_dirty(handle, inode);
if (res)
EXT4_ERROR_INODE(inode, "Failed to mark inode dirty");
}
res2 = ext4_journal_stop(handle);
+
+ if (res == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries))
+ goto retry;
if (!res)
res = res2;
return res;
}
-static int ext4_dummy_context(struct inode *inode)
+static bool ext4_dummy_context(struct inode *inode)
{
return DUMMY_ENCRYPTION_ENABLED(EXT4_SB(inode->i_sb));
}
-static unsigned ext4_max_namelen(struct inode *inode)
-{
- return S_ISLNK(inode->i_mode) ? inode->i_sb->s_blocksize :
- EXT4_NAME_LEN;
-}
-
-static struct fscrypt_operations ext4_cryptops = {
+static const struct fscrypt_operations ext4_cryptops = {
+ .key_prefix = "ext4:",
.get_context = ext4_get_context,
- .key_prefix = ext4_key_prefix,
- .prepare_context = ext4_prepare_context,
.set_context = ext4_set_context,
.dummy_context = ext4_dummy_context,
- .is_encrypted = ext4_encrypted_inode,
.empty_dir = ext4_empty_dir,
- .max_namelen = ext4_max_namelen,
-};
-#else
-static struct fscrypt_operations ext4_cryptops = {
- .is_encrypted = ext4_encrypted_inode,
+ .max_namelen = EXT4_NAME_LEN,
};
#endif
@@ -3974,7 +3978,9 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
sb->s_op = &ext4_sops;
sb->s_export_op = &ext4_export_ops;
sb->s_xattr = ext4_xattr_handlers;
+#ifdef CONFIG_EXT4_FS_ENCRYPTION
sb->s_cop = &ext4_cryptops;
+#endif
#ifdef CONFIG_QUOTA
sb->dq_op = &ext4_quota_operations;
if (ext4_has_feature_quota(sb))
@@ -4290,11 +4296,6 @@ no_journal:
ratelimit_state_init(&sbi->s_msg_ratelimit_state, 5 * HZ, 10);
kfree(orig_data);
-#ifdef CONFIG_EXT4_FS_ENCRYPTION
- memcpy(sbi->key_prefix, EXT4_KEY_DESC_PREFIX,
- EXT4_KEY_DESC_PREFIX_SIZE);
- sbi->key_prefix_size = EXT4_KEY_DESC_PREFIX_SIZE;
-#endif
return 0;
cantfind_ext4:
diff --git a/fs/ext4/symlink.c b/fs/ext4/symlink.c
index 557b3b0d668c..d4ce3af418a0 100644
--- a/fs/ext4/symlink.c
+++ b/fs/ext4/symlink.c
@@ -27,59 +27,28 @@ static const char *ext4_encrypted_get_link(struct dentry *dentry,
struct delayed_call *done)
{
struct page *cpage = NULL;
- char *caddr, *paddr = NULL;
- struct fscrypt_str cstr, pstr;
- struct fscrypt_symlink_data *sd;
- int res;
- u32 max_size = inode->i_sb->s_blocksize;
+ const void *caddr;
+ unsigned int max_size;
+ const char *paddr;
if (!dentry)
return ERR_PTR(-ECHILD);
- res = fscrypt_get_encryption_info(inode);
- if (res)
- return ERR_PTR(res);
-
if (ext4_inode_is_fast_symlink(inode)) {
- caddr = (char *) EXT4_I(inode)->i_data;
+ caddr = EXT4_I(inode)->i_data;
max_size = sizeof(EXT4_I(inode)->i_data);
} else {
cpage = read_mapping_page(inode->i_mapping, 0, NULL);
if (IS_ERR(cpage))
return ERR_CAST(cpage);
caddr = page_address(cpage);
+ max_size = inode->i_sb->s_blocksize;
}
- /* Symlink is encrypted */
- sd = (struct fscrypt_symlink_data *)caddr;
- cstr.name = sd->encrypted_path;
- cstr.len = le16_to_cpu(sd->len);
- if ((cstr.len + sizeof(struct fscrypt_symlink_data) - 1) > max_size) {
- /* Symlink data on the disk is corrupted */
- res = -EFSCORRUPTED;
- goto errout;
- }
-
- res = fscrypt_fname_alloc_buffer(inode, cstr.len, &pstr);
- if (res)
- goto errout;
- paddr = pstr.name;
-
- res = fscrypt_fname_disk_to_usr(inode, 0, 0, &cstr, &pstr);
- if (res)
- goto errout;
-
- /* Null-terminate the name */
- paddr[pstr.len] = '\0';
+ paddr = fscrypt_get_symlink(inode, caddr, max_size, done);
if (cpage)
put_page(cpage);
- set_delayed_call(done, kfree_link, paddr);
return paddr;
-errout:
- if (cpage)
- put_page(cpage);
- kfree(paddr);
- return ERR_PTR(res);
}
const struct inode_operations ext4_encrypted_symlink_inode_operations = {
diff --git a/fs/f2fs/Makefile b/fs/f2fs/Makefile
index ca949ea7c02f..a0dc559b1b47 100644
--- a/fs/f2fs/Makefile
+++ b/fs/f2fs/Makefile
@@ -2,7 +2,7 @@ obj-$(CONFIG_F2FS_FS) += f2fs.o
f2fs-y := dir.o file.o inode.o namei.o hash.o super.o inline.o
f2fs-y += checkpoint.o gc.o data.o node.o segment.o recovery.o
-f2fs-y += shrinker.o extent_cache.o
+f2fs-y += shrinker.o extent_cache.o sysfs.o
f2fs-$(CONFIG_F2FS_STAT_FS) += debug.o
f2fs-$(CONFIG_F2FS_FS_XATTR) += xattr.o
f2fs-$(CONFIG_F2FS_FS_POSIX_ACL) += acl.o
diff --git a/fs/f2fs/acl.c b/fs/f2fs/acl.c
index 55aa29c0c78d..b32efb53c770 100644
--- a/fs/f2fs/acl.c
+++ b/fs/f2fs/acl.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
/*
* fs/f2fs/acl.c
*
@@ -7,10 +8,6 @@
* Portions of this code from linux/fs/ext2/acl.c
*
* Copyright (C) 2001-2003 Andreas Gruenbacher, <agruen@suse.de>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
*/
#include <linux/f2fs_fs.h>
#include "f2fs.h"
@@ -53,6 +50,9 @@ static struct posix_acl *f2fs_acl_from_disk(const char *value, size_t size)
struct f2fs_acl_entry *entry = (struct f2fs_acl_entry *)(hdr + 1);
const char *end = value + size;
+ if (size < sizeof(struct f2fs_acl_header))
+ return ERR_PTR(-EINVAL);
+
if (hdr->a_version != cpu_to_le32(F2FS_ACL_VERSION))
return ERR_PTR(-EINVAL);
@@ -160,7 +160,7 @@ static void *f2fs_acl_to_disk(struct f2fs_sb_info *sbi,
return (void *)f2fs_acl;
fail:
- kfree(f2fs_acl);
+ kvfree(f2fs_acl);
return ERR_PTR(-EINVAL);
}
@@ -190,7 +190,7 @@ static struct posix_acl *__f2fs_get_acl(struct inode *inode, int type,
acl = NULL;
else
acl = ERR_PTR(retval);
- kfree(value);
+ kvfree(value);
return acl;
}
@@ -207,15 +207,16 @@ static int __f2fs_set_acl(struct inode *inode, int type,
void *value = NULL;
size_t size = 0;
int error;
+ umode_t mode = inode->i_mode;
switch (type) {
case ACL_TYPE_ACCESS:
name_index = F2FS_XATTR_INDEX_POSIX_ACL_ACCESS;
if (acl && !ipage) {
- error = posix_acl_update_mode(inode, &inode->i_mode, &acl);
+ error = posix_acl_update_mode(inode, &mode, &acl);
if (error)
return error;
- set_acl_inode(inode, inode->i_mode);
+ set_acl_inode(inode, mode);
}
break;
@@ -233,13 +234,13 @@ static int __f2fs_set_acl(struct inode *inode, int type,
value = f2fs_acl_to_disk(F2FS_I_SB(inode), acl, &size);
if (IS_ERR(value)) {
clear_inode_flag(inode, FI_ACL_MODE);
- return (int)PTR_ERR(value);
+ return PTR_ERR(value);
}
}
error = f2fs_setxattr(inode, name_index, "", value, size, ipage, 0);
- kfree(value);
+ kvfree(value);
if (!error)
set_cached_acl(inode, type, acl);
@@ -249,6 +250,9 @@ static int __f2fs_set_acl(struct inode *inode, int type,
int f2fs_set_acl(struct inode *inode, struct posix_acl *acl, int type)
{
+ if (unlikely(f2fs_cp_error(F2FS_I_SB(inode))))
+ return -EIO;
+
return __f2fs_set_acl(inode, type, acl, NULL);
}
@@ -348,12 +352,14 @@ static int f2fs_acl_create(struct inode *dir, umode_t *mode,
return PTR_ERR(p);
clone = f2fs_acl_clone(p, GFP_NOFS);
- if (!clone)
- goto no_mem;
+ if (!clone) {
+ ret = -ENOMEM;
+ goto release_acl;
+ }
ret = f2fs_acl_create_masq(clone, mode);
if (ret < 0)
- goto no_mem_clone;
+ goto release_clone;
if (ret == 0)
posix_acl_release(clone);
@@ -367,11 +373,11 @@ static int f2fs_acl_create(struct inode *dir, umode_t *mode,
return 0;
-no_mem_clone:
+release_clone:
posix_acl_release(clone);
-no_mem:
+release_acl:
posix_acl_release(p);
- return -ENOMEM;
+ return ret;
}
int f2fs_init_acl(struct inode *inode, struct inode *dir, struct page *ipage,
@@ -384,18 +390,22 @@ int f2fs_init_acl(struct inode *inode, struct inode *dir, struct page *ipage,
if (error)
return error;
- f2fs_mark_inode_dirty_sync(inode);
+ f2fs_mark_inode_dirty_sync(inode, true);
if (default_acl) {
error = __f2fs_set_acl(inode, ACL_TYPE_DEFAULT, default_acl,
ipage);
posix_acl_release(default_acl);
+ } else {
+ inode->i_default_acl = NULL;
}
if (acl) {
if (!error)
error = __f2fs_set_acl(inode, ACL_TYPE_ACCESS, acl,
ipage);
posix_acl_release(acl);
+ } else {
+ inode->i_acl = NULL;
}
return error;
diff --git a/fs/f2fs/acl.h b/fs/f2fs/acl.h
index 2c685185c24d..b96823c59b15 100644
--- a/fs/f2fs/acl.h
+++ b/fs/f2fs/acl.h
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
/*
* fs/f2fs/acl.h
*
@@ -7,10 +8,6 @@
* Portions of this code from linux/fs/ext2/acl.h
*
* Copyright (C) 2001-2003 Andreas Gruenbacher, <agruen@suse.de>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
*/
#ifndef __F2FS_ACL_H__
#define __F2FS_ACL_H__
diff --git a/fs/f2fs/checkpoint.c b/fs/f2fs/checkpoint.c
index 0b061bbf1639..d27d3c4458e4 100644
--- a/fs/f2fs/checkpoint.c
+++ b/fs/f2fs/checkpoint.c
@@ -1,12 +1,9 @@
+// SPDX-License-Identifier: GPL-2.0
/*
* fs/f2fs/checkpoint.c
*
* Copyright (c) 2012 Samsung Electronics Co., Ltd.
* http://www.samsung.com/
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
*/
#include <linux/fs.h>
#include <linux/bio.h>
@@ -24,20 +21,20 @@
#include <trace/events/f2fs.h>
static struct kmem_cache *ino_entry_slab;
-struct kmem_cache *inode_entry_slab;
+struct kmem_cache *f2fs_inode_entry_slab;
void f2fs_stop_checkpoint(struct f2fs_sb_info *sbi, bool end_io)
{
+ f2fs_build_fault_attr(sbi, 0, 0);
set_ckpt_flags(sbi, CP_ERROR_FLAG);
- sbi->sb->s_flags |= MS_RDONLY;
if (!end_io)
- f2fs_flush_merged_bios(sbi);
+ f2fs_flush_merged_writes(sbi);
}
/*
* We guarantee no failure on the returned page.
*/
-struct page *grab_meta_page(struct f2fs_sb_info *sbi, pgoff_t index)
+struct page *f2fs_grab_meta_page(struct f2fs_sb_info *sbi, pgoff_t index)
{
struct address_space *mapping = META_MAPPING(sbi);
struct page *page = NULL;
@@ -47,7 +44,7 @@ repeat:
cond_resched();
goto repeat;
}
- f2fs_wait_on_page_writeback(page, META, true);
+ f2fs_wait_on_page_writeback(page, META, true, true);
if (!PageUptodate(page))
SetPageUptodate(page);
return page;
@@ -65,12 +62,13 @@ static struct page *__get_meta_page(struct f2fs_sb_info *sbi, pgoff_t index,
.sbi = sbi,
.type = META,
.op = REQ_OP_READ,
- .op_flags = READ_SYNC | REQ_META | REQ_PRIO,
+ .op_flags = REQ_META | REQ_PRIO,
.old_blkaddr = index,
.new_blkaddr = index,
.encrypted_page = NULL,
.is_meta = is_meta,
};
+ int err;
if (unlikely(!is_meta))
fio.op_flags &= ~REQ_META;
@@ -85,11 +83,10 @@ repeat:
fio.page = page;
- if (f2fs_submit_page_bio(&fio)) {
- memset(page_address(page), 0, PAGE_SIZE);
- f2fs_stop_checkpoint(sbi, false);
- f2fs_bug_on(sbi, 1);
- return page;
+ err = f2fs_submit_page_bio(&fio);
+ if (err) {
+ f2fs_put_page(page, 1);
+ return ERR_PTR(err);
}
lock_page(page);
@@ -98,24 +95,37 @@ repeat:
goto repeat;
}
- /*
- * if there is any IO error when accessing device, make our filesystem
- * readonly and make sure do not write checkpoint with non-uptodate
- * meta page.
- */
- if (unlikely(!PageUptodate(page)))
- f2fs_stop_checkpoint(sbi, false);
+ if (unlikely(!PageUptodate(page))) {
+ f2fs_put_page(page, 1);
+ return ERR_PTR(-EIO);
+ }
out:
return page;
}
-struct page *get_meta_page(struct f2fs_sb_info *sbi, pgoff_t index)
+struct page *f2fs_get_meta_page(struct f2fs_sb_info *sbi, pgoff_t index)
{
return __get_meta_page(sbi, index, true);
}
+struct page *f2fs_get_meta_page_nofail(struct f2fs_sb_info *sbi, pgoff_t index)
+{
+ struct page *page;
+ int count = 0;
+
+retry:
+ page = __get_meta_page(sbi, index, true);
+ if (IS_ERR(page)) {
+ if (PTR_ERR(page) == -EIO &&
+ ++count <= DEFAULT_RETRY_IO_COUNT)
+ goto retry;
+ f2fs_stop_checkpoint(sbi, false);
+ }
+ return page;
+}
+
/* for POR only */
-struct page *get_tmp_page(struct f2fs_sb_info *sbi, pgoff_t index)
+struct page *f2fs_get_tmp_page(struct f2fs_sb_info *sbi, pgoff_t index)
{
return __get_meta_page(sbi, index, false);
}
@@ -167,7 +177,7 @@ bool f2fs_is_valid_blkaddr(struct f2fs_sb_info *sbi,
/*
* Readahead CP/NAT/SIT/SSA pages
*/
-int ra_meta_pages(struct f2fs_sb_info *sbi, block_t start, int nrpages,
+int f2fs_ra_meta_pages(struct f2fs_sb_info *sbi, block_t start, int nrpages,
int type, bool sync)
{
struct page *page;
@@ -176,8 +186,9 @@ int ra_meta_pages(struct f2fs_sb_info *sbi, block_t start, int nrpages,
.sbi = sbi,
.type = META,
.op = REQ_OP_READ,
- .op_flags = sync ? (READ_SYNC | REQ_META | REQ_PRIO) : REQ_RAHEAD,
+ .op_flags = sync ? (REQ_META | REQ_PRIO) : REQ_RAHEAD,
.encrypted_page = NULL,
+ .in_list = false,
.is_meta = (type != META_POR),
};
struct blk_plug plug;
@@ -224,17 +235,15 @@ int ra_meta_pages(struct f2fs_sb_info *sbi, block_t start, int nrpages,
}
fio.page = page;
- fio.old_blkaddr = fio.new_blkaddr;
- f2fs_submit_page_mbio(&fio);
+ f2fs_submit_page_bio(&fio);
f2fs_put_page(page, 0);
}
out:
- f2fs_submit_merged_bio(sbi, META, READ);
blk_finish_plug(&plug);
return blkno - start;
}
-void ra_meta_pages_cond(struct f2fs_sb_info *sbi, pgoff_t index)
+void f2fs_ra_meta_pages_cond(struct f2fs_sb_info *sbi, pgoff_t index)
{
struct page *page;
bool readahead = false;
@@ -245,33 +254,34 @@ void ra_meta_pages_cond(struct f2fs_sb_info *sbi, pgoff_t index)
f2fs_put_page(page, 0);
if (readahead)
- ra_meta_pages(sbi, index, MAX_BIO_BLOCKS(sbi), META_POR, true);
+ f2fs_ra_meta_pages(sbi, index, BIO_MAX_PAGES, META_POR, true);
}
-static int f2fs_write_meta_page(struct page *page,
- struct writeback_control *wbc)
+static int __f2fs_write_meta_page(struct page *page,
+ struct writeback_control *wbc,
+ enum iostat_type io_type)
{
struct f2fs_sb_info *sbi = F2FS_P_SB(page);
trace_f2fs_writepage(page, META);
+ if (unlikely(f2fs_cp_error(sbi)))
+ goto redirty_out;
if (unlikely(is_sbi_flag_set(sbi, SBI_POR_DOING)))
goto redirty_out;
if (wbc->for_reclaim && page->index < GET_SUM_BLOCK(sbi, 0))
goto redirty_out;
- if (unlikely(f2fs_cp_error(sbi)))
- goto redirty_out;
- write_meta_page(sbi, page);
+ f2fs_do_write_meta_page(sbi, page, io_type);
dec_page_count(sbi, F2FS_DIRTY_META);
if (wbc->for_reclaim)
- f2fs_submit_merged_bio_cond(sbi, NULL, page, 0, META, WRITE);
+ f2fs_submit_merged_write_cond(sbi, NULL, page, 0, META);
unlock_page(page);
if (unlikely(f2fs_cp_error(sbi)))
- f2fs_submit_merged_bio(sbi, META, WRITE);
+ f2fs_submit_merged_write(sbi, META);
return 0;
@@ -280,23 +290,33 @@ redirty_out:
return AOP_WRITEPAGE_ACTIVATE;
}
+static int f2fs_write_meta_page(struct page *page,
+ struct writeback_control *wbc)
+{
+ return __f2fs_write_meta_page(page, wbc, FS_META_IO);
+}
+
static int f2fs_write_meta_pages(struct address_space *mapping,
struct writeback_control *wbc)
{
struct f2fs_sb_info *sbi = F2FS_M_SB(mapping);
long diff, written;
+ if (unlikely(is_sbi_flag_set(sbi, SBI_POR_DOING)))
+ goto skip_write;
+
/* collect a number of dirty meta pages and write together */
if (wbc->for_kupdate ||
get_pages(sbi, F2FS_DIRTY_META) < nr_pages_to_skip(sbi, META))
goto skip_write;
- trace_f2fs_writepages(mapping->host, wbc, META);
+ /* if locked failed, cp will flush dirty pages instead */
+ if (!mutex_trylock(&sbi->cp_mutex))
+ goto skip_write;
- /* if mounting is failed, skip writing node pages */
- mutex_lock(&sbi->cp_mutex);
+ trace_f2fs_writepages(mapping->host, wbc, META);
diff = nr_pages_to_write(sbi, META, wbc);
- written = sync_meta_pages(sbi, META, wbc->nr_to_write);
+ written = f2fs_sync_meta_pages(sbi, META, wbc->nr_to_write, FS_META_IO);
mutex_unlock(&sbi->cp_mutex);
wbc->nr_to_write = max((long)0, wbc->nr_to_write - written - diff);
return 0;
@@ -307,13 +327,14 @@ skip_write:
return 0;
}
-long sync_meta_pages(struct f2fs_sb_info *sbi, enum page_type type,
- long nr_to_write)
+long f2fs_sync_meta_pages(struct f2fs_sb_info *sbi, enum page_type type,
+ long nr_to_write, enum iostat_type io_type)
{
struct address_space *mapping = META_MAPPING(sbi);
- pgoff_t index = 0, end = ULONG_MAX, prev = ULONG_MAX;
+ pgoff_t index = 0, prev = ULONG_MAX;
struct pagevec pvec;
long nwritten = 0;
+ int nr_pages;
struct writeback_control wbc = {
.for_reclaim = 0,
};
@@ -323,13 +344,9 @@ long sync_meta_pages(struct f2fs_sb_info *sbi, enum page_type type,
blk_start_plug(&plug);
- while (index <= end) {
- int i, nr_pages;
- nr_pages = pagevec_lookup_tag(&pvec, mapping, &index,
- PAGECACHE_TAG_DIRTY,
- min(end - index, (pgoff_t)PAGEVEC_SIZE-1) + 1);
- if (unlikely(nr_pages == 0))
- break;
+ while ((nr_pages = pagevec_lookup_tag(&pvec, mapping, &index,
+ PAGECACHE_TAG_DIRTY))) {
+ int i;
for (i = 0; i < nr_pages; i++) {
struct page *page = pvec.pages[i];
@@ -353,13 +370,12 @@ continue_unlock:
goto continue_unlock;
}
- f2fs_wait_on_page_writeback(page, META, true);
+ f2fs_wait_on_page_writeback(page, META, true, true);
- BUG_ON(PageWriteback(page));
if (!clear_page_dirty_for_io(page))
goto continue_unlock;
- if (mapping->a_ops->writepage(page, &wbc)) {
+ if (__f2fs_write_meta_page(page, &wbc, io_type)) {
unlock_page(page);
break;
}
@@ -373,7 +389,7 @@ continue_unlock:
}
stop:
if (nwritten)
- f2fs_submit_merged_bio(sbi, type, WRITE);
+ f2fs_submit_merged_write(sbi, type);
blk_finish_plug(&plug);
@@ -387,7 +403,7 @@ static int f2fs_set_meta_page_dirty(struct page *page)
if (!PageUptodate(page))
SetPageUptodate(page);
if (!PageDirty(page)) {
- f2fs_set_page_dirty_nobuffers(page);
+ __set_page_dirty_nobuffers(page);
inc_page_count(F2FS_P_SB(page), F2FS_DIRTY_META);
SetPagePrivate(page);
f2fs_trace_pid(page);
@@ -407,24 +423,23 @@ const struct address_space_operations f2fs_meta_aops = {
#endif
};
-static void __add_ino_entry(struct f2fs_sb_info *sbi, nid_t ino, int type)
+static void __add_ino_entry(struct f2fs_sb_info *sbi, nid_t ino,
+ unsigned int devidx, int type)
{
struct inode_management *im = &sbi->im[type];
struct ino_entry *e, *tmp;
tmp = f2fs_kmem_cache_alloc(ino_entry_slab, GFP_NOFS);
-retry:
+
radix_tree_preload(GFP_NOFS | __GFP_NOFAIL);
spin_lock(&im->ino_lock);
e = radix_tree_lookup(&im->ino_root, ino);
if (!e) {
e = tmp;
- if (radix_tree_insert(&im->ino_root, ino, e)) {
- spin_unlock(&im->ino_lock);
- radix_tree_preload_end();
- goto retry;
- }
+ if (unlikely(radix_tree_insert(&im->ino_root, ino, e)))
+ f2fs_bug_on(sbi, 1);
+
memset(e, 0, sizeof(struct ino_entry));
e->ino = ino;
@@ -432,6 +447,10 @@ retry:
if (type != ORPHAN_INO)
im->ino_num++;
}
+
+ if (type == FLUSH_INO)
+ f2fs_set_bit(devidx, (char *)&e->dirty_device);
+
spin_unlock(&im->ino_lock);
radix_tree_preload_end();
@@ -457,20 +476,20 @@ static void __remove_ino_entry(struct f2fs_sb_info *sbi, nid_t ino, int type)
spin_unlock(&im->ino_lock);
}
-void add_ino_entry(struct f2fs_sb_info *sbi, nid_t ino, int type)
+void f2fs_add_ino_entry(struct f2fs_sb_info *sbi, nid_t ino, int type)
{
/* add new dirty ino entry into list */
- __add_ino_entry(sbi, ino, type);
+ __add_ino_entry(sbi, ino, 0, type);
}
-void remove_ino_entry(struct f2fs_sb_info *sbi, nid_t ino, int type)
+void f2fs_remove_ino_entry(struct f2fs_sb_info *sbi, nid_t ino, int type)
{
/* remove dirty ino entry from list */
__remove_ino_entry(sbi, ino, type);
}
/* mode should be APPEND_INO or UPDATE_INO */
-bool exist_written_data(struct f2fs_sb_info *sbi, nid_t ino, int mode)
+bool f2fs_exist_written_data(struct f2fs_sb_info *sbi, nid_t ino, int mode)
{
struct inode_management *im = &sbi->im[mode];
struct ino_entry *e;
@@ -481,12 +500,12 @@ bool exist_written_data(struct f2fs_sb_info *sbi, nid_t ino, int mode)
return e ? true : false;
}
-void release_ino_entry(struct f2fs_sb_info *sbi, bool all)
+void f2fs_release_ino_entry(struct f2fs_sb_info *sbi, bool all)
{
struct ino_entry *e, *tmp;
int i;
- for (i = all ? ORPHAN_INO: APPEND_INO; i <= UPDATE_INO; i++) {
+ for (i = all ? ORPHAN_INO : APPEND_INO; i < MAX_INO_ENTRY; i++) {
struct inode_management *im = &sbi->im[i];
spin_lock(&im->ino_lock);
@@ -500,19 +519,40 @@ void release_ino_entry(struct f2fs_sb_info *sbi, bool all)
}
}
-int acquire_orphan_inode(struct f2fs_sb_info *sbi)
+void f2fs_set_dirty_device(struct f2fs_sb_info *sbi, nid_t ino,
+ unsigned int devidx, int type)
+{
+ __add_ino_entry(sbi, ino, devidx, type);
+}
+
+bool f2fs_is_dirty_device(struct f2fs_sb_info *sbi, nid_t ino,
+ unsigned int devidx, int type)
+{
+ struct inode_management *im = &sbi->im[type];
+ struct ino_entry *e;
+ bool is_dirty = false;
+
+ spin_lock(&im->ino_lock);
+ e = radix_tree_lookup(&im->ino_root, ino);
+ if (e && f2fs_test_bit(devidx, (char *)&e->dirty_device))
+ is_dirty = true;
+ spin_unlock(&im->ino_lock);
+ return is_dirty;
+}
+
+int f2fs_acquire_orphan_inode(struct f2fs_sb_info *sbi)
{
struct inode_management *im = &sbi->im[ORPHAN_INO];
int err = 0;
spin_lock(&im->ino_lock);
-#ifdef CONFIG_F2FS_FAULT_INJECTION
if (time_to_inject(sbi, FAULT_ORPHAN)) {
spin_unlock(&im->ino_lock);
+ f2fs_show_injection_info(FAULT_ORPHAN);
return -ENOSPC;
}
-#endif
+
if (unlikely(im->ino_num >= sbi->max_orphans))
err = -ENOSPC;
else
@@ -522,7 +562,7 @@ int acquire_orphan_inode(struct f2fs_sb_info *sbi)
return err;
}
-void release_orphan_inode(struct f2fs_sb_info *sbi)
+void f2fs_release_orphan_inode(struct f2fs_sb_info *sbi)
{
struct inode_management *im = &sbi->im[ORPHAN_INO];
@@ -532,14 +572,14 @@ void release_orphan_inode(struct f2fs_sb_info *sbi)
spin_unlock(&im->ino_lock);
}
-void add_orphan_inode(struct inode *inode)
+void f2fs_add_orphan_inode(struct inode *inode)
{
/* add new orphan ino entry into list */
- __add_ino_entry(F2FS_I_SB(inode), inode->i_ino, ORPHAN_INO);
- update_inode_page(inode);
+ __add_ino_entry(F2FS_I_SB(inode), inode->i_ino, 0, ORPHAN_INO);
+ f2fs_update_inode_page(inode);
}
-void remove_orphan_inode(struct f2fs_sb_info *sbi, nid_t ino)
+void f2fs_remove_orphan_inode(struct f2fs_sb_info *sbi, nid_t ino)
{
/* remove orphan entry from orphan list */
__remove_ino_entry(sbi, ino, ORPHAN_INO);
@@ -549,17 +589,7 @@ static int recover_orphan_inode(struct f2fs_sb_info *sbi, nid_t ino)
{
struct inode *inode;
struct node_info ni;
- int err = acquire_orphan_inode(sbi);
-
- if (err) {
- set_sbi_flag(sbi, SBI_NEED_FSCK);
- f2fs_msg(sbi->sb, KERN_WARNING,
- "%s: orphan failed (ino=%x), run fsck to fix.",
- __func__, ino);
- return err;
- }
-
- __add_ino_entry(sbi, ino, ORPHAN_INO);
+ int err;
inode = f2fs_iget_retry(sbi->sb, ino);
if (IS_ERR(inode)) {
@@ -571,56 +601,103 @@ static int recover_orphan_inode(struct f2fs_sb_info *sbi, nid_t ino)
return PTR_ERR(inode);
}
+ err = dquot_initialize(inode);
+ if (err) {
+ iput(inode);
+ goto err_out;
+ }
+
clear_nlink(inode);
/* truncate all the data during iput */
iput(inode);
- get_node_info(sbi, ino, &ni);
+ err = f2fs_get_node_info(sbi, ino, &ni);
+ if (err)
+ goto err_out;
/* ENOMEM was fully retried in f2fs_evict_inode. */
if (ni.blk_addr != NULL_ADDR) {
- set_sbi_flag(sbi, SBI_NEED_FSCK);
- f2fs_msg(sbi->sb, KERN_WARNING,
- "%s: orphan failed (ino=%x), run fsck to fix.",
- __func__, ino);
- return -EIO;
+ err = -EIO;
+ goto err_out;
}
- __remove_ino_entry(sbi, ino, ORPHAN_INO);
return 0;
+
+err_out:
+ set_sbi_flag(sbi, SBI_NEED_FSCK);
+ f2fs_msg(sbi->sb, KERN_WARNING,
+ "%s: orphan failed (ino=%x), run fsck to fix.",
+ __func__, ino);
+ return err;
}
-int recover_orphan_inodes(struct f2fs_sb_info *sbi)
+int f2fs_recover_orphan_inodes(struct f2fs_sb_info *sbi)
{
block_t start_blk, orphan_blocks, i, j;
- int err;
+ unsigned int s_flags = sbi->sb->s_flags;
+ int err = 0;
+#ifdef CONFIG_QUOTA
+ int quota_enabled;
+#endif
if (!is_set_ckpt_flags(sbi, CP_ORPHAN_PRESENT_FLAG))
return 0;
+ if (s_flags & MS_RDONLY) {
+ f2fs_msg(sbi->sb, KERN_INFO, "orphan cleanup on readonly fs");
+ sbi->sb->s_flags &= ~MS_RDONLY;
+ }
+
+#ifdef CONFIG_QUOTA
+ /* Needed for iput() to work correctly and not trash data */
+ sbi->sb->s_flags |= MS_ACTIVE;
+
+ /*
+ * Turn on quotas which were not enabled for read-only mounts if
+ * filesystem has quota feature, so that they are updated correctly.
+ */
+ quota_enabled = f2fs_enable_quota_files(sbi, s_flags & MS_RDONLY);
+#endif
+
start_blk = __start_cp_addr(sbi) + 1 + __cp_payload(sbi);
orphan_blocks = __start_sum_addr(sbi) - 1 - __cp_payload(sbi);
- ra_meta_pages(sbi, start_blk, orphan_blocks, META_CP, true);
+ f2fs_ra_meta_pages(sbi, start_blk, orphan_blocks, META_CP, true);
for (i = 0; i < orphan_blocks; i++) {
- struct page *page = get_meta_page(sbi, start_blk + i);
+ struct page *page;
struct f2fs_orphan_block *orphan_blk;
+ page = f2fs_get_meta_page(sbi, start_blk + i);
+ if (IS_ERR(page)) {
+ err = PTR_ERR(page);
+ goto out;
+ }
+
orphan_blk = (struct f2fs_orphan_block *)page_address(page);
for (j = 0; j < le32_to_cpu(orphan_blk->entry_count); j++) {
nid_t ino = le32_to_cpu(orphan_blk->ino[j]);
err = recover_orphan_inode(sbi, ino);
if (err) {
f2fs_put_page(page, 1);
- return err;
+ goto out;
}
}
f2fs_put_page(page, 1);
}
/* clear Orphan Flag */
clear_ckpt_flags(sbi, CP_ORPHAN_PRESENT_FLAG);
- return 0;
+out:
+ set_sbi_flag(sbi, SBI_IS_RECOVERED);
+
+#ifdef CONFIG_QUOTA
+ /* Turn quotas off */
+ if (quota_enabled)
+ f2fs_quota_off_umount(sbi->sb);
+#endif
+ sbi->sb->s_flags = s_flags; /* Restore MS_RDONLY status */
+
+ return err;
}
static void write_orphan_inodes(struct f2fs_sb_info *sbi, block_t start_blk)
@@ -646,7 +723,7 @@ static void write_orphan_inodes(struct f2fs_sb_info *sbi, block_t start_blk)
/* loop for each orphan inode entry and write them in Jornal block */
list_for_each_entry(orphan, head, list) {
if (!page) {
- page = grab_meta_page(sbi, start_blk++);
+ page = f2fs_grab_meta_page(sbi, start_blk++);
orphan_blk =
(struct f2fs_orphan_block *)page_address(page);
memset(orphan_blk, 0, sizeof(*orphan_blk));
@@ -688,19 +765,21 @@ static int get_checkpoint_version(struct f2fs_sb_info *sbi, block_t cp_addr,
size_t crc_offset = 0;
__u32 crc = 0;
- *cp_page = get_meta_page(sbi, cp_addr);
+ *cp_page = f2fs_get_meta_page(sbi, cp_addr);
+ if (IS_ERR(*cp_page))
+ return PTR_ERR(*cp_page);
+
*cp_block = (struct f2fs_checkpoint *)page_address(*cp_page);
crc_offset = le32_to_cpu((*cp_block)->checksum_offset);
- if (crc_offset >= blk_size) {
+ if (crc_offset > (blk_size - sizeof(__le32))) {
f2fs_put_page(*cp_page, 1);
f2fs_msg(sbi->sb, KERN_WARNING,
"invalid crc_offset: %zu", crc_offset);
return -EINVAL;
}
- crc = le32_to_cpu(*((__le32 *)((unsigned char *)*cp_block
- + crc_offset)));
+ crc = cur_cp_crc(*cp_block);
if (!f2fs_crc_valid(sbi, crc, *cp_block, crc_offset)) {
f2fs_put_page(*cp_page, 1);
f2fs_msg(sbi->sb, KERN_WARNING, "invalid crc value");
@@ -751,7 +830,7 @@ invalid_cp:
return NULL;
}
-int get_valid_checkpoint(struct f2fs_sb_info *sbi)
+int f2fs_get_valid_checkpoint(struct f2fs_sb_info *sbi)
{
struct f2fs_checkpoint *cp_block;
struct f2fs_super_block *fsb = sbi->raw_super;
@@ -763,7 +842,8 @@ int get_valid_checkpoint(struct f2fs_sb_info *sbi)
block_t cp_blk_no;
int i;
- sbi->ckpt = kzalloc(cp_blks * blk_size, GFP_KERNEL);
+ sbi->ckpt = f2fs_kzalloc(sbi, array_size(blk_size, cp_blks),
+ GFP_KERNEL);
if (!sbi->ckpt)
return -ENOMEM;
/*
@@ -800,7 +880,7 @@ int get_valid_checkpoint(struct f2fs_sb_info *sbi)
sbi->cur_cp_pack = 2;
/* Sanity checking of checkpoint */
- if (sanity_check_ckpt(sbi))
+ if (f2fs_sanity_check_ckpt(sbi))
goto free_fail_no_cp;
if (cp_blks <= 1)
@@ -814,7 +894,9 @@ int get_valid_checkpoint(struct f2fs_sb_info *sbi)
void *sit_bitmap_ptr;
unsigned char *ckpt = (unsigned char *)sbi->ckpt;
- cur_page = get_meta_page(sbi, cp_blk_no + i);
+ cur_page = f2fs_get_meta_page(sbi, cp_blk_no + i);
+ if (IS_ERR(cur_page))
+ goto free_fail_no_cp;
sit_bitmap_ptr = page_address(cur_page);
memcpy(ckpt + i * blk_size, sit_bitmap_ptr, blk_size);
f2fs_put_page(cur_page, 1);
@@ -828,7 +910,7 @@ free_fail_no_cp:
f2fs_put_page(cp1, 1);
f2fs_put_page(cp2, 1);
fail_no_cp:
- kfree(sbi->ckpt);
+ kvfree(sbi->ckpt);
return -EINVAL;
}
@@ -841,7 +923,9 @@ static void __add_dirty_inode(struct inode *inode, enum inode_type type)
return;
set_inode_flag(inode, flag);
- list_add_tail(&F2FS_I(inode)->dirty_list, &sbi->inode_list[type]);
+ if (!f2fs_is_volatile_file(inode))
+ list_add_tail(&F2FS_I(inode)->dirty_list,
+ &sbi->inode_list[type]);
stat_inc_dirty_inode(sbi, type);
}
@@ -857,7 +941,7 @@ static void __remove_dirty_inode(struct inode *inode, enum inode_type type)
stat_dec_dirty_inode(F2FS_I_SB(inode), type);
}
-void update_dirty_page(struct inode *inode, struct page *page)
+void f2fs_update_dirty_page(struct inode *inode, struct page *page)
{
struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
enum inode_type type = S_ISDIR(inode->i_mode) ? DIR_INODE : FILE_INODE;
@@ -876,7 +960,7 @@ void update_dirty_page(struct inode *inode, struct page *page)
f2fs_trace_pid(page);
}
-void remove_dirty_inode(struct inode *inode)
+void f2fs_remove_dirty_inode(struct inode *inode)
{
struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
enum inode_type type = S_ISDIR(inode->i_mode) ? DIR_INODE : FILE_INODE;
@@ -893,12 +977,13 @@ void remove_dirty_inode(struct inode *inode)
spin_unlock(&sbi->inode_lock[type]);
}
-int sync_dirty_inodes(struct f2fs_sb_info *sbi, enum inode_type type)
+int f2fs_sync_dirty_inodes(struct f2fs_sb_info *sbi, enum inode_type type)
{
struct list_head *head;
struct inode *inode;
struct f2fs_inode_info *fi;
bool is_dir = (type == DIR_INODE);
+ unsigned long ino = 0;
trace_f2fs_sync_dirty_inodes_enter(sbi->sb, is_dir,
get_pages(sbi, is_dir ?
@@ -917,18 +1002,32 @@ retry:
F2FS_DIRTY_DENTS : F2FS_DIRTY_DATA));
return 0;
}
- fi = list_entry(head->next, struct f2fs_inode_info, dirty_list);
+ fi = list_first_entry(head, struct f2fs_inode_info, dirty_list);
inode = igrab(&fi->vfs_inode);
spin_unlock(&sbi->inode_lock[type]);
if (inode) {
+ unsigned long cur_ino = inode->i_ino;
+
+ if (is_dir)
+ F2FS_I(inode)->cp_task = current;
+
filemap_fdatawrite(inode->i_mapping);
+
+ if (is_dir)
+ F2FS_I(inode)->cp_task = NULL;
+
iput(inode);
+ /* We need to give cpu to another writers. */
+ if (ino == cur_ino)
+ cond_resched();
+ else
+ ino = cur_ino;
} else {
/*
* We should submit bio, since it exists several
* wribacking dentry pages in the freeing inode.
*/
- f2fs_submit_merged_bio(sbi, DATA, WRITE);
+ f2fs_submit_merged_write(sbi, DATA);
cond_resched();
}
goto retry;
@@ -950,18 +1049,50 @@ int f2fs_sync_inode_meta(struct f2fs_sb_info *sbi)
spin_unlock(&sbi->inode_lock[DIRTY_META]);
return 0;
}
- fi = list_entry(head->next, struct f2fs_inode_info,
+ fi = list_first_entry(head, struct f2fs_inode_info,
gdirty_list);
inode = igrab(&fi->vfs_inode);
spin_unlock(&sbi->inode_lock[DIRTY_META]);
if (inode) {
- update_inode_page(inode);
+ sync_inode_metadata(inode, 0);
+
+ /* it's on eviction */
+ if (is_inode_flag_set(inode, FI_DIRTY_INODE))
+ f2fs_update_inode_page(inode);
iput(inode);
}
- };
+ }
return 0;
}
+static void __prepare_cp_block(struct f2fs_sb_info *sbi)
+{
+ struct f2fs_checkpoint *ckpt = F2FS_CKPT(sbi);
+ struct f2fs_nm_info *nm_i = NM_I(sbi);
+ nid_t last_nid = nm_i->next_scan_nid;
+
+ next_free_nid(sbi, &last_nid);
+ ckpt->valid_block_count = cpu_to_le64(valid_user_blocks(sbi));
+ ckpt->valid_node_count = cpu_to_le32(valid_node_count(sbi));
+ ckpt->valid_inode_count = cpu_to_le32(valid_inode_count(sbi));
+ ckpt->next_free_nid = cpu_to_le32(last_nid);
+}
+
+static bool __need_flush_quota(struct f2fs_sb_info *sbi)
+{
+ if (!is_journalled_quota(sbi))
+ return false;
+ if (is_sbi_flag_set(sbi, SBI_QUOTA_SKIP_FLUSH))
+ return false;
+ if (is_sbi_flag_set(sbi, SBI_QUOTA_NEED_REPAIR))
+ return false;
+ if (is_sbi_flag_set(sbi, SBI_QUOTA_NEED_FLUSH))
+ return true;
+ if (get_pages(sbi, F2FS_DIRTY_QDATA))
+ return true;
+ return false;
+}
+
/*
* Freeze all the FS-operations for checkpoint.
*/
@@ -973,45 +1104,91 @@ static int block_operations(struct f2fs_sb_info *sbi)
.for_reclaim = 0,
};
struct blk_plug plug;
- int err = 0;
+ int err = 0, cnt = 0;
blk_start_plug(&plug);
-retry_flush_dents:
+retry_flush_quotas:
+ if (__need_flush_quota(sbi)) {
+ int locked;
+
+ if (++cnt > DEFAULT_RETRY_QUOTA_FLUSH_COUNT) {
+ set_sbi_flag(sbi, SBI_QUOTA_SKIP_FLUSH);
+ f2fs_lock_all(sbi);
+ goto retry_flush_dents;
+ }
+ clear_sbi_flag(sbi, SBI_QUOTA_NEED_FLUSH);
+
+ /* only failed during mount/umount/freeze/quotactl */
+ locked = down_read_trylock(&sbi->sb->s_umount);
+ f2fs_quota_sync(sbi->sb, -1);
+ if (locked)
+ up_read(&sbi->sb->s_umount);
+ }
+
f2fs_lock_all(sbi);
+ if (__need_flush_quota(sbi)) {
+ f2fs_unlock_all(sbi);
+ cond_resched();
+ goto retry_flush_quotas;
+ }
+
+retry_flush_dents:
/* write all the dirty dentry pages */
if (get_pages(sbi, F2FS_DIRTY_DENTS)) {
f2fs_unlock_all(sbi);
- err = sync_dirty_inodes(sbi, DIR_INODE);
+ err = f2fs_sync_dirty_inodes(sbi, DIR_INODE);
if (err)
goto out;
- goto retry_flush_dents;
+ cond_resched();
+ goto retry_flush_quotas;
+ }
+
+ /*
+ * POR: we should ensure that there are no dirty node pages
+ * until finishing nat/sit flush. inode->i_blocks can be updated.
+ */
+ down_write(&sbi->node_change);
+
+ if (__need_flush_quota(sbi)) {
+ up_write(&sbi->node_change);
+ f2fs_unlock_all(sbi);
+ goto retry_flush_quotas;
}
if (get_pages(sbi, F2FS_DIRTY_IMETA)) {
+ up_write(&sbi->node_change);
f2fs_unlock_all(sbi);
err = f2fs_sync_inode_meta(sbi);
if (err)
goto out;
- goto retry_flush_dents;
+ cond_resched();
+ goto retry_flush_quotas;
}
- /*
- * POR: we should ensure that there are no dirty node pages
- * until finishing nat/sit flush.
- */
retry_flush_nodes:
down_write(&sbi->node_write);
if (get_pages(sbi, F2FS_DIRTY_NODES)) {
up_write(&sbi->node_write);
- err = sync_node_pages(sbi, &wbc);
+ atomic_inc(&sbi->wb_sync_req[NODE]);
+ err = f2fs_sync_node_pages(sbi, &wbc, false, FS_CP_NODE_IO);
+ atomic_dec(&sbi->wb_sync_req[NODE]);
if (err) {
+ up_write(&sbi->node_change);
f2fs_unlock_all(sbi);
goto out;
}
+ cond_resched();
goto retry_flush_nodes;
}
+
+ /*
+ * sbi->node_change is used only for AIO write_begin path which produces
+ * dirty node blocks and some checkpoint values by block allocation.
+ */
+ __prepare_cp_block(sbi);
+ up_write(&sbi->node_change);
out:
blk_finish_plug(&plug);
return err;
@@ -1020,19 +1197,20 @@ out:
static void unblock_operations(struct f2fs_sb_info *sbi)
{
up_write(&sbi->node_write);
-
- build_free_nids(sbi);
f2fs_unlock_all(sbi);
}
-static void wait_on_all_pages_writeback(struct f2fs_sb_info *sbi)
+void f2fs_wait_on_all_pages_writeback(struct f2fs_sb_info *sbi)
{
DEFINE_WAIT(wait);
for (;;) {
prepare_to_wait(&sbi->cp_wait, &wait, TASK_UNINTERRUPTIBLE);
- if (!atomic_read(&sbi->nr_wb_bios))
+ if (!get_pages(sbi, F2FS_WB_CP_DATA))
+ break;
+
+ if (unlikely(f2fs_cp_error(sbi)))
break;
io_schedule_timeout(5*HZ);
@@ -1044,15 +1222,26 @@ static void update_ckpt_flags(struct f2fs_sb_info *sbi, struct cp_control *cpc)
{
unsigned long orphan_num = sbi->im[ORPHAN_INO].ino_num;
struct f2fs_checkpoint *ckpt = F2FS_CKPT(sbi);
+ unsigned long flags;
- spin_lock(&sbi->cp_lock);
+ spin_lock_irqsave(&sbi->cp_lock, flags);
- if (cpc->reason == CP_UMOUNT)
+ if ((cpc->reason & CP_UMOUNT) &&
+ le32_to_cpu(ckpt->cp_pack_total_block_count) >
+ sbi->blocks_per_seg - NM_I(sbi)->nat_bits_blocks)
+ disable_nat_bits(sbi, false);
+
+ if (cpc->reason & CP_TRIMMED)
+ __set_ckpt_flags(ckpt, CP_TRIMMED_FLAG);
+ else
+ __clear_ckpt_flags(ckpt, CP_TRIMMED_FLAG);
+
+ if (cpc->reason & CP_UMOUNT)
__set_ckpt_flags(ckpt, CP_UMOUNT_FLAG);
else
__clear_ckpt_flags(ckpt, CP_UMOUNT_FLAG);
- if (cpc->reason == CP_FASTBOOT)
+ if (cpc->reason & CP_FASTBOOT)
__set_ckpt_flags(ckpt, CP_FASTBOOT_FLAG);
else
__clear_ckpt_flags(ckpt, CP_FASTBOOT_FLAG);
@@ -1065,18 +1254,68 @@ static void update_ckpt_flags(struct f2fs_sb_info *sbi, struct cp_control *cpc)
if (is_sbi_flag_set(sbi, SBI_NEED_FSCK))
__set_ckpt_flags(ckpt, CP_FSCK_FLAG);
+ if (is_sbi_flag_set(sbi, SBI_CP_DISABLED))
+ __set_ckpt_flags(ckpt, CP_DISABLED_FLAG);
+ else
+ __clear_ckpt_flags(ckpt, CP_DISABLED_FLAG);
+
+ if (is_sbi_flag_set(sbi, SBI_QUOTA_SKIP_FLUSH))
+ __set_ckpt_flags(ckpt, CP_QUOTA_NEED_FSCK_FLAG);
+ else
+ __clear_ckpt_flags(ckpt, CP_QUOTA_NEED_FSCK_FLAG);
+
+ if (is_sbi_flag_set(sbi, SBI_QUOTA_NEED_REPAIR))
+ __set_ckpt_flags(ckpt, CP_QUOTA_NEED_FSCK_FLAG);
+
/* set this flag to activate crc|cp_ver for recovery */
__set_ckpt_flags(ckpt, CP_CRC_RECOVERY_FLAG);
+ __clear_ckpt_flags(ckpt, CP_NOCRC_RECOVERY_FLAG);
- spin_unlock(&sbi->cp_lock);
+ spin_unlock_irqrestore(&sbi->cp_lock, flags);
+}
+
+static void commit_checkpoint(struct f2fs_sb_info *sbi,
+ void *src, block_t blk_addr)
+{
+ struct writeback_control wbc = {
+ .for_reclaim = 0,
+ };
+
+ /*
+ * pagevec_lookup_tag and lock_page again will take
+ * some extra time. Therefore, f2fs_update_meta_pages and
+ * f2fs_sync_meta_pages are combined in this function.
+ */
+ struct page *page = f2fs_grab_meta_page(sbi, blk_addr);
+ int err;
+
+ f2fs_wait_on_page_writeback(page, META, true, true);
+
+ memcpy(page_address(page), src, PAGE_SIZE);
+
+ set_page_dirty(page);
+ if (unlikely(!clear_page_dirty_for_io(page)))
+ f2fs_bug_on(sbi, 1);
+
+ /* writeout cp pack 2 page */
+ err = __f2fs_write_meta_page(page, &wbc, FS_CP_META_IO);
+ if (unlikely(err && f2fs_cp_error(sbi))) {
+ f2fs_put_page(page, 1);
+ return;
+ }
+
+ f2fs_bug_on(sbi, err);
+ f2fs_put_page(page, 0);
+
+ /* submit checkpoint (with barrier if NOBARRIER is not set) */
+ f2fs_submit_merged_write(sbi, META_FLUSH);
}
static int do_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc)
{
struct f2fs_checkpoint *ckpt = F2FS_CKPT(sbi);
struct f2fs_nm_info *nm_i = NM_I(sbi);
- unsigned long orphan_num = sbi->im[ORPHAN_INO].ino_num;
- nid_t last_nid = nm_i->next_scan_nid;
+ unsigned long orphan_num = sbi->im[ORPHAN_INO].ino_num, flags;
block_t start_blk;
unsigned int data_sum_blocks, orphan_blocks;
__u32 crc32 = 0;
@@ -1085,22 +1324,18 @@ static int do_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc)
struct super_block *sb = sbi->sb;
struct curseg_info *seg_i = CURSEG_I(sbi, CURSEG_HOT_NODE);
u64 kbytes_written;
+ int err;
/* Flush all the NAT/SIT pages */
- while (get_pages(sbi, F2FS_DIRTY_META)) {
- sync_meta_pages(sbi, META, LONG_MAX);
- if (unlikely(f2fs_cp_error(sbi)))
- return -EIO;
- }
-
- next_free_nid(sbi, &last_nid);
+ f2fs_sync_meta_pages(sbi, META, LONG_MAX, FS_CP_META_IO);
+ f2fs_bug_on(sbi, get_pages(sbi, F2FS_DIRTY_META) &&
+ !f2fs_cp_error(sbi));
/*
* modify checkpoint
* version number is already updated
*/
- ckpt->elapsed_time = cpu_to_le64(get_mtime(sbi));
- ckpt->valid_block_count = cpu_to_le64(valid_user_blocks(sbi));
+ ckpt->elapsed_time = cpu_to_le64(get_mtime(sbi, true));
ckpt->free_segment_count = cpu_to_le32(free_segments(sbi));
for (i = 0; i < NR_CURSEG_NODE_TYPE; i++) {
ckpt->cur_node_segno[i] =
@@ -1119,18 +1354,14 @@ static int do_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc)
curseg_alloc_type(sbi, i + CURSEG_HOT_DATA);
}
- ckpt->valid_node_count = cpu_to_le32(valid_node_count(sbi));
- ckpt->valid_inode_count = cpu_to_le32(valid_inode_count(sbi));
- ckpt->next_free_nid = cpu_to_le32(last_nid);
-
/* 2 cp + n data seg summary + orphan inode blocks */
- data_sum_blocks = npages_for_summary_flush(sbi, false);
- spin_lock(&sbi->cp_lock);
+ data_sum_blocks = f2fs_npages_for_summary_flush(sbi, false);
+ spin_lock_irqsave(&sbi->cp_lock, flags);
if (data_sum_blocks < NR_CURSEG_DATA_TYPE)
__set_ckpt_flags(ckpt, CP_COMPACT_SUM_FLAG);
else
__clear_ckpt_flags(ckpt, CP_COMPACT_SUM_FLAG);
- spin_unlock(&sbi->cp_lock);
+ spin_unlock_irqrestore(&sbi->cp_lock, flags);
orphan_blocks = GET_ORPHAN_BLOCKS(orphan_num);
ckpt->cp_pack_start_sum = cpu_to_le32(1 + cp_payload_blks +
@@ -1159,16 +1390,25 @@ static int do_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc)
start_blk = __start_cp_next_addr(sbi);
- /* need to wait for end_io results */
- wait_on_all_pages_writeback(sbi);
- if (unlikely(f2fs_cp_error(sbi)))
- return -EIO;
+ /* write nat bits */
+ if (enabled_nat_bits(sbi, cpc)) {
+ __u64 cp_ver = cur_cp_version(ckpt);
+ block_t blk;
+
+ cp_ver |= ((__u64)crc32 << 32);
+ *(__le64 *)nm_i->nat_bits = cpu_to_le64(cp_ver);
+
+ blk = start_blk + sbi->blocks_per_seg - nm_i->nat_bits_blocks;
+ for (i = 0; i < nm_i->nat_bits_blocks; i++)
+ f2fs_update_meta_page(sbi, nm_i->nat_bits +
+ (i << F2FS_BLKSIZE_BITS), blk + i);
+ }
/* write out checkpoint buffer at block 0 */
- update_meta_page(sbi, ckpt, start_blk++);
+ f2fs_update_meta_page(sbi, ckpt, start_blk++);
for (i = 1; i < 1 + cp_payload_blks; i++)
- update_meta_page(sbi, (char *)ckpt + i * F2FS_BLKSIZE,
+ f2fs_update_meta_page(sbi, (char *)ckpt + i * F2FS_BLKSIZE,
start_blk++);
if (orphan_num) {
@@ -1176,7 +1416,7 @@ static int do_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc)
start_blk += orphan_blocks;
}
- write_data_summaries(sbi, start_blk);
+ f2fs_write_data_summaries(sbi, start_blk);
start_blk += data_sum_blocks;
/* Record write statistics in the hot node summary */
@@ -1187,40 +1427,47 @@ static int do_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc)
seg_i->journal->info.kbytes_written = cpu_to_le64(kbytes_written);
if (__remain_node_summaries(cpc->reason)) {
- write_node_summaries(sbi, start_blk);
+ f2fs_write_node_summaries(sbi, start_blk);
start_blk += NR_CURSEG_NODE_TYPE;
}
- /* writeout checkpoint block */
- update_meta_page(sbi, ckpt, start_blk);
-
- /* wait for previous submitted node/meta pages writeback */
- wait_on_all_pages_writeback(sbi);
-
- if (unlikely(f2fs_cp_error(sbi)))
- return -EIO;
-
- filemap_fdatawait_range(NODE_MAPPING(sbi), 0, LLONG_MAX);
- filemap_fdatawait_range(META_MAPPING(sbi), 0, LLONG_MAX);
-
/* update user_block_counts */
sbi->last_valid_block_count = sbi->total_valid_block_count;
percpu_counter_set(&sbi->alloc_valid_block_count, 0);
- /* Here, we only have one bio having CP pack */
- sync_meta_pages(sbi, META_FLUSH, LONG_MAX);
+ /* Here, we have one bio having CP pack except cp pack 2 page */
+ f2fs_sync_meta_pages(sbi, META, LONG_MAX, FS_CP_META_IO);
+ f2fs_bug_on(sbi, get_pages(sbi, F2FS_DIRTY_META) &&
+ !f2fs_cp_error(sbi));
/* wait for previous submitted meta pages writeback */
- wait_on_all_pages_writeback(sbi);
+ f2fs_wait_on_all_pages_writeback(sbi);
- release_ino_entry(sbi, false);
+ /* flush all device cache */
+ err = f2fs_flush_device_cache(sbi);
+ if (err)
+ return err;
- if (unlikely(f2fs_cp_error(sbi)))
- return -EIO;
+ /* barrier and flush checkpoint cp pack 2 page if it can */
+ commit_checkpoint(sbi, ckpt, start_blk);
+ f2fs_wait_on_all_pages_writeback(sbi);
+
+ /*
+ * invalidate intermediate page cache borrowed from meta inode
+ * which are used for migration of encrypted inode's blocks.
+ */
+ if (f2fs_sb_has_encrypt(sbi))
+ invalidate_mapping_pages(META_MAPPING(sbi),
+ MAIN_BLKADDR(sbi), MAX_BLKADDR(sbi) - 1);
+
+ f2fs_release_ino_entry(sbi, false);
+
+ f2fs_reset_fsync_node_info(sbi);
- clear_prefree_segments(sbi, cpc);
clear_sbi_flag(sbi, SBI_IS_DIRTY);
clear_sbi_flag(sbi, SBI_NEED_CP);
+ clear_sbi_flag(sbi, SBI_QUOTA_SKIP_FLUSH);
+ sbi->unusable_block_count = 0;
__set_cp_next_pack(sbi);
/*
@@ -1233,23 +1480,29 @@ static int do_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc)
f2fs_bug_on(sbi, get_pages(sbi, F2FS_DIRTY_DENTS));
- return 0;
+ return unlikely(f2fs_cp_error(sbi)) ? -EIO : 0;
}
/*
* We guarantee that this checkpoint procedure will not fail.
*/
-int write_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc)
+int f2fs_write_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc)
{
struct f2fs_checkpoint *ckpt = F2FS_CKPT(sbi);
unsigned long long ckpt_ver;
int err = 0;
+ if (unlikely(is_sbi_flag_set(sbi, SBI_CP_DISABLED))) {
+ if (cpc->reason != CP_PAUSE)
+ return 0;
+ f2fs_msg(sbi->sb, KERN_WARNING,
+ "Start checkpoint disabled!");
+ }
mutex_lock(&sbi->cp_mutex);
if (!is_sbi_flag_set(sbi, SBI_IS_DIRTY) &&
- (cpc->reason == CP_FASTBOOT || cpc->reason == CP_SYNC ||
- (cpc->reason == CP_DISCARD && !sbi->discard_blks)))
+ ((cpc->reason & CP_FASTBOOT) || (cpc->reason & CP_SYNC) ||
+ ((cpc->reason & CP_DISCARD) && !sbi->discard_blks)))
goto out;
if (unlikely(f2fs_cp_error(sbi))) {
err = -EIO;
@@ -1268,18 +1521,23 @@ int write_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc)
trace_f2fs_write_checkpoint(sbi->sb, cpc->reason, "finish block_ops");
- f2fs_flush_merged_bios(sbi);
+ f2fs_flush_merged_writes(sbi);
/* this is the case of multiple fstrims without any changes */
- if (cpc->reason == CP_DISCARD && !is_sbi_flag_set(sbi, SBI_IS_DIRTY)) {
- f2fs_bug_on(sbi, NM_I(sbi)->dirty_nat_cnt);
- f2fs_bug_on(sbi, SIT_I(sbi)->dirty_sentries);
- f2fs_bug_on(sbi, prefree_segments(sbi));
- flush_sit_entries(sbi, cpc);
- clear_prefree_segments(sbi, cpc);
- f2fs_wait_all_discard_bio(sbi);
- unblock_operations(sbi);
- goto out;
+ if (cpc->reason & CP_DISCARD) {
+ if (!f2fs_exist_trim_candidates(sbi, cpc)) {
+ unblock_operations(sbi);
+ goto out;
+ }
+
+ if (NM_I(sbi)->dirty_nat_cnt == 0 &&
+ SIT_I(sbi)->dirty_sentries == 0 &&
+ prefree_segments(sbi) == 0) {
+ f2fs_flush_sit_entries(sbi, cpc);
+ f2fs_clear_prefree_segments(sbi, cpc);
+ unblock_operations(sbi);
+ goto out;
+ }
}
/*
@@ -1291,18 +1549,23 @@ int write_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc)
ckpt->checkpoint_ver = cpu_to_le64(++ckpt_ver);
/* write cached NAT/SIT entries to NAT/SIT area */
- flush_nat_entries(sbi);
- flush_sit_entries(sbi, cpc);
+ err = f2fs_flush_nat_entries(sbi, cpc);
+ if (err)
+ goto stop;
+
+ f2fs_flush_sit_entries(sbi, cpc);
/* unlock all the fs_lock[] in do_checkpoint() */
err = do_checkpoint(sbi, cpc);
-
- f2fs_wait_all_discard_bio(sbi);
-
+ if (err)
+ f2fs_release_discard_addrs(sbi);
+ else
+ f2fs_clear_prefree_segments(sbi, cpc);
+stop:
unblock_operations(sbi);
stat_inc_cp_count(sbi->stat_info);
- if (cpc->reason == CP_RECOVERY)
+ if (cpc->reason & CP_RECOVERY)
f2fs_msg(sbi->sb, KERN_NOTICE,
"checkpoint: version = %llx", ckpt_ver);
@@ -1314,7 +1577,7 @@ out:
return err;
}
-void init_ino_entry_info(struct f2fs_sb_info *sbi)
+void f2fs_init_ino_entry_info(struct f2fs_sb_info *sbi)
{
int i;
@@ -1332,23 +1595,23 @@ void init_ino_entry_info(struct f2fs_sb_info *sbi)
F2FS_ORPHANS_PER_BLOCK;
}
-int __init create_checkpoint_caches(void)
+int __init f2fs_create_checkpoint_caches(void)
{
ino_entry_slab = f2fs_kmem_cache_create("f2fs_ino_entry",
sizeof(struct ino_entry));
if (!ino_entry_slab)
return -ENOMEM;
- inode_entry_slab = f2fs_kmem_cache_create("f2fs_inode_entry",
+ f2fs_inode_entry_slab = f2fs_kmem_cache_create("f2fs_inode_entry",
sizeof(struct inode_entry));
- if (!inode_entry_slab) {
+ if (!f2fs_inode_entry_slab) {
kmem_cache_destroy(ino_entry_slab);
return -ENOMEM;
}
return 0;
}
-void destroy_checkpoint_caches(void)
+void f2fs_destroy_checkpoint_caches(void)
{
kmem_cache_destroy(ino_entry_slab);
- kmem_cache_destroy(inode_entry_slab);
+ kmem_cache_destroy(f2fs_inode_entry_slab);
}
diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c
index 9041805096e0..8f7d8f7f8f0a 100644
--- a/fs/f2fs/data.c
+++ b/fs/f2fs/data.c
@@ -1,12 +1,9 @@
+// SPDX-License-Identifier: GPL-2.0
/*
* fs/f2fs/data.c
*
* Copyright (c) 2012 Samsung Electronics Co., Ltd.
* http://www.samsung.com/
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
*/
#include <linux/fs.h>
#include <linux/f2fs_fs.h>
@@ -19,8 +16,6 @@
#include <linux/bio.h>
#include <linux/prefetch.h>
#include <linux/uio.h>
-#include <linux/mm.h>
-#include <linux/memcontrol.h>
#include <linux/cleancache.h>
#include "f2fs.h"
@@ -28,59 +23,194 @@
#include "segment.h"
#include "trace.h"
#include <trace/events/f2fs.h>
+#include <trace/events/android_fs.h>
-static void f2fs_read_end_io(struct bio *bio)
+#define NUM_PREALLOC_POST_READ_CTXS 128
+
+static struct kmem_cache *bio_post_read_ctx_cache;
+static mempool_t *bio_post_read_ctx_pool;
+
+static bool __is_cp_guaranteed(struct page *page)
{
- struct bio_vec *bvec;
- int i;
+ struct address_space *mapping = page->mapping;
+ struct inode *inode;
+ struct f2fs_sb_info *sbi;
-#ifdef CONFIG_F2FS_FAULT_INJECTION
- if (time_to_inject(F2FS_P_SB(bio->bi_io_vec->bv_page), FAULT_IO))
- bio->bi_error = -EIO;
-#endif
+ if (!mapping)
+ return false;
- if (f2fs_bio_encrypted(bio)) {
- if (bio->bi_error) {
- fscrypt_release_ctx(bio->bi_private);
- } else {
- fscrypt_decrypt_bio_pages(bio->bi_private, bio);
- return;
- }
+ inode = mapping->host;
+ sbi = F2FS_I_SB(inode);
+
+ if (inode->i_ino == F2FS_META_INO(sbi) ||
+ inode->i_ino == F2FS_NODE_INO(sbi) ||
+ S_ISDIR(inode->i_mode) ||
+ (S_ISREG(inode->i_mode) &&
+ (f2fs_is_atomic_file(inode) || IS_NOQUOTA(inode))) ||
+ is_cold_data(page))
+ return true;
+ return false;
+}
+
+static enum count_type __read_io_type(struct page *page)
+{
+ struct address_space *mapping = page->mapping;
+
+ if (mapping) {
+ struct inode *inode = mapping->host;
+ struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
+
+ if (inode->i_ino == F2FS_META_INO(sbi))
+ return F2FS_RD_META;
+
+ if (inode->i_ino == F2FS_NODE_INO(sbi))
+ return F2FS_RD_NODE;
}
+ return F2FS_RD_DATA;
+}
- bio_for_each_segment_all(bvec, bio, i) {
- struct page *page = bvec->bv_page;
+/* postprocessing steps for read bios */
+enum bio_post_read_step {
+ STEP_INITIAL = 0,
+ STEP_DECRYPT,
+};
- if (!bio->bi_error) {
- if (!PageUptodate(page))
- SetPageUptodate(page);
- } else {
+struct bio_post_read_ctx {
+ struct bio *bio;
+ struct work_struct work;
+ unsigned int cur_step;
+ unsigned int enabled_steps;
+};
+
+static void __read_end_io(struct bio *bio)
+{
+ struct page *page;
+ struct bio_vec *bv;
+ int i;
+
+ bio_for_each_segment_all(bv, bio, i) {
+ page = bv->bv_page;
+
+ /* PG_error was set if any post_read step failed */
+ if (bio->bi_error || PageError(page)) {
ClearPageUptodate(page);
- SetPageError(page);
+ /* will re-read again later */
+ ClearPageError(page);
+ } else {
+ SetPageUptodate(page);
}
+ dec_page_count(F2FS_P_SB(page), __read_io_type(page));
unlock_page(page);
}
+ if (bio->bi_private)
+ mempool_free(bio->bi_private, bio_post_read_ctx_pool);
bio_put(bio);
}
+static void bio_post_read_processing(struct bio_post_read_ctx *ctx);
+
+static void decrypt_work(struct work_struct *work)
+{
+ struct bio_post_read_ctx *ctx =
+ container_of(work, struct bio_post_read_ctx, work);
+
+ fscrypt_decrypt_bio(ctx->bio);
+
+ bio_post_read_processing(ctx);
+}
+
+static void bio_post_read_processing(struct bio_post_read_ctx *ctx)
+{
+ switch (++ctx->cur_step) {
+ case STEP_DECRYPT:
+ if (ctx->enabled_steps & (1 << STEP_DECRYPT)) {
+ INIT_WORK(&ctx->work, decrypt_work);
+ fscrypt_enqueue_decrypt_work(&ctx->work);
+ return;
+ }
+ ctx->cur_step++;
+ /* fall-through */
+ default:
+ __read_end_io(ctx->bio);
+ }
+}
+
+static bool f2fs_bio_post_read_required(struct bio *bio)
+{
+ return bio->bi_private && !bio->bi_error;
+}
+
+static void f2fs_read_end_io(struct bio *bio)
+{
+ struct page *first_page = bio->bi_io_vec[0].bv_page;
+
+ if (time_to_inject(F2FS_P_SB(bio->bi_io_vec->bv_page), FAULT_READ_IO)) {
+ f2fs_show_injection_info(FAULT_READ_IO);
+ bio->bi_error = -EIO;
+ }
+
+ if (f2fs_bio_post_read_required(bio)) {
+ struct bio_post_read_ctx *ctx = bio->bi_private;
+
+ ctx->cur_step = STEP_INITIAL;
+ bio_post_read_processing(ctx);
+ return;
+ }
+
+ if (first_page != NULL &&
+ __read_io_type(first_page) == F2FS_RD_DATA) {
+ trace_android_fs_dataread_end(first_page->mapping->host,
+ page_offset(first_page),
+ bio->bi_iter.bi_size);
+ }
+
+ __read_end_io(bio);
+}
+
static void f2fs_write_end_io(struct bio *bio)
{
struct f2fs_sb_info *sbi = bio->bi_private;
struct bio_vec *bvec;
int i;
+ if (time_to_inject(sbi, FAULT_WRITE_IO)) {
+ f2fs_show_injection_info(FAULT_WRITE_IO);
+ bio->bi_error = -EIO;
+ }
+
bio_for_each_segment_all(bvec, bio, i) {
struct page *page = bvec->bv_page;
+ enum count_type type = WB_DATA_TYPE(page);
+
+ if (IS_DUMMY_WRITTEN_PAGE(page)) {
+ set_page_private(page, (unsigned long)NULL);
+ ClearPagePrivate(page);
+ unlock_page(page);
+ mempool_free(page, sbi->write_io_dummy);
+
+ if (unlikely(bio->bi_error))
+ f2fs_stop_checkpoint(sbi, true);
+ continue;
+ }
fscrypt_pullback_bio_page(&page, true);
if (unlikely(bio->bi_error)) {
mapping_set_error(page->mapping, -EIO);
- f2fs_stop_checkpoint(sbi, true);
+ if (type == F2FS_WB_CP_DATA)
+ f2fs_stop_checkpoint(sbi, true);
}
+
+ f2fs_bug_on(sbi, page->mapping == NODE_MAPPING(sbi) &&
+ page->index != nid_of_node(page));
+
+ dec_page_count(sbi, type);
+ if (f2fs_in_warm_node_list(sbi, page))
+ f2fs_del_fsync_node_entry(sbi, page);
+ clear_cold_data(page);
end_page_writeback(page);
}
- if (atomic_dec_and_test(&sbi->nr_wb_bios) &&
+ if (!get_pages(sbi, F2FS_WB_CP_DATA) &&
wq_has_sleeper(&sbi->cp_wait))
wake_up(&sbi->cp_wait);
@@ -88,19 +218,68 @@ static void f2fs_write_end_io(struct bio *bio)
}
/*
+ * Return true, if pre_bio's bdev is same as its target device.
+ */
+struct block_device *f2fs_target_device(struct f2fs_sb_info *sbi,
+ block_t blk_addr, struct bio *bio)
+{
+ struct block_device *bdev = sbi->sb->s_bdev;
+ int i;
+
+ for (i = 0; i < sbi->s_ndevs; i++) {
+ if (FDEV(i).start_blk <= blk_addr &&
+ FDEV(i).end_blk >= blk_addr) {
+ blk_addr -= FDEV(i).start_blk;
+ bdev = FDEV(i).bdev;
+ break;
+ }
+ }
+ if (bio) {
+ bio->bi_bdev = bdev;
+ bio->bi_iter.bi_sector = SECTOR_FROM_BLOCK(blk_addr);
+ }
+ return bdev;
+}
+
+int f2fs_target_device_index(struct f2fs_sb_info *sbi, block_t blkaddr)
+{
+ int i;
+
+ for (i = 0; i < sbi->s_ndevs; i++)
+ if (FDEV(i).start_blk <= blkaddr && FDEV(i).end_blk >= blkaddr)
+ return i;
+ return 0;
+}
+
+static bool __same_bdev(struct f2fs_sb_info *sbi,
+ block_t blk_addr, struct bio *bio)
+{
+ return f2fs_target_device(sbi, blk_addr, NULL) == bio->bi_bdev;
+}
+
+/*
* Low-level block read/write IO operations.
*/
static struct bio *__bio_alloc(struct f2fs_sb_info *sbi, block_t blk_addr,
- int npages, bool is_read)
+ struct writeback_control *wbc,
+ int npages, bool is_read,
+ enum page_type type, enum temp_type temp)
{
struct bio *bio;
- bio = f2fs_bio_alloc(npages);
+ bio = f2fs_bio_alloc(sbi, npages, true);
- bio->bi_bdev = sbi->sb->s_bdev;
- bio->bi_iter.bi_sector = SECTOR_FROM_BLOCK(blk_addr);
- bio->bi_end_io = is_read ? f2fs_read_end_io : f2fs_write_end_io;
- bio->bi_private = is_read ? NULL : sbi;
+ f2fs_target_device(sbi, blk_addr, bio);
+ if (is_read) {
+ bio->bi_end_io = f2fs_read_end_io;
+ bio->bi_private = NULL;
+ } else {
+ bio->bi_end_io = f2fs_write_end_io;
+ bio->bi_private = sbi;
+ bio->bi_write_hint = f2fs_io_type_to_rw_hint(sbi, type, temp);
+ }
+ if (wbc)
+ wbc_init_bio(wbc, bio);
return bio;
}
@@ -109,14 +288,74 @@ static inline void __submit_bio(struct f2fs_sb_info *sbi,
struct bio *bio, enum page_type type)
{
if (!is_read_io(bio_op(bio))) {
- atomic_inc(&sbi->nr_wb_bios);
- if (f2fs_sb_mounted_hmsmr(sbi->sb) &&
- current->plug && (type == DATA || type == NODE))
+ unsigned int start;
+
+ if (type != DATA && type != NODE)
+ goto submit_io;
+
+ if (test_opt(sbi, LFS) && current->plug)
blk_finish_plug(current->plug);
+
+ start = bio->bi_iter.bi_size >> F2FS_BLKSIZE_BITS;
+ start %= F2FS_IO_SIZE(sbi);
+
+ if (start == 0)
+ goto submit_io;
+
+ /* fill dummy pages */
+ for (; start < F2FS_IO_SIZE(sbi); start++) {
+ struct page *page =
+ mempool_alloc(sbi->write_io_dummy,
+ GFP_NOIO | __GFP_ZERO | __GFP_NOFAIL);
+ f2fs_bug_on(sbi, !page);
+
+ SetPagePrivate(page);
+ set_page_private(page, (unsigned long)DUMMY_WRITTEN_PAGE);
+ lock_page(page);
+ if (bio_add_page(bio, page, PAGE_SIZE, 0) < PAGE_SIZE)
+ f2fs_bug_on(sbi, 1);
+ }
+ /*
+ * In the NODE case, we lose next block address chain. So, we
+ * need to do checkpoint in f2fs_sync_file.
+ */
+ if (type == NODE)
+ set_sbi_flag(sbi, SBI_NEED_CP);
}
+submit_io:
+ if (is_read_io(bio_op(bio)))
+ trace_f2fs_submit_read_bio(sbi->sb, type, bio);
+ else
+ trace_f2fs_submit_write_bio(sbi->sb, type, bio);
submit_bio(bio);
}
+static void __f2fs_submit_read_bio(struct f2fs_sb_info *sbi,
+ struct bio *bio, enum page_type type)
+{
+ if (trace_android_fs_dataread_start_enabled() && (type == DATA)) {
+ struct page *first_page = bio->bi_io_vec[0].bv_page;
+
+ if (first_page != NULL &&
+ __read_io_type(first_page) == F2FS_RD_DATA) {
+ char *path, pathbuf[MAX_TRACE_PATHBUF_LEN];
+
+ path = android_fstrace_get_pathname(pathbuf,
+ MAX_TRACE_PATHBUF_LEN,
+ first_page->mapping->host);
+
+ trace_android_fs_dataread_start(
+ first_page->mapping->host,
+ page_offset(first_page),
+ bio->bi_iter.bi_size,
+ current->pid,
+ path,
+ current->comm);
+ }
+ }
+ __submit_bio(sbi, bio, type);
+}
+
static void __submit_merged_bio(struct f2fs_bio_info *io)
{
struct f2fs_io_info *fio = &io->fio;
@@ -124,12 +363,12 @@ static void __submit_merged_bio(struct f2fs_bio_info *io)
if (!io->bio)
return;
+ bio_set_op_attrs(io->bio, fio->op, fio->op_flags);
+
if (is_read_io(fio->op))
- trace_f2fs_submit_read_bio(io->sbi->sb, fio, io->bio);
+ trace_f2fs_prepare_read_bio(io->sbi->sb, fio->type, io->bio);
else
- trace_f2fs_submit_write_bio(io->sbi->sb, fio, io->bio);
-
- bio_set_op_attrs(io->bio, fio->op, fio->op_flags);
+ trace_f2fs_prepare_write_bio(io->sbi->sb, fio->type, io->bio);
__submit_bio(io->sbi, io->bio, fio->type);
io->bio = NULL;
@@ -166,73 +405,73 @@ static bool __has_merged_page(struct f2fs_bio_info *io, struct inode *inode,
return false;
}
-static bool has_merged_page(struct f2fs_sb_info *sbi, struct inode *inode,
- struct page *page, nid_t ino,
- enum page_type type)
-{
- enum page_type btype = PAGE_TYPE_OF_BIO(type);
- struct f2fs_bio_info *io = &sbi->write_io[btype];
- bool ret;
-
- down_read(&io->io_rwsem);
- ret = __has_merged_page(io, inode, page, ino);
- up_read(&io->io_rwsem);
- return ret;
-}
-
-static void __f2fs_submit_merged_bio(struct f2fs_sb_info *sbi,
- struct inode *inode, struct page *page,
- nid_t ino, enum page_type type, int rw)
+static void __f2fs_submit_merged_write(struct f2fs_sb_info *sbi,
+ enum page_type type, enum temp_type temp)
{
enum page_type btype = PAGE_TYPE_OF_BIO(type);
- struct f2fs_bio_info *io;
-
- io = is_read_io(rw) ? &sbi->read_io : &sbi->write_io[btype];
+ struct f2fs_bio_info *io = sbi->write_io[btype] + temp;
down_write(&io->io_rwsem);
- if (!__has_merged_page(io, inode, page, ino))
- goto out;
-
/* change META to META_FLUSH in the checkpoint procedure */
if (type >= META_FLUSH) {
io->fio.type = META_FLUSH;
io->fio.op = REQ_OP_WRITE;
- if (test_opt(sbi, NOBARRIER))
- io->fio.op_flags = WRITE_FLUSH | REQ_META | REQ_PRIO;
- else
- io->fio.op_flags = WRITE_FLUSH_FUA | REQ_META |
- REQ_PRIO;
+ io->fio.op_flags = REQ_META | REQ_PRIO | REQ_SYNC;
+ if (!test_opt(sbi, NOBARRIER))
+ io->fio.op_flags |= REQ_PREFLUSH | REQ_FUA;
}
__submit_merged_bio(io);
-out:
up_write(&io->io_rwsem);
}
-void f2fs_submit_merged_bio(struct f2fs_sb_info *sbi, enum page_type type,
- int rw)
+static void __submit_merged_write_cond(struct f2fs_sb_info *sbi,
+ struct inode *inode, struct page *page,
+ nid_t ino, enum page_type type, bool force)
+{
+ enum temp_type temp;
+ bool ret = true;
+
+ for (temp = HOT; temp < NR_TEMP_TYPE; temp++) {
+ if (!force) {
+ enum page_type btype = PAGE_TYPE_OF_BIO(type);
+ struct f2fs_bio_info *io = sbi->write_io[btype] + temp;
+
+ down_read(&io->io_rwsem);
+ ret = __has_merged_page(io, inode, page, ino);
+ up_read(&io->io_rwsem);
+ }
+ if (ret)
+ __f2fs_submit_merged_write(sbi, type, temp);
+
+ /* TODO: use HOT temp only for meta pages now. */
+ if (type >= META)
+ break;
+ }
+}
+
+void f2fs_submit_merged_write(struct f2fs_sb_info *sbi, enum page_type type)
{
- __f2fs_submit_merged_bio(sbi, NULL, NULL, 0, type, rw);
+ __submit_merged_write_cond(sbi, NULL, 0, 0, type, true);
}
-void f2fs_submit_merged_bio_cond(struct f2fs_sb_info *sbi,
+void f2fs_submit_merged_write_cond(struct f2fs_sb_info *sbi,
struct inode *inode, struct page *page,
- nid_t ino, enum page_type type, int rw)
+ nid_t ino, enum page_type type)
{
- if (has_merged_page(sbi, inode, page, ino, type))
- __f2fs_submit_merged_bio(sbi, inode, page, ino, type, rw);
+ __submit_merged_write_cond(sbi, inode, page, ino, type, false);
}
-void f2fs_flush_merged_bios(struct f2fs_sb_info *sbi)
+void f2fs_flush_merged_writes(struct f2fs_sb_info *sbi)
{
- f2fs_submit_merged_bio(sbi, DATA, WRITE);
- f2fs_submit_merged_bio(sbi, NODE, WRITE);
- f2fs_submit_merged_bio(sbi, META, WRITE);
+ f2fs_submit_merged_write(sbi, DATA);
+ f2fs_submit_merged_write(sbi, NODE);
+ f2fs_submit_merged_write(sbi, META);
}
/*
* Fill the locked page with data located in the block address.
- * Return unlocked page.
+ * A caller needs to unlock the page on failure.
*/
int f2fs_submit_page_bio(struct f2fs_io_info *fio)
{
@@ -248,69 +487,168 @@ int f2fs_submit_page_bio(struct f2fs_io_info *fio)
f2fs_trace_ios(fio, 0);
/* Allocate a new bio */
- bio = __bio_alloc(fio->sbi, fio->new_blkaddr, 1, is_read_io(fio->op));
+ bio = __bio_alloc(fio->sbi, fio->new_blkaddr, fio->io_wbc,
+ 1, is_read_io(fio->op), fio->type, fio->temp);
if (bio_add_page(bio, page, PAGE_SIZE, 0) < PAGE_SIZE) {
bio_put(bio);
return -EFAULT;
}
+
+ if (fio->io_wbc && !is_read_io(fio->op))
+ wbc_account_io(fio->io_wbc, page, PAGE_SIZE);
+
bio_set_op_attrs(bio, fio->op, fio->op_flags);
- __submit_bio(fio->sbi, bio, fio->type);
+ inc_page_count(fio->sbi, is_read_io(fio->op) ?
+ __read_io_type(page): WB_DATA_TYPE(fio->page));
+
+ __f2fs_submit_read_bio(fio->sbi, bio, fio->type);
return 0;
}
-void f2fs_submit_page_mbio(struct f2fs_io_info *fio)
+void f2fs_submit_page_write(struct f2fs_io_info *fio)
{
struct f2fs_sb_info *sbi = fio->sbi;
enum page_type btype = PAGE_TYPE_OF_BIO(fio->type);
- struct f2fs_bio_info *io;
- bool is_read = is_read_io(fio->op);
+ struct f2fs_bio_info *io = sbi->write_io[btype] + fio->temp;
struct page *bio_page;
- io = is_read ? &sbi->read_io : &sbi->write_io[btype];
+ f2fs_bug_on(sbi, is_read_io(fio->op));
+
+ down_write(&io->io_rwsem);
+next:
+ if (fio->in_list) {
+ spin_lock(&io->io_lock);
+ if (list_empty(&io->io_list)) {
+ spin_unlock(&io->io_lock);
+ goto out;
+ }
+ fio = list_first_entry(&io->io_list,
+ struct f2fs_io_info, list);
+ list_del(&fio->list);
+ spin_unlock(&io->io_lock);
+ }
if (__is_valid_data_blkaddr(fio->old_blkaddr))
verify_block_addr(fio, fio->old_blkaddr);
verify_block_addr(fio, fio->new_blkaddr);
- down_write(&io->io_rwsem);
+ bio_page = fio->encrypted_page ? fio->encrypted_page : fio->page;
+
+ /* set submitted = true as a return value */
+ fio->submitted = true;
+
+ inc_page_count(sbi, WB_DATA_TYPE(bio_page));
if (io->bio && (io->last_block_in_bio != fio->new_blkaddr - 1 ||
- (io->fio.op != fio->op || io->fio.op_flags != fio->op_flags)))
+ (io->fio.op != fio->op || io->fio.op_flags != fio->op_flags) ||
+ !__same_bdev(sbi, fio->new_blkaddr, io->bio)))
__submit_merged_bio(io);
alloc_new:
if (io->bio == NULL) {
- int bio_blocks = MAX_BIO_BLOCKS(sbi);
-
- io->bio = __bio_alloc(sbi, fio->new_blkaddr,
- bio_blocks, is_read);
+ if ((fio->type == DATA || fio->type == NODE) &&
+ fio->new_blkaddr & F2FS_IO_SIZE_MASK(sbi)) {
+ dec_page_count(sbi, WB_DATA_TYPE(bio_page));
+ fio->retry = true;
+ goto skip;
+ }
+ io->bio = __bio_alloc(sbi, fio->new_blkaddr, fio->io_wbc,
+ BIO_MAX_PAGES, false,
+ fio->type, fio->temp);
io->fio = *fio;
}
- bio_page = fio->encrypted_page ? fio->encrypted_page : fio->page;
-
- if (bio_add_page(io->bio, bio_page, PAGE_SIZE, 0) <
- PAGE_SIZE) {
+ if (bio_add_page(io->bio, bio_page, PAGE_SIZE, 0) < PAGE_SIZE) {
__submit_merged_bio(io);
goto alloc_new;
}
+ if (fio->io_wbc)
+ wbc_account_io(fio->io_wbc, bio_page, PAGE_SIZE);
+
io->last_block_in_bio = fio->new_blkaddr;
f2fs_trace_ios(fio, 0);
+ trace_f2fs_submit_page_write(fio->page, fio);
+skip:
+ if (fio->in_list)
+ goto next;
+out:
+ if (is_sbi_flag_set(sbi, SBI_IS_SHUTDOWN) ||
+ f2fs_is_checkpoint_ready(sbi))
+ __submit_merged_bio(io);
up_write(&io->io_rwsem);
- trace_f2fs_submit_page_mbio(fio->page, fio);
+}
+
+static struct bio *f2fs_grab_read_bio(struct inode *inode, block_t blkaddr,
+ unsigned nr_pages, unsigned op_flag)
+{
+ struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
+ struct bio *bio;
+ struct bio_post_read_ctx *ctx;
+ unsigned int post_read_steps = 0;
+
+ if (!f2fs_is_valid_blkaddr(sbi, blkaddr, DATA_GENERIC))
+ return ERR_PTR(-EFAULT);
+
+ bio = f2fs_bio_alloc(sbi, min_t(int, nr_pages, BIO_MAX_PAGES), false);
+ if (!bio)
+ return ERR_PTR(-ENOMEM);
+ f2fs_target_device(sbi, blkaddr, bio);
+ bio->bi_end_io = f2fs_read_end_io;
+ bio_set_op_attrs(bio, REQ_OP_READ, op_flag);
+
+ if (f2fs_encrypted_file(inode))
+ post_read_steps |= 1 << STEP_DECRYPT;
+ if (post_read_steps) {
+ ctx = mempool_alloc(bio_post_read_ctx_pool, GFP_NOFS);
+ if (!ctx) {
+ bio_put(bio);
+ return ERR_PTR(-ENOMEM);
+ }
+ ctx->bio = bio;
+ ctx->enabled_steps = post_read_steps;
+ bio->bi_private = ctx;
+ }
+
+ return bio;
+}
+
+/* This can handle encryption stuffs */
+static int f2fs_submit_page_read(struct inode *inode, struct page *page,
+ block_t blkaddr)
+{
+ struct bio *bio = f2fs_grab_read_bio(inode, blkaddr, 1, 0);
+
+ if (IS_ERR(bio))
+ return PTR_ERR(bio);
+
+ /* wait for GCed page writeback via META_MAPPING */
+ f2fs_wait_on_block_writeback(inode, blkaddr);
+
+ if (bio_add_page(bio, page, PAGE_SIZE, 0) < PAGE_SIZE) {
+ bio_put(bio);
+ return -EFAULT;
+ }
+ ClearPageError(page);
+ inc_page_count(F2FS_I_SB(inode), F2FS_RD_DATA);
+ __f2fs_submit_read_bio(F2FS_I_SB(inode), bio, DATA);
+ return 0;
}
static void __set_data_blkaddr(struct dnode_of_data *dn)
{
struct f2fs_node *rn = F2FS_NODE(dn->node_page);
__le32 *addr_array;
+ int base = 0;
+
+ if (IS_INODE(dn->node_page) && f2fs_has_extra_attr(dn->inode))
+ base = get_extra_isize(dn->inode);
/* Get physical address of data block */
addr_array = blkaddr_in_node(rn);
- addr_array[dn->ofs_in_node] = cpu_to_le32(dn->data_blkaddr);
+ addr_array[base + dn->ofs_in_node] = cpu_to_le32(dn->data_blkaddr);
}
/*
@@ -319,9 +657,9 @@ static void __set_data_blkaddr(struct dnode_of_data *dn)
* ->node_page
* update block addresses in the node page
*/
-void set_data_blkaddr(struct dnode_of_data *dn)
+void f2fs_set_data_blkaddr(struct dnode_of_data *dn)
{
- f2fs_wait_on_page_writeback(dn->node_page, NODE, true);
+ f2fs_wait_on_page_writeback(dn->node_page, NODE, true, true);
__set_data_blkaddr(dn);
if (set_page_dirty(dn->node_page))
dn->node_changed = true;
@@ -330,31 +668,32 @@ void set_data_blkaddr(struct dnode_of_data *dn)
void f2fs_update_data_blkaddr(struct dnode_of_data *dn, block_t blkaddr)
{
dn->data_blkaddr = blkaddr;
- set_data_blkaddr(dn);
+ f2fs_set_data_blkaddr(dn);
f2fs_update_extent_cache(dn);
}
/* dn->ofs_in_node will be returned with up-to-date last block pointer */
-int reserve_new_blocks(struct dnode_of_data *dn, blkcnt_t count)
+int f2fs_reserve_new_blocks(struct dnode_of_data *dn, blkcnt_t count)
{
struct f2fs_sb_info *sbi = F2FS_I_SB(dn->inode);
+ int err;
if (!count)
return 0;
if (unlikely(is_inode_flag_set(dn->inode, FI_NO_ALLOC)))
return -EPERM;
- if (unlikely(!inc_valid_block_count(sbi, dn->inode, &count)))
- return -ENOSPC;
+ if (unlikely((err = inc_valid_block_count(sbi, dn->inode, &count))))
+ return err;
trace_f2fs_reserve_new_blocks(dn->inode, dn->nid,
dn->ofs_in_node, count);
- f2fs_wait_on_page_writeback(dn->node_page, NODE, true);
+ f2fs_wait_on_page_writeback(dn->node_page, NODE, true, true);
for (; count > 0; dn->ofs_in_node++) {
- block_t blkaddr =
- datablock_addr(dn->node_page, dn->ofs_in_node);
+ block_t blkaddr = datablock_addr(dn->inode,
+ dn->node_page, dn->ofs_in_node);
if (blkaddr == NULL_ADDR) {
dn->data_blkaddr = NEW_ADDR;
__set_data_blkaddr(dn);
@@ -368,12 +707,12 @@ int reserve_new_blocks(struct dnode_of_data *dn, blkcnt_t count)
}
/* Should keep dn->ofs_in_node unchanged */
-int reserve_new_block(struct dnode_of_data *dn)
+int f2fs_reserve_new_block(struct dnode_of_data *dn)
{
unsigned int ofs_in_node = dn->ofs_in_node;
int ret;
- ret = reserve_new_blocks(dn, 1);
+ ret = f2fs_reserve_new_blocks(dn, 1);
dn->ofs_in_node = ofs_in_node;
return ret;
}
@@ -383,12 +722,12 @@ int f2fs_reserve_block(struct dnode_of_data *dn, pgoff_t index)
bool need_put = dn->inode_page ? false : true;
int err;
- err = get_dnode_of_data(dn, index, ALLOC_NODE);
+ err = f2fs_get_dnode_of_data(dn, index, ALLOC_NODE);
if (err)
return err;
if (dn->data_blkaddr == NULL_ADDR)
- err = reserve_new_block(dn);
+ err = f2fs_reserve_new_block(dn);
if (err || need_put)
f2fs_put_dnode(dn);
return err;
@@ -396,7 +735,7 @@ int f2fs_reserve_block(struct dnode_of_data *dn, pgoff_t index)
int f2fs_get_block(struct dnode_of_data *dn, pgoff_t index)
{
- struct extent_info ei;
+ struct extent_info ei = {0,0,0};
struct inode *inode = dn->inode;
if (f2fs_lookup_extent_cache(inode, index, &ei)) {
@@ -407,24 +746,14 @@ int f2fs_get_block(struct dnode_of_data *dn, pgoff_t index)
return f2fs_reserve_block(dn, index);
}
-struct page *get_read_data_page(struct inode *inode, pgoff_t index,
+struct page *f2fs_get_read_data_page(struct inode *inode, pgoff_t index,
int op_flags, bool for_write)
{
struct address_space *mapping = inode->i_mapping;
struct dnode_of_data dn;
struct page *page;
- struct extent_info ei;
+ struct extent_info ei = {0,0,0};
int err;
- struct f2fs_io_info fio = {
- .sbi = F2FS_I_SB(inode),
- .type = DATA,
- .op = REQ_OP_READ,
- .op_flags = op_flags,
- .encrypted_page = NULL,
- };
-
- if (f2fs_encrypted_inode(inode) && S_ISREG(inode->i_mode))
- return read_mapping_page(mapping, index, NULL);
page = f2fs_grab_cache_page(mapping, index, for_write);
if (!page)
@@ -436,7 +765,7 @@ struct page *get_read_data_page(struct inode *inode, pgoff_t index,
}
set_new_dnode(&dn, inode, NULL, NULL, 0);
- err = get_dnode_of_data(&dn, index, LOOKUP_NODE);
+ err = f2fs_get_dnode_of_data(&dn, index, LOOKUP_NODE);
if (err)
goto put_err;
f2fs_put_dnode(&dn);
@@ -455,7 +784,8 @@ got_it:
* A new dentry page is allocated but not able to be written, since its
* new inode page couldn't be allocated due to -ENOSPC.
* In such the case, its blkaddr can be remained as NEW_ADDR.
- * see, f2fs_add_link -> get_new_data_page -> init_inode_metadata.
+ * see, f2fs_add_link -> f2fs_get_new_data_page ->
+ * f2fs_init_inode_metadata.
*/
if (dn.data_blkaddr == NEW_ADDR) {
zero_user_segment(page, 0, PAGE_SIZE);
@@ -465,9 +795,7 @@ got_it:
return page;
}
- fio.new_blkaddr = fio.old_blkaddr = dn.data_blkaddr;
- fio.page = page;
- err = f2fs_submit_page_bio(&fio);
+ err = f2fs_submit_page_read(inode, page, dn.data_blkaddr);
if (err)
goto put_err;
return page;
@@ -477,7 +805,7 @@ put_err:
return ERR_PTR(err);
}
-struct page *find_data_page(struct inode *inode, pgoff_t index)
+struct page *f2fs_find_data_page(struct inode *inode, pgoff_t index)
{
struct address_space *mapping = inode->i_mapping;
struct page *page;
@@ -487,7 +815,7 @@ struct page *find_data_page(struct inode *inode, pgoff_t index)
return page;
f2fs_put_page(page, 0);
- page = get_read_data_page(inode, index, READ_SYNC, false);
+ page = f2fs_get_read_data_page(inode, index, 0, false);
if (IS_ERR(page))
return page;
@@ -507,13 +835,13 @@ struct page *find_data_page(struct inode *inode, pgoff_t index)
* Because, the callers, functions in dir.c and GC, should be able to know
* whether this page exists or not.
*/
-struct page *get_lock_data_page(struct inode *inode, pgoff_t index,
+struct page *f2fs_get_lock_data_page(struct inode *inode, pgoff_t index,
bool for_write)
{
struct address_space *mapping = inode->i_mapping;
struct page *page;
repeat:
- page = get_read_data_page(inode, index, READ_SYNC, for_write);
+ page = f2fs_get_read_data_page(inode, index, 0, for_write);
if (IS_ERR(page))
return page;
@@ -539,7 +867,7 @@ repeat:
* Note that, ipage is set only by make_empty_dir, and if any error occur,
* ipage should be released by this function.
*/
-struct page *get_new_data_page(struct inode *inode,
+struct page *f2fs_get_new_data_page(struct inode *inode,
struct page *ipage, pgoff_t index, bool new_i_size)
{
struct address_space *mapping = inode->i_mapping;
@@ -578,7 +906,7 @@ struct page *get_new_data_page(struct inode *inode,
/* if ipage exists, blkaddr should be NEW_ADDR */
f2fs_bug_on(F2FS_I_SB(inode), ipage);
- page = get_lock_data_page(inode, index, true);
+ page = f2fs_get_lock_data_page(inode, index, true);
if (IS_ERR(page))
return page;
}
@@ -589,50 +917,67 @@ got_it:
return page;
}
-static int __allocate_data_block(struct dnode_of_data *dn)
+static int __allocate_data_block(struct dnode_of_data *dn, int seg_type)
{
struct f2fs_sb_info *sbi = F2FS_I_SB(dn->inode);
struct f2fs_summary sum;
struct node_info ni;
- int seg = CURSEG_WARM_DATA;
- pgoff_t fofs;
+ block_t old_blkaddr;
blkcnt_t count = 1;
+ int err;
if (unlikely(is_inode_flag_set(dn->inode, FI_NO_ALLOC)))
return -EPERM;
- dn->data_blkaddr = datablock_addr(dn->node_page, dn->ofs_in_node);
- if (dn->data_blkaddr == NEW_ADDR)
+ err = f2fs_get_node_info(sbi, dn->nid, &ni);
+ if (err)
+ return err;
+
+ dn->data_blkaddr = datablock_addr(dn->inode,
+ dn->node_page, dn->ofs_in_node);
+ if (dn->data_blkaddr != NULL_ADDR)
goto alloc;
- if (unlikely(!inc_valid_block_count(sbi, dn->inode, &count)))
- return -ENOSPC;
+ if (unlikely((err = inc_valid_block_count(sbi, dn->inode, &count))))
+ return err;
alloc:
- get_node_info(sbi, dn->nid, &ni);
set_summary(&sum, dn->nid, dn->ofs_in_node, ni.version);
+ old_blkaddr = dn->data_blkaddr;
+ f2fs_allocate_data_block(sbi, NULL, old_blkaddr, &dn->data_blkaddr,
+ &sum, seg_type, NULL, false);
+ if (GET_SEGNO(sbi, old_blkaddr) != NULL_SEGNO)
+ invalidate_mapping_pages(META_MAPPING(sbi),
+ old_blkaddr, old_blkaddr);
+ f2fs_set_data_blkaddr(dn);
- if (dn->ofs_in_node == 0 && dn->inode_page == dn->node_page)
- seg = CURSEG_DIRECT_IO;
-
- allocate_data_block(sbi, NULL, dn->data_blkaddr, &dn->data_blkaddr,
- &sum, seg);
- set_data_blkaddr(dn);
-
- /* update i_size */
- fofs = start_bidx_of_node(ofs_of_node(dn->node_page), dn->inode) +
- dn->ofs_in_node;
- if (i_size_read(dn->inode) < ((loff_t)(fofs + 1) << PAGE_SHIFT))
- f2fs_i_size_write(dn->inode,
- ((loff_t)(fofs + 1) << PAGE_SHIFT));
+ /*
+ * i_size will be updated by direct_IO. Otherwise, we'll get stale
+ * data from unwritten block via dio_read.
+ */
return 0;
}
-ssize_t f2fs_preallocate_blocks(struct kiocb *iocb, struct iov_iter *from)
+int f2fs_preallocate_blocks(struct kiocb *iocb, struct iov_iter *from)
{
struct inode *inode = file_inode(iocb->ki_filp);
struct f2fs_map_blocks map;
- ssize_t ret = 0;
+ int flag;
+ int err = 0;
+ bool direct_io = iocb->ki_flags & IOCB_DIRECT;
+
+ /* convert inline data for Direct I/O*/
+ if (direct_io) {
+ err = f2fs_convert_inline_inode(inode);
+ if (err)
+ return err;
+ }
+
+ if (direct_io && allow_outplace_dio(inode, iocb, from))
+ return 0;
+
+ if (is_inode_flag_set(inode, FI_NO_PREALLOC))
+ return 0;
map.m_lblk = F2FS_BLK_ALIGN(iocb->ki_pos);
map.m_len = F2FS_BYTES_TO_BLK(iocb->ki_pos + iov_iter_count(from));
@@ -642,21 +987,50 @@ ssize_t f2fs_preallocate_blocks(struct kiocb *iocb, struct iov_iter *from)
map.m_len = 0;
map.m_next_pgofs = NULL;
+ map.m_next_extent = NULL;
+ map.m_seg_type = NO_CHECK_TYPE;
+ map.m_may_create = true;
+
+ if (direct_io) {
+ map.m_seg_type = f2fs_rw_hint_to_seg_type(iocb->ki_hint);
+ flag = f2fs_force_buffered_io(inode, iocb, from) ?
+ F2FS_GET_BLOCK_PRE_AIO :
+ F2FS_GET_BLOCK_PRE_DIO;
+ goto map_blocks;
+ }
+ if (iocb->ki_pos + iov_iter_count(from) > MAX_INLINE_DATA(inode)) {
+ err = f2fs_convert_inline_inode(inode);
+ if (err)
+ return err;
+ }
+ if (f2fs_has_inline_data(inode))
+ return err;
- if (iocb->ki_flags & IOCB_DIRECT) {
- ret = f2fs_convert_inline_inode(inode);
- if (ret)
- return ret;
- return f2fs_map_blocks(inode, &map, 1, F2FS_GET_BLOCK_PRE_DIO);
+ flag = F2FS_GET_BLOCK_PRE_AIO;
+
+map_blocks:
+ err = f2fs_map_blocks(inode, &map, 1, flag);
+ if (map.m_len > 0 && err == -ENOSPC) {
+ if (!direct_io)
+ set_inode_flag(inode, FI_NO_PREALLOC);
+ err = 0;
}
- if (iocb->ki_pos + iov_iter_count(from) > MAX_INLINE_DATA) {
- ret = f2fs_convert_inline_inode(inode);
- if (ret)
- return ret;
+ return err;
+}
+
+void __do_map_lock(struct f2fs_sb_info *sbi, int flag, bool lock)
+{
+ if (flag == F2FS_GET_BLOCK_PRE_AIO) {
+ if (lock)
+ down_read(&sbi->node_change);
+ else
+ up_read(&sbi->node_change);
+ } else {
+ if (lock)
+ f2fs_lock_op(sbi);
+ else
+ f2fs_unlock_op(sbi);
}
- if (!f2fs_has_inline_data(inode))
- return f2fs_map_blocks(inode, &map, 1, F2FS_GET_BLOCK_PRE_AIO);
- return ret;
}
/*
@@ -674,14 +1048,14 @@ int f2fs_map_blocks(struct inode *inode, struct f2fs_map_blocks *map,
unsigned int maxblocks = map->m_len;
struct dnode_of_data dn;
struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
- int mode = create ? ALLOC_NODE : LOOKUP_NODE;
+ int mode = map->m_may_create ? ALLOC_NODE : LOOKUP_NODE;
pgoff_t pgofs, end_offset, end;
int err = 0, ofs = 1;
unsigned int ofs_in_node, last_ofs_in_node;
blkcnt_t prealloc;
- struct extent_info ei;
- bool allocated = false;
+ struct extent_info ei = {0,0,0};
block_t blkaddr;
+ unsigned int start_pgofs;
if (!maxblocks)
return 0;
@@ -694,19 +1068,30 @@ int f2fs_map_blocks(struct inode *inode, struct f2fs_map_blocks *map,
end = pgofs + maxblocks;
if (!create && f2fs_lookup_extent_cache(inode, pgofs, &ei)) {
+ if (test_opt(sbi, LFS) && flag == F2FS_GET_BLOCK_DIO &&
+ map->m_may_create)
+ goto next_dnode;
+
map->m_pblk = ei.blk + pgofs - ei.fofs;
map->m_len = min((pgoff_t)maxblocks, ei.fofs + ei.len - pgofs);
map->m_flags = F2FS_MAP_MAPPED;
+ if (map->m_next_extent)
+ *map->m_next_extent = pgofs + map->m_len;
+
+ /* for hardware encryption, but to avoid potential issue in future */
+ if (flag == F2FS_GET_BLOCK_DIO)
+ f2fs_wait_on_block_writeback_range(inode,
+ map->m_pblk, map->m_len);
goto out;
}
next_dnode:
- if (create)
- f2fs_lock_op(sbi);
+ if (map->m_may_create)
+ __do_map_lock(sbi, flag, true);
/* When reading holes, we need its node page */
set_new_dnode(&dn, inode, NULL, NULL, 0);
- err = get_dnode_of_data(&dn, pgofs, mode);
+ err = f2fs_get_dnode_of_data(&dn, pgofs, mode);
if (err) {
if (flag == F2FS_GET_BLOCK_BMAP)
map->m_pblk = 0;
@@ -714,17 +1099,21 @@ next_dnode:
err = 0;
if (map->m_next_pgofs)
*map->m_next_pgofs =
- get_next_page_offset(&dn, pgofs);
+ f2fs_get_next_page_offset(&dn, pgofs);
+ if (map->m_next_extent)
+ *map->m_next_extent =
+ f2fs_get_next_page_offset(&dn, pgofs);
}
goto unlock_out;
}
+ start_pgofs = pgofs;
prealloc = 0;
last_ofs_in_node = ofs_in_node = dn.ofs_in_node;
end_offset = ADDRS_PER_PAGE(dn.node_page, inode);
next_block:
- blkaddr = datablock_addr(dn.node_page, dn.ofs_in_node);
+ blkaddr = datablock_addr(dn.inode, dn.node_page, dn.ofs_in_node);
if (__is_valid_data_blkaddr(blkaddr) &&
!f2fs_is_valid_blkaddr(sbi, blkaddr, DATA_GENERIC)) {
@@ -732,7 +1121,17 @@ next_block:
goto sync_out;
}
- if (!is_valid_data_blkaddr(sbi, blkaddr)) {
+ if (is_valid_data_blkaddr(sbi, blkaddr)) {
+ /* use out-place-update for driect IO under LFS mode */
+ if (test_opt(sbi, LFS) && flag == F2FS_GET_BLOCK_DIO &&
+ map->m_may_create) {
+ err = __allocate_data_block(&dn, map->m_seg_type);
+ if (!err) {
+ blkaddr = dn.data_blkaddr;
+ set_inode_flag(inode, FI_APPEND_WRITE);
+ }
+ }
+ } else {
if (create) {
if (unlikely(f2fs_cp_error(sbi))) {
err = -EIO;
@@ -744,29 +1143,36 @@ next_block:
last_ofs_in_node = dn.ofs_in_node;
}
} else {
- err = __allocate_data_block(&dn);
- if (!err) {
+ WARN_ON(flag != F2FS_GET_BLOCK_PRE_DIO &&
+ flag != F2FS_GET_BLOCK_DIO);
+ err = __allocate_data_block(&dn,
+ map->m_seg_type);
+ if (!err)
set_inode_flag(inode, FI_APPEND_WRITE);
- allocated = true;
- }
}
if (err)
goto sync_out;
- map->m_flags = F2FS_MAP_NEW;
+ map->m_flags |= F2FS_MAP_NEW;
blkaddr = dn.data_blkaddr;
} else {
if (flag == F2FS_GET_BLOCK_BMAP) {
map->m_pblk = 0;
goto sync_out;
}
+ if (flag == F2FS_GET_BLOCK_PRECACHE)
+ goto sync_out;
if (flag == F2FS_GET_BLOCK_FIEMAP &&
blkaddr == NULL_ADDR) {
if (map->m_next_pgofs)
*map->m_next_pgofs = pgofs + 1;
+ goto sync_out;
}
- if (flag != F2FS_GET_BLOCK_FIEMAP ||
- blkaddr != NEW_ADDR)
+ if (flag != F2FS_GET_BLOCK_FIEMAP) {
+ /* for defragment case */
+ if (map->m_next_pgofs)
+ *map->m_next_pgofs = pgofs + 1;
goto sync_out;
+ }
}
}
@@ -800,10 +1206,9 @@ skip:
(pgofs == end || dn.ofs_in_node == end_offset)) {
dn.ofs_in_node = ofs_in_node;
- err = reserve_new_blocks(&dn, prealloc);
+ err = f2fs_reserve_new_blocks(&dn, prealloc);
if (err)
goto sync_out;
- allocated = dn.node_changed;
map->m_len += dn.ofs_in_node - ofs_in_node;
if (prealloc && dn.ofs_in_node != last_ofs_in_node + 1) {
@@ -818,45 +1223,100 @@ skip:
else if (dn.ofs_in_node < end_offset)
goto next_block;
+ if (flag == F2FS_GET_BLOCK_PRECACHE) {
+ if (map->m_flags & F2FS_MAP_MAPPED) {
+ unsigned int ofs = start_pgofs - map->m_lblk;
+
+ f2fs_update_extent_cache_range(&dn,
+ start_pgofs, map->m_pblk + ofs,
+ map->m_len - ofs);
+ }
+ }
+
f2fs_put_dnode(&dn);
- if (create) {
- f2fs_unlock_op(sbi);
- f2fs_balance_fs(sbi, allocated);
+ if (map->m_may_create) {
+ __do_map_lock(sbi, flag, false);
+ f2fs_balance_fs(sbi, dn.node_changed);
}
- allocated = false;
goto next_dnode;
sync_out:
+
+ /* for hardware encryption, but to avoid potential issue in future */
+ if (flag == F2FS_GET_BLOCK_DIO && map->m_flags & F2FS_MAP_MAPPED)
+ f2fs_wait_on_block_writeback_range(inode,
+ map->m_pblk, map->m_len);
+
+ if (flag == F2FS_GET_BLOCK_PRECACHE) {
+ if (map->m_flags & F2FS_MAP_MAPPED) {
+ unsigned int ofs = start_pgofs - map->m_lblk;
+
+ f2fs_update_extent_cache_range(&dn,
+ start_pgofs, map->m_pblk + ofs,
+ map->m_len - ofs);
+ }
+ if (map->m_next_extent)
+ *map->m_next_extent = pgofs + 1;
+ }
f2fs_put_dnode(&dn);
unlock_out:
- if (create) {
- f2fs_unlock_op(sbi);
- f2fs_balance_fs(sbi, allocated);
+ if (map->m_may_create) {
+ __do_map_lock(sbi, flag, false);
+ f2fs_balance_fs(sbi, dn.node_changed);
}
out:
trace_f2fs_map_blocks(inode, map, err);
return err;
}
+bool f2fs_overwrite_io(struct inode *inode, loff_t pos, size_t len)
+{
+ struct f2fs_map_blocks map;
+ block_t last_lblk;
+ int err;
+
+ if (pos + len > i_size_read(inode))
+ return false;
+
+ map.m_lblk = F2FS_BYTES_TO_BLK(pos);
+ map.m_next_pgofs = NULL;
+ map.m_next_extent = NULL;
+ map.m_seg_type = NO_CHECK_TYPE;
+ map.m_may_create = false;
+ last_lblk = F2FS_BLK_ALIGN(pos + len);
+
+ while (map.m_lblk < last_lblk) {
+ map.m_len = last_lblk - map.m_lblk;
+ err = f2fs_map_blocks(inode, &map, 0, F2FS_GET_BLOCK_DEFAULT);
+ if (err || map.m_len == 0)
+ return false;
+ map.m_lblk += map.m_len;
+ }
+ return true;
+}
+
static int __get_data_block(struct inode *inode, sector_t iblock,
struct buffer_head *bh, int create, int flag,
- pgoff_t *next_pgofs)
+ pgoff_t *next_pgofs, int seg_type, bool may_write)
{
struct f2fs_map_blocks map;
- int ret;
+ int err;
map.m_lblk = iblock;
map.m_len = bh->b_size >> inode->i_blkbits;
map.m_next_pgofs = next_pgofs;
+ map.m_next_extent = NULL;
+ map.m_seg_type = seg_type;
+ map.m_may_create = may_write;
- ret = f2fs_map_blocks(inode, &map, create, flag);
- if (!ret) {
+ err = f2fs_map_blocks(inode, &map, create, flag);
+ if (!err) {
map_bh(bh, inode->i_sb, map.m_pblk);
bh->b_state = (bh->b_state & ~F2FS_MAP_FLAGS) | map.m_flags;
bh->b_size = (u64)map.m_len << inode->i_blkbits;
}
- return ret;
+ return err;
}
static int get_data_block(struct inode *inode, sector_t iblock,
@@ -864,14 +1324,26 @@ static int get_data_block(struct inode *inode, sector_t iblock,
pgoff_t *next_pgofs)
{
return __get_data_block(inode, iblock, bh_result, create,
- flag, next_pgofs);
+ flag, next_pgofs,
+ NO_CHECK_TYPE, create);
+}
+
+static int get_data_block_dio_write(struct inode *inode, sector_t iblock,
+ struct buffer_head *bh_result, int create)
+{
+ return __get_data_block(inode, iblock, bh_result, create,
+ F2FS_GET_BLOCK_DIO, NULL,
+ f2fs_rw_hint_to_seg_type(inode->i_write_hint),
+ true);
}
static int get_data_block_dio(struct inode *inode, sector_t iblock,
struct buffer_head *bh_result, int create)
{
return __get_data_block(inode, iblock, bh_result, create,
- F2FS_GET_BLOCK_DIO, NULL);
+ F2FS_GET_BLOCK_DIO, NULL,
+ f2fs_rw_hint_to_seg_type(inode->i_write_hint),
+ false);
}
static int get_data_block_bmap(struct inode *inode, sector_t iblock,
@@ -882,7 +1354,8 @@ static int get_data_block_bmap(struct inode *inode, sector_t iblock,
return -EFBIG;
return __get_data_block(inode, iblock, bh_result, create,
- F2FS_GET_BLOCK_BMAP, NULL);
+ F2FS_GET_BLOCK_BMAP, NULL,
+ NO_CHECK_TYPE, create);
}
static inline sector_t logical_to_blk(struct inode *inode, loff_t offset)
@@ -895,35 +1368,108 @@ static inline loff_t blk_to_logical(struct inode *inode, sector_t blk)
return (blk << inode->i_blkbits);
}
+static int f2fs_xattr_fiemap(struct inode *inode,
+ struct fiemap_extent_info *fieinfo)
+{
+ struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
+ struct page *page;
+ struct node_info ni;
+ __u64 phys = 0, len;
+ __u32 flags;
+ nid_t xnid = F2FS_I(inode)->i_xattr_nid;
+ int err = 0;
+
+ if (f2fs_has_inline_xattr(inode)) {
+ int offset;
+
+ page = f2fs_grab_cache_page(NODE_MAPPING(sbi),
+ inode->i_ino, false);
+ if (!page)
+ return -ENOMEM;
+
+ err = f2fs_get_node_info(sbi, inode->i_ino, &ni);
+ if (err) {
+ f2fs_put_page(page, 1);
+ return err;
+ }
+
+ phys = (__u64)blk_to_logical(inode, ni.blk_addr);
+ offset = offsetof(struct f2fs_inode, i_addr) +
+ sizeof(__le32) * (DEF_ADDRS_PER_INODE -
+ get_inline_xattr_addrs(inode));
+
+ phys += offset;
+ len = inline_xattr_size(inode);
+
+ f2fs_put_page(page, 1);
+
+ flags = FIEMAP_EXTENT_DATA_INLINE | FIEMAP_EXTENT_NOT_ALIGNED;
+
+ if (!xnid)
+ flags |= FIEMAP_EXTENT_LAST;
+
+ err = fiemap_fill_next_extent(fieinfo, 0, phys, len, flags);
+ if (err || err == 1)
+ return err;
+ }
+
+ if (xnid) {
+ page = f2fs_grab_cache_page(NODE_MAPPING(sbi), xnid, false);
+ if (!page)
+ return -ENOMEM;
+
+ err = f2fs_get_node_info(sbi, xnid, &ni);
+ if (err) {
+ f2fs_put_page(page, 1);
+ return err;
+ }
+
+ phys = (__u64)blk_to_logical(inode, ni.blk_addr);
+ len = inode->i_sb->s_blocksize;
+
+ f2fs_put_page(page, 1);
+
+ flags = FIEMAP_EXTENT_LAST;
+ }
+
+ if (phys)
+ err = fiemap_fill_next_extent(fieinfo, 0, phys, len, flags);
+
+ return (err < 0 ? err : 0);
+}
+
int f2fs_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
u64 start, u64 len)
{
struct buffer_head map_bh;
sector_t start_blk, last_blk;
pgoff_t next_pgofs;
- loff_t isize;
u64 logical = 0, phys = 0, size = 0;
u32 flags = 0;
int ret = 0;
- ret = fiemap_check_flags(fieinfo, FIEMAP_FLAG_SYNC);
- if (ret)
- return ret;
-
- if (f2fs_has_inline_data(inode)) {
- ret = f2fs_inline_data_fiemap(inode, fieinfo, start, len);
- if (ret != -EAGAIN)
+ if (fieinfo->fi_flags & FIEMAP_FLAG_CACHE) {
+ ret = f2fs_precache_extents(inode);
+ if (ret)
return ret;
}
+ ret = fiemap_check_flags(fieinfo, FIEMAP_FLAG_SYNC | FIEMAP_FLAG_XATTR);
+ if (ret)
+ return ret;
+
inode_lock(inode);
- isize = i_size_read(inode);
- if (start >= isize)
+ if (fieinfo->fi_flags & FIEMAP_FLAG_XATTR) {
+ ret = f2fs_xattr_fiemap(inode, fieinfo);
goto out;
+ }
- if (start + len > isize)
- len = isize - start;
+ if (f2fs_has_inline_data(inode)) {
+ ret = f2fs_inline_data_fiemap(inode, fieinfo, start, len);
+ if (ret != -EAGAIN)
+ goto out;
+ }
if (logical_to_blk(inode, len) == 0)
len = blk_to_logical(inode, 1);
@@ -943,13 +1489,11 @@ next:
/* HOLE */
if (!buffer_mapped(&map_bh)) {
start_blk = next_pgofs;
- /* Go through holes util pass the EOF */
- if (blk_to_logical(inode, start_blk) < isize)
+
+ if (blk_to_logical(inode, start_blk) < blk_to_logical(inode,
+ F2FS_I_SB(inode)->max_file_blocks))
goto prep_next;
- /* Found a hole beyond isize means no more extents.
- * Note that the premise is that filesystems don't
- * punch holes beyond isize and keep size unchanged.
- */
+
flags |= FIEMAP_EXTENT_LAST;
}
@@ -987,50 +1531,20 @@ out:
return ret;
}
-static struct bio *f2fs_grab_bio(struct inode *inode, block_t blkaddr,
- unsigned nr_pages)
-{
- struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
- struct fscrypt_ctx *ctx = NULL;
- struct block_device *bdev = sbi->sb->s_bdev;
- struct bio *bio;
-
- if (!f2fs_is_valid_blkaddr(sbi, blkaddr, DATA_GENERIC))
- return ERR_PTR(-EFAULT);
-
- if (f2fs_encrypted_inode(inode) && S_ISREG(inode->i_mode)) {
- ctx = fscrypt_get_ctx(inode, GFP_NOFS);
- if (IS_ERR(ctx))
- return ERR_CAST(ctx);
-
- /* wait the page to be moved by cleaning */
- f2fs_wait_on_encrypted_page_writeback(sbi, blkaddr);
- }
-
- bio = bio_alloc(GFP_KERNEL, min_t(int, nr_pages, BIO_MAX_PAGES));
- if (!bio) {
- if (ctx)
- fscrypt_release_ctx(ctx);
- return ERR_PTR(-ENOMEM);
- }
- bio->bi_bdev = bdev;
- bio->bi_iter.bi_sector = SECTOR_FROM_BLOCK(blkaddr);
- bio->bi_end_io = f2fs_read_end_io;
- bio->bi_private = ctx;
-
- return bio;
-}
-
/*
* This function was originally taken from fs/mpage.c, and customized for f2fs.
* Major change was from block_size == page_size in f2fs by default.
+ *
+ * Note that the aops->readpages() function is ONLY used for read-ahead. If
+ * this function ever deviates from doing just read-ahead, it should either
+ * use ->readpage() or do the necessary surgery to decouple ->readpages()
+ * from read-ahead.
*/
static int f2fs_mpage_readpages(struct address_space *mapping,
struct list_head *pages, struct page *page,
- unsigned nr_pages)
+ unsigned nr_pages, bool is_readahead)
{
struct bio *bio = NULL;
- unsigned page_idx;
sector_t last_block_in_bio = 0;
struct inode *inode = mapping->host;
const unsigned blkbits = inode->i_blkbits;
@@ -1046,12 +1560,15 @@ static int f2fs_mpage_readpages(struct address_space *mapping,
map.m_len = 0;
map.m_flags = 0;
map.m_next_pgofs = NULL;
+ map.m_next_extent = NULL;
+ map.m_seg_type = NO_CHECK_TYPE;
+ map.m_may_create = false;
- for (page_idx = 0; nr_pages; page_idx++, nr_pages--) {
-
- prefetchw(&page->flags);
+ for (; nr_pages; nr_pages--) {
if (pages) {
- page = list_entry(pages->prev, struct page, lru);
+ page = list_last_entry(pages, struct page, lru);
+
+ prefetchw(&page->flags);
list_del(&page->lru);
if (add_to_page_cache_lru(page, mapping,
page->index,
@@ -1085,7 +1602,7 @@ static int f2fs_mpage_readpages(struct address_space *mapping,
map.m_len = last_block - block_in_file;
if (f2fs_map_blocks(inode, &map, 0,
- F2FS_GET_BLOCK_READ))
+ F2FS_GET_BLOCK_DEFAULT))
goto set_error_page;
}
got_it:
@@ -1113,23 +1630,32 @@ got_it:
* This page will go to BIO. Do we need to send this
* BIO off first?
*/
- if (bio && (last_block_in_bio != block_nr - 1)) {
+ if (bio && (last_block_in_bio != block_nr - 1 ||
+ !__same_bdev(F2FS_I_SB(inode), block_nr, bio))) {
submit_and_realloc:
- __submit_bio(F2FS_I_SB(inode), bio, DATA);
+ __f2fs_submit_read_bio(F2FS_I_SB(inode), bio, DATA);
bio = NULL;
}
if (bio == NULL) {
- bio = f2fs_grab_bio(inode, block_nr, nr_pages);
+ bio = f2fs_grab_read_bio(inode, block_nr, nr_pages,
+ is_readahead ? REQ_RAHEAD : 0);
if (IS_ERR(bio)) {
bio = NULL;
goto set_error_page;
}
- bio_set_op_attrs(bio, REQ_OP_READ, 0);
}
+ /*
+ * If the page is under writeback, we need to wait for
+ * its completion to see the correct decrypted data.
+ */
+ f2fs_wait_on_block_writeback(inode, block_nr);
+
if (bio_add_page(bio, page, blocksize, 0) < blocksize)
goto submit_and_realloc;
+ inc_page_count(F2FS_I_SB(inode), F2FS_RD_DATA);
+ ClearPageError(page);
last_block_in_bio = block_nr;
goto next_page;
set_error_page:
@@ -1139,7 +1665,7 @@ set_error_page:
goto next_page;
confused:
if (bio) {
- __submit_bio(F2FS_I_SB(inode), bio, DATA);
+ __f2fs_submit_read_bio(F2FS_I_SB(inode), bio, DATA);
bio = NULL;
}
unlock_page(page);
@@ -1149,7 +1675,7 @@ next_page:
}
BUG_ON(pages && !list_empty(pages));
if (bio)
- __submit_bio(F2FS_I_SB(inode), bio, DATA);
+ __f2fs_submit_read_bio(F2FS_I_SB(inode), bio, DATA);
return 0;
}
@@ -1164,7 +1690,7 @@ static int f2fs_read_data_page(struct file *file, struct page *page)
if (f2fs_has_inline_data(inode))
ret = f2fs_read_inline_data(inode, page);
if (ret == -EAGAIN)
- ret = f2fs_mpage_readpages(page->mapping, NULL, page, 1);
+ ret = f2fs_mpage_readpages(page->mapping, NULL, page, 1, false);
return ret;
}
@@ -1172,8 +1698,8 @@ static int f2fs_read_data_pages(struct file *file,
struct address_space *mapping,
struct list_head *pages, unsigned nr_pages)
{
- struct inode *inode = file->f_mapping->host;
- struct page *page = list_entry(pages->prev, struct page, lru);
+ struct inode *inode = mapping->host;
+ struct page *page = list_last_entry(pages, struct page, lru);
trace_f2fs_readpages(inode, page, nr_pages);
@@ -1181,54 +1707,170 @@ static int f2fs_read_data_pages(struct file *file,
if (f2fs_has_inline_data(inode))
return 0;
- return f2fs_mpage_readpages(mapping, pages, NULL, nr_pages);
+ return f2fs_mpage_readpages(mapping, pages, NULL, nr_pages, true);
+}
+
+static int encrypt_one_page(struct f2fs_io_info *fio)
+{
+ struct inode *inode = fio->page->mapping->host;
+ struct page *mpage;
+ gfp_t gfp_flags = GFP_NOFS;
+
+ if (!f2fs_encrypted_file(inode))
+ return 0;
+
+ /* wait for GCed page writeback via META_MAPPING */
+ f2fs_wait_on_block_writeback(inode, fio->old_blkaddr);
+
+retry_encrypt:
+ fio->encrypted_page = fscrypt_encrypt_page(inode, fio->page,
+ PAGE_SIZE, 0, fio->page->index, gfp_flags);
+ if (IS_ERR(fio->encrypted_page)) {
+ /* flush pending IOs and wait for a while in the ENOMEM case */
+ if (PTR_ERR(fio->encrypted_page) == -ENOMEM) {
+ f2fs_flush_merged_writes(fio->sbi);
+ congestion_wait(BLK_RW_ASYNC, HZ/50);
+ gfp_flags |= __GFP_NOFAIL;
+ goto retry_encrypt;
+ }
+ return PTR_ERR(fio->encrypted_page);
+ }
+
+ mpage = find_lock_page(META_MAPPING(fio->sbi), fio->old_blkaddr);
+ if (mpage) {
+ if (PageUptodate(mpage))
+ memcpy(page_address(mpage),
+ page_address(fio->encrypted_page), PAGE_SIZE);
+ f2fs_put_page(mpage, 1);
+ }
+ return 0;
+}
+
+static inline bool check_inplace_update_policy(struct inode *inode,
+ struct f2fs_io_info *fio)
+{
+ struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
+ unsigned int policy = SM_I(sbi)->ipu_policy;
+
+ if (policy & (0x1 << F2FS_IPU_FORCE))
+ return true;
+ if (policy & (0x1 << F2FS_IPU_SSR) && f2fs_need_SSR(sbi))
+ return true;
+ if (policy & (0x1 << F2FS_IPU_UTIL) &&
+ utilization(sbi) > SM_I(sbi)->min_ipu_util)
+ return true;
+ if (policy & (0x1 << F2FS_IPU_SSR_UTIL) && f2fs_need_SSR(sbi) &&
+ utilization(sbi) > SM_I(sbi)->min_ipu_util)
+ return true;
+
+ /*
+ * IPU for rewrite async pages
+ */
+ if (policy & (0x1 << F2FS_IPU_ASYNC) &&
+ fio && fio->op == REQ_OP_WRITE &&
+ !(fio->op_flags & REQ_SYNC) &&
+ !f2fs_encrypted_inode(inode))
+ return true;
+
+ /* this is only set during fdatasync */
+ if (policy & (0x1 << F2FS_IPU_FSYNC) &&
+ is_inode_flag_set(inode, FI_NEED_IPU))
+ return true;
+
+ if (unlikely(fio && is_sbi_flag_set(sbi, SBI_CP_DISABLED) &&
+ !f2fs_is_checkpointed_data(sbi, fio->old_blkaddr)))
+ return true;
+
+ return false;
+}
+
+bool f2fs_should_update_inplace(struct inode *inode, struct f2fs_io_info *fio)
+{
+ if (f2fs_is_pinned_file(inode))
+ return true;
+
+ /* if this is cold file, we should overwrite to avoid fragmentation */
+ if (file_is_cold(inode))
+ return true;
+
+ return check_inplace_update_policy(inode, fio);
+}
+
+bool f2fs_should_update_outplace(struct inode *inode, struct f2fs_io_info *fio)
+{
+ struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
+
+ if (test_opt(sbi, LFS))
+ return true;
+ if (S_ISDIR(inode->i_mode))
+ return true;
+ if (IS_NOQUOTA(inode))
+ return true;
+ if (f2fs_is_atomic_file(inode))
+ return true;
+ if (fio) {
+ if (is_cold_data(fio->page))
+ return true;
+ if (IS_ATOMIC_WRITTEN_PAGE(fio->page))
+ return true;
+ if (unlikely(is_sbi_flag_set(sbi, SBI_CP_DISABLED) &&
+ f2fs_is_checkpointed_data(sbi, fio->old_blkaddr)))
+ return true;
+ }
+ return false;
+}
+
+static inline bool need_inplace_update(struct f2fs_io_info *fio)
+{
+ struct inode *inode = fio->page->mapping->host;
+
+ if (f2fs_should_update_outplace(inode, fio))
+ return false;
+
+ return f2fs_should_update_inplace(inode, fio);
}
-int do_write_data_page(struct f2fs_io_info *fio)
+int f2fs_do_write_data_page(struct f2fs_io_info *fio)
{
struct page *page = fio->page;
struct inode *inode = page->mapping->host;
struct dnode_of_data dn;
+ struct extent_info ei = {0,0,0};
+ struct node_info ni;
+ bool ipu_force = false;
int err = 0;
set_new_dnode(&dn, inode, NULL, NULL, 0);
- err = get_dnode_of_data(&dn, page->index, LOOKUP_NODE);
+ if (need_inplace_update(fio) &&
+ f2fs_lookup_extent_cache(inode, page->index, &ei)) {
+ fio->old_blkaddr = ei.blk + page->index - ei.fofs;
+
+ if (!f2fs_is_valid_blkaddr(fio->sbi, fio->old_blkaddr,
+ DATA_GENERIC))
+ return -EFAULT;
+
+ ipu_force = true;
+ fio->need_lock = LOCK_DONE;
+ goto got_it;
+ }
+
+ /* Deadlock due to between page->lock and f2fs_lock_op */
+ if (fio->need_lock == LOCK_REQ && !f2fs_trylock_op(fio->sbi))
+ return -EAGAIN;
+
+ err = f2fs_get_dnode_of_data(&dn, page->index, LOOKUP_NODE);
if (err)
- return err;
+ goto out;
fio->old_blkaddr = dn.data_blkaddr;
/* This page is already truncated */
if (fio->old_blkaddr == NULL_ADDR) {
ClearPageUptodate(page);
+ clear_cold_data(page);
goto out_writepage;
}
-
- if (f2fs_encrypted_inode(inode) && S_ISREG(inode->i_mode)) {
- gfp_t gfp_flags = GFP_NOFS;
-
- /* wait for GCed encrypted page writeback */
- f2fs_wait_on_encrypted_page_writeback(F2FS_I_SB(inode),
- fio->old_blkaddr);
-retry_encrypt:
- fio->encrypted_page = fscrypt_encrypt_page(inode, fio->page,
- gfp_flags);
- if (IS_ERR(fio->encrypted_page)) {
- err = PTR_ERR(fio->encrypted_page);
- if (err == -ENOMEM) {
- /* flush pending ios and wait for a while */
- f2fs_flush_merged_bios(F2FS_I_SB(inode));
- congestion_wait(BLK_RW_ASYNC, HZ/50);
- gfp_flags |= __GFP_NOFAIL;
- err = 0;
- goto retry_encrypt;
- }
- goto out_writepage;
- }
- }
-
- set_page_writeback(page);
-
+got_it:
if (__is_valid_data_blkaddr(fio->old_blkaddr) &&
!f2fs_is_valid_blkaddr(fio->sbi, fio->old_blkaddr,
DATA_GENERIC)) {
@@ -1239,27 +1881,63 @@ retry_encrypt:
* If current allocation needs SSR,
* it had better in-place writes for updated data.
*/
- if (unlikely(is_valid_data_blkaddr(fio->sbi, fio->old_blkaddr) &&
- !is_cold_data(page) &&
- !IS_ATOMIC_WRITTEN_PAGE(page) &&
- need_inplace_update(inode))) {
- rewrite_data_page(fio);
+ if (ipu_force || (is_valid_data_blkaddr(fio->sbi, fio->old_blkaddr) &&
+ need_inplace_update(fio))) {
+ err = encrypt_one_page(fio);
+ if (err)
+ goto out_writepage;
+
+ set_page_writeback(page);
+ ClearPageError(page);
+ f2fs_put_dnode(&dn);
+ if (fio->need_lock == LOCK_REQ)
+ f2fs_unlock_op(fio->sbi);
+ err = f2fs_inplace_write_data(fio);
+ if (err && PageWriteback(page))
+ end_page_writeback(page);
+ trace_f2fs_do_write_data_page(fio->page, IPU);
set_inode_flag(inode, FI_UPDATE_WRITE);
- trace_f2fs_do_write_data_page(page, IPU);
- } else {
- write_data_page(&dn, fio);
- trace_f2fs_do_write_data_page(page, OPU);
- set_inode_flag(inode, FI_APPEND_WRITE);
- if (page->index == 0)
- set_inode_flag(inode, FI_FIRST_BLOCK_WRITTEN);
+ return err;
+ }
+
+ if (fio->need_lock == LOCK_RETRY) {
+ if (!f2fs_trylock_op(fio->sbi)) {
+ err = -EAGAIN;
+ goto out_writepage;
+ }
+ fio->need_lock = LOCK_REQ;
}
+
+ err = f2fs_get_node_info(fio->sbi, dn.nid, &ni);
+ if (err)
+ goto out_writepage;
+
+ fio->version = ni.version;
+
+ err = encrypt_one_page(fio);
+ if (err)
+ goto out_writepage;
+
+ set_page_writeback(page);
+ ClearPageError(page);
+
+ /* LFS mode write path */
+ f2fs_outplace_write_data(&dn, fio);
+ trace_f2fs_do_write_data_page(page, OPU);
+ set_inode_flag(inode, FI_APPEND_WRITE);
+ if (page->index == 0)
+ set_inode_flag(inode, FI_FIRST_BLOCK_WRITTEN);
out_writepage:
f2fs_put_dnode(&dn);
+out:
+ if (fio->need_lock == LOCK_REQ)
+ f2fs_unlock_op(fio->sbi);
return err;
}
-static int f2fs_write_data_page(struct page *page,
- struct writeback_control *wbc)
+static int __write_data_page(struct page *page, bool *submitted,
+ struct writeback_control *wbc,
+ enum iostat_type io_type)
{
struct inode *inode = page->mapping->host;
struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
@@ -1272,15 +1950,36 @@ static int f2fs_write_data_page(struct page *page,
int err = 0;
struct f2fs_io_info fio = {
.sbi = sbi,
+ .ino = inode->i_ino,
.type = DATA,
.op = REQ_OP_WRITE,
- .op_flags = (wbc->sync_mode == WB_SYNC_ALL) ? WRITE_SYNC : 0,
+ .op_flags = wbc_to_write_flags(wbc),
+ .old_blkaddr = NULL_ADDR,
.page = page,
.encrypted_page = NULL,
+ .submitted = false,
+ .need_lock = LOCK_RETRY,
+ .io_type = io_type,
+ .io_wbc = wbc,
};
trace_f2fs_writepage(page, DATA);
+ /* we should bypass data pages to proceed the kworkder jobs */
+ if (unlikely(f2fs_cp_error(sbi))) {
+ mapping_set_error(page->mapping, -EIO);
+ /*
+ * don't drop any dirty dentry pages for keeping lastest
+ * directory structure.
+ */
+ if (S_ISDIR(inode->i_mode))
+ goto redirty_out;
+ goto out;
+ }
+
+ if (unlikely(is_sbi_flag_set(sbi, SBI_POR_DOING)))
+ goto redirty_out;
+
if (page->index < end_index)
goto write;
@@ -1294,25 +1993,18 @@ static int f2fs_write_data_page(struct page *page,
zero_user_segment(page, offset, PAGE_SIZE);
write:
- if (unlikely(is_sbi_flag_set(sbi, SBI_POR_DOING)))
- goto redirty_out;
if (f2fs_is_drop_cache(inode))
goto out;
/* we should not write 0'th page having journal header */
if (f2fs_is_volatile_file(inode) && (!page->index ||
(!wbc->for_reclaim &&
- available_free_memory(sbi, BASE_CHECK))))
+ f2fs_available_free_memory(sbi, BASE_CHECK))))
goto redirty_out;
- /* we should bypass data pages to proceed the kworkder jobs */
- if (unlikely(f2fs_cp_error(sbi))) {
- mapping_set_error(page->mapping, -EIO);
- goto out;
- }
-
/* Dentry blocks are controlled by checkpoint */
if (S_ISDIR(inode->i_mode)) {
- err = do_write_data_page(&fio);
+ fio.need_lock = LOCK_DONE;
+ err = f2fs_do_write_data_page(&fio);
goto done;
}
@@ -1320,56 +2012,98 @@ write:
need_balance_fs = true;
else if (has_not_enough_free_secs(sbi, 0, 0))
goto redirty_out;
+ else
+ set_inode_flag(inode, FI_HOT_DATA);
err = -EAGAIN;
- f2fs_lock_op(sbi);
- if (f2fs_has_inline_data(inode))
+ if (f2fs_has_inline_data(inode)) {
err = f2fs_write_inline_data(inode, page);
- if (err == -EAGAIN)
- err = do_write_data_page(&fio);
- if (F2FS_I(inode)->last_disk_size < psize)
- F2FS_I(inode)->last_disk_size = psize;
- f2fs_unlock_op(sbi);
+ if (!err)
+ goto out;
+ }
+
+ if (err == -EAGAIN) {
+ err = f2fs_do_write_data_page(&fio);
+ if (err == -EAGAIN) {
+ fio.need_lock = LOCK_REQ;
+ err = f2fs_do_write_data_page(&fio);
+ }
+ }
+
+ if (err) {
+ file_set_keep_isize(inode);
+ } else {
+ down_write(&F2FS_I(inode)->i_sem);
+ if (F2FS_I(inode)->last_disk_size < psize)
+ F2FS_I(inode)->last_disk_size = psize;
+ up_write(&F2FS_I(inode)->i_sem);
+ }
+
done:
if (err && err != -ENOENT)
goto redirty_out;
- clear_cold_data(page);
out:
inode_dec_dirty_pages(inode);
- if (err)
+ if (err) {
ClearPageUptodate(page);
+ clear_cold_data(page);
+ }
if (wbc->for_reclaim) {
- f2fs_submit_merged_bio_cond(sbi, NULL, page, 0, DATA, WRITE);
- remove_dirty_inode(inode);
+ f2fs_submit_merged_write_cond(sbi, NULL, page, 0, DATA);
+ clear_inode_flag(inode, FI_HOT_DATA);
+ f2fs_remove_dirty_inode(inode);
+ submitted = NULL;
}
unlock_page(page);
- f2fs_balance_fs(sbi, need_balance_fs);
+ if (!S_ISDIR(inode->i_mode) && !IS_NOQUOTA(inode))
+ f2fs_balance_fs(sbi, need_balance_fs);
- if (unlikely(f2fs_cp_error(sbi)))
- f2fs_submit_merged_bio(sbi, DATA, WRITE);
+ if (unlikely(f2fs_cp_error(sbi))) {
+ f2fs_submit_merged_write(sbi, DATA);
+ submitted = NULL;
+ }
+
+ if (submitted)
+ *submitted = fio.submitted;
return 0;
redirty_out:
redirty_page_for_writepage(wbc, page);
+ /*
+ * pageout() in MM traslates EAGAIN, so calls handle_write_error()
+ * -> mapping_set_error() -> set_bit(AS_EIO, ...).
+ * file_write_and_wait_range() will see EIO error, which is critical
+ * to return value of fsync() followed by atomic_write failure to user.
+ */
+ if (!err || wbc->for_reclaim)
+ return AOP_WRITEPAGE_ACTIVATE;
unlock_page(page);
return err;
}
+static int f2fs_write_data_page(struct page *page,
+ struct writeback_control *wbc)
+{
+ return __write_data_page(page, NULL, wbc, FS_DATA_IO);
+}
+
/*
* This function was copied from write_cche_pages from mm/page-writeback.c.
* The major change is making write step of cold data page separately from
* warm/hot data page.
*/
static int f2fs_write_cache_pages(struct address_space *mapping,
- struct writeback_control *wbc)
+ struct writeback_control *wbc,
+ enum iostat_type io_type)
{
int ret = 0;
int done = 0;
struct pagevec pvec;
+ struct f2fs_sb_info *sbi = F2FS_M_SB(mapping);
int nr_pages;
pgoff_t uninitialized_var(writeback_index);
pgoff_t index;
@@ -1382,6 +2116,12 @@ static int f2fs_write_cache_pages(struct address_space *mapping,
pagevec_init(&pvec, 0);
+ if (get_dirty_pages(mapping->host) <=
+ SM_I(F2FS_M_SB(mapping))->min_hot_blocks)
+ set_inode_flag(mapping->host, FI_HOT_DATA);
+ else
+ clear_inode_flag(mapping->host, FI_HOT_DATA);
+
if (wbc->range_cyclic) {
writeback_index = mapping->writeback_index; /* prev offset */
index = writeback_index;
@@ -1408,21 +2148,24 @@ retry:
while (!done && (index <= end)) {
int i;
- nr_pages = pagevec_lookup_tag(&pvec, mapping, &index, tag,
- min(end - index, (pgoff_t)PAGEVEC_SIZE - 1) + 1);
+ nr_pages = pagevec_lookup_range_tag(&pvec, mapping, &index, end,
+ tag);
if (nr_pages == 0)
break;
for (i = 0; i < nr_pages; i++) {
struct page *page = pvec.pages[i];
+ bool submitted = false;
- if (page->index > end) {
+ /* give a priority to WB_SYNC threads */
+ if (atomic_read(&sbi->wb_sync_req[DATA]) &&
+ wbc->sync_mode == WB_SYNC_NONE) {
done = 1;
break;
}
done_index = page->index;
-
+retry_write:
lock_page(page);
if (unlikely(page->mapping != mapping)) {
@@ -1439,26 +2182,43 @@ continue_unlock:
if (PageWriteback(page)) {
if (wbc->sync_mode != WB_SYNC_NONE)
f2fs_wait_on_page_writeback(page,
- DATA, true);
+ DATA, true, true);
else
goto continue_unlock;
}
- BUG_ON(PageWriteback(page));
if (!clear_page_dirty_for_io(page))
goto continue_unlock;
- ret = mapping->a_ops->writepage(page, wbc);
+ ret = __write_data_page(page, &submitted, wbc, io_type);
if (unlikely(ret)) {
+ /*
+ * keep nr_to_write, since vfs uses this to
+ * get # of written pages.
+ */
+ if (ret == AOP_WRITEPAGE_ACTIVATE) {
+ unlock_page(page);
+ ret = 0;
+ continue;
+ } else if (ret == -EAGAIN) {
+ ret = 0;
+ if (wbc->sync_mode == WB_SYNC_ALL) {
+ cond_resched();
+ congestion_wait(BLK_RW_ASYNC,
+ HZ/50);
+ goto retry_write;
+ }
+ continue;
+ }
done_index = page->index + 1;
done = 1;
break;
- } else {
+ } else if (submitted) {
nwritten++;
}
if (--wbc->nr_to_write <= 0 &&
- wbc->sync_mode == WB_SYNC_NONE) {
+ wbc->sync_mode == WB_SYNC_NONE) {
done = 1;
break;
}
@@ -1477,19 +2237,35 @@ continue_unlock:
mapping->writeback_index = done_index;
if (nwritten)
- f2fs_submit_merged_bio_cond(F2FS_M_SB(mapping), mapping->host,
- NULL, 0, DATA, WRITE);
+ f2fs_submit_merged_write_cond(F2FS_M_SB(mapping), mapping->host,
+ NULL, 0, DATA);
return ret;
}
-static int f2fs_write_data_pages(struct address_space *mapping,
- struct writeback_control *wbc)
+static inline bool __should_serialize_io(struct inode *inode,
+ struct writeback_control *wbc)
+{
+ if (!S_ISREG(inode->i_mode))
+ return false;
+ if (IS_NOQUOTA(inode))
+ return false;
+ if (wbc->sync_mode != WB_SYNC_ALL)
+ return true;
+ if (get_dirty_pages(inode) >= SM_I(F2FS_I_SB(inode))->min_seq_blocks)
+ return true;
+ return false;
+}
+
+static int __f2fs_write_data_pages(struct address_space *mapping,
+ struct writeback_control *wbc,
+ enum iostat_type io_type)
{
struct inode *inode = mapping->host;
struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
struct blk_plug plug;
int ret;
+ bool locked = false;
/* deal with chardevs and other special file */
if (!mapping->a_ops->writepage)
@@ -1499,30 +2275,48 @@ static int f2fs_write_data_pages(struct address_space *mapping,
if (!get_dirty_pages(inode) && wbc->sync_mode == WB_SYNC_NONE)
return 0;
- if (S_ISDIR(inode->i_mode) && wbc->sync_mode == WB_SYNC_NONE &&
+ /* during POR, we don't need to trigger writepage at all. */
+ if (unlikely(is_sbi_flag_set(sbi, SBI_POR_DOING)))
+ goto skip_write;
+
+ if ((S_ISDIR(inode->i_mode) || IS_NOQUOTA(inode)) &&
+ wbc->sync_mode == WB_SYNC_NONE &&
get_dirty_pages(inode) < nr_pages_to_skip(sbi, DATA) &&
- available_free_memory(sbi, DIRTY_DENTS))
+ f2fs_available_free_memory(sbi, DIRTY_DENTS))
goto skip_write;
/* skip writing during file defragment */
if (is_inode_flag_set(inode, FI_DO_DEFRAG))
goto skip_write;
- /* during POR, we don't need to trigger writepage at all. */
- if (unlikely(is_sbi_flag_set(sbi, SBI_POR_DOING)))
+ trace_f2fs_writepages(mapping->host, wbc, DATA);
+
+ /* to avoid spliting IOs due to mixed WB_SYNC_ALL and WB_SYNC_NONE */
+ if (wbc->sync_mode == WB_SYNC_ALL)
+ atomic_inc(&sbi->wb_sync_req[DATA]);
+ else if (atomic_read(&sbi->wb_sync_req[DATA]))
goto skip_write;
- trace_f2fs_writepages(mapping->host, wbc, DATA);
+ if (__should_serialize_io(inode, wbc)) {
+ mutex_lock(&sbi->writepages);
+ locked = true;
+ }
blk_start_plug(&plug);
- ret = f2fs_write_cache_pages(mapping, wbc);
+ ret = f2fs_write_cache_pages(mapping, wbc, io_type);
blk_finish_plug(&plug);
+
+ if (locked)
+ mutex_unlock(&sbi->writepages);
+
+ if (wbc->sync_mode == WB_SYNC_ALL)
+ atomic_dec(&sbi->wb_sync_req[DATA]);
/*
* if some pages were truncated, we cannot guarantee its mapping->host
* to detect pending bios.
*/
- remove_dirty_inode(inode);
+ f2fs_remove_dirty_inode(inode);
return ret;
skip_write:
@@ -1531,14 +2325,30 @@ skip_write:
return 0;
}
+static int f2fs_write_data_pages(struct address_space *mapping,
+ struct writeback_control *wbc)
+{
+ struct inode *inode = mapping->host;
+
+ return __f2fs_write_data_pages(mapping, wbc,
+ F2FS_I(inode)->cp_task == current ?
+ FS_CP_DATA_IO : FS_DATA_IO);
+}
+
static void f2fs_write_failed(struct address_space *mapping, loff_t to)
{
struct inode *inode = mapping->host;
loff_t i_size = i_size_read(inode);
if (to > i_size) {
+ down_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]);
+ down_write(&F2FS_I(inode)->i_mmap_sem);
+
truncate_pagecache(inode, i_size);
- truncate_blocks(inode, i_size, true);
+ f2fs_truncate_blocks(inode, i_size, true, true);
+
+ up_write(&F2FS_I(inode)->i_mmap_sem);
+ up_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]);
}
}
@@ -1551,24 +2361,32 @@ static int prepare_write_begin(struct f2fs_sb_info *sbi,
struct dnode_of_data dn;
struct page *ipage;
bool locked = false;
- struct extent_info ei;
+ struct extent_info ei = {0,0,0};
int err = 0;
+ int flag;
/*
* we already allocated all the blocks, so we don't need to get
* the block addresses when there is no need to fill the page.
*/
- if (!f2fs_has_inline_data(inode) && len == PAGE_SIZE)
+ if (!f2fs_has_inline_data(inode) && len == PAGE_SIZE &&
+ !is_inode_flag_set(inode, FI_NO_PREALLOC))
return 0;
+ /* f2fs_lock_op avoids race between write CP and convert_inline_page */
+ if (f2fs_has_inline_data(inode) && pos + len > MAX_INLINE_DATA(inode))
+ flag = F2FS_GET_BLOCK_DEFAULT;
+ else
+ flag = F2FS_GET_BLOCK_PRE_AIO;
+
if (f2fs_has_inline_data(inode) ||
(pos & PAGE_MASK) >= i_size_read(inode)) {
- f2fs_lock_op(sbi);
+ __do_map_lock(sbi, flag, true);
locked = true;
}
restart:
/* check inline_data */
- ipage = get_node_page(sbi, inode->i_ino);
+ ipage = f2fs_get_node_page(sbi, inode->i_ino);
if (IS_ERR(ipage)) {
err = PTR_ERR(ipage);
goto unlock_out;
@@ -1577,8 +2395,8 @@ restart:
set_new_dnode(&dn, inode, ipage, ipage, 0);
if (f2fs_has_inline_data(inode)) {
- if (pos + len <= MAX_INLINE_DATA) {
- read_inline_data(page, ipage);
+ if (pos + len <= MAX_INLINE_DATA(inode)) {
+ f2fs_do_read_inline_data(page, ipage);
set_inode_flag(inode, FI_DATA_EXIST);
if (inode->i_nlink)
set_inline_node(ipage);
@@ -1596,10 +2414,12 @@ restart:
dn.data_blkaddr = ei.blk + index - ei.fofs;
} else {
/* hole case */
- err = get_dnode_of_data(&dn, index, LOOKUP_NODE);
+ err = f2fs_get_dnode_of_data(&dn, index, LOOKUP_NODE);
if (err || dn.data_blkaddr == NULL_ADDR) {
f2fs_put_dnode(&dn);
- f2fs_lock_op(sbi);
+ __do_map_lock(sbi, F2FS_GET_BLOCK_PRE_AIO,
+ true);
+ WARN_ON(flag != F2FS_GET_BLOCK_PRE_AIO);
locked = true;
goto restart;
}
@@ -1613,7 +2433,7 @@ out:
f2fs_put_dnode(&dn);
unlock_out:
if (locked)
- f2fs_unlock_op(sbi);
+ __do_map_lock(sbi, flag, false);
return err;
}
@@ -1625,12 +2445,34 @@ static int f2fs_write_begin(struct file *file, struct address_space *mapping,
struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
struct page *page = NULL;
pgoff_t index = ((unsigned long long) pos) >> PAGE_SHIFT;
- bool need_balance = false;
+ bool need_balance = false, drop_atomic = false;
block_t blkaddr = NULL_ADDR;
int err = 0;
+ if (trace_android_fs_datawrite_start_enabled()) {
+ char *path, pathbuf[MAX_TRACE_PATHBUF_LEN];
+
+ path = android_fstrace_get_pathname(pathbuf,
+ MAX_TRACE_PATHBUF_LEN,
+ inode);
+ trace_android_fs_datawrite_start(inode, pos, len,
+ current->pid, path,
+ current->comm);
+ }
trace_f2fs_write_begin(inode, pos, len, flags);
+ err = f2fs_is_checkpoint_ready(sbi);
+ if (err)
+ goto fail;
+
+ if ((f2fs_is_atomic_file(inode) &&
+ !f2fs_available_free_memory(sbi, INMEM_PAGES)) ||
+ is_inode_flag_set(inode, FI_ATOMIC_REVOKE_REQUEST)) {
+ err = -ENOMEM;
+ drop_atomic = true;
+ goto fail;
+ }
+
/*
* We should check this at this moment to avoid deadlock on inode page
* and #0 page. The locking rule for inline_data conversion should be:
@@ -1646,7 +2488,7 @@ repeat:
* Do not use grab_cache_page_write_begin() to avoid deadlock due to
* wait_for_stable_page. Will wait that below with our IO control.
*/
- page = pagecache_get_page(mapping, index,
+ page = f2fs_pagecache_get_page(mapping, index,
FGP_LOCK | FGP_WRITE | FGP_CREAT, GFP_NOFS);
if (!page) {
err = -ENOMEM;
@@ -1660,7 +2502,8 @@ repeat:
if (err)
goto fail;
- if (need_balance && has_not_enough_free_secs(sbi, 0, 0)) {
+ if (need_balance && !IS_NOQUOTA(inode) &&
+ has_not_enough_free_secs(sbi, 0, 0)) {
unlock_page(page);
f2fs_balance_fs(sbi, true);
lock_page(page);
@@ -1671,34 +2514,23 @@ repeat:
}
}
- f2fs_wait_on_page_writeback(page, DATA, false);
-
- /* wait for GCed encrypted page writeback */
- if (f2fs_encrypted_inode(inode) && S_ISREG(inode->i_mode))
- f2fs_wait_on_encrypted_page_writeback(sbi, blkaddr);
+ f2fs_wait_on_page_writeback(page, DATA, false, true);
if (len == PAGE_SIZE || PageUptodate(page))
return 0;
+ if (!(pos & (PAGE_SIZE - 1)) && (pos + len) >= i_size_read(inode)) {
+ zero_user_segment(page, len, PAGE_SIZE);
+ return 0;
+ }
+
if (blkaddr == NEW_ADDR) {
zero_user_segment(page, 0, PAGE_SIZE);
SetPageUptodate(page);
} else {
- struct bio *bio;
-
- bio = f2fs_grab_bio(inode, blkaddr, 1);
- if (IS_ERR(bio)) {
- err = PTR_ERR(bio);
- goto fail;
- }
- bio_set_op_attrs(bio, REQ_OP_READ, READ_SYNC);
- if (bio_add_page(bio, page, PAGE_SIZE, 0) < PAGE_SIZE) {
- bio_put(bio);
- err = -EFAULT;
+ err = f2fs_submit_page_read(inode, page, blkaddr);
+ if (err)
goto fail;
- }
-
- __submit_bio(sbi, bio, DATA);
lock_page(page);
if (unlikely(page->mapping != mapping)) {
@@ -1715,6 +2547,8 @@ repeat:
fail:
f2fs_put_page(page, 1);
f2fs_write_failed(mapping, pos + len);
+ if (drop_atomic)
+ f2fs_drop_inmem_pages_all(sbi, false);
return err;
}
@@ -1725,6 +2559,7 @@ static int f2fs_write_end(struct file *file,
{
struct inode *inode = page->mapping->host;
+ trace_android_fs_datawrite_end(inode, pos, len);
trace_f2fs_write_end(inode, pos, len, copied);
/*
@@ -1733,7 +2568,7 @@ static int f2fs_write_end(struct file *file,
* let generic_perform_write() try to copy data again through copied=0.
*/
if (!PageUptodate(page)) {
- if (unlikely(copied != PAGE_SIZE))
+ if (unlikely(copied != len))
copied = 0;
else
SetPageUptodate(page);
@@ -1742,7 +2577,6 @@ static int f2fs_write_end(struct file *file,
goto unlock_out;
set_page_dirty(page);
- clear_cold_data(page);
if (pos + copied > i_size_read(inode))
f2fs_i_size_write(inode, pos + copied);
@@ -1755,47 +2589,167 @@ unlock_out:
static int check_direct_IO(struct inode *inode, struct iov_iter *iter,
loff_t offset)
{
- unsigned blocksize_mask = inode->i_sb->s_blocksize - 1;
+ unsigned i_blkbits = READ_ONCE(inode->i_blkbits);
+ unsigned blkbits = i_blkbits;
+ unsigned blocksize_mask = (1 << blkbits) - 1;
+ unsigned long align = offset | iov_iter_alignment(iter);
+ struct block_device *bdev = inode->i_sb->s_bdev;
+
+ if (align & blocksize_mask) {
+ if (bdev)
+ blkbits = blksize_bits(bdev_logical_block_size(bdev));
+ blocksize_mask = (1 << blkbits) - 1;
+ if (align & blocksize_mask)
+ return -EINVAL;
+ return 1;
+ }
+ return 0;
+}
- if (offset & blocksize_mask)
- return -EINVAL;
+static void f2fs_dio_end_io(struct bio *bio)
+{
+ struct f2fs_private_dio *dio = bio->bi_private;
- if (iov_iter_alignment(iter) & blocksize_mask)
- return -EINVAL;
+ dec_page_count(F2FS_I_SB(dio->inode),
+ dio->write ? F2FS_DIO_WRITE : F2FS_DIO_READ);
- return 0;
+ bio->bi_private = dio->orig_private;
+ bio->bi_end_io = dio->orig_end_io;
+
+ kvfree(dio);
+
+ bio_endio(bio);
+}
+
+static void f2fs_dio_submit_bio(struct bio *bio, struct inode *inode,
+ loff_t file_offset)
+{
+ struct f2fs_private_dio *dio;
+ bool write = (bio_op(bio) == REQ_OP_WRITE);
+ int err;
+
+ dio = f2fs_kzalloc(F2FS_I_SB(inode),
+ sizeof(struct f2fs_private_dio), GFP_NOFS);
+ if (!dio) {
+ err = -ENOMEM;
+ goto out;
+ }
+
+ dio->inode = inode;
+ dio->orig_end_io = bio->bi_end_io;
+ dio->orig_private = bio->bi_private;
+ dio->write = write;
+
+ bio->bi_end_io = f2fs_dio_end_io;
+ bio->bi_private = dio;
+
+ inc_page_count(F2FS_I_SB(inode),
+ write ? F2FS_DIO_WRITE : F2FS_DIO_READ);
+
+ submit_bio(bio);
+ return;
+out:
+ bio->bi_error = -EIO;
+ bio_endio(bio);
}
static ssize_t f2fs_direct_IO(struct kiocb *iocb, struct iov_iter *iter)
{
struct address_space *mapping = iocb->ki_filp->f_mapping;
struct inode *inode = mapping->host;
+ struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
+ struct f2fs_inode_info *fi = F2FS_I(inode);
size_t count = iov_iter_count(iter);
loff_t offset = iocb->ki_pos;
int rw = iov_iter_rw(iter);
int err;
+ enum rw_hint hint = iocb->ki_hint;
+ int whint_mode = F2FS_OPTION(sbi).whint_mode;
+ bool do_opu;
err = check_direct_IO(inode, iter, offset);
if (err)
- return err;
+ return err < 0 ? err : 0;
- if (f2fs_encrypted_inode(inode) && S_ISREG(inode->i_mode))
- return 0;
- if (test_opt(F2FS_I_SB(inode), LFS))
+ if (f2fs_force_buffered_io(inode, iocb, iter))
return 0;
+ do_opu = allow_outplace_dio(inode, iocb, iter);
+
trace_f2fs_direct_IO_enter(inode, offset, count, rw);
- down_read(&F2FS_I(inode)->dio_rwsem[rw]);
- err = blockdev_direct_IO(iocb, inode, iter, get_data_block_dio);
- up_read(&F2FS_I(inode)->dio_rwsem[rw]);
+ if (trace_android_fs_dataread_start_enabled() &&
+ (rw == READ)) {
+ char *path, pathbuf[MAX_TRACE_PATHBUF_LEN];
+
+ path = android_fstrace_get_pathname(pathbuf,
+ MAX_TRACE_PATHBUF_LEN,
+ inode);
+ trace_android_fs_dataread_start(inode, offset,
+ count, current->pid, path,
+ current->comm);
+ }
+ if (trace_android_fs_datawrite_start_enabled() &&
+ (rw == WRITE)) {
+ char *path, pathbuf[MAX_TRACE_PATHBUF_LEN];
+
+ path = android_fstrace_get_pathname(pathbuf,
+ MAX_TRACE_PATHBUF_LEN,
+ inode);
+ trace_android_fs_datawrite_start(inode, offset, count,
+ current->pid, path,
+ current->comm);
+ }
+ if (rw == WRITE && whint_mode == WHINT_MODE_OFF)
+ iocb->ki_hint = WRITE_LIFE_NOT_SET;
+
+ if (iocb->ki_flags & IOCB_NOWAIT) {
+ if (!down_read_trylock(&fi->i_gc_rwsem[rw])) {
+ iocb->ki_hint = hint;
+ err = -EAGAIN;
+ goto out;
+ }
+ if (do_opu && !down_read_trylock(&fi->i_gc_rwsem[READ])) {
+ up_read(&fi->i_gc_rwsem[rw]);
+ iocb->ki_hint = hint;
+ err = -EAGAIN;
+ goto out;
+ }
+ } else {
+ down_read(&fi->i_gc_rwsem[rw]);
+ if (do_opu)
+ down_read(&fi->i_gc_rwsem[READ]);
+ }
+
+ err = __blockdev_direct_IO(iocb, inode, inode->i_sb->s_bdev,
+ iter, rw == WRITE ? get_data_block_dio_write :
+ get_data_block_dio, NULL, f2fs_dio_submit_bio,
+ DIO_LOCKING | DIO_SKIP_HOLES);
+
+ if (do_opu)
+ up_read(&fi->i_gc_rwsem[READ]);
+
+ up_read(&fi->i_gc_rwsem[rw]);
if (rw == WRITE) {
- if (err > 0)
- set_inode_flag(inode, FI_UPDATE_WRITE);
- else if (err < 0)
+ if (whint_mode == WHINT_MODE_OFF)
+ iocb->ki_hint = hint;
+ if (err > 0) {
+ f2fs_update_iostat(F2FS_I_SB(inode), APP_DIRECT_IO,
+ err);
+ if (!do_opu)
+ set_inode_flag(inode, FI_UPDATE_WRITE);
+ } else if (err < 0) {
f2fs_write_failed(mapping, offset + count);
+ }
}
+out:
+ if (trace_android_fs_dataread_start_enabled() &&
+ (rw == READ))
+ trace_android_fs_dataread_end(inode, offset, count);
+ if (trace_android_fs_datawrite_start_enabled() &&
+ (rw == WRITE))
+ trace_android_fs_datawrite_end(inode, offset, count);
trace_f2fs_direct_IO_exit(inode, offset, count, rw, err);
@@ -1813,17 +2767,21 @@ void f2fs_invalidate_page(struct page *page, unsigned int offset,
return;
if (PageDirty(page)) {
- if (inode->i_ino == F2FS_META_INO(sbi))
+ if (inode->i_ino == F2FS_META_INO(sbi)) {
dec_page_count(sbi, F2FS_DIRTY_META);
- else if (inode->i_ino == F2FS_NODE_INO(sbi))
+ } else if (inode->i_ino == F2FS_NODE_INO(sbi)) {
dec_page_count(sbi, F2FS_DIRTY_NODES);
- else
+ } else {
inode_dec_dirty_pages(inode);
+ f2fs_remove_dirty_inode(inode);
+ }
}
+ clear_cold_data(page);
+
/* This is atomic written page, keep Private */
if (IS_ATOMIC_WRITTEN_PAGE(page))
- return;
+ return f2fs_drop_inmem_page(inode, page);
set_page_private(page, 0);
ClearPagePrivate(page);
@@ -1839,40 +2797,12 @@ int f2fs_release_page(struct page *page, gfp_t wait)
if (IS_ATOMIC_WRITTEN_PAGE(page))
return 0;
+ clear_cold_data(page);
set_page_private(page, 0);
ClearPagePrivate(page);
return 1;
}
-/*
- * This was copied from __set_page_dirty_buffers which gives higher performance
- * in very high speed storages. (e.g., pmem)
- */
-void f2fs_set_page_dirty_nobuffers(struct page *page)
-{
- struct address_space *mapping = page->mapping;
- unsigned long flags;
-
- if (unlikely(!mapping))
- return;
-
- spin_lock(&mapping->private_lock);
- lock_page_memcg(page);
- SetPageDirty(page);
- spin_unlock(&mapping->private_lock);
-
- spin_lock_irqsave(&mapping->tree_lock, flags);
- WARN_ON_ONCE(!PageUptodate(page));
- account_page_dirtied(page, mapping);
- radix_tree_tag_set(&mapping->page_tree,
- page_index(page), PAGECACHE_TAG_DIRTY);
- spin_unlock_irqrestore(&mapping->tree_lock, flags);
- unlock_page_memcg(page);
-
- __mark_inode_dirty(mapping->host, I_DIRTY_PAGES);
- return;
-}
-
static int f2fs_set_data_page_dirty(struct page *page)
{
struct address_space *mapping = page->mapping;
@@ -1883,9 +2813,9 @@ static int f2fs_set_data_page_dirty(struct page *page)
if (!PageUptodate(page))
SetPageUptodate(page);
- if (f2fs_is_atomic_file(inode)) {
+ if (f2fs_is_atomic_file(inode) && !f2fs_is_commit_atomic_write(inode)) {
if (!IS_ATOMIC_WRITTEN_PAGE(page)) {
- register_inmem_page(inode, page);
+ f2fs_register_inmem_page(inode, page);
return 1;
}
/*
@@ -1896,8 +2826,8 @@ static int f2fs_set_data_page_dirty(struct page *page)
}
if (!PageDirty(page)) {
- f2fs_set_page_dirty_nobuffers(page);
- update_dirty_page(inode, page);
+ __set_page_dirty_nobuffers(page);
+ f2fs_update_dirty_page(inode, page);
return 1;
}
return 0;
@@ -1930,8 +2860,12 @@ int f2fs_migrate_page(struct address_space *mapping,
BUG_ON(PageWriteback(page));
/* migrating an atomic written page is safe with the inmem_lock hold */
- if (atomic_written && !mutex_trylock(&fi->inmem_lock))
- return -EAGAIN;
+ if (atomic_written) {
+ if (mode != MIGRATE_SYNC)
+ return -EBUSY;
+ if (!mutex_trylock(&fi->inmem_lock))
+ return -EAGAIN;
+ }
/*
* A reference is expected if PagePrivate set when move mapping,
@@ -1985,3 +2919,38 @@ const struct address_space_operations f2fs_dblock_aops = {
.migratepage = f2fs_migrate_page,
#endif
};
+
+void f2fs_clear_radix_tree_dirty_tag(struct page *page)
+{
+ struct address_space *mapping = page_mapping(page);
+ unsigned long flags;
+
+ spin_lock_irqsave(&mapping->tree_lock, flags);
+ radix_tree_tag_clear(&mapping->page_tree, page_index(page),
+ PAGECACHE_TAG_DIRTY);
+ spin_unlock_irqrestore(&mapping->tree_lock, flags);
+}
+
+int __init f2fs_init_post_read_processing(void)
+{
+ bio_post_read_ctx_cache = KMEM_CACHE(bio_post_read_ctx, 0);
+ if (!bio_post_read_ctx_cache)
+ goto fail;
+ bio_post_read_ctx_pool =
+ mempool_create_slab_pool(NUM_PREALLOC_POST_READ_CTXS,
+ bio_post_read_ctx_cache);
+ if (!bio_post_read_ctx_pool)
+ goto fail_free_cache;
+ return 0;
+
+fail_free_cache:
+ kmem_cache_destroy(bio_post_read_ctx_cache);
+fail:
+ return -ENOMEM;
+}
+
+void __exit f2fs_destroy_post_read_processing(void)
+{
+ mempool_destroy(bio_post_read_ctx_pool);
+ kmem_cache_destroy(bio_post_read_ctx_cache);
+}
diff --git a/fs/f2fs/debug.c b/fs/f2fs/debug.c
index 687998e9557c..f05b37ef7182 100644
--- a/fs/f2fs/debug.c
+++ b/fs/f2fs/debug.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
/*
* f2fs debugging statistics
*
@@ -5,10 +6,6 @@
* http://www.samsung.com/
* Copyright (c) 2012 Linux Foundation
* Copyright (c) 2012 Greg Kroah-Hartman <gregkh@linuxfoundation.org>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
*/
#include <linux/fs.h>
@@ -45,12 +42,41 @@ static void update_general_status(struct f2fs_sb_info *sbi)
si->ndirty_dent = get_pages(sbi, F2FS_DIRTY_DENTS);
si->ndirty_meta = get_pages(sbi, F2FS_DIRTY_META);
si->ndirty_data = get_pages(sbi, F2FS_DIRTY_DATA);
+ si->ndirty_qdata = get_pages(sbi, F2FS_DIRTY_QDATA);
si->ndirty_imeta = get_pages(sbi, F2FS_DIRTY_IMETA);
si->ndirty_dirs = sbi->ndirty_inode[DIR_INODE];
si->ndirty_files = sbi->ndirty_inode[FILE_INODE];
+ si->nquota_files = sbi->nquota_files;
si->ndirty_all = sbi->ndirty_inode[DIRTY_META];
si->inmem_pages = get_pages(sbi, F2FS_INMEM_PAGES);
- si->wb_bios = atomic_read(&sbi->nr_wb_bios);
+ si->aw_cnt = atomic_read(&sbi->aw_cnt);
+ si->vw_cnt = atomic_read(&sbi->vw_cnt);
+ si->max_aw_cnt = atomic_read(&sbi->max_aw_cnt);
+ si->max_vw_cnt = atomic_read(&sbi->max_vw_cnt);
+ si->nr_dio_read = get_pages(sbi, F2FS_DIO_READ);
+ si->nr_dio_write = get_pages(sbi, F2FS_DIO_WRITE);
+ si->nr_wb_cp_data = get_pages(sbi, F2FS_WB_CP_DATA);
+ si->nr_wb_data = get_pages(sbi, F2FS_WB_DATA);
+ si->nr_rd_data = get_pages(sbi, F2FS_RD_DATA);
+ si->nr_rd_node = get_pages(sbi, F2FS_RD_NODE);
+ si->nr_rd_meta = get_pages(sbi, F2FS_RD_META);
+ if (SM_I(sbi) && SM_I(sbi)->fcc_info) {
+ si->nr_flushed =
+ atomic_read(&SM_I(sbi)->fcc_info->issued_flush);
+ si->nr_flushing =
+ atomic_read(&SM_I(sbi)->fcc_info->queued_flush);
+ si->flush_list_empty =
+ llist_empty(&SM_I(sbi)->fcc_info->issue_list);
+ }
+ if (SM_I(sbi) && SM_I(sbi)->dcc_info) {
+ si->nr_discarded =
+ atomic_read(&SM_I(sbi)->dcc_info->issued_discard);
+ si->nr_discarding =
+ atomic_read(&SM_I(sbi)->dcc_info->queued_discard);
+ si->nr_discard_cmd =
+ atomic_read(&SM_I(sbi)->dcc_info->discard_cmd_cnt);
+ si->undiscard_blks = SM_I(sbi)->dcc_info->undiscard_blks;
+ }
si->total_count = (int)sbi->user_block_count / sbi->blocks_per_seg;
si->rsvd_segs = reserved_segments(sbi);
si->overp_segs = overprovision_segments(sbi);
@@ -61,6 +87,8 @@ static void update_general_status(struct f2fs_sb_info *sbi)
si->inline_xattr = atomic_read(&sbi->inline_xattr);
si->inline_inode = atomic_read(&sbi->inline_inode);
si->inline_dir = atomic_read(&sbi->inline_dir);
+ si->append = sbi->im[APPEND_INO].ino_num;
+ si->update = sbi->im[UPDATE_INO].ino_num;
si->orphans = sbi->im[ORPHAN_INO].ino_num;
si->utilization = utilization(sbi);
@@ -68,14 +96,22 @@ static void update_general_status(struct f2fs_sb_info *sbi)
si->free_secs = free_sections(sbi);
si->prefree_count = prefree_segments(sbi);
si->dirty_count = dirty_segments(sbi);
- si->node_pages = NODE_MAPPING(sbi)->nrpages;
- si->meta_pages = META_MAPPING(sbi)->nrpages;
+ if (sbi->node_inode)
+ si->node_pages = NODE_MAPPING(sbi)->nrpages;
+ if (sbi->meta_inode)
+ si->meta_pages = META_MAPPING(sbi)->nrpages;
si->nats = NM_I(sbi)->nat_cnt;
si->dirty_nats = NM_I(sbi)->dirty_nat_cnt;
si->sits = MAIN_SEGS(sbi);
si->dirty_sits = SIT_I(sbi)->dirty_sentries;
- si->fnids = NM_I(sbi)->fcnt;
+ si->free_nids = NM_I(sbi)->nid_cnt[FREE_NID];
+ si->avail_nids = NM_I(sbi)->available_nids;
+ si->alloc_nids = NM_I(sbi)->nid_cnt[PREALLOC_NID];
si->bg_gc = sbi->bg_gc;
+ si->io_skip_bggc = sbi->io_skip_bggc;
+ si->other_skip_bggc = sbi->other_skip_bggc;
+ si->skipped_atomic_files[BG_GC] = sbi->skipped_atomic_files[BG_GC];
+ si->skipped_atomic_files[FG_GC] = sbi->skipped_atomic_files[FG_GC];
si->util_free = (int)(free_user_blocks(sbi) >> sbi->log_blocks_per_seg)
* 100 / (int)(sbi->user_block_count >> sbi->log_blocks_per_seg)
/ 2;
@@ -87,10 +123,13 @@ static void update_general_status(struct f2fs_sb_info *sbi)
for (i = CURSEG_HOT_DATA; i <= CURSEG_COLD_NODE; i++) {
struct curseg_info *curseg = CURSEG_I(sbi, i);
si->curseg[i] = curseg->segno;
- si->cursec[i] = curseg->segno / sbi->segs_per_sec;
- si->curzone[i] = si->cursec[i] / sbi->secs_per_zone;
+ si->cursec[i] = GET_SEC_FROM_SEG(sbi, curseg->segno);
+ si->curzone[i] = GET_ZONE_FROM_SEC(sbi, si->cursec[i]);
}
+ for (i = META_CP; i < META_MAX; i++)
+ si->meta_count[i] = atomic_read(&sbi->meta_count[i]);
+
for (i = 0; i < 2; i++) {
si->segment_count[i] = sbi->segment_count[i];
si->block_count[i] = sbi->block_count[i];
@@ -112,10 +151,10 @@ static void update_sit_info(struct f2fs_sb_info *sbi)
bimodal = 0;
total_vblocks = 0;
- blks_per_sec = sbi->segs_per_sec * sbi->blocks_per_seg;
+ blks_per_sec = BLKS_PER_SEC(sbi);
hblks_per_sec = blks_per_sec / 2;
for (segno = 0; segno < MAIN_SEGS(sbi); segno += sbi->segs_per_sec) {
- vblocks = get_valid_blocks(sbi, segno, sbi->segs_per_sec);
+ vblocks = get_valid_blocks(sbi, segno, true);
dist = abs(vblocks - hblks_per_sec);
bimodal += dist * dist;
@@ -138,16 +177,18 @@ static void update_sit_info(struct f2fs_sb_info *sbi)
static void update_mem_info(struct f2fs_sb_info *sbi)
{
struct f2fs_stat_info *si = F2FS_STAT(sbi);
- unsigned npages;
int i;
if (si->base_mem)
goto get_cache;
- si->base_mem = sizeof(struct f2fs_sb_info) + sbi->sb->s_blocksize;
+ /* build stat */
+ si->base_mem = sizeof(struct f2fs_stat_info);
+
+ /* build superblock */
+ si->base_mem += sizeof(struct f2fs_sb_info) + sbi->sb->s_blocksize;
si->base_mem += 2 * sizeof(struct f2fs_inode_info);
si->base_mem += sizeof(*sbi->ckpt);
- si->base_mem += sizeof(struct percpu_counter) * NR_COUNT_TYPE;
/* build sm */
si->base_mem += sizeof(struct f2fs_sm_info);
@@ -157,10 +198,9 @@ static void update_mem_info(struct f2fs_sb_info *sbi)
si->base_mem += MAIN_SEGS(sbi) * sizeof(struct seg_entry);
si->base_mem += f2fs_bitmap_size(MAIN_SEGS(sbi));
si->base_mem += 2 * SIT_VBLOCK_MAP_SIZE * MAIN_SEGS(sbi);
- if (f2fs_discard_en(sbi))
- si->base_mem += SIT_VBLOCK_MAP_SIZE * MAIN_SEGS(sbi);
+ si->base_mem += SIT_VBLOCK_MAP_SIZE * MAIN_SEGS(sbi);
si->base_mem += SIT_VBLOCK_MAP_SIZE;
- if (sbi->segs_per_sec > 1)
+ if (__is_large_section(sbi))
si->base_mem += MAIN_SECS(sbi) * sizeof(struct sec_entry);
si->base_mem += __bitmap_size(sbi, SIT_BITMAP);
@@ -181,6 +221,11 @@ static void update_mem_info(struct f2fs_sb_info *sbi)
/* build nm */
si->base_mem += sizeof(struct f2fs_nm_info);
si->base_mem += __bitmap_size(sbi, NAT_BITMAP);
+ si->base_mem += (NM_I(sbi)->nat_bits_blocks << F2FS_BLKSIZE_BITS);
+ si->base_mem += NM_I(sbi)->nat_blocks *
+ f2fs_bitmap_size(NAT_ENTRY_PER_BLOCK);
+ si->base_mem += NM_I(sbi)->nat_blocks / 8;
+ si->base_mem += NM_I(sbi)->nat_blocks * sizeof(unsigned short);
get_cache:
si->cache_mem = 0;
@@ -190,16 +235,23 @@ get_cache:
si->cache_mem += sizeof(struct f2fs_gc_kthread);
/* build merge flush thread */
- if (SM_I(sbi)->cmd_control_info)
+ if (SM_I(sbi)->fcc_info)
si->cache_mem += sizeof(struct flush_cmd_control);
+ if (SM_I(sbi)->dcc_info) {
+ si->cache_mem += sizeof(struct discard_cmd_control);
+ si->cache_mem += sizeof(struct discard_cmd) *
+ atomic_read(&SM_I(sbi)->dcc_info->discard_cmd_cnt);
+ }
/* free nids */
- si->cache_mem += NM_I(sbi)->fcnt * sizeof(struct free_nid);
+ si->cache_mem += (NM_I(sbi)->nid_cnt[FREE_NID] +
+ NM_I(sbi)->nid_cnt[PREALLOC_NID]) *
+ sizeof(struct free_nid);
si->cache_mem += NM_I(sbi)->nat_cnt * sizeof(struct nat_entry);
si->cache_mem += NM_I(sbi)->dirty_nat_cnt *
sizeof(struct nat_entry_set);
si->cache_mem += si->inmem_pages * sizeof(struct inmem_pages);
- for (i = 0; i <= ORPHAN_INO; i++)
+ for (i = 0; i < MAX_INO_ENTRY; i++)
si->cache_mem += sbi->im[i].ino_num * sizeof(struct ino_entry);
si->cache_mem += atomic_read(&sbi->total_ext_tree) *
sizeof(struct extent_tree);
@@ -207,10 +259,14 @@ get_cache:
sizeof(struct extent_node);
si->page_mem = 0;
- npages = NODE_MAPPING(sbi)->nrpages;
- si->page_mem += (unsigned long long)npages << PAGE_SHIFT;
- npages = META_MAPPING(sbi)->nrpages;
- si->page_mem += (unsigned long long)npages << PAGE_SHIFT;
+ if (sbi->node_inode) {
+ unsigned npages = NODE_MAPPING(sbi)->nrpages;
+ si->page_mem += (unsigned long long)npages << PAGE_SHIFT;
+ }
+ if (sbi->meta_inode) {
+ unsigned npages = META_MAPPING(sbi)->nrpages;
+ si->page_mem += (unsigned long long)npages << PAGE_SHIFT;
+ }
}
static int stat_show(struct seq_file *s, void *v)
@@ -223,9 +279,11 @@ static int stat_show(struct seq_file *s, void *v)
list_for_each_entry(si, &f2fs_stat_list, stat_list) {
update_general_status(si->sbi);
- seq_printf(s, "\n=====[ partition info(%pg). #%d, %s]=====\n",
+ seq_printf(s, "\n=====[ partition info(%pg). #%d, %s, CP: %s]=====\n",
si->sbi->sb->s_bdev, i++,
- f2fs_readonly(si->sbi->sb) ? "RO": "RW");
+ f2fs_readonly(si->sbi->sb) ? "RO": "RW",
+ is_set_ckpt_flags(si->sbi, CP_DISABLED_FLAG) ?
+ "Disabled": (f2fs_cp_error(si->sbi) ? "Error": "Good"));
seq_printf(s, "[SB: 1] [CP: 2] [SIT: %d] [NAT: %d] ",
si->sit_area_segs, si->nat_area_segs);
seq_printf(s, "[SSA: %d] [MAIN: %d",
@@ -250,8 +308,8 @@ static int stat_show(struct seq_file *s, void *v)
si->inline_inode);
seq_printf(s, " - Inline_dentry Inode: %u\n",
si->inline_dir);
- seq_printf(s, " - Orphan Inode: %u\n",
- si->orphans);
+ seq_printf(s, " - Orphan/Append/Update Inode: %u, %u, %u\n",
+ si->orphans, si->append, si->update);
seq_printf(s, "\nMain area: %d segs, %d secs %d zones\n",
si->main_area_segs, si->main_area_sections,
si->main_area_zones);
@@ -287,6 +345,13 @@ static int stat_show(struct seq_file *s, void *v)
si->prefree_count, si->free_segs, si->free_secs);
seq_printf(s, "CP calls: %d (BG: %d)\n",
si->cp_count, si->bg_cp_count);
+ seq_printf(s, " - cp blocks : %u\n", si->meta_count[META_CP]);
+ seq_printf(s, " - sit blocks : %u\n",
+ si->meta_count[META_SIT]);
+ seq_printf(s, " - nat blocks : %u\n",
+ si->meta_count[META_NAT]);
+ seq_printf(s, " - ssa blocks : %u\n",
+ si->meta_count[META_SSA]);
seq_printf(s, "GC calls: %d (BG: %d)\n",
si->call_count, si->bg_gc);
seq_printf(s, " - data segments : %d (%d)\n",
@@ -299,6 +364,12 @@ static int stat_show(struct seq_file *s, void *v)
si->bg_data_blks);
seq_printf(s, " - node blocks : %d (%d)\n", si->node_blks,
si->bg_node_blks);
+ seq_printf(s, "Skipped : atomic write %llu (%llu)\n",
+ si->skipped_atomic_files[BG_GC] +
+ si->skipped_atomic_files[FG_GC],
+ si->skipped_atomic_files[BG_GC]);
+ seq_printf(s, "BG skip : IO: %u, Other: %u\n",
+ si->io_skip_bggc, si->other_skip_bggc);
seq_puts(s, "\nExtent Cache:\n");
seq_printf(s, " - Hit Count: L1-1:%llu L1-2:%llu L2:%llu\n",
si->hit_largest, si->hit_cached,
@@ -310,22 +381,37 @@ static int stat_show(struct seq_file *s, void *v)
seq_printf(s, " - Inner Struct Count: tree: %d(%d), node: %d\n",
si->ext_tree, si->zombie_tree, si->ext_node);
seq_puts(s, "\nBalancing F2FS Async:\n");
- seq_printf(s, " - inmem: %4d, wb_bios: %4d\n",
- si->inmem_pages, si->wb_bios);
+ seq_printf(s, " - DIO (R: %4d, W: %4d)\n",
+ si->nr_dio_read, si->nr_dio_write);
+ seq_printf(s, " - IO_R (Data: %4d, Node: %4d, Meta: %4d\n",
+ si->nr_rd_data, si->nr_rd_node, si->nr_rd_meta);
+ seq_printf(s, " - IO_W (CP: %4d, Data: %4d, Flush: (%4d %4d %4d), "
+ "Discard: (%4d %4d)) cmd: %4d undiscard:%4u\n",
+ si->nr_wb_cp_data, si->nr_wb_data,
+ si->nr_flushing, si->nr_flushed,
+ si->flush_list_empty,
+ si->nr_discarding, si->nr_discarded,
+ si->nr_discard_cmd, si->undiscard_blks);
+ seq_printf(s, " - inmem: %4d, atomic IO: %4d (Max. %4d), "
+ "volatile IO: %4d (Max. %4d)\n",
+ si->inmem_pages, si->aw_cnt, si->max_aw_cnt,
+ si->vw_cnt, si->max_vw_cnt);
seq_printf(s, " - nodes: %4d in %4d\n",
si->ndirty_node, si->node_pages);
seq_printf(s, " - dents: %4d in dirs:%4d (%4d)\n",
si->ndirty_dent, si->ndirty_dirs, si->ndirty_all);
seq_printf(s, " - datas: %4d in files:%4d\n",
si->ndirty_data, si->ndirty_files);
+ seq_printf(s, " - quota datas: %4d in quota files:%4d\n",
+ si->ndirty_qdata, si->nquota_files);
seq_printf(s, " - meta: %4d in %4d\n",
si->ndirty_meta, si->meta_pages);
seq_printf(s, " - imeta: %4d\n",
si->ndirty_imeta);
seq_printf(s, " - NATs: %9d/%9d\n - SITs: %9d/%9d\n",
si->dirty_nats, si->nats, si->dirty_sits, si->sits);
- seq_printf(s, " - free_nids: %9d\n",
- si->fnids);
+ seq_printf(s, " - free_nids: %9d/%9d\n - alloc_nids: %9d\n",
+ si->free_nids, si->avail_nids, si->alloc_nids);
seq_puts(s, "\nDistribution of User Blocks:");
seq_puts(s, " [ valid | invalid | free ]\n");
seq_puts(s, " [");
@@ -384,8 +470,9 @@ int f2fs_build_stats(struct f2fs_sb_info *sbi)
{
struct f2fs_super_block *raw_super = F2FS_RAW_SUPER(sbi);
struct f2fs_stat_info *si;
+ int i;
- si = kzalloc(sizeof(struct f2fs_stat_info), GFP_KERNEL);
+ si = f2fs_kzalloc(sbi, sizeof(struct f2fs_stat_info), GFP_KERNEL);
if (!si)
return -ENOMEM;
@@ -409,6 +496,13 @@ int f2fs_build_stats(struct f2fs_sb_info *sbi)
atomic_set(&sbi->inline_inode, 0);
atomic_set(&sbi->inline_dir, 0);
atomic_set(&sbi->inplace_count, 0);
+ for (i = META_CP; i < META_MAX; i++)
+ atomic_set(&sbi->meta_count[i], 0);
+
+ atomic_set(&sbi->aw_cnt, 0);
+ atomic_set(&sbi->vw_cnt, 0);
+ atomic_set(&sbi->max_aw_cnt, 0);
+ atomic_set(&sbi->max_vw_cnt, 0);
mutex_lock(&f2fs_stat_mutex);
list_add_tail(&si->stat_list, &f2fs_stat_list);
@@ -425,7 +519,7 @@ void f2fs_destroy_stats(struct f2fs_sb_info *sbi)
list_del(&si->stat_list);
mutex_unlock(&f2fs_stat_mutex);
- kfree(si);
+ kvfree(si);
}
int __init f2fs_create_root_stats(void)
diff --git a/fs/f2fs/dir.c b/fs/f2fs/dir.c
index af719d93507e..8fd036039731 100644
--- a/fs/f2fs/dir.c
+++ b/fs/f2fs/dir.c
@@ -1,19 +1,18 @@
+// SPDX-License-Identifier: GPL-2.0
/*
* fs/f2fs/dir.c
*
* Copyright (c) 2012 Samsung Electronics Co., Ltd.
* http://www.samsung.com/
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
*/
#include <linux/fs.h>
#include <linux/f2fs_fs.h>
+#include <linux/sched.h>
#include "f2fs.h"
#include "node.h"
#include "acl.h"
#include "xattr.h"
+#include <trace/events/f2fs.h>
static unsigned long dir_blocks(struct inode *inode)
{
@@ -58,12 +57,12 @@ static unsigned char f2fs_type_by_mode[S_IFMT >> S_SHIFT] = {
[S_IFLNK >> S_SHIFT] = F2FS_FT_SYMLINK,
};
-void set_de_type(struct f2fs_dir_entry *de, umode_t mode)
+static void set_de_type(struct f2fs_dir_entry *de, umode_t mode)
{
de->file_type = f2fs_type_by_mode[(mode & S_IFMT) >> S_SHIFT];
}
-unsigned char get_de_type(struct f2fs_dir_entry *de)
+unsigned char f2fs_get_de_type(struct f2fs_dir_entry *de)
{
if (de->file_type < F2FS_FT_MAX)
return f2fs_filetype_table[de->file_type];
@@ -92,27 +91,23 @@ static struct f2fs_dir_entry *find_in_block(struct page *dentry_page,
struct f2fs_dir_entry *de;
struct f2fs_dentry_ptr d;
- dentry_blk = (struct f2fs_dentry_block *)kmap(dentry_page);
+ dentry_blk = (struct f2fs_dentry_block *)page_address(dentry_page);
- make_dentry_ptr(NULL, &d, (void *)dentry_blk, 1);
- de = find_target_dentry(fname, namehash, max_slots, &d);
+ make_dentry_ptr_block(NULL, &d, dentry_blk);
+ de = f2fs_find_target_dentry(fname, namehash, max_slots, &d);
if (de)
*res_page = dentry_page;
- else
- kunmap(dentry_page);
return de;
}
-struct f2fs_dir_entry *find_target_dentry(struct fscrypt_name *fname,
+struct f2fs_dir_entry *f2fs_find_target_dentry(struct fscrypt_name *fname,
f2fs_hash_t namehash, int *max_slots,
struct f2fs_dentry_ptr *d)
{
struct f2fs_dir_entry *de;
unsigned long bit_pos = 0;
int max_len = 0;
- struct fscrypt_str de_name = FSTR_INIT(NULL, 0);
- struct fscrypt_str *name = &fname->disk_name;
if (max_slots)
*max_slots = 0;
@@ -130,29 +125,11 @@ struct f2fs_dir_entry *find_target_dentry(struct fscrypt_name *fname,
continue;
}
- if (de->hash_code != namehash)
- goto not_match;
-
- de_name.name = d->filename[bit_pos];
- de_name.len = le16_to_cpu(de->name_len);
-
-#ifdef CONFIG_F2FS_FS_ENCRYPTION
- if (unlikely(!name->name)) {
- if (fname->usr_fname->name[0] == '_') {
- if (de_name.len > 32 &&
- !memcmp(de_name.name + ((de_name.len - 17) & ~15),
- fname->crypto_buf.name + 8, 16))
- goto found;
- goto not_match;
- }
- name->name = fname->crypto_buf.name;
- name->len = fname->crypto_buf.len;
- }
-#endif
- if (de_name.len == name->len &&
- !memcmp(de_name.name, name->name, name->len))
+ if (de->hash_code == namehash &&
+ fscrypt_match_name(fname, d->filename[bit_pos],
+ le16_to_cpu(de->name_len)))
goto found;
-not_match:
+
if (max_slots && max_len > *max_slots)
*max_slots = max_len;
max_len = 0;
@@ -191,7 +168,7 @@ static struct f2fs_dir_entry *find_in_level(struct inode *dir,
for (; bidx < end_block; bidx++) {
/* no need to allocate new dentry pages to all the indices */
- dentry_page = find_data_page(dir, bidx);
+ dentry_page = f2fs_find_data_page(dir, bidx);
if (IS_ERR(dentry_page)) {
if (PTR_ERR(dentry_page) == -ENOENT) {
room = true;
@@ -230,7 +207,7 @@ struct f2fs_dir_entry *__f2fs_find_entry(struct inode *dir,
if (f2fs_has_inline_dentry(dir)) {
*res_page = NULL;
- de = find_in_inline_dir(dir, fname, res_page);
+ de = f2fs_find_in_inline_dir(dir, fname, res_page);
goto out;
}
@@ -305,7 +282,6 @@ ino_t f2fs_inode_by_name(struct inode *dir, const struct qstr *qstr,
de = f2fs_find_entry(dir, qstr, page);
if (de) {
res = le32_to_cpu(de->ino);
- f2fs_dentry_kunmap(dir, *page);
f2fs_put_page(*page, 0);
}
@@ -317,14 +293,13 @@ void f2fs_set_link(struct inode *dir, struct f2fs_dir_entry *de,
{
enum page_type type = f2fs_has_inline_dentry(dir) ? NODE : DATA;
lock_page(page);
- f2fs_wait_on_page_writeback(page, type, true);
+ f2fs_wait_on_page_writeback(page, type, true, true);
de->ino = cpu_to_le32(inode->i_ino);
set_de_type(de, inode->i_mode);
- f2fs_dentry_kunmap(dir, page);
set_page_dirty(page);
dir->i_mtime = dir->i_ctime = current_time(dir);
- f2fs_mark_inode_dirty_sync(dir);
+ f2fs_mark_inode_dirty_sync(dir, false);
f2fs_put_page(page, 1);
}
@@ -332,7 +307,7 @@ static void init_dent_inode(const struct qstr *name, struct page *ipage)
{
struct f2fs_inode *ri;
- f2fs_wait_on_page_writeback(ipage, NODE, true);
+ f2fs_wait_on_page_writeback(ipage, NODE, true, true);
/* copy name info. to this inode page */
ri = F2FS_INODE(ipage);
@@ -341,25 +316,7 @@ static void init_dent_inode(const struct qstr *name, struct page *ipage)
set_page_dirty(ipage);
}
-int update_dent_inode(struct inode *inode, struct inode *to,
- const struct qstr *name)
-{
- struct page *page;
-
- if (file_enc_name(to))
- return 0;
-
- page = get_node_page(F2FS_I_SB(inode), inode->i_ino);
- if (IS_ERR(page))
- return PTR_ERR(page);
-
- init_dent_inode(name, page);
- f2fs_put_page(page, 1);
-
- return 0;
-}
-
-void do_make_empty_dir(struct inode *inode, struct inode *parent,
+void f2fs_do_make_empty_dir(struct inode *inode, struct inode *parent,
struct f2fs_dentry_ptr *d)
{
struct qstr dot = QSTR_INIT(".", 1);
@@ -380,33 +337,32 @@ static int make_empty_dir(struct inode *inode,
struct f2fs_dentry_ptr d;
if (f2fs_has_inline_dentry(inode))
- return make_empty_inline_dir(inode, parent, page);
+ return f2fs_make_empty_inline_dir(inode, parent, page);
- dentry_page = get_new_data_page(inode, page, 0, true);
+ dentry_page = f2fs_get_new_data_page(inode, page, 0, true);
if (IS_ERR(dentry_page))
return PTR_ERR(dentry_page);
- dentry_blk = kmap_atomic(dentry_page);
+ dentry_blk = page_address(dentry_page);
- make_dentry_ptr(NULL, &d, (void *)dentry_blk, 1);
- do_make_empty_dir(inode, parent, &d);
-
- kunmap_atomic(dentry_blk);
+ make_dentry_ptr_block(NULL, &d, dentry_blk);
+ f2fs_do_make_empty_dir(inode, parent, &d);
set_page_dirty(dentry_page);
f2fs_put_page(dentry_page, 1);
return 0;
}
-struct page *init_inode_metadata(struct inode *inode, struct inode *dir,
+struct page *f2fs_init_inode_metadata(struct inode *inode, struct inode *dir,
const struct qstr *new_name, const struct qstr *orig_name,
struct page *dpage)
{
struct page *page;
+ int dummy_encrypt = DUMMY_ENCRYPTION_ENABLED(F2FS_I_SB(dir));
int err;
if (is_inode_flag_set(inode, FI_NEW_INODE)) {
- page = new_inode_page(inode);
+ page = f2fs_new_inode_page(inode);
if (IS_ERR(page))
return page;
@@ -429,46 +385,49 @@ struct page *init_inode_metadata(struct inode *inode, struct inode *dir,
if (err)
goto put_error;
- if (f2fs_encrypted_inode(dir) && f2fs_may_encrypt(inode)) {
+ if ((f2fs_encrypted_inode(dir) || dummy_encrypt) &&
+ f2fs_may_encrypt(inode)) {
err = fscrypt_inherit_context(dir, inode, page, false);
if (err)
goto put_error;
}
} else {
- page = get_node_page(F2FS_I_SB(dir), inode->i_ino);
+ page = f2fs_get_node_page(F2FS_I_SB(dir), inode->i_ino);
if (IS_ERR(page))
return page;
-
- set_cold_node(inode, page);
}
- if (new_name)
+ if (new_name) {
init_dent_inode(new_name, page);
+ if (f2fs_encrypted_inode(dir))
+ file_set_enc_name(inode);
+ }
/*
* This file should be checkpointed during fsync.
* We lost i_pino from now on.
*/
if (is_inode_flag_set(inode, FI_INC_LINK)) {
- file_lost_pino(inode);
+ if (!S_ISDIR(inode->i_mode))
+ file_lost_pino(inode);
/*
* If link the tmpfile to alias through linkat path,
* we should remove this inode from orphan list.
*/
if (inode->i_nlink == 0)
- remove_orphan_inode(F2FS_I_SB(dir), inode->i_ino);
+ f2fs_remove_orphan_inode(F2FS_I_SB(dir), inode->i_ino);
f2fs_i_links_write(inode, true);
}
return page;
put_error:
clear_nlink(inode);
- update_inode(inode, page);
+ f2fs_update_inode(inode, page);
f2fs_put_page(page, 1);
return ERR_PTR(err);
}
-void update_parent_metadata(struct inode *dir, struct inode *inode,
+void f2fs_update_parent_metadata(struct inode *dir, struct inode *inode,
unsigned int current_depth)
{
if (inode && is_inode_flag_set(inode, FI_NEW_INODE)) {
@@ -477,7 +436,7 @@ void update_parent_metadata(struct inode *dir, struct inode *inode,
clear_inode_flag(inode, FI_NEW_INODE);
}
dir->i_mtime = dir->i_ctime = current_time(dir);
- f2fs_mark_inode_dirty_sync(dir);
+ f2fs_mark_inode_dirty_sync(dir, false);
if (F2FS_I(dir)->i_current_depth != current_depth)
f2fs_i_depth_write(dir, current_depth);
@@ -486,7 +445,7 @@ void update_parent_metadata(struct inode *dir, struct inode *inode,
clear_inode_flag(inode, FI_INC_LINK);
}
-int room_for_filename(const void *bitmap, int slots, int max_slots)
+int f2fs_room_for_filename(const void *bitmap, int slots, int max_slots)
{
int bit_start = 0;
int zero_start, zero_end;
@@ -555,10 +514,11 @@ int f2fs_add_regular_entry(struct inode *dir, const struct qstr *new_name,
}
start:
-#ifdef CONFIG_F2FS_FAULT_INJECTION
- if (time_to_inject(F2FS_I_SB(dir), FAULT_DIR_DEPTH))
+ if (time_to_inject(F2FS_I_SB(dir), FAULT_DIR_DEPTH)) {
+ f2fs_show_injection_info(FAULT_DIR_DEPTH);
return -ENOSPC;
-#endif
+ }
+
if (unlikely(current_depth == MAX_DIR_HASH_DEPTH))
return -ENOSPC;
@@ -573,17 +533,16 @@ start:
(le32_to_cpu(dentry_hash) % nbucket));
for (block = bidx; block <= (bidx + nblock - 1); block++) {
- dentry_page = get_new_data_page(dir, NULL, block, true);
+ dentry_page = f2fs_get_new_data_page(dir, NULL, block, true);
if (IS_ERR(dentry_page))
return PTR_ERR(dentry_page);
- dentry_blk = kmap(dentry_page);
- bit_pos = room_for_filename(&dentry_blk->dentry_bitmap,
+ dentry_blk = page_address(dentry_page);
+ bit_pos = f2fs_room_for_filename(&dentry_blk->dentry_bitmap,
slots, NR_DENTRY_IN_BLOCK);
if (bit_pos < NR_DENTRY_IN_BLOCK)
goto add_dentry;
- kunmap(dentry_page);
f2fs_put_page(dentry_page, 1);
}
@@ -591,21 +550,19 @@ start:
++level;
goto start;
add_dentry:
- f2fs_wait_on_page_writeback(dentry_page, DATA, true);
+ f2fs_wait_on_page_writeback(dentry_page, DATA, true, true);
if (inode) {
down_write(&F2FS_I(inode)->i_sem);
- page = init_inode_metadata(inode, dir, new_name,
+ page = f2fs_init_inode_metadata(inode, dir, new_name,
orig_name, NULL);
if (IS_ERR(page)) {
err = PTR_ERR(page);
goto fail;
}
- if (f2fs_encrypted_inode(dir))
- file_set_enc_name(inode);
}
- make_dentry_ptr(NULL, &d, (void *)dentry_blk, 1);
+ make_dentry_ptr_block(NULL, &d, dentry_blk);
f2fs_update_dentry(ino, mode, &d, new_name, dentry_hash, bit_pos);
set_page_dirty(dentry_page);
@@ -615,18 +572,17 @@ add_dentry:
f2fs_put_page(page, 1);
}
- update_parent_metadata(dir, inode, current_depth);
+ f2fs_update_parent_metadata(dir, inode, current_depth);
fail:
if (inode)
up_write(&F2FS_I(inode)->i_sem);
- kunmap(dentry_page);
f2fs_put_page(dentry_page, 1);
return err;
}
-int __f2fs_do_add_link(struct inode *dir, struct fscrypt_name *fname,
+int f2fs_add_dentry(struct inode *dir, struct fscrypt_name *fname,
struct inode *inode, nid_t ino, umode_t mode)
{
struct qstr new_name;
@@ -650,7 +606,7 @@ int __f2fs_do_add_link(struct inode *dir, struct fscrypt_name *fname,
* Caller should grab and release a rwsem by calling f2fs_lock_op() and
* f2fs_unlock_op().
*/
-int __f2fs_add_link(struct inode *dir, const struct qstr *name,
+int f2fs_do_add_link(struct inode *dir, const struct qstr *name,
struct inode *inode, nid_t ino, umode_t mode)
{
struct fscrypt_name fname;
@@ -674,13 +630,12 @@ int __f2fs_add_link(struct inode *dir, const struct qstr *name,
F2FS_I(dir)->task = NULL;
}
if (de) {
- f2fs_dentry_kunmap(dir, page);
f2fs_put_page(page, 0);
err = -EEXIST;
} else if (IS_ERR(page)) {
err = PTR_ERR(page);
} else {
- err = __f2fs_do_add_link(dir, &fname, inode, ino, mode);
+ err = f2fs_add_dentry(dir, &fname, inode, ino, mode);
}
fscrypt_free_filename(&fname);
return err;
@@ -692,7 +647,7 @@ int f2fs_do_tmpfile(struct inode *inode, struct inode *dir)
int err = 0;
down_write(&F2FS_I(inode)->i_sem);
- page = init_inode_metadata(inode, dir, NULL, NULL, NULL);
+ page = f2fs_init_inode_metadata(inode, dir, NULL, NULL, NULL);
if (IS_ERR(page)) {
err = PTR_ERR(page);
goto fail;
@@ -700,9 +655,9 @@ int f2fs_do_tmpfile(struct inode *inode, struct inode *dir)
f2fs_put_page(page, 1);
clear_inode_flag(inode, FI_NEW_INODE);
+ f2fs_update_time(F2FS_I_SB(inode), REQ_TIME);
fail:
up_write(&F2FS_I(inode)->i_sem);
- f2fs_update_time(F2FS_I_SB(inode), REQ_TIME);
return err;
}
@@ -724,9 +679,9 @@ void f2fs_drop_nlink(struct inode *dir, struct inode *inode)
up_write(&F2FS_I(inode)->i_sem);
if (inode->i_nlink == 0)
- add_orphan_inode(inode);
+ f2fs_add_orphan_inode(inode);
else
- release_orphan_inode(sbi);
+ f2fs_release_orphan_inode(sbi);
}
/*
@@ -743,36 +698,41 @@ void f2fs_delete_entry(struct f2fs_dir_entry *dentry, struct page *page,
f2fs_update_time(F2FS_I_SB(dir), REQ_TIME);
+ if (F2FS_OPTION(F2FS_I_SB(dir)).fsync_mode == FSYNC_MODE_STRICT)
+ f2fs_add_ino_entry(F2FS_I_SB(dir), dir->i_ino, TRANS_DIR_INO);
+
if (f2fs_has_inline_dentry(dir))
return f2fs_delete_inline_entry(dentry, page, dir, inode);
lock_page(page);
- f2fs_wait_on_page_writeback(page, DATA, true);
+ f2fs_wait_on_page_writeback(page, DATA, true, true);
dentry_blk = page_address(page);
bit_pos = dentry - dentry_blk->dentry;
for (i = 0; i < slots; i++)
- clear_bit_le(bit_pos + i, &dentry_blk->dentry_bitmap);
+ __clear_bit_le(bit_pos + i, &dentry_blk->dentry_bitmap);
/* Let's check and deallocate this dentry page */
bit_pos = find_next_bit_le(&dentry_blk->dentry_bitmap,
NR_DENTRY_IN_BLOCK,
0);
- kunmap(page); /* kunmap - pair of f2fs_find_entry */
set_page_dirty(page);
dir->i_ctime = dir->i_mtime = current_time(dir);
- f2fs_mark_inode_dirty_sync(dir);
+ f2fs_mark_inode_dirty_sync(dir, false);
if (inode)
f2fs_drop_nlink(dir, inode);
if (bit_pos == NR_DENTRY_IN_BLOCK &&
- !truncate_hole(dir, page->index, page->index + 1)) {
+ !f2fs_truncate_hole(dir, page->index, page->index + 1)) {
+ f2fs_clear_radix_tree_dirty_tag(page);
clear_page_dirty_for_io(page);
ClearPagePrivate(page);
ClearPageUptodate(page);
+ clear_cold_data(page);
inode_dec_dirty_pages(dir);
+ f2fs_remove_dirty_inode(dir);
}
f2fs_put_page(page, 1);
}
@@ -789,7 +749,7 @@ bool f2fs_empty_dir(struct inode *dir)
return f2fs_empty_inline_dir(dir);
for (bidx = 0; bidx < nblock; bidx++) {
- dentry_page = get_lock_data_page(dir, bidx, false);
+ dentry_page = f2fs_get_lock_data_page(dir, bidx, false);
if (IS_ERR(dentry_page)) {
if (PTR_ERR(dentry_page) == -ENOENT)
continue;
@@ -797,7 +757,7 @@ bool f2fs_empty_dir(struct inode *dir)
return false;
}
- dentry_blk = kmap_atomic(dentry_page);
+ dentry_blk = page_address(dentry_page);
if (bidx == 0)
bit_pos = 2;
else
@@ -805,7 +765,6 @@ bool f2fs_empty_dir(struct inode *dir)
bit_pos = find_next_bit_le(&dentry_blk->dentry_bitmap,
NR_DENTRY_IN_BLOCK,
bit_pos);
- kunmap_atomic(dentry_blk);
f2fs_put_page(dentry_page, 1);
@@ -815,16 +774,23 @@ bool f2fs_empty_dir(struct inode *dir)
return true;
}
-bool f2fs_fill_dentries(struct dir_context *ctx, struct f2fs_dentry_ptr *d,
+int f2fs_fill_dentries(struct dir_context *ctx, struct f2fs_dentry_ptr *d,
unsigned int start_pos, struct fscrypt_str *fstr)
{
unsigned char d_type = DT_UNKNOWN;
unsigned int bit_pos;
struct f2fs_dir_entry *de = NULL;
struct fscrypt_str de_name = FSTR_INIT(NULL, 0);
+ struct f2fs_sb_info *sbi = F2FS_I_SB(d->inode);
+ struct blk_plug plug;
+ bool readdir_ra = sbi->readdir_ra == 1;
+ int err = 0;
bit_pos = ((unsigned long)ctx->pos % d->max);
+ if (readdir_ra)
+ blk_start_plug(&plug);
+
while (bit_pos < d->max) {
bit_pos = find_next_bit_le(d->bitmap, d->max, bit_pos);
if (bit_pos >= d->max)
@@ -837,33 +803,50 @@ bool f2fs_fill_dentries(struct dir_context *ctx, struct f2fs_dentry_ptr *d,
continue;
}
- d_type = get_de_type(de);
+ d_type = f2fs_get_de_type(de);
de_name.name = d->filename[bit_pos];
de_name.len = le16_to_cpu(de->name_len);
+ /* check memory boundary before moving forward */
+ bit_pos += GET_DENTRY_SLOTS(le16_to_cpu(de->name_len));
+ if (unlikely(bit_pos > d->max)) {
+ f2fs_msg(sbi->sb, KERN_WARNING,
+ "%s: corrupted namelen=%d, run fsck to fix.",
+ __func__, le16_to_cpu(de->name_len));
+ set_sbi_flag(sbi, SBI_NEED_FSCK);
+ err = -EINVAL;
+ goto out;
+ }
+
if (f2fs_encrypted_inode(d->inode)) {
int save_len = fstr->len;
- int err;
err = fscrypt_fname_disk_to_usr(d->inode,
(u32)de->hash_code, 0,
&de_name, fstr);
if (err)
- return true;
+ goto out;
de_name = *fstr;
fstr->len = save_len;
}
if (!dir_emit(ctx, de_name.name, de_name.len,
- le32_to_cpu(de->ino), d_type))
- return true;
+ le32_to_cpu(de->ino), d_type)) {
+ err = 1;
+ goto out;
+ }
+
+ if (readdir_ra)
+ f2fs_ra_node_page(sbi, le32_to_cpu(de->ino));
- bit_pos += GET_DENTRY_SLOTS(le16_to_cpu(de->name_len));
ctx->pos = start_pos + bit_pos;
}
- return false;
+out:
+ if (readdir_ra)
+ blk_finish_plug(&plug);
+ return err;
}
static int f2fs_readdir(struct file *file, struct dir_context *ctx)
@@ -873,6 +856,7 @@ static int f2fs_readdir(struct file *file, struct dir_context *ctx)
struct f2fs_dentry_block *dentry_blk = NULL;
struct page *dentry_page = NULL;
struct file_ra_state *ra = &file->f_ra;
+ loff_t start_pos = ctx->pos;
unsigned int n = ((unsigned long)ctx->pos / NR_DENTRY_IN_BLOCK);
struct f2fs_dentry_ptr d;
struct fscrypt_str fstr = FSTR_INIT(NULL, 0);
@@ -881,51 +865,61 @@ static int f2fs_readdir(struct file *file, struct dir_context *ctx)
if (f2fs_encrypted_inode(inode)) {
err = fscrypt_get_encryption_info(inode);
if (err && err != -ENOKEY)
- return err;
+ goto out;
err = fscrypt_fname_alloc_buffer(inode, F2FS_NAME_LEN, &fstr);
if (err < 0)
- return err;
+ goto out;
}
if (f2fs_has_inline_dentry(inode)) {
err = f2fs_read_inline_dir(file, ctx, &fstr);
- goto out;
+ goto out_free;
}
- /* readahead for multi pages of dir */
- if (npages - n > 1 && !ra_has_index(ra, n))
- page_cache_sync_readahead(inode->i_mapping, ra, file, n,
+ for (; n < npages; n++, ctx->pos = n * NR_DENTRY_IN_BLOCK) {
+
+ /* allow readdir() to be interrupted */
+ if (fatal_signal_pending(current)) {
+ err = -ERESTARTSYS;
+ goto out_free;
+ }
+ cond_resched();
+
+ /* readahead for multi pages of dir */
+ if (npages - n > 1 && !ra_has_index(ra, n))
+ page_cache_sync_readahead(inode->i_mapping, ra, file, n,
min(npages - n, (pgoff_t)MAX_DIR_RA_PAGES));
- for (; n < npages; n++) {
- dentry_page = get_lock_data_page(inode, n, false);
+ dentry_page = f2fs_get_lock_data_page(inode, n, false);
if (IS_ERR(dentry_page)) {
err = PTR_ERR(dentry_page);
- if (err == -ENOENT)
+ if (err == -ENOENT) {
+ err = 0;
continue;
- else
- goto out;
+ } else {
+ goto out_free;
+ }
}
- dentry_blk = kmap(dentry_page);
+ dentry_blk = page_address(dentry_page);
- make_dentry_ptr(inode, &d, (void *)dentry_blk, 1);
+ make_dentry_ptr_block(inode, &d, dentry_blk);
- if (f2fs_fill_dentries(ctx, &d, n * NR_DENTRY_IN_BLOCK, &fstr)) {
- kunmap(dentry_page);
+ err = f2fs_fill_dentries(ctx, &d,
+ n * NR_DENTRY_IN_BLOCK, &fstr);
+ if (err) {
f2fs_put_page(dentry_page, 1);
break;
}
- ctx->pos = (n + 1) * NR_DENTRY_IN_BLOCK;
- kunmap(dentry_page);
f2fs_put_page(dentry_page, 1);
}
- err = 0;
-out:
+out_free:
fscrypt_fname_free_buffer(&fstr);
- return err;
+out:
+ trace_f2fs_readdir(inode, start_pos, ctx->pos, err);
+ return err < 0 ? err : 0;
}
static int f2fs_dir_open(struct inode *inode, struct file *filp)
diff --git a/fs/f2fs/extent_cache.c b/fs/f2fs/extent_cache.c
index d7b8c8b5fc39..904ad7ba5a45 100644
--- a/fs/f2fs/extent_cache.c
+++ b/fs/f2fs/extent_cache.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
/*
* f2fs extent cache support
*
@@ -5,10 +6,6 @@
* Copyright (c) 2015 Samsung Electronics
* Authors: Jaegeuk Kim <jaegeuk@kernel.org>
* Chao Yu <chao2.yu@samsung.com>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
*/
#include <linux/fs.h>
@@ -18,6 +15,179 @@
#include "node.h"
#include <trace/events/f2fs.h>
+static struct rb_entry *__lookup_rb_tree_fast(struct rb_entry *cached_re,
+ unsigned int ofs)
+{
+ if (cached_re) {
+ if (cached_re->ofs <= ofs &&
+ cached_re->ofs + cached_re->len > ofs) {
+ return cached_re;
+ }
+ }
+ return NULL;
+}
+
+static struct rb_entry *__lookup_rb_tree_slow(struct rb_root *root,
+ unsigned int ofs)
+{
+ struct rb_node *node = root->rb_node;
+ struct rb_entry *re;
+
+ while (node) {
+ re = rb_entry(node, struct rb_entry, rb_node);
+
+ if (ofs < re->ofs)
+ node = node->rb_left;
+ else if (ofs >= re->ofs + re->len)
+ node = node->rb_right;
+ else
+ return re;
+ }
+ return NULL;
+}
+
+struct rb_entry *f2fs_lookup_rb_tree(struct rb_root *root,
+ struct rb_entry *cached_re, unsigned int ofs)
+{
+ struct rb_entry *re;
+
+ re = __lookup_rb_tree_fast(cached_re, ofs);
+ if (!re)
+ return __lookup_rb_tree_slow(root, ofs);
+
+ return re;
+}
+
+struct rb_node **f2fs_lookup_rb_tree_for_insert(struct f2fs_sb_info *sbi,
+ struct rb_root *root, struct rb_node **parent,
+ unsigned int ofs)
+{
+ struct rb_node **p = &root->rb_node;
+ struct rb_entry *re;
+
+ while (*p) {
+ *parent = *p;
+ re = rb_entry(*parent, struct rb_entry, rb_node);
+
+ if (ofs < re->ofs)
+ p = &(*p)->rb_left;
+ else if (ofs >= re->ofs + re->len)
+ p = &(*p)->rb_right;
+ else
+ f2fs_bug_on(sbi, 1);
+ }
+
+ return p;
+}
+
+/*
+ * lookup rb entry in position of @ofs in rb-tree,
+ * if hit, return the entry, otherwise, return NULL
+ * @prev_ex: extent before ofs
+ * @next_ex: extent after ofs
+ * @insert_p: insert point for new extent at ofs
+ * in order to simpfy the insertion after.
+ * tree must stay unchanged between lookup and insertion.
+ */
+struct rb_entry *f2fs_lookup_rb_tree_ret(struct rb_root *root,
+ struct rb_entry *cached_re,
+ unsigned int ofs,
+ struct rb_entry **prev_entry,
+ struct rb_entry **next_entry,
+ struct rb_node ***insert_p,
+ struct rb_node **insert_parent,
+ bool force)
+{
+ struct rb_node **pnode = &root->rb_node;
+ struct rb_node *parent = NULL, *tmp_node;
+ struct rb_entry *re = cached_re;
+
+ *insert_p = NULL;
+ *insert_parent = NULL;
+ *prev_entry = NULL;
+ *next_entry = NULL;
+
+ if (RB_EMPTY_ROOT(root))
+ return NULL;
+
+ if (re) {
+ if (re->ofs <= ofs && re->ofs + re->len > ofs)
+ goto lookup_neighbors;
+ }
+
+ while (*pnode) {
+ parent = *pnode;
+ re = rb_entry(*pnode, struct rb_entry, rb_node);
+
+ if (ofs < re->ofs)
+ pnode = &(*pnode)->rb_left;
+ else if (ofs >= re->ofs + re->len)
+ pnode = &(*pnode)->rb_right;
+ else
+ goto lookup_neighbors;
+ }
+
+ *insert_p = pnode;
+ *insert_parent = parent;
+
+ re = rb_entry(parent, struct rb_entry, rb_node);
+ tmp_node = parent;
+ if (parent && ofs > re->ofs)
+ tmp_node = rb_next(parent);
+ *next_entry = rb_entry_safe(tmp_node, struct rb_entry, rb_node);
+
+ tmp_node = parent;
+ if (parent && ofs < re->ofs)
+ tmp_node = rb_prev(parent);
+ *prev_entry = rb_entry_safe(tmp_node, struct rb_entry, rb_node);
+ return NULL;
+
+lookup_neighbors:
+ if (ofs == re->ofs || force) {
+ /* lookup prev node for merging backward later */
+ tmp_node = rb_prev(&re->rb_node);
+ *prev_entry = rb_entry_safe(tmp_node, struct rb_entry, rb_node);
+ }
+ if (ofs == re->ofs + re->len - 1 || force) {
+ /* lookup next node for merging frontward later */
+ tmp_node = rb_next(&re->rb_node);
+ *next_entry = rb_entry_safe(tmp_node, struct rb_entry, rb_node);
+ }
+ return re;
+}
+
+bool f2fs_check_rb_tree_consistence(struct f2fs_sb_info *sbi,
+ struct rb_root *root)
+{
+#ifdef CONFIG_F2FS_CHECK_FS
+ struct rb_node *cur = rb_first(root), *next;
+ struct rb_entry *cur_re, *next_re;
+
+ if (!cur)
+ return true;
+
+ while (cur) {
+ next = rb_next(cur);
+ if (!next)
+ return true;
+
+ cur_re = rb_entry(cur, struct rb_entry, rb_node);
+ next_re = rb_entry(next, struct rb_entry, rb_node);
+
+ if (cur_re->ofs + cur_re->len > next_re->ofs) {
+ f2fs_msg(sbi->sb, KERN_INFO, "inconsistent rbtree, "
+ "cur(%u, %u) next(%u, %u)",
+ cur_re->ofs, cur_re->len,
+ next_re->ofs, next_re->len);
+ return false;
+ }
+
+ cur = next;
+ }
+#endif
+ return true;
+}
+
static struct kmem_cache *extent_tree_slab;
static struct kmem_cache *extent_node_slab;
@@ -77,7 +247,7 @@ static struct extent_tree *__grab_extent_tree(struct inode *inode)
struct extent_tree *et;
nid_t ino = inode->i_ino;
- down_write(&sbi->extent_tree_lock);
+ mutex_lock(&sbi->extent_tree_lock);
et = radix_tree_lookup(&sbi->extent_tree_root, ino);
if (!et) {
et = f2fs_kmem_cache_alloc(extent_tree_slab, GFP_NOFS);
@@ -94,7 +264,7 @@ static struct extent_tree *__grab_extent_tree(struct inode *inode)
atomic_dec(&sbi->total_zombie_tree);
list_del_init(&et->list);
}
- up_write(&sbi->extent_tree_lock);
+ mutex_unlock(&sbi->extent_tree_lock);
/* never died until evict_inode */
F2FS_I(inode)->extent_tree = et;
@@ -102,36 +272,6 @@ static struct extent_tree *__grab_extent_tree(struct inode *inode)
return et;
}
-static struct extent_node *__lookup_extent_tree(struct f2fs_sb_info *sbi,
- struct extent_tree *et, unsigned int fofs)
-{
- struct rb_node *node = et->root.rb_node;
- struct extent_node *en = et->cached_en;
-
- if (en) {
- struct extent_info *cei = &en->ei;
-
- if (cei->fofs <= fofs && cei->fofs + cei->len > fofs) {
- stat_inc_cached_node_hit(sbi);
- return en;
- }
- }
-
- while (node) {
- en = rb_entry(node, struct extent_node, rb_node);
-
- if (fofs < en->ei.fofs) {
- node = node->rb_left;
- } else if (fofs >= en->ei.fofs + en->ei.len) {
- node = node->rb_right;
- } else {
- stat_inc_rbtree_node_hit(sbi);
- return en;
- }
- }
- return NULL;
-}
-
static struct extent_node *__init_extent_tree(struct f2fs_sb_info *sbi,
struct extent_tree *et, struct extent_info *ei)
{
@@ -165,14 +305,13 @@ static unsigned int __free_extent_tree(struct f2fs_sb_info *sbi,
return count - atomic_read(&et->node_cnt);
}
-static void __drop_largest_extent(struct inode *inode,
+static void __drop_largest_extent(struct extent_tree *et,
pgoff_t fofs, unsigned int len)
{
- struct extent_info *largest = &F2FS_I(inode)->extent_tree->largest;
-
- if (fofs < largest->fofs + largest->len && fofs + len > largest->fofs) {
- largest->len = 0;
- f2fs_mark_inode_dirty_sync(inode);
+ if (fofs < et->largest.fofs + et->largest.len &&
+ fofs + len > et->largest.fofs) {
+ et->largest.len = 0;
+ et->largest_updated = true;
}
}
@@ -247,17 +386,24 @@ static bool f2fs_lookup_extent_tree(struct inode *inode, pgoff_t pgofs,
goto out;
}
- en = __lookup_extent_tree(sbi, et, pgofs);
- if (en) {
- *ei = en->ei;
- spin_lock(&sbi->extent_lock);
- if (!list_empty(&en->list)) {
- list_move_tail(&en->list, &sbi->extent_list);
- et->cached_en = en;
- }
- spin_unlock(&sbi->extent_lock);
- ret = true;
+ en = (struct extent_node *)f2fs_lookup_rb_tree(&et->root,
+ (struct rb_entry *)et->cached_en, pgofs);
+ if (!en)
+ goto out;
+
+ if (en == et->cached_en)
+ stat_inc_cached_node_hit(sbi);
+ else
+ stat_inc_rbtree_node_hit(sbi);
+
+ *ei = en->ei;
+ spin_lock(&sbi->extent_lock);
+ if (!list_empty(&en->list)) {
+ list_move_tail(&en->list, &sbi->extent_list);
+ et->cached_en = en;
}
+ spin_unlock(&sbi->extent_lock);
+ ret = true;
out:
stat_inc_total_hit(sbi);
read_unlock(&et->lock);
@@ -266,93 +412,11 @@ out:
return ret;
}
-
-/*
- * lookup extent at @fofs, if hit, return the extent
- * if not, return NULL and
- * @prev_ex: extent before fofs
- * @next_ex: extent after fofs
- * @insert_p: insert point for new extent at fofs
- * in order to simpfy the insertion after.
- * tree must stay unchanged between lookup and insertion.
- */
-static struct extent_node *__lookup_extent_tree_ret(struct extent_tree *et,
- unsigned int fofs,
- struct extent_node **prev_ex,
- struct extent_node **next_ex,
- struct rb_node ***insert_p,
- struct rb_node **insert_parent)
-{
- struct rb_node **pnode = &et->root.rb_node;
- struct rb_node *parent = NULL, *tmp_node;
- struct extent_node *en = et->cached_en;
-
- *insert_p = NULL;
- *insert_parent = NULL;
- *prev_ex = NULL;
- *next_ex = NULL;
-
- if (RB_EMPTY_ROOT(&et->root))
- return NULL;
-
- if (en) {
- struct extent_info *cei = &en->ei;
-
- if (cei->fofs <= fofs && cei->fofs + cei->len > fofs)
- goto lookup_neighbors;
- }
-
- while (*pnode) {
- parent = *pnode;
- en = rb_entry(*pnode, struct extent_node, rb_node);
-
- if (fofs < en->ei.fofs)
- pnode = &(*pnode)->rb_left;
- else if (fofs >= en->ei.fofs + en->ei.len)
- pnode = &(*pnode)->rb_right;
- else
- goto lookup_neighbors;
- }
-
- *insert_p = pnode;
- *insert_parent = parent;
-
- en = rb_entry(parent, struct extent_node, rb_node);
- tmp_node = parent;
- if (parent && fofs > en->ei.fofs)
- tmp_node = rb_next(parent);
- *next_ex = tmp_node ?
- rb_entry(tmp_node, struct extent_node, rb_node) : NULL;
-
- tmp_node = parent;
- if (parent && fofs < en->ei.fofs)
- tmp_node = rb_prev(parent);
- *prev_ex = tmp_node ?
- rb_entry(tmp_node, struct extent_node, rb_node) : NULL;
- return NULL;
-
-lookup_neighbors:
- if (fofs == en->ei.fofs) {
- /* lookup prev node for merging backward later */
- tmp_node = rb_prev(&en->rb_node);
- *prev_ex = tmp_node ?
- rb_entry(tmp_node, struct extent_node, rb_node) : NULL;
- }
- if (fofs == en->ei.fofs + en->ei.len - 1) {
- /* lookup next node for merging frontward later */
- tmp_node = rb_next(&en->rb_node);
- *next_ex = tmp_node ?
- rb_entry(tmp_node, struct extent_node, rb_node) : NULL;
- }
- return en;
-}
-
-static struct extent_node *__try_merge_extent_node(struct inode *inode,
+static struct extent_node *__try_merge_extent_node(struct f2fs_sb_info *sbi,
struct extent_tree *et, struct extent_info *ei,
struct extent_node *prev_ex,
struct extent_node *next_ex)
{
- struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
struct extent_node *en = NULL;
if (prev_ex && __is_back_mergeable(ei, &prev_ex->ei)) {
@@ -374,7 +438,7 @@ static struct extent_node *__try_merge_extent_node(struct inode *inode,
if (!en)
return NULL;
- __try_update_largest_extent(inode, et, en);
+ __try_update_largest_extent(et, en);
spin_lock(&sbi->extent_lock);
if (!list_empty(&en->list)) {
@@ -385,13 +449,12 @@ static struct extent_node *__try_merge_extent_node(struct inode *inode,
return en;
}
-static struct extent_node *__insert_extent_tree(struct inode *inode,
+static struct extent_node *__insert_extent_tree(struct f2fs_sb_info *sbi,
struct extent_tree *et, struct extent_info *ei,
struct rb_node **insert_p,
struct rb_node *insert_parent)
{
- struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
- struct rb_node **p = &et->root.rb_node;
+ struct rb_node **p;
struct rb_node *parent = NULL;
struct extent_node *en = NULL;
@@ -401,23 +464,13 @@ static struct extent_node *__insert_extent_tree(struct inode *inode,
goto do_insert;
}
- while (*p) {
- parent = *p;
- en = rb_entry(parent, struct extent_node, rb_node);
-
- if (ei->fofs < en->ei.fofs)
- p = &(*p)->rb_left;
- else if (ei->fofs >= en->ei.fofs + en->ei.len)
- p = &(*p)->rb_right;
- else
- f2fs_bug_on(sbi, 1);
- }
+ p = f2fs_lookup_rb_tree_for_insert(sbi, &et->root, &parent, ei->fofs);
do_insert:
en = __attach_extent_node(sbi, et, ei, parent, p);
if (!en)
return NULL;
- __try_update_largest_extent(inode, et, en);
+ __try_update_largest_extent(et, en);
/* update in global extent list */
spin_lock(&sbi->extent_lock);
@@ -427,7 +480,7 @@ do_insert:
return en;
}
-static unsigned int f2fs_update_extent_tree_range(struct inode *inode,
+static void f2fs_update_extent_tree_range(struct inode *inode,
pgoff_t fofs, block_t blkaddr, unsigned int len)
{
struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
@@ -438,9 +491,10 @@ static unsigned int f2fs_update_extent_tree_range(struct inode *inode,
struct rb_node **insert_p = NULL, *insert_parent = NULL;
unsigned int end = fofs + len;
unsigned int pos = (unsigned int)fofs;
+ bool updated = false;
if (!et)
- return false;
+ return;
trace_f2fs_update_extent_tree_range(inode, fofs, blkaddr, len);
@@ -448,7 +502,7 @@ static unsigned int f2fs_update_extent_tree_range(struct inode *inode,
if (is_inode_flag_set(inode, FI_NO_EXTENT)) {
write_unlock(&et->lock);
- return false;
+ return;
}
prev = et->largest;
@@ -458,11 +512,14 @@ static unsigned int f2fs_update_extent_tree_range(struct inode *inode,
* drop largest extent before lookup, in case it's already
* been shrunk from extent tree
*/
- __drop_largest_extent(inode, fofs, len);
+ __drop_largest_extent(et, fofs, len);
/* 1. lookup first extent node in range [fofs, fofs + len - 1] */
- en = __lookup_extent_tree_ret(et, fofs, &prev_en, &next_en,
- &insert_p, &insert_parent);
+ en = (struct extent_node *)f2fs_lookup_rb_tree_ret(&et->root,
+ (struct rb_entry *)et->cached_en, fofs,
+ (struct rb_entry **)&prev_en,
+ (struct rb_entry **)&next_en,
+ &insert_p, &insert_parent, false);
if (!en)
en = next_en;
@@ -488,7 +545,7 @@ static unsigned int f2fs_update_extent_tree_range(struct inode *inode,
set_extent_info(&ei, end,
end - dei.fofs + dei.blk,
org_end - end);
- en1 = __insert_extent_tree(inode, et, &ei,
+ en1 = __insert_extent_tree(sbi, et, &ei,
NULL, NULL);
next_en = en1;
} else {
@@ -503,13 +560,12 @@ static unsigned int f2fs_update_extent_tree_range(struct inode *inode,
if (!next_en) {
struct rb_node *node = rb_next(&en->rb_node);
- next_en = node ?
- rb_entry(node, struct extent_node, rb_node)
- : NULL;
+ next_en = rb_entry_safe(node, struct extent_node,
+ rb_node);
}
if (parts)
- __try_update_largest_extent(inode, et, en);
+ __try_update_largest_extent(et, en);
else
__release_extent_node(sbi, et, en);
@@ -529,15 +585,16 @@ static unsigned int f2fs_update_extent_tree_range(struct inode *inode,
if (blkaddr) {
set_extent_info(&ei, fofs, blkaddr, len);
- if (!__try_merge_extent_node(inode, et, &ei, prev_en, next_en))
- __insert_extent_tree(inode, et, &ei,
+ if (!__try_merge_extent_node(sbi, et, &ei, prev_en, next_en))
+ __insert_extent_tree(sbi, et, &ei,
insert_p, insert_parent);
/* give up extent_cache, if split and small updates happen */
if (dei.len >= 1 &&
prev.len < F2FS_MIN_EXTENT_LEN &&
et->largest.len < F2FS_MIN_EXTENT_LEN) {
- __drop_largest_extent(inode, 0, UINT_MAX);
+ et->largest.len = 0;
+ et->largest_updated = true;
set_inode_flag(inode, FI_NO_EXTENT);
}
}
@@ -545,9 +602,15 @@ static unsigned int f2fs_update_extent_tree_range(struct inode *inode,
if (is_inode_flag_set(inode, FI_NO_EXTENT))
__free_extent_tree(sbi, et);
+ if (et->largest_updated) {
+ et->largest_updated = false;
+ updated = true;
+ }
+
write_unlock(&et->lock);
- return !__is_extent_same(&prev, &et->largest);
+ if (updated)
+ f2fs_mark_inode_dirty_sync(inode, true);
}
unsigned int f2fs_shrink_extent_tree(struct f2fs_sb_info *sbi, int nr_shrink)
@@ -563,7 +626,7 @@ unsigned int f2fs_shrink_extent_tree(struct f2fs_sb_info *sbi, int nr_shrink)
if (!atomic_read(&sbi->total_zombie_tree))
goto free_node;
- if (!down_write_trylock(&sbi->extent_tree_lock))
+ if (!mutex_trylock(&sbi->extent_tree_lock))
goto out;
/* 1. remove unreferenced extent tree */
@@ -585,11 +648,11 @@ unsigned int f2fs_shrink_extent_tree(struct f2fs_sb_info *sbi, int nr_shrink)
goto unlock_out;
cond_resched();
}
- up_write(&sbi->extent_tree_lock);
+ mutex_unlock(&sbi->extent_tree_lock);
free_node:
/* 2. remove LRU extent entries */
- if (!down_write_trylock(&sbi->extent_tree_lock))
+ if (!mutex_trylock(&sbi->extent_tree_lock))
goto out;
remained = nr_shrink - (node_cnt + tree_cnt);
@@ -619,7 +682,7 @@ free_node:
spin_unlock(&sbi->extent_lock);
unlock_out:
- up_write(&sbi->extent_tree_lock);
+ mutex_unlock(&sbi->extent_tree_lock);
out:
trace_f2fs_shrink_extent_tree(sbi, node_cnt, tree_cnt);
@@ -646,6 +709,7 @@ void f2fs_drop_extent_tree(struct inode *inode)
{
struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
struct extent_tree *et = F2FS_I(inode)->extent_tree;
+ bool updated = false;
if (!f2fs_may_extent_tree(inode))
return;
@@ -654,8 +718,13 @@ void f2fs_drop_extent_tree(struct inode *inode)
write_lock(&et->lock);
__free_extent_tree(sbi, et);
- __drop_largest_extent(inode, 0, UINT_MAX);
+ if (et->largest.len) {
+ et->largest.len = 0;
+ updated = true;
+ }
write_unlock(&et->lock);
+ if (updated)
+ f2fs_mark_inode_dirty_sync(inode, true);
}
void f2fs_destroy_extent_tree(struct inode *inode)
@@ -669,10 +738,10 @@ void f2fs_destroy_extent_tree(struct inode *inode)
if (inode->i_nlink && !is_bad_inode(inode) &&
atomic_read(&et->node_cnt)) {
- down_write(&sbi->extent_tree_lock);
+ mutex_lock(&sbi->extent_tree_lock);
list_add_tail(&et->list, &sbi->zombie_list);
atomic_inc(&sbi->total_zombie_tree);
- up_write(&sbi->extent_tree_lock);
+ mutex_unlock(&sbi->extent_tree_lock);
return;
}
@@ -680,12 +749,12 @@ void f2fs_destroy_extent_tree(struct inode *inode)
node_cnt = f2fs_destroy_extent_node(inode);
/* delete extent tree entry in radix tree */
- down_write(&sbi->extent_tree_lock);
+ mutex_lock(&sbi->extent_tree_lock);
f2fs_bug_on(sbi, atomic_read(&et->node_cnt));
radix_tree_delete(&sbi->extent_tree_root, inode->i_ino);
kmem_cache_free(extent_tree_slab, et);
atomic_dec(&sbi->total_ext_tree);
- up_write(&sbi->extent_tree_lock);
+ mutex_unlock(&sbi->extent_tree_lock);
F2FS_I(inode)->extent_tree = NULL;
@@ -714,7 +783,7 @@ void f2fs_update_extent_cache(struct dnode_of_data *dn)
else
blkaddr = dn->data_blkaddr;
- fofs = start_bidx_of_node(ofs_of_node(dn->node_page), dn->inode) +
+ fofs = f2fs_start_bidx_of_node(ofs_of_node(dn->node_page), dn->inode) +
dn->ofs_in_node;
f2fs_update_extent_tree_range(dn->inode, fofs, blkaddr, 1);
}
@@ -729,10 +798,10 @@ void f2fs_update_extent_cache_range(struct dnode_of_data *dn,
f2fs_update_extent_tree_range(dn->inode, fofs, blkaddr, len);
}
-void init_extent_cache_info(struct f2fs_sb_info *sbi)
+void f2fs_init_extent_cache_info(struct f2fs_sb_info *sbi)
{
INIT_RADIX_TREE(&sbi->extent_tree_root, GFP_NOIO);
- init_rwsem(&sbi->extent_tree_lock);
+ mutex_init(&sbi->extent_tree_lock);
INIT_LIST_HEAD(&sbi->extent_list);
spin_lock_init(&sbi->extent_lock);
atomic_set(&sbi->total_ext_tree, 0);
@@ -741,7 +810,7 @@ void init_extent_cache_info(struct f2fs_sb_info *sbi)
atomic_set(&sbi->total_ext_node, 0);
}
-int __init create_extent_cache(void)
+int __init f2fs_create_extent_cache(void)
{
extent_tree_slab = f2fs_kmem_cache_create("f2fs_extent_tree",
sizeof(struct extent_tree));
@@ -756,7 +825,7 @@ int __init create_extent_cache(void)
return 0;
}
-void destroy_extent_cache(void)
+void f2fs_destroy_extent_cache(void)
{
kmem_cache_destroy(extent_node_slab);
kmem_cache_destroy(extent_tree_slab);
diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h
index 9c380885b0fc..520c4a0a7fb9 100644
--- a/fs/f2fs/f2fs.h
+++ b/fs/f2fs/f2fs.h
@@ -1,16 +1,14 @@
+// SPDX-License-Identifier: GPL-2.0
/*
* fs/f2fs/f2fs.h
*
* Copyright (c) 2012 Samsung Electronics Co., Ltd.
* http://www.samsung.com/
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
*/
#ifndef _LINUX_F2FS_H
#define _LINUX_F2FS_H
+#include <linux/uio.h>
#include <linux/types.h>
#include <linux/page-flags.h>
#include <linux/buffer_head.h>
@@ -19,11 +17,17 @@
#include <linux/magic.h>
#include <linux/kobject.h>
#include <linux/sched.h>
+#include <linux/writeback.h>
+#include <linux/cred.h>
#include <linux/vmalloc.h>
#include <linux/bio.h>
#include <linux/blkdev.h>
-#include <linux/fscrypto.h>
+#include <linux/quotaops.h>
#include <crypto/hash.h>
+#include <linux/overflow.h>
+
+#define __FS_HAS_ENCRYPTION IS_ENABLED(CONFIG_F2FS_FS_ENCRYPTION)
+#include <linux/fscrypt.h>
#ifdef CONFIG_F2FS_CHECK_FS
#define f2fs_bug_on(sbi, condition) BUG_ON(condition)
@@ -37,28 +41,36 @@
} while (0)
#endif
-#ifdef CONFIG_F2FS_FAULT_INJECTION
enum {
FAULT_KMALLOC,
+ FAULT_KVMALLOC,
FAULT_PAGE_ALLOC,
+ FAULT_PAGE_GET,
+ FAULT_ALLOC_BIO,
FAULT_ALLOC_NID,
FAULT_ORPHAN,
FAULT_BLOCK,
FAULT_DIR_DEPTH,
FAULT_EVICT_INODE,
- FAULT_IO,
+ FAULT_TRUNCATE,
+ FAULT_READ_IO,
FAULT_CHECKPOINT,
+ FAULT_DISCARD,
+ FAULT_WRITE_IO,
FAULT_MAX,
};
+#ifdef CONFIG_F2FS_FAULT_INJECTION
+#define F2FS_ALL_FAULT_TYPE ((1 << FAULT_MAX) - 1)
+
struct f2fs_fault_info {
atomic_t inject_ops;
unsigned int inject_rate;
unsigned int inject_type;
};
-extern char *fault_name[FAULT_MAX];
-#define IS_FAULT_SET(fi, type) (fi->inject_type & (1 << (type)))
+extern const char *f2fs_fault_name[FAULT_MAX];
+#define IS_FAULT_SET(fi, type) ((fi)->inject_type & (1 << (type)))
#endif
/*
@@ -83,10 +95,18 @@ extern char *fault_name[FAULT_MAX];
#define F2FS_MOUNT_FAULT_INJECTION 0x00010000
#define F2FS_MOUNT_ADAPTIVE 0x00020000
#define F2FS_MOUNT_LFS 0x00040000
-
-#define clear_opt(sbi, option) (sbi->mount_opt.opt &= ~F2FS_MOUNT_##option)
-#define set_opt(sbi, option) (sbi->mount_opt.opt |= F2FS_MOUNT_##option)
-#define test_opt(sbi, option) (sbi->mount_opt.opt & F2FS_MOUNT_##option)
+#define F2FS_MOUNT_USRQUOTA 0x00080000
+#define F2FS_MOUNT_GRPQUOTA 0x00100000
+#define F2FS_MOUNT_PRJQUOTA 0x00200000
+#define F2FS_MOUNT_QUOTA 0x00400000
+#define F2FS_MOUNT_INLINE_XATTR_SIZE 0x00800000
+#define F2FS_MOUNT_RESERVE_ROOT 0x01000000
+#define F2FS_MOUNT_DISABLE_CHECKPOINT 0x02000000
+
+#define F2FS_OPTION(sbi) ((sbi)->mount_opt)
+#define clear_opt(sbi, option) (F2FS_OPTION(sbi).opt &= ~F2FS_MOUNT_##option)
+#define set_opt(sbi, option) (F2FS_OPTION(sbi).opt |= F2FS_MOUNT_##option)
+#define test_opt(sbi, option) (F2FS_OPTION(sbi).opt & F2FS_MOUNT_##option)
#define ver_after(a, b) (typecheck(unsigned long long, a) && \
typecheck(unsigned long long, b) && \
@@ -99,18 +119,54 @@ typedef u32 block_t; /*
typedef u32 nid_t;
struct f2fs_mount_info {
- unsigned int opt;
+ unsigned int opt;
+ int write_io_size_bits; /* Write IO size bits */
+ block_t root_reserved_blocks; /* root reserved blocks */
+ kuid_t s_resuid; /* reserved blocks for uid */
+ kgid_t s_resgid; /* reserved blocks for gid */
+ int active_logs; /* # of active logs */
+ int inline_xattr_size; /* inline xattr size */
+#ifdef CONFIG_F2FS_FAULT_INJECTION
+ struct f2fs_fault_info fault_info; /* For fault injection */
+#endif
+#ifdef CONFIG_QUOTA
+ /* Names of quota files with journalled quota */
+ char *s_qf_names[MAXQUOTAS];
+ int s_jquota_fmt; /* Format of quota to use */
+#endif
+ /* For which write hints are passed down to block layer */
+ int whint_mode;
+ int alloc_mode; /* segment allocation policy */
+ int fsync_mode; /* fsync policy */
+ bool test_dummy_encryption; /* test dummy encryption */
};
-#define F2FS_FEATURE_ENCRYPT 0x0001
-#define F2FS_FEATURE_HMSMR 0x0002
+#define F2FS_FEATURE_ENCRYPT 0x0001
+#define F2FS_FEATURE_BLKZONED 0x0002
+#define F2FS_FEATURE_ATOMIC_WRITE 0x0004
+#define F2FS_FEATURE_EXTRA_ATTR 0x0008
+#define F2FS_FEATURE_PRJQUOTA 0x0010
+#define F2FS_FEATURE_INODE_CHKSUM 0x0020
+#define F2FS_FEATURE_FLEXIBLE_INLINE_XATTR 0x0040
+#define F2FS_FEATURE_QUOTA_INO 0x0080
+#define F2FS_FEATURE_INODE_CRTIME 0x0100
+#define F2FS_FEATURE_LOST_FOUND 0x0200
+#define F2FS_FEATURE_VERITY 0x0400 /* reserved */
+#define F2FS_FEATURE_SB_CHKSUM 0x0800
+
+#define __F2FS_HAS_FEATURE(raw_super, mask) \
+ ((raw_super->feature & cpu_to_le32(mask)) != 0)
+#define F2FS_HAS_FEATURE(sbi, mask) __F2FS_HAS_FEATURE(sbi->raw_super, mask)
+#define F2FS_SET_FEATURE(sbi, mask) \
+ (sbi->raw_super->feature |= cpu_to_le32(mask))
+#define F2FS_CLEAR_FEATURE(sbi, mask) \
+ (sbi->raw_super->feature &= ~cpu_to_le32(mask))
-#define F2FS_HAS_FEATURE(sb, mask) \
- ((F2FS_SB(sb)->raw_super->feature & cpu_to_le32(mask)) != 0)
-#define F2FS_SET_FEATURE(sb, mask) \
- F2FS_SB(sb)->raw_super->feature |= cpu_to_le32(mask)
-#define F2FS_CLEAR_FEATURE(sb, mask) \
- F2FS_SB(sb)->raw_super->feature &= ~cpu_to_le32(mask)
+/*
+ * Default values for user and/or group using reserved blocks
+ */
+#define F2FS_DEF_RESUID 0
+#define F2FS_DEF_RESGID 0
/*
* For checkpoint manager
@@ -120,28 +176,29 @@ enum {
SIT_BITMAP
};
-enum {
- CP_UMOUNT,
- CP_FASTBOOT,
- CP_SYNC,
- CP_RECOVERY,
- CP_DISCARD,
-};
-
-#define DEF_BATCHED_TRIM_SECTIONS 2
-#define BATCHED_TRIM_SEGMENTS(sbi) \
- (SM_I(sbi)->trim_sections * (sbi)->segs_per_sec)
-#define BATCHED_TRIM_BLOCKS(sbi) \
- (BATCHED_TRIM_SEGMENTS(sbi) << (sbi)->log_blocks_per_seg)
+#define CP_UMOUNT 0x00000001
+#define CP_FASTBOOT 0x00000002
+#define CP_SYNC 0x00000004
+#define CP_RECOVERY 0x00000008
+#define CP_DISCARD 0x00000010
+#define CP_TRIMMED 0x00000020
+#define CP_PAUSE 0x00000040
+
+#define MAX_DISCARD_BLOCKS(sbi) BLKS_PER_SEC(sbi)
+#define DEF_MAX_DISCARD_REQUEST 8 /* issue 8 discards per round */
+#define DEF_MIN_DISCARD_ISSUE_TIME 50 /* 50 ms, if exists */
+#define DEF_MID_DISCARD_ISSUE_TIME 500 /* 500 ms, if device busy */
+#define DEF_MAX_DISCARD_ISSUE_TIME 60000 /* 60 s, if no candidates */
+#define DEF_DISCARD_URGENT_UTIL 80 /* do more discard over 80% */
#define DEF_CP_INTERVAL 60 /* 60 secs */
#define DEF_IDLE_INTERVAL 5 /* 5 secs */
+#define DEF_DISABLE_INTERVAL 5 /* 5 secs */
struct cp_control {
int reason;
__u64 trim_start;
__u64 trim_end;
__u64 trim_minlen;
- __u64 trimmed;
};
/*
@@ -152,6 +209,7 @@ enum {
META_NAT,
META_SIT,
META_SSA,
+ META_MAX,
META_POR,
DATA_GENERIC,
META_GENERIC,
@@ -162,12 +220,15 @@ enum {
ORPHAN_INO, /* for orphan ino list */
APPEND_INO, /* for append ino list */
UPDATE_INO, /* for update ino list */
+ TRANS_DIR_INO, /* for trasactions dir ino list */
+ FLUSH_INO, /* for multiple device flushing */
MAX_INO_ENTRY, /* max. list */
};
struct ino_entry {
- struct list_head list; /* list head */
- nid_t ino; /* inode number */
+ struct list_head list; /* list head */
+ nid_t ino; /* inode number */
+ unsigned int dirty_device; /* dirty device bitmap */
};
/* for the list of inodes to be GCed */
@@ -176,18 +237,102 @@ struct inode_entry {
struct inode *inode; /* vfs inode pointer */
};
-/* for the list of blockaddresses to be discarded */
+struct fsync_node_entry {
+ struct list_head list; /* list head */
+ struct page *page; /* warm node page pointer */
+ unsigned int seq_id; /* sequence id */
+};
+
+/* for the bitmap indicate blocks to be discarded */
struct discard_entry {
struct list_head list; /* list head */
- block_t blkaddr; /* block address to be discarded */
- int len; /* # of consecutive blocks of the discard */
+ block_t start_blkaddr; /* start blockaddr of current segment */
+ unsigned char discard_map[SIT_VBLOCK_MAP_SIZE]; /* segment discard bitmap */
};
-struct bio_entry {
- struct list_head list;
- struct bio *bio;
- struct completion event;
- int error;
+/* default discard granularity of inner discard thread, unit: block count */
+#define DEFAULT_DISCARD_GRANULARITY 16
+
+/* max discard pend list number */
+#define MAX_PLIST_NUM 512
+#define plist_idx(blk_num) ((blk_num) >= MAX_PLIST_NUM ? \
+ (MAX_PLIST_NUM - 1) : (blk_num - 1))
+
+enum {
+ D_PREP, /* initial */
+ D_PARTIAL, /* partially submitted */
+ D_SUBMIT, /* all submitted */
+ D_DONE, /* finished */
+};
+
+struct discard_info {
+ block_t lstart; /* logical start address */
+ block_t len; /* length */
+ block_t start; /* actual start address in dev */
+};
+
+struct discard_cmd {
+ struct rb_node rb_node; /* rb node located in rb-tree */
+ union {
+ struct {
+ block_t lstart; /* logical start address */
+ block_t len; /* length */
+ block_t start; /* actual start address in dev */
+ };
+ struct discard_info di; /* discard info */
+
+ };
+ struct list_head list; /* command list */
+ struct completion wait; /* compleation */
+ struct block_device *bdev; /* bdev */
+ unsigned short ref; /* reference count */
+ unsigned char state; /* state */
+ unsigned char queued; /* queued discard */
+ int error; /* bio error */
+ spinlock_t lock; /* for state/bio_ref updating */
+ unsigned short bio_ref; /* bio reference count */
+};
+
+enum {
+ DPOLICY_BG,
+ DPOLICY_FORCE,
+ DPOLICY_FSTRIM,
+ DPOLICY_UMOUNT,
+ MAX_DPOLICY,
+};
+
+struct discard_policy {
+ int type; /* type of discard */
+ unsigned int min_interval; /* used for candidates exist */
+ unsigned int mid_interval; /* used for device busy */
+ unsigned int max_interval; /* used for candidates not exist */
+ unsigned int max_requests; /* # of discards issued per round */
+ unsigned int io_aware_gran; /* minimum granularity discard not be aware of I/O */
+ bool io_aware; /* issue discard in idle time */
+ bool sync; /* submit discard with REQ_SYNC flag */
+ bool ordered; /* issue discard by lba order */
+ unsigned int granularity; /* discard granularity */
+};
+
+struct discard_cmd_control {
+ struct task_struct *f2fs_issue_discard; /* discard thread */
+ struct list_head entry_list; /* 4KB discard entry list */
+ struct list_head pend_list[MAX_PLIST_NUM];/* store pending entries */
+ struct list_head wait_list; /* store on-flushing entries */
+ struct list_head fstrim_list; /* in-flight discard from fstrim */
+ wait_queue_head_t discard_wait_queue; /* waiting queue for wake-up */
+ unsigned int discard_wake; /* to wake up discard thread */
+ struct mutex cmd_lock;
+ unsigned int nr_discards; /* # of discards in the list */
+ unsigned int max_discards; /* max. discards to be issued */
+ unsigned int discard_granularity; /* discard granularity */
+ unsigned int undiscard_blks; /* # of undiscard blocks */
+ unsigned int next_pos; /* next discard position */
+ atomic_t issued_discard; /* # of issued discard */
+ atomic_t queued_discard; /* # of queued discard */
+ atomic_t discard_cmd_cnt; /* # of cached cmd count */
+ struct rb_root root; /* root of discard rb-tree */
+ bool rbtree_check; /* config for consistence check */
};
/* for the list of fsync inodes, used only during recovery */
@@ -198,13 +343,13 @@ struct fsync_inode_entry {
block_t last_dentry; /* block address locating the last dentry */
};
-#define nats_in_cursum(jnl) (le16_to_cpu(jnl->n_nats))
-#define sits_in_cursum(jnl) (le16_to_cpu(jnl->n_sits))
+#define nats_in_cursum(jnl) (le16_to_cpu((jnl)->n_nats))
+#define sits_in_cursum(jnl) (le16_to_cpu((jnl)->n_sits))
-#define nat_in_journal(jnl, i) (jnl->nat_j.entries[i].ne)
-#define nid_in_journal(jnl, i) (jnl->nat_j.entries[i].nid)
-#define sit_in_journal(jnl, i) (jnl->sit_j.entries[i].se)
-#define segno_in_journal(jnl, i) (jnl->sit_j.entries[i].segno)
+#define nat_in_journal(jnl, i) ((jnl)->nat_j.entries[i].ne)
+#define nid_in_journal(jnl, i) ((jnl)->nat_j.entries[i].nid)
+#define sit_in_journal(jnl, i) ((jnl)->sit_j.entries[i].se)
+#define segno_in_journal(jnl, i) ((jnl)->sit_j.entries[i].segno)
#define MAX_NAT_JENTRIES(jnl) (NAT_JOURNAL_ENTRIES - nats_in_cursum(jnl))
#define MAX_SIT_JENTRIES(jnl) (SIT_JOURNAL_ENTRIES - sits_in_cursum(jnl))
@@ -212,6 +357,7 @@ struct fsync_inode_entry {
static inline int update_nats_in_cursum(struct f2fs_journal *journal, int i)
{
int before = nats_in_cursum(journal);
+
journal->n_nats = cpu_to_le16(before + i);
return before;
}
@@ -219,6 +365,7 @@ static inline int update_nats_in_cursum(struct f2fs_journal *journal, int i)
static inline int update_sits_in_cursum(struct f2fs_journal *journal, int i)
{
int before = sits_in_cursum(journal);
+
journal->n_sits = cpu_to_le16(before + i);
return before;
}
@@ -244,11 +391,20 @@ static inline bool __has_cursum_space(struct f2fs_journal *journal,
#define F2FS_IOC_START_VOLATILE_WRITE _IO(F2FS_IOCTL_MAGIC, 3)
#define F2FS_IOC_RELEASE_VOLATILE_WRITE _IO(F2FS_IOCTL_MAGIC, 4)
#define F2FS_IOC_ABORT_VOLATILE_WRITE _IO(F2FS_IOCTL_MAGIC, 5)
-#define F2FS_IOC_GARBAGE_COLLECT _IO(F2FS_IOCTL_MAGIC, 6)
+#define F2FS_IOC_GARBAGE_COLLECT _IOW(F2FS_IOCTL_MAGIC, 6, __u32)
#define F2FS_IOC_WRITE_CHECKPOINT _IO(F2FS_IOCTL_MAGIC, 7)
-#define F2FS_IOC_DEFRAGMENT _IO(F2FS_IOCTL_MAGIC, 8)
+#define F2FS_IOC_DEFRAGMENT _IOWR(F2FS_IOCTL_MAGIC, 8, \
+ struct f2fs_defragment)
#define F2FS_IOC_MOVE_RANGE _IOWR(F2FS_IOCTL_MAGIC, 9, \
struct f2fs_move_range)
+#define F2FS_IOC_FLUSH_DEVICE _IOW(F2FS_IOCTL_MAGIC, 10, \
+ struct f2fs_flush_device)
+#define F2FS_IOC_GARBAGE_COLLECT_RANGE _IOW(F2FS_IOCTL_MAGIC, 11, \
+ struct f2fs_gc_range)
+#define F2FS_IOC_GET_FEATURES _IOR(F2FS_IOCTL_MAGIC, 12, __u32)
+#define F2FS_IOC_SET_PIN_FILE _IOW(F2FS_IOCTL_MAGIC, 13, __u32)
+#define F2FS_IOC_GET_PIN_FILE _IOR(F2FS_IOCTL_MAGIC, 14, __u32)
+#define F2FS_IOC_PRECACHE_EXTENTS _IO(F2FS_IOCTL_MAGIC, 15)
#define F2FS_IOC_SET_ENCRYPTION_POLICY FS_IOC_SET_ENCRYPTION_POLICY
#define F2FS_IOC_GET_ENCRYPTION_POLICY FS_IOC_GET_ENCRYPTION_POLICY
@@ -263,6 +419,7 @@ static inline bool __has_cursum_space(struct f2fs_journal *journal,
#define F2FS_GOING_DOWN_METASYNC 0x1 /* going down with metadata */
#define F2FS_GOING_DOWN_NOSYNC 0x2 /* going down */
#define F2FS_GOING_DOWN_METAFLUSH 0x3 /* going down with meta flush */
+#define F2FS_GOING_DOWN_NEED_FSCK 0x4 /* going down to trigger fsck */
#if defined(__KERNEL__) && defined(CONFIG_COMPAT)
/*
@@ -273,6 +430,12 @@ static inline bool __has_cursum_space(struct f2fs_journal *journal,
#define F2FS_IOC32_GETVERSION FS_IOC32_GETVERSION
#endif
+struct f2fs_gc_range {
+ u32 sync;
+ u64 start;
+ u64 len;
+};
+
struct f2fs_defragment {
u64 start;
u64 len;
@@ -285,36 +448,70 @@ struct f2fs_move_range {
u64 len; /* size to move */
};
+struct f2fs_flush_device {
+ u32 dev_num; /* device number to flush */
+ u32 segments; /* # of segments to flush */
+};
+
+/* for inline stuff */
+#define DEF_INLINE_RESERVED_SIZE 1
+#define DEF_MIN_INLINE_SIZE 1
+static inline int get_extra_isize(struct inode *inode);
+static inline int get_inline_xattr_addrs(struct inode *inode);
+#define MAX_INLINE_DATA(inode) (sizeof(__le32) * \
+ (CUR_ADDRS_PER_INODE(inode) - \
+ get_inline_xattr_addrs(inode) - \
+ DEF_INLINE_RESERVED_SIZE))
+
+/* for inline dir */
+#define NR_INLINE_DENTRY(inode) (MAX_INLINE_DATA(inode) * BITS_PER_BYTE / \
+ ((SIZE_OF_DIR_ENTRY + F2FS_SLOT_LEN) * \
+ BITS_PER_BYTE + 1))
+#define INLINE_DENTRY_BITMAP_SIZE(inode) ((NR_INLINE_DENTRY(inode) + \
+ BITS_PER_BYTE - 1) / BITS_PER_BYTE)
+#define INLINE_RESERVED_SIZE(inode) (MAX_INLINE_DATA(inode) - \
+ ((SIZE_OF_DIR_ENTRY + F2FS_SLOT_LEN) * \
+ NR_INLINE_DENTRY(inode) + \
+ INLINE_DENTRY_BITMAP_SIZE(inode)))
+
/*
* For INODE and NODE manager
*/
/* for directory operations */
struct f2fs_dentry_ptr {
struct inode *inode;
- const void *bitmap;
+ void *bitmap;
struct f2fs_dir_entry *dentry;
__u8 (*filename)[F2FS_SLOT_LEN];
int max;
+ int nr_bitmap;
};
-static inline void make_dentry_ptr(struct inode *inode,
- struct f2fs_dentry_ptr *d, void *src, int type)
+static inline void make_dentry_ptr_block(struct inode *inode,
+ struct f2fs_dentry_ptr *d, struct f2fs_dentry_block *t)
{
d->inode = inode;
+ d->max = NR_DENTRY_IN_BLOCK;
+ d->nr_bitmap = SIZE_OF_DENTRY_BITMAP;
+ d->bitmap = t->dentry_bitmap;
+ d->dentry = t->dentry;
+ d->filename = t->filename;
+}
- if (type == 1) {
- struct f2fs_dentry_block *t = (struct f2fs_dentry_block *)src;
- d->max = NR_DENTRY_IN_BLOCK;
- d->bitmap = &t->dentry_bitmap;
- d->dentry = t->dentry;
- d->filename = t->filename;
- } else {
- struct f2fs_inline_dentry *t = (struct f2fs_inline_dentry *)src;
- d->max = NR_INLINE_DENTRY;
- d->bitmap = &t->dentry_bitmap;
- d->dentry = t->dentry;
- d->filename = t->filename;
- }
+static inline void make_dentry_ptr_inline(struct inode *inode,
+ struct f2fs_dentry_ptr *d, void *t)
+{
+ int entry_cnt = NR_INLINE_DENTRY(inode);
+ int bitmap_size = INLINE_DENTRY_BITMAP_SIZE(inode);
+ int reserved_size = INLINE_RESERVED_SIZE(inode);
+
+ d->inode = inode;
+ d->max = entry_cnt;
+ d->nr_bitmap = bitmap_size;
+ d->bitmap = t;
+ d->dentry = t + bitmap_size + reserved_size;
+ d->filename = t + bitmap_size + reserved_size +
+ SIZE_OF_DIR_ENTRY * entry_cnt;
}
/*
@@ -333,29 +530,37 @@ enum {
*/
};
+#define DEFAULT_RETRY_IO_COUNT 8 /* maximum retry read IO count */
+
+/* maximum retry quota flush count */
+#define DEFAULT_RETRY_QUOTA_FLUSH_COUNT 8
+
#define F2FS_LINK_MAX 0xffffffff /* maximum link count per file */
#define MAX_DIR_RA_PAGES 4 /* maximum ra pages of dir */
-/* vector size for gang look-up from extent cache that consists of radix tree */
-#define EXT_TREE_VEC_SIZE 64
-
/* for in-memory extent cache entry */
#define F2FS_MIN_EXTENT_LEN 64 /* minimum extent length */
/* number of extent info in extent cache we try to shrink */
#define EXTENT_CACHE_SHRINK_NUMBER 128
+struct rb_entry {
+ struct rb_node rb_node; /* rb node located in rb-tree */
+ unsigned int ofs; /* start offset of the entry */
+ unsigned int len; /* length of the entry */
+};
+
struct extent_info {
unsigned int fofs; /* start offset in a file */
- u32 blk; /* start block address of the extent */
unsigned int len; /* length of the extent */
+ u32 blk; /* start block address of the extent */
};
struct extent_node {
struct rb_node rb_node; /* rb node located in rb-tree */
- struct list_head list; /* node in global extent list of sbi */
struct extent_info ei; /* extent info */
+ struct list_head list; /* node in global extent list of sbi */
struct extent_tree *et; /* extent tree pointer */
};
@@ -367,6 +572,7 @@ struct extent_tree {
struct list_head list; /* to be used by sbi->zombie_list */
rwlock_t lock; /* protect extent info rb-tree */
atomic_t node_cnt; /* # of extent node in rb-tree*/
+ bool largest_updated; /* largest extent updated */
};
/*
@@ -386,15 +592,21 @@ struct f2fs_map_blocks {
unsigned int m_len;
unsigned int m_flags;
pgoff_t *m_next_pgofs; /* point next possible non-hole pgofs */
+ pgoff_t *m_next_extent; /* point to next possible extent */
+ int m_seg_type;
+ bool m_may_create; /* indicate it is from write path */
};
/* for flag in get_data_block */
-#define F2FS_GET_BLOCK_READ 0
-#define F2FS_GET_BLOCK_DIO 1
-#define F2FS_GET_BLOCK_FIEMAP 2
-#define F2FS_GET_BLOCK_BMAP 3
-#define F2FS_GET_BLOCK_PRE_DIO 4
-#define F2FS_GET_BLOCK_PRE_AIO 5
+enum {
+ F2FS_GET_BLOCK_DEFAULT,
+ F2FS_GET_BLOCK_FIEMAP,
+ F2FS_GET_BLOCK_BMAP,
+ F2FS_GET_BLOCK_DIO,
+ F2FS_GET_BLOCK_PRE_DIO,
+ F2FS_GET_BLOCK_PRE_AIO,
+ F2FS_GET_BLOCK_PRECACHE,
+};
/*
* i_advise uses FADVISE_XXX_BIT. We can add additional hints later.
@@ -403,6 +615,11 @@ struct f2fs_map_blocks {
#define FADVISE_LOST_PINO_BIT 0x02
#define FADVISE_ENCRYPT_BIT 0x04
#define FADVISE_ENC_NAME_BIT 0x08
+#define FADVISE_KEEP_SIZE_BIT 0x10
+#define FADVISE_HOT_BIT 0x20
+#define FADVISE_VERITY_BIT 0x40 /* reserved */
+
+#define FADVISE_MODIFIABLE_BITS (FADVISE_COLD_BIT | FADVISE_HOT_BIT)
#define file_is_cold(inode) is_file(inode, FADVISE_COLD_BIT)
#define file_wrong_pino(inode) is_file(inode, FADVISE_LOST_PINO_BIT)
@@ -415,15 +632,28 @@ struct f2fs_map_blocks {
#define file_clear_encrypt(inode) clear_file(inode, FADVISE_ENCRYPT_BIT)
#define file_enc_name(inode) is_file(inode, FADVISE_ENC_NAME_BIT)
#define file_set_enc_name(inode) set_file(inode, FADVISE_ENC_NAME_BIT)
+#define file_keep_isize(inode) is_file(inode, FADVISE_KEEP_SIZE_BIT)
+#define file_set_keep_isize(inode) set_file(inode, FADVISE_KEEP_SIZE_BIT)
+#define file_is_hot(inode) is_file(inode, FADVISE_HOT_BIT)
+#define file_set_hot(inode) set_file(inode, FADVISE_HOT_BIT)
+#define file_clear_hot(inode) clear_file(inode, FADVISE_HOT_BIT)
#define DEF_DIR_LEVEL 0
+enum {
+ GC_FAILURE_PIN,
+ GC_FAILURE_ATOMIC,
+ MAX_GC_FAILURE
+};
+
struct f2fs_inode_info {
struct inode vfs_inode; /* serve a vfs inode */
unsigned long i_flags; /* keep an inode flags for ioctl */
unsigned char i_advise; /* use to give file attribute hints */
unsigned char i_dir_level; /* use for dentry level for large dir */
- unsigned int i_current_depth; /* use only in directory structure */
+ unsigned int i_current_depth; /* only for directory depth */
+ /* for gc failure statistic */
+ unsigned int i_gc_failures[MAX_GC_FAILURE];
unsigned int i_pino; /* parent inode number */
umode_t i_acl_mode; /* keep file acl mode temporarily */
@@ -434,16 +664,34 @@ struct f2fs_inode_info {
f2fs_hash_t chash; /* hash value of given file name */
unsigned int clevel; /* maximum level of given file name */
struct task_struct *task; /* lookup and create consistency */
+ struct task_struct *cp_task; /* separate cp/wb IO stats*/
nid_t i_xattr_nid; /* node id that contains xattrs */
- unsigned long long xattr_ver; /* cp version of xattr modification */
loff_t last_disk_size; /* lastly written file size */
+#ifdef CONFIG_QUOTA
+ struct dquot *i_dquot[MAXQUOTAS];
+
+ /* quota space reservation, managed internally by quota code */
+ qsize_t i_reserved_quota;
+#endif
struct list_head dirty_list; /* dirty list for dirs and files */
struct list_head gdirty_list; /* linked in global dirty list */
+ struct list_head inmem_ilist; /* list for inmem inodes */
struct list_head inmem_pages; /* inmemory pages managed by f2fs */
+ struct task_struct *inmem_task; /* store inmemory task */
struct mutex inmem_lock; /* lock for inmemory pages */
struct extent_tree *extent_tree; /* cached extent_tree entry */
- struct rw_semaphore dio_rwsem[2];/* avoid racing between dio and gc */
+
+ /* avoid racing between foreground op and gc */
+ struct rw_semaphore i_gc_rwsem[2];
+ struct rw_semaphore i_mmap_sem;
+ struct rw_semaphore i_xattr_sem; /* avoid racing between reading and changing EAs */
+
+ int i_extra_isize; /* size of extra space located in i_addr */
+ kprojid_t i_projid; /* id for project quota */
+ int i_inline_xattr_size; /* inline xattr size */
+ struct timespec i_crtime; /* inode creation time */
+ struct timespec i_disk_time[4]; /* inode disk times */
};
static inline void get_extent_info(struct extent_info *ext,
@@ -470,11 +718,23 @@ static inline void set_extent_info(struct extent_info *ei, unsigned int fofs,
ei->len = len;
}
-static inline bool __is_extent_same(struct extent_info *ei1,
- struct extent_info *ei2)
+static inline bool __is_discard_mergeable(struct discard_info *back,
+ struct discard_info *front, unsigned int max_len)
{
- return (ei1->fofs == ei2->fofs && ei1->blk == ei2->blk &&
- ei1->len == ei2->len);
+ return (back->lstart + back->len == front->lstart) &&
+ (back->len + front->len <= max_len);
+}
+
+static inline bool __is_discard_back_mergeable(struct discard_info *cur,
+ struct discard_info *back, unsigned int max_len)
+{
+ return __is_discard_mergeable(back, cur, max_len);
+}
+
+static inline bool __is_discard_front_mergeable(struct discard_info *cur,
+ struct discard_info *front, unsigned int max_len)
+{
+ return __is_discard_mergeable(cur, front, max_len);
}
static inline bool __is_extent_mergeable(struct extent_info *back,
@@ -496,20 +756,29 @@ static inline bool __is_front_mergeable(struct extent_info *cur,
return __is_extent_mergeable(cur, front);
}
-extern void f2fs_mark_inode_dirty_sync(struct inode *);
-static inline void __try_update_largest_extent(struct inode *inode,
- struct extent_tree *et, struct extent_node *en)
+extern void f2fs_mark_inode_dirty_sync(struct inode *inode, bool sync);
+static inline void __try_update_largest_extent(struct extent_tree *et,
+ struct extent_node *en)
{
if (en->ei.len > et->largest.len) {
et->largest = en->ei;
- f2fs_mark_inode_dirty_sync(inode);
+ et->largest_updated = true;
}
}
+/*
+ * For free nid management
+ */
+enum nid_state {
+ FREE_NID, /* newly added to free nid list */
+ PREALLOC_NID, /* it is preallocated */
+ MAX_NID_STATE,
+};
+
struct f2fs_nm_info {
block_t nat_blkaddr; /* base disk address of NAT */
nid_t max_nid; /* maximum possible node ids */
- nid_t available_nids; /* maximum available node ids */
+ nid_t available_nids; /* # of available node ids */
nid_t next_scan_nid; /* the next nid to be scanned */
unsigned int ram_thresh; /* control the memory footprint */
unsigned int ra_nid_pages; /* # of nid pages to be readaheaded */
@@ -520,18 +789,31 @@ struct f2fs_nm_info {
struct radix_tree_root nat_set_root;/* root of the nat set cache */
struct rw_semaphore nat_tree_lock; /* protect nat_tree_lock */
struct list_head nat_entries; /* cached nat entry list (clean) */
+ spinlock_t nat_list_lock; /* protect clean nat entry list */
unsigned int nat_cnt; /* the # of cached nat entries */
unsigned int dirty_nat_cnt; /* total num of nat entries in set */
+ unsigned int nat_blocks; /* # of nat blocks */
/* free node ids management */
struct radix_tree_root free_nid_root;/* root of the free_nid cache */
- struct list_head free_nid_list; /* a list for free nids */
- spinlock_t free_nid_list_lock; /* protect free nid list */
- unsigned int fcnt; /* the number of free node id */
+ struct list_head free_nid_list; /* list for free nids excluding preallocated nids */
+ unsigned int nid_cnt[MAX_NID_STATE]; /* the number of free node id */
+ spinlock_t nid_list_lock; /* protect nid lists ops */
struct mutex build_lock; /* lock for build free nids */
+ unsigned char **free_nid_bitmap;
+ unsigned char *nat_block_bitmap;
+ unsigned short *free_nid_count; /* free nid count of NAT block */
/* for checkpoint */
char *nat_bitmap; /* NAT bitmap pointer */
+
+ unsigned int nat_bits_blocks; /* # of nat bits blocks */
+ unsigned char *nat_bits; /* NAT bits blocks */
+ unsigned char *full_nat_bits; /* full NAT pages */
+ unsigned char *empty_nat_bits; /* empty NAT pages */
+#ifdef CONFIG_F2FS_CHECK_FS
+ char *nat_bitmap_mir; /* NAT bitmap mirror */
+#endif
int bitmap_size; /* bitmap size */
};
@@ -588,19 +870,20 @@ enum {
CURSEG_WARM_NODE, /* direct node blocks of normal files */
CURSEG_COLD_NODE, /* indirect node blocks */
NO_CHECK_TYPE,
- CURSEG_DIRECT_IO, /* to use for the direct IO path */
};
struct flush_cmd {
struct completion wait;
struct llist_node llnode;
+ nid_t ino;
int ret;
};
struct flush_cmd_control {
struct task_struct *f2fs_issue_flush; /* flush thread */
wait_queue_head_t flush_wait_queue; /* waiting queue for wake-up */
- atomic_t submit_flush; /* # of issued flushes */
+ atomic_t issued_flush; /* # of issued flushes */
+ atomic_t queued_flush; /* # of queued flushes */
struct llist_head issue_list; /* list for command issue */
struct llist_node *dispatch_list; /* list for command dispatch */
};
@@ -611,6 +894,8 @@ struct f2fs_sm_info {
struct dirty_seglist_info *dirty_info; /* dirty segment information */
struct curseg_info *curseg_array; /* active segment information */
+ struct rw_semaphore curseg_lock; /* for preventing curseg change */
+
block_t seg0_blkaddr; /* block address of 0'th segment */
block_t main_blkaddr; /* start block address of main area */
block_t ssa_blkaddr; /* start block address of SSA area */
@@ -623,12 +908,6 @@ struct f2fs_sm_info {
/* a threshold to reclaim prefree segments */
unsigned int rec_prefree_segments;
- /* for small discard management */
- struct list_head discard_list; /* 4KB discard list */
- struct list_head wait_list; /* linked with issued discard bio */
- int nr_discards; /* # of discards in the list */
- int max_discards; /* max. discards to be issued */
-
/* for batched trimming */
unsigned int trim_sections; /* # of sections to trim */
@@ -637,10 +916,15 @@ struct f2fs_sm_info {
unsigned int ipu_policy; /* in-place-update policy */
unsigned int min_ipu_util; /* in-place-update threshold */
unsigned int min_fsync_blocks; /* threshold for fsync */
+ unsigned int min_seq_blocks; /* threshold for sequential blocks */
+ unsigned int min_hot_blocks; /* threshold for hot block allocation */
+ unsigned int min_ssr_sections; /* threshold to trigger SSR allocation */
/* for flush command control */
- struct flush_cmd_control *cmd_control_info;
+ struct flush_cmd_control *fcc_info;
+ /* for discard command control */
+ struct discard_cmd_control *dcc_info;
};
/*
@@ -652,13 +936,22 @@ struct f2fs_sm_info {
* f2fs monitors the number of several block types such as on-writeback,
* dirty dentry blocks, dirty node blocks, and dirty meta blocks.
*/
+#define WB_DATA_TYPE(p) (__is_cp_guaranteed(p) ? F2FS_WB_CP_DATA : F2FS_WB_DATA)
enum count_type {
F2FS_DIRTY_DENTS,
F2FS_DIRTY_DATA,
+ F2FS_DIRTY_QDATA,
F2FS_DIRTY_NODES,
F2FS_DIRTY_META,
F2FS_INMEM_PAGES,
F2FS_DIRTY_IMETA,
+ F2FS_WB_CP_DATA,
+ F2FS_WB_DATA,
+ F2FS_RD_DATA,
+ F2FS_RD_NODE,
+ F2FS_RD_META,
+ F2FS_DIO_WRITE,
+ F2FS_DIO_READ,
NR_COUNT_TYPE,
};
@@ -682,36 +975,107 @@ enum page_type {
META_FLUSH,
INMEM, /* the below types are used by tracepoints only. */
INMEM_DROP,
+ INMEM_INVALIDATE,
INMEM_REVOKE,
IPU,
OPU,
};
+enum temp_type {
+ HOT = 0, /* must be zero for meta bio */
+ WARM,
+ COLD,
+ NR_TEMP_TYPE,
+};
+
+enum need_lock_type {
+ LOCK_REQ = 0,
+ LOCK_DONE,
+ LOCK_RETRY,
+};
+
+enum cp_reason_type {
+ CP_NO_NEEDED,
+ CP_NON_REGULAR,
+ CP_HARDLINK,
+ CP_SB_NEED_CP,
+ CP_WRONG_PINO,
+ CP_NO_SPC_ROLL,
+ CP_NODE_NEED_CP,
+ CP_FASTBOOT_MODE,
+ CP_SPEC_LOG_NUM,
+ CP_RECOVER_DIR,
+};
+
+enum iostat_type {
+ APP_DIRECT_IO, /* app direct IOs */
+ APP_BUFFERED_IO, /* app buffered IOs */
+ APP_WRITE_IO, /* app write IOs */
+ APP_MAPPED_IO, /* app mapped IOs */
+ FS_DATA_IO, /* data IOs from kworker/fsync/reclaimer */
+ FS_NODE_IO, /* node IOs from kworker/fsync/reclaimer */
+ FS_META_IO, /* meta IOs from kworker/reclaimer */
+ FS_GC_DATA_IO, /* data IOs from forground gc */
+ FS_GC_NODE_IO, /* node IOs from forground gc */
+ FS_CP_DATA_IO, /* data IOs from checkpoint */
+ FS_CP_NODE_IO, /* node IOs from checkpoint */
+ FS_CP_META_IO, /* meta IOs from checkpoint */
+ FS_DISCARD, /* discard */
+ NR_IO_TYPE,
+};
+
struct f2fs_io_info {
struct f2fs_sb_info *sbi; /* f2fs_sb_info pointer */
+ nid_t ino; /* inode number */
enum page_type type; /* contains DATA/NODE/META/META_FLUSH */
+ enum temp_type temp; /* contains HOT/WARM/COLD */
int op; /* contains REQ_OP_ */
- int op_flags; /* rq_flag_bits */
+ int op_flags; /* req_flag_bits */
block_t new_blkaddr; /* new block address to be written */
block_t old_blkaddr; /* old block address before Cow */
struct page *page; /* page to be written */
struct page *encrypted_page; /* encrypted page */
+ struct list_head list; /* serialize IOs */
+ bool submitted; /* indicate IO submission */
+ int need_lock; /* indicate we need to lock cp_rwsem */
+ bool in_list; /* indicate fio is in io_list */
bool is_meta; /* indicate borrow meta inode mapping or not */
+ bool retry; /* need to reallocate block address */
+ enum iostat_type io_type; /* io type */
+ struct writeback_control *io_wbc; /* writeback control */
+ unsigned char version; /* version of the node */
};
-#define is_read_io(rw) (rw == READ)
+#define is_read_io(rw) ((rw) == READ)
struct f2fs_bio_info {
struct f2fs_sb_info *sbi; /* f2fs superblock */
struct bio *bio; /* bios to merge */
sector_t last_block_in_bio; /* last block number */
struct f2fs_io_info fio; /* store buffered io info. */
struct rw_semaphore io_rwsem; /* blocking op for bio */
+ spinlock_t io_lock; /* serialize DATA/NODE IOs */
+ struct list_head io_list; /* track fios */
+};
+
+#define FDEV(i) (sbi->devs[i])
+#define RDEV(i) (raw_super->devs[i])
+struct f2fs_dev_info {
+ struct block_device *bdev;
+ char path[MAX_PATH_LEN];
+ unsigned int total_segments;
+ block_t start_blk;
+ block_t end_blk;
+#ifdef CONFIG_BLK_DEV_ZONED
+ unsigned int nr_blkz; /* Total number of zones */
+ u8 *blkz_type; /* Array of zones type */
+#endif
};
enum inode_type {
DIR_INODE, /* for dirty dir inode */
FILE_INODE, /* for dirty regular/symlink inode */
DIRTY_META, /* for all dirtied inode metadata */
+ ATOMIC_FILE, /* for all atomic files */
NR_INODE_TYPE,
};
@@ -731,29 +1095,68 @@ enum {
SBI_POR_DOING, /* recovery is doing or not */
SBI_NEED_SB_WRITE, /* need to recover superblock */
SBI_NEED_CP, /* need to checkpoint */
+ SBI_IS_SHUTDOWN, /* shutdown by ioctl */
+ SBI_IS_RECOVERED, /* recovered orphan/data */
+ SBI_CP_DISABLED, /* CP was disabled last mount */
+ SBI_QUOTA_NEED_FLUSH, /* need to flush quota info in CP */
+ SBI_QUOTA_SKIP_FLUSH, /* skip flushing quota in current CP */
+ SBI_QUOTA_NEED_REPAIR, /* quota file may be corrupted */
};
enum {
CP_TIME,
REQ_TIME,
+ DISCARD_TIME,
+ GC_TIME,
+ DISABLE_TIME,
MAX_TIME,
};
+enum {
+ GC_NORMAL,
+ GC_IDLE_CB,
+ GC_IDLE_GREEDY,
+ GC_URGENT,
+};
+
+enum {
+ WHINT_MODE_OFF, /* not pass down write hints */
+ WHINT_MODE_USER, /* try to pass down hints given by users */
+ WHINT_MODE_FS, /* pass down hints with F2FS policy */
+};
+
+enum {
+ ALLOC_MODE_DEFAULT, /* stay default */
+ ALLOC_MODE_REUSE, /* reuse segments as much as possible */
+};
+
+enum fsync_mode {
+ FSYNC_MODE_POSIX, /* fsync follows posix semantics */
+ FSYNC_MODE_STRICT, /* fsync behaves in line with ext4 */
+ FSYNC_MODE_NOBARRIER, /* fsync behaves nobarrier based on posix */
+};
+
#ifdef CONFIG_F2FS_FS_ENCRYPTION
-#define F2FS_KEY_DESC_PREFIX "f2fs:"
-#define F2FS_KEY_DESC_PREFIX_SIZE 5
+#define DUMMY_ENCRYPTION_ENABLED(sbi) \
+ (unlikely(F2FS_OPTION(sbi).test_dummy_encryption))
+#else
+#define DUMMY_ENCRYPTION_ENABLED(sbi) (0)
#endif
+
struct f2fs_sb_info {
struct super_block *sb; /* pointer to VFS super block */
struct proc_dir_entry *s_proc; /* proc entry */
struct f2fs_super_block *raw_super; /* raw super block pointer */
+ struct rw_semaphore sb_lock; /* lock for raw super block */
int valid_super_block; /* valid super block no */
unsigned long s_flag; /* flags for sbi */
+ struct mutex writepages; /* mutex for writepages() */
-#ifdef CONFIG_F2FS_FS_ENCRYPTION
- u8 key_prefix[F2FS_KEY_DESC_PREFIX_SIZE];
- u8 key_prefix_size;
+#ifdef CONFIG_BLK_DEV_ZONED
+ unsigned int blocks_per_blkz; /* F2FS blocks per zone */
+ unsigned int log_blocks_per_blkz; /* log2 F2FS blocks per zone */
#endif
+
/* for node-related operations */
struct f2fs_nm_info *nm_info; /* node manager */
struct inode *node_inode; /* cache node blocks */
@@ -762,9 +1165,10 @@ struct f2fs_sb_info {
struct f2fs_sm_info *sm_info; /* segment manager */
/* for bio operations */
- struct f2fs_bio_info read_io; /* for read bios */
- struct f2fs_bio_info write_io[NR_PAGE_TYPE]; /* for write bios */
- struct mutex wio_mutex[NODE + 1]; /* bio ordering for NODE/DATA */
+ struct f2fs_bio_info *write_io[NR_PAGE_TYPE]; /* for write bios */
+ /* keep migration IO order for LFS mode */
+ struct rw_semaphore io_order_lock;
+ mempool_t *write_io_dummy; /* Dummy pages */
/* for checkpoint */
struct f2fs_checkpoint *ckpt; /* raw checkpoint pointer */
@@ -774,12 +1178,18 @@ struct f2fs_sb_info {
struct mutex cp_mutex; /* checkpoint procedure lock */
struct rw_semaphore cp_rwsem; /* blocking FS operations */
struct rw_semaphore node_write; /* locking node writes */
+ struct rw_semaphore node_change; /* locking node change */
wait_queue_head_t cp_wait;
unsigned long last_time[MAX_TIME]; /* to store time in jiffies */
long interval_time[MAX_TIME]; /* to store thresholds */
struct inode_management im[MAX_INO_ENTRY]; /* manage inode cache */
+ spinlock_t fsync_node_lock; /* for node entry lock */
+ struct list_head fsync_node_list; /* node list head */
+ unsigned int fsync_seg_id; /* sequence id */
+ unsigned int fsync_node_num; /* number of node entries */
+
/* for orphan inode, use 0'th array */
unsigned int max_orphans; /* max orphan inodes */
@@ -789,7 +1199,7 @@ struct f2fs_sb_info {
/* for extent tree cache */
struct radix_tree_root extent_tree_root;/* cache extent cache entries */
- struct rw_semaphore extent_tree_lock; /* locking extent radix tree */
+ struct mutex extent_tree_lock; /* locking extent radix tree */
struct list_head extent_list; /* lru list for shrinker */
spinlock_t extent_lock; /* locking extent lru list */
atomic_t total_ext_tree; /* extent tree count */
@@ -812,21 +1222,31 @@ struct f2fs_sb_info {
unsigned int total_node_count; /* total node block count */
unsigned int total_valid_node_count; /* valid node block count */
loff_t max_file_blocks; /* max block index of file */
- int active_logs; /* # of active logs */
int dir_level; /* directory level */
+ int readdir_ra; /* readahead inode in readdir */
block_t user_block_count; /* # of user blocks */
block_t total_valid_block_count; /* # of valid blocks */
block_t discard_blks; /* discard command candidats */
block_t last_valid_block_count; /* for recovery */
+ block_t reserved_blocks; /* configurable reserved blocks */
+ block_t current_reserved_blocks; /* current reserved blocks */
+
+ /* Additional tracking for no checkpoint mode */
+ block_t unusable_block_count; /* # of blocks saved by last cp */
+
+ unsigned int nquota_files; /* # of quota sysfile */
+
u32 s_next_generation; /* for NFS support */
- atomic_t nr_wb_bios; /* # of writeback bios */
/* # of pages, see count_type */
atomic_t nr_pages[NR_COUNT_TYPE];
/* # of allocated blocks */
struct percpu_counter alloc_valid_block_count;
+ /* writeback control */
+ atomic_t wb_sync_req[META]; /* count # of WB_SYNC threads */
+
/* valid inode count */
struct percpu_counter total_valid_inode_count;
@@ -836,12 +1256,19 @@ struct f2fs_sb_info {
struct mutex gc_mutex; /* mutex for GC */
struct f2fs_gc_kthread *gc_thread; /* GC thread */
unsigned int cur_victim_sec; /* current victim section num */
+ unsigned int gc_mode; /* current GC state */
+ unsigned int next_victim_seg[2]; /* next segment in victim section */
+ /* for skip statistic */
+ unsigned long long skipped_atomic_files[2]; /* FG_GC and BG_GC */
+ unsigned long long skipped_gc_rwsem; /* FG_GC only */
- /* threshold for converting bg victims for fg */
- u64 fggc_threshold;
+ /* threshold for gc trials on pinned files */
+ u64 gc_pin_file_threshold;
/* maximum # of trials to find a victim segment for SSR and GC */
unsigned int max_victim_search;
+ /* migration granularity of garbage collection, unit: segment */
+ unsigned int migration_granularity;
/*
* for stat information.
@@ -849,6 +1276,7 @@ struct f2fs_sb_info {
*/
#ifdef CONFIG_F2FS_STAT_FS
struct f2fs_stat_info *stat_info; /* FS status information */
+ atomic_t meta_count[META_MAX]; /* # of meta blocks */
unsigned int segment_count[2]; /* # of allocated segments */
unsigned int block_count[2]; /* # of allocated blocks */
atomic_t inplace_count; /* # of inplace update */
@@ -859,18 +1287,32 @@ struct f2fs_sb_info {
atomic_t inline_xattr; /* # of inline_xattr inodes */
atomic_t inline_inode; /* # of inline_data inodes */
atomic_t inline_dir; /* # of inline_dentry inodes */
+ atomic_t aw_cnt; /* # of atomic writes */
+ atomic_t vw_cnt; /* # of volatile writes */
+ atomic_t max_aw_cnt; /* max # of atomic writes */
+ atomic_t max_vw_cnt; /* max # of volatile writes */
int bg_gc; /* background gc calls */
+ unsigned int io_skip_bggc; /* skip background gc for in-flight IO */
+ unsigned int other_skip_bggc; /* skip background gc for other reasons */
unsigned int ndirty_inode[NR_INODE_TYPE]; /* # of dirty inodes */
#endif
- unsigned int last_victim[2]; /* last victim segment # */
spinlock_t stat_lock; /* lock for stat operations */
+ /* For app/fs IO statistics */
+ spinlock_t iostat_lock;
+ unsigned long long write_iostat[NR_IO_TYPE];
+ bool iostat_enable;
+
/* For sysfs suppport */
struct kobject s_kobj;
struct completion s_kobj_unregister;
/* For shrinker support */
struct list_head s_list;
+ int s_ndevs; /* number of devices */
+ struct f2fs_dev_info *devs; /* for device list */
+ unsigned int dirty_device; /* for checkpoint data flush */
+ spinlock_t dev_lock; /* protect dirty_device */
struct mutex umount_mutex;
unsigned int shrinker_run_no;
@@ -881,16 +1323,25 @@ struct f2fs_sb_info {
/* Reference to checksum algorithm driver via cryptoapi */
struct crypto_shash *s_chksum_driver;
- /* For fault injection */
-#ifdef CONFIG_F2FS_FAULT_INJECTION
- struct f2fs_fault_info fault_info;
-#endif
+ /* Precomputed FS UUID checksum for seeding other checksums */
+ __u32 s_chksum_seed;
+};
+
+struct f2fs_private_dio {
+ struct inode *inode;
+ void *orig_private;
+ bio_end_io_t *orig_end_io;
+ bool write;
};
#ifdef CONFIG_F2FS_FAULT_INJECTION
+#define f2fs_show_injection_info(type) \
+ printk_ratelimited("%sF2FS-fs : inject %s in %s of %pF\n", \
+ KERN_INFO, f2fs_fault_name[type], \
+ __func__, __builtin_return_address(0))
static inline bool time_to_inject(struct f2fs_sb_info *sbi, int type)
{
- struct f2fs_fault_info *ffi = &sbi->fault_info;
+ struct f2fs_fault_info *ffi = &F2FS_OPTION(sbi).fault_info;
if (!ffi->inject_rate)
return false;
@@ -901,69 +1352,87 @@ static inline bool time_to_inject(struct f2fs_sb_info *sbi, int type)
atomic_inc(&ffi->inject_ops);
if (atomic_read(&ffi->inject_ops) >= ffi->inject_rate) {
atomic_set(&ffi->inject_ops, 0);
- printk("%sF2FS-fs : inject %s in %pF\n",
- KERN_INFO,
- fault_name[type],
- __builtin_return_address(0));
return true;
}
return false;
}
+#else
+#define f2fs_show_injection_info(type) do { } while (0)
+static inline bool time_to_inject(struct f2fs_sb_info *sbi, int type)
+{
+ return false;
+}
#endif
/* For write statistics. Suppose sector size is 512 bytes,
* and the return value is in kbytes. s is of struct f2fs_sb_info.
*/
#define BD_PART_WRITTEN(s) \
-(((u64)part_stat_read(s->sb->s_bdev->bd_part, sectors[1]) - \
- s->sectors_written_start) >> 1)
+(((u64)part_stat_read((s)->sb->s_bdev->bd_part, sectors[1]) - \
+ (s)->sectors_written_start) >> 1)
static inline void f2fs_update_time(struct f2fs_sb_info *sbi, int type)
{
- sbi->last_time[type] = jiffies;
+ unsigned long now = jiffies;
+
+ sbi->last_time[type] = now;
+
+ /* DISCARD_TIME and GC_TIME are based on REQ_TIME */
+ if (type == REQ_TIME) {
+ sbi->last_time[DISCARD_TIME] = now;
+ sbi->last_time[GC_TIME] = now;
+ }
}
static inline bool f2fs_time_over(struct f2fs_sb_info *sbi, int type)
{
- struct timespec ts = {sbi->interval_time[type], 0};
- unsigned long interval = timespec_to_jiffies(&ts);
+ unsigned long interval = sbi->interval_time[type] * HZ;
return time_after(jiffies, sbi->last_time[type] + interval);
}
-static inline bool is_idle(struct f2fs_sb_info *sbi)
+static inline unsigned int f2fs_time_to_wait(struct f2fs_sb_info *sbi,
+ int type)
{
- struct block_device *bdev = sbi->sb->s_bdev;
- struct request_queue *q = bdev_get_queue(bdev);
- struct request_list *rl = &q->root_rl;
+ unsigned long interval = sbi->interval_time[type] * HZ;
+ unsigned int wait_ms = 0;
+ long delta;
- if (rl->count[BLK_RW_SYNC] || rl->count[BLK_RW_ASYNC])
- return 0;
+ delta = (sbi->last_time[type] + interval) - jiffies;
+ if (delta > 0)
+ wait_ms = jiffies_to_msecs(delta);
- return f2fs_time_over(sbi, REQ_TIME);
+ return wait_ms;
}
/*
* Inline functions
*/
-static inline u32 f2fs_crc32(struct f2fs_sb_info *sbi, const void *address,
- unsigned int length)
+static inline u32 __f2fs_crc32(struct f2fs_sb_info *sbi, u32 crc,
+ const void *address, unsigned int length)
{
- SHASH_DESC_ON_STACK(shash, sbi->s_chksum_driver);
- u32 *ctx = (u32 *)shash_desc_ctx(shash);
- u32 retval;
+ struct {
+ struct shash_desc shash;
+ char ctx[4];
+ } desc;
int err;
- shash->tfm = sbi->s_chksum_driver;
- shash->flags = 0;
- *ctx = F2FS_SUPER_MAGIC;
+ BUG_ON(crypto_shash_descsize(sbi->s_chksum_driver) != sizeof(desc.ctx));
- err = crypto_shash_update(shash, address, length);
+ desc.shash.tfm = sbi->s_chksum_driver;
+ desc.shash.flags = 0;
+ *(u32 *)desc.ctx = crc;
+
+ err = crypto_shash_update(&desc.shash, address, length);
BUG_ON(err);
- retval = *ctx;
- barrier_data(ctx);
- return retval;
+ return *(u32 *)desc.ctx;
+}
+
+static inline u32 f2fs_crc32(struct f2fs_sb_info *sbi, const void *address,
+ unsigned int length)
+{
+ return __f2fs_crc32(sbi, F2FS_SUPER_MAGIC, address, length);
}
static inline bool f2fs_crc_valid(struct f2fs_sb_info *sbi, __u32 blk_crc,
@@ -972,6 +1441,12 @@ static inline bool f2fs_crc_valid(struct f2fs_sb_info *sbi, __u32 blk_crc,
return f2fs_crc32(sbi, buf, buf_size) == blk_crc;
}
+static inline u32 f2fs_chksum(struct f2fs_sb_info *sbi, u32 crc,
+ const void *address, unsigned int length)
+{
+ return __f2fs_crc32(sbi, crc, address, length);
+}
+
static inline struct f2fs_inode_info *F2FS_I(struct inode *inode)
{
return container_of(inode, struct f2fs_inode_info, vfs_inode);
@@ -1072,6 +1547,19 @@ static inline unsigned long long cur_cp_version(struct f2fs_checkpoint *cp)
return le64_to_cpu(cp->checkpoint_ver);
}
+static inline unsigned long f2fs_qf_ino(struct super_block *sb, int type)
+{
+ if (type < F2FS_MAX_QUOTAS)
+ return le32_to_cpu(F2FS_SB(sb)->raw_super->qf_ino[type]);
+ return 0;
+}
+
+static inline __u64 cur_cp_crc(struct f2fs_checkpoint *cp)
+{
+ size_t crc_offset = le32_to_cpu(cp->checksum_offset);
+ return le32_to_cpu(*((__le32 *)((unsigned char *)cp + crc_offset)));
+}
+
static inline bool __is_set_ckpt_flags(struct f2fs_checkpoint *cp, unsigned int f)
{
unsigned int ckpt_flags = le32_to_cpu(cp->ckpt_flags);
@@ -1095,9 +1583,11 @@ static inline void __set_ckpt_flags(struct f2fs_checkpoint *cp, unsigned int f)
static inline void set_ckpt_flags(struct f2fs_sb_info *sbi, unsigned int f)
{
- spin_lock(&sbi->cp_lock);
+ unsigned long flags;
+
+ spin_lock_irqsave(&sbi->cp_lock, flags);
__set_ckpt_flags(F2FS_CKPT(sbi), f);
- spin_unlock(&sbi->cp_lock);
+ spin_unlock_irqrestore(&sbi->cp_lock, flags);
}
static inline void __clear_ckpt_flags(struct f2fs_checkpoint *cp, unsigned int f)
@@ -1111,16 +1601,38 @@ static inline void __clear_ckpt_flags(struct f2fs_checkpoint *cp, unsigned int f
static inline void clear_ckpt_flags(struct f2fs_sb_info *sbi, unsigned int f)
{
- spin_lock(&sbi->cp_lock);
+ unsigned long flags;
+
+ spin_lock_irqsave(&sbi->cp_lock, flags);
__clear_ckpt_flags(F2FS_CKPT(sbi), f);
- spin_unlock(&sbi->cp_lock);
+ spin_unlock_irqrestore(&sbi->cp_lock, flags);
+}
+
+static inline void disable_nat_bits(struct f2fs_sb_info *sbi, bool lock)
+{
+ unsigned long flags;
+
+ /*
+ * In order to re-enable nat_bits we need to call fsck.f2fs by
+ * set_sbi_flag(sbi, SBI_NEED_FSCK). But it may give huge cost,
+ * so let's rely on regular fsck or unclean shutdown.
+ */
+
+ if (lock)
+ spin_lock_irqsave(&sbi->cp_lock, flags);
+ __clear_ckpt_flags(F2FS_CKPT(sbi), CP_NAT_BITS_FLAG);
+ kvfree(NM_I(sbi)->nat_bits);
+ NM_I(sbi)->nat_bits = NULL;
+ if (lock)
+ spin_unlock_irqrestore(&sbi->cp_lock, flags);
}
-static inline bool f2fs_discard_en(struct f2fs_sb_info *sbi)
+static inline bool enabled_nat_bits(struct f2fs_sb_info *sbi,
+ struct cp_control *cpc)
{
- struct request_queue *q = bdev_get_queue(sbi->sb->s_bdev);
+ bool set = is_set_ckpt_flags(sbi, CP_NAT_BITS_FLAG);
- return blk_queue_discard(q);
+ return (cpc) ? (cpc->reason & CP_UMOUNT) && set : set;
}
static inline void f2fs_lock_op(struct f2fs_sb_info *sbi)
@@ -1128,6 +1640,11 @@ static inline void f2fs_lock_op(struct f2fs_sb_info *sbi)
down_read(&sbi->cp_rwsem);
}
+static inline int f2fs_trylock_op(struct f2fs_sb_info *sbi)
+{
+ return down_read_trylock(&sbi->cp_rwsem);
+}
+
static inline void f2fs_unlock_op(struct f2fs_sb_info *sbi)
{
up_read(&sbi->cp_rwsem);
@@ -1156,7 +1673,7 @@ static inline int __get_cp_reason(struct f2fs_sb_info *sbi)
static inline bool __remain_node_summaries(int reason)
{
- return (reason == CP_UMOUNT || reason == CP_FASTBOOT);
+ return (reason & (CP_UMOUNT | CP_FASTBOOT));
}
static inline bool __exist_node_summaries(struct f2fs_sb_info *sbi)
@@ -1166,28 +1683,13 @@ static inline bool __exist_node_summaries(struct f2fs_sb_info *sbi)
}
/*
- * Check whether the given nid is within node id range.
- */
-static inline int check_nid_range(struct f2fs_sb_info *sbi, nid_t nid)
-{
- if (unlikely(nid < F2FS_ROOT_INO(sbi)))
- return -EINVAL;
- if (unlikely(nid >= NM_I(sbi)->max_nid))
- return -EINVAL;
- return 0;
-}
-
-#define F2FS_DEFAULT_ALLOCATED_BLOCKS 1
-
-/*
* Check whether the inode has blocks or not
*/
static inline int F2FS_HAS_BLOCKS(struct inode *inode)
{
- if (F2FS_I(inode)->i_xattr_nid)
- return inode->i_blocks > F2FS_DEFAULT_ALLOCATED_BLOCKS + 1;
- else
- return inode->i_blocks > F2FS_DEFAULT_ALLOCATED_BLOCKS;
+ block_t xattr_block = F2FS_I(inode)->i_xattr_nid ? 1 : 0;
+
+ return (inode->i_blocks >> F2FS_LOG_SECTORS_PER_BLOCK) > xattr_block;
}
static inline bool f2fs_has_xattr_block(unsigned int ofs)
@@ -1195,16 +1697,43 @@ static inline bool f2fs_has_xattr_block(unsigned int ofs)
return ofs == XATTR_NODE_OFFSET;
}
-static inline void f2fs_i_blocks_write(struct inode *, blkcnt_t, bool);
-static inline bool inc_valid_block_count(struct f2fs_sb_info *sbi,
+static inline bool __allow_reserved_blocks(struct f2fs_sb_info *sbi,
+ struct inode *inode, bool cap)
+{
+ if (!inode)
+ return true;
+ if (!test_opt(sbi, RESERVE_ROOT))
+ return false;
+ if (IS_NOQUOTA(inode))
+ return true;
+ if (uid_eq(F2FS_OPTION(sbi).s_resuid, current_fsuid()))
+ return true;
+ if (!gid_eq(F2FS_OPTION(sbi).s_resgid, GLOBAL_ROOT_GID) &&
+ in_group_p(F2FS_OPTION(sbi).s_resgid))
+ return true;
+ if (cap && capable(CAP_SYS_RESOURCE))
+ return true;
+ return false;
+}
+
+static inline void f2fs_i_blocks_write(struct inode *, block_t, bool, bool);
+static inline int inc_valid_block_count(struct f2fs_sb_info *sbi,
struct inode *inode, blkcnt_t *count)
{
- blkcnt_t diff;
+ blkcnt_t diff = 0, release = 0;
+ block_t avail_user_block_count;
+ int ret;
+
+ ret = dquot_reserve_block(inode, *count);
+ if (ret)
+ return ret;
+
+ if (time_to_inject(sbi, FAULT_BLOCK)) {
+ f2fs_show_injection_info(FAULT_BLOCK);
+ release = *count;
+ goto enospc;
+ }
-#ifdef CONFIG_F2FS_FAULT_INJECTION
- if (time_to_inject(sbi, FAULT_BLOCK))
- return false;
-#endif
/*
* let's increase this in prior to actual block count change in order
* for f2fs_sync_file to avoid data races when deciding checkpoint.
@@ -1213,39 +1742,66 @@ static inline bool inc_valid_block_count(struct f2fs_sb_info *sbi,
spin_lock(&sbi->stat_lock);
sbi->total_valid_block_count += (block_t)(*count);
- if (unlikely(sbi->total_valid_block_count > sbi->user_block_count)) {
- diff = sbi->total_valid_block_count - sbi->user_block_count;
+ avail_user_block_count = sbi->user_block_count -
+ sbi->current_reserved_blocks;
+
+ if (!__allow_reserved_blocks(sbi, inode, true))
+ avail_user_block_count -= F2FS_OPTION(sbi).root_reserved_blocks;
+ if (unlikely(is_sbi_flag_set(sbi, SBI_CP_DISABLED)))
+ avail_user_block_count -= sbi->unusable_block_count;
+ if (unlikely(sbi->total_valid_block_count > avail_user_block_count)) {
+ diff = sbi->total_valid_block_count - avail_user_block_count;
+ if (diff > *count)
+ diff = *count;
*count -= diff;
- sbi->total_valid_block_count = sbi->user_block_count;
+ release = diff;
+ sbi->total_valid_block_count -= diff;
if (!*count) {
spin_unlock(&sbi->stat_lock);
- percpu_counter_sub(&sbi->alloc_valid_block_count, diff);
- return false;
+ goto enospc;
}
}
spin_unlock(&sbi->stat_lock);
- f2fs_i_blocks_write(inode, *count, true);
- return true;
+ if (unlikely(release)) {
+ percpu_counter_sub(&sbi->alloc_valid_block_count, release);
+ dquot_release_reservation_block(inode, release);
+ }
+ f2fs_i_blocks_write(inode, *count, true, true);
+ return 0;
+
+enospc:
+ percpu_counter_sub(&sbi->alloc_valid_block_count, release);
+ dquot_release_reservation_block(inode, release);
+ return -ENOSPC;
}
static inline void dec_valid_block_count(struct f2fs_sb_info *sbi,
struct inode *inode,
- blkcnt_t count)
+ block_t count)
{
+ blkcnt_t sectors = count << F2FS_LOG_SECTORS_PER_BLOCK;
+
spin_lock(&sbi->stat_lock);
f2fs_bug_on(sbi, sbi->total_valid_block_count < (block_t) count);
- f2fs_bug_on(sbi, inode->i_blocks < count);
+ f2fs_bug_on(sbi, inode->i_blocks < sectors);
sbi->total_valid_block_count -= (block_t)count;
+ if (sbi->reserved_blocks &&
+ sbi->current_reserved_blocks < sbi->reserved_blocks)
+ sbi->current_reserved_blocks = min(sbi->reserved_blocks,
+ sbi->current_reserved_blocks + count);
spin_unlock(&sbi->stat_lock);
- f2fs_i_blocks_write(inode, count, false);
+ f2fs_i_blocks_write(inode, count, false, true);
}
static inline void inc_page_count(struct f2fs_sb_info *sbi, int count_type)
{
atomic_inc(&sbi->nr_pages[count_type]);
- if (count_type == F2FS_DIRTY_DATA || count_type == F2FS_INMEM_PAGES)
+ if (count_type == F2FS_DIRTY_DATA || count_type == F2FS_INMEM_PAGES ||
+ count_type == F2FS_WB_CP_DATA || count_type == F2FS_WB_DATA ||
+ count_type == F2FS_RD_DATA || count_type == F2FS_RD_NODE ||
+ count_type == F2FS_RD_META)
return;
set_sbi_flag(sbi, SBI_IS_DIRTY);
@@ -1256,6 +1812,8 @@ static inline void inode_inc_dirty_pages(struct inode *inode)
atomic_inc(&F2FS_I(inode)->dirty_pages);
inc_page_count(F2FS_I_SB(inode), S_ISDIR(inode->i_mode) ?
F2FS_DIRTY_DENTS : F2FS_DIRTY_DATA);
+ if (IS_NOQUOTA(inode))
+ inc_page_count(F2FS_I_SB(inode), F2FS_DIRTY_QDATA);
}
static inline void dec_page_count(struct f2fs_sb_info *sbi, int count_type)
@@ -1272,6 +1830,8 @@ static inline void inode_dec_dirty_pages(struct inode *inode)
atomic_dec(&F2FS_I(inode)->dirty_pages);
dec_page_count(F2FS_I_SB(inode), S_ISDIR(inode->i_mode) ?
F2FS_DIRTY_DENTS : F2FS_DIRTY_DATA);
+ if (IS_NOQUOTA(inode))
+ dec_page_count(F2FS_I_SB(inode), F2FS_DIRTY_QDATA);
}
static inline s64 get_pages(struct f2fs_sb_info *sbi, int count_type)
@@ -1326,6 +1886,12 @@ static inline void *__bitmap_ptr(struct f2fs_sb_info *sbi, int flag)
struct f2fs_checkpoint *ckpt = F2FS_CKPT(sbi);
int offset;
+ if (is_set_ckpt_flags(sbi, CP_LARGE_NAT_BITMAP_FLAG)) {
+ offset = (flag == SIT_BITMAP) ?
+ le32_to_cpu(ckpt->nat_ver_bitmap_bytesize) : 0;
+ return &ckpt->sit_nat_version_bitmap + offset;
+ }
+
if (__cp_payload(sbi) > 0) {
if (flag == NAT_BITMAP)
return &ckpt->sit_nat_version_bitmap;
@@ -1366,51 +1932,96 @@ static inline block_t __start_sum_addr(struct f2fs_sb_info *sbi)
return le32_to_cpu(F2FS_CKPT(sbi)->cp_pack_start_sum);
}
-static inline bool inc_valid_node_count(struct f2fs_sb_info *sbi,
- struct inode *inode)
+static inline int inc_valid_node_count(struct f2fs_sb_info *sbi,
+ struct inode *inode, bool is_inode)
{
block_t valid_block_count;
unsigned int valid_node_count;
+ int err;
+
+ if (is_inode) {
+ if (inode) {
+ err = dquot_alloc_inode(inode);
+ if (err)
+ return err;
+ }
+ } else {
+ err = dquot_reserve_block(inode, 1);
+ if (err)
+ return err;
+ }
+
+ if (time_to_inject(sbi, FAULT_BLOCK)) {
+ f2fs_show_injection_info(FAULT_BLOCK);
+ goto enospc;
+ }
spin_lock(&sbi->stat_lock);
- valid_block_count = sbi->total_valid_block_count + 1;
+ valid_block_count = sbi->total_valid_block_count +
+ sbi->current_reserved_blocks + 1;
+
+ if (!__allow_reserved_blocks(sbi, inode, false))
+ valid_block_count += F2FS_OPTION(sbi).root_reserved_blocks;
+ if (unlikely(is_sbi_flag_set(sbi, SBI_CP_DISABLED)))
+ valid_block_count += sbi->unusable_block_count;
+
if (unlikely(valid_block_count > sbi->user_block_count)) {
spin_unlock(&sbi->stat_lock);
- return false;
+ goto enospc;
}
valid_node_count = sbi->total_valid_node_count + 1;
if (unlikely(valid_node_count > sbi->total_node_count)) {
spin_unlock(&sbi->stat_lock);
- return false;
+ goto enospc;
}
- if (inode)
- f2fs_i_blocks_write(inode, 1, true);
-
sbi->total_valid_node_count++;
sbi->total_valid_block_count++;
spin_unlock(&sbi->stat_lock);
+ if (inode) {
+ if (is_inode)
+ f2fs_mark_inode_dirty_sync(inode, true);
+ else
+ f2fs_i_blocks_write(inode, 1, true, true);
+ }
+
percpu_counter_inc(&sbi->alloc_valid_block_count);
- return true;
+ return 0;
+
+enospc:
+ if (is_inode) {
+ if (inode)
+ dquot_free_inode(inode);
+ } else {
+ dquot_release_reservation_block(inode, 1);
+ }
+ return -ENOSPC;
}
static inline void dec_valid_node_count(struct f2fs_sb_info *sbi,
- struct inode *inode)
+ struct inode *inode, bool is_inode)
{
spin_lock(&sbi->stat_lock);
f2fs_bug_on(sbi, !sbi->total_valid_block_count);
f2fs_bug_on(sbi, !sbi->total_valid_node_count);
- f2fs_bug_on(sbi, !inode->i_blocks);
+ f2fs_bug_on(sbi, !is_inode && !inode->i_blocks);
- f2fs_i_blocks_write(inode, 1, false);
sbi->total_valid_node_count--;
sbi->total_valid_block_count--;
+ if (sbi->reserved_blocks &&
+ sbi->current_reserved_blocks < sbi->reserved_blocks)
+ sbi->current_reserved_blocks++;
spin_unlock(&sbi->stat_lock);
+
+ if (is_inode)
+ dquot_free_inode(inode);
+ else
+ f2fs_i_blocks_write(inode, 1, false, true);
}
static inline unsigned int valid_node_count(struct f2fs_sb_info *sbi)
@@ -1436,19 +2047,40 @@ static inline s64 valid_inode_count(struct f2fs_sb_info *sbi)
static inline struct page *f2fs_grab_cache_page(struct address_space *mapping,
pgoff_t index, bool for_write)
{
-#ifdef CONFIG_F2FS_FAULT_INJECTION
- struct page *page = find_lock_page(mapping, index);
- if (page)
- return page;
+ struct page *page;
+
+ if (IS_ENABLED(CONFIG_F2FS_FAULT_INJECTION)) {
+ if (!for_write)
+ page = find_get_page_flags(mapping, index,
+ FGP_LOCK | FGP_ACCESSED);
+ else
+ page = find_lock_page(mapping, index);
+ if (page)
+ return page;
+
+ if (time_to_inject(F2FS_M_SB(mapping), FAULT_PAGE_ALLOC)) {
+ f2fs_show_injection_info(FAULT_PAGE_ALLOC);
+ return NULL;
+ }
+ }
- if (time_to_inject(F2FS_M_SB(mapping), FAULT_PAGE_ALLOC))
- return NULL;
-#endif
if (!for_write)
return grab_cache_page(mapping, index);
return grab_cache_page_write_begin(mapping, index, AOP_FLAG_NOFS);
}
+static inline struct page *f2fs_pagecache_get_page(
+ struct address_space *mapping, pgoff_t index,
+ int fgp_flags, gfp_t gfp_mask)
+{
+ if (time_to_inject(F2FS_M_SB(mapping), FAULT_PAGE_GET)) {
+ f2fs_show_injection_info(FAULT_PAGE_GET);
+ return NULL;
+ }
+
+ return pagecache_get_page(mapping, index, fgp_flags, gfp_mask);
+}
+
static inline void f2fs_copy_page(struct page *src, struct page *dst)
{
char *src_kaddr = kmap(src);
@@ -1498,15 +2130,37 @@ static inline void *f2fs_kmem_cache_alloc(struct kmem_cache *cachep,
return entry;
}
-static inline struct bio *f2fs_bio_alloc(int npages)
+static inline struct bio *f2fs_bio_alloc(struct f2fs_sb_info *sbi,
+ int npages, bool no_fail)
{
struct bio *bio;
- /* No failure on bio allocation */
- bio = bio_alloc(GFP_NOIO, npages);
- if (!bio)
- bio = bio_alloc(GFP_NOIO | __GFP_NOFAIL, npages);
- return bio;
+ if (no_fail) {
+ /* No failure on bio allocation */
+ bio = bio_alloc(GFP_NOIO, npages);
+ if (!bio)
+ bio = bio_alloc(GFP_NOIO | __GFP_NOFAIL, npages);
+ return bio;
+ }
+ if (time_to_inject(sbi, FAULT_ALLOC_BIO)) {
+ f2fs_show_injection_info(FAULT_ALLOC_BIO);
+ return NULL;
+ }
+
+ return bio_alloc(GFP_KERNEL, npages);
+}
+
+static inline bool is_idle(struct f2fs_sb_info *sbi, int type)
+{
+ if (get_pages(sbi, F2FS_RD_DATA) || get_pages(sbi, F2FS_RD_NODE) ||
+ get_pages(sbi, F2FS_RD_META) || get_pages(sbi, F2FS_WB_DATA) ||
+ get_pages(sbi, F2FS_WB_CP_DATA) ||
+ get_pages(sbi, F2FS_DIO_READ) ||
+ get_pages(sbi, F2FS_DIO_WRITE) ||
+ atomic_read(&SM_I(sbi)->dcc_info->queued_discard) ||
+ atomic_read(&SM_I(sbi)->fcc_info->queued_flush))
+ return false;
+ return f2fs_time_over(sbi, type);
}
static inline void f2fs_radix_tree_insert(struct radix_tree_root *root,
@@ -1521,22 +2175,42 @@ static inline void f2fs_radix_tree_insert(struct radix_tree_root *root,
static inline bool IS_INODE(struct page *page)
{
struct f2fs_node *p = F2FS_NODE(page);
+
return RAW_IS_INODE(p);
}
+static inline int offset_in_addr(struct f2fs_inode *i)
+{
+ return (i->i_inline & F2FS_EXTRA_ATTR) ?
+ (le16_to_cpu(i->i_extra_isize) / sizeof(__le32)) : 0;
+}
+
static inline __le32 *blkaddr_in_node(struct f2fs_node *node)
{
return RAW_IS_INODE(node) ? node->i.i_addr : node->dn.addr;
}
-static inline block_t datablock_addr(struct page *node_page,
- unsigned int offset)
+static inline int f2fs_has_extra_attr(struct inode *inode);
+static inline block_t datablock_addr(struct inode *inode,
+ struct page *node_page, unsigned int offset)
{
struct f2fs_node *raw_node;
__le32 *addr_array;
+ int base = 0;
+ bool is_inode = IS_INODE(node_page);
+
raw_node = F2FS_NODE(node_page);
+
+ /* from GC path only */
+ if (is_inode) {
+ if (!inode)
+ base = offset_in_addr(&raw_node->i);
+ else if (f2fs_has_extra_attr(inode))
+ base = get_extra_isize(inode);
+ }
+
addr_array = blkaddr_in_node(raw_node);
- return le32_to_cpu(addr_array[offset]);
+ return le32_to_cpu(addr_array[base + offset]);
}
static inline int f2fs_test_bit(unsigned int nr, char *addr)
@@ -1599,6 +2273,71 @@ static inline void f2fs_change_bit(unsigned int nr, char *addr)
*addr ^= mask;
}
+/*
+ * Inode flags
+ */
+#define F2FS_SECRM_FL 0x00000001 /* Secure deletion */
+#define F2FS_UNRM_FL 0x00000002 /* Undelete */
+#define F2FS_COMPR_FL 0x00000004 /* Compress file */
+#define F2FS_SYNC_FL 0x00000008 /* Synchronous updates */
+#define F2FS_IMMUTABLE_FL 0x00000010 /* Immutable file */
+#define F2FS_APPEND_FL 0x00000020 /* writes to file may only append */
+#define F2FS_NODUMP_FL 0x00000040 /* do not dump file */
+#define F2FS_NOATIME_FL 0x00000080 /* do not update atime */
+/* Reserved for compression usage... */
+#define F2FS_DIRTY_FL 0x00000100
+#define F2FS_COMPRBLK_FL 0x00000200 /* One or more compressed clusters */
+#define F2FS_NOCOMPR_FL 0x00000400 /* Don't compress */
+#define F2FS_ENCRYPT_FL 0x00000800 /* encrypted file */
+/* End compression flags --- maybe not all used */
+#define F2FS_INDEX_FL 0x00001000 /* hash-indexed directory */
+#define F2FS_IMAGIC_FL 0x00002000 /* AFS directory */
+#define F2FS_JOURNAL_DATA_FL 0x00004000 /* file data should be journaled */
+#define F2FS_NOTAIL_FL 0x00008000 /* file tail should not be merged */
+#define F2FS_DIRSYNC_FL 0x00010000 /* dirsync behaviour (directories only) */
+#define F2FS_TOPDIR_FL 0x00020000 /* Top of directory hierarchies*/
+#define F2FS_HUGE_FILE_FL 0x00040000 /* Set to each huge file */
+#define F2FS_EXTENTS_FL 0x00080000 /* Inode uses extents */
+#define F2FS_EA_INODE_FL 0x00200000 /* Inode used for large EA */
+#define F2FS_EOFBLOCKS_FL 0x00400000 /* Blocks allocated beyond EOF */
+#define F2FS_INLINE_DATA_FL 0x10000000 /* Inode has inline data. */
+#define F2FS_PROJINHERIT_FL 0x20000000 /* Create with parents projid */
+#define F2FS_RESERVED_FL 0x80000000 /* reserved for ext4 lib */
+
+#define F2FS_FL_USER_VISIBLE 0x304BDFFF /* User visible flags */
+#define F2FS_FL_USER_MODIFIABLE 0x204BC0FF /* User modifiable flags */
+
+/* Flags we can manipulate with through F2FS_IOC_FSSETXATTR */
+#define F2FS_FL_XFLAG_VISIBLE (F2FS_SYNC_FL | \
+ F2FS_IMMUTABLE_FL | \
+ F2FS_APPEND_FL | \
+ F2FS_NODUMP_FL | \
+ F2FS_NOATIME_FL | \
+ F2FS_PROJINHERIT_FL)
+
+/* Flags that should be inherited by new inodes from their parent. */
+#define F2FS_FL_INHERITED (F2FS_SECRM_FL | F2FS_UNRM_FL | F2FS_COMPR_FL |\
+ F2FS_SYNC_FL | F2FS_NODUMP_FL | F2FS_NOATIME_FL |\
+ F2FS_NOCOMPR_FL | F2FS_JOURNAL_DATA_FL |\
+ F2FS_NOTAIL_FL | F2FS_DIRSYNC_FL |\
+ F2FS_PROJINHERIT_FL)
+
+/* Flags that are appropriate for regular files (all but dir-specific ones). */
+#define F2FS_REG_FLMASK (~(F2FS_DIRSYNC_FL | F2FS_TOPDIR_FL))
+
+/* Flags that are appropriate for non-directories/regular files. */
+#define F2FS_OTHER_FLMASK (F2FS_NODUMP_FL | F2FS_NOATIME_FL)
+
+static inline __u32 f2fs_mask_flags(umode_t mode, __u32 flags)
+{
+ if (S_ISDIR(mode))
+ return flags;
+ else if (S_ISREG(mode))
+ return flags & F2FS_REG_FLMASK;
+ else
+ return flags & F2FS_OTHER_FLMASK;
+}
+
/* used for f2fs_inode_info->flags */
enum {
FI_NEW_INODE, /* indicate newly allocated inode */
@@ -1617,6 +2356,7 @@ enum {
FI_UPDATE_WRITE, /* inode has in-place-update data */
FI_NEED_IPU, /* used for ipu per file */
FI_ATOMIC_FILE, /* indicate atomic file */
+ FI_ATOMIC_COMMIT, /* indicate the state of atomical committing */
FI_VOLATILE_FILE, /* indicate volatile file */
FI_FIRST_BLOCK_WRITTEN, /* indicate #0 data block was written */
FI_DROP_CACHE, /* drop dirty page cache */
@@ -1624,6 +2364,12 @@ enum {
FI_INLINE_DOTS, /* indicate inline dot dentries */
FI_DO_DEFRAG, /* indicate defragment is running */
FI_DIRTY_FILE, /* indicate regular/symlink has dirty pages */
+ FI_NO_PREALLOC, /* indicate skipped preallocated blocks */
+ FI_HOT_DATA, /* indicate file is hot */
+ FI_EXTRA_ATTR, /* indicate file has extra attribute */
+ FI_PROJ_INHERIT, /* indicate file inherits projectid */
+ FI_PIN_FILE, /* indicate file should not be gced */
+ FI_ATOMIC_REVOKE_REQUEST, /* request to drop atomic data */
};
static inline void __mark_inode_dirty_flag(struct inode *inode,
@@ -1633,11 +2379,14 @@ static inline void __mark_inode_dirty_flag(struct inode *inode,
case FI_INLINE_XATTR:
case FI_INLINE_DATA:
case FI_INLINE_DENTRY:
+ case FI_NEW_INODE:
if (set)
return;
+ /* fall through */
case FI_DATA_EXIST:
case FI_INLINE_DOTS:
- f2fs_mark_inode_dirty_sync(inode);
+ case FI_PIN_FILE:
+ f2fs_mark_inode_dirty_sync(inode, true);
}
}
@@ -1664,7 +2413,7 @@ static inline void set_acl_inode(struct inode *inode, umode_t mode)
{
F2FS_I(inode)->i_acl_mode = mode;
set_inode_flag(inode, FI_ACL_MODE);
- f2fs_mark_inode_dirty_sync(inode);
+ f2fs_mark_inode_dirty_sync(inode, false);
}
static inline void f2fs_i_links_write(struct inode *inode, bool inc)
@@ -1673,18 +2422,26 @@ static inline void f2fs_i_links_write(struct inode *inode, bool inc)
inc_nlink(inode);
else
drop_nlink(inode);
- f2fs_mark_inode_dirty_sync(inode);
+ f2fs_mark_inode_dirty_sync(inode, true);
}
static inline void f2fs_i_blocks_write(struct inode *inode,
- blkcnt_t diff, bool add)
+ block_t diff, bool add, bool claim)
{
bool clean = !is_inode_flag_set(inode, FI_DIRTY_INODE);
bool recover = is_inode_flag_set(inode, FI_AUTO_RECOVER);
- inode->i_blocks = add ? inode->i_blocks + diff :
- inode->i_blocks - diff;
- f2fs_mark_inode_dirty_sync(inode);
+ /* add = 1, claim = 1 should be dquot_reserve_block in pair */
+ if (add) {
+ if (claim)
+ dquot_claim_block(inode, diff);
+ else
+ dquot_alloc_block_nofail(inode, diff);
+ } else {
+ dquot_free_block(inode, diff);
+ }
+
+ f2fs_mark_inode_dirty_sync(inode, true);
if (clean || recover)
set_inode_flag(inode, FI_AUTO_RECOVER);
}
@@ -1698,34 +2455,34 @@ static inline void f2fs_i_size_write(struct inode *inode, loff_t i_size)
return;
i_size_write(inode, i_size);
- f2fs_mark_inode_dirty_sync(inode);
+ f2fs_mark_inode_dirty_sync(inode, true);
if (clean || recover)
set_inode_flag(inode, FI_AUTO_RECOVER);
}
-static inline bool f2fs_skip_inode_update(struct inode *inode)
+static inline void f2fs_i_depth_write(struct inode *inode, unsigned int depth)
{
- if (!is_inode_flag_set(inode, FI_AUTO_RECOVER))
- return false;
- return F2FS_I(inode)->last_disk_size == i_size_read(inode);
+ F2FS_I(inode)->i_current_depth = depth;
+ f2fs_mark_inode_dirty_sync(inode, true);
}
-static inline void f2fs_i_depth_write(struct inode *inode, unsigned int depth)
+static inline void f2fs_i_gc_failures_write(struct inode *inode,
+ unsigned int count)
{
- F2FS_I(inode)->i_current_depth = depth;
- f2fs_mark_inode_dirty_sync(inode);
+ F2FS_I(inode)->i_gc_failures[GC_FAILURE_PIN] = count;
+ f2fs_mark_inode_dirty_sync(inode, true);
}
static inline void f2fs_i_xnid_write(struct inode *inode, nid_t xnid)
{
F2FS_I(inode)->i_xattr_nid = xnid;
- f2fs_mark_inode_dirty_sync(inode);
+ f2fs_mark_inode_dirty_sync(inode, true);
}
static inline void f2fs_i_pino_write(struct inode *inode, nid_t pino)
{
F2FS_I(inode)->i_pino = pino;
- f2fs_mark_inode_dirty_sync(inode);
+ f2fs_mark_inode_dirty_sync(inode, true);
}
static inline void get_inline_info(struct inode *inode, struct f2fs_inode *ri)
@@ -1742,6 +2499,10 @@ static inline void get_inline_info(struct inode *inode, struct f2fs_inode *ri)
set_bit(FI_DATA_EXIST, &fi->flags);
if (ri->i_inline & F2FS_INLINE_DOTS)
set_bit(FI_INLINE_DOTS, &fi->flags);
+ if (ri->i_inline & F2FS_EXTRA_ATTR)
+ set_bit(FI_EXTRA_ATTR, &fi->flags);
+ if (ri->i_inline & F2FS_PIN_FILE)
+ set_bit(FI_PIN_FILE, &fi->flags);
}
static inline void set_raw_inline(struct inode *inode, struct f2fs_inode *ri)
@@ -1758,6 +2519,15 @@ static inline void set_raw_inline(struct inode *inode, struct f2fs_inode *ri)
ri->i_inline |= F2FS_DATA_EXIST;
if (is_inode_flag_set(inode, FI_INLINE_DOTS))
ri->i_inline |= F2FS_INLINE_DOTS;
+ if (is_inode_flag_set(inode, FI_EXTRA_ATTR))
+ ri->i_inline |= F2FS_EXTRA_ATTR;
+ if (is_inode_flag_set(inode, FI_PIN_FILE))
+ ri->i_inline |= F2FS_PIN_FILE;
+}
+
+static inline int f2fs_has_extra_attr(struct inode *inode)
+{
+ return is_inode_flag_set(inode, FI_EXTRA_ATTR);
}
static inline int f2fs_has_inline_xattr(struct inode *inode)
@@ -1767,24 +2537,20 @@ static inline int f2fs_has_inline_xattr(struct inode *inode)
static inline unsigned int addrs_per_inode(struct inode *inode)
{
- if (f2fs_has_inline_xattr(inode))
- return DEF_ADDRS_PER_INODE - F2FS_INLINE_XATTR_ADDRS;
- return DEF_ADDRS_PER_INODE;
+ return CUR_ADDRS_PER_INODE(inode) - get_inline_xattr_addrs(inode);
}
-static inline void *inline_xattr_addr(struct page *page)
+static inline void *inline_xattr_addr(struct inode *inode, struct page *page)
{
struct f2fs_inode *ri = F2FS_INODE(page);
+
return (void *)&(ri->i_addr[DEF_ADDRS_PER_INODE -
- F2FS_INLINE_XATTR_ADDRS]);
+ get_inline_xattr_addrs(inode)]);
}
static inline int inline_xattr_size(struct inode *inode)
{
- if (f2fs_has_inline_xattr(inode))
- return F2FS_INLINE_XATTR_ADDRS << 2;
- else
- return 0;
+ return get_inline_xattr_addrs(inode) * sizeof(__le32);
}
static inline int f2fs_has_inline_data(struct inode *inode)
@@ -1792,12 +2558,6 @@ static inline int f2fs_has_inline_data(struct inode *inode)
return is_inode_flag_set(inode, FI_INLINE_DATA);
}
-static inline void f2fs_clear_inline_inode(struct inode *inode)
-{
- clear_inode_flag(inode, FI_INLINE_DATA);
- clear_inode_flag(inode, FI_DATA_EXIST);
-}
-
static inline int f2fs_exist_data(struct inode *inode)
{
return is_inode_flag_set(inode, FI_DATA_EXIST);
@@ -1808,11 +2568,21 @@ static inline int f2fs_has_inline_dots(struct inode *inode)
return is_inode_flag_set(inode, FI_INLINE_DOTS);
}
+static inline bool f2fs_is_pinned_file(struct inode *inode)
+{
+ return is_inode_flag_set(inode, FI_PIN_FILE);
+}
+
static inline bool f2fs_is_atomic_file(struct inode *inode)
{
return is_inode_flag_set(inode, FI_ATOMIC_FILE);
}
+static inline bool f2fs_is_commit_atomic_write(struct inode *inode)
+{
+ return is_inode_flag_set(inode, FI_ATOMIC_COMMIT);
+}
+
static inline bool f2fs_is_volatile_file(struct inode *inode)
{
return is_inode_flag_set(inode, FI_VOLATILE_FILE);
@@ -1828,10 +2598,12 @@ static inline bool f2fs_is_drop_cache(struct inode *inode)
return is_inode_flag_set(inode, FI_DROP_CACHE);
}
-static inline void *inline_data_addr(struct page *page)
+static inline void *inline_data_addr(struct inode *inode, struct page *page)
{
struct f2fs_inode *ri = F2FS_INODE(page);
- return (void *)&(ri->i_addr[1]);
+ int extra_size = get_extra_isize(inode);
+
+ return (void *)&(ri->i_addr[extra_size + DEF_INLINE_RESERVED_SIZE]);
}
static inline int f2fs_has_inline_dentry(struct inode *inode)
@@ -1839,12 +2611,6 @@ static inline int f2fs_has_inline_dentry(struct inode *inode)
return is_inode_flag_set(inode, FI_INLINE_DENTRY);
}
-static inline void f2fs_dentry_kunmap(struct inode *dir, struct page *page)
-{
- if (!f2fs_has_inline_dentry(dir))
- kunmap(page);
-}
-
static inline int is_file(struct inode *inode, int type)
{
return F2FS_I(inode)->i_advise & type;
@@ -1853,16 +2619,50 @@ static inline int is_file(struct inode *inode, int type)
static inline void set_file(struct inode *inode, int type)
{
F2FS_I(inode)->i_advise |= type;
- f2fs_mark_inode_dirty_sync(inode);
+ f2fs_mark_inode_dirty_sync(inode, true);
}
static inline void clear_file(struct inode *inode, int type)
{
F2FS_I(inode)->i_advise &= ~type;
- f2fs_mark_inode_dirty_sync(inode);
+ f2fs_mark_inode_dirty_sync(inode, true);
+}
+
+static inline bool f2fs_skip_inode_update(struct inode *inode, int dsync)
+{
+ bool ret;
+
+ if (dsync) {
+ struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
+
+ spin_lock(&sbi->inode_lock[DIRTY_META]);
+ ret = list_empty(&F2FS_I(inode)->gdirty_list);
+ spin_unlock(&sbi->inode_lock[DIRTY_META]);
+ return ret;
+ }
+ if (!is_inode_flag_set(inode, FI_AUTO_RECOVER) ||
+ file_keep_isize(inode) ||
+ i_size_read(inode) & ~PAGE_MASK)
+ return false;
+
+ if (!timespec_equal(F2FS_I(inode)->i_disk_time, &inode->i_atime))
+ return false;
+ if (!timespec_equal(F2FS_I(inode)->i_disk_time + 1, &inode->i_ctime))
+ return false;
+ if (!timespec_equal(F2FS_I(inode)->i_disk_time + 2, &inode->i_mtime))
+ return false;
+ if (!timespec_equal(F2FS_I(inode)->i_disk_time + 3,
+ &F2FS_I(inode)->i_crtime))
+ return false;
+
+ down_read(&F2FS_I(inode)->i_sem);
+ ret = F2FS_I(inode)->last_disk_size == i_size_read(inode);
+ up_read(&F2FS_I(inode)->i_sem);
+
+ return ret;
}
-static inline int f2fs_readonly(struct super_block *sb)
+static inline bool f2fs_readonly(struct super_block *sb)
{
return sb->s_flags & MS_RDONLY;
}
@@ -1885,24 +2685,23 @@ static inline bool is_dot_dotdot(const struct qstr *str)
static inline bool f2fs_may_extent_tree(struct inode *inode)
{
- if (!test_opt(F2FS_I_SB(inode), EXTENT_CACHE) ||
+ struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
+
+ if (!test_opt(sbi, EXTENT_CACHE) ||
is_inode_flag_set(inode, FI_NO_EXTENT))
return false;
- return S_ISREG(inode->i_mode);
-}
+ /*
+ * for recovered files during mount do not create extents
+ * if shrinker is not registered.
+ */
+ if (list_empty(&sbi->s_list))
+ return false;
-static inline void *f2fs_kmalloc(struct f2fs_sb_info *sbi,
- size_t size, gfp_t flags)
-{
-#ifdef CONFIG_F2FS_FAULT_INJECTION
- if (time_to_inject(sbi, FAULT_KMALLOC))
- return NULL;
-#endif
- return kmalloc(size, flags);
+ return S_ISREG(inode->i_mode);
}
-static inline void *f2fs_kvmalloc(size_t size, gfp_t flags)
+static inline void *kvmalloc(size_t size, gfp_t flags)
{
void *ret;
@@ -1912,7 +2711,24 @@ static inline void *f2fs_kvmalloc(size_t size, gfp_t flags)
return ret;
}
-static inline void *f2fs_kvzalloc(size_t size, gfp_t flags)
+static inline void *f2fs_kmalloc(struct f2fs_sb_info *sbi,
+ size_t size, gfp_t flags)
+{
+ void *ret;
+
+ if (time_to_inject(sbi, FAULT_KMALLOC)) {
+ f2fs_show_injection_info(FAULT_KMALLOC);
+ return NULL;
+ }
+
+ ret = kmalloc(size, flags);
+ if (ret)
+ return ret;
+
+ return kvmalloc(size, flags);
+}
+
+static inline void *kvzalloc(size_t size, gfp_t flags)
{
void *ret;
@@ -1922,15 +2738,89 @@ static inline void *f2fs_kvzalloc(size_t size, gfp_t flags)
return ret;
}
-#define get_inode_mode(i) \
+static inline int wbc_to_write_flags(struct writeback_control *wbc)
+{
+ if (wbc->sync_mode == WB_SYNC_ALL)
+ return REQ_SYNC;
+ else if (wbc->for_kupdate || wbc->for_background)
+ return 0;
+
+ return 0;
+}
+
+static inline void *f2fs_kzalloc(struct f2fs_sb_info *sbi,
+ size_t size, gfp_t flags)
+{
+ return f2fs_kmalloc(sbi, size, flags | __GFP_ZERO);
+}
+
+static inline void *f2fs_kvmalloc(struct f2fs_sb_info *sbi,
+ size_t size, gfp_t flags)
+{
+ if (time_to_inject(sbi, FAULT_KVMALLOC)) {
+ f2fs_show_injection_info(FAULT_KVMALLOC);
+ return NULL;
+ }
+
+ return kvmalloc(size, flags);
+}
+
+static inline void *f2fs_kvzalloc(struct f2fs_sb_info *sbi,
+ size_t size, gfp_t flags)
+{
+ return f2fs_kvmalloc(sbi, size, flags | __GFP_ZERO);
+}
+
+static inline int get_extra_isize(struct inode *inode)
+{
+ return F2FS_I(inode)->i_extra_isize / sizeof(__le32);
+}
+
+static inline int get_inline_xattr_addrs(struct inode *inode)
+{
+ return F2FS_I(inode)->i_inline_xattr_size;
+}
+
+#define f2fs_get_inode_mode(i) \
((is_inode_flag_set(i, FI_ACL_MODE)) ? \
(F2FS_I(i)->i_acl_mode) : ((i)->i_mode))
-/* get offset of first page in next direct node */
-#define PGOFS_OF_NEXT_DNODE(pgofs, inode) \
- ((pgofs < ADDRS_PER_INODE(inode)) ? ADDRS_PER_INODE(inode) : \
- (pgofs - ADDRS_PER_INODE(inode) + ADDRS_PER_BLOCK) / \
- ADDRS_PER_BLOCK * ADDRS_PER_BLOCK + ADDRS_PER_INODE(inode))
+#define F2FS_TOTAL_EXTRA_ATTR_SIZE \
+ (offsetof(struct f2fs_inode, i_extra_end) - \
+ offsetof(struct f2fs_inode, i_extra_isize)) \
+
+#define F2FS_OLD_ATTRIBUTE_SIZE (offsetof(struct f2fs_inode, i_addr))
+#define F2FS_FITS_IN_INODE(f2fs_inode, extra_isize, field) \
+ ((offsetof(typeof(*f2fs_inode), field) + \
+ sizeof((f2fs_inode)->field)) \
+ <= (F2FS_OLD_ATTRIBUTE_SIZE + extra_isize)) \
+
+static inline void f2fs_reset_iostat(struct f2fs_sb_info *sbi)
+{
+ int i;
+
+ spin_lock(&sbi->iostat_lock);
+ for (i = 0; i < NR_IO_TYPE; i++)
+ sbi->write_iostat[i] = 0;
+ spin_unlock(&sbi->iostat_lock);
+}
+
+static inline void f2fs_update_iostat(struct f2fs_sb_info *sbi,
+ enum iostat_type type, unsigned long long io_bytes)
+{
+ if (!sbi->iostat_enable)
+ return;
+ spin_lock(&sbi->iostat_lock);
+ sbi->write_iostat[type] += io_bytes;
+
+ if (type == APP_WRITE_IO || type == APP_DIRECT_IO)
+ sbi->write_iostat[APP_BUFFERED_IO] =
+ sbi->write_iostat[APP_WRITE_IO] -
+ sbi->write_iostat[APP_DIRECT_IO];
+ spin_unlock(&sbi->iostat_lock);
+}
+
+#define __is_large_section(sbi) ((sbi)->segs_per_sec > 1)
#define __is_meta_io(fio) (PAGE_TYPE_OF_BIO(fio->type) == META && \
(!is_read_io(fio->op) || fio->is_meta))
@@ -1968,89 +2858,104 @@ static inline bool is_valid_data_blkaddr(struct f2fs_sb_info *sbi,
/*
* file.c
*/
-int f2fs_sync_file(struct file *, loff_t, loff_t, int);
-void truncate_data_blocks(struct dnode_of_data *);
-int truncate_blocks(struct inode *, u64, bool);
-int f2fs_truncate(struct inode *);
-int f2fs_getattr(struct vfsmount *, struct dentry *, struct kstat *);
-int f2fs_setattr(struct dentry *, struct iattr *);
-int truncate_hole(struct inode *, pgoff_t, pgoff_t);
-int truncate_data_blocks_range(struct dnode_of_data *, int);
-long f2fs_ioctl(struct file *, unsigned int, unsigned long);
-long f2fs_compat_ioctl(struct file *, unsigned int, unsigned long);
+int f2fs_sync_file(struct file *file, loff_t start, loff_t end, int datasync);
+void f2fs_truncate_data_blocks(struct dnode_of_data *dn);
+int f2fs_truncate_blocks(struct inode *inode, u64 from, bool lock,
+ bool buf_write);
+int f2fs_truncate(struct inode *inode);
+int f2fs_getattr(struct vfsmount *mnt, struct dentry *dentry,
+ struct kstat *stat);
+int f2fs_setattr(struct dentry *dentry, struct iattr *attr);
+int f2fs_truncate_hole(struct inode *inode, pgoff_t pg_start, pgoff_t pg_end);
+void f2fs_truncate_data_blocks_range(struct dnode_of_data *dn, int count);
+int f2fs_precache_extents(struct inode *inode);
+long f2fs_ioctl(struct file *filp, unsigned int cmd, unsigned long arg);
+long f2fs_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg);
+int f2fs_pin_file_control(struct inode *inode, bool inc);
/*
* inode.c
*/
-void f2fs_set_inode_flags(struct inode *);
-struct inode *f2fs_iget(struct super_block *, unsigned long);
-struct inode *f2fs_iget_retry(struct super_block *, unsigned long);
-int try_to_free_nats(struct f2fs_sb_info *, int);
-int update_inode(struct inode *, struct page *);
-int update_inode_page(struct inode *);
-int f2fs_write_inode(struct inode *, struct writeback_control *);
-void f2fs_evict_inode(struct inode *);
-void handle_failed_inode(struct inode *);
+void f2fs_set_inode_flags(struct inode *inode);
+bool f2fs_inode_chksum_verify(struct f2fs_sb_info *sbi, struct page *page);
+void f2fs_inode_chksum_set(struct f2fs_sb_info *sbi, struct page *page);
+struct inode *f2fs_iget(struct super_block *sb, unsigned long ino);
+struct inode *f2fs_iget_retry(struct super_block *sb, unsigned long ino);
+int f2fs_try_to_free_nats(struct f2fs_sb_info *sbi, int nr_shrink);
+void f2fs_update_inode(struct inode *inode, struct page *node_page);
+void f2fs_update_inode_page(struct inode *inode);
+int f2fs_write_inode(struct inode *inode, struct writeback_control *wbc);
+void f2fs_evict_inode(struct inode *inode);
+void f2fs_handle_failed_inode(struct inode *inode);
/*
* namei.c
*/
+int f2fs_update_extension_list(struct f2fs_sb_info *sbi, const char *name,
+ bool hot, bool set);
struct dentry *f2fs_get_parent(struct dentry *child);
/*
* dir.c
*/
-void set_de_type(struct f2fs_dir_entry *, umode_t);
-unsigned char get_de_type(struct f2fs_dir_entry *);
-struct f2fs_dir_entry *find_target_dentry(struct fscrypt_name *,
- f2fs_hash_t, int *, struct f2fs_dentry_ptr *);
-bool f2fs_fill_dentries(struct dir_context *, struct f2fs_dentry_ptr *,
- unsigned int, struct fscrypt_str *);
-void do_make_empty_dir(struct inode *, struct inode *,
- struct f2fs_dentry_ptr *);
-struct page *init_inode_metadata(struct inode *, struct inode *,
- const struct qstr *, const struct qstr *, struct page *);
-void update_parent_metadata(struct inode *, struct inode *, unsigned int);
-int room_for_filename(const void *, int, int);
-void f2fs_drop_nlink(struct inode *, struct inode *);
-struct f2fs_dir_entry *__f2fs_find_entry(struct inode *, struct fscrypt_name *,
- struct page **);
-struct f2fs_dir_entry *f2fs_find_entry(struct inode *, const struct qstr *,
- struct page **);
-struct f2fs_dir_entry *f2fs_parent_dir(struct inode *, struct page **);
-ino_t f2fs_inode_by_name(struct inode *, const struct qstr *, struct page **);
-void f2fs_set_link(struct inode *, struct f2fs_dir_entry *,
- struct page *, struct inode *);
-int update_dent_inode(struct inode *, struct inode *, const struct qstr *);
-void f2fs_update_dentry(nid_t ino, umode_t mode, struct f2fs_dentry_ptr *,
- const struct qstr *, f2fs_hash_t , unsigned int);
-int f2fs_add_regular_entry(struct inode *, const struct qstr *,
- const struct qstr *, struct inode *, nid_t, umode_t);
-int __f2fs_do_add_link(struct inode *, struct fscrypt_name*, struct inode *,
- nid_t, umode_t);
-int __f2fs_add_link(struct inode *, const struct qstr *, struct inode *, nid_t,
- umode_t);
-void f2fs_delete_entry(struct f2fs_dir_entry *, struct page *, struct inode *,
- struct inode *);
-int f2fs_do_tmpfile(struct inode *, struct inode *);
-bool f2fs_empty_dir(struct inode *);
+unsigned char f2fs_get_de_type(struct f2fs_dir_entry *de);
+struct f2fs_dir_entry *f2fs_find_target_dentry(struct fscrypt_name *fname,
+ f2fs_hash_t namehash, int *max_slots,
+ struct f2fs_dentry_ptr *d);
+int f2fs_fill_dentries(struct dir_context *ctx, struct f2fs_dentry_ptr *d,
+ unsigned int start_pos, struct fscrypt_str *fstr);
+void f2fs_do_make_empty_dir(struct inode *inode, struct inode *parent,
+ struct f2fs_dentry_ptr *d);
+struct page *f2fs_init_inode_metadata(struct inode *inode, struct inode *dir,
+ const struct qstr *new_name,
+ const struct qstr *orig_name, struct page *dpage);
+void f2fs_update_parent_metadata(struct inode *dir, struct inode *inode,
+ unsigned int current_depth);
+int f2fs_room_for_filename(const void *bitmap, int slots, int max_slots);
+void f2fs_drop_nlink(struct inode *dir, struct inode *inode);
+struct f2fs_dir_entry *__f2fs_find_entry(struct inode *dir,
+ struct fscrypt_name *fname, struct page **res_page);
+struct f2fs_dir_entry *f2fs_find_entry(struct inode *dir,
+ const struct qstr *child, struct page **res_page);
+struct f2fs_dir_entry *f2fs_parent_dir(struct inode *dir, struct page **p);
+ino_t f2fs_inode_by_name(struct inode *dir, const struct qstr *qstr,
+ struct page **page);
+void f2fs_set_link(struct inode *dir, struct f2fs_dir_entry *de,
+ struct page *page, struct inode *inode);
+void f2fs_update_dentry(nid_t ino, umode_t mode, struct f2fs_dentry_ptr *d,
+ const struct qstr *name, f2fs_hash_t name_hash,
+ unsigned int bit_pos);
+int f2fs_add_regular_entry(struct inode *dir, const struct qstr *new_name,
+ const struct qstr *orig_name,
+ struct inode *inode, nid_t ino, umode_t mode);
+int f2fs_add_dentry(struct inode *dir, struct fscrypt_name *fname,
+ struct inode *inode, nid_t ino, umode_t mode);
+int f2fs_do_add_link(struct inode *dir, const struct qstr *name,
+ struct inode *inode, nid_t ino, umode_t mode);
+void f2fs_delete_entry(struct f2fs_dir_entry *dentry, struct page *page,
+ struct inode *dir, struct inode *inode);
+int f2fs_do_tmpfile(struct inode *inode, struct inode *dir);
+bool f2fs_empty_dir(struct inode *dir);
static inline int f2fs_add_link(struct dentry *dentry, struct inode *inode)
{
- return __f2fs_add_link(d_inode(dentry->d_parent), &dentry->d_name,
+ return f2fs_do_add_link(d_inode(dentry->d_parent), &dentry->d_name,
inode, inode->i_ino, inode->i_mode);
}
/*
* super.c
*/
-int f2fs_inode_dirtied(struct inode *);
-void f2fs_inode_synced(struct inode *);
-int f2fs_commit_super(struct f2fs_sb_info *, bool);
-int f2fs_sync_fs(struct super_block *, int);
+int f2fs_inode_dirtied(struct inode *inode, bool sync);
+void f2fs_inode_synced(struct inode *inode);
+int f2fs_enable_quota_files(struct f2fs_sb_info *sbi, bool rdonly);
+int f2fs_quota_sync(struct super_block *sb, int type);
+void f2fs_quota_off_umount(struct super_block *sb);
+int f2fs_commit_super(struct f2fs_sb_info *sbi, bool recover);
+int f2fs_sync_fs(struct super_block *sb, int sync);
extern __printf(3, 4)
-void f2fs_msg(struct super_block *, const char *, const char *, ...);
-int sanity_check_ckpt(struct f2fs_sb_info *sbi);
+void f2fs_msg(struct super_block *sb, const char *level, const char *fmt, ...);
+int f2fs_sanity_check_ckpt(struct f2fs_sb_info *sbi);
/*
* hash.c
@@ -2064,161 +2969,221 @@ f2fs_hash_t f2fs_dentry_hash(const struct qstr *name_info,
struct dnode_of_data;
struct node_info;
-bool available_free_memory(struct f2fs_sb_info *, int);
-int need_dentry_mark(struct f2fs_sb_info *, nid_t);
-bool is_checkpointed_node(struct f2fs_sb_info *, nid_t);
-bool need_inode_block_update(struct f2fs_sb_info *, nid_t);
-void get_node_info(struct f2fs_sb_info *, nid_t, struct node_info *);
-pgoff_t get_next_page_offset(struct dnode_of_data *, pgoff_t);
-int get_dnode_of_data(struct dnode_of_data *, pgoff_t, int);
-int truncate_inode_blocks(struct inode *, pgoff_t);
-int truncate_xattr_node(struct inode *, struct page *);
-int wait_on_node_pages_writeback(struct f2fs_sb_info *, nid_t);
-int remove_inode_page(struct inode *);
-struct page *new_inode_page(struct inode *);
-struct page *new_node_page(struct dnode_of_data *, unsigned int, struct page *);
-void ra_node_page(struct f2fs_sb_info *, nid_t);
-struct page *get_node_page(struct f2fs_sb_info *, pgoff_t);
-struct page *get_node_page_ra(struct page *, int);
-void move_node_page(struct page *, int);
-int fsync_node_pages(struct f2fs_sb_info *, struct inode *,
- struct writeback_control *, bool);
-int sync_node_pages(struct f2fs_sb_info *, struct writeback_control *);
-void build_free_nids(struct f2fs_sb_info *);
-bool alloc_nid(struct f2fs_sb_info *, nid_t *);
-void alloc_nid_done(struct f2fs_sb_info *, nid_t);
-void alloc_nid_failed(struct f2fs_sb_info *, nid_t);
-int try_to_free_nids(struct f2fs_sb_info *, int);
-void recover_inline_xattr(struct inode *, struct page *);
-void recover_xattr_data(struct inode *, struct page *, block_t);
-int recover_inode_page(struct f2fs_sb_info *, struct page *);
-int restore_node_summary(struct f2fs_sb_info *, unsigned int,
- struct f2fs_summary_block *);
-void flush_nat_entries(struct f2fs_sb_info *);
-int build_node_manager(struct f2fs_sb_info *);
-void destroy_node_manager(struct f2fs_sb_info *);
-int __init create_node_manager_caches(void);
-void destroy_node_manager_caches(void);
+int f2fs_check_nid_range(struct f2fs_sb_info *sbi, nid_t nid);
+bool f2fs_available_free_memory(struct f2fs_sb_info *sbi, int type);
+bool f2fs_in_warm_node_list(struct f2fs_sb_info *sbi, struct page *page);
+void f2fs_init_fsync_node_info(struct f2fs_sb_info *sbi);
+void f2fs_del_fsync_node_entry(struct f2fs_sb_info *sbi, struct page *page);
+void f2fs_reset_fsync_node_info(struct f2fs_sb_info *sbi);
+int f2fs_need_dentry_mark(struct f2fs_sb_info *sbi, nid_t nid);
+bool f2fs_is_checkpointed_node(struct f2fs_sb_info *sbi, nid_t nid);
+bool f2fs_need_inode_block_update(struct f2fs_sb_info *sbi, nid_t ino);
+int f2fs_get_node_info(struct f2fs_sb_info *sbi, nid_t nid,
+ struct node_info *ni);
+pgoff_t f2fs_get_next_page_offset(struct dnode_of_data *dn, pgoff_t pgofs);
+int f2fs_get_dnode_of_data(struct dnode_of_data *dn, pgoff_t index, int mode);
+int f2fs_truncate_inode_blocks(struct inode *inode, pgoff_t from);
+int f2fs_truncate_xattr_node(struct inode *inode);
+int f2fs_wait_on_node_pages_writeback(struct f2fs_sb_info *sbi,
+ unsigned int seq_id);
+int f2fs_remove_inode_page(struct inode *inode);
+struct page *f2fs_new_inode_page(struct inode *inode);
+struct page *f2fs_new_node_page(struct dnode_of_data *dn, unsigned int ofs);
+void f2fs_ra_node_page(struct f2fs_sb_info *sbi, nid_t nid);
+struct page *f2fs_get_node_page(struct f2fs_sb_info *sbi, pgoff_t nid);
+struct page *f2fs_get_node_page_ra(struct page *parent, int start);
+int f2fs_move_node_page(struct page *node_page, int gc_type);
+int f2fs_fsync_node_pages(struct f2fs_sb_info *sbi, struct inode *inode,
+ struct writeback_control *wbc, bool atomic,
+ unsigned int *seq_id);
+int f2fs_sync_node_pages(struct f2fs_sb_info *sbi,
+ struct writeback_control *wbc,
+ bool do_balance, enum iostat_type io_type);
+int f2fs_build_free_nids(struct f2fs_sb_info *sbi, bool sync, bool mount);
+bool f2fs_alloc_nid(struct f2fs_sb_info *sbi, nid_t *nid);
+void f2fs_alloc_nid_done(struct f2fs_sb_info *sbi, nid_t nid);
+void f2fs_alloc_nid_failed(struct f2fs_sb_info *sbi, nid_t nid);
+int f2fs_try_to_free_nids(struct f2fs_sb_info *sbi, int nr_shrink);
+void f2fs_recover_inline_xattr(struct inode *inode, struct page *page);
+int f2fs_recover_xattr_data(struct inode *inode, struct page *page);
+int f2fs_recover_inode_page(struct f2fs_sb_info *sbi, struct page *page);
+int f2fs_restore_node_summary(struct f2fs_sb_info *sbi,
+ unsigned int segno, struct f2fs_summary_block *sum);
+int f2fs_flush_nat_entries(struct f2fs_sb_info *sbi, struct cp_control *cpc);
+int f2fs_build_node_manager(struct f2fs_sb_info *sbi);
+void f2fs_destroy_node_manager(struct f2fs_sb_info *sbi);
+int __init f2fs_create_node_manager_caches(void);
+void f2fs_destroy_node_manager_caches(void);
/*
* segment.c
*/
-void register_inmem_page(struct inode *, struct page *);
-void drop_inmem_pages(struct inode *);
-int commit_inmem_pages(struct inode *);
-void f2fs_balance_fs(struct f2fs_sb_info *, bool);
-void f2fs_balance_fs_bg(struct f2fs_sb_info *);
-int f2fs_issue_flush(struct f2fs_sb_info *);
-int create_flush_cmd_control(struct f2fs_sb_info *);
-void destroy_flush_cmd_control(struct f2fs_sb_info *);
-void invalidate_blocks(struct f2fs_sb_info *, block_t);
-bool is_checkpointed_data(struct f2fs_sb_info *, block_t);
-void refresh_sit_entry(struct f2fs_sb_info *, block_t, block_t);
-void f2fs_wait_all_discard_bio(struct f2fs_sb_info *);
-void clear_prefree_segments(struct f2fs_sb_info *, struct cp_control *);
-void release_discard_addrs(struct f2fs_sb_info *);
-int npages_for_summary_flush(struct f2fs_sb_info *, bool);
-void allocate_new_segments(struct f2fs_sb_info *);
-int f2fs_trim_fs(struct f2fs_sb_info *, struct fstrim_range *);
-struct page *get_sum_page(struct f2fs_sb_info *, unsigned int);
-void update_meta_page(struct f2fs_sb_info *, void *, block_t);
-void write_meta_page(struct f2fs_sb_info *, struct page *);
-void write_node_page(unsigned int, struct f2fs_io_info *);
-void write_data_page(struct dnode_of_data *, struct f2fs_io_info *);
-void rewrite_data_page(struct f2fs_io_info *);
-void __f2fs_replace_block(struct f2fs_sb_info *, struct f2fs_summary *,
- block_t, block_t, bool, bool);
-void f2fs_replace_block(struct f2fs_sb_info *, struct dnode_of_data *,
- block_t, block_t, unsigned char, bool, bool);
-void allocate_data_block(struct f2fs_sb_info *, struct page *,
- block_t, block_t *, struct f2fs_summary *, int);
-void f2fs_wait_on_page_writeback(struct page *, enum page_type, bool);
-void f2fs_wait_on_encrypted_page_writeback(struct f2fs_sb_info *, block_t);
-void write_data_summaries(struct f2fs_sb_info *, block_t);
-void write_node_summaries(struct f2fs_sb_info *, block_t);
-int lookup_journal_in_cursum(struct f2fs_journal *, int, unsigned int, int);
-void flush_sit_entries(struct f2fs_sb_info *, struct cp_control *);
-int build_segment_manager(struct f2fs_sb_info *);
-void destroy_segment_manager(struct f2fs_sb_info *);
-int __init create_segment_manager_caches(void);
-void destroy_segment_manager_caches(void);
+bool f2fs_need_SSR(struct f2fs_sb_info *sbi);
+void f2fs_register_inmem_page(struct inode *inode, struct page *page);
+void f2fs_drop_inmem_pages_all(struct f2fs_sb_info *sbi, bool gc_failure);
+void f2fs_drop_inmem_pages(struct inode *inode);
+void f2fs_drop_inmem_page(struct inode *inode, struct page *page);
+int f2fs_commit_inmem_pages(struct inode *inode);
+void f2fs_balance_fs(struct f2fs_sb_info *sbi, bool need);
+void f2fs_balance_fs_bg(struct f2fs_sb_info *sbi);
+int f2fs_issue_flush(struct f2fs_sb_info *sbi, nid_t ino);
+int f2fs_create_flush_cmd_control(struct f2fs_sb_info *sbi);
+int f2fs_flush_device_cache(struct f2fs_sb_info *sbi);
+void f2fs_destroy_flush_cmd_control(struct f2fs_sb_info *sbi, bool free);
+void f2fs_invalidate_blocks(struct f2fs_sb_info *sbi, block_t addr);
+bool f2fs_is_checkpointed_data(struct f2fs_sb_info *sbi, block_t blkaddr);
+void f2fs_drop_discard_cmd(struct f2fs_sb_info *sbi);
+void f2fs_stop_discard_thread(struct f2fs_sb_info *sbi);
+bool f2fs_wait_discard_bios(struct f2fs_sb_info *sbi);
+void f2fs_clear_prefree_segments(struct f2fs_sb_info *sbi,
+ struct cp_control *cpc);
+void f2fs_dirty_to_prefree(struct f2fs_sb_info *sbi);
+int f2fs_disable_cp_again(struct f2fs_sb_info *sbi);
+void f2fs_release_discard_addrs(struct f2fs_sb_info *sbi);
+int f2fs_npages_for_summary_flush(struct f2fs_sb_info *sbi, bool for_ra);
+void f2fs_allocate_new_segments(struct f2fs_sb_info *sbi);
+int f2fs_trim_fs(struct f2fs_sb_info *sbi, struct fstrim_range *range);
+bool f2fs_exist_trim_candidates(struct f2fs_sb_info *sbi,
+ struct cp_control *cpc);
+struct page *f2fs_get_sum_page(struct f2fs_sb_info *sbi, unsigned int segno);
+void f2fs_update_meta_page(struct f2fs_sb_info *sbi, void *src,
+ block_t blk_addr);
+void f2fs_do_write_meta_page(struct f2fs_sb_info *sbi, struct page *page,
+ enum iostat_type io_type);
+void f2fs_do_write_node_page(unsigned int nid, struct f2fs_io_info *fio);
+void f2fs_outplace_write_data(struct dnode_of_data *dn,
+ struct f2fs_io_info *fio);
+int f2fs_inplace_write_data(struct f2fs_io_info *fio);
+void f2fs_do_replace_block(struct f2fs_sb_info *sbi, struct f2fs_summary *sum,
+ block_t old_blkaddr, block_t new_blkaddr,
+ bool recover_curseg, bool recover_newaddr);
+void f2fs_replace_block(struct f2fs_sb_info *sbi, struct dnode_of_data *dn,
+ block_t old_addr, block_t new_addr,
+ unsigned char version, bool recover_curseg,
+ bool recover_newaddr);
+void f2fs_allocate_data_block(struct f2fs_sb_info *sbi, struct page *page,
+ block_t old_blkaddr, block_t *new_blkaddr,
+ struct f2fs_summary *sum, int type,
+ struct f2fs_io_info *fio, bool add_list);
+void f2fs_wait_on_page_writeback(struct page *page,
+ enum page_type type, bool ordered, bool locked);
+void f2fs_wait_on_block_writeback(struct inode *inode, block_t blkaddr);
+void f2fs_wait_on_block_writeback_range(struct inode *inode, block_t blkaddr,
+ block_t len);
+void f2fs_write_data_summaries(struct f2fs_sb_info *sbi, block_t start_blk);
+void f2fs_write_node_summaries(struct f2fs_sb_info *sbi, block_t start_blk);
+int f2fs_lookup_journal_in_cursum(struct f2fs_journal *journal, int type,
+ unsigned int val, int alloc);
+void f2fs_flush_sit_entries(struct f2fs_sb_info *sbi, struct cp_control *cpc);
+int f2fs_build_segment_manager(struct f2fs_sb_info *sbi);
+void f2fs_destroy_segment_manager(struct f2fs_sb_info *sbi);
+int __init f2fs_create_segment_manager_caches(void);
+void f2fs_destroy_segment_manager_caches(void);
+int f2fs_rw_hint_to_seg_type(enum rw_hint hint);
+enum rw_hint f2fs_io_type_to_rw_hint(struct f2fs_sb_info *sbi,
+ enum page_type type, enum temp_type temp);
/*
* checkpoint.c
*/
-void f2fs_stop_checkpoint(struct f2fs_sb_info *, bool);
-struct page *grab_meta_page(struct f2fs_sb_info *, pgoff_t);
-struct page *get_meta_page(struct f2fs_sb_info *, pgoff_t);
-struct page *get_tmp_page(struct f2fs_sb_info *, pgoff_t);
+void f2fs_stop_checkpoint(struct f2fs_sb_info *sbi, bool end_io);
+struct page *f2fs_grab_meta_page(struct f2fs_sb_info *sbi, pgoff_t index);
+struct page *f2fs_get_meta_page(struct f2fs_sb_info *sbi, pgoff_t index);
+struct page *f2fs_get_meta_page_nofail(struct f2fs_sb_info *sbi, pgoff_t index);
+struct page *f2fs_get_tmp_page(struct f2fs_sb_info *sbi, pgoff_t index);
bool f2fs_is_valid_blkaddr(struct f2fs_sb_info *sbi,
block_t blkaddr, int type);
-int ra_meta_pages(struct f2fs_sb_info *, block_t, int, int, bool);
-void ra_meta_pages_cond(struct f2fs_sb_info *, pgoff_t);
-long sync_meta_pages(struct f2fs_sb_info *, enum page_type, long);
-void add_ino_entry(struct f2fs_sb_info *, nid_t, int type);
-void remove_ino_entry(struct f2fs_sb_info *, nid_t, int type);
-void release_ino_entry(struct f2fs_sb_info *, bool);
-bool exist_written_data(struct f2fs_sb_info *, nid_t, int);
-int f2fs_sync_inode_meta(struct f2fs_sb_info *);
-int acquire_orphan_inode(struct f2fs_sb_info *);
-void release_orphan_inode(struct f2fs_sb_info *);
-void add_orphan_inode(struct inode *);
-void remove_orphan_inode(struct f2fs_sb_info *, nid_t);
-int recover_orphan_inodes(struct f2fs_sb_info *);
-int get_valid_checkpoint(struct f2fs_sb_info *);
-void update_dirty_page(struct inode *, struct page *);
-void remove_dirty_inode(struct inode *);
-int sync_dirty_inodes(struct f2fs_sb_info *, enum inode_type);
-int write_checkpoint(struct f2fs_sb_info *, struct cp_control *);
-void init_ino_entry_info(struct f2fs_sb_info *);
-int __init create_checkpoint_caches(void);
-void destroy_checkpoint_caches(void);
+int f2fs_ra_meta_pages(struct f2fs_sb_info *sbi, block_t start, int nrpages,
+ int type, bool sync);
+void f2fs_ra_meta_pages_cond(struct f2fs_sb_info *sbi, pgoff_t index);
+long f2fs_sync_meta_pages(struct f2fs_sb_info *sbi, enum page_type type,
+ long nr_to_write, enum iostat_type io_type);
+void f2fs_add_ino_entry(struct f2fs_sb_info *sbi, nid_t ino, int type);
+void f2fs_remove_ino_entry(struct f2fs_sb_info *sbi, nid_t ino, int type);
+void f2fs_release_ino_entry(struct f2fs_sb_info *sbi, bool all);
+bool f2fs_exist_written_data(struct f2fs_sb_info *sbi, nid_t ino, int mode);
+void f2fs_set_dirty_device(struct f2fs_sb_info *sbi, nid_t ino,
+ unsigned int devidx, int type);
+bool f2fs_is_dirty_device(struct f2fs_sb_info *sbi, nid_t ino,
+ unsigned int devidx, int type);
+int f2fs_sync_inode_meta(struct f2fs_sb_info *sbi);
+int f2fs_acquire_orphan_inode(struct f2fs_sb_info *sbi);
+void f2fs_release_orphan_inode(struct f2fs_sb_info *sbi);
+void f2fs_add_orphan_inode(struct inode *inode);
+void f2fs_remove_orphan_inode(struct f2fs_sb_info *sbi, nid_t ino);
+int f2fs_recover_orphan_inodes(struct f2fs_sb_info *sbi);
+int f2fs_get_valid_checkpoint(struct f2fs_sb_info *sbi);
+void f2fs_update_dirty_page(struct inode *inode, struct page *page);
+void f2fs_remove_dirty_inode(struct inode *inode);
+int f2fs_sync_dirty_inodes(struct f2fs_sb_info *sbi, enum inode_type type);
+void f2fs_wait_on_all_pages_writeback(struct f2fs_sb_info *sbi);
+int f2fs_write_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc);
+void f2fs_init_ino_entry_info(struct f2fs_sb_info *sbi);
+int __init f2fs_create_checkpoint_caches(void);
+void f2fs_destroy_checkpoint_caches(void);
/*
* data.c
*/
-void f2fs_submit_merged_bio(struct f2fs_sb_info *, enum page_type, int);
-void f2fs_submit_merged_bio_cond(struct f2fs_sb_info *, struct inode *,
- struct page *, nid_t, enum page_type, int);
-void f2fs_flush_merged_bios(struct f2fs_sb_info *);
-int f2fs_submit_page_bio(struct f2fs_io_info *);
-void f2fs_submit_page_mbio(struct f2fs_io_info *);
-void set_data_blkaddr(struct dnode_of_data *);
-void f2fs_update_data_blkaddr(struct dnode_of_data *, block_t);
-int reserve_new_blocks(struct dnode_of_data *, blkcnt_t);
-int reserve_new_block(struct dnode_of_data *);
-int f2fs_get_block(struct dnode_of_data *, pgoff_t);
-ssize_t f2fs_preallocate_blocks(struct kiocb *, struct iov_iter *);
-int f2fs_reserve_block(struct dnode_of_data *, pgoff_t);
-struct page *get_read_data_page(struct inode *, pgoff_t, int, bool);
-struct page *find_data_page(struct inode *, pgoff_t);
-struct page *get_lock_data_page(struct inode *, pgoff_t, bool);
-struct page *get_new_data_page(struct inode *, struct page *, pgoff_t, bool);
-int do_write_data_page(struct f2fs_io_info *);
-int f2fs_map_blocks(struct inode *, struct f2fs_map_blocks *, int, int);
-int f2fs_fiemap(struct inode *inode, struct fiemap_extent_info *, u64, u64);
-void f2fs_set_page_dirty_nobuffers(struct page *);
-void f2fs_invalidate_page(struct page *, unsigned int, unsigned int);
-int f2fs_release_page(struct page *, gfp_t);
+int f2fs_init_post_read_processing(void);
+void f2fs_destroy_post_read_processing(void);
+void f2fs_submit_merged_write(struct f2fs_sb_info *sbi, enum page_type type);
+void f2fs_submit_merged_write_cond(struct f2fs_sb_info *sbi,
+ struct inode *inode, struct page *page,
+ nid_t ino, enum page_type type);
+void f2fs_flush_merged_writes(struct f2fs_sb_info *sbi);
+int f2fs_submit_page_bio(struct f2fs_io_info *fio);
+void f2fs_submit_page_write(struct f2fs_io_info *fio);
+struct block_device *f2fs_target_device(struct f2fs_sb_info *sbi,
+ block_t blk_addr, struct bio *bio);
+int f2fs_target_device_index(struct f2fs_sb_info *sbi, block_t blkaddr);
+void f2fs_set_data_blkaddr(struct dnode_of_data *dn);
+void f2fs_update_data_blkaddr(struct dnode_of_data *dn, block_t blkaddr);
+int f2fs_reserve_new_blocks(struct dnode_of_data *dn, blkcnt_t count);
+int f2fs_reserve_new_block(struct dnode_of_data *dn);
+int f2fs_get_block(struct dnode_of_data *dn, pgoff_t index);
+int f2fs_preallocate_blocks(struct kiocb *iocb, struct iov_iter *from);
+int f2fs_reserve_block(struct dnode_of_data *dn, pgoff_t index);
+struct page *f2fs_get_read_data_page(struct inode *inode, pgoff_t index,
+ int op_flags, bool for_write);
+struct page *f2fs_find_data_page(struct inode *inode, pgoff_t index);
+struct page *f2fs_get_lock_data_page(struct inode *inode, pgoff_t index,
+ bool for_write);
+struct page *f2fs_get_new_data_page(struct inode *inode,
+ struct page *ipage, pgoff_t index, bool new_i_size);
+int f2fs_do_write_data_page(struct f2fs_io_info *fio);
+void __do_map_lock(struct f2fs_sb_info *sbi, int flag, bool lock);
+int f2fs_map_blocks(struct inode *inode, struct f2fs_map_blocks *map,
+ int create, int flag);
+int f2fs_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
+ u64 start, u64 len);
+bool f2fs_should_update_inplace(struct inode *inode, struct f2fs_io_info *fio);
+bool f2fs_should_update_outplace(struct inode *inode, struct f2fs_io_info *fio);
+void f2fs_invalidate_page(struct page *page, unsigned int offset,
+ unsigned int length);
+int f2fs_release_page(struct page *page, gfp_t wait);
#ifdef CONFIG_MIGRATION
-int f2fs_migrate_page(struct address_space *, struct page *, struct page *,
- enum migrate_mode);
+int f2fs_migrate_page(struct address_space *mapping, struct page *newpage,
+ struct page *page, enum migrate_mode mode);
#endif
+bool f2fs_overwrite_io(struct inode *inode, loff_t pos, size_t len);
+void f2fs_clear_radix_tree_dirty_tag(struct page *page);
/*
* gc.c
*/
-int start_gc_thread(struct f2fs_sb_info *);
-void stop_gc_thread(struct f2fs_sb_info *);
-block_t start_bidx_of_node(unsigned int, struct inode *);
-int f2fs_gc(struct f2fs_sb_info *, bool);
-void build_gc_manager(struct f2fs_sb_info *);
+int f2fs_start_gc_thread(struct f2fs_sb_info *sbi);
+void f2fs_stop_gc_thread(struct f2fs_sb_info *sbi);
+block_t f2fs_start_bidx_of_node(unsigned int node_ofs, struct inode *inode);
+int f2fs_gc(struct f2fs_sb_info *sbi, bool sync, bool background,
+ unsigned int segno);
+void f2fs_build_gc_manager(struct f2fs_sb_info *sbi);
/*
* recovery.c
*/
-int recover_fsync_data(struct f2fs_sb_info *, bool);
-bool space_for_roll_forward(struct f2fs_sb_info *);
+int f2fs_recover_fsync_data(struct f2fs_sb_info *sbi, bool check_only);
+bool f2fs_space_for_roll_forward(struct f2fs_sb_info *sbi);
/*
* debug.c
@@ -2232,13 +3197,23 @@ struct f2fs_stat_info {
unsigned long long hit_largest, hit_cached, hit_rbtree;
unsigned long long hit_total, total_ext;
int ext_tree, zombie_tree, ext_node;
- int ndirty_node, ndirty_dent, ndirty_meta, ndirty_data, ndirty_imeta;
+ int ndirty_node, ndirty_dent, ndirty_meta, ndirty_imeta;
+ int ndirty_data, ndirty_qdata;
int inmem_pages;
- unsigned int ndirty_dirs, ndirty_files, ndirty_all;
- int nats, dirty_nats, sits, dirty_sits, fnids;
+ unsigned int ndirty_dirs, ndirty_files, nquota_files, ndirty_all;
+ int nats, dirty_nats, sits, dirty_sits;
+ int free_nids, avail_nids, alloc_nids;
int total_count, utilization;
- int bg_gc, wb_bios;
- int inline_xattr, inline_inode, inline_dir, orphans;
+ int bg_gc, nr_wb_cp_data, nr_wb_data;
+ int nr_rd_data, nr_rd_node, nr_rd_meta;
+ int nr_dio_read, nr_dio_write;
+ unsigned int io_skip_bggc, other_skip_bggc;
+ int nr_flushing, nr_flushed, flush_list_empty;
+ int nr_discarding, nr_discarded;
+ int nr_discard_cmd;
+ unsigned int undiscard_blks;
+ int inline_xattr, inline_inode, inline_dir, append, update, orphans;
+ int aw_cnt, max_aw_cnt, vw_cnt, max_vw_cnt;
unsigned int valid_count, valid_node_count, valid_inode_count, discard_blks;
unsigned int bimodal, avg_vblocks;
int util_free, util_valid, util_invalid;
@@ -2249,10 +3224,12 @@ struct f2fs_stat_info {
int bg_node_segs, bg_data_segs;
int tot_blks, data_blks, node_blks;
int bg_data_blks, bg_node_blks;
+ unsigned long long skipped_atomic_files[2];
int curseg[NR_CURSEG_TYPE];
int cursec[NR_CURSEG_TYPE];
int curzone[NR_CURSEG_TYPE];
+ unsigned int meta_count[META_MAX];
unsigned int segment_count[2];
unsigned int block_count[2];
unsigned int inplace_count;
@@ -2268,6 +3245,8 @@ static inline struct f2fs_stat_info *F2FS_STAT(struct f2fs_sb_info *sbi)
#define stat_inc_bg_cp_count(si) ((si)->bg_cp_count++)
#define stat_inc_call_count(si) ((si)->call_count++)
#define stat_inc_bggc_count(sbi) ((sbi)->bg_gc++)
+#define stat_io_skip_bggc_count(sbi) ((sbi)->io_skip_bggc++)
+#define stat_other_skip_bggc_count(sbi) ((sbi)->other_skip_bggc++)
#define stat_inc_dirty_inode(sbi, type) ((sbi)->ndirty_inode[type]++)
#define stat_dec_dirty_inode(sbi, type) ((sbi)->ndirty_inode[type]--)
#define stat_inc_total_hit(sbi) (atomic64_inc(&(sbi)->total_hit_ext))
@@ -2304,17 +3283,50 @@ static inline struct f2fs_stat_info *F2FS_STAT(struct f2fs_sb_info *sbi)
if (f2fs_has_inline_dentry(inode)) \
(atomic_dec(&F2FS_I_SB(inode)->inline_dir)); \
} while (0)
+#define stat_inc_meta_count(sbi, blkaddr) \
+ do { \
+ if (blkaddr < SIT_I(sbi)->sit_base_addr) \
+ atomic_inc(&(sbi)->meta_count[META_CP]); \
+ else if (blkaddr < NM_I(sbi)->nat_blkaddr) \
+ atomic_inc(&(sbi)->meta_count[META_SIT]); \
+ else if (blkaddr < SM_I(sbi)->ssa_blkaddr) \
+ atomic_inc(&(sbi)->meta_count[META_NAT]); \
+ else if (blkaddr < SM_I(sbi)->main_blkaddr) \
+ atomic_inc(&(sbi)->meta_count[META_SSA]); \
+ } while (0)
#define stat_inc_seg_type(sbi, curseg) \
((sbi)->segment_count[(curseg)->alloc_type]++)
#define stat_inc_block_count(sbi, curseg) \
((sbi)->block_count[(curseg)->alloc_type]++)
#define stat_inc_inplace_blocks(sbi) \
(atomic_inc(&(sbi)->inplace_count))
+#define stat_inc_atomic_write(inode) \
+ (atomic_inc(&F2FS_I_SB(inode)->aw_cnt))
+#define stat_dec_atomic_write(inode) \
+ (atomic_dec(&F2FS_I_SB(inode)->aw_cnt))
+#define stat_update_max_atomic_write(inode) \
+ do { \
+ int cur = atomic_read(&F2FS_I_SB(inode)->aw_cnt); \
+ int max = atomic_read(&F2FS_I_SB(inode)->max_aw_cnt); \
+ if (cur > max) \
+ atomic_set(&F2FS_I_SB(inode)->max_aw_cnt, cur); \
+ } while (0)
+#define stat_inc_volatile_write(inode) \
+ (atomic_inc(&F2FS_I_SB(inode)->vw_cnt))
+#define stat_dec_volatile_write(inode) \
+ (atomic_dec(&F2FS_I_SB(inode)->vw_cnt))
+#define stat_update_max_volatile_write(inode) \
+ do { \
+ int cur = atomic_read(&F2FS_I_SB(inode)->vw_cnt); \
+ int max = atomic_read(&F2FS_I_SB(inode)->max_vw_cnt); \
+ if (cur > max) \
+ atomic_set(&F2FS_I_SB(inode)->max_vw_cnt, cur); \
+ } while (0)
#define stat_inc_seg_count(sbi, type, gc_type) \
do { \
struct f2fs_stat_info *si = F2FS_STAT(sbi); \
- (si)->tot_segs++; \
- if (type == SUM_TYPE_DATA) { \
+ si->tot_segs++; \
+ if ((type) == SUM_TYPE_DATA) { \
si->data_segs++; \
si->bg_data_segs += (gc_type == BG_GC) ? 1 : 0; \
} else { \
@@ -2324,14 +3336,14 @@ static inline struct f2fs_stat_info *F2FS_STAT(struct f2fs_sb_info *sbi)
} while (0)
#define stat_inc_tot_blk_count(si, blks) \
- (si->tot_blks += (blks))
+ ((si)->tot_blks += (blks))
#define stat_inc_data_blk_count(sbi, blks, gc_type) \
do { \
struct f2fs_stat_info *si = F2FS_STAT(sbi); \
stat_inc_tot_blk_count(si, blks); \
si->data_blks += (blks); \
- si->bg_data_blks += (gc_type == BG_GC) ? (blks) : 0; \
+ si->bg_data_blks += ((gc_type) == BG_GC) ? (blks) : 0; \
} while (0)
#define stat_inc_node_blk_count(sbi, blks, gc_type) \
@@ -2339,37 +3351,46 @@ static inline struct f2fs_stat_info *F2FS_STAT(struct f2fs_sb_info *sbi)
struct f2fs_stat_info *si = F2FS_STAT(sbi); \
stat_inc_tot_blk_count(si, blks); \
si->node_blks += (blks); \
- si->bg_node_blks += (gc_type == BG_GC) ? (blks) : 0; \
+ si->bg_node_blks += ((gc_type) == BG_GC) ? (blks) : 0; \
} while (0)
-int f2fs_build_stats(struct f2fs_sb_info *);
-void f2fs_destroy_stats(struct f2fs_sb_info *);
+int f2fs_build_stats(struct f2fs_sb_info *sbi);
+void f2fs_destroy_stats(struct f2fs_sb_info *sbi);
int __init f2fs_create_root_stats(void);
void f2fs_destroy_root_stats(void);
#else
-#define stat_inc_cp_count(si)
-#define stat_inc_bg_cp_count(si)
-#define stat_inc_call_count(si)
-#define stat_inc_bggc_count(si)
-#define stat_inc_dirty_inode(sbi, type)
-#define stat_dec_dirty_inode(sbi, type)
-#define stat_inc_total_hit(sb)
-#define stat_inc_rbtree_node_hit(sb)
-#define stat_inc_largest_node_hit(sbi)
-#define stat_inc_cached_node_hit(sbi)
-#define stat_inc_inline_xattr(inode)
-#define stat_dec_inline_xattr(inode)
-#define stat_inc_inline_inode(inode)
-#define stat_dec_inline_inode(inode)
-#define stat_inc_inline_dir(inode)
-#define stat_dec_inline_dir(inode)
-#define stat_inc_seg_type(sbi, curseg)
-#define stat_inc_block_count(sbi, curseg)
-#define stat_inc_inplace_blocks(sbi)
-#define stat_inc_seg_count(sbi, type, gc_type)
-#define stat_inc_tot_blk_count(si, blks)
-#define stat_inc_data_blk_count(sbi, blks, gc_type)
-#define stat_inc_node_blk_count(sbi, blks, gc_type)
+#define stat_inc_cp_count(si) do { } while (0)
+#define stat_inc_bg_cp_count(si) do { } while (0)
+#define stat_inc_call_count(si) do { } while (0)
+#define stat_inc_bggc_count(si) do { } while (0)
+#define stat_io_skip_bggc_count(sbi) do { } while (0)
+#define stat_other_skip_bggc_count(sbi) do { } while (0)
+#define stat_inc_dirty_inode(sbi, type) do { } while (0)
+#define stat_dec_dirty_inode(sbi, type) do { } while (0)
+#define stat_inc_total_hit(sb) do { } while (0)
+#define stat_inc_rbtree_node_hit(sb) do { } while (0)
+#define stat_inc_largest_node_hit(sbi) do { } while (0)
+#define stat_inc_cached_node_hit(sbi) do { } while (0)
+#define stat_inc_inline_xattr(inode) do { } while (0)
+#define stat_dec_inline_xattr(inode) do { } while (0)
+#define stat_inc_inline_inode(inode) do { } while (0)
+#define stat_dec_inline_inode(inode) do { } while (0)
+#define stat_inc_inline_dir(inode) do { } while (0)
+#define stat_dec_inline_dir(inode) do { } while (0)
+#define stat_inc_atomic_write(inode) do { } while (0)
+#define stat_dec_atomic_write(inode) do { } while (0)
+#define stat_update_max_atomic_write(inode) do { } while (0)
+#define stat_inc_volatile_write(inode) do { } while (0)
+#define stat_dec_volatile_write(inode) do { } while (0)
+#define stat_update_max_volatile_write(inode) do { } while (0)
+#define stat_inc_meta_count(sbi, blkaddr) do { } while (0)
+#define stat_inc_seg_type(sbi, curseg) do { } while (0)
+#define stat_inc_block_count(sbi, curseg) do { } while (0)
+#define stat_inc_inplace_blocks(sbi) do { } while (0)
+#define stat_inc_seg_count(sbi, type, gc_type) do { } while (0)
+#define stat_inc_tot_blk_count(si, blks) do { } while (0)
+#define stat_inc_data_blk_count(sbi, blks, gc_type) do { } while (0)
+#define stat_inc_node_blk_count(sbi, blks, gc_type) do { } while (0)
static inline int f2fs_build_stats(struct f2fs_sb_info *sbi) { return 0; }
static inline void f2fs_destroy_stats(struct f2fs_sb_info *sbi) { }
@@ -2387,56 +3408,84 @@ extern const struct inode_operations f2fs_dir_inode_operations;
extern const struct inode_operations f2fs_symlink_inode_operations;
extern const struct inode_operations f2fs_encrypted_symlink_inode_operations;
extern const struct inode_operations f2fs_special_inode_operations;
-extern struct kmem_cache *inode_entry_slab;
+extern struct kmem_cache *f2fs_inode_entry_slab;
/*
* inline.c
*/
-bool f2fs_may_inline_data(struct inode *);
-bool f2fs_may_inline_dentry(struct inode *);
-void read_inline_data(struct page *, struct page *);
-bool truncate_inline_inode(struct page *, u64);
-int f2fs_read_inline_data(struct inode *, struct page *);
-int f2fs_convert_inline_page(struct dnode_of_data *, struct page *);
-int f2fs_convert_inline_inode(struct inode *);
-int f2fs_write_inline_data(struct inode *, struct page *);
-bool recover_inline_data(struct inode *, struct page *);
-struct f2fs_dir_entry *find_in_inline_dir(struct inode *,
- struct fscrypt_name *, struct page **);
-int make_empty_inline_dir(struct inode *inode, struct inode *, struct page *);
-int f2fs_add_inline_entry(struct inode *, const struct qstr *,
- const struct qstr *, struct inode *, nid_t, umode_t);
-void f2fs_delete_inline_entry(struct f2fs_dir_entry *, struct page *,
- struct inode *, struct inode *);
-bool f2fs_empty_inline_dir(struct inode *);
-int f2fs_read_inline_dir(struct file *, struct dir_context *,
- struct fscrypt_str *);
-int f2fs_inline_data_fiemap(struct inode *,
- struct fiemap_extent_info *, __u64, __u64);
+bool f2fs_may_inline_data(struct inode *inode);
+bool f2fs_may_inline_dentry(struct inode *inode);
+void f2fs_do_read_inline_data(struct page *page, struct page *ipage);
+void f2fs_truncate_inline_inode(struct inode *inode,
+ struct page *ipage, u64 from);
+int f2fs_read_inline_data(struct inode *inode, struct page *page);
+int f2fs_convert_inline_page(struct dnode_of_data *dn, struct page *page);
+int f2fs_convert_inline_inode(struct inode *inode);
+int f2fs_write_inline_data(struct inode *inode, struct page *page);
+bool f2fs_recover_inline_data(struct inode *inode, struct page *npage);
+struct f2fs_dir_entry *f2fs_find_in_inline_dir(struct inode *dir,
+ struct fscrypt_name *fname, struct page **res_page);
+int f2fs_make_empty_inline_dir(struct inode *inode, struct inode *parent,
+ struct page *ipage);
+int f2fs_add_inline_entry(struct inode *dir, const struct qstr *new_name,
+ const struct qstr *orig_name,
+ struct inode *inode, nid_t ino, umode_t mode);
+void f2fs_delete_inline_entry(struct f2fs_dir_entry *dentry,
+ struct page *page, struct inode *dir,
+ struct inode *inode);
+bool f2fs_empty_inline_dir(struct inode *dir);
+int f2fs_read_inline_dir(struct file *file, struct dir_context *ctx,
+ struct fscrypt_str *fstr);
+int f2fs_inline_data_fiemap(struct inode *inode,
+ struct fiemap_extent_info *fieinfo,
+ __u64 start, __u64 len);
/*
* shrinker.c
*/
-unsigned long f2fs_shrink_count(struct shrinker *, struct shrink_control *);
-unsigned long f2fs_shrink_scan(struct shrinker *, struct shrink_control *);
-void f2fs_join_shrinker(struct f2fs_sb_info *);
-void f2fs_leave_shrinker(struct f2fs_sb_info *);
+unsigned long f2fs_shrink_count(struct shrinker *shrink,
+ struct shrink_control *sc);
+unsigned long f2fs_shrink_scan(struct shrinker *shrink,
+ struct shrink_control *sc);
+void f2fs_join_shrinker(struct f2fs_sb_info *sbi);
+void f2fs_leave_shrinker(struct f2fs_sb_info *sbi);
/*
* extent_cache.c
*/
-unsigned int f2fs_shrink_extent_tree(struct f2fs_sb_info *, int);
-bool f2fs_init_extent_tree(struct inode *, struct f2fs_extent *);
-void f2fs_drop_extent_tree(struct inode *);
-unsigned int f2fs_destroy_extent_node(struct inode *);
-void f2fs_destroy_extent_tree(struct inode *);
-bool f2fs_lookup_extent_cache(struct inode *, pgoff_t, struct extent_info *);
-void f2fs_update_extent_cache(struct dnode_of_data *);
+struct rb_entry *f2fs_lookup_rb_tree(struct rb_root *root,
+ struct rb_entry *cached_re, unsigned int ofs);
+struct rb_node **f2fs_lookup_rb_tree_for_insert(struct f2fs_sb_info *sbi,
+ struct rb_root *root, struct rb_node **parent,
+ unsigned int ofs);
+struct rb_entry *f2fs_lookup_rb_tree_ret(struct rb_root *root,
+ struct rb_entry *cached_re, unsigned int ofs,
+ struct rb_entry **prev_entry, struct rb_entry **next_entry,
+ struct rb_node ***insert_p, struct rb_node **insert_parent,
+ bool force);
+bool f2fs_check_rb_tree_consistence(struct f2fs_sb_info *sbi,
+ struct rb_root *root);
+unsigned int f2fs_shrink_extent_tree(struct f2fs_sb_info *sbi, int nr_shrink);
+bool f2fs_init_extent_tree(struct inode *inode, struct f2fs_extent *i_ext);
+void f2fs_drop_extent_tree(struct inode *inode);
+unsigned int f2fs_destroy_extent_node(struct inode *inode);
+void f2fs_destroy_extent_tree(struct inode *inode);
+bool f2fs_lookup_extent_cache(struct inode *inode, pgoff_t pgofs,
+ struct extent_info *ei);
+void f2fs_update_extent_cache(struct dnode_of_data *dn);
void f2fs_update_extent_cache_range(struct dnode_of_data *dn,
- pgoff_t, block_t, unsigned int);
-void init_extent_cache_info(struct f2fs_sb_info *);
-int __init create_extent_cache(void);
-void destroy_extent_cache(void);
+ pgoff_t fofs, block_t blkaddr, unsigned int len);
+void f2fs_init_extent_cache_info(struct f2fs_sb_info *sbi);
+int __init f2fs_create_extent_cache(void);
+void f2fs_destroy_extent_cache(void);
+
+/*
+ * sysfs.c
+ */
+int __init f2fs_init_sysfs(void);
+void f2fs_exit_sysfs(void);
+int f2fs_register_sysfs(struct f2fs_sb_info *sbi);
+void f2fs_unregister_sysfs(struct f2fs_sb_info *sbi);
/*
* crypto support
@@ -2446,26 +3495,73 @@ static inline bool f2fs_encrypted_inode(struct inode *inode)
return file_is_encrypt(inode);
}
+static inline bool f2fs_encrypted_file(struct inode *inode)
+{
+ return f2fs_encrypted_inode(inode) && S_ISREG(inode->i_mode);
+}
+
static inline void f2fs_set_encrypted_inode(struct inode *inode)
{
#ifdef CONFIG_F2FS_FS_ENCRYPTION
file_set_encrypt(inode);
+ f2fs_set_inode_flags(inode);
#endif
}
-static inline bool f2fs_bio_encrypted(struct bio *bio)
+/*
+ * Returns true if the reads of the inode's data need to undergo some
+ * postprocessing step, like decryption or authenticity verification.
+ */
+static inline bool f2fs_post_read_required(struct inode *inode)
+{
+ return f2fs_encrypted_file(inode);
+}
+
+#define F2FS_FEATURE_FUNCS(name, flagname) \
+static inline int f2fs_sb_has_##name(struct f2fs_sb_info *sbi) \
+{ \
+ return F2FS_HAS_FEATURE(sbi, F2FS_FEATURE_##flagname); \
+}
+
+F2FS_FEATURE_FUNCS(encrypt, ENCRYPT);
+F2FS_FEATURE_FUNCS(blkzoned, BLKZONED);
+F2FS_FEATURE_FUNCS(extra_attr, EXTRA_ATTR);
+F2FS_FEATURE_FUNCS(project_quota, PRJQUOTA);
+F2FS_FEATURE_FUNCS(inode_chksum, INODE_CHKSUM);
+F2FS_FEATURE_FUNCS(flexible_inline_xattr, FLEXIBLE_INLINE_XATTR);
+F2FS_FEATURE_FUNCS(quota_ino, QUOTA_INO);
+F2FS_FEATURE_FUNCS(inode_crtime, INODE_CRTIME);
+F2FS_FEATURE_FUNCS(lost_found, LOST_FOUND);
+F2FS_FEATURE_FUNCS(sb_chksum, SB_CHKSUM);
+
+#ifdef CONFIG_BLK_DEV_ZONED
+static inline int get_blkz_type(struct f2fs_sb_info *sbi,
+ struct block_device *bdev, block_t blkaddr)
{
- return bio->bi_private != NULL;
+ unsigned int zno = blkaddr >> sbi->log_blocks_per_blkz;
+ int i;
+
+ for (i = 0; i < sbi->s_ndevs; i++)
+ if (FDEV(i).bdev == bdev)
+ return FDEV(i).blkz_type[zno];
+ return -EINVAL;
}
+#endif
-static inline int f2fs_sb_has_crypto(struct super_block *sb)
+static inline bool f2fs_hw_should_discard(struct f2fs_sb_info *sbi)
{
- return F2FS_HAS_FEATURE(sb, F2FS_FEATURE_ENCRYPT);
+ return f2fs_sb_has_blkzoned(sbi);
}
-static inline int f2fs_sb_mounted_hmsmr(struct super_block *sb)
+static inline bool f2fs_hw_support_discard(struct f2fs_sb_info *sbi)
{
- return F2FS_HAS_FEATURE(sb, F2FS_FEATURE_HMSMR);
+ return blk_queue_discard(bdev_get_queue(sbi->sb->s_bdev));
+}
+
+static inline bool f2fs_realtime_discard_enable(struct f2fs_sb_info *sbi)
+{
+ return (test_opt(sbi, DISCARD) && f2fs_hw_support_discard(sbi)) ||
+ f2fs_hw_should_discard(sbi);
}
static inline void set_opt_mode(struct f2fs_sb_info *sbi, unsigned int mt)
@@ -2490,32 +3586,74 @@ static inline bool f2fs_may_encrypt(struct inode *inode)
return (S_ISREG(mode) || S_ISDIR(mode) || S_ISLNK(mode));
#else
- return 0;
+ return false;
#endif
}
-#ifndef CONFIG_F2FS_FS_ENCRYPTION
-#define fscrypt_set_d_op(i)
-#define fscrypt_get_ctx fscrypt_notsupp_get_ctx
-#define fscrypt_release_ctx fscrypt_notsupp_release_ctx
-#define fscrypt_encrypt_page fscrypt_notsupp_encrypt_page
-#define fscrypt_decrypt_page fscrypt_notsupp_decrypt_page
-#define fscrypt_decrypt_bio_pages fscrypt_notsupp_decrypt_bio_pages
-#define fscrypt_pullback_bio_page fscrypt_notsupp_pullback_bio_page
-#define fscrypt_restore_control_page fscrypt_notsupp_restore_control_page
-#define fscrypt_zeroout_range fscrypt_notsupp_zeroout_range
-#define fscrypt_process_policy fscrypt_notsupp_process_policy
-#define fscrypt_get_policy fscrypt_notsupp_get_policy
-#define fscrypt_has_permitted_context fscrypt_notsupp_has_permitted_context
-#define fscrypt_inherit_context fscrypt_notsupp_inherit_context
-#define fscrypt_get_encryption_info fscrypt_notsupp_get_encryption_info
-#define fscrypt_put_encryption_info fscrypt_notsupp_put_encryption_info
-#define fscrypt_setup_filename fscrypt_notsupp_setup_filename
-#define fscrypt_free_filename fscrypt_notsupp_free_filename
-#define fscrypt_fname_encrypted_size fscrypt_notsupp_fname_encrypted_size
-#define fscrypt_fname_alloc_buffer fscrypt_notsupp_fname_alloc_buffer
-#define fscrypt_fname_free_buffer fscrypt_notsupp_fname_free_buffer
-#define fscrypt_fname_disk_to_usr fscrypt_notsupp_fname_disk_to_usr
-#define fscrypt_fname_usr_to_disk fscrypt_notsupp_fname_usr_to_disk
+static inline int block_unaligned_IO(struct inode *inode,
+ struct kiocb *iocb, struct iov_iter *iter)
+{
+ unsigned int i_blkbits = READ_ONCE(inode->i_blkbits);
+ unsigned int blocksize_mask = (1 << i_blkbits) - 1;
+ loff_t offset = iocb->ki_pos;
+ unsigned long align = offset | iov_iter_alignment(iter);
+
+ return align & blocksize_mask;
+}
+
+static inline int allow_outplace_dio(struct inode *inode,
+ struct kiocb *iocb, struct iov_iter *iter)
+{
+ struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
+ int rw = iov_iter_rw(iter);
+
+ return (test_opt(sbi, LFS) && (rw == WRITE) &&
+ !block_unaligned_IO(inode, iocb, iter));
+}
+
+static inline bool f2fs_force_buffered_io(struct inode *inode,
+ struct kiocb *iocb, struct iov_iter *iter)
+{
+ struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
+ int rw = iov_iter_rw(iter);
+
+ if (f2fs_post_read_required(inode))
+ return true;
+ if (sbi->s_ndevs)
+ return true;
+ /*
+ * for blkzoned device, fallback direct IO to buffered IO, so
+ * all IOs can be serialized by log-structured write.
+ */
+ if (f2fs_sb_has_blkzoned(sbi))
+ return true;
+ if (test_opt(sbi, LFS) && (rw == WRITE) &&
+ block_unaligned_IO(inode, iocb, iter))
+ return true;
+ if (is_sbi_flag_set(F2FS_I_SB(inode), SBI_CP_DISABLED))
+ return true;
+
+ return false;
+}
+
+#ifdef CONFIG_F2FS_FAULT_INJECTION
+extern void f2fs_build_fault_attr(struct f2fs_sb_info *sbi, unsigned int rate,
+ unsigned int type);
+#else
+#define f2fs_build_fault_attr(sbi, rate, type) do { } while (0)
#endif
+
#endif
+
+static inline bool is_journalled_quota(struct f2fs_sb_info *sbi)
+{
+#ifdef CONFIG_QUOTA
+ if (f2fs_sb_has_quota_ino(sbi))
+ return true;
+ if (F2FS_OPTION(sbi).s_qf_names[USRQUOTA] ||
+ F2FS_OPTION(sbi).s_qf_names[GRPQUOTA] ||
+ F2FS_OPTION(sbi).s_qf_names[PRJQUOTA])
+ return true;
+#endif
+ return false;
+}
diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c
index b768f495603e..d3504977547d 100644
--- a/fs/f2fs/file.c
+++ b/fs/f2fs/file.c
@@ -1,12 +1,9 @@
+// SPDX-License-Identifier: GPL-2.0
/*
* fs/f2fs/file.c
*
* Copyright (c) 2012 Samsung Electronics Co., Ltd.
* http://www.samsung.com/
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
*/
#include <linux/fs.h>
#include <linux/f2fs_fs.h>
@@ -20,6 +17,7 @@
#include <linux/uaccess.h>
#include <linux/mount.h>
#include <linux/pagevec.h>
+#include <linux/uio.h>
#include <linux/uuid.h>
#include <linux/file.h>
@@ -32,52 +30,76 @@
#include "trace.h"
#include <trace/events/f2fs.h>
+static int f2fs_filemap_fault(struct vm_area_struct *vma,
+ struct vm_fault *vmf)
+{
+ struct inode *inode = file_inode(vma->vm_file);
+ int err;
+
+ down_read(&F2FS_I(inode)->i_mmap_sem);
+ err = filemap_fault(vma, vmf);
+ up_read(&F2FS_I(inode)->i_mmap_sem);
+
+ return err;
+}
+
static int f2fs_vm_page_mkwrite(struct vm_area_struct *vma,
struct vm_fault *vmf)
{
struct page *page = vmf->page;
struct inode *inode = file_inode(vma->vm_file);
struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
- struct dnode_of_data dn;
+ struct dnode_of_data dn = { .node_changed = false };
int err;
+ if (unlikely(f2fs_cp_error(sbi))) {
+ err = -EIO;
+ goto err;
+ }
+
sb_start_pagefault(inode->i_sb);
f2fs_bug_on(sbi, f2fs_has_inline_data(inode));
- /* block allocation */
- f2fs_lock_op(sbi);
- set_new_dnode(&dn, inode, NULL, NULL, 0);
- err = f2fs_reserve_block(&dn, page->index);
- if (err) {
- f2fs_unlock_op(sbi);
- goto out;
- }
- f2fs_put_dnode(&dn);
- f2fs_unlock_op(sbi);
-
- f2fs_balance_fs(sbi, dn.node_changed);
-
file_update_time(vma->vm_file);
+ down_read(&F2FS_I(inode)->i_mmap_sem);
lock_page(page);
if (unlikely(page->mapping != inode->i_mapping ||
page_offset(page) > i_size_read(inode) ||
!PageUptodate(page))) {
unlock_page(page);
err = -EFAULT;
- goto out;
+ goto out_sem;
+ }
+
+ /* block allocation */
+ __do_map_lock(sbi, F2FS_GET_BLOCK_PRE_AIO, true);
+ set_new_dnode(&dn, inode, NULL, NULL, 0);
+ err = f2fs_get_block(&dn, page->index);
+ f2fs_put_dnode(&dn);
+ __do_map_lock(sbi, F2FS_GET_BLOCK_PRE_AIO, false);
+ if (err) {
+ unlock_page(page);
+ goto out_sem;
}
+ /* fill the page */
+ f2fs_wait_on_page_writeback(page, DATA, false, true);
+
+ /* wait for GCed page writeback via META_MAPPING */
+ f2fs_wait_on_block_writeback(inode, dn.data_blkaddr);
+
/*
* check to see if the page is mapped already (no holes)
*/
if (PageMappedToDisk(page))
- goto mapped;
+ goto out_sem;
/* page is wholly or partially inside EOF */
if (((loff_t)(page->index + 1) << PAGE_SHIFT) >
i_size_read(inode)) {
- unsigned offset;
+ loff_t offset;
+
offset = i_size_read(inode) & ~PAGE_MASK;
zero_user_segment(page, offset, PAGE_SIZE);
}
@@ -85,25 +107,22 @@ static int f2fs_vm_page_mkwrite(struct vm_area_struct *vma,
if (!PageUptodate(page))
SetPageUptodate(page);
+ f2fs_update_iostat(sbi, APP_MAPPED_IO, F2FS_BLKSIZE);
+ f2fs_update_time(sbi, REQ_TIME);
+
trace_f2fs_vm_page_mkwrite(page, DATA);
-mapped:
- /* fill the page */
- f2fs_wait_on_page_writeback(page, DATA, false);
+out_sem:
+ up_read(&F2FS_I(inode)->i_mmap_sem);
- /* wait for GCed encrypted page writeback */
- if (f2fs_encrypted_inode(inode) && S_ISREG(inode->i_mode))
- f2fs_wait_on_encrypted_page_writeback(sbi, dn.data_blkaddr);
+ f2fs_balance_fs(sbi, dn.node_changed);
- /* if gced page is attached, don't write to cold segment */
- clear_cold_data(page);
-out:
sb_end_pagefault(inode->i_sb);
- f2fs_update_time(sbi, REQ_TIME);
+err:
return block_page_mkwrite_return(err);
}
static const struct vm_operations_struct f2fs_file_vm_ops = {
- .fault = filemap_fault,
+ .fault = f2fs_filemap_fault,
.map_pages = filemap_map_pages,
.page_mkwrite = f2fs_vm_page_mkwrite,
};
@@ -118,39 +137,39 @@ static int get_parent_ino(struct inode *inode, nid_t *pino)
if (!dentry)
return 0;
- if (update_dent_inode(inode, inode, &dentry->d_name)) {
- dput(dentry);
- return 0;
- }
-
*pino = parent_ino(dentry);
dput(dentry);
return 1;
}
-static inline bool need_do_checkpoint(struct inode *inode)
+static inline enum cp_reason_type need_do_checkpoint(struct inode *inode)
{
struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
- bool need_cp = false;
+ enum cp_reason_type cp_reason = CP_NO_NEEDED;
- if (!S_ISREG(inode->i_mode) || inode->i_nlink != 1)
- need_cp = true;
+ if (!S_ISREG(inode->i_mode))
+ cp_reason = CP_NON_REGULAR;
+ else if (inode->i_nlink != 1)
+ cp_reason = CP_HARDLINK;
else if (is_sbi_flag_set(sbi, SBI_NEED_CP))
- need_cp = true;
+ cp_reason = CP_SB_NEED_CP;
else if (file_wrong_pino(inode))
- need_cp = true;
- else if (!space_for_roll_forward(sbi))
- need_cp = true;
- else if (!is_checkpointed_node(sbi, F2FS_I(inode)->i_pino))
- need_cp = true;
- else if (F2FS_I(inode)->xattr_ver == cur_cp_version(F2FS_CKPT(sbi)))
- need_cp = true;
+ cp_reason = CP_WRONG_PINO;
+ else if (!f2fs_space_for_roll_forward(sbi))
+ cp_reason = CP_NO_SPC_ROLL;
+ else if (!f2fs_is_checkpointed_node(sbi, F2FS_I(inode)->i_pino))
+ cp_reason = CP_NODE_NEED_CP;
else if (test_opt(sbi, FASTBOOT))
- need_cp = true;
- else if (sbi->active_logs == 2)
- need_cp = true;
-
- return need_cp;
+ cp_reason = CP_FASTBOOT_MODE;
+ else if (F2FS_OPTION(sbi).active_logs == 2)
+ cp_reason = CP_SPEC_LOG_NUM;
+ else if (F2FS_OPTION(sbi).fsync_mode == FSYNC_MODE_STRICT &&
+ f2fs_need_dentry_mark(sbi, inode->i_ino) &&
+ f2fs_exist_written_data(sbi, F2FS_I(inode)->i_pino,
+ TRANS_DIR_INO))
+ cp_reason = CP_RECOVER_DIR;
+
+ return cp_reason;
}
static bool need_inode_page_update(struct f2fs_sb_info *sbi, nid_t ino)
@@ -158,7 +177,7 @@ static bool need_inode_page_update(struct f2fs_sb_info *sbi, nid_t ino)
struct page *i = find_get_page(NODE_MAPPING(sbi), ino);
bool ret = false;
/* But we need to avoid that there are some inode updates */
- if ((i && PageDirty(i)) || need_inode_block_update(sbi, ino))
+ if ((i && PageDirty(i)) || f2fs_need_inode_block_update(sbi, ino))
ret = true;
f2fs_put_page(i, 0);
return ret;
@@ -170,7 +189,6 @@ static void try_to_fix_pino(struct inode *inode)
nid_t pino;
down_write(&fi->i_sem);
- fi->xattr_ver = 0;
if (file_wrong_pino(inode) && inode->i_nlink == 1 &&
get_parent_ino(inode, &pino)) {
f2fs_i_pino_write(inode, pino);
@@ -186,18 +204,23 @@ static int f2fs_do_sync_file(struct file *file, loff_t start, loff_t end,
struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
nid_t ino = inode->i_ino;
int ret = 0;
- bool need_cp = false;
+ enum cp_reason_type cp_reason = 0;
struct writeback_control wbc = {
.sync_mode = WB_SYNC_ALL,
.nr_to_write = LONG_MAX,
.for_reclaim = 0,
};
+ unsigned int seq_id = 0;
- if (unlikely(f2fs_readonly(inode->i_sb)))
+ if (unlikely(f2fs_readonly(inode->i_sb) ||
+ is_sbi_flag_set(sbi, SBI_CP_DISABLED)))
return 0;
trace_f2fs_sync_file_enter(inode);
+ if (S_ISDIR(inode->i_mode))
+ goto go_write;
+
/* if fdatasync is triggered, let's do in-place-update */
if (datasync || get_dirty_pages(inode) <= SM_I(sbi)->min_fsync_blocks)
set_inode_flag(inode, FI_NEED_IPU);
@@ -205,12 +228,12 @@ static int f2fs_do_sync_file(struct file *file, loff_t start, loff_t end,
clear_inode_flag(inode, FI_NEED_IPU);
if (ret) {
- trace_f2fs_sync_file_exit(inode, need_cp, datasync, ret);
+ trace_f2fs_sync_file_exit(inode, cp_reason, datasync, ret);
return ret;
}
/* if the inode is dirty, let's recover all the time */
- if (!datasync && !f2fs_skip_inode_update(inode)) {
+ if (!f2fs_skip_inode_update(inode, datasync)) {
f2fs_write_inode(inode, NULL);
goto go_write;
}
@@ -219,14 +242,14 @@ static int f2fs_do_sync_file(struct file *file, loff_t start, loff_t end,
* if there is no written data, don't waste time to write recovery info.
*/
if (!is_inode_flag_set(inode, FI_APPEND_WRITE) &&
- !exist_written_data(sbi, ino, APPEND_INO)) {
+ !f2fs_exist_written_data(sbi, ino, APPEND_INO)) {
/* it may call write_inode just prior to fsync */
if (need_inode_page_update(sbi, ino))
goto go_write;
if (is_inode_flag_set(inode, FI_UPDATE_WRITE) ||
- exist_written_data(sbi, ino, UPDATE_INO))
+ f2fs_exist_written_data(sbi, ino, UPDATE_INO))
goto flush_out;
goto out;
}
@@ -236,10 +259,10 @@ go_write:
* sudden-power-off.
*/
down_read(&F2FS_I(inode)->i_sem);
- need_cp = need_do_checkpoint(inode);
+ cp_reason = need_do_checkpoint(inode);
up_read(&F2FS_I(inode)->i_sem);
- if (need_cp) {
+ if (cp_reason) {
/* all the dirty node pages should be flushed for POR */
ret = f2fs_sync_fs(inode->i_sb, 1);
@@ -253,7 +276,9 @@ go_write:
goto out;
}
sync_nodes:
- ret = fsync_node_pages(sbi, inode, &wbc, atomic);
+ atomic_inc(&sbi->wb_sync_req[NODE]);
+ ret = f2fs_fsync_node_pages(sbi, inode, &wbc, atomic, &seq_id);
+ atomic_dec(&sbi->wb_sync_req[NODE]);
if (ret)
goto out;
@@ -263,50 +288,67 @@ sync_nodes:
goto out;
}
- if (need_inode_block_update(sbi, ino)) {
- f2fs_mark_inode_dirty_sync(inode);
+ if (f2fs_need_inode_block_update(sbi, ino)) {
+ f2fs_mark_inode_dirty_sync(inode, true);
f2fs_write_inode(inode, NULL);
goto sync_nodes;
}
- ret = wait_on_node_pages_writeback(sbi, ino);
- if (ret)
- goto out;
+ /*
+ * If it's atomic_write, it's just fine to keep write ordering. So
+ * here we don't need to wait for node write completion, since we use
+ * node chain which serializes node blocks. If one of node writes are
+ * reordered, we can see simply broken chain, resulting in stopping
+ * roll-forward recovery. It means we'll recover all or none node blocks
+ * given fsync mark.
+ */
+ if (!atomic) {
+ ret = f2fs_wait_on_node_pages_writeback(sbi, seq_id);
+ if (ret)
+ goto out;
+ }
/* once recovery info is written, don't need to tack this */
- remove_ino_entry(sbi, ino, APPEND_INO);
+ f2fs_remove_ino_entry(sbi, ino, APPEND_INO);
clear_inode_flag(inode, FI_APPEND_WRITE);
flush_out:
- remove_ino_entry(sbi, ino, UPDATE_INO);
- clear_inode_flag(inode, FI_UPDATE_WRITE);
- ret = f2fs_issue_flush(sbi);
+ if (!atomic && F2FS_OPTION(sbi).fsync_mode != FSYNC_MODE_NOBARRIER)
+ ret = f2fs_issue_flush(sbi, inode->i_ino);
+ if (!ret) {
+ f2fs_remove_ino_entry(sbi, ino, UPDATE_INO);
+ clear_inode_flag(inode, FI_UPDATE_WRITE);
+ f2fs_remove_ino_entry(sbi, ino, FLUSH_INO);
+ }
f2fs_update_time(sbi, REQ_TIME);
out:
- trace_f2fs_sync_file_exit(inode, need_cp, datasync, ret);
+ trace_f2fs_sync_file_exit(inode, cp_reason, datasync, ret);
f2fs_trace_ios(NULL, 1);
return ret;
}
int f2fs_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
{
+ if (unlikely(f2fs_cp_error(F2FS_I_SB(file_inode(file)))))
+ return -EIO;
return f2fs_do_sync_file(file, start, end, datasync, false);
}
static pgoff_t __get_first_dirty_index(struct address_space *mapping,
pgoff_t pgofs, int whence)
{
- struct pagevec pvec;
+ struct page *page;
int nr_pages;
if (whence != SEEK_DATA)
return 0;
/* find first dirty page index */
- pagevec_init(&pvec, 0);
- nr_pages = pagevec_lookup_tag(&pvec, mapping, &pgofs,
- PAGECACHE_TAG_DIRTY, 1);
- pgofs = nr_pages ? pvec.pages[0]->index : ULONG_MAX;
- pagevec_release(&pvec);
+ nr_pages = find_get_pages_tag(mapping, &pgofs, PAGECACHE_TAG_DIRTY,
+ 1, &page);
+ if (!nr_pages)
+ return ULONG_MAX;
+ pgofs = page->index;
+ put_page(page);
return pgofs;
}
@@ -356,13 +398,13 @@ static loff_t f2fs_seek_block(struct file *file, loff_t offset, int whence)
for (; data_ofs < isize; data_ofs = (loff_t)pgofs << PAGE_SHIFT) {
set_new_dnode(&dn, inode, NULL, NULL, 0);
- err = get_dnode_of_data(&dn, pgofs, LOOKUP_NODE);
+ err = f2fs_get_dnode_of_data(&dn, pgofs, LOOKUP_NODE);
if (err && err != -ENOENT) {
goto fail;
} else if (err == -ENOENT) {
/* direct node does not exists */
if (whence == SEEK_DATA) {
- pgofs = get_next_page_offset(&dn, pgofs);
+ pgofs = f2fs_get_next_page_offset(&dn, pgofs);
continue;
} else {
goto found;
@@ -376,7 +418,9 @@ static loff_t f2fs_seek_block(struct file *file, loff_t offset, int whence)
dn.ofs_in_node++, pgofs++,
data_ofs = (loff_t)pgofs << PAGE_SHIFT) {
block_t blkaddr;
- blkaddr = datablock_addr(dn.node_page, dn.ofs_in_node);
+
+ blkaddr = datablock_addr(dn.inode,
+ dn.node_page, dn.ofs_in_node);
if (__is_valid_data_blkaddr(blkaddr) &&
!f2fs_is_valid_blkaddr(F2FS_I_SB(inode),
@@ -432,13 +476,8 @@ static int f2fs_file_mmap(struct file *file, struct vm_area_struct *vma)
struct inode *inode = file_inode(file);
int err;
- if (f2fs_encrypted_inode(inode)) {
- err = fscrypt_get_encryption_info(inode);
- if (err)
- return 0;
- if (!f2fs_encrypted_inode(inode))
- return -ENOKEY;
- }
+ if (unlikely(f2fs_cp_error(F2FS_I_SB(inode))))
+ return -EIO;
/* we don't need to use inline_data strictly */
err = f2fs_convert_inline_inode(inode);
@@ -452,49 +491,44 @@ static int f2fs_file_mmap(struct file *file, struct vm_area_struct *vma)
static int f2fs_file_open(struct inode *inode, struct file *filp)
{
- int ret = generic_file_open(inode, filp);
- struct dentry *dir;
+ int err = fscrypt_file_open(inode, filp);
- if (!ret && f2fs_encrypted_inode(inode)) {
- ret = fscrypt_get_encryption_info(inode);
- if (ret)
- return -EACCES;
- if (!fscrypt_has_encryption_key(inode))
- return -ENOKEY;
- }
- dir = dget_parent(file_dentry(filp));
- if (f2fs_encrypted_inode(d_inode(dir)) &&
- !fscrypt_has_permitted_context(d_inode(dir), inode)) {
- dput(dir);
- return -EPERM;
- }
- dput(dir);
- return ret;
+ if (err)
+ return err;
+
+ filp->f_mode |= FMODE_NOWAIT;
+
+ return dquot_file_open(inode, filp);
}
-int truncate_data_blocks_range(struct dnode_of_data *dn, int count)
+void f2fs_truncate_data_blocks_range(struct dnode_of_data *dn, int count)
{
struct f2fs_sb_info *sbi = F2FS_I_SB(dn->inode);
struct f2fs_node *raw_node;
int nr_free = 0, ofs = dn->ofs_in_node, len = count;
__le32 *addr;
+ int base = 0;
+
+ if (IS_INODE(dn->node_page) && f2fs_has_extra_attr(dn->inode))
+ base = get_extra_isize(dn->inode);
raw_node = F2FS_NODE(dn->node_page);
- addr = blkaddr_in_node(raw_node) + ofs;
+ addr = blkaddr_in_node(raw_node) + base + ofs;
for (; count > 0; count--, addr++, dn->ofs_in_node++) {
block_t blkaddr = le32_to_cpu(*addr);
+
if (blkaddr == NULL_ADDR)
continue;
dn->data_blkaddr = NULL_ADDR;
- set_data_blkaddr(dn);
+ f2fs_set_data_blkaddr(dn);
if (__is_valid_data_blkaddr(blkaddr) &&
!f2fs_is_valid_blkaddr(sbi, blkaddr, DATA_GENERIC))
continue;
- invalidate_blocks(sbi, blkaddr);
+ f2fs_invalidate_blocks(sbi, blkaddr);
if (dn->ofs_in_node == 0 && IS_INODE(dn->node_page))
clear_inode_flag(dn->inode, FI_FIRST_BLOCK_WRITTEN);
nr_free++;
@@ -506,7 +540,7 @@ int truncate_data_blocks_range(struct dnode_of_data *dn, int count)
* once we invalidate valid blkaddr in range [ofs, ofs + count],
* we will invalidate all blkaddr in the whole range.
*/
- fofs = start_bidx_of_node(ofs_of_node(dn->node_page),
+ fofs = f2fs_start_bidx_of_node(ofs_of_node(dn->node_page),
dn->inode) + ofs;
f2fs_update_extent_cache_range(dn, fofs, 0, len);
dec_valid_block_count(sbi, dn->inode, nr_free);
@@ -516,18 +550,17 @@ int truncate_data_blocks_range(struct dnode_of_data *dn, int count)
f2fs_update_time(sbi, REQ_TIME);
trace_f2fs_truncate_data_blocks_range(dn->inode, dn->nid,
dn->ofs_in_node, nr_free);
- return nr_free;
}
-void truncate_data_blocks(struct dnode_of_data *dn)
+void f2fs_truncate_data_blocks(struct dnode_of_data *dn)
{
- truncate_data_blocks_range(dn, ADDRS_PER_BLOCK);
+ f2fs_truncate_data_blocks_range(dn, ADDRS_PER_BLOCK);
}
static int truncate_partial_data_page(struct inode *inode, u64 from,
bool cache_only)
{
- unsigned offset = from & (PAGE_SIZE - 1);
+ loff_t offset = from & (PAGE_SIZE - 1);
pgoff_t index = from >> PAGE_SHIFT;
struct address_space *mapping = inode->i_mapping;
struct page *page;
@@ -543,55 +576,57 @@ static int truncate_partial_data_page(struct inode *inode, u64 from,
return 0;
}
- page = get_lock_data_page(inode, index, true);
+ page = f2fs_get_lock_data_page(inode, index, true);
if (IS_ERR(page))
- return 0;
+ return PTR_ERR(page) == -ENOENT ? 0 : PTR_ERR(page);
truncate_out:
- f2fs_wait_on_page_writeback(page, DATA, true);
+ f2fs_wait_on_page_writeback(page, DATA, true, true);
zero_user(page, offset, PAGE_SIZE - offset);
- if (!cache_only || !f2fs_encrypted_inode(inode) ||
- !S_ISREG(inode->i_mode))
+
+ /* An encrypted inode should have a key and truncate the last page. */
+ f2fs_bug_on(F2FS_I_SB(inode), cache_only && f2fs_encrypted_inode(inode));
+ if (!cache_only)
set_page_dirty(page);
f2fs_put_page(page, 1);
return 0;
}
-int truncate_blocks(struct inode *inode, u64 from, bool lock)
+int f2fs_truncate_blocks(struct inode *inode, u64 from, bool lock,
+ bool buf_write)
{
struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
- unsigned int blocksize = inode->i_sb->s_blocksize;
struct dnode_of_data dn;
pgoff_t free_from;
int count = 0, err = 0;
struct page *ipage;
bool truncate_page = false;
+ int flag = buf_write ? F2FS_GET_BLOCK_PRE_AIO : F2FS_GET_BLOCK_PRE_DIO;
trace_f2fs_truncate_blocks_enter(inode, from);
- free_from = (pgoff_t)F2FS_BYTES_TO_BLK(from + blocksize - 1);
+ free_from = (pgoff_t)F2FS_BLK_ALIGN(from);
if (free_from >= sbi->max_file_blocks)
goto free_partial;
if (lock)
- f2fs_lock_op(sbi);
+ __do_map_lock(sbi, flag, true);
- ipage = get_node_page(sbi, inode->i_ino);
+ ipage = f2fs_get_node_page(sbi, inode->i_ino);
if (IS_ERR(ipage)) {
err = PTR_ERR(ipage);
goto out;
}
if (f2fs_has_inline_data(inode)) {
- if (truncate_inline_inode(ipage, from))
- set_page_dirty(ipage);
+ f2fs_truncate_inline_inode(inode, ipage, from);
f2fs_put_page(ipage, 1);
truncate_page = true;
goto out;
}
set_new_dnode(&dn, inode, ipage, NULL, 0);
- err = get_dnode_of_data(&dn, free_from, LOOKUP_NODE_RA);
+ err = f2fs_get_dnode_of_data(&dn, free_from, LOOKUP_NODE_RA);
if (err) {
if (err == -ENOENT)
goto free_next;
@@ -604,16 +639,16 @@ int truncate_blocks(struct inode *inode, u64 from, bool lock)
f2fs_bug_on(sbi, count < 0);
if (dn.ofs_in_node || IS_INODE(dn.node_page)) {
- truncate_data_blocks_range(&dn, count);
+ f2fs_truncate_data_blocks_range(&dn, count);
free_from += count;
}
f2fs_put_dnode(&dn);
free_next:
- err = truncate_inode_blocks(inode, free_from);
+ err = f2fs_truncate_inode_blocks(inode, free_from);
out:
if (lock)
- f2fs_unlock_op(sbi);
+ __do_map_lock(sbi, flag, false);
free_partial:
/* lastly zero out the first data page */
if (!err)
@@ -627,12 +662,20 @@ int f2fs_truncate(struct inode *inode)
{
int err;
+ if (unlikely(f2fs_cp_error(F2FS_I_SB(inode))))
+ return -EIO;
+
if (!(S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode) ||
S_ISLNK(inode->i_mode)))
return 0;
trace_f2fs_truncate(inode);
+ if (time_to_inject(F2FS_I_SB(inode), FAULT_TRUNCATE)) {
+ f2fs_show_injection_info(FAULT_TRUNCATE);
+ return -EIO;
+ }
+
/* we should check inline_data size */
if (!f2fs_may_inline_data(inode)) {
err = f2fs_convert_inline_inode(inode);
@@ -640,21 +683,57 @@ int f2fs_truncate(struct inode *inode)
return err;
}
- err = truncate_blocks(inode, i_size_read(inode), true);
+ err = f2fs_truncate_blocks(inode, i_size_read(inode), true, false);
if (err)
return err;
inode->i_mtime = inode->i_ctime = current_time(inode);
- f2fs_mark_inode_dirty_sync(inode);
+ f2fs_mark_inode_dirty_sync(inode, false);
return 0;
}
int f2fs_getattr(struct vfsmount *mnt,
- struct dentry *dentry, struct kstat *stat)
+ struct dentry *dentry, struct kstat *stat)
{
struct inode *inode = d_inode(dentry);
+#if 0
+ struct f2fs_inode_info *fi = F2FS_I(inode);
+ struct f2fs_inode *ri;
+ unsigned int flags;
+
+ if (f2fs_has_extra_attr(inode) &&
+ f2fs_sb_has_inode_crtime(F2FS_I_SB(inode)) &&
+ F2FS_FITS_IN_INODE(ri, fi->i_extra_isize, i_crtime)) {
+ stat->result_mask |= STATX_BTIME;
+ stat->btime.tv_sec = fi->i_crtime.tv_sec;
+ stat->btime.tv_nsec = fi->i_crtime.tv_nsec;
+ }
+
+ flags = fi->i_flags & F2FS_FL_USER_VISIBLE;
+ if (flags & F2FS_APPEND_FL)
+ stat->attributes |= STATX_ATTR_APPEND;
+ if (flags & F2FS_COMPR_FL)
+ stat->attributes |= STATX_ATTR_COMPRESSED;
+ if (f2fs_encrypted_inode(inode))
+ stat->attributes |= STATX_ATTR_ENCRYPTED;
+ if (flags & F2FS_IMMUTABLE_FL)
+ stat->attributes |= STATX_ATTR_IMMUTABLE;
+ if (flags & F2FS_NODUMP_FL)
+ stat->attributes |= STATX_ATTR_NODUMP;
+
+ stat->attributes_mask |= (STATX_ATTR_APPEND |
+ STATX_ATTR_COMPRESSED |
+ STATX_ATTR_ENCRYPTED |
+ STATX_ATTR_IMMUTABLE |
+ STATX_ATTR_NODUMP);
+#endif
generic_fillattr(inode, stat);
- stat->blocks <<= 3;
+
+ /* we need to show initial sectors used for inline_data/dentries */
+ if ((S_ISREG(inode->i_mode) && f2fs_has_inline_data(inode)) ||
+ f2fs_has_inline_dentry(inode))
+ stat->blocks += (stat->size + 511) >> 9;
+
return 0;
}
@@ -692,29 +771,69 @@ int f2fs_setattr(struct dentry *dentry, struct iattr *attr)
{
struct inode *inode = d_inode(dentry);
int err;
+ bool size_changed = false;
+
+ if (unlikely(f2fs_cp_error(F2FS_I_SB(inode))))
+ return -EIO;
err = setattr_prepare(dentry, attr);
if (err)
return err;
+ err = fscrypt_prepare_setattr(dentry, attr);
+ if (err)
+ return err;
+
+ if (is_quota_modification(inode, attr)) {
+ err = dquot_initialize(inode);
+ if (err)
+ return err;
+ }
+ if ((attr->ia_valid & ATTR_UID &&
+ !uid_eq(attr->ia_uid, inode->i_uid)) ||
+ (attr->ia_valid & ATTR_GID &&
+ !gid_eq(attr->ia_gid, inode->i_gid))) {
+ f2fs_lock_op(F2FS_I_SB(inode));
+ err = dquot_transfer(inode, attr);
+ if (err) {
+ set_sbi_flag(F2FS_I_SB(inode),
+ SBI_QUOTA_NEED_REPAIR);
+ f2fs_unlock_op(F2FS_I_SB(inode));
+ return err;
+ }
+ /*
+ * update uid/gid under lock_op(), so that dquot and inode can
+ * be updated atomically.
+ */
+ if (attr->ia_valid & ATTR_UID)
+ inode->i_uid = attr->ia_uid;
+ if (attr->ia_valid & ATTR_GID)
+ inode->i_gid = attr->ia_gid;
+ f2fs_mark_inode_dirty_sync(inode, true);
+ f2fs_unlock_op(F2FS_I_SB(inode));
+ }
+
if (attr->ia_valid & ATTR_SIZE) {
- if (f2fs_encrypted_inode(inode) &&
- fscrypt_get_encryption_info(inode))
- return -EACCES;
+ bool to_smaller = (attr->ia_size <= i_size_read(inode));
+
+ down_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]);
+ down_write(&F2FS_I(inode)->i_mmap_sem);
- if (attr->ia_size <= i_size_read(inode)) {
- truncate_setsize(inode, attr->ia_size);
+ truncate_setsize(inode, attr->ia_size);
+
+ if (to_smaller)
err = f2fs_truncate(inode);
- if (err)
- return err;
- f2fs_balance_fs(F2FS_I_SB(inode), true);
- } else {
- /*
- * do not trim all blocks after i_size if target size is
- * larger than i_size.
- */
- truncate_setsize(inode, attr->ia_size);
+ /*
+ * do not trim all blocks after i_size if target size is
+ * larger than i_size.
+ */
+ up_write(&F2FS_I(inode)->i_mmap_sem);
+ up_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]);
+ if (err)
+ return err;
+
+ if (!to_smaller) {
/* should convert inline inode here */
if (!f2fs_may_inline_data(inode)) {
err = f2fs_convert_inline_inode(inode);
@@ -723,19 +842,30 @@ int f2fs_setattr(struct dentry *dentry, struct iattr *attr)
}
inode->i_mtime = inode->i_ctime = current_time(inode);
}
+
+ down_write(&F2FS_I(inode)->i_sem);
+ F2FS_I(inode)->last_disk_size = i_size_read(inode);
+ up_write(&F2FS_I(inode)->i_sem);
+
+ size_changed = true;
}
__setattr_copy(inode, attr);
if (attr->ia_valid & ATTR_MODE) {
- err = posix_acl_chmod(inode, get_inode_mode(inode));
+ err = posix_acl_chmod(inode, f2fs_get_inode_mode(inode));
if (err || is_inode_flag_set(inode, FI_ACL_MODE)) {
inode->i_mode = F2FS_I(inode)->i_acl_mode;
clear_inode_flag(inode, FI_ACL_MODE);
}
}
- f2fs_mark_inode_dirty_sync(inode);
+ /* file size may changed here */
+ f2fs_mark_inode_dirty_sync(inode, size_changed);
+
+ /* inode change will produce dirty node pages flushed by checkpoint */
+ f2fs_balance_fs(F2FS_I_SB(inode), true);
+
return err;
}
@@ -762,20 +892,20 @@ static int fill_zero(struct inode *inode, pgoff_t index,
f2fs_balance_fs(sbi, true);
f2fs_lock_op(sbi);
- page = get_new_data_page(inode, NULL, index, false);
+ page = f2fs_get_new_data_page(inode, NULL, index, false);
f2fs_unlock_op(sbi);
if (IS_ERR(page))
return PTR_ERR(page);
- f2fs_wait_on_page_writeback(page, DATA, true);
+ f2fs_wait_on_page_writeback(page, DATA, true, true);
zero_user(page, start, len);
set_page_dirty(page);
f2fs_put_page(page, 1);
return 0;
}
-int truncate_hole(struct inode *inode, pgoff_t pg_start, pgoff_t pg_end)
+int f2fs_truncate_hole(struct inode *inode, pgoff_t pg_start, pgoff_t pg_end)
{
int err;
@@ -784,10 +914,11 @@ int truncate_hole(struct inode *inode, pgoff_t pg_start, pgoff_t pg_end)
pgoff_t end_offset, count;
set_new_dnode(&dn, inode, NULL, NULL, 0);
- err = get_dnode_of_data(&dn, pg_start, LOOKUP_NODE);
+ err = f2fs_get_dnode_of_data(&dn, pg_start, LOOKUP_NODE);
if (err) {
if (err == -ENOENT) {
- pg_start++;
+ pg_start = f2fs_get_next_page_offset(&dn,
+ pg_start);
continue;
}
return err;
@@ -798,7 +929,7 @@ int truncate_hole(struct inode *inode, pgoff_t pg_start, pgoff_t pg_end)
f2fs_bug_on(F2FS_I_SB(inode), count == 0 || count > end_offset);
- truncate_data_blocks_range(&dn, count);
+ f2fs_truncate_data_blocks_range(&dn, count);
f2fs_put_dnode(&dn);
pg_start += count;
@@ -849,12 +980,19 @@ static int punch_hole(struct inode *inode, loff_t offset, loff_t len)
blk_start = (loff_t)pg_start << PAGE_SHIFT;
blk_end = (loff_t)pg_end << PAGE_SHIFT;
+
+ down_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]);
+ down_write(&F2FS_I(inode)->i_mmap_sem);
+
truncate_inode_pages_range(mapping, blk_start,
blk_end - 1);
f2fs_lock_op(sbi);
- ret = truncate_hole(inode, pg_start, pg_end);
+ ret = f2fs_truncate_hole(inode, pg_start, pg_end);
f2fs_unlock_op(sbi);
+
+ up_write(&F2FS_I(inode)->i_mmap_sem);
+ up_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]);
}
}
@@ -870,7 +1008,7 @@ static int __read_out_blkaddrs(struct inode *inode, block_t *blkaddr,
next_dnode:
set_new_dnode(&dn, inode, NULL, NULL, 0);
- ret = get_dnode_of_data(&dn, off, LOOKUP_NODE_RA);
+ ret = f2fs_get_dnode_of_data(&dn, off, LOOKUP_NODE_RA);
if (ret && ret != -ENOENT) {
return ret;
} else if (ret == -ENOENT) {
@@ -885,8 +1023,9 @@ next_dnode:
done = min((pgoff_t)ADDRS_PER_PAGE(dn.node_page, inode) -
dn.ofs_in_node, len);
for (i = 0; i < done; i++, blkaddr++, do_replace++, dn.ofs_in_node++) {
- *blkaddr = datablock_addr(dn.node_page, dn.ofs_in_node);
- if (!is_checkpointed_data(sbi, *blkaddr)) {
+ *blkaddr = datablock_addr(dn.inode,
+ dn.node_page, dn.ofs_in_node);
+ if (!f2fs_is_checkpointed_data(sbi, *blkaddr)) {
if (test_opt(sbi, LFS)) {
f2fs_put_dnode(&dn);
@@ -919,10 +1058,10 @@ static int __roll_back_blkaddrs(struct inode *inode, block_t *blkaddr,
continue;
set_new_dnode(&dn, inode, NULL, NULL, 0);
- ret = get_dnode_of_data(&dn, off + i, LOOKUP_NODE_RA);
+ ret = f2fs_get_dnode_of_data(&dn, off + i, LOOKUP_NODE_RA);
if (ret) {
dec_valid_block_count(sbi, inode, 1);
- invalidate_blocks(sbi, *blkaddr);
+ f2fs_invalidate_blocks(sbi, *blkaddr);
} else {
f2fs_update_data_blkaddr(&dn, *blkaddr);
}
@@ -952,24 +1091,29 @@ static int __clone_blkaddrs(struct inode *src_inode, struct inode *dst_inode,
pgoff_t ilen;
set_new_dnode(&dn, dst_inode, NULL, NULL, 0);
- ret = get_dnode_of_data(&dn, dst + i, ALLOC_NODE);
+ ret = f2fs_get_dnode_of_data(&dn, dst + i, ALLOC_NODE);
if (ret)
return ret;
- get_node_info(sbi, dn.nid, &ni);
+ ret = f2fs_get_node_info(sbi, dn.nid, &ni);
+ if (ret) {
+ f2fs_put_dnode(&dn);
+ return ret;
+ }
+
ilen = min((pgoff_t)
ADDRS_PER_PAGE(dn.node_page, dst_inode) -
dn.ofs_in_node, len - i);
do {
- dn.data_blkaddr = datablock_addr(dn.node_page,
- dn.ofs_in_node);
- truncate_data_blocks_range(&dn, 1);
+ dn.data_blkaddr = datablock_addr(dn.inode,
+ dn.node_page, dn.ofs_in_node);
+ f2fs_truncate_data_blocks_range(&dn, 1);
if (do_replace[i]) {
f2fs_i_blocks_write(src_inode,
- 1, false);
+ 1, false, false);
f2fs_i_blocks_write(dst_inode,
- 1, true);
+ 1, true, false);
f2fs_replace_block(sbi, &dn, dn.data_blkaddr,
blkaddr[i], ni.version, true, false);
@@ -986,10 +1130,11 @@ static int __clone_blkaddrs(struct inode *src_inode, struct inode *dst_inode,
} else {
struct page *psrc, *pdst;
- psrc = get_lock_data_page(src_inode, src + i, true);
+ psrc = f2fs_get_lock_data_page(src_inode,
+ src + i, true);
if (IS_ERR(psrc))
return PTR_ERR(psrc);
- pdst = get_new_data_page(dst_inode, NULL, dst + i,
+ pdst = f2fs_get_new_data_page(dst_inode, NULL, dst + i,
true);
if (IS_ERR(pdst)) {
f2fs_put_page(psrc, 1);
@@ -1000,7 +1145,8 @@ static int __clone_blkaddrs(struct inode *src_inode, struct inode *dst_inode,
f2fs_put_page(pdst, 1);
f2fs_put_page(psrc, 1);
- ret = truncate_hole(src_inode, src + i, src + i + 1);
+ ret = f2fs_truncate_hole(src_inode,
+ src + i, src + i + 1);
if (ret)
return ret;
i++;
@@ -1021,11 +1167,15 @@ static int __exchange_data_block(struct inode *src_inode,
while (len) {
olen = min((pgoff_t)4 * ADDRS_PER_BLOCK, len);
- src_blkaddr = f2fs_kvzalloc(sizeof(block_t) * olen, GFP_KERNEL);
+ src_blkaddr = f2fs_kvzalloc(F2FS_I_SB(src_inode),
+ array_size(olen, sizeof(block_t)),
+ GFP_KERNEL);
if (!src_blkaddr)
return -ENOMEM;
- do_replace = f2fs_kvzalloc(sizeof(int) * olen, GFP_KERNEL);
+ do_replace = f2fs_kvzalloc(F2FS_I_SB(src_inode),
+ array_size(olen, sizeof(int)),
+ GFP_KERNEL);
if (!do_replace) {
kvfree(src_blkaddr);
return -ENOMEM;
@@ -1051,31 +1201,39 @@ static int __exchange_data_block(struct inode *src_inode,
return 0;
roll_back:
- __roll_back_blkaddrs(src_inode, src_blkaddr, do_replace, src, len);
+ __roll_back_blkaddrs(src_inode, src_blkaddr, do_replace, src, olen);
kvfree(src_blkaddr);
kvfree(do_replace);
return ret;
}
-static int f2fs_do_collapse(struct inode *inode, pgoff_t start, pgoff_t end)
+static int f2fs_do_collapse(struct inode *inode, loff_t offset, loff_t len)
{
struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
pgoff_t nrpages = (i_size_read(inode) + PAGE_SIZE - 1) / PAGE_SIZE;
+ pgoff_t start = offset >> PAGE_SHIFT;
+ pgoff_t end = (offset + len) >> PAGE_SHIFT;
int ret;
f2fs_balance_fs(sbi, true);
- f2fs_lock_op(sbi);
- f2fs_drop_extent_tree(inode);
+ /* avoid gc operation during block exchange */
+ down_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]);
+ down_write(&F2FS_I(inode)->i_mmap_sem);
+ f2fs_lock_op(sbi);
+ f2fs_drop_extent_tree(inode);
+ truncate_pagecache(inode, offset);
ret = __exchange_data_block(inode, inode, end, start, nrpages - end, true);
f2fs_unlock_op(sbi);
+
+ up_write(&F2FS_I(inode)->i_mmap_sem);
+ up_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]);
return ret;
}
static int f2fs_collapse_range(struct inode *inode, loff_t offset, loff_t len)
{
- pgoff_t pg_start, pg_end;
loff_t new_size;
int ret;
@@ -1090,31 +1248,27 @@ static int f2fs_collapse_range(struct inode *inode, loff_t offset, loff_t len)
if (ret)
return ret;
- pg_start = offset >> PAGE_SHIFT;
- pg_end = (offset + len) >> PAGE_SHIFT;
-
/* write out all dirty pages from offset */
ret = filemap_write_and_wait_range(inode->i_mapping, offset, LLONG_MAX);
if (ret)
return ret;
- truncate_pagecache(inode, offset);
-
- ret = f2fs_do_collapse(inode, pg_start, pg_end);
+ ret = f2fs_do_collapse(inode, offset, len);
if (ret)
return ret;
/* write out all moved pages, if possible */
+ down_write(&F2FS_I(inode)->i_mmap_sem);
filemap_write_and_wait_range(inode->i_mapping, offset, LLONG_MAX);
truncate_pagecache(inode, offset);
new_size = i_size_read(inode) - len;
truncate_pagecache(inode, new_size);
- ret = truncate_blocks(inode, new_size, true);
+ ret = f2fs_truncate_blocks(inode, new_size, true, false);
+ up_write(&F2FS_I(inode)->i_mmap_sem);
if (!ret)
f2fs_i_size_write(inode, new_size);
-
return ret;
}
@@ -1128,21 +1282,22 @@ static int f2fs_do_zero_range(struct dnode_of_data *dn, pgoff_t start,
int ret;
for (; index < end; index++, dn->ofs_in_node++) {
- if (datablock_addr(dn->node_page, dn->ofs_in_node) == NULL_ADDR)
+ if (datablock_addr(dn->inode, dn->node_page,
+ dn->ofs_in_node) == NULL_ADDR)
count++;
}
dn->ofs_in_node = ofs_in_node;
- ret = reserve_new_blocks(dn, count);
+ ret = f2fs_reserve_new_blocks(dn, count);
if (ret)
return ret;
dn->ofs_in_node = ofs_in_node;
for (index = start; index < end; index++, dn->ofs_in_node++) {
- dn->data_blkaddr =
- datablock_addr(dn->node_page, dn->ofs_in_node);
+ dn->data_blkaddr = datablock_addr(dn->inode,
+ dn->node_page, dn->ofs_in_node);
/*
- * reserve_new_blocks will not guarantee entire block
+ * f2fs_reserve_new_blocks will not guarantee entire block
* allocation.
*/
if (dn->data_blkaddr == NULL_ADDR) {
@@ -1150,9 +1305,9 @@ static int f2fs_do_zero_range(struct dnode_of_data *dn, pgoff_t start,
break;
}
if (dn->data_blkaddr != NEW_ADDR) {
- invalidate_blocks(sbi, dn->data_blkaddr);
+ f2fs_invalidate_blocks(sbi, dn->data_blkaddr);
dn->data_blkaddr = NEW_ADDR;
- set_data_blkaddr(dn);
+ f2fs_set_data_blkaddr(dn);
}
}
@@ -1183,8 +1338,6 @@ static int f2fs_zero_range(struct inode *inode, loff_t offset, loff_t len,
if (ret)
return ret;
- truncate_pagecache_range(inode, offset, offset + len - 1);
-
pg_start = ((unsigned long long) offset) >> PAGE_SHIFT;
pg_end = ((unsigned long long) offset + len) >> PAGE_SHIFT;
@@ -1197,8 +1350,6 @@ static int f2fs_zero_range(struct inode *inode, loff_t offset, loff_t len,
if (ret)
return ret;
- if (offset + len > new_size)
- new_size = offset + len;
new_size = max_t(loff_t, new_size, offset + len);
} else {
if (off_start) {
@@ -1216,12 +1367,21 @@ static int f2fs_zero_range(struct inode *inode, loff_t offset, loff_t len,
unsigned int end_offset;
pgoff_t end;
+ down_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]);
+ down_write(&F2FS_I(inode)->i_mmap_sem);
+
+ truncate_pagecache_range(inode,
+ (loff_t)index << PAGE_SHIFT,
+ ((loff_t)pg_end << PAGE_SHIFT) - 1);
+
f2fs_lock_op(sbi);
set_new_dnode(&dn, inode, NULL, NULL, 0);
- ret = get_dnode_of_data(&dn, index, ALLOC_NODE);
+ ret = f2fs_get_dnode_of_data(&dn, index, ALLOC_NODE);
if (ret) {
f2fs_unlock_op(sbi);
+ up_write(&F2FS_I(inode)->i_mmap_sem);
+ up_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]);
goto out;
}
@@ -1230,7 +1390,13 @@ static int f2fs_zero_range(struct inode *inode, loff_t offset, loff_t len,
ret = f2fs_do_zero_range(&dn, index, end);
f2fs_put_dnode(&dn);
+
f2fs_unlock_op(sbi);
+ up_write(&F2FS_I(inode)->i_mmap_sem);
+ up_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]);
+
+ f2fs_balance_fs(sbi, dn.node_changed);
+
if (ret)
goto out;
@@ -1249,9 +1415,12 @@ static int f2fs_zero_range(struct inode *inode, loff_t offset, loff_t len,
}
out:
- if (!(mode & FALLOC_FL_KEEP_SIZE) && i_size_read(inode) < new_size)
- f2fs_i_size_write(inode, new_size);
-
+ if (new_size > i_size_read(inode)) {
+ if (mode & FALLOC_FL_KEEP_SIZE)
+ file_set_keep_isize(inode);
+ else
+ f2fs_i_size_write(inode, new_size);
+ }
return ret;
}
@@ -1263,8 +1432,9 @@ static int f2fs_insert_range(struct inode *inode, loff_t offset, loff_t len)
int ret = 0;
new_size = i_size_read(inode) + len;
- if (new_size > inode->i_sb->s_maxbytes)
- return -EFBIG;
+ ret = inode_newsize_ok(inode, new_size);
+ if (ret)
+ return ret;
if (offset >= i_size_read(inode))
return -EINVAL;
@@ -1279,7 +1449,9 @@ static int f2fs_insert_range(struct inode *inode, loff_t offset, loff_t len)
f2fs_balance_fs(sbi, true);
- ret = truncate_blocks(inode, i_size_read(inode), true);
+ down_write(&F2FS_I(inode)->i_mmap_sem);
+ ret = f2fs_truncate_blocks(inode, i_size_read(inode), true, false);
+ up_write(&F2FS_I(inode)->i_mmap_sem);
if (ret)
return ret;
@@ -1288,13 +1460,16 @@ static int f2fs_insert_range(struct inode *inode, loff_t offset, loff_t len)
if (ret)
return ret;
- truncate_pagecache(inode, offset);
-
pg_start = offset >> PAGE_SHIFT;
pg_end = (offset + len) >> PAGE_SHIFT;
delta = pg_end - pg_start;
idx = (i_size_read(inode) + PAGE_SIZE - 1) / PAGE_SIZE;
+ /* avoid gc operation during block exchange */
+ down_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]);
+ down_write(&F2FS_I(inode)->i_mmap_sem);
+ truncate_pagecache(inode, offset);
+
while (!ret && idx > pg_start) {
nr = idx - pg_start;
if (nr > delta)
@@ -1308,10 +1483,14 @@ static int f2fs_insert_range(struct inode *inode, loff_t offset, loff_t len)
idx + delta, nr, false);
f2fs_unlock_op(sbi);
}
+ up_write(&F2FS_I(inode)->i_mmap_sem);
+ up_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]);
/* write out all moved pages, if possible */
+ down_write(&F2FS_I(inode)->i_mmap_sem);
filemap_write_and_wait_range(inode->i_mapping, offset, LLONG_MAX);
truncate_pagecache(inode, offset);
+ up_write(&F2FS_I(inode)->i_mmap_sem);
if (!ret)
f2fs_i_size_write(inode, new_size);
@@ -1322,19 +1501,21 @@ static int expand_inode_data(struct inode *inode, loff_t offset,
loff_t len, int mode)
{
struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
- struct f2fs_map_blocks map = { .m_next_pgofs = NULL };
+ struct f2fs_map_blocks map = { .m_next_pgofs = NULL,
+ .m_next_extent = NULL, .m_seg_type = NO_CHECK_TYPE,
+ .m_may_create = true };
pgoff_t pg_end;
loff_t new_size = i_size_read(inode);
loff_t off_end;
- int ret;
+ int err;
- ret = inode_newsize_ok(inode, (len + offset));
- if (ret)
- return ret;
+ err = inode_newsize_ok(inode, (len + offset));
+ if (err)
+ return err;
- ret = f2fs_convert_inline_inode(inode);
- if (ret)
- return ret;
+ err = f2fs_convert_inline_inode(inode);
+ if (err)
+ return err;
f2fs_balance_fs(sbi, true);
@@ -1346,26 +1527,30 @@ static int expand_inode_data(struct inode *inode, loff_t offset,
if (off_end)
map.m_len++;
- ret = f2fs_map_blocks(inode, &map, 1, F2FS_GET_BLOCK_PRE_AIO);
- if (ret) {
+ err = f2fs_map_blocks(inode, &map, 1, F2FS_GET_BLOCK_PRE_AIO);
+ if (err) {
pgoff_t last_off;
if (!map.m_len)
- return ret;
+ return err;
last_off = map.m_lblk + map.m_len - 1;
/* update new size to the failed position */
- new_size = (last_off == pg_end) ? offset + len:
+ new_size = (last_off == pg_end) ? offset + len :
(loff_t)(last_off + 1) << PAGE_SHIFT;
} else {
new_size = ((loff_t)pg_end << PAGE_SHIFT) + off_end;
}
- if (!(mode & FALLOC_FL_KEEP_SIZE) && i_size_read(inode) < new_size)
- f2fs_i_size_write(inode, new_size);
+ if (new_size > i_size_read(inode)) {
+ if (mode & FALLOC_FL_KEEP_SIZE)
+ file_set_keep_isize(inode);
+ else
+ f2fs_i_size_write(inode, new_size);
+ }
- return ret;
+ return err;
}
static long f2fs_fallocate(struct file *file, int mode,
@@ -1374,6 +1559,9 @@ static long f2fs_fallocate(struct file *file, int mode,
struct inode *inode = file_inode(file);
long ret = 0;
+ if (unlikely(f2fs_cp_error(F2FS_I_SB(inode))))
+ return -EIO;
+
/* f2fs only support ->fallocate for regular file */
if (!S_ISREG(inode->i_mode))
return -EINVAL;
@@ -1406,7 +1594,7 @@ static long f2fs_fallocate(struct file *file, int mode,
if (!ret) {
inode->i_mtime = inode->i_ctime = current_time(inode);
- f2fs_mark_inode_dirty_sync(inode);
+ f2fs_mark_inode_dirty_sync(inode, false);
f2fs_update_time(F2FS_I_SB(inode), REQ_TIME);
}
@@ -1429,34 +1617,46 @@ static int f2fs_release_file(struct inode *inode, struct file *filp)
/* some remained atomic pages should discarded */
if (f2fs_is_atomic_file(inode))
- drop_inmem_pages(inode);
+ f2fs_drop_inmem_pages(inode);
if (f2fs_is_volatile_file(inode)) {
- clear_inode_flag(inode, FI_VOLATILE_FILE);
set_inode_flag(inode, FI_DROP_CACHE);
filemap_fdatawrite(inode->i_mapping);
clear_inode_flag(inode, FI_DROP_CACHE);
+ clear_inode_flag(inode, FI_VOLATILE_FILE);
+ stat_dec_volatile_write(inode);
}
return 0;
}
-#define F2FS_REG_FLMASK (~(FS_DIRSYNC_FL | FS_TOPDIR_FL))
-#define F2FS_OTHER_FLMASK (FS_NODUMP_FL | FS_NOATIME_FL)
-
-static inline __u32 f2fs_mask_flags(umode_t mode, __u32 flags)
+static int f2fs_file_flush(struct file *file, fl_owner_t id)
{
- if (S_ISDIR(mode))
- return flags;
- else if (S_ISREG(mode))
- return flags & F2FS_REG_FLMASK;
- else
- return flags & F2FS_OTHER_FLMASK;
+ struct inode *inode = file_inode(file);
+
+ /*
+ * If the process doing a transaction is crashed, we should do
+ * roll-back. Otherwise, other reader/write can see corrupted database
+ * until all the writers close its file. Since this should be done
+ * before dropping file lock, it needs to do in ->flush.
+ */
+ if (f2fs_is_atomic_file(inode) &&
+ F2FS_I(inode)->inmem_task == current)
+ f2fs_drop_inmem_pages(inode);
+ return 0;
}
static int f2fs_ioc_getflags(struct file *filp, unsigned long arg)
{
struct inode *inode = file_inode(filp);
struct f2fs_inode_info *fi = F2FS_I(inode);
- unsigned int flags = fi->i_flags & FS_FL_USER_VISIBLE;
+ unsigned int flags = fi->i_flags;
+
+ if (f2fs_encrypted_inode(inode))
+ flags |= F2FS_ENCRYPT_FL;
+ if (f2fs_has_inline_data(inode) || f2fs_has_inline_dentry(inode))
+ flags |= F2FS_INLINE_DATA_FL;
+
+ flags &= F2FS_FL_USER_VISIBLE;
+
return put_user(flags, (int __user *)arg);
}
@@ -1478,28 +1678,34 @@ static int f2fs_ioc_setflags(struct file *filp, unsigned long arg)
if (ret)
return ret;
- flags = f2fs_mask_flags(inode->i_mode, flags);
-
inode_lock(inode);
+ /* Is it quota file? Do not allow user to mess with it */
+ if (IS_NOQUOTA(inode)) {
+ ret = -EPERM;
+ goto unlock_out;
+ }
+
+ flags = f2fs_mask_flags(inode->i_mode, flags);
+
oldflags = fi->i_flags;
- if ((flags ^ oldflags) & (FS_APPEND_FL | FS_IMMUTABLE_FL)) {
+ if ((flags ^ oldflags) & (F2FS_APPEND_FL | F2FS_IMMUTABLE_FL)) {
if (!capable(CAP_LINUX_IMMUTABLE)) {
- inode_unlock(inode);
ret = -EPERM;
- goto out;
+ goto unlock_out;
}
}
- flags = flags & FS_FL_USER_MODIFIABLE;
- flags |= oldflags & ~FS_FL_USER_MODIFIABLE;
+ flags = flags & (F2FS_FL_USER_MODIFIABLE);
+ flags |= oldflags & ~(F2FS_FL_USER_MODIFIABLE);
fi->i_flags = flags;
- inode_unlock(inode);
inode->i_ctime = current_time(inode);
f2fs_set_inode_flags(inode);
-out:
+ f2fs_mark_inode_dirty_sync(inode, false);
+unlock_out:
+ inode_unlock(inode);
mnt_drop_write_file(filp);
return ret;
}
@@ -1519,35 +1725,50 @@ static int f2fs_ioc_start_atomic_write(struct file *filp)
if (!inode_owner_or_capable(inode))
return -EACCES;
+ if (!S_ISREG(inode->i_mode))
+ return -EINVAL;
+
ret = mnt_want_write_file(filp);
if (ret)
return ret;
inode_lock(inode);
- down_write(&F2FS_I(inode)->dio_rwsem[WRITE]);
-
- if (f2fs_is_atomic_file(inode))
+ if (f2fs_is_atomic_file(inode)) {
+ if (is_inode_flag_set(inode, FI_ATOMIC_REVOKE_REQUEST))
+ ret = -EINVAL;
goto out;
+ }
ret = f2fs_convert_inline_inode(inode);
if (ret)
goto out;
- set_inode_flag(inode, FI_ATOMIC_FILE);
- f2fs_update_time(F2FS_I_SB(inode), REQ_TIME);
-
- if (!get_dirty_pages(inode))
- goto out;
+ down_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]);
- f2fs_msg(F2FS_I_SB(inode)->sb, KERN_WARNING,
+ /*
+ * Should wait end_io to count F2FS_WB_CP_DATA correctly by
+ * f2fs_is_atomic_file.
+ */
+ if (get_dirty_pages(inode))
+ f2fs_msg(F2FS_I_SB(inode)->sb, KERN_WARNING,
"Unexpected flush for atomic writes: ino=%lu, npages=%u",
inode->i_ino, get_dirty_pages(inode));
ret = filemap_write_and_wait_range(inode->i_mapping, 0, LLONG_MAX);
- if (ret)
- clear_inode_flag(inode, FI_ATOMIC_FILE);
+ if (ret) {
+ up_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]);
+ goto out;
+ }
+
+ set_inode_flag(inode, FI_ATOMIC_FILE);
+ clear_inode_flag(inode, FI_ATOMIC_REVOKE_REQUEST);
+ up_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]);
+
+ f2fs_update_time(F2FS_I_SB(inode), REQ_TIME);
+ F2FS_I(inode)->inmem_task = current;
+ stat_inc_atomic_write(inode);
+ stat_update_max_atomic_write(inode);
out:
- up_write(&F2FS_I(inode)->dio_rwsem[WRITE]);
inode_unlock(inode);
mnt_drop_write_file(filp);
return ret;
@@ -1565,22 +1786,34 @@ static int f2fs_ioc_commit_atomic_write(struct file *filp)
if (ret)
return ret;
+ f2fs_balance_fs(F2FS_I_SB(inode), true);
+
inode_lock(inode);
- if (f2fs_is_volatile_file(inode))
+ if (f2fs_is_volatile_file(inode)) {
+ ret = -EINVAL;
goto err_out;
+ }
if (f2fs_is_atomic_file(inode)) {
- clear_inode_flag(inode, FI_ATOMIC_FILE);
- ret = commit_inmem_pages(inode);
- if (ret) {
- set_inode_flag(inode, FI_ATOMIC_FILE);
+ ret = f2fs_commit_inmem_pages(inode);
+ if (ret)
goto err_out;
+
+ ret = f2fs_do_sync_file(filp, 0, LLONG_MAX, 0, true);
+ if (!ret) {
+ clear_inode_flag(inode, FI_ATOMIC_FILE);
+ F2FS_I(inode)->i_gc_failures[GC_FAILURE_ATOMIC] = 0;
+ stat_dec_atomic_write(inode);
}
+ } else {
+ ret = f2fs_do_sync_file(filp, 0, LLONG_MAX, 1, false);
}
-
- ret = f2fs_do_sync_file(filp, 0, LLONG_MAX, 0, true);
err_out:
+ if (is_inode_flag_set(inode, FI_ATOMIC_REVOKE_REQUEST)) {
+ clear_inode_flag(inode, FI_ATOMIC_REVOKE_REQUEST);
+ ret = -EINVAL;
+ }
inode_unlock(inode);
mnt_drop_write_file(filp);
return ret;
@@ -1594,6 +1827,9 @@ static int f2fs_ioc_start_volatile_write(struct file *filp)
if (!inode_owner_or_capable(inode))
return -EACCES;
+ if (!S_ISREG(inode->i_mode))
+ return -EINVAL;
+
ret = mnt_want_write_file(filp);
if (ret)
return ret;
@@ -1607,6 +1843,9 @@ static int f2fs_ioc_start_volatile_write(struct file *filp)
if (ret)
goto out;
+ stat_inc_volatile_write(inode);
+ stat_update_max_volatile_write(inode);
+
set_inode_flag(inode, FI_VOLATILE_FILE);
f2fs_update_time(F2FS_I_SB(inode), REQ_TIME);
out:
@@ -1659,12 +1898,15 @@ static int f2fs_ioc_abort_volatile_write(struct file *filp)
inode_lock(inode);
if (f2fs_is_atomic_file(inode))
- drop_inmem_pages(inode);
+ f2fs_drop_inmem_pages(inode);
if (f2fs_is_volatile_file(inode)) {
clear_inode_flag(inode, FI_VOLATILE_FILE);
+ stat_dec_volatile_write(inode);
ret = f2fs_do_sync_file(filp, 0, LLONG_MAX, 0, true);
}
+ clear_inode_flag(inode, FI_ATOMIC_REVOKE_REQUEST);
+
inode_unlock(inode);
mnt_drop_write_file(filp);
@@ -1695,27 +1937,51 @@ static int f2fs_ioc_shutdown(struct file *filp, unsigned long arg)
switch (in) {
case F2FS_GOING_DOWN_FULLSYNC:
sb = freeze_bdev(sb->s_bdev);
- if (sb && !IS_ERR(sb)) {
+ if (IS_ERR(sb)) {
+ ret = PTR_ERR(sb);
+ goto out;
+ }
+ if (sb) {
f2fs_stop_checkpoint(sbi, false);
+ set_sbi_flag(sbi, SBI_IS_SHUTDOWN);
thaw_bdev(sb->s_bdev, sb);
}
break;
case F2FS_GOING_DOWN_METASYNC:
/* do checkpoint only */
- f2fs_sync_fs(sb, 1);
+ ret = f2fs_sync_fs(sb, 1);
+ if (ret)
+ goto out;
f2fs_stop_checkpoint(sbi, false);
+ set_sbi_flag(sbi, SBI_IS_SHUTDOWN);
break;
case F2FS_GOING_DOWN_NOSYNC:
f2fs_stop_checkpoint(sbi, false);
+ set_sbi_flag(sbi, SBI_IS_SHUTDOWN);
break;
case F2FS_GOING_DOWN_METAFLUSH:
- sync_meta_pages(sbi, META, LONG_MAX);
+ f2fs_sync_meta_pages(sbi, META, LONG_MAX, FS_META_IO);
f2fs_stop_checkpoint(sbi, false);
+ set_sbi_flag(sbi, SBI_IS_SHUTDOWN);
+ break;
+ case F2FS_GOING_DOWN_NEED_FSCK:
+ set_sbi_flag(sbi, SBI_NEED_FSCK);
+ /* do checkpoint only */
+ ret = f2fs_sync_fs(sb, 1);
+ if (ret)
+ goto out;
break;
default:
ret = -EINVAL;
goto out;
}
+
+ f2fs_stop_gc_thread(sbi);
+ f2fs_stop_discard_thread(sbi);
+
+ f2fs_drop_discard_cmd(sbi);
+ clear_opt(sbi, DISCARD);
+
f2fs_update_time(sbi, REQ_TIME);
out:
if (in != F2FS_GOING_DOWN_FULLSYNC)
@@ -1734,7 +2000,7 @@ static int f2fs_ioc_fitrim(struct file *filp, unsigned long arg)
if (!capable(CAP_SYS_ADMIN))
return -EPERM;
- if (!blk_queue_discard(q))
+ if (!f2fs_hw_support_discard(F2FS_SB(sb)))
return -EOPNOTSUPP;
if (copy_from_user(&range, (struct fstrim_range __user *)arg,
@@ -1771,31 +2037,21 @@ static bool uuid_is_nonzero(__u8 u[16])
static int f2fs_ioc_set_encryption_policy(struct file *filp, unsigned long arg)
{
- struct fscrypt_policy policy;
struct inode *inode = file_inode(filp);
- if (copy_from_user(&policy, (struct fscrypt_policy __user *)arg,
- sizeof(policy)))
- return -EFAULT;
+ if (!f2fs_sb_has_encrypt(F2FS_I_SB(inode)))
+ return -EOPNOTSUPP;
f2fs_update_time(F2FS_I_SB(inode), REQ_TIME);
- return fscrypt_process_policy(filp, &policy);
+ return fscrypt_ioctl_set_policy(filp, (const void __user *)arg);
}
static int f2fs_ioc_get_encryption_policy(struct file *filp, unsigned long arg)
{
- struct fscrypt_policy policy;
- struct inode *inode = file_inode(filp);
- int err;
-
- err = fscrypt_get_policy(inode, &policy);
- if (err)
- return err;
-
- if (copy_to_user((struct fscrypt_policy __user *)arg, &policy, sizeof(policy)))
- return -EFAULT;
- return 0;
+ if (!f2fs_sb_has_encrypt(F2FS_I_SB(file_inode(filp))))
+ return -EOPNOTSUPP;
+ return fscrypt_ioctl_get_policy(filp, (void __user *)arg);
}
static int f2fs_ioc_get_encryption_pwsalt(struct file *filp, unsigned long arg)
@@ -1804,16 +2060,18 @@ static int f2fs_ioc_get_encryption_pwsalt(struct file *filp, unsigned long arg)
struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
int err;
- if (!f2fs_sb_has_crypto(inode->i_sb))
+ if (!f2fs_sb_has_encrypt(sbi))
return -EOPNOTSUPP;
- if (uuid_is_nonzero(sbi->raw_super->encrypt_pw_salt))
- goto got_it;
-
err = mnt_want_write_file(filp);
if (err)
return err;
+ down_write(&sbi->sb_lock);
+
+ if (uuid_is_nonzero(sbi->raw_super->encrypt_pw_salt))
+ goto got_it;
+
/* update superblock with uuid */
generate_random_uuid(sbi->raw_super->encrypt_pw_salt);
@@ -1821,15 +2079,16 @@ static int f2fs_ioc_get_encryption_pwsalt(struct file *filp, unsigned long arg)
if (err) {
/* undo new data */
memset(sbi->raw_super->encrypt_pw_salt, 0, 16);
- mnt_drop_write_file(filp);
- return err;
+ goto out_err;
}
- mnt_drop_write_file(filp);
got_it:
if (copy_to_user((__u8 __user *)arg, sbi->raw_super->encrypt_pw_salt,
16))
- return -EFAULT;
- return 0;
+ err = -EFAULT;
+out_err:
+ up_write(&sbi->sb_lock);
+ mnt_drop_write_file(filp);
+ return err;
}
static int f2fs_ioc_gc(struct file *filp, unsigned long arg)
@@ -1861,7 +2120,53 @@ static int f2fs_ioc_gc(struct file *filp, unsigned long arg)
mutex_lock(&sbi->gc_mutex);
}
- ret = f2fs_gc(sbi, sync);
+ ret = f2fs_gc(sbi, sync, true, NULL_SEGNO);
+out:
+ mnt_drop_write_file(filp);
+ return ret;
+}
+
+static int f2fs_ioc_gc_range(struct file *filp, unsigned long arg)
+{
+ struct inode *inode = file_inode(filp);
+ struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
+ struct f2fs_gc_range range;
+ u64 end;
+ int ret;
+
+ if (!capable(CAP_SYS_ADMIN))
+ return -EPERM;
+
+ if (copy_from_user(&range, (struct f2fs_gc_range __user *)arg,
+ sizeof(range)))
+ return -EFAULT;
+
+ if (f2fs_readonly(sbi->sb))
+ return -EROFS;
+
+ end = range.start + range.len;
+ if (range.start < MAIN_BLKADDR(sbi) || end >= MAX_BLKADDR(sbi)) {
+ return -EINVAL;
+ }
+
+ ret = mnt_want_write_file(filp);
+ if (ret)
+ return ret;
+
+do_more:
+ if (!range.sync) {
+ if (!mutex_trylock(&sbi->gc_mutex)) {
+ ret = -EBUSY;
+ goto out;
+ }
+ } else {
+ mutex_lock(&sbi->gc_mutex);
+ }
+
+ ret = f2fs_gc(sbi, range.sync, true, GET_SEGNO(sbi, range.start));
+ range.start += BLKS_PER_SEC(sbi);
+ if (range.start <= end)
+ goto do_more;
out:
mnt_drop_write_file(filp);
return ret;
@@ -1879,6 +2184,12 @@ static int f2fs_ioc_write_checkpoint(struct file *filp, unsigned long arg)
if (f2fs_readonly(sbi->sb))
return -EROFS;
+ if (unlikely(is_sbi_flag_set(sbi, SBI_CP_DISABLED))) {
+ f2fs_msg(sbi->sb, KERN_INFO,
+ "Skipping Checkpoint. Checkpoints currently disabled.");
+ return -EINVAL;
+ }
+
ret = mnt_want_write_file(filp);
if (ret)
return ret;
@@ -1894,18 +2205,19 @@ static int f2fs_defragment_range(struct f2fs_sb_info *sbi,
struct f2fs_defragment *range)
{
struct inode *inode = file_inode(filp);
- struct f2fs_map_blocks map = { .m_next_pgofs = NULL };
- struct extent_info ei;
- pgoff_t pg_start, pg_end;
+ struct f2fs_map_blocks map = { .m_next_extent = NULL,
+ .m_seg_type = NO_CHECK_TYPE ,
+ .m_may_create = false };
+ struct extent_info ei = {0, 0, 0};
+ pgoff_t pg_start, pg_end, next_pgofs;
unsigned int blk_per_seg = sbi->blocks_per_seg;
unsigned int total = 0, sec_num;
- unsigned int pages_per_sec = sbi->segs_per_sec * blk_per_seg;
block_t blk_end = 0;
bool fragmented = false;
int err;
/* if in-place-update policy is enabled, don't waste time here */
- if (need_inplace_update(inode))
+ if (f2fs_should_update_inplace(inode, NULL))
return -EINVAL;
pg_start = range->start >> PAGE_SHIFT;
@@ -1931,6 +2243,7 @@ static int f2fs_defragment_range(struct f2fs_sb_info *sbi,
}
map.m_lblk = pg_start;
+ map.m_next_pgofs = &next_pgofs;
/*
* lookup mapping info in dnode page cache, skip defragmenting if all
@@ -1939,19 +2252,21 @@ static int f2fs_defragment_range(struct f2fs_sb_info *sbi,
*/
while (map.m_lblk < pg_end) {
map.m_len = pg_end - map.m_lblk;
- err = f2fs_map_blocks(inode, &map, 0, F2FS_GET_BLOCK_READ);
+ err = f2fs_map_blocks(inode, &map, 0, F2FS_GET_BLOCK_DEFAULT);
if (err)
goto out;
if (!(map.m_flags & F2FS_MAP_FLAGS)) {
- map.m_lblk++;
+ map.m_lblk = next_pgofs;
continue;
}
- if (blk_end && blk_end != map.m_pblk) {
+ if (blk_end && blk_end != map.m_pblk)
fragmented = true;
- break;
- }
+
+ /* record total count of block that we're going to move */
+ total += map.m_len;
+
blk_end = map.m_pblk + map.m_len;
map.m_lblk += map.m_len;
@@ -1960,10 +2275,7 @@ static int f2fs_defragment_range(struct f2fs_sb_info *sbi,
if (!fragmented)
goto out;
- map.m_lblk = pg_start;
- map.m_len = pg_end - pg_start;
-
- sec_num = (map.m_len + pages_per_sec - 1) / pages_per_sec;
+ sec_num = (total + BLKS_PER_SEC(sbi) - 1) / BLKS_PER_SEC(sbi);
/*
* make sure there are enough free section for LFS allocation, this can
@@ -1975,18 +2287,22 @@ static int f2fs_defragment_range(struct f2fs_sb_info *sbi,
goto out;
}
+ map.m_lblk = pg_start;
+ map.m_len = pg_end - pg_start;
+ total = 0;
+
while (map.m_lblk < pg_end) {
pgoff_t idx;
int cnt = 0;
do_map:
map.m_len = pg_end - map.m_lblk;
- err = f2fs_map_blocks(inode, &map, 0, F2FS_GET_BLOCK_READ);
+ err = f2fs_map_blocks(inode, &map, 0, F2FS_GET_BLOCK_DEFAULT);
if (err)
goto clear_out;
if (!(map.m_flags & F2FS_MAP_FLAGS)) {
- map.m_lblk++;
+ map.m_lblk = next_pgofs;
continue;
}
@@ -1996,7 +2312,7 @@ do_map:
while (idx < map.m_lblk + map.m_len && cnt < blk_per_seg) {
struct page *page;
- page = get_lock_data_page(inode, idx, true);
+ page = f2fs_get_lock_data_page(inode, idx, true);
if (IS_ERR(page)) {
err = PTR_ERR(page);
goto clear_out;
@@ -2040,42 +2356,40 @@ static int f2fs_ioc_defragment(struct file *filp, unsigned long arg)
if (!capable(CAP_SYS_ADMIN))
return -EPERM;
- if (!S_ISREG(inode->i_mode))
+ if (!S_ISREG(inode->i_mode) || f2fs_is_atomic_file(inode))
return -EINVAL;
- err = mnt_want_write_file(filp);
- if (err)
- return err;
-
- if (f2fs_readonly(sbi->sb)) {
- err = -EROFS;
- goto out;
- }
+ if (f2fs_readonly(sbi->sb))
+ return -EROFS;
if (copy_from_user(&range, (struct f2fs_defragment __user *)arg,
- sizeof(range))) {
- err = -EFAULT;
- goto out;
- }
+ sizeof(range)))
+ return -EFAULT;
/* verify alignment of offset & size */
- if (range.start & (F2FS_BLKSIZE - 1) ||
- range.len & (F2FS_BLKSIZE - 1)) {
- err = -EINVAL;
- goto out;
- }
+ if (range.start & (F2FS_BLKSIZE - 1) || range.len & (F2FS_BLKSIZE - 1))
+ return -EINVAL;
+
+ if (unlikely((range.start + range.len) >> PAGE_SHIFT >
+ sbi->max_file_blocks))
+ return -EINVAL;
+
+ err = mnt_want_write_file(filp);
+ if (err)
+ return err;
err = f2fs_defragment_range(sbi, filp, &range);
+ mnt_drop_write_file(filp);
+
f2fs_update_time(sbi, REQ_TIME);
if (err < 0)
- goto out;
+ return err;
if (copy_to_user((struct f2fs_defragment __user *)arg, &range,
sizeof(range)))
- err = -EFAULT;
-out:
- mnt_drop_write_file(filp);
- return err;
+ return -EFAULT;
+
+ return 0;
}
static int f2fs_move_file_range(struct file *file_in, loff_t pos_in,
@@ -2110,10 +2424,9 @@ static int f2fs_move_file_range(struct file *file_in, loff_t pos_in,
inode_lock(src);
if (src != dst) {
- if (!inode_trylock(dst)) {
- ret = -EBUSY;
+ ret = -EBUSY;
+ if (!inode_trylock(dst))
goto out;
- }
}
ret = -EINVAL;
@@ -2158,6 +2471,14 @@ static int f2fs_move_file_range(struct file *file_in, loff_t pos_in,
goto out_unlock;
f2fs_balance_fs(sbi, true);
+
+ down_write(&F2FS_I(src)->i_gc_rwsem[WRITE]);
+ if (src != dst) {
+ ret = -EBUSY;
+ if (!down_write_trylock(&F2FS_I(dst)->i_gc_rwsem[WRITE]))
+ goto out_src;
+ }
+
f2fs_lock_op(sbi);
ret = __exchange_data_block(src, dst, pos_in >> F2FS_BLKSIZE_BITS,
pos_out >> F2FS_BLKSIZE_BITS,
@@ -2170,6 +2491,11 @@ static int f2fs_move_file_range(struct file *file_in, loff_t pos_in,
f2fs_i_size_write(dst, dst_osize);
}
f2fs_unlock_op(sbi);
+
+ if (src != dst)
+ up_write(&F2FS_I(dst)->i_gc_rwsem[WRITE]);
+out_src:
+ up_write(&F2FS_I(src)->i_gc_rwsem[WRITE]);
out_unlock:
if (src != dst)
inode_unlock(dst);
@@ -2209,6 +2535,8 @@ static int f2fs_ioc_move_range(struct file *filp, unsigned long arg)
range.pos_out, range.len);
mnt_drop_write_file(filp);
+ if (err)
+ goto err_out;
if (copy_to_user((struct f2fs_move_range __user *)arg,
&range, sizeof(range)))
@@ -2218,8 +2546,209 @@ err_out:
return err;
}
+static int f2fs_ioc_flush_device(struct file *filp, unsigned long arg)
+{
+ struct inode *inode = file_inode(filp);
+ struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
+ struct sit_info *sm = SIT_I(sbi);
+ unsigned int start_segno = 0, end_segno = 0;
+ unsigned int dev_start_segno = 0, dev_end_segno = 0;
+ struct f2fs_flush_device range;
+ int ret;
+
+ if (!capable(CAP_SYS_ADMIN))
+ return -EPERM;
+
+ if (f2fs_readonly(sbi->sb))
+ return -EROFS;
+
+ if (unlikely(is_sbi_flag_set(sbi, SBI_CP_DISABLED)))
+ return -EINVAL;
+
+ if (copy_from_user(&range, (struct f2fs_flush_device __user *)arg,
+ sizeof(range)))
+ return -EFAULT;
+
+ if (sbi->s_ndevs <= 1 || sbi->s_ndevs - 1 <= range.dev_num ||
+ __is_large_section(sbi)) {
+ f2fs_msg(sbi->sb, KERN_WARNING,
+ "Can't flush %u in %d for segs_per_sec %u != 1\n",
+ range.dev_num, sbi->s_ndevs,
+ sbi->segs_per_sec);
+ return -EINVAL;
+ }
+
+ ret = mnt_want_write_file(filp);
+ if (ret)
+ return ret;
+
+ if (range.dev_num != 0)
+ dev_start_segno = GET_SEGNO(sbi, FDEV(range.dev_num).start_blk);
+ dev_end_segno = GET_SEGNO(sbi, FDEV(range.dev_num).end_blk);
+
+ start_segno = sm->last_victim[FLUSH_DEVICE];
+ if (start_segno < dev_start_segno || start_segno >= dev_end_segno)
+ start_segno = dev_start_segno;
+ end_segno = min(start_segno + range.segments, dev_end_segno);
+
+ while (start_segno < end_segno) {
+ if (!mutex_trylock(&sbi->gc_mutex)) {
+ ret = -EBUSY;
+ goto out;
+ }
+ sm->last_victim[GC_CB] = end_segno + 1;
+ sm->last_victim[GC_GREEDY] = end_segno + 1;
+ sm->last_victim[ALLOC_NEXT] = end_segno + 1;
+ ret = f2fs_gc(sbi, true, true, start_segno);
+ if (ret == -EAGAIN)
+ ret = 0;
+ else if (ret < 0)
+ break;
+ start_segno++;
+ }
+out:
+ mnt_drop_write_file(filp);
+ return ret;
+}
+
+static int f2fs_ioc_get_features(struct file *filp, unsigned long arg)
+{
+ struct inode *inode = file_inode(filp);
+ u32 sb_feature = le32_to_cpu(F2FS_I_SB(inode)->raw_super->feature);
+
+ /* Must validate to set it with SQLite behavior in Android. */
+ sb_feature |= F2FS_FEATURE_ATOMIC_WRITE;
+
+ return put_user(sb_feature, (u32 __user *)arg);
+}
+
+int f2fs_pin_file_control(struct inode *inode, bool inc)
+{
+ struct f2fs_inode_info *fi = F2FS_I(inode);
+ struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
+
+ /* Use i_gc_failures for normal file as a risk signal. */
+ if (inc)
+ f2fs_i_gc_failures_write(inode,
+ fi->i_gc_failures[GC_FAILURE_PIN] + 1);
+
+ if (fi->i_gc_failures[GC_FAILURE_PIN] > sbi->gc_pin_file_threshold) {
+ f2fs_msg(sbi->sb, KERN_WARNING,
+ "%s: Enable GC = ino %lx after %x GC trials\n",
+ __func__, inode->i_ino,
+ fi->i_gc_failures[GC_FAILURE_PIN]);
+ clear_inode_flag(inode, FI_PIN_FILE);
+ return -EAGAIN;
+ }
+ return 0;
+}
+
+static int f2fs_ioc_set_pin_file(struct file *filp, unsigned long arg)
+{
+ struct inode *inode = file_inode(filp);
+ __u32 pin;
+ int ret = 0;
+
+ if (!inode_owner_or_capable(inode))
+ return -EACCES;
+
+ if (get_user(pin, (__u32 __user *)arg))
+ return -EFAULT;
+
+ if (!S_ISREG(inode->i_mode))
+ return -EINVAL;
+
+ if (f2fs_readonly(F2FS_I_SB(inode)->sb))
+ return -EROFS;
+
+ ret = mnt_want_write_file(filp);
+ if (ret)
+ return ret;
+
+ inode_lock(inode);
+
+ if (f2fs_should_update_outplace(inode, NULL)) {
+ ret = -EINVAL;
+ goto out;
+ }
+
+ if (!pin) {
+ clear_inode_flag(inode, FI_PIN_FILE);
+ f2fs_i_gc_failures_write(inode, 0);
+ goto done;
+ }
+
+ if (f2fs_pin_file_control(inode, false)) {
+ ret = -EAGAIN;
+ goto out;
+ }
+ ret = f2fs_convert_inline_inode(inode);
+ if (ret)
+ goto out;
+
+ set_inode_flag(inode, FI_PIN_FILE);
+ ret = F2FS_I(inode)->i_gc_failures[GC_FAILURE_PIN];
+done:
+ f2fs_update_time(F2FS_I_SB(inode), REQ_TIME);
+out:
+ inode_unlock(inode);
+ mnt_drop_write_file(filp);
+ return ret;
+}
+
+static int f2fs_ioc_get_pin_file(struct file *filp, unsigned long arg)
+{
+ struct inode *inode = file_inode(filp);
+ __u32 pin = 0;
+
+ if (is_inode_flag_set(inode, FI_PIN_FILE))
+ pin = F2FS_I(inode)->i_gc_failures[GC_FAILURE_PIN];
+ return put_user(pin, (u32 __user *)arg);
+}
+
+int f2fs_precache_extents(struct inode *inode)
+{
+ struct f2fs_inode_info *fi = F2FS_I(inode);
+ struct f2fs_map_blocks map;
+ pgoff_t m_next_extent;
+ loff_t end;
+ int err;
+
+ if (is_inode_flag_set(inode, FI_NO_EXTENT))
+ return -EOPNOTSUPP;
+
+ map.m_lblk = 0;
+ map.m_next_pgofs = NULL;
+ map.m_next_extent = &m_next_extent;
+ map.m_seg_type = NO_CHECK_TYPE;
+ map.m_may_create = false;
+ end = F2FS_I_SB(inode)->max_file_blocks;
+
+ while (map.m_lblk < end) {
+ map.m_len = end - map.m_lblk;
+
+ down_write(&fi->i_gc_rwsem[WRITE]);
+ err = f2fs_map_blocks(inode, &map, 0, F2FS_GET_BLOCK_PRECACHE);
+ up_write(&fi->i_gc_rwsem[WRITE]);
+ if (err)
+ return err;
+
+ map.m_lblk = m_next_extent;
+ }
+
+ return err;
+}
+
+static int f2fs_ioc_precache_extents(struct file *filp, unsigned long arg)
+{
+ return f2fs_precache_extents(file_inode(filp));
+}
+
long f2fs_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
{
+ if (unlikely(f2fs_cp_error(F2FS_I_SB(file_inode(filp)))))
+ return -EIO;
+
switch (cmd) {
case F2FS_IOC_GETFLAGS:
return f2fs_ioc_getflags(filp, arg);
@@ -2249,12 +2778,24 @@ long f2fs_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
return f2fs_ioc_get_encryption_pwsalt(filp, arg);
case F2FS_IOC_GARBAGE_COLLECT:
return f2fs_ioc_gc(filp, arg);
+ case F2FS_IOC_GARBAGE_COLLECT_RANGE:
+ return f2fs_ioc_gc_range(filp, arg);
case F2FS_IOC_WRITE_CHECKPOINT:
return f2fs_ioc_write_checkpoint(filp, arg);
case F2FS_IOC_DEFRAGMENT:
return f2fs_ioc_defragment(filp, arg);
case F2FS_IOC_MOVE_RANGE:
return f2fs_ioc_move_range(filp, arg);
+ case F2FS_IOC_FLUSH_DEVICE:
+ return f2fs_ioc_flush_device(filp, arg);
+ case F2FS_IOC_GET_FEATURES:
+ return f2fs_ioc_get_features(filp, arg);
+ case F2FS_IOC_GET_PIN_FILE:
+ return f2fs_ioc_get_pin_file(filp, arg);
+ case F2FS_IOC_SET_PIN_FILE:
+ return f2fs_ioc_set_pin_file(filp, arg);
+ case F2FS_IOC_PRECACHE_EXTENTS:
+ return f2fs_ioc_precache_extents(filp, arg);
default:
return -ENOTTY;
}
@@ -2264,23 +2805,62 @@ static ssize_t f2fs_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
{
struct file *file = iocb->ki_filp;
struct inode *inode = file_inode(file);
- struct blk_plug plug;
ssize_t ret;
- if (f2fs_encrypted_inode(inode) &&
- !fscrypt_has_encryption_key(inode) &&
- fscrypt_get_encryption_info(inode))
- return -EACCES;
+ if (unlikely(f2fs_cp_error(F2FS_I_SB(inode))))
+ return -EIO;
+
+ if ((iocb->ki_flags & IOCB_NOWAIT) && !(iocb->ki_flags & IOCB_DIRECT))
+ return -EINVAL;
+
+ if (!inode_trylock(inode)) {
+ if (iocb->ki_flags & IOCB_NOWAIT)
+ return -EAGAIN;
+ inode_lock(inode);
+ }
- inode_lock(inode);
ret = generic_write_checks(iocb, from);
if (ret > 0) {
- ret = f2fs_preallocate_blocks(iocb, from);
- if (!ret) {
- blk_start_plug(&plug);
- ret = __generic_file_write_iter(iocb, from);
- blk_finish_plug(&plug);
+ bool preallocated = false;
+ size_t target_size = 0;
+ int err;
+
+ if (iov_iter_fault_in_readable(from, iov_iter_count(from)))
+ set_inode_flag(inode, FI_NO_PREALLOC);
+
+ if ((iocb->ki_flags & IOCB_NOWAIT) &&
+ (iocb->ki_flags & IOCB_DIRECT)) {
+ if (!f2fs_overwrite_io(inode, iocb->ki_pos,
+ iov_iter_count(from)) ||
+ f2fs_has_inline_data(inode) ||
+ f2fs_force_buffered_io(inode,
+ iocb, from)) {
+ clear_inode_flag(inode,
+ FI_NO_PREALLOC);
+ inode_unlock(inode);
+ return -EAGAIN;
+ }
+
+ } else {
+ preallocated = true;
+ target_size = iocb->ki_pos + iov_iter_count(from);
+
+ err = f2fs_preallocate_blocks(iocb, from);
+ if (err) {
+ clear_inode_flag(inode, FI_NO_PREALLOC);
+ inode_unlock(inode);
+ return err;
+ }
}
+ ret = __generic_file_write_iter(iocb, from);
+ clear_inode_flag(inode, FI_NO_PREALLOC);
+
+ /* if we couldn't write data, we should deallocate blocks. */
+ if (preallocated && i_size_read(inode) < target_size)
+ f2fs_truncate(inode);
+
+ if (ret > 0)
+ f2fs_update_iostat(F2FS_I_SB(inode), APP_WRITE_IO, ret);
}
inode_unlock(inode);
@@ -2312,10 +2892,15 @@ long f2fs_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
case F2FS_IOC_GET_ENCRYPTION_PWSALT:
case F2FS_IOC_GET_ENCRYPTION_POLICY:
case F2FS_IOC_GARBAGE_COLLECT:
+ case F2FS_IOC_GARBAGE_COLLECT_RANGE:
case F2FS_IOC_WRITE_CHECKPOINT:
case F2FS_IOC_DEFRAGMENT:
- break;
case F2FS_IOC_MOVE_RANGE:
+ case F2FS_IOC_FLUSH_DEVICE:
+ case F2FS_IOC_GET_FEATURES:
+ case F2FS_IOC_GET_PIN_FILE:
+ case F2FS_IOC_SET_PIN_FILE:
+ case F2FS_IOC_PRECACHE_EXTENTS:
break;
default:
return -ENOIOCTLCMD;
@@ -2331,6 +2916,7 @@ const struct file_operations f2fs_file_operations = {
.open = f2fs_file_open,
.release = f2fs_release_file,
.mmap = f2fs_file_mmap,
+ .flush = f2fs_file_flush,
.fsync = f2fs_sync_file,
.fallocate = f2fs_fallocate,
.unlocked_ioctl = f2fs_ioctl,
diff --git a/fs/f2fs/gc.c b/fs/f2fs/gc.c
index 759056e776e5..d421720cfb17 100644
--- a/fs/f2fs/gc.c
+++ b/fs/f2fs/gc.c
@@ -1,12 +1,9 @@
+// SPDX-License-Identifier: GPL-2.0
/*
* fs/f2fs/gc.c
*
* Copyright (c) 2012 Samsung Electronics Co., Ltd.
* http://www.samsung.com/
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
*/
#include <linux/fs.h>
#include <linux/module.h>
@@ -28,29 +25,43 @@ static int gc_thread_func(void *data)
struct f2fs_sb_info *sbi = data;
struct f2fs_gc_kthread *gc_th = sbi->gc_thread;
wait_queue_head_t *wq = &sbi->gc_thread->gc_wait_queue_head;
- long wait_ms;
+ unsigned int wait_ms;
wait_ms = gc_th->min_sleep_time;
+ set_freezable();
do {
- if (try_to_freeze())
+ wait_event_interruptible_timeout(*wq,
+ kthread_should_stop() || freezing(current) ||
+ gc_th->gc_wake,
+ msecs_to_jiffies(wait_ms));
+
+ /* give it a try one time */
+ if (gc_th->gc_wake)
+ gc_th->gc_wake = 0;
+
+ if (try_to_freeze()) {
+ stat_other_skip_bggc_count(sbi);
continue;
- else
- wait_event_interruptible_timeout(*wq,
- kthread_should_stop(),
- msecs_to_jiffies(wait_ms));
+ }
if (kthread_should_stop())
break;
if (sbi->sb->s_writers.frozen >= SB_FREEZE_WRITE) {
increase_sleep_time(gc_th, &wait_ms);
+ stat_other_skip_bggc_count(sbi);
continue;
}
-#ifdef CONFIG_F2FS_FAULT_INJECTION
- if (time_to_inject(sbi, FAULT_CHECKPOINT))
+ if (time_to_inject(sbi, FAULT_CHECKPOINT)) {
+ f2fs_show_injection_info(FAULT_CHECKPOINT);
f2fs_stop_checkpoint(sbi, false);
-#endif
+ }
+
+ if (!sb_start_write_trylock(sbi->sb)) {
+ stat_other_skip_bggc_count(sbi);
+ continue;
+ }
/*
* [GC triggering condition]
@@ -65,24 +76,33 @@ static int gc_thread_func(void *data)
* invalidated soon after by user update or deletion.
* So, I'd like to wait some time to collect dirty segments.
*/
- if (!mutex_trylock(&sbi->gc_mutex))
- continue;
+ if (sbi->gc_mode == GC_URGENT) {
+ wait_ms = gc_th->urgent_sleep_time;
+ mutex_lock(&sbi->gc_mutex);
+ goto do_gc;
+ }
- if (!is_idle(sbi)) {
+ if (!mutex_trylock(&sbi->gc_mutex)) {
+ stat_other_skip_bggc_count(sbi);
+ goto next;
+ }
+
+ if (!is_idle(sbi, GC_TIME)) {
increase_sleep_time(gc_th, &wait_ms);
mutex_unlock(&sbi->gc_mutex);
- continue;
+ stat_io_skip_bggc_count(sbi);
+ goto next;
}
if (has_enough_invalid_blocks(sbi))
decrease_sleep_time(gc_th, &wait_ms);
else
increase_sleep_time(gc_th, &wait_ms);
-
+do_gc:
stat_inc_bggc_count(sbi);
/* if return value is not zero, no victim was selected */
- if (f2fs_gc(sbi, test_opt(sbi, FORCE_FG_GC)))
+ if (f2fs_gc(sbi, test_opt(sbi, FORCE_FG_GC), true, NULL_SEGNO))
wait_ms = gc_th->no_gc_sleep_time;
trace_f2fs_background_gc(sbi->sb, wait_ms,
@@ -90,12 +110,14 @@ static int gc_thread_func(void *data)
/* balancing f2fs's metadata periodically */
f2fs_balance_fs_bg(sbi);
+next:
+ sb_end_write(sbi->sb);
} while (!kthread_should_stop());
return 0;
}
-int start_gc_thread(struct f2fs_sb_info *sbi)
+int f2fs_start_gc_thread(struct f2fs_sb_info *sbi)
{
struct f2fs_gc_kthread *gc_th;
dev_t dev = sbi->sb->s_bdev->bd_dev;
@@ -107,11 +129,12 @@ int start_gc_thread(struct f2fs_sb_info *sbi)
goto out;
}
+ gc_th->urgent_sleep_time = DEF_GC_THREAD_URGENT_SLEEP_TIME;
gc_th->min_sleep_time = DEF_GC_THREAD_MIN_SLEEP_TIME;
gc_th->max_sleep_time = DEF_GC_THREAD_MAX_SLEEP_TIME;
gc_th->no_gc_sleep_time = DEF_GC_THREAD_NOGC_SLEEP_TIME;
- gc_th->gc_idle = 0;
+ gc_th->gc_wake= 0;
sbi->gc_thread = gc_th;
init_waitqueue_head(&sbi->gc_thread->gc_wait_queue_head);
@@ -119,32 +142,35 @@ int start_gc_thread(struct f2fs_sb_info *sbi)
"f2fs_gc-%u:%u", MAJOR(dev), MINOR(dev));
if (IS_ERR(gc_th->f2fs_gc_task)) {
err = PTR_ERR(gc_th->f2fs_gc_task);
- kfree(gc_th);
+ kvfree(gc_th);
sbi->gc_thread = NULL;
}
out:
return err;
}
-void stop_gc_thread(struct f2fs_sb_info *sbi)
+void f2fs_stop_gc_thread(struct f2fs_sb_info *sbi)
{
struct f2fs_gc_kthread *gc_th = sbi->gc_thread;
if (!gc_th)
return;
kthread_stop(gc_th->f2fs_gc_task);
- kfree(gc_th);
+ kvfree(gc_th);
sbi->gc_thread = NULL;
}
-static int select_gc_type(struct f2fs_gc_kthread *gc_th, int gc_type)
+static int select_gc_type(struct f2fs_sb_info *sbi, int gc_type)
{
int gc_mode = (gc_type == BG_GC) ? GC_CB : GC_GREEDY;
- if (gc_th && gc_th->gc_idle) {
- if (gc_th->gc_idle == 1)
- gc_mode = GC_CB;
- else if (gc_th->gc_idle == 2)
- gc_mode = GC_GREEDY;
+ switch (sbi->gc_mode) {
+ case GC_IDLE_CB:
+ gc_mode = GC_CB;
+ break;
+ case GC_IDLE_GREEDY:
+ case GC_URGENT:
+ gc_mode = GC_GREEDY;
+ break;
}
return gc_mode;
}
@@ -160,17 +186,24 @@ static void select_policy(struct f2fs_sb_info *sbi, int gc_type,
p->max_search = dirty_i->nr_dirty[type];
p->ofs_unit = 1;
} else {
- p->gc_mode = select_gc_type(sbi->gc_thread, gc_type);
+ p->gc_mode = select_gc_type(sbi, gc_type);
p->dirty_segmap = dirty_i->dirty_segmap[DIRTY];
p->max_search = dirty_i->nr_dirty[DIRTY];
p->ofs_unit = sbi->segs_per_sec;
}
/* we need to check every dirty segments in the FG_GC case */
- if (gc_type != FG_GC && p->max_search > sbi->max_victim_search)
+ if (gc_type != FG_GC &&
+ (sbi->gc_mode != GC_URGENT) &&
+ p->max_search > sbi->max_victim_search)
p->max_search = sbi->max_victim_search;
- p->offset = sbi->last_victim[p->gc_mode];
+ /* let's select beginning hot/small space first in no_heap mode*/
+ if (test_opt(sbi, NOHEAP) &&
+ (type == CURSEG_HOT_DATA || IS_NODESEG(type)))
+ p->offset = 0;
+ else
+ p->offset = SIT_I(sbi)->last_victim[p->gc_mode];
}
static unsigned int get_max_cost(struct f2fs_sb_info *sbi,
@@ -180,7 +213,7 @@ static unsigned int get_max_cost(struct f2fs_sb_info *sbi,
if (p->alloc_mode == SSR)
return sbi->blocks_per_seg;
if (p->gc_mode == GC_GREEDY)
- return sbi->blocks_per_seg * p->ofs_unit;
+ return 2 * sbi->blocks_per_seg * p->ofs_unit;
else if (p->gc_mode == GC_CB)
return UINT_MAX;
else /* No other gc_mode */
@@ -200,12 +233,8 @@ static unsigned int check_bg_victims(struct f2fs_sb_info *sbi)
for_each_set_bit(secno, dirty_i->victim_secmap, MAIN_SECS(sbi)) {
if (sec_usage_check(sbi, secno))
continue;
-
- if (no_fggc_candidate(sbi, secno))
- continue;
-
clear_bit(secno, dirty_i->victim_secmap);
- return secno * sbi->segs_per_sec;
+ return GET_SEG_FROM_SEC(sbi, secno);
}
return NULL_SEGNO;
}
@@ -213,8 +242,8 @@ static unsigned int check_bg_victims(struct f2fs_sb_info *sbi)
static unsigned int get_cb_cost(struct f2fs_sb_info *sbi, unsigned int segno)
{
struct sit_info *sit_i = SIT_I(sbi);
- unsigned int secno = GET_SECNO(sbi, segno);
- unsigned int start = secno * sbi->segs_per_sec;
+ unsigned int secno = GET_SEC_FROM_SEG(sbi, segno);
+ unsigned int start = GET_SEG_FROM_SEC(sbi, secno);
unsigned long long mtime = 0;
unsigned int vblocks;
unsigned char age = 0;
@@ -223,7 +252,7 @@ static unsigned int get_cb_cost(struct f2fs_sb_info *sbi, unsigned int segno)
for (i = 0; i < sbi->segs_per_sec; i++)
mtime += get_seg_entry(sbi, start + i)->mtime;
- vblocks = get_valid_blocks(sbi, segno, sbi->segs_per_sec);
+ vblocks = get_valid_blocks(sbi, segno, true);
mtime = div_u64(mtime, sbi->segs_per_sec);
vblocks = div_u64(vblocks, sbi->segs_per_sec);
@@ -250,7 +279,7 @@ static inline unsigned int get_gc_cost(struct f2fs_sb_info *sbi,
/* alloc_mode == LFS */
if (p->gc_mode == GC_GREEDY)
- return get_valid_blocks(sbi, segno, sbi->segs_per_sec);
+ return get_valid_blocks(sbi, segno, true);
else
return get_cb_cost(sbi, segno);
}
@@ -279,6 +308,7 @@ static int get_victim_by_default(struct f2fs_sb_info *sbi,
unsigned int *result, int gc_type, int type, char alloc_mode)
{
struct dirty_seglist_info *dirty_i = DIRTY_I(sbi);
+ struct sit_info *sm = SIT_I(sbi);
struct victim_sel_policy p;
unsigned int secno, last_victim;
unsigned int last_segment = MAIN_SEGS(sbi);
@@ -292,10 +322,33 @@ static int get_victim_by_default(struct f2fs_sb_info *sbi,
p.min_segno = NULL_SEGNO;
p.min_cost = get_max_cost(sbi, &p);
+ if (*result != NULL_SEGNO) {
+ if (get_valid_blocks(sbi, *result, false) &&
+ !sec_usage_check(sbi, GET_SEC_FROM_SEG(sbi, *result)))
+ p.min_segno = *result;
+ goto out;
+ }
+
if (p.max_search == 0)
goto out;
- last_victim = sbi->last_victim[p.gc_mode];
+ if (__is_large_section(sbi) && p.alloc_mode == LFS) {
+ if (sbi->next_victim_seg[BG_GC] != NULL_SEGNO) {
+ p.min_segno = sbi->next_victim_seg[BG_GC];
+ *result = p.min_segno;
+ sbi->next_victim_seg[BG_GC] = NULL_SEGNO;
+ goto got_result;
+ }
+ if (gc_type == FG_GC &&
+ sbi->next_victim_seg[FG_GC] != NULL_SEGNO) {
+ p.min_segno = sbi->next_victim_seg[FG_GC];
+ *result = p.min_segno;
+ sbi->next_victim_seg[FG_GC] = NULL_SEGNO;
+ goto got_result;
+ }
+ }
+
+ last_victim = sm->last_victim[p.gc_mode];
if (p.alloc_mode == LFS && gc_type == FG_GC) {
p.min_segno = check_bg_victims(sbi);
if (p.min_segno != NULL_SEGNO)
@@ -308,9 +361,10 @@ static int get_victim_by_default(struct f2fs_sb_info *sbi,
segno = find_next_bit(p.dirty_segmap, last_segment, p.offset);
if (segno >= last_segment) {
- if (sbi->last_victim[p.gc_mode]) {
- last_segment = sbi->last_victim[p.gc_mode];
- sbi->last_victim[p.gc_mode] = 0;
+ if (sm->last_victim[p.gc_mode]) {
+ last_segment =
+ sm->last_victim[p.gc_mode];
+ sm->last_victim[p.gc_mode] = 0;
p.offset = 0;
continue;
}
@@ -327,14 +381,15 @@ static int get_victim_by_default(struct f2fs_sb_info *sbi,
nsearched++;
}
- secno = GET_SECNO(sbi, segno);
+ secno = GET_SEC_FROM_SEG(sbi, segno);
if (sec_usage_check(sbi, secno))
goto next;
- if (gc_type == BG_GC && test_bit(secno, dirty_i->victim_secmap))
+ /* Don't touch checkpointed data */
+ if (unlikely(is_sbi_flag_set(sbi, SBI_CP_DISABLED) &&
+ get_ckpt_valid_blocks(sbi, segno)))
goto next;
- if (gc_type == FG_GC && p.alloc_mode == LFS &&
- no_fggc_candidate(sbi, secno))
+ if (gc_type == BG_GC && test_bit(secno, dirty_i->victim_secmap))
goto next;
cost = get_gc_cost(sbi, segno, &p);
@@ -345,29 +400,32 @@ static int get_victim_by_default(struct f2fs_sb_info *sbi,
}
next:
if (nsearched >= p.max_search) {
- if (!sbi->last_victim[p.gc_mode] && segno <= last_victim)
- sbi->last_victim[p.gc_mode] = last_victim + 1;
+ if (!sm->last_victim[p.gc_mode] && segno <= last_victim)
+ sm->last_victim[p.gc_mode] = last_victim + 1;
else
- sbi->last_victim[p.gc_mode] = segno + 1;
+ sm->last_victim[p.gc_mode] = segno + 1;
+ sm->last_victim[p.gc_mode] %= MAIN_SEGS(sbi);
break;
}
}
if (p.min_segno != NULL_SEGNO) {
got_it:
+ *result = (p.min_segno / p.ofs_unit) * p.ofs_unit;
+got_result:
if (p.alloc_mode == LFS) {
- secno = GET_SECNO(sbi, p.min_segno);
+ secno = GET_SEC_FROM_SEG(sbi, p.min_segno);
if (gc_type == FG_GC)
sbi->cur_victim_sec = secno;
else
set_bit(secno, dirty_i->victim_secmap);
}
- *result = (p.min_segno / p.ofs_unit) * p.ofs_unit;
+ }
+out:
+ if (p.min_segno != NULL_SEGNO)
trace_f2fs_get_victim(sbi->sb, type, gc_type, &p,
sbi->cur_victim_sec,
prefree_segments(sbi), free_segments(sbi));
- }
-out:
mutex_unlock(&dirty_i->seglist_lock);
return (p.min_segno == NULL_SEGNO) ? 0 : 1;
@@ -395,7 +453,7 @@ static void add_gc_inode(struct gc_inode_list *gc_list, struct inode *inode)
iput(inode);
return;
}
- new_ie = f2fs_kmem_cache_alloc(inode_entry_slab, GFP_NOFS);
+ new_ie = f2fs_kmem_cache_alloc(f2fs_inode_entry_slab, GFP_NOFS);
new_ie->inode = inode;
f2fs_radix_tree_insert(&gc_list->iroot, inode->i_ino, new_ie);
@@ -409,7 +467,7 @@ static void put_gc_inode(struct gc_inode_list *gc_list)
radix_tree_delete(&gc_list->iroot, ie->inode->i_ino);
iput(ie->inode);
list_del(&ie->list);
- kmem_cache_free(inode_entry_slab, ie);
+ kmem_cache_free(f2fs_inode_entry_slab, ie);
}
}
@@ -420,10 +478,10 @@ static int check_valid_map(struct f2fs_sb_info *sbi,
struct seg_entry *sentry;
int ret;
- mutex_lock(&sit_i->sentry_lock);
+ down_read(&sit_i->sentry_lock);
sentry = get_seg_entry(sbi, segno);
ret = f2fs_test_bit(offset, sentry->cur_valid_map);
- mutex_unlock(&sit_i->sentry_lock);
+ up_read(&sit_i->sentry_lock);
return ret;
}
@@ -432,65 +490,81 @@ static int check_valid_map(struct f2fs_sb_info *sbi,
* On validity, copy that node with cold status, otherwise (invalid node)
* ignore that.
*/
-static void gc_node_segment(struct f2fs_sb_info *sbi,
+static int gc_node_segment(struct f2fs_sb_info *sbi,
struct f2fs_summary *sum, unsigned int segno, int gc_type)
{
struct f2fs_summary *entry;
block_t start_addr;
int off;
int phase = 0;
+ bool fggc = (gc_type == FG_GC);
+ int submitted = 0;
start_addr = START_BLOCK(sbi, segno);
next_step:
entry = sum;
+ if (fggc && phase == 2)
+ atomic_inc(&sbi->wb_sync_req[NODE]);
+
for (off = 0; off < sbi->blocks_per_seg; off++, entry++) {
nid_t nid = le32_to_cpu(entry->nid);
struct page *node_page;
struct node_info ni;
+ int err;
/* stop BG_GC if there is not enough free sections. */
if (gc_type == BG_GC && has_not_enough_free_secs(sbi, 0, 0))
- return;
+ return submitted;
if (check_valid_map(sbi, segno, off) == 0)
continue;
if (phase == 0) {
- ra_meta_pages(sbi, NAT_BLOCK_OFFSET(nid), 1,
+ f2fs_ra_meta_pages(sbi, NAT_BLOCK_OFFSET(nid), 1,
META_NAT, true);
continue;
}
if (phase == 1) {
- ra_node_page(sbi, nid);
+ f2fs_ra_node_page(sbi, nid);
continue;
}
/* phase == 2 */
- node_page = get_node_page(sbi, nid);
+ node_page = f2fs_get_node_page(sbi, nid);
if (IS_ERR(node_page))
continue;
- /* block may become invalid during get_node_page */
+ /* block may become invalid during f2fs_get_node_page */
if (check_valid_map(sbi, segno, off) == 0) {
f2fs_put_page(node_page, 1);
continue;
}
- get_node_info(sbi, nid, &ni);
+ if (f2fs_get_node_info(sbi, nid, &ni)) {
+ f2fs_put_page(node_page, 1);
+ continue;
+ }
+
if (ni.blk_addr != start_addr + off) {
f2fs_put_page(node_page, 1);
continue;
}
- move_node_page(node_page, gc_type);
+ err = f2fs_move_node_page(node_page, gc_type);
+ if (!err && gc_type == FG_GC)
+ submitted++;
stat_inc_node_blk_count(sbi, 1, gc_type);
}
if (++phase < 3)
goto next_step;
+
+ if (fggc)
+ atomic_dec(&sbi->wb_sync_req[NODE]);
+ return submitted;
}
/*
@@ -500,7 +574,7 @@ next_step:
* as indirect or double indirect node blocks, are given, it must be a caller's
* bug.
*/
-block_t start_bidx_of_node(unsigned int node_ofs, struct inode *inode)
+block_t f2fs_start_bidx_of_node(unsigned int node_ofs, struct inode *inode)
{
unsigned int indirect_blks = 2 * NIDS_PER_BLOCK + 4;
unsigned int bidx;
@@ -531,11 +605,14 @@ static bool is_alive(struct f2fs_sb_info *sbi, struct f2fs_summary *sum,
nid = le32_to_cpu(sum->nid);
ofs_in_node = le16_to_cpu(sum->ofs_in_node);
- node_page = get_node_page(sbi, nid);
+ node_page = f2fs_get_node_page(sbi, nid);
if (IS_ERR(node_page))
return false;
- get_node_info(sbi, nid, dni);
+ if (f2fs_get_node_info(sbi, nid, dni)) {
+ f2fs_put_page(node_page, 1);
+ return false;
+ }
if (sum->version != dni->version) {
f2fs_msg(sbi->sb, KERN_WARNING,
@@ -545,7 +622,7 @@ static bool is_alive(struct f2fs_sb_info *sbi, struct f2fs_summary *sum,
}
*nofs = ofs_of_node(node_page);
- source_blkaddr = datablock_addr(node_page, ofs_in_node);
+ source_blkaddr = datablock_addr(NULL, node_page, ofs_in_node);
f2fs_put_page(node_page, 1);
if (source_blkaddr != blkaddr)
@@ -553,34 +630,137 @@ static bool is_alive(struct f2fs_sb_info *sbi, struct f2fs_summary *sum,
return true;
}
-static void move_encrypted_block(struct inode *inode, block_t bidx)
+static int ra_data_block(struct inode *inode, pgoff_t index)
+{
+ struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
+ struct address_space *mapping = inode->i_mapping;
+ struct dnode_of_data dn;
+ struct page *page;
+ struct extent_info ei = {0, 0, 0};
+ struct f2fs_io_info fio = {
+ .sbi = sbi,
+ .ino = inode->i_ino,
+ .type = DATA,
+ .temp = COLD,
+ .op = REQ_OP_READ,
+ .op_flags = 0,
+ .encrypted_page = NULL,
+ .in_list = false,
+ .retry = false,
+ };
+ int err;
+
+ page = f2fs_grab_cache_page(mapping, index, true);
+ if (!page)
+ return -ENOMEM;
+
+ if (f2fs_lookup_extent_cache(inode, index, &ei)) {
+ dn.data_blkaddr = ei.blk + index - ei.fofs;
+ goto got_it;
+ }
+
+ set_new_dnode(&dn, inode, NULL, NULL, 0);
+ err = f2fs_get_dnode_of_data(&dn, index, LOOKUP_NODE);
+ if (err)
+ goto put_page;
+ f2fs_put_dnode(&dn);
+
+ if (unlikely(!f2fs_is_valid_blkaddr(sbi, dn.data_blkaddr,
+ DATA_GENERIC))) {
+ err = -EFAULT;
+ goto put_page;
+ }
+got_it:
+ /* read page */
+ fio.page = page;
+ fio.new_blkaddr = fio.old_blkaddr = dn.data_blkaddr;
+
+ /*
+ * don't cache encrypted data into meta inode until previous dirty
+ * data were writebacked to avoid racing between GC and flush.
+ */
+ f2fs_wait_on_page_writeback(page, DATA, true, true);
+
+ f2fs_wait_on_block_writeback(inode, dn.data_blkaddr);
+
+ fio.encrypted_page = f2fs_pagecache_get_page(META_MAPPING(sbi),
+ dn.data_blkaddr,
+ FGP_LOCK | FGP_CREAT, GFP_NOFS);
+ if (!fio.encrypted_page) {
+ err = -ENOMEM;
+ goto put_page;
+ }
+
+ err = f2fs_submit_page_bio(&fio);
+ if (err)
+ goto put_encrypted_page;
+ f2fs_put_page(fio.encrypted_page, 0);
+ f2fs_put_page(page, 1);
+ return 0;
+put_encrypted_page:
+ f2fs_put_page(fio.encrypted_page, 1);
+put_page:
+ f2fs_put_page(page, 1);
+ return err;
+}
+
+/*
+ * Move data block via META_MAPPING while keeping locked data page.
+ * This can be used to move blocks, aka LBAs, directly on disk.
+ */
+static int move_data_block(struct inode *inode, block_t bidx,
+ int gc_type, unsigned int segno, int off)
{
struct f2fs_io_info fio = {
.sbi = F2FS_I_SB(inode),
+ .ino = inode->i_ino,
.type = DATA,
+ .temp = COLD,
.op = REQ_OP_READ,
- .op_flags = READ_SYNC,
+ .op_flags = 0,
.encrypted_page = NULL,
+ .in_list = false,
+ .retry = false,
};
struct dnode_of_data dn;
struct f2fs_summary sum;
struct node_info ni;
- struct page *page;
+ struct page *page, *mpage;
block_t newaddr;
- int err;
+ int err = 0;
+ bool lfs_mode = test_opt(fio.sbi, LFS);
/* do not read out */
page = f2fs_grab_cache_page(inode->i_mapping, bidx, false);
if (!page)
- return;
+ return -ENOMEM;
+
+ if (!check_valid_map(F2FS_I_SB(inode), segno, off)) {
+ err = -ENOENT;
+ goto out;
+ }
+
+ if (f2fs_is_atomic_file(inode)) {
+ F2FS_I(inode)->i_gc_failures[GC_FAILURE_ATOMIC]++;
+ F2FS_I_SB(inode)->skipped_atomic_files[gc_type]++;
+ err = -EAGAIN;
+ goto out;
+ }
+
+ if (f2fs_is_pinned_file(inode)) {
+ f2fs_pin_file_control(inode, true);
+ err = -EAGAIN;
+ goto out;
+ }
set_new_dnode(&dn, inode, NULL, NULL, 0);
- err = get_dnode_of_data(&dn, bidx, LOOKUP_NODE);
+ err = f2fs_get_dnode_of_data(&dn, bidx, LOOKUP_NODE);
if (err)
goto out;
if (unlikely(dn.data_blkaddr == NULL_ADDR)) {
ClearPageUptodate(page);
+ err = -ENOENT;
goto put_out;
}
@@ -588,25 +768,50 @@ static void move_encrypted_block(struct inode *inode, block_t bidx)
* don't cache encrypted data into meta inode until previous dirty
* data were writebacked to avoid racing between GC and flush.
*/
- f2fs_wait_on_page_writeback(page, DATA, true);
+ f2fs_wait_on_page_writeback(page, DATA, true, true);
+
+ f2fs_wait_on_block_writeback(inode, dn.data_blkaddr);
+
+ err = f2fs_get_node_info(fio.sbi, dn.nid, &ni);
+ if (err)
+ goto put_out;
- get_node_info(fio.sbi, dn.nid, &ni);
set_summary(&sum, dn.nid, dn.ofs_in_node, ni.version);
/* read page */
fio.page = page;
fio.new_blkaddr = fio.old_blkaddr = dn.data_blkaddr;
- allocate_data_block(fio.sbi, NULL, fio.old_blkaddr, &newaddr,
- &sum, CURSEG_COLD_DATA);
+ if (lfs_mode)
+ down_write(&fio.sbi->io_order_lock);
- fio.encrypted_page = pagecache_get_page(META_MAPPING(fio.sbi), newaddr,
- FGP_LOCK | FGP_CREAT, GFP_NOFS);
+ f2fs_allocate_data_block(fio.sbi, NULL, fio.old_blkaddr, &newaddr,
+ &sum, CURSEG_COLD_DATA, NULL, false);
+
+ fio.encrypted_page = f2fs_pagecache_get_page(META_MAPPING(fio.sbi),
+ newaddr, FGP_LOCK | FGP_CREAT, GFP_NOFS);
if (!fio.encrypted_page) {
err = -ENOMEM;
goto recover_block;
}
+ mpage = f2fs_pagecache_get_page(META_MAPPING(fio.sbi),
+ fio.old_blkaddr, FGP_LOCK, GFP_NOFS);
+ if (mpage) {
+ bool updated = false;
+
+ if (PageUptodate(mpage)) {
+ memcpy(page_address(fio.encrypted_page),
+ page_address(mpage), PAGE_SIZE);
+ updated = true;
+ }
+ f2fs_put_page(mpage, 1);
+ invalidate_mapping_pages(META_MAPPING(fio.sbi),
+ fio.old_blkaddr, fio.old_blkaddr);
+ if (updated)
+ goto write_page;
+ }
+
err = f2fs_submit_page_bio(&fio);
if (err)
goto put_page_out;
@@ -623,20 +828,30 @@ static void move_encrypted_block(struct inode *inode, block_t bidx)
goto put_page_out;
}
+write_page:
+ f2fs_wait_on_page_writeback(fio.encrypted_page, DATA, true, true);
set_page_dirty(fio.encrypted_page);
- f2fs_wait_on_page_writeback(fio.encrypted_page, DATA, true);
if (clear_page_dirty_for_io(fio.encrypted_page))
dec_page_count(fio.sbi, F2FS_DIRTY_META);
set_page_writeback(fio.encrypted_page);
+ ClearPageError(page);
/* allocate block address */
- f2fs_wait_on_page_writeback(dn.node_page, NODE, true);
+ f2fs_wait_on_page_writeback(dn.node_page, NODE, true, true);
fio.op = REQ_OP_WRITE;
- fio.op_flags = WRITE_SYNC;
+ fio.op_flags = REQ_SYNC;
fio.new_blkaddr = newaddr;
- f2fs_submit_page_mbio(&fio);
+ f2fs_submit_page_write(&fio);
+ if (fio.retry) {
+ err = -EAGAIN;
+ if (PageWriteback(fio.encrypted_page))
+ end_page_writeback(fio.encrypted_page);
+ goto put_page_out;
+ }
+
+ f2fs_update_iostat(fio.sbi, FS_GC_DATA_IO, F2FS_BLKSIZE);
f2fs_update_data_blkaddr(&dn, newaddr);
set_inode_flag(inode, FI_APPEND_WRITE);
@@ -645,49 +860,81 @@ static void move_encrypted_block(struct inode *inode, block_t bidx)
put_page_out:
f2fs_put_page(fio.encrypted_page, 1);
recover_block:
+ if (lfs_mode)
+ up_write(&fio.sbi->io_order_lock);
if (err)
- __f2fs_replace_block(fio.sbi, &sum, newaddr, fio.old_blkaddr,
+ f2fs_do_replace_block(fio.sbi, &sum, newaddr, fio.old_blkaddr,
true, true);
put_out:
f2fs_put_dnode(&dn);
out:
f2fs_put_page(page, 1);
+ return err;
}
-static void move_data_page(struct inode *inode, block_t bidx, int gc_type)
+static int move_data_page(struct inode *inode, block_t bidx, int gc_type,
+ unsigned int segno, int off)
{
struct page *page;
+ int err = 0;
- page = get_lock_data_page(inode, bidx, true);
+ page = f2fs_get_lock_data_page(inode, bidx, true);
if (IS_ERR(page))
- return;
+ return PTR_ERR(page);
+
+ if (!check_valid_map(F2FS_I_SB(inode), segno, off)) {
+ err = -ENOENT;
+ goto out;
+ }
+
+ if (f2fs_is_atomic_file(inode)) {
+ F2FS_I(inode)->i_gc_failures[GC_FAILURE_ATOMIC]++;
+ F2FS_I_SB(inode)->skipped_atomic_files[gc_type]++;
+ err = -EAGAIN;
+ goto out;
+ }
+ if (f2fs_is_pinned_file(inode)) {
+ if (gc_type == FG_GC)
+ f2fs_pin_file_control(inode, true);
+ err = -EAGAIN;
+ goto out;
+ }
if (gc_type == BG_GC) {
- if (PageWriteback(page))
+ if (PageWriteback(page)) {
+ err = -EAGAIN;
goto out;
+ }
set_page_dirty(page);
set_cold_data(page);
} else {
struct f2fs_io_info fio = {
.sbi = F2FS_I_SB(inode),
+ .ino = inode->i_ino,
.type = DATA,
+ .temp = COLD,
.op = REQ_OP_WRITE,
- .op_flags = WRITE_SYNC,
+ .op_flags = REQ_SYNC,
+ .old_blkaddr = NULL_ADDR,
.page = page,
.encrypted_page = NULL,
+ .need_lock = LOCK_REQ,
+ .io_type = FS_GC_DATA_IO,
};
bool is_dirty = PageDirty(page);
- int err;
retry:
+ f2fs_wait_on_page_writeback(page, DATA, true, true);
+
set_page_dirty(page);
- f2fs_wait_on_page_writeback(page, DATA, true);
- if (clear_page_dirty_for_io(page))
+ if (clear_page_dirty_for_io(page)) {
inode_dec_dirty_pages(inode);
+ f2fs_remove_dirty_inode(inode);
+ }
set_cold_data(page);
- err = do_write_data_page(&fio);
+ err = f2fs_do_write_data_page(&fio);
if (err) {
clear_cold_data(page);
if (err == -ENOMEM) {
@@ -697,11 +944,10 @@ retry:
if (is_dirty)
set_page_dirty(page);
}
-
- clear_cold_data(page);
}
out:
f2fs_put_page(page, 1);
+ return err;
}
/*
@@ -711,7 +957,7 @@ out:
* If the parent node is not valid or the data block address is different,
* the victim data block is ignored.
*/
-static void gc_data_segment(struct f2fs_sb_info *sbi, struct f2fs_summary *sum,
+static int gc_data_segment(struct f2fs_sb_info *sbi, struct f2fs_summary *sum,
struct gc_inode_list *gc_list, unsigned int segno, int gc_type)
{
struct super_block *sb = sbi->sb;
@@ -719,6 +965,7 @@ static void gc_data_segment(struct f2fs_sb_info *sbi, struct f2fs_summary *sum,
block_t start_addr;
int off;
int phase = 0;
+ int submitted = 0;
start_addr = START_BLOCK(sbi, segno);
@@ -735,19 +982,19 @@ next_step:
/* stop BG_GC if there is not enough free sections. */
if (gc_type == BG_GC && has_not_enough_free_secs(sbi, 0, 0))
- return;
+ return submitted;
if (check_valid_map(sbi, segno, off) == 0)
continue;
if (phase == 0) {
- ra_meta_pages(sbi, NAT_BLOCK_OFFSET(nid), 1,
+ f2fs_ra_meta_pages(sbi, NAT_BLOCK_OFFSET(nid), 1,
META_NAT, true);
continue;
}
if (phase == 1) {
- ra_node_page(sbi, nid);
+ f2fs_ra_node_page(sbi, nid);
continue;
}
@@ -756,7 +1003,7 @@ next_step:
continue;
if (phase == 2) {
- ra_node_page(sbi, dni.ino);
+ f2fs_ra_node_page(sbi, dni.ino);
continue;
}
@@ -767,17 +1014,31 @@ next_step:
if (IS_ERR(inode) || is_bad_inode(inode))
continue;
- /* if encrypted inode, let's go phase 3 */
- if (f2fs_encrypted_inode(inode) &&
- S_ISREG(inode->i_mode)) {
+ if (!down_write_trylock(
+ &F2FS_I(inode)->i_gc_rwsem[WRITE])) {
+ iput(inode);
+ sbi->skipped_gc_rwsem++;
+ continue;
+ }
+
+ start_bidx = f2fs_start_bidx_of_node(nofs, inode) +
+ ofs_in_node;
+
+ if (f2fs_post_read_required(inode)) {
+ int err = ra_data_block(inode, start_bidx);
+
+ up_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]);
+ if (err) {
+ iput(inode);
+ continue;
+ }
add_gc_inode(gc_list, inode);
continue;
}
- start_bidx = start_bidx_of_node(nofs, inode);
- data_page = get_read_data_page(inode,
- start_bidx + ofs_in_node, REQ_RAHEAD,
- true);
+ data_page = f2fs_get_read_data_page(inode,
+ start_bidx, REQ_RAHEAD, true);
+ up_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]);
if (IS_ERR(data_page)) {
iput(inode);
continue;
@@ -793,28 +1054,39 @@ next_step:
if (inode) {
struct f2fs_inode_info *fi = F2FS_I(inode);
bool locked = false;
+ int err;
if (S_ISREG(inode->i_mode)) {
- if (!down_write_trylock(&fi->dio_rwsem[READ]))
+ if (!down_write_trylock(&fi->i_gc_rwsem[READ]))
continue;
if (!down_write_trylock(
- &fi->dio_rwsem[WRITE])) {
- up_write(&fi->dio_rwsem[READ]);
+ &fi->i_gc_rwsem[WRITE])) {
+ sbi->skipped_gc_rwsem++;
+ up_write(&fi->i_gc_rwsem[READ]);
continue;
}
locked = true;
+
+ /* wait for all inflight aio data */
+ inode_dio_wait(inode);
}
- start_bidx = start_bidx_of_node(nofs, inode)
+ start_bidx = f2fs_start_bidx_of_node(nofs, inode)
+ ofs_in_node;
- if (f2fs_encrypted_inode(inode) && S_ISREG(inode->i_mode))
- move_encrypted_block(inode, start_bidx);
+ if (f2fs_post_read_required(inode))
+ err = move_data_block(inode, start_bidx,
+ gc_type, segno, off);
else
- move_data_page(inode, start_bidx, gc_type);
+ err = move_data_page(inode, start_bidx, gc_type,
+ segno, off);
+
+ if (!err && (gc_type == FG_GC ||
+ f2fs_post_read_required(inode)))
+ submitted++;
if (locked) {
- up_write(&fi->dio_rwsem[WRITE]);
- up_write(&fi->dio_rwsem[READ]);
+ up_write(&fi->i_gc_rwsem[WRITE]);
+ up_write(&fi->i_gc_rwsem[READ]);
}
stat_inc_data_blk_count(sbi, 1, gc_type);
@@ -823,6 +1095,8 @@ next_step:
if (++phase < 5)
goto next_step;
+
+ return submitted;
}
static int __get_victim(struct f2fs_sb_info *sbi, unsigned int *victim,
@@ -831,10 +1105,10 @@ static int __get_victim(struct f2fs_sb_info *sbi, unsigned int *victim,
struct sit_info *sit_i = SIT_I(sbi);
int ret;
- mutex_lock(&sit_i->sentry_lock);
+ down_write(&sit_i->sentry_lock);
ret = DIRTY_I(sbi)->v_ops->get_victim(sbi, victim, gc_type,
NO_CHECK_TYPE, LFS);
- mutex_unlock(&sit_i->sentry_lock);
+ up_write(&sit_i->sentry_lock);
return ret;
}
@@ -847,18 +1121,34 @@ static int do_garbage_collect(struct f2fs_sb_info *sbi,
struct blk_plug plug;
unsigned int segno = start_segno;
unsigned int end_segno = start_segno + sbi->segs_per_sec;
- int sec_freed = 0;
+ int seg_freed = 0, migrated = 0;
unsigned char type = IS_DATASEG(get_seg_entry(sbi, segno)->type) ?
SUM_TYPE_DATA : SUM_TYPE_NODE;
+ int submitted = 0;
+
+ if (__is_large_section(sbi))
+ end_segno = rounddown(end_segno, sbi->segs_per_sec);
/* readahead multi ssa blocks those have contiguous address */
- if (sbi->segs_per_sec > 1)
- ra_meta_pages(sbi, GET_SUM_BLOCK(sbi, segno),
- sbi->segs_per_sec, META_SSA, true);
+ if (__is_large_section(sbi))
+ f2fs_ra_meta_pages(sbi, GET_SUM_BLOCK(sbi, segno),
+ end_segno - segno, META_SSA, true);
/* reference all summary page */
while (segno < end_segno) {
- sum_page = get_sum_page(sbi, segno++);
+ sum_page = f2fs_get_sum_page(sbi, segno++);
+ if (IS_ERR(sum_page)) {
+ int err = PTR_ERR(sum_page);
+
+ end_segno = segno - 1;
+ for (segno = start_segno; segno < end_segno; segno++) {
+ sum_page = find_get_page(META_MAPPING(sbi),
+ GET_SUM_BLOCK(sbi, segno));
+ f2fs_put_page(sum_page, 0);
+ f2fs_put_page(sum_page, 0);
+ }
+ return err;
+ }
unlock_page(sum_page);
}
@@ -871,10 +1161,13 @@ static int do_garbage_collect(struct f2fs_sb_info *sbi,
GET_SUM_BLOCK(sbi, segno));
f2fs_put_page(sum_page, 0);
- if (get_valid_blocks(sbi, segno, 1) == 0 ||
- !PageUptodate(sum_page) ||
- unlikely(f2fs_cp_error(sbi)))
- goto next;
+ if (get_valid_blocks(sbi, segno, false) == 0)
+ goto freed;
+ if (__is_large_section(sbi) &&
+ migrated >= sbi->migration_granularity)
+ goto skip;
+ if (!PageUptodate(sum_page) || unlikely(f2fs_cp_error(sbi)))
+ goto skip;
sum = page_address(sum_page);
if (type != GET_SUM_TYPE((&sum->footer))) {
@@ -882,126 +1175,178 @@ static int do_garbage_collect(struct f2fs_sb_info *sbi,
"type [%d, %d] in SSA and SIT",
segno, type, GET_SUM_TYPE((&sum->footer)));
set_sbi_flag(sbi, SBI_NEED_FSCK);
- goto next;
+ goto skip;
}
/*
* this is to avoid deadlock:
* - lock_page(sum_page) - f2fs_replace_block
- * - check_valid_map() - mutex_lock(sentry_lock)
- * - mutex_lock(sentry_lock) - change_curseg()
+ * - check_valid_map() - down_write(sentry_lock)
+ * - down_read(sentry_lock) - change_curseg()
* - lock_page(sum_page)
*/
-
if (type == SUM_TYPE_NODE)
- gc_node_segment(sbi, sum->entries, segno, gc_type);
- else
- gc_data_segment(sbi, sum->entries, gc_list, segno,
+ submitted += gc_node_segment(sbi, sum->entries, segno,
gc_type);
+ else
+ submitted += gc_data_segment(sbi, sum->entries, gc_list,
+ segno, gc_type);
stat_inc_seg_count(sbi, type, gc_type);
-next:
+
+freed:
+ if (gc_type == FG_GC &&
+ get_valid_blocks(sbi, segno, false) == 0)
+ seg_freed++;
+ migrated++;
+
+ if (__is_large_section(sbi) && segno + 1 < end_segno)
+ sbi->next_victim_seg[gc_type] = segno + 1;
+skip:
f2fs_put_page(sum_page, 0);
}
- if (gc_type == FG_GC)
- f2fs_submit_merged_bio(sbi,
- (type == SUM_TYPE_NODE) ? NODE : DATA, WRITE);
+ if (submitted)
+ f2fs_submit_merged_write(sbi,
+ (type == SUM_TYPE_NODE) ? NODE : DATA);
blk_finish_plug(&plug);
- if (gc_type == FG_GC &&
- get_valid_blocks(sbi, start_segno, sbi->segs_per_sec) == 0)
- sec_freed = 1;
-
stat_inc_call_count(sbi->stat_info);
- return sec_freed;
+ return seg_freed;
}
-int f2fs_gc(struct f2fs_sb_info *sbi, bool sync)
+int f2fs_gc(struct f2fs_sb_info *sbi, bool sync,
+ bool background, unsigned int segno)
{
- unsigned int segno;
int gc_type = sync ? FG_GC : BG_GC;
- int sec_freed = 0;
- int ret = -EINVAL;
+ int sec_freed = 0, seg_freed = 0, total_freed = 0;
+ int ret = 0;
struct cp_control cpc;
+ unsigned int init_segno = segno;
struct gc_inode_list gc_list = {
.ilist = LIST_HEAD_INIT(gc_list.ilist),
.iroot = RADIX_TREE_INIT(GFP_NOFS),
};
+ unsigned long long last_skipped = sbi->skipped_atomic_files[FG_GC];
+ unsigned long long first_skipped;
+ unsigned int skipped_round = 0, round = 0;
+
+ trace_f2fs_gc_begin(sbi->sb, sync, background,
+ get_pages(sbi, F2FS_DIRTY_NODES),
+ get_pages(sbi, F2FS_DIRTY_DENTS),
+ get_pages(sbi, F2FS_DIRTY_IMETA),
+ free_sections(sbi),
+ free_segments(sbi),
+ reserved_segments(sbi),
+ prefree_segments(sbi));
cpc.reason = __get_cp_reason(sbi);
+ sbi->skipped_gc_rwsem = 0;
+ first_skipped = last_skipped;
gc_more:
- segno = NULL_SEGNO;
-
- if (unlikely(!(sbi->sb->s_flags & MS_ACTIVE)))
+ if (unlikely(!(sbi->sb->s_flags & MS_ACTIVE))) {
+ ret = -EINVAL;
goto stop;
+ }
if (unlikely(f2fs_cp_error(sbi))) {
ret = -EIO;
goto stop;
}
- if (gc_type == BG_GC && has_not_enough_free_secs(sbi, sec_freed, 0)) {
- gc_type = FG_GC;
+ if (gc_type == BG_GC && has_not_enough_free_secs(sbi, 0, 0)) {
/*
- * If there is no victim and no prefree segment but still not
- * enough free sections, we should flush dent/node blocks and do
- * garbage collections.
+ * For example, if there are many prefree_segments below given
+ * threshold, we can make them free by checkpoint. Then, we
+ * secure free segments which doesn't need fggc any more.
*/
- if (__get_victim(sbi, &segno, gc_type) ||
- prefree_segments(sbi)) {
- ret = write_checkpoint(sbi, &cpc);
- if (ret)
- goto stop;
- segno = NULL_SEGNO;
- } else if (has_not_enough_free_secs(sbi, 0, 0)) {
- ret = write_checkpoint(sbi, &cpc);
+ if (prefree_segments(sbi) &&
+ !is_sbi_flag_set(sbi, SBI_CP_DISABLED)) {
+ ret = f2fs_write_checkpoint(sbi, &cpc);
if (ret)
goto stop;
}
+ if (has_not_enough_free_secs(sbi, 0, 0))
+ gc_type = FG_GC;
}
- if (segno == NULL_SEGNO && !__get_victim(sbi, &segno, gc_type))
+ /* f2fs_balance_fs doesn't need to do BG_GC in critical path. */
+ if (gc_type == BG_GC && !background) {
+ ret = -EINVAL;
goto stop;
- ret = 0;
+ }
+ if (!__get_victim(sbi, &segno, gc_type)) {
+ ret = -ENODATA;
+ goto stop;
+ }
- if (do_garbage_collect(sbi, segno, &gc_list, gc_type) &&
- gc_type == FG_GC)
+ seg_freed = do_garbage_collect(sbi, segno, &gc_list, gc_type);
+ if (gc_type == FG_GC && seg_freed == sbi->segs_per_sec)
sec_freed++;
+ total_freed += seg_freed;
+
+ if (gc_type == FG_GC) {
+ if (sbi->skipped_atomic_files[FG_GC] > last_skipped ||
+ sbi->skipped_gc_rwsem)
+ skipped_round++;
+ last_skipped = sbi->skipped_atomic_files[FG_GC];
+ round++;
+ }
if (gc_type == FG_GC)
sbi->cur_victim_sec = NULL_SEGNO;
- if (!sync) {
- if (has_not_enough_free_secs(sbi, sec_freed, 0))
+ if (sync)
+ goto stop;
+
+ if (has_not_enough_free_secs(sbi, sec_freed, 0)) {
+ if (skipped_round <= MAX_SKIP_GC_COUNT ||
+ skipped_round * 2 < round) {
+ segno = NULL_SEGNO;
goto gc_more;
+ }
- if (gc_type == FG_GC)
- ret = write_checkpoint(sbi, &cpc);
+ if (first_skipped < last_skipped &&
+ (last_skipped - first_skipped) >
+ sbi->skipped_gc_rwsem) {
+ f2fs_drop_inmem_pages_all(sbi, true);
+ segno = NULL_SEGNO;
+ goto gc_more;
+ }
+ if (gc_type == FG_GC && !is_sbi_flag_set(sbi, SBI_CP_DISABLED))
+ ret = f2fs_write_checkpoint(sbi, &cpc);
}
stop:
+ SIT_I(sbi)->last_victim[ALLOC_NEXT] = 0;
+ SIT_I(sbi)->last_victim[FLUSH_DEVICE] = init_segno;
+
+ trace_f2fs_gc_end(sbi->sb, ret, total_freed, sec_freed,
+ get_pages(sbi, F2FS_DIRTY_NODES),
+ get_pages(sbi, F2FS_DIRTY_DENTS),
+ get_pages(sbi, F2FS_DIRTY_IMETA),
+ free_sections(sbi),
+ free_segments(sbi),
+ reserved_segments(sbi),
+ prefree_segments(sbi));
+
mutex_unlock(&sbi->gc_mutex);
put_gc_inode(&gc_list);
- if (sync)
+ if (sync && !ret)
ret = sec_freed ? 0 : -EAGAIN;
return ret;
}
-void build_gc_manager(struct f2fs_sb_info *sbi)
+void f2fs_build_gc_manager(struct f2fs_sb_info *sbi)
{
- u64 main_count, resv_count, ovp_count, blocks_per_sec;
-
DIRTY_I(sbi)->v_ops = &default_v_ops;
- /* threshold of # of valid blocks in a section for victims of FG_GC */
- main_count = SM_I(sbi)->main_segments << sbi->log_blocks_per_seg;
- resv_count = SM_I(sbi)->reserved_segments << sbi->log_blocks_per_seg;
- ovp_count = SM_I(sbi)->ovp_segments << sbi->log_blocks_per_seg;
- blocks_per_sec = sbi->blocks_per_seg * sbi->segs_per_sec;
+ sbi->gc_pin_file_threshold = DEF_GC_FAILED_PINNED_FILES;
- sbi->fggc_threshold = div_u64((main_count - ovp_count) * blocks_per_sec,
- (main_count - resv_count));
+ /* give warm/cold data area from slower device */
+ if (sbi->s_ndevs && !__is_large_section(sbi))
+ SIT_I(sbi)->last_victim[ALLOC_NEXT] =
+ GET_SEGNO(sbi, FDEV(0).end_blk) + 1;
}
diff --git a/fs/f2fs/gc.h b/fs/f2fs/gc.h
index a993967dcdb9..bbac9d3787bd 100644
--- a/fs/f2fs/gc.h
+++ b/fs/f2fs/gc.h
@@ -1,24 +1,24 @@
+// SPDX-License-Identifier: GPL-2.0
/*
* fs/f2fs/gc.h
*
* Copyright (c) 2012 Samsung Electronics Co., Ltd.
* http://www.samsung.com/
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
*/
#define GC_THREAD_MIN_WB_PAGES 1 /*
* a threshold to determine
* whether IO subsystem is idle
* or not
*/
+#define DEF_GC_THREAD_URGENT_SLEEP_TIME 500 /* 500 ms */
#define DEF_GC_THREAD_MIN_SLEEP_TIME 30000 /* milliseconds */
#define DEF_GC_THREAD_MAX_SLEEP_TIME 60000
#define DEF_GC_THREAD_NOGC_SLEEP_TIME 300000 /* wait 5 min */
#define LIMIT_INVALID_BLOCK 40 /* percentage over total user space */
#define LIMIT_FREE_BLOCK 40 /* percentage over invalid + free space */
+#define DEF_GC_FAILED_PINNED_FILES 2048
+
/* Search max. number of dirty segments to select a victim segment */
#define DEF_MAX_VICTIM_SEARCH 4096 /* covers 8GB */
@@ -27,12 +27,13 @@ struct f2fs_gc_kthread {
wait_queue_head_t gc_wait_queue_head;
/* for gc sleep time */
+ unsigned int urgent_sleep_time;
unsigned int min_sleep_time;
unsigned int max_sleep_time;
unsigned int no_gc_sleep_time;
/* for changing gc mode */
- unsigned int gc_idle;
+ unsigned int gc_wake;
};
struct gc_inode_list {
@@ -65,25 +66,32 @@ static inline block_t limit_free_user_blocks(struct f2fs_sb_info *sbi)
}
static inline void increase_sleep_time(struct f2fs_gc_kthread *gc_th,
- long *wait)
+ unsigned int *wait)
{
+ unsigned int min_time = gc_th->min_sleep_time;
+ unsigned int max_time = gc_th->max_sleep_time;
+
if (*wait == gc_th->no_gc_sleep_time)
return;
- *wait += gc_th->min_sleep_time;
- if (*wait > gc_th->max_sleep_time)
- *wait = gc_th->max_sleep_time;
+ if ((long long)*wait + (long long)min_time > (long long)max_time)
+ *wait = max_time;
+ else
+ *wait += min_time;
}
static inline void decrease_sleep_time(struct f2fs_gc_kthread *gc_th,
- long *wait)
+ unsigned int *wait)
{
+ unsigned int min_time = gc_th->min_sleep_time;
+
if (*wait == gc_th->no_gc_sleep_time)
*wait = gc_th->max_sleep_time;
- *wait -= gc_th->min_sleep_time;
- if (*wait <= gc_th->min_sleep_time)
- *wait = gc_th->min_sleep_time;
+ if ((long long)*wait - (long long)min_time < (long long)min_time)
+ *wait = min_time;
+ else
+ *wait -= min_time;
}
static inline bool has_enough_invalid_blocks(struct f2fs_sb_info *sbi)
diff --git a/fs/f2fs/hash.c b/fs/f2fs/hash.c
index eb2e031ea887..cc82f142f811 100644
--- a/fs/f2fs/hash.c
+++ b/fs/f2fs/hash.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
/*
* fs/f2fs/hash.c
*
@@ -7,10 +8,6 @@
* Portions of this code from linux/fs/ext3/hash.c
*
* Copyright (C) 2002 by Theodore Ts'o
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
*/
#include <linux/types.h>
#include <linux/fs.h>
diff --git a/fs/f2fs/inline.c b/fs/f2fs/inline.c
index 482888ee8942..b8676a28db4a 100644
--- a/fs/f2fs/inline.c
+++ b/fs/f2fs/inline.c
@@ -1,11 +1,9 @@
+// SPDX-License-Identifier: GPL-2.0
/*
* fs/f2fs/inline.c
* Copyright (c) 2013, Intel Corporation
* Authors: Huajun Li <huajun.li@intel.com>
* Haicheng Li <haicheng.li@intel.com>
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
*/
#include <linux/fs.h>
@@ -13,6 +11,7 @@
#include "f2fs.h"
#include "node.h"
+#include <trace/events/android_fs.h>
bool f2fs_may_inline_data(struct inode *inode)
{
@@ -22,10 +21,10 @@ bool f2fs_may_inline_data(struct inode *inode)
if (!S_ISREG(inode->i_mode) && !S_ISLNK(inode->i_mode))
return false;
- if (i_size_read(inode) > MAX_INLINE_DATA)
+ if (i_size_read(inode) > MAX_INLINE_DATA(inode))
return false;
- if (f2fs_encrypted_inode(inode) && S_ISREG(inode->i_mode))
+ if (f2fs_post_read_required(inode))
return false;
return true;
@@ -42,8 +41,9 @@ bool f2fs_may_inline_dentry(struct inode *inode)
return true;
}
-void read_inline_data(struct page *page, struct page *ipage)
+void f2fs_do_read_inline_data(struct page *page, struct page *ipage)
{
+ struct inode *inode = page->mapping->host;
void *src_addr, *dst_addr;
if (PageUptodate(page))
@@ -51,56 +51,76 @@ void read_inline_data(struct page *page, struct page *ipage)
f2fs_bug_on(F2FS_P_SB(page), page->index);
- zero_user_segment(page, MAX_INLINE_DATA, PAGE_SIZE);
+ zero_user_segment(page, MAX_INLINE_DATA(inode), PAGE_SIZE);
/* Copy the whole inline data block */
- src_addr = inline_data_addr(ipage);
+ src_addr = inline_data_addr(inode, ipage);
dst_addr = kmap_atomic(page);
- memcpy(dst_addr, src_addr, MAX_INLINE_DATA);
+ memcpy(dst_addr, src_addr, MAX_INLINE_DATA(inode));
flush_dcache_page(page);
kunmap_atomic(dst_addr);
if (!PageUptodate(page))
SetPageUptodate(page);
}
-bool truncate_inline_inode(struct page *ipage, u64 from)
+void f2fs_truncate_inline_inode(struct inode *inode,
+ struct page *ipage, u64 from)
{
void *addr;
- if (from >= MAX_INLINE_DATA)
- return false;
+ if (from >= MAX_INLINE_DATA(inode))
+ return;
- addr = inline_data_addr(ipage);
+ addr = inline_data_addr(inode, ipage);
- f2fs_wait_on_page_writeback(ipage, NODE, true);
- memset(addr + from, 0, MAX_INLINE_DATA - from);
+ f2fs_wait_on_page_writeback(ipage, NODE, true, true);
+ memset(addr + from, 0, MAX_INLINE_DATA(inode) - from);
set_page_dirty(ipage);
- return true;
+
+ if (from == 0)
+ clear_inode_flag(inode, FI_DATA_EXIST);
}
int f2fs_read_inline_data(struct inode *inode, struct page *page)
{
struct page *ipage;
- ipage = get_node_page(F2FS_I_SB(inode), inode->i_ino);
+ if (trace_android_fs_dataread_start_enabled()) {
+ char *path, pathbuf[MAX_TRACE_PATHBUF_LEN];
+
+ path = android_fstrace_get_pathname(pathbuf,
+ MAX_TRACE_PATHBUF_LEN,
+ inode);
+ trace_android_fs_dataread_start(inode, page_offset(page),
+ PAGE_SIZE, current->pid,
+ path, current->comm);
+ }
+
+ ipage = f2fs_get_node_page(F2FS_I_SB(inode), inode->i_ino);
if (IS_ERR(ipage)) {
+ trace_android_fs_dataread_end(inode, page_offset(page),
+ PAGE_SIZE);
unlock_page(page);
return PTR_ERR(ipage);
}
if (!f2fs_has_inline_data(inode)) {
f2fs_put_page(ipage, 1);
+ trace_android_fs_dataread_end(inode, page_offset(page),
+ PAGE_SIZE);
return -EAGAIN;
}
if (page->index)
zero_user_segment(page, 0, PAGE_SIZE);
else
- read_inline_data(page, ipage);
+ f2fs_do_read_inline_data(page, ipage);
if (!PageUptodate(page))
SetPageUptodate(page);
f2fs_put_page(ipage, 1);
+ trace_android_fs_dataread_end(inode, page_offset(page),
+ PAGE_SIZE);
unlock_page(page);
return 0;
}
@@ -109,12 +129,15 @@ int f2fs_convert_inline_page(struct dnode_of_data *dn, struct page *page)
{
struct f2fs_io_info fio = {
.sbi = F2FS_I_SB(dn->inode),
+ .ino = dn->inode->i_ino,
.type = DATA,
.op = REQ_OP_WRITE,
- .op_flags = WRITE_SYNC | REQ_PRIO,
+ .op_flags = REQ_SYNC | REQ_PRIO,
.page = page,
.encrypted_page = NULL,
+ .io_type = FS_DATA_IO,
};
+ struct node_info ni;
int dirty, err;
if (!f2fs_exist_data(dn->inode))
@@ -124,6 +147,14 @@ int f2fs_convert_inline_page(struct dnode_of_data *dn, struct page *page)
if (err)
return err;
+ err = f2fs_get_node_info(fio.sbi, dn->nid, &ni);
+ if (err) {
+ f2fs_put_dnode(dn);
+ return err;
+ }
+
+ fio.version = ni.version;
+
if (unlikely(dn->data_blkaddr != NEW_ADDR)) {
f2fs_put_dnode(dn);
set_sbi_flag(fio.sbi, SBI_NEED_FSCK);
@@ -136,7 +167,7 @@ int f2fs_convert_inline_page(struct dnode_of_data *dn, struct page *page)
f2fs_bug_on(F2FS_P_SB(page), PageWriteback(page));
- read_inline_data(page, dn->inode_page);
+ f2fs_do_read_inline_data(page, dn->inode_page);
set_page_dirty(page);
/* clear dirty state */
@@ -144,21 +175,25 @@ int f2fs_convert_inline_page(struct dnode_of_data *dn, struct page *page)
/* write data page to try to make data consistent */
set_page_writeback(page);
+ ClearPageError(page);
fio.old_blkaddr = dn->data_blkaddr;
- write_data_page(dn, &fio);
- f2fs_wait_on_page_writeback(page, DATA, true);
- if (dirty)
+ set_inode_flag(dn->inode, FI_HOT_DATA);
+ f2fs_outplace_write_data(dn, &fio);
+ f2fs_wait_on_page_writeback(page, DATA, true, true);
+ if (dirty) {
inode_dec_dirty_pages(dn->inode);
+ f2fs_remove_dirty_inode(dn->inode);
+ }
/* this converted inline_data should be recovered. */
set_inode_flag(dn->inode, FI_APPEND_WRITE);
/* clear inline data and flag after data writeback */
- truncate_inline_inode(dn->inode_page, 0);
+ f2fs_truncate_inline_inode(dn->inode, dn->inode_page, 0);
clear_inline_node(dn->inode_page);
clear_out:
stat_dec_inline_inode(dn->inode);
- f2fs_clear_inline_inode(dn->inode);
+ clear_inode_flag(dn->inode, FI_INLINE_DATA);
f2fs_put_dnode(dn);
return 0;
}
@@ -179,7 +214,7 @@ int f2fs_convert_inline_inode(struct inode *inode)
f2fs_lock_op(sbi);
- ipage = get_node_page(sbi, inode->i_ino);
+ ipage = f2fs_get_node_page(sbi, inode->i_ino);
if (IS_ERR(ipage)) {
err = PTR_ERR(ipage);
goto out;
@@ -208,7 +243,7 @@ int f2fs_write_inline_data(struct inode *inode, struct page *page)
int err;
set_new_dnode(&dn, inode, NULL, NULL, 0);
- err = get_dnode_of_data(&dn, 0, LOOKUP_NODE);
+ err = f2fs_get_dnode_of_data(&dn, 0, LOOKUP_NODE);
if (err)
return err;
@@ -219,13 +254,15 @@ int f2fs_write_inline_data(struct inode *inode, struct page *page)
f2fs_bug_on(F2FS_I_SB(inode), page->index);
- f2fs_wait_on_page_writeback(dn.inode_page, NODE, true);
+ f2fs_wait_on_page_writeback(dn.inode_page, NODE, true, true);
src_addr = kmap_atomic(page);
- dst_addr = inline_data_addr(dn.inode_page);
- memcpy(dst_addr, src_addr, MAX_INLINE_DATA);
+ dst_addr = inline_data_addr(inode, dn.inode_page);
+ memcpy(dst_addr, src_addr, MAX_INLINE_DATA(inode));
kunmap_atomic(src_addr);
set_page_dirty(dn.inode_page);
+ f2fs_clear_radix_tree_dirty_tag(page);
+
set_inode_flag(inode, FI_APPEND_WRITE);
set_inode_flag(inode, FI_DATA_EXIST);
@@ -234,7 +271,7 @@ int f2fs_write_inline_data(struct inode *inode, struct page *page)
return 0;
}
-bool recover_inline_data(struct inode *inode, struct page *npage)
+bool f2fs_recover_inline_data(struct inode *inode, struct page *npage)
{
struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
struct f2fs_inode *ri = NULL;
@@ -255,14 +292,14 @@ bool recover_inline_data(struct inode *inode, struct page *npage)
if (f2fs_has_inline_data(inode) &&
ri && (ri->i_inline & F2FS_INLINE_DATA)) {
process_inline:
- ipage = get_node_page(sbi, inode->i_ino);
+ ipage = f2fs_get_node_page(sbi, inode->i_ino);
f2fs_bug_on(sbi, IS_ERR(ipage));
- f2fs_wait_on_page_writeback(ipage, NODE, true);
+ f2fs_wait_on_page_writeback(ipage, NODE, true, true);
- src_addr = inline_data_addr(npage);
- dst_addr = inline_data_addr(ipage);
- memcpy(dst_addr, src_addr, MAX_INLINE_DATA);
+ src_addr = inline_data_addr(inode, npage);
+ dst_addr = inline_data_addr(inode, ipage);
+ memcpy(dst_addr, src_addr, MAX_INLINE_DATA(inode));
set_inode_flag(inode, FI_INLINE_DATA);
set_inode_flag(inode, FI_DATA_EXIST);
@@ -273,32 +310,31 @@ process_inline:
}
if (f2fs_has_inline_data(inode)) {
- ipage = get_node_page(sbi, inode->i_ino);
+ ipage = f2fs_get_node_page(sbi, inode->i_ino);
f2fs_bug_on(sbi, IS_ERR(ipage));
- if (!truncate_inline_inode(ipage, 0))
- return false;
- f2fs_clear_inline_inode(inode);
+ f2fs_truncate_inline_inode(inode, ipage, 0);
+ clear_inode_flag(inode, FI_INLINE_DATA);
f2fs_put_page(ipage, 1);
} else if (ri && (ri->i_inline & F2FS_INLINE_DATA)) {
- if (truncate_blocks(inode, 0, false))
+ if (f2fs_truncate_blocks(inode, 0, false, false))
return false;
goto process_inline;
}
return false;
}
-struct f2fs_dir_entry *find_in_inline_dir(struct inode *dir,
+struct f2fs_dir_entry *f2fs_find_in_inline_dir(struct inode *dir,
struct fscrypt_name *fname, struct page **res_page)
{
struct f2fs_sb_info *sbi = F2FS_SB(dir->i_sb);
- struct f2fs_inline_dentry *inline_dentry;
struct qstr name = FSTR_TO_QSTR(&fname->disk_name);
struct f2fs_dir_entry *de;
struct f2fs_dentry_ptr d;
struct page *ipage;
+ void *inline_dentry;
f2fs_hash_t namehash;
- ipage = get_node_page(sbi, dir->i_ino);
+ ipage = f2fs_get_node_page(sbi, dir->i_ino);
if (IS_ERR(ipage)) {
*res_page = ipage;
return NULL;
@@ -306,10 +342,10 @@ struct f2fs_dir_entry *find_in_inline_dir(struct inode *dir,
namehash = f2fs_dentry_hash(&name, fname);
- inline_dentry = inline_data_addr(ipage);
+ inline_dentry = inline_data_addr(dir, ipage);
- make_dentry_ptr(NULL, &d, (void *)inline_dentry, 2);
- de = find_target_dentry(fname, namehash, NULL, &d);
+ make_dentry_ptr_inline(dir, &d, inline_dentry);
+ de = f2fs_find_target_dentry(fname, namehash, NULL, &d);
unlock_page(ipage);
if (de)
*res_page = ipage;
@@ -319,22 +355,22 @@ struct f2fs_dir_entry *find_in_inline_dir(struct inode *dir,
return de;
}
-int make_empty_inline_dir(struct inode *inode, struct inode *parent,
+int f2fs_make_empty_inline_dir(struct inode *inode, struct inode *parent,
struct page *ipage)
{
- struct f2fs_inline_dentry *dentry_blk;
struct f2fs_dentry_ptr d;
+ void *inline_dentry;
- dentry_blk = inline_data_addr(ipage);
+ inline_dentry = inline_data_addr(inode, ipage);
- make_dentry_ptr(NULL, &d, (void *)dentry_blk, 2);
- do_make_empty_dir(inode, parent, &d);
+ make_dentry_ptr_inline(inode, &d, inline_dentry);
+ f2fs_do_make_empty_dir(inode, parent, &d);
set_page_dirty(ipage);
/* update i_size to MAX_INLINE_DATA */
- if (i_size_read(inode) < MAX_INLINE_DATA)
- f2fs_i_size_write(inode, MAX_INLINE_DATA);
+ if (i_size_read(inode) < MAX_INLINE_DATA(inode))
+ f2fs_i_size_write(inode, MAX_INLINE_DATA(inode));
return 0;
}
@@ -343,11 +379,12 @@ int make_empty_inline_dir(struct inode *inode, struct inode *parent,
* release ipage in this function.
*/
static int f2fs_move_inline_dirents(struct inode *dir, struct page *ipage,
- struct f2fs_inline_dentry *inline_dentry)
+ void *inline_dentry)
{
struct page *page;
struct dnode_of_data dn;
struct f2fs_dentry_block *dentry_blk;
+ struct f2fs_dentry_ptr src, dst;
int err;
page = f2fs_grab_cache_page(dir->i_mapping, 0, false);
@@ -372,34 +409,31 @@ static int f2fs_move_inline_dirents(struct inode *dir, struct page *ipage,
goto out;
}
- f2fs_wait_on_page_writeback(page, DATA, true);
- zero_user_segment(page, MAX_INLINE_DATA, PAGE_SIZE);
+ f2fs_wait_on_page_writeback(page, DATA, true, true);
- dentry_blk = kmap_atomic(page);
+ dentry_blk = page_address(page);
+
+ make_dentry_ptr_inline(dir, &src, inline_dentry);
+ make_dentry_ptr_block(dir, &dst, dentry_blk);
/* copy data from inline dentry block to new dentry block */
- memcpy(dentry_blk->dentry_bitmap, inline_dentry->dentry_bitmap,
- INLINE_DENTRY_BITMAP_SIZE);
- memset(dentry_blk->dentry_bitmap + INLINE_DENTRY_BITMAP_SIZE, 0,
- SIZE_OF_DENTRY_BITMAP - INLINE_DENTRY_BITMAP_SIZE);
+ memcpy(dst.bitmap, src.bitmap, src.nr_bitmap);
+ memset(dst.bitmap + src.nr_bitmap, 0, dst.nr_bitmap - src.nr_bitmap);
/*
* we do not need to zero out remainder part of dentry and filename
* field, since we have used bitmap for marking the usage status of
* them, besides, we can also ignore copying/zeroing reserved space
* of dentry block, because them haven't been used so far.
*/
- memcpy(dentry_blk->dentry, inline_dentry->dentry,
- sizeof(struct f2fs_dir_entry) * NR_INLINE_DENTRY);
- memcpy(dentry_blk->filename, inline_dentry->filename,
- NR_INLINE_DENTRY * F2FS_SLOT_LEN);
+ memcpy(dst.dentry, src.dentry, SIZE_OF_DIR_ENTRY * src.max);
+ memcpy(dst.filename, src.filename, src.max * F2FS_SLOT_LEN);
- kunmap_atomic(dentry_blk);
if (!PageUptodate(page))
SetPageUptodate(page);
set_page_dirty(page);
/* clear inline dir and flag after data writeback */
- truncate_inline_inode(ipage, 0);
+ f2fs_truncate_inline_inode(dir, ipage, 0);
stat_dec_inline_dir(dir);
clear_inode_flag(dir, FI_INLINE_DENTRY);
@@ -412,14 +446,13 @@ out:
return err;
}
-static int f2fs_add_inline_entries(struct inode *dir,
- struct f2fs_inline_dentry *inline_dentry)
+static int f2fs_add_inline_entries(struct inode *dir, void *inline_dentry)
{
struct f2fs_dentry_ptr d;
unsigned long bit_pos = 0;
int err = 0;
- make_dentry_ptr(NULL, &d, (void *)inline_dentry, 2);
+ make_dentry_ptr_inline(dir, &d, inline_dentry);
while (bit_pos < d.max) {
struct f2fs_dir_entry *de;
@@ -440,10 +473,10 @@ static int f2fs_add_inline_entries(struct inode *dir,
}
new_name.name = d.filename[bit_pos];
- new_name.len = de->name_len;
+ new_name.len = le16_to_cpu(de->name_len);
ino = le32_to_cpu(de->ino);
- fake_mode = get_de_type(de) << S_SHIFT;
+ fake_mode = f2fs_get_de_type(de) << S_SHIFT;
err = f2fs_add_regular_entry(dir, &new_name, NULL, NULL,
ino, fake_mode);
@@ -455,26 +488,26 @@ static int f2fs_add_inline_entries(struct inode *dir,
return 0;
punch_dentry_pages:
truncate_inode_pages(&dir->i_data, 0);
- truncate_blocks(dir, 0, false);
- remove_dirty_inode(dir);
+ f2fs_truncate_blocks(dir, 0, false, false);
+ f2fs_remove_dirty_inode(dir);
return err;
}
static int f2fs_move_rehashed_dirents(struct inode *dir, struct page *ipage,
- struct f2fs_inline_dentry *inline_dentry)
+ void *inline_dentry)
{
- struct f2fs_inline_dentry *backup_dentry;
+ void *backup_dentry;
int err;
backup_dentry = f2fs_kmalloc(F2FS_I_SB(dir),
- sizeof(struct f2fs_inline_dentry), GFP_F2FS_ZERO);
+ MAX_INLINE_DATA(dir), GFP_F2FS_ZERO);
if (!backup_dentry) {
f2fs_put_page(ipage, 1);
return -ENOMEM;
}
- memcpy(backup_dentry, inline_dentry, MAX_INLINE_DATA);
- truncate_inline_inode(ipage, 0);
+ memcpy(backup_dentry, inline_dentry, MAX_INLINE_DATA(dir));
+ f2fs_truncate_inline_inode(dir, ipage, 0);
unlock_page(ipage);
@@ -486,22 +519,23 @@ static int f2fs_move_rehashed_dirents(struct inode *dir, struct page *ipage,
stat_dec_inline_dir(dir);
clear_inode_flag(dir, FI_INLINE_DENTRY);
- kfree(backup_dentry);
+ kvfree(backup_dentry);
return 0;
recover:
lock_page(ipage);
- memcpy(inline_dentry, backup_dentry, MAX_INLINE_DATA);
+ f2fs_wait_on_page_writeback(ipage, NODE, true, true);
+ memcpy(inline_dentry, backup_dentry, MAX_INLINE_DATA(dir));
f2fs_i_depth_write(dir, 0);
- f2fs_i_size_write(dir, MAX_INLINE_DATA);
+ f2fs_i_size_write(dir, MAX_INLINE_DATA(dir));
set_page_dirty(ipage);
f2fs_put_page(ipage, 1);
- kfree(backup_dentry);
+ kvfree(backup_dentry);
return err;
}
static int f2fs_convert_inline_dir(struct inode *dir, struct page *ipage,
- struct f2fs_inline_dentry *inline_dentry)
+ void *inline_dentry)
{
if (!F2FS_I(dir)->i_dir_level)
return f2fs_move_inline_dirents(dir, ipage, inline_dentry);
@@ -517,21 +551,22 @@ int f2fs_add_inline_entry(struct inode *dir, const struct qstr *new_name,
struct page *ipage;
unsigned int bit_pos;
f2fs_hash_t name_hash;
- struct f2fs_inline_dentry *dentry_blk = NULL;
+ void *inline_dentry = NULL;
struct f2fs_dentry_ptr d;
int slots = GET_DENTRY_SLOTS(new_name->len);
struct page *page = NULL;
int err = 0;
- ipage = get_node_page(sbi, dir->i_ino);
+ ipage = f2fs_get_node_page(sbi, dir->i_ino);
if (IS_ERR(ipage))
return PTR_ERR(ipage);
- dentry_blk = inline_data_addr(ipage);
- bit_pos = room_for_filename(&dentry_blk->dentry_bitmap,
- slots, NR_INLINE_DENTRY);
- if (bit_pos >= NR_INLINE_DENTRY) {
- err = f2fs_convert_inline_dir(dir, ipage, dentry_blk);
+ inline_dentry = inline_data_addr(dir, ipage);
+ make_dentry_ptr_inline(dir, &d, inline_dentry);
+
+ bit_pos = f2fs_room_for_filename(d.bitmap, slots, d.max);
+ if (bit_pos >= d.max) {
+ err = f2fs_convert_inline_dir(dir, ipage, inline_dentry);
if (err)
return err;
err = -EAGAIN;
@@ -540,20 +575,17 @@ int f2fs_add_inline_entry(struct inode *dir, const struct qstr *new_name,
if (inode) {
down_write(&F2FS_I(inode)->i_sem);
- page = init_inode_metadata(inode, dir, new_name,
+ page = f2fs_init_inode_metadata(inode, dir, new_name,
orig_name, ipage);
if (IS_ERR(page)) {
err = PTR_ERR(page);
goto fail;
}
- if (f2fs_encrypted_inode(dir))
- file_set_enc_name(inode);
}
- f2fs_wait_on_page_writeback(ipage, NODE, true);
+ f2fs_wait_on_page_writeback(ipage, NODE, true, true);
name_hash = f2fs_dentry_hash(new_name, NULL);
- make_dentry_ptr(NULL, &d, (void *)dentry_blk, 2);
f2fs_update_dentry(ino, mode, &d, new_name, name_hash, bit_pos);
set_page_dirty(ipage);
@@ -564,7 +596,7 @@ int f2fs_add_inline_entry(struct inode *dir, const struct qstr *new_name,
f2fs_put_page(page, 1);
}
- update_parent_metadata(dir, inode, 0);
+ f2fs_update_parent_metadata(dir, inode, 0);
fail:
if (inode)
up_write(&F2FS_I(inode)->i_sem);
@@ -576,25 +608,27 @@ out:
void f2fs_delete_inline_entry(struct f2fs_dir_entry *dentry, struct page *page,
struct inode *dir, struct inode *inode)
{
- struct f2fs_inline_dentry *inline_dentry;
+ struct f2fs_dentry_ptr d;
+ void *inline_dentry;
int slots = GET_DENTRY_SLOTS(le16_to_cpu(dentry->name_len));
unsigned int bit_pos;
int i;
lock_page(page);
- f2fs_wait_on_page_writeback(page, NODE, true);
+ f2fs_wait_on_page_writeback(page, NODE, true, true);
+
+ inline_dentry = inline_data_addr(dir, page);
+ make_dentry_ptr_inline(dir, &d, inline_dentry);
- inline_dentry = inline_data_addr(page);
- bit_pos = dentry - inline_dentry->dentry;
+ bit_pos = dentry - d.dentry;
for (i = 0; i < slots; i++)
- __clear_bit_le(bit_pos + i,
- &inline_dentry->dentry_bitmap);
+ __clear_bit_le(bit_pos + i, d.bitmap);
set_page_dirty(page);
f2fs_put_page(page, 1);
dir->i_ctime = dir->i_mtime = current_time(dir);
- f2fs_mark_inode_dirty_sync(dir);
+ f2fs_mark_inode_dirty_sync(dir, false);
if (inode)
f2fs_drop_nlink(dir, inode);
@@ -605,20 +639,21 @@ bool f2fs_empty_inline_dir(struct inode *dir)
struct f2fs_sb_info *sbi = F2FS_I_SB(dir);
struct page *ipage;
unsigned int bit_pos = 2;
- struct f2fs_inline_dentry *dentry_blk;
+ void *inline_dentry;
+ struct f2fs_dentry_ptr d;
- ipage = get_node_page(sbi, dir->i_ino);
+ ipage = f2fs_get_node_page(sbi, dir->i_ino);
if (IS_ERR(ipage))
return false;
- dentry_blk = inline_data_addr(ipage);
- bit_pos = find_next_bit_le(&dentry_blk->dentry_bitmap,
- NR_INLINE_DENTRY,
- bit_pos);
+ inline_dentry = inline_data_addr(dir, ipage);
+ make_dentry_ptr_inline(dir, &d, inline_dentry);
+
+ bit_pos = find_next_bit_le(d.bitmap, d.max, bit_pos);
f2fs_put_page(ipage, 1);
- if (bit_pos < NR_INLINE_DENTRY)
+ if (bit_pos < d.max)
return false;
return true;
@@ -628,26 +663,30 @@ int f2fs_read_inline_dir(struct file *file, struct dir_context *ctx,
struct fscrypt_str *fstr)
{
struct inode *inode = file_inode(file);
- struct f2fs_inline_dentry *inline_dentry = NULL;
struct page *ipage = NULL;
struct f2fs_dentry_ptr d;
+ void *inline_dentry = NULL;
+ int err;
- if (ctx->pos == NR_INLINE_DENTRY)
+ make_dentry_ptr_inline(inode, &d, inline_dentry);
+
+ if (ctx->pos == d.max)
return 0;
- ipage = get_node_page(F2FS_I_SB(inode), inode->i_ino);
+ ipage = f2fs_get_node_page(F2FS_I_SB(inode), inode->i_ino);
if (IS_ERR(ipage))
return PTR_ERR(ipage);
- inline_dentry = inline_data_addr(ipage);
+ inline_dentry = inline_data_addr(inode, ipage);
- make_dentry_ptr(inode, &d, (void *)inline_dentry, 2);
+ make_dentry_ptr_inline(inode, &d, inline_dentry);
- if (!f2fs_fill_dentries(ctx, &d, 0, fstr))
- ctx->pos = NR_INLINE_DENTRY;
+ err = f2fs_fill_dentries(ctx, &d, 0, fstr);
+ if (!err)
+ ctx->pos = d.max;
f2fs_put_page(ipage, 1);
- return 0;
+ return err < 0 ? err : 0;
}
int f2fs_inline_data_fiemap(struct inode *inode,
@@ -660,7 +699,7 @@ int f2fs_inline_data_fiemap(struct inode *inode,
struct page *ipage;
int err = 0;
- ipage = get_node_page(F2FS_I_SB(inode), inode->i_ino);
+ ipage = f2fs_get_node_page(F2FS_I_SB(inode), inode->i_ino);
if (IS_ERR(ipage))
return PTR_ERR(ipage);
@@ -669,16 +708,20 @@ int f2fs_inline_data_fiemap(struct inode *inode,
goto out;
}
- ilen = min_t(size_t, MAX_INLINE_DATA, i_size_read(inode));
+ ilen = min_t(size_t, MAX_INLINE_DATA(inode), i_size_read(inode));
if (start >= ilen)
goto out;
if (start + len < ilen)
ilen = start + len;
ilen -= start;
- get_node_info(F2FS_I_SB(inode), inode->i_ino, &ni);
+ err = f2fs_get_node_info(F2FS_I_SB(inode), inode->i_ino, &ni);
+ if (err)
+ goto out;
+
byteaddr = (__u64)ni.blk_addr << inode->i_sb->s_blocksize_bits;
- byteaddr += (char *)inline_data_addr(ipage) - (char *)F2FS_INODE(ipage);
+ byteaddr += (char *)inline_data_addr(inode, ipage) -
+ (char *)F2FS_INODE(ipage);
err = fiemap_fill_next_extent(fieinfo, start, byteaddr, ilen, flags);
out:
f2fs_put_page(ipage, 1);
diff --git a/fs/f2fs/inode.c b/fs/f2fs/inode.c
index 1de02c31756b..bec52961630b 100644
--- a/fs/f2fs/inode.c
+++ b/fs/f2fs/inode.c
@@ -1,12 +1,9 @@
+// SPDX-License-Identifier: GPL-2.0
/*
* fs/f2fs/inode.c
*
* Copyright (c) 2012 Samsung Electronics Co., Ltd.
* http://www.samsung.com/
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
*/
#include <linux/fs.h>
#include <linux/f2fs_fs.h>
@@ -16,13 +13,18 @@
#include "f2fs.h"
#include "node.h"
+#include "segment.h"
#include <trace/events/f2fs.h>
-void f2fs_mark_inode_dirty_sync(struct inode *inode)
+void f2fs_mark_inode_dirty_sync(struct inode *inode, bool sync)
{
- if (f2fs_inode_dirtied(inode))
+ if (is_inode_flag_set(inode, FI_NEW_INODE))
+ return;
+
+ if (f2fs_inode_dirtied(inode, sync))
return;
+
mark_inode_dirty_sync(inode);
}
@@ -31,38 +33,42 @@ void f2fs_set_inode_flags(struct inode *inode)
unsigned int flags = F2FS_I(inode)->i_flags;
unsigned int new_fl = 0;
- if (flags & FS_SYNC_FL)
+ if (flags & F2FS_SYNC_FL)
new_fl |= S_SYNC;
- if (flags & FS_APPEND_FL)
+ if (flags & F2FS_APPEND_FL)
new_fl |= S_APPEND;
- if (flags & FS_IMMUTABLE_FL)
+ if (flags & F2FS_IMMUTABLE_FL)
new_fl |= S_IMMUTABLE;
- if (flags & FS_NOATIME_FL)
+ if (flags & F2FS_NOATIME_FL)
new_fl |= S_NOATIME;
- if (flags & FS_DIRSYNC_FL)
+ if (flags & F2FS_DIRSYNC_FL)
new_fl |= S_DIRSYNC;
+ if (f2fs_encrypted_inode(inode))
+ new_fl |= S_ENCRYPTED;
inode_set_flags(inode, new_fl,
- S_SYNC|S_APPEND|S_IMMUTABLE|S_NOATIME|S_DIRSYNC);
- f2fs_mark_inode_dirty_sync(inode);
+ S_SYNC|S_APPEND|S_IMMUTABLE|S_NOATIME|S_DIRSYNC|
+ S_ENCRYPTED);
}
static void __get_inode_rdev(struct inode *inode, struct f2fs_inode *ri)
{
+ int extra_size = get_extra_isize(inode);
+
if (S_ISCHR(inode->i_mode) || S_ISBLK(inode->i_mode) ||
S_ISFIFO(inode->i_mode) || S_ISSOCK(inode->i_mode)) {
- if (ri->i_addr[0])
- inode->i_rdev =
- old_decode_dev(le32_to_cpu(ri->i_addr[0]));
+ if (ri->i_addr[extra_size])
+ inode->i_rdev = old_decode_dev(
+ le32_to_cpu(ri->i_addr[extra_size]));
else
- inode->i_rdev =
- new_decode_dev(le32_to_cpu(ri->i_addr[1]));
+ inode->i_rdev = new_decode_dev(
+ le32_to_cpu(ri->i_addr[extra_size + 1]));
}
}
static int __written_first_block(struct f2fs_sb_info *sbi,
struct f2fs_inode *ri)
{
- block_t addr = le32_to_cpu(ri->i_addr[0]);
+ block_t addr = le32_to_cpu(ri->i_addr[offset_in_addr(ri)]);
if (!__is_valid_data_blkaddr(addr))
return 1;
@@ -73,29 +79,31 @@ static int __written_first_block(struct f2fs_sb_info *sbi,
static void __set_inode_rdev(struct inode *inode, struct f2fs_inode *ri)
{
+ int extra_size = get_extra_isize(inode);
+
if (S_ISCHR(inode->i_mode) || S_ISBLK(inode->i_mode)) {
if (old_valid_dev(inode->i_rdev)) {
- ri->i_addr[0] =
+ ri->i_addr[extra_size] =
cpu_to_le32(old_encode_dev(inode->i_rdev));
- ri->i_addr[1] = 0;
+ ri->i_addr[extra_size + 1] = 0;
} else {
- ri->i_addr[0] = 0;
- ri->i_addr[1] =
+ ri->i_addr[extra_size] = 0;
+ ri->i_addr[extra_size + 1] =
cpu_to_le32(new_encode_dev(inode->i_rdev));
- ri->i_addr[2] = 0;
+ ri->i_addr[extra_size + 2] = 0;
}
}
}
static void __recover_inline_status(struct inode *inode, struct page *ipage)
{
- void *inline_data = inline_data_addr(ipage);
+ void *inline_data = inline_data_addr(inode, ipage);
__le32 *start = inline_data;
- __le32 *end = start + MAX_INLINE_DATA / sizeof(__le32);
+ __le32 *end = start + MAX_INLINE_DATA(inode) / sizeof(__le32);
while (start < end) {
if (*start++) {
- f2fs_wait_on_page_writeback(ipage, NODE, true);
+ f2fs_wait_on_page_writeback(ipage, NODE, true, true);
set_inode_flag(inode, FI_DATA_EXIST);
set_raw_inline(inode, F2FS_INODE(ipage));
@@ -106,9 +114,88 @@ static void __recover_inline_status(struct inode *inode, struct page *ipage)
return;
}
+static bool f2fs_enable_inode_chksum(struct f2fs_sb_info *sbi, struct page *page)
+{
+ struct f2fs_inode *ri = &F2FS_NODE(page)->i;
+
+ if (!f2fs_sb_has_inode_chksum(sbi))
+ return false;
+
+ if (!IS_INODE(page) || !(ri->i_inline & F2FS_EXTRA_ATTR))
+ return false;
+
+ if (!F2FS_FITS_IN_INODE(ri, le16_to_cpu(ri->i_extra_isize),
+ i_inode_checksum))
+ return false;
+
+ return true;
+}
+
+static __u32 f2fs_inode_chksum(struct f2fs_sb_info *sbi, struct page *page)
+{
+ struct f2fs_node *node = F2FS_NODE(page);
+ struct f2fs_inode *ri = &node->i;
+ __le32 ino = node->footer.ino;
+ __le32 gen = ri->i_generation;
+ __u32 chksum, chksum_seed;
+ __u32 dummy_cs = 0;
+ unsigned int offset = offsetof(struct f2fs_inode, i_inode_checksum);
+ unsigned int cs_size = sizeof(dummy_cs);
+
+ chksum = f2fs_chksum(sbi, sbi->s_chksum_seed, (__u8 *)&ino,
+ sizeof(ino));
+ chksum_seed = f2fs_chksum(sbi, chksum, (__u8 *)&gen, sizeof(gen));
+
+ chksum = f2fs_chksum(sbi, chksum_seed, (__u8 *)ri, offset);
+ chksum = f2fs_chksum(sbi, chksum, (__u8 *)&dummy_cs, cs_size);
+ offset += cs_size;
+ chksum = f2fs_chksum(sbi, chksum, (__u8 *)ri + offset,
+ F2FS_BLKSIZE - offset);
+ return chksum;
+}
+
+bool f2fs_inode_chksum_verify(struct f2fs_sb_info *sbi, struct page *page)
+{
+ struct f2fs_inode *ri;
+ __u32 provided, calculated;
+
+ if (unlikely(is_sbi_flag_set(sbi, SBI_IS_SHUTDOWN)))
+ return true;
+
+#ifdef CONFIG_F2FS_CHECK_FS
+ if (!f2fs_enable_inode_chksum(sbi, page))
+#else
+ if (!f2fs_enable_inode_chksum(sbi, page) ||
+ PageDirty(page) || PageWriteback(page))
+#endif
+ return true;
+
+ ri = &F2FS_NODE(page)->i;
+ provided = le32_to_cpu(ri->i_inode_checksum);
+ calculated = f2fs_inode_chksum(sbi, page);
+
+ if (provided != calculated)
+ f2fs_msg(sbi->sb, KERN_WARNING,
+ "checksum invalid, ino = %x, %x vs. %x",
+ ino_of_node(page), provided, calculated);
+
+ return provided == calculated;
+}
+
+void f2fs_inode_chksum_set(struct f2fs_sb_info *sbi, struct page *page)
+{
+ struct f2fs_inode *ri = &F2FS_NODE(page)->i;
+
+ if (!f2fs_enable_inode_chksum(sbi, page))
+ return;
+
+ ri->i_inode_checksum = cpu_to_le32(f2fs_inode_chksum(sbi, page));
+}
+
static bool sanity_check_inode(struct inode *inode, struct page *node_page)
{
struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
+ struct f2fs_inode_info *fi = F2FS_I(inode);
unsigned long long iblocks;
iblocks = le64_to_cpu(F2FS_INODE(node_page)->i_blocks);
@@ -131,6 +218,36 @@ static bool sanity_check_inode(struct inode *inode, struct page *node_page)
return false;
}
+ if (f2fs_sb_has_flexible_inline_xattr(sbi)
+ && !f2fs_has_extra_attr(inode)) {
+ set_sbi_flag(sbi, SBI_NEED_FSCK);
+ f2fs_msg(sbi->sb, KERN_WARNING,
+ "%s: corrupted inode ino=%lx, run fsck to fix.",
+ __func__, inode->i_ino);
+ return false;
+ }
+
+ if (f2fs_has_extra_attr(inode) &&
+ !f2fs_sb_has_extra_attr(sbi)) {
+ set_sbi_flag(sbi, SBI_NEED_FSCK);
+ f2fs_msg(sbi->sb, KERN_WARNING,
+ "%s: inode (ino=%lx) is with extra_attr, "
+ "but extra_attr feature is off",
+ __func__, inode->i_ino);
+ return false;
+ }
+
+ if (fi->i_extra_isize > F2FS_TOTAL_EXTRA_ATTR_SIZE ||
+ fi->i_extra_isize % sizeof(__le32)) {
+ set_sbi_flag(sbi, SBI_NEED_FSCK);
+ f2fs_msg(sbi->sb, KERN_WARNING,
+ "%s: inode (ino=%lx) has corrupted i_extra_isize: %d, "
+ "max: %zu",
+ __func__, inode->i_ino, fi->i_extra_isize,
+ F2FS_TOTAL_EXTRA_ATTR_SIZE);
+ return false;
+ }
+
if (F2FS_I(inode)->extent_tree) {
struct extent_info *ei = &F2FS_I(inode)->extent_tree->largest;
@@ -147,6 +264,26 @@ static bool sanity_check_inode(struct inode *inode, struct page *node_page)
return false;
}
}
+
+ if (f2fs_has_inline_data(inode) &&
+ (!S_ISREG(inode->i_mode) && !S_ISLNK(inode->i_mode))) {
+ set_sbi_flag(sbi, SBI_NEED_FSCK);
+ f2fs_msg(sbi->sb, KERN_WARNING,
+ "%s: inode (ino=%lx, mode=%u) should not have "
+ "inline_data, run fsck to fix",
+ __func__, inode->i_ino, inode->i_mode);
+ return false;
+ }
+
+ if (f2fs_has_inline_dentry(inode) && !S_ISDIR(inode->i_mode)) {
+ set_sbi_flag(sbi, SBI_NEED_FSCK);
+ f2fs_msg(sbi->sb, KERN_WARNING,
+ "%s: inode (ino=%lx, mode=%u) should not have "
+ "inline_dentry, run fsck to fix",
+ __func__, inode->i_ino, inode->i_mode);
+ return false;
+ }
+
return true;
}
@@ -156,17 +293,14 @@ static int do_read_inode(struct inode *inode)
struct f2fs_inode_info *fi = F2FS_I(inode);
struct page *node_page;
struct f2fs_inode *ri;
+ projid_t i_projid;
int err;
/* Check if ino is within scope */
- if (check_nid_range(sbi, inode->i_ino)) {
- f2fs_msg(inode->i_sb, KERN_ERR, "bad inode number: %lu",
- (unsigned long) inode->i_ino);
- WARN_ON(1);
+ if (f2fs_check_nid_range(sbi, inode->i_ino))
return -EINVAL;
- }
- node_page = get_node_page(sbi, inode->i_ino);
+ node_page = f2fs_get_node_page(sbi, inode->i_ino);
if (IS_ERR(node_page))
return PTR_ERR(node_page);
@@ -177,7 +311,7 @@ static int do_read_inode(struct inode *inode)
i_gid_write(inode, le32_to_cpu(ri->i_gid));
set_nlink(inode, le32_to_cpu(ri->i_links));
inode->i_size = le64_to_cpu(ri->i_size);
- inode->i_blocks = le64_to_cpu(ri->i_blocks);
+ inode->i_blocks = SECTOR_FROM_BLOCK(le64_to_cpu(ri->i_blocks) - 1);
inode->i_atime.tv_sec = le64_to_cpu(ri->i_atime);
inode->i_ctime.tv_sec = le64_to_cpu(ri->i_ctime);
@@ -186,8 +320,11 @@ static int do_read_inode(struct inode *inode)
inode->i_ctime.tv_nsec = le32_to_cpu(ri->i_ctime_nsec);
inode->i_mtime.tv_nsec = le32_to_cpu(ri->i_mtime_nsec);
inode->i_generation = le32_to_cpu(ri->i_generation);
-
- fi->i_current_depth = le32_to_cpu(ri->i_current_depth);
+ if (S_ISDIR(inode->i_mode))
+ fi->i_current_depth = le32_to_cpu(ri->i_current_depth);
+ else if (S_ISREG(inode->i_mode))
+ fi->i_gc_failures[GC_FAILURE_PIN] =
+ le16_to_cpu(ri->i_gc_failures);
fi->i_xattr_nid = le32_to_cpu(ri->i_xattr_nid);
fi->i_flags = le32_to_cpu(ri->i_flags);
fi->flags = 0;
@@ -200,6 +337,25 @@ static int do_read_inode(struct inode *inode)
get_inline_info(inode, ri);
+ fi->i_extra_isize = f2fs_has_extra_attr(inode) ?
+ le16_to_cpu(ri->i_extra_isize) : 0;
+
+ if (f2fs_sb_has_flexible_inline_xattr(sbi)) {
+ fi->i_inline_xattr_size = le16_to_cpu(ri->i_inline_xattr_size);
+ } else if (f2fs_has_inline_xattr(inode) ||
+ f2fs_has_inline_dentry(inode)) {
+ fi->i_inline_xattr_size = DEFAULT_INLINE_XATTR_ADDRS;
+ } else {
+
+ /*
+ * Previous inline data or directory always reserved 200 bytes
+ * in inode layout, even if inline_xattr is disabled. In order
+ * to keep inline_dentry's structure for backward compatibility,
+ * we get the space back only from inline_data.
+ */
+ fi->i_inline_xattr_size = 0;
+ }
+
if (!sanity_check_inode(inode, node_page)) {
f2fs_put_page(node_page, 1);
return -EINVAL;
@@ -209,20 +365,48 @@ static int do_read_inode(struct inode *inode)
if (f2fs_has_inline_data(inode) && !f2fs_exist_data(inode))
__recover_inline_status(inode, node_page);
+ /* try to recover cold bit for non-dir inode */
+ if (!S_ISDIR(inode->i_mode) && !is_cold_node(node_page)) {
+ set_cold_node(node_page, false);
+ set_page_dirty(node_page);
+ }
+
/* get rdev by using inline_info */
__get_inode_rdev(inode, ri);
- err = __written_first_block(sbi, ri);
- if (err < 0) {
- f2fs_put_page(node_page, 1);
- return err;
+ if (S_ISREG(inode->i_mode)) {
+ err = __written_first_block(sbi, ri);
+ if (err < 0) {
+ f2fs_put_page(node_page, 1);
+ return err;
+ }
+ if (!err)
+ set_inode_flag(inode, FI_FIRST_BLOCK_WRITTEN);
}
- if (!err)
- set_inode_flag(inode, FI_FIRST_BLOCK_WRITTEN);
- if (!need_inode_block_update(sbi, inode->i_ino))
+ if (!f2fs_need_inode_block_update(sbi, inode->i_ino))
fi->last_disk_size = inode->i_size;
+ if (fi->i_flags & F2FS_PROJINHERIT_FL)
+ set_inode_flag(inode, FI_PROJ_INHERIT);
+
+ if (f2fs_has_extra_attr(inode) && f2fs_sb_has_project_quota(sbi) &&
+ F2FS_FITS_IN_INODE(ri, fi->i_extra_isize, i_projid))
+ i_projid = (projid_t)le32_to_cpu(ri->i_projid);
+ else
+ i_projid = F2FS_DEF_PROJID;
+ fi->i_projid = make_kprojid(&init_user_ns, i_projid);
+
+ if (f2fs_has_extra_attr(inode) && f2fs_sb_has_inode_crtime(sbi) &&
+ F2FS_FITS_IN_INODE(ri, fi->i_extra_isize, i_crtime)) {
+ fi->i_crtime.tv_sec = le64_to_cpu(ri->i_crtime);
+ fi->i_crtime.tv_nsec = le32_to_cpu(ri->i_crtime_nsec);
+ }
+
+ F2FS_I(inode)->i_disk_time[0] = inode->i_atime;
+ F2FS_I(inode)->i_disk_time[1] = inode->i_ctime;
+ F2FS_I(inode)->i_disk_time[2] = inode->i_mtime;
+ F2FS_I(inode)->i_disk_time[3] = F2FS_I(inode)->i_crtime;
f2fs_put_page(node_page, 1);
stat_inc_inline_xattr(inode);
@@ -255,10 +439,10 @@ struct inode *f2fs_iget(struct super_block *sb, unsigned long ino)
make_now:
if (ino == F2FS_NODE_INO(sbi)) {
inode->i_mapping->a_ops = &f2fs_node_aops;
- mapping_set_gfp_mask(inode->i_mapping, GFP_F2FS_ZERO);
+ mapping_set_gfp_mask(inode->i_mapping, GFP_NOFS);
} else if (ino == F2FS_META_INO(sbi)) {
inode->i_mapping->a_ops = &f2fs_meta_aops;
- mapping_set_gfp_mask(inode->i_mapping, GFP_F2FS_ZERO);
+ mapping_set_gfp_mask(inode->i_mapping, GFP_NOFS);
} else if (S_ISREG(inode->i_mode)) {
inode->i_op = &f2fs_file_inode_operations;
inode->i_fop = &f2fs_file_operations;
@@ -267,7 +451,7 @@ make_now:
inode->i_op = &f2fs_dir_inode_operations;
inode->i_fop = &f2fs_dir_operations;
inode->i_mapping->a_ops = &f2fs_dblock_aops;
- mapping_set_gfp_mask(inode->i_mapping, GFP_F2FS_HIGH_ZERO);
+ inode_nohighmem(inode);
} else if (S_ISLNK(inode->i_mode)) {
if (f2fs_encrypted_inode(inode))
inode->i_op = &f2fs_encrypted_symlink_inode_operations;
@@ -283,6 +467,7 @@ make_now:
ret = -EIO;
goto bad_inode;
}
+ f2fs_set_inode_flags(inode);
unlock_new_inode(inode);
trace_f2fs_iget(inode);
return inode;
@@ -307,13 +492,15 @@ retry:
return inode;
}
-int update_inode(struct inode *inode, struct page *node_page)
+void f2fs_update_inode(struct inode *inode, struct page *node_page)
{
struct f2fs_inode *ri;
+ struct extent_tree *et = F2FS_I(inode)->extent_tree;
- f2fs_inode_synced(inode);
+ f2fs_wait_on_page_writeback(node_page, NODE, true, true);
+ set_page_dirty(node_page);
- f2fs_wait_on_page_writeback(node_page, NODE, true);
+ f2fs_inode_synced(inode);
ri = F2FS_INODE(node_page);
@@ -323,13 +510,15 @@ int update_inode(struct inode *inode, struct page *node_page)
ri->i_gid = cpu_to_le32(i_gid_read(inode));
ri->i_links = cpu_to_le32(inode->i_nlink);
ri->i_size = cpu_to_le64(i_size_read(inode));
- ri->i_blocks = cpu_to_le64(inode->i_blocks);
+ ri->i_blocks = cpu_to_le64(SECTOR_TO_BLOCK(inode->i_blocks) + 1);
- if (F2FS_I(inode)->extent_tree)
- set_raw_extent(&F2FS_I(inode)->extent_tree->largest,
- &ri->i_ext);
- else
+ if (et) {
+ read_lock(&et->lock);
+ set_raw_extent(&et->largest, &ri->i_ext);
+ read_unlock(&et->lock);
+ } else {
memset(&ri->i_ext, 0, sizeof(ri->i_ext));
+ }
set_raw_inline(inode, ri);
ri->i_atime = cpu_to_le64(inode->i_atime.tv_sec);
@@ -338,30 +527,67 @@ int update_inode(struct inode *inode, struct page *node_page)
ri->i_atime_nsec = cpu_to_le32(inode->i_atime.tv_nsec);
ri->i_ctime_nsec = cpu_to_le32(inode->i_ctime.tv_nsec);
ri->i_mtime_nsec = cpu_to_le32(inode->i_mtime.tv_nsec);
- ri->i_current_depth = cpu_to_le32(F2FS_I(inode)->i_current_depth);
+ if (S_ISDIR(inode->i_mode))
+ ri->i_current_depth =
+ cpu_to_le32(F2FS_I(inode)->i_current_depth);
+ else if (S_ISREG(inode->i_mode))
+ ri->i_gc_failures =
+ cpu_to_le16(F2FS_I(inode)->i_gc_failures[GC_FAILURE_PIN]);
ri->i_xattr_nid = cpu_to_le32(F2FS_I(inode)->i_xattr_nid);
ri->i_flags = cpu_to_le32(F2FS_I(inode)->i_flags);
ri->i_pino = cpu_to_le32(F2FS_I(inode)->i_pino);
ri->i_generation = cpu_to_le32(inode->i_generation);
ri->i_dir_level = F2FS_I(inode)->i_dir_level;
+ if (f2fs_has_extra_attr(inode)) {
+ ri->i_extra_isize = cpu_to_le16(F2FS_I(inode)->i_extra_isize);
+
+ if (f2fs_sb_has_flexible_inline_xattr(F2FS_I_SB(inode)))
+ ri->i_inline_xattr_size =
+ cpu_to_le16(F2FS_I(inode)->i_inline_xattr_size);
+
+ if (f2fs_sb_has_project_quota(F2FS_I_SB(inode)) &&
+ F2FS_FITS_IN_INODE(ri, F2FS_I(inode)->i_extra_isize,
+ i_projid)) {
+ projid_t i_projid;
+
+ i_projid = from_kprojid(&init_user_ns,
+ F2FS_I(inode)->i_projid);
+ ri->i_projid = cpu_to_le32(i_projid);
+ }
+
+ if (f2fs_sb_has_inode_crtime(F2FS_I_SB(inode)) &&
+ F2FS_FITS_IN_INODE(ri, F2FS_I(inode)->i_extra_isize,
+ i_crtime)) {
+ ri->i_crtime =
+ cpu_to_le64(F2FS_I(inode)->i_crtime.tv_sec);
+ ri->i_crtime_nsec =
+ cpu_to_le32(F2FS_I(inode)->i_crtime.tv_nsec);
+ }
+ }
+
__set_inode_rdev(inode, ri);
- set_cold_node(inode, node_page);
/* deleted inode */
if (inode->i_nlink == 0)
clear_inline_node(node_page);
- return set_page_dirty(node_page);
+ F2FS_I(inode)->i_disk_time[0] = inode->i_atime;
+ F2FS_I(inode)->i_disk_time[1] = inode->i_ctime;
+ F2FS_I(inode)->i_disk_time[2] = inode->i_mtime;
+ F2FS_I(inode)->i_disk_time[3] = F2FS_I(inode)->i_crtime;
+
+#ifdef CONFIG_F2FS_CHECK_FS
+ f2fs_inode_chksum_set(F2FS_I_SB(inode), node_page);
+#endif
}
-int update_inode_page(struct inode *inode)
+void f2fs_update_inode_page(struct inode *inode)
{
struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
struct page *node_page;
- int ret = 0;
retry:
- node_page = get_node_page(sbi, inode->i_ino);
+ node_page = f2fs_get_node_page(sbi, inode->i_ino);
if (IS_ERR(node_page)) {
int err = PTR_ERR(node_page);
if (err == -ENOMEM) {
@@ -370,12 +596,10 @@ retry:
} else if (err != -ENOENT) {
f2fs_stop_checkpoint(sbi, false);
}
- f2fs_inode_synced(inode);
- return 0;
+ return;
}
- ret = update_inode(inode, node_page);
+ f2fs_update_inode(inode, node_page);
f2fs_put_page(node_page, 1);
- return ret;
}
int f2fs_write_inode(struct inode *inode, struct writeback_control *wbc)
@@ -389,11 +613,15 @@ int f2fs_write_inode(struct inode *inode, struct writeback_control *wbc)
if (!is_inode_flag_set(inode, FI_DIRTY_INODE))
return 0;
+ if (f2fs_is_checkpoint_ready(sbi))
+ return -ENOSPC;
+
/*
* We need to balance fs here to prevent from producing dirty node pages
* during the urgent cleaning time when runing out of free sections.
*/
- if (update_inode_page(inode))
+ f2fs_update_inode_page(inode);
+ if (wbc && wbc->nr_to_write)
f2fs_balance_fs(sbi, true);
return 0;
}
@@ -409,7 +637,7 @@ void f2fs_evict_inode(struct inode *inode)
/* some remained atomic pages should discarded */
if (f2fs_is_atomic_file(inode))
- drop_inmem_pages(inode);
+ f2fs_drop_inmem_pages(inode);
trace_f2fs_evict_inode(inode);
truncate_inode_pages_final(&inode->i_data);
@@ -419,17 +647,22 @@ void f2fs_evict_inode(struct inode *inode)
goto out_clear;
f2fs_bug_on(sbi, get_dirty_pages(inode));
- remove_dirty_inode(inode);
+ f2fs_remove_dirty_inode(inode);
f2fs_destroy_extent_tree(inode);
if (inode->i_nlink || is_bad_inode(inode))
goto no_delete;
-#ifdef CONFIG_F2FS_FAULT_INJECTION
- if (time_to_inject(sbi, FAULT_EVICT_INODE))
- goto no_delete;
-#endif
+ err = dquot_initialize(inode);
+ if (err) {
+ err = 0;
+ set_sbi_flag(sbi, SBI_QUOTA_NEED_REPAIR);
+ }
+
+ f2fs_remove_ino_entry(sbi, inode->i_ino, APPEND_INO);
+ f2fs_remove_ino_entry(sbi, inode->i_ino, UPDATE_INO);
+ f2fs_remove_ino_entry(sbi, inode->i_ino, FLUSH_INO);
sb_start_intwrite(inode->i_sb);
set_inode_flag(inode, FI_NO_ALLOC);
@@ -438,10 +671,17 @@ retry:
if (F2FS_HAS_BLOCKS(inode))
err = f2fs_truncate(inode);
+ if (time_to_inject(sbi, FAULT_EVICT_INODE)) {
+ f2fs_show_injection_info(FAULT_EVICT_INODE);
+ err = -EIO;
+ }
+
if (!err) {
f2fs_lock_op(sbi);
- err = remove_inode_page(inode);
+ err = f2fs_remove_inode_page(inode);
f2fs_unlock_op(sbi);
+ if (err == -ENOENT)
+ err = 0;
}
/* give more chances, if ENOMEM case */
@@ -450,37 +690,70 @@ retry:
goto retry;
}
- if (err)
- update_inode_page(inode);
+ if (err) {
+ f2fs_update_inode_page(inode);
+ set_sbi_flag(sbi, SBI_QUOTA_NEED_REPAIR);
+ }
sb_end_intwrite(inode->i_sb);
no_delete:
+ dquot_drop(inode);
+
stat_dec_inline_xattr(inode);
stat_dec_inline_dir(inode);
stat_dec_inline_inode(inode);
- invalidate_mapping_pages(NODE_MAPPING(sbi), inode->i_ino, inode->i_ino);
+ if (likely(!is_set_ckpt_flags(sbi, CP_ERROR_FLAG) &&
+ !is_sbi_flag_set(sbi, SBI_CP_DISABLED)))
+ f2fs_bug_on(sbi, is_inode_flag_set(inode, FI_DIRTY_INODE));
+ else
+ f2fs_inode_synced(inode);
+
+ /* ino == 0, if f2fs_new_inode() was failed t*/
+ if (inode->i_ino)
+ invalidate_mapping_pages(NODE_MAPPING(sbi), inode->i_ino,
+ inode->i_ino);
if (xnid)
invalidate_mapping_pages(NODE_MAPPING(sbi), xnid, xnid);
- if (is_inode_flag_set(inode, FI_APPEND_WRITE))
- add_ino_entry(sbi, inode->i_ino, APPEND_INO);
- if (is_inode_flag_set(inode, FI_UPDATE_WRITE))
- add_ino_entry(sbi, inode->i_ino, UPDATE_INO);
+ if (inode->i_nlink) {
+ if (is_inode_flag_set(inode, FI_APPEND_WRITE))
+ f2fs_add_ino_entry(sbi, inode->i_ino, APPEND_INO);
+ if (is_inode_flag_set(inode, FI_UPDATE_WRITE))
+ f2fs_add_ino_entry(sbi, inode->i_ino, UPDATE_INO);
+ }
if (is_inode_flag_set(inode, FI_FREE_NID)) {
- alloc_nid_failed(sbi, inode->i_ino);
+ f2fs_alloc_nid_failed(sbi, inode->i_ino);
clear_inode_flag(inode, FI_FREE_NID);
+ } else {
+ /*
+ * If xattr nid is corrupted, we can reach out error condition,
+ * err & !f2fs_exist_written_data(sbi, inode->i_ino, ORPHAN_INO)).
+ * In that case, f2fs_check_nid_range() is enough to give a clue.
+ */
}
- f2fs_bug_on(sbi, err &&
- !exist_written_data(sbi, inode->i_ino, ORPHAN_INO));
out_clear:
- fscrypt_put_encryption_info(inode, NULL);
+ fscrypt_put_encryption_info(inode);
clear_inode(inode);
}
/* caller should call f2fs_lock_op() */
-void handle_failed_inode(struct inode *inode)
+void f2fs_handle_failed_inode(struct inode *inode)
{
struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
struct node_info ni;
+ int err;
+
+ /*
+ * clear nlink of inode in order to release resource of inode
+ * immediately.
+ */
+ clear_nlink(inode);
+
+ /*
+ * we must call this to avoid inode being remained as dirty, resulting
+ * in a panic when flushing dirty inodes in gdirty_list.
+ */
+ f2fs_update_inode_page(inode);
+ f2fs_inode_synced(inode);
/* don't make bad inode, since it becomes a regular file. */
unlock_new_inode(inode);
@@ -490,22 +763,29 @@ void handle_failed_inode(struct inode *inode)
* so we can prevent losing this orphan when encoutering checkpoint
* and following suddenly power-off.
*/
- get_node_info(sbi, inode->i_ino, &ni);
+ err = f2fs_get_node_info(sbi, inode->i_ino, &ni);
+ if (err) {
+ set_sbi_flag(sbi, SBI_NEED_FSCK);
+ f2fs_msg(sbi->sb, KERN_WARNING,
+ "May loss orphan inode, run fsck to fix.");
+ goto out;
+ }
if (ni.blk_addr != NULL_ADDR) {
- int err = acquire_orphan_inode(sbi);
+ err = f2fs_acquire_orphan_inode(sbi);
if (err) {
set_sbi_flag(sbi, SBI_NEED_FSCK);
f2fs_msg(sbi->sb, KERN_WARNING,
"Too many orphan inodes, run fsck to fix.");
} else {
- add_orphan_inode(inode);
+ f2fs_add_orphan_inode(inode);
}
- alloc_nid_done(sbi, inode->i_ino);
+ f2fs_alloc_nid_done(sbi, inode->i_ino);
} else {
set_inode_flag(inode, FI_FREE_NID);
}
+out:
f2fs_unlock_op(sbi);
/* iput will drop the inode object */
diff --git a/fs/f2fs/namei.c b/fs/f2fs/namei.c
index ccb99d5cfd8b..98fe5a72bb3d 100644
--- a/fs/f2fs/namei.c
+++ b/fs/f2fs/namei.c
@@ -1,12 +1,9 @@
+// SPDX-License-Identifier: GPL-2.0
/*
* fs/f2fs/namei.c
*
* Copyright (c) 2012 Samsung Electronics Co., Ltd.
* http://www.samsung.com/
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
*/
#include <linux/fs.h>
#include <linux/f2fs_fs.h>
@@ -15,9 +12,11 @@
#include <linux/ctype.h>
#include <linux/dcache.h>
#include <linux/namei.h>
+#include <linux/quotaops.h>
#include "f2fs.h"
#include "node.h"
+#include "segment.h"
#include "xattr.h"
#include "acl.h"
#include <trace/events/f2fs.h>
@@ -28,6 +27,7 @@ static struct inode *f2fs_new_inode(struct inode *dir, umode_t mode)
nid_t ino;
struct inode *inode;
bool nid_free = false;
+ int xattr_size = 0;
int err;
inode = new_inode(dir->i_sb);
@@ -35,46 +35,91 @@ static struct inode *f2fs_new_inode(struct inode *dir, umode_t mode)
return ERR_PTR(-ENOMEM);
f2fs_lock_op(sbi);
- if (!alloc_nid(sbi, &ino)) {
+ if (!f2fs_alloc_nid(sbi, &ino)) {
f2fs_unlock_op(sbi);
err = -ENOSPC;
goto fail;
}
f2fs_unlock_op(sbi);
+ nid_free = true;
+
inode_init_owner(inode, dir, mode);
inode->i_ino = ino;
inode->i_blocks = 0;
- inode->i_mtime = inode->i_atime = inode->i_ctime = current_time(inode);
+ inode->i_mtime = inode->i_atime = inode->i_ctime =
+ F2FS_I(inode)->i_crtime = current_time(inode);
inode->i_generation = sbi->s_next_generation++;
+ if (S_ISDIR(inode->i_mode))
+ F2FS_I(inode)->i_current_depth = 1;
+
err = insert_inode_locked(inode);
if (err) {
err = -EINVAL;
- nid_free = true;
goto fail;
}
+ if (f2fs_sb_has_project_quota(sbi) &&
+ (F2FS_I(dir)->i_flags & F2FS_PROJINHERIT_FL))
+ F2FS_I(inode)->i_projid = F2FS_I(dir)->i_projid;
+ else
+ F2FS_I(inode)->i_projid = make_kprojid(&init_user_ns,
+ F2FS_DEF_PROJID);
+
+ err = dquot_initialize(inode);
+ if (err)
+ goto fail_drop;
+
+ set_inode_flag(inode, FI_NEW_INODE);
+
/* If the directory encrypted, then we should encrypt the inode. */
- if (f2fs_encrypted_inode(dir) && f2fs_may_encrypt(inode))
+ if ((f2fs_encrypted_inode(dir) || DUMMY_ENCRYPTION_ENABLED(sbi)) &&
+ f2fs_may_encrypt(inode))
f2fs_set_encrypted_inode(inode);
- set_inode_flag(inode, FI_NEW_INODE);
+ if (f2fs_sb_has_extra_attr(sbi)) {
+ set_inode_flag(inode, FI_EXTRA_ATTR);
+ F2FS_I(inode)->i_extra_isize = F2FS_TOTAL_EXTRA_ATTR_SIZE;
+ }
if (test_opt(sbi, INLINE_XATTR))
set_inode_flag(inode, FI_INLINE_XATTR);
+
if (test_opt(sbi, INLINE_DATA) && f2fs_may_inline_data(inode))
set_inode_flag(inode, FI_INLINE_DATA);
if (f2fs_may_inline_dentry(inode))
set_inode_flag(inode, FI_INLINE_DENTRY);
+ if (f2fs_sb_has_flexible_inline_xattr(sbi)) {
+ f2fs_bug_on(sbi, !f2fs_has_extra_attr(inode));
+ if (f2fs_has_inline_xattr(inode))
+ xattr_size = F2FS_OPTION(sbi).inline_xattr_size;
+ /* Otherwise, will be 0 */
+ } else if (f2fs_has_inline_xattr(inode) ||
+ f2fs_has_inline_dentry(inode)) {
+ xattr_size = DEFAULT_INLINE_XATTR_ADDRS;
+ }
+ F2FS_I(inode)->i_inline_xattr_size = xattr_size;
+
f2fs_init_extent_tree(inode, NULL);
stat_inc_inline_xattr(inode);
stat_inc_inline_inode(inode);
stat_inc_inline_dir(inode);
+ F2FS_I(inode)->i_flags =
+ f2fs_mask_flags(mode, F2FS_I(dir)->i_flags & F2FS_FL_INHERITED);
+
+ if (S_ISDIR(inode->i_mode))
+ F2FS_I(inode)->i_flags |= F2FS_INDEX_FL;
+
+ if (F2FS_I(inode)->i_flags & F2FS_PROJINHERIT_FL)
+ set_inode_flag(inode, FI_PROJ_INHERIT);
+
+ f2fs_set_inode_flags(inode);
+
trace_f2fs_new_inode(inode, 0);
return inode;
@@ -85,9 +130,19 @@ fail:
set_inode_flag(inode, FI_FREE_NID);
iput(inode);
return ERR_PTR(err);
+fail_drop:
+ trace_f2fs_new_inode(inode, err);
+ dquot_drop(inode);
+ inode->i_flags |= S_NOQUOTA;
+ if (nid_free)
+ set_inode_flag(inode, FI_FREE_NID);
+ clear_nlink(inode);
+ unlock_new_inode(inode);
+ iput(inode);
+ return ERR_PTR(err);
}
-static int is_multimedia_file(const unsigned char *s, const char *sub)
+static int is_extension_exist(const unsigned char *s, const char *sub)
{
size_t slen = strlen(s);
size_t sublen = strlen(sub);
@@ -113,19 +168,97 @@ static int is_multimedia_file(const unsigned char *s, const char *sub)
/*
* Set multimedia files as cold files for hot/cold data separation
*/
-static inline void set_cold_files(struct f2fs_sb_info *sbi, struct inode *inode,
+static inline void set_file_temperature(struct f2fs_sb_info *sbi, struct inode *inode,
const unsigned char *name)
{
- int i;
- __u8 (*extlist)[8] = sbi->raw_super->extension_list;
+ __u8 (*extlist)[F2FS_EXTENSION_LEN] = sbi->raw_super->extension_list;
+ int i, cold_count, hot_count;
+
+ down_read(&sbi->sb_lock);
+
+ cold_count = le32_to_cpu(sbi->raw_super->extension_count);
+ hot_count = sbi->raw_super->hot_ext_count;
- int count = le32_to_cpu(sbi->raw_super->extension_count);
- for (i = 0; i < count; i++) {
- if (is_multimedia_file(name, extlist[i])) {
- file_set_cold(inode);
+ for (i = 0; i < cold_count + hot_count; i++) {
+ if (is_extension_exist(name, extlist[i]))
break;
- }
}
+
+ up_read(&sbi->sb_lock);
+
+ if (i == cold_count + hot_count)
+ return;
+
+ if (i < cold_count)
+ file_set_cold(inode);
+ else
+ file_set_hot(inode);
+}
+
+int f2fs_update_extension_list(struct f2fs_sb_info *sbi, const char *name,
+ bool hot, bool set)
+{
+ __u8 (*extlist)[F2FS_EXTENSION_LEN] = sbi->raw_super->extension_list;
+ int cold_count = le32_to_cpu(sbi->raw_super->extension_count);
+ int hot_count = sbi->raw_super->hot_ext_count;
+ int total_count = cold_count + hot_count;
+ int start, count;
+ int i;
+
+ if (set) {
+ if (total_count == F2FS_MAX_EXTENSION)
+ return -EINVAL;
+ } else {
+ if (!hot && !cold_count)
+ return -EINVAL;
+ if (hot && !hot_count)
+ return -EINVAL;
+ }
+
+ if (hot) {
+ start = cold_count;
+ count = total_count;
+ } else {
+ start = 0;
+ count = cold_count;
+ }
+
+ for (i = start; i < count; i++) {
+ if (strcmp(name, extlist[i]))
+ continue;
+
+ if (set)
+ return -EINVAL;
+
+ memcpy(extlist[i], extlist[i + 1],
+ F2FS_EXTENSION_LEN * (total_count - i - 1));
+ memset(extlist[total_count - 1], 0, F2FS_EXTENSION_LEN);
+ if (hot)
+ sbi->raw_super->hot_ext_count = hot_count - 1;
+ else
+ sbi->raw_super->extension_count =
+ cpu_to_le32(cold_count - 1);
+ return 0;
+ }
+
+ if (!set)
+ return -EINVAL;
+
+ if (hot) {
+ memcpy(extlist[count], name, strlen(name));
+ sbi->raw_super->hot_ext_count = hot_count + 1;
+ } else {
+ char buf[F2FS_MAX_EXTENSION][F2FS_EXTENSION_LEN];
+
+ memcpy(buf, &extlist[cold_count],
+ F2FS_EXTENSION_LEN * hot_count);
+ memset(extlist[cold_count], 0, F2FS_EXTENSION_LEN);
+ memcpy(extlist[cold_count], name, strlen(name));
+ memcpy(&extlist[cold_count + 1], buf,
+ F2FS_EXTENSION_LEN * hot_count);
+ sbi->raw_super->extension_count = cpu_to_le32(cold_count + 1);
+ }
+ return 0;
}
static int f2fs_create(struct inode *dir, struct dentry *dentry, umode_t mode,
@@ -136,35 +269,45 @@ static int f2fs_create(struct inode *dir, struct dentry *dentry, umode_t mode,
nid_t ino = 0;
int err;
+ if (unlikely(f2fs_cp_error(sbi)))
+ return -EIO;
+ err = f2fs_is_checkpoint_ready(sbi);
+ if (err)
+ return err;
+
+ err = dquot_initialize(dir);
+ if (err)
+ return err;
+
inode = f2fs_new_inode(dir, mode);
if (IS_ERR(inode))
return PTR_ERR(inode);
if (!test_opt(sbi, DISABLE_EXT_IDENTIFY))
- set_cold_files(sbi, inode, dentry->d_name.name);
+ set_file_temperature(sbi, inode, dentry->d_name.name);
inode->i_op = &f2fs_file_inode_operations;
inode->i_fop = &f2fs_file_operations;
inode->i_mapping->a_ops = &f2fs_dblock_aops;
ino = inode->i_ino;
- f2fs_balance_fs(sbi, true);
-
f2fs_lock_op(sbi);
err = f2fs_add_link(dentry, inode);
if (err)
goto out;
f2fs_unlock_op(sbi);
- alloc_nid_done(sbi, ino);
+ f2fs_alloc_nid_done(sbi, ino);
d_instantiate_new(dentry, inode);
if (IS_DIRSYNC(dir))
f2fs_sync_fs(sbi->sb, 1);
+
+ f2fs_balance_fs(sbi, true);
return 0;
out:
- handle_failed_inode(inode);
+ f2fs_handle_failed_inode(inode);
return err;
}
@@ -175,9 +318,24 @@ static int f2fs_link(struct dentry *old_dentry, struct inode *dir,
struct f2fs_sb_info *sbi = F2FS_I_SB(dir);
int err;
- if (f2fs_encrypted_inode(dir) &&
- !fscrypt_has_permitted_context(dir, inode))
- return -EPERM;
+ if (unlikely(f2fs_cp_error(sbi)))
+ return -EIO;
+ err = f2fs_is_checkpoint_ready(sbi);
+ if (err)
+ return err;
+
+ err = fscrypt_prepare_link(old_dentry, dir, dentry);
+ if (err)
+ return err;
+
+ if (is_inode_flag_set(dir, FI_PROJ_INHERIT) &&
+ (!projid_eq(F2FS_I(dir)->i_projid,
+ F2FS_I(old_dentry->d_inode)->i_projid)))
+ return -EXDEV;
+
+ err = dquot_initialize(dir);
+ if (err)
+ return err;
f2fs_balance_fs(sbi, true);
@@ -232,32 +390,33 @@ static int __recover_dot_dentries(struct inode *dir, nid_t pino)
return 0;
}
+ err = dquot_initialize(dir);
+ if (err)
+ return err;
+
f2fs_balance_fs(sbi, true);
f2fs_lock_op(sbi);
de = f2fs_find_entry(dir, &dot, &page);
if (de) {
- f2fs_dentry_kunmap(dir, page);
f2fs_put_page(page, 0);
} else if (IS_ERR(page)) {
err = PTR_ERR(page);
goto out;
} else {
- err = __f2fs_add_link(dir, &dot, NULL, dir->i_ino, S_IFDIR);
+ err = f2fs_do_add_link(dir, &dot, NULL, dir->i_ino, S_IFDIR);
if (err)
goto out;
}
de = f2fs_find_entry(dir, &dotdot, &page);
- if (de) {
- f2fs_dentry_kunmap(dir, page);
+ if (de)
f2fs_put_page(page, 0);
- } else if (IS_ERR(page)) {
+ else if (IS_ERR(page))
err = PTR_ERR(page);
- } else {
- err = __f2fs_add_link(dir, &dotdot, NULL, pino, S_IFDIR);
- }
+ else
+ err = f2fs_do_add_link(dir, &dotdot, NULL, pino, S_IFDIR);
out:
if (!err)
clear_inode_flag(dir, FI_INLINE_DOTS);
@@ -272,66 +431,70 @@ static struct dentry *f2fs_lookup(struct inode *dir, struct dentry *dentry,
struct inode *inode = NULL;
struct f2fs_dir_entry *de;
struct page *page;
- nid_t ino;
+ struct dentry *new;
+ nid_t ino = -1;
int err = 0;
unsigned int root_ino = F2FS_ROOT_INO(F2FS_I_SB(dir));
- if (f2fs_encrypted_inode(dir)) {
- int res = fscrypt_get_encryption_info(dir);
+ trace_f2fs_lookup_start(dir, dentry, flags);
- /*
- * DCACHE_ENCRYPTED_WITH_KEY is set if the dentry is
- * created while the directory was encrypted and we
- * don't have access to the key.
- */
- if (fscrypt_has_encryption_key(dir))
- fscrypt_set_encrypted_dentry(dentry);
- fscrypt_set_d_op(dentry);
- if (res && res != -ENOKEY)
- return ERR_PTR(res);
- }
+ err = fscrypt_prepare_lookup(dir, dentry, flags);
+ if (err)
+ goto out;
- if (dentry->d_name.len > F2FS_NAME_LEN)
- return ERR_PTR(-ENAMETOOLONG);
+ if (dentry->d_name.len > F2FS_NAME_LEN) {
+ err = -ENAMETOOLONG;
+ goto out;
+ }
de = f2fs_find_entry(dir, &dentry->d_name, &page);
if (!de) {
- if (IS_ERR(page))
- return (struct dentry *)page;
- return d_splice_alias(inode, dentry);
+ if (IS_ERR(page)) {
+ err = PTR_ERR(page);
+ goto out;
+ }
+ goto out_splice;
}
ino = le32_to_cpu(de->ino);
- f2fs_dentry_kunmap(dir, page);
f2fs_put_page(page, 0);
inode = f2fs_iget(dir->i_sb, ino);
- if (IS_ERR(inode))
- return ERR_CAST(inode);
+ if (IS_ERR(inode)) {
+ err = PTR_ERR(inode);
+ goto out;
+ }
if ((dir->i_ino == root_ino) && f2fs_has_inline_dots(dir)) {
err = __recover_dot_dentries(dir, root_ino);
if (err)
- goto err_out;
+ goto out_iput;
}
if (f2fs_has_inline_dots(inode)) {
err = __recover_dot_dentries(inode, dir->i_ino);
if (err)
- goto err_out;
+ goto out_iput;
}
- if (!IS_ERR(inode) && f2fs_encrypted_inode(dir) &&
- (S_ISDIR(inode->i_mode) || S_ISLNK(inode->i_mode)) &&
- !fscrypt_has_permitted_context(dir, inode)) {
- bool nokey = f2fs_encrypted_inode(inode) &&
- !fscrypt_has_encryption_key(inode);
- err = nokey ? -ENOKEY : -EPERM;
- goto err_out;
+ if (f2fs_encrypted_inode(dir) &&
+ (S_ISDIR(inode->i_mode) || S_ISLNK(inode->i_mode)) &&
+ !fscrypt_has_permitted_context(dir, inode)) {
+ f2fs_msg(inode->i_sb, KERN_WARNING,
+ "Inconsistent encryption contexts: %lu/%lu",
+ dir->i_ino, inode->i_ino);
+ err = -EPERM;
+ goto out_iput;
}
- return d_splice_alias(inode, dentry);
-
-err_out:
+out_splice:
+ new = d_splice_alias(inode, dentry);
+ if (IS_ERR(new))
+ err = PTR_ERR(new);
+ trace_f2fs_lookup_end(dir, dentry, ino, err);
+ return new;
+out_iput:
iput(inode);
+out:
+ trace_f2fs_lookup_end(dir, dentry, ino, err);
return ERR_PTR(err);
}
@@ -345,6 +508,16 @@ static int f2fs_unlink(struct inode *dir, struct dentry *dentry)
trace_f2fs_unlink_enter(dir, dentry);
+ if (unlikely(f2fs_cp_error(sbi)))
+ return -EIO;
+
+ err = dquot_initialize(dir);
+ if (err)
+ return err;
+ err = dquot_initialize(inode);
+ if (err)
+ return err;
+
de = f2fs_find_entry(dir, &dentry->d_name, &page);
if (!de) {
if (IS_ERR(page))
@@ -355,10 +528,9 @@ static int f2fs_unlink(struct inode *dir, struct dentry *dentry)
f2fs_balance_fs(sbi, true);
f2fs_lock_op(sbi);
- err = acquire_orphan_inode(sbi);
+ err = f2fs_acquire_orphan_inode(sbi);
if (err) {
f2fs_unlock_op(sbi);
- f2fs_dentry_kunmap(dir, page);
f2fs_put_page(page, 0);
goto fail;
}
@@ -392,73 +564,45 @@ static int f2fs_symlink(struct inode *dir, struct dentry *dentry,
struct f2fs_sb_info *sbi = F2FS_I_SB(dir);
struct inode *inode;
size_t len = strlen(symname);
- struct fscrypt_str disk_link = FSTR_INIT((char *)symname, len + 1);
- struct fscrypt_symlink_data *sd = NULL;
+ struct fscrypt_str disk_link;
int err;
- if (f2fs_encrypted_inode(dir)) {
- err = fscrypt_get_encryption_info(dir);
- if (err)
- return err;
-
- if (!fscrypt_has_encryption_key(dir))
- return -ENOKEY;
+ if (unlikely(f2fs_cp_error(sbi)))
+ return -EIO;
+ err = f2fs_is_checkpoint_ready(sbi);
+ if (err)
+ return err;
- disk_link.len = (fscrypt_fname_encrypted_size(dir, len) +
- sizeof(struct fscrypt_symlink_data));
- }
+ err = fscrypt_prepare_symlink(dir, symname, len, dir->i_sb->s_blocksize,
+ &disk_link);
+ if (err)
+ return err;
- if (disk_link.len > dir->i_sb->s_blocksize)
- return -ENAMETOOLONG;
+ err = dquot_initialize(dir);
+ if (err)
+ return err;
inode = f2fs_new_inode(dir, S_IFLNK | S_IRWXUGO);
if (IS_ERR(inode))
return PTR_ERR(inode);
- if (f2fs_encrypted_inode(inode))
+ if (IS_ENCRYPTED(inode))
inode->i_op = &f2fs_encrypted_symlink_inode_operations;
else
inode->i_op = &f2fs_symlink_inode_operations;
inode_nohighmem(inode);
inode->i_mapping->a_ops = &f2fs_dblock_aops;
- f2fs_balance_fs(sbi, true);
-
f2fs_lock_op(sbi);
err = f2fs_add_link(dentry, inode);
if (err)
- goto out;
+ goto out_f2fs_handle_failed_inode;
f2fs_unlock_op(sbi);
- alloc_nid_done(sbi, inode->i_ino);
-
- if (f2fs_encrypted_inode(inode)) {
- struct qstr istr = QSTR_INIT(symname, len);
- struct fscrypt_str ostr;
-
- sd = kzalloc(disk_link.len, GFP_NOFS);
- if (!sd) {
- err = -ENOMEM;
- goto err_out;
- }
-
- err = fscrypt_get_encryption_info(inode);
- if (err)
- goto err_out;
-
- if (!fscrypt_has_encryption_key(inode)) {
- err = -ENOKEY;
- goto err_out;
- }
-
- ostr.name = sd->encrypted_path;
- ostr.len = disk_link.len;
- err = fscrypt_fname_usr_to_disk(inode, &istr, &ostr);
- if (err)
- goto err_out;
+ f2fs_alloc_nid_done(sbi, inode->i_ino);
- sd->len = cpu_to_le16(ostr.len);
- disk_link.name = (char *)sd;
- }
+ err = fscrypt_encrypt_symlink(inode, symname, len, &disk_link);
+ if (err)
+ goto err_out;
err = page_symlink(inode, disk_link.name, disk_link.len);
@@ -484,10 +628,14 @@ err_out:
f2fs_unlink(dir, dentry);
}
- kfree(sd);
- return err;
-out:
- handle_failed_inode(inode);
+ f2fs_balance_fs(sbi, true);
+ goto out_free_encrypted_link;
+
+out_f2fs_handle_failed_inode:
+ f2fs_handle_failed_inode(inode);
+out_free_encrypted_link:
+ if (disk_link.name != (unsigned char *)symname)
+ kvfree(disk_link.name);
return err;
}
@@ -497,6 +645,13 @@ static int f2fs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
struct inode *inode;
int err;
+ if (unlikely(f2fs_cp_error(sbi)))
+ return -EIO;
+
+ err = dquot_initialize(dir);
+ if (err)
+ return err;
+
inode = f2fs_new_inode(dir, S_IFDIR | mode);
if (IS_ERR(inode))
return PTR_ERR(inode);
@@ -504,9 +659,7 @@ static int f2fs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
inode->i_op = &f2fs_dir_inode_operations;
inode->i_fop = &f2fs_dir_operations;
inode->i_mapping->a_ops = &f2fs_dblock_aops;
- mapping_set_gfp_mask(inode->i_mapping, GFP_F2FS_HIGH_ZERO);
-
- f2fs_balance_fs(sbi, true);
+ inode_nohighmem(inode);
set_inode_flag(inode, FI_INC_LINK);
f2fs_lock_op(sbi);
@@ -515,17 +668,19 @@ static int f2fs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
goto out_fail;
f2fs_unlock_op(sbi);
- alloc_nid_done(sbi, inode->i_ino);
+ f2fs_alloc_nid_done(sbi, inode->i_ino);
d_instantiate_new(dentry, inode);
if (IS_DIRSYNC(dir))
f2fs_sync_fs(sbi->sb, 1);
+
+ f2fs_balance_fs(sbi, true);
return 0;
out_fail:
clear_inode_flag(inode, FI_INC_LINK);
- handle_failed_inode(inode);
+ f2fs_handle_failed_inode(inode);
return err;
}
@@ -544,6 +699,16 @@ static int f2fs_mknod(struct inode *dir, struct dentry *dentry,
struct inode *inode;
int err = 0;
+ if (unlikely(f2fs_cp_error(sbi)))
+ return -EIO;
+ err = f2fs_is_checkpoint_ready(sbi);
+ if (err)
+ return err;
+
+ err = dquot_initialize(dir);
+ if (err)
+ return err;
+
inode = f2fs_new_inode(dir, mode);
if (IS_ERR(inode))
return PTR_ERR(inode);
@@ -551,23 +716,23 @@ static int f2fs_mknod(struct inode *dir, struct dentry *dentry,
init_special_inode(inode, inode->i_mode, rdev);
inode->i_op = &f2fs_special_inode_operations;
- f2fs_balance_fs(sbi, true);
-
f2fs_lock_op(sbi);
err = f2fs_add_link(dentry, inode);
if (err)
goto out;
f2fs_unlock_op(sbi);
- alloc_nid_done(sbi, inode->i_ino);
+ f2fs_alloc_nid_done(sbi, inode->i_ino);
d_instantiate_new(dentry, inode);
if (IS_DIRSYNC(dir))
f2fs_sync_fs(sbi->sb, 1);
+
+ f2fs_balance_fs(sbi, true);
return 0;
out:
- handle_failed_inode(inode);
+ f2fs_handle_failed_inode(inode);
return err;
}
@@ -578,6 +743,10 @@ static int __f2fs_tmpfile(struct inode *dir, struct dentry *dentry,
struct inode *inode;
int err;
+ err = dquot_initialize(dir);
+ if (err)
+ return err;
+
inode = f2fs_new_inode(dir, mode);
if (IS_ERR(inode))
return PTR_ERR(inode);
@@ -591,10 +760,8 @@ static int __f2fs_tmpfile(struct inode *dir, struct dentry *dentry,
inode->i_mapping->a_ops = &f2fs_dblock_aops;
}
- f2fs_balance_fs(sbi, true);
-
f2fs_lock_op(sbi);
- err = acquire_orphan_inode(sbi);
+ err = f2fs_acquire_orphan_inode(sbi);
if (err)
goto out;
@@ -606,8 +773,8 @@ static int __f2fs_tmpfile(struct inode *dir, struct dentry *dentry,
* add this non-linked tmpfile to orphan list, in this way we could
* remove all unused data of tmpfile after abnormal power-off.
*/
- add_orphan_inode(inode);
- alloc_nid_done(sbi, inode->i_ino);
+ f2fs_add_orphan_inode(inode);
+ f2fs_alloc_nid_done(sbi, inode->i_ino);
if (whiteout) {
f2fs_i_links_write(inode, false);
@@ -618,18 +785,25 @@ static int __f2fs_tmpfile(struct inode *dir, struct dentry *dentry,
/* link_count was changed by d_tmpfile as well. */
f2fs_unlock_op(sbi);
unlock_new_inode(inode);
+
+ f2fs_balance_fs(sbi, true);
return 0;
release_out:
- release_orphan_inode(sbi);
+ f2fs_release_orphan_inode(sbi);
out:
- handle_failed_inode(inode);
+ f2fs_handle_failed_inode(inode);
return err;
}
static int f2fs_tmpfile(struct inode *dir, struct dentry *dentry, umode_t mode)
{
- if (f2fs_encrypted_inode(dir)) {
+ struct f2fs_sb_info *sbi = F2FS_I_SB(dir);
+
+ if (unlikely(f2fs_cp_error(sbi)))
+ return -EIO;
+
+ if (f2fs_encrypted_inode(dir) || DUMMY_ENCRYPTION_ENABLED(sbi)) {
int err = fscrypt_get_encryption_info(dir);
if (err)
return err;
@@ -640,6 +814,9 @@ static int f2fs_tmpfile(struct inode *dir, struct dentry *dentry, umode_t mode)
static int f2fs_create_whiteout(struct inode *dir, struct inode **whiteout)
{
+ if (unlikely(f2fs_cp_error(F2FS_I_SB(dir))))
+ return -EIO;
+
return __f2fs_tmpfile(dir, NULL, S_IFCHR | WHITEOUT_MODE, whiteout);
}
@@ -657,20 +834,34 @@ static int f2fs_rename(struct inode *old_dir, struct dentry *old_dentry,
struct f2fs_dir_entry *old_entry;
struct f2fs_dir_entry *new_entry;
bool is_old_inline = f2fs_has_inline_dentry(old_dir);
- int err = -ENOENT;
+ int err;
- if ((f2fs_encrypted_inode(old_dir) &&
- !fscrypt_has_encryption_key(old_dir)) ||
- (f2fs_encrypted_inode(new_dir) &&
- !fscrypt_has_encryption_key(new_dir)))
- return -ENOKEY;
+ if (unlikely(f2fs_cp_error(sbi)))
+ return -EIO;
+ err = f2fs_is_checkpoint_ready(sbi);
+ if (err)
+ return err;
- if ((old_dir != new_dir) && f2fs_encrypted_inode(new_dir) &&
- !fscrypt_has_permitted_context(new_dir, old_inode)) {
- err = -EPERM;
+ if (is_inode_flag_set(new_dir, FI_PROJ_INHERIT) &&
+ (!projid_eq(F2FS_I(new_dir)->i_projid,
+ F2FS_I(old_dentry->d_inode)->i_projid)))
+ return -EXDEV;
+
+ err = dquot_initialize(old_dir);
+ if (err)
goto out;
+
+ err = dquot_initialize(new_dir);
+ if (err)
+ goto out;
+
+ if (new_inode) {
+ err = dquot_initialize(new_inode);
+ if (err)
+ goto out;
}
+ err = -ENOENT;
old_entry = f2fs_find_entry(old_dir, &old_dentry->d_name, &old_page);
if (!old_entry) {
if (IS_ERR(old_page))
@@ -712,17 +903,10 @@ static int f2fs_rename(struct inode *old_dir, struct dentry *old_dentry,
f2fs_lock_op(sbi);
- err = acquire_orphan_inode(sbi);
+ err = f2fs_acquire_orphan_inode(sbi);
if (err)
goto put_out_dir;
- err = update_dent_inode(old_inode, new_inode,
- &new_dentry->d_name);
- if (err) {
- release_orphan_inode(sbi);
- goto put_out_dir;
- }
-
f2fs_set_link(new_dir, new_entry, new_page, old_inode);
new_inode->i_ctime = current_time(new_inode);
@@ -733,9 +917,9 @@ static int f2fs_rename(struct inode *old_dir, struct dentry *old_dentry,
up_write(&F2FS_I(new_inode)->i_sem);
if (!new_inode->i_nlink)
- add_orphan_inode(new_inode);
+ f2fs_add_orphan_inode(new_inode);
else
- release_orphan_inode(sbi);
+ f2fs_release_orphan_inode(sbi);
} else {
f2fs_balance_fs(sbi, true);
@@ -774,13 +958,14 @@ static int f2fs_rename(struct inode *old_dir, struct dentry *old_dentry,
}
down_write(&F2FS_I(old_inode)->i_sem);
- file_lost_pino(old_inode);
- if (new_inode && file_enc_name(new_inode))
- file_set_enc_name(old_inode);
+ if (!old_dir_entry || whiteout)
+ file_lost_pino(old_inode);
+ else
+ F2FS_I(old_inode)->i_pino = new_dir->i_ino;
up_write(&F2FS_I(old_inode)->i_sem);
old_inode->i_ctime = current_time(old_inode);
- f2fs_mark_inode_dirty_sync(old_inode);
+ f2fs_mark_inode_dirty_sync(old_inode, false);
f2fs_delete_entry(old_entry, old_page, old_dir, NULL);
@@ -795,38 +980,39 @@ static int f2fs_rename(struct inode *old_dir, struct dentry *old_dentry,
}
if (old_dir_entry) {
- if (old_dir != new_dir && !whiteout) {
+ if (old_dir != new_dir && !whiteout)
f2fs_set_link(old_inode, old_dir_entry,
old_dir_page, new_dir);
- } else {
- f2fs_dentry_kunmap(old_inode, old_dir_page);
+ else
f2fs_put_page(old_dir_page, 0);
- }
f2fs_i_links_write(old_dir, false);
}
+ if (F2FS_OPTION(sbi).fsync_mode == FSYNC_MODE_STRICT) {
+ f2fs_add_ino_entry(sbi, new_dir->i_ino, TRANS_DIR_INO);
+ if (S_ISDIR(old_inode->i_mode))
+ f2fs_add_ino_entry(sbi, old_inode->i_ino,
+ TRANS_DIR_INO);
+ }
f2fs_unlock_op(sbi);
if (IS_DIRSYNC(old_dir) || IS_DIRSYNC(new_dir))
f2fs_sync_fs(sbi->sb, 1);
+
+ f2fs_update_time(sbi, REQ_TIME);
return 0;
put_out_dir:
f2fs_unlock_op(sbi);
- if (new_page) {
- f2fs_dentry_kunmap(new_dir, new_page);
+ if (new_page)
f2fs_put_page(new_page, 0);
- }
out_whiteout:
if (whiteout)
iput(whiteout);
out_dir:
- if (old_dir_entry) {
- f2fs_dentry_kunmap(old_inode, old_dir_page);
+ if (old_dir_entry)
f2fs_put_page(old_dir_page, 0);
- }
out_old:
- f2fs_dentry_kunmap(old_dir, old_page);
f2fs_put_page(old_page, 0);
out:
return err;
@@ -843,20 +1029,31 @@ static int f2fs_cross_rename(struct inode *old_dir, struct dentry *old_dentry,
struct f2fs_dir_entry *old_dir_entry = NULL, *new_dir_entry = NULL;
struct f2fs_dir_entry *old_entry, *new_entry;
int old_nlink = 0, new_nlink = 0;
- int err = -ENOENT;
+ int err;
+
+ if (unlikely(f2fs_cp_error(sbi)))
+ return -EIO;
+ err = f2fs_is_checkpoint_ready(sbi);
+ if (err)
+ return err;
- if ((f2fs_encrypted_inode(old_dir) &&
- !fscrypt_has_encryption_key(old_dir)) ||
- (f2fs_encrypted_inode(new_dir) &&
- !fscrypt_has_encryption_key(new_dir)))
- return -ENOKEY;
+ if ((is_inode_flag_set(new_dir, FI_PROJ_INHERIT) &&
+ !projid_eq(F2FS_I(new_dir)->i_projid,
+ F2FS_I(old_dentry->d_inode)->i_projid)) ||
+ (is_inode_flag_set(new_dir, FI_PROJ_INHERIT) &&
+ !projid_eq(F2FS_I(old_dir)->i_projid,
+ F2FS_I(new_dentry->d_inode)->i_projid)))
+ return -EXDEV;
- if ((f2fs_encrypted_inode(old_dir) || f2fs_encrypted_inode(new_dir)) &&
- (old_dir != new_dir) &&
- (!fscrypt_has_permitted_context(new_dir, old_inode) ||
- !fscrypt_has_permitted_context(old_dir, new_inode)))
- return -EPERM;
+ err = dquot_initialize(old_dir);
+ if (err)
+ goto out;
+ err = dquot_initialize(new_dir);
+ if (err)
+ goto out;
+
+ err = -ENOENT;
old_entry = f2fs_find_entry(old_dir, &old_dentry->d_name, &old_page);
if (!old_entry) {
if (IS_ERR(old_page))
@@ -904,8 +1101,8 @@ static int f2fs_cross_rename(struct inode *old_dir, struct dentry *old_dentry,
old_nlink = old_dir_entry ? -1 : 1;
new_nlink = -old_nlink;
err = -EMLINK;
- if ((old_nlink > 0 && old_inode->i_nlink >= F2FS_LINK_MAX) ||
- (new_nlink > 0 && new_inode->i_nlink >= F2FS_LINK_MAX))
+ if ((old_nlink > 0 && old_dir->i_nlink >= F2FS_LINK_MAX) ||
+ (new_nlink > 0 && new_dir->i_nlink >= F2FS_LINK_MAX))
goto out_new_dir;
}
@@ -913,18 +1110,6 @@ static int f2fs_cross_rename(struct inode *old_dir, struct dentry *old_dentry,
f2fs_lock_op(sbi);
- err = update_dent_inode(old_inode, new_inode, &new_dentry->d_name);
- if (err)
- goto out_unlock;
- if (file_enc_name(new_inode))
- file_set_enc_name(old_inode);
-
- err = update_dent_inode(new_inode, old_inode, &old_dentry->d_name);
- if (err)
- goto out_undo;
- if (file_enc_name(old_inode))
- file_set_enc_name(new_inode);
-
/* update ".." directory entry info of old dentry */
if (old_dir_entry)
f2fs_set_link(old_inode, old_dir_entry, old_dir_page, new_dir);
@@ -946,7 +1131,7 @@ static int f2fs_cross_rename(struct inode *old_dir, struct dentry *old_dentry,
f2fs_i_links_write(old_dir, old_nlink > 0);
up_write(&F2FS_I(old_dir)->i_sem);
}
- f2fs_mark_inode_dirty_sync(old_dir);
+ f2fs_mark_inode_dirty_sync(old_dir, false);
/* update directory entry info of new dir inode */
f2fs_set_link(new_dir, new_entry, new_page, old_inode);
@@ -961,36 +1146,31 @@ static int f2fs_cross_rename(struct inode *old_dir, struct dentry *old_dentry,
f2fs_i_links_write(new_dir, new_nlink > 0);
up_write(&F2FS_I(new_dir)->i_sem);
}
- f2fs_mark_inode_dirty_sync(new_dir);
+ f2fs_mark_inode_dirty_sync(new_dir, false);
+
+ if (F2FS_OPTION(sbi).fsync_mode == FSYNC_MODE_STRICT) {
+ f2fs_add_ino_entry(sbi, old_dir->i_ino, TRANS_DIR_INO);
+ f2fs_add_ino_entry(sbi, new_dir->i_ino, TRANS_DIR_INO);
+ }
f2fs_unlock_op(sbi);
if (IS_DIRSYNC(old_dir) || IS_DIRSYNC(new_dir))
f2fs_sync_fs(sbi->sb, 1);
+
+ f2fs_update_time(sbi, REQ_TIME);
return 0;
-out_undo:
- /*
- * Still we may fail to recover name info of f2fs_inode here
- * Drop it, once its name is set as encrypted
- */
- update_dent_inode(old_inode, old_inode, &old_dentry->d_name);
-out_unlock:
- f2fs_unlock_op(sbi);
out_new_dir:
if (new_dir_entry) {
- f2fs_dentry_kunmap(new_inode, new_dir_page);
f2fs_put_page(new_dir_page, 0);
}
out_old_dir:
if (old_dir_entry) {
- f2fs_dentry_kunmap(old_inode, old_dir_page);
f2fs_put_page(old_dir_page, 0);
}
out_new:
- f2fs_dentry_kunmap(new_dir, new_page);
f2fs_put_page(new_page, 0);
out_old:
- f2fs_dentry_kunmap(old_dir, old_page);
f2fs_put_page(old_page, 0);
out:
return err;
@@ -1000,9 +1180,16 @@ static int f2fs_rename2(struct inode *old_dir, struct dentry *old_dentry,
struct inode *new_dir, struct dentry *new_dentry,
unsigned int flags)
{
+ int err;
+
if (flags & ~(RENAME_NOREPLACE | RENAME_EXCHANGE | RENAME_WHITEOUT))
return -EINVAL;
+ err = fscrypt_prepare_rename(old_dir, old_dentry, new_dir, new_dentry,
+ flags);
+ if (err)
+ return err;
+
if (flags & RENAME_EXCHANGE) {
return f2fs_cross_rename(old_dir, old_dentry,
new_dir, new_dentry);
@@ -1018,68 +1205,20 @@ static const char *f2fs_encrypted_get_link(struct dentry *dentry,
struct inode *inode,
struct delayed_call *done)
{
- struct page *cpage = NULL;
- char *caddr, *paddr = NULL;
- struct fscrypt_str cstr = FSTR_INIT(NULL, 0);
- struct fscrypt_str pstr = FSTR_INIT(NULL, 0);
- struct fscrypt_symlink_data *sd;
- u32 max_size = inode->i_sb->s_blocksize;
- int res;
+ struct page *page;
+ const char *target;
if (!dentry)
return ERR_PTR(-ECHILD);
- res = fscrypt_get_encryption_info(inode);
- if (res)
- return ERR_PTR(res);
-
- cpage = read_mapping_page(inode->i_mapping, 0, NULL);
- if (IS_ERR(cpage))
- return ERR_CAST(cpage);
- caddr = page_address(cpage);
-
- /* Symlink is encrypted */
- sd = (struct fscrypt_symlink_data *)caddr;
- cstr.name = sd->encrypted_path;
- cstr.len = le16_to_cpu(sd->len);
-
- /* this is broken symlink case */
- if (unlikely(cstr.len == 0)) {
- res = -ENOENT;
- goto errout;
- }
-
- if ((cstr.len + sizeof(struct fscrypt_symlink_data) - 1) > max_size) {
- /* Symlink data on the disk is corrupted */
- res = -EIO;
- goto errout;
- }
- res = fscrypt_fname_alloc_buffer(inode, cstr.len, &pstr);
- if (res)
- goto errout;
-
- res = fscrypt_fname_disk_to_usr(inode, 0, 0, &cstr, &pstr);
- if (res)
- goto errout;
-
- /* this is broken symlink case */
- if (unlikely(pstr.name[0] == 0)) {
- res = -ENOENT;
- goto errout;
- }
-
- paddr = pstr.name;
-
- /* Null-terminate the name */
- paddr[pstr.len] = '\0';
+ page = read_mapping_page(inode->i_mapping, 0, NULL);
+ if (IS_ERR(page))
+ return ERR_CAST(page);
- put_page(cpage);
- set_delayed_call(done, kfree_link, paddr);
- return paddr;
-errout:
- fscrypt_fname_free_buffer(&pstr);
- put_page(cpage);
- return ERR_PTR(res);
+ target = fscrypt_get_symlink(inode, page_address(page),
+ inode->i_sb->s_blocksize, done);
+ put_page(page);
+ return target;
}
const struct inode_operations f2fs_encrypted_symlink_inode_operations = {
diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c
index f4fe54047fb7..46fdaebd6185 100644
--- a/fs/f2fs/node.c
+++ b/fs/f2fs/node.c
@@ -1,12 +1,9 @@
+// SPDX-License-Identifier: GPL-2.0
/*
* fs/f2fs/node.c
*
* Copyright (c) 2012 Samsung Electronics Co., Ltd.
* http://www.samsung.com/
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
*/
#include <linux/fs.h>
#include <linux/f2fs_fs.h>
@@ -19,16 +16,33 @@
#include "f2fs.h"
#include "node.h"
#include "segment.h"
+#include "xattr.h"
#include "trace.h"
#include <trace/events/f2fs.h>
-#define on_build_free_nids(nmi) mutex_is_locked(&nm_i->build_lock)
+#define on_f2fs_build_free_nids(nmi) mutex_is_locked(&(nm_i)->build_lock)
static struct kmem_cache *nat_entry_slab;
static struct kmem_cache *free_nid_slab;
static struct kmem_cache *nat_entry_set_slab;
+static struct kmem_cache *fsync_node_entry_slab;
-bool available_free_memory(struct f2fs_sb_info *sbi, int type)
+/*
+ * Check whether the given nid is within node id range.
+ */
+int f2fs_check_nid_range(struct f2fs_sb_info *sbi, nid_t nid)
+{
+ if (unlikely(nid < F2FS_ROOT_INO(sbi) || nid >= NM_I(sbi)->max_nid)) {
+ set_sbi_flag(sbi, SBI_NEED_FSCK);
+ f2fs_msg(sbi->sb, KERN_WARNING,
+ "%s: out-of-range nid=%x, run fsck to fix.",
+ __func__, nid);
+ return -EINVAL;
+ }
+ return 0;
+}
+
+bool f2fs_available_free_memory(struct f2fs_sb_info *sbi, int type)
{
struct f2fs_nm_info *nm_i = NM_I(sbi);
struct sysinfo val;
@@ -45,8 +59,8 @@ bool available_free_memory(struct f2fs_sb_info *sbi, int type)
* give 25%, 25%, 50%, 50%, 50% memory for each components respectively
*/
if (type == FREE_NIDS) {
- mem_size = (nm_i->fcnt * sizeof(struct free_nid)) >>
- PAGE_SHIFT;
+ mem_size = (nm_i->nid_cnt[FREE_NID] *
+ sizeof(struct free_nid)) >> PAGE_SHIFT;
res = mem_size < ((avail_ram * nm_i->ram_thresh / 100) >> 2);
} else if (type == NAT_ENTRIES) {
mem_size = (nm_i->nat_cnt * sizeof(struct nat_entry)) >>
@@ -62,9 +76,10 @@ bool available_free_memory(struct f2fs_sb_info *sbi, int type)
} else if (type == INO_ENTRIES) {
int i;
- for (i = 0; i <= UPDATE_INO; i++)
- mem_size += (sbi->im[i].ino_num *
- sizeof(struct ino_entry)) >> PAGE_SHIFT;
+ for (i = 0; i < MAX_INO_ENTRY; i++)
+ mem_size += sbi->im[i].ino_num *
+ sizeof(struct ino_entry);
+ mem_size >>= PAGE_SHIFT;
res = mem_size < ((avail_ram * nm_i->ram_thresh / 100) >> 1);
} else if (type == EXTENT_CACHE) {
mem_size = (atomic_read(&sbi->total_ext_tree) *
@@ -72,6 +87,10 @@ bool available_free_memory(struct f2fs_sb_info *sbi, int type)
atomic_read(&sbi->total_ext_node) *
sizeof(struct extent_node)) >> PAGE_SHIFT;
res = mem_size < ((avail_ram * nm_i->ram_thresh / 100) >> 1);
+ } else if (type == INMEM_PAGES) {
+ /* it allows 20% / total_ram for inmemory pages */
+ mem_size = get_pages(sbi, F2FS_INMEM_PAGES);
+ res = mem_size < (val.totalram / 5);
} else {
if (!sbi->sb->s_bdi->wb.dirty_exceeded)
return true;
@@ -81,44 +100,35 @@ bool available_free_memory(struct f2fs_sb_info *sbi, int type)
static void clear_node_page_dirty(struct page *page)
{
- struct address_space *mapping = page->mapping;
- unsigned int long flags;
-
if (PageDirty(page)) {
- spin_lock_irqsave(&mapping->tree_lock, flags);
- radix_tree_tag_clear(&mapping->page_tree,
- page_index(page),
- PAGECACHE_TAG_DIRTY);
- spin_unlock_irqrestore(&mapping->tree_lock, flags);
-
+ f2fs_clear_radix_tree_dirty_tag(page);
clear_page_dirty_for_io(page);
- dec_page_count(F2FS_M_SB(mapping), F2FS_DIRTY_NODES);
+ dec_page_count(F2FS_P_SB(page), F2FS_DIRTY_NODES);
}
ClearPageUptodate(page);
}
static struct page *get_current_nat_page(struct f2fs_sb_info *sbi, nid_t nid)
{
- pgoff_t index = current_nat_addr(sbi, nid);
- return get_meta_page(sbi, index);
+ return f2fs_get_meta_page_nofail(sbi, current_nat_addr(sbi, nid));
}
static struct page *get_next_nat_page(struct f2fs_sb_info *sbi, nid_t nid)
{
struct page *src_page;
struct page *dst_page;
- pgoff_t src_off;
pgoff_t dst_off;
void *src_addr;
void *dst_addr;
struct f2fs_nm_info *nm_i = NM_I(sbi);
- src_off = current_nat_addr(sbi, nid);
- dst_off = next_nat_addr(sbi, src_off);
+ dst_off = next_nat_addr(sbi, current_nat_addr(sbi, nid));
/* get current nat block page with lock */
- src_page = get_meta_page(sbi, src_off);
- dst_page = grab_meta_page(sbi, dst_off);
+ src_page = get_current_nat_page(sbi, nid);
+ if (IS_ERR(src_page))
+ return src_page;
+ dst_page = f2fs_grab_meta_page(sbi, dst_off);
f2fs_bug_on(sbi, PageDirty(src_page));
src_addr = page_address(src_page);
@@ -132,9 +142,61 @@ static struct page *get_next_nat_page(struct f2fs_sb_info *sbi, nid_t nid)
return dst_page;
}
+static struct nat_entry *__alloc_nat_entry(nid_t nid, bool no_fail)
+{
+ struct nat_entry *new;
+
+ if (no_fail)
+ new = f2fs_kmem_cache_alloc(nat_entry_slab, GFP_F2FS_ZERO);
+ else
+ new = kmem_cache_alloc(nat_entry_slab, GFP_F2FS_ZERO);
+ if (new) {
+ nat_set_nid(new, nid);
+ nat_reset_flag(new);
+ }
+ return new;
+}
+
+static void __free_nat_entry(struct nat_entry *e)
+{
+ kmem_cache_free(nat_entry_slab, e);
+}
+
+/* must be locked by nat_tree_lock */
+static struct nat_entry *__init_nat_entry(struct f2fs_nm_info *nm_i,
+ struct nat_entry *ne, struct f2fs_nat_entry *raw_ne, bool no_fail)
+{
+ if (no_fail)
+ f2fs_radix_tree_insert(&nm_i->nat_root, nat_get_nid(ne), ne);
+ else if (radix_tree_insert(&nm_i->nat_root, nat_get_nid(ne), ne))
+ return NULL;
+
+ if (raw_ne)
+ node_info_from_raw_nat(&ne->ni, raw_ne);
+
+ spin_lock(&nm_i->nat_list_lock);
+ list_add_tail(&ne->list, &nm_i->nat_entries);
+ spin_unlock(&nm_i->nat_list_lock);
+
+ nm_i->nat_cnt++;
+ return ne;
+}
+
static struct nat_entry *__lookup_nat_cache(struct f2fs_nm_info *nm_i, nid_t n)
{
- return radix_tree_lookup(&nm_i->nat_root, n);
+ struct nat_entry *ne;
+
+ ne = radix_tree_lookup(&nm_i->nat_root, n);
+
+ /* for recent accessed nat entry, move it to tail of lru list */
+ if (ne && !get_nat_flag(ne, IS_DIRTY)) {
+ spin_lock(&nm_i->nat_list_lock);
+ if (!list_empty(&ne->list))
+ list_move_tail(&ne->list, &nm_i->nat_entries);
+ spin_unlock(&nm_i->nat_list_lock);
+ }
+
+ return ne;
}
static unsigned int __gang_lookup_nat_cache(struct f2fs_nm_info *nm_i,
@@ -145,21 +207,17 @@ static unsigned int __gang_lookup_nat_cache(struct f2fs_nm_info *nm_i,
static void __del_from_nat_cache(struct f2fs_nm_info *nm_i, struct nat_entry *e)
{
- list_del(&e->list);
radix_tree_delete(&nm_i->nat_root, nat_get_nid(e));
nm_i->nat_cnt--;
- kmem_cache_free(nat_entry_slab, e);
+ __free_nat_entry(e);
}
-static void __set_nat_cache_dirty(struct f2fs_nm_info *nm_i,
- struct nat_entry *ne)
+static struct nat_entry_set *__grab_nat_entry_set(struct f2fs_nm_info *nm_i,
+ struct nat_entry *ne)
{
nid_t set = NAT_BLOCK_OFFSET(ne->ni.nid);
struct nat_entry_set *head;
- if (get_nat_flag(ne, IS_DIRTY))
- return;
-
head = radix_tree_lookup(&nm_i->nat_set_root, set);
if (!head) {
head = f2fs_kmem_cache_alloc(nat_entry_set_slab, GFP_NOFS);
@@ -170,25 +228,53 @@ static void __set_nat_cache_dirty(struct f2fs_nm_info *nm_i,
head->entry_cnt = 0;
f2fs_radix_tree_insert(&nm_i->nat_set_root, set, head);
}
- list_move_tail(&ne->list, &head->entry_list);
+ return head;
+}
+
+static void __set_nat_cache_dirty(struct f2fs_nm_info *nm_i,
+ struct nat_entry *ne)
+{
+ struct nat_entry_set *head;
+ bool new_ne = nat_get_blkaddr(ne) == NEW_ADDR;
+
+ if (!new_ne)
+ head = __grab_nat_entry_set(nm_i, ne);
+
+ /*
+ * update entry_cnt in below condition:
+ * 1. update NEW_ADDR to valid block address;
+ * 2. update old block address to new one;
+ */
+ if (!new_ne && (get_nat_flag(ne, IS_PREALLOC) ||
+ !get_nat_flag(ne, IS_DIRTY)))
+ head->entry_cnt++;
+
+ set_nat_flag(ne, IS_PREALLOC, new_ne);
+
+ if (get_nat_flag(ne, IS_DIRTY))
+ goto refresh_list;
+
nm_i->dirty_nat_cnt++;
- head->entry_cnt++;
set_nat_flag(ne, IS_DIRTY, true);
+refresh_list:
+ spin_lock(&nm_i->nat_list_lock);
+ if (new_ne)
+ list_del_init(&ne->list);
+ else
+ list_move_tail(&ne->list, &head->entry_list);
+ spin_unlock(&nm_i->nat_list_lock);
}
static void __clear_nat_cache_dirty(struct f2fs_nm_info *nm_i,
- struct nat_entry *ne)
+ struct nat_entry_set *set, struct nat_entry *ne)
{
- nid_t set = NAT_BLOCK_OFFSET(ne->ni.nid);
- struct nat_entry_set *head;
+ spin_lock(&nm_i->nat_list_lock);
+ list_move_tail(&ne->list, &nm_i->nat_entries);
+ spin_unlock(&nm_i->nat_list_lock);
- head = radix_tree_lookup(&nm_i->nat_set_root, set);
- if (head) {
- list_move_tail(&ne->list, &nm_i->nat_entries);
- set_nat_flag(ne, IS_DIRTY, false);
- head->entry_cnt--;
- nm_i->dirty_nat_cnt--;
- }
+ set_nat_flag(ne, IS_DIRTY, false);
+ set->entry_cnt--;
+ nm_i->dirty_nat_cnt--;
}
static unsigned int __gang_lookup_nat_set(struct f2fs_nm_info *nm_i,
@@ -198,7 +284,73 @@ static unsigned int __gang_lookup_nat_set(struct f2fs_nm_info *nm_i,
start, nr);
}
-int need_dentry_mark(struct f2fs_sb_info *sbi, nid_t nid)
+bool f2fs_in_warm_node_list(struct f2fs_sb_info *sbi, struct page *page)
+{
+ return NODE_MAPPING(sbi) == page->mapping &&
+ IS_DNODE(page) && is_cold_node(page);
+}
+
+void f2fs_init_fsync_node_info(struct f2fs_sb_info *sbi)
+{
+ spin_lock_init(&sbi->fsync_node_lock);
+ INIT_LIST_HEAD(&sbi->fsync_node_list);
+ sbi->fsync_seg_id = 0;
+ sbi->fsync_node_num = 0;
+}
+
+static unsigned int f2fs_add_fsync_node_entry(struct f2fs_sb_info *sbi,
+ struct page *page)
+{
+ struct fsync_node_entry *fn;
+ unsigned long flags;
+ unsigned int seq_id;
+
+ fn = f2fs_kmem_cache_alloc(fsync_node_entry_slab, GFP_NOFS);
+
+ get_page(page);
+ fn->page = page;
+ INIT_LIST_HEAD(&fn->list);
+
+ spin_lock_irqsave(&sbi->fsync_node_lock, flags);
+ list_add_tail(&fn->list, &sbi->fsync_node_list);
+ fn->seq_id = sbi->fsync_seg_id++;
+ seq_id = fn->seq_id;
+ sbi->fsync_node_num++;
+ spin_unlock_irqrestore(&sbi->fsync_node_lock, flags);
+
+ return seq_id;
+}
+
+void f2fs_del_fsync_node_entry(struct f2fs_sb_info *sbi, struct page *page)
+{
+ struct fsync_node_entry *fn;
+ unsigned long flags;
+
+ spin_lock_irqsave(&sbi->fsync_node_lock, flags);
+ list_for_each_entry(fn, &sbi->fsync_node_list, list) {
+ if (fn->page == page) {
+ list_del(&fn->list);
+ sbi->fsync_node_num--;
+ spin_unlock_irqrestore(&sbi->fsync_node_lock, flags);
+ kmem_cache_free(fsync_node_entry_slab, fn);
+ put_page(page);
+ return;
+ }
+ }
+ spin_unlock_irqrestore(&sbi->fsync_node_lock, flags);
+ f2fs_bug_on(sbi, 1);
+}
+
+void f2fs_reset_fsync_node_info(struct f2fs_sb_info *sbi)
+{
+ unsigned long flags;
+
+ spin_lock_irqsave(&sbi->fsync_node_lock, flags);
+ sbi->fsync_seg_id = 0;
+ spin_unlock_irqrestore(&sbi->fsync_node_lock, flags);
+}
+
+int f2fs_need_dentry_mark(struct f2fs_sb_info *sbi, nid_t nid)
{
struct f2fs_nm_info *nm_i = NM_I(sbi);
struct nat_entry *e;
@@ -215,7 +367,7 @@ int need_dentry_mark(struct f2fs_sb_info *sbi, nid_t nid)
return need;
}
-bool is_checkpointed_node(struct f2fs_sb_info *sbi, nid_t nid)
+bool f2fs_is_checkpointed_node(struct f2fs_sb_info *sbi, nid_t nid)
{
struct f2fs_nm_info *nm_i = NM_I(sbi);
struct nat_entry *e;
@@ -229,7 +381,7 @@ bool is_checkpointed_node(struct f2fs_sb_info *sbi, nid_t nid)
return is_cp;
}
-bool need_inode_block_update(struct f2fs_sb_info *sbi, nid_t ino)
+bool f2fs_need_inode_block_update(struct f2fs_sb_info *sbi, nid_t ino)
{
struct f2fs_nm_info *nm_i = NM_I(sbi);
struct nat_entry *e;
@@ -245,35 +397,29 @@ bool need_inode_block_update(struct f2fs_sb_info *sbi, nid_t ino)
return need_update;
}
-static struct nat_entry *grab_nat_entry(struct f2fs_nm_info *nm_i, nid_t nid)
-{
- struct nat_entry *new;
-
- new = f2fs_kmem_cache_alloc(nat_entry_slab, GFP_NOFS);
- f2fs_radix_tree_insert(&nm_i->nat_root, nid, new);
- memset(new, 0, sizeof(struct nat_entry));
- nat_set_nid(new, nid);
- nat_reset_flag(new);
- list_add_tail(&new->list, &nm_i->nat_entries);
- nm_i->nat_cnt++;
- return new;
-}
-
+/* must be locked by nat_tree_lock */
static void cache_nat_entry(struct f2fs_sb_info *sbi, nid_t nid,
struct f2fs_nat_entry *ne)
{
struct f2fs_nm_info *nm_i = NM_I(sbi);
- struct nat_entry *e;
+ struct nat_entry *new, *e;
+
+ new = __alloc_nat_entry(nid, false);
+ if (!new)
+ return;
+ down_write(&nm_i->nat_tree_lock);
e = __lookup_nat_cache(nm_i, nid);
- if (!e) {
- e = grab_nat_entry(nm_i, nid);
- node_info_from_raw_nat(&e->ni, ne);
- } else {
- f2fs_bug_on(sbi, nat_get_ino(e) != ne->ino ||
- nat_get_blkaddr(e) != ne->block_addr ||
+ if (!e)
+ e = __init_nat_entry(nm_i, new, ne, false);
+ else
+ f2fs_bug_on(sbi, nat_get_ino(e) != le32_to_cpu(ne->ino) ||
+ nat_get_blkaddr(e) !=
+ le32_to_cpu(ne->block_addr) ||
nat_get_version(e) != ne->version);
- }
+ up_write(&nm_i->nat_tree_lock);
+ if (e != new)
+ __free_nat_entry(new);
}
static void set_node_addr(struct f2fs_sb_info *sbi, struct node_info *ni,
@@ -281,11 +427,12 @@ static void set_node_addr(struct f2fs_sb_info *sbi, struct node_info *ni,
{
struct f2fs_nm_info *nm_i = NM_I(sbi);
struct nat_entry *e;
+ struct nat_entry *new = __alloc_nat_entry(ni->nid, true);
down_write(&nm_i->nat_tree_lock);
e = __lookup_nat_cache(nm_i, ni->nid);
if (!e) {
- e = grab_nat_entry(nm_i, ni->nid);
+ e = __init_nat_entry(nm_i, new, NULL, true);
copy_node_info(&e->ni, ni);
f2fs_bug_on(sbi, ni->blk_addr == NEW_ADDR);
} else if (new_blkaddr == NEW_ADDR) {
@@ -297,6 +444,9 @@ static void set_node_addr(struct f2fs_sb_info *sbi, struct node_info *ni,
copy_node_info(&e->ni, ni);
f2fs_bug_on(sbi, ni->blk_addr != NULL_ADDR);
}
+ /* let's free early to reduce memory consumption */
+ if (e != new)
+ __free_nat_entry(new);
/* sanity check */
f2fs_bug_on(sbi, nat_get_blkaddr(e) != ni->blk_addr);
@@ -311,10 +461,6 @@ static void set_node_addr(struct f2fs_sb_info *sbi, struct node_info *ni,
if (nat_get_blkaddr(e) != NEW_ADDR && new_blkaddr == NULL_ADDR) {
unsigned char version = nat_get_version(e);
nat_set_version(e, inc_node_version(version));
-
- /* in order to reuse the nid */
- if (nm_i->next_scan_nid > ni->nid)
- nm_i->next_scan_nid = ni->nid;
}
/* change address */
@@ -334,7 +480,7 @@ static void set_node_addr(struct f2fs_sb_info *sbi, struct node_info *ni,
up_write(&nm_i->nat_tree_lock);
}
-int try_to_free_nats(struct f2fs_sb_info *sbi, int nr_shrink)
+int f2fs_try_to_free_nats(struct f2fs_sb_info *sbi, int nr_shrink)
{
struct f2fs_nm_info *nm_i = NM_I(sbi);
int nr = nr_shrink;
@@ -342,13 +488,25 @@ int try_to_free_nats(struct f2fs_sb_info *sbi, int nr_shrink)
if (!down_write_trylock(&nm_i->nat_tree_lock))
return 0;
- while (nr_shrink && !list_empty(&nm_i->nat_entries)) {
+ spin_lock(&nm_i->nat_list_lock);
+ while (nr_shrink) {
struct nat_entry *ne;
+
+ if (list_empty(&nm_i->nat_entries))
+ break;
+
ne = list_first_entry(&nm_i->nat_entries,
struct nat_entry, list);
+ list_del(&ne->list);
+ spin_unlock(&nm_i->nat_list_lock);
+
__del_from_nat_cache(nm_i, ne);
nr_shrink--;
+
+ spin_lock(&nm_i->nat_list_lock);
}
+ spin_unlock(&nm_i->nat_list_lock);
+
up_write(&nm_i->nat_tree_lock);
return nr - nr_shrink;
}
@@ -356,7 +514,8 @@ int try_to_free_nats(struct f2fs_sb_info *sbi, int nr_shrink)
/*
* This function always returns success
*/
-void get_node_info(struct f2fs_sb_info *sbi, nid_t nid, struct node_info *ni)
+int f2fs_get_node_info(struct f2fs_sb_info *sbi, nid_t nid,
+ struct node_info *ni)
{
struct f2fs_nm_info *nm_i = NM_I(sbi);
struct curseg_info *curseg = CURSEG_I(sbi, CURSEG_HOT_DATA);
@@ -366,6 +525,7 @@ void get_node_info(struct f2fs_sb_info *sbi, nid_t nid, struct node_info *ni)
struct page *page = NULL;
struct f2fs_nat_entry ne;
struct nat_entry *e;
+ pgoff_t index;
int i;
ni->nid = nid;
@@ -378,40 +538,46 @@ void get_node_info(struct f2fs_sb_info *sbi, nid_t nid, struct node_info *ni)
ni->blk_addr = nat_get_blkaddr(e);
ni->version = nat_get_version(e);
up_read(&nm_i->nat_tree_lock);
- return;
+ return 0;
}
memset(&ne, 0, sizeof(struct f2fs_nat_entry));
/* Check current segment summary */
down_read(&curseg->journal_rwsem);
- i = lookup_journal_in_cursum(journal, NAT_JOURNAL, nid, 0);
+ i = f2fs_lookup_journal_in_cursum(journal, NAT_JOURNAL, nid, 0);
if (i >= 0) {
ne = nat_in_journal(journal, i);
node_info_from_raw_nat(ni, &ne);
}
up_read(&curseg->journal_rwsem);
- if (i >= 0)
+ if (i >= 0) {
+ up_read(&nm_i->nat_tree_lock);
goto cache;
+ }
/* Fill node_info from nat page */
- page = get_current_nat_page(sbi, start_nid);
+ index = current_nat_addr(sbi, nid);
+ up_read(&nm_i->nat_tree_lock);
+
+ page = f2fs_get_meta_page(sbi, index);
+ if (IS_ERR(page))
+ return PTR_ERR(page);
+
nat_blk = (struct f2fs_nat_block *)page_address(page);
ne = nat_blk->entries[nid - start_nid];
node_info_from_raw_nat(ni, &ne);
f2fs_put_page(page, 1);
cache:
- up_read(&nm_i->nat_tree_lock);
/* cache nat entry */
- down_write(&nm_i->nat_tree_lock);
cache_nat_entry(sbi, nid, &ne);
- up_write(&nm_i->nat_tree_lock);
+ return 0;
}
/*
* readahead MAX_RA_NODE number of node pages.
*/
-static void ra_node_pages(struct page *parent, int start, int n)
+static void f2fs_ra_node_pages(struct page *parent, int start, int n)
{
struct f2fs_sb_info *sbi = F2FS_P_SB(parent);
struct blk_plug plug;
@@ -425,13 +591,13 @@ static void ra_node_pages(struct page *parent, int start, int n)
end = min(end, NIDS_PER_BLOCK);
for (i = start; i < end; i++) {
nid = get_nid(parent, i, false);
- ra_node_page(sbi, nid);
+ f2fs_ra_node_page(sbi, nid);
}
blk_finish_plug(&plug);
}
-pgoff_t get_next_page_offset(struct dnode_of_data *dn, pgoff_t pgofs)
+pgoff_t f2fs_get_next_page_offset(struct dnode_of_data *dn, pgoff_t pgofs)
{
const long direct_index = ADDRS_PER_INODE(dn->inode);
const long direct_blks = ADDRS_PER_BLOCK;
@@ -534,7 +700,7 @@ static int get_node_path(struct inode *inode, long block,
level = 3;
goto got;
} else {
- BUG();
+ return -E2BIG;
}
got:
return level;
@@ -546,7 +712,7 @@ got:
* f2fs_unlock_op() only if ro is not set RDONLY_NODE.
* In the case of RDONLY_NODE, we don't need to care about mutex.
*/
-int get_dnode_of_data(struct dnode_of_data *dn, pgoff_t index, int mode)
+int f2fs_get_dnode_of_data(struct dnode_of_data *dn, pgoff_t index, int mode)
{
struct f2fs_sb_info *sbi = F2FS_I_SB(dn->inode);
struct page *npage[4];
@@ -558,12 +724,14 @@ int get_dnode_of_data(struct dnode_of_data *dn, pgoff_t index, int mode)
int err = 0;
level = get_node_path(dn->inode, index, offset, noffset);
+ if (level < 0)
+ return level;
nids[0] = dn->inode->i_ino;
npage[0] = dn->inode_page;
if (!npage[0]) {
- npage[0] = get_node_page(sbi, nids[0]);
+ npage[0] = f2fs_get_node_page(sbi, nids[0]);
if (IS_ERR(npage[0]))
return PTR_ERR(npage[0]);
}
@@ -587,24 +755,24 @@ int get_dnode_of_data(struct dnode_of_data *dn, pgoff_t index, int mode)
if (!nids[i] && mode == ALLOC_NODE) {
/* alloc new node */
- if (!alloc_nid(sbi, &(nids[i]))) {
+ if (!f2fs_alloc_nid(sbi, &(nids[i]))) {
err = -ENOSPC;
goto release_pages;
}
dn->nid = nids[i];
- npage[i] = new_node_page(dn, noffset[i], NULL);
+ npage[i] = f2fs_new_node_page(dn, noffset[i]);
if (IS_ERR(npage[i])) {
- alloc_nid_failed(sbi, nids[i]);
+ f2fs_alloc_nid_failed(sbi, nids[i]);
err = PTR_ERR(npage[i]);
goto release_pages;
}
set_nid(parent, offset[i - 1], nids[i], i == 1);
- alloc_nid_done(sbi, nids[i]);
+ f2fs_alloc_nid_done(sbi, nids[i]);
done = true;
} else if (mode == LOOKUP_NODE_RA && i == level && level > 1) {
- npage[i] = get_node_page_ra(parent, offset[i - 1]);
+ npage[i] = f2fs_get_node_page_ra(parent, offset[i - 1]);
if (IS_ERR(npage[i])) {
err = PTR_ERR(npage[i]);
goto release_pages;
@@ -619,7 +787,7 @@ int get_dnode_of_data(struct dnode_of_data *dn, pgoff_t index, int mode)
}
if (!done) {
- npage[i] = get_node_page(sbi, nids[i]);
+ npage[i] = f2fs_get_node_page(sbi, nids[i]);
if (IS_ERR(npage[i])) {
err = PTR_ERR(npage[i]);
f2fs_put_page(npage[0], 0);
@@ -634,7 +802,8 @@ int get_dnode_of_data(struct dnode_of_data *dn, pgoff_t index, int mode)
dn->nid = nids[level];
dn->ofs_in_node = offset[level];
dn->node_page = npage[level];
- dn->data_blkaddr = datablock_addr(dn->node_page, dn->ofs_in_node);
+ dn->data_blkaddr = datablock_addr(dn->inode,
+ dn->node_page, dn->ofs_in_node);
return 0;
release_pages:
@@ -652,50 +821,53 @@ release_out:
return err;
}
-static void truncate_node(struct dnode_of_data *dn)
+static int truncate_node(struct dnode_of_data *dn)
{
struct f2fs_sb_info *sbi = F2FS_I_SB(dn->inode);
struct node_info ni;
+ int err;
+ pgoff_t index;
- get_node_info(sbi, dn->nid, &ni);
- if (dn->inode->i_blocks == 0) {
- f2fs_bug_on(sbi, ni.blk_addr != NULL_ADDR);
- goto invalidate;
- }
- f2fs_bug_on(sbi, ni.blk_addr == NULL_ADDR);
+ err = f2fs_get_node_info(sbi, dn->nid, &ni);
+ if (err)
+ return err;
/* Deallocate node address */
- invalidate_blocks(sbi, ni.blk_addr);
- dec_valid_node_count(sbi, dn->inode);
+ f2fs_invalidate_blocks(sbi, ni.blk_addr);
+ dec_valid_node_count(sbi, dn->inode, dn->nid == dn->inode->i_ino);
set_node_addr(sbi, &ni, NULL_ADDR, false);
if (dn->nid == dn->inode->i_ino) {
- remove_orphan_inode(sbi, dn->nid);
+ f2fs_remove_orphan_inode(sbi, dn->nid);
dec_valid_inode_count(sbi);
f2fs_inode_synced(dn->inode);
}
-invalidate:
+
clear_node_page_dirty(dn->node_page);
set_sbi_flag(sbi, SBI_IS_DIRTY);
+ index = dn->node_page->index;
f2fs_put_page(dn->node_page, 1);
invalidate_mapping_pages(NODE_MAPPING(sbi),
- dn->node_page->index, dn->node_page->index);
+ index, index);
dn->node_page = NULL;
trace_f2fs_truncate_node(dn->inode, dn->nid, ni.blk_addr);
+
+ return 0;
}
static int truncate_dnode(struct dnode_of_data *dn)
{
struct page *page;
+ int err;
if (dn->nid == 0)
return 1;
/* get direct node */
- page = get_node_page(F2FS_I_SB(dn->inode), dn->nid);
+ page = f2fs_get_node_page(F2FS_I_SB(dn->inode), dn->nid);
if (IS_ERR(page) && PTR_ERR(page) == -ENOENT)
return 1;
else if (IS_ERR(page))
@@ -704,8 +876,11 @@ static int truncate_dnode(struct dnode_of_data *dn)
/* Make dnode_of_data for parameter */
dn->node_page = page;
dn->ofs_in_node = 0;
- truncate_data_blocks(dn);
- truncate_node(dn);
+ f2fs_truncate_data_blocks(dn);
+ err = truncate_node(dn);
+ if (err)
+ return err;
+
return 1;
}
@@ -725,13 +900,13 @@ static int truncate_nodes(struct dnode_of_data *dn, unsigned int nofs,
trace_f2fs_truncate_nodes_enter(dn->inode, dn->nid, dn->data_blkaddr);
- page = get_node_page(F2FS_I_SB(dn->inode), dn->nid);
+ page = f2fs_get_node_page(F2FS_I_SB(dn->inode), dn->nid);
if (IS_ERR(page)) {
trace_f2fs_truncate_nodes_exit(dn->inode, PTR_ERR(page));
return PTR_ERR(page);
}
- ra_node_pages(page, ofs, NIDS_PER_BLOCK);
+ f2fs_ra_node_pages(page, ofs, NIDS_PER_BLOCK);
rn = F2FS_NODE(page);
if (depth < 3) {
@@ -770,7 +945,9 @@ static int truncate_nodes(struct dnode_of_data *dn, unsigned int nofs,
if (!ofs) {
/* remove current indirect node */
dn->node_page = page;
- truncate_node(dn);
+ ret = truncate_node(dn);
+ if (ret)
+ goto out_err;
freed++;
} else {
f2fs_put_page(page, 1);
@@ -801,7 +978,7 @@ static int truncate_partial_nodes(struct dnode_of_data *dn,
/* get indirect nodes in the path */
for (i = 0; i < idx + 1; i++) {
/* reference count'll be increased */
- pages[i] = get_node_page(F2FS_I_SB(dn->inode), nid[i]);
+ pages[i] = f2fs_get_node_page(F2FS_I_SB(dn->inode), nid[i]);
if (IS_ERR(pages[i])) {
err = PTR_ERR(pages[i]);
idx = i - 1;
@@ -810,7 +987,7 @@ static int truncate_partial_nodes(struct dnode_of_data *dn,
nid[i + 1] = get_nid(pages[i], offset[i + 1], false);
}
- ra_node_pages(pages[idx], offset[idx + 1], NIDS_PER_BLOCK);
+ f2fs_ra_node_pages(pages[idx], offset[idx + 1], NIDS_PER_BLOCK);
/* free direct nodes linked to a partial indirect node */
for (i = offset[idx + 1]; i < NIDS_PER_BLOCK; i++) {
@@ -828,7 +1005,9 @@ static int truncate_partial_nodes(struct dnode_of_data *dn,
if (offset[idx + 1] == 0) {
dn->node_page = pages[idx];
dn->nid = nid[idx];
- truncate_node(dn);
+ err = truncate_node(dn);
+ if (err)
+ goto fail;
} else {
f2fs_put_page(pages[idx], 1);
}
@@ -847,7 +1026,7 @@ fail:
/*
* All the block addresses of data and nodes should be nullified.
*/
-int truncate_inode_blocks(struct inode *inode, pgoff_t from)
+int f2fs_truncate_inode_blocks(struct inode *inode, pgoff_t from)
{
struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
int err = 0, cont = 1;
@@ -860,8 +1039,10 @@ int truncate_inode_blocks(struct inode *inode, pgoff_t from)
trace_f2fs_truncate_inode_blocks_enter(inode, from);
level = get_node_path(inode, from, offset, noffset);
+ if (level < 0)
+ return level;
- page = get_node_page(sbi, inode->i_ino);
+ page = f2fs_get_node_page(sbi, inode->i_ino);
if (IS_ERR(page)) {
trace_f2fs_truncate_inode_blocks_exit(inode, PTR_ERR(page));
return PTR_ERR(page);
@@ -925,7 +1106,7 @@ skip_partial:
ri->i_nid[offset[0] - NODE_DIR1_BLOCK]) {
lock_page(page);
BUG_ON(page->mapping != NODE_MAPPING(sbi));
- f2fs_wait_on_page_writeback(page, NODE, true);
+ f2fs_wait_on_page_writeback(page, NODE, true, true);
ri->i_nid[offset[0] - NODE_DIR1_BLOCK] = 0;
set_page_dirty(page);
unlock_page(page);
@@ -940,30 +1121,31 @@ fail:
return err > 0 ? 0 : err;
}
-int truncate_xattr_node(struct inode *inode, struct page *page)
+/* caller must lock inode page */
+int f2fs_truncate_xattr_node(struct inode *inode)
{
struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
nid_t nid = F2FS_I(inode)->i_xattr_nid;
struct dnode_of_data dn;
struct page *npage;
+ int err;
if (!nid)
return 0;
- npage = get_node_page(sbi, nid);
+ npage = f2fs_get_node_page(sbi, nid);
if (IS_ERR(npage))
return PTR_ERR(npage);
- f2fs_i_xnid_write(inode, 0);
-
- /* need to do checkpoint during fsync */
- F2FS_I(inode)->xattr_ver = cur_cp_version(F2FS_CKPT(sbi));
+ set_new_dnode(&dn, inode, NULL, npage, nid);
+ err = truncate_node(&dn);
+ if (err) {
+ f2fs_put_page(npage, 1);
+ return err;
+ }
- set_new_dnode(&dn, inode, page, npage, nid);
+ f2fs_i_xnid_write(inode, 0);
- if (page)
- dn.inode_page_locked = true;
- truncate_node(&dn);
return 0;
}
@@ -971,17 +1153,17 @@ int truncate_xattr_node(struct inode *inode, struct page *page)
* Caller should grab and release a rwsem by calling f2fs_lock_op() and
* f2fs_unlock_op().
*/
-int remove_inode_page(struct inode *inode)
+int f2fs_remove_inode_page(struct inode *inode)
{
struct dnode_of_data dn;
int err;
set_new_dnode(&dn, inode, NULL, NULL, inode->i_ino);
- err = get_dnode_of_data(&dn, 0, LOOKUP_NODE);
+ err = f2fs_get_dnode_of_data(&dn, 0, LOOKUP_NODE);
if (err)
return err;
- err = truncate_xattr_node(inode, dn.inode_page);
+ err = f2fs_truncate_xattr_node(inode);
if (err) {
f2fs_put_dnode(&dn);
return err;
@@ -990,18 +1172,26 @@ int remove_inode_page(struct inode *inode)
/* remove potential inline_data blocks */
if (S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode) ||
S_ISLNK(inode->i_mode))
- truncate_data_blocks_range(&dn, 1);
+ f2fs_truncate_data_blocks_range(&dn, 1);
/* 0 is possible, after f2fs_new_inode() has failed */
+ if (unlikely(f2fs_cp_error(F2FS_I_SB(inode)))) {
+ f2fs_put_dnode(&dn);
+ return -EIO;
+ }
f2fs_bug_on(F2FS_I_SB(inode),
- inode->i_blocks != 0 && inode->i_blocks != 1);
+ inode->i_blocks != 0 && inode->i_blocks != 8);
/* will put inode & node pages */
- truncate_node(&dn);
+ err = truncate_node(&dn);
+ if (err) {
+ f2fs_put_dnode(&dn);
+ return err;
+ }
return 0;
}
-struct page *new_inode_page(struct inode *inode)
+struct page *f2fs_new_inode_page(struct inode *inode)
{
struct dnode_of_data dn;
@@ -1009,14 +1199,13 @@ struct page *new_inode_page(struct inode *inode)
set_new_dnode(&dn, inode, NULL, NULL, inode->i_ino);
/* caller should f2fs_put_page(page, 1); */
- return new_node_page(&dn, 0, NULL);
+ return f2fs_new_node_page(&dn, 0);
}
-struct page *new_node_page(struct dnode_of_data *dn,
- unsigned int ofs, struct page *ipage)
+struct page *f2fs_new_node_page(struct dnode_of_data *dn, unsigned int ofs)
{
struct f2fs_sb_info *sbi = F2FS_I_SB(dn->inode);
- struct node_info old_ni, new_ni;
+ struct node_info new_ni;
struct page *page;
int err;
@@ -1027,22 +1216,27 @@ struct page *new_node_page(struct dnode_of_data *dn,
if (!page)
return ERR_PTR(-ENOMEM);
- if (unlikely(!inc_valid_node_count(sbi, dn->inode))) {
- err = -ENOSPC;
+ if (unlikely((err = inc_valid_node_count(sbi, dn->inode, !ofs))))
goto fail;
- }
- get_node_info(sbi, dn->nid, &old_ni);
-
- /* Reinitialize old_ni with new node page */
- f2fs_bug_on(sbi, old_ni.blk_addr != NULL_ADDR);
- new_ni = old_ni;
+#ifdef CONFIG_F2FS_CHECK_FS
+ err = f2fs_get_node_info(sbi, dn->nid, &new_ni);
+ if (err) {
+ dec_valid_node_count(sbi, dn->inode, !ofs);
+ goto fail;
+ }
+ f2fs_bug_on(sbi, new_ni.blk_addr != NULL_ADDR);
+#endif
+ new_ni.nid = dn->nid;
new_ni.ino = dn->inode->i_ino;
+ new_ni.blk_addr = NULL_ADDR;
+ new_ni.flag = 0;
+ new_ni.version = 0;
set_node_addr(sbi, &new_ni, NEW_ADDR, false);
- f2fs_wait_on_page_writeback(page, NODE, true);
+ f2fs_wait_on_page_writeback(page, NODE, true, true);
fill_node_footer(page, dn->nid, dn->inode->i_ino, ofs, true);
- set_cold_node(dn->inode, page);
+ set_cold_node(page, S_ISDIR(dn->inode->i_mode));
if (!PageUptodate(page))
SetPageUptodate(page);
if (set_page_dirty(page))
@@ -1078,13 +1272,21 @@ static int read_node_page(struct page *page, int op_flags)
.page = page,
.encrypted_page = NULL,
};
+ int err;
- if (PageUptodate(page))
+ if (PageUptodate(page)) {
+#ifdef CONFIG_F2FS_CHECK_FS
+ f2fs_bug_on(sbi, !f2fs_inode_chksum_verify(sbi, page));
+#endif
return LOCKED_PAGE;
+ }
- get_node_info(sbi, page->index, &ni);
+ err = f2fs_get_node_info(sbi, page->index, &ni);
+ if (err)
+ return err;
- if (unlikely(ni.blk_addr == NULL_ADDR)) {
+ if (unlikely(ni.blk_addr == NULL_ADDR) ||
+ is_sbi_flag_set(sbi, SBI_IS_SHUTDOWN)) {
ClearPageUptodate(page);
return -ENOENT;
}
@@ -1096,14 +1298,15 @@ static int read_node_page(struct page *page, int op_flags)
/*
* Readahead a node page
*/
-void ra_node_page(struct f2fs_sb_info *sbi, nid_t nid)
+void f2fs_ra_node_page(struct f2fs_sb_info *sbi, nid_t nid)
{
struct page *apage;
int err;
if (!nid)
return;
- f2fs_bug_on(sbi, check_nid_range(sbi, nid));
+ if (f2fs_check_nid_range(sbi, nid))
+ return;
rcu_read_lock();
apage = radix_tree_lookup(&NODE_MAPPING(sbi)->page_tree, nid);
@@ -1127,22 +1330,24 @@ static struct page *__get_node_page(struct f2fs_sb_info *sbi, pgoff_t nid,
if (!nid)
return ERR_PTR(-ENOENT);
- f2fs_bug_on(sbi, check_nid_range(sbi, nid));
+ if (f2fs_check_nid_range(sbi, nid))
+ return ERR_PTR(-EINVAL);
repeat:
page = f2fs_grab_cache_page(NODE_MAPPING(sbi), nid, false);
if (!page)
return ERR_PTR(-ENOMEM);
- err = read_node_page(page, READ_SYNC);
+ err = read_node_page(page, 0);
if (err < 0) {
f2fs_put_page(page, 1);
return ERR_PTR(err);
} else if (err == LOCKED_PAGE) {
+ err = 0;
goto page_hit;
}
if (parent)
- ra_node_pages(parent, start + 1, MAX_RA_NODE);
+ f2fs_ra_node_pages(parent, start + 1, MAX_RA_NODE);
lock_page(page);
@@ -1151,25 +1356,37 @@ repeat:
goto repeat;
}
- if (unlikely(!PageUptodate(page)))
+ if (unlikely(!PageUptodate(page))) {
+ err = -EIO;
goto out_err;
+ }
+
+ if (!f2fs_inode_chksum_verify(sbi, page)) {
+ err = -EBADMSG;
+ goto out_err;
+ }
page_hit:
if(unlikely(nid != nid_of_node(page))) {
- f2fs_bug_on(sbi, 1);
- ClearPageUptodate(page);
+ f2fs_msg(sbi->sb, KERN_WARNING, "inconsistent node block, "
+ "nid:%lu, node_footer[nid:%u,ino:%u,ofs:%u,cpver:%llu,blkaddr:%u]",
+ nid, nid_of_node(page), ino_of_node(page),
+ ofs_of_node(page), cpver_of_node(page),
+ next_blkaddr_of_node(page));
+ err = -EINVAL;
out_err:
+ ClearPageUptodate(page);
f2fs_put_page(page, 1);
- return ERR_PTR(-EIO);
+ return ERR_PTR(err);
}
return page;
}
-struct page *get_node_page(struct f2fs_sb_info *sbi, pgoff_t nid)
+struct page *f2fs_get_node_page(struct f2fs_sb_info *sbi, pgoff_t nid)
{
return __get_node_page(sbi, nid, NULL, 0);
}
-struct page *get_node_page_ra(struct page *parent, int start)
+struct page *f2fs_get_node_page_ra(struct page *parent, int start)
{
struct f2fs_sb_info *sbi = F2FS_P_SB(parent);
nid_t nid = get_nid(parent, start, false);
@@ -1188,7 +1405,8 @@ static void flush_inline_data(struct f2fs_sb_info *sbi, nid_t ino)
if (!inode)
return;
- page = pagecache_get_page(inode->i_mapping, 0, FGP_LOCK|FGP_NOWAIT, 0);
+ page = f2fs_pagecache_get_page(inode->i_mapping, 0,
+ FGP_LOCK|FGP_NOWAIT, 0);
if (!page)
goto iput_out;
@@ -1203,6 +1421,7 @@ static void flush_inline_data(struct f2fs_sb_info *sbi, nid_t ino)
ret = f2fs_write_inline_data(inode, page);
inode_dec_dirty_pages(inode);
+ f2fs_remove_dirty_inode(inode);
if (ret)
set_page_dirty(page);
page_out:
@@ -1211,54 +1430,19 @@ iput_out:
iput(inode);
}
-void move_node_page(struct page *node_page, int gc_type)
-{
- if (gc_type == FG_GC) {
- struct f2fs_sb_info *sbi = F2FS_P_SB(node_page);
- struct writeback_control wbc = {
- .sync_mode = WB_SYNC_ALL,
- .nr_to_write = 1,
- .for_reclaim = 0,
- };
-
- set_page_dirty(node_page);
- f2fs_wait_on_page_writeback(node_page, NODE, true);
-
- f2fs_bug_on(sbi, PageWriteback(node_page));
- if (!clear_page_dirty_for_io(node_page))
- goto out_page;
-
- if (NODE_MAPPING(sbi)->a_ops->writepage(node_page, &wbc))
- unlock_page(node_page);
- goto release_page;
- } else {
- /* set page dirty and write it */
- if (!PageWriteback(node_page))
- set_page_dirty(node_page);
- }
-out_page:
- unlock_page(node_page);
-release_page:
- f2fs_put_page(node_page, 0);
-}
-
static struct page *last_fsync_dnode(struct f2fs_sb_info *sbi, nid_t ino)
{
- pgoff_t index, end;
+ pgoff_t index;
struct pagevec pvec;
struct page *last_page = NULL;
+ int nr_pages;
pagevec_init(&pvec, 0);
index = 0;
- end = ULONG_MAX;
-
- while (index <= end) {
- int i, nr_pages;
- nr_pages = pagevec_lookup_tag(&pvec, NODE_MAPPING(sbi), &index,
- PAGECACHE_TAG_DIRTY,
- min(end - index, (pgoff_t)PAGEVEC_SIZE-1) + 1);
- if (nr_pages == 0)
- break;
+
+ while ((nr_pages = pagevec_lookup_tag(&pvec, NODE_MAPPING(sbi), &index,
+ PAGECACHE_TAG_DIRTY))) {
+ int i;
for (i = 0; i < nr_pages; i++) {
struct page *page = pvec.pages[i];
@@ -1302,15 +1486,165 @@ continue_unlock:
return last_page;
}
-int fsync_node_pages(struct f2fs_sb_info *sbi, struct inode *inode,
- struct writeback_control *wbc, bool atomic)
+static int __write_node_page(struct page *page, bool atomic, bool *submitted,
+ struct writeback_control *wbc, bool do_balance,
+ enum iostat_type io_type, unsigned int *seq_id)
+{
+ struct f2fs_sb_info *sbi = F2FS_P_SB(page);
+ nid_t nid;
+ struct node_info ni;
+ struct f2fs_io_info fio = {
+ .sbi = sbi,
+ .ino = ino_of_node(page),
+ .type = NODE,
+ .op = REQ_OP_WRITE,
+ .op_flags = wbc_to_write_flags(wbc),
+ .page = page,
+ .encrypted_page = NULL,
+ .submitted = false,
+ .io_type = io_type,
+ .io_wbc = wbc,
+ };
+ unsigned int seq;
+
+ trace_f2fs_writepage(page, NODE);
+
+ if (unlikely(f2fs_cp_error(sbi)))
+ goto redirty_out;
+
+ if (unlikely(is_sbi_flag_set(sbi, SBI_POR_DOING)))
+ goto redirty_out;
+
+ if (wbc->sync_mode == WB_SYNC_NONE &&
+ IS_DNODE(page) && is_cold_node(page))
+ goto redirty_out;
+
+ /* get old block addr of this node page */
+ nid = nid_of_node(page);
+ f2fs_bug_on(sbi, page->index != nid);
+
+ if (f2fs_get_node_info(sbi, nid, &ni))
+ goto redirty_out;
+
+ if (wbc->for_reclaim) {
+ if (!down_read_trylock(&sbi->node_write))
+ goto redirty_out;
+ } else {
+ down_read(&sbi->node_write);
+ }
+
+ /* This page is already truncated */
+ if (unlikely(ni.blk_addr == NULL_ADDR)) {
+ ClearPageUptodate(page);
+ dec_page_count(sbi, F2FS_DIRTY_NODES);
+ up_read(&sbi->node_write);
+ unlock_page(page);
+ return 0;
+ }
+
+ if (__is_valid_data_blkaddr(ni.blk_addr) &&
+ !f2fs_is_valid_blkaddr(sbi, ni.blk_addr, DATA_GENERIC)) {
+ up_read(&sbi->node_write);
+ goto redirty_out;
+ }
+
+ if (atomic && !test_opt(sbi, NOBARRIER))
+ fio.op_flags |= REQ_PREFLUSH | REQ_FUA;
+
+ set_page_writeback(page);
+ ClearPageError(page);
+
+ if (f2fs_in_warm_node_list(sbi, page)) {
+ seq = f2fs_add_fsync_node_entry(sbi, page);
+ if (seq_id)
+ *seq_id = seq;
+ }
+
+ fio.old_blkaddr = ni.blk_addr;
+ f2fs_do_write_node_page(nid, &fio);
+ set_node_addr(sbi, &ni, fio.new_blkaddr, is_fsync_dnode(page));
+ dec_page_count(sbi, F2FS_DIRTY_NODES);
+ up_read(&sbi->node_write);
+
+ if (wbc->for_reclaim) {
+ f2fs_submit_merged_write_cond(sbi, NULL, page, 0, NODE);
+ submitted = NULL;
+ }
+
+ unlock_page(page);
+
+ if (unlikely(f2fs_cp_error(sbi))) {
+ f2fs_submit_merged_write(sbi, NODE);
+ submitted = NULL;
+ }
+ if (submitted)
+ *submitted = fio.submitted;
+
+ if (do_balance)
+ f2fs_balance_fs(sbi, false);
+ return 0;
+
+redirty_out:
+ redirty_page_for_writepage(wbc, page);
+ return AOP_WRITEPAGE_ACTIVATE;
+}
+
+int f2fs_move_node_page(struct page *node_page, int gc_type)
+{
+ int err = 0;
+
+ if (gc_type == FG_GC) {
+ struct writeback_control wbc = {
+ .sync_mode = WB_SYNC_ALL,
+ .nr_to_write = 1,
+ .for_reclaim = 0,
+ };
+
+ f2fs_wait_on_page_writeback(node_page, NODE, true, true);
+
+ set_page_dirty(node_page);
+
+ if (!clear_page_dirty_for_io(node_page)) {
+ err = -EAGAIN;
+ goto out_page;
+ }
+
+ if (__write_node_page(node_page, false, NULL,
+ &wbc, false, FS_GC_NODE_IO, NULL)) {
+ err = -EAGAIN;
+ unlock_page(node_page);
+ }
+ goto release_page;
+ } else {
+ /* set page dirty and write it */
+ if (!PageWriteback(node_page))
+ set_page_dirty(node_page);
+ }
+out_page:
+ unlock_page(node_page);
+release_page:
+ f2fs_put_page(node_page, 0);
+ return err;
+}
+
+static int f2fs_write_node_page(struct page *page,
+ struct writeback_control *wbc)
+{
+ return __write_node_page(page, false, NULL, wbc, false,
+ FS_NODE_IO, NULL);
+}
+
+int f2fs_fsync_node_pages(struct f2fs_sb_info *sbi, struct inode *inode,
+ struct writeback_control *wbc, bool atomic,
+ unsigned int *seq_id)
{
- pgoff_t index, end;
+ pgoff_t index;
struct pagevec pvec;
int ret = 0;
struct page *last_page = NULL;
bool marked = false;
nid_t ino = inode->i_ino;
+ int nr_pages;
int nwritten = 0;
if (atomic) {
@@ -1321,23 +1655,20 @@ int fsync_node_pages(struct f2fs_sb_info *sbi, struct inode *inode,
retry:
pagevec_init(&pvec, 0);
index = 0;
- end = ULONG_MAX;
-
- while (index <= end) {
- int i, nr_pages;
- nr_pages = pagevec_lookup_tag(&pvec, NODE_MAPPING(sbi), &index,
- PAGECACHE_TAG_DIRTY,
- min(end - index, (pgoff_t)PAGEVEC_SIZE-1) + 1);
- if (nr_pages == 0)
- break;
+
+ while ((nr_pages = pagevec_lookup_tag(&pvec, NODE_MAPPING(sbi), &index,
+ PAGECACHE_TAG_DIRTY))) {
+ int i;
for (i = 0; i < nr_pages; i++) {
struct page *page = pvec.pages[i];
+ bool submitted = false;
if (unlikely(f2fs_cp_error(sbi))) {
f2fs_put_page(last_page, 0);
pagevec_release(&pvec);
- return -EIO;
+ ret = -EIO;
+ goto out;
}
if (!IS_DNODE(page) || !is_cold_node(page))
@@ -1360,17 +1691,19 @@ continue_unlock:
goto continue_unlock;
}
- f2fs_wait_on_page_writeback(page, NODE, true);
- BUG_ON(PageWriteback(page));
+ f2fs_wait_on_page_writeback(page, NODE, true, true);
+
+ set_fsync_mark(page, 0);
+ set_dentry_mark(page, 0);
if (!atomic || page == last_page) {
set_fsync_mark(page, 1);
if (IS_INODE(page)) {
if (is_inode_flag_set(inode,
FI_DIRTY_INODE))
- update_inode(inode, page);
+ f2fs_update_inode(inode, page);
set_dentry_mark(page,
- need_dentry_mark(sbi, ino));
+ f2fs_need_dentry_mark(sbi, ino));
}
/* may be written by other thread */
if (!PageDirty(page))
@@ -1380,12 +1713,15 @@ continue_unlock:
if (!clear_page_dirty_for_io(page))
goto continue_unlock;
- ret = NODE_MAPPING(sbi)->a_ops->writepage(page, wbc);
+ ret = __write_node_page(page, atomic &&
+ page == last_page,
+ &submitted, wbc, true,
+ FS_NODE_IO, seq_id);
if (ret) {
unlock_page(page);
f2fs_put_page(last_page, 0);
break;
- } else {
+ } else if (submitted) {
nwritten++;
}
@@ -1406,45 +1742,46 @@ continue_unlock:
"Retry to write fsync mark: ino=%u, idx=%lx",
ino, last_page->index);
lock_page(last_page);
+ f2fs_wait_on_page_writeback(last_page, NODE, true, true);
set_page_dirty(last_page);
unlock_page(last_page);
goto retry;
}
-
+out:
if (nwritten)
- f2fs_submit_merged_bio_cond(sbi, NULL, NULL, ino, NODE, WRITE);
+ f2fs_submit_merged_write_cond(sbi, NULL, NULL, ino, NODE);
return ret ? -EIO: 0;
}
-int sync_node_pages(struct f2fs_sb_info *sbi, struct writeback_control *wbc)
+int f2fs_sync_node_pages(struct f2fs_sb_info *sbi,
+ struct writeback_control *wbc,
+ bool do_balance, enum iostat_type io_type)
{
- pgoff_t index, end;
+ pgoff_t index;
struct pagevec pvec;
int step = 0;
int nwritten = 0;
int ret = 0;
+ int nr_pages, done = 0;
pagevec_init(&pvec, 0);
next_step:
index = 0;
- end = ULONG_MAX;
-
- while (index <= end) {
- int i, nr_pages;
- nr_pages = pagevec_lookup_tag(&pvec, NODE_MAPPING(sbi), &index,
- PAGECACHE_TAG_DIRTY,
- min(end - index, (pgoff_t)PAGEVEC_SIZE-1) + 1);
- if (nr_pages == 0)
- break;
+
+ while (!done && (nr_pages = pagevec_lookup_tag(&pvec,
+ NODE_MAPPING(sbi), &index, PAGECACHE_TAG_DIRTY))) {
+ int i;
for (i = 0; i < nr_pages; i++) {
struct page *page = pvec.pages[i];
+ bool submitted = false;
- if (unlikely(f2fs_cp_error(sbi))) {
- pagevec_release(&pvec);
- ret = -EIO;
- goto out;
+ /* give a priority to WB_SYNC threads */
+ if (atomic_read(&sbi->wb_sync_req[NODE]) &&
+ wbc->sync_mode == WB_SYNC_NONE) {
+ done = 1;
+ break;
}
/*
@@ -1486,18 +1823,19 @@ continue_unlock:
goto lock_node;
}
- f2fs_wait_on_page_writeback(page, NODE, true);
+ f2fs_wait_on_page_writeback(page, NODE, true, true);
- BUG_ON(PageWriteback(page));
if (!clear_page_dirty_for_io(page))
goto continue_unlock;
set_fsync_mark(page, 0);
set_dentry_mark(page, 0);
- if (NODE_MAPPING(sbi)->a_ops->writepage(page, wbc))
+ ret = __write_node_page(page, false, &submitted,
+ wbc, do_balance, io_type, NULL);
+ if (ret)
unlock_page(page);
- else
+ else if (submitted)
nwritten++;
if (--wbc->nr_to_write == 0)
@@ -1513,124 +1851,61 @@ continue_unlock:
}
if (step < 2) {
+ if (wbc->sync_mode == WB_SYNC_NONE && step == 1)
+ goto out;
step++;
goto next_step;
}
out:
if (nwritten)
- f2fs_submit_merged_bio(sbi, NODE, WRITE);
+ f2fs_submit_merged_write(sbi, NODE);
+
+ if (unlikely(f2fs_cp_error(sbi)))
+ return -EIO;
return ret;
}
-int wait_on_node_pages_writeback(struct f2fs_sb_info *sbi, nid_t ino)
+int f2fs_wait_on_node_pages_writeback(struct f2fs_sb_info *sbi,
+ unsigned int seq_id)
{
- pgoff_t index = 0, end = ULONG_MAX;
- struct pagevec pvec;
+ struct fsync_node_entry *fn;
+ struct page *page;
+ struct list_head *head = &sbi->fsync_node_list;
+ unsigned long flags;
+ unsigned int cur_seq_id = 0;
int ret2, ret = 0;
- pagevec_init(&pvec, 0);
-
- while (index <= end) {
- int i, nr_pages;
- nr_pages = pagevec_lookup_tag(&pvec, NODE_MAPPING(sbi), &index,
- PAGECACHE_TAG_WRITEBACK,
- min(end - index, (pgoff_t)PAGEVEC_SIZE-1) + 1);
- if (nr_pages == 0)
+ while (seq_id && cur_seq_id < seq_id) {
+ spin_lock_irqsave(&sbi->fsync_node_lock, flags);
+ if (list_empty(head)) {
+ spin_unlock_irqrestore(&sbi->fsync_node_lock, flags);
+ break;
+ }
+ fn = list_first_entry(head, struct fsync_node_entry, list);
+ if (fn->seq_id > seq_id) {
+ spin_unlock_irqrestore(&sbi->fsync_node_lock, flags);
break;
+ }
+ cur_seq_id = fn->seq_id;
+ page = fn->page;
+ get_page(page);
+ spin_unlock_irqrestore(&sbi->fsync_node_lock, flags);
- for (i = 0; i < nr_pages; i++) {
- struct page *page = pvec.pages[i];
+ f2fs_wait_on_page_writeback(page, NODE, true, false);
+ if (TestClearPageError(page))
+ ret = -EIO;
- /* until radix tree lookup accepts end_index */
- if (unlikely(page->index > end))
- continue;
+ put_page(page);
- if (ino && ino_of_node(page) == ino) {
- f2fs_wait_on_page_writeback(page, NODE, true);
- if (TestClearPageError(page))
- ret = -EIO;
- }
- }
- pagevec_release(&pvec);
- cond_resched();
+ if (ret)
+ break;
}
ret2 = filemap_check_errors(NODE_MAPPING(sbi));
if (!ret)
ret = ret2;
- return ret;
-}
-
-static int f2fs_write_node_page(struct page *page,
- struct writeback_control *wbc)
-{
- struct f2fs_sb_info *sbi = F2FS_P_SB(page);
- nid_t nid;
- struct node_info ni;
- struct f2fs_io_info fio = {
- .sbi = sbi,
- .type = NODE,
- .op = REQ_OP_WRITE,
- .op_flags = (wbc->sync_mode == WB_SYNC_ALL) ? WRITE_SYNC : 0,
- .page = page,
- .encrypted_page = NULL,
- };
-
- trace_f2fs_writepage(page, NODE);
-
- if (unlikely(is_sbi_flag_set(sbi, SBI_POR_DOING)))
- goto redirty_out;
- if (unlikely(f2fs_cp_error(sbi)))
- goto redirty_out;
-
- /* get old block addr of this node page */
- nid = nid_of_node(page);
- f2fs_bug_on(sbi, page->index != nid);
-
- if (wbc->for_reclaim) {
- if (!down_read_trylock(&sbi->node_write))
- goto redirty_out;
- } else {
- down_read(&sbi->node_write);
- }
-
- get_node_info(sbi, nid, &ni);
-
- /* This page is already truncated */
- if (unlikely(ni.blk_addr == NULL_ADDR)) {
- ClearPageUptodate(page);
- dec_page_count(sbi, F2FS_DIRTY_NODES);
- up_read(&sbi->node_write);
- unlock_page(page);
- return 0;
- }
- if (__is_valid_data_blkaddr(ni.blk_addr) &&
- !f2fs_is_valid_blkaddr(sbi, ni.blk_addr, DATA_GENERIC)) {
- up_read(&sbi->node_write);
- goto redirty_out;
- }
-
- set_page_writeback(page);
- fio.old_blkaddr = ni.blk_addr;
- write_node_page(nid, &fio);
- set_node_addr(sbi, &ni, fio.new_blkaddr, is_fsync_dnode(page));
- dec_page_count(sbi, F2FS_DIRTY_NODES);
- up_read(&sbi->node_write);
-
- if (wbc->for_reclaim)
- f2fs_submit_merged_bio_cond(sbi, NULL, page, 0, NODE, WRITE);
-
- unlock_page(page);
-
- if (unlikely(f2fs_cp_error(sbi)))
- f2fs_submit_merged_bio(sbi, NODE, WRITE);
-
- return 0;
-
-redirty_out:
- redirty_page_for_writepage(wbc, page);
- return AOP_WRITEPAGE_ACTIVATE;
+ return ret;
}
static int f2fs_write_node_pages(struct address_space *mapping,
@@ -1640,6 +1915,9 @@ static int f2fs_write_node_pages(struct address_space *mapping,
struct blk_plug plug;
long diff;
+ if (unlikely(is_sbi_flag_set(sbi, SBI_POR_DOING)))
+ goto skip_write;
+
/* balancing f2fs's metadata in background */
f2fs_balance_fs_bg(sbi);
@@ -1647,14 +1925,21 @@ static int f2fs_write_node_pages(struct address_space *mapping,
if (get_pages(sbi, F2FS_DIRTY_NODES) < nr_pages_to_skip(sbi, NODE))
goto skip_write;
+ if (wbc->sync_mode == WB_SYNC_ALL)
+ atomic_inc(&sbi->wb_sync_req[NODE]);
+ else if (atomic_read(&sbi->wb_sync_req[NODE]))
+ goto skip_write;
+
trace_f2fs_writepages(mapping->host, wbc, NODE);
diff = nr_pages_to_write(sbi, NODE, wbc);
- wbc->sync_mode = WB_SYNC_NONE;
blk_start_plug(&plug);
- sync_node_pages(sbi, wbc);
+ f2fs_sync_node_pages(sbi, wbc, true, FS_NODE_IO);
blk_finish_plug(&plug);
wbc->nr_to_write = max((long)0, wbc->nr_to_write - diff);
+
+ if (wbc->sync_mode == WB_SYNC_ALL)
+ atomic_dec(&sbi->wb_sync_req[NODE]);
return 0;
skip_write:
@@ -1669,8 +1954,12 @@ static int f2fs_set_node_page_dirty(struct page *page)
if (!PageUptodate(page))
SetPageUptodate(page);
+#ifdef CONFIG_F2FS_CHECK_FS
+ if (IS_INODE(page))
+ f2fs_inode_chksum_set(F2FS_P_SB(page), page);
+#endif
if (!PageDirty(page)) {
- f2fs_set_page_dirty_nobuffers(page);
+ __set_page_dirty_nobuffers(page);
inc_page_count(F2FS_P_SB(page), F2FS_DIRTY_NODES);
SetPagePrivate(page);
f2fs_trace_pid(page);
@@ -1699,57 +1988,123 @@ static struct free_nid *__lookup_free_nid_list(struct f2fs_nm_info *nm_i,
return radix_tree_lookup(&nm_i->free_nid_root, n);
}
-static void __del_from_free_nid_list(struct f2fs_nm_info *nm_i,
- struct free_nid *i)
+static int __insert_free_nid(struct f2fs_sb_info *sbi,
+ struct free_nid *i, enum nid_state state)
{
- list_del(&i->list);
+ struct f2fs_nm_info *nm_i = NM_I(sbi);
+
+ int err = radix_tree_insert(&nm_i->free_nid_root, i->nid, i);
+ if (err)
+ return err;
+
+ f2fs_bug_on(sbi, state != i->state);
+ nm_i->nid_cnt[state]++;
+ if (state == FREE_NID)
+ list_add_tail(&i->list, &nm_i->free_nid_list);
+ return 0;
+}
+
+static void __remove_free_nid(struct f2fs_sb_info *sbi,
+ struct free_nid *i, enum nid_state state)
+{
+ struct f2fs_nm_info *nm_i = NM_I(sbi);
+
+ f2fs_bug_on(sbi, state != i->state);
+ nm_i->nid_cnt[state]--;
+ if (state == FREE_NID)
+ list_del(&i->list);
radix_tree_delete(&nm_i->free_nid_root, i->nid);
}
-static int add_free_nid(struct f2fs_sb_info *sbi, nid_t nid, bool build)
+static void __move_free_nid(struct f2fs_sb_info *sbi, struct free_nid *i,
+ enum nid_state org_state, enum nid_state dst_state)
+{
+ struct f2fs_nm_info *nm_i = NM_I(sbi);
+
+ f2fs_bug_on(sbi, org_state != i->state);
+ i->state = dst_state;
+ nm_i->nid_cnt[org_state]--;
+ nm_i->nid_cnt[dst_state]++;
+
+ switch (dst_state) {
+ case PREALLOC_NID:
+ list_del(&i->list);
+ break;
+ case FREE_NID:
+ list_add_tail(&i->list, &nm_i->free_nid_list);
+ break;
+ default:
+ BUG_ON(1);
+ }
+}
+
+static void update_free_nid_bitmap(struct f2fs_sb_info *sbi, nid_t nid,
+ bool set, bool build)
+{
+ struct f2fs_nm_info *nm_i = NM_I(sbi);
+ unsigned int nat_ofs = NAT_BLOCK_OFFSET(nid);
+ unsigned int nid_ofs = nid - START_NID(nid);
+
+ if (!test_bit_le(nat_ofs, nm_i->nat_block_bitmap))
+ return;
+
+ if (set) {
+ if (test_bit_le(nid_ofs, nm_i->free_nid_bitmap[nat_ofs]))
+ return;
+ __set_bit_le(nid_ofs, nm_i->free_nid_bitmap[nat_ofs]);
+ nm_i->free_nid_count[nat_ofs]++;
+ } else {
+ if (!test_bit_le(nid_ofs, nm_i->free_nid_bitmap[nat_ofs]))
+ return;
+ __clear_bit_le(nid_ofs, nm_i->free_nid_bitmap[nat_ofs]);
+ if (!build)
+ nm_i->free_nid_count[nat_ofs]--;
+ }
+}
+
+/* return if the nid is recognized as free */
+static bool add_free_nid(struct f2fs_sb_info *sbi,
+ nid_t nid, bool build, bool update)
{
struct f2fs_nm_info *nm_i = NM_I(sbi);
struct free_nid *i, *e;
struct nat_entry *ne;
int err = -EINVAL;
-
- if (!available_free_memory(sbi, FREE_NIDS))
- return -1;
+ bool ret = false;
/* 0 nid should not be used */
if (unlikely(nid == 0))
- return 0;
+ return false;
i = f2fs_kmem_cache_alloc(free_nid_slab, GFP_NOFS);
i->nid = nid;
- i->state = NID_NEW;
+ i->state = FREE_NID;
- if (radix_tree_preload(GFP_NOFS))
- goto err;
+ radix_tree_preload(GFP_NOFS | __GFP_NOFAIL);
- spin_lock(&nm_i->free_nid_list_lock);
+ spin_lock(&nm_i->nid_list_lock);
if (build) {
/*
* Thread A Thread B
* - f2fs_create
* - f2fs_new_inode
- * - alloc_nid
- * - __insert_nid_to_list(ALLOC_NID_LIST)
+ * - f2fs_alloc_nid
+ * - __insert_nid_to_list(PREALLOC_NID)
* - f2fs_balance_fs_bg
- * - build_free_nids
- * - __build_free_nids
+ * - f2fs_build_free_nids
+ * - __f2fs_build_free_nids
* - scan_nat_page
* - add_free_nid
* - __lookup_nat_cache
* - f2fs_add_link
- * - init_inode_metadata
- * - new_inode_page
- * - new_node_page
+ * - f2fs_init_inode_metadata
+ * - f2fs_new_inode_page
+ * - f2fs_new_node_page
* - set_node_addr
- * - alloc_nid_done
- * - __remove_nid_from_list(ALLOC_NID_LIST)
- * - __insert_nid_to_list(FREE_NID_LIST)
+ * - f2fs_alloc_nid_done
+ * - __remove_nid_from_list(PREALLOC_NID)
+ * - __insert_nid_to_list(FREE_NID)
*/
ne = __lookup_nat_cache(nm_i, nid);
if (ne && (!get_nat_flag(ne, IS_CHECKPOINTED) ||
@@ -1757,88 +2112,184 @@ static int add_free_nid(struct f2fs_sb_info *sbi, nid_t nid, bool build)
goto err_out;
e = __lookup_free_nid_list(nm_i, nid);
- if (e)
+ if (e) {
+ if (e->state == FREE_NID)
+ ret = true;
goto err_out;
+ }
}
- if (radix_tree_insert(&nm_i->free_nid_root, i->nid, i))
- goto err_out;
- err = 0;
- list_add_tail(&i->list, &nm_i->free_nid_list);
- nm_i->fcnt++;
+ ret = true;
+ err = __insert_free_nid(sbi, i, FREE_NID);
err_out:
- spin_unlock(&nm_i->free_nid_list_lock);
+ if (update) {
+ update_free_nid_bitmap(sbi, nid, ret, build);
+ if (!build)
+ nm_i->available_nids++;
+ }
+ spin_unlock(&nm_i->nid_list_lock);
radix_tree_preload_end();
-err:
+
if (err)
kmem_cache_free(free_nid_slab, i);
- return !err;
+ return ret;
}
-static void remove_free_nid(struct f2fs_nm_info *nm_i, nid_t nid)
+static void remove_free_nid(struct f2fs_sb_info *sbi, nid_t nid)
{
+ struct f2fs_nm_info *nm_i = NM_I(sbi);
struct free_nid *i;
bool need_free = false;
- spin_lock(&nm_i->free_nid_list_lock);
+ spin_lock(&nm_i->nid_list_lock);
i = __lookup_free_nid_list(nm_i, nid);
- if (i && i->state == NID_NEW) {
- __del_from_free_nid_list(nm_i, i);
- nm_i->fcnt--;
+ if (i && i->state == FREE_NID) {
+ __remove_free_nid(sbi, i, FREE_NID);
need_free = true;
}
- spin_unlock(&nm_i->free_nid_list_lock);
+ spin_unlock(&nm_i->nid_list_lock);
if (need_free)
kmem_cache_free(free_nid_slab, i);
}
-static void scan_nat_page(struct f2fs_sb_info *sbi,
+static int scan_nat_page(struct f2fs_sb_info *sbi,
struct page *nat_page, nid_t start_nid)
{
struct f2fs_nm_info *nm_i = NM_I(sbi);
struct f2fs_nat_block *nat_blk = page_address(nat_page);
block_t blk_addr;
+ unsigned int nat_ofs = NAT_BLOCK_OFFSET(start_nid);
int i;
+ __set_bit_le(nat_ofs, nm_i->nat_block_bitmap);
+
i = start_nid % NAT_ENTRY_PER_BLOCK;
for (; i < NAT_ENTRY_PER_BLOCK; i++, start_nid++) {
-
if (unlikely(start_nid >= nm_i->max_nid))
break;
blk_addr = le32_to_cpu(nat_blk->entries[i].block_addr);
- f2fs_bug_on(sbi, blk_addr == NEW_ADDR);
+
+ if (blk_addr == NEW_ADDR)
+ return -EINVAL;
+
if (blk_addr == NULL_ADDR) {
- if (add_free_nid(sbi, start_nid, true) < 0)
- break;
+ add_free_nid(sbi, start_nid, true, true);
+ } else {
+ spin_lock(&NM_I(sbi)->nid_list_lock);
+ update_free_nid_bitmap(sbi, start_nid, false, true);
+ spin_unlock(&NM_I(sbi)->nid_list_lock);
}
}
+
+ return 0;
}
-void build_free_nids(struct f2fs_sb_info *sbi)
+static void scan_curseg_cache(struct f2fs_sb_info *sbi)
{
- struct f2fs_nm_info *nm_i = NM_I(sbi);
struct curseg_info *curseg = CURSEG_I(sbi, CURSEG_HOT_DATA);
struct f2fs_journal *journal = curseg->journal;
- int i = 0;
+ int i;
+
+ down_read(&curseg->journal_rwsem);
+ for (i = 0; i < nats_in_cursum(journal); i++) {
+ block_t addr;
+ nid_t nid;
+
+ addr = le32_to_cpu(nat_in_journal(journal, i).block_addr);
+ nid = le32_to_cpu(nid_in_journal(journal, i));
+ if (addr == NULL_ADDR)
+ add_free_nid(sbi, nid, true, false);
+ else
+ remove_free_nid(sbi, nid);
+ }
+ up_read(&curseg->journal_rwsem);
+}
+
+static void scan_free_nid_bits(struct f2fs_sb_info *sbi)
+{
+ struct f2fs_nm_info *nm_i = NM_I(sbi);
+ unsigned int i, idx;
+ nid_t nid;
+
+ down_read(&nm_i->nat_tree_lock);
+
+ for (i = 0; i < nm_i->nat_blocks; i++) {
+ if (!test_bit_le(i, nm_i->nat_block_bitmap))
+ continue;
+ if (!nm_i->free_nid_count[i])
+ continue;
+ for (idx = 0; idx < NAT_ENTRY_PER_BLOCK; idx++) {
+ idx = find_next_bit_le(nm_i->free_nid_bitmap[i],
+ NAT_ENTRY_PER_BLOCK, idx);
+ if (idx >= NAT_ENTRY_PER_BLOCK)
+ break;
+
+ nid = i * NAT_ENTRY_PER_BLOCK + idx;
+ add_free_nid(sbi, nid, true, false);
+
+ if (nm_i->nid_cnt[FREE_NID] >= MAX_FREE_NIDS)
+ goto out;
+ }
+ }
+out:
+ scan_curseg_cache(sbi);
+
+ up_read(&nm_i->nat_tree_lock);
+}
+
+static int __f2fs_build_free_nids(struct f2fs_sb_info *sbi,
+ bool sync, bool mount)
+{
+ struct f2fs_nm_info *nm_i = NM_I(sbi);
+ int i = 0, ret;
nid_t nid = nm_i->next_scan_nid;
+ if (unlikely(nid >= nm_i->max_nid))
+ nid = 0;
+
/* Enough entries */
- if (nm_i->fcnt >= NAT_ENTRY_PER_BLOCK)
- return;
+ if (nm_i->nid_cnt[FREE_NID] >= NAT_ENTRY_PER_BLOCK)
+ return 0;
+
+ if (!sync && !f2fs_available_free_memory(sbi, FREE_NIDS))
+ return 0;
+
+ if (!mount) {
+ /* try to find free nids in free_nid_bitmap */
+ scan_free_nid_bits(sbi);
+
+ if (nm_i->nid_cnt[FREE_NID] >= NAT_ENTRY_PER_BLOCK)
+ return 0;
+ }
/* readahead nat pages to be scanned */
- ra_meta_pages(sbi, NAT_BLOCK_OFFSET(nid), FREE_NID_PAGES,
+ f2fs_ra_meta_pages(sbi, NAT_BLOCK_OFFSET(nid), FREE_NID_PAGES,
META_NAT, true);
down_read(&nm_i->nat_tree_lock);
while (1) {
- struct page *page = get_current_nat_page(sbi, nid);
+ if (!test_bit_le(NAT_BLOCK_OFFSET(nid),
+ nm_i->nat_block_bitmap)) {
+ struct page *page = get_current_nat_page(sbi, nid);
- scan_nat_page(sbi, page, nid);
- f2fs_put_page(page, 1);
+ if (IS_ERR(page)) {
+ ret = PTR_ERR(page);
+ } else {
+ ret = scan_nat_page(sbi, page, nid);
+ f2fs_put_page(page, 1);
+ }
+
+ if (ret) {
+ up_read(&nm_i->nat_tree_lock);
+ f2fs_bug_on(sbi, !mount);
+ f2fs_msg(sbi->sb, KERN_ERR,
+ "NAT is corrupt, run fsck to fix it");
+ return ret;
+ }
+ }
nid += (NAT_ENTRY_PER_BLOCK - (nid % NAT_ENTRY_PER_BLOCK));
if (unlikely(nid >= nm_i->max_nid))
@@ -1852,22 +2303,25 @@ void build_free_nids(struct f2fs_sb_info *sbi)
nm_i->next_scan_nid = nid;
/* find free nids from current sum_pages */
- down_read(&curseg->journal_rwsem);
- for (i = 0; i < nats_in_cursum(journal); i++) {
- block_t addr;
+ scan_curseg_cache(sbi);
- addr = le32_to_cpu(nat_in_journal(journal, i).block_addr);
- nid = le32_to_cpu(nid_in_journal(journal, i));
- if (addr == NULL_ADDR)
- add_free_nid(sbi, nid, true);
- else
- remove_free_nid(nm_i, nid);
- }
- up_read(&curseg->journal_rwsem);
up_read(&nm_i->nat_tree_lock);
- ra_meta_pages(sbi, NAT_BLOCK_OFFSET(nm_i->next_scan_nid),
+ f2fs_ra_meta_pages(sbi, NAT_BLOCK_OFFSET(nm_i->next_scan_nid),
nm_i->ra_nid_pages, META_NAT, false);
+
+ return 0;
+}
+
+int f2fs_build_free_nids(struct f2fs_sb_info *sbi, bool sync, bool mount)
+{
+ int ret;
+
+ mutex_lock(&NM_I(sbi)->build_lock);
+ ret = __f2fs_build_free_nids(sbi, sync, mount);
+ mutex_unlock(&NM_I(sbi)->build_lock);
+
+ return ret;
}
/*
@@ -1875,64 +2329,67 @@ void build_free_nids(struct f2fs_sb_info *sbi)
* from second parameter of this function.
* The returned nid could be used ino as well as nid when inode is created.
*/
-bool alloc_nid(struct f2fs_sb_info *sbi, nid_t *nid)
+bool f2fs_alloc_nid(struct f2fs_sb_info *sbi, nid_t *nid)
{
struct f2fs_nm_info *nm_i = NM_I(sbi);
struct free_nid *i = NULL;
retry:
-#ifdef CONFIG_F2FS_FAULT_INJECTION
- if (time_to_inject(sbi, FAULT_ALLOC_NID))
- return false;
-#endif
- if (unlikely(sbi->total_valid_node_count + 1 > nm_i->available_nids))
+ if (time_to_inject(sbi, FAULT_ALLOC_NID)) {
+ f2fs_show_injection_info(FAULT_ALLOC_NID);
return false;
+ }
- spin_lock(&nm_i->free_nid_list_lock);
+ spin_lock(&nm_i->nid_list_lock);
- /* We should not use stale free nids created by build_free_nids */
- if (nm_i->fcnt && !on_build_free_nids(nm_i)) {
- f2fs_bug_on(sbi, list_empty(&nm_i->free_nid_list));
- list_for_each_entry(i, &nm_i->free_nid_list, list)
- if (i->state == NID_NEW)
- break;
+ if (unlikely(nm_i->available_nids == 0)) {
+ spin_unlock(&nm_i->nid_list_lock);
+ return false;
+ }
- f2fs_bug_on(sbi, i->state != NID_NEW);
+ /* We should not use stale free nids created by f2fs_build_free_nids */
+ if (nm_i->nid_cnt[FREE_NID] && !on_f2fs_build_free_nids(nm_i)) {
+ f2fs_bug_on(sbi, list_empty(&nm_i->free_nid_list));
+ i = list_first_entry(&nm_i->free_nid_list,
+ struct free_nid, list);
*nid = i->nid;
- i->state = NID_ALLOC;
- nm_i->fcnt--;
- spin_unlock(&nm_i->free_nid_list_lock);
+
+ __move_free_nid(sbi, i, FREE_NID, PREALLOC_NID);
+ nm_i->available_nids--;
+
+ update_free_nid_bitmap(sbi, *nid, false, false);
+
+ spin_unlock(&nm_i->nid_list_lock);
return true;
}
- spin_unlock(&nm_i->free_nid_list_lock);
+ spin_unlock(&nm_i->nid_list_lock);
/* Let's scan nat pages and its caches to get free nids */
- mutex_lock(&nm_i->build_lock);
- build_free_nids(sbi);
- mutex_unlock(&nm_i->build_lock);
- goto retry;
+ if (!f2fs_build_free_nids(sbi, true, false))
+ goto retry;
+ return false;
}
/*
- * alloc_nid() should be called prior to this function.
+ * f2fs_alloc_nid() should be called prior to this function.
*/
-void alloc_nid_done(struct f2fs_sb_info *sbi, nid_t nid)
+void f2fs_alloc_nid_done(struct f2fs_sb_info *sbi, nid_t nid)
{
struct f2fs_nm_info *nm_i = NM_I(sbi);
struct free_nid *i;
- spin_lock(&nm_i->free_nid_list_lock);
+ spin_lock(&nm_i->nid_list_lock);
i = __lookup_free_nid_list(nm_i, nid);
- f2fs_bug_on(sbi, !i || i->state != NID_ALLOC);
- __del_from_free_nid_list(nm_i, i);
- spin_unlock(&nm_i->free_nid_list_lock);
+ f2fs_bug_on(sbi, !i);
+ __remove_free_nid(sbi, i, PREALLOC_NID);
+ spin_unlock(&nm_i->nid_list_lock);
kmem_cache_free(free_nid_slab, i);
}
/*
- * alloc_nid() should be called prior to this function.
+ * f2fs_alloc_nid() should be called prior to this function.
*/
-void alloc_nid_failed(struct f2fs_sb_info *sbi, nid_t nid)
+void f2fs_alloc_nid_failed(struct f2fs_sb_info *sbi, nid_t nid)
{
struct f2fs_nm_info *nm_i = NM_I(sbi);
struct free_nid *i;
@@ -1941,120 +2398,141 @@ void alloc_nid_failed(struct f2fs_sb_info *sbi, nid_t nid)
if (!nid)
return;
- spin_lock(&nm_i->free_nid_list_lock);
+ spin_lock(&nm_i->nid_list_lock);
i = __lookup_free_nid_list(nm_i, nid);
- f2fs_bug_on(sbi, !i || i->state != NID_ALLOC);
- if (!available_free_memory(sbi, FREE_NIDS)) {
- __del_from_free_nid_list(nm_i, i);
+ f2fs_bug_on(sbi, !i);
+
+ if (!f2fs_available_free_memory(sbi, FREE_NIDS)) {
+ __remove_free_nid(sbi, i, PREALLOC_NID);
need_free = true;
} else {
- i->state = NID_NEW;
- nm_i->fcnt++;
+ __move_free_nid(sbi, i, PREALLOC_NID, FREE_NID);
}
- spin_unlock(&nm_i->free_nid_list_lock);
+
+ nm_i->available_nids++;
+
+ update_free_nid_bitmap(sbi, nid, true, false);
+
+ spin_unlock(&nm_i->nid_list_lock);
if (need_free)
kmem_cache_free(free_nid_slab, i);
}
-int try_to_free_nids(struct f2fs_sb_info *sbi, int nr_shrink)
+int f2fs_try_to_free_nids(struct f2fs_sb_info *sbi, int nr_shrink)
{
struct f2fs_nm_info *nm_i = NM_I(sbi);
struct free_nid *i, *next;
int nr = nr_shrink;
- if (nm_i->fcnt <= MAX_FREE_NIDS)
+ if (nm_i->nid_cnt[FREE_NID] <= MAX_FREE_NIDS)
return 0;
if (!mutex_trylock(&nm_i->build_lock))
return 0;
- spin_lock(&nm_i->free_nid_list_lock);
+ spin_lock(&nm_i->nid_list_lock);
list_for_each_entry_safe(i, next, &nm_i->free_nid_list, list) {
- if (nr_shrink <= 0 || nm_i->fcnt <= MAX_FREE_NIDS)
+ if (nr_shrink <= 0 ||
+ nm_i->nid_cnt[FREE_NID] <= MAX_FREE_NIDS)
break;
- if (i->state == NID_ALLOC)
- continue;
- __del_from_free_nid_list(nm_i, i);
+
+ __remove_free_nid(sbi, i, FREE_NID);
kmem_cache_free(free_nid_slab, i);
- nm_i->fcnt--;
nr_shrink--;
}
- spin_unlock(&nm_i->free_nid_list_lock);
+ spin_unlock(&nm_i->nid_list_lock);
mutex_unlock(&nm_i->build_lock);
return nr - nr_shrink;
}
-void recover_inline_xattr(struct inode *inode, struct page *page)
+void f2fs_recover_inline_xattr(struct inode *inode, struct page *page)
{
void *src_addr, *dst_addr;
size_t inline_size;
struct page *ipage;
struct f2fs_inode *ri;
- ipage = get_node_page(F2FS_I_SB(inode), inode->i_ino);
+ ipage = f2fs_get_node_page(F2FS_I_SB(inode), inode->i_ino);
f2fs_bug_on(F2FS_I_SB(inode), IS_ERR(ipage));
ri = F2FS_INODE(page);
- if (!(ri->i_inline & F2FS_INLINE_XATTR)) {
+ if (ri->i_inline & F2FS_INLINE_XATTR) {
+ set_inode_flag(inode, FI_INLINE_XATTR);
+ } else {
clear_inode_flag(inode, FI_INLINE_XATTR);
goto update_inode;
}
- dst_addr = inline_xattr_addr(ipage);
- src_addr = inline_xattr_addr(page);
+ dst_addr = inline_xattr_addr(inode, ipage);
+ src_addr = inline_xattr_addr(inode, page);
inline_size = inline_xattr_size(inode);
- f2fs_wait_on_page_writeback(ipage, NODE, true);
+ f2fs_wait_on_page_writeback(ipage, NODE, true, true);
memcpy(dst_addr, src_addr, inline_size);
update_inode:
- update_inode(inode, ipage);
+ f2fs_update_inode(inode, ipage);
f2fs_put_page(ipage, 1);
}
-void recover_xattr_data(struct inode *inode, struct page *page, block_t blkaddr)
+int f2fs_recover_xattr_data(struct inode *inode, struct page *page)
{
struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
nid_t prev_xnid = F2FS_I(inode)->i_xattr_nid;
- nid_t new_xnid = nid_of_node(page);
+ nid_t new_xnid;
+ struct dnode_of_data dn;
struct node_info ni;
+ struct page *xpage;
+ int err;
- /* 1: invalidate the previous xattr nid */
if (!prev_xnid)
goto recover_xnid;
- /* Deallocate node address */
- get_node_info(sbi, prev_xnid, &ni);
- f2fs_bug_on(sbi, ni.blk_addr == NULL_ADDR);
- invalidate_blocks(sbi, ni.blk_addr);
- dec_valid_node_count(sbi, inode);
+ /* 1: invalidate the previous xattr nid */
+ err = f2fs_get_node_info(sbi, prev_xnid, &ni);
+ if (err)
+ return err;
+
+ f2fs_invalidate_blocks(sbi, ni.blk_addr);
+ dec_valid_node_count(sbi, inode, false);
set_node_addr(sbi, &ni, NULL_ADDR, false);
recover_xnid:
- /* 2: allocate new xattr nid */
- if (unlikely(!inc_valid_node_count(sbi, inode)))
- f2fs_bug_on(sbi, 1);
+ /* 2: update xattr nid in inode */
+ if (!f2fs_alloc_nid(sbi, &new_xnid))
+ return -ENOSPC;
+
+ set_new_dnode(&dn, inode, NULL, NULL, new_xnid);
+ xpage = f2fs_new_node_page(&dn, XATTR_NODE_OFFSET);
+ if (IS_ERR(xpage)) {
+ f2fs_alloc_nid_failed(sbi, new_xnid);
+ return PTR_ERR(xpage);
+ }
- remove_free_nid(NM_I(sbi), new_xnid);
- get_node_info(sbi, new_xnid, &ni);
- ni.ino = inode->i_ino;
- set_node_addr(sbi, &ni, NEW_ADDR, false);
- f2fs_i_xnid_write(inode, new_xnid);
+ f2fs_alloc_nid_done(sbi, new_xnid);
+ f2fs_update_inode_page(inode);
- /* 3: update xattr blkaddr */
- refresh_sit_entry(sbi, NEW_ADDR, blkaddr);
- set_node_addr(sbi, &ni, blkaddr, false);
+ /* 3: update and set xattr node page dirty */
+ memcpy(F2FS_NODE(xpage), F2FS_NODE(page), VALID_XATTR_BLOCK_SIZE);
+
+ set_page_dirty(xpage);
+ f2fs_put_page(xpage, 1);
+
+ return 0;
}
-int recover_inode_page(struct f2fs_sb_info *sbi, struct page *page)
+int f2fs_recover_inode_page(struct f2fs_sb_info *sbi, struct page *page)
{
struct f2fs_inode *src, *dst;
nid_t ino = ino_of_node(page);
struct node_info old_ni, new_ni;
struct page *ipage;
+ int err;
- get_node_info(sbi, ino, &old_ni);
+ err = f2fs_get_node_info(sbi, ino, &old_ni);
+ if (err)
+ return err;
if (unlikely(old_ni.blk_addr != NULL_ADDR))
return -EINVAL;
@@ -2066,11 +2544,12 @@ retry:
}
/* Should not use this inode from free nid list */
- remove_free_nid(NM_I(sbi), ino);
+ remove_free_nid(sbi, ino);
if (!PageUptodate(ipage))
SetPageUptodate(ipage);
fill_node_footer(ipage, ino, ino, 0, true);
+ set_cold_node(ipage, false);
src = F2FS_INODE(page);
dst = F2FS_INODE(ipage);
@@ -2080,12 +2559,32 @@ retry:
dst->i_blocks = cpu_to_le64(1);
dst->i_links = cpu_to_le32(1);
dst->i_xattr_nid = 0;
- dst->i_inline = src->i_inline & F2FS_INLINE_XATTR;
+ dst->i_inline = src->i_inline & (F2FS_INLINE_XATTR | F2FS_EXTRA_ATTR);
+ if (dst->i_inline & F2FS_EXTRA_ATTR) {
+ dst->i_extra_isize = src->i_extra_isize;
+
+ if (f2fs_sb_has_flexible_inline_xattr(sbi) &&
+ F2FS_FITS_IN_INODE(src, le16_to_cpu(src->i_extra_isize),
+ i_inline_xattr_size))
+ dst->i_inline_xattr_size = src->i_inline_xattr_size;
+
+ if (f2fs_sb_has_project_quota(sbi) &&
+ F2FS_FITS_IN_INODE(src, le16_to_cpu(src->i_extra_isize),
+ i_projid))
+ dst->i_projid = src->i_projid;
+
+ if (f2fs_sb_has_inode_crtime(sbi) &&
+ F2FS_FITS_IN_INODE(src, le16_to_cpu(src->i_extra_isize),
+ i_crtime_nsec)) {
+ dst->i_crtime = src->i_crtime;
+ dst->i_crtime_nsec = src->i_crtime_nsec;
+ }
+ }
new_ni = old_ni;
new_ni.ino = ino;
- if (unlikely(!inc_valid_node_count(sbi, NULL)))
+ if (unlikely(inc_valid_node_count(sbi, NULL, true)))
WARN_ON(1);
set_node_addr(sbi, &new_ni, NEW_ADDR, false);
inc_valid_inode_count(sbi);
@@ -2094,13 +2593,12 @@ retry:
return 0;
}
-int restore_node_summary(struct f2fs_sb_info *sbi,
+int f2fs_restore_node_summary(struct f2fs_sb_info *sbi,
unsigned int segno, struct f2fs_summary_block *sum)
{
struct f2fs_node *rn;
struct f2fs_summary *sum_entry;
block_t addr;
- int bio_blocks = MAX_BIO_BLOCKS(sbi);
int i, idx, last_offset, nrpages;
/* scan the node segment */
@@ -2109,13 +2607,16 @@ int restore_node_summary(struct f2fs_sb_info *sbi,
sum_entry = &sum->entries[0];
for (i = 0; i < last_offset; i += nrpages, addr += nrpages) {
- nrpages = min(last_offset - i, bio_blocks);
+ nrpages = min(last_offset - i, BIO_MAX_PAGES);
/* readahead node pages */
- ra_meta_pages(sbi, addr, nrpages, META_POR, true);
+ f2fs_ra_meta_pages(sbi, addr, nrpages, META_POR, true);
for (idx = addr; idx < addr + nrpages; idx++) {
- struct page *page = get_tmp_page(sbi, idx);
+ struct page *page = f2fs_get_tmp_page(sbi, idx);
+
+ if (IS_ERR(page))
+ return PTR_ERR(page);
rn = F2FS_NODE(page);
sum_entry->nid = rn->footer.nid;
@@ -2148,9 +2649,22 @@ static void remove_nats_in_journal(struct f2fs_sb_info *sbi)
ne = __lookup_nat_cache(nm_i, nid);
if (!ne) {
- ne = grab_nat_entry(nm_i, nid);
- node_info_from_raw_nat(&ne->ni, &raw_ne);
+ ne = __alloc_nat_entry(nid, true);
+ __init_nat_entry(nm_i, ne, &raw_ne, true);
}
+
+ /*
+ * if a free nat in journal has not been used after last
+ * checkpoint, we should remove it from available nids,
+ * since later we will add it again.
+ */
+ if (!get_nat_flag(ne, IS_DIRTY) &&
+ le32_to_cpu(raw_ne.block_addr) == NULL_ADDR) {
+ spin_lock(&nm_i->nid_list_lock);
+ nm_i->available_nids--;
+ spin_unlock(&nm_i->nid_list_lock);
+ }
+
__set_nat_cache_dirty(nm_i, ne);
}
update_nats_in_cursum(journal, -i);
@@ -2175,8 +2689,41 @@ add_out:
list_add_tail(&nes->set_list, head);
}
-static void __flush_nat_entry_set(struct f2fs_sb_info *sbi,
- struct nat_entry_set *set)
+static void __update_nat_bits(struct f2fs_sb_info *sbi, nid_t start_nid,
+ struct page *page)
+{
+ struct f2fs_nm_info *nm_i = NM_I(sbi);
+ unsigned int nat_index = start_nid / NAT_ENTRY_PER_BLOCK;
+ struct f2fs_nat_block *nat_blk = page_address(page);
+ int valid = 0;
+ int i = 0;
+
+ if (!enabled_nat_bits(sbi, NULL))
+ return;
+
+ if (nat_index == 0) {
+ valid = 1;
+ i = 1;
+ }
+ for (; i < NAT_ENTRY_PER_BLOCK; i++) {
+ if (nat_blk->entries[i].block_addr != NULL_ADDR)
+ valid++;
+ }
+ if (valid == 0) {
+ __set_bit_le(nat_index, nm_i->empty_nat_bits);
+ __clear_bit_le(nat_index, nm_i->full_nat_bits);
+ return;
+ }
+
+ __clear_bit_le(nat_index, nm_i->empty_nat_bits);
+ if (valid == NAT_ENTRY_PER_BLOCK)
+ __set_bit_le(nat_index, nm_i->full_nat_bits);
+ else
+ __clear_bit_le(nat_index, nm_i->full_nat_bits);
+}
+
+static int __flush_nat_entry_set(struct f2fs_sb_info *sbi,
+ struct nat_entry_set *set, struct cp_control *cpc)
{
struct curseg_info *curseg = CURSEG_I(sbi, CURSEG_HOT_DATA);
struct f2fs_journal *journal = curseg->journal;
@@ -2191,13 +2738,17 @@ static void __flush_nat_entry_set(struct f2fs_sb_info *sbi,
* #1, flush nat entries to journal in current hot data summary block.
* #2, flush nat entries to nat page.
*/
- if (!__has_cursum_space(journal, set->entry_cnt, NAT_JOURNAL))
+ if (enabled_nat_bits(sbi, cpc) ||
+ !__has_cursum_space(journal, set->entry_cnt, NAT_JOURNAL))
to_journal = false;
if (to_journal) {
down_write(&curseg->journal_rwsem);
} else {
page = get_next_nat_page(sbi, start_nid);
+ if (IS_ERR(page))
+ return PTR_ERR(page);
+
nat_blk = page_address(page);
f2fs_bug_on(sbi, !nat_blk);
}
@@ -2208,11 +2759,10 @@ static void __flush_nat_entry_set(struct f2fs_sb_info *sbi,
nid_t nid = nat_get_nid(ne);
int offset;
- if (nat_get_blkaddr(ne) == NEW_ADDR)
- continue;
+ f2fs_bug_on(sbi, nat_get_blkaddr(ne) == NEW_ADDR);
if (to_journal) {
- offset = lookup_journal_in_cursum(journal,
+ offset = f2fs_lookup_journal_in_cursum(journal,
NAT_JOURNAL, nid, 1);
f2fs_bug_on(sbi, offset < 0);
raw_ne = &nat_in_journal(journal, offset);
@@ -2222,26 +2772,35 @@ static void __flush_nat_entry_set(struct f2fs_sb_info *sbi,
}
raw_nat_from_node_info(raw_ne, &ne->ni);
nat_reset_flag(ne);
- __clear_nat_cache_dirty(NM_I(sbi), ne);
- if (nat_get_blkaddr(ne) == NULL_ADDR)
- add_free_nid(sbi, nid, false);
+ __clear_nat_cache_dirty(NM_I(sbi), set, ne);
+ if (nat_get_blkaddr(ne) == NULL_ADDR) {
+ add_free_nid(sbi, nid, false, true);
+ } else {
+ spin_lock(&NM_I(sbi)->nid_list_lock);
+ update_free_nid_bitmap(sbi, nid, false, false);
+ spin_unlock(&NM_I(sbi)->nid_list_lock);
+ }
}
- if (to_journal)
+ if (to_journal) {
up_write(&curseg->journal_rwsem);
- else
+ } else {
+ __update_nat_bits(sbi, start_nid, page);
f2fs_put_page(page, 1);
+ }
- f2fs_bug_on(sbi, set->entry_cnt);
-
- radix_tree_delete(&NM_I(sbi)->nat_set_root, set->set);
- kmem_cache_free(nat_entry_set_slab, set);
+ /* Allow dirty nats by node block allocation in write_begin */
+ if (!set->entry_cnt) {
+ radix_tree_delete(&NM_I(sbi)->nat_set_root, set->set);
+ kmem_cache_free(nat_entry_set_slab, set);
+ }
+ return 0;
}
/*
* This function is called during the checkpointing process.
*/
-void flush_nat_entries(struct f2fs_sb_info *sbi)
+int f2fs_flush_nat_entries(struct f2fs_sb_info *sbi, struct cp_control *cpc)
{
struct f2fs_nm_info *nm_i = NM_I(sbi);
struct curseg_info *curseg = CURSEG_I(sbi, CURSEG_HOT_DATA);
@@ -2251,9 +2810,17 @@ void flush_nat_entries(struct f2fs_sb_info *sbi)
unsigned int found;
nid_t set_idx = 0;
LIST_HEAD(sets);
+ int err = 0;
+
+ /* during unmount, let's flush nat_bits before checking dirty_nat_cnt */
+ if (enabled_nat_bits(sbi, cpc)) {
+ down_write(&nm_i->nat_tree_lock);
+ remove_nats_in_journal(sbi);
+ up_write(&nm_i->nat_tree_lock);
+ }
if (!nm_i->dirty_nat_cnt)
- return;
+ return 0;
down_write(&nm_i->nat_tree_lock);
@@ -2262,7 +2829,8 @@ void flush_nat_entries(struct f2fs_sb_info *sbi)
* entries, remove all entries from journal and merge them
* into nat entry set.
*/
- if (!__has_cursum_space(journal, nm_i->dirty_nat_cnt, NAT_JOURNAL))
+ if (enabled_nat_bits(sbi, cpc) ||
+ !__has_cursum_space(journal, nm_i->dirty_nat_cnt, NAT_JOURNAL))
remove_nats_in_journal(sbi);
while ((found = __gang_lookup_nat_set(nm_i,
@@ -2275,12 +2843,95 @@ void flush_nat_entries(struct f2fs_sb_info *sbi)
}
/* flush dirty nats in nat entry set */
- list_for_each_entry_safe(set, tmp, &sets, set_list)
- __flush_nat_entry_set(sbi, set);
+ list_for_each_entry_safe(set, tmp, &sets, set_list) {
+ err = __flush_nat_entry_set(sbi, set, cpc);
+ if (err)
+ break;
+ }
up_write(&nm_i->nat_tree_lock);
+ /* Allow dirty nats by node block allocation in write_begin */
+
+ return err;
+}
+
+static int __get_nat_bitmaps(struct f2fs_sb_info *sbi)
+{
+ struct f2fs_checkpoint *ckpt = F2FS_CKPT(sbi);
+ struct f2fs_nm_info *nm_i = NM_I(sbi);
+ unsigned int nat_bits_bytes = nm_i->nat_blocks / BITS_PER_BYTE;
+ unsigned int i;
+ __u64 cp_ver = cur_cp_version(ckpt);
+ block_t nat_bits_addr;
+
+ if (!enabled_nat_bits(sbi, NULL))
+ return 0;
+
+ nm_i->nat_bits_blocks = F2FS_BLK_ALIGN((nat_bits_bytes << 1) + 8);
+ nm_i->nat_bits = f2fs_kzalloc(sbi,
+ nm_i->nat_bits_blocks << F2FS_BLKSIZE_BITS, GFP_KERNEL);
+ if (!nm_i->nat_bits)
+ return -ENOMEM;
+
+ nat_bits_addr = __start_cp_addr(sbi) + sbi->blocks_per_seg -
+ nm_i->nat_bits_blocks;
+ for (i = 0; i < nm_i->nat_bits_blocks; i++) {
+ struct page *page;
+
+ page = f2fs_get_meta_page(sbi, nat_bits_addr++);
+ if (IS_ERR(page))
+ return PTR_ERR(page);
+
+ memcpy(nm_i->nat_bits + (i << F2FS_BLKSIZE_BITS),
+ page_address(page), F2FS_BLKSIZE);
+ f2fs_put_page(page, 1);
+ }
+
+ cp_ver |= (cur_cp_crc(ckpt) << 32);
+ if (cpu_to_le64(cp_ver) != *(__le64 *)nm_i->nat_bits) {
+ disable_nat_bits(sbi, true);
+ return 0;
+ }
+
+ nm_i->full_nat_bits = nm_i->nat_bits + 8;
+ nm_i->empty_nat_bits = nm_i->full_nat_bits + nat_bits_bytes;
+
+ f2fs_msg(sbi->sb, KERN_NOTICE, "Found nat_bits in checkpoint");
+ return 0;
+}
+
+static inline void load_free_nid_bitmap(struct f2fs_sb_info *sbi)
+{
+ struct f2fs_nm_info *nm_i = NM_I(sbi);
+ unsigned int i = 0;
+ nid_t nid, last_nid;
+
+ if (!enabled_nat_bits(sbi, NULL))
+ return;
+
+ for (i = 0; i < nm_i->nat_blocks; i++) {
+ i = find_next_bit_le(nm_i->empty_nat_bits, nm_i->nat_blocks, i);
+ if (i >= nm_i->nat_blocks)
+ break;
+
+ __set_bit_le(i, nm_i->nat_block_bitmap);
+
+ nid = i * NAT_ENTRY_PER_BLOCK;
+ last_nid = nid + NAT_ENTRY_PER_BLOCK;
+
+ spin_lock(&NM_I(sbi)->nid_list_lock);
+ for (; nid < last_nid; nid++)
+ update_free_nid_bitmap(sbi, nid, true, true);
+ spin_unlock(&NM_I(sbi)->nid_list_lock);
+ }
- f2fs_bug_on(sbi, nm_i->dirty_nat_cnt);
+ for (i = 0; i < nm_i->nat_blocks; i++) {
+ i = find_next_bit_le(nm_i->full_nat_bits, nm_i->nat_blocks, i);
+ if (i >= nm_i->nat_blocks)
+ break;
+
+ __set_bit_le(i, nm_i->nat_block_bitmap);
+ }
}
static int init_node_manager(struct f2fs_sb_info *sbi)
@@ -2288,19 +2939,21 @@ static int init_node_manager(struct f2fs_sb_info *sbi)
struct f2fs_super_block *sb_raw = F2FS_RAW_SUPER(sbi);
struct f2fs_nm_info *nm_i = NM_I(sbi);
unsigned char *version_bitmap;
- unsigned int nat_segs, nat_blocks;
+ unsigned int nat_segs;
+ int err;
nm_i->nat_blkaddr = le32_to_cpu(sb_raw->nat_blkaddr);
/* segment_count_nat includes pair segment so divide to 2. */
nat_segs = le32_to_cpu(sb_raw->segment_count_nat) >> 1;
- nat_blocks = nat_segs << le32_to_cpu(sb_raw->log_blocks_per_seg);
-
- nm_i->max_nid = NAT_ENTRY_PER_BLOCK * nat_blocks;
+ nm_i->nat_blocks = nat_segs << le32_to_cpu(sb_raw->log_blocks_per_seg);
+ nm_i->max_nid = NAT_ENTRY_PER_BLOCK * nm_i->nat_blocks;
/* not used nids: 0, node, meta, (and root counted as valid node) */
- nm_i->available_nids = nm_i->max_nid - F2FS_RESERVED_NODE_NUM;
- nm_i->fcnt = 0;
+ nm_i->available_nids = nm_i->max_nid - sbi->total_valid_node_count -
+ sbi->nquota_files - F2FS_RESERVED_NODE_NUM;
+ nm_i->nid_cnt[FREE_NID] = 0;
+ nm_i->nid_cnt[PREALLOC_NID] = 0;
nm_i->nat_cnt = 0;
nm_i->ram_thresh = DEF_RAM_THRESHOLD;
nm_i->ra_nid_pages = DEF_RA_NID_PAGES;
@@ -2311,9 +2964,10 @@ static int init_node_manager(struct f2fs_sb_info *sbi)
INIT_RADIX_TREE(&nm_i->nat_root, GFP_NOIO);
INIT_RADIX_TREE(&nm_i->nat_set_root, GFP_NOIO);
INIT_LIST_HEAD(&nm_i->nat_entries);
+ spin_lock_init(&nm_i->nat_list_lock);
mutex_init(&nm_i->build_lock);
- spin_lock_init(&nm_i->free_nid_list_lock);
+ spin_lock_init(&nm_i->nid_list_lock);
init_rwsem(&nm_i->nat_tree_lock);
nm_i->next_scan_nid = le32_to_cpu(sbi->ckpt->next_free_nid);
@@ -2326,14 +2980,60 @@ static int init_node_manager(struct f2fs_sb_info *sbi)
GFP_KERNEL);
if (!nm_i->nat_bitmap)
return -ENOMEM;
+
+ err = __get_nat_bitmaps(sbi);
+ if (err)
+ return err;
+
+#ifdef CONFIG_F2FS_CHECK_FS
+ nm_i->nat_bitmap_mir = kmemdup(version_bitmap, nm_i->bitmap_size,
+ GFP_KERNEL);
+ if (!nm_i->nat_bitmap_mir)
+ return -ENOMEM;
+#endif
+
return 0;
}
-int build_node_manager(struct f2fs_sb_info *sbi)
+static int init_free_nid_cache(struct f2fs_sb_info *sbi)
+{
+ struct f2fs_nm_info *nm_i = NM_I(sbi);
+ int i;
+
+ nm_i->free_nid_bitmap =
+ f2fs_kzalloc(sbi, array_size(sizeof(unsigned char *),
+ nm_i->nat_blocks),
+ GFP_KERNEL);
+ if (!nm_i->free_nid_bitmap)
+ return -ENOMEM;
+
+ for (i = 0; i < nm_i->nat_blocks; i++) {
+ nm_i->free_nid_bitmap[i] = f2fs_kvzalloc(sbi,
+ f2fs_bitmap_size(NAT_ENTRY_PER_BLOCK), GFP_KERNEL);
+ if (!nm_i->free_nid_bitmap[i])
+ return -ENOMEM;
+ }
+
+ nm_i->nat_block_bitmap = f2fs_kvzalloc(sbi, nm_i->nat_blocks / 8,
+ GFP_KERNEL);
+ if (!nm_i->nat_block_bitmap)
+ return -ENOMEM;
+
+ nm_i->free_nid_count =
+ f2fs_kvzalloc(sbi, array_size(sizeof(unsigned short),
+ nm_i->nat_blocks),
+ GFP_KERNEL);
+ if (!nm_i->free_nid_count)
+ return -ENOMEM;
+ return 0;
+}
+
+int f2fs_build_node_manager(struct f2fs_sb_info *sbi)
{
int err;
- sbi->nm_info = kzalloc(sizeof(struct f2fs_nm_info), GFP_KERNEL);
+ sbi->nm_info = f2fs_kzalloc(sbi, sizeof(struct f2fs_nm_info),
+ GFP_KERNEL);
if (!sbi->nm_info)
return -ENOMEM;
@@ -2341,11 +3041,17 @@ int build_node_manager(struct f2fs_sb_info *sbi)
if (err)
return err;
- build_free_nids(sbi);
- return 0;
+ err = init_free_nid_cache(sbi);
+ if (err)
+ return err;
+
+ /* load free nid status from nat_bits table */
+ load_free_nid_bitmap(sbi);
+
+ return f2fs_build_free_nids(sbi, true, true);
}
-void destroy_node_manager(struct f2fs_sb_info *sbi)
+void f2fs_destroy_node_manager(struct f2fs_sb_info *sbi)
{
struct f2fs_nm_info *nm_i = NM_I(sbi);
struct free_nid *i, *next_i;
@@ -2358,17 +3064,17 @@ void destroy_node_manager(struct f2fs_sb_info *sbi)
return;
/* destroy free nid list */
- spin_lock(&nm_i->free_nid_list_lock);
+ spin_lock(&nm_i->nid_list_lock);
list_for_each_entry_safe(i, next_i, &nm_i->free_nid_list, list) {
- f2fs_bug_on(sbi, i->state == NID_ALLOC);
- __del_from_free_nid_list(nm_i, i);
- nm_i->fcnt--;
- spin_unlock(&nm_i->free_nid_list_lock);
+ __remove_free_nid(sbi, i, FREE_NID);
+ spin_unlock(&nm_i->nid_list_lock);
kmem_cache_free(free_nid_slab, i);
- spin_lock(&nm_i->free_nid_list_lock);
+ spin_lock(&nm_i->nid_list_lock);
}
- f2fs_bug_on(sbi, nm_i->fcnt);
- spin_unlock(&nm_i->free_nid_list_lock);
+ f2fs_bug_on(sbi, nm_i->nid_cnt[FREE_NID]);
+ f2fs_bug_on(sbi, nm_i->nid_cnt[PREALLOC_NID]);
+ f2fs_bug_on(sbi, !list_empty(&nm_i->free_nid_list));
+ spin_unlock(&nm_i->nid_list_lock);
/* destroy nat cache */
down_write(&nm_i->nat_tree_lock);
@@ -2377,8 +3083,13 @@ void destroy_node_manager(struct f2fs_sb_info *sbi)
unsigned idx;
nid = nat_get_nid(natvec[found - 1]) + 1;
- for (idx = 0; idx < found; idx++)
+ for (idx = 0; idx < found; idx++) {
+ spin_lock(&nm_i->nat_list_lock);
+ list_del(&natvec[idx]->list);
+ spin_unlock(&nm_i->nat_list_lock);
+
__del_from_nat_cache(nm_i, natvec[idx]);
+ }
}
f2fs_bug_on(sbi, nm_i->nat_cnt);
@@ -2398,12 +3109,26 @@ void destroy_node_manager(struct f2fs_sb_info *sbi)
}
up_write(&nm_i->nat_tree_lock);
- kfree(nm_i->nat_bitmap);
+ kvfree(nm_i->nat_block_bitmap);
+ if (nm_i->free_nid_bitmap) {
+ int i;
+
+ for (i = 0; i < nm_i->nat_blocks; i++)
+ kvfree(nm_i->free_nid_bitmap[i]);
+ kvfree(nm_i->free_nid_bitmap);
+ }
+ kvfree(nm_i->free_nid_count);
+
+ kvfree(nm_i->nat_bitmap);
+ kvfree(nm_i->nat_bits);
+#ifdef CONFIG_F2FS_CHECK_FS
+ kvfree(nm_i->nat_bitmap_mir);
+#endif
sbi->nm_info = NULL;
- kfree(nm_i);
+ kvfree(nm_i);
}
-int __init create_node_manager_caches(void)
+int __init f2fs_create_node_manager_caches(void)
{
nat_entry_slab = f2fs_kmem_cache_create("nat_entry",
sizeof(struct nat_entry));
@@ -2419,8 +3144,15 @@ int __init create_node_manager_caches(void)
sizeof(struct nat_entry_set));
if (!nat_entry_set_slab)
goto destroy_free_nid;
+
+ fsync_node_entry_slab = f2fs_kmem_cache_create("fsync_node_entry",
+ sizeof(struct fsync_node_entry));
+ if (!fsync_node_entry_slab)
+ goto destroy_nat_entry_set;
return 0;
+destroy_nat_entry_set:
+ kmem_cache_destroy(nat_entry_set_slab);
destroy_free_nid:
kmem_cache_destroy(free_nid_slab);
destroy_nat_entry:
@@ -2429,8 +3161,9 @@ fail:
return -ENOMEM;
}
-void destroy_node_manager_caches(void)
+void f2fs_destroy_node_manager_caches(void)
{
+ kmem_cache_destroy(fsync_node_entry_slab);
kmem_cache_destroy(nat_entry_set_slab);
kmem_cache_destroy(free_nid_slab);
kmem_cache_destroy(nat_entry_slab);
diff --git a/fs/f2fs/node.h b/fs/f2fs/node.h
index 868bec65e51c..e05af5df5648 100644
--- a/fs/f2fs/node.h
+++ b/fs/f2fs/node.h
@@ -1,18 +1,15 @@
+// SPDX-License-Identifier: GPL-2.0
/*
* fs/f2fs/node.h
*
* Copyright (c) 2012 Samsung Electronics Co., Ltd.
* http://www.samsung.com/
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
*/
/* start node id of a node block dedicated to the given node id */
-#define START_NID(nid) ((nid / NAT_ENTRY_PER_BLOCK) * NAT_ENTRY_PER_BLOCK)
+#define START_NID(nid) (((nid) / NAT_ENTRY_PER_BLOCK) * NAT_ENTRY_PER_BLOCK)
/* node block offset on the NAT area dedicated to the given start node id */
-#define NAT_BLOCK_OFFSET(start_nid) (start_nid / NAT_ENTRY_PER_BLOCK)
+#define NAT_BLOCK_OFFSET(start_nid) ((start_nid) / NAT_ENTRY_PER_BLOCK)
/* # of pages to perform synchronous readahead before building free nids */
#define FREE_NID_PAGES 8
@@ -44,6 +41,7 @@ enum {
HAS_FSYNCED_INODE, /* is the inode fsynced before? */
HAS_LAST_FSYNC, /* has the latest node fsync mark? */
IS_DIRTY, /* this nat entry is dirty? */
+ IS_PREALLOC, /* nat entry is preallocated */
};
/*
@@ -62,16 +60,16 @@ struct nat_entry {
struct node_info ni; /* in-memory node information */
};
-#define nat_get_nid(nat) (nat->ni.nid)
-#define nat_set_nid(nat, n) (nat->ni.nid = n)
-#define nat_get_blkaddr(nat) (nat->ni.blk_addr)
-#define nat_set_blkaddr(nat, b) (nat->ni.blk_addr = b)
-#define nat_get_ino(nat) (nat->ni.ino)
-#define nat_set_ino(nat, i) (nat->ni.ino = i)
-#define nat_get_version(nat) (nat->ni.version)
-#define nat_set_version(nat, v) (nat->ni.version = v)
+#define nat_get_nid(nat) ((nat)->ni.nid)
+#define nat_set_nid(nat, n) ((nat)->ni.nid = (n))
+#define nat_get_blkaddr(nat) ((nat)->ni.blk_addr)
+#define nat_set_blkaddr(nat, b) ((nat)->ni.blk_addr = (b))
+#define nat_get_ino(nat) ((nat)->ni.ino)
+#define nat_set_ino(nat, i) ((nat)->ni.ino = (i))
+#define nat_get_version(nat) ((nat)->ni.version)
+#define nat_set_version(nat, v) ((nat)->ni.version = (v))
-#define inc_node_version(version) (++version)
+#define inc_node_version(version) (++(version))
static inline void copy_node_info(struct node_info *dst,
struct node_info *src)
@@ -134,12 +132,18 @@ static inline bool excess_cached_nats(struct f2fs_sb_info *sbi)
return NM_I(sbi)->nat_cnt >= DEF_NAT_CACHE_THRESHOLD;
}
+static inline bool excess_dirty_nodes(struct f2fs_sb_info *sbi)
+{
+ return get_pages(sbi, F2FS_DIRTY_NODES) >= sbi->blocks_per_seg * 8;
+}
+
enum mem_type {
FREE_NIDS, /* indicates the free nid list */
NAT_ENTRIES, /* indicates the cached nat entry */
DIRTY_DENTS, /* indicates dirty dentry pages */
INO_ENTRIES, /* indicates inode entries */
EXTENT_CACHE, /* indicates extent cache */
+ INMEM_PAGES, /* indicates inmemory pages */
BASE_CHECK, /* check kernel status */
};
@@ -150,18 +154,10 @@ struct nat_entry_set {
unsigned int entry_cnt; /* the # of nat entries in set */
};
-/*
- * For free nid mangement
- */
-enum nid_state {
- NID_NEW, /* newly added to free nid list */
- NID_ALLOC /* it is allocated */
-};
-
struct free_nid {
struct list_head list; /* for free node id list */
nid_t nid; /* node id */
- int state; /* in use or not: NID_NEW or NID_ALLOC */
+ int state; /* in use or not: FREE_NID or PREALLOC_NID */
};
static inline void next_free_nid(struct f2fs_sb_info *sbi, nid_t *nid)
@@ -169,14 +165,14 @@ static inline void next_free_nid(struct f2fs_sb_info *sbi, nid_t *nid)
struct f2fs_nm_info *nm_i = NM_I(sbi);
struct free_nid *fnid;
- spin_lock(&nm_i->free_nid_list_lock);
- if (nm_i->fcnt <= 0) {
- spin_unlock(&nm_i->free_nid_list_lock);
+ spin_lock(&nm_i->nid_list_lock);
+ if (nm_i->nid_cnt[FREE_NID] <= 0) {
+ spin_unlock(&nm_i->nid_list_lock);
return;
}
- fnid = list_entry(nm_i->free_nid_list.next, struct free_nid, list);
+ fnid = list_first_entry(&nm_i->free_nid_list, struct free_nid, list);
*nid = fnid->nid;
- spin_unlock(&nm_i->free_nid_list_lock);
+ spin_unlock(&nm_i->nid_list_lock);
}
/*
@@ -185,6 +181,12 @@ static inline void next_free_nid(struct f2fs_sb_info *sbi, nid_t *nid)
static inline void get_nat_bitmap(struct f2fs_sb_info *sbi, void *addr)
{
struct f2fs_nm_info *nm_i = NM_I(sbi);
+
+#ifdef CONFIG_F2FS_CHECK_FS
+ if (memcmp(nm_i->nat_bitmap, nm_i->nat_bitmap_mir,
+ nm_i->bitmap_size))
+ f2fs_bug_on(sbi, 1);
+#endif
memcpy(addr, nm_i->nat_bitmap, nm_i->bitmap_size);
}
@@ -193,13 +195,16 @@ static inline pgoff_t current_nat_addr(struct f2fs_sb_info *sbi, nid_t start)
struct f2fs_nm_info *nm_i = NM_I(sbi);
pgoff_t block_off;
pgoff_t block_addr;
- int seg_off;
+ /*
+ * block_off = segment_off * 512 + off_in_segment
+ * OLD = (segment_off * 512) * 2 + off_in_segment
+ * NEW = 2 * (segment_off * 512 + off_in_segment) - off_in_segment
+ */
block_off = NAT_BLOCK_OFFSET(start);
- seg_off = block_off >> sbi->log_blocks_per_seg;
block_addr = (pgoff_t)(nm_i->nat_blkaddr +
- (seg_off << sbi->log_blocks_per_seg << 1) +
+ (block_off << 1) -
(block_off & (sbi->blocks_per_seg - 1)));
if (f2fs_test_bit(block_off, nm_i->nat_bitmap))
@@ -214,11 +219,7 @@ static inline pgoff_t next_nat_addr(struct f2fs_sb_info *sbi,
struct f2fs_nm_info *nm_i = NM_I(sbi);
block_addr -= nm_i->nat_blkaddr;
- if ((block_addr >> sbi->log_blocks_per_seg) % 2)
- block_addr -= sbi->blocks_per_seg;
- else
- block_addr += sbi->blocks_per_seg;
-
+ block_addr ^= 1 << sbi->log_blocks_per_seg;
return block_addr + nm_i->nat_blkaddr;
}
@@ -227,6 +228,9 @@ static inline void set_to_next_nat(struct f2fs_nm_info *nm_i, nid_t start_nid)
unsigned int block_off = NAT_BLOCK_OFFSET(start_nid);
f2fs_change_bit(block_off, nm_i->nat_bitmap);
+#ifdef CONFIG_F2FS_CHECK_FS
+ f2fs_change_bit(block_off, nm_i->nat_bitmap_mir);
+#endif
}
static inline nid_t ino_of_node(struct page *node_page)
@@ -290,14 +294,11 @@ static inline void fill_node_footer_blkaddr(struct page *page, block_t blkaddr)
{
struct f2fs_checkpoint *ckpt = F2FS_CKPT(F2FS_P_SB(page));
struct f2fs_node *rn = F2FS_NODE(page);
- size_t crc_offset = le32_to_cpu(ckpt->checksum_offset);
- __u64 cp_ver = le64_to_cpu(ckpt->checkpoint_ver);
+ __u64 cp_ver = cur_cp_version(ckpt);
+
+ if (__is_set_ckpt_flags(ckpt, CP_CRC_RECOVERY_FLAG))
+ cp_ver |= (cur_cp_crc(ckpt) << 32);
- if (__is_set_ckpt_flags(ckpt, CP_CRC_RECOVERY_FLAG)) {
- __u64 crc = le32_to_cpu(*((__le32 *)
- ((unsigned char *)ckpt + crc_offset)));
- cp_ver |= (crc << 32);
- }
rn->footer.cp_ver = cpu_to_le64(cp_ver);
rn->footer.next_blkaddr = cpu_to_le32(blkaddr);
}
@@ -305,15 +306,16 @@ static inline void fill_node_footer_blkaddr(struct page *page, block_t blkaddr)
static inline bool is_recoverable_dnode(struct page *page)
{
struct f2fs_checkpoint *ckpt = F2FS_CKPT(F2FS_P_SB(page));
- size_t crc_offset = le32_to_cpu(ckpt->checksum_offset);
__u64 cp_ver = cur_cp_version(ckpt);
- if (__is_set_ckpt_flags(ckpt, CP_CRC_RECOVERY_FLAG)) {
- __u64 crc = le32_to_cpu(*((__le32 *)
- ((unsigned char *)ckpt + crc_offset)));
- cp_ver |= (crc << 32);
- }
- return cpu_to_le64(cp_ver) == cpver_of_node(page);
+ /* Don't care crc part, if fsck.f2fs sets it. */
+ if (__is_set_ckpt_flags(ckpt, CP_NOCRC_RECOVERY_FLAG))
+ return (cp_ver << 32) == (cpver_of_node(page) << 32);
+
+ if (__is_set_ckpt_flags(ckpt, CP_CRC_RECOVERY_FLAG))
+ cp_ver |= (cur_cp_crc(ckpt) << 32);
+
+ return cp_ver == cpver_of_node(page);
}
/*
@@ -342,7 +344,7 @@ static inline bool IS_DNODE(struct page *node_page)
unsigned int ofs = ofs_of_node(node_page);
if (f2fs_has_xattr_block(ofs))
- return false;
+ return true;
if (ofs == 3 || ofs == 4 + NIDS_PER_BLOCK ||
ofs == 5 + 2 * NIDS_PER_BLOCK)
@@ -359,7 +361,7 @@ static inline int set_nid(struct page *p, int off, nid_t nid, bool i)
{
struct f2fs_node *rn = F2FS_NODE(p);
- f2fs_wait_on_page_writeback(p, NODE, true);
+ f2fs_wait_on_page_writeback(p, NODE, true, true);
if (i)
rn->i.i_nid[off - NODE_DIR1_BLOCK] = cpu_to_le32(nid);
@@ -423,12 +425,12 @@ static inline void clear_inline_node(struct page *page)
ClearPageChecked(page);
}
-static inline void set_cold_node(struct inode *inode, struct page *page)
+static inline void set_cold_node(struct page *page, bool is_dir)
{
struct f2fs_node *rn = F2FS_NODE(page);
unsigned int flag = le32_to_cpu(rn->footer.flag);
- if (S_ISDIR(inode->i_mode))
+ if (is_dir)
flag &= ~(0x1 << COLD_BIT_SHIFT);
else
flag |= (0x1 << COLD_BIT_SHIFT);
@@ -444,6 +446,10 @@ static inline void set_mark(struct page *page, int mark, int type)
else
flag &= ~(0x1 << type);
rn->footer.flag = cpu_to_le32(flag);
+
+#ifdef CONFIG_F2FS_CHECK_FS
+ f2fs_inode_chksum_set(F2FS_P_SB(page), page);
+#endif
}
#define set_dentry_mark(page, mark) set_mark(page, mark, DENT_BIT_SHIFT)
#define set_fsync_mark(page, mark) set_mark(page, mark, FSYNC_BIT_SHIFT)
diff --git a/fs/f2fs/recovery.c b/fs/f2fs/recovery.c
index ab4cbb4be423..5a99911d6641 100644
--- a/fs/f2fs/recovery.c
+++ b/fs/f2fs/recovery.c
@@ -1,12 +1,9 @@
+// SPDX-License-Identifier: GPL-2.0
/*
* fs/f2fs/recovery.c
*
* Copyright (c) 2012 Samsung Electronics Co., Ltd.
* http://www.samsung.com/
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
*/
#include <linux/fs.h>
#include <linux/f2fs_fs.h>
@@ -47,7 +44,7 @@
static struct kmem_cache *fsync_entry_slab;
-bool space_for_roll_forward(struct f2fs_sb_info *sbi)
+bool f2fs_space_for_roll_forward(struct f2fs_sb_info *sbi)
{
s64 nalloc = percpu_counter_sum_positive(&sbi->alloc_valid_block_count);
@@ -69,24 +66,42 @@ static struct fsync_inode_entry *get_fsync_inode(struct list_head *head,
}
static struct fsync_inode_entry *add_fsync_inode(struct f2fs_sb_info *sbi,
- struct list_head *head, nid_t ino)
+ struct list_head *head, nid_t ino, bool quota_inode)
{
struct inode *inode;
struct fsync_inode_entry *entry;
+ int err;
inode = f2fs_iget_retry(sbi->sb, ino);
if (IS_ERR(inode))
return ERR_CAST(inode);
+ err = dquot_initialize(inode);
+ if (err)
+ goto err_out;
+
+ if (quota_inode) {
+ err = dquot_alloc_inode(inode);
+ if (err)
+ goto err_out;
+ }
+
entry = f2fs_kmem_cache_alloc(fsync_entry_slab, GFP_F2FS_ZERO);
entry->inode = inode;
list_add_tail(&entry->list, head);
return entry;
+err_out:
+ iput(inode);
+ return ERR_PTR(err);
}
-static void del_fsync_inode(struct fsync_inode_entry *entry)
+static void del_fsync_inode(struct fsync_inode_entry *entry, int drop)
{
+ if (drop) {
+ /* inode should not be recovered, drop it */
+ f2fs_inode_synced(entry->inode);
+ }
iput(entry->inode);
list_del(&entry->list);
kmem_cache_free(fsync_entry_slab, entry);
@@ -107,7 +122,8 @@ static int recover_dentry(struct inode *inode, struct page *ipage,
entry = get_fsync_inode(dir_list, pino);
if (!entry) {
- entry = add_fsync_inode(F2FS_I_SB(inode), dir_list, pino);
+ entry = add_fsync_inode(F2FS_I_SB(inode), dir_list,
+ pino, false);
if (IS_ERR(entry)) {
dir = ERR_CAST(entry);
err = PTR_ERR(entry);
@@ -129,7 +145,7 @@ static int recover_dentry(struct inode *inode, struct page *ipage,
retry:
de = __f2fs_find_entry(dir, &fname, &page);
if (de && inode->i_ino == le32_to_cpu(de->ino))
- goto out_unmap_put;
+ goto out_put;
if (de) {
einode = f2fs_iget_retry(inode->i_sb, le32_to_cpu(de->ino));
@@ -138,12 +154,19 @@ retry:
err = PTR_ERR(einode);
if (err == -ENOENT)
err = -EEXIST;
- goto out_unmap_put;
+ goto out_put;
}
- err = acquire_orphan_inode(F2FS_I_SB(inode));
+
+ err = dquot_initialize(einode);
+ if (err) {
+ iput(einode);
+ goto out_put;
+ }
+
+ err = f2fs_acquire_orphan_inode(F2FS_I_SB(inode));
if (err) {
iput(einode);
- goto out_unmap_put;
+ goto out_put;
}
f2fs_delete_entry(de, page, dir, einode);
iput(einode);
@@ -151,15 +174,14 @@ retry:
} else if (IS_ERR(page)) {
err = PTR_ERR(page);
} else {
- err = __f2fs_do_add_link(dir, &fname, inode,
+ err = f2fs_add_dentry(dir, &fname, inode,
inode->i_ino, inode->i_mode);
}
if (err == -ENOMEM)
goto retry;
goto out;
-out_unmap_put:
- f2fs_dentry_kunmap(dir, page);
+out_put:
f2fs_put_page(page, 0);
out:
if (file_enc_name(inode))
@@ -173,60 +195,110 @@ out:
return err;
}
-static void recover_inode(struct inode *inode, struct page *page)
+static int recover_quota_data(struct inode *inode, struct page *page)
+{
+ struct f2fs_inode *raw = F2FS_INODE(page);
+ struct iattr attr;
+ uid_t i_uid = le32_to_cpu(raw->i_uid);
+ gid_t i_gid = le32_to_cpu(raw->i_gid);
+ int err;
+
+ memset(&attr, 0, sizeof(attr));
+
+ attr.ia_uid = make_kuid(inode->i_sb->s_user_ns, i_uid);
+ attr.ia_gid = make_kgid(inode->i_sb->s_user_ns, i_gid);
+
+ if (!uid_eq(attr.ia_uid, inode->i_uid))
+ attr.ia_valid |= ATTR_UID;
+ if (!gid_eq(attr.ia_gid, inode->i_gid))
+ attr.ia_valid |= ATTR_GID;
+
+ if (!attr.ia_valid)
+ return 0;
+
+ err = dquot_transfer(inode, &attr);
+ if (err)
+ set_sbi_flag(F2FS_I_SB(inode), SBI_QUOTA_NEED_REPAIR);
+ return err;
+}
+
+static void recover_inline_flags(struct inode *inode, struct f2fs_inode *ri)
+{
+ if (ri->i_inline & F2FS_PIN_FILE)
+ set_inode_flag(inode, FI_PIN_FILE);
+ else
+ clear_inode_flag(inode, FI_PIN_FILE);
+ if (ri->i_inline & F2FS_DATA_EXIST)
+ set_inode_flag(inode, FI_DATA_EXIST);
+ else
+ clear_inode_flag(inode, FI_DATA_EXIST);
+}
+
+static int recover_inode(struct inode *inode, struct page *page)
{
struct f2fs_inode *raw = F2FS_INODE(page);
char *name;
+ int err;
inode->i_mode = le16_to_cpu(raw->i_mode);
+
+ err = recover_quota_data(inode, page);
+ if (err)
+ return err;
+
+ i_uid_write(inode, le32_to_cpu(raw->i_uid));
+ i_gid_write(inode, le32_to_cpu(raw->i_gid));
+
+ if (raw->i_inline & F2FS_EXTRA_ATTR) {
+ if (f2fs_sb_has_project_quota(F2FS_I_SB(inode)) &&
+ F2FS_FITS_IN_INODE(raw, le16_to_cpu(raw->i_extra_isize),
+ i_projid)) {
+ projid_t i_projid;
+
+ i_projid = (projid_t)le32_to_cpu(raw->i_projid);
+ F2FS_I(inode)->i_projid =
+ make_kprojid(&init_user_ns, i_projid);
+ }
+ }
+
f2fs_i_size_write(inode, le64_to_cpu(raw->i_size));
- inode->i_atime.tv_sec = le64_to_cpu(raw->i_mtime);
+ inode->i_atime.tv_sec = le64_to_cpu(raw->i_atime);
inode->i_ctime.tv_sec = le64_to_cpu(raw->i_ctime);
inode->i_mtime.tv_sec = le64_to_cpu(raw->i_mtime);
- inode->i_atime.tv_nsec = le32_to_cpu(raw->i_mtime_nsec);
+ inode->i_atime.tv_nsec = le32_to_cpu(raw->i_atime_nsec);
inode->i_ctime.tv_nsec = le32_to_cpu(raw->i_ctime_nsec);
inode->i_mtime.tv_nsec = le32_to_cpu(raw->i_mtime_nsec);
+ F2FS_I(inode)->i_advise = raw->i_advise;
+ F2FS_I(inode)->i_flags = le32_to_cpu(raw->i_flags);
+ f2fs_set_inode_flags(inode);
+ F2FS_I(inode)->i_gc_failures[GC_FAILURE_PIN] =
+ le16_to_cpu(raw->i_gc_failures);
+
+ recover_inline_flags(inode, raw);
+
+ f2fs_mark_inode_dirty_sync(inode, true);
+
if (file_enc_name(inode))
name = "<encrypted>";
else
name = F2FS_INODE(page)->i_name;
- f2fs_msg(inode->i_sb, KERN_NOTICE, "recover_inode: ino = %x, name = %s",
- ino_of_node(page), name);
-}
-
-static bool is_same_inode(struct inode *inode, struct page *ipage)
-{
- struct f2fs_inode *ri = F2FS_INODE(ipage);
- struct timespec disk;
-
- if (!IS_INODE(ipage))
- return true;
-
- disk.tv_sec = le64_to_cpu(ri->i_ctime);
- disk.tv_nsec = le32_to_cpu(ri->i_ctime_nsec);
- if (timespec_compare(&inode->i_ctime, &disk) > 0)
- return false;
-
- disk.tv_sec = le64_to_cpu(ri->i_atime);
- disk.tv_nsec = le32_to_cpu(ri->i_atime_nsec);
- if (timespec_compare(&inode->i_atime, &disk) > 0)
- return false;
-
- disk.tv_sec = le64_to_cpu(ri->i_mtime);
- disk.tv_nsec = le32_to_cpu(ri->i_mtime_nsec);
- if (timespec_compare(&inode->i_mtime, &disk) > 0)
- return false;
-
- return true;
+ f2fs_msg(inode->i_sb, KERN_NOTICE,
+ "recover_inode: ino = %x, name = %s, inline = %x",
+ ino_of_node(page), name, raw->i_inline);
+ return 0;
}
-static int find_fsync_dnodes(struct f2fs_sb_info *sbi, struct list_head *head)
+static int find_fsync_dnodes(struct f2fs_sb_info *sbi, struct list_head *head,
+ bool check_only)
{
struct curseg_info *curseg;
struct page *page = NULL;
block_t blkaddr;
+ unsigned int loop_cnt = 0;
+ unsigned int free_blocks = MAIN_SEGS(sbi) * sbi->blocks_per_seg -
+ valid_user_blocks(sbi);
int err = 0;
/* get node pages in the current segment */
@@ -239,7 +311,11 @@ static int find_fsync_dnodes(struct f2fs_sb_info *sbi, struct list_head *head)
if (!f2fs_is_valid_blkaddr(sbi, blkaddr, META_POR))
return 0;
- page = get_tmp_page(sbi, blkaddr);
+ page = f2fs_get_tmp_page(sbi, blkaddr);
+ if (IS_ERR(page)) {
+ err = PTR_ERR(page);
+ break;
+ }
if (!is_recoverable_dnode(page))
break;
@@ -248,21 +324,23 @@ static int find_fsync_dnodes(struct f2fs_sb_info *sbi, struct list_head *head)
goto next;
entry = get_fsync_inode(head, ino_of_node(page));
- if (entry) {
- if (!is_same_inode(entry->inode, page))
- goto next;
- } else {
- if (IS_INODE(page) && is_dent_dnode(page)) {
- err = recover_inode_page(sbi, page);
+ if (!entry) {
+ bool quota_inode = false;
+
+ if (!check_only &&
+ IS_INODE(page) && is_dent_dnode(page)) {
+ err = f2fs_recover_inode_page(sbi, page);
if (err)
break;
+ quota_inode = true;
}
/*
* CP | dnode(F) | inode(DF)
* For this case, we should not give up now.
*/
- entry = add_fsync_inode(sbi, head, ino_of_node(page));
+ entry = add_fsync_inode(sbi, head, ino_of_node(page),
+ quota_inode);
if (IS_ERR(entry)) {
err = PTR_ERR(entry);
if (err == -ENOENT) {
@@ -277,22 +355,33 @@ static int find_fsync_dnodes(struct f2fs_sb_info *sbi, struct list_head *head)
if (IS_INODE(page) && is_dent_dnode(page))
entry->last_dentry = blkaddr;
next:
+ /* sanity check in order to detect looped node chain */
+ if (++loop_cnt >= free_blocks ||
+ blkaddr == next_blkaddr_of_node(page)) {
+ f2fs_msg(sbi->sb, KERN_NOTICE,
+ "%s: detect looped node chain, "
+ "blkaddr:%u, next:%u",
+ __func__, blkaddr, next_blkaddr_of_node(page));
+ err = -EINVAL;
+ break;
+ }
+
/* check next segment */
blkaddr = next_blkaddr_of_node(page);
f2fs_put_page(page, 1);
- ra_meta_pages_cond(sbi, blkaddr);
+ f2fs_ra_meta_pages_cond(sbi, blkaddr);
}
f2fs_put_page(page, 1);
return err;
}
-static void destroy_fsync_dnodes(struct list_head *head)
+static void destroy_fsync_dnodes(struct list_head *head, int drop)
{
struct fsync_inode_entry *entry, *tmp;
list_for_each_entry_safe(entry, tmp, head, list)
- del_fsync_inode(entry);
+ del_fsync_inode(entry, drop);
}
static int check_index_in_prev_nodes(struct f2fs_sb_info *sbi,
@@ -324,7 +413,9 @@ static int check_index_in_prev_nodes(struct f2fs_sb_info *sbi,
}
}
- sum_page = get_sum_page(sbi, segno);
+ sum_page = f2fs_get_sum_page(sbi, segno);
+ if (IS_ERR(sum_page))
+ return PTR_ERR(sum_page);
sum_node = (struct f2fs_summary_block *)page_address(sum_page);
sum = sum_node->entries[blkoff];
f2fs_put_page(sum_page, 1);
@@ -344,7 +435,7 @@ got_it:
}
/* Get the node page */
- node_page = get_node_page(sbi, nid);
+ node_page = f2fs_get_node_page(sbi, nid);
if (IS_ERR(node_page))
return PTR_ERR(node_page);
@@ -353,15 +444,24 @@ got_it:
f2fs_put_page(node_page, 1);
if (ino != dn->inode->i_ino) {
+ int ret;
+
/* Deallocate previous index in the node page */
inode = f2fs_iget_retry(sbi->sb, ino);
if (IS_ERR(inode))
return PTR_ERR(inode);
+
+ ret = dquot_initialize(inode);
+ if (ret) {
+ iput(inode);
+ return ret;
+ }
} else {
inode = dn->inode;
}
- bidx = start_bidx_of_node(offset, inode) + le16_to_cpu(sum.ofs_in_node);
+ bidx = f2fs_start_bidx_of_node(offset, inode) +
+ le16_to_cpu(sum.ofs_in_node);
/*
* if inode page is locked, unlock temporarily, but its reference
@@ -371,11 +471,11 @@ got_it:
unlock_page(dn->inode_page);
set_new_dnode(&tdn, inode, NULL, NULL, 0);
- if (get_dnode_of_data(&tdn, bidx, LOOKUP_NODE))
+ if (f2fs_get_dnode_of_data(&tdn, bidx, LOOKUP_NODE))
goto out;
if (tdn.data_blkaddr == blkaddr)
- truncate_data_blocks_range(&tdn, 1);
+ f2fs_truncate_data_blocks_range(&tdn, 1);
f2fs_put_dnode(&tdn);
out:
@@ -386,15 +486,16 @@ out:
return 0;
truncate_out:
- if (datablock_addr(tdn.node_page, tdn.ofs_in_node) == blkaddr)
- truncate_data_blocks_range(&tdn, 1);
+ if (datablock_addr(tdn.inode, tdn.node_page,
+ tdn.ofs_in_node) == blkaddr)
+ f2fs_truncate_data_blocks_range(&tdn, 1);
if (dn->inode->i_ino == nid && !dn->inode_page_locked)
unlock_page(dn->inode_page);
return 0;
}
static int do_recover_data(struct f2fs_sb_info *sbi, struct inode *inode,
- struct page *page, block_t blkaddr)
+ struct page *page)
{
struct dnode_of_data dn;
struct node_info ni;
@@ -403,27 +504,25 @@ static int do_recover_data(struct f2fs_sb_info *sbi, struct inode *inode,
/* step 1: recover xattr */
if (IS_INODE(page)) {
- recover_inline_xattr(inode, page);
+ f2fs_recover_inline_xattr(inode, page);
} else if (f2fs_has_xattr_block(ofs_of_node(page))) {
- /*
- * Deprecated; xattr blocks should be found from cold log.
- * But, we should remain this for backward compatibility.
- */
- recover_xattr_data(inode, page, blkaddr);
+ err = f2fs_recover_xattr_data(inode, page);
+ if (!err)
+ recovered++;
goto out;
}
/* step 2: recover inline data */
- if (recover_inline_data(inode, page))
+ if (f2fs_recover_inline_data(inode, page))
goto out;
/* step 3: recover data indices */
- start = start_bidx_of_node(ofs_of_node(page), inode);
+ start = f2fs_start_bidx_of_node(ofs_of_node(page), inode);
end = start + ADDRS_PER_PAGE(page, inode);
set_new_dnode(&dn, inode, NULL, NULL, 0);
retry_dn:
- err = get_dnode_of_data(&dn, start, ALLOC_NODE);
+ err = f2fs_get_dnode_of_data(&dn, start, ALLOC_NODE);
if (err) {
if (err == -ENOMEM) {
congestion_wait(BLK_RW_ASYNC, HZ/50);
@@ -432,17 +531,20 @@ retry_dn:
goto out;
}
- f2fs_wait_on_page_writeback(dn.node_page, NODE, true);
+ f2fs_wait_on_page_writeback(dn.node_page, NODE, true, true);
+
+ err = f2fs_get_node_info(sbi, dn.nid, &ni);
+ if (err)
+ goto err;
- get_node_info(sbi, dn.nid, &ni);
f2fs_bug_on(sbi, ni.ino != ino_of_node(page));
f2fs_bug_on(sbi, ofs_of_node(dn.node_page) != ofs_of_node(page));
for (; start < end; start++, dn.ofs_in_node++) {
block_t src, dest;
- src = datablock_addr(dn.node_page, dn.ofs_in_node);
- dest = datablock_addr(page, dn.ofs_in_node);
+ src = datablock_addr(dn.inode, dn.node_page, dn.ofs_in_node);
+ dest = datablock_addr(dn.inode, page, dn.ofs_in_node);
/* skip recovering if dest is the same as src */
if (src == dest)
@@ -450,20 +552,22 @@ retry_dn:
/* dest is invalid, just invalidate src block */
if (dest == NULL_ADDR) {
- truncate_data_blocks_range(&dn, 1);
+ f2fs_truncate_data_blocks_range(&dn, 1);
continue;
}
- if ((start + 1) << PAGE_SHIFT > i_size_read(inode))
- f2fs_i_size_write(inode, (start + 1) << PAGE_SHIFT);
+ if (!file_keep_isize(inode) &&
+ (i_size_read(inode) <= ((loff_t)start << PAGE_SHIFT)))
+ f2fs_i_size_write(inode,
+ (loff_t)(start + 1) << PAGE_SHIFT);
/*
* dest is reserved block, invalidate src block
* and then reserve one new block in dnode page.
*/
if (dest == NEW_ADDR) {
- truncate_data_blocks_range(&dn, 1);
- reserve_new_block(&dn);
+ f2fs_truncate_data_blocks_range(&dn, 1);
+ f2fs_reserve_new_block(&dn);
continue;
}
@@ -471,11 +575,10 @@ retry_dn:
if (f2fs_is_valid_blkaddr(sbi, dest, META_POR)) {
if (src == NULL_ADDR) {
- err = reserve_new_block(&dn);
-#ifdef CONFIG_F2FS_FAULT_INJECTION
- while (err)
- err = reserve_new_block(&dn);
-#endif
+ err = f2fs_reserve_new_block(&dn);
+ while (err &&
+ IS_ENABLED(CONFIG_F2FS_FAULT_INJECTION))
+ err = f2fs_reserve_new_block(&dn);
/* We should not get -ENOSPC */
f2fs_bug_on(sbi, err);
if (err)
@@ -507,13 +610,15 @@ err:
f2fs_put_dnode(&dn);
out:
f2fs_msg(sbi->sb, KERN_NOTICE,
- "recover_data: ino = %lx, recovered = %d blocks, err = %d",
- inode->i_ino, recovered, err);
+ "recover_data: ino = %lx (i_size: %s) recovered = %d, err = %d",
+ inode->i_ino,
+ file_keep_isize(inode) ? "keep" : "recover",
+ recovered, err);
return err;
}
static int recover_data(struct f2fs_sb_info *sbi, struct list_head *inode_list,
- struct list_head *dir_list)
+ struct list_head *tmp_inode_list, struct list_head *dir_list)
{
struct curseg_info *curseg;
struct page *page = NULL;
@@ -530,9 +635,13 @@ static int recover_data(struct f2fs_sb_info *sbi, struct list_head *inode_list,
if (!f2fs_is_valid_blkaddr(sbi, blkaddr, META_POR))
break;
- ra_meta_pages_cond(sbi, blkaddr);
+ f2fs_ra_meta_pages_cond(sbi, blkaddr);
- page = get_tmp_page(sbi, blkaddr);
+ page = f2fs_get_tmp_page(sbi, blkaddr);
+ if (IS_ERR(page)) {
+ err = PTR_ERR(page);
+ break;
+ }
if (!is_recoverable_dnode(page)) {
f2fs_put_page(page, 1);
@@ -547,8 +656,11 @@ static int recover_data(struct f2fs_sb_info *sbi, struct list_head *inode_list,
* In this case, we can lose the latest inode(x).
* So, call recover_inode for the inode update.
*/
- if (IS_INODE(page))
- recover_inode(entry->inode, page);
+ if (IS_INODE(page)) {
+ err = recover_inode(entry->inode, page);
+ if (err)
+ break;
+ }
if (entry->last_dentry == blkaddr) {
err = recover_dentry(entry->inode, page, dir_list);
if (err) {
@@ -556,65 +668,86 @@ static int recover_data(struct f2fs_sb_info *sbi, struct list_head *inode_list,
break;
}
}
- err = do_recover_data(sbi, entry->inode, page, blkaddr);
+ err = do_recover_data(sbi, entry->inode, page);
if (err) {
f2fs_put_page(page, 1);
break;
}
if (entry->blkaddr == blkaddr)
- del_fsync_inode(entry);
+ list_move_tail(&entry->list, tmp_inode_list);
next:
/* check next segment */
blkaddr = next_blkaddr_of_node(page);
f2fs_put_page(page, 1);
}
if (!err)
- allocate_new_segments(sbi);
+ f2fs_allocate_new_segments(sbi);
return err;
}
-int recover_fsync_data(struct f2fs_sb_info *sbi, bool check_only)
+int f2fs_recover_fsync_data(struct f2fs_sb_info *sbi, bool check_only)
{
- struct curseg_info *curseg = CURSEG_I(sbi, CURSEG_WARM_NODE);
- struct list_head inode_list;
+ struct list_head inode_list, tmp_inode_list;
struct list_head dir_list;
- block_t blkaddr;
int err;
int ret = 0;
+ unsigned long s_flags = sbi->sb->s_flags;
bool need_writecp = false;
+#ifdef CONFIG_QUOTA
+ int quota_enabled;
+#endif
+
+ if (s_flags & MS_RDONLY) {
+ f2fs_msg(sbi->sb, KERN_INFO,
+ "recover fsync data on readonly fs");
+ sbi->sb->s_flags &= ~MS_RDONLY;
+ }
+
+#ifdef CONFIG_QUOTA
+ /* Needed for iput() to work correctly and not trash data */
+ sbi->sb->s_flags |= MS_ACTIVE;
+ /* Turn on quotas so that they are updated correctly */
+ quota_enabled = f2fs_enable_quota_files(sbi, s_flags & MS_RDONLY);
+#endif
fsync_entry_slab = f2fs_kmem_cache_create("f2fs_fsync_inode_entry",
sizeof(struct fsync_inode_entry));
- if (!fsync_entry_slab)
- return -ENOMEM;
+ if (!fsync_entry_slab) {
+ err = -ENOMEM;
+ goto out;
+ }
INIT_LIST_HEAD(&inode_list);
+ INIT_LIST_HEAD(&tmp_inode_list);
INIT_LIST_HEAD(&dir_list);
/* prevent checkpoint */
mutex_lock(&sbi->cp_mutex);
- blkaddr = NEXT_FREE_BLKADDR(sbi, curseg);
-
/* step #1: find fsynced inode numbers */
- err = find_fsync_dnodes(sbi, &inode_list);
+ err = find_fsync_dnodes(sbi, &inode_list, check_only);
if (err || list_empty(&inode_list))
- goto out;
+ goto skip;
if (check_only) {
ret = 1;
- goto out;
+ goto skip;
}
need_writecp = true;
/* step #2: recover data */
- err = recover_data(sbi, &inode_list, &dir_list);
+ err = recover_data(sbi, &inode_list, &tmp_inode_list, &dir_list);
if (!err)
f2fs_bug_on(sbi, !list_empty(&inode_list));
-out:
- destroy_fsync_dnodes(&inode_list);
+ else {
+ /* restore s_flags to let iput() trash data */
+ sbi->sb->s_flags = s_flags;
+ }
+skip:
+ destroy_fsync_dnodes(&inode_list, err);
+ destroy_fsync_dnodes(&tmp_inode_list, err);
/* truncate meta pages to be used by the recovery */
truncate_inode_pages_range(META_MAPPING(sbi),
@@ -623,21 +756,33 @@ out:
if (err) {
truncate_inode_pages_final(NODE_MAPPING(sbi));
truncate_inode_pages_final(META_MAPPING(sbi));
+ } else {
+ clear_sbi_flag(sbi, SBI_POR_DOING);
}
-
- clear_sbi_flag(sbi, SBI_POR_DOING);
mutex_unlock(&sbi->cp_mutex);
/* let's drop all the directory inodes for clean checkpoint */
- destroy_fsync_dnodes(&dir_list);
+ destroy_fsync_dnodes(&dir_list, err);
- if (!err && need_writecp) {
- struct cp_control cpc = {
- .reason = CP_RECOVERY,
- };
- err = write_checkpoint(sbi, &cpc);
+ if (need_writecp) {
+ set_sbi_flag(sbi, SBI_IS_RECOVERED);
+
+ if (!err) {
+ struct cp_control cpc = {
+ .reason = CP_RECOVERY,
+ };
+ err = f2fs_write_checkpoint(sbi, &cpc);
+ }
}
kmem_cache_destroy(fsync_entry_slab);
+out:
+#ifdef CONFIG_QUOTA
+ /* Turn quotas off */
+ if (quota_enabled)
+ f2fs_quota_off_umount(sbi->sb);
+#endif
+ sbi->sb->s_flags = s_flags; /* Restore MS_RDONLY status */
+
return ret ? ret: err;
}
diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c
index 2fb99a081de8..eaf7be527b67 100644
--- a/fs/f2fs/segment.c
+++ b/fs/f2fs/segment.c
@@ -1,12 +1,9 @@
+// SPDX-License-Identifier: GPL-2.0
/*
* fs/f2fs/segment.c
*
* Copyright (c) 2012 Samsung Electronics Co., Ltd.
* http://www.samsung.com/
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
*/
#include <linux/fs.h>
#include <linux/f2fs_fs.h>
@@ -16,17 +13,20 @@
#include <linux/kthread.h>
#include <linux/swap.h>
#include <linux/timer.h>
+#include <linux/freezer.h>
+#include <linux/sched.h>
#include "f2fs.h"
#include "segment.h"
#include "node.h"
+#include "gc.h"
#include "trace.h"
#include <trace/events/f2fs.h>
#define __reverse_ffz(x) __reverse_ffs(~(x))
static struct kmem_cache *discard_entry_slab;
-static struct kmem_cache *bio_entry_slab;
+static struct kmem_cache *discard_cmd_slab;
static struct kmem_cache *sit_entry_set_slab;
static struct kmem_cache *inmem_entry_slab;
@@ -166,8 +166,26 @@ found:
return result - size + __reverse_ffz(tmp);
}
-void register_inmem_page(struct inode *inode, struct page *page)
+bool f2fs_need_SSR(struct f2fs_sb_info *sbi)
{
+ int node_secs = get_blocktype_secs(sbi, F2FS_DIRTY_NODES);
+ int dent_secs = get_blocktype_secs(sbi, F2FS_DIRTY_DENTS);
+ int imeta_secs = get_blocktype_secs(sbi, F2FS_DIRTY_IMETA);
+
+ if (test_opt(sbi, LFS))
+ return false;
+ if (sbi->gc_mode == GC_URGENT)
+ return true;
+ if (unlikely(is_sbi_flag_set(sbi, SBI_CP_DISABLED)))
+ return true;
+
+ return free_sections(sbi) <= (node_secs + 2 * dent_secs + imeta_secs +
+ SM_I(sbi)->min_ssr_sections + reserved_sections(sbi));
+}
+
+void f2fs_register_inmem_page(struct inode *inode, struct page *page)
+{
+ struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
struct f2fs_inode_info *fi = F2FS_I(inode);
struct inmem_pages *new;
@@ -186,6 +204,10 @@ void register_inmem_page(struct inode *inode, struct page *page)
mutex_lock(&fi->inmem_lock);
get_page(page);
list_add_tail(&new->list, &fi->inmem_pages);
+ spin_lock(&sbi->inode_lock[ATOMIC_FILE]);
+ if (list_empty(&fi->inmem_ilist))
+ list_add_tail(&fi->inmem_ilist, &sbi->inode_list[ATOMIC_FILE]);
+ spin_unlock(&sbi->inode_lock[ATOMIC_FILE]);
inc_page_count(F2FS_I_SB(inode), F2FS_INMEM_PAGES);
mutex_unlock(&fi->inmem_lock);
@@ -207,28 +229,47 @@ static int __revoke_inmem_pages(struct inode *inode,
lock_page(page);
- f2fs_wait_on_page_writeback(page, DATA, true);
+ f2fs_wait_on_page_writeback(page, DATA, true, true);
if (recover) {
struct dnode_of_data dn;
struct node_info ni;
trace_f2fs_commit_inmem_page(page, INMEM_REVOKE);
-
+retry:
set_new_dnode(&dn, inode, NULL, NULL, 0);
- if (get_dnode_of_data(&dn, page->index, LOOKUP_NODE)) {
+ err = f2fs_get_dnode_of_data(&dn, page->index,
+ LOOKUP_NODE);
+ if (err) {
+ if (err == -ENOMEM) {
+ congestion_wait(BLK_RW_ASYNC, HZ/50);
+ cond_resched();
+ goto retry;
+ }
err = -EAGAIN;
goto next;
}
- get_node_info(sbi, dn.nid, &ni);
- f2fs_replace_block(sbi, &dn, dn.data_blkaddr,
+
+ err = f2fs_get_node_info(sbi, dn.nid, &ni);
+ if (err) {
+ f2fs_put_dnode(&dn);
+ return err;
+ }
+
+ if (cur->old_addr == NEW_ADDR) {
+ f2fs_invalidate_blocks(sbi, dn.data_blkaddr);
+ f2fs_update_data_blkaddr(&dn, NEW_ADDR);
+ } else
+ f2fs_replace_block(sbi, &dn, dn.data_blkaddr,
cur->old_addr, ni.version, true, true);
f2fs_put_dnode(&dn);
}
next:
/* we don't need to invalidate this in the sccessful status */
- if (drop || recover)
+ if (drop || recover) {
ClearPageUptodate(page);
+ clear_cold_data(page);
+ }
set_page_private(page, 0);
ClearPagePrivate(page);
f2fs_put_page(page, 1);
@@ -240,33 +281,105 @@ next:
return err;
}
-void drop_inmem_pages(struct inode *inode)
+void f2fs_drop_inmem_pages_all(struct f2fs_sb_info *sbi, bool gc_failure)
{
+ struct list_head *head = &sbi->inode_list[ATOMIC_FILE];
+ struct inode *inode;
+ struct f2fs_inode_info *fi;
+next:
+ spin_lock(&sbi->inode_lock[ATOMIC_FILE]);
+ if (list_empty(head)) {
+ spin_unlock(&sbi->inode_lock[ATOMIC_FILE]);
+ return;
+ }
+ fi = list_first_entry(head, struct f2fs_inode_info, inmem_ilist);
+ inode = igrab(&fi->vfs_inode);
+ spin_unlock(&sbi->inode_lock[ATOMIC_FILE]);
+
+ if (inode) {
+ if (gc_failure) {
+ if (fi->i_gc_failures[GC_FAILURE_ATOMIC])
+ goto drop;
+ goto skip;
+ }
+drop:
+ set_inode_flag(inode, FI_ATOMIC_REVOKE_REQUEST);
+ f2fs_drop_inmem_pages(inode);
+ iput(inode);
+ }
+skip:
+ congestion_wait(BLK_RW_ASYNC, HZ/50);
+ cond_resched();
+ goto next;
+}
+
+void f2fs_drop_inmem_pages(struct inode *inode)
+{
+ struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
struct f2fs_inode_info *fi = F2FS_I(inode);
+ mutex_lock(&fi->inmem_lock);
+ __revoke_inmem_pages(inode, &fi->inmem_pages, true, false);
+ spin_lock(&sbi->inode_lock[ATOMIC_FILE]);
+ if (!list_empty(&fi->inmem_ilist))
+ list_del_init(&fi->inmem_ilist);
+ spin_unlock(&sbi->inode_lock[ATOMIC_FILE]);
+ mutex_unlock(&fi->inmem_lock);
+
clear_inode_flag(inode, FI_ATOMIC_FILE);
+ fi->i_gc_failures[GC_FAILURE_ATOMIC] = 0;
+ stat_dec_atomic_write(inode);
+}
+
+void f2fs_drop_inmem_page(struct inode *inode, struct page *page)
+{
+ struct f2fs_inode_info *fi = F2FS_I(inode);
+ struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
+ struct list_head *head = &fi->inmem_pages;
+ struct inmem_pages *cur = NULL;
+
+ f2fs_bug_on(sbi, !IS_ATOMIC_WRITTEN_PAGE(page));
mutex_lock(&fi->inmem_lock);
- __revoke_inmem_pages(inode, &fi->inmem_pages, true, false);
+ list_for_each_entry(cur, head, list) {
+ if (cur->page == page)
+ break;
+ }
+
+ f2fs_bug_on(sbi, list_empty(head) || cur->page != page);
+ list_del(&cur->list);
mutex_unlock(&fi->inmem_lock);
+
+ dec_page_count(sbi, F2FS_INMEM_PAGES);
+ kmem_cache_free(inmem_entry_slab, cur);
+
+ ClearPageUptodate(page);
+ set_page_private(page, 0);
+ ClearPagePrivate(page);
+ f2fs_put_page(page, 0);
+
+ trace_f2fs_commit_inmem_page(page, INMEM_INVALIDATE);
}
-static int __commit_inmem_pages(struct inode *inode,
- struct list_head *revoke_list)
+static int __f2fs_commit_inmem_pages(struct inode *inode)
{
struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
struct f2fs_inode_info *fi = F2FS_I(inode);
struct inmem_pages *cur, *tmp;
struct f2fs_io_info fio = {
.sbi = sbi,
+ .ino = inode->i_ino,
.type = DATA,
.op = REQ_OP_WRITE,
- .op_flags = WRITE_SYNC | REQ_PRIO,
- .encrypted_page = NULL,
+ .op_flags = REQ_SYNC | REQ_PRIO,
+ .io_type = FS_DATA_IO,
};
+ struct list_head revoke_list;
bool submit_bio = false;
int err = 0;
+ INIT_LIST_HEAD(&revoke_list);
+
list_for_each_entry_safe(cur, tmp, &fi->inmem_pages, list) {
struct page *page = cur->page;
@@ -274,70 +387,86 @@ static int __commit_inmem_pages(struct inode *inode,
if (page->mapping == inode->i_mapping) {
trace_f2fs_commit_inmem_page(page, INMEM);
+ f2fs_wait_on_page_writeback(page, DATA, true, true);
+
set_page_dirty(page);
- f2fs_wait_on_page_writeback(page, DATA, true);
- if (clear_page_dirty_for_io(page))
+ if (clear_page_dirty_for_io(page)) {
inode_dec_dirty_pages(inode);
-
+ f2fs_remove_dirty_inode(inode);
+ }
+retry:
fio.page = page;
- err = do_write_data_page(&fio);
+ fio.old_blkaddr = NULL_ADDR;
+ fio.encrypted_page = NULL;
+ fio.need_lock = LOCK_DONE;
+ err = f2fs_do_write_data_page(&fio);
if (err) {
+ if (err == -ENOMEM) {
+ congestion_wait(BLK_RW_ASYNC, HZ/50);
+ cond_resched();
+ goto retry;
+ }
unlock_page(page);
break;
}
-
/* record old blkaddr for revoking */
cur->old_addr = fio.old_blkaddr;
-
- clear_cold_data(page);
submit_bio = true;
}
unlock_page(page);
- list_move_tail(&cur->list, revoke_list);
+ list_move_tail(&cur->list, &revoke_list);
}
if (submit_bio)
- f2fs_submit_merged_bio_cond(sbi, inode, NULL, 0, DATA, WRITE);
+ f2fs_submit_merged_write_cond(sbi, inode, NULL, 0, DATA);
- if (!err)
- __revoke_inmem_pages(inode, revoke_list, false, false);
+ if (err) {
+ /*
+ * try to revoke all committed pages, but still we could fail
+ * due to no memory or other reason, if that happened, EAGAIN
+ * will be returned, which means in such case, transaction is
+ * already not integrity, caller should use journal to do the
+ * recovery or rewrite & commit last transaction. For other
+ * error number, revoking was done by filesystem itself.
+ */
+ err = __revoke_inmem_pages(inode, &revoke_list, false, true);
+
+ /* drop all uncommitted pages */
+ __revoke_inmem_pages(inode, &fi->inmem_pages, true, false);
+ } else {
+ __revoke_inmem_pages(inode, &revoke_list, false, false);
+ }
return err;
}
-int commit_inmem_pages(struct inode *inode)
+int f2fs_commit_inmem_pages(struct inode *inode)
{
struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
struct f2fs_inode_info *fi = F2FS_I(inode);
- struct list_head revoke_list;
int err;
- INIT_LIST_HEAD(&revoke_list);
f2fs_balance_fs(sbi, true);
+
+ down_write(&fi->i_gc_rwsem[WRITE]);
+
f2fs_lock_op(sbi);
+ set_inode_flag(inode, FI_ATOMIC_COMMIT);
mutex_lock(&fi->inmem_lock);
- err = __commit_inmem_pages(inode, &revoke_list);
- if (err) {
- int ret;
- /*
- * try to revoke all committed pages, but still we could fail
- * due to no memory or other reason, if that happened, EAGAIN
- * will be returned, which means in such case, transaction is
- * already not integrity, caller should use journal to do the
- * recovery or rewrite & commit last transaction. For other
- * error number, revoking was done by filesystem itself.
- */
- ret = __revoke_inmem_pages(inode, &revoke_list, false, true);
- if (ret)
- err = ret;
+ err = __f2fs_commit_inmem_pages(inode);
- /* drop all uncommitted pages */
- __revoke_inmem_pages(inode, &fi->inmem_pages, true, false);
- }
+ spin_lock(&sbi->inode_lock[ATOMIC_FILE]);
+ if (!list_empty(&fi->inmem_ilist))
+ list_del_init(&fi->inmem_ilist);
+ spin_unlock(&sbi->inode_lock[ATOMIC_FILE]);
mutex_unlock(&fi->inmem_lock);
+ clear_inode_flag(inode, FI_ATOMIC_COMMIT);
+
f2fs_unlock_op(sbi);
+ up_write(&fi->i_gc_rwsem[WRITE]);
+
return err;
}
@@ -347,25 +476,25 @@ int commit_inmem_pages(struct inode *inode)
*/
void f2fs_balance_fs(struct f2fs_sb_info *sbi, bool need)
{
-#ifdef CONFIG_F2FS_FAULT_INJECTION
- if (time_to_inject(sbi, FAULT_CHECKPOINT))
+ if (time_to_inject(sbi, FAULT_CHECKPOINT)) {
+ f2fs_show_injection_info(FAULT_CHECKPOINT);
f2fs_stop_checkpoint(sbi, false);
-#endif
-
- if (!need)
- return;
+ }
/* balance_fs_bg is able to be pending */
- if (excess_cached_nats(sbi))
+ if (need && excess_cached_nats(sbi))
f2fs_balance_fs_bg(sbi);
+ if (f2fs_is_checkpoint_ready(sbi))
+ return;
+
/*
* We should do GC or end up with checkpoint, if there are so many dirty
* dir/node pages without enough free segments.
*/
if (has_not_enough_free_secs(sbi, 0, 0)) {
mutex_lock(&sbi->gc_mutex);
- f2fs_gc(sbi, false);
+ f2fs_gc(sbi, false, false, NULL_SEGNO);
}
}
@@ -375,29 +504,34 @@ void f2fs_balance_fs_bg(struct f2fs_sb_info *sbi)
return;
/* try to shrink extent cache when there is no enough memory */
- if (!available_free_memory(sbi, EXTENT_CACHE))
+ if (!f2fs_available_free_memory(sbi, EXTENT_CACHE))
f2fs_shrink_extent_tree(sbi, EXTENT_CACHE_SHRINK_NUMBER);
/* check the # of cached NAT entries */
- if (!available_free_memory(sbi, NAT_ENTRIES))
- try_to_free_nats(sbi, NAT_ENTRY_PER_BLOCK);
+ if (!f2fs_available_free_memory(sbi, NAT_ENTRIES))
+ f2fs_try_to_free_nats(sbi, NAT_ENTRY_PER_BLOCK);
- if (!available_free_memory(sbi, FREE_NIDS))
- try_to_free_nids(sbi, MAX_FREE_NIDS);
+ if (!f2fs_available_free_memory(sbi, FREE_NIDS))
+ f2fs_try_to_free_nids(sbi, MAX_FREE_NIDS);
else
- build_free_nids(sbi);
+ f2fs_build_free_nids(sbi, false, false);
+
+ if (!is_idle(sbi, REQ_TIME) &&
+ (!excess_dirty_nats(sbi) && !excess_dirty_nodes(sbi)))
+ return;
/* checkpoint is the only way to shrink partial cached entries */
- if (!available_free_memory(sbi, NAT_ENTRIES) ||
- !available_free_memory(sbi, INO_ENTRIES) ||
+ if (!f2fs_available_free_memory(sbi, NAT_ENTRIES) ||
+ !f2fs_available_free_memory(sbi, INO_ENTRIES) ||
excess_prefree_segs(sbi) ||
excess_dirty_nats(sbi) ||
- (is_idle(sbi) && f2fs_time_over(sbi, CP_TIME))) {
+ excess_dirty_nodes(sbi) ||
+ f2fs_time_over(sbi, CP_TIME)) {
if (test_opt(sbi, DATA_FLUSH)) {
struct blk_plug plug;
blk_start_plug(&plug);
- sync_dirty_inodes(sbi, FILE_INODE);
+ f2fs_sync_dirty_inodes(sbi, FILE_INODE);
blk_finish_plug(&plug);
}
f2fs_sync_fs(sbi->sb, true);
@@ -405,117 +539,217 @@ void f2fs_balance_fs_bg(struct f2fs_sb_info *sbi)
}
}
+static int __submit_flush_wait(struct f2fs_sb_info *sbi,
+ struct block_device *bdev)
+{
+ struct bio *bio = f2fs_bio_alloc(sbi, 0, true);
+ int ret;
+
+ bio->bi_opf = REQ_OP_WRITE | REQ_SYNC | REQ_PREFLUSH;
+ bio->bi_bdev = bdev;
+ ret = submit_bio_wait(bio);
+ bio_put(bio);
+
+ trace_f2fs_issue_flush(bdev, test_opt(sbi, NOBARRIER),
+ test_opt(sbi, FLUSH_MERGE), ret);
+ return ret;
+}
+
+static int submit_flush_wait(struct f2fs_sb_info *sbi, nid_t ino)
+{
+ int ret = 0;
+ int i;
+
+ if (!sbi->s_ndevs)
+ return __submit_flush_wait(sbi, sbi->sb->s_bdev);
+
+ for (i = 0; i < sbi->s_ndevs; i++) {
+ if (!f2fs_is_dirty_device(sbi, ino, i, FLUSH_INO))
+ continue;
+ ret = __submit_flush_wait(sbi, FDEV(i).bdev);
+ if (ret)
+ break;
+ }
+ return ret;
+}
+
static int issue_flush_thread(void *data)
{
struct f2fs_sb_info *sbi = data;
- struct flush_cmd_control *fcc = SM_I(sbi)->cmd_control_info;
+ struct flush_cmd_control *fcc = SM_I(sbi)->fcc_info;
wait_queue_head_t *q = &fcc->flush_wait_queue;
repeat:
if (kthread_should_stop())
return 0;
+ sb_start_intwrite(sbi->sb);
+
if (!llist_empty(&fcc->issue_list)) {
- struct bio *bio;
struct flush_cmd *cmd, *next;
int ret;
- bio = f2fs_bio_alloc(0);
-
fcc->dispatch_list = llist_del_all(&fcc->issue_list);
fcc->dispatch_list = llist_reverse_order(fcc->dispatch_list);
- bio->bi_bdev = sbi->sb->s_bdev;
- bio_set_op_attrs(bio, REQ_OP_WRITE, WRITE_FLUSH);
- ret = submit_bio_wait(bio);
+ cmd = llist_entry(fcc->dispatch_list, struct flush_cmd, llnode);
+
+ ret = submit_flush_wait(sbi, cmd->ino);
+ atomic_inc(&fcc->issued_flush);
llist_for_each_entry_safe(cmd, next,
fcc->dispatch_list, llnode) {
cmd->ret = ret;
complete(&cmd->wait);
}
- bio_put(bio);
fcc->dispatch_list = NULL;
}
+ sb_end_intwrite(sbi->sb);
+
wait_event_interruptible(*q,
kthread_should_stop() || !llist_empty(&fcc->issue_list));
goto repeat;
}
-int f2fs_issue_flush(struct f2fs_sb_info *sbi)
+int f2fs_issue_flush(struct f2fs_sb_info *sbi, nid_t ino)
{
- struct flush_cmd_control *fcc = SM_I(sbi)->cmd_control_info;
+ struct flush_cmd_control *fcc = SM_I(sbi)->fcc_info;
struct flush_cmd cmd;
-
- trace_f2fs_issue_flush(sbi->sb, test_opt(sbi, NOBARRIER),
- test_opt(sbi, FLUSH_MERGE));
+ int ret;
if (test_opt(sbi, NOBARRIER))
return 0;
- if (!test_opt(sbi, FLUSH_MERGE) || !atomic_read(&fcc->submit_flush)) {
- struct bio *bio = f2fs_bio_alloc(0);
- int ret;
+ if (!test_opt(sbi, FLUSH_MERGE)) {
+ atomic_inc(&fcc->queued_flush);
+ ret = submit_flush_wait(sbi, ino);
+ atomic_dec(&fcc->queued_flush);
+ atomic_inc(&fcc->issued_flush);
+ return ret;
+ }
+
+ if (atomic_inc_return(&fcc->queued_flush) == 1 || sbi->s_ndevs > 1) {
+ ret = submit_flush_wait(sbi, ino);
+ atomic_dec(&fcc->queued_flush);
- atomic_inc(&fcc->submit_flush);
- bio->bi_bdev = sbi->sb->s_bdev;
- bio_set_op_attrs(bio, REQ_OP_WRITE, WRITE_FLUSH);
- ret = submit_bio_wait(bio);
- atomic_dec(&fcc->submit_flush);
- bio_put(bio);
+ atomic_inc(&fcc->issued_flush);
return ret;
}
+ cmd.ino = ino;
init_completion(&cmd.wait);
- atomic_inc(&fcc->submit_flush);
llist_add(&cmd.llnode, &fcc->issue_list);
- if (!fcc->dispatch_list)
+ /* update issue_list before we wake up issue_flush thread */
+ smp_mb();
+
+ if (waitqueue_active(&fcc->flush_wait_queue))
wake_up(&fcc->flush_wait_queue);
- wait_for_completion(&cmd.wait);
- atomic_dec(&fcc->submit_flush);
+ if (fcc->f2fs_issue_flush) {
+ wait_for_completion(&cmd.wait);
+ atomic_dec(&fcc->queued_flush);
+ } else {
+ struct llist_node *list;
+
+ list = llist_del_all(&fcc->issue_list);
+ if (!list) {
+ wait_for_completion(&cmd.wait);
+ atomic_dec(&fcc->queued_flush);
+ } else {
+ struct flush_cmd *tmp, *next;
+
+ ret = submit_flush_wait(sbi, ino);
+
+ llist_for_each_entry_safe(tmp, next, list, llnode) {
+ if (tmp == &cmd) {
+ cmd.ret = ret;
+ atomic_dec(&fcc->queued_flush);
+ continue;
+ }
+ tmp->ret = ret;
+ complete(&tmp->wait);
+ }
+ }
+ }
return cmd.ret;
}
-int create_flush_cmd_control(struct f2fs_sb_info *sbi)
+int f2fs_create_flush_cmd_control(struct f2fs_sb_info *sbi)
{
dev_t dev = sbi->sb->s_bdev->bd_dev;
struct flush_cmd_control *fcc;
int err = 0;
- fcc = kzalloc(sizeof(struct flush_cmd_control), GFP_KERNEL);
+ if (SM_I(sbi)->fcc_info) {
+ fcc = SM_I(sbi)->fcc_info;
+ if (fcc->f2fs_issue_flush)
+ return err;
+ goto init_thread;
+ }
+
+ fcc = f2fs_kzalloc(sbi, sizeof(struct flush_cmd_control), GFP_KERNEL);
if (!fcc)
return -ENOMEM;
- atomic_set(&fcc->submit_flush, 0);
+ atomic_set(&fcc->issued_flush, 0);
+ atomic_set(&fcc->queued_flush, 0);
init_waitqueue_head(&fcc->flush_wait_queue);
init_llist_head(&fcc->issue_list);
- SM_I(sbi)->cmd_control_info = fcc;
+ SM_I(sbi)->fcc_info = fcc;
if (!test_opt(sbi, FLUSH_MERGE))
return err;
+init_thread:
fcc->f2fs_issue_flush = kthread_run(issue_flush_thread, sbi,
"f2fs_flush-%u:%u", MAJOR(dev), MINOR(dev));
if (IS_ERR(fcc->f2fs_issue_flush)) {
err = PTR_ERR(fcc->f2fs_issue_flush);
- kfree(fcc);
- SM_I(sbi)->cmd_control_info = NULL;
+ kvfree(fcc);
+ SM_I(sbi)->fcc_info = NULL;
return err;
}
return err;
}
-void destroy_flush_cmd_control(struct f2fs_sb_info *sbi)
+void f2fs_destroy_flush_cmd_control(struct f2fs_sb_info *sbi, bool free)
{
- struct flush_cmd_control *fcc = SM_I(sbi)->cmd_control_info;
+ struct flush_cmd_control *fcc = SM_I(sbi)->fcc_info;
+
+ if (fcc && fcc->f2fs_issue_flush) {
+ struct task_struct *flush_thread = fcc->f2fs_issue_flush;
- if (fcc && fcc->f2fs_issue_flush)
- kthread_stop(fcc->f2fs_issue_flush);
- kfree(fcc);
- SM_I(sbi)->cmd_control_info = NULL;
+ fcc->f2fs_issue_flush = NULL;
+ kthread_stop(flush_thread);
+ }
+ if (free) {
+ kvfree(fcc);
+ SM_I(sbi)->fcc_info = NULL;
+ }
+}
+
+int f2fs_flush_device_cache(struct f2fs_sb_info *sbi)
+{
+ int ret = 0, i;
+
+ if (!sbi->s_ndevs)
+ return 0;
+
+ for (i = 1; i < sbi->s_ndevs; i++) {
+ if (!f2fs_test_bit(i, (char *)&sbi->dirty_device))
+ continue;
+ ret = __submit_flush_wait(sbi, FDEV(i).bdev);
+ if (ret)
+ break;
+
+ spin_lock(&sbi->dev_lock);
+ f2fs_clear_bit(i, (char *)&sbi->dirty_device);
+ spin_unlock(&sbi->dev_lock);
+ }
+
+ return ret;
}
static void __locate_dirty_segment(struct f2fs_sb_info *sbi, unsigned int segno,
@@ -558,8 +792,8 @@ static void __remove_dirty_segment(struct f2fs_sb_info *sbi, unsigned int segno,
if (test_and_clear_bit(segno, dirty_i->dirty_segmap[t]))
dirty_i->nr_dirty[t]--;
- if (get_valid_blocks(sbi, segno, sbi->segs_per_sec) == 0)
- clear_bit(GET_SECNO(sbi, segno),
+ if (get_valid_blocks(sbi, segno, true) == 0)
+ clear_bit(GET_SEC_FROM_SEG(sbi, segno),
dirty_i->victim_secmap);
}
}
@@ -572,16 +806,18 @@ static void __remove_dirty_segment(struct f2fs_sb_info *sbi, unsigned int segno,
static void locate_dirty_segment(struct f2fs_sb_info *sbi, unsigned int segno)
{
struct dirty_seglist_info *dirty_i = DIRTY_I(sbi);
- unsigned short valid_blocks;
+ unsigned short valid_blocks, ckpt_valid_blocks;
if (segno == NULL_SEGNO || IS_CURSEG(sbi, segno))
return;
mutex_lock(&dirty_i->seglist_lock);
- valid_blocks = get_valid_blocks(sbi, segno, 0);
+ valid_blocks = get_valid_blocks(sbi, segno, false);
+ ckpt_valid_blocks = get_ckpt_valid_blocks(sbi, segno);
- if (valid_blocks == 0) {
+ if (valid_blocks == 0 && (!is_sbi_flag_set(sbi, SBI_CP_DISABLED) ||
+ ckpt_valid_blocks == sbi->blocks_per_seg)) {
__locate_dirty_segment(sbi, segno, PRE);
__remove_dirty_segment(sbi, segno, DIRTY);
} else if (valid_blocks < sbi->blocks_per_seg) {
@@ -594,120 +830,962 @@ static void locate_dirty_segment(struct f2fs_sb_info *sbi, unsigned int segno)
mutex_unlock(&dirty_i->seglist_lock);
}
-static struct bio_entry *__add_bio_entry(struct f2fs_sb_info *sbi,
- struct bio *bio)
+/* This moves currently empty dirty blocks to prefree. Must hold seglist_lock */
+void f2fs_dirty_to_prefree(struct f2fs_sb_info *sbi)
+{
+ struct dirty_seglist_info *dirty_i = DIRTY_I(sbi);
+ unsigned int segno;
+
+ mutex_lock(&dirty_i->seglist_lock);
+ for_each_set_bit(segno, dirty_i->dirty_segmap[DIRTY], MAIN_SEGS(sbi)) {
+ if (get_valid_blocks(sbi, segno, false))
+ continue;
+ if (IS_CURSEG(sbi, segno))
+ continue;
+ __locate_dirty_segment(sbi, segno, PRE);
+ __remove_dirty_segment(sbi, segno, DIRTY);
+ }
+ mutex_unlock(&dirty_i->seglist_lock);
+}
+
+int f2fs_disable_cp_again(struct f2fs_sb_info *sbi)
+{
+ struct dirty_seglist_info *dirty_i = DIRTY_I(sbi);
+ block_t ovp = overprovision_segments(sbi) << sbi->log_blocks_per_seg;
+ block_t holes[2] = {0, 0}; /* DATA and NODE */
+ struct seg_entry *se;
+ unsigned int segno;
+
+ mutex_lock(&dirty_i->seglist_lock);
+ for_each_set_bit(segno, dirty_i->dirty_segmap[DIRTY], MAIN_SEGS(sbi)) {
+ se = get_seg_entry(sbi, segno);
+ if (IS_NODESEG(se->type))
+ holes[NODE] += sbi->blocks_per_seg - se->valid_blocks;
+ else
+ holes[DATA] += sbi->blocks_per_seg - se->valid_blocks;
+ }
+ mutex_unlock(&dirty_i->seglist_lock);
+
+ if (holes[DATA] > ovp || holes[NODE] > ovp)
+ return -EAGAIN;
+ return 0;
+}
+
+/* This is only used by SBI_CP_DISABLED */
+static unsigned int get_free_segment(struct f2fs_sb_info *sbi)
{
- struct list_head *wait_list = &(SM_I(sbi)->wait_list);
- struct bio_entry *be = f2fs_kmem_cache_alloc(bio_entry_slab, GFP_NOFS);
+ struct dirty_seglist_info *dirty_i = DIRTY_I(sbi);
+ unsigned int segno = 0;
- INIT_LIST_HEAD(&be->list);
- be->bio = bio;
- init_completion(&be->event);
- list_add_tail(&be->list, wait_list);
+ mutex_lock(&dirty_i->seglist_lock);
+ for_each_set_bit(segno, dirty_i->dirty_segmap[DIRTY], MAIN_SEGS(sbi)) {
+ if (get_valid_blocks(sbi, segno, false))
+ continue;
+ if (get_ckpt_valid_blocks(sbi, segno))
+ continue;
+ mutex_unlock(&dirty_i->seglist_lock);
+ return segno;
+ }
+ mutex_unlock(&dirty_i->seglist_lock);
+ return NULL_SEGNO;
+}
- return be;
+static struct discard_cmd *__create_discard_cmd(struct f2fs_sb_info *sbi,
+ struct block_device *bdev, block_t lstart,
+ block_t start, block_t len)
+{
+ struct discard_cmd_control *dcc = SM_I(sbi)->dcc_info;
+ struct list_head *pend_list;
+ struct discard_cmd *dc;
+
+ f2fs_bug_on(sbi, !len);
+
+ pend_list = &dcc->pend_list[plist_idx(len)];
+
+ dc = f2fs_kmem_cache_alloc(discard_cmd_slab, GFP_NOFS);
+ INIT_LIST_HEAD(&dc->list);
+ dc->bdev = bdev;
+ dc->lstart = lstart;
+ dc->start = start;
+ dc->len = len;
+ dc->ref = 0;
+ dc->state = D_PREP;
+ dc->queued = 0;
+ dc->error = 0;
+ init_completion(&dc->wait);
+ list_add_tail(&dc->list, pend_list);
+ spin_lock_init(&dc->lock);
+ dc->bio_ref = 0;
+ atomic_inc(&dcc->discard_cmd_cnt);
+ dcc->undiscard_blks += len;
+
+ return dc;
}
-void f2fs_wait_all_discard_bio(struct f2fs_sb_info *sbi)
+static struct discard_cmd *__attach_discard_cmd(struct f2fs_sb_info *sbi,
+ struct block_device *bdev, block_t lstart,
+ block_t start, block_t len,
+ struct rb_node *parent, struct rb_node **p)
{
- struct list_head *wait_list = &(SM_I(sbi)->wait_list);
- struct bio_entry *be, *tmp;
+ struct discard_cmd_control *dcc = SM_I(sbi)->dcc_info;
+ struct discard_cmd *dc;
- list_for_each_entry_safe(be, tmp, wait_list, list) {
- struct bio *bio = be->bio;
- int err;
+ dc = __create_discard_cmd(sbi, bdev, lstart, start, len);
- wait_for_completion_io(&be->event);
- err = be->error;
- if (err == -EOPNOTSUPP)
- err = 0;
+ rb_link_node(&dc->rb_node, parent, p);
+ rb_insert_color(&dc->rb_node, &dcc->root);
- if (err)
- f2fs_msg(sbi->sb, KERN_INFO,
- "Issue discard failed, ret: %d", err);
+ return dc;
+}
+
+static void __detach_discard_cmd(struct discard_cmd_control *dcc,
+ struct discard_cmd *dc)
+{
+ if (dc->state == D_DONE)
+ atomic_sub(dc->queued, &dcc->queued_discard);
+
+ list_del(&dc->list);
+ rb_erase(&dc->rb_node, &dcc->root);
+ dcc->undiscard_blks -= dc->len;
+
+ kmem_cache_free(discard_cmd_slab, dc);
- bio_put(bio);
- list_del(&be->list);
- kmem_cache_free(bio_entry_slab, be);
+ atomic_dec(&dcc->discard_cmd_cnt);
+}
+
+static void __remove_discard_cmd(struct f2fs_sb_info *sbi,
+ struct discard_cmd *dc)
+{
+ struct discard_cmd_control *dcc = SM_I(sbi)->dcc_info;
+ unsigned long flags;
+
+ trace_f2fs_remove_discard(dc->bdev, dc->start, dc->len);
+
+ spin_lock_irqsave(&dc->lock, flags);
+ if (dc->bio_ref) {
+ spin_unlock_irqrestore(&dc->lock, flags);
+ return;
}
+ spin_unlock_irqrestore(&dc->lock, flags);
+
+ f2fs_bug_on(sbi, dc->ref);
+
+ if (dc->error == -EOPNOTSUPP)
+ dc->error = 0;
+
+ if (dc->error)
+ printk_ratelimited(
+ "%sF2FS-fs: Issue discard(%u, %u, %u) failed, ret: %d",
+ KERN_INFO, dc->lstart, dc->start, dc->len, dc->error);
+ __detach_discard_cmd(dcc, dc);
}
-static void f2fs_submit_bio_wait_endio(struct bio *bio)
+static void f2fs_submit_discard_endio(struct bio *bio)
{
- struct bio_entry *be = (struct bio_entry *)bio->bi_private;
+ struct discard_cmd *dc = (struct discard_cmd *)bio->bi_private;
+ unsigned long flags;
+
+ dc->error = bio->bi_error;
+
+ spin_lock_irqsave(&dc->lock, flags);
+ dc->bio_ref--;
+ if (!dc->bio_ref && dc->state == D_SUBMIT) {
+ dc->state = D_DONE;
+ complete_all(&dc->wait);
+ }
+ spin_unlock_irqrestore(&dc->lock, flags);
+ bio_put(bio);
+}
- be->error = bio->bi_error;
- complete(&be->event);
+static void __check_sit_bitmap(struct f2fs_sb_info *sbi,
+ block_t start, block_t end)
+{
+#ifdef CONFIG_F2FS_CHECK_FS
+ struct seg_entry *sentry;
+ unsigned int segno;
+ block_t blk = start;
+ unsigned long offset, size, max_blocks = sbi->blocks_per_seg;
+ unsigned long *map;
+
+ while (blk < end) {
+ segno = GET_SEGNO(sbi, blk);
+ sentry = get_seg_entry(sbi, segno);
+ offset = GET_BLKOFF_FROM_SEG0(sbi, blk);
+
+ if (end < START_BLOCK(sbi, segno + 1))
+ size = GET_BLKOFF_FROM_SEG0(sbi, end);
+ else
+ size = max_blocks;
+ map = (unsigned long *)(sentry->cur_valid_map);
+ offset = __find_rev_next_bit(map, size, offset);
+ f2fs_bug_on(sbi, offset != size);
+ blk = START_BLOCK(sbi, segno + 1);
+ }
+#endif
+}
+
+static void __init_discard_policy(struct f2fs_sb_info *sbi,
+ struct discard_policy *dpolicy,
+ int discard_type, unsigned int granularity)
+{
+ /* common policy */
+ dpolicy->type = discard_type;
+ dpolicy->sync = true;
+ dpolicy->ordered = false;
+ dpolicy->granularity = granularity;
+
+ dpolicy->max_requests = DEF_MAX_DISCARD_REQUEST;
+ dpolicy->io_aware_gran = MAX_PLIST_NUM;
+
+ if (discard_type == DPOLICY_BG) {
+ dpolicy->min_interval = DEF_MIN_DISCARD_ISSUE_TIME;
+ dpolicy->mid_interval = DEF_MID_DISCARD_ISSUE_TIME;
+ dpolicy->max_interval = DEF_MAX_DISCARD_ISSUE_TIME;
+ dpolicy->io_aware = true;
+ dpolicy->sync = false;
+ dpolicy->ordered = true;
+ if (utilization(sbi) > DEF_DISCARD_URGENT_UTIL) {
+ dpolicy->granularity = 1;
+ dpolicy->max_interval = DEF_MIN_DISCARD_ISSUE_TIME;
+ }
+ } else if (discard_type == DPOLICY_FORCE) {
+ dpolicy->min_interval = DEF_MIN_DISCARD_ISSUE_TIME;
+ dpolicy->mid_interval = DEF_MID_DISCARD_ISSUE_TIME;
+ dpolicy->max_interval = DEF_MAX_DISCARD_ISSUE_TIME;
+ dpolicy->io_aware = false;
+ } else if (discard_type == DPOLICY_FSTRIM) {
+ dpolicy->io_aware = false;
+ } else if (discard_type == DPOLICY_UMOUNT) {
+ dpolicy->max_requests = UINT_MAX;
+ dpolicy->io_aware = false;
+ }
}
+static void __update_discard_tree_range(struct f2fs_sb_info *sbi,
+ struct block_device *bdev, block_t lstart,
+ block_t start, block_t len);
/* this function is copied from blkdev_issue_discard from block/blk-lib.c */
-int __f2fs_issue_discard_async(struct f2fs_sb_info *sbi, sector_t sector,
- sector_t nr_sects, gfp_t gfp_mask, unsigned long flags)
+static int __submit_discard_cmd(struct f2fs_sb_info *sbi,
+ struct discard_policy *dpolicy,
+ struct discard_cmd *dc,
+ unsigned int *issued)
{
- struct block_device *bdev = sbi->sb->s_bdev;
- struct bio *bio = NULL;
- int err;
+ struct block_device *bdev = dc->bdev;
+ struct request_queue *q = bdev_get_queue(bdev);
+ unsigned int max_discard_blocks =
+ SECTOR_TO_BLOCK(q->limits.max_discard_sectors);
+ struct discard_cmd_control *dcc = SM_I(sbi)->dcc_info;
+ struct list_head *wait_list = (dpolicy->type == DPOLICY_FSTRIM) ?
+ &(dcc->fstrim_list) : &(dcc->wait_list);
+ int flag = dpolicy->sync ? REQ_SYNC : 0;
+ block_t lstart, start, len, total_len;
+ int err = 0;
+
+ if (dc->state != D_PREP)
+ return 0;
+
+ if (is_sbi_flag_set(sbi, SBI_NEED_FSCK))
+ return 0;
+
+ trace_f2fs_issue_discard(bdev, dc->start, dc->len);
+
+ lstart = dc->lstart;
+ start = dc->start;
+ len = dc->len;
+ total_len = len;
+
+ dc->len = 0;
+
+ while (total_len && *issued < dpolicy->max_requests && !err) {
+ struct bio *bio = NULL;
+ unsigned long flags;
+ bool last = true;
+
+ if (len > max_discard_blocks) {
+ len = max_discard_blocks;
+ last = false;
+ }
+
+ (*issued)++;
+ if (*issued == dpolicy->max_requests)
+ last = true;
+
+ dc->len += len;
+
+ if (time_to_inject(sbi, FAULT_DISCARD)) {
+ f2fs_show_injection_info(FAULT_DISCARD);
+ err = -EIO;
+ goto submit;
+ }
+ err = __blkdev_issue_discard(bdev,
+ SECTOR_FROM_BLOCK(start),
+ SECTOR_FROM_BLOCK(len),
+ GFP_NOFS, 0, &bio);
+submit:
+ if (err) {
+ spin_lock_irqsave(&dc->lock, flags);
+ if (dc->state == D_PARTIAL)
+ dc->state = D_SUBMIT;
+ spin_unlock_irqrestore(&dc->lock, flags);
+
+ break;
+ }
+
+ f2fs_bug_on(sbi, !bio);
+
+ /*
+ * should keep before submission to avoid D_DONE
+ * right away
+ */
+ spin_lock_irqsave(&dc->lock, flags);
+ if (last)
+ dc->state = D_SUBMIT;
+ else
+ dc->state = D_PARTIAL;
+ dc->bio_ref++;
+ spin_unlock_irqrestore(&dc->lock, flags);
- err = __blkdev_issue_discard(bdev, sector, nr_sects, gfp_mask, flags,
- &bio);
- if (!err && bio) {
- struct bio_entry *be = __add_bio_entry(sbi, bio);
+ atomic_inc(&dcc->queued_discard);
+ dc->queued++;
+ list_move_tail(&dc->list, wait_list);
- bio->bi_private = be;
- bio->bi_end_io = f2fs_submit_bio_wait_endio;
- bio->bi_opf |= REQ_SYNC;
+ /* sanity check on discard range */
+ __check_sit_bitmap(sbi, lstart, lstart + len);
+
+ bio->bi_private = dc;
+ bio->bi_end_io = f2fs_submit_discard_endio;
+ bio->bi_opf |= flag;
submit_bio(bio);
+
+ atomic_inc(&dcc->issued_discard);
+
+ f2fs_update_iostat(sbi, FS_DISCARD, 1);
+
+ lstart += len;
+ start += len;
+ total_len -= len;
+ len = total_len;
}
+ if (!err && len)
+ __update_discard_tree_range(sbi, bdev, lstart, start, len);
return err;
}
+static struct discard_cmd *__insert_discard_tree(struct f2fs_sb_info *sbi,
+ struct block_device *bdev, block_t lstart,
+ block_t start, block_t len,
+ struct rb_node **insert_p,
+ struct rb_node *insert_parent)
+{
+ struct discard_cmd_control *dcc = SM_I(sbi)->dcc_info;
+ struct rb_node **p;
+ struct rb_node *parent = NULL;
+ struct discard_cmd *dc = NULL;
+
+ if (insert_p && insert_parent) {
+ parent = insert_parent;
+ p = insert_p;
+ goto do_insert;
+ }
+
+ p = f2fs_lookup_rb_tree_for_insert(sbi, &dcc->root, &parent, lstart);
+do_insert:
+ dc = __attach_discard_cmd(sbi, bdev, lstart, start, len, parent, p);
+ if (!dc)
+ return NULL;
+
+ return dc;
+}
+
+static void __relocate_discard_cmd(struct discard_cmd_control *dcc,
+ struct discard_cmd *dc)
+{
+ list_move_tail(&dc->list, &dcc->pend_list[plist_idx(dc->len)]);
+}
+
+static void __punch_discard_cmd(struct f2fs_sb_info *sbi,
+ struct discard_cmd *dc, block_t blkaddr)
+{
+ struct discard_cmd_control *dcc = SM_I(sbi)->dcc_info;
+ struct discard_info di = dc->di;
+ bool modified = false;
+
+ if (dc->state == D_DONE || dc->len == 1) {
+ __remove_discard_cmd(sbi, dc);
+ return;
+ }
+
+ dcc->undiscard_blks -= di.len;
+
+ if (blkaddr > di.lstart) {
+ dc->len = blkaddr - dc->lstart;
+ dcc->undiscard_blks += dc->len;
+ __relocate_discard_cmd(dcc, dc);
+ modified = true;
+ }
+
+ if (blkaddr < di.lstart + di.len - 1) {
+ if (modified) {
+ __insert_discard_tree(sbi, dc->bdev, blkaddr + 1,
+ di.start + blkaddr + 1 - di.lstart,
+ di.lstart + di.len - 1 - blkaddr,
+ NULL, NULL);
+ } else {
+ dc->lstart++;
+ dc->len--;
+ dc->start++;
+ dcc->undiscard_blks += dc->len;
+ __relocate_discard_cmd(dcc, dc);
+ }
+ }
+}
+
+static void __update_discard_tree_range(struct f2fs_sb_info *sbi,
+ struct block_device *bdev, block_t lstart,
+ block_t start, block_t len)
+{
+ struct discard_cmd_control *dcc = SM_I(sbi)->dcc_info;
+ struct discard_cmd *prev_dc = NULL, *next_dc = NULL;
+ struct discard_cmd *dc;
+ struct discard_info di = {0};
+ struct rb_node **insert_p = NULL, *insert_parent = NULL;
+ struct request_queue *q = bdev_get_queue(bdev);
+ unsigned int max_discard_blocks =
+ SECTOR_TO_BLOCK(q->limits.max_discard_sectors);
+ block_t end = lstart + len;
+
+ dc = (struct discard_cmd *)f2fs_lookup_rb_tree_ret(&dcc->root,
+ NULL, lstart,
+ (struct rb_entry **)&prev_dc,
+ (struct rb_entry **)&next_dc,
+ &insert_p, &insert_parent, true);
+ if (dc)
+ prev_dc = dc;
+
+ if (!prev_dc) {
+ di.lstart = lstart;
+ di.len = next_dc ? next_dc->lstart - lstart : len;
+ di.len = min(di.len, len);
+ di.start = start;
+ }
+
+ while (1) {
+ struct rb_node *node;
+ bool merged = false;
+ struct discard_cmd *tdc = NULL;
+
+ if (prev_dc) {
+ di.lstart = prev_dc->lstart + prev_dc->len;
+ if (di.lstart < lstart)
+ di.lstart = lstart;
+ if (di.lstart >= end)
+ break;
+
+ if (!next_dc || next_dc->lstart > end)
+ di.len = end - di.lstart;
+ else
+ di.len = next_dc->lstart - di.lstart;
+ di.start = start + di.lstart - lstart;
+ }
+
+ if (!di.len)
+ goto next;
+
+ if (prev_dc && prev_dc->state == D_PREP &&
+ prev_dc->bdev == bdev &&
+ __is_discard_back_mergeable(&di, &prev_dc->di,
+ max_discard_blocks)) {
+ prev_dc->di.len += di.len;
+ dcc->undiscard_blks += di.len;
+ __relocate_discard_cmd(dcc, prev_dc);
+ di = prev_dc->di;
+ tdc = prev_dc;
+ merged = true;
+ }
+
+ if (next_dc && next_dc->state == D_PREP &&
+ next_dc->bdev == bdev &&
+ __is_discard_front_mergeable(&di, &next_dc->di,
+ max_discard_blocks)) {
+ next_dc->di.lstart = di.lstart;
+ next_dc->di.len += di.len;
+ next_dc->di.start = di.start;
+ dcc->undiscard_blks += di.len;
+ __relocate_discard_cmd(dcc, next_dc);
+ if (tdc)
+ __remove_discard_cmd(sbi, tdc);
+ merged = true;
+ }
+
+ if (!merged) {
+ __insert_discard_tree(sbi, bdev, di.lstart, di.start,
+ di.len, NULL, NULL);
+ }
+ next:
+ prev_dc = next_dc;
+ if (!prev_dc)
+ break;
+
+ node = rb_next(&prev_dc->rb_node);
+ next_dc = rb_entry_safe(node, struct discard_cmd, rb_node);
+ }
+}
+
+static int __queue_discard_cmd(struct f2fs_sb_info *sbi,
+ struct block_device *bdev, block_t blkstart, block_t blklen)
+{
+ block_t lblkstart = blkstart;
+
+ trace_f2fs_queue_discard(bdev, blkstart, blklen);
+
+ if (sbi->s_ndevs) {
+ int devi = f2fs_target_device_index(sbi, blkstart);
+
+ blkstart -= FDEV(devi).start_blk;
+ }
+ mutex_lock(&SM_I(sbi)->dcc_info->cmd_lock);
+ __update_discard_tree_range(sbi, bdev, lblkstart, blkstart, blklen);
+ mutex_unlock(&SM_I(sbi)->dcc_info->cmd_lock);
+ return 0;
+}
+
+static unsigned int __issue_discard_cmd_orderly(struct f2fs_sb_info *sbi,
+ struct discard_policy *dpolicy)
+{
+ struct discard_cmd_control *dcc = SM_I(sbi)->dcc_info;
+ struct discard_cmd *prev_dc = NULL, *next_dc = NULL;
+ struct rb_node **insert_p = NULL, *insert_parent = NULL;
+ struct discard_cmd *dc;
+ struct blk_plug plug;
+ unsigned int pos = dcc->next_pos;
+ unsigned int issued = 0;
+ bool io_interrupted = false;
+
+ mutex_lock(&dcc->cmd_lock);
+ dc = (struct discard_cmd *)f2fs_lookup_rb_tree_ret(&dcc->root,
+ NULL, pos,
+ (struct rb_entry **)&prev_dc,
+ (struct rb_entry **)&next_dc,
+ &insert_p, &insert_parent, true);
+ if (!dc)
+ dc = next_dc;
+
+ blk_start_plug(&plug);
+
+ while (dc) {
+ struct rb_node *node;
+ int err = 0;
+
+ if (dc->state != D_PREP)
+ goto next;
+
+ if (dpolicy->io_aware && !is_idle(sbi, DISCARD_TIME)) {
+ io_interrupted = true;
+ break;
+ }
+
+ dcc->next_pos = dc->lstart + dc->len;
+ err = __submit_discard_cmd(sbi, dpolicy, dc, &issued);
+
+ if (issued >= dpolicy->max_requests)
+ break;
+next:
+ node = rb_next(&dc->rb_node);
+ if (err)
+ __remove_discard_cmd(sbi, dc);
+ dc = rb_entry_safe(node, struct discard_cmd, rb_node);
+ }
+
+ blk_finish_plug(&plug);
+
+ if (!dc)
+ dcc->next_pos = 0;
+
+ mutex_unlock(&dcc->cmd_lock);
+
+ if (!issued && io_interrupted)
+ issued = -1;
+
+ return issued;
+}
+
+static int __issue_discard_cmd(struct f2fs_sb_info *sbi,
+ struct discard_policy *dpolicy)
+{
+ struct discard_cmd_control *dcc = SM_I(sbi)->dcc_info;
+ struct list_head *pend_list;
+ struct discard_cmd *dc, *tmp;
+ struct blk_plug plug;
+ int i, issued = 0;
+ bool io_interrupted = false;
+
+ for (i = MAX_PLIST_NUM - 1; i >= 0; i--) {
+ if (i + 1 < dpolicy->granularity)
+ break;
+
+ if (i < DEFAULT_DISCARD_GRANULARITY && dpolicy->ordered)
+ return __issue_discard_cmd_orderly(sbi, dpolicy);
+
+ pend_list = &dcc->pend_list[i];
+
+ mutex_lock(&dcc->cmd_lock);
+ if (list_empty(pend_list))
+ goto next;
+ if (unlikely(dcc->rbtree_check))
+ f2fs_bug_on(sbi, !f2fs_check_rb_tree_consistence(sbi,
+ &dcc->root));
+ blk_start_plug(&plug);
+ list_for_each_entry_safe(dc, tmp, pend_list, list) {
+ f2fs_bug_on(sbi, dc->state != D_PREP);
+
+ if (dpolicy->io_aware && i < dpolicy->io_aware_gran &&
+ !is_idle(sbi, DISCARD_TIME)) {
+ io_interrupted = true;
+ break;
+ }
+
+ __submit_discard_cmd(sbi, dpolicy, dc, &issued);
+
+ if (issued >= dpolicy->max_requests)
+ break;
+ }
+ blk_finish_plug(&plug);
+next:
+ mutex_unlock(&dcc->cmd_lock);
+
+ if (issued >= dpolicy->max_requests || io_interrupted)
+ break;
+ }
+
+ if (!issued && io_interrupted)
+ issued = -1;
+
+ return issued;
+}
+
+static bool __drop_discard_cmd(struct f2fs_sb_info *sbi)
+{
+ struct discard_cmd_control *dcc = SM_I(sbi)->dcc_info;
+ struct list_head *pend_list;
+ struct discard_cmd *dc, *tmp;
+ int i;
+ bool dropped = false;
+
+ mutex_lock(&dcc->cmd_lock);
+ for (i = MAX_PLIST_NUM - 1; i >= 0; i--) {
+ pend_list = &dcc->pend_list[i];
+ list_for_each_entry_safe(dc, tmp, pend_list, list) {
+ f2fs_bug_on(sbi, dc->state != D_PREP);
+ __remove_discard_cmd(sbi, dc);
+ dropped = true;
+ }
+ }
+ mutex_unlock(&dcc->cmd_lock);
+
+ return dropped;
+}
+
+void f2fs_drop_discard_cmd(struct f2fs_sb_info *sbi)
+{
+ __drop_discard_cmd(sbi);
+}
+
+static unsigned int __wait_one_discard_bio(struct f2fs_sb_info *sbi,
+ struct discard_cmd *dc)
+{
+ struct discard_cmd_control *dcc = SM_I(sbi)->dcc_info;
+ unsigned int len = 0;
+
+ wait_for_completion_io(&dc->wait);
+ mutex_lock(&dcc->cmd_lock);
+ f2fs_bug_on(sbi, dc->state != D_DONE);
+ dc->ref--;
+ if (!dc->ref) {
+ if (!dc->error)
+ len = dc->len;
+ __remove_discard_cmd(sbi, dc);
+ }
+ mutex_unlock(&dcc->cmd_lock);
+
+ return len;
+}
+
+static unsigned int __wait_discard_cmd_range(struct f2fs_sb_info *sbi,
+ struct discard_policy *dpolicy,
+ block_t start, block_t end)
+{
+ struct discard_cmd_control *dcc = SM_I(sbi)->dcc_info;
+ struct list_head *wait_list = (dpolicy->type == DPOLICY_FSTRIM) ?
+ &(dcc->fstrim_list) : &(dcc->wait_list);
+ struct discard_cmd *dc, *tmp;
+ bool need_wait;
+ unsigned int trimmed = 0;
+
+next:
+ need_wait = false;
+
+ mutex_lock(&dcc->cmd_lock);
+ list_for_each_entry_safe(dc, tmp, wait_list, list) {
+ if (dc->lstart + dc->len <= start || end <= dc->lstart)
+ continue;
+ if (dc->len < dpolicy->granularity)
+ continue;
+ if (dc->state == D_DONE && !dc->ref) {
+ wait_for_completion_io(&dc->wait);
+ if (!dc->error)
+ trimmed += dc->len;
+ __remove_discard_cmd(sbi, dc);
+ } else {
+ dc->ref++;
+ need_wait = true;
+ break;
+ }
+ }
+ mutex_unlock(&dcc->cmd_lock);
+
+ if (need_wait) {
+ trimmed += __wait_one_discard_bio(sbi, dc);
+ goto next;
+ }
+
+ return trimmed;
+}
+
+static unsigned int __wait_all_discard_cmd(struct f2fs_sb_info *sbi,
+ struct discard_policy *dpolicy)
+{
+ struct discard_policy dp;
+ unsigned int discard_blks;
+
+ if (dpolicy)
+ return __wait_discard_cmd_range(sbi, dpolicy, 0, UINT_MAX);
+
+ /* wait all */
+ __init_discard_policy(sbi, &dp, DPOLICY_FSTRIM, 1);
+ discard_blks = __wait_discard_cmd_range(sbi, &dp, 0, UINT_MAX);
+ __init_discard_policy(sbi, &dp, DPOLICY_UMOUNT, 1);
+ discard_blks += __wait_discard_cmd_range(sbi, &dp, 0, UINT_MAX);
+
+ return discard_blks;
+}
+
+/* This should be covered by global mutex, &sit_i->sentry_lock */
+static void f2fs_wait_discard_bio(struct f2fs_sb_info *sbi, block_t blkaddr)
+{
+ struct discard_cmd_control *dcc = SM_I(sbi)->dcc_info;
+ struct discard_cmd *dc;
+ bool need_wait = false;
+
+ mutex_lock(&dcc->cmd_lock);
+ dc = (struct discard_cmd *)f2fs_lookup_rb_tree(&dcc->root,
+ NULL, blkaddr);
+ if (dc) {
+ if (dc->state == D_PREP) {
+ __punch_discard_cmd(sbi, dc, blkaddr);
+ } else {
+ dc->ref++;
+ need_wait = true;
+ }
+ }
+ mutex_unlock(&dcc->cmd_lock);
+
+ if (need_wait)
+ __wait_one_discard_bio(sbi, dc);
+}
+
+void f2fs_stop_discard_thread(struct f2fs_sb_info *sbi)
+{
+ struct discard_cmd_control *dcc = SM_I(sbi)->dcc_info;
+
+ if (dcc && dcc->f2fs_issue_discard) {
+ struct task_struct *discard_thread = dcc->f2fs_issue_discard;
+
+ dcc->f2fs_issue_discard = NULL;
+ kthread_stop(discard_thread);
+ }
+}
+
+/* This comes from f2fs_put_super */
+bool f2fs_wait_discard_bios(struct f2fs_sb_info *sbi)
+{
+ struct discard_cmd_control *dcc = SM_I(sbi)->dcc_info;
+ struct discard_policy dpolicy;
+ bool dropped;
+
+ __init_discard_policy(sbi, &dpolicy, DPOLICY_UMOUNT,
+ dcc->discard_granularity);
+ __issue_discard_cmd(sbi, &dpolicy);
+ dropped = __drop_discard_cmd(sbi);
+
+ /* just to make sure there is no pending discard commands */
+ __wait_all_discard_cmd(sbi, NULL);
+
+ f2fs_bug_on(sbi, atomic_read(&dcc->discard_cmd_cnt));
+ return dropped;
+}
+
+static int issue_discard_thread(void *data)
+{
+ struct f2fs_sb_info *sbi = data;
+ struct discard_cmd_control *dcc = SM_I(sbi)->dcc_info;
+ wait_queue_head_t *q = &dcc->discard_wait_queue;
+ struct discard_policy dpolicy;
+ unsigned int wait_ms = DEF_MIN_DISCARD_ISSUE_TIME;
+ int issued;
+
+ set_freezable();
+
+ do {
+ __init_discard_policy(sbi, &dpolicy, DPOLICY_BG,
+ dcc->discard_granularity);
+
+ wait_event_interruptible_timeout(*q,
+ kthread_should_stop() || freezing(current) ||
+ dcc->discard_wake,
+ msecs_to_jiffies(wait_ms));
+
+ if (dcc->discard_wake)
+ dcc->discard_wake = 0;
+
+ /* clean up pending candidates before going to sleep */
+ if (atomic_read(&dcc->queued_discard))
+ __wait_all_discard_cmd(sbi, NULL);
+
+ if (try_to_freeze())
+ continue;
+ if (f2fs_readonly(sbi->sb))
+ continue;
+ if (kthread_should_stop())
+ return 0;
+ if (is_sbi_flag_set(sbi, SBI_NEED_FSCK)) {
+ wait_ms = dpolicy.max_interval;
+ continue;
+ }
+
+ if (sbi->gc_mode == GC_URGENT)
+ __init_discard_policy(sbi, &dpolicy, DPOLICY_FORCE, 1);
+
+ sb_start_intwrite(sbi->sb);
+
+ issued = __issue_discard_cmd(sbi, &dpolicy);
+ if (issued > 0) {
+ __wait_all_discard_cmd(sbi, &dpolicy);
+ wait_ms = dpolicy.min_interval;
+ } else if (issued == -1){
+ wait_ms = f2fs_time_to_wait(sbi, DISCARD_TIME);
+ if (!wait_ms)
+ wait_ms = dpolicy.mid_interval;
+ } else {
+ wait_ms = dpolicy.max_interval;
+ }
+
+ sb_end_intwrite(sbi->sb);
+
+ } while (!kthread_should_stop());
+ return 0;
+}
+
+#ifdef CONFIG_BLK_DEV_ZONED
+static int __f2fs_issue_discard_zone(struct f2fs_sb_info *sbi,
+ struct block_device *bdev, block_t blkstart, block_t blklen)
+{
+ sector_t sector, nr_sects;
+ block_t lblkstart = blkstart;
+ int devi = 0;
+
+ if (sbi->s_ndevs) {
+ devi = f2fs_target_device_index(sbi, blkstart);
+ blkstart -= FDEV(devi).start_blk;
+ }
+
+ /*
+ * We need to know the type of the zone: for conventional zones,
+ * use regular discard if the drive supports it. For sequential
+ * zones, reset the zone write pointer.
+ */
+ switch (get_blkz_type(sbi, bdev, blkstart)) {
+
+ case BLK_ZONE_TYPE_CONVENTIONAL:
+ if (!blk_queue_discard(bdev_get_queue(bdev)))
+ return 0;
+ return __queue_discard_cmd(sbi, bdev, lblkstart, blklen);
+ case BLK_ZONE_TYPE_SEQWRITE_REQ:
+ case BLK_ZONE_TYPE_SEQWRITE_PREF:
+ sector = SECTOR_FROM_BLOCK(blkstart);
+ nr_sects = SECTOR_FROM_BLOCK(blklen);
+
+ if (sector & (bdev_zone_sectors(bdev) - 1) ||
+ nr_sects != bdev_zone_sectors(bdev)) {
+ f2fs_msg(sbi->sb, KERN_INFO,
+ "(%d) %s: Unaligned discard attempted (block %x + %x)",
+ devi, sbi->s_ndevs ? FDEV(devi).path: "",
+ blkstart, blklen);
+ return -EIO;
+ }
+ trace_f2fs_issue_reset_zone(bdev, blkstart);
+ return blkdev_reset_zones(bdev, sector,
+ nr_sects, GFP_NOFS);
+ default:
+ /* Unknown zone type: broken device ? */
+ return -EIO;
+ }
+}
+#endif
+
+static int __issue_discard_async(struct f2fs_sb_info *sbi,
+ struct block_device *bdev, block_t blkstart, block_t blklen)
+{
+#ifdef CONFIG_BLK_DEV_ZONED
+ if (f2fs_sb_has_blkzoned(sbi) &&
+ bdev_zoned_model(bdev) != BLK_ZONED_NONE)
+ return __f2fs_issue_discard_zone(sbi, bdev, blkstart, blklen);
+#endif
+ return __queue_discard_cmd(sbi, bdev, blkstart, blklen);
+}
+
static int f2fs_issue_discard(struct f2fs_sb_info *sbi,
block_t blkstart, block_t blklen)
{
- sector_t start = SECTOR_FROM_BLOCK(blkstart);
- sector_t len = SECTOR_FROM_BLOCK(blklen);
+ sector_t start = blkstart, len = 0;
+ struct block_device *bdev;
struct seg_entry *se;
unsigned int offset;
block_t i;
+ int err = 0;
+
+ bdev = f2fs_target_device(sbi, blkstart, NULL);
+
+ for (i = blkstart; i < blkstart + blklen; i++, len++) {
+ if (i != start) {
+ struct block_device *bdev2 =
+ f2fs_target_device(sbi, i, NULL);
+
+ if (bdev2 != bdev) {
+ err = __issue_discard_async(sbi, bdev,
+ start, len);
+ if (err)
+ return err;
+ bdev = bdev2;
+ start = i;
+ len = 0;
+ }
+ }
- for (i = blkstart; i < blkstart + blklen; i++) {
se = get_seg_entry(sbi, GET_SEGNO(sbi, i));
offset = GET_BLKOFF_FROM_SEG0(sbi, i);
if (!f2fs_test_and_set_bit(offset, se->discard_map))
sbi->discard_blks--;
}
- trace_f2fs_issue_discard(sbi->sb, blkstart, blklen);
- return __f2fs_issue_discard_async(sbi, start, len, GFP_NOFS, 0);
-}
-
-static void __add_discard_entry(struct f2fs_sb_info *sbi,
- struct cp_control *cpc, struct seg_entry *se,
- unsigned int start, unsigned int end)
-{
- struct list_head *head = &SM_I(sbi)->discard_list;
- struct discard_entry *new, *last;
-
- if (!list_empty(head)) {
- last = list_last_entry(head, struct discard_entry, list);
- if (START_BLOCK(sbi, cpc->trim_start) + start ==
- last->blkaddr + last->len) {
- last->len += end - start;
- goto done;
- }
- }
- new = f2fs_kmem_cache_alloc(discard_entry_slab, GFP_NOFS);
- INIT_LIST_HEAD(&new->list);
- new->blkaddr = START_BLOCK(sbi, cpc->trim_start) + start;
- new->len = end - start;
- list_add_tail(&new->list, head);
-done:
- SM_I(sbi)->nr_discards += end - start;
+ if (len)
+ err = __issue_discard_async(sbi, bdev, start, len);
+ return err;
}
-static void add_discard_addrs(struct f2fs_sb_info *sbi, struct cp_control *cpc)
+static bool add_discard_addrs(struct f2fs_sb_info *sbi, struct cp_control *cpc,
+ bool check_only)
{
int entries = SIT_VBLOCK_MAP_SIZE / sizeof(unsigned long);
int max_blocks = sbi->blocks_per_seg;
@@ -717,16 +1795,19 @@ static void add_discard_addrs(struct f2fs_sb_info *sbi, struct cp_control *cpc)
unsigned long *discard_map = (unsigned long *)se->discard_map;
unsigned long *dmap = SIT_I(sbi)->tmp_map;
unsigned int start = 0, end = -1;
- bool force = (cpc->reason == CP_DISCARD);
+ bool force = (cpc->reason & CP_DISCARD);
+ struct discard_entry *de = NULL;
+ struct list_head *head = &SM_I(sbi)->dcc_info->entry_list;
int i;
- if (se->valid_blocks == max_blocks || !f2fs_discard_en(sbi))
- return;
+ if (se->valid_blocks == max_blocks || !f2fs_hw_support_discard(sbi))
+ return false;
if (!force) {
- if (!test_opt(sbi, DISCARD) || !se->valid_blocks ||
- SM_I(sbi)->nr_discards >= SM_I(sbi)->max_discards)
- return;
+ if (!f2fs_realtime_discard_enable(sbi) || !se->valid_blocks ||
+ SM_I(sbi)->dcc_info->nr_discards >=
+ SM_I(sbi)->dcc_info->max_discards)
+ return false;
}
/* SIT_VBLOCK_MAP_SIZE should be multiple of sizeof(unsigned long) */
@@ -734,7 +1815,8 @@ static void add_discard_addrs(struct f2fs_sb_info *sbi, struct cp_control *cpc)
dmap[i] = force ? ~ckpt_map[i] & ~discard_map[i] :
(cur_map[i] ^ ckpt_map[i]) & ckpt_map[i];
- while (force || SM_I(sbi)->nr_discards <= SM_I(sbi)->max_discards) {
+ while (force || SM_I(sbi)->dcc_info->nr_discards <=
+ SM_I(sbi)->dcc_info->max_discards) {
start = __find_rev_next_bit(dmap, max_blocks, end + 1);
if (start >= max_blocks)
break;
@@ -744,24 +1826,42 @@ static void add_discard_addrs(struct f2fs_sb_info *sbi, struct cp_control *cpc)
&& (end - start) < cpc->trim_minlen)
continue;
- __add_discard_entry(sbi, cpc, se, start, end);
+ if (check_only)
+ return true;
+
+ if (!de) {
+ de = f2fs_kmem_cache_alloc(discard_entry_slab,
+ GFP_F2FS_ZERO);
+ de->start_blkaddr = START_BLOCK(sbi, cpc->trim_start);
+ list_add_tail(&de->list, head);
+ }
+
+ for (i = start; i < end; i++)
+ __set_bit_le(i, (void *)de->discard_map);
+
+ SM_I(sbi)->dcc_info->nr_discards += end - start;
}
+ return false;
+}
+
+static void release_discard_addr(struct discard_entry *entry)
+{
+ list_del(&entry->list);
+ kmem_cache_free(discard_entry_slab, entry);
}
-void release_discard_addrs(struct f2fs_sb_info *sbi)
+void f2fs_release_discard_addrs(struct f2fs_sb_info *sbi)
{
- struct list_head *head = &(SM_I(sbi)->discard_list);
+ struct list_head *head = &(SM_I(sbi)->dcc_info->entry_list);
struct discard_entry *entry, *this;
/* drop caches */
- list_for_each_entry_safe(entry, this, head, list) {
- list_del(&entry->list);
- kmem_cache_free(discard_entry_slab, entry);
- }
+ list_for_each_entry_safe(entry, this, head, list)
+ release_discard_addr(entry);
}
/*
- * Should call clear_prefree_segments after checkpoint is done.
+ * Should call f2fs_clear_prefree_segments after checkpoint is done.
*/
static void set_prefree_as_free_segments(struct f2fs_sb_info *sbi)
{
@@ -774,47 +1874,59 @@ static void set_prefree_as_free_segments(struct f2fs_sb_info *sbi)
mutex_unlock(&dirty_i->seglist_lock);
}
-void clear_prefree_segments(struct f2fs_sb_info *sbi, struct cp_control *cpc)
+void f2fs_clear_prefree_segments(struct f2fs_sb_info *sbi,
+ struct cp_control *cpc)
{
- struct list_head *head = &(SM_I(sbi)->discard_list);
+ struct discard_cmd_control *dcc = SM_I(sbi)->dcc_info;
+ struct list_head *head = &dcc->entry_list;
struct discard_entry *entry, *this;
struct dirty_seglist_info *dirty_i = DIRTY_I(sbi);
- struct blk_plug plug;
unsigned long *prefree_map = dirty_i->dirty_segmap[PRE];
unsigned int start = 0, end = -1;
unsigned int secno, start_segno;
- bool force = (cpc->reason == CP_DISCARD);
-
- blk_start_plug(&plug);
+ bool force = (cpc->reason & CP_DISCARD);
+ bool need_align = test_opt(sbi, LFS) && __is_large_section(sbi);
mutex_lock(&dirty_i->seglist_lock);
while (1) {
int i;
+
+ if (need_align && end != -1)
+ end--;
start = find_next_bit(prefree_map, MAIN_SEGS(sbi), end + 1);
if (start >= MAIN_SEGS(sbi))
break;
end = find_next_zero_bit(prefree_map, MAIN_SEGS(sbi),
start + 1);
- for (i = start; i < end; i++)
- clear_bit(i, prefree_map);
+ if (need_align) {
+ start = rounddown(start, sbi->segs_per_sec);
+ end = roundup(end, sbi->segs_per_sec);
+ }
- dirty_i->nr_dirty[PRE] -= end - start;
+ for (i = start; i < end; i++) {
+ if (test_and_clear_bit(i, prefree_map))
+ dirty_i->nr_dirty[PRE]--;
+ }
- if (force || !test_opt(sbi, DISCARD))
+ if (!f2fs_realtime_discard_enable(sbi))
continue;
- if (!test_opt(sbi, LFS) || sbi->segs_per_sec == 1) {
+ if (force && start >= cpc->trim_start &&
+ (end - 1) <= cpc->trim_end)
+ continue;
+
+ if (!test_opt(sbi, LFS) || !__is_large_section(sbi)) {
f2fs_issue_discard(sbi, START_BLOCK(sbi, start),
(end - start) << sbi->log_blocks_per_seg);
continue;
}
next:
- secno = GET_SECNO(sbi, start);
- start_segno = secno * sbi->segs_per_sec;
+ secno = GET_SEC_FROM_SEG(sbi, start);
+ start_segno = GET_SEG_FROM_SEC(sbi, secno);
if (!IS_CURSEC(sbi, secno) &&
- !get_valid_blocks(sbi, start, sbi->segs_per_sec))
+ !get_valid_blocks(sbi, start, true))
f2fs_issue_discard(sbi, START_BLOCK(sbi, start_segno),
sbi->segs_per_sec << sbi->log_blocks_per_seg);
@@ -828,17 +1940,98 @@ next:
/* send small discards */
list_for_each_entry_safe(entry, this, head, list) {
- if (force && entry->len < cpc->trim_minlen)
- goto skip;
- f2fs_issue_discard(sbi, entry->blkaddr, entry->len);
- cpc->trimmed += entry->len;
+ unsigned int cur_pos = 0, next_pos, len, total_len = 0;
+ bool is_valid = test_bit_le(0, entry->discard_map);
+
+find_next:
+ if (is_valid) {
+ next_pos = find_next_zero_bit_le(entry->discard_map,
+ sbi->blocks_per_seg, cur_pos);
+ len = next_pos - cur_pos;
+
+ if (f2fs_sb_has_blkzoned(sbi) ||
+ (force && len < cpc->trim_minlen))
+ goto skip;
+
+ f2fs_issue_discard(sbi, entry->start_blkaddr + cur_pos,
+ len);
+ total_len += len;
+ } else {
+ next_pos = find_next_bit_le(entry->discard_map,
+ sbi->blocks_per_seg, cur_pos);
+ }
skip:
- list_del(&entry->list);
- SM_I(sbi)->nr_discards -= entry->len;
- kmem_cache_free(discard_entry_slab, entry);
+ cur_pos = next_pos;
+ is_valid = !is_valid;
+
+ if (cur_pos < sbi->blocks_per_seg)
+ goto find_next;
+
+ release_discard_addr(entry);
+ dcc->nr_discards -= total_len;
}
- blk_finish_plug(&plug);
+ wake_up_discard_thread(sbi, false);
+}
+
+static int create_discard_cmd_control(struct f2fs_sb_info *sbi)
+{
+ dev_t dev = sbi->sb->s_bdev->bd_dev;
+ struct discard_cmd_control *dcc;
+ int err = 0, i;
+
+ if (SM_I(sbi)->dcc_info) {
+ dcc = SM_I(sbi)->dcc_info;
+ goto init_thread;
+ }
+
+ dcc = f2fs_kzalloc(sbi, sizeof(struct discard_cmd_control), GFP_KERNEL);
+ if (!dcc)
+ return -ENOMEM;
+
+ dcc->discard_granularity = DEFAULT_DISCARD_GRANULARITY;
+ INIT_LIST_HEAD(&dcc->entry_list);
+ for (i = 0; i < MAX_PLIST_NUM; i++)
+ INIT_LIST_HEAD(&dcc->pend_list[i]);
+ INIT_LIST_HEAD(&dcc->wait_list);
+ INIT_LIST_HEAD(&dcc->fstrim_list);
+ mutex_init(&dcc->cmd_lock);
+ atomic_set(&dcc->issued_discard, 0);
+ atomic_set(&dcc->queued_discard, 0);
+ atomic_set(&dcc->discard_cmd_cnt, 0);
+ dcc->nr_discards = 0;
+ dcc->max_discards = MAIN_SEGS(sbi) << sbi->log_blocks_per_seg;
+ dcc->undiscard_blks = 0;
+ dcc->next_pos = 0;
+ dcc->root = RB_ROOT;
+ dcc->rbtree_check = false;
+
+ init_waitqueue_head(&dcc->discard_wait_queue);
+ SM_I(sbi)->dcc_info = dcc;
+init_thread:
+ dcc->f2fs_issue_discard = kthread_run(issue_discard_thread, sbi,
+ "f2fs_discard-%u:%u", MAJOR(dev), MINOR(dev));
+ if (IS_ERR(dcc->f2fs_issue_discard)) {
+ err = PTR_ERR(dcc->f2fs_issue_discard);
+ kvfree(dcc);
+ SM_I(sbi)->dcc_info = NULL;
+ return err;
+ }
+
+ return err;
+}
+
+static void destroy_discard_cmd_control(struct f2fs_sb_info *sbi)
+{
+ struct discard_cmd_control *dcc = SM_I(sbi)->dcc_info;
+
+ if (!dcc)
+ return;
+
+ f2fs_stop_discard_thread(sbi);
+
+ kvfree(dcc);
+ SM_I(sbi)->dcc_info = NULL;
}
static bool __mark_sit_entry_dirty(struct f2fs_sb_info *sbi, unsigned int segno)
@@ -867,6 +2060,10 @@ static void update_sit_entry(struct f2fs_sb_info *sbi, block_t blkaddr, int del)
struct seg_entry *se;
unsigned int segno, offset;
long int new_vblocks;
+ bool exist;
+#ifdef CONFIG_F2FS_CHECK_FS
+ bool mir_exist;
+#endif
segno = GET_SEGNO(sbi, blkaddr);
@@ -878,21 +2075,70 @@ static void update_sit_entry(struct f2fs_sb_info *sbi, block_t blkaddr, int del)
(new_vblocks > sbi->blocks_per_seg)));
se->valid_blocks = new_vblocks;
- se->mtime = get_mtime(sbi);
- SIT_I(sbi)->max_mtime = se->mtime;
+ se->mtime = get_mtime(sbi, false);
+ if (se->mtime > SIT_I(sbi)->max_mtime)
+ SIT_I(sbi)->max_mtime = se->mtime;
/* Update valid block bitmap */
if (del > 0) {
- if (f2fs_test_and_set_bit(offset, se->cur_valid_map))
+ exist = f2fs_test_and_set_bit(offset, se->cur_valid_map);
+#ifdef CONFIG_F2FS_CHECK_FS
+ mir_exist = f2fs_test_and_set_bit(offset,
+ se->cur_valid_map_mir);
+ if (unlikely(exist != mir_exist)) {
+ f2fs_msg(sbi->sb, KERN_ERR, "Inconsistent error "
+ "when setting bitmap, blk:%u, old bit:%d",
+ blkaddr, exist);
+ f2fs_bug_on(sbi, 1);
+ }
+#endif
+ if (unlikely(exist)) {
+ f2fs_msg(sbi->sb, KERN_ERR,
+ "Bitmap was wrongly set, blk:%u", blkaddr);
f2fs_bug_on(sbi, 1);
- if (f2fs_discard_en(sbi) &&
- !f2fs_test_and_set_bit(offset, se->discard_map))
+ se->valid_blocks--;
+ del = 0;
+ }
+
+ if (!f2fs_test_and_set_bit(offset, se->discard_map))
sbi->discard_blks--;
+
+ /* don't overwrite by SSR to keep node chain */
+ if (IS_NODESEG(se->type) &&
+ !is_sbi_flag_set(sbi, SBI_CP_DISABLED)) {
+ if (!f2fs_test_and_set_bit(offset, se->ckpt_valid_map))
+ se->ckpt_valid_blocks++;
+ }
} else {
- if (!f2fs_test_and_clear_bit(offset, se->cur_valid_map))
+ exist = f2fs_test_and_clear_bit(offset, se->cur_valid_map);
+#ifdef CONFIG_F2FS_CHECK_FS
+ mir_exist = f2fs_test_and_clear_bit(offset,
+ se->cur_valid_map_mir);
+ if (unlikely(exist != mir_exist)) {
+ f2fs_msg(sbi->sb, KERN_ERR, "Inconsistent error "
+ "when clearing bitmap, blk:%u, old bit:%d",
+ blkaddr, exist);
f2fs_bug_on(sbi, 1);
- if (f2fs_discard_en(sbi) &&
- f2fs_test_and_clear_bit(offset, se->discard_map))
+ }
+#endif
+ if (unlikely(!exist)) {
+ f2fs_msg(sbi->sb, KERN_ERR,
+ "Bitmap was wrongly cleared, blk:%u", blkaddr);
+ f2fs_bug_on(sbi, 1);
+ se->valid_blocks++;
+ del = 0;
+ } else if (unlikely(is_sbi_flag_set(sbi, SBI_CP_DISABLED))) {
+ /*
+ * If checkpoints are off, we must not reuse data that
+ * was used in the previous checkpoint. If it was used
+ * before, we must track that to know how much space we
+ * really have.
+ */
+ if (f2fs_test_bit(offset, se->ckpt_valid_map))
+ sbi->unusable_block_count++;
+ }
+
+ if (f2fs_test_and_clear_bit(offset, se->discard_map))
sbi->discard_blks++;
}
if (!f2fs_test_bit(offset, se->ckpt_valid_map))
@@ -903,21 +2149,11 @@ static void update_sit_entry(struct f2fs_sb_info *sbi, block_t blkaddr, int del)
/* update total number of valid blocks to be written in ckpt area */
SIT_I(sbi)->written_valid_blocks += del;
- if (sbi->segs_per_sec > 1)
+ if (__is_large_section(sbi))
get_sec_entry(sbi, segno)->valid_blocks += del;
}
-void refresh_sit_entry(struct f2fs_sb_info *sbi, block_t old, block_t new)
-{
- update_sit_entry(sbi, new, 1);
- if (GET_SEGNO(sbi, old) != NULL_SEGNO)
- update_sit_entry(sbi, old, -1);
-
- locate_dirty_segment(sbi, GET_SEGNO(sbi, old));
- locate_dirty_segment(sbi, GET_SEGNO(sbi, new));
-}
-
-void invalidate_blocks(struct f2fs_sb_info *sbi, block_t addr)
+void f2fs_invalidate_blocks(struct f2fs_sb_info *sbi, block_t addr)
{
unsigned int segno = GET_SEGNO(sbi, addr);
struct sit_info *sit_i = SIT_I(sbi);
@@ -926,18 +2162,20 @@ void invalidate_blocks(struct f2fs_sb_info *sbi, block_t addr)
if (addr == NEW_ADDR)
return;
+ invalidate_mapping_pages(META_MAPPING(sbi), addr, addr);
+
/* add it into sit main buffer */
- mutex_lock(&sit_i->sentry_lock);
+ down_write(&sit_i->sentry_lock);
update_sit_entry(sbi, addr, -1);
/* add it into dirty seglist */
locate_dirty_segment(sbi, segno);
- mutex_unlock(&sit_i->sentry_lock);
+ up_write(&sit_i->sentry_lock);
}
-bool is_checkpointed_data(struct f2fs_sb_info *sbi, block_t blkaddr)
+bool f2fs_is_checkpointed_data(struct f2fs_sb_info *sbi, block_t blkaddr)
{
struct sit_info *sit_i = SIT_I(sbi);
unsigned int segno, offset;
@@ -947,7 +2185,7 @@ bool is_checkpointed_data(struct f2fs_sb_info *sbi, block_t blkaddr)
if (!is_valid_data_blkaddr(sbi, blkaddr))
return true;
- mutex_lock(&sit_i->sentry_lock);
+ down_read(&sit_i->sentry_lock);
segno = GET_SEGNO(sbi, blkaddr);
se = get_seg_entry(sbi, segno);
@@ -956,7 +2194,7 @@ bool is_checkpointed_data(struct f2fs_sb_info *sbi, block_t blkaddr)
if (f2fs_test_bit(offset, se->ckpt_valid_map))
is_cp = true;
- mutex_unlock(&sit_i->sentry_lock);
+ up_read(&sit_i->sentry_lock);
return is_cp;
}
@@ -976,7 +2214,7 @@ static void __add_sum_entry(struct f2fs_sb_info *sbi, int type,
/*
* Calculate the number of current summary pages for writing
*/
-int npages_for_summary_flush(struct f2fs_sb_info *sbi, bool for_ra)
+int f2fs_npages_for_summary_flush(struct f2fs_sb_info *sbi, bool for_ra)
{
int valid_sum_count = 0;
int i, sum_in_page;
@@ -1006,20 +2244,17 @@ int npages_for_summary_flush(struct f2fs_sb_info *sbi, bool for_ra)
/*
* Caller should put this summary page
*/
-struct page *get_sum_page(struct f2fs_sb_info *sbi, unsigned int segno)
+struct page *f2fs_get_sum_page(struct f2fs_sb_info *sbi, unsigned int segno)
{
- return get_meta_page(sbi, GET_SUM_BLOCK(sbi, segno));
+ return f2fs_get_meta_page_nofail(sbi, GET_SUM_BLOCK(sbi, segno));
}
-void update_meta_page(struct f2fs_sb_info *sbi, void *src, block_t blk_addr)
+void f2fs_update_meta_page(struct f2fs_sb_info *sbi,
+ void *src, block_t blk_addr)
{
- struct page *page = grab_meta_page(sbi, blk_addr);
- void *dst = page_address(page);
+ struct page *page = f2fs_grab_meta_page(sbi, blk_addr);
- if (src)
- memcpy(dst, src, PAGE_SIZE);
- else
- memset(dst, 0, PAGE_SIZE);
+ memcpy(page_address(page), src, PAGE_SIZE);
set_page_dirty(page);
f2fs_put_page(page, 1);
}
@@ -1027,18 +2262,19 @@ void update_meta_page(struct f2fs_sb_info *sbi, void *src, block_t blk_addr)
static void write_sum_page(struct f2fs_sb_info *sbi,
struct f2fs_summary_block *sum_blk, block_t blk_addr)
{
- update_meta_page(sbi, (void *)sum_blk, blk_addr);
+ f2fs_update_meta_page(sbi, (void *)sum_blk, blk_addr);
}
static void write_current_sum_page(struct f2fs_sb_info *sbi,
int type, block_t blk_addr)
{
struct curseg_info *curseg = CURSEG_I(sbi, type);
- struct page *page = grab_meta_page(sbi, blk_addr);
+ struct page *page = f2fs_grab_meta_page(sbi, blk_addr);
struct f2fs_summary_block *src = curseg->sum_blk;
struct f2fs_summary_block *dst;
dst = (struct f2fs_summary_block *)page_address(page);
+ memset(dst, 0, PAGE_SIZE);
mutex_lock(&curseg->curseg_mutex);
@@ -1076,8 +2312,8 @@ static void get_new_segment(struct f2fs_sb_info *sbi,
struct free_segmap_info *free_i = FREE_I(sbi);
unsigned int segno, secno, zoneno;
unsigned int total_zones = MAIN_SECS(sbi) / sbi->secs_per_zone;
- unsigned int hint = *newseg / sbi->segs_per_sec;
- unsigned int old_zoneno = GET_ZONENO_FROM_SEGNO(sbi, *newseg);
+ unsigned int hint = GET_SEC_FROM_SEG(sbi, *newseg);
+ unsigned int old_zoneno = GET_ZONE_FROM_SEG(sbi, *newseg);
unsigned int left_start = hint;
bool init = true;
int go_left = 0;
@@ -1087,8 +2323,8 @@ static void get_new_segment(struct f2fs_sb_info *sbi,
if (!new_sec && ((*newseg + 1) % sbi->segs_per_sec)) {
segno = find_next_zero_bit(free_i->free_segmap,
- (hint + 1) * sbi->segs_per_sec, *newseg + 1);
- if (segno < (hint + 1) * sbi->segs_per_sec)
+ GET_SEG_FROM_SEC(sbi, hint + 1), *newseg + 1);
+ if (segno < GET_SEG_FROM_SEC(sbi, hint + 1))
goto got_it;
}
find_other_zone:
@@ -1118,9 +2354,8 @@ find_other_zone:
}
secno = left_start;
skip_left:
- hint = secno;
- segno = secno * sbi->segs_per_sec;
- zoneno = secno / sbi->secs_per_zone;
+ segno = GET_SEG_FROM_SEC(sbi, secno);
+ zoneno = GET_ZONE_FROM_SEC(sbi, secno);
/* give up on finding another zone */
if (!init)
@@ -1164,7 +2399,7 @@ static void reset_curseg(struct f2fs_sb_info *sbi, int type, int modified)
struct summary_footer *sum_footer;
curseg->segno = curseg->next_segno;
- curseg->zone = GET_ZONENO_FROM_SEGNO(sbi, curseg->segno);
+ curseg->zone = GET_ZONE_FROM_SEG(sbi, curseg->segno);
curseg->next_blkoff = 0;
curseg->next_segno = NULL_SEGNO;
@@ -1177,6 +2412,29 @@ static void reset_curseg(struct f2fs_sb_info *sbi, int type, int modified)
__set_sit_entry_type(sbi, type, curseg->segno, modified);
}
+static unsigned int __get_next_segno(struct f2fs_sb_info *sbi, int type)
+{
+ /* if segs_per_sec is large than 1, we need to keep original policy. */
+ if (__is_large_section(sbi))
+ return CURSEG_I(sbi, type)->segno;
+
+ if (unlikely(is_sbi_flag_set(sbi, SBI_CP_DISABLED)))
+ return 0;
+
+ if (test_opt(sbi, NOHEAP) &&
+ (type == CURSEG_HOT_DATA || IS_NODESEG(type)))
+ return 0;
+
+ if (SIT_I(sbi)->last_victim[ALLOC_NEXT])
+ return SIT_I(sbi)->last_victim[ALLOC_NEXT];
+
+ /* find segments from 0 to reuse freed segments */
+ if (F2FS_OPTION(sbi).alloc_mode == ALLOC_MODE_REUSE)
+ return 0;
+
+ return CURSEG_I(sbi, type)->segno;
+}
+
/*
* Allocate a current working segment.
* This function always allocates a free segment in LFS manner.
@@ -1195,6 +2453,7 @@ static void new_curseg(struct f2fs_sb_info *sbi, int type, bool new_sec)
if (test_opt(sbi, NOHEAP))
dir = ALLOC_RIGHT;
+ segno = __get_next_segno(sbi, type);
get_new_segment(sbi, &segno, new_sec, dir);
curseg->next_segno = segno;
reset_curseg(sbi, type, 1);
@@ -1237,7 +2496,7 @@ static void __refresh_next_blkoff(struct f2fs_sb_info *sbi,
* This function always allocates a used segment(from dirty seglist) by SSR
* manner, so it should recover the existing segment information of valid blocks
*/
-static void change_curseg(struct f2fs_sb_info *sbi, int type, bool reuse)
+static void change_curseg(struct f2fs_sb_info *sbi, int type)
{
struct dirty_seglist_info *dirty_i = DIRTY_I(sbi);
struct curseg_info *curseg = CURSEG_I(sbi, type);
@@ -1258,28 +2517,63 @@ static void change_curseg(struct f2fs_sb_info *sbi, int type, bool reuse)
curseg->alloc_type = SSR;
__next_free_blkoff(sbi, curseg, 0);
- if (reuse) {
- sum_page = get_sum_page(sbi, new_segno);
- sum_node = (struct f2fs_summary_block *)page_address(sum_page);
- memcpy(curseg->sum_blk, sum_node, SUM_ENTRY_SIZE);
- f2fs_put_page(sum_page, 1);
- }
+ sum_page = f2fs_get_sum_page(sbi, new_segno);
+ f2fs_bug_on(sbi, IS_ERR(sum_page));
+ sum_node = (struct f2fs_summary_block *)page_address(sum_page);
+ memcpy(curseg->sum_blk, sum_node, SUM_ENTRY_SIZE);
+ f2fs_put_page(sum_page, 1);
}
static int get_ssr_segment(struct f2fs_sb_info *sbi, int type)
{
struct curseg_info *curseg = CURSEG_I(sbi, type);
const struct victim_selection *v_ops = DIRTY_I(sbi)->v_ops;
+ unsigned segno = NULL_SEGNO;
+ int i, cnt;
+ bool reversed = false;
- if (IS_NODESEG(type))
- return v_ops->get_victim(sbi,
- &(curseg)->next_segno, BG_GC, type, SSR);
+ /* f2fs_need_SSR() already forces to do this */
+ if (v_ops->get_victim(sbi, &segno, BG_GC, type, SSR)) {
+ curseg->next_segno = segno;
+ return 1;
+ }
+
+ /* For node segments, let's do SSR more intensively */
+ if (IS_NODESEG(type)) {
+ if (type >= CURSEG_WARM_NODE) {
+ reversed = true;
+ i = CURSEG_COLD_NODE;
+ } else {
+ i = CURSEG_HOT_NODE;
+ }
+ cnt = NR_CURSEG_NODE_TYPE;
+ } else {
+ if (type >= CURSEG_WARM_DATA) {
+ reversed = true;
+ i = CURSEG_COLD_DATA;
+ } else {
+ i = CURSEG_HOT_DATA;
+ }
+ cnt = NR_CURSEG_DATA_TYPE;
+ }
+
+ for (; cnt-- > 0; reversed ? i-- : i++) {
+ if (i == type)
+ continue;
+ if (v_ops->get_victim(sbi, &segno, BG_GC, i, SSR)) {
+ curseg->next_segno = segno;
+ return 1;
+ }
+ }
- /* For data segments, let's do SSR more intensively */
- for (; type >= CURSEG_HOT_DATA; type--)
- if (v_ops->get_victim(sbi, &(curseg)->next_segno,
- BG_GC, type, SSR))
+ /* find valid_blocks=0 in dirty list */
+ if (unlikely(is_sbi_flag_set(sbi, SBI_CP_DISABLED))) {
+ segno = get_free_segment(sbi);
+ if (segno != NULL_SEGNO) {
+ curseg->next_segno = segno;
return 1;
+ }
+ }
return 0;
}
@@ -1294,95 +2588,201 @@ static void allocate_segment_by_default(struct f2fs_sb_info *sbi,
if (force)
new_curseg(sbi, type, true);
- else if (type == CURSEG_WARM_NODE)
+ else if (!is_set_ckpt_flags(sbi, CP_CRC_RECOVERY_FLAG) &&
+ type == CURSEG_WARM_NODE)
new_curseg(sbi, type, false);
- else if (curseg->alloc_type == LFS && is_next_segment_free(sbi, type))
+ else if (curseg->alloc_type == LFS && is_next_segment_free(sbi, type) &&
+ likely(!is_sbi_flag_set(sbi, SBI_CP_DISABLED)))
new_curseg(sbi, type, false);
- else if (need_SSR(sbi) && get_ssr_segment(sbi, type))
- change_curseg(sbi, type, true);
+ else if (f2fs_need_SSR(sbi) && get_ssr_segment(sbi, type))
+ change_curseg(sbi, type);
else
new_curseg(sbi, type, false);
stat_inc_seg_type(sbi, curseg);
}
-static void __allocate_new_segments(struct f2fs_sb_info *sbi, int type)
+void f2fs_allocate_new_segments(struct f2fs_sb_info *sbi)
{
- struct curseg_info *curseg = CURSEG_I(sbi, type);
+ struct curseg_info *curseg;
unsigned int old_segno;
-
- old_segno = curseg->segno;
- SIT_I(sbi)->s_ops->allocate_segment(sbi, type, true);
- locate_dirty_segment(sbi, old_segno);
-}
-
-void allocate_new_segments(struct f2fs_sb_info *sbi)
-{
int i;
- if (test_opt(sbi, LFS))
- return;
+ down_write(&SIT_I(sbi)->sentry_lock);
+
+ for (i = CURSEG_HOT_DATA; i <= CURSEG_COLD_DATA; i++) {
+ curseg = CURSEG_I(sbi, i);
+ old_segno = curseg->segno;
+ SIT_I(sbi)->s_ops->allocate_segment(sbi, i, true);
+ locate_dirty_segment(sbi, old_segno);
+ }
- for (i = CURSEG_HOT_DATA; i <= CURSEG_COLD_DATA; i++)
- __allocate_new_segments(sbi, i);
+ up_write(&SIT_I(sbi)->sentry_lock);
}
static const struct segment_allocation default_salloc_ops = {
.allocate_segment = allocate_segment_by_default,
};
+bool f2fs_exist_trim_candidates(struct f2fs_sb_info *sbi,
+ struct cp_control *cpc)
+{
+ __u64 trim_start = cpc->trim_start;
+ bool has_candidate = false;
+
+ down_write(&SIT_I(sbi)->sentry_lock);
+ for (; cpc->trim_start <= cpc->trim_end; cpc->trim_start++) {
+ if (add_discard_addrs(sbi, cpc, true)) {
+ has_candidate = true;
+ break;
+ }
+ }
+ up_write(&SIT_I(sbi)->sentry_lock);
+
+ cpc->trim_start = trim_start;
+ return has_candidate;
+}
+
+static unsigned int __issue_discard_cmd_range(struct f2fs_sb_info *sbi,
+ struct discard_policy *dpolicy,
+ unsigned int start, unsigned int end)
+{
+ struct discard_cmd_control *dcc = SM_I(sbi)->dcc_info;
+ struct discard_cmd *prev_dc = NULL, *next_dc = NULL;
+ struct rb_node **insert_p = NULL, *insert_parent = NULL;
+ struct discard_cmd *dc;
+ struct blk_plug plug;
+ int issued;
+ unsigned int trimmed = 0;
+
+next:
+ issued = 0;
+
+ mutex_lock(&dcc->cmd_lock);
+ if (unlikely(dcc->rbtree_check))
+ f2fs_bug_on(sbi, !f2fs_check_rb_tree_consistence(sbi,
+ &dcc->root));
+
+ dc = (struct discard_cmd *)f2fs_lookup_rb_tree_ret(&dcc->root,
+ NULL, start,
+ (struct rb_entry **)&prev_dc,
+ (struct rb_entry **)&next_dc,
+ &insert_p, &insert_parent, true);
+ if (!dc)
+ dc = next_dc;
+
+ blk_start_plug(&plug);
+
+ while (dc && dc->lstart <= end) {
+ struct rb_node *node;
+ int err = 0;
+
+ if (dc->len < dpolicy->granularity)
+ goto skip;
+
+ if (dc->state != D_PREP) {
+ list_move_tail(&dc->list, &dcc->fstrim_list);
+ goto skip;
+ }
+
+ err = __submit_discard_cmd(sbi, dpolicy, dc, &issued);
+
+ if (issued >= dpolicy->max_requests) {
+ start = dc->lstart + dc->len;
+
+ if (err)
+ __remove_discard_cmd(sbi, dc);
+
+ blk_finish_plug(&plug);
+ mutex_unlock(&dcc->cmd_lock);
+ trimmed += __wait_all_discard_cmd(sbi, NULL);
+ congestion_wait(BLK_RW_ASYNC, HZ/50);
+ goto next;
+ }
+skip:
+ node = rb_next(&dc->rb_node);
+ if (err)
+ __remove_discard_cmd(sbi, dc);
+ dc = rb_entry_safe(node, struct discard_cmd, rb_node);
+
+ if (fatal_signal_pending(current))
+ break;
+ }
+
+ blk_finish_plug(&plug);
+ mutex_unlock(&dcc->cmd_lock);
+
+ return trimmed;
+}
+
int f2fs_trim_fs(struct f2fs_sb_info *sbi, struct fstrim_range *range)
{
__u64 start = F2FS_BYTES_TO_BLK(range->start);
__u64 end = start + F2FS_BYTES_TO_BLK(range->len) - 1;
unsigned int start_segno, end_segno;
+ block_t start_block, end_block;
struct cp_control cpc;
+ struct discard_policy dpolicy;
+ unsigned long long trimmed = 0;
int err = 0;
+ bool need_align = test_opt(sbi, LFS) && __is_large_section(sbi);
if (start >= MAX_BLKADDR(sbi) || range->len < sbi->blocksize)
return -EINVAL;
- cpc.trimmed = 0;
- if (end <= MAIN_BLKADDR(sbi))
+ if (end < MAIN_BLKADDR(sbi))
goto out;
if (is_sbi_flag_set(sbi, SBI_NEED_FSCK)) {
f2fs_msg(sbi->sb, KERN_WARNING,
"Found FS corruption, run fsck to fix.");
- goto out;
+ return -EIO;
}
/* start/end segment number in main_area */
start_segno = (start <= MAIN_BLKADDR(sbi)) ? 0 : GET_SEGNO(sbi, start);
end_segno = (end >= MAX_BLKADDR(sbi)) ? MAIN_SEGS(sbi) - 1 :
GET_SEGNO(sbi, end);
+ if (need_align) {
+ start_segno = rounddown(start_segno, sbi->segs_per_sec);
+ end_segno = roundup(end_segno + 1, sbi->segs_per_sec) - 1;
+ }
+
cpc.reason = CP_DISCARD;
cpc.trim_minlen = max_t(__u64, 1, F2FS_BYTES_TO_BLK(range->minlen));
+ cpc.trim_start = start_segno;
+ cpc.trim_end = end_segno;
- /* do checkpoint to issue discard commands safely */
- for (; start_segno <= end_segno; start_segno = cpc.trim_end + 1) {
- cpc.trim_start = start_segno;
+ if (sbi->discard_blks == 0)
+ goto out;
- if (sbi->discard_blks == 0)
- break;
- else if (sbi->discard_blks < BATCHED_TRIM_BLOCKS(sbi))
- cpc.trim_end = end_segno;
- else
- cpc.trim_end = min_t(unsigned int,
- rounddown(start_segno +
- BATCHED_TRIM_SEGMENTS(sbi),
- sbi->segs_per_sec) - 1, end_segno);
+ mutex_lock(&sbi->gc_mutex);
+ err = f2fs_write_checkpoint(sbi, &cpc);
+ mutex_unlock(&sbi->gc_mutex);
+ if (err)
+ goto out;
- mutex_lock(&sbi->gc_mutex);
- err = write_checkpoint(sbi, &cpc);
- mutex_unlock(&sbi->gc_mutex);
- if (err)
- break;
+ /*
+ * We filed discard candidates, but actually we don't need to wait for
+ * all of them, since they'll be issued in idle time along with runtime
+ * discard option. User configuration looks like using runtime discard
+ * or periodic fstrim instead of it.
+ */
+ if (f2fs_realtime_discard_enable(sbi))
+ goto out;
- schedule();
- }
+ start_block = START_BLOCK(sbi, start_segno);
+ end_block = START_BLOCK(sbi, end_segno + 1);
+
+ __init_discard_policy(sbi, &dpolicy, DPOLICY_FSTRIM, cpc.trim_minlen);
+ trimmed = __issue_discard_cmd_range(sbi, &dpolicy,
+ start_block, end_block);
+
+ trimmed += __wait_discard_cmd_range(sbi, &dpolicy,
+ start_block, end_block);
out:
- range->len = F2FS_BLK_TO_BYTES(cpc.trimmed);
+ if (!err)
+ range->len = F2FS_BLK_TO_BYTES(trimmed);
return err;
}
@@ -1394,87 +2794,204 @@ static bool __has_curseg_space(struct f2fs_sb_info *sbi, int type)
return false;
}
-static int __get_segment_type_2(struct page *page, enum page_type p_type)
+int f2fs_rw_hint_to_seg_type(enum rw_hint hint)
{
- if (p_type == DATA)
+ switch (hint) {
+ case WRITE_LIFE_SHORT:
+ return CURSEG_HOT_DATA;
+ case WRITE_LIFE_EXTREME:
+ return CURSEG_COLD_DATA;
+ default:
+ return CURSEG_WARM_DATA;
+ }
+}
+
+/* This returns write hints for each segment type. This hints will be
+ * passed down to block layer. There are mapping tables which depend on
+ * the mount option 'whint_mode'.
+ *
+ * 1) whint_mode=off. F2FS only passes down WRITE_LIFE_NOT_SET.
+ *
+ * 2) whint_mode=user-based. F2FS tries to pass down hints given by users.
+ *
+ * User F2FS Block
+ * ---- ---- -----
+ * META WRITE_LIFE_NOT_SET
+ * HOT_NODE "
+ * WARM_NODE "
+ * COLD_NODE "
+ * ioctl(COLD) COLD_DATA WRITE_LIFE_EXTREME
+ * extension list " "
+ *
+ * -- buffered io
+ * WRITE_LIFE_EXTREME COLD_DATA WRITE_LIFE_EXTREME
+ * WRITE_LIFE_SHORT HOT_DATA WRITE_LIFE_SHORT
+ * WRITE_LIFE_NOT_SET WARM_DATA WRITE_LIFE_NOT_SET
+ * WRITE_LIFE_NONE " "
+ * WRITE_LIFE_MEDIUM " "
+ * WRITE_LIFE_LONG " "
+ *
+ * -- direct io
+ * WRITE_LIFE_EXTREME COLD_DATA WRITE_LIFE_EXTREME
+ * WRITE_LIFE_SHORT HOT_DATA WRITE_LIFE_SHORT
+ * WRITE_LIFE_NOT_SET WARM_DATA WRITE_LIFE_NOT_SET
+ * WRITE_LIFE_NONE " WRITE_LIFE_NONE
+ * WRITE_LIFE_MEDIUM " WRITE_LIFE_MEDIUM
+ * WRITE_LIFE_LONG " WRITE_LIFE_LONG
+ *
+ * 3) whint_mode=fs-based. F2FS passes down hints with its policy.
+ *
+ * User F2FS Block
+ * ---- ---- -----
+ * META WRITE_LIFE_MEDIUM;
+ * HOT_NODE WRITE_LIFE_NOT_SET
+ * WARM_NODE "
+ * COLD_NODE WRITE_LIFE_NONE
+ * ioctl(COLD) COLD_DATA WRITE_LIFE_EXTREME
+ * extension list " "
+ *
+ * -- buffered io
+ * WRITE_LIFE_EXTREME COLD_DATA WRITE_LIFE_EXTREME
+ * WRITE_LIFE_SHORT HOT_DATA WRITE_LIFE_SHORT
+ * WRITE_LIFE_NOT_SET WARM_DATA WRITE_LIFE_LONG
+ * WRITE_LIFE_NONE " "
+ * WRITE_LIFE_MEDIUM " "
+ * WRITE_LIFE_LONG " "
+ *
+ * -- direct io
+ * WRITE_LIFE_EXTREME COLD_DATA WRITE_LIFE_EXTREME
+ * WRITE_LIFE_SHORT HOT_DATA WRITE_LIFE_SHORT
+ * WRITE_LIFE_NOT_SET WARM_DATA WRITE_LIFE_NOT_SET
+ * WRITE_LIFE_NONE " WRITE_LIFE_NONE
+ * WRITE_LIFE_MEDIUM " WRITE_LIFE_MEDIUM
+ * WRITE_LIFE_LONG " WRITE_LIFE_LONG
+ */
+
+enum rw_hint f2fs_io_type_to_rw_hint(struct f2fs_sb_info *sbi,
+ enum page_type type, enum temp_type temp)
+{
+ if (F2FS_OPTION(sbi).whint_mode == WHINT_MODE_USER) {
+ if (type == DATA) {
+ if (temp == WARM)
+ return WRITE_LIFE_NOT_SET;
+ else if (temp == HOT)
+ return WRITE_LIFE_SHORT;
+ else if (temp == COLD)
+ return WRITE_LIFE_EXTREME;
+ } else {
+ return WRITE_LIFE_NOT_SET;
+ }
+ } else if (F2FS_OPTION(sbi).whint_mode == WHINT_MODE_FS) {
+ if (type == DATA) {
+ if (temp == WARM)
+ return WRITE_LIFE_LONG;
+ else if (temp == HOT)
+ return WRITE_LIFE_SHORT;
+ else if (temp == COLD)
+ return WRITE_LIFE_EXTREME;
+ } else if (type == NODE) {
+ if (temp == WARM || temp == HOT)
+ return WRITE_LIFE_NOT_SET;
+ else if (temp == COLD)
+ return WRITE_LIFE_NONE;
+ } else if (type == META) {
+ return WRITE_LIFE_MEDIUM;
+ }
+ }
+ return WRITE_LIFE_NOT_SET;
+}
+
+static int __get_segment_type_2(struct f2fs_io_info *fio)
+{
+ if (fio->type == DATA)
return CURSEG_HOT_DATA;
else
return CURSEG_HOT_NODE;
}
-static int __get_segment_type_4(struct page *page, enum page_type p_type)
+static int __get_segment_type_4(struct f2fs_io_info *fio)
{
- if (p_type == DATA) {
- struct inode *inode = page->mapping->host;
+ if (fio->type == DATA) {
+ struct inode *inode = fio->page->mapping->host;
if (S_ISDIR(inode->i_mode))
return CURSEG_HOT_DATA;
else
return CURSEG_COLD_DATA;
} else {
- if (IS_DNODE(page) && is_cold_node(page))
+ if (IS_DNODE(fio->page) && is_cold_node(fio->page))
return CURSEG_WARM_NODE;
else
return CURSEG_COLD_NODE;
}
}
-static int __get_segment_type_6(struct page *page, enum page_type p_type)
+static int __get_segment_type_6(struct f2fs_io_info *fio)
{
- if (p_type == DATA) {
- struct inode *inode = page->mapping->host;
+ if (fio->type == DATA) {
+ struct inode *inode = fio->page->mapping->host;
- if (S_ISDIR(inode->i_mode))
- return CURSEG_HOT_DATA;
- else if (is_cold_data(page) || file_is_cold(inode))
+ if (is_cold_data(fio->page) || file_is_cold(inode))
return CURSEG_COLD_DATA;
- else
- return CURSEG_WARM_DATA;
+ if (file_is_hot(inode) ||
+ is_inode_flag_set(inode, FI_HOT_DATA) ||
+ f2fs_is_atomic_file(inode) ||
+ f2fs_is_volatile_file(inode))
+ return CURSEG_HOT_DATA;
+ /* f2fs_rw_hint_to_seg_type(inode->i_write_hint); */
+ return CURSEG_WARM_DATA;
} else {
- if (IS_DNODE(page))
- return is_cold_node(page) ? CURSEG_WARM_NODE :
+ if (IS_DNODE(fio->page))
+ return is_cold_node(fio->page) ? CURSEG_WARM_NODE :
CURSEG_HOT_NODE;
- else
- return CURSEG_COLD_NODE;
+ return CURSEG_COLD_NODE;
}
}
-static int __get_segment_type(struct page *page, enum page_type p_type)
+static int __get_segment_type(struct f2fs_io_info *fio)
{
- switch (F2FS_P_SB(page)->active_logs) {
+ int type = 0;
+
+ switch (F2FS_OPTION(fio->sbi).active_logs) {
case 2:
- return __get_segment_type_2(page, p_type);
+ type = __get_segment_type_2(fio);
+ break;
case 4:
- return __get_segment_type_4(page, p_type);
+ type = __get_segment_type_4(fio);
+ break;
+ case 6:
+ type = __get_segment_type_6(fio);
+ break;
+ default:
+ f2fs_bug_on(fio->sbi, true);
}
- /* NR_CURSEG_TYPE(6) logs by default */
- f2fs_bug_on(F2FS_P_SB(page),
- F2FS_P_SB(page)->active_logs != NR_CURSEG_TYPE);
- return __get_segment_type_6(page, p_type);
+
+ if (IS_HOT(type))
+ fio->temp = HOT;
+ else if (IS_WARM(type))
+ fio->temp = WARM;
+ else
+ fio->temp = COLD;
+ return type;
}
-void allocate_data_block(struct f2fs_sb_info *sbi, struct page *page,
+void f2fs_allocate_data_block(struct f2fs_sb_info *sbi, struct page *page,
block_t old_blkaddr, block_t *new_blkaddr,
- struct f2fs_summary *sum, int type)
+ struct f2fs_summary *sum, int type,
+ struct f2fs_io_info *fio, bool add_list)
{
struct sit_info *sit_i = SIT_I(sbi);
- struct curseg_info *curseg;
- bool direct_io = (type == CURSEG_DIRECT_IO);
-
- type = direct_io ? CURSEG_WARM_DATA : type;
+ struct curseg_info *curseg = CURSEG_I(sbi, type);
- curseg = CURSEG_I(sbi, type);
+ down_read(&SM_I(sbi)->curseg_lock);
mutex_lock(&curseg->curseg_mutex);
- mutex_lock(&sit_i->sentry_lock);
-
- /* direct_io'ed data is aligned to the segment for better performance */
- if (direct_io && curseg->next_blkoff &&
- !has_not_enough_free_secs(sbi, 0, 0))
- __allocate_new_segments(sbi, type);
+ down_write(&sit_i->sentry_lock);
*new_blkaddr = NEXT_FREE_BLKADDR(sbi, curseg);
+ f2fs_wait_discard_bio(sbi, *new_blkaddr);
+
/*
* __add_sum_entry should be resided under the curseg_mutex
* because, this function updates a summary entry in the
@@ -1486,88 +3003,185 @@ void allocate_data_block(struct f2fs_sb_info *sbi, struct page *page,
stat_inc_block_count(sbi, curseg);
- if (!__has_curseg_space(sbi, type))
- sit_i->s_ops->allocate_segment(sbi, type, false);
/*
* SIT information should be updated before segment allocation,
* since SSR needs latest valid block information.
*/
- refresh_sit_entry(sbi, old_blkaddr, *new_blkaddr);
+ update_sit_entry(sbi, *new_blkaddr, 1);
+ if (GET_SEGNO(sbi, old_blkaddr) != NULL_SEGNO)
+ update_sit_entry(sbi, old_blkaddr, -1);
+
+ if (!__has_curseg_space(sbi, type))
+ sit_i->s_ops->allocate_segment(sbi, type, false);
+
+ /*
+ * segment dirty status should be updated after segment allocation,
+ * so we just need to update status only one time after previous
+ * segment being closed.
+ */
+ locate_dirty_segment(sbi, GET_SEGNO(sbi, old_blkaddr));
+ locate_dirty_segment(sbi, GET_SEGNO(sbi, *new_blkaddr));
- mutex_unlock(&sit_i->sentry_lock);
+ up_write(&sit_i->sentry_lock);
- if (page && IS_NODESEG(type))
+ if (page && IS_NODESEG(type)) {
fill_node_footer_blkaddr(page, NEXT_FREE_BLKADDR(sbi, curseg));
+ f2fs_inode_chksum_set(sbi, page);
+ }
+
+ if (add_list) {
+ struct f2fs_bio_info *io;
+
+ INIT_LIST_HEAD(&fio->list);
+ fio->in_list = true;
+ fio->retry = false;
+ io = sbi->write_io[fio->type] + fio->temp;
+ spin_lock(&io->io_lock);
+ list_add_tail(&fio->list, &io->io_list);
+ spin_unlock(&io->io_lock);
+ }
+
mutex_unlock(&curseg->curseg_mutex);
+
+ up_read(&SM_I(sbi)->curseg_lock);
}
-static void do_write_page(struct f2fs_summary *sum, struct f2fs_io_info *fio)
+static void update_device_state(struct f2fs_io_info *fio)
{
- int type = __get_segment_type(fio->page, fio->type);
+ struct f2fs_sb_info *sbi = fio->sbi;
+ unsigned int devidx;
+
+ if (!sbi->s_ndevs)
+ return;
+
+ devidx = f2fs_target_device_index(sbi, fio->new_blkaddr);
+
+ /* update device state for fsync */
+ f2fs_set_dirty_device(sbi, fio->ino, devidx, FLUSH_INO);
- if (fio->type == NODE || fio->type == DATA)
- mutex_lock(&fio->sbi->wio_mutex[fio->type]);
+ /* update device state for checkpoint */
+ if (!f2fs_test_bit(devidx, (char *)&sbi->dirty_device)) {
+ spin_lock(&sbi->dev_lock);
+ f2fs_set_bit(devidx, (char *)&sbi->dirty_device);
+ spin_unlock(&sbi->dev_lock);
+ }
+}
- allocate_data_block(fio->sbi, fio->page, fio->old_blkaddr,
- &fio->new_blkaddr, sum, type);
+static void do_write_page(struct f2fs_summary *sum, struct f2fs_io_info *fio)
+{
+ int type = __get_segment_type(fio);
+ bool keep_order = (test_opt(fio->sbi, LFS) && type == CURSEG_COLD_DATA);
+
+ if (keep_order)
+ down_read(&fio->sbi->io_order_lock);
+reallocate:
+ f2fs_allocate_data_block(fio->sbi, fio->page, fio->old_blkaddr,
+ &fio->new_blkaddr, sum, type, fio, true);
+ if (GET_SEGNO(fio->sbi, fio->old_blkaddr) != NULL_SEGNO)
+ invalidate_mapping_pages(META_MAPPING(fio->sbi),
+ fio->old_blkaddr, fio->old_blkaddr);
/* writeout dirty page into bdev */
- f2fs_submit_page_mbio(fio);
+ f2fs_submit_page_write(fio);
+ if (fio->retry) {
+ fio->old_blkaddr = fio->new_blkaddr;
+ goto reallocate;
+ }
+
+ update_device_state(fio);
- if (fio->type == NODE || fio->type == DATA)
- mutex_unlock(&fio->sbi->wio_mutex[fio->type]);
+ if (keep_order)
+ up_read(&fio->sbi->io_order_lock);
}
-void write_meta_page(struct f2fs_sb_info *sbi, struct page *page)
+void f2fs_do_write_meta_page(struct f2fs_sb_info *sbi, struct page *page,
+ enum iostat_type io_type)
{
struct f2fs_io_info fio = {
.sbi = sbi,
.type = META,
+ .temp = HOT,
.op = REQ_OP_WRITE,
- .op_flags = WRITE_SYNC | REQ_META | REQ_PRIO,
+ .op_flags = REQ_SYNC | REQ_META | REQ_PRIO,
.old_blkaddr = page->index,
.new_blkaddr = page->index,
.page = page,
.encrypted_page = NULL,
+ .in_list = false,
};
if (unlikely(page->index >= MAIN_BLKADDR(sbi)))
fio.op_flags &= ~REQ_META;
set_page_writeback(page);
- f2fs_submit_page_mbio(&fio);
+ ClearPageError(page);
+ f2fs_submit_page_write(&fio);
+
+ stat_inc_meta_count(sbi, page->index);
+ f2fs_update_iostat(sbi, io_type, F2FS_BLKSIZE);
}
-void write_node_page(unsigned int nid, struct f2fs_io_info *fio)
+void f2fs_do_write_node_page(unsigned int nid, struct f2fs_io_info *fio)
{
struct f2fs_summary sum;
set_summary(&sum, nid, 0, 0);
do_write_page(&sum, fio);
+
+ f2fs_update_iostat(fio->sbi, fio->io_type, F2FS_BLKSIZE);
}
-void write_data_page(struct dnode_of_data *dn, struct f2fs_io_info *fio)
+void f2fs_outplace_write_data(struct dnode_of_data *dn,
+ struct f2fs_io_info *fio)
{
struct f2fs_sb_info *sbi = fio->sbi;
struct f2fs_summary sum;
- struct node_info ni;
f2fs_bug_on(sbi, dn->data_blkaddr == NULL_ADDR);
- get_node_info(sbi, dn->nid, &ni);
- set_summary(&sum, dn->nid, dn->ofs_in_node, ni.version);
+ set_summary(&sum, dn->nid, dn->ofs_in_node, fio->version);
do_write_page(&sum, fio);
f2fs_update_data_blkaddr(dn, fio->new_blkaddr);
+
+ f2fs_update_iostat(sbi, fio->io_type, F2FS_BLKSIZE);
}
-void rewrite_data_page(struct f2fs_io_info *fio)
+int f2fs_inplace_write_data(struct f2fs_io_info *fio)
{
+ int err;
+ struct f2fs_sb_info *sbi = fio->sbi;
+
fio->new_blkaddr = fio->old_blkaddr;
+ /* i/o temperature is needed for passing down write hints */
+ __get_segment_type(fio);
+
+ f2fs_bug_on(sbi, !IS_DATASEG(get_seg_entry(sbi,
+ GET_SEGNO(sbi, fio->new_blkaddr))->type));
+
stat_inc_inplace_blocks(fio->sbi);
- f2fs_submit_page_mbio(fio);
+
+ err = f2fs_submit_page_bio(fio);
+ if (!err)
+ update_device_state(fio);
+
+ f2fs_update_iostat(fio->sbi, fio->io_type, F2FS_BLKSIZE);
+
+ return err;
+}
+
+static inline int __f2fs_get_curseg(struct f2fs_sb_info *sbi,
+ unsigned int segno)
+{
+ int i;
+
+ for (i = CURSEG_HOT_DATA; i < NO_CHECK_TYPE; i++) {
+ if (CURSEG_I(sbi, i)->segno == segno)
+ break;
+ }
+ return i;
}
-void __f2fs_replace_block(struct f2fs_sb_info *sbi, struct f2fs_summary *sum,
+void f2fs_do_replace_block(struct f2fs_sb_info *sbi, struct f2fs_summary *sum,
block_t old_blkaddr, block_t new_blkaddr,
bool recover_curseg, bool recover_newaddr)
{
@@ -1582,6 +3196,8 @@ void __f2fs_replace_block(struct f2fs_sb_info *sbi, struct f2fs_summary *sum,
se = get_seg_entry(sbi, segno);
type = se->type;
+ down_write(&SM_I(sbi)->curseg_lock);
+
if (!recover_curseg) {
/* for recovery flow */
if (se->valid_blocks == 0 && !IS_CURSEG(sbi, segno)) {
@@ -1591,14 +3207,20 @@ void __f2fs_replace_block(struct f2fs_sb_info *sbi, struct f2fs_summary *sum,
type = CURSEG_WARM_DATA;
}
} else {
- if (!IS_CURSEG(sbi, segno))
+ if (IS_CURSEG(sbi, segno)) {
+ /* se->type is volatile as SSR allocation */
+ type = __f2fs_get_curseg(sbi, segno);
+ f2fs_bug_on(sbi, type == NO_CHECK_TYPE);
+ } else {
type = CURSEG_WARM_DATA;
+ }
}
+ f2fs_bug_on(sbi, !IS_DATASEG(type));
curseg = CURSEG_I(sbi, type);
mutex_lock(&curseg->curseg_mutex);
- mutex_lock(&sit_i->sentry_lock);
+ down_write(&sit_i->sentry_lock);
old_cursegno = curseg->segno;
old_blkoff = curseg->next_blkoff;
@@ -1606,7 +3228,7 @@ void __f2fs_replace_block(struct f2fs_sb_info *sbi, struct f2fs_summary *sum,
/* change the current segment */
if (segno != curseg->segno) {
curseg->next_segno = segno;
- change_curseg(sbi, type, true);
+ change_curseg(sbi, type);
}
curseg->next_blkoff = GET_BLKOFF_FROM_SEG0(sbi, new_blkaddr);
@@ -1614,8 +3236,11 @@ void __f2fs_replace_block(struct f2fs_sb_info *sbi, struct f2fs_summary *sum,
if (!recover_curseg || recover_newaddr)
update_sit_entry(sbi, new_blkaddr, 1);
- if (GET_SEGNO(sbi, old_blkaddr) != NULL_SEGNO)
+ if (GET_SEGNO(sbi, old_blkaddr) != NULL_SEGNO) {
+ invalidate_mapping_pages(META_MAPPING(sbi),
+ old_blkaddr, old_blkaddr);
update_sit_entry(sbi, old_blkaddr, -1);
+ }
locate_dirty_segment(sbi, GET_SEGNO(sbi, old_blkaddr));
locate_dirty_segment(sbi, GET_SEGNO(sbi, new_blkaddr));
@@ -1625,13 +3250,14 @@ void __f2fs_replace_block(struct f2fs_sb_info *sbi, struct f2fs_summary *sum,
if (recover_curseg) {
if (old_cursegno != curseg->segno) {
curseg->next_segno = old_cursegno;
- change_curseg(sbi, type, true);
+ change_curseg(sbi, type);
}
curseg->next_blkoff = old_blkoff;
}
- mutex_unlock(&sit_i->sentry_lock);
+ up_write(&sit_i->sentry_lock);
mutex_unlock(&curseg->curseg_mutex);
+ up_write(&SM_I(sbi)->curseg_lock);
}
void f2fs_replace_block(struct f2fs_sb_info *sbi, struct dnode_of_data *dn,
@@ -1643,41 +3269,55 @@ void f2fs_replace_block(struct f2fs_sb_info *sbi, struct dnode_of_data *dn,
set_summary(&sum, dn->nid, dn->ofs_in_node, version);
- __f2fs_replace_block(sbi, &sum, old_addr, new_addr,
+ f2fs_do_replace_block(sbi, &sum, old_addr, new_addr,
recover_curseg, recover_newaddr);
f2fs_update_data_blkaddr(dn, new_addr);
}
void f2fs_wait_on_page_writeback(struct page *page,
- enum page_type type, bool ordered)
+ enum page_type type, bool ordered, bool locked)
{
if (PageWriteback(page)) {
struct f2fs_sb_info *sbi = F2FS_P_SB(page);
- f2fs_submit_merged_bio_cond(sbi, NULL, page, 0, type, WRITE);
- if (ordered)
+ f2fs_submit_merged_write_cond(sbi, NULL, page, 0, type);
+ if (ordered) {
wait_on_page_writeback(page);
- else
+ f2fs_bug_on(sbi, locked && PageWriteback(page));
+ } else {
wait_for_stable_page(page);
+ }
}
}
-void f2fs_wait_on_encrypted_page_writeback(struct f2fs_sb_info *sbi,
- block_t blkaddr)
+void f2fs_wait_on_block_writeback(struct inode *inode, block_t blkaddr)
{
+ struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
struct page *cpage;
+ if (!f2fs_post_read_required(inode))
+ return;
+
if (!is_valid_data_blkaddr(sbi, blkaddr))
return;
cpage = find_lock_page(META_MAPPING(sbi), blkaddr);
if (cpage) {
- f2fs_wait_on_page_writeback(cpage, DATA, true);
+ f2fs_wait_on_page_writeback(cpage, DATA, true, true);
f2fs_put_page(cpage, 1);
}
}
+void f2fs_wait_on_block_writeback_range(struct inode *inode, block_t blkaddr,
+ block_t len)
+{
+ block_t i;
+
+ for (i = 0; i < len; i++)
+ f2fs_wait_on_block_writeback(inode, blkaddr + i);
+}
+
static int read_compacted_summaries(struct f2fs_sb_info *sbi)
{
struct f2fs_checkpoint *ckpt = F2FS_CKPT(sbi);
@@ -1689,7 +3329,9 @@ static int read_compacted_summaries(struct f2fs_sb_info *sbi)
start = start_sum_block(sbi);
- page = get_meta_page(sbi, start++);
+ page = f2fs_get_meta_page(sbi, start++);
+ if (IS_ERR(page))
+ return PTR_ERR(page);
kaddr = (unsigned char *)page_address(page);
/* Step 1: restore nat cache */
@@ -1729,7 +3371,9 @@ static int read_compacted_summaries(struct f2fs_sb_info *sbi)
f2fs_put_page(page, 1);
page = NULL;
- page = get_meta_page(sbi, start++);
+ page = f2fs_get_meta_page(sbi, start++);
+ if (IS_ERR(page))
+ return PTR_ERR(page);
kaddr = (unsigned char *)page_address(page);
offset = 0;
}
@@ -1747,6 +3391,7 @@ static int read_normal_summaries(struct f2fs_sb_info *sbi, int type)
unsigned short blk_off;
unsigned int segno = 0;
block_t blk_addr = 0;
+ int err = 0;
/* get segment number and block addr */
if (IS_DATASEG(type)) {
@@ -1769,7 +3414,9 @@ static int read_normal_summaries(struct f2fs_sb_info *sbi, int type)
blk_addr = GET_SUM_BLOCK(sbi, segno);
}
- new = get_meta_page(sbi, blk_addr);
+ new = f2fs_get_meta_page(sbi, blk_addr);
+ if (IS_ERR(new))
+ return PTR_ERR(new);
sum = (struct f2fs_summary_block *)page_address(new);
if (IS_NODESEG(type)) {
@@ -1781,13 +3428,9 @@ static int read_normal_summaries(struct f2fs_sb_info *sbi, int type)
ns->ofs_in_node = 0;
}
} else {
- int err;
-
- err = restore_node_summary(sbi, segno, sum);
- if (err) {
- f2fs_put_page(new, 1);
- return err;
- }
+ err = f2fs_restore_node_summary(sbi, segno, sum);
+ if (err)
+ goto out;
}
}
@@ -1807,8 +3450,9 @@ static int read_normal_summaries(struct f2fs_sb_info *sbi, int type)
curseg->alloc_type = ckpt->alloc_type[type];
curseg->next_blkoff = blk_off;
mutex_unlock(&curseg->curseg_mutex);
+out:
f2fs_put_page(new, 1);
- return 0;
+ return err;
}
static int restore_curseg_summaries(struct f2fs_sb_info *sbi)
@@ -1819,20 +3463,21 @@ static int restore_curseg_summaries(struct f2fs_sb_info *sbi)
int err;
if (is_set_ckpt_flags(sbi, CP_COMPACT_SUM_FLAG)) {
- int npages = npages_for_summary_flush(sbi, true);
+ int npages = f2fs_npages_for_summary_flush(sbi, true);
if (npages >= 2)
- ra_meta_pages(sbi, start_sum_block(sbi), npages,
+ f2fs_ra_meta_pages(sbi, start_sum_block(sbi), npages,
META_CP, true);
/* restore for compacted data summary */
- if (read_compacted_summaries(sbi))
- return -EINVAL;
+ err = read_compacted_summaries(sbi);
+ if (err)
+ return err;
type = CURSEG_HOT_NODE;
}
if (__exist_node_summaries(sbi))
- ra_meta_pages(sbi, sum_blk_addr(sbi, NR_CURSEG_TYPE, type),
+ f2fs_ra_meta_pages(sbi, sum_blk_addr(sbi, NR_CURSEG_TYPE, type),
NR_CURSEG_TYPE - type, META_CP, true);
for (; type <= CURSEG_COLD_NODE; type++) {
@@ -1858,8 +3503,9 @@ static void write_compacted_summaries(struct f2fs_sb_info *sbi, block_t blkaddr)
int written_size = 0;
int i, j;
- page = grab_meta_page(sbi, blkaddr++);
+ page = f2fs_grab_meta_page(sbi, blkaddr++);
kaddr = (unsigned char *)page_address(page);
+ memset(kaddr, 0, PAGE_SIZE);
/* Step 1: write nat cache */
seg_i = CURSEG_I(sbi, CURSEG_HOT_DATA);
@@ -1882,8 +3528,9 @@ static void write_compacted_summaries(struct f2fs_sb_info *sbi, block_t blkaddr)
for (j = 0; j < blkoff; j++) {
if (!page) {
- page = grab_meta_page(sbi, blkaddr++);
+ page = f2fs_grab_meta_page(sbi, blkaddr++);
kaddr = (unsigned char *)page_address(page);
+ memset(kaddr, 0, PAGE_SIZE);
written_size = 0;
}
summary = (struct f2fs_summary *)(kaddr + written_size);
@@ -1918,7 +3565,7 @@ static void write_normal_summaries(struct f2fs_sb_info *sbi,
write_current_sum_page(sbi, i, blkaddr + (i - type));
}
-void write_data_summaries(struct f2fs_sb_info *sbi, block_t start_blk)
+void f2fs_write_data_summaries(struct f2fs_sb_info *sbi, block_t start_blk)
{
if (is_set_ckpt_flags(sbi, CP_COMPACT_SUM_FLAG))
write_compacted_summaries(sbi, start_blk);
@@ -1926,12 +3573,12 @@ void write_data_summaries(struct f2fs_sb_info *sbi, block_t start_blk)
write_normal_summaries(sbi, start_blk, CURSEG_HOT_DATA);
}
-void write_node_summaries(struct f2fs_sb_info *sbi, block_t start_blk)
+void f2fs_write_node_summaries(struct f2fs_sb_info *sbi, block_t start_blk)
{
write_normal_summaries(sbi, start_blk, CURSEG_HOT_NODE);
}
-int lookup_journal_in_cursum(struct f2fs_journal *journal, int type,
+int f2fs_lookup_journal_in_cursum(struct f2fs_journal *journal, int type,
unsigned int val, int alloc)
{
int i;
@@ -1956,35 +3603,26 @@ int lookup_journal_in_cursum(struct f2fs_journal *journal, int type,
static struct page *get_current_sit_page(struct f2fs_sb_info *sbi,
unsigned int segno)
{
- return get_meta_page(sbi, current_sit_addr(sbi, segno));
+ return f2fs_get_meta_page_nofail(sbi, current_sit_addr(sbi, segno));
}
static struct page *get_next_sit_page(struct f2fs_sb_info *sbi,
unsigned int start)
{
struct sit_info *sit_i = SIT_I(sbi);
- struct page *src_page, *dst_page;
+ struct page *page;
pgoff_t src_off, dst_off;
- void *src_addr, *dst_addr;
src_off = current_sit_addr(sbi, start);
dst_off = next_sit_addr(sbi, src_off);
- /* get current sit block page without lock */
- src_page = get_meta_page(sbi, src_off);
- dst_page = grab_meta_page(sbi, dst_off);
- f2fs_bug_on(sbi, PageDirty(src_page));
-
- src_addr = page_address(src_page);
- dst_addr = page_address(dst_page);
- memcpy(dst_addr, src_addr, PAGE_SIZE);
-
- set_page_dirty(dst_page);
- f2fs_put_page(src_page, 1);
+ page = f2fs_grab_meta_page(sbi, dst_off);
+ seg_info_to_sit_page(sbi, page, start);
+ set_page_dirty(page);
set_to_next_sit(sit_i, start);
- return dst_page;
+ return page;
}
static struct sit_entry_set *grab_sit_entry_set(void)
@@ -2074,7 +3712,7 @@ static void remove_sits_in_journal(struct f2fs_sb_info *sbi)
* CP calls this function, which flushes SIT entries including sit_journal,
* and moves prefree segs to free segs.
*/
-void flush_sit_entries(struct f2fs_sb_info *sbi, struct cp_control *cpc)
+void f2fs_flush_sit_entries(struct f2fs_sb_info *sbi, struct cp_control *cpc)
{
struct sit_info *sit_i = SIT_I(sbi);
unsigned long *bitmap = sit_i->dirty_sentries_bitmap;
@@ -2085,7 +3723,7 @@ void flush_sit_entries(struct f2fs_sb_info *sbi, struct cp_control *cpc)
bool to_journal = true;
struct seg_entry *se;
- mutex_lock(&sit_i->sentry_lock);
+ down_write(&sit_i->sentry_lock);
if (!sit_i->dirty_sentries)
goto out;
@@ -2133,25 +3771,34 @@ void flush_sit_entries(struct f2fs_sb_info *sbi, struct cp_control *cpc)
int offset, sit_offset;
se = get_seg_entry(sbi, segno);
+#ifdef CONFIG_F2FS_CHECK_FS
+ if (memcmp(se->cur_valid_map, se->cur_valid_map_mir,
+ SIT_VBLOCK_MAP_SIZE))
+ f2fs_bug_on(sbi, 1);
+#endif
/* add discard candidates */
- if (cpc->reason != CP_DISCARD) {
+ if (!(cpc->reason & CP_DISCARD)) {
cpc->trim_start = segno;
- add_discard_addrs(sbi, cpc);
+ add_discard_addrs(sbi, cpc, false);
}
if (to_journal) {
- offset = lookup_journal_in_cursum(journal,
+ offset = f2fs_lookup_journal_in_cursum(journal,
SIT_JOURNAL, segno, 1);
f2fs_bug_on(sbi, offset < 0);
segno_in_journal(journal, offset) =
cpu_to_le32(segno);
seg_info_to_raw_sit(se,
&sit_in_journal(journal, offset));
+ check_block_count(sbi, segno,
+ &sit_in_journal(journal, offset));
} else {
sit_offset = SIT_ENTRY_OFFSET(sit_i, segno);
seg_info_to_raw_sit(se,
&raw_sit->entries[sit_offset]);
+ check_block_count(sbi, segno,
+ &raw_sit->entries[sit_offset]);
}
__clear_bit(segno, bitmap);
@@ -2171,11 +3818,15 @@ void flush_sit_entries(struct f2fs_sb_info *sbi, struct cp_control *cpc)
f2fs_bug_on(sbi, !list_empty(head));
f2fs_bug_on(sbi, sit_i->dirty_sentries);
out:
- if (cpc->reason == CP_DISCARD) {
+ if (cpc->reason & CP_DISCARD) {
+ __u64 trim_start = cpc->trim_start;
+
for (; cpc->trim_start <= cpc->trim_end; cpc->trim_start++)
- add_discard_addrs(sbi, cpc);
+ add_discard_addrs(sbi, cpc, false);
+
+ cpc->trim_start = trim_start;
}
- mutex_unlock(&sit_i->sentry_lock);
+ up_write(&sit_i->sentry_lock);
set_prefree_as_free_segments(sbi);
}
@@ -2183,53 +3834,63 @@ out:
static int build_sit_info(struct f2fs_sb_info *sbi)
{
struct f2fs_super_block *raw_super = F2FS_RAW_SUPER(sbi);
- struct f2fs_checkpoint *ckpt = F2FS_CKPT(sbi);
struct sit_info *sit_i;
unsigned int sit_segs, start;
- char *src_bitmap, *dst_bitmap;
+ char *src_bitmap;
unsigned int bitmap_size;
/* allocate memory for SIT information */
- sit_i = kzalloc(sizeof(struct sit_info), GFP_KERNEL);
+ sit_i = f2fs_kzalloc(sbi, sizeof(struct sit_info), GFP_KERNEL);
if (!sit_i)
return -ENOMEM;
SM_I(sbi)->sit_info = sit_i;
- sit_i->sentries = f2fs_kvzalloc(MAIN_SEGS(sbi) *
- sizeof(struct seg_entry), GFP_KERNEL);
+ sit_i->sentries =
+ f2fs_kvzalloc(sbi, array_size(sizeof(struct seg_entry),
+ MAIN_SEGS(sbi)),
+ GFP_KERNEL);
if (!sit_i->sentries)
return -ENOMEM;
bitmap_size = f2fs_bitmap_size(MAIN_SEGS(sbi));
- sit_i->dirty_sentries_bitmap = f2fs_kvzalloc(bitmap_size, GFP_KERNEL);
+ sit_i->dirty_sentries_bitmap = f2fs_kvzalloc(sbi, bitmap_size,
+ GFP_KERNEL);
if (!sit_i->dirty_sentries_bitmap)
return -ENOMEM;
for (start = 0; start < MAIN_SEGS(sbi); start++) {
sit_i->sentries[start].cur_valid_map
- = kzalloc(SIT_VBLOCK_MAP_SIZE, GFP_KERNEL);
+ = f2fs_kzalloc(sbi, SIT_VBLOCK_MAP_SIZE, GFP_KERNEL);
sit_i->sentries[start].ckpt_valid_map
- = kzalloc(SIT_VBLOCK_MAP_SIZE, GFP_KERNEL);
+ = f2fs_kzalloc(sbi, SIT_VBLOCK_MAP_SIZE, GFP_KERNEL);
if (!sit_i->sentries[start].cur_valid_map ||
!sit_i->sentries[start].ckpt_valid_map)
return -ENOMEM;
- if (f2fs_discard_en(sbi)) {
- sit_i->sentries[start].discard_map
- = kzalloc(SIT_VBLOCK_MAP_SIZE, GFP_KERNEL);
- if (!sit_i->sentries[start].discard_map)
- return -ENOMEM;
- }
+#ifdef CONFIG_F2FS_CHECK_FS
+ sit_i->sentries[start].cur_valid_map_mir
+ = f2fs_kzalloc(sbi, SIT_VBLOCK_MAP_SIZE, GFP_KERNEL);
+ if (!sit_i->sentries[start].cur_valid_map_mir)
+ return -ENOMEM;
+#endif
+
+ sit_i->sentries[start].discard_map
+ = f2fs_kzalloc(sbi, SIT_VBLOCK_MAP_SIZE,
+ GFP_KERNEL);
+ if (!sit_i->sentries[start].discard_map)
+ return -ENOMEM;
}
- sit_i->tmp_map = kzalloc(SIT_VBLOCK_MAP_SIZE, GFP_KERNEL);
+ sit_i->tmp_map = f2fs_kzalloc(sbi, SIT_VBLOCK_MAP_SIZE, GFP_KERNEL);
if (!sit_i->tmp_map)
return -ENOMEM;
- if (sbi->segs_per_sec > 1) {
- sit_i->sec_entries = f2fs_kvzalloc(MAIN_SECS(sbi) *
- sizeof(struct sec_entry), GFP_KERNEL);
+ if (__is_large_section(sbi)) {
+ sit_i->sec_entries =
+ f2fs_kvzalloc(sbi, array_size(sizeof(struct sec_entry),
+ MAIN_SECS(sbi)),
+ GFP_KERNEL);
if (!sit_i->sec_entries)
return -ENOMEM;
}
@@ -2241,23 +3902,28 @@ static int build_sit_info(struct f2fs_sb_info *sbi)
bitmap_size = __bitmap_size(sbi, SIT_BITMAP);
src_bitmap = __bitmap_ptr(sbi, SIT_BITMAP);
- dst_bitmap = kmemdup(src_bitmap, bitmap_size, GFP_KERNEL);
- if (!dst_bitmap)
+ sit_i->sit_bitmap = kmemdup(src_bitmap, bitmap_size, GFP_KERNEL);
+ if (!sit_i->sit_bitmap)
return -ENOMEM;
+#ifdef CONFIG_F2FS_CHECK_FS
+ sit_i->sit_bitmap_mir = kmemdup(src_bitmap, bitmap_size, GFP_KERNEL);
+ if (!sit_i->sit_bitmap_mir)
+ return -ENOMEM;
+#endif
+
/* init SIT information */
sit_i->s_ops = &default_salloc_ops;
sit_i->sit_base_addr = le32_to_cpu(raw_super->sit_blkaddr);
sit_i->sit_blocks = sit_segs << sbi->log_blocks_per_seg;
- sit_i->written_valid_blocks = le64_to_cpu(ckpt->valid_block_count);
- sit_i->sit_bitmap = dst_bitmap;
+ sit_i->written_valid_blocks = 0;
sit_i->bitmap_size = bitmap_size;
sit_i->dirty_sentries = 0;
sit_i->sents_per_block = SIT_ENTRY_PER_BLOCK;
sit_i->elapsed_time = le64_to_cpu(sbi->ckpt->elapsed_time);
- sit_i->mounted_time = CURRENT_TIME_SEC.tv_sec;
- mutex_init(&sit_i->sentry_lock);
+ sit_i->mounted_time = ktime_get_real_seconds();
+ init_rwsem(&sit_i->sentry_lock);
return 0;
}
@@ -2267,19 +3933,19 @@ static int build_free_segmap(struct f2fs_sb_info *sbi)
unsigned int bitmap_size, sec_bitmap_size;
/* allocate memory for free segmap information */
- free_i = kzalloc(sizeof(struct free_segmap_info), GFP_KERNEL);
+ free_i = f2fs_kzalloc(sbi, sizeof(struct free_segmap_info), GFP_KERNEL);
if (!free_i)
return -ENOMEM;
SM_I(sbi)->free_info = free_i;
bitmap_size = f2fs_bitmap_size(MAIN_SEGS(sbi));
- free_i->free_segmap = f2fs_kvmalloc(bitmap_size, GFP_KERNEL);
+ free_i->free_segmap = f2fs_kvmalloc(sbi, bitmap_size, GFP_KERNEL);
if (!free_i->free_segmap)
return -ENOMEM;
sec_bitmap_size = f2fs_bitmap_size(MAIN_SECS(sbi));
- free_i->free_secmap = f2fs_kvmalloc(sec_bitmap_size, GFP_KERNEL);
+ free_i->free_secmap = f2fs_kvmalloc(sbi, sec_bitmap_size, GFP_KERNEL);
if (!free_i->free_secmap)
return -ENOMEM;
@@ -2300,7 +3966,8 @@ static int build_curseg(struct f2fs_sb_info *sbi)
struct curseg_info *array;
int i;
- array = kcalloc(NR_CURSEG_TYPE, sizeof(*array), GFP_KERNEL);
+ array = f2fs_kzalloc(sbi, array_size(NR_CURSEG_TYPE, sizeof(*array)),
+ GFP_KERNEL);
if (!array)
return -ENOMEM;
@@ -2308,12 +3975,12 @@ static int build_curseg(struct f2fs_sb_info *sbi)
for (i = 0; i < NR_CURSEG_TYPE; i++) {
mutex_init(&array[i].curseg_mutex);
- array[i].sum_blk = kzalloc(PAGE_SIZE, GFP_KERNEL);
+ array[i].sum_blk = f2fs_kzalloc(sbi, PAGE_SIZE, GFP_KERNEL);
if (!array[i].sum_blk)
return -ENOMEM;
init_rwsem(&array[i].journal_rwsem);
- array[i].journal = kzalloc(sizeof(struct f2fs_journal),
- GFP_KERNEL);
+ array[i].journal = f2fs_kzalloc(sbi,
+ sizeof(struct f2fs_journal), GFP_KERNEL);
if (!array[i].journal)
return -ENOMEM;
array[i].segno = NULL_SEGNO;
@@ -2332,11 +3999,12 @@ static int build_sit_entries(struct f2fs_sb_info *sbi)
int sit_blk_cnt = SIT_BLK_CNT(sbi);
unsigned int i, start, end;
unsigned int readed, start_blk = 0;
- int nrpages = MAX_BIO_BLOCKS(sbi) * 8;
int err = 0;
+ block_t total_node_blocks = 0;
do {
- readed = ra_meta_pages(sbi, start_blk, nrpages, META_SIT, true);
+ readed = f2fs_ra_meta_pages(sbi, start_blk, BIO_MAX_PAGES,
+ META_SIT, true);
start = start_blk * sit_i->sents_per_block;
end = (start_blk + readed) * sit_i->sents_per_block;
@@ -2347,6 +4015,8 @@ static int build_sit_entries(struct f2fs_sb_info *sbi)
se = &sit_i->sentries[start];
page = get_current_sit_page(sbi, start);
+ if (IS_ERR(page))
+ return PTR_ERR(page);
sit_blk = (struct f2fs_sit_block *)page_address(page);
sit = sit_blk->entries[SIT_ENTRY_OFFSET(sit_i, start)];
f2fs_put_page(page, 1);
@@ -2355,16 +4025,23 @@ static int build_sit_entries(struct f2fs_sb_info *sbi)
if (err)
return err;
seg_info_from_raw_sit(se, &sit);
+ if (IS_NODESEG(se->type))
+ total_node_blocks += se->valid_blocks;
/* build discard map only one time */
- if (f2fs_discard_en(sbi)) {
- memcpy(se->discard_map, se->cur_valid_map,
- SIT_VBLOCK_MAP_SIZE);
- sbi->discard_blks += sbi->blocks_per_seg -
- se->valid_blocks;
+ if (is_set_ckpt_flags(sbi, CP_TRIMMED_FLAG)) {
+ memset(se->discard_map, 0xff,
+ SIT_VBLOCK_MAP_SIZE);
+ } else {
+ memcpy(se->discard_map,
+ se->cur_valid_map,
+ SIT_VBLOCK_MAP_SIZE);
+ sbi->discard_blks +=
+ sbi->blocks_per_seg -
+ se->valid_blocks;
}
- if (sbi->segs_per_sec > 1)
+ if (__is_large_section(sbi))
get_sec_entry(sbi, start)->valid_blocks +=
se->valid_blocks;
}
@@ -2389,24 +4066,42 @@ static int build_sit_entries(struct f2fs_sb_info *sbi)
sit = sit_in_journal(journal, i);
old_valid_blocks = se->valid_blocks;
+ if (IS_NODESEG(se->type))
+ total_node_blocks -= old_valid_blocks;
err = check_block_count(sbi, start, &sit);
if (err)
break;
seg_info_from_raw_sit(se, &sit);
+ if (IS_NODESEG(se->type))
+ total_node_blocks += se->valid_blocks;
- if (f2fs_discard_en(sbi)) {
+ if (is_set_ckpt_flags(sbi, CP_TRIMMED_FLAG)) {
+ memset(se->discard_map, 0xff, SIT_VBLOCK_MAP_SIZE);
+ } else {
memcpy(se->discard_map, se->cur_valid_map,
SIT_VBLOCK_MAP_SIZE);
- sbi->discard_blks += old_valid_blocks -
- se->valid_blocks;
+ sbi->discard_blks += old_valid_blocks;
+ sbi->discard_blks -= se->valid_blocks;
}
- if (sbi->segs_per_sec > 1)
+ if (__is_large_section(sbi)) {
get_sec_entry(sbi, start)->valid_blocks +=
- se->valid_blocks - old_valid_blocks;
+ se->valid_blocks;
+ get_sec_entry(sbi, start)->valid_blocks -=
+ old_valid_blocks;
+ }
}
up_read(&curseg->journal_rwsem);
+
+ if (!err && total_node_blocks != valid_node_count(sbi)) {
+ f2fs_msg(sbi->sb, KERN_ERR,
+ "SIT is corrupted node# %u vs %u",
+ total_node_blocks, valid_node_count(sbi));
+ set_sbi_flag(sbi, SBI_NEED_FSCK);
+ err = -EINVAL;
+ }
+
return err;
}
@@ -2419,6 +4114,9 @@ static void init_free_segmap(struct f2fs_sb_info *sbi)
struct seg_entry *sentry = get_seg_entry(sbi, start);
if (!sentry->valid_blocks)
__set_free(sbi, start);
+ else
+ SIT_I(sbi)->written_valid_blocks +=
+ sentry->valid_blocks;
}
/* set use the current segments */
@@ -2441,7 +4139,7 @@ static void init_dirty_segmap(struct f2fs_sb_info *sbi)
if (segno >= MAIN_SEGS(sbi))
break;
offset = segno + 1;
- valid_blocks = get_valid_blocks(sbi, segno, 0);
+ valid_blocks = get_valid_blocks(sbi, segno, false);
if (valid_blocks == sbi->blocks_per_seg || !valid_blocks)
continue;
if (valid_blocks > sbi->blocks_per_seg) {
@@ -2459,7 +4157,7 @@ static int init_victim_secmap(struct f2fs_sb_info *sbi)
struct dirty_seglist_info *dirty_i = DIRTY_I(sbi);
unsigned int bitmap_size = f2fs_bitmap_size(MAIN_SECS(sbi));
- dirty_i->victim_secmap = f2fs_kvzalloc(bitmap_size, GFP_KERNEL);
+ dirty_i->victim_secmap = f2fs_kvzalloc(sbi, bitmap_size, GFP_KERNEL);
if (!dirty_i->victim_secmap)
return -ENOMEM;
return 0;
@@ -2471,7 +4169,8 @@ static int build_dirty_segmap(struct f2fs_sb_info *sbi)
unsigned int bitmap_size, i;
/* allocate memory for dirty segments list information */
- dirty_i = kzalloc(sizeof(struct dirty_seglist_info), GFP_KERNEL);
+ dirty_i = f2fs_kzalloc(sbi, sizeof(struct dirty_seglist_info),
+ GFP_KERNEL);
if (!dirty_i)
return -ENOMEM;
@@ -2481,7 +4180,8 @@ static int build_dirty_segmap(struct f2fs_sb_info *sbi)
bitmap_size = f2fs_bitmap_size(MAIN_SEGS(sbi));
for (i = 0; i < NR_DIRTY_TYPE; i++) {
- dirty_i->dirty_segmap[i] = f2fs_kvzalloc(bitmap_size, GFP_KERNEL);
+ dirty_i->dirty_segmap[i] = f2fs_kvzalloc(sbi, bitmap_size,
+ GFP_KERNEL);
if (!dirty_i->dirty_segmap[i])
return -ENOMEM;
}
@@ -2498,9 +4198,9 @@ static void init_min_max_mtime(struct f2fs_sb_info *sbi)
struct sit_info *sit_i = SIT_I(sbi);
unsigned int segno;
- mutex_lock(&sit_i->sentry_lock);
+ down_write(&sit_i->sentry_lock);
- sit_i->min_mtime = LLONG_MAX;
+ sit_i->min_mtime = ULLONG_MAX;
for (segno = 0; segno < MAIN_SEGS(sbi); segno += sbi->segs_per_sec) {
unsigned int i;
@@ -2514,18 +4214,18 @@ static void init_min_max_mtime(struct f2fs_sb_info *sbi)
if (sit_i->min_mtime > mtime)
sit_i->min_mtime = mtime;
}
- sit_i->max_mtime = get_mtime(sbi);
- mutex_unlock(&sit_i->sentry_lock);
+ sit_i->max_mtime = get_mtime(sbi, false);
+ up_write(&sit_i->sentry_lock);
}
-int build_segment_manager(struct f2fs_sb_info *sbi)
+int f2fs_build_segment_manager(struct f2fs_sb_info *sbi)
{
struct f2fs_super_block *raw_super = F2FS_RAW_SUPER(sbi);
struct f2fs_checkpoint *ckpt = F2FS_CKPT(sbi);
struct f2fs_sm_info *sm_info;
int err;
- sm_info = kzalloc(sizeof(struct f2fs_sm_info), GFP_KERNEL);
+ sm_info = f2fs_kzalloc(sbi, sizeof(struct f2fs_sm_info), GFP_KERNEL);
if (!sm_info)
return -ENOMEM;
@@ -2547,22 +4247,24 @@ int build_segment_manager(struct f2fs_sb_info *sbi)
sm_info->ipu_policy = 1 << F2FS_IPU_FSYNC;
sm_info->min_ipu_util = DEF_MIN_IPU_UTIL;
sm_info->min_fsync_blocks = DEF_MIN_FSYNC_BLOCKS;
-
- INIT_LIST_HEAD(&sm_info->discard_list);
- INIT_LIST_HEAD(&sm_info->wait_list);
- sm_info->nr_discards = 0;
- sm_info->max_discards = 0;
-
- sm_info->trim_sections = DEF_BATCHED_TRIM_SECTIONS;
+ sm_info->min_seq_blocks = sbi->blocks_per_seg * sbi->segs_per_sec;
+ sm_info->min_hot_blocks = DEF_MIN_HOT_BLOCKS;
+ sm_info->min_ssr_sections = reserved_sections(sbi);
INIT_LIST_HEAD(&sm_info->sit_entry_set);
+ init_rwsem(&sm_info->curseg_lock);
+
if (!f2fs_readonly(sbi->sb)) {
- err = create_flush_cmd_control(sbi);
+ err = f2fs_create_flush_cmd_control(sbi);
if (err)
return err;
}
+ err = create_discard_cmd_control(sbi);
+ if (err)
+ return err;
+
err = build_sit_info(sbi);
if (err)
return err;
@@ -2618,7 +4320,7 @@ static void destroy_dirty_segmap(struct f2fs_sb_info *sbi)
destroy_victim_secmap(sbi);
SM_I(sbi)->dirty_info = NULL;
- kfree(dirty_i);
+ kvfree(dirty_i);
}
static void destroy_curseg(struct f2fs_sb_info *sbi)
@@ -2630,10 +4332,10 @@ static void destroy_curseg(struct f2fs_sb_info *sbi)
return;
SM_I(sbi)->curseg_array = NULL;
for (i = 0; i < NR_CURSEG_TYPE; i++) {
- kfree(array[i].sum_blk);
- kfree(array[i].journal);
+ kvfree(array[i].sum_blk);
+ kvfree(array[i].journal);
}
- kfree(array);
+ kvfree(array);
}
static void destroy_free_segmap(struct f2fs_sb_info *sbi)
@@ -2644,7 +4346,7 @@ static void destroy_free_segmap(struct f2fs_sb_info *sbi)
SM_I(sbi)->free_info = NULL;
kvfree(free_i->free_segmap);
kvfree(free_i->free_secmap);
- kfree(free_i);
+ kvfree(free_i);
}
static void destroy_sit_info(struct f2fs_sb_info *sbi)
@@ -2657,53 +4359,60 @@ static void destroy_sit_info(struct f2fs_sb_info *sbi)
if (sit_i->sentries) {
for (start = 0; start < MAIN_SEGS(sbi); start++) {
- kfree(sit_i->sentries[start].cur_valid_map);
- kfree(sit_i->sentries[start].ckpt_valid_map);
- kfree(sit_i->sentries[start].discard_map);
+ kvfree(sit_i->sentries[start].cur_valid_map);
+#ifdef CONFIG_F2FS_CHECK_FS
+ kvfree(sit_i->sentries[start].cur_valid_map_mir);
+#endif
+ kvfree(sit_i->sentries[start].ckpt_valid_map);
+ kvfree(sit_i->sentries[start].discard_map);
}
}
- kfree(sit_i->tmp_map);
+ kvfree(sit_i->tmp_map);
kvfree(sit_i->sentries);
kvfree(sit_i->sec_entries);
kvfree(sit_i->dirty_sentries_bitmap);
SM_I(sbi)->sit_info = NULL;
- kfree(sit_i->sit_bitmap);
- kfree(sit_i);
+ kvfree(sit_i->sit_bitmap);
+#ifdef CONFIG_F2FS_CHECK_FS
+ kvfree(sit_i->sit_bitmap_mir);
+#endif
+ kvfree(sit_i);
}
-void destroy_segment_manager(struct f2fs_sb_info *sbi)
+void f2fs_destroy_segment_manager(struct f2fs_sb_info *sbi)
{
struct f2fs_sm_info *sm_info = SM_I(sbi);
if (!sm_info)
return;
- destroy_flush_cmd_control(sbi);
+ f2fs_destroy_flush_cmd_control(sbi, true);
+ destroy_discard_cmd_control(sbi);
destroy_dirty_segmap(sbi);
destroy_curseg(sbi);
destroy_free_segmap(sbi);
destroy_sit_info(sbi);
sbi->sm_info = NULL;
- kfree(sm_info);
+ kvfree(sm_info);
}
-int __init create_segment_manager_caches(void)
+int __init f2fs_create_segment_manager_caches(void)
{
discard_entry_slab = f2fs_kmem_cache_create("discard_entry",
sizeof(struct discard_entry));
if (!discard_entry_slab)
goto fail;
- bio_entry_slab = f2fs_kmem_cache_create("bio_entry",
- sizeof(struct bio_entry));
- if (!bio_entry_slab)
+ discard_cmd_slab = f2fs_kmem_cache_create("discard_cmd",
+ sizeof(struct discard_cmd));
+ if (!discard_cmd_slab)
goto destroy_discard_entry;
sit_entry_set_slab = f2fs_kmem_cache_create("sit_entry_set",
sizeof(struct sit_entry_set));
if (!sit_entry_set_slab)
- goto destroy_bio_entry;
+ goto destroy_discard_cmd;
inmem_entry_slab = f2fs_kmem_cache_create("inmem_page_entry",
sizeof(struct inmem_pages));
@@ -2713,18 +4422,18 @@ int __init create_segment_manager_caches(void)
destroy_sit_entry_set:
kmem_cache_destroy(sit_entry_set_slab);
-destroy_bio_entry:
- kmem_cache_destroy(bio_entry_slab);
+destroy_discard_cmd:
+ kmem_cache_destroy(discard_cmd_slab);
destroy_discard_entry:
kmem_cache_destroy(discard_entry_slab);
fail:
return -ENOMEM;
}
-void destroy_segment_manager_caches(void)
+void f2fs_destroy_segment_manager_caches(void)
{
kmem_cache_destroy(sit_entry_set_slab);
- kmem_cache_destroy(bio_entry_slab);
+ kmem_cache_destroy(discard_cmd_slab);
kmem_cache_destroy(discard_entry_slab);
kmem_cache_destroy(inmem_entry_slab);
}
diff --git a/fs/f2fs/segment.h b/fs/f2fs/segment.h
index 893723978f5e..a77f76f528b6 100644
--- a/fs/f2fs/segment.h
+++ b/fs/f2fs/segment.h
@@ -1,12 +1,9 @@
+// SPDX-License-Identifier: GPL-2.0
/*
* fs/f2fs/segment.h
*
* Copyright (c) 2012 Samsung Electronics Co., Ltd.
* http://www.samsung.com/
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
*/
#include <linux/blkdev.h>
#include <linux/backing-dev.h>
@@ -21,33 +18,37 @@
#define F2FS_MIN_SEGMENTS 9 /* SB + 2 (CP + SIT + NAT) + SSA + MAIN */
/* L: Logical segment # in volume, R: Relative segment # in main area */
-#define GET_L2R_SEGNO(free_i, segno) (segno - free_i->start_segno)
-#define GET_R2L_SEGNO(free_i, segno) (segno + free_i->start_segno)
+#define GET_L2R_SEGNO(free_i, segno) ((segno) - (free_i)->start_segno)
+#define GET_R2L_SEGNO(free_i, segno) ((segno) + (free_i)->start_segno)
+
+#define IS_DATASEG(t) ((t) <= CURSEG_COLD_DATA)
+#define IS_NODESEG(t) ((t) >= CURSEG_HOT_NODE)
-#define IS_DATASEG(t) (t <= CURSEG_COLD_DATA)
-#define IS_NODESEG(t) (t >= CURSEG_HOT_NODE)
+#define IS_HOT(t) ((t) == CURSEG_HOT_NODE || (t) == CURSEG_HOT_DATA)
+#define IS_WARM(t) ((t) == CURSEG_WARM_NODE || (t) == CURSEG_WARM_DATA)
+#define IS_COLD(t) ((t) == CURSEG_COLD_NODE || (t) == CURSEG_COLD_DATA)
#define IS_CURSEG(sbi, seg) \
- ((seg == CURSEG_I(sbi, CURSEG_HOT_DATA)->segno) || \
- (seg == CURSEG_I(sbi, CURSEG_WARM_DATA)->segno) || \
- (seg == CURSEG_I(sbi, CURSEG_COLD_DATA)->segno) || \
- (seg == CURSEG_I(sbi, CURSEG_HOT_NODE)->segno) || \
- (seg == CURSEG_I(sbi, CURSEG_WARM_NODE)->segno) || \
- (seg == CURSEG_I(sbi, CURSEG_COLD_NODE)->segno))
+ (((seg) == CURSEG_I(sbi, CURSEG_HOT_DATA)->segno) || \
+ ((seg) == CURSEG_I(sbi, CURSEG_WARM_DATA)->segno) || \
+ ((seg) == CURSEG_I(sbi, CURSEG_COLD_DATA)->segno) || \
+ ((seg) == CURSEG_I(sbi, CURSEG_HOT_NODE)->segno) || \
+ ((seg) == CURSEG_I(sbi, CURSEG_WARM_NODE)->segno) || \
+ ((seg) == CURSEG_I(sbi, CURSEG_COLD_NODE)->segno))
#define IS_CURSEC(sbi, secno) \
- ((secno == CURSEG_I(sbi, CURSEG_HOT_DATA)->segno / \
- sbi->segs_per_sec) || \
- (secno == CURSEG_I(sbi, CURSEG_WARM_DATA)->segno / \
- sbi->segs_per_sec) || \
- (secno == CURSEG_I(sbi, CURSEG_COLD_DATA)->segno / \
- sbi->segs_per_sec) || \
- (secno == CURSEG_I(sbi, CURSEG_HOT_NODE)->segno / \
- sbi->segs_per_sec) || \
- (secno == CURSEG_I(sbi, CURSEG_WARM_NODE)->segno / \
- sbi->segs_per_sec) || \
- (secno == CURSEG_I(sbi, CURSEG_COLD_NODE)->segno / \
- sbi->segs_per_sec)) \
+ (((secno) == CURSEG_I(sbi, CURSEG_HOT_DATA)->segno / \
+ (sbi)->segs_per_sec) || \
+ ((secno) == CURSEG_I(sbi, CURSEG_WARM_DATA)->segno / \
+ (sbi)->segs_per_sec) || \
+ ((secno) == CURSEG_I(sbi, CURSEG_COLD_DATA)->segno / \
+ (sbi)->segs_per_sec) || \
+ ((secno) == CURSEG_I(sbi, CURSEG_HOT_NODE)->segno / \
+ (sbi)->segs_per_sec) || \
+ ((secno) == CURSEG_I(sbi, CURSEG_WARM_NODE)->segno / \
+ (sbi)->segs_per_sec) || \
+ ((secno) == CURSEG_I(sbi, CURSEG_COLD_NODE)->segno / \
+ (sbi)->segs_per_sec)) \
#define MAIN_BLKADDR(sbi) \
(SM_I(sbi) ? SM_I(sbi)->main_blkaddr : \
@@ -57,48 +58,54 @@
le32_to_cpu(F2FS_RAW_SUPER(sbi)->segment0_blkaddr))
#define MAIN_SEGS(sbi) (SM_I(sbi)->main_segments)
-#define MAIN_SECS(sbi) (sbi->total_sections)
+#define MAIN_SECS(sbi) ((sbi)->total_sections)
#define TOTAL_SEGS(sbi) \
(SM_I(sbi) ? SM_I(sbi)->segment_count : \
le32_to_cpu(F2FS_RAW_SUPER(sbi)->segment_count))
-#define TOTAL_BLKS(sbi) (TOTAL_SEGS(sbi) << sbi->log_blocks_per_seg)
+#define TOTAL_BLKS(sbi) (TOTAL_SEGS(sbi) << (sbi)->log_blocks_per_seg)
#define MAX_BLKADDR(sbi) (SEG0_BLKADDR(sbi) + TOTAL_BLKS(sbi))
-#define SEGMENT_SIZE(sbi) (1ULL << (sbi->log_blocksize + \
- sbi->log_blocks_per_seg))
+#define SEGMENT_SIZE(sbi) (1ULL << ((sbi)->log_blocksize + \
+ (sbi)->log_blocks_per_seg))
#define START_BLOCK(sbi, segno) (SEG0_BLKADDR(sbi) + \
- (GET_R2L_SEGNO(FREE_I(sbi), segno) << sbi->log_blocks_per_seg))
+ (GET_R2L_SEGNO(FREE_I(sbi), segno) << (sbi)->log_blocks_per_seg))
#define NEXT_FREE_BLKADDR(sbi, curseg) \
- (START_BLOCK(sbi, curseg->segno) + curseg->next_blkoff)
+ (START_BLOCK(sbi, (curseg)->segno) + (curseg)->next_blkoff)
#define GET_SEGOFF_FROM_SEG0(sbi, blk_addr) ((blk_addr) - SEG0_BLKADDR(sbi))
#define GET_SEGNO_FROM_SEG0(sbi, blk_addr) \
- (GET_SEGOFF_FROM_SEG0(sbi, blk_addr) >> sbi->log_blocks_per_seg)
+ (GET_SEGOFF_FROM_SEG0(sbi, blk_addr) >> (sbi)->log_blocks_per_seg)
#define GET_BLKOFF_FROM_SEG0(sbi, blk_addr) \
- (GET_SEGOFF_FROM_SEG0(sbi, blk_addr) & (sbi->blocks_per_seg - 1))
+ (GET_SEGOFF_FROM_SEG0(sbi, blk_addr) & ((sbi)->blocks_per_seg - 1))
#define GET_SEGNO(sbi, blk_addr) \
((!is_valid_data_blkaddr(sbi, blk_addr)) ? \
NULL_SEGNO : GET_L2R_SEGNO(FREE_I(sbi), \
GET_SEGNO_FROM_SEG0(sbi, blk_addr)))
-#define GET_SECNO(sbi, segno) \
- ((segno) / sbi->segs_per_sec)
-#define GET_ZONENO_FROM_SEGNO(sbi, segno) \
- ((segno / sbi->segs_per_sec) / sbi->secs_per_zone)
+#define BLKS_PER_SEC(sbi) \
+ ((sbi)->segs_per_sec * (sbi)->blocks_per_seg)
+#define GET_SEC_FROM_SEG(sbi, segno) \
+ ((segno) / (sbi)->segs_per_sec)
+#define GET_SEG_FROM_SEC(sbi, secno) \
+ ((secno) * (sbi)->segs_per_sec)
+#define GET_ZONE_FROM_SEC(sbi, secno) \
+ ((secno) / (sbi)->secs_per_zone)
+#define GET_ZONE_FROM_SEG(sbi, segno) \
+ GET_ZONE_FROM_SEC(sbi, GET_SEC_FROM_SEG(sbi, segno))
#define GET_SUM_BLOCK(sbi, segno) \
- ((sbi->sm_info->ssa_blkaddr) + segno)
+ ((sbi)->sm_info->ssa_blkaddr + (segno))
#define GET_SUM_TYPE(footer) ((footer)->entry_type)
-#define SET_SUM_TYPE(footer, type) ((footer)->entry_type = type)
+#define SET_SUM_TYPE(footer, type) ((footer)->entry_type = (type))
#define SIT_ENTRY_OFFSET(sit_i, segno) \
- (segno % sit_i->sents_per_block)
+ ((segno) % (sit_i)->sents_per_block)
#define SIT_BLOCK_OFFSET(segno) \
- (segno / SIT_ENTRY_PER_BLOCK)
+ ((segno) / SIT_ENTRY_PER_BLOCK)
#define START_SEGNO(segno) \
(SIT_BLOCK_OFFSET(segno) * SIT_ENTRY_PER_BLOCK)
#define SIT_BLK_CNT(sbi) \
@@ -109,9 +116,7 @@
#define SECTOR_FROM_BLOCK(blk_addr) \
(((sector_t)blk_addr) << F2FS_LOG_SECTORS_PER_BLOCK)
#define SECTOR_TO_BLOCK(sectors) \
- (sectors >> F2FS_LOG_SECTORS_PER_BLOCK)
-#define MAX_BIO_BLOCKS(sbi) \
- ((int)min((int)max_hw_blocks(sbi), BIO_MAX_PAGES))
+ ((sectors) >> F2FS_LOG_SECTORS_PER_BLOCK)
/*
* indicate a block allocation direction: RIGHT and LEFT.
@@ -140,7 +145,10 @@ enum {
*/
enum {
GC_CB = 0,
- GC_GREEDY
+ GC_GREEDY,
+ ALLOC_NEXT,
+ FLUSH_DEVICE,
+ MAX_GC_POLICY,
};
/*
@@ -172,6 +180,9 @@ struct seg_entry {
unsigned int ckpt_valid_blocks:10; /* # of valid blocks last cp */
unsigned int padding:6; /* padding */
unsigned char *cur_valid_map; /* validity bitmap of blocks */
+#ifdef CONFIG_F2FS_CHECK_FS
+ unsigned char *cur_valid_map_mir; /* mirror of current valid bitmap */
+#endif
/*
* # of valid blocks and the validity bitmap stored in the the last
* checkpoint pack. This information is used by the SSR mode.
@@ -194,9 +205,14 @@ struct segment_allocation {
* the page is atomically written, and it is in inmem_pages list.
*/
#define ATOMIC_WRITTEN_PAGE ((unsigned long)-1)
+#define DUMMY_WRITTEN_PAGE ((unsigned long)-2)
#define IS_ATOMIC_WRITTEN_PAGE(page) \
(page_private(page) == (unsigned long)ATOMIC_WRITTEN_PAGE)
+#define IS_DUMMY_WRITTEN_PAGE(page) \
+ (page_private(page) == (unsigned long)DUMMY_WRITTEN_PAGE)
+
+#define MAX_SKIP_GC_COUNT 16
struct inmem_pages {
struct list_head list;
@@ -211,13 +227,16 @@ struct sit_info {
block_t sit_blocks; /* # of blocks used by SIT area */
block_t written_valid_blocks; /* # of valid blocks in main area */
char *sit_bitmap; /* SIT bitmap pointer */
+#ifdef CONFIG_F2FS_CHECK_FS
+ char *sit_bitmap_mir; /* SIT bitmap mirror */
+#endif
unsigned int bitmap_size; /* SIT bitmap size */
unsigned long *tmp_map; /* bitmap for temporal use */
unsigned long *dirty_sentries_bitmap; /* bitmap for dirty sentries */
unsigned int dirty_sentries; /* # of dirty sentries */
unsigned int sents_per_block; /* # of SIT entries per block */
- struct mutex sentry_lock; /* to protect SIT cache */
+ struct rw_semaphore sentry_lock; /* to protect SIT cache */
struct seg_entry *sentries; /* SIT segment-level cache */
struct sec_entry *sec_entries; /* SIT section-level cache */
@@ -226,6 +245,8 @@ struct sit_info {
unsigned long long mounted_time; /* mount time */
unsigned long long min_mtime; /* min. modification time */
unsigned long long max_mtime; /* max. modification time */
+
+ unsigned int last_victim[MAX_GC_POLICY]; /* last victim segment # */
};
struct free_segmap_info {
@@ -302,22 +323,28 @@ static inline struct sec_entry *get_sec_entry(struct f2fs_sb_info *sbi,
unsigned int segno)
{
struct sit_info *sit_i = SIT_I(sbi);
- return &sit_i->sec_entries[GET_SECNO(sbi, segno)];
+ return &sit_i->sec_entries[GET_SEC_FROM_SEG(sbi, segno)];
}
static inline unsigned int get_valid_blocks(struct f2fs_sb_info *sbi,
- unsigned int segno, int section)
+ unsigned int segno, bool use_section)
{
/*
* In order to get # of valid blocks in a section instantly from many
* segments, f2fs manages two counting structures separately.
*/
- if (section > 1)
+ if (use_section && __is_large_section(sbi))
return get_sec_entry(sbi, segno)->valid_blocks;
else
return get_seg_entry(sbi, segno)->valid_blocks;
}
+static inline unsigned int get_ckpt_valid_blocks(struct f2fs_sb_info *sbi,
+ unsigned int segno)
+{
+ return get_seg_entry(sbi, segno)->ckpt_valid_blocks;
+}
+
static inline void seg_info_from_raw_sit(struct seg_entry *se,
struct f2fs_sit_entry *rs)
{
@@ -325,20 +352,49 @@ static inline void seg_info_from_raw_sit(struct seg_entry *se,
se->ckpt_valid_blocks = GET_SIT_VBLOCKS(rs);
memcpy(se->cur_valid_map, rs->valid_map, SIT_VBLOCK_MAP_SIZE);
memcpy(se->ckpt_valid_map, rs->valid_map, SIT_VBLOCK_MAP_SIZE);
+#ifdef CONFIG_F2FS_CHECK_FS
+ memcpy(se->cur_valid_map_mir, rs->valid_map, SIT_VBLOCK_MAP_SIZE);
+#endif
se->type = GET_SIT_TYPE(rs);
se->mtime = le64_to_cpu(rs->mtime);
}
-static inline void seg_info_to_raw_sit(struct seg_entry *se,
+static inline void __seg_info_to_raw_sit(struct seg_entry *se,
struct f2fs_sit_entry *rs)
{
unsigned short raw_vblocks = (se->type << SIT_VBLOCKS_SHIFT) |
se->valid_blocks;
rs->vblocks = cpu_to_le16(raw_vblocks);
memcpy(rs->valid_map, se->cur_valid_map, SIT_VBLOCK_MAP_SIZE);
+ rs->mtime = cpu_to_le64(se->mtime);
+}
+
+static inline void seg_info_to_sit_page(struct f2fs_sb_info *sbi,
+ struct page *page, unsigned int start)
+{
+ struct f2fs_sit_block *raw_sit;
+ struct seg_entry *se;
+ struct f2fs_sit_entry *rs;
+ unsigned int end = min(start + SIT_ENTRY_PER_BLOCK,
+ (unsigned long)MAIN_SEGS(sbi));
+ int i;
+
+ raw_sit = (struct f2fs_sit_block *)page_address(page);
+ memset(raw_sit, 0, PAGE_SIZE);
+ for (i = 0; i < end - start; i++) {
+ rs = &raw_sit->entries[i];
+ se = get_seg_entry(sbi, start + i);
+ __seg_info_to_raw_sit(se, rs);
+ }
+}
+
+static inline void seg_info_to_raw_sit(struct seg_entry *se,
+ struct f2fs_sit_entry *rs)
+{
+ __seg_info_to_raw_sit(se, rs);
+
memcpy(se->ckpt_valid_map, rs->valid_map, SIT_VBLOCK_MAP_SIZE);
se->ckpt_valid_blocks = se->valid_blocks;
- rs->mtime = cpu_to_le64(se->mtime);
}
static inline unsigned int find_next_inuse(struct free_segmap_info *free_i,
@@ -354,8 +410,8 @@ static inline unsigned int find_next_inuse(struct free_segmap_info *free_i,
static inline void __set_free(struct f2fs_sb_info *sbi, unsigned int segno)
{
struct free_segmap_info *free_i = FREE_I(sbi);
- unsigned int secno = segno / sbi->segs_per_sec;
- unsigned int start_segno = secno * sbi->segs_per_sec;
+ unsigned int secno = GET_SEC_FROM_SEG(sbi, segno);
+ unsigned int start_segno = GET_SEG_FROM_SEC(sbi, secno);
unsigned int next;
spin_lock(&free_i->segmap_lock);
@@ -375,7 +431,8 @@ static inline void __set_inuse(struct f2fs_sb_info *sbi,
unsigned int segno)
{
struct free_segmap_info *free_i = FREE_I(sbi);
- unsigned int secno = segno / sbi->segs_per_sec;
+ unsigned int secno = GET_SEC_FROM_SEG(sbi, segno);
+
set_bit(segno, free_i->free_segmap);
free_i->free_segments--;
if (!test_and_set_bit(secno, free_i->free_secmap))
@@ -386,8 +443,8 @@ static inline void __set_test_and_free(struct f2fs_sb_info *sbi,
unsigned int segno)
{
struct free_segmap_info *free_i = FREE_I(sbi);
- unsigned int secno = segno / sbi->segs_per_sec;
- unsigned int start_segno = secno * sbi->segs_per_sec;
+ unsigned int secno = GET_SEC_FROM_SEG(sbi, segno);
+ unsigned int start_segno = GET_SEG_FROM_SEC(sbi, secno);
unsigned int next;
spin_lock(&free_i->segmap_lock);
@@ -411,7 +468,8 @@ static inline void __set_test_and_inuse(struct f2fs_sb_info *sbi,
unsigned int segno)
{
struct free_segmap_info *free_i = FREE_I(sbi);
- unsigned int secno = segno / sbi->segs_per_sec;
+ unsigned int secno = GET_SEC_FROM_SEG(sbi, segno);
+
spin_lock(&free_i->segmap_lock);
if (!test_and_set_bit(segno, free_i->free_segmap)) {
free_i->free_segments--;
@@ -425,6 +483,12 @@ static inline void get_sit_bitmap(struct f2fs_sb_info *sbi,
void *dst_addr)
{
struct sit_info *sit_i = SIT_I(sbi);
+
+#ifdef CONFIG_F2FS_CHECK_FS
+ if (memcmp(sit_i->sit_bitmap, sit_i->sit_bitmap_mir,
+ sit_i->bitmap_size))
+ f2fs_bug_on(sbi, 1);
+#endif
memcpy(dst_addr, sit_i->sit_bitmap, sit_i->bitmap_size);
}
@@ -468,26 +532,36 @@ static inline int overprovision_segments(struct f2fs_sb_info *sbi)
return SM_I(sbi)->ovp_segments;
}
-static inline int overprovision_sections(struct f2fs_sb_info *sbi)
-{
- return ((unsigned int) overprovision_segments(sbi)) / sbi->segs_per_sec;
-}
-
static inline int reserved_sections(struct f2fs_sb_info *sbi)
{
- return ((unsigned int) reserved_segments(sbi)) / sbi->segs_per_sec;
+ return GET_SEC_FROM_SEG(sbi, (unsigned int)reserved_segments(sbi));
}
-static inline bool need_SSR(struct f2fs_sb_info *sbi)
+static inline bool has_curseg_enough_space(struct f2fs_sb_info *sbi)
{
- int node_secs = get_blocktype_secs(sbi, F2FS_DIRTY_NODES);
- int dent_secs = get_blocktype_secs(sbi, F2FS_DIRTY_DENTS);
+ unsigned int node_blocks = get_pages(sbi, F2FS_DIRTY_NODES) +
+ get_pages(sbi, F2FS_DIRTY_DENTS);
+ unsigned int dent_blocks = get_pages(sbi, F2FS_DIRTY_DENTS);
+ unsigned int segno, left_blocks;
+ int i;
- if (test_opt(sbi, LFS))
- return false;
+ /* check current node segment */
+ for (i = CURSEG_HOT_NODE; i <= CURSEG_COLD_NODE; i++) {
+ segno = CURSEG_I(sbi, i)->segno;
+ left_blocks = sbi->blocks_per_seg -
+ get_seg_entry(sbi, segno)->ckpt_valid_blocks;
- return free_sections(sbi) <= (node_secs + 2 * dent_secs +
- reserved_sections(sbi) + 1);
+ if (node_blocks > left_blocks)
+ return false;
+ }
+
+ /* check current data segment */
+ segno = CURSEG_I(sbi, CURSEG_HOT_DATA)->segno;
+ left_blocks = sbi->blocks_per_seg -
+ get_seg_entry(sbi, segno)->ckpt_valid_blocks;
+ if (dent_blocks > left_blocks)
+ return false;
+ return true;
}
static inline bool has_not_enough_free_secs(struct f2fs_sb_info *sbi,
@@ -495,14 +569,26 @@ static inline bool has_not_enough_free_secs(struct f2fs_sb_info *sbi,
{
int node_secs = get_blocktype_secs(sbi, F2FS_DIRTY_NODES);
int dent_secs = get_blocktype_secs(sbi, F2FS_DIRTY_DENTS);
-
- node_secs += get_blocktype_secs(sbi, F2FS_DIRTY_IMETA);
+ int imeta_secs = get_blocktype_secs(sbi, F2FS_DIRTY_IMETA);
if (unlikely(is_sbi_flag_set(sbi, SBI_POR_DOING)))
return false;
+ if (free_sections(sbi) + freed == reserved_sections(sbi) + needed &&
+ has_curseg_enough_space(sbi))
+ return false;
return (free_sections(sbi) + freed) <=
- (node_secs + 2 * dent_secs + reserved_sections(sbi) + needed);
+ (node_secs + 2 * dent_secs + imeta_secs +
+ reserved_sections(sbi) + needed);
+}
+
+static inline int f2fs_is_checkpoint_ready(struct f2fs_sb_info *sbi)
+{
+ if (likely(!is_sbi_flag_set(sbi, SBI_CP_DISABLED)))
+ return 0;
+ if (likely(!has_not_enough_free_secs(sbi, 0, 0)))
+ return 0;
+ return -ENOSPC;
}
static inline bool excess_prefree_segs(struct f2fs_sb_info *sbi)
@@ -532,6 +618,9 @@ static inline int utilization(struct f2fs_sb_info *sbi)
*/
#define DEF_MIN_IPU_UTIL 70
#define DEF_MIN_FSYNC_BLOCKS 8
+#define DEF_MIN_HOT_BLOCKS 16
+
+#define SMALL_VOLUME_SEGMENTS (16 * 512) /* 16GB */
enum {
F2FS_IPU_FORCE,
@@ -539,39 +628,9 @@ enum {
F2FS_IPU_UTIL,
F2FS_IPU_SSR_UTIL,
F2FS_IPU_FSYNC,
+ F2FS_IPU_ASYNC,
};
-static inline bool need_inplace_update(struct inode *inode)
-{
- struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
- unsigned int policy = SM_I(sbi)->ipu_policy;
-
- /* IPU can be done only for the user data */
- if (S_ISDIR(inode->i_mode) || f2fs_is_atomic_file(inode))
- return false;
-
- if (test_opt(sbi, LFS))
- return false;
-
- if (policy & (0x1 << F2FS_IPU_FORCE))
- return true;
- if (policy & (0x1 << F2FS_IPU_SSR) && need_SSR(sbi))
- return true;
- if (policy & (0x1 << F2FS_IPU_UTIL) &&
- utilization(sbi) > SM_I(sbi)->min_ipu_util)
- return true;
- if (policy & (0x1 << F2FS_IPU_SSR_UTIL) && need_SSR(sbi) &&
- utilization(sbi) > SM_I(sbi)->min_ipu_util)
- return true;
-
- /* this is only set during fdatasync */
- if (policy & (0x1 << F2FS_IPU_FSYNC) &&
- is_inode_flag_set(inode, FI_NEED_IPU))
- return true;
-
- return false;
-}
-
static inline unsigned int curseg_segno(struct f2fs_sb_info *sbi,
int type)
{
@@ -662,6 +721,12 @@ static inline pgoff_t current_sit_addr(struct f2fs_sb_info *sbi,
check_seg_range(sbi, start);
+#ifdef CONFIG_F2FS_CHECK_FS
+ if (f2fs_test_bit(offset, sit_i->sit_bitmap) !=
+ f2fs_test_bit(offset, sit_i->sit_bitmap_mir))
+ f2fs_bug_on(sbi, 1);
+#endif
+
/* calculate sit block address */
if (f2fs_test_bit(offset, sit_i->sit_bitmap))
blk_addr += sit_i->sit_blocks;
@@ -687,13 +752,28 @@ static inline void set_to_next_sit(struct sit_info *sit_i, unsigned int start)
unsigned int block_off = SIT_BLOCK_OFFSET(start);
f2fs_change_bit(block_off, sit_i->sit_bitmap);
+#ifdef CONFIG_F2FS_CHECK_FS
+ f2fs_change_bit(block_off, sit_i->sit_bitmap_mir);
+#endif
}
-static inline unsigned long long get_mtime(struct f2fs_sb_info *sbi)
+static inline unsigned long long get_mtime(struct f2fs_sb_info *sbi,
+ bool base_time)
{
struct sit_info *sit_i = SIT_I(sbi);
- return sit_i->elapsed_time + CURRENT_TIME_SEC.tv_sec -
- sit_i->mounted_time;
+ time64_t diff, now = ktime_get_real_seconds();
+
+ if (now >= sit_i->mounted_time)
+ return sit_i->elapsed_time + now - sit_i->mounted_time;
+
+ /* system time is set to the past */
+ if (!base_time) {
+ diff = sit_i->mounted_time - now;
+ if (sit_i->elapsed_time >= diff)
+ return sit_i->elapsed_time - diff;
+ return 0;
+ }
+ return sit_i->elapsed_time;
}
static inline void set_summary(struct f2fs_summary *sum, nid_t nid,
@@ -717,15 +797,6 @@ static inline block_t sum_blk_addr(struct f2fs_sb_info *sbi, int base, int type)
- (base + 1) + type;
}
-static inline bool no_fggc_candidate(struct f2fs_sb_info *sbi,
- unsigned int secno)
-{
- if (get_valid_blocks(sbi, secno, sbi->segs_per_sec) >=
- sbi->fggc_threshold)
- return true;
- return false;
-}
-
static inline bool sec_usage_check(struct f2fs_sb_info *sbi, unsigned int secno)
{
if (IS_CURSEC(sbi, secno) || (sbi->cur_victim_sec == secno))
@@ -733,19 +804,12 @@ static inline bool sec_usage_check(struct f2fs_sb_info *sbi, unsigned int secno)
return false;
}
-static inline unsigned int max_hw_blocks(struct f2fs_sb_info *sbi)
-{
- struct block_device *bdev = sbi->sb->s_bdev;
- struct request_queue *q = bdev_get_queue(bdev);
- return SECTOR_TO_BLOCK(queue_max_sectors(q));
-}
-
/*
* It is very important to gather dirty pages and write at once, so that we can
* submit a big bio without interfering other data writes.
* By default, 512 pages for directory data,
- * 512 pages (2MB) * 3 for three types of nodes, and
- * max_bio_blocks for meta are set.
+ * 512 pages (2MB) * 8 for nodes, and
+ * 256 pages * 8 for meta are set.
*/
static inline int nr_pages_to_skip(struct f2fs_sb_info *sbi, int type)
{
@@ -757,7 +821,7 @@ static inline int nr_pages_to_skip(struct f2fs_sb_info *sbi, int type)
else if (type == NODE)
return 8 * sbi->blocks_per_seg;
else if (type == META)
- return 8 * MAX_BIO_BLOCKS(sbi);
+ return 8 * BIO_MAX_PAGES;
else
return 0;
}
@@ -774,12 +838,36 @@ static inline long nr_pages_to_write(struct f2fs_sb_info *sbi, int type,
return 0;
nr_to_write = wbc->nr_to_write;
-
+ desired = BIO_MAX_PAGES;
if (type == NODE)
- desired = 2 * max_hw_blocks(sbi);
- else
- desired = MAX_BIO_BLOCKS(sbi);
+ desired <<= 1;
wbc->nr_to_write = desired;
return desired - nr_to_write;
}
+
+static inline void wake_up_discard_thread(struct f2fs_sb_info *sbi, bool force)
+{
+ struct discard_cmd_control *dcc = SM_I(sbi)->dcc_info;
+ bool wakeup = false;
+ int i;
+
+ if (force)
+ goto wake_up;
+
+ mutex_lock(&dcc->cmd_lock);
+ for (i = MAX_PLIST_NUM - 1; i >= 0; i--) {
+ if (i + 1 < dcc->discard_granularity)
+ break;
+ if (!list_empty(&dcc->pend_list[i])) {
+ wakeup = true;
+ break;
+ }
+ }
+ mutex_unlock(&dcc->cmd_lock);
+ if (!wakeup)
+ return;
+wake_up:
+ dcc->discard_wake = 1;
+ wake_up_interruptible_all(&dcc->discard_wait_queue);
+}
diff --git a/fs/f2fs/shrinker.c b/fs/f2fs/shrinker.c
index 46c915425923..a467aca29cfe 100644
--- a/fs/f2fs/shrinker.c
+++ b/fs/f2fs/shrinker.c
@@ -1,13 +1,10 @@
+// SPDX-License-Identifier: GPL-2.0
/*
* f2fs shrinker support
* the basic infra was copied from fs/ubifs/shrinker.c
*
* Copyright (c) 2015 Motorola Mobility
* Copyright (c) 2015 Jaegeuk Kim <jaegeuk@kernel.org>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
*/
#include <linux/fs.h>
#include <linux/f2fs_fs.h>
@@ -21,14 +18,16 @@ static unsigned int shrinker_run_no;
static unsigned long __count_nat_entries(struct f2fs_sb_info *sbi)
{
- return NM_I(sbi)->nat_cnt - NM_I(sbi)->dirty_nat_cnt;
+ long count = NM_I(sbi)->nat_cnt - NM_I(sbi)->dirty_nat_cnt;
+
+ return count > 0 ? count : 0;
}
static unsigned long __count_free_nids(struct f2fs_sb_info *sbi)
{
- if (NM_I(sbi)->fcnt > MAX_FREE_NIDS)
- return NM_I(sbi)->fcnt - MAX_FREE_NIDS;
- return 0;
+ long count = NM_I(sbi)->nid_cnt[FREE_NID] - MAX_FREE_NIDS;
+
+ return count > 0 ? count : 0;
}
static unsigned long __count_extent_cache(struct f2fs_sb_info *sbi)
@@ -107,11 +106,11 @@ unsigned long f2fs_shrink_scan(struct shrinker *shrink,
/* shrink clean nat cache entries */
if (freed < nr)
- freed += try_to_free_nats(sbi, nr - freed);
+ freed += f2fs_try_to_free_nats(sbi, nr - freed);
/* shrink free nids cache entries */
if (freed < nr)
- freed += try_to_free_nids(sbi, nr - freed);
+ freed += f2fs_try_to_free_nids(sbi, nr - freed);
spin_lock(&f2fs_list_lock);
p = p->next;
@@ -136,6 +135,6 @@ void f2fs_leave_shrinker(struct f2fs_sb_info *sbi)
f2fs_shrink_extent_tree(sbi, __count_extent_cache(sbi));
spin_lock(&f2fs_list_lock);
- list_del(&sbi->s_list);
+ list_del_init(&sbi->s_list);
spin_unlock(&f2fs_list_lock);
}
diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c
index 83a96334dc07..3e0250b106e3 100644
--- a/fs/f2fs/super.c
+++ b/fs/f2fs/super.c
@@ -1,12 +1,9 @@
+// SPDX-License-Identifier: GPL-2.0
/*
* fs/f2fs/super.c
*
* Copyright (c) 2012 Samsung Electronics Co., Ltd.
* http://www.samsung.com/
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
*/
#include <linux/module.h>
#include <linux/init.h>
@@ -22,8 +19,10 @@
#include <linux/random.h>
#include <linux/exportfs.h>
#include <linux/blkdev.h>
+#include <linux/quotaops.h>
#include <linux/f2fs_fs.h>
#include <linux/sysfs.h>
+#include <linux/quota.h>
#include "f2fs.h"
#include "node.h"
@@ -35,36 +34,43 @@
#define CREATE_TRACE_POINTS
#include <trace/events/f2fs.h>
-static struct proc_dir_entry *f2fs_proc_root;
static struct kmem_cache *f2fs_inode_cachep;
-static struct kset *f2fs_kset;
#ifdef CONFIG_F2FS_FAULT_INJECTION
-char *fault_name[FAULT_MAX] = {
+const char *f2fs_fault_name[FAULT_MAX] = {
[FAULT_KMALLOC] = "kmalloc",
+ [FAULT_KVMALLOC] = "kvmalloc",
[FAULT_PAGE_ALLOC] = "page alloc",
+ [FAULT_PAGE_GET] = "page get",
+ [FAULT_ALLOC_BIO] = "alloc bio",
[FAULT_ALLOC_NID] = "alloc nid",
[FAULT_ORPHAN] = "orphan",
[FAULT_BLOCK] = "no more block",
[FAULT_DIR_DEPTH] = "too big dir depth",
[FAULT_EVICT_INODE] = "evict_inode fail",
- [FAULT_IO] = "IO error",
+ [FAULT_TRUNCATE] = "truncate fail",
+ [FAULT_READ_IO] = "read IO error",
[FAULT_CHECKPOINT] = "checkpoint error",
+ [FAULT_DISCARD] = "discard error",
+ [FAULT_WRITE_IO] = "write IO error",
};
-static void f2fs_build_fault_attr(struct f2fs_sb_info *sbi,
- unsigned int rate)
+void f2fs_build_fault_attr(struct f2fs_sb_info *sbi, unsigned int rate,
+ unsigned int type)
{
- struct f2fs_fault_info *ffi = &sbi->fault_info;
+ struct f2fs_fault_info *ffi = &F2FS_OPTION(sbi).fault_info;
if (rate) {
atomic_set(&ffi->inject_ops, 0);
ffi->inject_rate = rate;
- ffi->inject_type = (1 << FAULT_MAX) - 1;
- } else {
- memset(ffi, 0, sizeof(struct f2fs_fault_info));
}
+
+ if (type)
+ ffi->inject_type = type;
+
+ if (!rate && !type)
+ memset(ffi, 0, sizeof(struct f2fs_fault_info));
}
#endif
@@ -82,6 +88,7 @@ enum {
Opt_discard,
Opt_nodiscard,
Opt_noheap,
+ Opt_heap,
Opt_user_xattr,
Opt_nouser_xattr,
Opt_acl,
@@ -89,6 +96,8 @@ enum {
Opt_active_logs,
Opt_disable_ext_identify,
Opt_inline_xattr,
+ Opt_noinline_xattr,
+ Opt_inline_xattr_size,
Opt_inline_data,
Opt_inline_dentry,
Opt_noinline_dentry,
@@ -100,10 +109,34 @@ enum {
Opt_noextent_cache,
Opt_noinline_data,
Opt_data_flush,
+ Opt_reserve_root,
+ Opt_resgid,
+ Opt_resuid,
Opt_mode,
+ Opt_io_size_bits,
Opt_fault_injection,
+ Opt_fault_type,
Opt_lazytime,
Opt_nolazytime,
+ Opt_quota,
+ Opt_noquota,
+ Opt_usrquota,
+ Opt_grpquota,
+ Opt_prjquota,
+ Opt_usrjquota,
+ Opt_grpjquota,
+ Opt_prjjquota,
+ Opt_offusrjquota,
+ Opt_offgrpjquota,
+ Opt_offprjjquota,
+ Opt_jqfmt_vfsold,
+ Opt_jqfmt_vfsv0,
+ Opt_jqfmt_vfsv1,
+ Opt_whint,
+ Opt_alloc,
+ Opt_fsync,
+ Opt_test_dummy_encryption,
+ Opt_checkpoint,
Opt_err,
};
@@ -114,6 +147,7 @@ static match_table_t f2fs_tokens = {
{Opt_discard, "discard"},
{Opt_nodiscard, "nodiscard"},
{Opt_noheap, "no_heap"},
+ {Opt_heap, "heap"},
{Opt_user_xattr, "user_xattr"},
{Opt_nouser_xattr, "nouser_xattr"},
{Opt_acl, "acl"},
@@ -121,6 +155,8 @@ static match_table_t f2fs_tokens = {
{Opt_active_logs, "active_logs=%u"},
{Opt_disable_ext_identify, "disable_ext_identify"},
{Opt_inline_xattr, "inline_xattr"},
+ {Opt_noinline_xattr, "noinline_xattr"},
+ {Opt_inline_xattr_size, "inline_xattr_size=%u"},
{Opt_inline_data, "inline_data"},
{Opt_inline_dentry, "inline_dentry"},
{Opt_noinline_dentry, "noinline_dentry"},
@@ -132,237 +168,206 @@ static match_table_t f2fs_tokens = {
{Opt_noextent_cache, "noextent_cache"},
{Opt_noinline_data, "noinline_data"},
{Opt_data_flush, "data_flush"},
+ {Opt_reserve_root, "reserve_root=%u"},
+ {Opt_resgid, "resgid=%u"},
+ {Opt_resuid, "resuid=%u"},
{Opt_mode, "mode=%s"},
+ {Opt_io_size_bits, "io_bits=%u"},
{Opt_fault_injection, "fault_injection=%u"},
+ {Opt_fault_type, "fault_type=%u"},
{Opt_lazytime, "lazytime"},
{Opt_nolazytime, "nolazytime"},
+ {Opt_quota, "quota"},
+ {Opt_noquota, "noquota"},
+ {Opt_usrquota, "usrquota"},
+ {Opt_grpquota, "grpquota"},
+ {Opt_prjquota, "prjquota"},
+ {Opt_usrjquota, "usrjquota=%s"},
+ {Opt_grpjquota, "grpjquota=%s"},
+ {Opt_prjjquota, "prjjquota=%s"},
+ {Opt_offusrjquota, "usrjquota="},
+ {Opt_offgrpjquota, "grpjquota="},
+ {Opt_offprjjquota, "prjjquota="},
+ {Opt_jqfmt_vfsold, "jqfmt=vfsold"},
+ {Opt_jqfmt_vfsv0, "jqfmt=vfsv0"},
+ {Opt_jqfmt_vfsv1, "jqfmt=vfsv1"},
+ {Opt_whint, "whint_mode=%s"},
+ {Opt_alloc, "alloc_mode=%s"},
+ {Opt_fsync, "fsync_mode=%s"},
+ {Opt_test_dummy_encryption, "test_dummy_encryption"},
+ {Opt_checkpoint, "checkpoint=%s"},
{Opt_err, NULL},
};
-/* Sysfs support for f2fs */
-enum {
- GC_THREAD, /* struct f2fs_gc_thread */
- SM_INFO, /* struct f2fs_sm_info */
- NM_INFO, /* struct f2fs_nm_info */
- F2FS_SBI, /* struct f2fs_sb_info */
-#ifdef CONFIG_F2FS_FAULT_INJECTION
- FAULT_INFO_RATE, /* struct f2fs_fault_info */
- FAULT_INFO_TYPE, /* struct f2fs_fault_info */
-#endif
-};
-
-struct f2fs_attr {
- struct attribute attr;
- ssize_t (*show)(struct f2fs_attr *, struct f2fs_sb_info *, char *);
- ssize_t (*store)(struct f2fs_attr *, struct f2fs_sb_info *,
- const char *, size_t);
- int struct_type;
- int offset;
-};
-
-static unsigned char *__struct_ptr(struct f2fs_sb_info *sbi, int struct_type)
+void f2fs_msg(struct super_block *sb, const char *level, const char *fmt, ...)
{
- if (struct_type == GC_THREAD)
- return (unsigned char *)sbi->gc_thread;
- else if (struct_type == SM_INFO)
- return (unsigned char *)SM_I(sbi);
- else if (struct_type == NM_INFO)
- return (unsigned char *)NM_I(sbi);
- else if (struct_type == F2FS_SBI)
- return (unsigned char *)sbi;
-#ifdef CONFIG_F2FS_FAULT_INJECTION
- else if (struct_type == FAULT_INFO_RATE ||
- struct_type == FAULT_INFO_TYPE)
- return (unsigned char *)&sbi->fault_info;
-#endif
- return NULL;
+ struct va_format vaf;
+ va_list args;
+
+ va_start(args, fmt);
+ vaf.fmt = fmt;
+ vaf.va = &args;
+ printk("%sF2FS-fs (%s): %pV\n", level, sb->s_id, &vaf);
+ va_end(args);
}
-static ssize_t lifetime_write_kbytes_show(struct f2fs_attr *a,
- struct f2fs_sb_info *sbi, char *buf)
+static inline void limit_reserve_root(struct f2fs_sb_info *sbi)
{
- struct super_block *sb = sbi->sb;
-
- if (!sb->s_bdev->bd_part)
- return snprintf(buf, PAGE_SIZE, "0\n");
-
- return snprintf(buf, PAGE_SIZE, "%llu\n",
- (unsigned long long)(sbi->kbytes_written +
- BD_PART_WRITTEN(sbi)));
+ block_t limit = (sbi->user_block_count << 1) / 1000;
+
+ /* limit is 0.2% */
+ if (test_opt(sbi, RESERVE_ROOT) &&
+ F2FS_OPTION(sbi).root_reserved_blocks > limit) {
+ F2FS_OPTION(sbi).root_reserved_blocks = limit;
+ f2fs_msg(sbi->sb, KERN_INFO,
+ "Reduce reserved blocks for root = %u",
+ F2FS_OPTION(sbi).root_reserved_blocks);
+ }
+ if (!test_opt(sbi, RESERVE_ROOT) &&
+ (!uid_eq(F2FS_OPTION(sbi).s_resuid,
+ make_kuid(&init_user_ns, F2FS_DEF_RESUID)) ||
+ !gid_eq(F2FS_OPTION(sbi).s_resgid,
+ make_kgid(&init_user_ns, F2FS_DEF_RESGID))))
+ f2fs_msg(sbi->sb, KERN_INFO,
+ "Ignore s_resuid=%u, s_resgid=%u w/o reserve_root",
+ from_kuid_munged(&init_user_ns,
+ F2FS_OPTION(sbi).s_resuid),
+ from_kgid_munged(&init_user_ns,
+ F2FS_OPTION(sbi).s_resgid));
}
-static ssize_t f2fs_sbi_show(struct f2fs_attr *a,
- struct f2fs_sb_info *sbi, char *buf)
+static void init_once(void *foo)
{
- unsigned char *ptr = NULL;
- unsigned int *ui;
-
- ptr = __struct_ptr(sbi, a->struct_type);
- if (!ptr)
- return -EINVAL;
-
- ui = (unsigned int *)(ptr + a->offset);
+ struct f2fs_inode_info *fi = (struct f2fs_inode_info *) foo;
- return snprintf(buf, PAGE_SIZE, "%u\n", *ui);
+ inode_init_once(&fi->vfs_inode);
}
-static ssize_t f2fs_sbi_store(struct f2fs_attr *a,
- struct f2fs_sb_info *sbi,
- const char *buf, size_t count)
+#ifdef CONFIG_QUOTA
+static const char * const quotatypes[] = INITQFNAMES;
+#define QTYPE2NAME(t) (quotatypes[t])
+static int f2fs_set_qf_name(struct super_block *sb, int qtype,
+ substring_t *args)
{
- unsigned char *ptr;
- unsigned long t;
- unsigned int *ui;
- ssize_t ret;
+ struct f2fs_sb_info *sbi = F2FS_SB(sb);
+ char *qname;
+ int ret = -EINVAL;
- ptr = __struct_ptr(sbi, a->struct_type);
- if (!ptr)
+ if (sb_any_quota_loaded(sb) && !F2FS_OPTION(sbi).s_qf_names[qtype]) {
+ f2fs_msg(sb, KERN_ERR,
+ "Cannot change journaled "
+ "quota options when quota turned on");
return -EINVAL;
+ }
+ if (f2fs_sb_has_quota_ino(sbi)) {
+ f2fs_msg(sb, KERN_INFO,
+ "QUOTA feature is enabled, so ignore qf_name");
+ return 0;
+ }
- ui = (unsigned int *)(ptr + a->offset);
-
- ret = kstrtoul(skip_spaces(buf), 0, &t);
- if (ret < 0)
- return ret;
-#ifdef CONFIG_F2FS_FAULT_INJECTION
- if (a->struct_type == FAULT_INFO_TYPE && t >= (1 << FAULT_MAX))
+ qname = match_strdup(args);
+ if (!qname) {
+ f2fs_msg(sb, KERN_ERR,
+ "Not enough memory for storing quotafile name");
return -EINVAL;
-#endif
- *ui = t;
- return count;
+ }
+ if (F2FS_OPTION(sbi).s_qf_names[qtype]) {
+ if (strcmp(F2FS_OPTION(sbi).s_qf_names[qtype], qname) == 0)
+ ret = 0;
+ else
+ f2fs_msg(sb, KERN_ERR,
+ "%s quota file already specified",
+ QTYPE2NAME(qtype));
+ goto errout;
+ }
+ if (strchr(qname, '/')) {
+ f2fs_msg(sb, KERN_ERR,
+ "quotafile must be on filesystem root");
+ goto errout;
+ }
+ F2FS_OPTION(sbi).s_qf_names[qtype] = qname;
+ set_opt(sbi, QUOTA);
+ return 0;
+errout:
+ kvfree(qname);
+ return ret;
}
-static ssize_t f2fs_attr_show(struct kobject *kobj,
- struct attribute *attr, char *buf)
+static int f2fs_clear_qf_name(struct super_block *sb, int qtype)
{
- struct f2fs_sb_info *sbi = container_of(kobj, struct f2fs_sb_info,
- s_kobj);
- struct f2fs_attr *a = container_of(attr, struct f2fs_attr, attr);
+ struct f2fs_sb_info *sbi = F2FS_SB(sb);
- return a->show ? a->show(a, sbi, buf) : 0;
+ if (sb_any_quota_loaded(sb) && F2FS_OPTION(sbi).s_qf_names[qtype]) {
+ f2fs_msg(sb, KERN_ERR, "Cannot change journaled quota options"
+ " when quota turned on");
+ return -EINVAL;
+ }
+ kvfree(F2FS_OPTION(sbi).s_qf_names[qtype]);
+ F2FS_OPTION(sbi).s_qf_names[qtype] = NULL;
+ return 0;
}
-static ssize_t f2fs_attr_store(struct kobject *kobj, struct attribute *attr,
- const char *buf, size_t len)
+static int f2fs_check_quota_options(struct f2fs_sb_info *sbi)
{
- struct f2fs_sb_info *sbi = container_of(kobj, struct f2fs_sb_info,
- s_kobj);
- struct f2fs_attr *a = container_of(attr, struct f2fs_attr, attr);
-
- return a->store ? a->store(a, sbi, buf, len) : 0;
-}
+ /*
+ * We do the test below only for project quotas. 'usrquota' and
+ * 'grpquota' mount options are allowed even without quota feature
+ * to support legacy quotas in quota files.
+ */
+ if (test_opt(sbi, PRJQUOTA) && !f2fs_sb_has_project_quota(sbi)) {
+ f2fs_msg(sbi->sb, KERN_ERR, "Project quota feature not enabled. "
+ "Cannot enable project quota enforcement.");
+ return -1;
+ }
+ if (F2FS_OPTION(sbi).s_qf_names[USRQUOTA] ||
+ F2FS_OPTION(sbi).s_qf_names[GRPQUOTA] ||
+ F2FS_OPTION(sbi).s_qf_names[PRJQUOTA]) {
+ if (test_opt(sbi, USRQUOTA) &&
+ F2FS_OPTION(sbi).s_qf_names[USRQUOTA])
+ clear_opt(sbi, USRQUOTA);
+
+ if (test_opt(sbi, GRPQUOTA) &&
+ F2FS_OPTION(sbi).s_qf_names[GRPQUOTA])
+ clear_opt(sbi, GRPQUOTA);
+
+ if (test_opt(sbi, PRJQUOTA) &&
+ F2FS_OPTION(sbi).s_qf_names[PRJQUOTA])
+ clear_opt(sbi, PRJQUOTA);
+
+ if (test_opt(sbi, GRPQUOTA) || test_opt(sbi, USRQUOTA) ||
+ test_opt(sbi, PRJQUOTA)) {
+ f2fs_msg(sbi->sb, KERN_ERR, "old and new quota "
+ "format mixing");
+ return -1;
+ }
-static void f2fs_sb_release(struct kobject *kobj)
-{
- struct f2fs_sb_info *sbi = container_of(kobj, struct f2fs_sb_info,
- s_kobj);
- complete(&sbi->s_kobj_unregister);
-}
+ if (!F2FS_OPTION(sbi).s_jquota_fmt) {
+ f2fs_msg(sbi->sb, KERN_ERR, "journaled quota format "
+ "not specified");
+ return -1;
+ }
+ }
-#define F2FS_ATTR_OFFSET(_struct_type, _name, _mode, _show, _store, _offset) \
-static struct f2fs_attr f2fs_attr_##_name = { \
- .attr = {.name = __stringify(_name), .mode = _mode }, \
- .show = _show, \
- .store = _store, \
- .struct_type = _struct_type, \
- .offset = _offset \
+ if (f2fs_sb_has_quota_ino(sbi) && F2FS_OPTION(sbi).s_jquota_fmt) {
+ f2fs_msg(sbi->sb, KERN_INFO,
+ "QUOTA feature is enabled, so ignore jquota_fmt");
+ F2FS_OPTION(sbi).s_jquota_fmt = 0;
+ }
+ return 0;
}
-
-#define F2FS_RW_ATTR(struct_type, struct_name, name, elname) \
- F2FS_ATTR_OFFSET(struct_type, name, 0644, \
- f2fs_sbi_show, f2fs_sbi_store, \
- offsetof(struct struct_name, elname))
-
-#define F2FS_GENERAL_RO_ATTR(name) \
-static struct f2fs_attr f2fs_attr_##name = __ATTR(name, 0444, name##_show, NULL)
-
-F2FS_RW_ATTR(GC_THREAD, f2fs_gc_kthread, gc_min_sleep_time, min_sleep_time);
-F2FS_RW_ATTR(GC_THREAD, f2fs_gc_kthread, gc_max_sleep_time, max_sleep_time);
-F2FS_RW_ATTR(GC_THREAD, f2fs_gc_kthread, gc_no_gc_sleep_time, no_gc_sleep_time);
-F2FS_RW_ATTR(GC_THREAD, f2fs_gc_kthread, gc_idle, gc_idle);
-F2FS_RW_ATTR(SM_INFO, f2fs_sm_info, reclaim_segments, rec_prefree_segments);
-F2FS_RW_ATTR(SM_INFO, f2fs_sm_info, max_small_discards, max_discards);
-F2FS_RW_ATTR(SM_INFO, f2fs_sm_info, batched_trim_sections, trim_sections);
-F2FS_RW_ATTR(SM_INFO, f2fs_sm_info, ipu_policy, ipu_policy);
-F2FS_RW_ATTR(SM_INFO, f2fs_sm_info, min_ipu_util, min_ipu_util);
-F2FS_RW_ATTR(SM_INFO, f2fs_sm_info, min_fsync_blocks, min_fsync_blocks);
-F2FS_RW_ATTR(NM_INFO, f2fs_nm_info, ram_thresh, ram_thresh);
-F2FS_RW_ATTR(NM_INFO, f2fs_nm_info, ra_nid_pages, ra_nid_pages);
-F2FS_RW_ATTR(NM_INFO, f2fs_nm_info, dirty_nats_ratio, dirty_nats_ratio);
-F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, max_victim_search, max_victim_search);
-F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, dir_level, dir_level);
-F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, cp_interval, interval_time[CP_TIME]);
-F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, idle_interval, interval_time[REQ_TIME]);
-#ifdef CONFIG_F2FS_FAULT_INJECTION
-F2FS_RW_ATTR(FAULT_INFO_RATE, f2fs_fault_info, inject_rate, inject_rate);
-F2FS_RW_ATTR(FAULT_INFO_TYPE, f2fs_fault_info, inject_type, inject_type);
#endif
-F2FS_GENERAL_RO_ATTR(lifetime_write_kbytes);
-
-#define ATTR_LIST(name) (&f2fs_attr_##name.attr)
-static struct attribute *f2fs_attrs[] = {
- ATTR_LIST(gc_min_sleep_time),
- ATTR_LIST(gc_max_sleep_time),
- ATTR_LIST(gc_no_gc_sleep_time),
- ATTR_LIST(gc_idle),
- ATTR_LIST(reclaim_segments),
- ATTR_LIST(max_small_discards),
- ATTR_LIST(batched_trim_sections),
- ATTR_LIST(ipu_policy),
- ATTR_LIST(min_ipu_util),
- ATTR_LIST(min_fsync_blocks),
- ATTR_LIST(max_victim_search),
- ATTR_LIST(dir_level),
- ATTR_LIST(ram_thresh),
- ATTR_LIST(ra_nid_pages),
- ATTR_LIST(dirty_nats_ratio),
- ATTR_LIST(cp_interval),
- ATTR_LIST(idle_interval),
-#ifdef CONFIG_F2FS_FAULT_INJECTION
- ATTR_LIST(inject_rate),
- ATTR_LIST(inject_type),
-#endif
- ATTR_LIST(lifetime_write_kbytes),
- NULL,
-};
-
-static const struct sysfs_ops f2fs_attr_ops = {
- .show = f2fs_attr_show,
- .store = f2fs_attr_store,
-};
-
-static struct kobj_type f2fs_ktype = {
- .default_attrs = f2fs_attrs,
- .sysfs_ops = &f2fs_attr_ops,
- .release = f2fs_sb_release,
-};
-
-void f2fs_msg(struct super_block *sb, const char *level, const char *fmt, ...)
-{
- struct va_format vaf;
- va_list args;
-
- va_start(args, fmt);
- vaf.fmt = fmt;
- vaf.va = &args;
- printk("%sF2FS-fs (%s): %pV\n", level, sb->s_id, &vaf);
- va_end(args);
-}
-
-static void init_once(void *foo)
-{
- struct f2fs_inode_info *fi = (struct f2fs_inode_info *) foo;
-
- inode_init_once(&fi->vfs_inode);
-}
static int parse_options(struct super_block *sb, char *options)
{
struct f2fs_sb_info *sbi = F2FS_SB(sb);
- struct request_queue *q;
substring_t args[MAX_OPT_ARGS];
char *p, *name;
int arg = 0;
+ kuid_t uid;
+ kgid_t gid;
+#ifdef CONFIG_QUOTA
+ int ret;
+#endif
if (!options)
return 0;
@@ -394,10 +399,10 @@ static int parse_options(struct super_block *sb, char *options)
set_opt(sbi, BG_GC);
set_opt(sbi, FORCE_FG_GC);
} else {
- kfree(name);
+ kvfree(name);
return -EINVAL;
}
- kfree(name);
+ kvfree(name);
break;
case Opt_disable_roll_forward:
set_opt(sbi, DISABLE_ROLL_FORWARD);
@@ -409,20 +414,22 @@ static int parse_options(struct super_block *sb, char *options)
return -EINVAL;
break;
case Opt_discard:
- q = bdev_get_queue(sb->s_bdev);
- if (blk_queue_discard(q)) {
- set_opt(sbi, DISCARD);
- } else {
- f2fs_msg(sb, KERN_WARNING,
- "mounting with \"discard\" option, but "
- "the device does not support discard");
- }
+ set_opt(sbi, DISCARD);
break;
case Opt_nodiscard:
+ if (f2fs_sb_has_blkzoned(sbi)) {
+ f2fs_msg(sb, KERN_WARNING,
+ "discard is required for zoned block devices");
+ return -EINVAL;
+ }
clear_opt(sbi, DISCARD);
+ break;
case Opt_noheap:
set_opt(sbi, NOHEAP);
break;
+ case Opt_heap:
+ clear_opt(sbi, NOHEAP);
+ break;
#ifdef CONFIG_F2FS_FS_XATTR
case Opt_user_xattr:
set_opt(sbi, XATTR_USER);
@@ -433,6 +440,15 @@ static int parse_options(struct super_block *sb, char *options)
case Opt_inline_xattr:
set_opt(sbi, INLINE_XATTR);
break;
+ case Opt_noinline_xattr:
+ clear_opt(sbi, INLINE_XATTR);
+ break;
+ case Opt_inline_xattr_size:
+ if (args->from && match_int(args, &arg))
+ return -EINVAL;
+ set_opt(sbi, INLINE_XATTR_SIZE);
+ F2FS_OPTION(sbi).inline_xattr_size = arg;
+ break;
#else
case Opt_user_xattr:
f2fs_msg(sb, KERN_INFO,
@@ -446,6 +462,10 @@ static int parse_options(struct super_block *sb, char *options)
f2fs_msg(sb, KERN_INFO,
"inline_xattr options not supported");
break;
+ case Opt_noinline_xattr:
+ f2fs_msg(sb, KERN_INFO,
+ "noinline_xattr options not supported");
+ break;
#endif
#ifdef CONFIG_F2FS_FS_POSIX_ACL
case Opt_acl:
@@ -467,7 +487,7 @@ static int parse_options(struct super_block *sb, char *options)
return -EINVAL;
if (arg != 2 && arg != 4 && arg != NR_CURSEG_TYPE)
return -EINVAL;
- sbi->active_logs = arg;
+ F2FS_OPTION(sbi).active_logs = arg;
break;
case Opt_disable_ext_identify:
set_opt(sbi, DISABLE_EXT_IDENTIFY);
@@ -505,6 +525,40 @@ static int parse_options(struct super_block *sb, char *options)
case Opt_data_flush:
set_opt(sbi, DATA_FLUSH);
break;
+ case Opt_reserve_root:
+ if (args->from && match_int(args, &arg))
+ return -EINVAL;
+ if (test_opt(sbi, RESERVE_ROOT)) {
+ f2fs_msg(sb, KERN_INFO,
+ "Preserve previous reserve_root=%u",
+ F2FS_OPTION(sbi).root_reserved_blocks);
+ } else {
+ F2FS_OPTION(sbi).root_reserved_blocks = arg;
+ set_opt(sbi, RESERVE_ROOT);
+ }
+ break;
+ case Opt_resuid:
+ if (args->from && match_int(args, &arg))
+ return -EINVAL;
+ uid = make_kuid(current_user_ns(), arg);
+ if (!uid_valid(uid)) {
+ f2fs_msg(sb, KERN_ERR,
+ "Invalid uid value %d", arg);
+ return -EINVAL;
+ }
+ F2FS_OPTION(sbi).s_resuid = uid;
+ break;
+ case Opt_resgid:
+ if (args->from && match_int(args, &arg))
+ return -EINVAL;
+ gid = make_kgid(current_user_ns(), arg);
+ if (!gid_valid(gid)) {
+ f2fs_msg(sb, KERN_ERR,
+ "Invalid gid value %d", arg);
+ return -EINVAL;
+ }
+ F2FS_OPTION(sbi).s_resgid = gid;
+ break;
case Opt_mode:
name = match_strdup(&args[0]);
@@ -512,32 +566,228 @@ static int parse_options(struct super_block *sb, char *options)
return -ENOMEM;
if (strlen(name) == 8 &&
!strncmp(name, "adaptive", 8)) {
+ if (f2fs_sb_has_blkzoned(sbi)) {
+ f2fs_msg(sb, KERN_WARNING,
+ "adaptive mode is not allowed with "
+ "zoned block device feature");
+ kvfree(name);
+ return -EINVAL;
+ }
set_opt_mode(sbi, F2FS_MOUNT_ADAPTIVE);
} else if (strlen(name) == 3 &&
!strncmp(name, "lfs", 3)) {
set_opt_mode(sbi, F2FS_MOUNT_LFS);
} else {
- kfree(name);
+ kvfree(name);
return -EINVAL;
}
- kfree(name);
+ kvfree(name);
break;
- case Opt_fault_injection:
+ case Opt_io_size_bits:
if (args->from && match_int(args, &arg))
return -EINVAL;
+ if (arg > __ilog2_u32(BIO_MAX_PAGES)) {
+ f2fs_msg(sb, KERN_WARNING,
+ "Not support %d, larger than %d",
+ 1 << arg, BIO_MAX_PAGES);
+ return -EINVAL;
+ }
+ F2FS_OPTION(sbi).write_io_size_bits = arg;
+ break;
#ifdef CONFIG_F2FS_FAULT_INJECTION
- f2fs_build_fault_attr(sbi, arg);
+ case Opt_fault_injection:
+ if (args->from && match_int(args, &arg))
+ return -EINVAL;
+ f2fs_build_fault_attr(sbi, arg, F2FS_ALL_FAULT_TYPE);
+ set_opt(sbi, FAULT_INJECTION);
+ break;
+
+ case Opt_fault_type:
+ if (args->from && match_int(args, &arg))
+ return -EINVAL;
+ f2fs_build_fault_attr(sbi, 0, arg);
+ set_opt(sbi, FAULT_INJECTION);
+ break;
#else
+ case Opt_fault_injection:
f2fs_msg(sb, KERN_INFO,
- "FAULT_INJECTION was not selected");
-#endif
+ "fault_injection options not supported");
+ break;
+
+ case Opt_fault_type:
+ f2fs_msg(sb, KERN_INFO,
+ "fault_type options not supported");
break;
+#endif
case Opt_lazytime:
sb->s_flags |= MS_LAZYTIME;
break;
case Opt_nolazytime:
sb->s_flags &= ~MS_LAZYTIME;
break;
+#ifdef CONFIG_QUOTA
+ case Opt_quota:
+ case Opt_usrquota:
+ set_opt(sbi, USRQUOTA);
+ break;
+ case Opt_grpquota:
+ set_opt(sbi, GRPQUOTA);
+ break;
+ case Opt_prjquota:
+ set_opt(sbi, PRJQUOTA);
+ break;
+ case Opt_usrjquota:
+ ret = f2fs_set_qf_name(sb, USRQUOTA, &args[0]);
+ if (ret)
+ return ret;
+ break;
+ case Opt_grpjquota:
+ ret = f2fs_set_qf_name(sb, GRPQUOTA, &args[0]);
+ if (ret)
+ return ret;
+ break;
+ case Opt_prjjquota:
+ ret = f2fs_set_qf_name(sb, PRJQUOTA, &args[0]);
+ if (ret)
+ return ret;
+ break;
+ case Opt_offusrjquota:
+ ret = f2fs_clear_qf_name(sb, USRQUOTA);
+ if (ret)
+ return ret;
+ break;
+ case Opt_offgrpjquota:
+ ret = f2fs_clear_qf_name(sb, GRPQUOTA);
+ if (ret)
+ return ret;
+ break;
+ case Opt_offprjjquota:
+ ret = f2fs_clear_qf_name(sb, PRJQUOTA);
+ if (ret)
+ return ret;
+ break;
+ case Opt_jqfmt_vfsold:
+ F2FS_OPTION(sbi).s_jquota_fmt = QFMT_VFS_OLD;
+ break;
+ case Opt_jqfmt_vfsv0:
+ F2FS_OPTION(sbi).s_jquota_fmt = QFMT_VFS_V0;
+ break;
+ case Opt_jqfmt_vfsv1:
+ F2FS_OPTION(sbi).s_jquota_fmt = QFMT_VFS_V1;
+ break;
+ case Opt_noquota:
+ clear_opt(sbi, QUOTA);
+ clear_opt(sbi, USRQUOTA);
+ clear_opt(sbi, GRPQUOTA);
+ clear_opt(sbi, PRJQUOTA);
+ break;
+#else
+ case Opt_quota:
+ case Opt_usrquota:
+ case Opt_grpquota:
+ case Opt_prjquota:
+ case Opt_usrjquota:
+ case Opt_grpjquota:
+ case Opt_prjjquota:
+ case Opt_offusrjquota:
+ case Opt_offgrpjquota:
+ case Opt_offprjjquota:
+ case Opt_jqfmt_vfsold:
+ case Opt_jqfmt_vfsv0:
+ case Opt_jqfmt_vfsv1:
+ case Opt_noquota:
+ f2fs_msg(sb, KERN_INFO,
+ "quota operations not supported");
+ break;
+#endif
+ case Opt_whint:
+ name = match_strdup(&args[0]);
+ if (!name)
+ return -ENOMEM;
+ if (strlen(name) == 10 &&
+ !strncmp(name, "user-based", 10)) {
+ F2FS_OPTION(sbi).whint_mode = WHINT_MODE_USER;
+ } else if (strlen(name) == 3 &&
+ !strncmp(name, "off", 3)) {
+ F2FS_OPTION(sbi).whint_mode = WHINT_MODE_OFF;
+ } else if (strlen(name) == 8 &&
+ !strncmp(name, "fs-based", 8)) {
+ F2FS_OPTION(sbi).whint_mode = WHINT_MODE_FS;
+ } else {
+ kvfree(name);
+ return -EINVAL;
+ }
+ kvfree(name);
+ break;
+ case Opt_alloc:
+ name = match_strdup(&args[0]);
+ if (!name)
+ return -ENOMEM;
+
+ if (strlen(name) == 7 &&
+ !strncmp(name, "default", 7)) {
+ F2FS_OPTION(sbi).alloc_mode = ALLOC_MODE_DEFAULT;
+ } else if (strlen(name) == 5 &&
+ !strncmp(name, "reuse", 5)) {
+ F2FS_OPTION(sbi).alloc_mode = ALLOC_MODE_REUSE;
+ } else {
+ kvfree(name);
+ return -EINVAL;
+ }
+ kvfree(name);
+ break;
+ case Opt_fsync:
+ name = match_strdup(&args[0]);
+ if (!name)
+ return -ENOMEM;
+ if (strlen(name) == 5 &&
+ !strncmp(name, "posix", 5)) {
+ F2FS_OPTION(sbi).fsync_mode = FSYNC_MODE_POSIX;
+ } else if (strlen(name) == 6 &&
+ !strncmp(name, "strict", 6)) {
+ F2FS_OPTION(sbi).fsync_mode = FSYNC_MODE_STRICT;
+ } else if (strlen(name) == 9 &&
+ !strncmp(name, "nobarrier", 9)) {
+ F2FS_OPTION(sbi).fsync_mode =
+ FSYNC_MODE_NOBARRIER;
+ } else {
+ kvfree(name);
+ return -EINVAL;
+ }
+ kvfree(name);
+ break;
+ case Opt_test_dummy_encryption:
+#ifdef CONFIG_F2FS_FS_ENCRYPTION
+ if (!f2fs_sb_has_encrypt(sbi)) {
+ f2fs_msg(sb, KERN_ERR, "Encrypt feature is off");
+ return -EINVAL;
+ }
+
+ F2FS_OPTION(sbi).test_dummy_encryption = true;
+ f2fs_msg(sb, KERN_INFO,
+ "Test dummy encryption mode enabled");
+#else
+ f2fs_msg(sb, KERN_INFO,
+ "Test dummy encryption mount option ignored");
+#endif
+ break;
+ case Opt_checkpoint:
+ name = match_strdup(&args[0]);
+ if (!name)
+ return -ENOMEM;
+
+ if (strlen(name) == 6 &&
+ !strncmp(name, "enable", 6)) {
+ clear_opt(sbi, DISABLE_CHECKPOINT);
+ } else if (strlen(name) == 7 &&
+ !strncmp(name, "disable", 7)) {
+ set_opt(sbi, DISABLE_CHECKPOINT);
+ } else {
+ kvfree(name);
+ return -EINVAL;
+ }
+ kvfree(name);
+ break;
default:
f2fs_msg(sb, KERN_ERR,
"Unrecognized mount option \"%s\" or missing value",
@@ -545,6 +795,68 @@ static int parse_options(struct super_block *sb, char *options)
return -EINVAL;
}
}
+#ifdef CONFIG_QUOTA
+ if (f2fs_check_quota_options(sbi))
+ return -EINVAL;
+#else
+ if (f2fs_sb_has_quota_ino(sbi) && !f2fs_readonly(sbi->sb)) {
+ f2fs_msg(sbi->sb, KERN_INFO,
+ "Filesystem with quota feature cannot be mounted RDWR "
+ "without CONFIG_QUOTA");
+ return -EINVAL;
+ }
+ if (f2fs_sb_has_project_quota(sbi) && !f2fs_readonly(sbi->sb)) {
+ f2fs_msg(sb, KERN_ERR,
+ "Filesystem with project quota feature cannot be "
+ "mounted RDWR without CONFIG_QUOTA");
+ return -EINVAL;
+ }
+#endif
+
+ if (F2FS_IO_SIZE_BITS(sbi) && !test_opt(sbi, LFS)) {
+ f2fs_msg(sb, KERN_ERR,
+ "Should set mode=lfs with %uKB-sized IO",
+ F2FS_IO_SIZE_KB(sbi));
+ return -EINVAL;
+ }
+
+ if (test_opt(sbi, INLINE_XATTR_SIZE)) {
+ if (!f2fs_sb_has_extra_attr(sbi) ||
+ !f2fs_sb_has_flexible_inline_xattr(sbi)) {
+ f2fs_msg(sb, KERN_ERR,
+ "extra_attr or flexible_inline_xattr "
+ "feature is off");
+ return -EINVAL;
+ }
+ if (!test_opt(sbi, INLINE_XATTR)) {
+ f2fs_msg(sb, KERN_ERR,
+ "inline_xattr_size option should be "
+ "set with inline_xattr option");
+ return -EINVAL;
+ }
+ if (!F2FS_OPTION(sbi).inline_xattr_size ||
+ F2FS_OPTION(sbi).inline_xattr_size >=
+ DEF_ADDRS_PER_INODE -
+ F2FS_TOTAL_EXTRA_ATTR_SIZE -
+ DEF_INLINE_RESERVED_SIZE -
+ DEF_MIN_INLINE_SIZE) {
+ f2fs_msg(sb, KERN_ERR,
+ "inline xattr size is out of range");
+ return -EINVAL;
+ }
+ }
+
+ if (test_opt(sbi, DISABLE_CHECKPOINT) && test_opt(sbi, LFS)) {
+ f2fs_msg(sb, KERN_ERR,
+ "LFS not compatible with checkpoint=disable\n");
+ return -EINVAL;
+ }
+
+ /* Not pass down write hints if the number of active logs is lesser
+ * than NR_CURSEG_TYPE.
+ */
+ if (F2FS_OPTION(sbi).active_logs != NR_CURSEG_TYPE)
+ F2FS_OPTION(sbi).whint_mode = WHINT_MODE_OFF;
return 0;
}
@@ -559,25 +871,27 @@ static struct inode *f2fs_alloc_inode(struct super_block *sb)
init_once((void *) fi);
/* Initialize f2fs-specific inode info */
- fi->vfs_inode.i_version = 1;
atomic_set(&fi->dirty_pages, 0);
- fi->i_current_depth = 1;
- fi->i_advise = 0;
init_rwsem(&fi->i_sem);
INIT_LIST_HEAD(&fi->dirty_list);
INIT_LIST_HEAD(&fi->gdirty_list);
+ INIT_LIST_HEAD(&fi->inmem_ilist);
INIT_LIST_HEAD(&fi->inmem_pages);
mutex_init(&fi->inmem_lock);
- init_rwsem(&fi->dio_rwsem[READ]);
- init_rwsem(&fi->dio_rwsem[WRITE]);
+ init_rwsem(&fi->i_gc_rwsem[READ]);
+ init_rwsem(&fi->i_gc_rwsem[WRITE]);
+ init_rwsem(&fi->i_mmap_sem);
+ init_rwsem(&fi->i_xattr_sem);
/* Will be used by directory only */
fi->i_dir_level = F2FS_SB(sb)->dir_level;
+
return &fi->vfs_inode;
}
static int f2fs_drop_inode(struct inode *inode)
{
+ int ret;
/*
* This is to avoid a deadlock condition like below.
* writeback_single_inode(inode)
@@ -593,7 +907,7 @@ static int f2fs_drop_inode(struct inode *inode)
/* some remained atomic pages should discarded */
if (f2fs_is_atomic_file(inode))
- drop_inmem_pages(inode);
+ f2fs_drop_inmem_pages(inode);
/* should remain fi->extent_tree for writepage */
f2fs_destroy_extent_node(inode);
@@ -606,34 +920,36 @@ static int f2fs_drop_inode(struct inode *inode)
sb_end_intwrite(inode->i_sb);
- fscrypt_put_encryption_info(inode, NULL);
spin_lock(&inode->i_lock);
atomic_dec(&inode->i_count);
}
+ trace_f2fs_drop_inode(inode, 0);
return 0;
}
-
- return generic_drop_inode(inode);
+ ret = generic_drop_inode(inode);
+ trace_f2fs_drop_inode(inode, ret);
+ return ret;
}
-int f2fs_inode_dirtied(struct inode *inode)
+int f2fs_inode_dirtied(struct inode *inode, bool sync)
{
struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
+ int ret = 0;
spin_lock(&sbi->inode_lock[DIRTY_META]);
if (is_inode_flag_set(inode, FI_DIRTY_INODE)) {
- spin_unlock(&sbi->inode_lock[DIRTY_META]);
- return 1;
+ ret = 1;
+ } else {
+ set_inode_flag(inode, FI_DIRTY_INODE);
+ stat_inc_dirty_inode(sbi, DIRTY_META);
}
-
- set_inode_flag(inode, FI_DIRTY_INODE);
- list_add_tail(&F2FS_I(inode)->gdirty_list,
+ if (sync && list_empty(&F2FS_I(inode)->gdirty_list)) {
+ list_add_tail(&F2FS_I(inode)->gdirty_list,
&sbi->inode_list[DIRTY_META]);
- inc_page_count(sbi, F2FS_DIRTY_IMETA);
- stat_inc_dirty_inode(sbi, DIRTY_META);
+ inc_page_count(sbi, F2FS_DIRTY_IMETA);
+ }
spin_unlock(&sbi->inode_lock[DIRTY_META]);
-
- return 0;
+ return ret;
}
void f2fs_inode_synced(struct inode *inode)
@@ -645,10 +961,12 @@ void f2fs_inode_synced(struct inode *inode)
spin_unlock(&sbi->inode_lock[DIRTY_META]);
return;
}
- list_del_init(&F2FS_I(inode)->gdirty_list);
+ if (!list_empty(&F2FS_I(inode)->gdirty_list)) {
+ list_del_init(&F2FS_I(inode)->gdirty_list);
+ dec_page_count(sbi, F2FS_DIRTY_IMETA);
+ }
clear_inode_flag(inode, FI_DIRTY_INODE);
clear_inode_flag(inode, FI_AUTO_RECOVER);
- dec_page_count(sbi, F2FS_DIRTY_IMETA);
stat_dec_dirty_inode(F2FS_I_SB(inode), DIRTY_META);
spin_unlock(&sbi->inode_lock[DIRTY_META]);
}
@@ -672,7 +990,7 @@ static void f2fs_dirty_inode(struct inode *inode, int flags)
if (is_inode_flag_set(inode, FI_AUTO_RECOVER))
clear_inode_flag(inode, FI_AUTO_RECOVER);
- f2fs_inode_dirtied(inode);
+ f2fs_inode_dirtied(inode, false);
}
static void f2fs_i_callback(struct rcu_head *head)
@@ -692,18 +1010,26 @@ static void destroy_percpu_info(struct f2fs_sb_info *sbi)
percpu_counter_destroy(&sbi->total_valid_inode_count);
}
-static void f2fs_put_super(struct super_block *sb)
+static void destroy_device_list(struct f2fs_sb_info *sbi)
{
- struct f2fs_sb_info *sbi = F2FS_SB(sb);
+ int i;
- if (sbi->s_proc) {
- remove_proc_entry("segment_info", sbi->s_proc);
- remove_proc_entry("segment_bits", sbi->s_proc);
- remove_proc_entry(sb->s_id, f2fs_proc_root);
+ for (i = 0; i < sbi->s_ndevs; i++) {
+ blkdev_put(FDEV(i).bdev, FMODE_EXCL);
+#ifdef CONFIG_BLK_DEV_ZONED
+ kvfree(FDEV(i).blkz_type);
+#endif
}
- kobject_del(&sbi->s_kobj);
+ kvfree(sbi->devs);
+}
+
+static void f2fs_put_super(struct super_block *sb)
+{
+ struct f2fs_sb_info *sbi = F2FS_SB(sb);
+ int i;
+ bool dropped;
- stop_gc_thread(sbi);
+ f2fs_quota_off_umount(sb);
/* prevent remaining shrinker jobs */
mutex_lock(&sbi->umount_mutex);
@@ -713,48 +1039,76 @@ static void f2fs_put_super(struct super_block *sb)
* But, the previous checkpoint was not done by umount, it needs to do
* clean checkpoint again.
*/
- if (is_sbi_flag_set(sbi, SBI_IS_DIRTY) ||
- !is_set_ckpt_flags(sbi, CP_UMOUNT_FLAG)) {
+ if ((is_sbi_flag_set(sbi, SBI_IS_DIRTY) ||
+ !is_set_ckpt_flags(sbi, CP_UMOUNT_FLAG))) {
struct cp_control cpc = {
.reason = CP_UMOUNT,
};
- write_checkpoint(sbi, &cpc);
+ f2fs_write_checkpoint(sbi, &cpc);
}
- /* write_checkpoint can update stat informaion */
- f2fs_destroy_stats(sbi);
+ /* be sure to wait for any on-going discard commands */
+ dropped = f2fs_wait_discard_bios(sbi);
+
+ if ((f2fs_hw_support_discard(sbi) || f2fs_hw_should_discard(sbi)) &&
+ !sbi->discard_blks && !dropped) {
+ struct cp_control cpc = {
+ .reason = CP_UMOUNT | CP_TRIMMED,
+ };
+ f2fs_write_checkpoint(sbi, &cpc);
+ }
/*
* normally superblock is clean, so we need to release this.
* In addition, EIO will skip do checkpoint, we need this as well.
*/
- release_ino_entry(sbi, true);
- release_discard_addrs(sbi);
+ f2fs_release_ino_entry(sbi, true);
f2fs_leave_shrinker(sbi);
mutex_unlock(&sbi->umount_mutex);
/* our cp_error case, we can wait for any writeback page */
- f2fs_flush_merged_bios(sbi);
+ f2fs_flush_merged_writes(sbi);
+
+ f2fs_wait_on_all_pages_writeback(sbi);
+
+ f2fs_bug_on(sbi, sbi->fsync_node_num);
iput(sbi->node_inode);
+ sbi->node_inode = NULL;
+
iput(sbi->meta_inode);
+ sbi->meta_inode = NULL;
+
+ /*
+ * iput() can update stat information, if f2fs_write_checkpoint()
+ * above failed with error.
+ */
+ f2fs_destroy_stats(sbi);
/* destroy f2fs internal modules */
- destroy_node_manager(sbi);
- destroy_segment_manager(sbi);
+ f2fs_destroy_node_manager(sbi);
+ f2fs_destroy_segment_manager(sbi);
+
+ kvfree(sbi->ckpt);
- kfree(sbi->ckpt);
- kobject_put(&sbi->s_kobj);
- wait_for_completion(&sbi->s_kobj_unregister);
+ f2fs_unregister_sysfs(sbi);
sb->s_fs_info = NULL;
if (sbi->s_chksum_driver)
crypto_free_shash(sbi->s_chksum_driver);
- kfree(sbi->raw_super);
+ kvfree(sbi->raw_super);
+ destroy_device_list(sbi);
+ mempool_destroy(sbi->write_io_dummy);
+#ifdef CONFIG_QUOTA
+ for (i = 0; i < MAXQUOTAS; i++)
+ kvfree(F2FS_OPTION(sbi).s_qf_names[i]);
+#endif
destroy_percpu_info(sbi);
- kfree(sbi);
+ for (i = 0; i < NR_PAGE_TYPE; i++)
+ kvfree(sbi->write_io[i]);
+ kvfree(sbi);
}
int f2fs_sync_fs(struct super_block *sb, int sync)
@@ -762,15 +1116,23 @@ int f2fs_sync_fs(struct super_block *sb, int sync)
struct f2fs_sb_info *sbi = F2FS_SB(sb);
int err = 0;
+ if (unlikely(f2fs_cp_error(sbi)))
+ return 0;
+ if (unlikely(is_sbi_flag_set(sbi, SBI_CP_DISABLED)))
+ return 0;
+
trace_f2fs_sync_fs(sb, sync);
+ if (unlikely(is_sbi_flag_set(sbi, SBI_POR_DOING)))
+ return -EAGAIN;
+
if (sync) {
struct cp_control cpc;
cpc.reason = __get_cp_reason(sbi);
mutex_lock(&sbi->gc_mutex);
- err = write_checkpoint(sbi, &cpc);
+ err = f2fs_write_checkpoint(sbi, &cpc);
mutex_unlock(&sbi->gc_mutex);
}
f2fs_trace_ios(NULL, 1);
@@ -780,13 +1142,17 @@ int f2fs_sync_fs(struct super_block *sb, int sync)
static int f2fs_freeze(struct super_block *sb)
{
- int err;
-
if (f2fs_readonly(sb))
return 0;
- err = f2fs_sync_fs(sb, 1);
- return err;
+ /* IO error happened before */
+ if (unlikely(f2fs_cp_error(F2FS_SB(sb))))
+ return -EIO;
+
+ /* must be clean, since sync_filesystem() was already called */
+ if (is_sbi_flag_set(F2FS_SB(sb), SBI_IS_DIRTY))
+ return -EINVAL;
+ return 0;
}
static int f2fs_unfreeze(struct super_block *sb)
@@ -794,34 +1160,138 @@ static int f2fs_unfreeze(struct super_block *sb)
return 0;
}
+#ifdef CONFIG_QUOTA
+static int f2fs_statfs_project(struct super_block *sb,
+ kprojid_t projid, struct kstatfs *buf)
+{
+ struct kqid qid;
+ struct dquot *dquot;
+ u64 limit;
+ u64 curblock;
+
+ qid = make_kqid_projid(projid);
+ dquot = dqget(sb, qid);
+ if (IS_ERR(dquot))
+ return PTR_ERR(dquot);
+ spin_lock(&dq_data_lock);
+
+ limit = (dquot->dq_dqb.dqb_bsoftlimit ?
+ dquot->dq_dqb.dqb_bsoftlimit :
+ dquot->dq_dqb.dqb_bhardlimit) >> sb->s_blocksize_bits;
+ if (limit && buf->f_blocks > limit) {
+ curblock = dquot->dq_dqb.dqb_curspace >> sb->s_blocksize_bits;
+ buf->f_blocks = limit;
+ buf->f_bfree = buf->f_bavail =
+ (buf->f_blocks > curblock) ?
+ (buf->f_blocks - curblock) : 0;
+ }
+
+ limit = dquot->dq_dqb.dqb_isoftlimit ?
+ dquot->dq_dqb.dqb_isoftlimit :
+ dquot->dq_dqb.dqb_ihardlimit;
+ if (limit && buf->f_files > limit) {
+ buf->f_files = limit;
+ buf->f_ffree =
+ (buf->f_files > dquot->dq_dqb.dqb_curinodes) ?
+ (buf->f_files - dquot->dq_dqb.dqb_curinodes) : 0;
+ }
+
+ spin_unlock(&dq_data_lock);
+ dqput(dquot);
+ return 0;
+}
+#endif
+
static int f2fs_statfs(struct dentry *dentry, struct kstatfs *buf)
{
struct super_block *sb = dentry->d_sb;
struct f2fs_sb_info *sbi = F2FS_SB(sb);
u64 id = huge_encode_dev(sb->s_bdev->bd_dev);
- block_t total_count, user_block_count, start_count, ovp_count;
+ block_t total_count, user_block_count, start_count;
+ u64 avail_node_count;
total_count = le64_to_cpu(sbi->raw_super->block_count);
user_block_count = sbi->user_block_count;
start_count = le32_to_cpu(sbi->raw_super->segment0_blkaddr);
- ovp_count = SM_I(sbi)->ovp_segments << sbi->log_blocks_per_seg;
buf->f_type = F2FS_SUPER_MAGIC;
buf->f_bsize = sbi->blocksize;
buf->f_blocks = total_count - start_count;
- buf->f_bfree = user_block_count - valid_user_blocks(sbi) + ovp_count;
- buf->f_bavail = user_block_count - valid_user_blocks(sbi);
+ buf->f_bfree = user_block_count - valid_user_blocks(sbi) -
+ sbi->current_reserved_blocks;
+ if (unlikely(buf->f_bfree <= sbi->unusable_block_count))
+ buf->f_bfree = 0;
+ else
+ buf->f_bfree -= sbi->unusable_block_count;
+
+ if (buf->f_bfree > F2FS_OPTION(sbi).root_reserved_blocks)
+ buf->f_bavail = buf->f_bfree -
+ F2FS_OPTION(sbi).root_reserved_blocks;
+ else
+ buf->f_bavail = 0;
+
+ avail_node_count = sbi->total_node_count - sbi->nquota_files -
+ F2FS_RESERVED_NODE_NUM;
- buf->f_files = sbi->total_node_count - F2FS_RESERVED_NODE_NUM;
- buf->f_ffree = buf->f_files - valid_inode_count(sbi);
+ if (avail_node_count > user_block_count) {
+ buf->f_files = user_block_count;
+ buf->f_ffree = buf->f_bavail;
+ } else {
+ buf->f_files = avail_node_count;
+ buf->f_ffree = min(avail_node_count - valid_node_count(sbi),
+ buf->f_bavail);
+ }
buf->f_namelen = F2FS_NAME_LEN;
buf->f_fsid.val[0] = (u32)id;
buf->f_fsid.val[1] = (u32)(id >> 32);
+#ifdef CONFIG_QUOTA
+ if (is_inode_flag_set(dentry->d_inode, FI_PROJ_INHERIT) &&
+ sb_has_quota_limits_enabled(sb, PRJQUOTA)) {
+ f2fs_statfs_project(sb, F2FS_I(dentry->d_inode)->i_projid, buf);
+ }
+#endif
return 0;
}
+static inline void f2fs_show_quota_options(struct seq_file *seq,
+ struct super_block *sb)
+{
+#ifdef CONFIG_QUOTA
+ struct f2fs_sb_info *sbi = F2FS_SB(sb);
+
+ if (F2FS_OPTION(sbi).s_jquota_fmt) {
+ char *fmtname = "";
+
+ switch (F2FS_OPTION(sbi).s_jquota_fmt) {
+ case QFMT_VFS_OLD:
+ fmtname = "vfsold";
+ break;
+ case QFMT_VFS_V0:
+ fmtname = "vfsv0";
+ break;
+ case QFMT_VFS_V1:
+ fmtname = "vfsv1";
+ break;
+ }
+ seq_printf(seq, ",jqfmt=%s", fmtname);
+ }
+
+ if (F2FS_OPTION(sbi).s_qf_names[USRQUOTA])
+ seq_show_option(seq, "usrjquota",
+ F2FS_OPTION(sbi).s_qf_names[USRQUOTA]);
+
+ if (F2FS_OPTION(sbi).s_qf_names[GRPQUOTA])
+ seq_show_option(seq, "grpjquota",
+ F2FS_OPTION(sbi).s_qf_names[GRPQUOTA]);
+
+ if (F2FS_OPTION(sbi).s_qf_names[PRJQUOTA])
+ seq_show_option(seq, "prjjquota",
+ F2FS_OPTION(sbi).s_qf_names[PRJQUOTA]);
+#endif
+}
+
static int f2fs_show_options(struct seq_file *seq, struct dentry *root)
{
struct f2fs_sb_info *sbi = F2FS_SB(root->d_sb);
@@ -839,7 +1309,9 @@ static int f2fs_show_options(struct seq_file *seq, struct dentry *root)
if (test_opt(sbi, DISCARD))
seq_puts(seq, ",discard");
if (test_opt(sbi, NOHEAP))
- seq_puts(seq, ",no_heap_alloc");
+ seq_puts(seq, ",no_heap");
+ else
+ seq_puts(seq, ",heap");
#ifdef CONFIG_F2FS_FS_XATTR
if (test_opt(sbi, XATTR_USER))
seq_puts(seq, ",user_xattr");
@@ -847,6 +1319,11 @@ static int f2fs_show_options(struct seq_file *seq, struct dentry *root)
seq_puts(seq, ",nouser_xattr");
if (test_opt(sbi, INLINE_XATTR))
seq_puts(seq, ",inline_xattr");
+ else
+ seq_puts(seq, ",noinline_xattr");
+ if (test_opt(sbi, INLINE_XATTR_SIZE))
+ seq_printf(seq, ",inline_xattr_size=%u",
+ F2FS_OPTION(sbi).inline_xattr_size);
#endif
#ifdef CONFIG_F2FS_FS_POSIX_ACL
if (test_opt(sbi, POSIX_ACL))
@@ -882,95 +1359,88 @@ static int f2fs_show_options(struct seq_file *seq, struct dentry *root)
seq_puts(seq, "adaptive");
else if (test_opt(sbi, LFS))
seq_puts(seq, "lfs");
- seq_printf(seq, ",active_logs=%u", sbi->active_logs);
-
- return 0;
-}
-
-static int segment_info_seq_show(struct seq_file *seq, void *offset)
-{
- struct super_block *sb = seq->private;
- struct f2fs_sb_info *sbi = F2FS_SB(sb);
- unsigned int total_segs =
- le32_to_cpu(sbi->raw_super->segment_count_main);
- int i;
-
- seq_puts(seq, "format: segment_type|valid_blocks\n"
- "segment_type(0:HD, 1:WD, 2:CD, 3:HN, 4:WN, 5:CN)\n");
-
- for (i = 0; i < total_segs; i++) {
- struct seg_entry *se = get_seg_entry(sbi, i);
-
- if ((i % 10) == 0)
- seq_printf(seq, "%-10d", i);
- seq_printf(seq, "%d|%-3u", se->type,
- get_valid_blocks(sbi, i, 1));
- if ((i % 10) == 9 || i == (total_segs - 1))
- seq_putc(seq, '\n');
- else
- seq_putc(seq, ' ');
+ seq_printf(seq, ",active_logs=%u", F2FS_OPTION(sbi).active_logs);
+ if (test_opt(sbi, RESERVE_ROOT))
+ seq_printf(seq, ",reserve_root=%u,resuid=%u,resgid=%u",
+ F2FS_OPTION(sbi).root_reserved_blocks,
+ from_kuid_munged(&init_user_ns,
+ F2FS_OPTION(sbi).s_resuid),
+ from_kgid_munged(&init_user_ns,
+ F2FS_OPTION(sbi).s_resgid));
+ if (F2FS_IO_SIZE_BITS(sbi))
+ seq_printf(seq, ",io_bits=%u",
+ F2FS_OPTION(sbi).write_io_size_bits);
+#ifdef CONFIG_F2FS_FAULT_INJECTION
+ if (test_opt(sbi, FAULT_INJECTION)) {
+ seq_printf(seq, ",fault_injection=%u",
+ F2FS_OPTION(sbi).fault_info.inject_rate);
+ seq_printf(seq, ",fault_type=%u",
+ F2FS_OPTION(sbi).fault_info.inject_type);
}
+#endif
+#ifdef CONFIG_QUOTA
+ if (test_opt(sbi, QUOTA))
+ seq_puts(seq, ",quota");
+ if (test_opt(sbi, USRQUOTA))
+ seq_puts(seq, ",usrquota");
+ if (test_opt(sbi, GRPQUOTA))
+ seq_puts(seq, ",grpquota");
+ if (test_opt(sbi, PRJQUOTA))
+ seq_puts(seq, ",prjquota");
+#endif
+ f2fs_show_quota_options(seq, sbi->sb);
+ if (F2FS_OPTION(sbi).whint_mode == WHINT_MODE_USER)
+ seq_printf(seq, ",whint_mode=%s", "user-based");
+ else if (F2FS_OPTION(sbi).whint_mode == WHINT_MODE_FS)
+ seq_printf(seq, ",whint_mode=%s", "fs-based");
+#ifdef CONFIG_F2FS_FS_ENCRYPTION
+ if (F2FS_OPTION(sbi).test_dummy_encryption)
+ seq_puts(seq, ",test_dummy_encryption");
+#endif
- return 0;
-}
-
-static int segment_bits_seq_show(struct seq_file *seq, void *offset)
-{
- struct super_block *sb = seq->private;
- struct f2fs_sb_info *sbi = F2FS_SB(sb);
- unsigned int total_segs =
- le32_to_cpu(sbi->raw_super->segment_count_main);
- int i, j;
-
- seq_puts(seq, "format: segment_type|valid_blocks|bitmaps\n"
- "segment_type(0:HD, 1:WD, 2:CD, 3:HN, 4:WN, 5:CN)\n");
+ if (F2FS_OPTION(sbi).alloc_mode == ALLOC_MODE_DEFAULT)
+ seq_printf(seq, ",alloc_mode=%s", "default");
+ else if (F2FS_OPTION(sbi).alloc_mode == ALLOC_MODE_REUSE)
+ seq_printf(seq, ",alloc_mode=%s", "reuse");
- for (i = 0; i < total_segs; i++) {
- struct seg_entry *se = get_seg_entry(sbi, i);
+ if (test_opt(sbi, DISABLE_CHECKPOINT))
+ seq_puts(seq, ",checkpoint=disable");
- seq_printf(seq, "%-10d", i);
- seq_printf(seq, "%d|%-3u|", se->type,
- get_valid_blocks(sbi, i, 1));
- for (j = 0; j < SIT_VBLOCK_MAP_SIZE; j++)
- seq_printf(seq, " %.2x", se->cur_valid_map[j]);
- seq_putc(seq, '\n');
- }
+ if (F2FS_OPTION(sbi).fsync_mode == FSYNC_MODE_POSIX)
+ seq_printf(seq, ",fsync_mode=%s", "posix");
+ else if (F2FS_OPTION(sbi).fsync_mode == FSYNC_MODE_STRICT)
+ seq_printf(seq, ",fsync_mode=%s", "strict");
+ else if (F2FS_OPTION(sbi).fsync_mode == FSYNC_MODE_NOBARRIER)
+ seq_printf(seq, ",fsync_mode=%s", "nobarrier");
return 0;
}
-#define F2FS_PROC_FILE_DEF(_name) \
-static int _name##_open_fs(struct inode *inode, struct file *file) \
-{ \
- return single_open(file, _name##_seq_show, PDE_DATA(inode)); \
-} \
- \
-static const struct file_operations f2fs_seq_##_name##_fops = { \
- .open = _name##_open_fs, \
- .read = seq_read, \
- .llseek = seq_lseek, \
- .release = single_release, \
-};
-
-F2FS_PROC_FILE_DEF(segment_info);
-F2FS_PROC_FILE_DEF(segment_bits);
-
static void default_options(struct f2fs_sb_info *sbi)
{
/* init some FS parameters */
- sbi->active_logs = NR_CURSEG_TYPE;
+ F2FS_OPTION(sbi).active_logs = NR_CURSEG_TYPE;
+ F2FS_OPTION(sbi).inline_xattr_size = DEFAULT_INLINE_XATTR_ADDRS;
+ F2FS_OPTION(sbi).whint_mode = WHINT_MODE_OFF;
+ F2FS_OPTION(sbi).alloc_mode = ALLOC_MODE_DEFAULT;
+ F2FS_OPTION(sbi).fsync_mode = FSYNC_MODE_POSIX;
+ F2FS_OPTION(sbi).test_dummy_encryption = false;
+ F2FS_OPTION(sbi).s_resuid = make_kuid(&init_user_ns, F2FS_DEF_RESUID);
+ F2FS_OPTION(sbi).s_resgid = make_kgid(&init_user_ns, F2FS_DEF_RESGID);
set_opt(sbi, BG_GC);
+ set_opt(sbi, INLINE_XATTR);
set_opt(sbi, INLINE_DATA);
set_opt(sbi, INLINE_DENTRY);
set_opt(sbi, EXTENT_CACHE);
+ set_opt(sbi, NOHEAP);
sbi->sb->s_flags |= MS_LAZYTIME;
+ clear_opt(sbi, DISABLE_CHECKPOINT);
set_opt(sbi, FLUSH_MERGE);
- if (f2fs_sb_mounted_hmsmr(sbi->sb)) {
+ set_opt(sbi, DISCARD);
+ if (f2fs_sb_has_blkzoned(sbi))
set_opt_mode(sbi, F2FS_MOUNT_LFS);
- set_opt(sbi, DISCARD);
- } else {
+ else
set_opt_mode(sbi, F2FS_MOUNT_ADAPTIVE);
- }
#ifdef CONFIG_F2FS_FS_XATTR
set_opt(sbi, XATTR_USER);
@@ -979,21 +1449,73 @@ static void default_options(struct f2fs_sb_info *sbi)
set_opt(sbi, POSIX_ACL);
#endif
-#ifdef CONFIG_F2FS_FAULT_INJECTION
- f2fs_build_fault_attr(sbi, 0);
+ f2fs_build_fault_attr(sbi, 0, 0);
+}
+
+#ifdef CONFIG_QUOTA
+static int f2fs_enable_quotas(struct super_block *sb);
#endif
+
+static int f2fs_disable_checkpoint(struct f2fs_sb_info *sbi)
+{
+ struct cp_control cpc;
+ int err;
+
+ sbi->sb->s_flags |= MS_ACTIVE;
+
+ f2fs_update_time(sbi, DISABLE_TIME);
+
+ while (!f2fs_time_over(sbi, DISABLE_TIME)) {
+ mutex_lock(&sbi->gc_mutex);
+ err = f2fs_gc(sbi, true, false, NULL_SEGNO);
+ if (err == -ENODATA)
+ break;
+ if (err && err != -EAGAIN)
+ return err;
+ }
+
+ err = sync_filesystem(sbi->sb);
+ if (err)
+ return err;
+
+ if (f2fs_disable_cp_again(sbi))
+ return -EAGAIN;
+
+ mutex_lock(&sbi->gc_mutex);
+ cpc.reason = CP_PAUSE;
+ set_sbi_flag(sbi, SBI_CP_DISABLED);
+ f2fs_write_checkpoint(sbi, &cpc);
+
+ sbi->unusable_block_count = 0;
+ mutex_unlock(&sbi->gc_mutex);
+ return 0;
+}
+
+static void f2fs_enable_checkpoint(struct f2fs_sb_info *sbi)
+{
+ mutex_lock(&sbi->gc_mutex);
+ f2fs_dirty_to_prefree(sbi);
+
+ clear_sbi_flag(sbi, SBI_CP_DISABLED);
+ set_sbi_flag(sbi, SBI_IS_DIRTY);
+ mutex_unlock(&sbi->gc_mutex);
+
+ f2fs_sync_fs(sbi->sb, 1);
}
static int f2fs_remount(struct super_block *sb, int *flags, char *data)
{
struct f2fs_sb_info *sbi = F2FS_SB(sb);
struct f2fs_mount_info org_mount_opt;
- int err, active_logs;
+ unsigned long old_sb_flags;
+ int err;
bool need_restart_gc = false;
bool need_stop_gc = false;
bool no_extent_cache = !test_opt(sbi, EXTENT_CACHE);
-#ifdef CONFIG_F2FS_FAULT_INJECTION
- struct f2fs_fault_info ffi = sbi->fault_info;
+ bool disable_checkpoint = test_opt(sbi, DISABLE_CHECKPOINT);
+ bool checkpoint_changed;
+#ifdef CONFIG_QUOTA
+ int i, j;
#endif
/*
@@ -1001,7 +1523,25 @@ static int f2fs_remount(struct super_block *sb, int *flags, char *data)
* need to restore them.
*/
org_mount_opt = sbi->mount_opt;
- active_logs = sbi->active_logs;
+ old_sb_flags = sb->s_flags;
+
+#ifdef CONFIG_QUOTA
+ org_mount_opt.s_jquota_fmt = F2FS_OPTION(sbi).s_jquota_fmt;
+ for (i = 0; i < MAXQUOTAS; i++) {
+ if (F2FS_OPTION(sbi).s_qf_names[i]) {
+ org_mount_opt.s_qf_names[i] =
+ kstrdup(F2FS_OPTION(sbi).s_qf_names[i],
+ GFP_KERNEL);
+ if (!org_mount_opt.s_qf_names[i]) {
+ for (j = 0; j < i; j++)
+ kvfree(org_mount_opt.s_qf_names[j]);
+ return -ENOMEM;
+ }
+ } else {
+ org_mount_opt.s_qf_names[i] = NULL;
+ }
+ }
+#endif
/* recover superblocks we couldn't write due to previous RO mount */
if (!(*flags & MS_RDONLY) && is_sbi_flag_set(sbi, SBI_NEED_SB_WRITE)) {
@@ -1012,13 +1552,14 @@ static int f2fs_remount(struct super_block *sb, int *flags, char *data)
clear_sbi_flag(sbi, SBI_NEED_SB_WRITE);
}
- sbi->mount_opt.opt = 0;
default_options(sbi);
/* parse mount options */
err = parse_options(sb, data);
if (err)
goto restore_opts;
+ checkpoint_changed =
+ disable_checkpoint != test_opt(sbi, DISABLE_CHECKPOINT);
/*
* Previous and new state of filesystem is RO,
@@ -1027,6 +1568,23 @@ static int f2fs_remount(struct super_block *sb, int *flags, char *data)
if (f2fs_readonly(sb) && (*flags & MS_RDONLY))
goto skip;
+#ifdef CONFIG_QUOTA
+ if (!f2fs_readonly(sb) && (*flags & MS_RDONLY)) {
+ err = dquot_suspend(sb, -1);
+ if (err < 0)
+ goto restore_opts;
+ } else if (f2fs_readonly(sb) && !(*flags & MS_RDONLY)) {
+ /* dquot_resume needs RW */
+ sb->s_flags &= ~MS_RDONLY;
+ if (sb_any_quota_suspended(sb)) {
+ dquot_resume(sb, -1);
+ } else if (f2fs_sb_has_quota_ino(sbi)) {
+ err = f2fs_enable_quotas(sb);
+ if (err)
+ goto restore_opts;
+ }
+ }
+#endif
/* disallow enable/disable extent_cache dynamically */
if (no_extent_cache == !!test_opt(sbi, EXTENT_CACHE)) {
err = -EINVAL;
@@ -1035,6 +1593,13 @@ static int f2fs_remount(struct super_block *sb, int *flags, char *data)
goto restore_opts;
}
+ if ((*flags & MS_RDONLY) && test_opt(sbi, DISABLE_CHECKPOINT)) {
+ err = -EINVAL;
+ f2fs_msg(sbi->sb, KERN_WARNING,
+ "disabling checkpoint not compatible with read-only");
+ goto restore_opts;
+ }
+
/*
* We stop the GC thread if FS is mounted as RO
* or if background_gc = off is passed in mount
@@ -1042,17 +1607,18 @@ static int f2fs_remount(struct super_block *sb, int *flags, char *data)
*/
if ((*flags & MS_RDONLY) || !test_opt(sbi, BG_GC)) {
if (sbi->gc_thread) {
- stop_gc_thread(sbi);
+ f2fs_stop_gc_thread(sbi);
need_restart_gc = true;
}
} else if (!sbi->gc_thread) {
- err = start_gc_thread(sbi);
+ err = f2fs_start_gc_thread(sbi);
if (err)
goto restore_opts;
need_stop_gc = true;
}
- if (*flags & MS_RDONLY) {
+ if (*flags & MS_RDONLY ||
+ F2FS_OPTION(sbi).whint_mode != org_mount_opt.whint_mode) {
writeback_inodes_sb(sb, WB_REASON_SYNC);
sync_inodes_sb(sb);
@@ -1062,47 +1628,526 @@ static int f2fs_remount(struct super_block *sb, int *flags, char *data)
clear_sbi_flag(sbi, SBI_IS_CLOSE);
}
+ if (checkpoint_changed) {
+ if (test_opt(sbi, DISABLE_CHECKPOINT)) {
+ err = f2fs_disable_checkpoint(sbi);
+ if (err)
+ goto restore_gc;
+ } else {
+ f2fs_enable_checkpoint(sbi);
+ }
+ }
+
/*
* We stop issue flush thread if FS is mounted as RO
* or if flush_merge is not passed in mount option.
*/
if ((*flags & MS_RDONLY) || !test_opt(sbi, FLUSH_MERGE)) {
- destroy_flush_cmd_control(sbi);
- } else if (!SM_I(sbi)->cmd_control_info) {
- err = create_flush_cmd_control(sbi);
+ clear_opt(sbi, FLUSH_MERGE);
+ f2fs_destroy_flush_cmd_control(sbi, false);
+ } else {
+ err = f2fs_create_flush_cmd_control(sbi);
if (err)
goto restore_gc;
}
skip:
+#ifdef CONFIG_QUOTA
+ /* Release old quota file names */
+ for (i = 0; i < MAXQUOTAS; i++)
+ kvfree(org_mount_opt.s_qf_names[i]);
+#endif
/* Update the POSIXACL Flag */
sb->s_flags = (sb->s_flags & ~MS_POSIXACL) |
(test_opt(sbi, POSIX_ACL) ? MS_POSIXACL : 0);
+ limit_reserve_root(sbi);
+ *flags = (*flags & ~MS_LAZYTIME) | (sb->s_flags & MS_LAZYTIME);
return 0;
restore_gc:
if (need_restart_gc) {
- if (start_gc_thread(sbi))
+ if (f2fs_start_gc_thread(sbi))
f2fs_msg(sbi->sb, KERN_WARNING,
"background gc thread has stopped");
} else if (need_stop_gc) {
- stop_gc_thread(sbi);
+ f2fs_stop_gc_thread(sbi);
}
restore_opts:
- sbi->mount_opt = org_mount_opt;
- sbi->active_logs = active_logs;
-#ifdef CONFIG_F2FS_FAULT_INJECTION
- sbi->fault_info = ffi;
+#ifdef CONFIG_QUOTA
+ F2FS_OPTION(sbi).s_jquota_fmt = org_mount_opt.s_jquota_fmt;
+ for (i = 0; i < MAXQUOTAS; i++) {
+ kvfree(F2FS_OPTION(sbi).s_qf_names[i]);
+ F2FS_OPTION(sbi).s_qf_names[i] = org_mount_opt.s_qf_names[i];
+ }
#endif
+ sbi->mount_opt = org_mount_opt;
+ sb->s_flags = old_sb_flags;
+ return err;
+}
+
+#ifdef CONFIG_QUOTA
+/* Read data from quotafile */
+static ssize_t f2fs_quota_read(struct super_block *sb, int type, char *data,
+ size_t len, loff_t off)
+{
+ struct inode *inode = sb_dqopt(sb)->files[type];
+ struct address_space *mapping = inode->i_mapping;
+ block_t blkidx = F2FS_BYTES_TO_BLK(off);
+ int offset = off & (sb->s_blocksize - 1);
+ int tocopy;
+ size_t toread;
+ loff_t i_size = i_size_read(inode);
+ struct page *page;
+ char *kaddr;
+
+ if (off > i_size)
+ return 0;
+
+ if (off + len > i_size)
+ len = i_size - off;
+ toread = len;
+ while (toread > 0) {
+ tocopy = min_t(unsigned long, sb->s_blocksize - offset, toread);
+repeat:
+ page = read_cache_page_gfp(mapping, blkidx, GFP_NOFS);
+ if (IS_ERR(page)) {
+ if (PTR_ERR(page) == -ENOMEM) {
+ congestion_wait(BLK_RW_ASYNC, HZ/50);
+ goto repeat;
+ }
+ set_sbi_flag(F2FS_SB(sb), SBI_QUOTA_NEED_REPAIR);
+ return PTR_ERR(page);
+ }
+
+ lock_page(page);
+
+ if (unlikely(page->mapping != mapping)) {
+ f2fs_put_page(page, 1);
+ goto repeat;
+ }
+ if (unlikely(!PageUptodate(page))) {
+ f2fs_put_page(page, 1);
+ set_sbi_flag(F2FS_SB(sb), SBI_QUOTA_NEED_REPAIR);
+ return -EIO;
+ }
+
+ kaddr = kmap_atomic(page);
+ memcpy(data, kaddr + offset, tocopy);
+ kunmap_atomic(kaddr);
+ f2fs_put_page(page, 1);
+
+ offset = 0;
+ toread -= tocopy;
+ data += tocopy;
+ blkidx++;
+ }
+ return len;
+}
+
+/* Write to quotafile */
+static ssize_t f2fs_quota_write(struct super_block *sb, int type,
+ const char *data, size_t len, loff_t off)
+{
+ struct inode *inode = sb_dqopt(sb)->files[type];
+ struct address_space *mapping = inode->i_mapping;
+ const struct address_space_operations *a_ops = mapping->a_ops;
+ int offset = off & (sb->s_blocksize - 1);
+ size_t towrite = len;
+ struct page *page;
+ char *kaddr;
+ int err = 0;
+ int tocopy;
+
+ while (towrite > 0) {
+ tocopy = min_t(unsigned long, sb->s_blocksize - offset,
+ towrite);
+retry:
+ err = a_ops->write_begin(NULL, mapping, off, tocopy, 0,
+ &page, NULL);
+ if (unlikely(err)) {
+ if (err == -ENOMEM) {
+ congestion_wait(BLK_RW_ASYNC, HZ/50);
+ goto retry;
+ }
+ set_sbi_flag(F2FS_SB(sb), SBI_QUOTA_NEED_REPAIR);
+ break;
+ }
+
+ kaddr = kmap_atomic(page);
+ memcpy(kaddr + offset, data, tocopy);
+ kunmap_atomic(kaddr);
+ flush_dcache_page(page);
+
+ a_ops->write_end(NULL, mapping, off, tocopy, tocopy,
+ page, NULL);
+ offset = 0;
+ towrite -= tocopy;
+ off += tocopy;
+ data += tocopy;
+ cond_resched();
+ }
+
+ if (len == towrite)
+ return err;
+ inode->i_mtime = inode->i_ctime = current_time(inode);
+ f2fs_mark_inode_dirty_sync(inode, false);
+ return len - towrite;
+}
+
+static struct dquot **f2fs_get_dquots(struct inode *inode)
+{
+ return F2FS_I(inode)->i_dquot;
+}
+
+static qsize_t *f2fs_get_reserved_space(struct inode *inode)
+{
+ return &F2FS_I(inode)->i_reserved_quota;
+}
+
+static int f2fs_quota_on_mount(struct f2fs_sb_info *sbi, int type)
+{
+ if (is_set_ckpt_flags(sbi, CP_QUOTA_NEED_FSCK_FLAG)) {
+ f2fs_msg(sbi->sb, KERN_ERR,
+ "quota sysfile may be corrupted, skip loading it");
+ return 0;
+ }
+
+ return dquot_quota_on_mount(sbi->sb, F2FS_OPTION(sbi).s_qf_names[type],
+ F2FS_OPTION(sbi).s_jquota_fmt, type);
+}
+
+int f2fs_enable_quota_files(struct f2fs_sb_info *sbi, bool rdonly)
+{
+ int enabled = 0;
+ int i, err;
+
+ if (f2fs_sb_has_quota_ino(sbi) && rdonly) {
+ err = f2fs_enable_quotas(sbi->sb);
+ if (err) {
+ f2fs_msg(sbi->sb, KERN_ERR,
+ "Cannot turn on quota_ino: %d", err);
+ return 0;
+ }
+ return 1;
+ }
+
+ for (i = 0; i < MAXQUOTAS; i++) {
+ if (F2FS_OPTION(sbi).s_qf_names[i]) {
+ err = f2fs_quota_on_mount(sbi, i);
+ if (!err) {
+ enabled = 1;
+ continue;
+ }
+ f2fs_msg(sbi->sb, KERN_ERR,
+ "Cannot turn on quotas: %d on %d", err, i);
+ }
+ }
+ return enabled;
+}
+
+static int f2fs_quota_enable(struct super_block *sb, int type, int format_id,
+ unsigned int flags)
+{
+ struct inode *qf_inode;
+ unsigned long qf_inum;
+ int err;
+
+ BUG_ON(!f2fs_sb_has_quota_ino(F2FS_SB(sb)));
+
+ qf_inum = f2fs_qf_ino(sb, type);
+ if (!qf_inum)
+ return -EPERM;
+
+ qf_inode = f2fs_iget(sb, qf_inum);
+ if (IS_ERR(qf_inode)) {
+ f2fs_msg(sb, KERN_ERR,
+ "Bad quota inode %u:%lu", type, qf_inum);
+ return PTR_ERR(qf_inode);
+ }
+
+ /* Don't account quota for quota files to avoid recursion */
+ qf_inode->i_flags |= S_NOQUOTA;
+ err = dquot_enable(qf_inode, type, format_id, flags);
+ iput(qf_inode);
+ return err;
+}
+
+static int f2fs_enable_quotas(struct super_block *sb)
+{
+ int type, err = 0;
+ unsigned long qf_inum;
+ bool quota_mopt[MAXQUOTAS] = {
+ test_opt(F2FS_SB(sb), USRQUOTA),
+ test_opt(F2FS_SB(sb), GRPQUOTA),
+ test_opt(F2FS_SB(sb), PRJQUOTA),
+ };
+
+ if (is_set_ckpt_flags(F2FS_SB(sb), CP_QUOTA_NEED_FSCK_FLAG)) {
+ f2fs_msg(sb, KERN_ERR,
+ "quota file may be corrupted, skip loading it");
+ return 0;
+ }
+
+ sb_dqopt(sb)->flags |= DQUOT_QUOTA_SYS_FILE;
+
+ for (type = 0; type < MAXQUOTAS; type++) {
+ qf_inum = f2fs_qf_ino(sb, type);
+ if (qf_inum) {
+ err = f2fs_quota_enable(sb, type, QFMT_VFS_V1,
+ DQUOT_USAGE_ENABLED |
+ (quota_mopt[type] ? DQUOT_LIMITS_ENABLED : 0));
+ if (err) {
+ f2fs_msg(sb, KERN_ERR,
+ "Failed to enable quota tracking "
+ "(type=%d, err=%d). Please run "
+ "fsck to fix.", type, err);
+ for (type--; type >= 0; type--)
+ dquot_quota_off(sb, type);
+ set_sbi_flag(F2FS_SB(sb),
+ SBI_QUOTA_NEED_REPAIR);
+ return err;
+ }
+ }
+ }
+ return 0;
+}
+
+int f2fs_quota_sync(struct super_block *sb, int type)
+{
+ struct f2fs_sb_info *sbi = F2FS_SB(sb);
+ struct quota_info *dqopt = sb_dqopt(sb);
+ int cnt;
+ int ret;
+
+ ret = dquot_writeback_dquots(sb, type);
+ if (ret)
+ goto out;
+
+ /*
+ * Now when everything is written we can discard the pagecache so
+ * that userspace sees the changes.
+ */
+ for (cnt = 0; cnt < MAXQUOTAS; cnt++) {
+ struct address_space *mapping;
+
+ if (type != -1 && cnt != type)
+ continue;
+ if (!sb_has_quota_active(sb, cnt))
+ continue;
+
+ mapping = dqopt->files[cnt]->i_mapping;
+
+ ret = filemap_fdatawrite(mapping);
+ if (ret)
+ goto out;
+
+ /* if we are using journalled quota */
+ if (is_journalled_quota(sbi))
+ continue;
+
+ ret = filemap_fdatawait(mapping);
+ if (ret)
+ set_sbi_flag(F2FS_SB(sb), SBI_QUOTA_NEED_REPAIR);
+
+ inode_lock(dqopt->files[cnt]);
+ truncate_inode_pages(&dqopt->files[cnt]->i_data, 0);
+ inode_unlock(dqopt->files[cnt]);
+ }
+out:
+ if (ret)
+ set_sbi_flag(F2FS_SB(sb), SBI_QUOTA_NEED_REPAIR);
+ return ret;
+}
+
+static int f2fs_quota_on(struct super_block *sb, int type, int format_id,
+ struct path *path)
+{
+ struct inode *inode;
+ int err;
+
+ err = f2fs_quota_sync(sb, type);
+ if (err)
+ return err;
+
+ err = dquot_quota_on(sb, type, format_id, path);
+ if (err)
+ return err;
+
+ inode = d_inode(path->dentry);
+
+ inode_lock(inode);
+ F2FS_I(inode)->i_flags |= F2FS_NOATIME_FL | F2FS_IMMUTABLE_FL;
+ f2fs_set_inode_flags(inode);
+ inode_unlock(inode);
+ f2fs_mark_inode_dirty_sync(inode, false);
+
+ return 0;
+}
+
+static int f2fs_quota_off(struct super_block *sb, int type)
+{
+ struct inode *inode = sb_dqopt(sb)->files[type];
+ int err;
+
+ if (!inode || !igrab(inode))
+ return dquot_quota_off(sb, type);
+
+ err = f2fs_quota_sync(sb, type);
+ if (err)
+ goto out_put;
+
+ err = dquot_quota_off(sb, type);
+ if (err || f2fs_sb_has_quota_ino(F2FS_SB(sb)))
+ goto out_put;
+
+ inode_lock(inode);
+ F2FS_I(inode)->i_flags &= ~(F2FS_NOATIME_FL | F2FS_IMMUTABLE_FL);
+ f2fs_set_inode_flags(inode);
+ inode_unlock(inode);
+ f2fs_mark_inode_dirty_sync(inode, false);
+out_put:
+ iput(inode);
return err;
}
-static struct super_operations f2fs_sops = {
+void f2fs_quota_off_umount(struct super_block *sb)
+{
+ int type;
+ int err;
+
+ for (type = 0; type < MAXQUOTAS; type++) {
+ err = f2fs_quota_off(sb, type);
+ if (err) {
+ int ret = dquot_quota_off(sb, type);
+
+ f2fs_msg(sb, KERN_ERR,
+ "Fail to turn off disk quota "
+ "(type: %d, err: %d, ret:%d), Please "
+ "run fsck to fix it.", type, err, ret);
+ set_sbi_flag(F2FS_SB(sb), SBI_QUOTA_NEED_REPAIR);
+ }
+ }
+}
+
+static void f2fs_truncate_quota_inode_pages(struct super_block *sb)
+{
+ struct quota_info *dqopt = sb_dqopt(sb);
+ int type;
+
+ for (type = 0; type < MAXQUOTAS; type++) {
+ if (!dqopt->files[type])
+ continue;
+ f2fs_inode_synced(dqopt->files[type]);
+ }
+}
+
+static int f2fs_dquot_commit(struct dquot *dquot)
+{
+ int ret;
+
+ ret = dquot_commit(dquot);
+ if (ret < 0)
+ set_sbi_flag(F2FS_SB(dquot->dq_sb), SBI_QUOTA_NEED_REPAIR);
+ return ret;
+}
+
+static int f2fs_dquot_acquire(struct dquot *dquot)
+{
+ int ret;
+
+ ret = dquot_acquire(dquot);
+ if (ret < 0)
+ set_sbi_flag(F2FS_SB(dquot->dq_sb), SBI_QUOTA_NEED_REPAIR);
+
+ return ret;
+}
+
+static int f2fs_dquot_release(struct dquot *dquot)
+{
+ int ret;
+
+ ret = dquot_release(dquot);
+ if (ret < 0)
+ set_sbi_flag(F2FS_SB(dquot->dq_sb), SBI_QUOTA_NEED_REPAIR);
+ return ret;
+}
+
+static int f2fs_dquot_mark_dquot_dirty(struct dquot *dquot)
+{
+ struct super_block *sb = dquot->dq_sb;
+ struct f2fs_sb_info *sbi = F2FS_SB(sb);
+ int ret;
+
+ ret = dquot_mark_dquot_dirty(dquot);
+
+ /* if we are using journalled quota */
+ if (is_journalled_quota(sbi))
+ set_sbi_flag(sbi, SBI_QUOTA_NEED_FLUSH);
+
+ return ret;
+}
+
+static int f2fs_dquot_commit_info(struct super_block *sb, int type)
+{
+ int ret;
+
+ ret = dquot_commit_info(sb, type);
+ if (ret < 0)
+ set_sbi_flag(F2FS_SB(sb), SBI_QUOTA_NEED_REPAIR);
+ return ret;
+}
+
+static int f2fs_get_projid(struct inode *inode, kprojid_t *projid)
+{
+ *projid = F2FS_I(inode)->i_projid;
+ return 0;
+}
+
+static const struct dquot_operations f2fs_quota_operations = {
+ .get_reserved_space = f2fs_get_reserved_space,
+ .write_dquot = f2fs_dquot_commit,
+ .acquire_dquot = f2fs_dquot_acquire,
+ .release_dquot = f2fs_dquot_release,
+ .mark_dirty = f2fs_dquot_mark_dquot_dirty,
+ .write_info = f2fs_dquot_commit_info,
+ .alloc_dquot = dquot_alloc,
+ .destroy_dquot = dquot_destroy,
+ .get_projid = f2fs_get_projid,
+ .get_next_id = dquot_get_next_id,
+};
+
+static const struct quotactl_ops f2fs_quotactl_ops = {
+ .quota_on = f2fs_quota_on,
+ .quota_off = f2fs_quota_off,
+ .quota_sync = f2fs_quota_sync,
+ .get_state = dquot_get_state,
+ .set_info = dquot_set_dqinfo,
+ .get_dqblk = dquot_get_dqblk,
+ .set_dqblk = dquot_set_dqblk,
+ .get_nextdqblk = dquot_get_next_dqblk,
+};
+#else
+int f2fs_quota_sync(struct super_block *sb, int type)
+{
+ return 0;
+}
+
+void f2fs_quota_off_umount(struct super_block *sb)
+{
+}
+#endif
+
+static const struct super_operations f2fs_sops = {
.alloc_inode = f2fs_alloc_inode,
.drop_inode = f2fs_drop_inode,
.destroy_inode = f2fs_destroy_inode,
.write_inode = f2fs_write_inode,
.dirty_inode = f2fs_dirty_inode,
.show_options = f2fs_show_options,
+#ifdef CONFIG_QUOTA
+ .quota_read = f2fs_quota_read,
+ .quota_write = f2fs_quota_write,
+ .get_dquots = f2fs_get_dquots,
+#endif
.evict_inode = f2fs_evict_inode,
.put_super = f2fs_put_super,
.sync_fs = f2fs_sync_fs,
@@ -1120,37 +2165,38 @@ static int f2fs_get_context(struct inode *inode, void *ctx, size_t len)
ctx, len, NULL);
}
-static int f2fs_key_prefix(struct inode *inode, u8 **key)
-{
- *key = F2FS_I_SB(inode)->key_prefix;
- return F2FS_I_SB(inode)->key_prefix_size;
-}
-
static int f2fs_set_context(struct inode *inode, const void *ctx, size_t len,
void *fs_data)
{
+ struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
+
+ /*
+ * Encrypting the root directory is not allowed because fsck
+ * expects lost+found directory to exist and remain unencrypted
+ * if LOST_FOUND feature is enabled.
+ *
+ */
+ if (f2fs_sb_has_lost_found(sbi) &&
+ inode->i_ino == F2FS_ROOT_INO(sbi))
+ return -EPERM;
+
return f2fs_setxattr(inode, F2FS_XATTR_INDEX_ENCRYPTION,
F2FS_XATTR_NAME_ENCRYPTION_CONTEXT,
ctx, len, fs_data, XATTR_CREATE);
}
-static unsigned f2fs_max_namelen(struct inode *inode)
+static bool f2fs_dummy_context(struct inode *inode)
{
- return S_ISLNK(inode->i_mode) ?
- inode->i_sb->s_blocksize : F2FS_NAME_LEN;
+ return DUMMY_ENCRYPTION_ENABLED(F2FS_I_SB(inode));
}
-static struct fscrypt_operations f2fs_cryptops = {
+static const struct fscrypt_operations f2fs_cryptops = {
+ .key_prefix = "f2fs:",
.get_context = f2fs_get_context,
- .key_prefix = f2fs_key_prefix,
.set_context = f2fs_set_context,
- .is_encrypted = f2fs_encrypted_inode,
+ .dummy_context = f2fs_dummy_context,
.empty_dir = f2fs_empty_dir,
- .max_namelen = f2fs_max_namelen,
-};
-#else
-static struct fscrypt_operations f2fs_cryptops = {
- .is_encrypted = f2fs_encrypted_inode,
+ .max_namelen = F2FS_NAME_LEN,
};
#endif
@@ -1160,7 +2206,7 @@ static struct inode *f2fs_nfs_get_inode(struct super_block *sb,
struct f2fs_sb_info *sbi = F2FS_SB(sb);
struct inode *inode;
- if (check_nid_range(sbi, ino))
+ if (f2fs_check_nid_range(sbi, ino))
return ERR_PTR(-ESTALE);
/*
@@ -1201,9 +2247,16 @@ static const struct export_operations f2fs_export_ops = {
static loff_t max_file_blocks(void)
{
- loff_t result = (DEF_ADDRS_PER_INODE - F2FS_INLINE_XATTR_ADDRS);
+ loff_t result = 0;
loff_t leaf_count = ADDRS_PER_BLOCK;
+ /*
+ * note: previously, result is equal to (DEF_ADDRS_PER_INODE -
+ * DEFAULT_INLINE_XATTR_ADDRS), but now f2fs try to reserve more
+ * space in inode.i_addr, it will be more safe to reassign
+ * result as zero.
+ */
+
/* two direct node blocks */
result += (leaf_count * 2);
@@ -1224,12 +2277,11 @@ static int __f2fs_commit_super(struct buffer_head *bh,
lock_buffer(bh);
if (super)
memcpy(bh->b_data + F2FS_SUPER_OFFSET, super, sizeof(*super));
- set_buffer_uptodate(bh);
set_buffer_dirty(bh);
unlock_buffer(bh);
/* it's rare case, we can do fua all the time */
- return __sync_dirty_buffer(bh, WRITE_FLUSH_FUA);
+ return __sync_dirty_buffer(bh, REQ_SYNC | REQ_PREFLUSH | REQ_FUA);
}
static inline bool sanity_check_area_boundary(struct f2fs_sb_info *sbi,
@@ -1343,6 +2395,26 @@ static int sanity_check_raw_super(struct f2fs_sb_info *sbi,
(bh->b_data + F2FS_SUPER_OFFSET);
struct super_block *sb = sbi->sb;
unsigned int blocksize;
+ size_t crc_offset = 0;
+ __u32 crc = 0;
+
+ /* Check checksum_offset and crc in superblock */
+ if (__F2FS_HAS_FEATURE(raw_super, F2FS_FEATURE_SB_CHKSUM)) {
+ crc_offset = le32_to_cpu(raw_super->checksum_offset);
+ if (crc_offset !=
+ offsetof(struct f2fs_super_block, crc)) {
+ f2fs_msg(sb, KERN_INFO,
+ "Invalid SB checksum offset: %zu",
+ crc_offset);
+ return 1;
+ }
+ crc = le32_to_cpu(raw_super->crc);
+ if (!f2fs_crc_valid(sbi, crc, raw_super, crc_offset)) {
+ f2fs_msg(sb, KERN_INFO,
+ "Invalid SB checksum value: %u", crc);
+ return 1;
+ }
+ }
if (F2FS_SUPER_MAGIC != le32_to_cpu(raw_super->magic)) {
f2fs_msg(sb, KERN_INFO,
@@ -1440,10 +2512,14 @@ static int sanity_check_raw_super(struct f2fs_sb_info *sbi,
secs_per_zone, total_sections);
return 1;
}
- if (le32_to_cpu(raw_super->extension_count) > F2FS_MAX_EXTENSION) {
+ if (le32_to_cpu(raw_super->extension_count) > F2FS_MAX_EXTENSION ||
+ raw_super->hot_ext_count > F2FS_MAX_EXTENSION ||
+ (le32_to_cpu(raw_super->extension_count) +
+ raw_super->hot_ext_count) > F2FS_MAX_EXTENSION) {
f2fs_msg(sb, KERN_INFO,
- "Corrupted extension count (%u > %u)",
+ "Corrupted extension count (%u + %u > %u)",
le32_to_cpu(raw_super->extension_count),
+ raw_super->hot_ext_count,
F2FS_MAX_EXTENSION);
return 1;
}
@@ -1476,7 +2552,7 @@ static int sanity_check_raw_super(struct f2fs_sb_info *sbi,
return 0;
}
-int sanity_check_ckpt(struct f2fs_sb_info *sbi)
+int f2fs_sanity_check_ckpt(struct f2fs_sb_info *sbi)
{
unsigned int total, fsmeta;
struct f2fs_super_block *raw_super = F2FS_RAW_SUPER(sbi);
@@ -1489,7 +2565,7 @@ int sanity_check_ckpt(struct f2fs_sb_info *sbi)
unsigned int segment_count_main;
unsigned int cp_pack_start_sum, cp_payload;
block_t user_block_count;
- int i;
+ int i, j;
total = le32_to_cpu(raw_super->segment_count);
fsmeta = le32_to_cpu(raw_super->segment_count_ckpt);
@@ -1530,11 +2606,43 @@ int sanity_check_ckpt(struct f2fs_sb_info *sbi)
if (le32_to_cpu(ckpt->cur_node_segno[i]) >= main_segs ||
le16_to_cpu(ckpt->cur_node_blkoff[i]) >= blocks_per_seg)
return 1;
+ for (j = i + 1; j < NR_CURSEG_NODE_TYPE; j++) {
+ if (le32_to_cpu(ckpt->cur_node_segno[i]) ==
+ le32_to_cpu(ckpt->cur_node_segno[j])) {
+ f2fs_msg(sbi->sb, KERN_ERR,
+ "Node segment (%u, %u) has the same "
+ "segno: %u", i, j,
+ le32_to_cpu(ckpt->cur_node_segno[i]));
+ return 1;
+ }
+ }
}
for (i = 0; i < NR_CURSEG_DATA_TYPE; i++) {
if (le32_to_cpu(ckpt->cur_data_segno[i]) >= main_segs ||
le16_to_cpu(ckpt->cur_data_blkoff[i]) >= blocks_per_seg)
return 1;
+ for (j = i + 1; j < NR_CURSEG_DATA_TYPE; j++) {
+ if (le32_to_cpu(ckpt->cur_data_segno[i]) ==
+ le32_to_cpu(ckpt->cur_data_segno[j])) {
+ f2fs_msg(sbi->sb, KERN_ERR,
+ "Data segment (%u, %u) has the same "
+ "segno: %u", i, j,
+ le32_to_cpu(ckpt->cur_data_segno[i]));
+ return 1;
+ }
+ }
+ }
+ for (i = 0; i < NR_CURSEG_NODE_TYPE; i++) {
+ for (j = i; j < NR_CURSEG_DATA_TYPE; j++) {
+ if (le32_to_cpu(ckpt->cur_node_segno[i]) ==
+ le32_to_cpu(ckpt->cur_data_segno[j])) {
+ f2fs_msg(sbi->sb, KERN_ERR,
+ "Data segment (%u) and Data segment (%u)"
+ " has the same segno: %u", i, j,
+ le32_to_cpu(ckpt->cur_node_segno[i]));
+ return 1;
+ }
+ }
}
sit_bitmap_size = le32_to_cpu(ckpt->sit_ver_bitmap_bytesize);
@@ -1587,27 +2695,34 @@ static void init_sb_info(struct f2fs_sb_info *sbi)
sbi->node_ino_num = le32_to_cpu(raw_super->node_ino);
sbi->meta_ino_num = le32_to_cpu(raw_super->meta_ino);
sbi->cur_victim_sec = NULL_SECNO;
+ sbi->next_victim_seg[BG_GC] = NULL_SEGNO;
+ sbi->next_victim_seg[FG_GC] = NULL_SEGNO;
sbi->max_victim_search = DEF_MAX_VICTIM_SEARCH;
+ sbi->migration_granularity = sbi->segs_per_sec;
sbi->dir_level = DEF_DIR_LEVEL;
sbi->interval_time[CP_TIME] = DEF_CP_INTERVAL;
sbi->interval_time[REQ_TIME] = DEF_IDLE_INTERVAL;
+ sbi->interval_time[DISCARD_TIME] = DEF_IDLE_INTERVAL;
+ sbi->interval_time[GC_TIME] = DEF_IDLE_INTERVAL;
+ sbi->interval_time[DISABLE_TIME] = DEF_DISABLE_INTERVAL;
clear_sbi_flag(sbi, SBI_NEED_FSCK);
for (i = 0; i < NR_COUNT_TYPE; i++)
atomic_set(&sbi->nr_pages[i], 0);
+ for (i = 0; i < META; i++)
+ atomic_set(&sbi->wb_sync_req[i], 0);
+
INIT_LIST_HEAD(&sbi->s_list);
mutex_init(&sbi->umount_mutex);
- mutex_init(&sbi->wio_mutex[NODE]);
- mutex_init(&sbi->wio_mutex[DATA]);
+ init_rwsem(&sbi->io_order_lock);
spin_lock_init(&sbi->cp_lock);
-#ifdef CONFIG_F2FS_FS_ENCRYPTION
- memcpy(sbi->key_prefix, F2FS_KEY_DESC_PREFIX,
- F2FS_KEY_DESC_PREFIX_SIZE);
- sbi->key_prefix_size = F2FS_KEY_DESC_PREFIX_SIZE;
-#endif
+ sbi->dirty_device = 0;
+ spin_lock_init(&sbi->dev_lock);
+
+ init_rwsem(&sbi->sb_lock);
}
static int init_percpu_info(struct f2fs_sb_info *sbi)
@@ -1618,10 +2733,82 @@ static int init_percpu_info(struct f2fs_sb_info *sbi)
if (err)
return err;
- return percpu_counter_init(&sbi->total_valid_inode_count, 0,
+ err = percpu_counter_init(&sbi->total_valid_inode_count, 0,
GFP_KERNEL);
+ if (err)
+ percpu_counter_destroy(&sbi->alloc_valid_block_count);
+
+ return err;
}
+#ifdef CONFIG_BLK_DEV_ZONED
+static int init_blkz_info(struct f2fs_sb_info *sbi, int devi)
+{
+ struct block_device *bdev = FDEV(devi).bdev;
+ sector_t nr_sectors = bdev->bd_part->nr_sects;
+ sector_t sector = 0;
+ struct blk_zone *zones;
+ unsigned int i, nr_zones;
+ unsigned int n = 0;
+ int err = -EIO;
+
+ if (!f2fs_sb_has_blkzoned(sbi))
+ return 0;
+
+ if (sbi->blocks_per_blkz && sbi->blocks_per_blkz !=
+ SECTOR_TO_BLOCK(bdev_zone_sectors(bdev)))
+ return -EINVAL;
+ sbi->blocks_per_blkz = SECTOR_TO_BLOCK(bdev_zone_sectors(bdev));
+ if (sbi->log_blocks_per_blkz && sbi->log_blocks_per_blkz !=
+ __ilog2_u32(sbi->blocks_per_blkz))
+ return -EINVAL;
+ sbi->log_blocks_per_blkz = __ilog2_u32(sbi->blocks_per_blkz);
+ FDEV(devi).nr_blkz = SECTOR_TO_BLOCK(nr_sectors) >>
+ sbi->log_blocks_per_blkz;
+ if (nr_sectors & (bdev_zone_sectors(bdev) - 1))
+ FDEV(devi).nr_blkz++;
+
+ FDEV(devi).blkz_type = f2fs_kmalloc(sbi, FDEV(devi).nr_blkz,
+ GFP_KERNEL);
+ if (!FDEV(devi).blkz_type)
+ return -ENOMEM;
+
+#define F2FS_REPORT_NR_ZONES 4096
+
+ zones = f2fs_kzalloc(sbi,
+ array_size(F2FS_REPORT_NR_ZONES,
+ sizeof(struct blk_zone)),
+ GFP_KERNEL);
+ if (!zones)
+ return -ENOMEM;
+
+ /* Get block zones type */
+ while (zones && sector < nr_sectors) {
+
+ nr_zones = F2FS_REPORT_NR_ZONES;
+ err = blkdev_report_zones(bdev, sector,
+ zones, &nr_zones,
+ GFP_KERNEL);
+ if (err)
+ break;
+ if (!nr_zones) {
+ err = -EIO;
+ break;
+ }
+
+ for (i = 0; i < nr_zones; i++) {
+ FDEV(devi).blkz_type[n] = zones[i].type;
+ sector += zones[i].len;
+ n++;
+ }
+ }
+
+ kvfree(zones);
+
+ return err;
+}
+#endif
+
/*
* Read f2fs raw super block.
* Because we have two copies of super block, so read both of them
@@ -1676,7 +2863,7 @@ static int read_raw_super_block(struct f2fs_sb_info *sbi,
/* No valid superblock */
if (!*raw_super)
- kfree(super);
+ kvfree(super);
else
err = 0;
@@ -1686,6 +2873,7 @@ static int read_raw_super_block(struct f2fs_sb_info *sbi,
int f2fs_commit_super(struct f2fs_sb_info *sbi, bool recover)
{
struct buffer_head *bh;
+ __u32 crc = 0;
int err;
if ((recover && f2fs_readonly(sbi->sb)) ||
@@ -1694,8 +2882,15 @@ int f2fs_commit_super(struct f2fs_sb_info *sbi, bool recover)
return -EROFS;
}
+ /* we should update superblock crc here */
+ if (!recover && f2fs_sb_has_sb_chksum(sbi)) {
+ crc = f2fs_crc32(sbi, F2FS_RAW_SUPER(sbi),
+ offsetof(struct f2fs_super_block, crc));
+ F2FS_RAW_SUPER(sbi)->crc = cpu_to_le32(crc);
+ }
+
/* write back-up superblock first */
- bh = sb_getblk(sbi->sb, sbi->valid_super_block ? 0: 1);
+ bh = sb_bread(sbi->sb, sbi->valid_super_block ? 0 : 1);
if (!bh)
return -EIO;
err = __f2fs_commit_super(bh, F2FS_RAW_SUPER(sbi));
@@ -1706,7 +2901,7 @@ int f2fs_commit_super(struct f2fs_sb_info *sbi, bool recover)
return err;
/* write current valid superblock */
- bh = sb_getblk(sbi->sb, sbi->valid_super_block);
+ bh = sb_bread(sbi->sb, sbi->valid_super_block);
if (!bh)
return -EIO;
err = __f2fs_commit_super(bh, F2FS_RAW_SUPER(sbi));
@@ -1714,6 +2909,120 @@ int f2fs_commit_super(struct f2fs_sb_info *sbi, bool recover)
return err;
}
+static int f2fs_scan_devices(struct f2fs_sb_info *sbi)
+{
+ struct f2fs_super_block *raw_super = F2FS_RAW_SUPER(sbi);
+ unsigned int max_devices = MAX_DEVICES;
+ int i;
+
+ /* Initialize single device information */
+ if (!RDEV(0).path[0]) {
+#ifdef CONFIG_BLK_DEV_ZONED
+ if (!bdev_is_zoned(sbi->sb->s_bdev))
+ return 0;
+ max_devices = 1;
+#else
+ return 0;
+#endif
+ }
+
+ /*
+ * Initialize multiple devices information, or single
+ * zoned block device information.
+ */
+ sbi->devs = f2fs_kzalloc(sbi,
+ array_size(max_devices,
+ sizeof(struct f2fs_dev_info)),
+ GFP_KERNEL);
+ if (!sbi->devs)
+ return -ENOMEM;
+
+ for (i = 0; i < max_devices; i++) {
+
+ if (i > 0 && !RDEV(i).path[0])
+ break;
+
+ if (max_devices == 1) {
+ /* Single zoned block device mount */
+ FDEV(0).bdev =
+ blkdev_get_by_dev(sbi->sb->s_bdev->bd_dev,
+ sbi->sb->s_mode, sbi->sb->s_type);
+ } else {
+ /* Multi-device mount */
+ memcpy(FDEV(i).path, RDEV(i).path, MAX_PATH_LEN);
+ FDEV(i).total_segments =
+ le32_to_cpu(RDEV(i).total_segments);
+ if (i == 0) {
+ FDEV(i).start_blk = 0;
+ FDEV(i).end_blk = FDEV(i).start_blk +
+ (FDEV(i).total_segments <<
+ sbi->log_blocks_per_seg) - 1 +
+ le32_to_cpu(raw_super->segment0_blkaddr);
+ } else {
+ FDEV(i).start_blk = FDEV(i - 1).end_blk + 1;
+ FDEV(i).end_blk = FDEV(i).start_blk +
+ (FDEV(i).total_segments <<
+ sbi->log_blocks_per_seg) - 1;
+ }
+ FDEV(i).bdev = blkdev_get_by_path(FDEV(i).path,
+ sbi->sb->s_mode, sbi->sb->s_type);
+ }
+ if (IS_ERR(FDEV(i).bdev))
+ return PTR_ERR(FDEV(i).bdev);
+
+ /* to release errored devices */
+ sbi->s_ndevs = i + 1;
+
+#ifdef CONFIG_BLK_DEV_ZONED
+ if (bdev_zoned_model(FDEV(i).bdev) == BLK_ZONED_HM &&
+ !f2fs_sb_has_blkzoned(sbi)) {
+ f2fs_msg(sbi->sb, KERN_ERR,
+ "Zoned block device feature not enabled\n");
+ return -EINVAL;
+ }
+ if (bdev_zoned_model(FDEV(i).bdev) != BLK_ZONED_NONE) {
+ if (init_blkz_info(sbi, i)) {
+ f2fs_msg(sbi->sb, KERN_ERR,
+ "Failed to initialize F2FS blkzone information");
+ return -EINVAL;
+ }
+ if (max_devices == 1)
+ break;
+ f2fs_msg(sbi->sb, KERN_INFO,
+ "Mount Device [%2d]: %20s, %8u, %8x - %8x (zone: %s)",
+ i, FDEV(i).path,
+ FDEV(i).total_segments,
+ FDEV(i).start_blk, FDEV(i).end_blk,
+ bdev_zoned_model(FDEV(i).bdev) == BLK_ZONED_HA ?
+ "Host-aware" : "Host-managed");
+ continue;
+ }
+#endif
+ f2fs_msg(sbi->sb, KERN_INFO,
+ "Mount Device [%2d]: %20s, %8u, %8x - %8x",
+ i, FDEV(i).path,
+ FDEV(i).total_segments,
+ FDEV(i).start_blk, FDEV(i).end_blk);
+ }
+ f2fs_msg(sbi->sb, KERN_INFO,
+ "IO Block Size: %8d KB", F2FS_IO_SIZE_KB(sbi));
+ return 0;
+}
+
+static void f2fs_tuning_parameters(struct f2fs_sb_info *sbi)
+{
+ struct f2fs_sm_info *sm_i = SM_I(sbi);
+
+ /* adjust parameters according to the volume size */
+ if (sm_i->main_segments <= SMALL_VOLUME_SEGMENTS) {
+ F2FS_OPTION(sbi).alloc_mode = ALLOC_MODE_REUSE;
+ sm_i->dcc_info->discard_granularity = 1;
+ sm_i->ipu_policy = 1 << F2FS_IPU_FORCE;
+ }
+
+ sbi->readdir_ra = 1;
+}
+
static int f2fs_fill_super(struct super_block *sb, void *data, int silent)
{
struct f2fs_sb_info *sbi;
@@ -1761,6 +3070,24 @@ try_onemore:
sb->s_fs_info = sbi;
sbi->raw_super = raw_super;
+ /* precompute checksum seed for metadata */
+ if (f2fs_sb_has_inode_chksum(sbi))
+ sbi->s_chksum_seed = f2fs_chksum(sbi, ~0, raw_super->uuid,
+ sizeof(raw_super->uuid));
+
+ /*
+ * The BLKZONED feature indicates that the drive was formatted with
+ * zone alignment optimization. This is optional for host-aware
+ * devices, but mandatory for host-managed zoned block devices.
+ */
+#ifndef CONFIG_BLK_DEV_ZONED
+ if (f2fs_sb_has_blkzoned(sbi)) {
+ f2fs_msg(sb, KERN_ERR,
+ "Zoned block device support is not enabled\n");
+ err = -EOPNOTSUPP;
+ goto free_sb_buf;
+ }
+#endif
default_options(sbi);
/* parse mount options */
options = kstrdup((const char *)data, GFP_KERNEL);
@@ -1779,33 +3106,72 @@ try_onemore:
sb->s_max_links = F2FS_LINK_MAX;
get_random_bytes(&sbi->s_next_generation, sizeof(u32));
+#ifdef CONFIG_QUOTA
+ sb->dq_op = &f2fs_quota_operations;
+ if (f2fs_sb_has_quota_ino(sbi))
+ sb->s_qcop = &dquot_quotactl_sysfile_ops;
+ else
+ sb->s_qcop = &f2fs_quotactl_ops;
+ sb->s_quota_types = QTYPE_MASK_USR | QTYPE_MASK_GRP | QTYPE_MASK_PRJ;
+
+ if (f2fs_sb_has_quota_ino(sbi)) {
+ for (i = 0; i < MAXQUOTAS; i++) {
+ if (f2fs_qf_ino(sbi->sb, i))
+ sbi->nquota_files++;
+ }
+ }
+#endif
+
sb->s_op = &f2fs_sops;
+#ifdef CONFIG_F2FS_FS_ENCRYPTION
sb->s_cop = &f2fs_cryptops;
+#endif
sb->s_xattr = f2fs_xattr_handlers;
sb->s_export_op = &f2fs_export_ops;
sb->s_magic = F2FS_SUPER_MAGIC;
sb->s_time_gran = 1;
sb->s_flags = (sb->s_flags & ~MS_POSIXACL) |
(test_opt(sbi, POSIX_ACL) ? MS_POSIXACL : 0);
- memcpy(sb->s_uuid, raw_super->uuid, sizeof(raw_super->uuid));
+ memcpy(&sb->s_uuid, raw_super->uuid, sizeof(raw_super->uuid));
+ sb->s_iflags |= SB_I_CGROUPWB;
/* init f2fs-specific super block info */
sbi->valid_super_block = valid_super_block;
mutex_init(&sbi->gc_mutex);
+ mutex_init(&sbi->writepages);
mutex_init(&sbi->cp_mutex);
init_rwsem(&sbi->node_write);
+ init_rwsem(&sbi->node_change);
/* disallow all the data/node/meta page writes */
set_sbi_flag(sbi, SBI_POR_DOING);
spin_lock_init(&sbi->stat_lock);
- init_rwsem(&sbi->read_io.io_rwsem);
- sbi->read_io.sbi = sbi;
- sbi->read_io.bio = NULL;
+ /* init iostat info */
+ spin_lock_init(&sbi->iostat_lock);
+ sbi->iostat_enable = false;
+
for (i = 0; i < NR_PAGE_TYPE; i++) {
- init_rwsem(&sbi->write_io[i].io_rwsem);
- sbi->write_io[i].sbi = sbi;
- sbi->write_io[i].bio = NULL;
+ int n = (i == META) ? 1: NR_TEMP_TYPE;
+ int j;
+
+ sbi->write_io[i] =
+ f2fs_kmalloc(sbi,
+ array_size(n,
+ sizeof(struct f2fs_bio_info)),
+ GFP_KERNEL);
+ if (!sbi->write_io[i]) {
+ err = -ENOMEM;
+ goto free_bio_info;
+ }
+
+ for (j = HOT; j < n; j++) {
+ init_rwsem(&sbi->write_io[i][j].io_rwsem);
+ sbi->write_io[i][j].sbi = sbi;
+ sbi->write_io[i][j].bio = NULL;
+ spin_lock_init(&sbi->write_io[i][j].io_lock);
+ INIT_LIST_HEAD(&sbi->write_io[i][j].io_list);
+ }
}
init_rwsem(&sbi->cp_rwsem);
@@ -1814,22 +3180,41 @@ try_onemore:
err = init_percpu_info(sbi);
if (err)
- goto free_options;
+ goto free_bio_info;
+
+ if (F2FS_IO_SIZE(sbi) > 1) {
+ sbi->write_io_dummy =
+ mempool_create_page_pool(2 * (F2FS_IO_SIZE(sbi) - 1), 0);
+ if (!sbi->write_io_dummy) {
+ err = -ENOMEM;
+ goto free_percpu;
+ }
+ }
/* get an inode for meta space */
sbi->meta_inode = f2fs_iget(sb, F2FS_META_INO(sbi));
if (IS_ERR(sbi->meta_inode)) {
f2fs_msg(sb, KERN_ERR, "Failed to read F2FS meta data inode");
err = PTR_ERR(sbi->meta_inode);
- goto free_options;
+ goto free_io_dummy;
}
- err = get_valid_checkpoint(sbi);
+ err = f2fs_get_valid_checkpoint(sbi);
if (err) {
f2fs_msg(sb, KERN_ERR, "Failed to get valid F2FS checkpoint");
goto free_meta_inode;
}
+ if (__is_set_ckpt_flags(F2FS_CKPT(sbi), CP_QUOTA_NEED_FSCK_FLAG))
+ set_sbi_flag(sbi, SBI_QUOTA_NEED_REPAIR);
+
+ /* Initialize device list */
+ err = f2fs_scan_devices(sbi);
+ if (err) {
+ f2fs_msg(sb, KERN_ERR, "Failed to find devices");
+ goto free_devices;
+ }
+
sbi->total_valid_node_count =
le32_to_cpu(sbi->ckpt->valid_node_count);
percpu_counter_set(&sbi->total_valid_inode_count,
@@ -1838,24 +3223,29 @@ try_onemore:
sbi->total_valid_block_count =
le64_to_cpu(sbi->ckpt->valid_block_count);
sbi->last_valid_block_count = sbi->total_valid_block_count;
+ sbi->reserved_blocks = 0;
+ sbi->current_reserved_blocks = 0;
+ limit_reserve_root(sbi);
for (i = 0; i < NR_INODE_TYPE; i++) {
INIT_LIST_HEAD(&sbi->inode_list[i]);
spin_lock_init(&sbi->inode_lock[i]);
}
- init_extent_cache_info(sbi);
+ f2fs_init_extent_cache_info(sbi);
+
+ f2fs_init_ino_entry_info(sbi);
- init_ino_entry_info(sbi);
+ f2fs_init_fsync_node_info(sbi);
/* setup f2fs internal modules */
- err = build_segment_manager(sbi);
+ err = f2fs_build_segment_manager(sbi);
if (err) {
f2fs_msg(sb, KERN_ERR,
"Failed to initialize F2FS segment manager");
goto free_sm;
}
- err = build_node_manager(sbi);
+ err = f2fs_build_node_manager(sbi);
if (err) {
f2fs_msg(sb, KERN_ERR,
"Failed to initialize F2FS node manager");
@@ -1873,23 +3263,20 @@ try_onemore:
sbi->kbytes_written =
le64_to_cpu(seg_i->journal->info.kbytes_written);
- build_gc_manager(sbi);
+ f2fs_build_gc_manager(sbi);
+
+ err = f2fs_build_stats(sbi);
+ if (err)
+ goto free_nm;
/* get an inode for node space */
sbi->node_inode = f2fs_iget(sb, F2FS_NODE_INO(sbi));
if (IS_ERR(sbi->node_inode)) {
f2fs_msg(sb, KERN_ERR, "Failed to read node inode");
err = PTR_ERR(sbi->node_inode);
- goto free_nm;
+ goto free_stats;
}
- f2fs_join_shrinker(sbi);
-
- /* if there are nt orphan nodes free them */
- err = recover_orphan_inodes(sbi);
- if (err)
- goto free_node_inode;
-
/* read root inode and dentry */
root = f2fs_iget(sb, F2FS_ROOT_INO(sbi));
if (IS_ERR(root)) {
@@ -1897,7 +3284,8 @@ try_onemore:
err = PTR_ERR(root);
goto free_node_inode;
}
- if (!S_ISDIR(root->i_mode) || !root->i_blocks || !root->i_size) {
+ if (!S_ISDIR(root->i_mode) || !root->i_blocks ||
+ !root->i_size || !root->i_nlink) {
iput(root);
err = -EINVAL;
goto free_node_inode;
@@ -1909,26 +3297,26 @@ try_onemore:
goto free_root_inode;
}
- err = f2fs_build_stats(sbi);
+ err = f2fs_register_sysfs(sbi);
if (err)
goto free_root_inode;
- if (f2fs_proc_root)
- sbi->s_proc = proc_mkdir(sb->s_id, f2fs_proc_root);
-
- if (sbi->s_proc) {
- proc_create_data("segment_info", S_IRUGO, sbi->s_proc,
- &f2fs_seq_segment_info_fops, sb);
- proc_create_data("segment_bits", S_IRUGO, sbi->s_proc,
- &f2fs_seq_segment_bits_fops, sb);
+#ifdef CONFIG_QUOTA
+ /* Enable quota usage during mount */
+ if (f2fs_sb_has_quota_ino(sbi) && !f2fs_readonly(sb)) {
+ err = f2fs_enable_quotas(sb);
+ if (err)
+ f2fs_msg(sb, KERN_ERR,
+ "Cannot turn on quotas: error %d", err);
}
-
- sbi->s_kobj.kset = f2fs_kset;
- init_completion(&sbi->s_kobj_unregister);
- err = kobject_init_and_add(&sbi->s_kobj, &f2fs_ktype, NULL,
- "%s", sb->s_id);
+#endif
+ /* if there are nt orphan nodes free them */
+ err = f2fs_recover_orphan_inodes(sbi);
if (err)
- goto free_proc;
+ goto free_meta;
+
+ if (unlikely(is_set_ckpt_flags(sbi, CP_DISABLED_FLAG)))
+ goto skip_recovery;
/* recover fsynced data */
if (!test_opt(sbi, DISABLE_ROLL_FORWARD)) {
@@ -1939,7 +3327,7 @@ try_onemore:
if (bdev_read_only(sb->s_bdev) &&
!is_set_ckpt_flags(sbi, CP_UMOUNT_FLAG)) {
err = -EROFS;
- goto free_kobj;
+ goto free_meta;
}
if (need_fsck)
@@ -1948,38 +3336,46 @@ try_onemore:
if (!retry)
goto skip_recovery;
- err = recover_fsync_data(sbi, false);
+ err = f2fs_recover_fsync_data(sbi, false);
if (err < 0) {
need_fsck = true;
f2fs_msg(sb, KERN_ERR,
"Cannot recover all fsync data errno=%d", err);
- goto free_kobj;
+ goto free_meta;
}
} else {
- err = recover_fsync_data(sbi, true);
+ err = f2fs_recover_fsync_data(sbi, true);
if (!f2fs_readonly(sb) && err > 0) {
err = -EINVAL;
f2fs_msg(sb, KERN_ERR,
"Need to recover fsync data");
- goto free_kobj;
+ goto free_meta;
}
}
skip_recovery:
- /* recover_fsync_data() cleared this already */
+ /* f2fs_recover_fsync_data() cleared this already */
clear_sbi_flag(sbi, SBI_POR_DOING);
+ if (test_opt(sbi, DISABLE_CHECKPOINT)) {
+ err = f2fs_disable_checkpoint(sbi);
+ if (err)
+ goto free_meta;
+ } else if (is_set_ckpt_flags(sbi, CP_DISABLED_FLAG)) {
+ f2fs_enable_checkpoint(sbi);
+ }
+
/*
* If filesystem is not mounted as read-only then
* do start the gc_thread.
*/
if (test_opt(sbi, BG_GC) && !f2fs_readonly(sb)) {
/* After POR, we can run background GC thread.*/
- err = start_gc_thread(sbi);
+ err = f2fs_start_gc_thread(sbi);
if (err)
- goto free_kobj;
+ goto free_meta;
}
- kfree(options);
+ kvfree(options);
/* recover broken superblock */
if (recovery) {
@@ -1989,49 +3385,70 @@ skip_recovery:
sbi->valid_super_block ? 1 : 2, err);
}
+ f2fs_join_shrinker(sbi);
+
+ f2fs_tuning_parameters(sbi);
+
+ f2fs_msg(sbi->sb, KERN_NOTICE, "Mounted with checkpoint version = %llx",
+ cur_cp_version(F2FS_CKPT(sbi)));
f2fs_update_time(sbi, CP_TIME);
f2fs_update_time(sbi, REQ_TIME);
return 0;
-free_kobj:
- f2fs_sync_inode_meta(sbi);
- kobject_del(&sbi->s_kobj);
- kobject_put(&sbi->s_kobj);
- wait_for_completion(&sbi->s_kobj_unregister);
-free_proc:
- if (sbi->s_proc) {
- remove_proc_entry("segment_info", sbi->s_proc);
- remove_proc_entry("segment_bits", sbi->s_proc);
- remove_proc_entry(sb->s_id, f2fs_proc_root);
- }
- f2fs_destroy_stats(sbi);
+free_meta:
+#ifdef CONFIG_QUOTA
+ f2fs_truncate_quota_inode_pages(sb);
+ if (f2fs_sb_has_quota_ino(sbi) && !f2fs_readonly(sb))
+ f2fs_quota_off_umount(sbi->sb);
+#endif
+ /*
+ * Some dirty meta pages can be produced by f2fs_recover_orphan_inodes()
+ * failed by EIO. Then, iput(node_inode) can trigger balance_fs_bg()
+ * followed by f2fs_write_checkpoint() through f2fs_write_node_pages(), which
+ * falls into an infinite loop in f2fs_sync_meta_pages().
+ */
+ truncate_inode_pages_final(META_MAPPING(sbi));
+ f2fs_unregister_sysfs(sbi);
free_root_inode:
dput(sb->s_root);
sb->s_root = NULL;
free_node_inode:
+ f2fs_release_ino_entry(sbi, true);
truncate_inode_pages_final(NODE_MAPPING(sbi));
- mutex_lock(&sbi->umount_mutex);
- release_ino_entry(sbi, true);
- f2fs_leave_shrinker(sbi);
iput(sbi->node_inode);
- mutex_unlock(&sbi->umount_mutex);
+ sbi->node_inode = NULL;
+free_stats:
+ f2fs_destroy_stats(sbi);
free_nm:
- destroy_node_manager(sbi);
+ f2fs_destroy_node_manager(sbi);
free_sm:
- destroy_segment_manager(sbi);
- kfree(sbi->ckpt);
+ f2fs_destroy_segment_manager(sbi);
+free_devices:
+ destroy_device_list(sbi);
+ kvfree(sbi->ckpt);
free_meta_inode:
make_bad_inode(sbi->meta_inode);
iput(sbi->meta_inode);
-free_options:
+ sbi->meta_inode = NULL;
+free_io_dummy:
+ mempool_destroy(sbi->write_io_dummy);
+free_percpu:
destroy_percpu_info(sbi);
- kfree(options);
+free_bio_info:
+ for (i = 0; i < NR_PAGE_TYPE; i++)
+ kvfree(sbi->write_io[i]);
+free_options:
+#ifdef CONFIG_QUOTA
+ for (i = 0; i < MAXQUOTAS; i++)
+ kvfree(F2FS_OPTION(sbi).s_qf_names[i]);
+#endif
+ kvfree(options);
free_sb_buf:
- kfree(raw_super);
+ kvfree(raw_super);
free_sbi:
if (sbi->s_chksum_driver)
crypto_free_shash(sbi->s_chksum_driver);
- kfree(sbi);
+ kvfree(sbi);
/* give only one another chance */
if (retry) {
@@ -2050,8 +3467,24 @@ static struct dentry *f2fs_mount(struct file_system_type *fs_type, int flags,
static void kill_f2fs_super(struct super_block *sb)
{
- if (sb->s_root)
- set_sbi_flag(F2FS_SB(sb), SBI_IS_CLOSE);
+ if (sb->s_root) {
+ struct f2fs_sb_info *sbi = F2FS_SB(sb);
+
+ set_sbi_flag(sbi, SBI_IS_CLOSE);
+ f2fs_stop_gc_thread(sbi);
+ f2fs_stop_discard_thread(sbi);
+
+ if (is_sbi_flag_set(sbi, SBI_IS_DIRTY) ||
+ !is_set_ckpt_flags(sbi, CP_UMOUNT_FLAG)) {
+ struct cp_control cpc = {
+ .reason = CP_UMOUNT,
+ };
+ f2fs_write_checkpoint(sbi, &cpc);
+ }
+
+ if (is_sbi_flag_set(sbi, SBI_IS_RECOVERED) && f2fs_readonly(sb))
+ sb->s_flags &= ~MS_RDONLY;
+ }
kill_block_super(sb);
}
@@ -2099,50 +3532,51 @@ static int __init init_f2fs_fs(void)
err = init_inodecache();
if (err)
goto fail;
- err = create_node_manager_caches();
+ err = f2fs_create_node_manager_caches();
if (err)
goto free_inodecache;
- err = create_segment_manager_caches();
+ err = f2fs_create_segment_manager_caches();
if (err)
goto free_node_manager_caches;
- err = create_checkpoint_caches();
+ err = f2fs_create_checkpoint_caches();
if (err)
goto free_segment_manager_caches;
- err = create_extent_cache();
+ err = f2fs_create_extent_cache();
if (err)
goto free_checkpoint_caches;
- f2fs_kset = kset_create_and_add("f2fs", NULL, fs_kobj);
- if (!f2fs_kset) {
- err = -ENOMEM;
+ err = f2fs_init_sysfs();
+ if (err)
goto free_extent_cache;
- }
err = register_shrinker(&f2fs_shrinker_info);
if (err)
- goto free_kset;
-
+ goto free_sysfs;
err = register_filesystem(&f2fs_fs_type);
if (err)
goto free_shrinker;
err = f2fs_create_root_stats();
if (err)
goto free_filesystem;
- f2fs_proc_root = proc_mkdir("fs/f2fs", NULL);
+ err = f2fs_init_post_read_processing();
+ if (err)
+ goto free_root_stats;
return 0;
+free_root_stats:
+ f2fs_destroy_root_stats();
free_filesystem:
unregister_filesystem(&f2fs_fs_type);
free_shrinker:
unregister_shrinker(&f2fs_shrinker_info);
-free_kset:
- kset_unregister(f2fs_kset);
+free_sysfs:
+ f2fs_exit_sysfs();
free_extent_cache:
- destroy_extent_cache();
+ f2fs_destroy_extent_cache();
free_checkpoint_caches:
- destroy_checkpoint_caches();
+ f2fs_destroy_checkpoint_caches();
free_segment_manager_caches:
- destroy_segment_manager_caches();
+ f2fs_destroy_segment_manager_caches();
free_node_manager_caches:
- destroy_node_manager_caches();
+ f2fs_destroy_node_manager_caches();
free_inodecache:
destroy_inodecache();
fail:
@@ -2151,15 +3585,15 @@ fail:
static void __exit exit_f2fs_fs(void)
{
- remove_proc_entry("fs/f2fs", NULL);
+ f2fs_destroy_post_read_processing();
f2fs_destroy_root_stats();
unregister_filesystem(&f2fs_fs_type);
unregister_shrinker(&f2fs_shrinker_info);
- kset_unregister(f2fs_kset);
- destroy_extent_cache();
- destroy_checkpoint_caches();
- destroy_segment_manager_caches();
- destroy_node_manager_caches();
+ f2fs_exit_sysfs();
+ f2fs_destroy_extent_cache();
+ f2fs_destroy_checkpoint_caches();
+ f2fs_destroy_segment_manager_caches();
+ f2fs_destroy_node_manager_caches();
destroy_inodecache();
f2fs_destroy_trace_ios();
}
@@ -2170,3 +3604,4 @@ module_exit(exit_f2fs_fs)
MODULE_AUTHOR("Samsung Electronics's Praesto Team");
MODULE_DESCRIPTION("Flash Friendly File System");
MODULE_LICENSE("GPL");
+
diff --git a/fs/f2fs/sysfs.c b/fs/f2fs/sysfs.c
new file mode 100644
index 000000000000..948c1a211341
--- /dev/null
+++ b/fs/f2fs/sysfs.c
@@ -0,0 +1,742 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * f2fs sysfs interface
+ *
+ * Copyright (c) 2012 Samsung Electronics Co., Ltd.
+ * http://www.samsung.com/
+ * Copyright (c) 2017 Chao Yu <chao@kernel.org>
+ */
+#include <linux/compiler.h>
+#include <linux/proc_fs.h>
+#include <linux/f2fs_fs.h>
+#include <linux/seq_file.h>
+
+#include "f2fs.h"
+#include "segment.h"
+#include "gc.h"
+
+static struct proc_dir_entry *f2fs_proc_root;
+
+/* Sysfs support for f2fs */
+enum {
+ GC_THREAD, /* struct f2fs_gc_thread */
+ SM_INFO, /* struct f2fs_sm_info */
+ DCC_INFO, /* struct discard_cmd_control */
+ NM_INFO, /* struct f2fs_nm_info */
+ F2FS_SBI, /* struct f2fs_sb_info */
+#ifdef CONFIG_F2FS_FAULT_INJECTION
+ FAULT_INFO_RATE, /* struct f2fs_fault_info */
+ FAULT_INFO_TYPE, /* struct f2fs_fault_info */
+#endif
+ RESERVED_BLOCKS, /* struct f2fs_sb_info */
+};
+
+struct f2fs_attr {
+ struct attribute attr;
+ ssize_t (*show)(struct f2fs_attr *, struct f2fs_sb_info *, char *);
+ ssize_t (*store)(struct f2fs_attr *, struct f2fs_sb_info *,
+ const char *, size_t);
+ int struct_type;
+ int offset;
+ int id;
+};
+
+static unsigned char *__struct_ptr(struct f2fs_sb_info *sbi, int struct_type)
+{
+ if (struct_type == GC_THREAD)
+ return (unsigned char *)sbi->gc_thread;
+ else if (struct_type == SM_INFO)
+ return (unsigned char *)SM_I(sbi);
+ else if (struct_type == DCC_INFO)
+ return (unsigned char *)SM_I(sbi)->dcc_info;
+ else if (struct_type == NM_INFO)
+ return (unsigned char *)NM_I(sbi);
+ else if (struct_type == F2FS_SBI || struct_type == RESERVED_BLOCKS)
+ return (unsigned char *)sbi;
+#ifdef CONFIG_F2FS_FAULT_INJECTION
+ else if (struct_type == FAULT_INFO_RATE ||
+ struct_type == FAULT_INFO_TYPE)
+ return (unsigned char *)&F2FS_OPTION(sbi).fault_info;
+#endif
+ return NULL;
+}
+
+static ssize_t dirty_segments_show(struct f2fs_attr *a,
+ struct f2fs_sb_info *sbi, char *buf)
+{
+ return snprintf(buf, PAGE_SIZE, "%llu\n",
+ (unsigned long long)(dirty_segments(sbi)));
+}
+
+static ssize_t lifetime_write_kbytes_show(struct f2fs_attr *a,
+ struct f2fs_sb_info *sbi, char *buf)
+{
+ struct super_block *sb = sbi->sb;
+
+ if (!sb->s_bdev->bd_part)
+ return snprintf(buf, PAGE_SIZE, "0\n");
+
+ return snprintf(buf, PAGE_SIZE, "%llu\n",
+ (unsigned long long)(sbi->kbytes_written +
+ BD_PART_WRITTEN(sbi)));
+}
+
+static ssize_t features_show(struct f2fs_attr *a,
+ struct f2fs_sb_info *sbi, char *buf)
+{
+ struct super_block *sb = sbi->sb;
+ int len = 0;
+
+ if (!sb->s_bdev->bd_part)
+ return snprintf(buf, PAGE_SIZE, "0\n");
+
+ if (f2fs_sb_has_encrypt(sbi))
+ len += snprintf(buf, PAGE_SIZE - len, "%s",
+ "encryption");
+ if (f2fs_sb_has_blkzoned(sbi))
+ len += snprintf(buf + len, PAGE_SIZE - len, "%s%s",
+ len ? ", " : "", "blkzoned");
+ if (f2fs_sb_has_extra_attr(sbi))
+ len += snprintf(buf + len, PAGE_SIZE - len, "%s%s",
+ len ? ", " : "", "extra_attr");
+ if (f2fs_sb_has_project_quota(sbi))
+ len += snprintf(buf + len, PAGE_SIZE - len, "%s%s",
+ len ? ", " : "", "projquota");
+ if (f2fs_sb_has_inode_chksum(sbi))
+ len += snprintf(buf + len, PAGE_SIZE - len, "%s%s",
+ len ? ", " : "", "inode_checksum");
+ if (f2fs_sb_has_flexible_inline_xattr(sbi))
+ len += snprintf(buf + len, PAGE_SIZE - len, "%s%s",
+ len ? ", " : "", "flexible_inline_xattr");
+ if (f2fs_sb_has_quota_ino(sbi))
+ len += snprintf(buf + len, PAGE_SIZE - len, "%s%s",
+ len ? ", " : "", "quota_ino");
+ if (f2fs_sb_has_inode_crtime(sbi))
+ len += snprintf(buf + len, PAGE_SIZE - len, "%s%s",
+ len ? ", " : "", "inode_crtime");
+ if (f2fs_sb_has_lost_found(sbi))
+ len += snprintf(buf + len, PAGE_SIZE - len, "%s%s",
+ len ? ", " : "", "lost_found");
+ if (f2fs_sb_has_sb_chksum(sbi))
+ len += snprintf(buf + len, PAGE_SIZE - len, "%s%s",
+ len ? ", " : "", "sb_checksum");
+ len += snprintf(buf + len, PAGE_SIZE - len, "\n");
+ return len;
+}
+
+static ssize_t current_reserved_blocks_show(struct f2fs_attr *a,
+ struct f2fs_sb_info *sbi, char *buf)
+{
+ return snprintf(buf, PAGE_SIZE, "%u\n", sbi->current_reserved_blocks);
+}
+
+static ssize_t f2fs_sbi_show(struct f2fs_attr *a,
+ struct f2fs_sb_info *sbi, char *buf)
+{
+ unsigned char *ptr = NULL;
+ unsigned int *ui;
+
+ ptr = __struct_ptr(sbi, a->struct_type);
+ if (!ptr)
+ return -EINVAL;
+
+ if (!strcmp(a->attr.name, "extension_list")) {
+ __u8 (*extlist)[F2FS_EXTENSION_LEN] =
+ sbi->raw_super->extension_list;
+ int cold_count = le32_to_cpu(sbi->raw_super->extension_count);
+ int hot_count = sbi->raw_super->hot_ext_count;
+ int len = 0, i;
+
+ len += snprintf(buf + len, PAGE_SIZE - len,
+ "cold file extension:\n");
+ for (i = 0; i < cold_count; i++)
+ len += snprintf(buf + len, PAGE_SIZE - len, "%s\n",
+ extlist[i]);
+
+ len += snprintf(buf + len, PAGE_SIZE - len,
+ "hot file extension:\n");
+ for (i = cold_count; i < cold_count + hot_count; i++)
+ len += snprintf(buf + len, PAGE_SIZE - len, "%s\n",
+ extlist[i]);
+ return len;
+ }
+
+ ui = (unsigned int *)(ptr + a->offset);
+
+ return snprintf(buf, PAGE_SIZE, "%u\n", *ui);
+}
+
+static ssize_t __sbi_store(struct f2fs_attr *a,
+ struct f2fs_sb_info *sbi,
+ const char *buf, size_t count)
+{
+ unsigned char *ptr;
+ unsigned long t;
+ unsigned int *ui;
+ ssize_t ret;
+
+ ptr = __struct_ptr(sbi, a->struct_type);
+ if (!ptr)
+ return -EINVAL;
+
+ if (!strcmp(a->attr.name, "extension_list")) {
+ const char *name = strim((char *)buf);
+ bool set = true, hot;
+
+ if (!strncmp(name, "[h]", 3))
+ hot = true;
+ else if (!strncmp(name, "[c]", 3))
+ hot = false;
+ else
+ return -EINVAL;
+
+ name += 3;
+
+ if (*name == '!') {
+ name++;
+ set = false;
+ }
+
+ if (strlen(name) >= F2FS_EXTENSION_LEN)
+ return -EINVAL;
+
+ down_write(&sbi->sb_lock);
+
+ ret = f2fs_update_extension_list(sbi, name, hot, set);
+ if (ret)
+ goto out;
+
+ ret = f2fs_commit_super(sbi, false);
+ if (ret)
+ f2fs_update_extension_list(sbi, name, hot, !set);
+out:
+ up_write(&sbi->sb_lock);
+ return ret ? ret : count;
+ }
+
+ ui = (unsigned int *)(ptr + a->offset);
+
+ ret = kstrtoul(skip_spaces(buf), 0, &t);
+ if (ret < 0)
+ return ret;
+#ifdef CONFIG_F2FS_FAULT_INJECTION
+ if (a->struct_type == FAULT_INFO_TYPE && t >= (1 << FAULT_MAX))
+ return -EINVAL;
+#endif
+ if (a->struct_type == RESERVED_BLOCKS) {
+ spin_lock(&sbi->stat_lock);
+ if (t > (unsigned long)(sbi->user_block_count -
+ F2FS_OPTION(sbi).root_reserved_blocks)) {
+ spin_unlock(&sbi->stat_lock);
+ return -EINVAL;
+ }
+ *ui = t;
+ sbi->current_reserved_blocks = min(sbi->reserved_blocks,
+ sbi->user_block_count - valid_user_blocks(sbi));
+ spin_unlock(&sbi->stat_lock);
+ return count;
+ }
+
+ if (!strcmp(a->attr.name, "discard_granularity")) {
+ if (t == 0 || t > MAX_PLIST_NUM)
+ return -EINVAL;
+ if (t == *ui)
+ return count;
+ *ui = t;
+ return count;
+ }
+
+ if (!strcmp(a->attr.name, "migration_granularity")) {
+ if (t == 0 || t > sbi->segs_per_sec)
+ return -EINVAL;
+ }
+
+ if (!strcmp(a->attr.name, "trim_sections"))
+ return -EINVAL;
+
+ if (!strcmp(a->attr.name, "gc_urgent")) {
+ if (t >= 1) {
+ sbi->gc_mode = GC_URGENT;
+ if (sbi->gc_thread) {
+ sbi->gc_thread->gc_wake = 1;
+ wake_up_interruptible_all(
+ &sbi->gc_thread->gc_wait_queue_head);
+ wake_up_discard_thread(sbi, true);
+ }
+ } else {
+ sbi->gc_mode = GC_NORMAL;
+ }
+ return count;
+ }
+ if (!strcmp(a->attr.name, "gc_idle")) {
+ if (t == GC_IDLE_CB)
+ sbi->gc_mode = GC_IDLE_CB;
+ else if (t == GC_IDLE_GREEDY)
+ sbi->gc_mode = GC_IDLE_GREEDY;
+ else
+ sbi->gc_mode = GC_NORMAL;
+ return count;
+ }
+
+ *ui = t;
+
+ if (!strcmp(a->attr.name, "iostat_enable") && *ui == 0)
+ f2fs_reset_iostat(sbi);
+ return count;
+}
+
+static ssize_t f2fs_sbi_store(struct f2fs_attr *a,
+ struct f2fs_sb_info *sbi,
+ const char *buf, size_t count)
+{
+ ssize_t ret;
+ bool gc_entry = (!strcmp(a->attr.name, "gc_urgent") ||
+ a->struct_type == GC_THREAD);
+
+ if (gc_entry) {
+ if (!down_read_trylock(&sbi->sb->s_umount))
+ return -EAGAIN;
+ }
+ ret = __sbi_store(a, sbi, buf, count);
+ if (gc_entry)
+ up_read(&sbi->sb->s_umount);
+
+ return ret;
+}
+
+static ssize_t f2fs_attr_show(struct kobject *kobj,
+ struct attribute *attr, char *buf)
+{
+ struct f2fs_sb_info *sbi = container_of(kobj, struct f2fs_sb_info,
+ s_kobj);
+ struct f2fs_attr *a = container_of(attr, struct f2fs_attr, attr);
+
+ return a->show ? a->show(a, sbi, buf) : 0;
+}
+
+static ssize_t f2fs_attr_store(struct kobject *kobj, struct attribute *attr,
+ const char *buf, size_t len)
+{
+ struct f2fs_sb_info *sbi = container_of(kobj, struct f2fs_sb_info,
+ s_kobj);
+ struct f2fs_attr *a = container_of(attr, struct f2fs_attr, attr);
+
+ return a->store ? a->store(a, sbi, buf, len) : 0;
+}
+
+static void f2fs_sb_release(struct kobject *kobj)
+{
+ struct f2fs_sb_info *sbi = container_of(kobj, struct f2fs_sb_info,
+ s_kobj);
+ complete(&sbi->s_kobj_unregister);
+}
+
+enum feat_id {
+ FEAT_CRYPTO = 0,
+ FEAT_BLKZONED,
+ FEAT_ATOMIC_WRITE,
+ FEAT_EXTRA_ATTR,
+ FEAT_PROJECT_QUOTA,
+ FEAT_INODE_CHECKSUM,
+ FEAT_FLEXIBLE_INLINE_XATTR,
+ FEAT_QUOTA_INO,
+ FEAT_INODE_CRTIME,
+ FEAT_LOST_FOUND,
+ FEAT_SB_CHECKSUM,
+};
+
+static ssize_t f2fs_feature_show(struct f2fs_attr *a,
+ struct f2fs_sb_info *sbi, char *buf)
+{
+ switch (a->id) {
+ case FEAT_CRYPTO:
+ case FEAT_BLKZONED:
+ case FEAT_ATOMIC_WRITE:
+ case FEAT_EXTRA_ATTR:
+ case FEAT_PROJECT_QUOTA:
+ case FEAT_INODE_CHECKSUM:
+ case FEAT_FLEXIBLE_INLINE_XATTR:
+ case FEAT_QUOTA_INO:
+ case FEAT_INODE_CRTIME:
+ case FEAT_LOST_FOUND:
+ case FEAT_SB_CHECKSUM:
+ return snprintf(buf, PAGE_SIZE, "supported\n");
+ }
+ return 0;
+}
+
+#define F2FS_ATTR_OFFSET(_struct_type, _name, _mode, _show, _store, _offset) \
+static struct f2fs_attr f2fs_attr_##_name = { \
+ .attr = {.name = __stringify(_name), .mode = _mode }, \
+ .show = _show, \
+ .store = _store, \
+ .struct_type = _struct_type, \
+ .offset = _offset \
+}
+
+#define F2FS_RW_ATTR(struct_type, struct_name, name, elname) \
+ F2FS_ATTR_OFFSET(struct_type, name, 0644, \
+ f2fs_sbi_show, f2fs_sbi_store, \
+ offsetof(struct struct_name, elname))
+
+#define F2FS_GENERAL_RO_ATTR(name) \
+static struct f2fs_attr f2fs_attr_##name = __ATTR(name, 0444, name##_show, NULL)
+
+#define F2FS_FEATURE_RO_ATTR(_name, _id) \
+static struct f2fs_attr f2fs_attr_##_name = { \
+ .attr = {.name = __stringify(_name), .mode = 0444 }, \
+ .show = f2fs_feature_show, \
+ .id = _id, \
+}
+
+F2FS_RW_ATTR(GC_THREAD, f2fs_gc_kthread, gc_urgent_sleep_time,
+ urgent_sleep_time);
+F2FS_RW_ATTR(GC_THREAD, f2fs_gc_kthread, gc_min_sleep_time, min_sleep_time);
+F2FS_RW_ATTR(GC_THREAD, f2fs_gc_kthread, gc_max_sleep_time, max_sleep_time);
+F2FS_RW_ATTR(GC_THREAD, f2fs_gc_kthread, gc_no_gc_sleep_time, no_gc_sleep_time);
+F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, gc_idle, gc_mode);
+F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, gc_urgent, gc_mode);
+F2FS_RW_ATTR(SM_INFO, f2fs_sm_info, reclaim_segments, rec_prefree_segments);
+F2FS_RW_ATTR(DCC_INFO, discard_cmd_control, max_small_discards, max_discards);
+F2FS_RW_ATTR(DCC_INFO, discard_cmd_control, discard_granularity, discard_granularity);
+F2FS_RW_ATTR(RESERVED_BLOCKS, f2fs_sb_info, reserved_blocks, reserved_blocks);
+F2FS_RW_ATTR(SM_INFO, f2fs_sm_info, batched_trim_sections, trim_sections);
+F2FS_RW_ATTR(SM_INFO, f2fs_sm_info, ipu_policy, ipu_policy);
+F2FS_RW_ATTR(SM_INFO, f2fs_sm_info, min_ipu_util, min_ipu_util);
+F2FS_RW_ATTR(SM_INFO, f2fs_sm_info, min_fsync_blocks, min_fsync_blocks);
+F2FS_RW_ATTR(SM_INFO, f2fs_sm_info, min_seq_blocks, min_seq_blocks);
+F2FS_RW_ATTR(SM_INFO, f2fs_sm_info, min_hot_blocks, min_hot_blocks);
+F2FS_RW_ATTR(SM_INFO, f2fs_sm_info, min_ssr_sections, min_ssr_sections);
+F2FS_RW_ATTR(NM_INFO, f2fs_nm_info, ram_thresh, ram_thresh);
+F2FS_RW_ATTR(NM_INFO, f2fs_nm_info, ra_nid_pages, ra_nid_pages);
+F2FS_RW_ATTR(NM_INFO, f2fs_nm_info, dirty_nats_ratio, dirty_nats_ratio);
+F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, max_victim_search, max_victim_search);
+F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, migration_granularity, migration_granularity);
+F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, dir_level, dir_level);
+F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, cp_interval, interval_time[CP_TIME]);
+F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, idle_interval, interval_time[REQ_TIME]);
+F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, discard_idle_interval,
+ interval_time[DISCARD_TIME]);
+F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, gc_idle_interval, interval_time[GC_TIME]);
+F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, iostat_enable, iostat_enable);
+F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, readdir_ra, readdir_ra);
+F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, gc_pin_file_thresh, gc_pin_file_threshold);
+F2FS_RW_ATTR(F2FS_SBI, f2fs_super_block, extension_list, extension_list);
+#ifdef CONFIG_F2FS_FAULT_INJECTION
+F2FS_RW_ATTR(FAULT_INFO_RATE, f2fs_fault_info, inject_rate, inject_rate);
+F2FS_RW_ATTR(FAULT_INFO_TYPE, f2fs_fault_info, inject_type, inject_type);
+#endif
+F2FS_GENERAL_RO_ATTR(dirty_segments);
+F2FS_GENERAL_RO_ATTR(lifetime_write_kbytes);
+F2FS_GENERAL_RO_ATTR(features);
+F2FS_GENERAL_RO_ATTR(current_reserved_blocks);
+
+#ifdef CONFIG_F2FS_FS_ENCRYPTION
+F2FS_FEATURE_RO_ATTR(encryption, FEAT_CRYPTO);
+#endif
+#ifdef CONFIG_BLK_DEV_ZONED
+F2FS_FEATURE_RO_ATTR(block_zoned, FEAT_BLKZONED);
+#endif
+F2FS_FEATURE_RO_ATTR(atomic_write, FEAT_ATOMIC_WRITE);
+F2FS_FEATURE_RO_ATTR(extra_attr, FEAT_EXTRA_ATTR);
+F2FS_FEATURE_RO_ATTR(project_quota, FEAT_PROJECT_QUOTA);
+F2FS_FEATURE_RO_ATTR(inode_checksum, FEAT_INODE_CHECKSUM);
+F2FS_FEATURE_RO_ATTR(flexible_inline_xattr, FEAT_FLEXIBLE_INLINE_XATTR);
+F2FS_FEATURE_RO_ATTR(quota_ino, FEAT_QUOTA_INO);
+F2FS_FEATURE_RO_ATTR(inode_crtime, FEAT_INODE_CRTIME);
+F2FS_FEATURE_RO_ATTR(lost_found, FEAT_LOST_FOUND);
+F2FS_FEATURE_RO_ATTR(sb_checksum, FEAT_SB_CHECKSUM);
+
+#define ATTR_LIST(name) (&f2fs_attr_##name.attr)
+static struct attribute *f2fs_attrs[] = {
+ ATTR_LIST(gc_urgent_sleep_time),
+ ATTR_LIST(gc_min_sleep_time),
+ ATTR_LIST(gc_max_sleep_time),
+ ATTR_LIST(gc_no_gc_sleep_time),
+ ATTR_LIST(gc_idle),
+ ATTR_LIST(gc_urgent),
+ ATTR_LIST(reclaim_segments),
+ ATTR_LIST(max_small_discards),
+ ATTR_LIST(discard_granularity),
+ ATTR_LIST(batched_trim_sections),
+ ATTR_LIST(ipu_policy),
+ ATTR_LIST(min_ipu_util),
+ ATTR_LIST(min_fsync_blocks),
+ ATTR_LIST(min_seq_blocks),
+ ATTR_LIST(min_hot_blocks),
+ ATTR_LIST(min_ssr_sections),
+ ATTR_LIST(max_victim_search),
+ ATTR_LIST(migration_granularity),
+ ATTR_LIST(dir_level),
+ ATTR_LIST(ram_thresh),
+ ATTR_LIST(ra_nid_pages),
+ ATTR_LIST(dirty_nats_ratio),
+ ATTR_LIST(cp_interval),
+ ATTR_LIST(idle_interval),
+ ATTR_LIST(discard_idle_interval),
+ ATTR_LIST(gc_idle_interval),
+ ATTR_LIST(iostat_enable),
+ ATTR_LIST(readdir_ra),
+ ATTR_LIST(gc_pin_file_thresh),
+ ATTR_LIST(extension_list),
+#ifdef CONFIG_F2FS_FAULT_INJECTION
+ ATTR_LIST(inject_rate),
+ ATTR_LIST(inject_type),
+#endif
+ ATTR_LIST(dirty_segments),
+ ATTR_LIST(lifetime_write_kbytes),
+ ATTR_LIST(features),
+ ATTR_LIST(reserved_blocks),
+ ATTR_LIST(current_reserved_blocks),
+ NULL,
+};
+
+static struct attribute *f2fs_feat_attrs[] = {
+#ifdef CONFIG_F2FS_FS_ENCRYPTION
+ ATTR_LIST(encryption),
+#endif
+#ifdef CONFIG_BLK_DEV_ZONED
+ ATTR_LIST(block_zoned),
+#endif
+ ATTR_LIST(atomic_write),
+ ATTR_LIST(extra_attr),
+ ATTR_LIST(project_quota),
+ ATTR_LIST(inode_checksum),
+ ATTR_LIST(flexible_inline_xattr),
+ ATTR_LIST(quota_ino),
+ ATTR_LIST(inode_crtime),
+ ATTR_LIST(lost_found),
+ ATTR_LIST(sb_checksum),
+ NULL,
+};
+
+static const struct sysfs_ops f2fs_attr_ops = {
+ .show = f2fs_attr_show,
+ .store = f2fs_attr_store,
+};
+
+static struct kobj_type f2fs_sb_ktype = {
+ .default_attrs = f2fs_attrs,
+ .sysfs_ops = &f2fs_attr_ops,
+ .release = f2fs_sb_release,
+};
+
+static struct kobj_type f2fs_ktype = {
+ .sysfs_ops = &f2fs_attr_ops,
+};
+
+static struct kset f2fs_kset = {
+ .kobj = {.ktype = &f2fs_ktype},
+};
+
+static struct kobj_type f2fs_feat_ktype = {
+ .default_attrs = f2fs_feat_attrs,
+ .sysfs_ops = &f2fs_attr_ops,
+};
+
+static struct kobject f2fs_feat = {
+ .kset = &f2fs_kset,
+};
+
+static int __maybe_unused segment_info_seq_show(struct seq_file *seq,
+ void *offset)
+{
+ struct super_block *sb = seq->private;
+ struct f2fs_sb_info *sbi = F2FS_SB(sb);
+ unsigned int total_segs =
+ le32_to_cpu(sbi->raw_super->segment_count_main);
+ int i;
+
+ seq_puts(seq, "format: segment_type|valid_blocks\n"
+ "segment_type(0:HD, 1:WD, 2:CD, 3:HN, 4:WN, 5:CN)\n");
+
+ for (i = 0; i < total_segs; i++) {
+ struct seg_entry *se = get_seg_entry(sbi, i);
+
+ if ((i % 10) == 0)
+ seq_printf(seq, "%-10d", i);
+ seq_printf(seq, "%d|%-3u", se->type,
+ get_valid_blocks(sbi, i, false));
+ if ((i % 10) == 9 || i == (total_segs - 1))
+ seq_putc(seq, '\n');
+ else
+ seq_putc(seq, ' ');
+ }
+
+ return 0;
+}
+
+static int __maybe_unused segment_bits_seq_show(struct seq_file *seq,
+ void *offset)
+{
+ struct super_block *sb = seq->private;
+ struct f2fs_sb_info *sbi = F2FS_SB(sb);
+ unsigned int total_segs =
+ le32_to_cpu(sbi->raw_super->segment_count_main);
+ int i, j;
+
+ seq_puts(seq, "format: segment_type|valid_blocks|bitmaps\n"
+ "segment_type(0:HD, 1:WD, 2:CD, 3:HN, 4:WN, 5:CN)\n");
+
+ for (i = 0; i < total_segs; i++) {
+ struct seg_entry *se = get_seg_entry(sbi, i);
+
+ seq_printf(seq, "%-10d", i);
+ seq_printf(seq, "%d|%-3u|", se->type,
+ get_valid_blocks(sbi, i, false));
+ for (j = 0; j < SIT_VBLOCK_MAP_SIZE; j++)
+ seq_printf(seq, " %.2x", se->cur_valid_map[j]);
+ seq_putc(seq, '\n');
+ }
+ return 0;
+}
+
+static int __maybe_unused iostat_info_seq_show(struct seq_file *seq,
+ void *offset)
+{
+ struct super_block *sb = seq->private;
+ struct f2fs_sb_info *sbi = F2FS_SB(sb);
+ time64_t now = ktime_get_real_seconds();
+
+ if (!sbi->iostat_enable)
+ return 0;
+
+ seq_printf(seq, "time: %-16llu\n", now);
+
+ /* print app IOs */
+ seq_printf(seq, "app buffered: %-16llu\n",
+ sbi->write_iostat[APP_BUFFERED_IO]);
+ seq_printf(seq, "app direct: %-16llu\n",
+ sbi->write_iostat[APP_DIRECT_IO]);
+ seq_printf(seq, "app mapped: %-16llu\n",
+ sbi->write_iostat[APP_MAPPED_IO]);
+
+ /* print fs IOs */
+ seq_printf(seq, "fs data: %-16llu\n",
+ sbi->write_iostat[FS_DATA_IO]);
+ seq_printf(seq, "fs node: %-16llu\n",
+ sbi->write_iostat[FS_NODE_IO]);
+ seq_printf(seq, "fs meta: %-16llu\n",
+ sbi->write_iostat[FS_META_IO]);
+ seq_printf(seq, "fs gc data: %-16llu\n",
+ sbi->write_iostat[FS_GC_DATA_IO]);
+ seq_printf(seq, "fs gc node: %-16llu\n",
+ sbi->write_iostat[FS_GC_NODE_IO]);
+ seq_printf(seq, "fs cp data: %-16llu\n",
+ sbi->write_iostat[FS_CP_DATA_IO]);
+ seq_printf(seq, "fs cp node: %-16llu\n",
+ sbi->write_iostat[FS_CP_NODE_IO]);
+ seq_printf(seq, "fs cp meta: %-16llu\n",
+ sbi->write_iostat[FS_CP_META_IO]);
+ seq_printf(seq, "fs discard: %-16llu\n",
+ sbi->write_iostat[FS_DISCARD]);
+
+ return 0;
+}
+
+static int __maybe_unused victim_bits_seq_show(struct seq_file *seq,
+ void *offset)
+{
+ struct super_block *sb = seq->private;
+ struct f2fs_sb_info *sbi = F2FS_SB(sb);
+ struct dirty_seglist_info *dirty_i = DIRTY_I(sbi);
+ int i;
+
+ seq_puts(seq, "format: victim_secmap bitmaps\n");
+
+ for (i = 0; i < MAIN_SECS(sbi); i++) {
+ if ((i % 10) == 0)
+ seq_printf(seq, "%-10d", i);
+ seq_printf(seq, "%d", test_bit(i, dirty_i->victim_secmap) ? 1 : 0);
+ if ((i % 10) == 9 || i == (MAIN_SECS(sbi) - 1))
+ seq_putc(seq, '\n');
+ else
+ seq_putc(seq, ' ');
+ }
+ return 0;
+}
+
+#define F2FS_PROC_FILE_DEF(_name) \
+static int _name##_open_fs(struct inode *inode, struct file *file) \
+{ \
+ return single_open(file, _name##_seq_show, PDE_DATA(inode)); \
+} \
+ \
+static const struct file_operations f2fs_seq_##_name##_fops = { \
+ .open = _name##_open_fs, \
+ .read = seq_read, \
+ .llseek = seq_lseek, \
+ .release = single_release, \
+};
+
+F2FS_PROC_FILE_DEF(segment_info);
+F2FS_PROC_FILE_DEF(segment_bits);
+F2FS_PROC_FILE_DEF(iostat_info);
+F2FS_PROC_FILE_DEF(victim_bits);
+
+int __init f2fs_init_sysfs(void)
+{
+ int ret;
+
+ kobject_set_name(&f2fs_kset.kobj, "f2fs");
+ f2fs_kset.kobj.parent = fs_kobj;
+ ret = kset_register(&f2fs_kset);
+ if (ret)
+ return ret;
+
+ ret = kobject_init_and_add(&f2fs_feat, &f2fs_feat_ktype,
+ NULL, "features");
+ if (ret)
+ kset_unregister(&f2fs_kset);
+ else
+ f2fs_proc_root = proc_mkdir("fs/f2fs", NULL);
+ return ret;
+}
+
+void f2fs_exit_sysfs(void)
+{
+ kobject_put(&f2fs_feat);
+ kset_unregister(&f2fs_kset);
+ remove_proc_entry("fs/f2fs", NULL);
+ f2fs_proc_root = NULL;
+}
+
+int f2fs_register_sysfs(struct f2fs_sb_info *sbi)
+{
+ struct super_block *sb = sbi->sb;
+ int err;
+
+ sbi->s_kobj.kset = &f2fs_kset;
+ init_completion(&sbi->s_kobj_unregister);
+ err = kobject_init_and_add(&sbi->s_kobj, &f2fs_sb_ktype, NULL,
+ "%s", sb->s_id);
+ if (err)
+ return err;
+
+ if (f2fs_proc_root)
+ sbi->s_proc = proc_mkdir(sb->s_id, f2fs_proc_root);
+
+ if (sbi->s_proc) {
+ proc_create_data("segment_info", S_IRUGO, sbi->s_proc,
+ &f2fs_seq_segment_info_fops, sb);
+ proc_create_data("segment_bits", S_IRUGO, sbi->s_proc,
+ &f2fs_seq_segment_bits_fops, sb);
+ proc_create_data("iostat_info", S_IRUGO, sbi->s_proc,
+ &f2fs_seq_iostat_info_fops, sb);
+ proc_create_data("victim_bits", S_IRUGO, sbi->s_proc,
+ &f2fs_seq_victim_bits_fops, sb);
+ }
+ return 0;
+}
+
+void f2fs_unregister_sysfs(struct f2fs_sb_info *sbi)
+{
+ if (sbi->s_proc) {
+ remove_proc_entry("iostat_info", sbi->s_proc);
+ remove_proc_entry("segment_info", sbi->s_proc);
+ remove_proc_entry("segment_bits", sbi->s_proc);
+ remove_proc_entry("victim_bits", sbi->s_proc);
+ remove_proc_entry(sbi->sb->s_id, f2fs_proc_root);
+ }
+ kobject_del(&sbi->s_kobj);
+}
diff --git a/fs/f2fs/trace.c b/fs/f2fs/trace.c
index 73b4e1d1912a..ce2a5eb210b6 100644
--- a/fs/f2fs/trace.c
+++ b/fs/f2fs/trace.c
@@ -1,12 +1,9 @@
+// SPDX-License-Identifier: GPL-2.0
/*
* f2fs IO tracer
*
* Copyright (c) 2014 Motorola Mobility
* Copyright (c) 2014 Jaegeuk Kim <jaegeuk@kernel.org>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
*/
#include <linux/fs.h>
#include <linux/f2fs_fs.h>
@@ -17,7 +14,7 @@
#include "trace.h"
static RADIX_TREE(pids, GFP_ATOMIC);
-static spinlock_t pids_lock;
+static struct mutex pids_lock;
static struct last_io_info last_io;
static inline void __print_last_io(void)
@@ -59,12 +56,12 @@ void f2fs_trace_pid(struct page *page)
pid_t pid = task_pid_nr(current);
void *p;
- page->private = pid;
+ set_page_private(page, (unsigned long)pid);
if (radix_tree_preload(GFP_NOFS))
return;
- spin_lock(&pids_lock);
+ mutex_lock(&pids_lock);
p = radix_tree_lookup(&pids, pid);
if (p == current)
goto out;
@@ -77,7 +74,7 @@ void f2fs_trace_pid(struct page *page)
MAJOR(inode->i_sb->s_dev), MINOR(inode->i_sb->s_dev),
pid, current->comm);
out:
- spin_unlock(&pids_lock);
+ mutex_unlock(&pids_lock);
radix_tree_preload_end();
}
@@ -122,7 +119,7 @@ void f2fs_trace_ios(struct f2fs_io_info *fio, int flush)
void f2fs_build_trace_ios(void)
{
- spin_lock_init(&pids_lock);
+ mutex_init(&pids_lock);
}
#define PIDVEC_SIZE 128
@@ -138,7 +135,7 @@ static unsigned int gang_lookup_pids(pid_t *results, unsigned long first_index,
radix_tree_for_each_slot(slot, &pids, &iter, first_index) {
results[ret] = iter.index;
- if (++ret == PIDVEC_SIZE)
+ if (++ret == max_items)
break;
}
return ret;
@@ -150,7 +147,7 @@ void f2fs_destroy_trace_ios(void)
pid_t next_pid = 0;
unsigned int found;
- spin_lock(&pids_lock);
+ mutex_lock(&pids_lock);
while ((found = gang_lookup_pids(pid, next_pid, PIDVEC_SIZE))) {
unsigned idx;
@@ -158,5 +155,5 @@ void f2fs_destroy_trace_ios(void)
for (idx = 0; idx < found; idx++)
radix_tree_delete(&pids, pid[idx]);
}
- spin_unlock(&pids_lock);
+ mutex_unlock(&pids_lock);
}
diff --git a/fs/f2fs/trace.h b/fs/f2fs/trace.h
index 67db24ac1e85..e8075fc5b228 100644
--- a/fs/f2fs/trace.h
+++ b/fs/f2fs/trace.h
@@ -1,12 +1,9 @@
+// SPDX-License-Identifier: GPL-2.0
/*
* f2fs IO tracer
*
* Copyright (c) 2014 Motorola Mobility
* Copyright (c) 2014 Jaegeuk Kim <jaegeuk@kernel.org>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
*/
#ifndef __F2FS_TRACE_H__
#define __F2FS_TRACE_H__
diff --git a/fs/f2fs/xattr.c b/fs/f2fs/xattr.c
index 3e1c0280f866..18d5ffbc5e8c 100644
--- a/fs/f2fs/xattr.c
+++ b/fs/f2fs/xattr.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
/*
* fs/f2fs/xattr.c
*
@@ -13,10 +14,6 @@
* suggestion of Luka Renko <luka.renko@hermes.si>.
* xattr consolidation Copyright (c) 2004 James Morris <jmorris@redhat.com>,
* Red Hat Inc.
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
*/
#include <linux/rwsem.h>
#include <linux/f2fs_fs.h>
@@ -37,9 +34,6 @@ static int f2fs_xattr_generic_get(const struct xattr_handler *handler,
return -EOPNOTSUPP;
break;
case F2FS_XATTR_INDEX_TRUSTED:
- if (!capable(CAP_SYS_ADMIN))
- return -EPERM;
- break;
case F2FS_XATTR_INDEX_SECURITY:
break;
default:
@@ -62,9 +56,6 @@ static int f2fs_xattr_generic_set(const struct xattr_handler *handler,
return -EOPNOTSUPP;
break;
case F2FS_XATTR_INDEX_TRUSTED:
- if (!capable(CAP_SYS_ADMIN))
- return -EPERM;
- break;
case F2FS_XATTR_INDEX_SECURITY:
break;
default:
@@ -100,13 +91,23 @@ static int f2fs_xattr_advise_set(const struct xattr_handler *handler,
const char *name, const void *value,
size_t size, int flags)
{
+ unsigned char old_advise = F2FS_I(inode)->i_advise;
+ unsigned char new_advise;
+
if (!inode_owner_or_capable(inode))
return -EPERM;
if (value == NULL)
return -EINVAL;
- F2FS_I(inode)->i_advise |= *(char *)value;
- f2fs_mark_inode_dirty_sync(inode);
+ new_advise = *(char *)value;
+ if (new_advise & ~FADVISE_MODIFIABLE_BITS)
+ return -EINVAL;
+
+ new_advise = new_advise & FADVISE_MODIFIABLE_BITS;
+ new_advise |= old_advise & ~FADVISE_MODIFIABLE_BITS;
+
+ F2FS_I(inode)->i_advise = new_advise;
+ f2fs_mark_inode_dirty_sync(inode, true);
return 0;
}
@@ -217,55 +218,159 @@ static struct f2fs_xattr_entry *__find_xattr(void *base_addr, int index,
return entry;
}
-static int read_all_xattrs(struct inode *inode, struct page *ipage,
- void **base_addr)
+static struct f2fs_xattr_entry *__find_inline_xattr(struct inode *inode,
+ void *base_addr, void **last_addr, int index,
+ size_t len, const char *name)
+{
+ struct f2fs_xattr_entry *entry;
+ unsigned int inline_size = inline_xattr_size(inode);
+
+ list_for_each_xattr(entry, base_addr) {
+ if ((void *)entry + sizeof(__u32) > base_addr + inline_size ||
+ (void *)XATTR_NEXT_ENTRY(entry) + sizeof(__u32) >
+ base_addr + inline_size) {
+ *last_addr = entry;
+ return NULL;
+ }
+ if (entry->e_name_index != index)
+ continue;
+ if (entry->e_name_len != len)
+ continue;
+ if (!memcmp(entry->e_name, name, len))
+ break;
+ }
+ return entry;
+}
+
+static int read_inline_xattr(struct inode *inode, struct page *ipage,
+ void *txattr_addr)
{
struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
- struct f2fs_xattr_header *header;
- size_t size = PAGE_SIZE, inline_size = 0;
- void *txattr_addr;
- int err;
+ unsigned int inline_size = inline_xattr_size(inode);
+ struct page *page = NULL;
+ void *inline_addr;
+
+ if (ipage) {
+ inline_addr = inline_xattr_addr(inode, ipage);
+ } else {
+ page = f2fs_get_node_page(sbi, inode->i_ino);
+ if (IS_ERR(page))
+ return PTR_ERR(page);
+
+ inline_addr = inline_xattr_addr(inode, page);
+ }
+ memcpy(txattr_addr, inline_addr, inline_size);
+ f2fs_put_page(page, 1);
+
+ return 0;
+}
+
+static int read_xattr_block(struct inode *inode, void *txattr_addr)
+{
+ struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
+ nid_t xnid = F2FS_I(inode)->i_xattr_nid;
+ unsigned int inline_size = inline_xattr_size(inode);
+ struct page *xpage;
+ void *xattr_addr;
+
+ /* The inode already has an extended attribute block. */
+ xpage = f2fs_get_node_page(sbi, xnid);
+ if (IS_ERR(xpage))
+ return PTR_ERR(xpage);
+
+ xattr_addr = page_address(xpage);
+ memcpy(txattr_addr + inline_size, xattr_addr, VALID_XATTR_BLOCK_SIZE);
+ f2fs_put_page(xpage, 1);
+
+ return 0;
+}
+
+static int lookup_all_xattrs(struct inode *inode, struct page *ipage,
+ unsigned int index, unsigned int len,
+ const char *name, struct f2fs_xattr_entry **xe,
+ void **base_addr, int *base_size)
+{
+ void *cur_addr, *txattr_addr, *last_addr = NULL;
+ nid_t xnid = F2FS_I(inode)->i_xattr_nid;
+ unsigned int size = xnid ? VALID_XATTR_BLOCK_SIZE : 0;
+ unsigned int inline_size = inline_xattr_size(inode);
+ int err = 0;
- inline_size = inline_xattr_size(inode);
+ if (!size && !inline_size)
+ return -ENODATA;
- txattr_addr = kzalloc(inline_size + size, GFP_F2FS_ZERO);
+ *base_size = inline_size + size + XATTR_PADDING_SIZE;
+ txattr_addr = f2fs_kzalloc(F2FS_I_SB(inode), *base_size, GFP_NOFS);
if (!txattr_addr)
return -ENOMEM;
/* read from inline xattr */
if (inline_size) {
- struct page *page = NULL;
- void *inline_addr;
-
- if (ipage) {
- inline_addr = inline_xattr_addr(ipage);
- } else {
- page = get_node_page(sbi, inode->i_ino);
- if (IS_ERR(page)) {
- err = PTR_ERR(page);
- goto fail;
- }
- inline_addr = inline_xattr_addr(page);
+ err = read_inline_xattr(inode, ipage, txattr_addr);
+ if (err)
+ goto out;
+
+ *xe = __find_inline_xattr(inode, txattr_addr, &last_addr,
+ index, len, name);
+ if (*xe) {
+ *base_size = inline_size;
+ goto check;
}
- memcpy(txattr_addr, inline_addr, inline_size);
- f2fs_put_page(page, 1);
}
/* read from xattr node block */
- if (F2FS_I(inode)->i_xattr_nid) {
- struct page *xpage;
- void *xattr_addr;
+ if (xnid) {
+ err = read_xattr_block(inode, txattr_addr);
+ if (err)
+ goto out;
+ }
- /* The inode already has an extended attribute block. */
- xpage = get_node_page(sbi, F2FS_I(inode)->i_xattr_nid);
- if (IS_ERR(xpage)) {
- err = PTR_ERR(xpage);
+ if (last_addr)
+ cur_addr = XATTR_HDR(last_addr) - 1;
+ else
+ cur_addr = txattr_addr;
+
+ *xe = __find_xattr(cur_addr, index, len, name);
+check:
+ if (IS_XATTR_LAST_ENTRY(*xe)) {
+ err = -ENODATA;
+ goto out;
+ }
+
+ *base_addr = txattr_addr;
+ return 0;
+out:
+ kzfree(txattr_addr);
+ return err;
+}
+
+static int read_all_xattrs(struct inode *inode, struct page *ipage,
+ void **base_addr)
+{
+ struct f2fs_xattr_header *header;
+ nid_t xnid = F2FS_I(inode)->i_xattr_nid;
+ unsigned int size = VALID_XATTR_BLOCK_SIZE;
+ unsigned int inline_size = inline_xattr_size(inode);
+ void *txattr_addr;
+ int err;
+
+ txattr_addr = f2fs_kzalloc(F2FS_I_SB(inode),
+ inline_size + size + XATTR_PADDING_SIZE, GFP_NOFS);
+ if (!txattr_addr)
+ return -ENOMEM;
+
+ /* read from inline xattr */
+ if (inline_size) {
+ err = read_inline_xattr(inode, ipage, txattr_addr);
+ if (err)
goto fail;
- }
+ }
- xattr_addr = page_address(xpage);
- memcpy(txattr_addr + inline_size, xattr_addr, PAGE_SIZE);
- f2fs_put_page(xpage, 1);
+ /* read from xattr node block */
+ if (xnid) {
+ err = read_xattr_block(inode, txattr_addr);
+ if (err)
+ goto fail;
}
header = XATTR_HDR(txattr_addr);
@@ -286,85 +391,92 @@ static inline int write_all_xattrs(struct inode *inode, __u32 hsize,
void *txattr_addr, struct page *ipage)
{
struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
- size_t inline_size = 0;
+ size_t inline_size = inline_xattr_size(inode);
+ struct page *in_page = NULL;
void *xattr_addr;
+ void *inline_addr = NULL;
struct page *xpage;
nid_t new_nid = 0;
- int err;
-
- inline_size = inline_xattr_size(inode);
+ int err = 0;
if (hsize > inline_size && !F2FS_I(inode)->i_xattr_nid)
- if (!alloc_nid(sbi, &new_nid))
+ if (!f2fs_alloc_nid(sbi, &new_nid))
return -ENOSPC;
/* write to inline xattr */
if (inline_size) {
- struct page *page = NULL;
- void *inline_addr;
-
if (ipage) {
- inline_addr = inline_xattr_addr(ipage);
- f2fs_wait_on_page_writeback(ipage, NODE, true);
- set_page_dirty(ipage);
+ inline_addr = inline_xattr_addr(inode, ipage);
} else {
- page = get_node_page(sbi, inode->i_ino);
- if (IS_ERR(page)) {
- alloc_nid_failed(sbi, new_nid);
- return PTR_ERR(page);
+ in_page = f2fs_get_node_page(sbi, inode->i_ino);
+ if (IS_ERR(in_page)) {
+ f2fs_alloc_nid_failed(sbi, new_nid);
+ return PTR_ERR(in_page);
}
- inline_addr = inline_xattr_addr(page);
- f2fs_wait_on_page_writeback(page, NODE, true);
+ inline_addr = inline_xattr_addr(inode, in_page);
}
- memcpy(inline_addr, txattr_addr, inline_size);
- f2fs_put_page(page, 1);
+ f2fs_wait_on_page_writeback(ipage ? ipage : in_page,
+ NODE, true, true);
/* no need to use xattr node block */
if (hsize <= inline_size) {
- err = truncate_xattr_node(inode, ipage);
- alloc_nid_failed(sbi, new_nid);
- return err;
+ err = f2fs_truncate_xattr_node(inode);
+ f2fs_alloc_nid_failed(sbi, new_nid);
+ if (err) {
+ f2fs_put_page(in_page, 1);
+ return err;
+ }
+ memcpy(inline_addr, txattr_addr, inline_size);
+ set_page_dirty(ipage ? ipage : in_page);
+ goto in_page_out;
}
}
/* write to xattr node block */
if (F2FS_I(inode)->i_xattr_nid) {
- xpage = get_node_page(sbi, F2FS_I(inode)->i_xattr_nid);
+ xpage = f2fs_get_node_page(sbi, F2FS_I(inode)->i_xattr_nid);
if (IS_ERR(xpage)) {
- alloc_nid_failed(sbi, new_nid);
- return PTR_ERR(xpage);
+ err = PTR_ERR(xpage);
+ f2fs_alloc_nid_failed(sbi, new_nid);
+ goto in_page_out;
}
f2fs_bug_on(sbi, new_nid);
- f2fs_wait_on_page_writeback(xpage, NODE, true);
+ f2fs_wait_on_page_writeback(xpage, NODE, true, true);
} else {
struct dnode_of_data dn;
set_new_dnode(&dn, inode, NULL, NULL, new_nid);
- xpage = new_node_page(&dn, XATTR_NODE_OFFSET, ipage);
+ xpage = f2fs_new_node_page(&dn, XATTR_NODE_OFFSET);
if (IS_ERR(xpage)) {
- alloc_nid_failed(sbi, new_nid);
- return PTR_ERR(xpage);
+ err = PTR_ERR(xpage);
+ f2fs_alloc_nid_failed(sbi, new_nid);
+ goto in_page_out;
}
- alloc_nid_done(sbi, new_nid);
+ f2fs_alloc_nid_done(sbi, new_nid);
}
-
xattr_addr = page_address(xpage);
- memcpy(xattr_addr, txattr_addr + inline_size, PAGE_SIZE -
- sizeof(struct node_footer));
+
+ if (inline_size)
+ memcpy(inline_addr, txattr_addr, inline_size);
+ memcpy(xattr_addr, txattr_addr + inline_size, VALID_XATTR_BLOCK_SIZE);
+
+ if (inline_size)
+ set_page_dirty(ipage ? ipage : in_page);
set_page_dirty(xpage);
- f2fs_put_page(xpage, 1);
- /* need to checkpoint during fsync */
- F2FS_I(inode)->xattr_ver = cur_cp_version(F2FS_CKPT(sbi));
- return 0;
+ f2fs_put_page(xpage, 1);
+in_page_out:
+ f2fs_put_page(in_page, 1);
+ return err;
}
int f2fs_getxattr(struct inode *inode, int index, const char *name,
void *buffer, size_t buffer_size, struct page *ipage)
{
- struct f2fs_xattr_entry *entry;
- void *base_addr;
+ struct f2fs_xattr_entry *entry = NULL;
int error = 0;
- size_t size, len;
+ unsigned int size, len;
+ void *base_addr = NULL;
+ int base_size;
if (name == NULL)
return -EINVAL;
@@ -373,30 +485,31 @@ int f2fs_getxattr(struct inode *inode, int index, const char *name,
if (len > F2FS_NAME_LEN)
return -ERANGE;
- error = read_all_xattrs(inode, ipage, &base_addr);
+ down_read(&F2FS_I(inode)->i_xattr_sem);
+ error = lookup_all_xattrs(inode, ipage, index, len, name,
+ &entry, &base_addr, &base_size);
+ up_read(&F2FS_I(inode)->i_xattr_sem);
if (error)
return error;
- entry = __find_xattr(base_addr, index, len, name);
- if (IS_XATTR_LAST_ENTRY(entry)) {
- error = -ENODATA;
- goto cleanup;
- }
-
size = le16_to_cpu(entry->e_value_size);
if (buffer && size > buffer_size) {
error = -ERANGE;
- goto cleanup;
+ goto out;
}
if (buffer) {
char *pval = entry->e_name + entry->e_name_len;
+
+ if (base_size - (pval - (char *)base_addr) < size) {
+ error = -ERANGE;
+ goto out;
+ }
memcpy(buffer, pval, size);
}
error = size;
-
-cleanup:
+out:
kzfree(base_addr);
return error;
}
@@ -409,7 +522,9 @@ ssize_t f2fs_listxattr(struct dentry *dentry, char *buffer, size_t buffer_size)
int error = 0;
size_t rest = buffer_size;
+ down_read(&F2FS_I(inode)->i_xattr_sem);
error = read_all_xattrs(inode, NULL, &base_addr);
+ up_read(&F2FS_I(inode)->i_xattr_sem);
if (error)
return error;
@@ -445,6 +560,15 @@ cleanup:
return error;
}
+static bool f2fs_xattr_value_same(struct f2fs_xattr_entry *entry,
+ const void *value, size_t size)
+{
+ void *pval = entry->e_name + entry->e_name_len;
+
+ return (le16_to_cpu(entry->e_value_size) == size) &&
+ !memcmp(pval, value, size);
+}
+
static int __f2fs_setxattr(struct inode *inode, int index,
const char *name, const void *value, size_t size,
struct page *ipage, int flags)
@@ -479,12 +603,17 @@ static int __f2fs_setxattr(struct inode *inode, int index,
found = IS_XATTR_LAST_ENTRY(here) ? 0 : 1;
- if ((flags & XATTR_REPLACE) && !found) {
+ if (found) {
+ if ((flags & XATTR_CREATE)) {
+ error = -EEXIST;
+ goto exit;
+ }
+
+ if (value && f2fs_xattr_value_same(here, value, size))
+ goto exit;
+ } else if ((flags & XATTR_REPLACE)) {
error = -ENODATA;
goto exit;
- } else if ((flags & XATTR_CREATE) && found) {
- error = -EEXIST;
- goto exit;
}
last = here;
@@ -554,7 +683,7 @@ static int __f2fs_setxattr(struct inode *inode, int index,
if (index == F2FS_XATTR_INDEX_ENCRYPTION &&
!strcmp(name, F2FS_XATTR_NAME_ENCRYPTION_CONTEXT))
f2fs_set_encrypted_inode(inode);
- f2fs_mark_inode_dirty_sync(inode);
+ f2fs_mark_inode_dirty_sync(inode, true);
if (!error && S_ISDIR(inode->i_mode))
set_sbi_flag(F2FS_I_SB(inode), SBI_NEED_CP);
exit:
@@ -569,7 +698,11 @@ int f2fs_setxattr(struct inode *inode, int index, const char *name,
struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
int err;
- /* this case is only from init_inode_metadata */
+ err = dquot_initialize(inode);
+ if (err)
+ return err;
+
+ /* this case is only from f2fs_init_inode_metadata */
if (ipage)
return __f2fs_setxattr(inode, index, name, value,
size, ipage, flags);
@@ -578,7 +711,9 @@ int f2fs_setxattr(struct inode *inode, int index, const char *name,
f2fs_lock_op(sbi);
/* protect xattr_ver */
down_write(&F2FS_I(inode)->i_sem);
+ down_write(&F2FS_I(inode)->i_xattr_sem);
err = __f2fs_setxattr(inode, index, name, value, size, ipage, flags);
+ up_write(&F2FS_I(inode)->i_xattr_sem);
up_write(&F2FS_I(inode)->i_sem);
f2fs_unlock_op(sbi);
diff --git a/fs/f2fs/xattr.h b/fs/f2fs/xattr.h
index f990de20cdcd..67db134da0f5 100644
--- a/fs/f2fs/xattr.h
+++ b/fs/f2fs/xattr.h
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
/*
* fs/f2fs/xattr.h
*
@@ -9,10 +10,6 @@
* On-disk format of extended attributes for the ext2 filesystem.
*
* (C) 2001 Andreas Gruenbacher, <a.gruenbacher@computer.org>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
*/
#ifndef __F2FS_XATTR_H__
#define __F2FS_XATTR_H__
@@ -58,10 +55,10 @@ struct f2fs_xattr_entry {
#define XATTR_FIRST_ENTRY(ptr) (XATTR_ENTRY(XATTR_HDR(ptr) + 1))
#define XATTR_ROUND (3)
-#define XATTR_ALIGN(size) ((size + XATTR_ROUND) & ~XATTR_ROUND)
+#define XATTR_ALIGN(size) (((size) + XATTR_ROUND) & ~XATTR_ROUND)
#define ENTRY_SIZE(entry) (XATTR_ALIGN(sizeof(struct f2fs_xattr_entry) + \
- entry->e_name_len + le16_to_cpu(entry->e_value_size)))
+ (entry)->e_name_len + le16_to_cpu((entry)->e_value_size)))
#define XATTR_NEXT_ENTRY(entry) ((struct f2fs_xattr_entry *)((char *)(entry) +\
ENTRY_SIZE(entry)))
@@ -72,9 +69,10 @@ struct f2fs_xattr_entry {
for (entry = XATTR_FIRST_ENTRY(addr);\
!IS_XATTR_LAST_ENTRY(entry);\
entry = XATTR_NEXT_ENTRY(entry))
-
-#define MIN_OFFSET(i) XATTR_ALIGN(inline_xattr_size(i) + PAGE_SIZE - \
- sizeof(struct node_footer) - sizeof(__u32))
+#define VALID_XATTR_BLOCK_SIZE (PAGE_SIZE - sizeof(struct node_footer))
+#define XATTR_PADDING_SIZE (sizeof(__u32))
+#define MIN_OFFSET(i) XATTR_ALIGN(inline_xattr_size(i) + \
+ VALID_XATTR_BLOCK_SIZE)
#define MAX_VALUE_LEN(i) (MIN_OFFSET(i) - \
sizeof(struct f2fs_xattr_header) - \
diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c
index f3aea1b8702c..17ad41d88bea 100644
--- a/fs/fs-writeback.c
+++ b/fs/fs-writeback.c
@@ -2112,7 +2112,7 @@ void __mark_inode_dirty(struct inode *inode, int flags)
(dirtytime && (inode->i_state & I_DIRTY_INODE)))
return;
- if (unlikely(block_dump))
+ if (unlikely(block_dump > 1))
block_dump___mark_inode_dirty(inode);
spin_lock(&inode->i_lock);
diff --git a/fs/fs_struct.c b/fs/fs_struct.c
index 7dca743b2ce1..940c683561dd 100644
--- a/fs/fs_struct.c
+++ b/fs/fs_struct.c
@@ -44,6 +44,7 @@ void set_fs_pwd(struct fs_struct *fs, const struct path *path)
if (old_pwd.dentry)
path_put(&old_pwd);
}
+EXPORT_SYMBOL(set_fs_pwd);
static inline int replace_path(struct path *p, const struct path *old, const struct path *new)
{
@@ -89,6 +90,7 @@ void free_fs_struct(struct fs_struct *fs)
path_put(&fs->pwd);
kmem_cache_free(fs_cachep, fs);
}
+EXPORT_SYMBOL(free_fs_struct);
void exit_fs(struct task_struct *tsk)
{
@@ -127,6 +129,7 @@ struct fs_struct *copy_fs_struct(struct fs_struct *old)
}
return fs;
}
+EXPORT_SYMBOL_GPL(copy_fs_struct);
int unshare_fs_struct(void)
{
diff --git a/fs/fscache/object-list.c b/fs/fscache/object-list.c
index 37e0c31d284f..5eb2e24ce790 100644
--- a/fs/fscache/object-list.c
+++ b/fs/fscache/object-list.c
@@ -329,7 +329,7 @@ static void fscache_objlist_config(struct fscache_objlist_data *data)
config = 0;
rcu_read_lock();
- confkey = user_key_payload(key);
+ confkey = user_key_payload_rcu(key);
if (!confkey) {
/* key was revoked */
rcu_read_unlock();
diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c
index b4253181b5d4..127f0d1dee62 100644
--- a/fs/fuse/dev.c
+++ b/fs/fuse/dev.c
@@ -13,12 +13,14 @@
#include <linux/poll.h>
#include <linux/uio.h>
#include <linux/miscdevice.h>
+#include <linux/namei.h>
#include <linux/pagemap.h>
#include <linux/file.h>
#include <linux/slab.h>
#include <linux/pipe_fs_i.h>
#include <linux/swap.h>
#include <linux/splice.h>
+#include <linux/freezer.h>
MODULE_ALIAS_MISCDEV(FUSE_MINOR);
MODULE_ALIAS("devname:fuse");
@@ -471,7 +473,9 @@ static void request_wait_answer(struct fuse_conn *fc, struct fuse_req *req)
* Either request is already in userspace, or it was forced.
* Wait it out.
*/
- wait_event(req->waitq, test_bit(FR_FINISHED, &req->flags));
+ while (!test_bit(FR_FINISHED, &req->flags))
+ wait_event_freezable(req->waitq,
+ test_bit(FR_FINISHED, &req->flags));
}
static void __fuse_request_send(struct fuse_conn *fc, struct fuse_req *req)
@@ -1906,6 +1910,12 @@ static ssize_t fuse_dev_do_write(struct fuse_dev *fud,
cs->move_pages = 0;
err = copy_out_args(cs, &req->out, nbytes);
+ if (req->in.h.opcode == FUSE_CANONICAL_PATH) {
+ char *path = (char *)req->out.args[0].value;
+
+ path[req->out.args[0].size - 1] = 0;
+ req->out.h.error = kern_path(path, 0, req->canonical_path);
+ }
fuse_copy_finish(cs);
spin_lock(&fpq->lock);
diff --git a/fs/fuse/dir.c b/fs/fuse/dir.c
index 60dd2bc10776..547a324d45fd 100644
--- a/fs/fuse/dir.c
+++ b/fs/fuse/dir.c
@@ -262,6 +262,50 @@ invalid:
goto out;
}
+/*
+ * Get the canonical path. Since we must translate to a path, this must be done
+ * in the context of the userspace daemon, however, the userspace daemon cannot
+ * look up paths on its own. Instead, we handle the lookup as a special case
+ * inside of the write request.
+ */
+static void fuse_dentry_canonical_path(const struct path *path, struct path *canonical_path) {
+ struct inode *inode = path->dentry->d_inode;
+ struct fuse_conn *fc = get_fuse_conn(inode);
+ struct fuse_req *req;
+ int err;
+ char *path_name;
+
+ req = fuse_get_req(fc, 1);
+ err = PTR_ERR(req);
+ if (IS_ERR(req))
+ goto default_path;
+
+ path_name = (char*)__get_free_page(GFP_KERNEL);
+ if (!path_name) {
+ fuse_put_request(fc, req);
+ goto default_path;
+ }
+
+ req->in.h.opcode = FUSE_CANONICAL_PATH;
+ req->in.h.nodeid = get_node_id(inode);
+ req->in.numargs = 0;
+ req->out.numargs = 1;
+ req->out.args[0].size = PATH_MAX;
+ req->out.args[0].value = path_name;
+ req->canonical_path = canonical_path;
+ req->out.argvar = 1;
+ fuse_request_send(fc, req);
+ err = req->out.h.error;
+ fuse_put_request(fc, req);
+ free_page((unsigned long)path_name);
+ if (!err)
+ return;
+default_path:
+ canonical_path->dentry = path->dentry;
+ canonical_path->mnt = path->mnt;
+ path_get(canonical_path);
+}
+
static int invalid_nodeid(u64 nodeid)
{
return !nodeid || nodeid == FUSE_ROOT_ID;
@@ -284,11 +328,13 @@ const struct dentry_operations fuse_dentry_operations = {
.d_revalidate = fuse_dentry_revalidate,
.d_init = fuse_dentry_init,
.d_release = fuse_dentry_release,
+ .d_canonical_path = fuse_dentry_canonical_path,
};
const struct dentry_operations fuse_root_dentry_operations = {
.d_init = fuse_dentry_init,
.d_release = fuse_dentry_release,
+ .d_canonical_path = fuse_dentry_canonical_path,
};
int fuse_valid_type(int m)
diff --git a/fs/fuse/file.c b/fs/fuse/file.c
index 1cd46e667e3d..b6c76edffd87 100644
--- a/fs/fuse/file.c
+++ b/fs/fuse/file.c
@@ -839,9 +839,9 @@ struct fuse_fill_data {
unsigned nr_pages;
};
-static int fuse_readpages_fill(void *_data, struct page *page)
+static int fuse_readpages_fill(struct file *_data, struct page *page)
{
- struct fuse_fill_data *data = _data;
+ struct fuse_fill_data *data = (struct fuse_fill_data *)_data;
struct fuse_req *req = data->req;
struct inode *inode = data->inode;
struct fuse_conn *fc = get_fuse_conn(inode);
diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h
index 1c905c7666de..a10c56e2147b 100644
--- a/fs/fuse/fuse_i.h
+++ b/fs/fuse/fuse_i.h
@@ -368,6 +368,9 @@ struct fuse_req {
/** Inode used in the request or NULL */
struct inode *inode;
+ /** Path used for completing d_canonical_path */
+ struct path *canonical_path;
+
/** AIO control block */
struct fuse_io_priv *io;
diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c
index 7a9b1069d267..c32fe644f929 100644
--- a/fs/fuse/inode.c
+++ b/fs/fuse/inode.c
@@ -30,7 +30,7 @@ static struct kmem_cache *fuse_inode_cachep;
struct list_head fuse_conn_list;
DEFINE_MUTEX(fuse_mutex);
-static int set_global_limit(const char *val, struct kernel_param *kp);
+static int set_global_limit(const char *val, const struct kernel_param *kp);
unsigned max_user_bgreq;
module_param_call(max_user_bgreq, set_global_limit, param_get_uint,
@@ -830,7 +830,7 @@ static void sanitize_global_limit(unsigned *limit)
*limit = (1 << 16) - 1;
}
-static int set_global_limit(const char *val, struct kernel_param *kp)
+static int set_global_limit(const char *val, const struct kernel_param *kp)
{
int rv;
diff --git a/fs/gfs2/aops.c b/fs/gfs2/aops.c
index 5a6f52ea2722..0283ee057c54 100644
--- a/fs/gfs2/aops.c
+++ b/fs/gfs2/aops.c
@@ -268,22 +268,6 @@ static int gfs2_write_jdata_pagevec(struct address_space *mapping,
for(i = 0; i < nr_pages; i++) {
struct page *page = pvec->pages[i];
- /*
- * At this point, the page may be truncated or
- * invalidated (changing page->mapping to NULL), or
- * even swizzled back from swapper_space to tmpfs file
- * mapping. However, page->index will not change
- * because we have a reference on the page.
- */
- if (page->index > end) {
- /*
- * can't be range_cyclic (1st pass) because
- * end == -1 in that case.
- */
- ret = 1;
- break;
- }
-
*done_index = page->index;
lock_page(page);
@@ -401,8 +385,8 @@ retry:
tag_pages_for_writeback(mapping, index, end);
done_index = index;
while (!done && (index <= end)) {
- nr_pages = pagevec_lookup_tag(&pvec, mapping, &index, tag,
- min(end - index, (pgoff_t)PAGEVEC_SIZE-1) + 1);
+ nr_pages = pagevec_lookup_range_tag(&pvec, mapping, &index, end,
+ tag);
if (nr_pages == 0)
break;
@@ -511,7 +495,7 @@ static int stuffed_readpage(struct gfs2_inode *ip, struct page *page)
*
*/
-static int __gfs2_readpage(void *file, struct page *page)
+static int __gfs2_readpage(struct file *file, struct page *page)
{
struct gfs2_inode *ip = GFS2_I(page->mapping->host);
struct gfs2_sbd *sdp = GFS2_SB(page->mapping->host);
diff --git a/fs/inode.c b/fs/inode.c
index 2071ff5343c5..1d1a9573ca70 100644
--- a/fs/inode.c
+++ b/fs/inode.c
@@ -1781,7 +1781,7 @@ int dentry_needs_remove_privs(struct dentry *dentry)
return mask;
}
-static int __remove_privs(struct dentry *dentry, int kill)
+static int __remove_privs(struct vfsmount *mnt, struct dentry *dentry, int kill)
{
struct iattr newattrs;
@@ -1790,7 +1790,7 @@ static int __remove_privs(struct dentry *dentry, int kill)
* Note we call this on write, so notify_change will not
* encounter any conflicting delegations:
*/
- return notify_change(dentry, &newattrs, NULL);
+ return notify_change2(mnt, dentry, &newattrs, NULL);
}
/*
@@ -1812,7 +1812,7 @@ int file_remove_privs(struct file *file)
if (kill < 0)
return kill;
if (kill)
- error = __remove_privs(dentry, kill);
+ error = __remove_privs(file->f_path.mnt, dentry, kill);
if (!error)
inode_has_no_xattr(inode);
diff --git a/fs/internal.h b/fs/internal.h
index 8b7143b0211c..3e58863de514 100644
--- a/fs/internal.h
+++ b/fs/internal.h
@@ -88,9 +88,11 @@ extern struct file *get_empty_filp(void);
* super.c
*/
extern int do_remount_sb(struct super_block *, int, void *, int);
+extern int do_remount_sb2(struct vfsmount *, struct super_block *, int,
+ void *, int);
extern bool trylock_super(struct super_block *sb);
extern struct dentry *mount_fs(struct file_system_type *,
- int, const char *, void *);
+ int, const char *, struct vfsmount *, void *);
extern struct super_block *user_get_super(dev_t);
/*
diff --git a/fs/lockd/svc.c b/fs/lockd/svc.c
index c19123dcd1a4..d484c63d0c03 100644
--- a/fs/lockd/svc.c
+++ b/fs/lockd/svc.c
@@ -600,7 +600,7 @@ static struct ctl_table nlm_sysctl_root[] = {
*/
#define param_set_min_max(name, type, which_strtol, min, max) \
-static int param_set_##name(const char *val, struct kernel_param *kp) \
+static int param_set_##name(const char *val, const struct kernel_param *kp) \
{ \
char *endp; \
__typeof__(type) num = which_strtol(val, &endp, 0); \
diff --git a/fs/logfs/dev_bdev.c b/fs/logfs/dev_bdev.c
index a8329cc47dec..fdf844f6d9f7 100644
--- a/fs/logfs/dev_bdev.c
+++ b/fs/logfs/dev_bdev.c
@@ -34,9 +34,9 @@ static int sync_request(struct page *page, struct block_device *bdev, int op)
return submit_bio_wait(&bio);
}
-static int bdev_readpage(void *_sb, struct page *page)
+static int bdev_readpage(struct file *_sb, struct page *page)
{
- struct super_block *sb = _sb;
+ struct super_block *sb = (struct super_block *)_sb;
struct block_device *bdev = logfs_super(sb)->s_bdev;
int err;
diff --git a/fs/logfs/dev_mtd.c b/fs/logfs/dev_mtd.c
index b76a62b1978f..9ec8e8f9d1c9 100644
--- a/fs/logfs/dev_mtd.c
+++ b/fs/logfs/dev_mtd.c
@@ -122,9 +122,9 @@ static void logfs_mtd_sync(struct super_block *sb)
mtd_sync(mtd);
}
-static int logfs_mtd_readpage(void *_sb, struct page *page)
+static int logfs_mtd_readpage(struct file *_sb, struct page *page)
{
- struct super_block *sb = _sb;
+ struct super_block *sb = (struct super_block *)_sb;
int err;
err = logfs_mtd_read(sb, page->index << PAGE_SHIFT, PAGE_SIZE,
diff --git a/fs/logfs/dir.c b/fs/logfs/dir.c
index c87ea52de3d9..8ddacc124804 100644
--- a/fs/logfs/dir.c
+++ b/fs/logfs/dir.c
@@ -174,7 +174,7 @@ static struct page *logfs_get_dd_page(struct inode *dir, struct dentry *dentry)
if (!logfs_exist_block(dir, index))
continue;
page = read_cache_page(dir->i_mapping, index,
- (filler_t *)logfs_readpage, NULL);
+ logfs_readpage, NULL);
if (IS_ERR(page))
return page;
dd = kmap_atomic(page);
@@ -306,7 +306,7 @@ static int logfs_readdir(struct file *file, struct dir_context *ctx)
continue;
}
page = read_cache_page(dir->i_mapping, pos,
- (filler_t *)logfs_readpage, NULL);
+ logfs_readpage, NULL);
if (IS_ERR(page))
return PTR_ERR(page);
dd = kmap(page);
diff --git a/fs/logfs/logfs.h b/fs/logfs/logfs.h
index 27d040e35faa..1a7c0b02efbc 100644
--- a/fs/logfs/logfs.h
+++ b/fs/logfs/logfs.h
@@ -151,7 +151,7 @@ struct logfs_device_ops {
struct page *(*find_first_sb)(struct super_block *sb, u64 *ofs);
struct page *(*find_last_sb)(struct super_block *sb, u64 *ofs);
int (*write_sb)(struct super_block *sb, struct page *page);
- int (*readpage)(void *_sb, struct page *page);
+ int (*readpage)(struct file *_sb, struct page *page);
void (*writeseg)(struct super_block *sb, u64 ofs, size_t len);
int (*erase)(struct super_block *sb, loff_t ofs, size_t len,
int ensure_write);
diff --git a/fs/mpage.c b/fs/mpage.c
index e2ea442bb9e1..d4e17c88ce08 100644
--- a/fs/mpage.c
+++ b/fs/mpage.c
@@ -31,6 +31,14 @@
#include <linux/cleancache.h>
#include "internal.h"
+#define CREATE_TRACE_POINTS
+#include <trace/events/android_fs.h>
+
+EXPORT_TRACEPOINT_SYMBOL(android_fs_datawrite_start);
+EXPORT_TRACEPOINT_SYMBOL(android_fs_datawrite_end);
+EXPORT_TRACEPOINT_SYMBOL(android_fs_dataread_start);
+EXPORT_TRACEPOINT_SYMBOL(android_fs_dataread_end);
+
/*
* I/O completion handler for multipage BIOs.
*
@@ -48,6 +56,16 @@ static void mpage_end_io(struct bio *bio)
struct bio_vec *bv;
int i;
+ if (trace_android_fs_dataread_end_enabled() &&
+ (bio_data_dir(bio) == READ)) {
+ struct page *first_page = bio->bi_io_vec[0].bv_page;
+
+ if (first_page != NULL)
+ trace_android_fs_dataread_end(first_page->mapping->host,
+ page_offset(first_page),
+ bio->bi_iter.bi_size);
+ }
+
bio_for_each_segment_all(bv, bio, i) {
struct page *page = bv->bv_page;
page_endio(page, op_is_write(bio_op(bio)), bio->bi_error);
@@ -58,6 +76,24 @@ static void mpage_end_io(struct bio *bio)
static struct bio *mpage_bio_submit(int op, int op_flags, struct bio *bio)
{
+ if (trace_android_fs_dataread_start_enabled() && (op == REQ_OP_READ)) {
+ struct page *first_page = bio->bi_io_vec[0].bv_page;
+
+ if (first_page != NULL) {
+ char *path, pathbuf[MAX_TRACE_PATHBUF_LEN];
+
+ path = android_fstrace_get_pathname(pathbuf,
+ MAX_TRACE_PATHBUF_LEN,
+ first_page->mapping->host);
+ trace_android_fs_dataread_start(
+ first_page->mapping->host,
+ page_offset(first_page),
+ bio->bi_iter.bi_size,
+ current->pid,
+ path,
+ current->comm);
+ }
+ }
bio->bi_end_io = mpage_end_io;
bio_set_op_attrs(bio, op, op_flags);
guard_bio_eod(op, bio);
diff --git a/fs/namei.c b/fs/namei.c
index eb4626bad88a..88eb41cdd639 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -376,9 +376,11 @@ EXPORT_SYMBOL(generic_permission);
* flag in inode->i_opflags, that says "this has not special
* permission function, use the fast case".
*/
-static inline int do_inode_permission(struct inode *inode, int mask)
+static inline int do_inode_permission(struct vfsmount *mnt, struct inode *inode, int mask)
{
if (unlikely(!(inode->i_opflags & IOP_FASTPERM))) {
+ if (likely(mnt && inode->i_op->permission2))
+ return inode->i_op->permission2(mnt, inode, mask);
if (likely(inode->i_op->permission))
return inode->i_op->permission(inode, mask);
@@ -402,7 +404,7 @@ static inline int do_inode_permission(struct inode *inode, int mask)
* This does not check for a read-only file system. You probably want
* inode_permission().
*/
-int __inode_permission(struct inode *inode, int mask)
+int __inode_permission2(struct vfsmount *mnt, struct inode *inode, int mask)
{
int retval;
@@ -422,7 +424,7 @@ int __inode_permission(struct inode *inode, int mask)
return -EACCES;
}
- retval = do_inode_permission(inode, mask);
+ retval = do_inode_permission(mnt, inode, mask);
if (retval)
return retval;
@@ -430,7 +432,14 @@ int __inode_permission(struct inode *inode, int mask)
if (retval)
return retval;
- return security_inode_permission(inode, mask);
+ retval = security_inode_permission(inode, mask);
+ return retval;
+}
+EXPORT_SYMBOL(__inode_permission2);
+
+int __inode_permission(struct inode *inode, int mask)
+{
+ return __inode_permission2(NULL, inode, mask);
}
EXPORT_SYMBOL(__inode_permission);
@@ -466,14 +475,20 @@ static int sb_permission(struct super_block *sb, struct inode *inode, int mask)
*
* When checking for MAY_APPEND, MAY_WRITE must also be set in @mask.
*/
-int inode_permission(struct inode *inode, int mask)
+int inode_permission2(struct vfsmount *mnt, struct inode *inode, int mask)
{
int retval;
retval = sb_permission(inode->i_sb, inode, mask);
if (retval)
return retval;
- return __inode_permission(inode, mask);
+ return __inode_permission2(mnt, inode, mask);
+}
+EXPORT_SYMBOL(inode_permission2);
+
+int inode_permission(struct inode *inode, int mask)
+{
+ return inode_permission2(NULL, inode, mask);
}
EXPORT_SYMBOL(inode_permission);
@@ -1706,13 +1721,13 @@ out:
static inline int may_lookup(struct nameidata *nd)
{
if (nd->flags & LOOKUP_RCU) {
- int err = inode_permission(nd->inode, MAY_EXEC|MAY_NOT_BLOCK);
+ int err = inode_permission2(nd->path.mnt, nd->inode, MAY_EXEC|MAY_NOT_BLOCK);
if (err != -ECHILD)
return err;
if (unlazy_walk(nd, NULL, 0))
return -ECHILD;
}
- return inode_permission(nd->inode, MAY_EXEC);
+ return inode_permission2(nd->path.mnt, nd->inode, MAY_EXEC);
}
static inline int handle_dots(struct nameidata *nd, int type)
@@ -2186,11 +2201,12 @@ static const char *path_init(struct nameidata *nd, unsigned flags)
nd->depth = 0;
if (flags & LOOKUP_ROOT) {
struct dentry *root = nd->root.dentry;
+ struct vfsmount *mnt = nd->root.mnt;
struct inode *inode = root->d_inode;
if (*s) {
if (!d_can_lookup(root))
return ERR_PTR(-ENOTDIR);
- retval = inode_permission(inode, MAY_EXEC);
+ retval = inode_permission2(mnt, inode, MAY_EXEC);
if (retval)
return ERR_PTR(retval);
}
@@ -2455,6 +2471,7 @@ EXPORT_SYMBOL(vfs_path_lookup);
/**
* lookup_one_len - filesystem helper to lookup single pathname component
* @name: pathname component to lookup
+ * @mnt: mount we are looking up on
* @base: base directory to lookup from
* @len: maximum length @len should be interpreted to
*
@@ -2463,7 +2480,7 @@ EXPORT_SYMBOL(vfs_path_lookup);
*
* The caller must hold base->i_mutex.
*/
-struct dentry *lookup_one_len(const char *name, struct dentry *base, int len)
+struct dentry *lookup_one_len2(const char *name, struct vfsmount *mnt, struct dentry *base, int len)
{
struct qstr this;
unsigned int c;
@@ -2497,12 +2514,18 @@ struct dentry *lookup_one_len(const char *name, struct dentry *base, int len)
return ERR_PTR(err);
}
- err = inode_permission(base->d_inode, MAY_EXEC);
+ err = inode_permission2(mnt, base->d_inode, MAY_EXEC);
if (err)
return ERR_PTR(err);
return __lookup_hash(&this, base, 0);
}
+EXPORT_SYMBOL(lookup_one_len2);
+
+struct dentry *lookup_one_len(const char *name, struct dentry *base, int len)
+{
+ return lookup_one_len2(name, NULL, base, len);
+}
EXPORT_SYMBOL(lookup_one_len);
/**
@@ -2805,7 +2828,7 @@ EXPORT_SYMBOL(__check_sticky);
* 11. We don't allow removal of NFS sillyrenamed files; it's handled by
* nfs_async_unlink().
*/
-static int may_delete(struct inode *dir, struct dentry *victim, bool isdir)
+static int may_delete(struct vfsmount *mnt, struct inode *dir, struct dentry *victim, bool isdir)
{
struct inode *inode = d_backing_inode(victim);
int error;
@@ -2817,7 +2840,7 @@ static int may_delete(struct inode *dir, struct dentry *victim, bool isdir)
BUG_ON(victim->d_parent->d_inode != dir);
audit_inode_child(dir, victim, AUDIT_TYPE_CHILD_DELETE);
- error = inode_permission(dir, MAY_WRITE | MAY_EXEC);
+ error = inode_permission2(mnt, dir, MAY_WRITE | MAY_EXEC);
if (error)
return error;
if (IS_APPEND(dir))
@@ -2849,7 +2872,7 @@ static int may_delete(struct inode *dir, struct dentry *victim, bool isdir)
* 4. We should have write and exec permissions on dir
* 5. We can't do it if dir is immutable (done in permission())
*/
-static inline int may_create(struct inode *dir, struct dentry *child)
+static inline int may_create(struct vfsmount *mnt, struct inode *dir, struct dentry *child)
{
struct user_namespace *s_user_ns;
audit_inode_child(dir, child, AUDIT_TYPE_CHILD_CREATE);
@@ -2861,7 +2884,7 @@ static inline int may_create(struct inode *dir, struct dentry *child)
if (!kuid_has_mapping(s_user_ns, current_fsuid()) ||
!kgid_has_mapping(s_user_ns, current_fsgid()))
return -EOVERFLOW;
- return inode_permission(dir, MAY_WRITE | MAY_EXEC);
+ return inode_permission2(mnt, dir, MAY_WRITE | MAY_EXEC);
}
/*
@@ -2908,10 +2931,10 @@ void unlock_rename(struct dentry *p1, struct dentry *p2)
}
EXPORT_SYMBOL(unlock_rename);
-int vfs_create(struct inode *dir, struct dentry *dentry, umode_t mode,
- bool want_excl)
+int vfs_create2(struct vfsmount *mnt, struct inode *dir, struct dentry *dentry,
+ umode_t mode, bool want_excl)
{
- int error = may_create(dir, dentry);
+ int error = may_create(mnt, dir, dentry);
if (error)
return error;
@@ -2927,6 +2950,13 @@ int vfs_create(struct inode *dir, struct dentry *dentry, umode_t mode,
fsnotify_create(dir, dentry);
return error;
}
+EXPORT_SYMBOL(vfs_create2);
+
+int vfs_create(struct inode *dir, struct dentry *dentry, umode_t mode,
+ bool want_excl)
+{
+ return vfs_create2(NULL, dir, dentry, mode, want_excl);
+}
EXPORT_SYMBOL(vfs_create);
bool may_open_dev(const struct path *path)
@@ -2938,6 +2968,7 @@ bool may_open_dev(const struct path *path)
static int may_open(struct path *path, int acc_mode, int flag)
{
struct dentry *dentry = path->dentry;
+ struct vfsmount *mnt = path->mnt;
struct inode *inode = dentry->d_inode;
int error;
@@ -2962,7 +2993,7 @@ static int may_open(struct path *path, int acc_mode, int flag)
break;
}
- error = inode_permission(inode, MAY_OPEN | acc_mode);
+ error = inode_permission2(mnt, inode, MAY_OPEN | acc_mode);
if (error)
return error;
@@ -2997,7 +3028,7 @@ static int handle_truncate(struct file *filp)
if (!error)
error = security_path_truncate(path);
if (!error) {
- error = do_truncate(path->dentry, 0,
+ error = do_truncate2(path->mnt, path->dentry, 0,
ATTR_MTIME|ATTR_CTIME|ATTR_OPEN,
filp);
}
@@ -3024,7 +3055,7 @@ static int may_o_create(const struct path *dir, struct dentry *dentry, umode_t m
!kgid_has_mapping(s_user_ns, current_fsgid()))
return -EOVERFLOW;
- error = inode_permission(dir->dentry->d_inode, MAY_WRITE | MAY_EXEC);
+ error = inode_permission2(dir->mnt, dir->dentry->d_inode, MAY_WRITE | MAY_EXEC);
if (error)
return error;
@@ -3461,7 +3492,7 @@ static int do_tmpfile(struct nameidata *nd, unsigned flags,
goto out;
dir = path.dentry->d_inode;
/* we want directory to be writable */
- error = inode_permission(dir, MAY_WRITE | MAY_EXEC);
+ error = inode_permission2(nd->path.mnt, dir, MAY_WRITE | MAY_EXEC);
if (error)
goto out2;
if (!dir->i_op->tmpfile) {
@@ -3714,9 +3745,9 @@ inline struct dentry *user_path_create(int dfd, const char __user *pathname,
}
EXPORT_SYMBOL(user_path_create);
-int vfs_mknod(struct inode *dir, struct dentry *dentry, umode_t mode, dev_t dev)
+int vfs_mknod2(struct vfsmount *mnt, struct inode *dir, struct dentry *dentry, umode_t mode, dev_t dev)
{
- int error = may_create(dir, dentry);
+ int error = may_create(mnt, dir, dentry);
if (error)
return error;
@@ -3740,6 +3771,12 @@ int vfs_mknod(struct inode *dir, struct dentry *dentry, umode_t mode, dev_t dev)
fsnotify_create(dir, dentry);
return error;
}
+EXPORT_SYMBOL(vfs_mknod2);
+
+int vfs_mknod(struct inode *dir, struct dentry *dentry, umode_t mode, dev_t dev)
+{
+ return vfs_mknod2(NULL, dir, dentry, mode, dev);
+}
EXPORT_SYMBOL(vfs_mknod);
static int may_mknod(umode_t mode)
@@ -3782,12 +3819,12 @@ retry:
goto out;
switch (mode & S_IFMT) {
case 0: case S_IFREG:
- error = vfs_create(path.dentry->d_inode,dentry,mode,true);
+ error = vfs_create2(path.mnt, path.dentry->d_inode,dentry,mode,true);
if (!error)
ima_post_path_mknod(dentry);
break;
case S_IFCHR: case S_IFBLK:
- error = vfs_mknod(path.dentry->d_inode,dentry,mode,
+ error = vfs_mknod2(path.mnt, path.dentry->d_inode,dentry,mode,
new_decode_dev(dev));
break;
case S_IFIFO: case S_IFSOCK:
@@ -3808,9 +3845,9 @@ SYSCALL_DEFINE3(mknod, const char __user *, filename, umode_t, mode, unsigned, d
return sys_mknodat(AT_FDCWD, filename, mode, dev);
}
-int vfs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
+int vfs_mkdir2(struct vfsmount *mnt, struct inode *dir, struct dentry *dentry, umode_t mode)
{
- int error = may_create(dir, dentry);
+ int error = may_create(mnt, dir, dentry);
unsigned max_links = dir->i_sb->s_max_links;
if (error)
@@ -3832,6 +3869,12 @@ int vfs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
fsnotify_mkdir(dir, dentry);
return error;
}
+EXPORT_SYMBOL(vfs_mkdir2);
+
+int vfs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
+{
+ return vfs_mkdir2(NULL, dir, dentry, mode);
+}
EXPORT_SYMBOL(vfs_mkdir);
SYSCALL_DEFINE3(mkdirat, int, dfd, const char __user *, pathname, umode_t, mode)
@@ -3850,7 +3893,7 @@ retry:
mode &= ~current_umask();
error = security_path_mkdir(&path, dentry, mode);
if (!error)
- error = vfs_mkdir(path.dentry->d_inode, dentry, mode);
+ error = vfs_mkdir2(path.mnt, path.dentry->d_inode, dentry, mode);
done_path_create(&path, dentry);
if (retry_estale(error, lookup_flags)) {
lookup_flags |= LOOKUP_REVAL;
@@ -3864,9 +3907,9 @@ SYSCALL_DEFINE2(mkdir, const char __user *, pathname, umode_t, mode)
return sys_mkdirat(AT_FDCWD, pathname, mode);
}
-int vfs_rmdir(struct inode *dir, struct dentry *dentry)
+int vfs_rmdir2(struct vfsmount *mnt, struct inode *dir, struct dentry *dentry)
{
- int error = may_delete(dir, dentry, 1);
+ int error = may_delete(mnt, dir, dentry, 1);
if (error)
return error;
@@ -3901,6 +3944,12 @@ out:
d_delete(dentry);
return error;
}
+EXPORT_SYMBOL(vfs_rmdir2);
+
+int vfs_rmdir(struct inode *dir, struct dentry *dentry)
+{
+ return vfs_rmdir2(NULL, dir, dentry);
+}
EXPORT_SYMBOL(vfs_rmdir);
static long do_rmdir(int dfd, const char __user *pathname)
@@ -3946,7 +3995,7 @@ retry:
error = security_path_rmdir(&path, dentry);
if (error)
goto exit3;
- error = vfs_rmdir(path.dentry->d_inode, dentry);
+ error = vfs_rmdir2(path.mnt, path.dentry->d_inode, dentry);
exit3:
dput(dentry);
exit2:
@@ -3985,10 +4034,10 @@ SYSCALL_DEFINE1(rmdir, const char __user *, pathname)
* be appropriate for callers that expect the underlying filesystem not
* to be NFS exported.
*/
-int vfs_unlink(struct inode *dir, struct dentry *dentry, struct inode **delegated_inode)
+int vfs_unlink2(struct vfsmount *mnt, struct inode *dir, struct dentry *dentry, struct inode **delegated_inode)
{
struct inode *target = dentry->d_inode;
- int error = may_delete(dir, dentry, 0);
+ int error = may_delete(mnt, dir, dentry, 0);
if (error)
return error;
@@ -4023,6 +4072,12 @@ out:
return error;
}
+EXPORT_SYMBOL(vfs_unlink2);
+
+int vfs_unlink(struct inode *dir, struct dentry *dentry, struct inode **delegated_inode)
+{
+ return vfs_unlink2(NULL, dir, dentry, delegated_inode);
+}
EXPORT_SYMBOL(vfs_unlink);
/*
@@ -4070,7 +4125,7 @@ retry_deleg:
error = security_path_unlink(&path, dentry);
if (error)
goto exit2;
- error = vfs_unlink(path.dentry->d_inode, dentry, &delegated_inode);
+ error = vfs_unlink2(path.mnt, path.dentry->d_inode, dentry, &delegated_inode);
exit2:
dput(dentry);
}
@@ -4120,9 +4175,9 @@ SYSCALL_DEFINE1(unlink, const char __user *, pathname)
return do_unlinkat(AT_FDCWD, pathname);
}
-int vfs_symlink(struct inode *dir, struct dentry *dentry, const char *oldname)
+int vfs_symlink2(struct vfsmount *mnt, struct inode *dir, struct dentry *dentry, const char *oldname)
{
- int error = may_create(dir, dentry);
+ int error = may_create(mnt, dir, dentry);
if (error)
return error;
@@ -4139,6 +4194,12 @@ int vfs_symlink(struct inode *dir, struct dentry *dentry, const char *oldname)
fsnotify_create(dir, dentry);
return error;
}
+EXPORT_SYMBOL(vfs_symlink2);
+
+int vfs_symlink(struct inode *dir, struct dentry *dentry, const char *oldname)
+{
+ return vfs_symlink2(NULL, dir, dentry, oldname);
+}
EXPORT_SYMBOL(vfs_symlink);
SYSCALL_DEFINE3(symlinkat, const char __user *, oldname,
@@ -4161,7 +4222,7 @@ retry:
error = security_path_symlink(&path, dentry, from->name);
if (!error)
- error = vfs_symlink(path.dentry->d_inode, dentry, from->name);
+ error = vfs_symlink2(path.mnt, path.dentry->d_inode, dentry, from->name);
done_path_create(&path, dentry);
if (retry_estale(error, lookup_flags)) {
lookup_flags |= LOOKUP_REVAL;
@@ -4196,7 +4257,7 @@ SYSCALL_DEFINE2(symlink, const char __user *, oldname, const char __user *, newn
* be appropriate for callers that expect the underlying filesystem not
* to be NFS exported.
*/
-int vfs_link(struct dentry *old_dentry, struct inode *dir, struct dentry *new_dentry, struct inode **delegated_inode)
+int vfs_link2(struct vfsmount *mnt, struct dentry *old_dentry, struct inode *dir, struct dentry *new_dentry, struct inode **delegated_inode)
{
struct inode *inode = old_dentry->d_inode;
unsigned max_links = dir->i_sb->s_max_links;
@@ -4205,7 +4266,7 @@ int vfs_link(struct dentry *old_dentry, struct inode *dir, struct dentry *new_de
if (!inode)
return -ENOENT;
- error = may_create(dir, new_dentry);
+ error = may_create(mnt, dir, new_dentry);
if (error)
return error;
@@ -4255,6 +4316,12 @@ int vfs_link(struct dentry *old_dentry, struct inode *dir, struct dentry *new_de
fsnotify_link(dir, inode, new_dentry);
return error;
}
+EXPORT_SYMBOL(vfs_link2);
+
+int vfs_link(struct dentry *old_dentry, struct inode *dir, struct dentry *new_dentry, struct inode **delegated_inode)
+{
+ return vfs_link2(NULL, old_dentry, dir, new_dentry, delegated_inode);
+}
EXPORT_SYMBOL(vfs_link);
/*
@@ -4310,7 +4377,7 @@ retry:
error = security_path_link(old_path.dentry, &new_path, new_dentry);
if (error)
goto out_dput;
- error = vfs_link(old_path.dentry, new_path.dentry->d_inode, new_dentry, &delegated_inode);
+ error = vfs_link2(old_path.mnt, old_path.dentry, new_path.dentry->d_inode, new_dentry, &delegated_inode);
out_dput:
done_path_create(&new_path, new_dentry);
if (delegated_inode) {
@@ -4385,7 +4452,8 @@ SYSCALL_DEFINE2(link, const char __user *, oldname, const char __user *, newname
* ->i_mutex on parents, which works but leads to some truly excessive
* locking].
*/
-int vfs_rename(struct inode *old_dir, struct dentry *old_dentry,
+int vfs_rename2(struct vfsmount *mnt,
+ struct inode *old_dir, struct dentry *old_dentry,
struct inode *new_dir, struct dentry *new_dentry,
struct inode **delegated_inode, unsigned int flags)
{
@@ -4404,19 +4472,19 @@ int vfs_rename(struct inode *old_dir, struct dentry *old_dentry,
if (d_real_inode(old_dentry) == d_real_inode(new_dentry))
return 0;
- error = may_delete(old_dir, old_dentry, is_dir);
+ error = may_delete(mnt, old_dir, old_dentry, is_dir);
if (error)
return error;
if (!target) {
- error = may_create(new_dir, new_dentry);
+ error = may_create(mnt, new_dir, new_dentry);
} else {
new_is_dir = d_is_dir(new_dentry);
if (!(flags & RENAME_EXCHANGE))
- error = may_delete(new_dir, new_dentry, is_dir);
+ error = may_delete(mnt, new_dir, new_dentry, is_dir);
else
- error = may_delete(new_dir, new_dentry, new_is_dir);
+ error = may_delete(mnt, new_dir, new_dentry, new_is_dir);
}
if (error)
return error;
@@ -4430,12 +4498,12 @@ int vfs_rename(struct inode *old_dir, struct dentry *old_dentry,
*/
if (new_dir != old_dir) {
if (is_dir) {
- error = inode_permission(source, MAY_WRITE);
+ error = inode_permission2(mnt, source, MAY_WRITE);
if (error)
return error;
}
if ((flags & RENAME_EXCHANGE) && new_is_dir) {
- error = inode_permission(target, MAY_WRITE);
+ error = inode_permission2(mnt, target, MAY_WRITE);
if (error)
return error;
}
@@ -4512,6 +4580,14 @@ out:
return error;
}
+EXPORT_SYMBOL(vfs_rename2);
+
+int vfs_rename(struct inode *old_dir, struct dentry *old_dentry,
+ struct inode *new_dir, struct dentry *new_dentry,
+ struct inode **delegated_inode, unsigned int flags)
+{
+ return vfs_rename2(NULL, old_dir, old_dentry, new_dir, new_dentry, delegated_inode, flags);
+}
EXPORT_SYMBOL(vfs_rename);
SYSCALL_DEFINE5(renameat2, int, olddfd, const char __user *, oldname,
@@ -4625,7 +4701,7 @@ retry_deleg:
&new_path, new_dentry, flags);
if (error)
goto exit5;
- error = vfs_rename(old_path.dentry->d_inode, old_dentry,
+ error = vfs_rename2(old_path.mnt, old_path.dentry->d_inode, old_dentry,
new_path.dentry->d_inode, new_dentry,
&delegated_inode, flags);
exit5:
@@ -4670,7 +4746,7 @@ SYSCALL_DEFINE2(rename, const char __user *, oldname, const char __user *, newna
int vfs_whiteout(struct inode *dir, struct dentry *dentry)
{
- int error = may_create(dir, dentry);
+ int error = may_create(NULL, dir, dentry);
if (error)
return error;
diff --git a/fs/namespace.c b/fs/namespace.c
index 41f906a6f5d9..77b46bf40f09 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -227,6 +227,7 @@ static struct mount *alloc_vfsmnt(const char *name)
mnt->mnt_count = 1;
mnt->mnt_writers = 0;
#endif
+ mnt->mnt.data = NULL;
INIT_HLIST_NODE(&mnt->mnt_hash);
INIT_LIST_HEAD(&mnt->mnt_child);
@@ -581,6 +582,7 @@ int sb_prepare_remount_readonly(struct super_block *sb)
static void free_vfsmnt(struct mount *mnt)
{
+ kfree(mnt->mnt.data);
kfree_const(mnt->mnt_devname);
#ifdef CONFIG_SMP
free_percpu(mnt->mnt_pcp);
@@ -984,10 +986,18 @@ vfs_kern_mount(struct file_system_type *type, int flags, const char *name, void
if (!mnt)
return ERR_PTR(-ENOMEM);
+ if (type->alloc_mnt_data) {
+ mnt->mnt.data = type->alloc_mnt_data();
+ if (!mnt->mnt.data) {
+ mnt_free_id(mnt);
+ free_vfsmnt(mnt);
+ return ERR_PTR(-ENOMEM);
+ }
+ }
if (flags & MS_KERNMOUNT)
mnt->mnt.mnt_flags = MNT_INTERNAL;
- root = mount_fs(type, flags, name, data);
+ root = mount_fs(type, flags, name, &mnt->mnt, data);
if (IS_ERR(root)) {
mnt_free_id(mnt);
free_vfsmnt(mnt);
@@ -1031,6 +1041,14 @@ static struct mount *clone_mnt(struct mount *old, struct dentry *root,
if (!mnt)
return ERR_PTR(-ENOMEM);
+ if (sb->s_op->clone_mnt_data) {
+ mnt->mnt.data = sb->s_op->clone_mnt_data(old->mnt.data);
+ if (!mnt->mnt.data) {
+ err = -ENOMEM;
+ goto out_free;
+ }
+ }
+
if (flag & (CL_SLAVE | CL_PRIVATE | CL_SHARED_TO_SLAVE))
mnt->mnt_group_id = 0; /* not a peer of original */
else
@@ -2342,8 +2360,14 @@ static int do_remount(struct path *path, int flags, int mnt_flags,
err = change_mount_flags(path->mnt, flags);
else if (!capable(CAP_SYS_ADMIN))
err = -EPERM;
- else
- err = do_remount_sb(sb, flags, data, 0);
+ else {
+ err = do_remount_sb2(path->mnt, sb, flags, data, 0);
+ namespace_lock();
+ lock_mount_hash();
+ propagate_remount(mnt);
+ unlock_mount_hash();
+ namespace_unlock();
+ }
if (!err) {
lock_mount_hash();
mnt_flags |= mnt->mnt.mnt_flags & ~MNT_USER_SETTABLE_MASK;
diff --git a/fs/nfs/nfs4idmap.c b/fs/nfs/nfs4idmap.c
index 3d4602d66068..b5f02f156088 100644
--- a/fs/nfs/nfs4idmap.c
+++ b/fs/nfs/nfs4idmap.c
@@ -316,7 +316,7 @@ static ssize_t nfs_idmap_get_key(const char *name, size_t namelen,
if (ret < 0)
goto out_up;
- payload = user_key_payload(rkey);
+ payload = user_key_payload_rcu(rkey);
if (IS_ERR_OR_NULL(payload)) {
ret = PTR_ERR(payload);
goto out_up;
diff --git a/fs/nfs/read.c b/fs/nfs/read.c
index defc9233e985..67b80004d3f1 100644
--- a/fs/nfs/read.c
+++ b/fs/nfs/read.c
@@ -346,7 +346,7 @@ struct nfs_readdesc {
};
static int
-readpage_async_filler(void *data, struct page *page)
+readpage_async_filler(struct file *data, struct page *page)
{
struct nfs_readdesc *desc = (struct nfs_readdesc *)data;
struct nfs_page *new;
diff --git a/fs/nilfs2/btree.c b/fs/nilfs2/btree.c
index 2e315f9f2e51..ac1ec8f1e1db 100644
--- a/fs/nilfs2/btree.c
+++ b/fs/nilfs2/btree.c
@@ -2158,8 +2158,8 @@ static void nilfs_btree_lookup_dirty_buffers(struct nilfs_bmap *btree,
pagevec_init(&pvec, 0);
- while (pagevec_lookup_tag(&pvec, btcache, &index, PAGECACHE_TAG_DIRTY,
- PAGEVEC_SIZE)) {
+ while (pagevec_lookup_tag(&pvec, btcache, &index,
+ PAGECACHE_TAG_DIRTY)) {
for (i = 0; i < pagevec_count(&pvec); i++) {
bh = head = page_buffers(pvec.pages[i]);
do {
diff --git a/fs/nilfs2/page.c b/fs/nilfs2/page.c
index f11a3ad2df0c..454ee5254b10 100644
--- a/fs/nilfs2/page.c
+++ b/fs/nilfs2/page.c
@@ -257,8 +257,7 @@ int nilfs_copy_dirty_pages(struct address_space *dmap,
pagevec_init(&pvec, 0);
repeat:
- if (!pagevec_lookup_tag(&pvec, smap, &index, PAGECACHE_TAG_DIRTY,
- PAGEVEC_SIZE))
+ if (!pagevec_lookup_tag(&pvec, smap, &index, PAGECACHE_TAG_DIRTY))
return 0;
for (i = 0; i < pagevec_count(&pvec); i++) {
@@ -377,8 +376,8 @@ void nilfs_clear_dirty_pages(struct address_space *mapping, bool silent)
pagevec_init(&pvec, 0);
- while (pagevec_lookup_tag(&pvec, mapping, &index, PAGECACHE_TAG_DIRTY,
- PAGEVEC_SIZE)) {
+ while (pagevec_lookup_tag(&pvec, mapping, &index,
+ PAGECACHE_TAG_DIRTY)) {
for (i = 0; i < pagevec_count(&pvec); i++) {
struct page *page = pvec.pages[i];
diff --git a/fs/nilfs2/segment.c b/fs/nilfs2/segment.c
index 36362d4bc344..b0eb58ce6b4f 100644
--- a/fs/nilfs2/segment.c
+++ b/fs/nilfs2/segment.c
@@ -709,18 +709,14 @@ static size_t nilfs_lookup_dirty_data_buffers(struct inode *inode,
pagevec_init(&pvec, 0);
repeat:
if (unlikely(index > last) ||
- !pagevec_lookup_tag(&pvec, mapping, &index, PAGECACHE_TAG_DIRTY,
- min_t(pgoff_t, last - index,
- PAGEVEC_SIZE - 1) + 1))
+ !pagevec_lookup_range_tag(&pvec, mapping, &index, last,
+ PAGECACHE_TAG_DIRTY))
return ndirties;
for (i = 0; i < pagevec_count(&pvec); i++) {
struct buffer_head *bh, *head;
struct page *page = pvec.pages[i];
- if (unlikely(page->index > last))
- break;
-
lock_page(page);
if (!page_has_buffers(page))
create_empty_buffers(page, i_blocksize(inode), 0);
@@ -757,8 +753,8 @@ static void nilfs_lookup_dirty_node_buffers(struct inode *inode,
pagevec_init(&pvec, 0);
- while (pagevec_lookup_tag(&pvec, mapping, &index, PAGECACHE_TAG_DIRTY,
- PAGEVEC_SIZE)) {
+ while (pagevec_lookup_tag(&pvec, mapping, &index,
+ PAGECACHE_TAG_DIRTY)) {
for (i = 0; i < pagevec_count(&pvec); i++) {
bh = head = page_buffers(pvec.pages[i]);
do {
diff --git a/fs/notify/fanotify/fanotify_user.c b/fs/notify/fanotify/fanotify_user.c
index 258e8f635148..cef9885de214 100644
--- a/fs/notify/fanotify/fanotify_user.c
+++ b/fs/notify/fanotify/fanotify_user.c
@@ -498,7 +498,7 @@ static int fanotify_find_path(int dfd, const char __user *filename,
}
/* you can only watch an inode if you have read permissions on it */
- ret = inode_permission(path->dentry->d_inode, MAY_READ);
+ ret = inode_permission2(path->mnt, path->dentry->d_inode, MAY_READ);
if (ret)
path_put(path);
out:
diff --git a/fs/notify/inotify/inotify_user.c b/fs/notify/inotify/inotify_user.c
index 69d1ea3d292a..4da5c6a1134f 100644
--- a/fs/notify/inotify/inotify_user.c
+++ b/fs/notify/inotify/inotify_user.c
@@ -337,7 +337,7 @@ static int inotify_find_inode(const char __user *dirname, struct path *path, uns
if (error)
return error;
/* you can only watch an inode if you have read permissions on it */
- error = inode_permission(path->dentry->d_inode, MAY_READ);
+ error = inode_permission2(path->mnt, path->dentry->d_inode, MAY_READ);
if (error)
path_put(path);
return error;
@@ -702,6 +702,8 @@ SYSCALL_DEFINE3(inotify_add_watch, int, fd, const char __user *, pathname,
struct fsnotify_group *group;
struct inode *inode;
struct path path;
+ struct path alteredpath;
+ struct path *canonical_path = &path;
struct fd f;
int ret;
unsigned flags = 0;
@@ -741,13 +743,22 @@ SYSCALL_DEFINE3(inotify_add_watch, int, fd, const char __user *, pathname,
if (ret)
goto fput_and_out;
+ /* support stacked filesystems */
+ if(path.dentry && path.dentry->d_op) {
+ if (path.dentry->d_op->d_canonical_path) {
+ path.dentry->d_op->d_canonical_path(&path, &alteredpath);
+ canonical_path = &alteredpath;
+ path_put(&path);
+ }
+ }
+
/* inode held in place by reference to path; group by fget on fd */
- inode = path.dentry->d_inode;
+ inode = canonical_path->dentry->d_inode;
group = f.file->private_data;
/* create/update an inode mark */
ret = inotify_update_watch(group, inode, mask);
- path_put(&path);
+ path_put(canonical_path);
fput_and_out:
fdput(f);
return ret;
diff --git a/fs/ocfs2/dlmfs/dlmfs.c b/fs/ocfs2/dlmfs/dlmfs.c
index 1079fae5aa12..47e08e4f6223 100644
--- a/fs/ocfs2/dlmfs/dlmfs.c
+++ b/fs/ocfs2/dlmfs/dlmfs.c
@@ -88,13 +88,13 @@ struct workqueue_struct *user_dlm_worker;
*/
#define DLMFS_CAPABILITIES "bast stackglue"
static int param_set_dlmfs_capabilities(const char *val,
- struct kernel_param *kp)
+ const struct kernel_param *kp)
{
printk(KERN_ERR "%s: readonly parameter\n", kp->name);
return -EINVAL;
}
static int param_get_dlmfs_capabilities(char *buffer,
- struct kernel_param *kp)
+ const struct kernel_param *kp)
{
return strlcpy(buffer, DLMFS_CAPABILITIES,
strlen(DLMFS_CAPABILITIES) + 1);
diff --git a/fs/open.c b/fs/open.c
index a6c6244f4993..73b7d19129a1 100644
--- a/fs/open.c
+++ b/fs/open.c
@@ -34,8 +34,8 @@
#include "internal.h"
-int do_truncate(struct dentry *dentry, loff_t length, unsigned int time_attrs,
- struct file *filp)
+int do_truncate2(struct vfsmount *mnt, struct dentry *dentry, loff_t length,
+ unsigned int time_attrs, struct file *filp)
{
int ret;
struct iattr newattrs;
@@ -60,18 +60,25 @@ int do_truncate(struct dentry *dentry, loff_t length, unsigned int time_attrs,
inode_lock(dentry->d_inode);
/* Note any delegations or leases have already been broken: */
- ret = notify_change(dentry, &newattrs, NULL);
+ ret = notify_change2(mnt, dentry, &newattrs, NULL);
inode_unlock(dentry->d_inode);
return ret;
}
+int do_truncate(struct dentry *dentry, loff_t length, unsigned int time_attrs,
+ struct file *filp)
+{
+ return do_truncate2(NULL, dentry, length, time_attrs, filp);
+}
long vfs_truncate(const struct path *path, loff_t length)
{
struct inode *inode;
+ struct vfsmount *mnt;
struct dentry *upperdentry;
long error;
inode = path->dentry->d_inode;
+ mnt = path->mnt;
/* For directories it's -EISDIR, for other non-regulars - -EINVAL */
if (S_ISDIR(inode->i_mode))
@@ -83,7 +90,7 @@ long vfs_truncate(const struct path *path, loff_t length)
if (error)
goto out;
- error = inode_permission(inode, MAY_WRITE);
+ error = inode_permission2(mnt, inode, MAY_WRITE);
if (error)
goto mnt_drop_write_and_out;
@@ -117,7 +124,7 @@ long vfs_truncate(const struct path *path, loff_t length)
if (!error)
error = security_path_truncate(path);
if (!error)
- error = do_truncate(path->dentry, length, 0, NULL);
+ error = do_truncate2(mnt, path->dentry, length, 0, NULL);
put_write_and_out:
put_write_access(upperdentry->d_inode);
@@ -166,6 +173,7 @@ static long do_sys_ftruncate(unsigned int fd, loff_t length, int small)
{
struct inode *inode;
struct dentry *dentry;
+ struct vfsmount *mnt;
struct fd f;
int error;
@@ -182,6 +190,7 @@ static long do_sys_ftruncate(unsigned int fd, loff_t length, int small)
small = 0;
dentry = f.file->f_path.dentry;
+ mnt = f.file->f_path.mnt;
inode = dentry->d_inode;
error = -EINVAL;
if (!S_ISREG(inode->i_mode) || !(f.file->f_mode & FMODE_WRITE))
@@ -201,7 +210,7 @@ static long do_sys_ftruncate(unsigned int fd, loff_t length, int small)
if (!error)
error = security_path_truncate(&f.file->f_path);
if (!error)
- error = do_truncate(dentry, length, ATTR_MTIME|ATTR_CTIME, f.file);
+ error = do_truncate2(mnt, dentry, length, ATTR_MTIME|ATTR_CTIME, f.file);
sb_end_write(inode->i_sb);
out_putf:
fdput(f);
@@ -357,6 +366,7 @@ SYSCALL_DEFINE3(faccessat, int, dfd, const char __user *, filename, int, mode)
struct cred *override_cred;
struct path path;
struct inode *inode;
+ struct vfsmount *mnt;
int res;
unsigned int lookup_flags = LOOKUP_FOLLOW;
@@ -387,6 +397,7 @@ retry:
goto out;
inode = d_backing_inode(path.dentry);
+ mnt = path.mnt;
if ((mode & MAY_EXEC) && S_ISREG(inode->i_mode)) {
/*
@@ -398,7 +409,7 @@ retry:
goto out_path_release;
}
- res = inode_permission(inode, mode | MAY_ACCESS);
+ res = inode_permission2(mnt, inode, mode | MAY_ACCESS);
/* SuS v2 requires we report a read only fs too */
if (res || !(mode & S_IWOTH) || special_file(inode->i_mode))
goto out_path_release;
@@ -442,7 +453,7 @@ retry:
if (error)
goto out;
- error = inode_permission(path.dentry->d_inode, MAY_EXEC | MAY_CHDIR);
+ error = inode_permission2(path.mnt, path.dentry->d_inode, MAY_EXEC | MAY_CHDIR);
if (error)
goto dput_and_out;
@@ -462,6 +473,7 @@ SYSCALL_DEFINE1(fchdir, unsigned int, fd)
{
struct fd f = fdget_raw(fd);
struct inode *inode;
+ struct vfsmount *mnt;
int error = -EBADF;
error = -EBADF;
@@ -469,12 +481,13 @@ SYSCALL_DEFINE1(fchdir, unsigned int, fd)
goto out;
inode = file_inode(f.file);
+ mnt = f.file->f_path.mnt;
error = -ENOTDIR;
if (!S_ISDIR(inode->i_mode))
goto out_putf;
- error = inode_permission(inode, MAY_EXEC | MAY_CHDIR);
+ error = inode_permission2(mnt, inode, MAY_EXEC | MAY_CHDIR);
if (!error)
set_fs_pwd(current->fs, &f.file->f_path);
out_putf:
@@ -493,7 +506,7 @@ retry:
if (error)
goto out;
- error = inode_permission(path.dentry->d_inode, MAY_EXEC | MAY_CHDIR);
+ error = inode_permission2(path.mnt, path.dentry->d_inode, MAY_EXEC | MAY_CHDIR);
if (error)
goto dput_and_out;
@@ -533,7 +546,7 @@ retry_deleg:
goto out_unlock;
newattrs.ia_mode = (mode & S_IALLUGO) | (inode->i_mode & ~S_IALLUGO);
newattrs.ia_valid = ATTR_MODE | ATTR_CTIME;
- error = notify_change(path->dentry, &newattrs, &delegated_inode);
+ error = notify_change2(path->mnt, path->dentry, &newattrs, &delegated_inode);
out_unlock:
inode_unlock(inode);
if (delegated_inode) {
@@ -613,7 +626,7 @@ retry_deleg:
inode_lock(inode);
error = security_path_chown(path, uid, gid);
if (!error)
- error = notify_change(path->dentry, &newattrs, &delegated_inode);
+ error = notify_change2(path->mnt, path->dentry, &newattrs, &delegated_inode);
inode_unlock(inode);
if (delegated_inode) {
error = break_deleg_wait(&delegated_inode);
diff --git a/fs/pnode.c b/fs/pnode.c
index d15c63e97ef1..ddb846f878b8 100644
--- a/fs/pnode.c
+++ b/fs/pnode.c
@@ -609,3 +609,37 @@ int propagate_umount(struct list_head *list)
return 0;
}
+
+/*
+ * Iterates over all slaves, and slaves of slaves.
+ */
+static struct mount *next_descendent(struct mount *root, struct mount *cur)
+{
+ if (!IS_MNT_NEW(cur) && !list_empty(&cur->mnt_slave_list))
+ return first_slave(cur);
+ do {
+ struct mount *master = cur->mnt_master;
+
+ if (!master || cur->mnt_slave.next != &master->mnt_slave_list) {
+ struct mount *next = next_slave(cur);
+
+ return (next == root) ? NULL : next;
+ }
+ cur = master;
+ } while (cur != root);
+ return NULL;
+}
+
+void propagate_remount(struct mount *mnt)
+{
+ struct mount *m = mnt;
+ struct super_block *sb = mnt->mnt.mnt_sb;
+
+ if (sb->s_op->copy_mnt_data) {
+ m = next_descendent(mnt, m);
+ while (m) {
+ sb->s_op->copy_mnt_data(m->mnt.data, mnt->mnt.data);
+ m = next_descendent(mnt, m);
+ }
+ }
+}
diff --git a/fs/pnode.h b/fs/pnode.h
index dc87e65becd2..a9a6576540ad 100644
--- a/fs/pnode.h
+++ b/fs/pnode.h
@@ -44,6 +44,7 @@ int propagate_mnt(struct mount *, struct mountpoint *, struct mount *,
int propagate_umount(struct list_head *);
int propagate_mount_busy(struct mount *, int);
void propagate_mount_unlock(struct mount *);
+void propagate_remount(struct mount *);
void mnt_release_group_id(struct mount *);
int get_dominating_id(struct mount *mnt, const struct path *root);
unsigned int mnt_get_count(struct mount *mnt);
diff --git a/fs/proc/Kconfig b/fs/proc/Kconfig
index 1ade1206bb89..08dce22afec1 100644
--- a/fs/proc/Kconfig
+++ b/fs/proc/Kconfig
@@ -81,3 +81,10 @@ config PROC_CHILDREN
Say Y if you are running any user-space software which takes benefit from
this interface. For example, rkt is such a piece of software.
+
+config PROC_UID
+ bool "Include /proc/uid/ files"
+ default y
+ depends on PROC_FS && RT_MUTEXES
+ help
+ Provides aggregated per-uid information under /proc/uid.
diff --git a/fs/proc/Makefile b/fs/proc/Makefile
index 12c6922c913c..dea53bac412e 100644
--- a/fs/proc/Makefile
+++ b/fs/proc/Makefile
@@ -25,6 +25,7 @@ proc-y += softirqs.o
proc-y += namespaces.o
proc-y += self.o
proc-y += thread_self.o
+proc-$(CONFIG_PROC_UID) += uid.o
proc-$(CONFIG_PROC_SYSCTL) += proc_sysctl.o
proc-$(CONFIG_NET) += proc_net.o
proc-$(CONFIG_PROC_KCORE) += kcore.o
diff --git a/fs/proc/base.c b/fs/proc/base.c
index 79702d405ba7..d5d356d770ed 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -87,6 +87,7 @@
#include <linux/slab.h>
#include <linux/flex_array.h>
#include <linux/posix-timers.h>
+#include <linux/cpufreq_times.h>
#ifdef CONFIG_HARDWALL
#include <asm/hardwall.h>
#endif
@@ -2929,6 +2930,7 @@ static const struct pid_entry tgid_base_stuff[] = {
#ifdef CONFIG_PROC_PAGE_MONITOR
REG("clear_refs", S_IWUSR, proc_clear_refs_operations),
REG("smaps", S_IRUGO, proc_pid_smaps_operations),
+ REG("smaps_rollup", S_IRUGO, proc_pid_smaps_rollup_operations),
REG("pagemap", S_IRUSR, proc_pagemap_operations),
#endif
#ifdef CONFIG_SECURITY
@@ -2981,6 +2983,9 @@ static const struct pid_entry tgid_base_stuff[] = {
REG("timers", S_IRUGO, proc_timers_operations),
#endif
REG("timerslack_ns", S_IRUGO|S_IWUGO, proc_pid_set_timerslack_ns_operations),
+#ifdef CONFIG_CPU_FREQ_TIMES
+ ONE("time_in_state", 0444, proc_time_in_state_show),
+#endif
};
static int proc_tgid_base_readdir(struct file *file, struct dir_context *ctx)
@@ -3320,6 +3325,7 @@ static const struct pid_entry tid_base_stuff[] = {
#ifdef CONFIG_PROC_PAGE_MONITOR
REG("clear_refs", S_IWUSR, proc_clear_refs_operations),
REG("smaps", S_IRUGO, proc_tid_smaps_operations),
+ REG("smaps_rollup", S_IRUGO, proc_pid_smaps_rollup_operations),
REG("pagemap", S_IRUSR, proc_pagemap_operations),
#endif
#ifdef CONFIG_SECURITY
@@ -3365,6 +3371,9 @@ static const struct pid_entry tid_base_stuff[] = {
REG("projid_map", S_IRUGO|S_IWUSR, proc_projid_map_operations),
REG("setgroups", S_IRUGO|S_IWUSR, proc_setgroups_operations),
#endif
+#ifdef CONFIG_CPU_FREQ_TIMES
+ ONE("time_in_state", 0444, proc_time_in_state_show),
+#endif
};
static int proc_tid_base_readdir(struct file *file, struct dir_context *ctx)
diff --git a/fs/proc/inode.c b/fs/proc/inode.c
index c2afe39f0b9e..4298a3970976 100644
--- a/fs/proc/inode.c
+++ b/fs/proc/inode.c
@@ -457,17 +457,12 @@ struct inode *proc_get_inode(struct super_block *sb, struct proc_dir_entry *de)
return inode;
}
-int proc_fill_super(struct super_block *s, void *data, int silent)
+int proc_fill_super(struct super_block *s)
{
- struct pid_namespace *ns = get_pid_ns(s->s_fs_info);
struct inode *root_inode;
int ret;
- if (!proc_parse_options(data, ns))
- return -EINVAL;
-
- /* User space would break if executables or devices appear on proc */
- s->s_iflags |= SB_I_USERNS_VISIBLE | SB_I_NOEXEC | SB_I_NODEV;
+ s->s_iflags |= SB_I_USERNS_VISIBLE | SB_I_NODEV;
s->s_flags |= MS_NODIRATIME | MS_NOSUID | MS_NOEXEC;
s->s_blocksize = 1024;
s->s_blocksize_bits = 10;
diff --git a/fs/proc/internal.h b/fs/proc/internal.h
index c0bdeceaaeb6..fbe699e439b4 100644
--- a/fs/proc/internal.h
+++ b/fs/proc/internal.h
@@ -213,7 +213,7 @@ extern const struct inode_operations proc_pid_link_inode_operations;
extern void proc_init_inodecache(void);
extern struct inode *proc_get_inode(struct super_block *, struct proc_dir_entry *);
-extern int proc_fill_super(struct super_block *, void *data, int flags);
+extern int proc_fill_super(struct super_block *);
extern void proc_entry_rundown(struct proc_dir_entry *);
/*
@@ -259,6 +259,15 @@ static inline void proc_sys_evict_inode(struct inode *inode,
#endif
/*
+ * uid.c
+ */
+#ifdef CONFIG_PROC_UID
+extern int proc_uid_init(void);
+#else
+static inline void proc_uid_init(void) { }
+#endif
+
+/*
* proc_tty.c
*/
#ifdef CONFIG_TTY
@@ -271,7 +280,6 @@ static inline void proc_tty_init(void) {}
* root.c
*/
extern struct proc_dir_entry proc_root;
-extern int proc_parse_options(char *options, struct pid_namespace *pid);
extern void proc_self_init(void);
extern int proc_remount(struct super_block *, int *, char *);
@@ -279,10 +287,12 @@ extern int proc_remount(struct super_block *, int *, char *);
/*
* task_[no]mmu.c
*/
+struct mem_size_stats;
struct proc_maps_private {
struct inode *inode;
struct task_struct *task;
struct mm_struct *mm;
+ struct mem_size_stats *rollup;
#ifdef CONFIG_MMU
struct vm_area_struct *tail_vma;
#endif
@@ -298,6 +308,7 @@ extern const struct file_operations proc_tid_maps_operations;
extern const struct file_operations proc_pid_numa_maps_operations;
extern const struct file_operations proc_tid_numa_maps_operations;
extern const struct file_operations proc_pid_smaps_operations;
+extern const struct file_operations proc_pid_smaps_rollup_operations;
extern const struct file_operations proc_tid_smaps_operations;
extern const struct file_operations proc_clear_refs_operations;
extern const struct file_operations proc_pagemap_operations;
diff --git a/fs/proc/root.c b/fs/proc/root.c
index 8d3e484055a6..1d68fcd9313f 100644
--- a/fs/proc/root.c
+++ b/fs/proc/root.c
@@ -23,6 +23,21 @@
#include "internal.h"
+static int proc_test_super(struct super_block *sb, void *data)
+{
+ return sb->s_fs_info == data;
+}
+
+static int proc_set_super(struct super_block *sb, void *data)
+{
+ int err = set_anon_super(sb, NULL);
+ if (!err) {
+ struct pid_namespace *ns = (struct pid_namespace *)data;
+ sb->s_fs_info = get_pid_ns(ns);
+ }
+ return err;
+}
+
enum {
Opt_gid, Opt_hidepid, Opt_err,
};
@@ -33,7 +48,7 @@ static const match_table_t tokens = {
{Opt_err, NULL},
};
-int proc_parse_options(char *options, struct pid_namespace *pid)
+static int proc_parse_options(char *options, struct pid_namespace *pid)
{
char *p;
substring_t args[MAX_OPT_ARGS];
@@ -85,16 +100,45 @@ int proc_remount(struct super_block *sb, int *flags, char *data)
static struct dentry *proc_mount(struct file_system_type *fs_type,
int flags, const char *dev_name, void *data)
{
+ int err;
+ struct super_block *sb;
struct pid_namespace *ns;
+ char *options;
if (flags & MS_KERNMOUNT) {
- ns = data;
- data = NULL;
+ ns = (struct pid_namespace *)data;
+ options = NULL;
} else {
ns = task_active_pid_ns(current);
+ options = data;
+
+ /* Does the mounter have privilege over the pid namespace? */
+ if (!ns_capable(ns->user_ns, CAP_SYS_ADMIN))
+ return ERR_PTR(-EPERM);
}
- return mount_ns(fs_type, flags, data, ns, ns->user_ns, proc_fill_super);
+ sb = sget(fs_type, proc_test_super, proc_set_super, flags, ns);
+ if (IS_ERR(sb))
+ return ERR_CAST(sb);
+
+ if (!proc_parse_options(options, ns)) {
+ deactivate_locked_super(sb);
+ return ERR_PTR(-EINVAL);
+ }
+
+ if (!sb->s_root) {
+ err = proc_fill_super(sb);
+ if (err) {
+ deactivate_locked_super(sb);
+ return ERR_PTR(err);
+ }
+
+ sb->s_flags |= MS_ACTIVE;
+ /* User space would break if executables appear on proc */
+ sb->s_iflags |= SB_I_NOEXEC;
+ }
+
+ return dget(sb->s_root);
}
static void proc_kill_sb(struct super_block *sb)
@@ -131,7 +175,7 @@ void __init proc_root_init(void)
proc_symlink("mounts", NULL, "self/mounts");
proc_net_init();
-
+ proc_uid_init();
#ifdef CONFIG_SYSVIPC
proc_mkdir("sysvipc", NULL);
#endif
diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
index 5138e781737a..f8ea3d6298cf 100644
--- a/fs/proc/task_mmu.c
+++ b/fs/proc/task_mmu.c
@@ -127,6 +127,56 @@ static void release_task_mempolicy(struct proc_maps_private *priv)
}
#endif
+static void seq_print_vma_name(struct seq_file *m, struct vm_area_struct *vma)
+{
+ const char __user *name = vma_get_anon_name(vma);
+ struct mm_struct *mm = vma->vm_mm;
+
+ unsigned long page_start_vaddr;
+ unsigned long page_offset;
+ unsigned long num_pages;
+ unsigned long max_len = NAME_MAX;
+ int i;
+
+ page_start_vaddr = (unsigned long)name & PAGE_MASK;
+ page_offset = (unsigned long)name - page_start_vaddr;
+ num_pages = DIV_ROUND_UP(page_offset + max_len, PAGE_SIZE);
+
+ seq_puts(m, "[anon:");
+
+ for (i = 0; i < num_pages; i++) {
+ int len;
+ int write_len;
+ const char *kaddr;
+ long pages_pinned;
+ struct page *page;
+
+ pages_pinned = get_user_pages_remote(current, mm,
+ page_start_vaddr, 1, 0, &page, NULL);
+ if (pages_pinned < 1) {
+ seq_puts(m, "<fault>]");
+ return;
+ }
+
+ kaddr = (const char *)kmap(page);
+ len = min(max_len, PAGE_SIZE - page_offset);
+ write_len = strnlen(kaddr + page_offset, len);
+ seq_write(m, kaddr + page_offset, write_len);
+ kunmap(page);
+ put_page(page);
+
+ /* if strnlen hit a null terminator then we're done */
+ if (write_len != len)
+ break;
+
+ max_len -= len;
+ page_offset = 0;
+ page_start_vaddr += PAGE_SIZE;
+ }
+
+ seq_putc(m, ']');
+}
+
static void vma_stop(struct proc_maps_private *priv)
{
struct mm_struct *mm = priv->mm;
@@ -251,6 +301,7 @@ static int proc_map_release(struct inode *inode, struct file *file)
if (priv->mm)
mmdrop(priv->mm);
+ kfree(priv->rollup);
return seq_release_private(inode, file);
}
@@ -277,6 +328,23 @@ static int is_stack(struct proc_maps_private *priv,
vma->vm_end >= vma->vm_mm->start_stack;
}
+static void show_vma_header_prefix(struct seq_file *m,
+ unsigned long start, unsigned long end,
+ vm_flags_t flags, unsigned long long pgoff,
+ dev_t dev, unsigned long ino)
+{
+ seq_setwidth(m, 25 + sizeof(void *) * 6 - 1);
+ seq_printf(m, "%08lx-%08lx %c%c%c%c %08llx %02x:%02x %lu ",
+ start,
+ end,
+ flags & VM_READ ? 'r' : '-',
+ flags & VM_WRITE ? 'w' : '-',
+ flags & VM_EXEC ? 'x' : '-',
+ flags & VM_MAYSHARE ? 's' : 'p',
+ pgoff,
+ MAJOR(dev), MINOR(dev), ino);
+}
+
static void
show_map_vma(struct seq_file *m, struct vm_area_struct *vma, int is_pid)
{
@@ -300,17 +368,7 @@ show_map_vma(struct seq_file *m, struct vm_area_struct *vma, int is_pid)
/* We don't show the stack guard page in /proc/maps */
start = vma->vm_start;
end = vma->vm_end;
-
- seq_setwidth(m, 25 + sizeof(void *) * 6 - 1);
- seq_printf(m, "%08lx-%08lx %c%c%c%c %08llx %02x:%02x %lu ",
- start,
- end,
- flags & VM_READ ? 'r' : '-',
- flags & VM_WRITE ? 'w' : '-',
- flags & VM_EXEC ? 'x' : '-',
- flags & VM_MAYSHARE ? 's' : 'p',
- pgoff,
- MAJOR(dev), MINOR(dev), ino);
+ show_vma_header_prefix(m, start, end, flags, pgoff, dev, ino);
/*
* Print the dentry name for named mappings, and a
@@ -341,8 +399,15 @@ show_map_vma(struct seq_file *m, struct vm_area_struct *vma, int is_pid)
goto done;
}
- if (is_stack(priv, vma))
+ if (is_stack(priv, vma)) {
name = "[stack]";
+ goto done;
+ }
+
+ if (vma_get_anon_name(vma)) {
+ seq_pad(m, ' ');
+ seq_print_vma_name(m, vma);
+ }
}
done:
@@ -429,6 +494,7 @@ const struct file_operations proc_tid_maps_operations = {
#ifdef CONFIG_PROC_PAGE_MONITOR
struct mem_size_stats {
+ bool first;
unsigned long resident;
unsigned long shared_clean;
unsigned long shared_dirty;
@@ -441,7 +507,9 @@ struct mem_size_stats {
unsigned long swap;
unsigned long shared_hugetlb;
unsigned long private_hugetlb;
+ unsigned long first_vma_start;
u64 pss;
+ u64 pss_locked;
u64 swap_pss;
bool check_shmem_swap;
};
@@ -714,20 +782,40 @@ void __weak arch_show_smap(struct seq_file *m, struct vm_area_struct *vma)
static int show_smap(struct seq_file *m, void *v, int is_pid)
{
+ struct proc_maps_private *priv = m->private;
struct vm_area_struct *vma = v;
- struct mem_size_stats mss;
+ struct mem_size_stats mss_stack;
+ struct mem_size_stats *mss;
struct mm_walk smaps_walk = {
.pmd_entry = smaps_pte_range,
#ifdef CONFIG_HUGETLB_PAGE
.hugetlb_entry = smaps_hugetlb_range,
#endif
.mm = vma->vm_mm,
- .private = &mss,
};
+ int ret = 0;
+ bool rollup_mode;
+ bool last_vma;
+
+ if (priv->rollup) {
+ rollup_mode = true;
+ mss = priv->rollup;
+ if (mss->first) {
+ mss->first_vma_start = vma->vm_start;
+ mss->first = false;
+ }
+ last_vma = !m_next_vma(priv, vma);
+ } else {
+ rollup_mode = false;
+ memset(&mss_stack, 0, sizeof(mss_stack));
+ mss = &mss_stack;
+ }
- memset(&mss, 0, sizeof mss);
+ smaps_walk.private = mss;
#ifdef CONFIG_SHMEM
+ /* In case of smaps_rollup, reset the value from previous vma */
+ mss->check_shmem_swap = false;
if (vma->vm_file && shmem_mapping(vma->vm_file->f_mapping)) {
/*
* For shared or readonly shmem mappings we know that all
@@ -743,9 +831,9 @@ static int show_smap(struct seq_file *m, void *v, int is_pid)
if (!shmem_swapped || (vma->vm_flags & VM_SHARED) ||
!(vma->vm_flags & VM_WRITE)) {
- mss.swap = shmem_swapped;
+ mss->swap += shmem_swapped;
} else {
- mss.check_shmem_swap = true;
+ mss->check_shmem_swap = true;
smaps_walk.pte_hole = smaps_pte_hole;
}
}
@@ -753,52 +841,76 @@ static int show_smap(struct seq_file *m, void *v, int is_pid)
/* mmap_sem is held in m_start */
walk_page_vma(vma, &smaps_walk);
+ if (vma->vm_flags & VM_LOCKED)
+ mss->pss_locked += mss->pss;
+
+ if (!rollup_mode) {
+ show_map_vma(m, vma, is_pid);
+ } else if (last_vma) {
+ show_vma_header_prefix(
+ m, mss->first_vma_start, vma->vm_end, 0, 0, 0, 0);
+ seq_pad(m, ' ');
+ seq_puts(m, "[rollup]\n");
+ } else {
+ ret = SEQ_SKIP;
+ }
- show_map_vma(m, vma, is_pid);
+ if (!rollup_mode && vma_get_anon_name(vma)) {
+ seq_puts(m, "Name: ");
+ seq_print_vma_name(m, vma);
+ seq_putc(m, '\n');
+ }
+
+ if (!rollup_mode)
+ seq_printf(m,
+ "Size: %8lu kB\n"
+ "KernelPageSize: %8lu kB\n"
+ "MMUPageSize: %8lu kB\n",
+ (vma->vm_end - vma->vm_start) >> 10,
+ vma_kernel_pagesize(vma) >> 10,
+ vma_mmu_pagesize(vma) >> 10);
+
+ if (!rollup_mode || last_vma)
+ seq_printf(m,
+ "Rss: %8lu kB\n"
+ "Pss: %8lu kB\n"
+ "Shared_Clean: %8lu kB\n"
+ "Shared_Dirty: %8lu kB\n"
+ "Private_Clean: %8lu kB\n"
+ "Private_Dirty: %8lu kB\n"
+ "Referenced: %8lu kB\n"
+ "Anonymous: %8lu kB\n"
+ "AnonHugePages: %8lu kB\n"
+ "ShmemPmdMapped: %8lu kB\n"
+ "Shared_Hugetlb: %8lu kB\n"
+ "Private_Hugetlb: %7lu kB\n"
+ "Swap: %8lu kB\n"
+ "SwapPss: %8lu kB\n"
+ "Locked: %8lu kB\n",
+ mss->resident >> 10,
+ (unsigned long)(mss->pss >> (10 + PSS_SHIFT)),
+ mss->shared_clean >> 10,
+ mss->shared_dirty >> 10,
+ mss->private_clean >> 10,
+ mss->private_dirty >> 10,
+ mss->referenced >> 10,
+ mss->anonymous >> 10,
+ mss->anonymous_thp >> 10,
+ mss->shmem_thp >> 10,
+ mss->shared_hugetlb >> 10,
+ mss->private_hugetlb >> 10,
+ mss->swap >> 10,
+ (unsigned long)(mss->swap_pss >> (10 + PSS_SHIFT)),
+ (unsigned long)(mss->pss_locked >> (10 + PSS_SHIFT)));
+
+
+ if (!rollup_mode) {
+ arch_show_smap(m, vma);
+ show_smap_vma_flags(m, vma);
+ }
- seq_printf(m,
- "Size: %8lu kB\n"
- "Rss: %8lu kB\n"
- "Pss: %8lu kB\n"
- "Shared_Clean: %8lu kB\n"
- "Shared_Dirty: %8lu kB\n"
- "Private_Clean: %8lu kB\n"
- "Private_Dirty: %8lu kB\n"
- "Referenced: %8lu kB\n"
- "Anonymous: %8lu kB\n"
- "AnonHugePages: %8lu kB\n"
- "ShmemPmdMapped: %8lu kB\n"
- "Shared_Hugetlb: %8lu kB\n"
- "Private_Hugetlb: %7lu kB\n"
- "Swap: %8lu kB\n"
- "SwapPss: %8lu kB\n"
- "KernelPageSize: %8lu kB\n"
- "MMUPageSize: %8lu kB\n"
- "Locked: %8lu kB\n",
- (vma->vm_end - vma->vm_start) >> 10,
- mss.resident >> 10,
- (unsigned long)(mss.pss >> (10 + PSS_SHIFT)),
- mss.shared_clean >> 10,
- mss.shared_dirty >> 10,
- mss.private_clean >> 10,
- mss.private_dirty >> 10,
- mss.referenced >> 10,
- mss.anonymous >> 10,
- mss.anonymous_thp >> 10,
- mss.shmem_thp >> 10,
- mss.shared_hugetlb >> 10,
- mss.private_hugetlb >> 10,
- mss.swap >> 10,
- (unsigned long)(mss.swap_pss >> (10 + PSS_SHIFT)),
- vma_kernel_pagesize(vma) >> 10,
- vma_mmu_pagesize(vma) >> 10,
- (vma->vm_flags & VM_LOCKED) ?
- (unsigned long)(mss.pss >> (10 + PSS_SHIFT)) : 0);
-
- arch_show_smap(m, vma);
- show_smap_vma_flags(m, vma);
m_cache_vma(m, vma);
- return 0;
+ return ret;
}
static int show_pid_smap(struct seq_file *m, void *v)
@@ -830,6 +942,25 @@ static int pid_smaps_open(struct inode *inode, struct file *file)
return do_maps_open(inode, file, &proc_pid_smaps_op);
}
+static int pid_smaps_rollup_open(struct inode *inode, struct file *file)
+{
+ struct seq_file *seq;
+ struct proc_maps_private *priv;
+ int ret = do_maps_open(inode, file, &proc_pid_smaps_op);
+
+ if (ret < 0)
+ return ret;
+ seq = file->private_data;
+ priv = seq->private;
+ priv->rollup = kzalloc(sizeof(*priv->rollup), GFP_KERNEL);
+ if (!priv->rollup) {
+ proc_map_release(inode, file);
+ return -ENOMEM;
+ }
+ priv->rollup->first = true;
+ return 0;
+}
+
static int tid_smaps_open(struct inode *inode, struct file *file)
{
return do_maps_open(inode, file, &proc_tid_smaps_op);
@@ -842,6 +973,13 @@ const struct file_operations proc_pid_smaps_operations = {
.release = proc_map_release,
};
+const struct file_operations proc_pid_smaps_rollup_operations = {
+ .open = pid_smaps_rollup_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = proc_map_release,
+};
+
const struct file_operations proc_tid_smaps_operations = {
.open = tid_smaps_open,
.read = seq_read,
diff --git a/fs/proc/uid.c b/fs/proc/uid.c
new file mode 100644
index 000000000000..b2bb08509212
--- /dev/null
+++ b/fs/proc/uid.c
@@ -0,0 +1,295 @@
+/*
+ * /proc/uid support
+ */
+
+#include <linux/cpufreq_times.h>
+#include <linux/fs.h>
+#include <linux/hashtable.h>
+#include <linux/init.h>
+#include <linux/proc_fs.h>
+#include <linux/rtmutex.h>
+#include <linux/sched.h>
+#include <linux/seq_file.h>
+#include <linux/slab.h>
+#include "internal.h"
+
+static struct proc_dir_entry *proc_uid;
+
+#define UID_HASH_BITS 10
+
+static DECLARE_HASHTABLE(proc_uid_hash_table, UID_HASH_BITS);
+
+/*
+ * use rt_mutex here to avoid priority inversion between high-priority readers
+ * of these files and tasks calling proc_register_uid().
+ */
+static DEFINE_RT_MUTEX(proc_uid_lock); /* proc_uid_hash_table */
+
+struct uid_hash_entry {
+ uid_t uid;
+ struct hlist_node hash;
+};
+
+/* Caller must hold proc_uid_lock */
+static bool uid_hash_entry_exists_locked(uid_t uid)
+{
+ struct uid_hash_entry *entry;
+
+ hash_for_each_possible(proc_uid_hash_table, entry, hash, uid) {
+ if (entry->uid == uid)
+ return true;
+ }
+ return false;
+}
+
+void proc_register_uid(kuid_t kuid)
+{
+ struct uid_hash_entry *entry;
+ bool exists;
+ uid_t uid = from_kuid_munged(current_user_ns(), kuid);
+
+ rt_mutex_lock(&proc_uid_lock);
+ exists = uid_hash_entry_exists_locked(uid);
+ rt_mutex_unlock(&proc_uid_lock);
+ if (exists)
+ return;
+
+ entry = kzalloc(sizeof(struct uid_hash_entry), GFP_KERNEL);
+ if (!entry)
+ return;
+ entry->uid = uid;
+
+ rt_mutex_lock(&proc_uid_lock);
+ if (uid_hash_entry_exists_locked(uid))
+ kfree(entry);
+ else
+ hash_add(proc_uid_hash_table, &entry->hash, uid);
+ rt_mutex_unlock(&proc_uid_lock);
+}
+
+struct uid_entry {
+ const char *name;
+ int len;
+ umode_t mode;
+ const struct inode_operations *iop;
+ const struct file_operations *fop;
+};
+
+#define NOD(NAME, MODE, IOP, FOP) { \
+ .name = (NAME), \
+ .len = sizeof(NAME) - 1, \
+ .mode = MODE, \
+ .iop = IOP, \
+ .fop = FOP, \
+}
+
+#ifdef CONFIG_CPU_FREQ_TIMES
+static const struct file_operations proc_uid_time_in_state_operations = {
+ .open = single_uid_time_in_state_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = single_release,
+};
+#endif
+
+static const struct uid_entry uid_base_stuff[] = {
+#ifdef CONFIG_CPU_FREQ_TIMES
+ NOD("time_in_state", 0444, NULL, &proc_uid_time_in_state_operations),
+#endif
+};
+
+static const struct inode_operations proc_uid_def_inode_operations = {
+ .setattr = proc_setattr,
+};
+
+static struct inode *proc_uid_make_inode(struct super_block *sb, kuid_t kuid)
+{
+ struct inode *inode;
+
+ inode = new_inode(sb);
+ if (!inode)
+ return NULL;
+
+ inode->i_ino = get_next_ino();
+ inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME;
+ inode->i_op = &proc_uid_def_inode_operations;
+ inode->i_uid = kuid;
+
+ return inode;
+}
+
+static int proc_uident_instantiate(struct inode *dir, struct dentry *dentry,
+ struct task_struct *unused, const void *ptr)
+{
+ const struct uid_entry *u = ptr;
+ struct inode *inode;
+
+ inode = proc_uid_make_inode(dir->i_sb, dir->i_uid);
+ if (!inode)
+ return -ENOENT;
+
+ inode->i_mode = u->mode;
+ if (S_ISDIR(inode->i_mode))
+ set_nlink(inode, 2);
+ if (u->iop)
+ inode->i_op = u->iop;
+ if (u->fop)
+ inode->i_fop = u->fop;
+ d_add(dentry, inode);
+ return 0;
+}
+
+static struct dentry *proc_uid_base_lookup(struct inode *dir,
+ struct dentry *dentry,
+ unsigned int flags)
+{
+ const struct uid_entry *u, *last;
+ unsigned int nents = ARRAY_SIZE(uid_base_stuff);
+
+ if (nents == 0)
+ return ERR_PTR(-ENOENT);
+
+ last = &uid_base_stuff[nents - 1];
+ for (u = uid_base_stuff; u <= last; u++) {
+ if (u->len != dentry->d_name.len)
+ continue;
+ if (!memcmp(dentry->d_name.name, u->name, u->len))
+ break;
+ }
+ if (u > last)
+ return ERR_PTR(-ENOENT);
+
+ return ERR_PTR(proc_uident_instantiate(dir, dentry, NULL, u));
+}
+
+static int proc_uid_base_readdir(struct file *file, struct dir_context *ctx)
+{
+ unsigned int nents = ARRAY_SIZE(uid_base_stuff);
+ const struct uid_entry *u;
+
+ if (!dir_emit_dots(file, ctx))
+ return 0;
+
+ if (ctx->pos >= nents + 2)
+ return 0;
+
+ for (u = uid_base_stuff + (ctx->pos - 2);
+ u < uid_base_stuff + nents; u++) {
+ if (!proc_fill_cache(file, ctx, u->name, u->len,
+ proc_uident_instantiate, NULL, u))
+ break;
+ ctx->pos++;
+ }
+
+ return 0;
+}
+
+static const struct inode_operations proc_uid_base_inode_operations = {
+ .lookup = proc_uid_base_lookup,
+ .setattr = proc_setattr,
+};
+
+static const struct file_operations proc_uid_base_operations = {
+ .read = generic_read_dir,
+ .iterate = proc_uid_base_readdir,
+ .llseek = default_llseek,
+};
+
+static int proc_uid_instantiate(struct inode *dir, struct dentry *dentry,
+ struct task_struct *unused, const void *ptr)
+{
+ unsigned int i, len;
+ nlink_t nlinks;
+ kuid_t *kuid = (kuid_t *)ptr;
+ struct inode *inode = proc_uid_make_inode(dir->i_sb, *kuid);
+
+ if (!inode)
+ return -ENOENT;
+
+ inode->i_mode = S_IFDIR | 0555;
+ inode->i_op = &proc_uid_base_inode_operations;
+ inode->i_fop = &proc_uid_base_operations;
+ inode->i_flags |= S_IMMUTABLE;
+
+ nlinks = 2;
+ len = ARRAY_SIZE(uid_base_stuff);
+ for (i = 0; i < len; ++i) {
+ if (S_ISDIR(uid_base_stuff[i].mode))
+ ++nlinks;
+ }
+ set_nlink(inode, nlinks);
+
+ d_add(dentry, inode);
+
+ return 0;
+}
+
+static int proc_uid_readdir(struct file *file, struct dir_context *ctx)
+{
+ int last_shown, i;
+ unsigned long bkt;
+ struct uid_hash_entry *entry;
+
+ if (!dir_emit_dots(file, ctx))
+ return 0;
+
+ i = 0;
+ last_shown = ctx->pos - 2;
+ rt_mutex_lock(&proc_uid_lock);
+ hash_for_each(proc_uid_hash_table, bkt, entry, hash) {
+ int len;
+ char buf[PROC_NUMBUF];
+
+ if (i < last_shown)
+ continue;
+ len = snprintf(buf, sizeof(buf), "%u", entry->uid);
+ if (!proc_fill_cache(file, ctx, buf, len,
+ proc_uid_instantiate, NULL, &entry->uid))
+ break;
+ i++;
+ ctx->pos++;
+ }
+ rt_mutex_unlock(&proc_uid_lock);
+ return 0;
+}
+
+static struct dentry *proc_uid_lookup(struct inode *dir, struct dentry *dentry,
+ unsigned int flags)
+{
+ int result = -ENOENT;
+
+ uid_t uid = name_to_int(&dentry->d_name);
+ bool uid_exists;
+
+ rt_mutex_lock(&proc_uid_lock);
+ uid_exists = uid_hash_entry_exists_locked(uid);
+ rt_mutex_unlock(&proc_uid_lock);
+ if (uid_exists) {
+ kuid_t kuid = make_kuid(current_user_ns(), uid);
+
+ result = proc_uid_instantiate(dir, dentry, NULL, &kuid);
+ }
+ return ERR_PTR(result);
+}
+
+static const struct file_operations proc_uid_operations = {
+ .read = generic_read_dir,
+ .iterate = proc_uid_readdir,
+ .llseek = default_llseek,
+};
+
+static const struct inode_operations proc_uid_inode_operations = {
+ .lookup = proc_uid_lookup,
+ .setattr = proc_setattr,
+};
+
+int __init proc_uid_init(void)
+{
+ proc_uid = proc_mkdir("uid", NULL);
+ if (!proc_uid)
+ return -ENOMEM;
+ proc_uid->proc_iops = &proc_uid_inode_operations;
+ proc_uid->proc_fops = &proc_uid_operations;
+
+ return 0;
+}
diff --git a/fs/proc_namespace.c b/fs/proc_namespace.c
index 3f1190d18991..6863773aff25 100644
--- a/fs/proc_namespace.c
+++ b/fs/proc_namespace.c
@@ -118,7 +118,9 @@ static int show_vfsmnt(struct seq_file *m, struct vfsmount *mnt)
if (err)
goto out;
show_mnt_opts(m, mnt);
- if (sb->s_op->show_options)
+ if (sb->s_op->show_options2)
+ err = sb->s_op->show_options2(mnt, m, mnt_path.dentry);
+ else if (sb->s_op->show_options)
err = sb->s_op->show_options(m, mnt_path.dentry);
seq_puts(m, " 0 0\n");
out:
@@ -180,7 +182,9 @@ static int show_mountinfo(struct seq_file *m, struct vfsmount *mnt)
err = show_sb_opts(m, sb);
if (err)
goto out;
- if (sb->s_op->show_options)
+ if (sb->s_op->show_options2) {
+ err = sb->s_op->show_options2(mnt, m, mnt->mnt_root);
+ } else if (sb->s_op->show_options)
err = sb->s_op->show_options(m, mnt->mnt_root);
seq_putc(m, '\n');
out:
diff --git a/fs/pstore/ram.c b/fs/pstore/ram.c
index 8b09271e5d66..8e151fb9bb76 100644
--- a/fs/pstore/ram.c
+++ b/fs/pstore/ram.c
@@ -550,6 +550,12 @@ static int ramoops_parse_dt(struct platform_device *pdev,
return 0;
}
+void notrace ramoops_console_write_buf(const char *buf, size_t size)
+{
+ struct ramoops_context *cxt = &oops_cxt;
+ persistent_ram_write(cxt->cprz, buf, size);
+}
+
static int ramoops_probe(struct platform_device *pdev)
{
struct device *dev = &pdev->dev;
diff --git a/fs/read_write.c b/fs/read_write.c
index ba280596ec78..2f1027d708d2 100644
--- a/fs/read_write.c
+++ b/fs/read_write.c
@@ -23,9 +23,6 @@
#include <asm/uaccess.h>
#include <asm/unistd.h>
-typedef ssize_t (*io_fn_t)(struct file *, char __user *, size_t, loff_t *);
-typedef ssize_t (*iter_fn_t)(struct kiocb *, struct iov_iter *);
-
const struct file_operations generic_ro_fops = {
.llseek = generic_file_llseek,
.read_iter = generic_file_read_iter,
@@ -675,7 +672,7 @@ unsigned long iov_shorten(struct iovec *iov, unsigned long nr_segs, size_t to)
EXPORT_SYMBOL(iov_shorten);
static ssize_t do_iter_readv_writev(struct file *filp, struct iov_iter *iter,
- loff_t *ppos, iter_fn_t fn, int flags)
+ loff_t *ppos, int type, int flags)
{
struct kiocb kiocb;
ssize_t ret;
@@ -692,7 +689,10 @@ static ssize_t do_iter_readv_writev(struct file *filp, struct iov_iter *iter,
kiocb.ki_flags |= (IOCB_DSYNC | IOCB_SYNC);
kiocb.ki_pos = *ppos;
- ret = fn(&kiocb, iter);
+ if (type == READ)
+ ret = filp->f_op->read_iter(&kiocb, iter);
+ else
+ ret = filp->f_op->write_iter(&kiocb, iter);
BUG_ON(ret == -EIOCBQUEUED);
*ppos = kiocb.ki_pos;
return ret;
@@ -700,7 +700,7 @@ static ssize_t do_iter_readv_writev(struct file *filp, struct iov_iter *iter,
/* Do it by hand, with file-ops */
static ssize_t do_loop_readv_writev(struct file *filp, struct iov_iter *iter,
- loff_t *ppos, io_fn_t fn, int flags)
+ loff_t *ppos, int type, int flags)
{
ssize_t ret = 0;
@@ -711,7 +711,13 @@ static ssize_t do_loop_readv_writev(struct file *filp, struct iov_iter *iter,
struct iovec iovec = iov_iter_iovec(iter);
ssize_t nr;
- nr = fn(filp, iovec.iov_base, iovec.iov_len, ppos);
+ if (type == READ) {
+ nr = filp->f_op->read(filp, iovec.iov_base,
+ iovec.iov_len, ppos);
+ } else {
+ nr = filp->f_op->write(filp, iovec.iov_base,
+ iovec.iov_len, ppos);
+ }
if (nr < 0) {
if (!ret)
@@ -844,8 +850,6 @@ static ssize_t do_readv_writev(int type, struct file *file,
struct iovec *iov = iovstack;
struct iov_iter iter;
ssize_t ret;
- io_fn_t fn;
- iter_fn_t iter_fn;
ret = import_iovec(type, uvector, nr_segs,
ARRAY_SIZE(iovstack), &iov, &iter);
@@ -859,19 +863,14 @@ static ssize_t do_readv_writev(int type, struct file *file,
if (ret < 0)
goto out;
- if (type == READ) {
- fn = file->f_op->read;
- iter_fn = file->f_op->read_iter;
- } else {
- fn = (io_fn_t)file->f_op->write;
- iter_fn = file->f_op->write_iter;
+ if (type != READ)
file_start_write(file);
- }
- if (iter_fn)
- ret = do_iter_readv_writev(file, &iter, pos, iter_fn, flags);
+ if ((type == READ && file->f_op->read_iter) ||
+ (type == WRITE && file->f_op->write_iter))
+ ret = do_iter_readv_writev(file, &iter, pos, type, flags);
else
- ret = do_loop_readv_writev(file, &iter, pos, fn, flags);
+ ret = do_loop_readv_writev(file, &iter, pos, type, flags);
if (type != READ)
file_end_write(file);
@@ -1069,8 +1068,6 @@ static ssize_t compat_do_readv_writev(int type, struct file *file,
struct iovec *iov = iovstack;
struct iov_iter iter;
ssize_t ret;
- io_fn_t fn;
- iter_fn_t iter_fn;
ret = compat_import_iovec(type, uvector, nr_segs,
UIO_FASTIOV, &iov, &iter);
@@ -1084,19 +1081,14 @@ static ssize_t compat_do_readv_writev(int type, struct file *file,
if (ret < 0)
goto out;
- if (type == READ) {
- fn = file->f_op->read;
- iter_fn = file->f_op->read_iter;
- } else {
- fn = (io_fn_t)file->f_op->write;
- iter_fn = file->f_op->write_iter;
+ if (type != READ)
file_start_write(file);
- }
- if (iter_fn)
- ret = do_iter_readv_writev(file, &iter, pos, iter_fn, flags);
+ if ((type == READ && file->f_op->read_iter) ||
+ (type == WRITE && file->f_op->write_iter))
+ ret = do_iter_readv_writev(file, &iter, pos, type, flags);
else
- ret = do_loop_readv_writev(file, &iter, pos, fn, flags);
+ ret = do_loop_readv_writev(file, &iter, pos, type, flags);
if (type != READ)
file_end_write(file);
diff --git a/fs/sdcardfs/Kconfig b/fs/sdcardfs/Kconfig
new file mode 100644
index 000000000000..a1c103316ac7
--- /dev/null
+++ b/fs/sdcardfs/Kconfig
@@ -0,0 +1,13 @@
+config SDCARD_FS
+ tristate "sdcard file system"
+ depends on CONFIGFS_FS
+ default n
+ help
+ Sdcardfs is based on Wrapfs file system.
+
+config SDCARD_FS_FADV_NOACTIVE
+ bool "sdcardfs fadvise noactive support"
+ depends on FADV_NOACTIVE
+ default y
+ help
+ Sdcardfs supports fadvise noactive mode.
diff --git a/fs/sdcardfs/Makefile b/fs/sdcardfs/Makefile
new file mode 100644
index 000000000000..b84fbb2b45a4
--- /dev/null
+++ b/fs/sdcardfs/Makefile
@@ -0,0 +1,7 @@
+SDCARDFS_VERSION="0.1"
+
+EXTRA_CFLAGS += -DSDCARDFS_VERSION=\"$(SDCARDFS_VERSION)\"
+
+obj-$(CONFIG_SDCARD_FS) += sdcardfs.o
+
+sdcardfs-y := dentry.o file.o inode.o main.o super.o lookup.o mmap.o packagelist.o derived_perm.o
diff --git a/fs/sdcardfs/dentry.c b/fs/sdcardfs/dentry.c
new file mode 100644
index 000000000000..cb573f1efbfc
--- /dev/null
+++ b/fs/sdcardfs/dentry.c
@@ -0,0 +1,196 @@
+/*
+ * fs/sdcardfs/dentry.c
+ *
+ * Copyright (c) 2013 Samsung Electronics Co. Ltd
+ * Authors: Daeho Jeong, Woojoong Lee, Seunghwan Hyun,
+ * Sunghwan Yun, Sungjong Seo
+ *
+ * This program has been developed as a stackable file system based on
+ * the WrapFS which written by
+ *
+ * Copyright (c) 1998-2011 Erez Zadok
+ * Copyright (c) 2009 Shrikar Archak
+ * Copyright (c) 2003-2011 Stony Brook University
+ * Copyright (c) 2003-2011 The Research Foundation of SUNY
+ *
+ * This file is dual licensed. It may be redistributed and/or modified
+ * under the terms of the Apache 2.0 License OR version 2 of the GNU
+ * General Public License.
+ */
+
+#include "sdcardfs.h"
+#include "linux/ctype.h"
+
+/*
+ * returns: -ERRNO if error (returned to user)
+ * 0: tell VFS to invalidate dentry
+ * 1: dentry is valid
+ */
+static int sdcardfs_d_revalidate(struct dentry *dentry, unsigned int flags)
+{
+ int err = 1;
+ struct path parent_lower_path, lower_path;
+ struct dentry *parent_dentry = NULL;
+ struct dentry *parent_lower_dentry = NULL;
+ struct dentry *lower_cur_parent_dentry = NULL;
+ struct dentry *lower_dentry = NULL;
+ struct inode *inode;
+ struct sdcardfs_inode_data *data;
+
+ if (flags & LOOKUP_RCU)
+ return -ECHILD;
+
+ spin_lock(&dentry->d_lock);
+ if (IS_ROOT(dentry)) {
+ spin_unlock(&dentry->d_lock);
+ return 1;
+ }
+ spin_unlock(&dentry->d_lock);
+
+ /* check uninitialized obb_dentry and
+ * whether the base obbpath has been changed or not
+ */
+ if (is_obbpath_invalid(dentry)) {
+ return 0;
+ }
+
+ parent_dentry = dget_parent(dentry);
+ sdcardfs_get_lower_path(parent_dentry, &parent_lower_path);
+ sdcardfs_get_real_lower(dentry, &lower_path);
+ parent_lower_dentry = parent_lower_path.dentry;
+ lower_dentry = lower_path.dentry;
+ lower_cur_parent_dentry = dget_parent(lower_dentry);
+
+ if ((lower_dentry->d_flags & DCACHE_OP_REVALIDATE)) {
+ err = lower_dentry->d_op->d_revalidate(lower_dentry, flags);
+ if (err == 0) {
+ goto out;
+ }
+ }
+
+ spin_lock(&lower_dentry->d_lock);
+ if (d_unhashed(lower_dentry)) {
+ spin_unlock(&lower_dentry->d_lock);
+ err = 0;
+ goto out;
+ }
+ spin_unlock(&lower_dentry->d_lock);
+
+ if (parent_lower_dentry != lower_cur_parent_dentry) {
+ err = 0;
+ goto out;
+ }
+
+ if (dentry < lower_dentry) {
+ spin_lock(&dentry->d_lock);
+ spin_lock_nested(&lower_dentry->d_lock, DENTRY_D_LOCK_NESTED);
+ } else {
+ spin_lock(&lower_dentry->d_lock);
+ spin_lock_nested(&dentry->d_lock, DENTRY_D_LOCK_NESTED);
+ }
+
+ if (!qstr_case_eq(&dentry->d_name, &lower_dentry->d_name)) {
+ err = 0;
+ }
+
+ if (dentry < lower_dentry) {
+ spin_unlock(&lower_dentry->d_lock);
+ spin_unlock(&dentry->d_lock);
+ } else {
+ spin_unlock(&dentry->d_lock);
+ spin_unlock(&lower_dentry->d_lock);
+ }
+ if (!err)
+ goto out;
+
+ /* If our top's inode is gone, we may be out of date */
+ inode = igrab(d_inode(dentry));
+ if (inode) {
+ data = top_data_get(SDCARDFS_I(inode));
+ if (!data || data->abandoned) {
+ err = 0;
+ }
+ if (data)
+ data_put(data);
+ iput(inode);
+ }
+
+out:
+ dput(parent_dentry);
+ dput(lower_cur_parent_dentry);
+ sdcardfs_put_lower_path(parent_dentry, &parent_lower_path);
+ sdcardfs_put_real_lower(dentry, &lower_path);
+ return err;
+}
+
+/* 1 = delete, 0 = cache */
+static int sdcardfs_d_delete(const struct dentry *d)
+{
+ return SDCARDFS_SB(d->d_sb)->options.nocache ? 1 : 0;
+}
+
+static void sdcardfs_d_release(struct dentry *dentry)
+{
+ if (!dentry || !dentry->d_fsdata)
+ return;
+ /* release and reset the lower paths */
+ if (has_graft_path(dentry))
+ sdcardfs_put_reset_orig_path(dentry);
+ sdcardfs_put_reset_lower_path(dentry);
+ free_dentry_private_data(dentry);
+}
+
+static int sdcardfs_hash_ci(const struct dentry *dentry,
+ struct qstr *qstr)
+{
+ /*
+ * This function is copy of vfat_hashi.
+ * FIXME Should we support national language?
+ * Refer to vfat_hashi()
+ * struct nls_table *t = MSDOS_SB(dentry->d_sb)->nls_io;
+ */
+ const unsigned char *name;
+ unsigned int len;
+ unsigned long hash;
+
+ name = qstr->name;
+ len = qstr->len;
+
+ hash = init_name_hash(dentry);
+ while (len--)
+ hash = partial_name_hash(tolower(*name++), hash);
+ qstr->hash = end_name_hash(hash);
+
+ return 0;
+}
+
+/*
+ * Case insensitive compare of two vfat names.
+ */
+static int sdcardfs_cmp_ci(const struct dentry *dentry,
+ unsigned int len, const char *str, const struct qstr *name)
+{
+ /* FIXME Should we support national language? */
+
+ if (name->len == len) {
+ if (str_n_case_eq(name->name, str, len))
+ return 0;
+ }
+ return 1;
+}
+
+static void sdcardfs_canonical_path(const struct path *path,
+ struct path *actual_path)
+{
+ sdcardfs_get_real_lower(path->dentry, actual_path);
+}
+
+const struct dentry_operations sdcardfs_ci_dops = {
+ .d_revalidate = sdcardfs_d_revalidate,
+ .d_delete = sdcardfs_d_delete,
+ .d_release = sdcardfs_d_release,
+ .d_hash = sdcardfs_hash_ci,
+ .d_compare = sdcardfs_cmp_ci,
+ .d_canonical_path = sdcardfs_canonical_path,
+};
+
diff --git a/fs/sdcardfs/derived_perm.c b/fs/sdcardfs/derived_perm.c
new file mode 100644
index 000000000000..78a669c8a4d6
--- /dev/null
+++ b/fs/sdcardfs/derived_perm.c
@@ -0,0 +1,477 @@
+/*
+ * fs/sdcardfs/derived_perm.c
+ *
+ * Copyright (c) 2013 Samsung Electronics Co. Ltd
+ * Authors: Daeho Jeong, Woojoong Lee, Seunghwan Hyun,
+ * Sunghwan Yun, Sungjong Seo
+ *
+ * This program has been developed as a stackable file system based on
+ * the WrapFS which written by
+ *
+ * Copyright (c) 1998-2011 Erez Zadok
+ * Copyright (c) 2009 Shrikar Archak
+ * Copyright (c) 2003-2011 Stony Brook University
+ * Copyright (c) 2003-2011 The Research Foundation of SUNY
+ *
+ * This file is dual licensed. It may be redistributed and/or modified
+ * under the terms of the Apache 2.0 License OR version 2 of the GNU
+ * General Public License.
+ */
+
+#include "sdcardfs.h"
+
+/* copy derived state from parent inode */
+static void inherit_derived_state(struct inode *parent, struct inode *child)
+{
+ struct sdcardfs_inode_info *pi = SDCARDFS_I(parent);
+ struct sdcardfs_inode_info *ci = SDCARDFS_I(child);
+
+ ci->data->perm = PERM_INHERIT;
+ ci->data->userid = pi->data->userid;
+ ci->data->d_uid = pi->data->d_uid;
+ ci->data->under_android = pi->data->under_android;
+ ci->data->under_cache = pi->data->under_cache;
+ ci->data->under_obb = pi->data->under_obb;
+}
+
+/* helper function for derived state */
+void setup_derived_state(struct inode *inode, perm_t perm, userid_t userid,
+ uid_t uid)
+{
+ struct sdcardfs_inode_info *info = SDCARDFS_I(inode);
+
+ info->data->perm = perm;
+ info->data->userid = userid;
+ info->data->d_uid = uid;
+ info->data->under_android = false;
+ info->data->under_cache = false;
+ info->data->under_obb = false;
+}
+
+/* While renaming, there is a point where we want the path from dentry,
+ * but the name from newdentry
+ */
+void get_derived_permission_new(struct dentry *parent, struct dentry *dentry,
+ const struct qstr *name)
+{
+ struct sdcardfs_inode_info *info = SDCARDFS_I(d_inode(dentry));
+ struct sdcardfs_inode_info *parent_info = SDCARDFS_I(d_inode(parent));
+ struct sdcardfs_inode_data *parent_data = parent_info->data;
+ appid_t appid;
+ unsigned long user_num;
+ int err;
+ struct qstr q_Android = QSTR_LITERAL("Android");
+ struct qstr q_data = QSTR_LITERAL("data");
+ struct qstr q_sandbox = QSTR_LITERAL("sandbox");
+ struct qstr q_obb = QSTR_LITERAL("obb");
+ struct qstr q_media = QSTR_LITERAL("media");
+ struct qstr q_cache = QSTR_LITERAL("cache");
+
+ /* By default, each inode inherits from its parent.
+ * the properties are maintained on its private fields
+ * because the inode attributes will be modified with that of
+ * its lower inode.
+ * These values are used by our custom permission call instead
+ * of using the inode permissions.
+ */
+
+ inherit_derived_state(d_inode(parent), d_inode(dentry));
+
+ /* Files don't get special labels */
+ if (!S_ISDIR(d_inode(dentry)->i_mode)) {
+ set_top(info, parent_info);
+ return;
+ }
+ /* Derive custom permissions based on parent and current node */
+ switch (parent_data->perm) {
+ case PERM_INHERIT:
+ case PERM_ANDROID_PACKAGE_CACHE:
+ set_top(info, parent_info);
+ break;
+ case PERM_PRE_ROOT:
+ /* Legacy internal layout places users at top level */
+ info->data->perm = PERM_ROOT;
+ err = kstrtoul(name->name, 10, &user_num);
+ if (err)
+ info->data->userid = 0;
+ else
+ info->data->userid = user_num;
+ break;
+ case PERM_ROOT:
+ /* Assume masked off by default. */
+ if (qstr_case_eq(name, &q_Android)) {
+ /* App-specific directories inside; let anyone traverse */
+ info->data->perm = PERM_ANDROID;
+ info->data->under_android = true;
+ } else {
+ set_top(info, parent_info);
+ }
+ break;
+ case PERM_ANDROID:
+ if (qstr_case_eq(name, &q_data)) {
+ /* App-specific directories inside; let anyone traverse */
+ info->data->perm = PERM_ANDROID_DATA;
+ } else if (qstr_case_eq(name, &q_sandbox)) {
+ /* App-specific directories inside; let anyone traverse */
+ info->data->perm = PERM_ANDROID_DATA;
+ } else if (qstr_case_eq(name, &q_obb)) {
+ /* App-specific directories inside; let anyone traverse */
+ info->data->perm = PERM_ANDROID_OBB;
+ info->data->under_obb = true;
+ /* Single OBB directory is always shared */
+ } else if (qstr_case_eq(name, &q_media)) {
+ /* App-specific directories inside; let anyone traverse */
+ info->data->perm = PERM_ANDROID_MEDIA;
+ } else {
+ set_top(info, parent_info);
+ }
+ break;
+ case PERM_ANDROID_OBB:
+ case PERM_ANDROID_DATA:
+ case PERM_ANDROID_MEDIA:
+ info->data->perm = PERM_ANDROID_PACKAGE;
+ appid = get_appid(name->name);
+ if (appid != 0 && !is_excluded(name->name, parent_data->userid))
+ info->data->d_uid =
+ multiuser_get_uid(parent_data->userid, appid);
+ break;
+ case PERM_ANDROID_PACKAGE:
+ if (qstr_case_eq(name, &q_cache)) {
+ info->data->perm = PERM_ANDROID_PACKAGE_CACHE;
+ info->data->under_cache = true;
+ }
+ set_top(info, parent_info);
+ break;
+ }
+}
+
+void get_derived_permission(struct dentry *parent, struct dentry *dentry)
+{
+ get_derived_permission_new(parent, dentry, &dentry->d_name);
+}
+
+static appid_t get_type(const char *name)
+{
+ const char *ext = strrchr(name, '.');
+ appid_t id;
+
+ if (ext && ext[0]) {
+ ext = &ext[1];
+ id = get_ext_gid(ext);
+ return id?:AID_MEDIA_RW;
+ }
+ return AID_MEDIA_RW;
+}
+
+void fixup_lower_ownership(struct dentry *dentry, const char *name)
+{
+ struct path path;
+ struct inode *inode;
+ struct inode *delegated_inode = NULL;
+ int error;
+ struct sdcardfs_inode_info *info;
+ struct sdcardfs_inode_data *info_d;
+ struct sdcardfs_inode_data *info_top;
+ perm_t perm;
+ struct sdcardfs_sb_info *sbi = SDCARDFS_SB(dentry->d_sb);
+ uid_t uid = sbi->options.fs_low_uid;
+ gid_t gid = sbi->options.fs_low_gid;
+ struct iattr newattrs;
+
+ if (!sbi->options.gid_derivation)
+ return;
+
+ info = SDCARDFS_I(d_inode(dentry));
+ info_d = info->data;
+ perm = info_d->perm;
+ if (info_d->under_obb) {
+ perm = PERM_ANDROID_OBB;
+ } else if (info_d->under_cache) {
+ perm = PERM_ANDROID_PACKAGE_CACHE;
+ } else if (perm == PERM_INHERIT) {
+ info_top = top_data_get(info);
+ perm = info_top->perm;
+ data_put(info_top);
+ }
+
+ switch (perm) {
+ case PERM_ROOT:
+ case PERM_ANDROID:
+ case PERM_ANDROID_DATA:
+ case PERM_ANDROID_MEDIA:
+ case PERM_ANDROID_PACKAGE:
+ case PERM_ANDROID_PACKAGE_CACHE:
+ uid = multiuser_get_uid(info_d->userid, uid);
+ break;
+ case PERM_ANDROID_OBB:
+ uid = AID_MEDIA_OBB;
+ break;
+ case PERM_PRE_ROOT:
+ default:
+ break;
+ }
+ switch (perm) {
+ case PERM_ROOT:
+ case PERM_ANDROID:
+ case PERM_ANDROID_DATA:
+ case PERM_ANDROID_MEDIA:
+ if (S_ISDIR(d_inode(dentry)->i_mode))
+ gid = multiuser_get_uid(info_d->userid, AID_MEDIA_RW);
+ else
+ gid = multiuser_get_uid(info_d->userid, get_type(name));
+ break;
+ case PERM_ANDROID_OBB:
+ gid = AID_MEDIA_OBB;
+ break;
+ case PERM_ANDROID_PACKAGE:
+ if (uid_is_app(info_d->d_uid))
+ gid = multiuser_get_ext_gid(info_d->d_uid);
+ else
+ gid = multiuser_get_uid(info_d->userid, AID_MEDIA_RW);
+ break;
+ case PERM_ANDROID_PACKAGE_CACHE:
+ if (uid_is_app(info_d->d_uid))
+ gid = multiuser_get_ext_cache_gid(info_d->d_uid);
+ else
+ gid = multiuser_get_uid(info_d->userid, AID_MEDIA_RW);
+ break;
+ case PERM_PRE_ROOT:
+ default:
+ break;
+ }
+
+ sdcardfs_get_lower_path(dentry, &path);
+ inode = d_inode(path.dentry);
+ if (d_inode(path.dentry)->i_gid.val != gid || d_inode(path.dentry)->i_uid.val != uid) {
+retry_deleg:
+ newattrs.ia_valid = ATTR_GID | ATTR_UID | ATTR_FORCE;
+ newattrs.ia_uid = make_kuid(current_user_ns(), uid);
+ newattrs.ia_gid = make_kgid(current_user_ns(), gid);
+ if (!S_ISDIR(inode->i_mode))
+ newattrs.ia_valid |=
+ ATTR_KILL_SUID | ATTR_KILL_SGID | ATTR_KILL_PRIV;
+ inode_lock(inode);
+ error = security_path_chown(&path, newattrs.ia_uid, newattrs.ia_gid);
+ if (!error)
+ error = notify_change2(path.mnt, path.dentry, &newattrs, &delegated_inode);
+ inode_unlock(inode);
+ if (delegated_inode) {
+ error = break_deleg_wait(&delegated_inode);
+ if (!error)
+ goto retry_deleg;
+ }
+ if (error)
+ pr_debug("sdcardfs: Failed to touch up lower fs gid/uid for %s\n", name);
+ }
+ sdcardfs_put_lower_path(dentry, &path);
+}
+
+static int descendant_may_need_fixup(struct sdcardfs_inode_data *data,
+ struct limit_search *limit)
+{
+ if (data->perm == PERM_ROOT)
+ return (limit->flags & BY_USERID) ?
+ data->userid == limit->userid : 1;
+ if (data->perm == PERM_PRE_ROOT || data->perm == PERM_ANDROID)
+ return 1;
+ return 0;
+}
+
+static int needs_fixup(perm_t perm)
+{
+ if (perm == PERM_ANDROID_DATA || perm == PERM_ANDROID_OBB
+ || perm == PERM_ANDROID_MEDIA)
+ return 1;
+ return 0;
+}
+
+static void __fixup_perms_recursive(struct dentry *dentry, struct limit_search *limit, int depth)
+{
+ struct dentry *child;
+ struct sdcardfs_inode_info *info;
+
+ /*
+ * All paths will terminate their recursion on hitting PERM_ANDROID_OBB,
+ * PERM_ANDROID_MEDIA, or PERM_ANDROID_DATA. This happens at a depth of
+ * at most 3.
+ */
+ WARN(depth > 3, "%s: Max expected depth exceeded!\n", __func__);
+ spin_lock_nested(&dentry->d_lock, depth);
+ if (!d_inode(dentry)) {
+ spin_unlock(&dentry->d_lock);
+ return;
+ }
+ info = SDCARDFS_I(d_inode(dentry));
+
+ if (needs_fixup(info->data->perm)) {
+ list_for_each_entry(child, &dentry->d_subdirs, d_child) {
+ spin_lock_nested(&child->d_lock, depth + 1);
+ if (!(limit->flags & BY_NAME) || qstr_case_eq(&child->d_name, &limit->name)) {
+ if (d_inode(child)) {
+ get_derived_permission(dentry, child);
+ fixup_tmp_permissions(d_inode(child));
+ spin_unlock(&child->d_lock);
+ break;
+ }
+ }
+ spin_unlock(&child->d_lock);
+ }
+ } else if (descendant_may_need_fixup(info->data, limit)) {
+ list_for_each_entry(child, &dentry->d_subdirs, d_child) {
+ __fixup_perms_recursive(child, limit, depth + 1);
+ }
+ }
+ spin_unlock(&dentry->d_lock);
+}
+
+void fixup_perms_recursive(struct dentry *dentry, struct limit_search *limit)
+{
+ __fixup_perms_recursive(dentry, limit, 0);
+}
+
+/* main function for updating derived permission */
+inline void update_derived_permission_lock(struct dentry *dentry)
+{
+ struct dentry *parent;
+
+ if (!dentry || !d_inode(dentry)) {
+ pr_err("sdcardfs: %s: invalid dentry\n", __func__);
+ return;
+ }
+ /* FIXME:
+ * 1. need to check whether the dentry is updated or not
+ * 2. remove the root dentry update
+ */
+ if (!IS_ROOT(dentry)) {
+ parent = dget_parent(dentry);
+ if (parent) {
+ get_derived_permission(parent, dentry);
+ dput(parent);
+ }
+ }
+ fixup_tmp_permissions(d_inode(dentry));
+}
+
+int need_graft_path(struct dentry *dentry)
+{
+ int ret = 0;
+ struct dentry *parent = dget_parent(dentry);
+ struct sdcardfs_inode_info *parent_info = SDCARDFS_I(d_inode(parent));
+ struct sdcardfs_sb_info *sbi = SDCARDFS_SB(dentry->d_sb);
+ struct qstr obb = QSTR_LITERAL("obb");
+
+ if (!sbi->options.unshared_obb &&
+ parent_info->data->perm == PERM_ANDROID &&
+ qstr_case_eq(&dentry->d_name, &obb)) {
+
+ /* /Android/obb is the base obbpath of DERIVED_UNIFIED */
+ if (!(sbi->options.multiuser == false
+ && parent_info->data->userid == 0)) {
+ ret = 1;
+ }
+ }
+ dput(parent);
+ return ret;
+}
+
+int is_obbpath_invalid(struct dentry *dent)
+{
+ int ret = 0;
+ struct sdcardfs_dentry_info *di = SDCARDFS_D(dent);
+ struct sdcardfs_sb_info *sbi = SDCARDFS_SB(dent->d_sb);
+ char *path_buf, *obbpath_s;
+ int need_put = 0;
+ struct path lower_path;
+
+ /* check the base obbpath has been changed.
+ * this routine can check an uninitialized obb dentry as well.
+ * regarding the uninitialized obb, refer to the sdcardfs_mkdir()
+ */
+ spin_lock(&di->lock);
+ if (di->orig_path.dentry) {
+ if (!di->lower_path.dentry) {
+ ret = 1;
+ } else {
+ path_get(&di->lower_path);
+
+ path_buf = kmalloc(PATH_MAX, GFP_ATOMIC);
+ if (!path_buf) {
+ ret = 1;
+ pr_err("sdcardfs: fail to allocate path_buf in %s.\n", __func__);
+ } else {
+ obbpath_s = d_path(&di->lower_path, path_buf, PATH_MAX);
+ if (d_unhashed(di->lower_path.dentry) ||
+ !str_case_eq(sbi->obbpath_s, obbpath_s)) {
+ ret = 1;
+ }
+ kfree(path_buf);
+ }
+
+ pathcpy(&lower_path, &di->lower_path);
+ need_put = 1;
+ }
+ }
+ spin_unlock(&di->lock);
+ if (need_put)
+ path_put(&lower_path);
+ return ret;
+}
+
+int is_base_obbpath(struct dentry *dentry)
+{
+ int ret = 0;
+ struct dentry *parent = dget_parent(dentry);
+ struct sdcardfs_inode_info *parent_info = SDCARDFS_I(d_inode(parent));
+ struct sdcardfs_sb_info *sbi = SDCARDFS_SB(dentry->d_sb);
+ struct qstr q_obb = QSTR_LITERAL("obb");
+
+ spin_lock(&SDCARDFS_D(dentry)->lock);
+ if (sbi->options.multiuser) {
+ if (parent_info->data->perm == PERM_PRE_ROOT &&
+ qstr_case_eq(&dentry->d_name, &q_obb)) {
+ ret = 1;
+ }
+ } else if (parent_info->data->perm == PERM_ANDROID &&
+ qstr_case_eq(&dentry->d_name, &q_obb)) {
+ ret = 1;
+ }
+ spin_unlock(&SDCARDFS_D(dentry)->lock);
+ return ret;
+}
+
+/* The lower_path will be stored to the dentry's orig_path
+ * and the base obbpath will be copyed to the lower_path variable.
+ * if an error returned, there's no change in the lower_path
+ * returns: -ERRNO if error (0: no error)
+ */
+int setup_obb_dentry(struct dentry *dentry, struct path *lower_path)
+{
+ int err = 0;
+ struct sdcardfs_sb_info *sbi = SDCARDFS_SB(dentry->d_sb);
+ struct path obbpath;
+
+ /* A local obb dentry must have its own orig_path to support rmdir
+ * and mkdir of itself. Usually, we expect that the sbi->obbpath
+ * is avaiable on this stage.
+ */
+ sdcardfs_set_orig_path(dentry, lower_path);
+
+ err = kern_path(sbi->obbpath_s,
+ LOOKUP_FOLLOW | LOOKUP_DIRECTORY, &obbpath);
+
+ if (!err) {
+ /* the obbpath base has been found */
+ pathcpy(lower_path, &obbpath);
+ } else {
+ /* if the sbi->obbpath is not available, we can optionally
+ * setup the lower_path with its orig_path.
+ * but, the current implementation just returns an error
+ * because the sdcard daemon also regards this case as
+ * a lookup fail.
+ */
+ pr_info("sdcardfs: the sbi->obbpath is not available\n");
+ }
+ return err;
+}
+
+
diff --git a/fs/sdcardfs/file.c b/fs/sdcardfs/file.c
new file mode 100644
index 000000000000..271c4c4cb760
--- /dev/null
+++ b/fs/sdcardfs/file.c
@@ -0,0 +1,467 @@
+/*
+ * fs/sdcardfs/file.c
+ *
+ * Copyright (c) 2013 Samsung Electronics Co. Ltd
+ * Authors: Daeho Jeong, Woojoong Lee, Seunghwan Hyun,
+ * Sunghwan Yun, Sungjong Seo
+ *
+ * This program has been developed as a stackable file system based on
+ * the WrapFS which written by
+ *
+ * Copyright (c) 1998-2011 Erez Zadok
+ * Copyright (c) 2009 Shrikar Archak
+ * Copyright (c) 2003-2011 Stony Brook University
+ * Copyright (c) 2003-2011 The Research Foundation of SUNY
+ *
+ * This file is dual licensed. It may be redistributed and/or modified
+ * under the terms of the Apache 2.0 License OR version 2 of the GNU
+ * General Public License.
+ */
+
+#include "sdcardfs.h"
+#ifdef CONFIG_SDCARD_FS_FADV_NOACTIVE
+#include <linux/backing-dev.h>
+#endif
+
+static ssize_t sdcardfs_read(struct file *file, char __user *buf,
+ size_t count, loff_t *ppos)
+{
+ int err;
+ struct file *lower_file;
+ struct dentry *dentry = file->f_path.dentry;
+#ifdef CONFIG_SDCARD_FS_FADV_NOACTIVE
+ struct backing_dev_info *bdi;
+#endif
+
+ lower_file = sdcardfs_lower_file(file);
+
+#ifdef CONFIG_SDCARD_FS_FADV_NOACTIVE
+ if (file->f_mode & FMODE_NOACTIVE) {
+ if (!(lower_file->f_mode & FMODE_NOACTIVE)) {
+ bdi = lower_file->f_mapping->backing_dev_info;
+ lower_file->f_ra.ra_pages = bdi->ra_pages * 2;
+ spin_lock(&lower_file->f_lock);
+ lower_file->f_mode |= FMODE_NOACTIVE;
+ spin_unlock(&lower_file->f_lock);
+ }
+ }
+#endif
+
+ err = vfs_read(lower_file, buf, count, ppos);
+ /* update our inode atime upon a successful lower read */
+ if (err >= 0)
+ fsstack_copy_attr_atime(d_inode(dentry),
+ file_inode(lower_file));
+
+ return err;
+}
+
+static ssize_t sdcardfs_write(struct file *file, const char __user *buf,
+ size_t count, loff_t *ppos)
+{
+ int err;
+ struct file *lower_file;
+ struct dentry *dentry = file->f_path.dentry;
+ struct inode *inode = d_inode(dentry);
+
+ /* check disk space */
+ if (!check_min_free_space(dentry, count, 0)) {
+ pr_err("No minimum free space.\n");
+ return -ENOSPC;
+ }
+
+ lower_file = sdcardfs_lower_file(file);
+ err = vfs_write(lower_file, buf, count, ppos);
+ /* update our inode times+sizes upon a successful lower write */
+ if (err >= 0) {
+ if (sizeof(loff_t) > sizeof(long))
+ inode_lock(inode);
+ fsstack_copy_inode_size(inode, file_inode(lower_file));
+ fsstack_copy_attr_times(inode, file_inode(lower_file));
+ if (sizeof(loff_t) > sizeof(long))
+ inode_unlock(inode);
+ }
+
+ return err;
+}
+
+static int sdcardfs_readdir(struct file *file, struct dir_context *ctx)
+{
+ int err;
+ struct file *lower_file = NULL;
+ struct dentry *dentry = file->f_path.dentry;
+
+ lower_file = sdcardfs_lower_file(file);
+
+ lower_file->f_pos = file->f_pos;
+ err = iterate_dir(lower_file, ctx);
+ file->f_pos = lower_file->f_pos;
+ if (err >= 0) /* copy the atime */
+ fsstack_copy_attr_atime(d_inode(dentry),
+ file_inode(lower_file));
+ return err;
+}
+
+static long sdcardfs_unlocked_ioctl(struct file *file, unsigned int cmd,
+ unsigned long arg)
+{
+ long err = -ENOTTY;
+ struct file *lower_file;
+ const struct cred *saved_cred = NULL;
+ struct dentry *dentry = file->f_path.dentry;
+ struct sdcardfs_sb_info *sbi = SDCARDFS_SB(dentry->d_sb);
+
+ lower_file = sdcardfs_lower_file(file);
+
+ /* XXX: use vfs_ioctl if/when VFS exports it */
+ if (!lower_file || !lower_file->f_op)
+ goto out;
+
+ /* save current_cred and override it */
+ saved_cred = override_fsids(sbi, SDCARDFS_I(file_inode(file))->data);
+ if (!saved_cred) {
+ err = -ENOMEM;
+ goto out;
+ }
+
+ if (lower_file->f_op->unlocked_ioctl)
+ err = lower_file->f_op->unlocked_ioctl(lower_file, cmd, arg);
+
+ /* some ioctls can change inode attributes (EXT2_IOC_SETFLAGS) */
+ if (!err)
+ sdcardfs_copy_and_fix_attrs(file_inode(file),
+ file_inode(lower_file));
+ revert_fsids(saved_cred);
+out:
+ return err;
+}
+
+#ifdef CONFIG_COMPAT
+static long sdcardfs_compat_ioctl(struct file *file, unsigned int cmd,
+ unsigned long arg)
+{
+ long err = -ENOTTY;
+ struct file *lower_file;
+ const struct cred *saved_cred = NULL;
+ struct dentry *dentry = file->f_path.dentry;
+ struct sdcardfs_sb_info *sbi = SDCARDFS_SB(dentry->d_sb);
+
+ lower_file = sdcardfs_lower_file(file);
+
+ /* XXX: use vfs_ioctl if/when VFS exports it */
+ if (!lower_file || !lower_file->f_op)
+ goto out;
+
+ /* save current_cred and override it */
+ saved_cred = override_fsids(sbi, SDCARDFS_I(file_inode(file))->data);
+ if (!saved_cred) {
+ err = -ENOMEM;
+ goto out;
+ }
+
+ if (lower_file->f_op->compat_ioctl)
+ err = lower_file->f_op->compat_ioctl(lower_file, cmd, arg);
+
+ revert_fsids(saved_cred);
+out:
+ return err;
+}
+#endif
+
+static int sdcardfs_mmap(struct file *file, struct vm_area_struct *vma)
+{
+ int err = 0;
+ bool willwrite;
+ struct file *lower_file;
+ const struct vm_operations_struct *saved_vm_ops = NULL;
+
+ /* this might be deferred to mmap's writepage */
+ willwrite = ((vma->vm_flags | VM_SHARED | VM_WRITE) == vma->vm_flags);
+
+ /*
+ * File systems which do not implement ->writepage may use
+ * generic_file_readonly_mmap as their ->mmap op. If you call
+ * generic_file_readonly_mmap with VM_WRITE, you'd get an -EINVAL.
+ * But we cannot call the lower ->mmap op, so we can't tell that
+ * writeable mappings won't work. Therefore, our only choice is to
+ * check if the lower file system supports the ->writepage, and if
+ * not, return EINVAL (the same error that
+ * generic_file_readonly_mmap returns in that case).
+ */
+ lower_file = sdcardfs_lower_file(file);
+ if (willwrite && !lower_file->f_mapping->a_ops->writepage) {
+ err = -EINVAL;
+ pr_err("sdcardfs: lower file system does not support writeable mmap\n");
+ goto out;
+ }
+
+ /*
+ * find and save lower vm_ops.
+ *
+ * XXX: the VFS should have a cleaner way of finding the lower vm_ops
+ */
+ if (!SDCARDFS_F(file)->lower_vm_ops) {
+ err = lower_file->f_op->mmap(lower_file, vma);
+ if (err) {
+ pr_err("sdcardfs: lower mmap failed %d\n", err);
+ goto out;
+ }
+ saved_vm_ops = vma->vm_ops; /* save: came from lower ->mmap */
+ }
+
+ /*
+ * Next 3 lines are all I need from generic_file_mmap. I definitely
+ * don't want its test for ->readpage which returns -ENOEXEC.
+ */
+ file_accessed(file);
+ vma->vm_ops = &sdcardfs_vm_ops;
+
+ file->f_mapping->a_ops = &sdcardfs_aops; /* set our aops */
+ if (!SDCARDFS_F(file)->lower_vm_ops) /* save for our ->fault */
+ SDCARDFS_F(file)->lower_vm_ops = saved_vm_ops;
+ vma->vm_private_data = file;
+ get_file(lower_file);
+ vma->vm_file = lower_file;
+
+out:
+ return err;
+}
+
+static int sdcardfs_open(struct inode *inode, struct file *file)
+{
+ int err = 0;
+ struct file *lower_file = NULL;
+ struct path lower_path;
+ struct dentry *dentry = file->f_path.dentry;
+ struct dentry *parent = dget_parent(dentry);
+ struct sdcardfs_sb_info *sbi = SDCARDFS_SB(dentry->d_sb);
+ const struct cred *saved_cred = NULL;
+
+ /* don't open unhashed/deleted files */
+ if (d_unhashed(dentry)) {
+ err = -ENOENT;
+ goto out_err;
+ }
+
+ if (!check_caller_access_to_name(d_inode(parent), &dentry->d_name)) {
+ err = -EACCES;
+ goto out_err;
+ }
+
+ /* save current_cred and override it */
+ saved_cred = override_fsids(sbi, SDCARDFS_I(inode)->data);
+ if (!saved_cred) {
+ err = -ENOMEM;
+ goto out_err;
+ }
+
+ file->private_data =
+ kzalloc(sizeof(struct sdcardfs_file_info), GFP_KERNEL);
+ if (!SDCARDFS_F(file)) {
+ err = -ENOMEM;
+ goto out_revert_cred;
+ }
+
+ /* open lower object and link sdcardfs's file struct to lower's */
+ sdcardfs_get_lower_path(file->f_path.dentry, &lower_path);
+ lower_file = dentry_open(&lower_path, file->f_flags, current_cred());
+ path_put(&lower_path);
+ if (IS_ERR(lower_file)) {
+ err = PTR_ERR(lower_file);
+ lower_file = sdcardfs_lower_file(file);
+ if (lower_file) {
+ sdcardfs_set_lower_file(file, NULL);
+ fput(lower_file); /* fput calls dput for lower_dentry */
+ }
+ } else {
+ sdcardfs_set_lower_file(file, lower_file);
+ }
+
+ if (err)
+ kfree(SDCARDFS_F(file));
+ else
+ sdcardfs_copy_and_fix_attrs(inode, sdcardfs_lower_inode(inode));
+
+out_revert_cred:
+ revert_fsids(saved_cred);
+out_err:
+ dput(parent);
+ return err;
+}
+
+static int sdcardfs_flush(struct file *file, fl_owner_t id)
+{
+ int err = 0;
+ struct file *lower_file = NULL;
+
+ lower_file = sdcardfs_lower_file(file);
+ if (lower_file && lower_file->f_op && lower_file->f_op->flush) {
+ filemap_write_and_wait(file->f_mapping);
+ err = lower_file->f_op->flush(lower_file, id);
+ }
+
+ return err;
+}
+
+/* release all lower object references & free the file info structure */
+static int sdcardfs_file_release(struct inode *inode, struct file *file)
+{
+ struct file *lower_file;
+
+ lower_file = sdcardfs_lower_file(file);
+ if (lower_file) {
+ sdcardfs_set_lower_file(file, NULL);
+ fput(lower_file);
+ }
+
+ kfree(SDCARDFS_F(file));
+ return 0;
+}
+
+static int sdcardfs_fsync(struct file *file, loff_t start, loff_t end,
+ int datasync)
+{
+ int err;
+ struct file *lower_file;
+ struct path lower_path;
+ struct dentry *dentry = file->f_path.dentry;
+
+ err = __generic_file_fsync(file, start, end, datasync);
+ if (err)
+ goto out;
+
+ lower_file = sdcardfs_lower_file(file);
+ sdcardfs_get_lower_path(dentry, &lower_path);
+ err = vfs_fsync_range(lower_file, start, end, datasync);
+ sdcardfs_put_lower_path(dentry, &lower_path);
+out:
+ return err;
+}
+
+static int sdcardfs_fasync(int fd, struct file *file, int flag)
+{
+ int err = 0;
+ struct file *lower_file = NULL;
+
+ lower_file = sdcardfs_lower_file(file);
+ if (lower_file->f_op && lower_file->f_op->fasync)
+ err = lower_file->f_op->fasync(fd, lower_file, flag);
+
+ return err;
+}
+
+/*
+ * Sdcardfs cannot use generic_file_llseek as ->llseek, because it would
+ * only set the offset of the upper file. So we have to implement our
+ * own method to set both the upper and lower file offsets
+ * consistently.
+ */
+static loff_t sdcardfs_file_llseek(struct file *file, loff_t offset, int whence)
+{
+ int err;
+ struct file *lower_file;
+
+ err = generic_file_llseek(file, offset, whence);
+ if (err < 0)
+ goto out;
+
+ lower_file = sdcardfs_lower_file(file);
+ err = generic_file_llseek(lower_file, offset, whence);
+
+out:
+ return err;
+}
+
+/*
+ * Sdcardfs read_iter, redirect modified iocb to lower read_iter
+ */
+ssize_t sdcardfs_read_iter(struct kiocb *iocb, struct iov_iter *iter)
+{
+ int err;
+ struct file *file = iocb->ki_filp, *lower_file;
+
+ lower_file = sdcardfs_lower_file(file);
+ if (!lower_file->f_op->read_iter) {
+ err = -EINVAL;
+ goto out;
+ }
+
+ get_file(lower_file); /* prevent lower_file from being released */
+ iocb->ki_filp = lower_file;
+ err = lower_file->f_op->read_iter(iocb, iter);
+ iocb->ki_filp = file;
+ fput(lower_file);
+ /* update upper inode atime as needed */
+ if (err >= 0 || err == -EIOCBQUEUED)
+ fsstack_copy_attr_atime(file->f_path.dentry->d_inode,
+ file_inode(lower_file));
+out:
+ return err;
+}
+
+/*
+ * Sdcardfs write_iter, redirect modified iocb to lower write_iter
+ */
+ssize_t sdcardfs_write_iter(struct kiocb *iocb, struct iov_iter *iter)
+{
+ int err;
+ struct file *file = iocb->ki_filp, *lower_file;
+ struct inode *inode = file->f_path.dentry->d_inode;
+
+ lower_file = sdcardfs_lower_file(file);
+ if (!lower_file->f_op->write_iter) {
+ err = -EINVAL;
+ goto out;
+ }
+
+ get_file(lower_file); /* prevent lower_file from being released */
+ iocb->ki_filp = lower_file;
+ err = lower_file->f_op->write_iter(iocb, iter);
+ iocb->ki_filp = file;
+ fput(lower_file);
+ /* update upper inode times/sizes as needed */
+ if (err >= 0 || err == -EIOCBQUEUED) {
+ if (sizeof(loff_t) > sizeof(long))
+ inode_lock(inode);
+ fsstack_copy_inode_size(inode, file_inode(lower_file));
+ fsstack_copy_attr_times(inode, file_inode(lower_file));
+ if (sizeof(loff_t) > sizeof(long))
+ inode_unlock(inode);
+ }
+out:
+ return err;
+}
+
+const struct file_operations sdcardfs_main_fops = {
+ .llseek = generic_file_llseek,
+ .read = sdcardfs_read,
+ .write = sdcardfs_write,
+ .unlocked_ioctl = sdcardfs_unlocked_ioctl,
+#ifdef CONFIG_COMPAT
+ .compat_ioctl = sdcardfs_compat_ioctl,
+#endif
+ .mmap = sdcardfs_mmap,
+ .open = sdcardfs_open,
+ .flush = sdcardfs_flush,
+ .release = sdcardfs_file_release,
+ .fsync = sdcardfs_fsync,
+ .fasync = sdcardfs_fasync,
+ .read_iter = sdcardfs_read_iter,
+ .write_iter = sdcardfs_write_iter,
+};
+
+/* trimmed directory options */
+const struct file_operations sdcardfs_dir_fops = {
+ .llseek = sdcardfs_file_llseek,
+ .read = generic_read_dir,
+ .iterate = sdcardfs_readdir,
+ .unlocked_ioctl = sdcardfs_unlocked_ioctl,
+#ifdef CONFIG_COMPAT
+ .compat_ioctl = sdcardfs_compat_ioctl,
+#endif
+ .open = sdcardfs_open,
+ .release = sdcardfs_file_release,
+ .flush = sdcardfs_flush,
+ .fsync = sdcardfs_fsync,
+ .fasync = sdcardfs_fasync,
+};
diff --git a/fs/sdcardfs/inode.c b/fs/sdcardfs/inode.c
new file mode 100644
index 000000000000..63d0736b1f68
--- /dev/null
+++ b/fs/sdcardfs/inode.c
@@ -0,0 +1,818 @@
+/*
+ * fs/sdcardfs/inode.c
+ *
+ * Copyright (c) 2013 Samsung Electronics Co. Ltd
+ * Authors: Daeho Jeong, Woojoong Lee, Seunghwan Hyun,
+ * Sunghwan Yun, Sungjong Seo
+ *
+ * This program has been developed as a stackable file system based on
+ * the WrapFS which written by
+ *
+ * Copyright (c) 1998-2011 Erez Zadok
+ * Copyright (c) 2009 Shrikar Archak
+ * Copyright (c) 2003-2011 Stony Brook University
+ * Copyright (c) 2003-2011 The Research Foundation of SUNY
+ *
+ * This file is dual licensed. It may be redistributed and/or modified
+ * under the terms of the Apache 2.0 License OR version 2 of the GNU
+ * General Public License.
+ */
+
+#include "sdcardfs.h"
+#include <linux/fs_struct.h>
+#include <linux/ratelimit.h>
+
+const struct cred *override_fsids(struct sdcardfs_sb_info *sbi,
+ struct sdcardfs_inode_data *data)
+{
+ struct cred *cred;
+ const struct cred *old_cred;
+ uid_t uid;
+
+ cred = prepare_creds();
+ if (!cred)
+ return NULL;
+
+ if (sbi->options.gid_derivation) {
+ if (data->under_obb)
+ uid = AID_MEDIA_OBB;
+ else
+ uid = multiuser_get_uid(data->userid, sbi->options.fs_low_uid);
+ } else {
+ uid = sbi->options.fs_low_uid;
+ }
+ cred->fsuid = make_kuid(&init_user_ns, uid);
+ cred->fsgid = make_kgid(&init_user_ns, sbi->options.fs_low_gid);
+
+ old_cred = override_creds(cred);
+
+ return old_cred;
+}
+
+void revert_fsids(const struct cred *old_cred)
+{
+ const struct cred *cur_cred;
+
+ cur_cred = current->cred;
+ revert_creds(old_cred);
+ put_cred(cur_cred);
+}
+
+static int sdcardfs_create(struct inode *dir, struct dentry *dentry,
+ umode_t mode, bool want_excl)
+{
+ int err;
+ struct dentry *lower_dentry;
+ struct vfsmount *lower_dentry_mnt;
+ struct dentry *lower_parent_dentry = NULL;
+ struct path lower_path;
+ const struct cred *saved_cred = NULL;
+ struct fs_struct *saved_fs;
+ struct fs_struct *copied_fs;
+
+ if (!check_caller_access_to_name(dir, &dentry->d_name)) {
+ err = -EACCES;
+ goto out_eacces;
+ }
+
+ /* save current_cred and override it */
+ saved_cred = override_fsids(SDCARDFS_SB(dir->i_sb),
+ SDCARDFS_I(dir)->data);
+ if (!saved_cred)
+ return -ENOMEM;
+
+ sdcardfs_get_lower_path(dentry, &lower_path);
+ lower_dentry = lower_path.dentry;
+ lower_dentry_mnt = lower_path.mnt;
+ lower_parent_dentry = lock_parent(lower_dentry);
+
+ /* set last 16bytes of mode field to 0664 */
+ mode = (mode & S_IFMT) | 00664;
+
+ /* temporarily change umask for lower fs write */
+ saved_fs = current->fs;
+ copied_fs = copy_fs_struct(current->fs);
+ if (!copied_fs) {
+ err = -ENOMEM;
+ goto out_unlock;
+ }
+ copied_fs->umask = 0;
+ task_lock(current);
+ current->fs = copied_fs;
+ task_unlock(current);
+
+ err = vfs_create2(lower_dentry_mnt, d_inode(lower_parent_dentry), lower_dentry, mode, want_excl);
+ if (err)
+ goto out;
+
+ err = sdcardfs_interpose(dentry, dir->i_sb, &lower_path,
+ SDCARDFS_I(dir)->data->userid);
+ if (err)
+ goto out;
+ fsstack_copy_attr_times(dir, sdcardfs_lower_inode(dir));
+ fsstack_copy_inode_size(dir, d_inode(lower_parent_dentry));
+ fixup_lower_ownership(dentry, dentry->d_name.name);
+
+out:
+ task_lock(current);
+ current->fs = saved_fs;
+ task_unlock(current);
+ free_fs_struct(copied_fs);
+out_unlock:
+ unlock_dir(lower_parent_dentry);
+ sdcardfs_put_lower_path(dentry, &lower_path);
+ revert_fsids(saved_cred);
+out_eacces:
+ return err;
+}
+
+static int sdcardfs_unlink(struct inode *dir, struct dentry *dentry)
+{
+ int err;
+ struct dentry *lower_dentry;
+ struct vfsmount *lower_mnt;
+ struct inode *lower_dir_inode = sdcardfs_lower_inode(dir);
+ struct dentry *lower_dir_dentry;
+ struct path lower_path;
+ const struct cred *saved_cred = NULL;
+
+ if (!check_caller_access_to_name(dir, &dentry->d_name)) {
+ err = -EACCES;
+ goto out_eacces;
+ }
+
+ /* save current_cred and override it */
+ saved_cred = override_fsids(SDCARDFS_SB(dir->i_sb),
+ SDCARDFS_I(dir)->data);
+ if (!saved_cred)
+ return -ENOMEM;
+
+ sdcardfs_get_lower_path(dentry, &lower_path);
+ lower_dentry = lower_path.dentry;
+ lower_mnt = lower_path.mnt;
+ dget(lower_dentry);
+ lower_dir_dentry = lock_parent(lower_dentry);
+
+ err = vfs_unlink2(lower_mnt, lower_dir_inode, lower_dentry, NULL);
+
+ /*
+ * Note: unlinking on top of NFS can cause silly-renamed files.
+ * Trying to delete such files results in EBUSY from NFS
+ * below. Silly-renamed files will get deleted by NFS later on, so
+ * we just need to detect them here and treat such EBUSY errors as
+ * if the upper file was successfully deleted.
+ */
+ if (err == -EBUSY && lower_dentry->d_flags & DCACHE_NFSFS_RENAMED)
+ err = 0;
+ if (err)
+ goto out;
+ fsstack_copy_attr_times(dir, lower_dir_inode);
+ fsstack_copy_inode_size(dir, lower_dir_inode);
+ set_nlink(d_inode(dentry),
+ sdcardfs_lower_inode(d_inode(dentry))->i_nlink);
+ d_inode(dentry)->i_ctime = dir->i_ctime;
+ d_drop(dentry); /* this is needed, else LTP fails (VFS won't do it) */
+out:
+ unlock_dir(lower_dir_dentry);
+ dput(lower_dentry);
+ sdcardfs_put_lower_path(dentry, &lower_path);
+ revert_fsids(saved_cred);
+out_eacces:
+ return err;
+}
+
+static int touch(char *abs_path, mode_t mode)
+{
+ struct file *filp = filp_open(abs_path, O_RDWR|O_CREAT|O_EXCL|O_NOFOLLOW, mode);
+
+ if (IS_ERR(filp)) {
+ if (PTR_ERR(filp) == -EEXIST) {
+ return 0;
+ } else {
+ pr_err("sdcardfs: failed to open(%s): %ld\n",
+ abs_path, PTR_ERR(filp));
+ return PTR_ERR(filp);
+ }
+ }
+ filp_close(filp, current->files);
+ return 0;
+}
+
+static int sdcardfs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
+{
+ int err;
+ int make_nomedia_in_obb = 0;
+ struct dentry *lower_dentry;
+ struct vfsmount *lower_mnt;
+ struct dentry *lower_parent_dentry = NULL;
+ struct dentry *parent_dentry = NULL;
+ struct path lower_path;
+ struct sdcardfs_sb_info *sbi = SDCARDFS_SB(dentry->d_sb);
+ const struct cred *saved_cred = NULL;
+ struct sdcardfs_inode_data *pd = SDCARDFS_I(dir)->data;
+ int touch_err = 0;
+ struct fs_struct *saved_fs;
+ struct fs_struct *copied_fs;
+ struct qstr q_obb = QSTR_LITERAL("obb");
+ struct qstr q_data = QSTR_LITERAL("data");
+
+ if (!check_caller_access_to_name(dir, &dentry->d_name)) {
+ err = -EACCES;
+ goto out_eacces;
+ }
+
+ /* save current_cred and override it */
+ saved_cred = override_fsids(SDCARDFS_SB(dir->i_sb),
+ SDCARDFS_I(dir)->data);
+ if (!saved_cred)
+ return -ENOMEM;
+
+ /* check disk space */
+ parent_dentry = dget_parent(dentry);
+ if (!check_min_free_space(parent_dentry, 0, 1)) {
+ pr_err("sdcardfs: No minimum free space.\n");
+ err = -ENOSPC;
+ dput(parent_dentry);
+ goto out_revert;
+ }
+ dput(parent_dentry);
+
+ /* the lower_dentry is negative here */
+ sdcardfs_get_lower_path(dentry, &lower_path);
+ lower_dentry = lower_path.dentry;
+ lower_mnt = lower_path.mnt;
+ lower_parent_dentry = lock_parent(lower_dentry);
+
+ /* set last 16bytes of mode field to 0775 */
+ mode = (mode & S_IFMT) | 00775;
+
+ /* temporarily change umask for lower fs write */
+ saved_fs = current->fs;
+ copied_fs = copy_fs_struct(current->fs);
+ if (!copied_fs) {
+ err = -ENOMEM;
+ unlock_dir(lower_parent_dentry);
+ goto out_unlock;
+ }
+ copied_fs->umask = 0;
+ task_lock(current);
+ current->fs = copied_fs;
+ task_unlock(current);
+
+ err = vfs_mkdir2(lower_mnt, d_inode(lower_parent_dentry), lower_dentry, mode);
+
+ if (err) {
+ unlock_dir(lower_parent_dentry);
+ goto out;
+ }
+
+ /* if it is a local obb dentry, setup it with the base obbpath */
+ if (need_graft_path(dentry)) {
+
+ err = setup_obb_dentry(dentry, &lower_path);
+ if (err) {
+ /* if the sbi->obbpath is not available, the lower_path won't be
+ * changed by setup_obb_dentry() but the lower path is saved to
+ * its orig_path. this dentry will be revalidated later.
+ * but now, the lower_path should be NULL
+ */
+ sdcardfs_put_reset_lower_path(dentry);
+
+ /* the newly created lower path which saved to its orig_path or
+ * the lower_path is the base obbpath.
+ * therefore, an additional path_get is required
+ */
+ path_get(&lower_path);
+ } else
+ make_nomedia_in_obb = 1;
+ }
+
+ err = sdcardfs_interpose(dentry, dir->i_sb, &lower_path, pd->userid);
+ if (err) {
+ unlock_dir(lower_parent_dentry);
+ goto out;
+ }
+
+ fsstack_copy_attr_times(dir, sdcardfs_lower_inode(dir));
+ fsstack_copy_inode_size(dir, d_inode(lower_parent_dentry));
+ /* update number of links on parent directory */
+ set_nlink(dir, sdcardfs_lower_inode(dir)->i_nlink);
+ fixup_lower_ownership(dentry, dentry->d_name.name);
+ unlock_dir(lower_parent_dentry);
+ if ((!sbi->options.multiuser) && (qstr_case_eq(&dentry->d_name, &q_obb))
+ && (pd->perm == PERM_ANDROID) && (pd->userid == 0))
+ make_nomedia_in_obb = 1;
+
+ /* When creating /Android/data and /Android/obb, mark them as .nomedia */
+ if (make_nomedia_in_obb ||
+ ((pd->perm == PERM_ANDROID)
+ && (qstr_case_eq(&dentry->d_name, &q_data)))) {
+ revert_fsids(saved_cred);
+ saved_cred = override_fsids(sbi,
+ SDCARDFS_I(d_inode(dentry))->data);
+ if (!saved_cred) {
+ pr_err("sdcardfs: failed to set up .nomedia in %s: %d\n",
+ lower_path.dentry->d_name.name,
+ -ENOMEM);
+ goto out;
+ }
+ set_fs_pwd(current->fs, &lower_path);
+ touch_err = touch(".nomedia", 0664);
+ if (touch_err) {
+ pr_err("sdcardfs: failed to create .nomedia in %s: %d\n",
+ lower_path.dentry->d_name.name,
+ touch_err);
+ goto out;
+ }
+ }
+out:
+ task_lock(current);
+ current->fs = saved_fs;
+ task_unlock(current);
+
+ free_fs_struct(copied_fs);
+out_unlock:
+ sdcardfs_put_lower_path(dentry, &lower_path);
+out_revert:
+ revert_fsids(saved_cred);
+out_eacces:
+ return err;
+}
+
+static int sdcardfs_rmdir(struct inode *dir, struct dentry *dentry)
+{
+ struct dentry *lower_dentry;
+ struct dentry *lower_dir_dentry;
+ struct vfsmount *lower_mnt;
+ int err;
+ struct path lower_path;
+ const struct cred *saved_cred = NULL;
+
+ if (!check_caller_access_to_name(dir, &dentry->d_name)) {
+ err = -EACCES;
+ goto out_eacces;
+ }
+
+ /* save current_cred and override it */
+ saved_cred = override_fsids(SDCARDFS_SB(dir->i_sb),
+ SDCARDFS_I(dir)->data);
+ if (!saved_cred)
+ return -ENOMEM;
+
+ /* sdcardfs_get_real_lower(): in case of remove an user's obb dentry
+ * the dentry on the original path should be deleted.
+ */
+ sdcardfs_get_real_lower(dentry, &lower_path);
+
+ lower_dentry = lower_path.dentry;
+ lower_mnt = lower_path.mnt;
+ lower_dir_dentry = lock_parent(lower_dentry);
+
+ err = vfs_rmdir2(lower_mnt, d_inode(lower_dir_dentry), lower_dentry);
+ if (err)
+ goto out;
+
+ d_drop(dentry); /* drop our dentry on success (why not VFS's job?) */
+ if (d_inode(dentry))
+ clear_nlink(d_inode(dentry));
+ fsstack_copy_attr_times(dir, d_inode(lower_dir_dentry));
+ fsstack_copy_inode_size(dir, d_inode(lower_dir_dentry));
+ set_nlink(dir, d_inode(lower_dir_dentry)->i_nlink);
+
+out:
+ unlock_dir(lower_dir_dentry);
+ sdcardfs_put_real_lower(dentry, &lower_path);
+ revert_fsids(saved_cred);
+out_eacces:
+ return err;
+}
+
+/*
+ * The locking rules in sdcardfs_rename are complex. We could use a simpler
+ * superblock-level name-space lock for renames and copy-ups.
+ */
+static int sdcardfs_rename(struct inode *old_dir, struct dentry *old_dentry,
+ struct inode *new_dir, struct dentry *new_dentry,
+ unsigned int flags)
+{
+ int err = 0;
+ struct dentry *lower_old_dentry = NULL;
+ struct dentry *lower_new_dentry = NULL;
+ struct dentry *lower_old_dir_dentry = NULL;
+ struct dentry *lower_new_dir_dentry = NULL;
+ struct vfsmount *lower_mnt = NULL;
+ struct dentry *trap = NULL;
+ struct path lower_old_path, lower_new_path;
+ const struct cred *saved_cred = NULL;
+
+ if (flags)
+ return -EINVAL;
+
+ if (!check_caller_access_to_name(old_dir, &old_dentry->d_name) ||
+ !check_caller_access_to_name(new_dir, &new_dentry->d_name)) {
+ err = -EACCES;
+ goto out_eacces;
+ }
+
+ /* save current_cred and override it */
+ saved_cred = override_fsids(SDCARDFS_SB(old_dir->i_sb),
+ SDCARDFS_I(new_dir)->data);
+ if (!saved_cred)
+ return -ENOMEM;
+
+ sdcardfs_get_real_lower(old_dentry, &lower_old_path);
+ sdcardfs_get_lower_path(new_dentry, &lower_new_path);
+ lower_old_dentry = lower_old_path.dentry;
+ lower_new_dentry = lower_new_path.dentry;
+ lower_mnt = lower_old_path.mnt;
+ lower_old_dir_dentry = dget_parent(lower_old_dentry);
+ lower_new_dir_dentry = dget_parent(lower_new_dentry);
+
+ trap = lock_rename(lower_old_dir_dentry, lower_new_dir_dentry);
+ /* source should not be ancestor of target */
+ if (trap == lower_old_dentry) {
+ err = -EINVAL;
+ goto out;
+ }
+ /* target should not be ancestor of source */
+ if (trap == lower_new_dentry) {
+ err = -ENOTEMPTY;
+ goto out;
+ }
+
+ err = vfs_rename2(lower_mnt,
+ d_inode(lower_old_dir_dentry), lower_old_dentry,
+ d_inode(lower_new_dir_dentry), lower_new_dentry,
+ NULL, 0);
+ if (err)
+ goto out;
+
+ /* Copy attrs from lower dir, but i_uid/i_gid */
+ sdcardfs_copy_and_fix_attrs(new_dir, d_inode(lower_new_dir_dentry));
+ fsstack_copy_inode_size(new_dir, d_inode(lower_new_dir_dentry));
+
+ if (new_dir != old_dir) {
+ sdcardfs_copy_and_fix_attrs(old_dir, d_inode(lower_old_dir_dentry));
+ fsstack_copy_inode_size(old_dir, d_inode(lower_old_dir_dentry));
+ }
+ get_derived_permission_new(new_dentry->d_parent, old_dentry, &new_dentry->d_name);
+ fixup_tmp_permissions(d_inode(old_dentry));
+ fixup_lower_ownership(old_dentry, new_dentry->d_name.name);
+ d_invalidate(old_dentry); /* Can't fixup ownership recursively :( */
+out:
+ unlock_rename(lower_old_dir_dentry, lower_new_dir_dentry);
+ dput(lower_old_dir_dentry);
+ dput(lower_new_dir_dentry);
+ sdcardfs_put_real_lower(old_dentry, &lower_old_path);
+ sdcardfs_put_lower_path(new_dentry, &lower_new_path);
+ revert_fsids(saved_cred);
+out_eacces:
+ return err;
+}
+
+#if 0
+static int sdcardfs_readlink(struct dentry *dentry, char __user *buf, int bufsiz)
+{
+ int err;
+ struct dentry *lower_dentry;
+ struct path lower_path;
+ /* XXX readlink does not requires overriding credential */
+
+ sdcardfs_get_lower_path(dentry, &lower_path);
+ lower_dentry = lower_path.dentry;
+ if (!d_inode(lower_dentry)->i_op ||
+ !d_inode(lower_dentry)->i_op->readlink) {
+ err = -EINVAL;
+ goto out;
+ }
+
+ err = d_inode(lower_dentry)->i_op->readlink(lower_dentry,
+ buf, bufsiz);
+ if (err < 0)
+ goto out;
+ fsstack_copy_attr_atime(d_inode(dentry), d_inode(lower_dentry));
+
+out:
+ sdcardfs_put_lower_path(dentry, &lower_path);
+ return err;
+}
+#endif
+
+#if 0
+static const char *sdcardfs_follow_link(struct dentry *dentry, void **cookie)
+{
+ char *buf;
+ int len = PAGE_SIZE, err;
+ mm_segment_t old_fs;
+
+ /* This is freed by the put_link method assuming a successful call. */
+ buf = kmalloc(len, GFP_KERNEL);
+ if (!buf) {
+ buf = ERR_PTR(-ENOMEM);
+ return buf;
+ }
+
+ /* read the symlink, and then we will follow it */
+ old_fs = get_fs();
+ set_fs(KERNEL_DS);
+ err = sdcardfs_readlink(dentry, buf, len);
+ set_fs(old_fs);
+ if (err < 0) {
+ kfree(buf);
+ buf = ERR_PTR(err);
+ } else {
+ buf[err] = '\0';
+ }
+ return *cookie = buf;
+}
+#endif
+
+static int sdcardfs_permission_wrn(struct inode *inode, int mask)
+{
+ WARN_RATELIMIT(1, "sdcardfs does not support permission. Use permission2.\n");
+ return -EINVAL;
+}
+
+void copy_attrs(struct inode *dest, const struct inode *src)
+{
+ dest->i_mode = src->i_mode;
+ dest->i_uid = src->i_uid;
+ dest->i_gid = src->i_gid;
+ dest->i_rdev = src->i_rdev;
+ dest->i_atime = src->i_atime;
+ dest->i_mtime = src->i_mtime;
+ dest->i_ctime = src->i_ctime;
+ dest->i_blkbits = src->i_blkbits;
+ dest->i_flags = src->i_flags;
+#ifdef CONFIG_FS_POSIX_ACL
+ dest->i_acl = src->i_acl;
+#endif
+#ifdef CONFIG_SECURITY
+ dest->i_security = src->i_security;
+#endif
+}
+
+static int sdcardfs_permission(struct vfsmount *mnt, struct inode *inode, int mask)
+{
+ int err;
+ struct inode tmp;
+ struct sdcardfs_inode_data *top = top_data_get(SDCARDFS_I(inode));
+
+ if (IS_ERR(mnt))
+ return PTR_ERR(mnt);
+ if (!top)
+ return -EINVAL;
+
+ /*
+ * Permission check on sdcardfs inode.
+ * Calling process should have AID_SDCARD_RW permission
+ * Since generic_permission only needs i_mode, i_uid,
+ * i_gid, and i_sb, we can create a fake inode to pass
+ * this information down in.
+ *
+ * The underlying code may attempt to take locks in some
+ * cases for features we're not using, but if that changes,
+ * locks must be dealt with to avoid undefined behavior.
+ */
+ copy_attrs(&tmp, inode);
+ tmp.i_uid = make_kuid(&init_user_ns, top->d_uid);
+ tmp.i_gid = make_kgid(&init_user_ns, get_gid(mnt, inode->i_sb, top));
+ tmp.i_mode = (inode->i_mode & S_IFMT)
+ | get_mode(mnt, SDCARDFS_I(inode), top);
+ data_put(top);
+ tmp.i_sb = inode->i_sb;
+ if (IS_POSIXACL(inode))
+ pr_warn("%s: This may be undefined behavior...\n", __func__);
+ err = generic_permission(&tmp, mask);
+ return err;
+}
+
+static int sdcardfs_setattr_wrn(struct dentry *dentry, struct iattr *ia)
+{
+ WARN_RATELIMIT(1, "sdcardfs does not support setattr. User setattr2.\n");
+ return -EINVAL;
+}
+
+static int sdcardfs_setattr(struct vfsmount *mnt, struct dentry *dentry, struct iattr *ia)
+{
+ int err;
+ struct dentry *lower_dentry;
+ struct vfsmount *lower_mnt;
+ struct inode *inode;
+ struct inode *lower_inode;
+ struct path lower_path;
+ struct iattr lower_ia;
+ struct dentry *parent;
+ struct inode tmp;
+ struct dentry tmp_d;
+ struct sdcardfs_inode_data *top;
+
+ const struct cred *saved_cred = NULL;
+
+ inode = d_inode(dentry);
+ top = top_data_get(SDCARDFS_I(inode));
+
+ if (!top)
+ return -EINVAL;
+
+ /*
+ * Permission check on sdcardfs inode.
+ * Calling process should have AID_SDCARD_RW permission
+ * Since generic_permission only needs i_mode, i_uid,
+ * i_gid, and i_sb, we can create a fake inode to pass
+ * this information down in.
+ *
+ * The underlying code may attempt to take locks in some
+ * cases for features we're not using, but if that changes,
+ * locks must be dealt with to avoid undefined behavior.
+ *
+ */
+ copy_attrs(&tmp, inode);
+ tmp.i_uid = make_kuid(&init_user_ns, top->d_uid);
+ tmp.i_gid = make_kgid(&init_user_ns, get_gid(mnt, dentry->d_sb, top));
+ tmp.i_mode = (inode->i_mode & S_IFMT)
+ | get_mode(mnt, SDCARDFS_I(inode), top);
+ tmp.i_size = i_size_read(inode);
+ data_put(top);
+ tmp.i_sb = inode->i_sb;
+ tmp_d.d_inode = &tmp;
+
+ /*
+ * Check if user has permission to change dentry. We don't check if
+ * this user can change the lower inode: that should happen when
+ * calling notify_change on the lower inode.
+ */
+ /* prepare our own lower struct iattr (with the lower file) */
+ memcpy(&lower_ia, ia, sizeof(lower_ia));
+ /* Allow touch updating timestamps. A previous permission check ensures
+ * we have write access. Changes to mode, owner, and group are ignored
+ */
+ ia->ia_valid |= ATTR_FORCE;
+ err = setattr_prepare(&tmp_d, ia);
+
+ if (!err) {
+ /* check the Android group ID */
+ parent = dget_parent(dentry);
+ if (!check_caller_access_to_name(d_inode(parent), &dentry->d_name))
+ err = -EACCES;
+ dput(parent);
+ }
+
+ if (err)
+ goto out_err;
+
+ /* save current_cred and override it */
+ saved_cred = override_fsids(SDCARDFS_SB(dentry->d_sb),
+ SDCARDFS_I(inode)->data);
+ if (!saved_cred)
+ return -ENOMEM;
+
+ sdcardfs_get_lower_path(dentry, &lower_path);
+ lower_dentry = lower_path.dentry;
+ lower_mnt = lower_path.mnt;
+ lower_inode = sdcardfs_lower_inode(inode);
+
+ if (ia->ia_valid & ATTR_FILE)
+ lower_ia.ia_file = sdcardfs_lower_file(ia->ia_file);
+
+ lower_ia.ia_valid &= ~(ATTR_UID | ATTR_GID | ATTR_MODE);
+
+ /*
+ * If shrinking, first truncate upper level to cancel writing dirty
+ * pages beyond the new eof; and also if its' maxbytes is more
+ * limiting (fail with -EFBIG before making any change to the lower
+ * level). There is no need to vmtruncate the upper level
+ * afterwards in the other cases: we fsstack_copy_inode_size from
+ * the lower level.
+ */
+ if (ia->ia_valid & ATTR_SIZE) {
+ err = inode_newsize_ok(&tmp, ia->ia_size);
+ if (err) {
+ goto out;
+ }
+ truncate_setsize(inode, ia->ia_size);
+ }
+
+ /*
+ * mode change is for clearing setuid/setgid bits. Allow lower fs
+ * to interpret this in its own way.
+ */
+ if (lower_ia.ia_valid & (ATTR_KILL_SUID | ATTR_KILL_SGID))
+ lower_ia.ia_valid &= ~ATTR_MODE;
+
+ /* notify the (possibly copied-up) lower inode */
+ /*
+ * Note: we use d_inode(lower_dentry), because lower_inode may be
+ * unlinked (no inode->i_sb and i_ino==0. This happens if someone
+ * tries to open(), unlink(), then ftruncate() a file.
+ */
+ inode_lock(d_inode(lower_dentry));
+ err = notify_change2(lower_mnt, lower_dentry, &lower_ia, /* note: lower_ia */
+ NULL);
+ inode_unlock(d_inode(lower_dentry));
+ if (err)
+ goto out;
+
+ /* get attributes from the lower inode and update derived permissions */
+ sdcardfs_copy_and_fix_attrs(inode, lower_inode);
+
+ /*
+ * Not running fsstack_copy_inode_size(inode, lower_inode), because
+ * VFS should update our inode size, and notify_change on
+ * lower_inode should update its size.
+ */
+
+out:
+ sdcardfs_put_lower_path(dentry, &lower_path);
+ revert_fsids(saved_cred);
+out_err:
+ return err;
+}
+
+static int sdcardfs_fillattr(struct vfsmount *mnt, struct inode *inode,
+ struct kstat *lower_stat, struct kstat *stat)
+{
+ struct sdcardfs_inode_info *info = SDCARDFS_I(inode);
+ struct sdcardfs_inode_data *top = top_data_get(info);
+ struct super_block *sb = inode->i_sb;
+
+ if (!top)
+ return -EINVAL;
+
+ stat->dev = inode->i_sb->s_dev;
+ stat->ino = inode->i_ino;
+ stat->mode = (inode->i_mode & S_IFMT) | get_mode(mnt, info, top);
+ stat->nlink = inode->i_nlink;
+ stat->uid = make_kuid(&init_user_ns, top->d_uid);
+ stat->gid = make_kgid(&init_user_ns, get_gid(mnt, sb, top));
+ stat->rdev = inode->i_rdev;
+ stat->size = lower_stat->size;
+ stat->atime = lower_stat->atime;
+ stat->mtime = lower_stat->mtime;
+ stat->ctime = lower_stat->ctime;
+ stat->blksize = lower_stat->blksize;
+ stat->blocks = lower_stat->blocks;
+ data_put(top);
+ return 0;
+}
+
+static int sdcardfs_getattr(struct vfsmount *mnt, struct dentry *dentry,
+ struct kstat *stat)
+{
+ struct kstat lower_stat;
+ struct path lower_path;
+ struct dentry *parent;
+ int err;
+
+ parent = dget_parent(dentry);
+ if (!check_caller_access_to_name(d_inode(parent), &dentry->d_name)) {
+ dput(parent);
+ return -EACCES;
+ }
+ dput(parent);
+
+ sdcardfs_get_lower_path(dentry, &lower_path);
+ err = vfs_getattr(&lower_path, &lower_stat);
+ if (err)
+ goto out;
+ sdcardfs_copy_and_fix_attrs(d_inode(dentry),
+ d_inode(lower_path.dentry));
+ err = sdcardfs_fillattr(mnt, d_inode(dentry), &lower_stat, stat);
+out:
+ sdcardfs_put_lower_path(dentry, &lower_path);
+ return err;
+}
+
+const struct inode_operations sdcardfs_symlink_iops = {
+ .permission2 = sdcardfs_permission,
+ .setattr2 = sdcardfs_setattr,
+ /* XXX Following operations are implemented,
+ * but FUSE(sdcard) or FAT does not support them
+ * These methods are *NOT* perfectly tested.
+ .readlink = sdcardfs_readlink,
+ .follow_link = sdcardfs_follow_link,
+ .put_link = kfree_put_link,
+ */
+};
+
+const struct inode_operations sdcardfs_dir_iops = {
+ .create = sdcardfs_create,
+ .lookup = sdcardfs_lookup,
+ .permission = sdcardfs_permission_wrn,
+ .permission2 = sdcardfs_permission,
+ .unlink = sdcardfs_unlink,
+ .mkdir = sdcardfs_mkdir,
+ .rmdir = sdcardfs_rmdir,
+ .rename = sdcardfs_rename,
+ .setattr = sdcardfs_setattr_wrn,
+ .setattr2 = sdcardfs_setattr,
+ .getattr = sdcardfs_getattr,
+};
+
+const struct inode_operations sdcardfs_main_iops = {
+ .permission = sdcardfs_permission_wrn,
+ .permission2 = sdcardfs_permission,
+ .setattr = sdcardfs_setattr_wrn,
+ .setattr2 = sdcardfs_setattr,
+ .getattr = sdcardfs_getattr,
+};
diff --git a/fs/sdcardfs/lookup.c b/fs/sdcardfs/lookup.c
new file mode 100644
index 000000000000..beec63b107f4
--- /dev/null
+++ b/fs/sdcardfs/lookup.c
@@ -0,0 +1,469 @@
+/*
+ * fs/sdcardfs/lookup.c
+ *
+ * Copyright (c) 2013 Samsung Electronics Co. Ltd
+ * Authors: Daeho Jeong, Woojoong Lee, Seunghwan Hyun,
+ * Sunghwan Yun, Sungjong Seo
+ *
+ * This program has been developed as a stackable file system based on
+ * the WrapFS which written by
+ *
+ * Copyright (c) 1998-2011 Erez Zadok
+ * Copyright (c) 2009 Shrikar Archak
+ * Copyright (c) 2003-2011 Stony Brook University
+ * Copyright (c) 2003-2011 The Research Foundation of SUNY
+ *
+ * This file is dual licensed. It may be redistributed and/or modified
+ * under the terms of the Apache 2.0 License OR version 2 of the GNU
+ * General Public License.
+ */
+
+#include "sdcardfs.h"
+#include "linux/delay.h"
+
+/* The dentry cache is just so we have properly sized dentries */
+static struct kmem_cache *sdcardfs_dentry_cachep;
+
+int sdcardfs_init_dentry_cache(void)
+{
+ sdcardfs_dentry_cachep =
+ kmem_cache_create("sdcardfs_dentry",
+ sizeof(struct sdcardfs_dentry_info),
+ 0, SLAB_RECLAIM_ACCOUNT, NULL);
+
+ return sdcardfs_dentry_cachep ? 0 : -ENOMEM;
+}
+
+void sdcardfs_destroy_dentry_cache(void)
+{
+ kmem_cache_destroy(sdcardfs_dentry_cachep);
+}
+
+void free_dentry_private_data(struct dentry *dentry)
+{
+ kmem_cache_free(sdcardfs_dentry_cachep, dentry->d_fsdata);
+ dentry->d_fsdata = NULL;
+}
+
+/* allocate new dentry private data */
+int new_dentry_private_data(struct dentry *dentry)
+{
+ struct sdcardfs_dentry_info *info = SDCARDFS_D(dentry);
+
+ /* use zalloc to init dentry_info.lower_path */
+ info = kmem_cache_zalloc(sdcardfs_dentry_cachep, GFP_ATOMIC);
+ if (!info)
+ return -ENOMEM;
+
+ spin_lock_init(&info->lock);
+ dentry->d_fsdata = info;
+
+ return 0;
+}
+
+struct inode_data {
+ struct inode *lower_inode;
+ userid_t id;
+};
+
+static int sdcardfs_inode_test(struct inode *inode, void *candidate_data/*void *candidate_lower_inode*/)
+{
+ struct inode *current_lower_inode = sdcardfs_lower_inode(inode);
+ userid_t current_userid = SDCARDFS_I(inode)->data->userid;
+
+ if (current_lower_inode == ((struct inode_data *)candidate_data)->lower_inode &&
+ current_userid == ((struct inode_data *)candidate_data)->id)
+ return 1; /* found a match */
+ else
+ return 0; /* no match */
+}
+
+static int sdcardfs_inode_set(struct inode *inode, void *lower_inode)
+{
+ /* we do actual inode initialization in sdcardfs_iget */
+ return 0;
+}
+
+struct inode *sdcardfs_iget(struct super_block *sb, struct inode *lower_inode, userid_t id)
+{
+ struct sdcardfs_inode_info *info;
+ struct inode_data data;
+ struct inode *inode; /* the new inode to return */
+
+ if (!igrab(lower_inode))
+ return ERR_PTR(-ESTALE);
+
+ data.id = id;
+ data.lower_inode = lower_inode;
+ inode = iget5_locked(sb, /* our superblock */
+ /*
+ * hashval: we use inode number, but we can
+ * also use "(unsigned long)lower_inode"
+ * instead.
+ */
+ lower_inode->i_ino, /* hashval */
+ sdcardfs_inode_test, /* inode comparison function */
+ sdcardfs_inode_set, /* inode init function */
+ &data); /* data passed to test+set fxns */
+ if (!inode) {
+ iput(lower_inode);
+ return ERR_PTR(-ENOMEM);
+ }
+ /* if found a cached inode, then just return it (after iput) */
+ if (!(inode->i_state & I_NEW)) {
+ iput(lower_inode);
+ return inode;
+ }
+
+ /* initialize new inode */
+ info = SDCARDFS_I(inode);
+
+ inode->i_ino = lower_inode->i_ino;
+ sdcardfs_set_lower_inode(inode, lower_inode);
+
+ inode->i_version++;
+
+ /* use different set of inode ops for symlinks & directories */
+ if (S_ISDIR(lower_inode->i_mode))
+ inode->i_op = &sdcardfs_dir_iops;
+ else if (S_ISLNK(lower_inode->i_mode))
+ inode->i_op = &sdcardfs_symlink_iops;
+ else
+ inode->i_op = &sdcardfs_main_iops;
+
+ /* use different set of file ops for directories */
+ if (S_ISDIR(lower_inode->i_mode))
+ inode->i_fop = &sdcardfs_dir_fops;
+ else
+ inode->i_fop = &sdcardfs_main_fops;
+
+ inode->i_mapping->a_ops = &sdcardfs_aops;
+
+ inode->i_atime.tv_sec = 0;
+ inode->i_atime.tv_nsec = 0;
+ inode->i_mtime.tv_sec = 0;
+ inode->i_mtime.tv_nsec = 0;
+ inode->i_ctime.tv_sec = 0;
+ inode->i_ctime.tv_nsec = 0;
+
+ /* properly initialize special inodes */
+ if (S_ISBLK(lower_inode->i_mode) || S_ISCHR(lower_inode->i_mode) ||
+ S_ISFIFO(lower_inode->i_mode) || S_ISSOCK(lower_inode->i_mode))
+ init_special_inode(inode, lower_inode->i_mode,
+ lower_inode->i_rdev);
+
+ /* all well, copy inode attributes */
+ sdcardfs_copy_and_fix_attrs(inode, lower_inode);
+ fsstack_copy_inode_size(inode, lower_inode);
+
+ unlock_new_inode(inode);
+ return inode;
+}
+
+/*
+ * Helper interpose routine, called directly by ->lookup to handle
+ * spliced dentries.
+ */
+static struct dentry *__sdcardfs_interpose(struct dentry *dentry,
+ struct super_block *sb,
+ struct path *lower_path,
+ userid_t id)
+{
+ struct inode *inode;
+ struct inode *lower_inode;
+ struct super_block *lower_sb;
+ struct dentry *ret_dentry;
+
+ lower_inode = d_inode(lower_path->dentry);
+ lower_sb = sdcardfs_lower_super(sb);
+
+ /* check that the lower file system didn't cross a mount point */
+ if (lower_inode->i_sb != lower_sb) {
+ ret_dentry = ERR_PTR(-EXDEV);
+ goto out;
+ }
+
+ /*
+ * We allocate our new inode below by calling sdcardfs_iget,
+ * which will initialize some of the new inode's fields
+ */
+
+ /* inherit lower inode number for sdcardfs's inode */
+ inode = sdcardfs_iget(sb, lower_inode, id);
+ if (IS_ERR(inode)) {
+ ret_dentry = ERR_CAST(inode);
+ goto out;
+ }
+
+ ret_dentry = d_splice_alias(inode, dentry);
+ dentry = ret_dentry ?: dentry;
+ if (!IS_ERR(dentry))
+ update_derived_permission_lock(dentry);
+out:
+ return ret_dentry;
+}
+
+/*
+ * Connect an sdcardfs inode dentry/inode with several lower ones. This is
+ * the classic stackable file system "vnode interposition" action.
+ *
+ * @dentry: sdcardfs's dentry which interposes on lower one
+ * @sb: sdcardfs's super_block
+ * @lower_path: the lower path (caller does path_get/put)
+ */
+int sdcardfs_interpose(struct dentry *dentry, struct super_block *sb,
+ struct path *lower_path, userid_t id)
+{
+ struct dentry *ret_dentry;
+
+ ret_dentry = __sdcardfs_interpose(dentry, sb, lower_path, id);
+ return PTR_ERR(ret_dentry);
+}
+
+struct sdcardfs_name_data {
+ struct dir_context ctx;
+ const struct qstr *to_find;
+ char *name;
+ bool found;
+};
+
+static int sdcardfs_name_match(struct dir_context *ctx, const char *name,
+ int namelen, loff_t offset, u64 ino, unsigned int d_type)
+{
+ struct sdcardfs_name_data *buf = container_of(ctx, struct sdcardfs_name_data, ctx);
+ struct qstr candidate = QSTR_INIT(name, namelen);
+
+ if (qstr_case_eq(buf->to_find, &candidate)) {
+ memcpy(buf->name, name, namelen);
+ buf->name[namelen] = 0;
+ buf->found = true;
+ return 1;
+ }
+ return 0;
+}
+
+/*
+ * Main driver function for sdcardfs's lookup.
+ *
+ * Returns: NULL (ok), ERR_PTR if an error occurred.
+ * Fills in lower_parent_path with <dentry,mnt> on success.
+ */
+static struct dentry *__sdcardfs_lookup(struct dentry *dentry,
+ unsigned int flags, struct path *lower_parent_path, userid_t id)
+{
+ int err = 0;
+ struct vfsmount *lower_dir_mnt;
+ struct dentry *lower_dir_dentry = NULL;
+ struct dentry *lower_dentry;
+ const struct qstr *name;
+ struct path lower_path;
+ struct qstr dname;
+ struct dentry *ret_dentry = NULL;
+ struct sdcardfs_sb_info *sbi;
+
+ sbi = SDCARDFS_SB(dentry->d_sb);
+ /* must initialize dentry operations */
+ d_set_d_op(dentry, &sdcardfs_ci_dops);
+
+ if (IS_ROOT(dentry))
+ goto out;
+
+ name = &dentry->d_name;
+
+ /* now start the actual lookup procedure */
+ lower_dir_dentry = lower_parent_path->dentry;
+ lower_dir_mnt = lower_parent_path->mnt;
+
+ /* Use vfs_path_lookup to check if the dentry exists or not */
+ err = vfs_path_lookup(lower_dir_dentry, lower_dir_mnt, name->name, 0,
+ &lower_path);
+ /* check for other cases */
+ if (err == -ENOENT) {
+ struct file *file;
+ const struct cred *cred = current_cred();
+
+ struct sdcardfs_name_data buffer = {
+ .ctx.actor = sdcardfs_name_match,
+ .to_find = name,
+ .name = __getname(),
+ .found = false,
+ };
+
+ if (!buffer.name) {
+ err = -ENOMEM;
+ goto out;
+ }
+ file = dentry_open(lower_parent_path, O_RDONLY, cred);
+ if (IS_ERR(file)) {
+ err = PTR_ERR(file);
+ goto put_name;
+ }
+ err = iterate_dir(file, &buffer.ctx);
+ fput(file);
+ if (err)
+ goto put_name;
+
+ if (buffer.found)
+ err = vfs_path_lookup(lower_dir_dentry,
+ lower_dir_mnt,
+ buffer.name, 0,
+ &lower_path);
+ else
+ err = -ENOENT;
+put_name:
+ __putname(buffer.name);
+ }
+
+ /* no error: handle positive dentries */
+ if (!err) {
+ /* check if the dentry is an obb dentry
+ * if true, the lower_inode must be replaced with
+ * the inode of the graft path
+ */
+
+ if (need_graft_path(dentry)) {
+
+ /* setup_obb_dentry()
+ * The lower_path will be stored to the dentry's orig_path
+ * and the base obbpath will be copyed to the lower_path variable.
+ * if an error returned, there's no change in the lower_path
+ * returns: -ERRNO if error (0: no error)
+ */
+ err = setup_obb_dentry(dentry, &lower_path);
+
+ if (err) {
+ /* if the sbi->obbpath is not available, we can optionally
+ * setup the lower_path with its orig_path.
+ * but, the current implementation just returns an error
+ * because the sdcard daemon also regards this case as
+ * a lookup fail.
+ */
+ pr_info("sdcardfs: base obbpath is not available\n");
+ sdcardfs_put_reset_orig_path(dentry);
+ goto out;
+ }
+ }
+
+ sdcardfs_set_lower_path(dentry, &lower_path);
+ ret_dentry =
+ __sdcardfs_interpose(dentry, dentry->d_sb, &lower_path, id);
+ if (IS_ERR(ret_dentry)) {
+ err = PTR_ERR(ret_dentry);
+ /* path_put underlying path on error */
+ sdcardfs_put_reset_lower_path(dentry);
+ }
+ goto out;
+ }
+
+ /*
+ * We don't consider ENOENT an error, and we want to return a
+ * negative dentry.
+ */
+ if (err && err != -ENOENT)
+ goto out;
+
+ /* instatiate a new negative dentry */
+ dname.name = name->name;
+ dname.len = name->len;
+
+ /* See if the low-level filesystem might want
+ * to use its own hash
+ */
+ lower_dentry = d_hash_and_lookup(lower_dir_dentry, &dname);
+ if (IS_ERR(lower_dentry))
+ return lower_dentry;
+ if (!lower_dentry) {
+ /* We called vfs_path_lookup earlier, and did not get a negative
+ * dentry then. Don't confuse the lower filesystem by forcing
+ * one on it now...
+ */
+ err = -ENOENT;
+ goto out;
+ }
+
+ lower_path.dentry = lower_dentry;
+ lower_path.mnt = mntget(lower_dir_mnt);
+ sdcardfs_set_lower_path(dentry, &lower_path);
+
+ /*
+ * If the intent is to create a file, then don't return an error, so
+ * the VFS will continue the process of making this negative dentry
+ * into a positive one.
+ */
+ if (flags & (LOOKUP_CREATE|LOOKUP_RENAME_TARGET))
+ err = 0;
+
+out:
+ if (err)
+ return ERR_PTR(err);
+ return ret_dentry;
+}
+
+/*
+ * On success:
+ * fills dentry object appropriate values and returns NULL.
+ * On fail (== error)
+ * returns error ptr
+ *
+ * @dir : Parent inode.
+ * @dentry : Target dentry to lookup. we should set each of fields.
+ * (dentry->d_name is initialized already)
+ * @nd : nameidata of parent inode
+ */
+struct dentry *sdcardfs_lookup(struct inode *dir, struct dentry *dentry,
+ unsigned int flags)
+{
+ struct dentry *ret = NULL, *parent;
+ struct path lower_parent_path;
+ int err = 0;
+ const struct cred *saved_cred = NULL;
+
+ parent = dget_parent(dentry);
+
+ if (!check_caller_access_to_name(d_inode(parent), &dentry->d_name)) {
+ ret = ERR_PTR(-EACCES);
+ goto out_err;
+ }
+
+ /* save current_cred and override it */
+ saved_cred = override_fsids(SDCARDFS_SB(dir->i_sb),
+ SDCARDFS_I(dir)->data);
+ if (!saved_cred) {
+ ret = ERR_PTR(-ENOMEM);
+ goto out_err;
+ }
+
+ sdcardfs_get_lower_path(parent, &lower_parent_path);
+
+ /* allocate dentry private data. We free it in ->d_release */
+ err = new_dentry_private_data(dentry);
+ if (err) {
+ ret = ERR_PTR(err);
+ goto out;
+ }
+
+ ret = __sdcardfs_lookup(dentry, flags, &lower_parent_path,
+ SDCARDFS_I(dir)->data->userid);
+ if (IS_ERR(ret))
+ goto out;
+ if (ret)
+ dentry = ret;
+ if (d_inode(dentry)) {
+ fsstack_copy_attr_times(d_inode(dentry),
+ sdcardfs_lower_inode(d_inode(dentry)));
+ /* get derived permission */
+ get_derived_permission(parent, dentry);
+ fixup_tmp_permissions(d_inode(dentry));
+ fixup_lower_ownership(dentry, dentry->d_name.name);
+ }
+ /* update parent directory's atime */
+ fsstack_copy_attr_atime(d_inode(parent),
+ sdcardfs_lower_inode(d_inode(parent)));
+
+out:
+ sdcardfs_put_lower_path(parent, &lower_parent_path);
+ revert_fsids(saved_cred);
+out_err:
+ dput(parent);
+ return ret;
+}
diff --git a/fs/sdcardfs/main.c b/fs/sdcardfs/main.c
new file mode 100644
index 000000000000..d890c5711907
--- /dev/null
+++ b/fs/sdcardfs/main.c
@@ -0,0 +1,511 @@
+/*
+ * fs/sdcardfs/main.c
+ *
+ * Copyright (c) 2013 Samsung Electronics Co. Ltd
+ * Authors: Daeho Jeong, Woojoong Lee, Seunghwan Hyun,
+ * Sunghwan Yun, Sungjong Seo
+ *
+ * This program has been developed as a stackable file system based on
+ * the WrapFS which written by
+ *
+ * Copyright (c) 1998-2011 Erez Zadok
+ * Copyright (c) 2009 Shrikar Archak
+ * Copyright (c) 2003-2011 Stony Brook University
+ * Copyright (c) 2003-2011 The Research Foundation of SUNY
+ *
+ * This file is dual licensed. It may be redistributed and/or modified
+ * under the terms of the Apache 2.0 License OR version 2 of the GNU
+ * General Public License.
+ */
+
+#include "sdcardfs.h"
+#include <linux/module.h>
+#include <linux/types.h>
+#include <linux/parser.h>
+
+enum {
+ Opt_fsuid,
+ Opt_fsgid,
+ Opt_gid,
+ Opt_debug,
+ Opt_mask,
+ Opt_multiuser,
+ Opt_userid,
+ Opt_reserved_mb,
+ Opt_gid_derivation,
+ Opt_default_normal,
+ Opt_nocache,
+ Opt_unshared_obb,
+ Opt_err,
+};
+
+static const match_table_t sdcardfs_tokens = {
+ {Opt_fsuid, "fsuid=%u"},
+ {Opt_fsgid, "fsgid=%u"},
+ {Opt_gid, "gid=%u"},
+ {Opt_debug, "debug"},
+ {Opt_mask, "mask=%u"},
+ {Opt_userid, "userid=%d"},
+ {Opt_multiuser, "multiuser"},
+ {Opt_gid_derivation, "derive_gid"},
+ {Opt_default_normal, "default_normal"},
+ {Opt_unshared_obb, "unshared_obb"},
+ {Opt_reserved_mb, "reserved_mb=%u"},
+ {Opt_nocache, "nocache"},
+ {Opt_err, NULL}
+};
+
+static int parse_options(struct super_block *sb, char *options, int silent,
+ int *debug, struct sdcardfs_vfsmount_options *vfsopts,
+ struct sdcardfs_mount_options *opts)
+{
+ char *p;
+ substring_t args[MAX_OPT_ARGS];
+ int option;
+
+ /* by default, we use AID_MEDIA_RW as uid, gid */
+ opts->fs_low_uid = AID_MEDIA_RW;
+ opts->fs_low_gid = AID_MEDIA_RW;
+ vfsopts->mask = 0;
+ opts->multiuser = false;
+ opts->fs_user_id = 0;
+ vfsopts->gid = 0;
+ /* by default, 0MB is reserved */
+ opts->reserved_mb = 0;
+ /* by default, gid derivation is off */
+ opts->gid_derivation = false;
+ opts->default_normal = false;
+ opts->nocache = false;
+
+ *debug = 0;
+
+ if (!options)
+ return 0;
+
+ while ((p = strsep(&options, ",")) != NULL) {
+ int token;
+
+ if (!*p)
+ continue;
+
+ token = match_token(p, sdcardfs_tokens, args);
+
+ switch (token) {
+ case Opt_debug:
+ *debug = 1;
+ break;
+ case Opt_fsuid:
+ if (match_int(&args[0], &option))
+ return 0;
+ opts->fs_low_uid = option;
+ break;
+ case Opt_fsgid:
+ if (match_int(&args[0], &option))
+ return 0;
+ opts->fs_low_gid = option;
+ break;
+ case Opt_gid:
+ if (match_int(&args[0], &option))
+ return 0;
+ vfsopts->gid = option;
+ break;
+ case Opt_userid:
+ if (match_int(&args[0], &option))
+ return 0;
+ opts->fs_user_id = option;
+ break;
+ case Opt_mask:
+ if (match_int(&args[0], &option))
+ return 0;
+ vfsopts->mask = option;
+ break;
+ case Opt_multiuser:
+ opts->multiuser = true;
+ break;
+ case Opt_reserved_mb:
+ if (match_int(&args[0], &option))
+ return 0;
+ opts->reserved_mb = option;
+ break;
+ case Opt_gid_derivation:
+ opts->gid_derivation = true;
+ break;
+ case Opt_default_normal:
+ opts->default_normal = true;
+ break;
+ case Opt_nocache:
+ opts->nocache = true;
+ break;
+ case Opt_unshared_obb:
+ opts->unshared_obb = true;
+ break;
+ /* unknown option */
+ default:
+ if (!silent)
+ pr_err("Unrecognized mount option \"%s\" or missing value", p);
+ return -EINVAL;
+ }
+ }
+
+ if (*debug) {
+ pr_info("sdcardfs : options - debug:%d\n", *debug);
+ pr_info("sdcardfs : options - uid:%d\n",
+ opts->fs_low_uid);
+ pr_info("sdcardfs : options - gid:%d\n",
+ opts->fs_low_gid);
+ }
+
+ return 0;
+}
+
+int parse_options_remount(struct super_block *sb, char *options, int silent,
+ struct sdcardfs_vfsmount_options *vfsopts)
+{
+ char *p;
+ substring_t args[MAX_OPT_ARGS];
+ int option;
+ int debug;
+
+ if (!options)
+ return 0;
+
+ while ((p = strsep(&options, ",")) != NULL) {
+ int token;
+
+ if (!*p)
+ continue;
+
+ token = match_token(p, sdcardfs_tokens, args);
+
+ switch (token) {
+ case Opt_debug:
+ debug = 1;
+ break;
+ case Opt_gid:
+ if (match_int(&args[0], &option))
+ return 0;
+ vfsopts->gid = option;
+
+ break;
+ case Opt_mask:
+ if (match_int(&args[0], &option))
+ return 0;
+ vfsopts->mask = option;
+ break;
+ case Opt_unshared_obb:
+ case Opt_default_normal:
+ case Opt_multiuser:
+ case Opt_userid:
+ case Opt_fsuid:
+ case Opt_fsgid:
+ case Opt_reserved_mb:
+ case Opt_gid_derivation:
+ if (!silent)
+ pr_warn("Option \"%s\" can't be changed during remount\n", p);
+ break;
+ /* unknown option */
+ default:
+ if (!silent)
+ pr_err("Unrecognized mount option \"%s\" or missing value", p);
+ return -EINVAL;
+ }
+ }
+
+ if (debug) {
+ pr_info("sdcardfs : options - debug:%d\n", debug);
+ pr_info("sdcardfs : options - gid:%d\n", vfsopts->gid);
+ pr_info("sdcardfs : options - mask:%d\n", vfsopts->mask);
+ }
+
+ return 0;
+}
+
+#if 0
+/*
+ * our custom d_alloc_root work-alike
+ *
+ * we can't use d_alloc_root if we want to use our own interpose function
+ * unchanged, so we simply call our own "fake" d_alloc_root
+ */
+static struct dentry *sdcardfs_d_alloc_root(struct super_block *sb)
+{
+ struct dentry *ret = NULL;
+
+ if (sb) {
+ static const struct qstr name = {
+ .name = "/",
+ .len = 1
+ };
+
+ ret = d_alloc(NULL, &name);
+ if (ret) {
+ d_set_d_op(ret, &sdcardfs_ci_dops);
+ ret->d_sb = sb;
+ ret->d_parent = ret;
+ }
+ }
+ return ret;
+}
+#endif
+
+DEFINE_MUTEX(sdcardfs_super_list_lock);
+EXPORT_SYMBOL_GPL(sdcardfs_super_list_lock);
+LIST_HEAD(sdcardfs_super_list);
+EXPORT_SYMBOL_GPL(sdcardfs_super_list);
+
+/*
+ * There is no need to lock the sdcardfs_super_info's rwsem as there is no
+ * way anyone can have a reference to the superblock at this point in time.
+ */
+static int sdcardfs_read_super(struct vfsmount *mnt, struct super_block *sb,
+ const char *dev_name, void *raw_data, int silent)
+{
+ int err = 0;
+ int debug;
+ struct super_block *lower_sb;
+ struct path lower_path;
+ struct sdcardfs_sb_info *sb_info;
+ struct sdcardfs_vfsmount_options *mnt_opt = mnt->data;
+ struct inode *inode;
+
+ pr_info("sdcardfs version 2.0\n");
+
+ if (!dev_name) {
+ pr_err("sdcardfs: read_super: missing dev_name argument\n");
+ err = -EINVAL;
+ goto out;
+ }
+
+ pr_info("sdcardfs: dev_name -> %s\n", dev_name);
+ pr_info("sdcardfs: options -> %s\n", (char *)raw_data);
+ pr_info("sdcardfs: mnt -> %p\n", mnt);
+
+ /* parse lower path */
+ err = kern_path(dev_name, LOOKUP_FOLLOW | LOOKUP_DIRECTORY,
+ &lower_path);
+ if (err) {
+ pr_err("sdcardfs: error accessing lower directory '%s'\n", dev_name);
+ goto out;
+ }
+
+ /* allocate superblock private data */
+ sb->s_fs_info = kzalloc(sizeof(struct sdcardfs_sb_info), GFP_KERNEL);
+ if (!SDCARDFS_SB(sb)) {
+ pr_crit("sdcardfs: read_super: out of memory\n");
+ err = -ENOMEM;
+ goto out_free;
+ }
+
+ sb_info = sb->s_fs_info;
+ /* parse options */
+ err = parse_options(sb, raw_data, silent, &debug, mnt_opt, &sb_info->options);
+ if (err) {
+ pr_err("sdcardfs: invalid options\n");
+ goto out_freesbi;
+ }
+
+ /* set the lower superblock field of upper superblock */
+ lower_sb = lower_path.dentry->d_sb;
+ atomic_inc(&lower_sb->s_active);
+ sdcardfs_set_lower_super(sb, lower_sb);
+
+ sb->s_stack_depth = lower_sb->s_stack_depth + 1;
+ if (sb->s_stack_depth > FILESYSTEM_MAX_STACK_DEPTH) {
+ pr_err("sdcardfs: maximum fs stacking depth exceeded\n");
+ err = -EINVAL;
+ goto out_sput;
+ }
+
+ /* inherit maxbytes from lower file system */
+ sb->s_maxbytes = lower_sb->s_maxbytes;
+
+ /*
+ * Our c/m/atime granularity is 1 ns because we may stack on file
+ * systems whose granularity is as good.
+ */
+ sb->s_time_gran = 1;
+
+ sb->s_magic = SDCARDFS_SUPER_MAGIC;
+ sb->s_op = &sdcardfs_sops;
+
+ /* get a new inode and allocate our root dentry */
+ inode = sdcardfs_iget(sb, d_inode(lower_path.dentry), 0);
+ if (IS_ERR(inode)) {
+ err = PTR_ERR(inode);
+ goto out_sput;
+ }
+ sb->s_root = d_make_root(inode);
+ if (!sb->s_root) {
+ err = -ENOMEM;
+ goto out_sput;
+ }
+ d_set_d_op(sb->s_root, &sdcardfs_ci_dops);
+
+ /* link the upper and lower dentries */
+ sb->s_root->d_fsdata = NULL;
+ err = new_dentry_private_data(sb->s_root);
+ if (err)
+ goto out_freeroot;
+
+ /* set the lower dentries for s_root */
+ sdcardfs_set_lower_path(sb->s_root, &lower_path);
+
+ /*
+ * No need to call interpose because we already have a positive
+ * dentry, which was instantiated by d_make_root. Just need to
+ * d_rehash it.
+ */
+ d_rehash(sb->s_root);
+
+ /* setup permission policy */
+ sb_info->obbpath_s = kzalloc(PATH_MAX, GFP_KERNEL);
+ mutex_lock(&sdcardfs_super_list_lock);
+ if (sb_info->options.multiuser) {
+ setup_derived_state(d_inode(sb->s_root), PERM_PRE_ROOT,
+ sb_info->options.fs_user_id, AID_ROOT);
+ snprintf(sb_info->obbpath_s, PATH_MAX, "%s/obb", dev_name);
+ } else {
+ setup_derived_state(d_inode(sb->s_root), PERM_ROOT,
+ sb_info->options.fs_user_id, AID_ROOT);
+ snprintf(sb_info->obbpath_s, PATH_MAX, "%s/Android/obb", dev_name);
+ }
+ fixup_tmp_permissions(d_inode(sb->s_root));
+ sb_info->sb = sb;
+ list_add(&sb_info->list, &sdcardfs_super_list);
+ mutex_unlock(&sdcardfs_super_list_lock);
+
+ if (!silent)
+ pr_info("sdcardfs: mounted on top of %s type %s\n",
+ dev_name, lower_sb->s_type->name);
+ goto out; /* all is well */
+
+ /* no longer needed: free_dentry_private_data(sb->s_root); */
+out_freeroot:
+ dput(sb->s_root);
+ sb->s_root = NULL;
+out_sput:
+ /* drop refs we took earlier */
+ atomic_dec(&lower_sb->s_active);
+out_freesbi:
+ kfree(SDCARDFS_SB(sb));
+ sb->s_fs_info = NULL;
+out_free:
+ path_put(&lower_path);
+
+out:
+ return err;
+}
+
+struct sdcardfs_mount_private {
+ struct vfsmount *mnt;
+ const char *dev_name;
+ void *raw_data;
+};
+
+static int __sdcardfs_fill_super(
+ struct super_block *sb,
+ void *_priv, int silent)
+{
+ struct sdcardfs_mount_private *priv = _priv;
+
+ return sdcardfs_read_super(priv->mnt,
+ sb, priv->dev_name, priv->raw_data, silent);
+}
+
+static struct dentry *sdcardfs_mount(struct vfsmount *mnt,
+ struct file_system_type *fs_type, int flags,
+ const char *dev_name, void *raw_data)
+{
+ struct sdcardfs_mount_private priv = {
+ .mnt = mnt,
+ .dev_name = dev_name,
+ .raw_data = raw_data
+ };
+
+ return mount_nodev(fs_type, flags,
+ &priv, __sdcardfs_fill_super);
+}
+
+static struct dentry *sdcardfs_mount_wrn(struct file_system_type *fs_type,
+ int flags, const char *dev_name, void *raw_data)
+{
+ WARN(1, "sdcardfs does not support mount. Use mount2.\n");
+ return ERR_PTR(-EINVAL);
+}
+
+void *sdcardfs_alloc_mnt_data(void)
+{
+ return kmalloc(sizeof(struct sdcardfs_vfsmount_options), GFP_KERNEL);
+}
+
+void sdcardfs_kill_sb(struct super_block *sb)
+{
+ struct sdcardfs_sb_info *sbi;
+
+ if (sb->s_magic == SDCARDFS_SUPER_MAGIC && sb->s_fs_info) {
+ sbi = SDCARDFS_SB(sb);
+ mutex_lock(&sdcardfs_super_list_lock);
+ list_del(&sbi->list);
+ mutex_unlock(&sdcardfs_super_list_lock);
+ }
+ kill_anon_super(sb);
+}
+
+static struct file_system_type sdcardfs_fs_type = {
+ .owner = THIS_MODULE,
+ .name = SDCARDFS_NAME,
+ .mount = sdcardfs_mount_wrn,
+ .mount2 = sdcardfs_mount,
+ .alloc_mnt_data = sdcardfs_alloc_mnt_data,
+ .kill_sb = sdcardfs_kill_sb,
+ .fs_flags = 0,
+};
+MODULE_ALIAS_FS(SDCARDFS_NAME);
+
+static int __init init_sdcardfs_fs(void)
+{
+ int err;
+
+ pr_info("Registering sdcardfs " SDCARDFS_VERSION "\n");
+
+ err = sdcardfs_init_inode_cache();
+ if (err)
+ goto out;
+ err = sdcardfs_init_dentry_cache();
+ if (err)
+ goto out;
+ err = packagelist_init();
+ if (err)
+ goto out;
+ err = register_filesystem(&sdcardfs_fs_type);
+out:
+ if (err) {
+ sdcardfs_destroy_inode_cache();
+ sdcardfs_destroy_dentry_cache();
+ packagelist_exit();
+ }
+ return err;
+}
+
+static void __exit exit_sdcardfs_fs(void)
+{
+ sdcardfs_destroy_inode_cache();
+ sdcardfs_destroy_dentry_cache();
+ packagelist_exit();
+ unregister_filesystem(&sdcardfs_fs_type);
+ pr_info("Completed sdcardfs module unload\n");
+}
+
+/* Original wrapfs authors */
+MODULE_AUTHOR("Erez Zadok, Filesystems and Storage Lab, Stony Brook University (http://www.fsl.cs.sunysb.edu/)");
+
+/* Original sdcardfs authors */
+MODULE_AUTHOR("Woojoong Lee, Daeho Jeong, Kitae Lee, Yeongjin Gil System Memory Lab., Samsung Electronics");
+
+/* Current maintainer */
+MODULE_AUTHOR("Daniel Rosenberg, Google");
+MODULE_DESCRIPTION("Sdcardfs " SDCARDFS_VERSION);
+MODULE_LICENSE("GPL");
+
+module_init(init_sdcardfs_fs);
+module_exit(exit_sdcardfs_fs);
diff --git a/fs/sdcardfs/mmap.c b/fs/sdcardfs/mmap.c
new file mode 100644
index 000000000000..391d2a7d10e9
--- /dev/null
+++ b/fs/sdcardfs/mmap.c
@@ -0,0 +1,88 @@
+/*
+ * fs/sdcardfs/mmap.c
+ *
+ * Copyright (c) 2013 Samsung Electronics Co. Ltd
+ * Authors: Daeho Jeong, Woojoong Lee, Seunghwan Hyun,
+ * Sunghwan Yun, Sungjong Seo
+ *
+ * This program has been developed as a stackable file system based on
+ * the WrapFS which written by
+ *
+ * Copyright (c) 1998-2011 Erez Zadok
+ * Copyright (c) 2009 Shrikar Archak
+ * Copyright (c) 2003-2011 Stony Brook University
+ * Copyright (c) 2003-2011 The Research Foundation of SUNY
+ *
+ * This file is dual licensed. It may be redistributed and/or modified
+ * under the terms of the Apache 2.0 License OR version 2 of the GNU
+ * General Public License.
+ */
+
+#include "sdcardfs.h"
+
+static int sdcardfs_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
+{
+ int err;
+ struct file *file;
+ const struct vm_operations_struct *lower_vm_ops;
+
+ file = (struct file *)vma->vm_private_data;
+ lower_vm_ops = SDCARDFS_F(file)->lower_vm_ops;
+ BUG_ON(!lower_vm_ops);
+
+ err = lower_vm_ops->fault(vma, vmf);
+ return err;
+}
+
+static void sdcardfs_vm_open(struct vm_area_struct *vma)
+{
+ struct file *file = (struct file *)vma->vm_private_data;
+
+ get_file(file);
+}
+
+static void sdcardfs_vm_close(struct vm_area_struct *vma)
+{
+ struct file *file = (struct file *)vma->vm_private_data;
+
+ fput(file);
+}
+
+static int sdcardfs_page_mkwrite(struct vm_area_struct *vma,
+ struct vm_fault *vmf)
+{
+ int err = 0;
+ struct file *file;
+ const struct vm_operations_struct *lower_vm_ops;
+
+ file = (struct file *)vma->vm_private_data;
+ lower_vm_ops = SDCARDFS_F(file)->lower_vm_ops;
+ BUG_ON(!lower_vm_ops);
+ if (!lower_vm_ops->page_mkwrite)
+ goto out;
+
+ err = lower_vm_ops->page_mkwrite(vma, vmf);
+out:
+ return err;
+}
+
+static ssize_t sdcardfs_direct_IO(struct kiocb *iocb, struct iov_iter *iter)
+{
+ /*
+ * This function should never be called directly. We need it
+ * to exist, to get past a check in open_check_o_direct(),
+ * which is called from do_last().
+ */
+ return -EINVAL;
+}
+
+const struct address_space_operations sdcardfs_aops = {
+ .direct_IO = sdcardfs_direct_IO,
+};
+
+const struct vm_operations_struct sdcardfs_vm_ops = {
+ .fault = sdcardfs_fault,
+ .page_mkwrite = sdcardfs_page_mkwrite,
+ .open = sdcardfs_vm_open,
+ .close = sdcardfs_vm_close,
+};
diff --git a/fs/sdcardfs/multiuser.h b/fs/sdcardfs/multiuser.h
new file mode 100644
index 000000000000..85341e753f8c
--- /dev/null
+++ b/fs/sdcardfs/multiuser.h
@@ -0,0 +1,53 @@
+/*
+ * fs/sdcardfs/multiuser.h
+ *
+ * Copyright (c) 2013 Samsung Electronics Co. Ltd
+ * Authors: Daeho Jeong, Woojoong Lee, Seunghwan Hyun,
+ * Sunghwan Yun, Sungjong Seo
+ *
+ * This program has been developed as a stackable file system based on
+ * the WrapFS which written by
+ *
+ * Copyright (c) 1998-2011 Erez Zadok
+ * Copyright (c) 2009 Shrikar Archak
+ * Copyright (c) 2003-2011 Stony Brook University
+ * Copyright (c) 2003-2011 The Research Foundation of SUNY
+ *
+ * This file is dual licensed. It may be redistributed and/or modified
+ * under the terms of the Apache 2.0 License OR version 2 of the GNU
+ * General Public License.
+ */
+
+#define AID_USER_OFFSET 100000 /* offset for uid ranges for each user */
+#define AID_APP_START 10000 /* first app user */
+#define AID_APP_END 19999 /* last app user */
+#define AID_CACHE_GID_START 20000 /* start of gids for apps to mark cached data */
+#define AID_EXT_GID_START 30000 /* start of gids for apps to mark external data */
+#define AID_EXT_CACHE_GID_START 40000 /* start of gids for apps to mark external cached data */
+#define AID_EXT_CACHE_GID_END 49999 /* end of gids for apps to mark external cached data */
+#define AID_SHARED_GID_START 50000 /* start of gids for apps in each user to share */
+
+typedef uid_t userid_t;
+typedef uid_t appid_t;
+
+static inline uid_t multiuser_get_uid(userid_t user_id, appid_t app_id)
+{
+ return (user_id * AID_USER_OFFSET) + (app_id % AID_USER_OFFSET);
+}
+
+static inline bool uid_is_app(uid_t uid)
+{
+ appid_t appid = uid % AID_USER_OFFSET;
+
+ return appid >= AID_APP_START && appid <= AID_APP_END;
+}
+
+static inline gid_t multiuser_get_ext_cache_gid(uid_t uid)
+{
+ return uid - AID_APP_START + AID_EXT_CACHE_GID_START;
+}
+
+static inline gid_t multiuser_get_ext_gid(uid_t uid)
+{
+ return uid - AID_APP_START + AID_EXT_GID_START;
+}
diff --git a/fs/sdcardfs/packagelist.c b/fs/sdcardfs/packagelist.c
new file mode 100644
index 000000000000..4b9a5635f1e0
--- /dev/null
+++ b/fs/sdcardfs/packagelist.c
@@ -0,0 +1,882 @@
+/*
+ * fs/sdcardfs/packagelist.c
+ *
+ * Copyright (c) 2013 Samsung Electronics Co. Ltd
+ * Authors: Daeho Jeong, Woojoong Lee, Seunghwan Hyun,
+ * Sunghwan Yun, Sungjong Seo
+ *
+ * This program has been developed as a stackable file system based on
+ * the WrapFS which written by
+ *
+ * Copyright (c) 1998-2011 Erez Zadok
+ * Copyright (c) 2009 Shrikar Archak
+ * Copyright (c) 2003-2011 Stony Brook University
+ * Copyright (c) 2003-2011 The Research Foundation of SUNY
+ *
+ * This file is dual licensed. It may be redistributed and/or modified
+ * under the terms of the Apache 2.0 License OR version 2 of the GNU
+ * General Public License.
+ */
+
+#include "sdcardfs.h"
+#include <linux/hashtable.h>
+#include <linux/ctype.h>
+#include <linux/delay.h>
+#include <linux/radix-tree.h>
+#include <linux/dcache.h>
+
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/slab.h>
+
+#include <linux/configfs.h>
+
+struct hashtable_entry {
+ struct hlist_node hlist;
+ struct hlist_node dlist; /* for deletion cleanup */
+ struct qstr key;
+ atomic_t value;
+};
+
+static DEFINE_HASHTABLE(package_to_appid, 8);
+static DEFINE_HASHTABLE(package_to_userid, 8);
+static DEFINE_HASHTABLE(ext_to_groupid, 8);
+
+
+static struct kmem_cache *hashtable_entry_cachep;
+
+static unsigned int full_name_case_hash(const void *salt, const unsigned char *name, unsigned int len)
+{
+ unsigned long hash = init_name_hash(salt);
+
+ while (len--)
+ hash = partial_name_hash(tolower(*name++), hash);
+ return end_name_hash(hash);
+}
+
+static inline void qstr_init(struct qstr *q, const char *name)
+{
+ q->name = name;
+ q->len = strlen(q->name);
+ q->hash = full_name_case_hash(0, q->name, q->len);
+}
+
+static inline int qstr_copy(const struct qstr *src, struct qstr *dest)
+{
+ dest->name = kstrdup(src->name, GFP_KERNEL);
+ dest->hash_len = src->hash_len;
+ return !!dest->name;
+}
+
+
+static appid_t __get_appid(const struct qstr *key)
+{
+ struct hashtable_entry *hash_cur;
+ unsigned int hash = key->hash;
+ appid_t ret_id;
+
+ rcu_read_lock();
+ hash_for_each_possible_rcu(package_to_appid, hash_cur, hlist, hash) {
+ if (qstr_case_eq(key, &hash_cur->key)) {
+ ret_id = atomic_read(&hash_cur->value);
+ rcu_read_unlock();
+ return ret_id;
+ }
+ }
+ rcu_read_unlock();
+ return 0;
+}
+
+appid_t get_appid(const char *key)
+{
+ struct qstr q;
+
+ qstr_init(&q, key);
+ return __get_appid(&q);
+}
+
+static appid_t __get_ext_gid(const struct qstr *key)
+{
+ struct hashtable_entry *hash_cur;
+ unsigned int hash = key->hash;
+ appid_t ret_id;
+
+ rcu_read_lock();
+ hash_for_each_possible_rcu(ext_to_groupid, hash_cur, hlist, hash) {
+ if (qstr_case_eq(key, &hash_cur->key)) {
+ ret_id = atomic_read(&hash_cur->value);
+ rcu_read_unlock();
+ return ret_id;
+ }
+ }
+ rcu_read_unlock();
+ return 0;
+}
+
+appid_t get_ext_gid(const char *key)
+{
+ struct qstr q;
+
+ qstr_init(&q, key);
+ return __get_ext_gid(&q);
+}
+
+static appid_t __is_excluded(const struct qstr *app_name, userid_t user)
+{
+ struct hashtable_entry *hash_cur;
+ unsigned int hash = app_name->hash;
+
+ rcu_read_lock();
+ hash_for_each_possible_rcu(package_to_userid, hash_cur, hlist, hash) {
+ if (atomic_read(&hash_cur->value) == user &&
+ qstr_case_eq(app_name, &hash_cur->key)) {
+ rcu_read_unlock();
+ return 1;
+ }
+ }
+ rcu_read_unlock();
+ return 0;
+}
+
+appid_t is_excluded(const char *key, userid_t user)
+{
+ struct qstr q;
+ qstr_init(&q, key);
+ return __is_excluded(&q, user);
+}
+
+/* Kernel has already enforced everything we returned through
+ * derive_permissions_locked(), so this is used to lock down access
+ * even further, such as enforcing that apps hold sdcard_rw.
+ */
+int check_caller_access_to_name(struct inode *parent_node, const struct qstr *name)
+{
+ struct qstr q_autorun = QSTR_LITERAL("autorun.inf");
+ struct qstr q__android_secure = QSTR_LITERAL(".android_secure");
+ struct qstr q_android_secure = QSTR_LITERAL("android_secure");
+
+ /* Always block security-sensitive files at root */
+ if (parent_node && SDCARDFS_I(parent_node)->data->perm == PERM_ROOT) {
+ if (qstr_case_eq(name, &q_autorun)
+ || qstr_case_eq(name, &q__android_secure)
+ || qstr_case_eq(name, &q_android_secure)) {
+ return 0;
+ }
+ }
+
+ /* Root always has access; access for any other UIDs should always
+ * be controlled through packages.list.
+ */
+ if (from_kuid(&init_user_ns, current_fsuid()) == 0)
+ return 1;
+
+ /* No extra permissions to enforce */
+ return 1;
+}
+
+static struct hashtable_entry *alloc_hashtable_entry(const struct qstr *key,
+ appid_t value)
+{
+ struct hashtable_entry *ret = kmem_cache_alloc(hashtable_entry_cachep,
+ GFP_KERNEL);
+ if (!ret)
+ return NULL;
+ INIT_HLIST_NODE(&ret->dlist);
+ INIT_HLIST_NODE(&ret->hlist);
+
+ if (!qstr_copy(key, &ret->key)) {
+ kmem_cache_free(hashtable_entry_cachep, ret);
+ return NULL;
+ }
+
+ atomic_set(&ret->value, value);
+ return ret;
+}
+
+static int insert_packagelist_appid_entry_locked(const struct qstr *key, appid_t value)
+{
+ struct hashtable_entry *hash_cur;
+ struct hashtable_entry *new_entry;
+ unsigned int hash = key->hash;
+
+ hash_for_each_possible_rcu(package_to_appid, hash_cur, hlist, hash) {
+ if (qstr_case_eq(key, &hash_cur->key)) {
+ atomic_set(&hash_cur->value, value);
+ return 0;
+ }
+ }
+ new_entry = alloc_hashtable_entry(key, value);
+ if (!new_entry)
+ return -ENOMEM;
+ hash_add_rcu(package_to_appid, &new_entry->hlist, hash);
+ return 0;
+}
+
+static int insert_ext_gid_entry_locked(const struct qstr *key, appid_t value)
+{
+ struct hashtable_entry *hash_cur;
+ struct hashtable_entry *new_entry;
+ unsigned int hash = key->hash;
+
+ /* An extension can only belong to one gid */
+ hash_for_each_possible_rcu(ext_to_groupid, hash_cur, hlist, hash) {
+ if (qstr_case_eq(key, &hash_cur->key))
+ return -EINVAL;
+ }
+ new_entry = alloc_hashtable_entry(key, value);
+ if (!new_entry)
+ return -ENOMEM;
+ hash_add_rcu(ext_to_groupid, &new_entry->hlist, hash);
+ return 0;
+}
+
+static int insert_userid_exclude_entry_locked(const struct qstr *key, userid_t value)
+{
+ struct hashtable_entry *hash_cur;
+ struct hashtable_entry *new_entry;
+ unsigned int hash = key->hash;
+
+ /* Only insert if not already present */
+ hash_for_each_possible_rcu(package_to_userid, hash_cur, hlist, hash) {
+ if (atomic_read(&hash_cur->value) == value &&
+ qstr_case_eq(key, &hash_cur->key))
+ return 0;
+ }
+ new_entry = alloc_hashtable_entry(key, value);
+ if (!new_entry)
+ return -ENOMEM;
+ hash_add_rcu(package_to_userid, &new_entry->hlist, hash);
+ return 0;
+}
+
+static void fixup_all_perms_name(const struct qstr *key)
+{
+ struct sdcardfs_sb_info *sbinfo;
+ struct limit_search limit = {
+ .flags = BY_NAME,
+ .name = QSTR_INIT(key->name, key->len),
+ };
+ list_for_each_entry(sbinfo, &sdcardfs_super_list, list) {
+ if (sbinfo_has_sdcard_magic(sbinfo))
+ fixup_perms_recursive(sbinfo->sb->s_root, &limit);
+ }
+}
+
+static void fixup_all_perms_name_userid(const struct qstr *key, userid_t userid)
+{
+ struct sdcardfs_sb_info *sbinfo;
+ struct limit_search limit = {
+ .flags = BY_NAME | BY_USERID,
+ .name = QSTR_INIT(key->name, key->len),
+ .userid = userid,
+ };
+ list_for_each_entry(sbinfo, &sdcardfs_super_list, list) {
+ if (sbinfo_has_sdcard_magic(sbinfo))
+ fixup_perms_recursive(sbinfo->sb->s_root, &limit);
+ }
+}
+
+static void fixup_all_perms_userid(userid_t userid)
+{
+ struct sdcardfs_sb_info *sbinfo;
+ struct limit_search limit = {
+ .flags = BY_USERID,
+ .userid = userid,
+ };
+ list_for_each_entry(sbinfo, &sdcardfs_super_list, list) {
+ if (sbinfo_has_sdcard_magic(sbinfo))
+ fixup_perms_recursive(sbinfo->sb->s_root, &limit);
+ }
+}
+
+static int insert_packagelist_entry(const struct qstr *key, appid_t value)
+{
+ int err;
+
+ mutex_lock(&sdcardfs_super_list_lock);
+ err = insert_packagelist_appid_entry_locked(key, value);
+ if (!err)
+ fixup_all_perms_name(key);
+ mutex_unlock(&sdcardfs_super_list_lock);
+
+ return err;
+}
+
+static int insert_ext_gid_entry(const struct qstr *key, appid_t value)
+{
+ int err;
+
+ mutex_lock(&sdcardfs_super_list_lock);
+ err = insert_ext_gid_entry_locked(key, value);
+ mutex_unlock(&sdcardfs_super_list_lock);
+
+ return err;
+}
+
+static int insert_userid_exclude_entry(const struct qstr *key, userid_t value)
+{
+ int err;
+
+ mutex_lock(&sdcardfs_super_list_lock);
+ err = insert_userid_exclude_entry_locked(key, value);
+ if (!err)
+ fixup_all_perms_name_userid(key, value);
+ mutex_unlock(&sdcardfs_super_list_lock);
+
+ return err;
+}
+
+static void free_hashtable_entry(struct hashtable_entry *entry)
+{
+ kfree(entry->key.name);
+ kmem_cache_free(hashtable_entry_cachep, entry);
+}
+
+static void remove_packagelist_entry_locked(const struct qstr *key)
+{
+ struct hashtable_entry *hash_cur;
+ unsigned int hash = key->hash;
+ struct hlist_node *h_t;
+ HLIST_HEAD(free_list);
+
+ hash_for_each_possible_rcu(package_to_userid, hash_cur, hlist, hash) {
+ if (qstr_case_eq(key, &hash_cur->key)) {
+ hash_del_rcu(&hash_cur->hlist);
+ hlist_add_head(&hash_cur->dlist, &free_list);
+ }
+ }
+ hash_for_each_possible_rcu(package_to_appid, hash_cur, hlist, hash) {
+ if (qstr_case_eq(key, &hash_cur->key)) {
+ hash_del_rcu(&hash_cur->hlist);
+ hlist_add_head(&hash_cur->dlist, &free_list);
+ break;
+ }
+ }
+ synchronize_rcu();
+ hlist_for_each_entry_safe(hash_cur, h_t, &free_list, dlist)
+ free_hashtable_entry(hash_cur);
+}
+
+static void remove_packagelist_entry(const struct qstr *key)
+{
+ mutex_lock(&sdcardfs_super_list_lock);
+ remove_packagelist_entry_locked(key);
+ fixup_all_perms_name(key);
+ mutex_unlock(&sdcardfs_super_list_lock);
+}
+
+static void remove_ext_gid_entry_locked(const struct qstr *key, gid_t group)
+{
+ struct hashtable_entry *hash_cur;
+ unsigned int hash = key->hash;
+
+ hash_for_each_possible_rcu(ext_to_groupid, hash_cur, hlist, hash) {
+ if (qstr_case_eq(key, &hash_cur->key) && atomic_read(&hash_cur->value) == group) {
+ hash_del_rcu(&hash_cur->hlist);
+ synchronize_rcu();
+ free_hashtable_entry(hash_cur);
+ break;
+ }
+ }
+}
+
+static void remove_ext_gid_entry(const struct qstr *key, gid_t group)
+{
+ mutex_lock(&sdcardfs_super_list_lock);
+ remove_ext_gid_entry_locked(key, group);
+ mutex_unlock(&sdcardfs_super_list_lock);
+}
+
+static void remove_userid_all_entry_locked(userid_t userid)
+{
+ struct hashtable_entry *hash_cur;
+ struct hlist_node *h_t;
+ HLIST_HEAD(free_list);
+ int i;
+
+ hash_for_each_rcu(package_to_userid, i, hash_cur, hlist) {
+ if (atomic_read(&hash_cur->value) == userid) {
+ hash_del_rcu(&hash_cur->hlist);
+ hlist_add_head(&hash_cur->dlist, &free_list);
+ }
+ }
+ synchronize_rcu();
+ hlist_for_each_entry_safe(hash_cur, h_t, &free_list, dlist) {
+ free_hashtable_entry(hash_cur);
+ }
+}
+
+static void remove_userid_all_entry(userid_t userid)
+{
+ mutex_lock(&sdcardfs_super_list_lock);
+ remove_userid_all_entry_locked(userid);
+ fixup_all_perms_userid(userid);
+ mutex_unlock(&sdcardfs_super_list_lock);
+}
+
+static void remove_userid_exclude_entry_locked(const struct qstr *key, userid_t userid)
+{
+ struct hashtable_entry *hash_cur;
+ unsigned int hash = key->hash;
+
+ hash_for_each_possible_rcu(package_to_userid, hash_cur, hlist, hash) {
+ if (qstr_case_eq(key, &hash_cur->key) &&
+ atomic_read(&hash_cur->value) == userid) {
+ hash_del_rcu(&hash_cur->hlist);
+ synchronize_rcu();
+ free_hashtable_entry(hash_cur);
+ break;
+ }
+ }
+}
+
+static void remove_userid_exclude_entry(const struct qstr *key, userid_t userid)
+{
+ mutex_lock(&sdcardfs_super_list_lock);
+ remove_userid_exclude_entry_locked(key, userid);
+ fixup_all_perms_name_userid(key, userid);
+ mutex_unlock(&sdcardfs_super_list_lock);
+}
+
+static void packagelist_destroy(void)
+{
+ struct hashtable_entry *hash_cur;
+ struct hlist_node *h_t;
+ HLIST_HEAD(free_list);
+ int i;
+
+ mutex_lock(&sdcardfs_super_list_lock);
+ hash_for_each_rcu(package_to_appid, i, hash_cur, hlist) {
+ hash_del_rcu(&hash_cur->hlist);
+ hlist_add_head(&hash_cur->dlist, &free_list);
+ }
+ hash_for_each_rcu(package_to_userid, i, hash_cur, hlist) {
+ hash_del_rcu(&hash_cur->hlist);
+ hlist_add_head(&hash_cur->dlist, &free_list);
+ }
+ synchronize_rcu();
+ hlist_for_each_entry_safe(hash_cur, h_t, &free_list, dlist)
+ free_hashtable_entry(hash_cur);
+ mutex_unlock(&sdcardfs_super_list_lock);
+ pr_info("sdcardfs: destroyed packagelist pkgld\n");
+}
+
+#define SDCARDFS_CONFIGFS_ATTR(_pfx, _name) \
+static struct configfs_attribute _pfx##attr_##_name = { \
+ .ca_name = __stringify(_name), \
+ .ca_mode = S_IRUGO | S_IWUGO, \
+ .ca_owner = THIS_MODULE, \
+ .show = _pfx##_name##_show, \
+ .store = _pfx##_name##_store, \
+}
+
+#define SDCARDFS_CONFIGFS_ATTR_RO(_pfx, _name) \
+static struct configfs_attribute _pfx##attr_##_name = { \
+ .ca_name = __stringify(_name), \
+ .ca_mode = S_IRUGO, \
+ .ca_owner = THIS_MODULE, \
+ .show = _pfx##_name##_show, \
+}
+
+#define SDCARDFS_CONFIGFS_ATTR_WO(_pfx, _name) \
+static struct configfs_attribute _pfx##attr_##_name = { \
+ .ca_name = __stringify(_name), \
+ .ca_mode = S_IWUGO, \
+ .ca_owner = THIS_MODULE, \
+ .store = _pfx##_name##_store, \
+}
+
+struct package_details {
+ struct config_item item;
+ struct qstr name;
+};
+
+static inline struct package_details *to_package_details(struct config_item *item)
+{
+ return item ? container_of(item, struct package_details, item) : NULL;
+}
+
+static ssize_t package_details_appid_show(struct config_item *item, char *page)
+{
+ return scnprintf(page, PAGE_SIZE, "%u\n", __get_appid(&to_package_details(item)->name));
+}
+
+static ssize_t package_details_appid_store(struct config_item *item,
+ const char *page, size_t count)
+{
+ unsigned int tmp;
+ int ret;
+
+ ret = kstrtouint(page, 10, &tmp);
+ if (ret)
+ return ret;
+
+ ret = insert_packagelist_entry(&to_package_details(item)->name, tmp);
+
+ if (ret)
+ return ret;
+
+ return count;
+}
+
+static ssize_t package_details_excluded_userids_show(struct config_item *item,
+ char *page)
+{
+ struct package_details *package_details = to_package_details(item);
+ struct hashtable_entry *hash_cur;
+ unsigned int hash = package_details->name.hash;
+ int count = 0;
+
+ rcu_read_lock();
+ hash_for_each_possible_rcu(package_to_userid, hash_cur, hlist, hash) {
+ if (qstr_case_eq(&package_details->name, &hash_cur->key))
+ count += scnprintf(page + count, PAGE_SIZE - count,
+ "%d ", atomic_read(&hash_cur->value));
+ }
+ rcu_read_unlock();
+ if (count)
+ count--;
+ count += scnprintf(page + count, PAGE_SIZE - count, "\n");
+ return count;
+}
+
+static ssize_t package_details_excluded_userids_store(struct config_item *item,
+ const char *page, size_t count)
+{
+ unsigned int tmp;
+ int ret;
+
+ ret = kstrtouint(page, 10, &tmp);
+ if (ret)
+ return ret;
+
+ ret = insert_userid_exclude_entry(&to_package_details(item)->name, tmp);
+
+ if (ret)
+ return ret;
+
+ return count;
+}
+
+static ssize_t package_details_clear_userid_store(struct config_item *item,
+ const char *page, size_t count)
+{
+ unsigned int tmp;
+ int ret;
+
+ ret = kstrtouint(page, 10, &tmp);
+ if (ret)
+ return ret;
+ remove_userid_exclude_entry(&to_package_details(item)->name, tmp);
+ return count;
+}
+
+static void package_details_release(struct config_item *item)
+{
+ struct package_details *package_details = to_package_details(item);
+
+ pr_info("sdcardfs: removing %s\n", package_details->name.name);
+ remove_packagelist_entry(&package_details->name);
+ kfree(package_details->name.name);
+ kfree(package_details);
+}
+
+SDCARDFS_CONFIGFS_ATTR(package_details_, appid);
+SDCARDFS_CONFIGFS_ATTR(package_details_, excluded_userids);
+SDCARDFS_CONFIGFS_ATTR_WO(package_details_, clear_userid);
+
+static struct configfs_attribute *package_details_attrs[] = {
+ &package_details_attr_appid,
+ &package_details_attr_excluded_userids,
+ &package_details_attr_clear_userid,
+ NULL,
+};
+
+static struct configfs_item_operations package_details_item_ops = {
+ .release = package_details_release,
+};
+
+static struct config_item_type package_appid_type = {
+ .ct_item_ops = &package_details_item_ops,
+ .ct_attrs = package_details_attrs,
+ .ct_owner = THIS_MODULE,
+};
+
+struct extensions_value {
+ struct config_group group;
+ unsigned int num;
+};
+
+struct extension_details {
+ struct config_item item;
+ struct qstr name;
+ unsigned int num;
+};
+
+static inline struct extensions_value *to_extensions_value(struct config_item *item)
+{
+ return item ? container_of(to_config_group(item), struct extensions_value, group) : NULL;
+}
+
+static inline struct extension_details *to_extension_details(struct config_item *item)
+{
+ return item ? container_of(item, struct extension_details, item) : NULL;
+}
+
+static void extension_details_release(struct config_item *item)
+{
+ struct extension_details *extension_details = to_extension_details(item);
+
+ pr_info("sdcardfs: No longer mapping %s files to gid %d\n",
+ extension_details->name.name, extension_details->num);
+ remove_ext_gid_entry(&extension_details->name, extension_details->num);
+ kfree(extension_details->name.name);
+ kfree(extension_details);
+}
+
+static struct configfs_item_operations extension_details_item_ops = {
+ .release = extension_details_release,
+};
+
+static struct config_item_type extension_details_type = {
+ .ct_item_ops = &extension_details_item_ops,
+ .ct_owner = THIS_MODULE,
+};
+
+static struct config_item *extension_details_make_item(struct config_group *group, const char *name)
+{
+ struct extensions_value *extensions_value = to_extensions_value(&group->cg_item);
+ struct extension_details *extension_details = kzalloc(sizeof(struct extension_details), GFP_KERNEL);
+ const char *tmp;
+ int ret;
+
+ if (!extension_details)
+ return ERR_PTR(-ENOMEM);
+
+ tmp = kstrdup(name, GFP_KERNEL);
+ if (!tmp) {
+ kfree(extension_details);
+ return ERR_PTR(-ENOMEM);
+ }
+ qstr_init(&extension_details->name, tmp);
+ extension_details->num = extensions_value->num;
+ ret = insert_ext_gid_entry(&extension_details->name, extensions_value->num);
+
+ if (ret) {
+ kfree(extension_details->name.name);
+ kfree(extension_details);
+ return ERR_PTR(ret);
+ }
+ config_item_init_type_name(&extension_details->item, name, &extension_details_type);
+
+ return &extension_details->item;
+}
+
+static struct configfs_group_operations extensions_value_group_ops = {
+ .make_item = extension_details_make_item,
+};
+
+static struct config_item_type extensions_name_type = {
+ .ct_group_ops = &extensions_value_group_ops,
+ .ct_owner = THIS_MODULE,
+};
+
+static struct config_group *extensions_make_group(struct config_group *group, const char *name)
+{
+ struct extensions_value *extensions_value;
+ unsigned int tmp;
+ int ret;
+
+ extensions_value = kzalloc(sizeof(struct extensions_value), GFP_KERNEL);
+ if (!extensions_value)
+ return ERR_PTR(-ENOMEM);
+ ret = kstrtouint(name, 10, &tmp);
+ if (ret) {
+ kfree(extensions_value);
+ return ERR_PTR(ret);
+ }
+
+ extensions_value->num = tmp;
+ config_group_init_type_name(&extensions_value->group, name,
+ &extensions_name_type);
+ return &extensions_value->group;
+}
+
+static void extensions_drop_group(struct config_group *group, struct config_item *item)
+{
+ struct extensions_value *value = to_extensions_value(item);
+
+ pr_info("sdcardfs: No longer mapping any files to gid %d\n", value->num);
+ kfree(value);
+}
+
+static struct configfs_group_operations extensions_group_ops = {
+ .make_group = extensions_make_group,
+ .drop_item = extensions_drop_group,
+};
+
+static struct config_item_type extensions_type = {
+ .ct_group_ops = &extensions_group_ops,
+ .ct_owner = THIS_MODULE,
+};
+
+struct config_group extension_group = {
+ .cg_item = {
+ .ci_namebuf = "extensions",
+ .ci_type = &extensions_type,
+ },
+};
+
+static struct config_item *packages_make_item(struct config_group *group, const char *name)
+{
+ struct package_details *package_details;
+ const char *tmp;
+
+ package_details = kzalloc(sizeof(struct package_details), GFP_KERNEL);
+ if (!package_details)
+ return ERR_PTR(-ENOMEM);
+ tmp = kstrdup(name, GFP_KERNEL);
+ if (!tmp) {
+ kfree(package_details);
+ return ERR_PTR(-ENOMEM);
+ }
+ qstr_init(&package_details->name, tmp);
+ config_item_init_type_name(&package_details->item, name,
+ &package_appid_type);
+
+ return &package_details->item;
+}
+
+static ssize_t packages_list_show(struct config_item *item, char *page)
+{
+ struct hashtable_entry *hash_cur_app;
+ struct hashtable_entry *hash_cur_user;
+ int i;
+ int count = 0, written = 0;
+ const char errormsg[] = "<truncated>\n";
+ unsigned int hash;
+
+ rcu_read_lock();
+ hash_for_each_rcu(package_to_appid, i, hash_cur_app, hlist) {
+ written = scnprintf(page + count, PAGE_SIZE - sizeof(errormsg) - count, "%s %d\n",
+ hash_cur_app->key.name, atomic_read(&hash_cur_app->value));
+ hash = hash_cur_app->key.hash;
+ hash_for_each_possible_rcu(package_to_userid, hash_cur_user, hlist, hash) {
+ if (qstr_case_eq(&hash_cur_app->key, &hash_cur_user->key)) {
+ written += scnprintf(page + count + written - 1,
+ PAGE_SIZE - sizeof(errormsg) - count - written + 1,
+ " %d\n", atomic_read(&hash_cur_user->value)) - 1;
+ }
+ }
+ if (count + written == PAGE_SIZE - sizeof(errormsg) - 1) {
+ count += scnprintf(page + count, PAGE_SIZE - count, errormsg);
+ break;
+ }
+ count += written;
+ }
+ rcu_read_unlock();
+
+ return count;
+}
+
+static ssize_t packages_remove_userid_store(struct config_item *item,
+ const char *page, size_t count)
+{
+ unsigned int tmp;
+ int ret;
+
+ ret = kstrtouint(page, 10, &tmp);
+ if (ret)
+ return ret;
+ remove_userid_all_entry(tmp);
+ return count;
+}
+
+static struct configfs_attribute packages_attr_packages_gid_list = {
+ .ca_name = "packages_gid.list",
+ .ca_mode = S_IRUGO,
+ .ca_owner = THIS_MODULE,
+ .show = packages_list_show,
+};
+
+SDCARDFS_CONFIGFS_ATTR_WO(packages_, remove_userid);
+
+static struct configfs_attribute *packages_attrs[] = {
+ &packages_attr_packages_gid_list,
+ &packages_attr_remove_userid,
+ NULL,
+};
+
+/*
+ * Note that, since no extra work is required on ->drop_item(),
+ * no ->drop_item() is provided.
+ */
+static struct configfs_group_operations packages_group_ops = {
+ .make_item = packages_make_item,
+};
+
+static struct config_item_type packages_type = {
+ .ct_group_ops = &packages_group_ops,
+ .ct_attrs = packages_attrs,
+ .ct_owner = THIS_MODULE,
+};
+
+struct config_group *sd_default_groups[] = {
+ &extension_group,
+ NULL,
+};
+
+static struct configfs_subsystem sdcardfs_packages = {
+ .su_group = {
+ .cg_item = {
+ .ci_namebuf = "sdcardfs",
+ .ci_type = &packages_type,
+ },
+ },
+};
+
+static int configfs_sdcardfs_init(void)
+{
+ int ret, i;
+ struct configfs_subsystem *subsys = &sdcardfs_packages;
+
+ config_group_init(&subsys->su_group);
+ for (i = 0; sd_default_groups[i]; i++) {
+ config_group_init(sd_default_groups[i]);
+ configfs_add_default_group(sd_default_groups[i], &subsys->su_group);
+ }
+ mutex_init(&subsys->su_mutex);
+ ret = configfs_register_subsystem(subsys);
+ if (ret) {
+ pr_err("Error %d while registering subsystem %s\n",
+ ret,
+ subsys->su_group.cg_item.ci_namebuf);
+ }
+ return ret;
+}
+
+static void configfs_sdcardfs_exit(void)
+{
+ configfs_unregister_subsystem(&sdcardfs_packages);
+}
+
+int packagelist_init(void)
+{
+ hashtable_entry_cachep =
+ kmem_cache_create("packagelist_hashtable_entry",
+ sizeof(struct hashtable_entry), 0, 0, NULL);
+ if (!hashtable_entry_cachep) {
+ pr_err("sdcardfs: failed creating pkgl_hashtable entry slab cache\n");
+ return -ENOMEM;
+ }
+
+ configfs_sdcardfs_init();
+ return 0;
+}
+
+void packagelist_exit(void)
+{
+ configfs_sdcardfs_exit();
+ packagelist_destroy();
+ kmem_cache_destroy(hashtable_entry_cachep);
+}
diff --git a/fs/sdcardfs/sdcardfs.h b/fs/sdcardfs/sdcardfs.h
new file mode 100644
index 000000000000..6219771ed71c
--- /dev/null
+++ b/fs/sdcardfs/sdcardfs.h
@@ -0,0 +1,654 @@
+/*
+ * fs/sdcardfs/sdcardfs.h
+ *
+ * The sdcardfs v2.0
+ * This file system replaces the sdcard daemon on Android
+ * On version 2.0, some of the daemon functions have been ported
+ * to support the multi-user concepts of Android 4.4
+ *
+ * Copyright (c) 2013 Samsung Electronics Co. Ltd
+ * Authors: Daeho Jeong, Woojoong Lee, Seunghwan Hyun,
+ * Sunghwan Yun, Sungjong Seo
+ *
+ * This program has been developed as a stackable file system based on
+ * the WrapFS which written by
+ *
+ * Copyright (c) 1998-2011 Erez Zadok
+ * Copyright (c) 2009 Shrikar Archak
+ * Copyright (c) 2003-2011 Stony Brook University
+ * Copyright (c) 2003-2011 The Research Foundation of SUNY
+ *
+ * This file is dual licensed. It may be redistributed and/or modified
+ * under the terms of the Apache 2.0 License OR version 2 of the GNU
+ * General Public License.
+ */
+
+#ifndef _SDCARDFS_H_
+#define _SDCARDFS_H_
+
+#include <linux/dcache.h>
+#include <linux/file.h>
+#include <linux/fs.h>
+#include <linux/aio.h>
+#include <linux/kref.h>
+#include <linux/mm.h>
+#include <linux/mount.h>
+#include <linux/namei.h>
+#include <linux/seq_file.h>
+#include <linux/statfs.h>
+#include <linux/fs_stack.h>
+#include <linux/magic.h>
+#include <linux/uaccess.h>
+#include <linux/slab.h>
+#include <linux/sched.h>
+#include <linux/types.h>
+#include <linux/security.h>
+#include <linux/string.h>
+#include <linux/list.h>
+#include "multiuser.h"
+
+/* the file system name */
+#define SDCARDFS_NAME "sdcardfs"
+
+/* sdcardfs root inode number */
+#define SDCARDFS_ROOT_INO 1
+
+/* useful for tracking code reachability */
+#define UDBG pr_default("DBG:%s:%s:%d\n", __FILE__, __func__, __LINE__)
+
+#define SDCARDFS_DIRENT_SIZE 256
+
+/* temporary static uid settings for development */
+#define AID_ROOT 0 /* uid for accessing /mnt/sdcard & extSdcard */
+#define AID_MEDIA_RW 1023 /* internal media storage write access */
+
+#define AID_SDCARD_RW 1015 /* external storage write access */
+#define AID_SDCARD_R 1028 /* external storage read access */
+#define AID_SDCARD_PICS 1033 /* external storage photos access */
+#define AID_SDCARD_AV 1034 /* external storage audio/video access */
+#define AID_SDCARD_ALL 1035 /* access all users external storage */
+#define AID_MEDIA_OBB 1059 /* obb files */
+
+#define AID_SDCARD_IMAGE 1057
+
+#define AID_PACKAGE_INFO 1027
+
+
+/*
+ * Permissions are handled by our permission function.
+ * We don't want anyone who happens to look at our inode value to prematurely
+ * block access, so store more permissive values. These are probably never
+ * used.
+ */
+#define fixup_tmp_permissions(x) \
+ do { \
+ (x)->i_uid = make_kuid(&init_user_ns, \
+ SDCARDFS_I(x)->data->d_uid); \
+ (x)->i_gid = make_kgid(&init_user_ns, AID_SDCARD_RW); \
+ (x)->i_mode = ((x)->i_mode & S_IFMT) | 0775;\
+ } while (0)
+
+/* Android 5.0 support */
+
+/* Permission mode for a specific node. Controls how file permissions
+ * are derived for children nodes.
+ */
+typedef enum {
+ /* Nothing special; this node should just inherit from its parent. */
+ PERM_INHERIT,
+ /* This node is one level above a normal root; used for legacy layouts
+ * which use the first level to represent user_id.
+ */
+ PERM_PRE_ROOT,
+ /* This node is "/" */
+ PERM_ROOT,
+ /* This node is "/Android" */
+ PERM_ANDROID,
+ /* This node is "/Android/data" */
+ PERM_ANDROID_DATA,
+ /* This node is "/Android/obb" */
+ PERM_ANDROID_OBB,
+ /* This node is "/Android/media" */
+ PERM_ANDROID_MEDIA,
+ /* This node is "/Android/[data|media|obb]/[package]" */
+ PERM_ANDROID_PACKAGE,
+ /* This node is "/Android/[data|media|obb]/[package]/cache" */
+ PERM_ANDROID_PACKAGE_CACHE,
+} perm_t;
+
+struct sdcardfs_sb_info;
+struct sdcardfs_mount_options;
+struct sdcardfs_inode_info;
+struct sdcardfs_inode_data;
+
+/* Do not directly use this function. Use OVERRIDE_CRED() instead. */
+const struct cred *override_fsids(struct sdcardfs_sb_info *sbi,
+ struct sdcardfs_inode_data *data);
+/* Do not directly use this function, use REVERT_CRED() instead. */
+void revert_fsids(const struct cred *old_cred);
+
+/* operations vectors defined in specific files */
+extern const struct file_operations sdcardfs_main_fops;
+extern const struct file_operations sdcardfs_dir_fops;
+extern const struct inode_operations sdcardfs_main_iops;
+extern const struct inode_operations sdcardfs_dir_iops;
+extern const struct inode_operations sdcardfs_symlink_iops;
+extern const struct super_operations sdcardfs_sops;
+extern const struct dentry_operations sdcardfs_ci_dops;
+extern const struct address_space_operations sdcardfs_aops, sdcardfs_dummy_aops;
+extern const struct vm_operations_struct sdcardfs_vm_ops;
+
+extern int sdcardfs_init_inode_cache(void);
+extern void sdcardfs_destroy_inode_cache(void);
+extern int sdcardfs_init_dentry_cache(void);
+extern void sdcardfs_destroy_dentry_cache(void);
+extern int new_dentry_private_data(struct dentry *dentry);
+extern void free_dentry_private_data(struct dentry *dentry);
+extern struct dentry *sdcardfs_lookup(struct inode *dir, struct dentry *dentry,
+ unsigned int flags);
+extern struct inode *sdcardfs_iget(struct super_block *sb,
+ struct inode *lower_inode, userid_t id);
+extern int sdcardfs_interpose(struct dentry *dentry, struct super_block *sb,
+ struct path *lower_path, userid_t id);
+
+/* file private data */
+struct sdcardfs_file_info {
+ struct file *lower_file;
+ const struct vm_operations_struct *lower_vm_ops;
+};
+
+struct sdcardfs_inode_data {
+ struct kref refcount;
+ bool abandoned;
+
+ perm_t perm;
+ userid_t userid;
+ uid_t d_uid;
+ bool under_android;
+ bool under_cache;
+ bool under_obb;
+};
+
+/* sdcardfs inode data in memory */
+struct sdcardfs_inode_info {
+ struct inode *lower_inode;
+ /* state derived based on current position in hierarchy */
+ struct sdcardfs_inode_data *data;
+
+ /* top folder for ownership */
+ spinlock_t top_lock;
+ struct sdcardfs_inode_data *top_data;
+
+ struct inode vfs_inode;
+};
+
+
+/* sdcardfs dentry data in memory */
+struct sdcardfs_dentry_info {
+ spinlock_t lock; /* protects lower_path */
+ struct path lower_path;
+ struct path orig_path;
+};
+
+struct sdcardfs_mount_options {
+ uid_t fs_low_uid;
+ gid_t fs_low_gid;
+ userid_t fs_user_id;
+ bool multiuser;
+ bool gid_derivation;
+ bool default_normal;
+ bool unshared_obb;
+ unsigned int reserved_mb;
+ bool nocache;
+};
+
+struct sdcardfs_vfsmount_options {
+ gid_t gid;
+ mode_t mask;
+};
+
+extern int parse_options_remount(struct super_block *sb, char *options, int silent,
+ struct sdcardfs_vfsmount_options *vfsopts);
+
+/* sdcardfs super-block data in memory */
+struct sdcardfs_sb_info {
+ struct super_block *sb;
+ struct super_block *lower_sb;
+ /* derived perm policy : some of options have been added
+ * to sdcardfs_mount_options (Android 4.4 support)
+ */
+ struct sdcardfs_mount_options options;
+ spinlock_t lock; /* protects obbpath */
+ char *obbpath_s;
+ struct path obbpath;
+ void *pkgl_id;
+ struct list_head list;
+};
+
+/*
+ * inode to private data
+ *
+ * Since we use containers and the struct inode is _inside_ the
+ * sdcardfs_inode_info structure, SDCARDFS_I will always (given a non-NULL
+ * inode pointer), return a valid non-NULL pointer.
+ */
+static inline struct sdcardfs_inode_info *SDCARDFS_I(const struct inode *inode)
+{
+ return container_of(inode, struct sdcardfs_inode_info, vfs_inode);
+}
+
+/* dentry to private data */
+#define SDCARDFS_D(dent) ((struct sdcardfs_dentry_info *)(dent)->d_fsdata)
+
+/* superblock to private data */
+#define SDCARDFS_SB(super) ((struct sdcardfs_sb_info *)(super)->s_fs_info)
+
+/* file to private Data */
+#define SDCARDFS_F(file) ((struct sdcardfs_file_info *)((file)->private_data))
+
+/* file to lower file */
+static inline struct file *sdcardfs_lower_file(const struct file *f)
+{
+ return SDCARDFS_F(f)->lower_file;
+}
+
+static inline void sdcardfs_set_lower_file(struct file *f, struct file *val)
+{
+ SDCARDFS_F(f)->lower_file = val;
+}
+
+/* inode to lower inode. */
+static inline struct inode *sdcardfs_lower_inode(const struct inode *i)
+{
+ return SDCARDFS_I(i)->lower_inode;
+}
+
+static inline void sdcardfs_set_lower_inode(struct inode *i, struct inode *val)
+{
+ SDCARDFS_I(i)->lower_inode = val;
+}
+
+/* superblock to lower superblock */
+static inline struct super_block *sdcardfs_lower_super(
+ const struct super_block *sb)
+{
+ return SDCARDFS_SB(sb)->lower_sb;
+}
+
+static inline void sdcardfs_set_lower_super(struct super_block *sb,
+ struct super_block *val)
+{
+ SDCARDFS_SB(sb)->lower_sb = val;
+}
+
+/* path based (dentry/mnt) macros */
+static inline void pathcpy(struct path *dst, const struct path *src)
+{
+ dst->dentry = src->dentry;
+ dst->mnt = src->mnt;
+}
+
+/* sdcardfs_get_pname functions calls path_get()
+ * therefore, the caller must call "proper" path_put functions
+ */
+#define SDCARDFS_DENT_FUNC(pname) \
+static inline void sdcardfs_get_##pname(const struct dentry *dent, \
+ struct path *pname) \
+{ \
+ spin_lock(&SDCARDFS_D(dent)->lock); \
+ pathcpy(pname, &SDCARDFS_D(dent)->pname); \
+ path_get(pname); \
+ spin_unlock(&SDCARDFS_D(dent)->lock); \
+ return; \
+} \
+static inline void sdcardfs_put_##pname(const struct dentry *dent, \
+ struct path *pname) \
+{ \
+ path_put(pname); \
+ return; \
+} \
+static inline void sdcardfs_set_##pname(const struct dentry *dent, \
+ struct path *pname) \
+{ \
+ spin_lock(&SDCARDFS_D(dent)->lock); \
+ pathcpy(&SDCARDFS_D(dent)->pname, pname); \
+ spin_unlock(&SDCARDFS_D(dent)->lock); \
+ return; \
+} \
+static inline void sdcardfs_reset_##pname(const struct dentry *dent) \
+{ \
+ spin_lock(&SDCARDFS_D(dent)->lock); \
+ SDCARDFS_D(dent)->pname.dentry = NULL; \
+ SDCARDFS_D(dent)->pname.mnt = NULL; \
+ spin_unlock(&SDCARDFS_D(dent)->lock); \
+ return; \
+} \
+static inline void sdcardfs_put_reset_##pname(const struct dentry *dent) \
+{ \
+ struct path pname; \
+ spin_lock(&SDCARDFS_D(dent)->lock); \
+ if (SDCARDFS_D(dent)->pname.dentry) { \
+ pathcpy(&pname, &SDCARDFS_D(dent)->pname); \
+ SDCARDFS_D(dent)->pname.dentry = NULL; \
+ SDCARDFS_D(dent)->pname.mnt = NULL; \
+ spin_unlock(&SDCARDFS_D(dent)->lock); \
+ path_put(&pname); \
+ } else \
+ spin_unlock(&SDCARDFS_D(dent)->lock); \
+ return; \
+}
+
+SDCARDFS_DENT_FUNC(lower_path)
+SDCARDFS_DENT_FUNC(orig_path)
+
+static inline bool sbinfo_has_sdcard_magic(struct sdcardfs_sb_info *sbinfo)
+{
+ return sbinfo && sbinfo->sb
+ && sbinfo->sb->s_magic == SDCARDFS_SUPER_MAGIC;
+}
+
+static inline struct sdcardfs_inode_data *data_get(
+ struct sdcardfs_inode_data *data)
+{
+ if (data)
+ kref_get(&data->refcount);
+ return data;
+}
+
+static inline struct sdcardfs_inode_data *top_data_get(
+ struct sdcardfs_inode_info *info)
+{
+ struct sdcardfs_inode_data *top_data;
+
+ spin_lock(&info->top_lock);
+ top_data = data_get(info->top_data);
+ spin_unlock(&info->top_lock);
+ return top_data;
+}
+
+extern void data_release(struct kref *ref);
+
+static inline void data_put(struct sdcardfs_inode_data *data)
+{
+ kref_put(&data->refcount, data_release);
+}
+
+static inline void release_own_data(struct sdcardfs_inode_info *info)
+{
+ /*
+ * This happens exactly once per inode. At this point, the inode that
+ * originally held this data is about to be freed, and all references
+ * to it are held as a top value, and will likely be released soon.
+ */
+ info->data->abandoned = true;
+ data_put(info->data);
+}
+
+static inline void set_top(struct sdcardfs_inode_info *info,
+ struct sdcardfs_inode_info *top_owner)
+{
+ struct sdcardfs_inode_data *old_top;
+ struct sdcardfs_inode_data *new_top = NULL;
+
+ if (top_owner)
+ new_top = top_data_get(top_owner);
+
+ spin_lock(&info->top_lock);
+ old_top = info->top_data;
+ info->top_data = new_top;
+ if (old_top)
+ data_put(old_top);
+ spin_unlock(&info->top_lock);
+}
+
+static inline int get_gid(struct vfsmount *mnt,
+ struct super_block *sb,
+ struct sdcardfs_inode_data *data)
+{
+ struct sdcardfs_vfsmount_options *vfsopts = mnt->data;
+ struct sdcardfs_sb_info *sbi = SDCARDFS_SB(sb);
+
+ if (vfsopts->gid == AID_SDCARD_RW && !sbi->options.default_normal)
+ /* As an optimization, certain trusted system components only run
+ * as owner but operate across all users. Since we're now handing
+ * out the sdcard_rw GID only to trusted apps, we're okay relaxing
+ * the user boundary enforcement for the default view. The UIDs
+ * assigned to app directories are still multiuser aware.
+ */
+ return AID_SDCARD_RW;
+ else
+ return multiuser_get_uid(data->userid, vfsopts->gid);
+}
+
+static inline int get_mode(struct vfsmount *mnt,
+ struct sdcardfs_inode_info *info,
+ struct sdcardfs_inode_data *data)
+{
+ int owner_mode;
+ int filtered_mode;
+ struct sdcardfs_vfsmount_options *opts = mnt->data;
+ int visible_mode = 0775 & ~opts->mask;
+
+
+ if (data->perm == PERM_PRE_ROOT) {
+ /* Top of multi-user view should always be visible to ensure
+ * secondary users can traverse inside.
+ */
+ visible_mode = 0711;
+ } else if (data->under_android) {
+ /* Block "other" access to Android directories, since only apps
+ * belonging to a specific user should be in there; we still
+ * leave +x open for the default view.
+ */
+ if (opts->gid == AID_SDCARD_RW)
+ visible_mode = visible_mode & ~0006;
+ else
+ visible_mode = visible_mode & ~0007;
+ }
+ owner_mode = info->lower_inode->i_mode & 0700;
+ filtered_mode = visible_mode & (owner_mode | (owner_mode >> 3) | (owner_mode >> 6));
+ return filtered_mode;
+}
+
+static inline int has_graft_path(const struct dentry *dent)
+{
+ int ret = 0;
+
+ spin_lock(&SDCARDFS_D(dent)->lock);
+ if (SDCARDFS_D(dent)->orig_path.dentry != NULL)
+ ret = 1;
+ spin_unlock(&SDCARDFS_D(dent)->lock);
+
+ return ret;
+}
+
+static inline void sdcardfs_get_real_lower(const struct dentry *dent,
+ struct path *real_lower)
+{
+ /* in case of a local obb dentry
+ * the orig_path should be returned
+ */
+ if (has_graft_path(dent))
+ sdcardfs_get_orig_path(dent, real_lower);
+ else
+ sdcardfs_get_lower_path(dent, real_lower);
+}
+
+static inline void sdcardfs_put_real_lower(const struct dentry *dent,
+ struct path *real_lower)
+{
+ if (has_graft_path(dent))
+ sdcardfs_put_orig_path(dent, real_lower);
+ else
+ sdcardfs_put_lower_path(dent, real_lower);
+}
+
+extern struct mutex sdcardfs_super_list_lock;
+extern struct list_head sdcardfs_super_list;
+
+/* for packagelist.c */
+extern appid_t get_appid(const char *app_name);
+extern appid_t get_ext_gid(const char *app_name);
+extern appid_t is_excluded(const char *app_name, userid_t userid);
+extern int check_caller_access_to_name(struct inode *parent_node, const struct qstr *name);
+extern int packagelist_init(void);
+extern void packagelist_exit(void);
+
+/* for derived_perm.c */
+#define BY_NAME (1 << 0)
+#define BY_USERID (1 << 1)
+struct limit_search {
+ unsigned int flags;
+ struct qstr name;
+ userid_t userid;
+};
+
+extern void setup_derived_state(struct inode *inode, perm_t perm,
+ userid_t userid, uid_t uid);
+extern void get_derived_permission(struct dentry *parent, struct dentry *dentry);
+extern void get_derived_permission_new(struct dentry *parent, struct dentry *dentry, const struct qstr *name);
+extern void fixup_perms_recursive(struct dentry *dentry, struct limit_search *limit);
+
+extern void update_derived_permission_lock(struct dentry *dentry);
+void fixup_lower_ownership(struct dentry *dentry, const char *name);
+extern int need_graft_path(struct dentry *dentry);
+extern int is_base_obbpath(struct dentry *dentry);
+extern int is_obbpath_invalid(struct dentry *dentry);
+extern int setup_obb_dentry(struct dentry *dentry, struct path *lower_path);
+
+/* locking helpers */
+static inline struct dentry *lock_parent(struct dentry *dentry)
+{
+ struct dentry *dir = dget_parent(dentry);
+
+ inode_lock_nested(d_inode(dir), I_MUTEX_PARENT);
+ return dir;
+}
+
+static inline void unlock_dir(struct dentry *dir)
+{
+ inode_unlock(d_inode(dir));
+ dput(dir);
+}
+
+static inline int prepare_dir(const char *path_s, uid_t uid, gid_t gid, mode_t mode)
+{
+ int err;
+ struct dentry *dent;
+ struct iattr attrs;
+ struct path parent;
+
+ dent = kern_path_locked(path_s, &parent);
+ if (IS_ERR(dent)) {
+ err = PTR_ERR(dent);
+ if (err == -EEXIST)
+ err = 0;
+ goto out_unlock;
+ }
+
+ err = vfs_mkdir2(parent.mnt, d_inode(parent.dentry), dent, mode);
+ if (err) {
+ if (err == -EEXIST)
+ err = 0;
+ goto out_dput;
+ }
+
+ attrs.ia_uid = make_kuid(&init_user_ns, uid);
+ attrs.ia_gid = make_kgid(&init_user_ns, gid);
+ attrs.ia_valid = ATTR_UID | ATTR_GID;
+ inode_lock(d_inode(dent));
+ notify_change2(parent.mnt, dent, &attrs, NULL);
+ inode_unlock(d_inode(dent));
+
+out_dput:
+ dput(dent);
+
+out_unlock:
+ /* parent dentry locked by lookup_create */
+ inode_unlock(d_inode(parent.dentry));
+ path_put(&parent);
+ return err;
+}
+
+/*
+ * Return 1, if a disk has enough free space, otherwise 0.
+ * We assume that any files can not be overwritten.
+ */
+static inline int check_min_free_space(struct dentry *dentry, size_t size, int dir)
+{
+ int err;
+ struct path lower_path;
+ struct kstatfs statfs;
+ u64 avail;
+ struct sdcardfs_sb_info *sbi = SDCARDFS_SB(dentry->d_sb);
+
+ if (sbi->options.reserved_mb) {
+ /* Get fs stat of lower filesystem. */
+ sdcardfs_get_lower_path(dentry, &lower_path);
+ err = vfs_statfs(&lower_path, &statfs);
+ sdcardfs_put_lower_path(dentry, &lower_path);
+
+ if (unlikely(err))
+ return 0;
+
+ /* Invalid statfs informations. */
+ if (unlikely(statfs.f_bsize == 0))
+ return 0;
+
+ /* if you are checking directory, set size to f_bsize. */
+ if (unlikely(dir))
+ size = statfs.f_bsize;
+
+ /* available size */
+ avail = statfs.f_bavail * statfs.f_bsize;
+
+ /* not enough space */
+ if ((u64)size > avail)
+ return 0;
+
+ /* enough space */
+ if ((avail - size) > (sbi->options.reserved_mb * 1024 * 1024))
+ return 1;
+
+ return 0;
+ } else
+ return 1;
+}
+
+/*
+ * Copies attrs and maintains sdcardfs managed attrs
+ * Since our permission check handles all special permissions, set those to be open
+ */
+static inline void sdcardfs_copy_and_fix_attrs(struct inode *dest, const struct inode *src)
+{
+ dest->i_mode = (src->i_mode & S_IFMT) | S_IRWXU | S_IRWXG |
+ S_IROTH | S_IXOTH; /* 0775 */
+ dest->i_uid = make_kuid(&init_user_ns, SDCARDFS_I(dest)->data->d_uid);
+ dest->i_gid = make_kgid(&init_user_ns, AID_SDCARD_RW);
+ dest->i_rdev = src->i_rdev;
+ dest->i_atime = src->i_atime;
+ dest->i_mtime = src->i_mtime;
+ dest->i_ctime = src->i_ctime;
+ dest->i_blkbits = src->i_blkbits;
+ dest->i_flags = src->i_flags;
+ set_nlink(dest, src->i_nlink);
+}
+
+static inline bool str_case_eq(const char *s1, const char *s2)
+{
+ return !strcasecmp(s1, s2);
+}
+
+static inline bool str_n_case_eq(const char *s1, const char *s2, size_t len)
+{
+ return !strncasecmp(s1, s2, len);
+}
+
+static inline bool qstr_case_eq(const struct qstr *q1, const struct qstr *q2)
+{
+ return q1->len == q2->len && str_n_case_eq(q1->name, q2->name, q2->len);
+}
+
+#define QSTR_LITERAL(string) QSTR_INIT(string, sizeof(string)-1)
+
+#endif /* not _SDCARDFS_H_ */
diff --git a/fs/sdcardfs/super.c b/fs/sdcardfs/super.c
new file mode 100644
index 000000000000..140696ed3ed3
--- /dev/null
+++ b/fs/sdcardfs/super.c
@@ -0,0 +1,333 @@
+/*
+ * fs/sdcardfs/super.c
+ *
+ * Copyright (c) 2013 Samsung Electronics Co. Ltd
+ * Authors: Daeho Jeong, Woojoong Lee, Seunghwan Hyun,
+ * Sunghwan Yun, Sungjong Seo
+ *
+ * This program has been developed as a stackable file system based on
+ * the WrapFS which written by
+ *
+ * Copyright (c) 1998-2011 Erez Zadok
+ * Copyright (c) 2009 Shrikar Archak
+ * Copyright (c) 2003-2011 Stony Brook University
+ * Copyright (c) 2003-2011 The Research Foundation of SUNY
+ *
+ * This file is dual licensed. It may be redistributed and/or modified
+ * under the terms of the Apache 2.0 License OR version 2 of the GNU
+ * General Public License.
+ */
+
+#include "sdcardfs.h"
+
+/*
+ * The inode cache is used with alloc_inode for both our inode info and the
+ * vfs inode.
+ */
+static struct kmem_cache *sdcardfs_inode_cachep;
+
+/*
+ * To support the top references, we must track some data separately.
+ * An sdcardfs_inode_info always has a reference to its data, and once set up,
+ * also has a reference to its top. The top may be itself, in which case it
+ * holds two references to its data. When top is changed, it takes a ref to the
+ * new data and then drops the ref to the old data.
+ */
+static struct kmem_cache *sdcardfs_inode_data_cachep;
+
+void data_release(struct kref *ref)
+{
+ struct sdcardfs_inode_data *data =
+ container_of(ref, struct sdcardfs_inode_data, refcount);
+
+ kmem_cache_free(sdcardfs_inode_data_cachep, data);
+}
+
+/* final actions when unmounting a file system */
+static void sdcardfs_put_super(struct super_block *sb)
+{
+ struct sdcardfs_sb_info *spd;
+ struct super_block *s;
+
+ spd = SDCARDFS_SB(sb);
+ if (!spd)
+ return;
+
+ if (spd->obbpath_s) {
+ kfree(spd->obbpath_s);
+ path_put(&spd->obbpath);
+ }
+
+ /* decrement lower super references */
+ s = sdcardfs_lower_super(sb);
+ sdcardfs_set_lower_super(sb, NULL);
+ atomic_dec(&s->s_active);
+
+ kfree(spd);
+ sb->s_fs_info = NULL;
+}
+
+static int sdcardfs_statfs(struct dentry *dentry, struct kstatfs *buf)
+{
+ int err;
+ struct path lower_path;
+ u32 min_blocks;
+ struct sdcardfs_sb_info *sbi = SDCARDFS_SB(dentry->d_sb);
+
+ sdcardfs_get_lower_path(dentry, &lower_path);
+ err = vfs_statfs(&lower_path, buf);
+ sdcardfs_put_lower_path(dentry, &lower_path);
+
+ if (sbi->options.reserved_mb) {
+ /* Invalid statfs informations. */
+ if (buf->f_bsize == 0) {
+ pr_err("Returned block size is zero.\n");
+ return -EINVAL;
+ }
+
+ min_blocks = ((sbi->options.reserved_mb * 1024 * 1024)/buf->f_bsize);
+ buf->f_blocks -= min_blocks;
+
+ if (buf->f_bavail > min_blocks)
+ buf->f_bavail -= min_blocks;
+ else
+ buf->f_bavail = 0;
+
+ /* Make reserved blocks invisiable to media storage */
+ buf->f_bfree = buf->f_bavail;
+ }
+
+ /* set return buf to our f/s to avoid confusing user-level utils */
+ buf->f_type = SDCARDFS_SUPER_MAGIC;
+
+ return err;
+}
+
+/*
+ * @flags: numeric mount options
+ * @options: mount options string
+ */
+static int sdcardfs_remount_fs(struct super_block *sb, int *flags, char *options)
+{
+ int err = 0;
+
+ /*
+ * The VFS will take care of "ro" and "rw" flags among others. We
+ * can safely accept a few flags (RDONLY, MANDLOCK), and honor
+ * SILENT, but anything else left over is an error.
+ */
+ if ((*flags & ~(MS_RDONLY | MS_MANDLOCK | MS_SILENT)) != 0) {
+ pr_err("sdcardfs: remount flags 0x%x unsupported\n", *flags);
+ err = -EINVAL;
+ }
+
+ return err;
+}
+
+/*
+ * @mnt: mount point we are remounting
+ * @sb: superblock we are remounting
+ * @flags: numeric mount options
+ * @options: mount options string
+ */
+static int sdcardfs_remount_fs2(struct vfsmount *mnt, struct super_block *sb,
+ int *flags, char *options)
+{
+ int err = 0;
+
+ /*
+ * The VFS will take care of "ro" and "rw" flags among others. We
+ * can safely accept a few flags (RDONLY, MANDLOCK), and honor
+ * SILENT, but anything else left over is an error.
+ */
+ if ((*flags & ~(MS_RDONLY | MS_MANDLOCK | MS_SILENT | MS_REMOUNT)) != 0) {
+ pr_err("sdcardfs: remount flags 0x%x unsupported\n", *flags);
+ err = -EINVAL;
+ }
+ pr_info("Remount options were %s for vfsmnt %p.\n", options, mnt);
+ err = parse_options_remount(sb, options, *flags & ~MS_SILENT, mnt->data);
+
+
+ return err;
+}
+
+static void *sdcardfs_clone_mnt_data(void *data)
+{
+ struct sdcardfs_vfsmount_options *opt = kmalloc(sizeof(struct sdcardfs_vfsmount_options), GFP_KERNEL);
+ struct sdcardfs_vfsmount_options *old = data;
+
+ if (!opt)
+ return NULL;
+ opt->gid = old->gid;
+ opt->mask = old->mask;
+ return opt;
+}
+
+static void sdcardfs_copy_mnt_data(void *data, void *newdata)
+{
+ struct sdcardfs_vfsmount_options *old = data;
+ struct sdcardfs_vfsmount_options *new = newdata;
+
+ old->gid = new->gid;
+ old->mask = new->mask;
+}
+
+/*
+ * Called by iput() when the inode reference count reached zero
+ * and the inode is not hashed anywhere. Used to clear anything
+ * that needs to be, before the inode is completely destroyed and put
+ * on the inode free list.
+ */
+static void sdcardfs_evict_inode(struct inode *inode)
+{
+ struct inode *lower_inode;
+
+ truncate_inode_pages(&inode->i_data, 0);
+ set_top(SDCARDFS_I(inode), NULL);
+ clear_inode(inode);
+ /*
+ * Decrement a reference to a lower_inode, which was incremented
+ * by our read_inode when it was created initially.
+ */
+ lower_inode = sdcardfs_lower_inode(inode);
+ sdcardfs_set_lower_inode(inode, NULL);
+ iput(lower_inode);
+}
+
+static struct inode *sdcardfs_alloc_inode(struct super_block *sb)
+{
+ struct sdcardfs_inode_info *i;
+ struct sdcardfs_inode_data *d;
+
+ i = kmem_cache_alloc(sdcardfs_inode_cachep, GFP_KERNEL);
+ if (!i)
+ return NULL;
+
+ /* memset everything up to the inode to 0 */
+ memset(i, 0, offsetof(struct sdcardfs_inode_info, vfs_inode));
+
+ d = kmem_cache_alloc(sdcardfs_inode_data_cachep,
+ GFP_KERNEL | __GFP_ZERO);
+ if (!d) {
+ kmem_cache_free(sdcardfs_inode_cachep, i);
+ return NULL;
+ }
+
+ i->data = d;
+ kref_init(&d->refcount);
+ i->top_data = d;
+ spin_lock_init(&i->top_lock);
+ kref_get(&d->refcount);
+
+ i->vfs_inode.i_version = 1;
+ return &i->vfs_inode;
+}
+
+static void i_callback(struct rcu_head *head)
+{
+ struct inode *inode = container_of(head, struct inode, i_rcu);
+
+ release_own_data(SDCARDFS_I(inode));
+ kmem_cache_free(sdcardfs_inode_cachep, SDCARDFS_I(inode));
+}
+
+static void sdcardfs_destroy_inode(struct inode *inode)
+{
+ call_rcu(&inode->i_rcu, i_callback);
+}
+
+/* sdcardfs inode cache constructor */
+static void init_once(void *obj)
+{
+ struct sdcardfs_inode_info *i = obj;
+
+ inode_init_once(&i->vfs_inode);
+}
+
+int sdcardfs_init_inode_cache(void)
+{
+ sdcardfs_inode_cachep =
+ kmem_cache_create("sdcardfs_inode_cache",
+ sizeof(struct sdcardfs_inode_info), 0,
+ SLAB_RECLAIM_ACCOUNT, init_once);
+
+ if (!sdcardfs_inode_cachep)
+ return -ENOMEM;
+
+ sdcardfs_inode_data_cachep =
+ kmem_cache_create("sdcardfs_inode_data_cache",
+ sizeof(struct sdcardfs_inode_data), 0,
+ SLAB_RECLAIM_ACCOUNT, NULL);
+ if (!sdcardfs_inode_data_cachep) {
+ kmem_cache_destroy(sdcardfs_inode_cachep);
+ return -ENOMEM;
+ }
+
+ return 0;
+}
+
+/* sdcardfs inode cache destructor */
+void sdcardfs_destroy_inode_cache(void)
+{
+ kmem_cache_destroy(sdcardfs_inode_data_cachep);
+ kmem_cache_destroy(sdcardfs_inode_cachep);
+}
+
+/*
+ * Used only in nfs, to kill any pending RPC tasks, so that subsequent
+ * code can actually succeed and won't leave tasks that need handling.
+ */
+static void sdcardfs_umount_begin(struct super_block *sb)
+{
+ struct super_block *lower_sb;
+
+ lower_sb = sdcardfs_lower_super(sb);
+ if (lower_sb && lower_sb->s_op && lower_sb->s_op->umount_begin)
+ lower_sb->s_op->umount_begin(lower_sb);
+}
+
+static int sdcardfs_show_options(struct vfsmount *mnt, struct seq_file *m,
+ struct dentry *root)
+{
+ struct sdcardfs_sb_info *sbi = SDCARDFS_SB(root->d_sb);
+ struct sdcardfs_mount_options *opts = &sbi->options;
+ struct sdcardfs_vfsmount_options *vfsopts = mnt->data;
+
+ if (opts->fs_low_uid != 0)
+ seq_printf(m, ",fsuid=%u", opts->fs_low_uid);
+ if (opts->fs_low_gid != 0)
+ seq_printf(m, ",fsgid=%u", opts->fs_low_gid);
+ if (vfsopts->gid != 0)
+ seq_printf(m, ",gid=%u", vfsopts->gid);
+ if (opts->multiuser)
+ seq_puts(m, ",multiuser");
+ if (vfsopts->mask)
+ seq_printf(m, ",mask=%u", vfsopts->mask);
+ if (opts->fs_user_id)
+ seq_printf(m, ",userid=%u", opts->fs_user_id);
+ if (opts->gid_derivation)
+ seq_puts(m, ",derive_gid");
+ if (opts->default_normal)
+ seq_puts(m, ",default_normal");
+ if (opts->reserved_mb != 0)
+ seq_printf(m, ",reserved=%uMB", opts->reserved_mb);
+ if (opts->nocache)
+ seq_printf(m, ",nocache");
+
+ return 0;
+};
+
+const struct super_operations sdcardfs_sops = {
+ .put_super = sdcardfs_put_super,
+ .statfs = sdcardfs_statfs,
+ .remount_fs = sdcardfs_remount_fs,
+ .remount_fs2 = sdcardfs_remount_fs2,
+ .clone_mnt_data = sdcardfs_clone_mnt_data,
+ .copy_mnt_data = sdcardfs_copy_mnt_data,
+ .evict_inode = sdcardfs_evict_inode,
+ .umount_begin = sdcardfs_umount_begin,
+ .show_options2 = sdcardfs_show_options,
+ .alloc_inode = sdcardfs_alloc_inode,
+ .destroy_inode = sdcardfs_destroy_inode,
+ .drop_inode = generic_delete_inode,
+};
diff --git a/fs/super.c b/fs/super.c
index 7e9beab77259..d80a1d2d491b 100644
--- a/fs/super.c
+++ b/fs/super.c
@@ -761,7 +761,8 @@ rescan:
}
/**
- * do_remount_sb - asks filesystem to change mount options.
+ * do_remount_sb2 - asks filesystem to change mount options.
+ * @mnt: mount we are looking at
* @sb: superblock in question
* @flags: numeric part of options
* @data: the rest of options
@@ -769,7 +770,7 @@ rescan:
*
* Alters the mount options of a mounted file system.
*/
-int do_remount_sb(struct super_block *sb, int flags, void *data, int force)
+int do_remount_sb2(struct vfsmount *mnt, struct super_block *sb, int flags, void *data, int force)
{
int retval;
int remount_ro;
@@ -811,7 +812,16 @@ int do_remount_sb(struct super_block *sb, int flags, void *data, int force)
}
}
- if (sb->s_op->remount_fs) {
+ if (mnt && sb->s_op->remount_fs2) {
+ retval = sb->s_op->remount_fs2(mnt, sb, &flags, data);
+ if (retval) {
+ if (!force)
+ goto cancel_readonly;
+ /* If forced remount, go ahead despite any errors */
+ WARN(1, "forced remount of a %s fs returned %i\n",
+ sb->s_type->name, retval);
+ }
+ } else if (sb->s_op->remount_fs) {
retval = sb->s_op->remount_fs(sb, &flags, data);
if (retval) {
if (!force)
@@ -843,12 +853,17 @@ cancel_readonly:
return retval;
}
+int do_remount_sb(struct super_block *sb, int flags, void *data, int force)
+{
+ return do_remount_sb2(NULL, sb, flags, data, force);
+}
+
static void do_emergency_remount(struct work_struct *work)
{
struct super_block *sb, *p = NULL;
spin_lock(&sb_lock);
- list_for_each_entry(sb, &super_blocks, s_list) {
+ list_for_each_entry_reverse(sb, &super_blocks, s_list) {
if (hlist_unhashed(&sb->s_instances))
continue;
sb->s_count++;
@@ -1168,7 +1183,7 @@ struct dentry *mount_single(struct file_system_type *fs_type,
EXPORT_SYMBOL(mount_single);
struct dentry *
-mount_fs(struct file_system_type *type, int flags, const char *name, void *data)
+mount_fs(struct file_system_type *type, int flags, const char *name, struct vfsmount *mnt, void *data)
{
struct dentry *root;
struct super_block *sb;
@@ -1185,7 +1200,10 @@ mount_fs(struct file_system_type *type, int flags, const char *name, void *data)
goto out_free_secdata;
}
- root = type->mount(type, flags, name, data);
+ if (type->mount2)
+ root = type->mount2(mnt, type, flags, name, data);
+ else
+ root = type->mount(type, flags, name, data);
if (IS_ERR(root)) {
error = PTR_ERR(root);
goto out_free_secdata;
diff --git a/fs/sync.c b/fs/sync.c
index 2a54c1f22035..5c2420c3eb17 100644
--- a/fs/sync.c
+++ b/fs/sync.c
@@ -218,6 +218,7 @@ static int do_fsync(unsigned int fd, int datasync)
if (f.file) {
ret = vfs_fsync(f.file, datasync);
fdput(f);
+ inc_syscfs(current);
}
return ret;
}
diff --git a/fs/userfaultfd.c b/fs/userfaultfd.c
index 784d667475ae..9d9c032d5fe5 100644
--- a/fs/userfaultfd.c
+++ b/fs/userfaultfd.c
@@ -493,7 +493,8 @@ static int userfaultfd_release(struct inode *inode, struct file *file)
new_flags, vma->anon_vma,
vma->vm_file, vma->vm_pgoff,
vma_policy(vma),
- NULL_VM_UFFD_CTX);
+ NULL_VM_UFFD_CTX,
+ vma_get_anon_name(vma));
if (prev)
vma = prev;
else
@@ -872,7 +873,8 @@ static int userfaultfd_register(struct userfaultfd_ctx *ctx,
prev = vma_merge(mm, prev, start, vma_end, new_flags,
vma->anon_vma, vma->vm_file, vma->vm_pgoff,
vma_policy(vma),
- ((struct vm_userfaultfd_ctx){ ctx }));
+ ((struct vm_userfaultfd_ctx){ ctx }),
+ vma_get_anon_name(vma));
if (prev) {
vma = prev;
goto next;
@@ -1009,7 +1011,8 @@ static int userfaultfd_unregister(struct userfaultfd_ctx *ctx,
prev = vma_merge(mm, prev, start, vma_end, new_flags,
vma->anon_vma, vma->vm_file, vma->vm_pgoff,
vma_policy(vma),
- NULL_VM_UFFD_CTX);
+ NULL_VM_UFFD_CTX,
+ vma_get_anon_name(vma));
if (prev) {
vma = prev;
goto next;
diff --git a/fs/utimes.c b/fs/utimes.c
index 22307cdf7014..87ce37bcaa84 100644
--- a/fs/utimes.c
+++ b/fs/utimes.c
@@ -91,7 +91,7 @@ static int utimes_common(struct path *path, struct timespec *times)
}
retry_deleg:
inode_lock(inode);
- error = notify_change(path->dentry, &newattrs, &delegated_inode);
+ error = notify_change2(path->mnt, path->dentry, &newattrs, &delegated_inode);
inode_unlock(inode);
if (delegated_inode) {
error = break_deleg_wait(&delegated_inode);
diff --git a/fs/xattr.c b/fs/xattr.c
index 2f6423182301..1c91835ac431 100644
--- a/fs/xattr.c
+++ b/fs/xattr.c
@@ -130,7 +130,7 @@ xattr_permission(struct inode *inode, const char *name, int mask)
return -EPERM;
}
- return inode_permission(inode, mask);
+ return inode_permission2(ERR_PTR(-EOPNOTSUPP), inode, mask);
}
int
diff --git a/include/asm-generic/sections.h b/include/asm-generic/sections.h
index 4df64a1fc09e..e02a3d968f12 100644
--- a/include/asm-generic/sections.h
+++ b/include/asm-generic/sections.h
@@ -27,6 +27,8 @@
* __kprobes_text_start, __kprobes_text_end
* __entry_text_start, __entry_text_end
* __ctors_start, __ctors_end
+ * __irqentry_text_start, __irqentry_text_end
+ * __softirqentry_text_start, __softirqentry_text_end
*/
extern char _text[], _stext[], _etext[];
extern char _data[], _sdata[], _edata[];
@@ -39,6 +41,8 @@ extern char __per_cpu_load[], __per_cpu_start[], __per_cpu_end[];
extern char __kprobes_text_start[], __kprobes_text_end[];
extern char __entry_text_start[], __entry_text_end[];
extern char __start_rodata[], __end_rodata[];
+extern char __irqentry_text_start[], __irqentry_text_end[];
+extern char __softirqentry_text_start[], __softirqentry_text_end[];
/* Start and end of .ctors section - used for constructor calls. */
extern char __ctors_start[], __ctors_end[];
diff --git a/include/asm-generic/vmlinux.lds.h b/include/asm-generic/vmlinux.lds.h
index 1462071a19bf..3c3519b0fce5 100644
--- a/include/asm-generic/vmlinux.lds.h
+++ b/include/asm-generic/vmlinux.lds.h
@@ -67,10 +67,12 @@
*/
#ifdef CONFIG_LD_DEAD_CODE_DATA_ELIMINATION
#define TEXT_MAIN .text .text.[0-9a-zA-Z_]*
+#define TEXT_CFI_MAIN .text.cfi .text.[0-9a-zA-Z_]*.cfi
#define DATA_MAIN .data .data.[0-9a-zA-Z_]*
#define BSS_MAIN .bss .bss.[0-9a-zA-Z_]*
#else
#define TEXT_MAIN .text
+#define TEXT_CFI_MAIN .text.cfi
#define DATA_MAIN .data
#define BSS_MAIN .bss
#endif
@@ -105,7 +107,7 @@
#ifdef CONFIG_FTRACE_MCOUNT_RECORD
#define MCOUNT_REC() . = ALIGN(8); \
VMLINUX_SYMBOL(__start_mcount_loc) = .; \
- *(__mcount_loc) \
+ KEEP(*(__mcount_loc)) \
VMLINUX_SYMBOL(__stop_mcount_loc) = .;
#else
#define MCOUNT_REC()
@@ -139,10 +141,10 @@
#ifdef CONFIG_EVENT_TRACING
#define FTRACE_EVENTS() . = ALIGN(8); \
VMLINUX_SYMBOL(__start_ftrace_events) = .; \
- *(_ftrace_events) \
+ KEEP(*(_ftrace_events)) \
VMLINUX_SYMBOL(__stop_ftrace_events) = .; \
VMLINUX_SYMBOL(__start_ftrace_enum_maps) = .; \
- *(_ftrace_enum_map) \
+ KEEP(*(_ftrace_enum_map)) \
VMLINUX_SYMBOL(__stop_ftrace_enum_maps) = .;
#else
#define FTRACE_EVENTS()
@@ -185,8 +187,8 @@
#define _OF_TABLE_1(name) \
. = ALIGN(8); \
VMLINUX_SYMBOL(__##name##_of_table) = .; \
- *(__##name##_of_table) \
- *(__##name##_of_table_end)
+ KEEP(*(__##name##_of_table)) \
+ KEEP(*(__##name##_of_table_end))
#define CLKSRC_OF_TABLES() OF_TABLE(CONFIG_CLKSRC_OF, clksrc)
#define IRQCHIP_OF_MATCH_TABLE() OF_TABLE(CONFIG_IRQCHIP, irqchip)
@@ -304,28 +306,28 @@
/* PCI quirks */ \
.pci_fixup : AT(ADDR(.pci_fixup) - LOAD_OFFSET) { \
VMLINUX_SYMBOL(__start_pci_fixups_early) = .; \
- *(.pci_fixup_early) \
+ KEEP(*(.pci_fixup_early)) \
VMLINUX_SYMBOL(__end_pci_fixups_early) = .; \
VMLINUX_SYMBOL(__start_pci_fixups_header) = .; \
- *(.pci_fixup_header) \
+ KEEP(*(.pci_fixup_header)) \
VMLINUX_SYMBOL(__end_pci_fixups_header) = .; \
VMLINUX_SYMBOL(__start_pci_fixups_final) = .; \
- *(.pci_fixup_final) \
+ KEEP(*(.pci_fixup_final)) \
VMLINUX_SYMBOL(__end_pci_fixups_final) = .; \
VMLINUX_SYMBOL(__start_pci_fixups_enable) = .; \
- *(.pci_fixup_enable) \
+ KEEP(*(.pci_fixup_enable)) \
VMLINUX_SYMBOL(__end_pci_fixups_enable) = .; \
VMLINUX_SYMBOL(__start_pci_fixups_resume) = .; \
- *(.pci_fixup_resume) \
+ KEEP(*(.pci_fixup_resume)) \
VMLINUX_SYMBOL(__end_pci_fixups_resume) = .; \
VMLINUX_SYMBOL(__start_pci_fixups_resume_early) = .; \
- *(.pci_fixup_resume_early) \
+ KEEP(*(.pci_fixup_resume_early)) \
VMLINUX_SYMBOL(__end_pci_fixups_resume_early) = .; \
VMLINUX_SYMBOL(__start_pci_fixups_suspend) = .; \
- *(.pci_fixup_suspend) \
+ KEEP(*(.pci_fixup_suspend)) \
VMLINUX_SYMBOL(__end_pci_fixups_suspend) = .; \
VMLINUX_SYMBOL(__start_pci_fixups_suspend_late) = .; \
- *(.pci_fixup_suspend_late) \
+ KEEP(*(.pci_fixup_suspend_late)) \
VMLINUX_SYMBOL(__end_pci_fixups_suspend_late) = .; \
} \
\
@@ -423,7 +425,7 @@
/* Built-in module parameters. */ \
__param : AT(ADDR(__param) - LOAD_OFFSET) { \
VMLINUX_SYMBOL(__start___param) = .; \
- *(__param) \
+ KEEP(*(__param)) \
VMLINUX_SYMBOL(__stop___param) = .; \
} \
\
@@ -460,6 +462,8 @@
#define TEXT_TEXT \
ALIGN_FUNCTION(); \
*(.text.hot TEXT_MAIN .text.fixup .text.unlikely) \
+ *(.text..ftrace) \
+ *(TEXT_CFI_MAIN) \
*(.ref.text) \
MEM_KEEP(init.text) \
MEM_KEEP(exit.text) \
@@ -499,28 +503,20 @@
*(.entry.text) \
VMLINUX_SYMBOL(__entry_text_end) = .;
-#if defined(CONFIG_FUNCTION_GRAPH_TRACER) || defined(CONFIG_KASAN)
#define IRQENTRY_TEXT \
ALIGN_FUNCTION(); \
VMLINUX_SYMBOL(__irqentry_text_start) = .; \
*(.irqentry.text) \
VMLINUX_SYMBOL(__irqentry_text_end) = .;
-#else
-#define IRQENTRY_TEXT
-#endif
-#if defined(CONFIG_FUNCTION_GRAPH_TRACER) || defined(CONFIG_KASAN)
#define SOFTIRQENTRY_TEXT \
ALIGN_FUNCTION(); \
VMLINUX_SYMBOL(__softirqentry_text_start) = .; \
*(.softirqentry.text) \
VMLINUX_SYMBOL(__softirqentry_text_end) = .;
-#else
-#define SOFTIRQENTRY_TEXT
-#endif
/* Section used for early init (in .S files) */
-#define HEAD_TEXT *(.head.text)
+#define HEAD_TEXT KEEP(*(.head.text))
#define HEAD_TEXT_SECTION \
.head.text : AT(ADDR(.head.text) - LOAD_OFFSET) { \
@@ -534,7 +530,7 @@
. = ALIGN(align); \
__ex_table : AT(ADDR(__ex_table) - LOAD_OFFSET) { \
VMLINUX_SYMBOL(__start___ex_table) = .; \
- *(__ex_table) \
+ KEEP(*(__ex_table)) \
VMLINUX_SYMBOL(__stop___ex_table) = .; \
}
@@ -565,7 +561,7 @@
MEM_DISCARD(init.data) \
KERNEL_CTORS() \
MCOUNT_REC() \
- *(.init.rodata) \
+ *(.init.rodata .init.rodata.*) \
FTRACE_EVENTS() \
TRACE_SYSCALLS() \
KPROBE_BLACKLIST() \
@@ -583,7 +579,7 @@
EARLYCON_TABLE()
#define INIT_TEXT \
- *(.init.text) \
+ *(.init.text .init.text.*) \
*(.text.startup) \
MEM_DISCARD(init.text)
@@ -600,7 +596,7 @@
MEM_DISCARD(exit.text)
#define EXIT_CALL \
- *(.exitcall.exit)
+ KEEP(*(.exitcall.exit))
/*
* bss (Block Started by Symbol) - uninitialized data
@@ -676,7 +672,7 @@
. = ALIGN(8); \
__bug_table : AT(ADDR(__bug_table) - LOAD_OFFSET) { \
VMLINUX_SYMBOL(__start___bug_table) = .; \
- *(__bug_table) \
+ KEEP(*(__bug_table)) \
VMLINUX_SYMBOL(__stop___bug_table) = .; \
}
#else
@@ -705,7 +701,7 @@
#define INIT_SETUP(initsetup_align) \
. = ALIGN(initsetup_align); \
VMLINUX_SYMBOL(__setup_start) = .; \
- *(.init.setup) \
+ KEEP(*(.init.setup)) \
VMLINUX_SYMBOL(__setup_end) = .;
#define INIT_CALLS_LEVEL(level) \
diff --git a/include/crypto/chacha.h b/include/crypto/chacha.h
new file mode 100644
index 000000000000..d000aa2120b0
--- /dev/null
+++ b/include/crypto/chacha.h
@@ -0,0 +1,54 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Common values and helper functions for the ChaCha and XChaCha stream ciphers.
+ *
+ * XChaCha extends ChaCha's nonce to 192 bits, while provably retaining ChaCha's
+ * security. Here they share the same key size, tfm context, and setkey
+ * function; only their IV size and encrypt/decrypt function differ.
+ *
+ * The ChaCha paper specifies 20, 12, and 8-round variants. In general, it is
+ * recommended to use the 20-round variant ChaCha20. However, the other
+ * variants can be needed in some performance-sensitive scenarios. The generic
+ * ChaCha code currently allows only the 20 and 12-round variants.
+ */
+
+#ifndef _CRYPTO_CHACHA_H
+#define _CRYPTO_CHACHA_H
+
+#include <linux/types.h>
+#include <linux/crypto.h>
+
+/* 32-bit stream position, then 96-bit nonce (RFC7539 convention) */
+#define CHACHA_IV_SIZE 16
+
+#define CHACHA_KEY_SIZE 32
+#define CHACHA_BLOCK_SIZE 64
+
+/* 192-bit nonce, then 64-bit stream position */
+#define XCHACHA_IV_SIZE 32
+
+struct chacha_ctx {
+ u32 key[8];
+ int nrounds;
+};
+
+void chacha_block(u32 *state, u8 *stream, int nrounds);
+static inline void chacha20_block(u32 *state, u8 *stream)
+{
+ chacha_block(state, stream, 20);
+}
+void hchacha_block(const u32 *in, u32 *out, int nrounds);
+
+void crypto_chacha_init(u32 *state, struct chacha_ctx *ctx, u8 *iv);
+
+int crypto_chacha20_setkey(struct crypto_tfm *tfm, const u8 *key,
+ unsigned int keysize);
+int crypto_chacha12_setkey(struct crypto_tfm *tfm, const u8 *key,
+ unsigned int keysize);
+
+int crypto_chacha_crypt(struct blkcipher_desc *desc, struct scatterlist *dst,
+ struct scatterlist *src, unsigned int nbytes);
+int crypto_xchacha_crypt(struct blkcipher_desc *desc, struct scatterlist *dst,
+ struct scatterlist *src, unsigned int nbytes);
+
+#endif /* _CRYPTO_CHACHA_H */
diff --git a/include/crypto/chacha20.h b/include/crypto/chacha20.h
deleted file mode 100644
index 20d20f681a72..000000000000
--- a/include/crypto/chacha20.h
+++ /dev/null
@@ -1,26 +0,0 @@
-/*
- * Common values for the ChaCha20 algorithm
- */
-
-#ifndef _CRYPTO_CHACHA20_H
-#define _CRYPTO_CHACHA20_H
-
-#include <linux/types.h>
-#include <linux/crypto.h>
-
-#define CHACHA20_IV_SIZE 16
-#define CHACHA20_KEY_SIZE 32
-#define CHACHA20_BLOCK_SIZE 64
-
-struct chacha20_ctx {
- u32 key[8];
-};
-
-void chacha20_block(u32 *state, void *stream);
-void crypto_chacha20_init(u32 *state, struct chacha20_ctx *ctx, u8 *iv);
-int crypto_chacha20_setkey(struct crypto_tfm *tfm, const u8 *key,
- unsigned int keysize);
-int crypto_chacha20_crypt(struct blkcipher_desc *desc, struct scatterlist *dst,
- struct scatterlist *src, unsigned int nbytes);
-
-#endif
diff --git a/include/crypto/nhpoly1305.h b/include/crypto/nhpoly1305.h
new file mode 100644
index 000000000000..53c04423c582
--- /dev/null
+++ b/include/crypto/nhpoly1305.h
@@ -0,0 +1,74 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Common values and helper functions for the NHPoly1305 hash function.
+ */
+
+#ifndef _NHPOLY1305_H
+#define _NHPOLY1305_H
+
+#include <crypto/hash.h>
+#include <crypto/poly1305.h>
+
+/* NH parameterization: */
+
+/* Endianness: little */
+/* Word size: 32 bits (works well on NEON, SSE2, AVX2) */
+
+/* Stride: 2 words (optimal on ARM32 NEON; works okay on other CPUs too) */
+#define NH_PAIR_STRIDE 2
+#define NH_MESSAGE_UNIT (NH_PAIR_STRIDE * 2 * sizeof(u32))
+
+/* Num passes (Toeplitz iteration count): 4, to give ε = 2^{-128} */
+#define NH_NUM_PASSES 4
+#define NH_HASH_BYTES (NH_NUM_PASSES * sizeof(u64))
+
+/* Max message size: 1024 bytes (32x compression factor) */
+#define NH_NUM_STRIDES 64
+#define NH_MESSAGE_WORDS (NH_PAIR_STRIDE * 2 * NH_NUM_STRIDES)
+#define NH_MESSAGE_BYTES (NH_MESSAGE_WORDS * sizeof(u32))
+#define NH_KEY_WORDS (NH_MESSAGE_WORDS + \
+ NH_PAIR_STRIDE * 2 * (NH_NUM_PASSES - 1))
+#define NH_KEY_BYTES (NH_KEY_WORDS * sizeof(u32))
+
+#define NHPOLY1305_KEY_SIZE (POLY1305_BLOCK_SIZE + NH_KEY_BYTES)
+
+struct nhpoly1305_key {
+ struct poly1305_key poly_key;
+ u32 nh_key[NH_KEY_WORDS];
+};
+
+struct nhpoly1305_state {
+
+ /* Running total of polynomial evaluation */
+ struct poly1305_state poly_state;
+
+ /* Partial block buffer */
+ u8 buffer[NH_MESSAGE_UNIT];
+ unsigned int buflen;
+
+ /*
+ * Number of bytes remaining until the current NH message reaches
+ * NH_MESSAGE_BYTES. When nonzero, 'nh_hash' holds the partial NH hash.
+ */
+ unsigned int nh_remaining;
+
+ __le64 nh_hash[NH_NUM_PASSES];
+};
+
+typedef void (*nh_t)(const u32 *key, const u8 *message, size_t message_len,
+ __le64 hash[NH_NUM_PASSES]);
+
+int crypto_nhpoly1305_setkey(struct crypto_shash *tfm,
+ const u8 *key, unsigned int keylen);
+
+int crypto_nhpoly1305_init(struct shash_desc *desc);
+int crypto_nhpoly1305_update(struct shash_desc *desc,
+ const u8 *src, unsigned int srclen);
+int crypto_nhpoly1305_update_helper(struct shash_desc *desc,
+ const u8 *src, unsigned int srclen,
+ nh_t nh_fn);
+int crypto_nhpoly1305_final(struct shash_desc *desc, u8 *dst);
+int crypto_nhpoly1305_final_helper(struct shash_desc *desc, u8 *dst,
+ nh_t nh_fn);
+
+#endif /* _NHPOLY1305_H */
diff --git a/include/crypto/poly1305.h b/include/crypto/poly1305.h
index d586f741cab5..0c0b83a91b91 100644
--- a/include/crypto/poly1305.h
+++ b/include/crypto/poly1305.h
@@ -12,13 +12,21 @@
#define POLY1305_KEY_SIZE 32
#define POLY1305_DIGEST_SIZE 16
+struct poly1305_key {
+ u32 r[5]; /* key, base 2^26 */
+};
+
+struct poly1305_state {
+ u32 h[5]; /* accumulator, base 2^26 */
+};
+
struct poly1305_desc_ctx {
/* key */
- u32 r[5];
+ struct poly1305_key r;
/* finalize key */
u32 s[4];
/* accumulator */
- u32 h[5];
+ struct poly1305_state h;
/* partial buffer */
u8 buf[POLY1305_BLOCK_SIZE];
/* bytes used in partial buffer */
@@ -29,6 +37,22 @@ struct poly1305_desc_ctx {
bool sset;
};
+/*
+ * Poly1305 core functions. These implement the ε-almost-∆-universal hash
+ * function underlying the Poly1305 MAC, i.e. they don't add an encrypted nonce
+ * ("s key") at the end. They also only support block-aligned inputs.
+ */
+void poly1305_core_setkey(struct poly1305_key *key, const u8 *raw_key);
+static inline void poly1305_core_init(struct poly1305_state *state)
+{
+ memset(state->h, 0, sizeof(state->h));
+}
+void poly1305_core_blocks(struct poly1305_state *state,
+ const struct poly1305_key *key,
+ const void *src, unsigned int nblocks);
+void poly1305_core_emit(const struct poly1305_state *state, void *dst);
+
+/* Crypto API helper functions for the Poly1305 MAC */
int crypto_poly1305_init(struct shash_desc *desc);
unsigned int crypto_poly1305_setdesckey(struct poly1305_desc_ctx *dctx,
const u8 *src, unsigned int srclen);
diff --git a/include/drm/drm_atomic.h b/include/drm/drm_atomic.h
index 9701f2dfb784..a5696c1806c9 100644
--- a/include/drm/drm_atomic.h
+++ b/include/drm/drm_atomic.h
@@ -144,6 +144,7 @@ struct __drm_crtcs_state {
struct drm_crtc *ptr;
struct drm_crtc_state *state;
struct drm_crtc_commit *commit;
+ s32 __user *out_fence_ptr;
};
struct __drm_connnectors_state {
@@ -316,6 +317,8 @@ drm_atomic_set_crtc_for_plane(struct drm_plane_state *plane_state,
struct drm_crtc *crtc);
void drm_atomic_set_fb_for_plane(struct drm_plane_state *plane_state,
struct drm_framebuffer *fb);
+void drm_atomic_set_fence_for_plane(struct drm_plane_state *plane_state,
+ struct fence *fence);
int __must_check
drm_atomic_set_crtc_for_connector(struct drm_connector_state *conn_state,
struct drm_crtc *crtc);
diff --git a/include/drm/drm_crtc.h b/include/drm/drm_crtc.h
index 0aa292526567..f3d58c7eb97e 100644
--- a/include/drm/drm_crtc.h
+++ b/include/drm/drm_crtc.h
@@ -680,6 +680,35 @@ struct drm_crtc {
* context.
*/
struct drm_modeset_acquire_ctx *acquire_ctx;
+
+ /**
+ * @fence_context:
+ *
+ * timeline context used for fence operations.
+ */
+ unsigned int fence_context;
+
+ /**
+ * @fence_lock:
+ *
+ * spinlock to protect the fences in the fence_context.
+ */
+
+ spinlock_t fence_lock;
+ /**
+ * @fence_seqno:
+ *
+ * Seqno variable used as monotonic counter for the fences
+ * created on the CRTC's timeline.
+ */
+ unsigned long fence_seqno;
+
+ /**
+ * @timeline_name:
+ *
+ * The name of the CRTC's fence timeline.
+ */
+ char timeline_name[32];
};
/**
@@ -1160,6 +1189,17 @@ struct drm_mode_config {
*/
struct drm_property *prop_fb_id;
/**
+ * @prop_in_fence_fd: Sync File fd representing the incoming fences
+ * for a Plane.
+ */
+ struct drm_property *prop_in_fence_fd;
+ /**
+ * @prop_out_fence_ptr: Sync File fd pointer representing the
+ * outgoing fences for a CRTC. Userspace should provide a pointer to a
+ * value of type s32, and then cast that pointer to u64.
+ */
+ struct drm_property *prop_out_fence_ptr;
+ /**
* @prop_crtc_id: Default atomic plane property to specify the
* &drm_crtc.
*/
diff --git a/include/drm/drm_fb_cma_helper.h b/include/drm/drm_fb_cma_helper.h
index f313211f8ed5..3b00f6480b83 100644
--- a/include/drm/drm_fb_cma_helper.h
+++ b/include/drm/drm_fb_cma_helper.h
@@ -12,6 +12,8 @@ struct drm_fb_helper;
struct drm_device;
struct drm_file;
struct drm_mode_fb_cmd2;
+struct drm_plane;
+struct drm_plane_state;
struct drm_fbdev_cma *drm_fbdev_cma_init_with_funcs(struct drm_device *dev,
unsigned int preferred_bpp, unsigned int num_crtc,
@@ -41,6 +43,9 @@ struct drm_framebuffer *drm_fb_cma_create(struct drm_device *dev,
struct drm_gem_cma_object *drm_fb_cma_get_gem_obj(struct drm_framebuffer *fb,
unsigned int plane);
+int drm_fb_cma_prepare_fb(struct drm_plane *plane,
+ struct drm_plane_state *state);
+
#ifdef CONFIG_DEBUG_FS
struct seq_file;
diff --git a/include/drm/drm_plane.h b/include/drm/drm_plane.h
index 8b4dc62470ff..952ef84dc046 100644
--- a/include/drm/drm_plane.h
+++ b/include/drm/drm_plane.h
@@ -65,7 +65,7 @@ struct drm_plane_state {
struct drm_crtc *crtc; /* do not write directly, use drm_atomic_set_crtc_for_plane() */
struct drm_framebuffer *fb; /* do not write directly, use drm_atomic_set_fb_for_plane() */
- struct fence *fence;
+ struct fence *fence; /* do not write directly, use drm_atomic_set_fence_for_plane() */
/* Signed dest location allows it to be partially off screen */
int32_t crtc_x, crtc_y;
diff --git a/include/keys/user-type.h b/include/keys/user-type.h
index c56fef40f53e..e098cbe27db5 100644
--- a/include/keys/user-type.h
+++ b/include/keys/user-type.h
@@ -48,9 +48,14 @@ extern void user_describe(const struct key *user, struct seq_file *m);
extern long user_read(const struct key *key,
char __user *buffer, size_t buflen);
-static inline const struct user_key_payload *user_key_payload(const struct key *key)
+static inline const struct user_key_payload *user_key_payload_rcu(const struct key *key)
{
- return (struct user_key_payload *)rcu_dereference_key(key);
+ return (struct user_key_payload *)dereference_key_rcu(key);
+}
+
+static inline struct user_key_payload *user_key_payload_locked(const struct key *key)
+{
+ return (struct user_key_payload *)dereference_key_locked((struct key *)key);
}
#endif /* CONFIG_KEYS */
diff --git a/include/linux/Kbuild b/include/linux/Kbuild
new file mode 100644
index 000000000000..a4608897a5f3
--- /dev/null
+++ b/include/linux/Kbuild
@@ -0,0 +1,2 @@
+header-y += if_pppolac.h
+header-y += if_pppopns.h
diff --git a/include/linux/amba/mmci.h b/include/linux/amba/mmci.h
index 8c98113069ce..eff56cb0016a 100644
--- a/include/linux/amba/mmci.h
+++ b/include/linux/amba/mmci.h
@@ -5,6 +5,15 @@
#define AMBA_MMCI_H
#include <linux/mmc/host.h>
+#include <linux/mmc/card.h>
+#include <linux/mmc/sdio_func.h>
+
+struct embedded_sdio_data {
+ struct sdio_cis cis;
+ struct sdio_cccr cccr;
+ struct sdio_embedded_func *funcs;
+ int num_funcs;
+};
/**
* struct mmci_platform_data - platform configuration for the MMCI
@@ -31,6 +40,7 @@ struct mmci_platform_data {
int gpio_wp;
int gpio_cd;
bool cd_invert;
+ struct embedded_sdio_data *embedded_sdio;
};
#endif
diff --git a/include/linux/android_aid.h b/include/linux/android_aid.h
new file mode 100644
index 000000000000..6f1fa1792dfc
--- /dev/null
+++ b/include/linux/android_aid.h
@@ -0,0 +1,28 @@
+/* include/linux/android_aid.h
+ *
+ * Copyright (C) 2008 Google, Inc.
+ *
+ * This software is licensed under the terms of the GNU General Public
+ * License version 2, as published by the Free Software Foundation, and
+ * may be copied, distributed, and modified under those terms.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ */
+
+#ifndef _LINUX_ANDROID_AID_H
+#define _LINUX_ANDROID_AID_H
+
+/* AIDs that the kernel treats differently */
+#define AID_OBSOLETE_000 KGIDT_INIT(3001) /* was NET_BT_ADMIN */
+#define AID_OBSOLETE_001 KGIDT_INIT(3002) /* was NET_BT */
+#define AID_INET KGIDT_INIT(3003)
+#define AID_NET_RAW KGIDT_INIT(3004)
+#define AID_NET_ADMIN KGIDT_INIT(3005)
+#define AID_NET_BW_STATS KGIDT_INIT(3006) /* read bandwidth statistics */
+#define AID_NET_BW_ACCT KGIDT_INIT(3007) /* change bandwidth statistics accounting */
+
+#endif
diff --git a/include/linux/arm-smccc.h b/include/linux/arm-smccc.h
index 18863d56273c..9f721ee6690d 100644
--- a/include/linux/arm-smccc.h
+++ b/include/linux/arm-smccc.h
@@ -160,6 +160,7 @@ asmlinkage void __arm_smccc_hvc(unsigned long a0, unsigned long a1,
#define SMCCC_SMC_INST "smc #0"
#define SMCCC_HVC_INST "hvc #0"
+#define SMCCC_REG(n) asm("x" # n)
#elif defined(CONFIG_ARM)
#include <asm/opcodes-sec.h>
@@ -167,6 +168,7 @@ asmlinkage void __arm_smccc_hvc(unsigned long a0, unsigned long a1,
#define SMCCC_SMC_INST __SMC(0)
#define SMCCC_HVC_INST __HVC(0)
+#define SMCCC_REG(n) asm("r" # n)
#endif
@@ -199,57 +201,57 @@ asmlinkage void __arm_smccc_hvc(unsigned long a0, unsigned long a1,
#define __declare_arg_0(a0, res) \
struct arm_smccc_res *___res = res; \
- register unsigned long r0 asm("r0") = (u32)a0; \
- register unsigned long r1 asm("r1"); \
- register unsigned long r2 asm("r2"); \
- register unsigned long r3 asm("r3")
+ register unsigned long r0 SMCCC_REG(0) = (u32)a0; \
+ register unsigned long r1 SMCCC_REG(1); \
+ register unsigned long r2 SMCCC_REG(2); \
+ register unsigned long r3 SMCCC_REG(3)
#define __declare_arg_1(a0, a1, res) \
typeof(a1) __a1 = a1; \
struct arm_smccc_res *___res = res; \
- register unsigned long r0 asm("r0") = (u32)a0; \
- register unsigned long r1 asm("r1") = __a1; \
- register unsigned long r2 asm("r2"); \
- register unsigned long r3 asm("r3")
+ register unsigned long r0 SMCCC_REG(0) = (u32)a0; \
+ register unsigned long r1 SMCCC_REG(1) = __a1; \
+ register unsigned long r2 SMCCC_REG(2); \
+ register unsigned long r3 SMCCC_REG(3)
#define __declare_arg_2(a0, a1, a2, res) \
typeof(a1) __a1 = a1; \
typeof(a2) __a2 = a2; \
struct arm_smccc_res *___res = res; \
- register unsigned long r0 asm("r0") = (u32)a0; \
- register unsigned long r1 asm("r1") = __a1; \
- register unsigned long r2 asm("r2") = __a2; \
- register unsigned long r3 asm("r3")
+ register unsigned long r0 SMCCC_REG(0) = (u32)a0; \
+ register unsigned long r1 SMCCC_REG(1) = __a1; \
+ register unsigned long r2 SMCCC_REG(2) = __a2; \
+ register unsigned long r3 SMCCC_REG(3)
#define __declare_arg_3(a0, a1, a2, a3, res) \
typeof(a1) __a1 = a1; \
typeof(a2) __a2 = a2; \
typeof(a3) __a3 = a3; \
struct arm_smccc_res *___res = res; \
- register unsigned long r0 asm("r0") = (u32)a0; \
- register unsigned long r1 asm("r1") = __a1; \
- register unsigned long r2 asm("r2") = __a2; \
- register unsigned long r3 asm("r3") = __a3
+ register unsigned long r0 SMCCC_REG(0) = (u32)a0; \
+ register unsigned long r1 SMCCC_REG(1) = __a1; \
+ register unsigned long r2 SMCCC_REG(2) = __a2; \
+ register unsigned long r3 SMCCC_REG(3) = __a3
#define __declare_arg_4(a0, a1, a2, a3, a4, res) \
typeof(a4) __a4 = a4; \
__declare_arg_3(a0, a1, a2, a3, res); \
- register unsigned long r4 asm("r4") = __a4
+ register unsigned long r4 SMCCC_REG(4) = __a4
#define __declare_arg_5(a0, a1, a2, a3, a4, a5, res) \
typeof(a5) __a5 = a5; \
__declare_arg_4(a0, a1, a2, a3, a4, res); \
- register unsigned long r5 asm("r5") = __a5
+ register unsigned long r5 SMCCC_REG(5) = __a5
#define __declare_arg_6(a0, a1, a2, a3, a4, a5, a6, res) \
typeof(a6) __a6 = a6; \
__declare_arg_5(a0, a1, a2, a3, a4, a5, res); \
- register unsigned long r6 asm("r6") = __a6
+ register unsigned long r6 SMCCC_REG(6) = __a6
#define __declare_arg_7(a0, a1, a2, a3, a4, a5, a6, a7, res) \
typeof(a7) __a7 = a7; \
__declare_arg_6(a0, a1, a2, a3, a4, a5, a6, res); \
- register unsigned long r7 asm("r7") = __a7
+ register unsigned long r7 SMCCC_REG(7) = __a7
#define ___declare_args(count, ...) __declare_arg_ ## count(__VA_ARGS__)
#define __declare_args(count, ...) ___declare_args(count, __VA_ARGS__)
diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h
index cd395ecec99d..99ad6101320b 100644
--- a/include/linux/blk_types.h
+++ b/include/linux/blk_types.h
@@ -25,6 +25,7 @@ typedef void (bio_end_io_t) (struct bio *);
struct bio {
struct bio *bi_next; /* request queue link */
struct block_device *bi_bdev;
+ unsigned short bi_write_hint;
int bi_error;
unsigned int bi_opf; /* bottom bits req flags,
* top bits REQ_OP. Use
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index bd738aafd432..657689db8a1a 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -196,6 +196,9 @@ struct request {
/* for bidi */
struct request *next_rq;
+
+ ktime_t lat_hist_io_start;
+ int lat_hist_enabled;
};
#define REQ_OP_SHIFT (8 * sizeof(u64) - REQ_OP_BITS)
@@ -1700,6 +1703,62 @@ extern int bdev_write_page(struct block_device *, sector_t, struct page *,
extern long bdev_direct_access(struct block_device *, struct blk_dax_ctl *);
extern int bdev_dax_supported(struct super_block *, int);
extern bool bdev_dax_capable(struct block_device *);
+
+/*
+ * X-axis for IO latency histogram support.
+ */
+static const u_int64_t latency_x_axis_us[] = {
+ 100,
+ 200,
+ 300,
+ 400,
+ 500,
+ 600,
+ 700,
+ 800,
+ 900,
+ 1000,
+ 1200,
+ 1400,
+ 1600,
+ 1800,
+ 2000,
+ 2500,
+ 3000,
+ 4000,
+ 5000,
+ 6000,
+ 7000,
+ 9000,
+ 10000
+};
+
+#define BLK_IO_LAT_HIST_DISABLE 0
+#define BLK_IO_LAT_HIST_ENABLE 1
+#define BLK_IO_LAT_HIST_ZERO 2
+
+struct io_latency_state {
+ u_int64_t latency_y_axis[ARRAY_SIZE(latency_x_axis_us) + 1];
+ u_int64_t latency_elems;
+ u_int64_t latency_sum;
+};
+
+static inline void
+blk_update_latency_hist(struct io_latency_state *s, u_int64_t delta_us)
+{
+ int i;
+
+ for (i = 0; i < ARRAY_SIZE(latency_x_axis_us); i++)
+ if (delta_us < (u_int64_t)latency_x_axis_us[i])
+ break;
+ s->latency_y_axis[i]++;
+ s->latency_elems++;
+ s->latency_sum += delta_us;
+}
+
+ssize_t blk_latency_hist_show(char* name, struct io_latency_state *s,
+ char *buf, int buf_size);
+
#else /* CONFIG_BLOCK */
struct block_device;
diff --git a/include/linux/bpf-cgroup.h b/include/linux/bpf-cgroup.h
new file mode 100644
index 000000000000..ace92fce296d
--- /dev/null
+++ b/include/linux/bpf-cgroup.h
@@ -0,0 +1,77 @@
+#ifndef _BPF_CGROUP_H
+#define _BPF_CGROUP_H
+
+#include <linux/jump_label.h>
+#include <uapi/linux/bpf.h>
+
+struct sock;
+struct cgroup;
+struct sk_buff;
+
+#ifdef CONFIG_CGROUP_BPF
+
+extern struct static_key_false cgroup_bpf_enabled_key;
+#define cgroup_bpf_enabled static_branch_unlikely(&cgroup_bpf_enabled_key)
+
+struct cgroup_bpf {
+ /*
+ * Store two sets of bpf_prog pointers, one for programs that are
+ * pinned directly to this cgroup, and one for those that are effective
+ * when this cgroup is accessed.
+ */
+ struct bpf_prog *prog[MAX_BPF_ATTACH_TYPE];
+ struct bpf_prog *effective[MAX_BPF_ATTACH_TYPE];
+ bool disallow_override[MAX_BPF_ATTACH_TYPE];
+};
+
+void cgroup_bpf_put(struct cgroup *cgrp);
+void cgroup_bpf_inherit(struct cgroup *cgrp, struct cgroup *parent);
+
+int __cgroup_bpf_update(struct cgroup *cgrp, struct cgroup *parent,
+ struct bpf_prog *prog, enum bpf_attach_type type,
+ bool overridable);
+
+/* Wrapper for __cgroup_bpf_update() protected by cgroup_mutex */
+int cgroup_bpf_update(struct cgroup *cgrp, struct bpf_prog *prog,
+ enum bpf_attach_type type, bool overridable);
+
+int __cgroup_bpf_run_filter(struct sock *sk,
+ struct sk_buff *skb,
+ enum bpf_attach_type type);
+
+/* Wrappers for __cgroup_bpf_run_filter() guarded by cgroup_bpf_enabled. */
+#define BPF_CGROUP_RUN_PROG_INET_INGRESS(sk,skb) \
+({ \
+ int __ret = 0; \
+ if (cgroup_bpf_enabled) \
+ __ret = __cgroup_bpf_run_filter(sk, skb, \
+ BPF_CGROUP_INET_INGRESS); \
+ \
+ __ret; \
+})
+
+#define BPF_CGROUP_RUN_PROG_INET_EGRESS(sk,skb) \
+({ \
+ int __ret = 0; \
+ if (cgroup_bpf_enabled && sk && sk == skb->sk) { \
+ typeof(sk) __sk = sk_to_full_sk(sk); \
+ if (sk_fullsock(__sk)) \
+ __ret = __cgroup_bpf_run_filter(__sk, skb, \
+ BPF_CGROUP_INET_EGRESS); \
+ } \
+ __ret; \
+})
+
+#else
+
+struct cgroup_bpf {};
+static inline void cgroup_bpf_put(struct cgroup *cgrp) {}
+static inline void cgroup_bpf_inherit(struct cgroup *cgrp,
+ struct cgroup *parent) {}
+
+#define BPF_CGROUP_RUN_PROG_INET_INGRESS(sk,skb) ({ 0; })
+#define BPF_CGROUP_RUN_PROG_INET_EGRESS(sk,skb) ({ 0; })
+
+#endif /* CONFIG_CGROUP_BPF */
+
+#endif /* _BPF_CGROUP_H */
diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index 7995940d4187..cd6aaf08324d 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -56,6 +56,9 @@ struct bpf_map {
atomic_t refcnt;
atomic_t usercnt;
struct work_struct work;
+#ifdef CONFIG_SECURITY
+ void *security;
+#endif
};
struct bpf_map_type_list {
@@ -189,6 +192,9 @@ struct bpf_prog_aux {
struct bpf_map **used_maps;
struct bpf_prog *prog;
struct user_struct *user;
+#ifdef CONFIG_SECURITY
+ void *security;
+#endif
union {
struct work_struct work;
struct rcu_head rcu;
@@ -241,6 +247,9 @@ DECLARE_PER_CPU(int, bpf_prog_active);
void bpf_register_prog_type(struct bpf_prog_type_list *tl);
void bpf_register_map_type(struct bpf_map_type_list *tl);
+extern const struct file_operations bpf_map_fops;
+extern const struct file_operations bpf_prog_fops;
+
struct bpf_prog *bpf_prog_get(u32 ufd);
struct bpf_prog *bpf_prog_get_type(u32 ufd, enum bpf_prog_type type);
struct bpf_prog *bpf_prog_add(struct bpf_prog *prog, int i);
@@ -258,11 +267,11 @@ void bpf_map_area_free(void *base);
extern int sysctl_unprivileged_bpf_disabled;
-int bpf_map_new_fd(struct bpf_map *map);
+int bpf_map_new_fd(struct bpf_map *map, int flags);
int bpf_prog_new_fd(struct bpf_prog *prog);
int bpf_obj_pin_user(u32 ufd, const char __user *pathname);
-int bpf_obj_get_user(const char __user *pathname);
+int bpf_obj_get_user(const char __user *pathname, int flags);
int bpf_percpu_hash_copy(struct bpf_map *map, void *key, void *value);
int bpf_percpu_array_copy(struct bpf_map *map, void *key, void *value);
@@ -277,6 +286,8 @@ int bpf_fd_array_map_update_elem(struct bpf_map *map, struct file *map_file,
void *key, void *value, u64 map_flags);
void bpf_fd_array_map_clear(struct bpf_map *map);
+int bpf_get_file_flag(int flags);
+
/* memcpy that is used with 8-byte aligned pointers, power-of-8 size and
* forced to use 'long' read/writes to try to atomically copy long counters.
* Best-effort only. No barriers here, since it _will_ race with concurrent
@@ -295,6 +306,9 @@ static inline void bpf_long_memcpy(void *dst, const void *src, u32 size)
/* verify correctness of eBPF program */
int bpf_check(struct bpf_prog **fp, union bpf_attr *attr);
+
+struct bpf_prog *bpf_prog_get_type_path(const char *name, enum bpf_prog_type type);
+
#else
static inline void bpf_register_prog_type(struct bpf_prog_type_list *tl)
{
@@ -322,6 +336,16 @@ static inline struct bpf_prog *bpf_prog_inc(struct bpf_prog *prog)
{
return ERR_PTR(-EOPNOTSUPP);
}
+static inline int bpf_obj_get_user(const char __user *pathname)
+{
+ return -EOPNOTSUPP;
+}
+
+static inline struct bpf_prog *bpf_prog_get_type_path(const char *name,
+ enum bpf_prog_type type)
+{
+ return ERR_PTR(-EOPNOTSUPP);
+}
#endif /* CONFIG_BPF_SYSCALL */
/* verifier prototypes for helper functions called from eBPF programs */
diff --git a/include/linux/bug.h b/include/linux/bug.h
index 292d6a10b0c2..5828489309bb 100644
--- a/include/linux/bug.h
+++ b/include/linux/bug.h
@@ -121,4 +121,23 @@ static inline enum bug_trap_type report_bug(unsigned long bug_addr,
}
#endif /* CONFIG_GENERIC_BUG */
+
+/*
+ * Since detected data corruption should stop operation on the affected
+ * structures. Return value must be checked and sanely acted on by caller.
+ */
+static inline __must_check bool check_data_corruption(bool v) { return v; }
+#define CHECK_DATA_CORRUPTION(condition, fmt, ...) \
+ check_data_corruption(({ \
+ bool corruption = unlikely(condition); \
+ if (corruption) { \
+ if (IS_ENABLED(CONFIG_BUG_ON_DATA_CORRUPTION)) { \
+ pr_err(fmt, ##__VA_ARGS__); \
+ BUG(); \
+ } else \
+ WARN(1, fmt, ##__VA_ARGS__); \
+ } \
+ corruption; \
+ }))
+
#endif /* _LINUX_BUG_H */
diff --git a/include/linux/cfi.h b/include/linux/cfi.h
new file mode 100644
index 000000000000..e27033d5dd53
--- /dev/null
+++ b/include/linux/cfi.h
@@ -0,0 +1,38 @@
+#ifndef _LINUX_CFI_H
+#define _LINUX_CFI_H
+
+#include <linux/stringify.h>
+
+#ifdef CONFIG_CFI_CLANG
+#ifdef CONFIG_MODULES
+
+typedef void (*cfi_check_fn)(uint64_t, void *, void *);
+
+/* Compiler-generated function in each module, and the kernel */
+#define CFI_CHECK_FN __cfi_check
+#define CFI_CHECK_FN_NAME __stringify(CFI_CHECK_FN)
+
+extern void CFI_CHECK_FN(uint64_t, void *, void *);
+
+#ifdef CONFIG_CFI_CLANG_SHADOW
+extern void cfi_module_add(struct module *mod, unsigned long min_addr,
+ unsigned long max_addr);
+
+extern void cfi_module_remove(struct module *mod, unsigned long min_addr,
+ unsigned long max_addr);
+#else
+static inline void cfi_module_add(struct module *mod, unsigned long min_addr,
+ unsigned long max_addr)
+{
+}
+
+static inline void cfi_module_remove(struct module *mod, unsigned long min_addr,
+ unsigned long max_addr)
+{
+}
+#endif /* CONFIG_CFI_CLANG_SHADOW */
+
+#endif /* CONFIG_MODULES */
+#endif /* CONFIG_CFI_CLANG */
+
+#endif /* _LINUX_CFI_H */
diff --git a/include/linux/cgroup-defs.h b/include/linux/cgroup-defs.h
index 1619a3213af5..9c72b21d4b39 100644
--- a/include/linux/cgroup-defs.h
+++ b/include/linux/cgroup-defs.h
@@ -16,6 +16,7 @@
#include <linux/percpu-refcount.h>
#include <linux/percpu-rwsem.h>
#include <linux/workqueue.h>
+#include <linux/bpf-cgroup.h>
#ifdef CONFIG_CGROUPS
@@ -301,6 +302,9 @@ struct cgroup {
/* used to schedule release agent */
struct work_struct release_agent_work;
+ /* used to store eBPF programs */
+ struct cgroup_bpf bpf;
+
/* ids of the ancestors at each level including self */
int ancestor_ids[];
};
diff --git a/include/linux/cgroup_subsys.h b/include/linux/cgroup_subsys.h
index 0df0336acee9..7f4a2a5a2a77 100644
--- a/include/linux/cgroup_subsys.h
+++ b/include/linux/cgroup_subsys.h
@@ -20,6 +20,10 @@ SUBSYS(cpu)
SUBSYS(cpuacct)
#endif
+#if IS_ENABLED(CONFIG_CGROUP_SCHEDTUNE)
+SUBSYS(schedtune)
+#endif
+
#if IS_ENABLED(CONFIG_BLK_CGROUP)
SUBSYS(io)
#endif
diff --git a/include/linux/clocksource.h b/include/linux/clocksource.h
index 08398182f56e..36dc52067377 100644
--- a/include/linux/clocksource.h
+++ b/include/linux/clocksource.h
@@ -117,7 +117,7 @@ struct clocksource {
#define CLOCK_SOURCE_RESELECT 0x100
/* simplify initialization of mask field */
-#define CLOCKSOURCE_MASK(bits) (cycle_t)((bits) < 64 ? ((1ULL<<(bits))-1) : -1)
+#define CLOCKSOURCE_MASK(bits) GENMASK_ULL((bits) - 1, 0)
static inline u32 clocksource_freq2mult(u32 freq, u32 shift_constant, u64 from)
{
diff --git a/include/linux/compiler-clang.h b/include/linux/compiler-clang.h
index 21c88a7ac23b..eeef7d65a6de 100644
--- a/include/linux/compiler-clang.h
+++ b/include/linux/compiler-clang.h
@@ -23,3 +23,20 @@
#ifdef __noretpoline
#undef __noretpoline
#endif
+
+#ifdef CONFIG_LTO_CLANG
+#ifdef CONFIG_FTRACE_MCOUNT_RECORD
+#define __norecordmcount \
+ __attribute__((__section__(".text..ftrace")))
+#endif
+
+#define __nocfi __attribute__((no_sanitize("cfi")))
+#endif
+
+/* all clang versions usable with the kernel support KASAN ABI version 5 */
+#define KASAN_ABI_VERSION 5
+
+/* emulate gcc's __SANITIZE_ADDRESS__ flag */
+#if __has_feature(address_sanitizer)
+#define __SANITIZE_ADDRESS__
+#endif
diff --git a/include/linux/compiler.h b/include/linux/compiler.h
index 4f3dfabb680f..7385c7f1b60d 100644
--- a/include/linux/compiler.h
+++ b/include/linux/compiler.h
@@ -451,6 +451,14 @@ static __always_inline void __write_once_size(volatile void *p, void *res, int s
#define __visible
#endif
+#ifndef __norecordmcount
+#define __norecordmcount
+#endif
+
+#ifndef __nocfi
+#define __nocfi
+#endif
+
/*
* Assume alignment of return value.
*/
diff --git a/include/linux/cpu.h b/include/linux/cpu.h
index ae5ac89324df..fdf5be472eff 100644
--- a/include/linux/cpu.h
+++ b/include/linux/cpu.h
@@ -276,4 +276,11 @@ static inline void cpu_smt_check_topology_early(void) { }
static inline void cpu_smt_check_topology(void) { }
#endif
+#define IDLE_START 1
+#define IDLE_END 2
+
+void idle_notifier_register(struct notifier_block *n);
+void idle_notifier_unregister(struct notifier_block *n);
+void idle_notifier_call_chain(unsigned long val);
+
#endif /* _LINUX_CPU_H_ */
diff --git a/include/linux/cpufreq.h b/include/linux/cpufreq.h
index 32dc0cbd51ca..61f405d70556 100644
--- a/include/linux/cpufreq.h
+++ b/include/linux/cpufreq.h
@@ -120,6 +120,14 @@ struct cpufreq_policy {
bool fast_switch_possible;
bool fast_switch_enabled;
+ /*
+ * Preferred average time interval between consecutive invocations of
+ * the driver to set the frequency for this policy. To be set by the
+ * scaling driver (0, which is the default, means no preference).
+ */
+ unsigned int up_transition_delay_us;
+ unsigned int down_transition_delay_us;
+
/* Cached frequency lookup from cpufreq_driver_resolve_freq. */
unsigned int cached_target_freq;
int cached_resolved_idx;
@@ -177,6 +185,7 @@ u64 get_cpu_idle_time(unsigned int cpu, u64 *wall, int io_busy);
int cpufreq_get_policy(struct cpufreq_policy *policy, unsigned int cpu);
int cpufreq_update_policy(unsigned int cpu);
bool have_governor_per_policy(void);
+bool cpufreq_driver_is_slow(void);
struct kobject *get_governor_parent_kobj(struct cpufreq_policy *policy);
void cpufreq_enable_fast_switch(struct cpufreq_policy *policy);
void cpufreq_disable_fast_switch(struct cpufreq_policy *policy);
@@ -359,6 +368,14 @@ struct cpufreq_driver {
*/
#define CPUFREQ_NEED_INITIAL_FREQ_CHECK (1 << 5)
+/*
+ * Indicates that it is safe to call cpufreq_driver_target from
+ * non-interruptable context in scheduler hot paths. Drivers must
+ * opt-in to this flag, as the safe default is that they might sleep
+ * or be too slow for hot path use.
+ */
+#define CPUFREQ_DRIVER_FAST (1 << 6)
+
int cpufreq_register_driver(struct cpufreq_driver *driver_data);
int cpufreq_unregister_driver(struct cpufreq_driver *driver_data);
@@ -553,6 +570,32 @@ struct governor_attr {
ssize_t (*store)(struct gov_attr_set *attr_set, const char *buf,
size_t count);
};
+/* CPUFREQ DEFAULT GOVERNOR */
+/*
+ * Performance governor is fallback governor if any other gov failed to auto
+ * load due latency restrictions
+ */
+#ifdef CONFIG_CPU_FREQ_GOV_PERFORMANCE
+extern struct cpufreq_governor cpufreq_gov_performance;
+#endif
+#ifdef CONFIG_CPU_FREQ_DEFAULT_GOV_PERFORMANCE
+#define CPUFREQ_DEFAULT_GOVERNOR (&cpufreq_gov_performance)
+#elif defined(CONFIG_CPU_FREQ_DEFAULT_GOV_POWERSAVE)
+extern struct cpufreq_governor cpufreq_gov_powersave;
+#define CPUFREQ_DEFAULT_GOVERNOR (&cpufreq_gov_powersave)
+#elif defined(CONFIG_CPU_FREQ_DEFAULT_GOV_USERSPACE)
+extern struct cpufreq_governor cpufreq_gov_userspace;
+#define CPUFREQ_DEFAULT_GOVERNOR (&cpufreq_gov_userspace)
+#elif defined(CONFIG_CPU_FREQ_DEFAULT_GOV_ONDEMAND)
+extern struct cpufreq_governor cpufreq_gov_ondemand;
+#define CPUFREQ_DEFAULT_GOVERNOR (&cpufreq_gov_ondemand)
+#elif defined(CONFIG_CPU_FREQ_DEFAULT_GOV_CONSERVATIVE)
+extern struct cpufreq_governor cpufreq_gov_conservative;
+#define CPUFREQ_DEFAULT_GOVERNOR (&cpufreq_gov_conservative)
+#elif defined(CONFIG_CPU_FREQ_DEFAULT_GOV_SCHED)
+extern struct cpufreq_governor cpufreq_gov_sched;
+#define CPUFREQ_DEFAULT_GOVERNOR (&cpufreq_gov_sched)
+#endif
/*********************************************************************
* FREQUENCY TABLE HELPERS *
@@ -886,4 +929,9 @@ unsigned int cpufreq_generic_get(unsigned int cpu);
int cpufreq_generic_init(struct cpufreq_policy *policy,
struct cpufreq_frequency_table *table,
unsigned int transition_latency);
+
+struct sched_domain;
+unsigned long cpufreq_scale_freq_capacity(struct sched_domain *sd, int cpu);
+unsigned long cpufreq_scale_max_freq_capacity(struct sched_domain *sd, int cpu);
+unsigned long cpufreq_scale_min_freq_capacity(struct sched_domain *sd, int cpu);
#endif /* _LINUX_CPUFREQ_H */
diff --git a/include/linux/cpufreq_times.h b/include/linux/cpufreq_times.h
new file mode 100644
index 000000000000..356a3fad03c9
--- /dev/null
+++ b/include/linux/cpufreq_times.h
@@ -0,0 +1,46 @@
+/* drivers/cpufreq/cpufreq_times.c
+ *
+ * Copyright (C) 2018 Google, Inc.
+ *
+ * This software is licensed under the terms of the GNU General Public
+ * License version 2, as published by the Free Software Foundation, and
+ * may be copied, distributed, and modified under those terms.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ */
+
+#ifndef _LINUX_CPUFREQ_TIMES_H
+#define _LINUX_CPUFREQ_TIMES_H
+
+#include <linux/cpufreq.h>
+#include <linux/cputime.h>
+#include <linux/pid.h>
+
+#ifdef CONFIG_CPU_FREQ_TIMES
+void cpufreq_task_times_init(struct task_struct *p);
+void cpufreq_task_times_alloc(struct task_struct *p);
+void cpufreq_task_times_exit(struct task_struct *p);
+int proc_time_in_state_show(struct seq_file *m, struct pid_namespace *ns,
+ struct pid *pid, struct task_struct *p);
+void cpufreq_acct_update_power(struct task_struct *p, cputime_t cputime);
+void cpufreq_times_create_policy(struct cpufreq_policy *policy);
+void cpufreq_times_record_transition(struct cpufreq_freqs *freq);
+void cpufreq_task_times_remove_uids(uid_t uid_start, uid_t uid_end);
+int single_uid_time_in_state_open(struct inode *inode, struct file *file);
+#else
+static inline void cpufreq_task_times_init(struct task_struct *p) {}
+static inline void cpufreq_task_times_alloc(struct task_struct *p) {}
+static inline void cpufreq_task_times_exit(struct task_struct *p) {}
+static inline void cpufreq_acct_update_power(struct task_struct *p,
+ u64 cputime) {}
+static inline void cpufreq_times_create_policy(struct cpufreq_policy *policy) {}
+static inline void cpufreq_times_record_transition(
+ struct cpufreq_freqs *freq) {}
+static inline void cpufreq_task_times_remove_uids(uid_t uid_start,
+ uid_t uid_end) {}
+#endif /* CONFIG_CPU_FREQ_TIMES */
+#endif /* _LINUX_CPUFREQ_TIMES_H */
diff --git a/include/linux/cpuidle.h b/include/linux/cpuidle.h
index bb31373c3478..9a8eec9e59b2 100644
--- a/include/linux/cpuidle.h
+++ b/include/linux/cpuidle.h
@@ -207,7 +207,7 @@ static inline int cpuidle_enter_freeze(struct cpuidle_driver *drv,
#endif
/* kernel/sched/idle.c */
-extern void sched_idle_set_state(struct cpuidle_state *idle_state);
+extern void sched_idle_set_state(struct cpuidle_state *idle_state, int index);
extern void default_idle_call(void);
#ifdef CONFIG_ARCH_NEEDS_CPU_IDLE_COUPLED
diff --git a/include/linux/crypto.h b/include/linux/crypto.h
index 9de26962338e..81339e184ae7 100644
--- a/include/linux/crypto.h
+++ b/include/linux/crypto.h
@@ -24,6 +24,7 @@
#include <linux/slab.h>
#include <linux/string.h>
#include <linux/uaccess.h>
+#include <linux/completion.h>
/*
* Autoloaded crypto modules should only use a prefixed name to avoid allowing
@@ -478,6 +479,45 @@ struct crypto_alg {
} CRYPTO_MINALIGN_ATTR;
/*
+ * A helper struct for waiting for completion of async crypto ops
+ */
+struct crypto_wait {
+ struct completion completion;
+ int err;
+};
+
+/*
+ * Macro for declaring a crypto op async wait object on stack
+ */
+#define DECLARE_CRYPTO_WAIT(_wait) \
+ struct crypto_wait _wait = { \
+ COMPLETION_INITIALIZER_ONSTACK((_wait).completion), 0 }
+
+/*
+ * Async ops completion helper functioons
+ */
+void crypto_req_done(struct crypto_async_request *req, int err);
+
+static inline int crypto_wait_req(int err, struct crypto_wait *wait)
+{
+ switch (err) {
+ case -EINPROGRESS:
+ case -EBUSY:
+ wait_for_completion(&wait->completion);
+ reinit_completion(&wait->completion);
+ err = wait->err;
+ break;
+ };
+
+ return err;
+}
+
+static inline void crypto_init_wait(struct crypto_wait *wait)
+{
+ init_completion(&wait->completion);
+}
+
+/*
* Algorithm registration interface.
*/
int crypto_register_alg(struct crypto_alg *alg);
diff --git a/include/linux/dcache.h b/include/linux/dcache.h
index b757ee42bc63..014d7f9ac615 100644
--- a/include/linux/dcache.h
+++ b/include/linux/dcache.h
@@ -142,6 +142,7 @@ struct dentry_operations {
int (*d_manage)(struct dentry *, bool);
struct dentry *(*d_real)(struct dentry *, const struct inode *,
unsigned int);
+ void (*d_canonical_path)(const struct path *, struct path *);
} ____cacheline_aligned;
/*
diff --git a/include/linux/device-mapper.h b/include/linux/device-mapper.h
index ef7962e84444..cf86f528e615 100644
--- a/include/linux/device-mapper.h
+++ b/include/linux/device-mapper.h
@@ -408,6 +408,12 @@ void dm_set_mdptr(struct mapped_device *md, void *ptr);
void *dm_get_mdptr(struct mapped_device *md);
/*
+ * Export the device via the ioctl interface (uses mdptr).
+ */
+int dm_ioctl_export(struct mapped_device *md, const char *name,
+ const char *uuid);
+
+/*
* A device can still be used while suspended, but I/O is deferred.
*/
int dm_suspend(struct mapped_device *md, unsigned suspend_flags);
@@ -434,6 +440,13 @@ union map_info *dm_get_rq_mapinfo(struct request *rq);
struct queue_limits *dm_get_queue_limits(struct mapped_device *md);
+void dm_lock_md_type(struct mapped_device *md);
+void dm_unlock_md_type(struct mapped_device *md);
+void dm_set_md_type(struct mapped_device *md, unsigned type);
+unsigned dm_get_md_type(struct mapped_device *md);
+int dm_setup_md_queue(struct mapped_device *md, struct dm_table *t);
+unsigned dm_table_get_type(struct dm_table *t);
+
/*
* Geometry functions.
*/
diff --git a/include/linux/efi.h b/include/linux/efi.h
index 80b1b8faf503..2877ccb3cdcc 100644
--- a/include/linux/efi.h
+++ b/include/linux/efi.h
@@ -1427,7 +1427,7 @@ efi_status_t handle_cmdline_files(efi_system_table_t *sys_table_arg,
unsigned long *load_addr,
unsigned long *load_size);
-efi_status_t efi_parse_options(char *cmdline);
+efi_status_t efi_parse_options(char const *cmdline);
efi_status_t efi_setup_gop(efi_system_table_t *sys_table_arg,
struct screen_info *si, efi_guid_t *proto,
diff --git a/include/linux/f2fs_fs.h b/include/linux/f2fs_fs.h
index e46e7d10312b..d7711048ef93 100644
--- a/include/linux/f2fs_fs.h
+++ b/include/linux/f2fs_fs.h
@@ -1,12 +1,9 @@
+// SPDX-License-Identifier: GPL-2.0
/**
* include/linux/f2fs_fs.h
*
* Copyright (c) 2012 Samsung Electronics Co., Ltd.
* http://www.samsung.com/
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
*/
#ifndef _LINUX_F2FS_FS_H
#define _LINUX_F2FS_FS_H
@@ -21,6 +18,7 @@
#define F2FS_BLKSIZE 4096 /* support only 4KB block */
#define F2FS_BLKSIZE_BITS 12 /* bits for F2FS_BLKSIZE */
#define F2FS_MAX_EXTENSION 64 /* # of extension entries */
+#define F2FS_EXTENSION_LEN 8 /* max size of extension */
#define F2FS_BLK_ALIGN(x) (((x) + F2FS_BLKSIZE - 1) >> F2FS_BLKSIZE_BITS)
#define NULL_ADDR ((block_t)0) /* used as block_t addresses */
@@ -32,13 +30,20 @@
/* 0, 1(node nid), 2(meta nid) are reserved node id */
#define F2FS_RESERVED_NODE_NUM 3
-#define F2FS_ROOT_INO(sbi) (sbi->root_ino_num)
-#define F2FS_NODE_INO(sbi) (sbi->node_ino_num)
-#define F2FS_META_INO(sbi) (sbi->meta_ino_num)
+#define F2FS_ROOT_INO(sbi) ((sbi)->root_ino_num)
+#define F2FS_NODE_INO(sbi) ((sbi)->node_ino_num)
+#define F2FS_META_INO(sbi) ((sbi)->meta_ino_num)
+
+#define F2FS_MAX_QUOTAS 3
+
+#define F2FS_IO_SIZE(sbi) (1 << F2FS_OPTION(sbi).write_io_size_bits) /* Blocks */
+#define F2FS_IO_SIZE_KB(sbi) (1 << (F2FS_OPTION(sbi).write_io_size_bits + 2)) /* KB */
+#define F2FS_IO_SIZE_BYTES(sbi) (1 << (F2FS_OPTION(sbi).write_io_size_bits + 12)) /* B */
+#define F2FS_IO_SIZE_BITS(sbi) (F2FS_OPTION(sbi).write_io_size_bits) /* power of 2 */
+#define F2FS_IO_SIZE_MASK(sbi) (F2FS_IO_SIZE(sbi) - 1)
/* This flag is used by node and meta inodes, and by recovery */
#define GFP_F2FS_ZERO (GFP_NOFS | __GFP_ZERO)
-#define GFP_F2FS_HIGH_ZERO (GFP_NOFS | __GFP_ZERO | __GFP_HIGHMEM)
/*
* For further optimization on multi-head logs, on-disk layout supports maximum
@@ -52,10 +57,17 @@
#define VERSION_LEN 256
#define MAX_VOLUME_NAME 512
+#define MAX_PATH_LEN 64
+#define MAX_DEVICES 8
/*
* For superblock
*/
+struct f2fs_device {
+ __u8 path[MAX_PATH_LEN];
+ __le32 total_segments;
+} __packed;
+
struct f2fs_super_block {
__le32 magic; /* Magic Number */
__le16 major_ver; /* Major Version */
@@ -87,19 +99,29 @@ struct f2fs_super_block {
__u8 uuid[16]; /* 128-bit uuid for volume */
__le16 volume_name[MAX_VOLUME_NAME]; /* volume name */
__le32 extension_count; /* # of extensions below */
- __u8 extension_list[F2FS_MAX_EXTENSION][8]; /* extension array */
+ __u8 extension_list[F2FS_MAX_EXTENSION][F2FS_EXTENSION_LEN];/* extension array */
__le32 cp_payload;
__u8 version[VERSION_LEN]; /* the kernel version */
__u8 init_version[VERSION_LEN]; /* the initial kernel version */
__le32 feature; /* defined features */
__u8 encryption_level; /* versioning level for encryption */
__u8 encrypt_pw_salt[16]; /* Salt used for string2key algorithm */
- __u8 reserved[871]; /* valid reserved region */
+ struct f2fs_device devs[MAX_DEVICES]; /* device list */
+ __le32 qf_ino[F2FS_MAX_QUOTAS]; /* quota inode numbers */
+ __u8 hot_ext_count; /* # of hot file extension */
+ __u8 reserved[310]; /* valid reserved region */
+ __le32 crc; /* checksum of superblock */
} __packed;
/*
* For checkpoint
*/
+#define CP_DISABLED_FLAG 0x00001000
+#define CP_QUOTA_NEED_FSCK_FLAG 0x00000800
+#define CP_LARGE_NAT_BITMAP_FLAG 0x00000400
+#define CP_NOCRC_RECOVERY_FLAG 0x00000200
+#define CP_TRIMMED_FLAG 0x00000100
+#define CP_NAT_BITS_FLAG 0x00000080
#define CP_CRC_RECOVERY_FLAG 0x00000040
#define CP_FASTBOOT_FLAG 0x00000020
#define CP_FSCK_FLAG 0x00000010
@@ -146,7 +168,7 @@ struct f2fs_checkpoint {
*/
#define F2FS_ORPHANS_PER_BLOCK 1020
-#define GET_ORPHAN_BLOCKS(n) ((n + F2FS_ORPHANS_PER_BLOCK - 1) / \
+#define GET_ORPHAN_BLOCKS(n) (((n) + F2FS_ORPHANS_PER_BLOCK - 1) / \
F2FS_ORPHANS_PER_BLOCK)
struct f2fs_orphan_block {
@@ -168,8 +190,11 @@ struct f2fs_extent {
} __packed;
#define F2FS_NAME_LEN 255
-#define F2FS_INLINE_XATTR_ADDRS 50 /* 200 bytes for inline xattrs */
+/* 200 bytes for inline xattrs by default */
+#define DEFAULT_INLINE_XATTR_ADDRS 50
#define DEF_ADDRS_PER_INODE 923 /* Address Pointers in an Inode */
+#define CUR_ADDRS_PER_INODE(inode) (DEF_ADDRS_PER_INODE - \
+ get_extra_isize(inode))
#define DEF_NIDS_PER_INODE 5 /* Node IDs in an Inode */
#define ADDRS_PER_INODE(inode) addrs_per_inode(inode)
#define ADDRS_PER_BLOCK 1018 /* Address Pointers in a Direct Block */
@@ -189,9 +214,8 @@ struct f2fs_extent {
#define F2FS_INLINE_DENTRY 0x04 /* file inline dentry flag */
#define F2FS_DATA_EXIST 0x08 /* file inline data exist flag */
#define F2FS_INLINE_DOTS 0x10 /* file having implicit dot dentries */
-
-#define MAX_INLINE_DATA (sizeof(__le32) * (DEF_ADDRS_PER_INODE - \
- F2FS_INLINE_XATTR_ADDRS - 1))
+#define F2FS_EXTRA_ATTR 0x20 /* file having extra attribute */
+#define F2FS_PIN_FILE 0x40 /* file should not be gced */
struct f2fs_inode {
__le16 i_mode; /* file mode */
@@ -209,7 +233,13 @@ struct f2fs_inode {
__le32 i_ctime_nsec; /* change time in nano scale */
__le32 i_mtime_nsec; /* modification time in nano scale */
__le32 i_generation; /* file version (for NFS) */
- __le32 i_current_depth; /* only for directory depth */
+ union {
+ __le32 i_current_depth; /* only for directory depth */
+ __le16 i_gc_failures; /*
+ * # of gc failures on pinned file.
+ * only for regular files.
+ */
+ };
__le32 i_xattr_nid; /* nid to save xattr */
__le32 i_flags; /* file attributes */
__le32 i_pino; /* parent inode number */
@@ -219,8 +249,18 @@ struct f2fs_inode {
struct f2fs_extent i_ext; /* caching a largest extent */
- __le32 i_addr[DEF_ADDRS_PER_INODE]; /* Pointers to data blocks */
-
+ union {
+ struct {
+ __le16 i_extra_isize; /* extra inode attribute size */
+ __le16 i_inline_xattr_size; /* inline xattr size, unit: 4 bytes */
+ __le32 i_projid; /* project id */
+ __le32 i_inode_checksum;/* inode meta checksum */
+ __le64 i_crtime; /* creation time */
+ __le32 i_crtime_nsec; /* creation time in nano scale */
+ __le32 i_extra_end[0]; /* for attribute size calculation */
+ } __packed;
+ __le32 i_addr[DEF_ADDRS_PER_INODE]; /* Pointers to data blocks */
+ };
__le32 i_nid[DEF_NIDS_PER_INODE]; /* direct(2), indirect(2),
double_indirect(1) node id */
} __packed;
@@ -439,7 +479,7 @@ typedef __le32 f2fs_hash_t;
#define F2FS_SLOT_LEN 8
#define F2FS_SLOT_LEN_BITS 3
-#define GET_DENTRY_SLOTS(x) ((x + F2FS_SLOT_LEN - 1) >> F2FS_SLOT_LEN_BITS)
+#define GET_DENTRY_SLOTS(x) (((x) + F2FS_SLOT_LEN - 1) >> F2FS_SLOT_LEN_BITS)
/* MAX level for dir lookup */
#define MAX_DIR_HASH_DEPTH 63
@@ -448,7 +488,7 @@ typedef __le32 f2fs_hash_t;
#define MAX_DIR_BUCKETS (1 << ((MAX_DIR_HASH_DEPTH / 2) - 1))
/*
- * space utilization of regular dentry and inline dentry
+ * space utilization of regular dentry and inline dentry (w/o extra reservation)
* regular dentry inline dentry
* bitmap 1 * 27 = 27 1 * 23 = 23
* reserved 1 * 3 = 3 1 * 7 = 7
@@ -484,24 +524,6 @@ struct f2fs_dentry_block {
__u8 filename[NR_DENTRY_IN_BLOCK][F2FS_SLOT_LEN];
} __packed;
-/* for inline dir */
-#define NR_INLINE_DENTRY (MAX_INLINE_DATA * BITS_PER_BYTE / \
- ((SIZE_OF_DIR_ENTRY + F2FS_SLOT_LEN) * \
- BITS_PER_BYTE + 1))
-#define INLINE_DENTRY_BITMAP_SIZE ((NR_INLINE_DENTRY + \
- BITS_PER_BYTE - 1) / BITS_PER_BYTE)
-#define INLINE_RESERVED_SIZE (MAX_INLINE_DATA - \
- ((SIZE_OF_DIR_ENTRY + F2FS_SLOT_LEN) * \
- NR_INLINE_DENTRY + INLINE_DENTRY_BITMAP_SIZE))
-
-/* inline directory entry structure */
-struct f2fs_inline_dentry {
- __u8 dentry_bitmap[INLINE_DENTRY_BITMAP_SIZE];
- __u8 reserved[INLINE_RESERVED_SIZE];
- struct f2fs_dir_entry dentry[NR_INLINE_DENTRY];
- __u8 filename[NR_INLINE_DENTRY][F2FS_SLOT_LEN];
-} __packed;
-
/* file types used in inode_info->flags */
enum {
F2FS_FT_UNKNOWN,
@@ -517,4 +539,6 @@ enum {
#define S_SHIFT 12
+#define F2FS_DEF_PROJID 0 /* default project ID */
+
#endif /* _LINUX_F2FS_FS_H */
diff --git a/include/linux/fence.h b/include/linux/fence.h
index 9bb2c0c97a21..7c9b78cf2a7e 100644
--- a/include/linux/fence.h
+++ b/include/linux/fence.h
@@ -108,6 +108,7 @@ struct fence_cb {
* @get_driver_name: returns the driver name.
* @get_timeline_name: return the name of the context this fence belongs to.
* @enable_signaling: enable software signaling of fence.
+ * @disable_signaling: disable software signaling of fence (optional).
* @signaled: [optional] peek whether the fence is signaled, can be null.
* @wait: custom wait implementation, or fence_default_wait.
* @release: [optional] called on destruction of fence, can be null
@@ -167,6 +168,7 @@ struct fence_ops {
const char * (*get_driver_name)(struct fence *fence);
const char * (*get_timeline_name)(struct fence *fence);
bool (*enable_signaling)(struct fence *fence);
+ void (*disable_signaling)(struct fence *fence);
bool (*signaled)(struct fence *fence);
signed long (*wait)(struct fence *fence, bool intr, signed long timeout);
void (*release)(struct fence *fence);
@@ -183,6 +185,16 @@ void fence_release(struct kref *kref);
void fence_free(struct fence *fence);
/**
+ * fence_put - decreases refcount of the fence
+ * @fence: [in] fence to reduce refcount of
+ */
+static inline void fence_put(struct fence *fence)
+{
+ if (fence)
+ kref_put(&fence->refcount, fence_release);
+}
+
+/**
* fence_get - increases refcount of the fence
* @fence: [in] fence to increase refcount of
*
@@ -210,13 +222,49 @@ static inline struct fence *fence_get_rcu(struct fence *fence)
}
/**
- * fence_put - decreases refcount of the fence
- * @fence: [in] fence to reduce refcount of
+ * fence_get_rcu_safe - acquire a reference to an RCU tracked fence
+ * @fence: [in] pointer to fence to increase refcount of
+ *
+ * Function returns NULL if no refcount could be obtained, or the fence.
+ * This function handles acquiring a reference to a fence that may be
+ * reallocated within the RCU grace period (such as with SLAB_DESTROY_BY_RCU),
+ * so long as the caller is using RCU on the pointer to the fence.
+ *
+ * An alternative mechanism is to employ a seqlock to protect a bunch of
+ * fences, such as used by struct reservation_object. When using a seqlock,
+ * the seqlock must be taken before and checked after a reference to the
+ * fence is acquired (as shown here).
+ *
+ * The caller is required to hold the RCU read lock.
*/
-static inline void fence_put(struct fence *fence)
+static inline struct fence *fence_get_rcu_safe(struct fence * __rcu *fencep)
{
- if (fence)
- kref_put(&fence->refcount, fence_release);
+ do {
+ struct fence *fence;
+
+ fence = rcu_dereference(*fencep);
+ if (!fence || !fence_get_rcu(fence))
+ return NULL;
+
+ /* The atomic_inc_not_zero() inside fence_get_rcu()
+ * provides a full memory barrier upon success (such as now).
+ * This is paired with the write barrier from assigning
+ * to the __rcu protected fence pointer so that if that
+ * pointer still matches the current fence, we know we
+ * have successfully acquire a reference to it. If it no
+ * longer matches, we are holding a reference to some other
+ * reallocated pointer. This is possible if the allocator
+ * is using a freelist like SLAB_DESTROY_BY_RCU where the
+ * fence remains valid for the RCU grace period, but it
+ * may be reallocated. When using such allocators, we are
+ * responsible for ensuring the reference we get is to
+ * the right fence, as below.
+ */
+ if (fence == rcu_access_pointer(*fencep))
+ return rcu_pointer_handoff(fence);
+
+ fence_put(fence);
+ } while (1);
}
int fence_signal(struct fence *fence);
diff --git a/include/linux/fs.h b/include/linux/fs.h
index bcad2b963296..a36a8a0fe6a9 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -21,6 +21,7 @@
#include <linux/mm_types.h>
#include <linux/capability.h>
#include <linux/semaphore.h>
+#include <linux/fcntl.h>
#include <linux/fiemap.h>
#include <linux/rculist_bl.h>
#include <linux/atomic.h>
@@ -146,6 +147,9 @@ typedef int (dio_iodone_t)(struct kiocb *iocb, loff_t offset,
/* File was opened by fanotify and shouldn't generate fanotify events */
#define FMODE_NONOTIFY ((__force fmode_t)0x4000000)
+/* File is capable of returning -EAGAIN if I/O will block */
+#define FMODE_NOWAIT ((__force fmode_t)0x8000000)
+
/*
* Flag for rw_copy_check_uvector and compat_rw_copy_check_uvector
* that indicates that they should check the contents of the iovec are
@@ -318,6 +322,18 @@ struct page;
struct address_space;
struct writeback_control;
+/*
+ * Write life time hint values.
+ */
+enum rw_hint {
+ WRITE_LIFE_NOT_SET = 0,
+ WRITE_LIFE_NONE = RWH_WRITE_LIFE_NONE,
+ WRITE_LIFE_SHORT = RWH_WRITE_LIFE_SHORT,
+ WRITE_LIFE_MEDIUM = RWH_WRITE_LIFE_MEDIUM,
+ WRITE_LIFE_LONG = RWH_WRITE_LIFE_LONG,
+ WRITE_LIFE_EXTREME = RWH_WRITE_LIFE_EXTREME,
+};
+
#define IOCB_EVENTFD (1 << 0)
#define IOCB_APPEND (1 << 1)
#define IOCB_DIRECT (1 << 2)
@@ -325,6 +341,7 @@ struct writeback_control;
#define IOCB_DSYNC (1 << 4)
#define IOCB_SYNC (1 << 5)
#define IOCB_WRITE (1 << 6)
+#define IOCB_NOWAIT (1 << 7)
struct kiocb {
struct file *ki_filp;
@@ -332,6 +349,7 @@ struct kiocb {
void (*ki_complete)(struct kiocb *iocb, long ret, long ret2);
void *private;
int ki_flags;
+ enum rw_hint ki_hint;
};
static inline bool is_sync_kiocb(struct kiocb *kiocb)
@@ -645,6 +663,7 @@ struct inode {
spinlock_t i_lock; /* i_blocks, i_bytes, maybe i_size */
unsigned short i_bytes;
unsigned int i_blkbits;
+ enum rw_hint i_write_hint;
blkcnt_t i_blocks;
#ifdef __NEED_I_SIZE_ORDERED
@@ -1072,8 +1091,6 @@ struct file_lock_context {
#define OFFT_OFFSET_MAX INT_LIMIT(off_t)
#endif
-#include <linux/fcntl.h>
-
extern void send_sigio(struct fown_struct *fown, int fd, int band);
/*
@@ -1612,13 +1629,21 @@ extern bool inode_owner_or_capable(const struct inode *inode);
* VFS helper functions..
*/
extern int vfs_create(struct inode *, struct dentry *, umode_t, bool);
+extern int vfs_create2(struct vfsmount *, struct inode *, struct dentry *, umode_t, bool);
extern int vfs_mkdir(struct inode *, struct dentry *, umode_t);
+extern int vfs_mkdir2(struct vfsmount *, struct inode *, struct dentry *, umode_t);
extern int vfs_mknod(struct inode *, struct dentry *, umode_t, dev_t);
+extern int vfs_mknod2(struct vfsmount *, struct inode *, struct dentry *, umode_t, dev_t);
extern int vfs_symlink(struct inode *, struct dentry *, const char *);
+extern int vfs_symlink2(struct vfsmount *, struct inode *, struct dentry *, const char *);
extern int vfs_link(struct dentry *, struct inode *, struct dentry *, struct inode **);
+extern int vfs_link2(struct vfsmount *, struct dentry *, struct inode *, struct dentry *, struct inode **);
extern int vfs_rmdir(struct inode *, struct dentry *);
+extern int vfs_rmdir2(struct vfsmount *, struct inode *, struct dentry *);
extern int vfs_unlink(struct inode *, struct dentry *, struct inode **);
+extern int vfs_unlink2(struct vfsmount *, struct inode *, struct dentry *, struct inode **);
extern int vfs_rename(struct inode *, struct dentry *, struct inode *, struct dentry *, struct inode **, unsigned int);
+extern int vfs_rename2(struct vfsmount *, struct inode *, struct dentry *, struct inode *, struct dentry *, struct inode **, unsigned int);
extern int vfs_whiteout(struct inode *, struct dentry *);
/*
@@ -1746,6 +1771,7 @@ struct inode_operations {
struct dentry * (*lookup) (struct inode *,struct dentry *, unsigned int);
const char * (*get_link) (struct dentry *, struct inode *, struct delayed_call *);
int (*permission) (struct inode *, int);
+ int (*permission2) (struct vfsmount *, struct inode *, int);
struct posix_acl * (*get_acl)(struct inode *, int);
int (*readlink) (struct dentry *, char __user *,int);
@@ -1760,6 +1786,7 @@ struct inode_operations {
int (*rename) (struct inode *, struct dentry *,
struct inode *, struct dentry *, unsigned int);
int (*setattr) (struct dentry *, struct iattr *);
+ int (*setattr2) (struct vfsmount *, struct dentry *, struct iattr *);
int (*getattr) (struct vfsmount *mnt, struct dentry *, struct kstat *);
ssize_t (*listxattr) (struct dentry *, char *, size_t);
int (*fiemap)(struct inode *, struct fiemap_extent_info *, u64 start,
@@ -1808,9 +1835,13 @@ struct super_operations {
int (*unfreeze_fs) (struct super_block *);
int (*statfs) (struct dentry *, struct kstatfs *);
int (*remount_fs) (struct super_block *, int *, char *);
+ int (*remount_fs2) (struct vfsmount *, struct super_block *, int *, char *);
+ void *(*clone_mnt_data) (void *);
+ void (*copy_mnt_data) (void *, void *);
void (*umount_begin) (struct super_block *);
int (*show_options)(struct seq_file *, struct dentry *);
+ int (*show_options2)(struct vfsmount *,struct seq_file *, struct dentry *);
int (*show_devname)(struct seq_file *, struct dentry *);
int (*show_path)(struct seq_file *, struct dentry *);
int (*show_stats)(struct seq_file *, struct dentry *);
@@ -1847,6 +1878,7 @@ struct super_operations {
#else
#define S_DAX 0 /* Make all the DAX code disappear */
#endif
+#define S_ENCRYPTED 16384 /* Encrypted file (using fs/crypto/) */
/*
* Note that nosuid etc flags are inode-specific: setting some file-system
@@ -1885,6 +1917,7 @@ struct super_operations {
#define IS_AUTOMOUNT(inode) ((inode)->i_flags & S_AUTOMOUNT)
#define IS_NOSEC(inode) ((inode)->i_flags & S_NOSEC)
#define IS_DAX(inode) ((inode)->i_flags & S_DAX)
+#define IS_ENCRYPTED(inode) ((inode)->i_flags & S_ENCRYPTED)
#define IS_WHITEOUT(inode) (S_ISCHR(inode->i_mode) && \
(inode)->i_rdev == WHITEOUT_DEV)
@@ -2044,6 +2077,9 @@ struct file_system_type {
#define FS_RENAME_DOES_D_MOVE 32768 /* FS will handle d_move() during rename() internally. */
struct dentry *(*mount) (struct file_system_type *, int,
const char *, void *);
+ struct dentry *(*mount2) (struct vfsmount *, struct file_system_type *, int,
+ const char *, void *);
+ void *(*alloc_mnt_data) (void);
void (*kill_sb) (struct super_block *);
struct module *owner;
struct file_system_type * next;
@@ -2342,6 +2378,8 @@ struct filename {
extern long vfs_truncate(const struct path *, loff_t);
extern int do_truncate(struct dentry *, loff_t start, unsigned int time_attrs,
struct file *filp);
+extern int do_truncate2(struct vfsmount *, struct dentry *, loff_t start,
+ unsigned int time_attrs, struct file *filp);
extern int vfs_fallocate(struct file *file, int mode, loff_t offset,
loff_t len);
extern long do_sys_open(int dfd, const char __user *filename, int flags,
@@ -2584,8 +2622,11 @@ extern void emergency_remount(void);
extern sector_t bmap(struct inode *, sector_t);
#endif
extern int notify_change(struct dentry *, struct iattr *, struct inode **);
+extern int notify_change2(struct vfsmount *, struct dentry *, struct iattr *, struct inode **);
extern int inode_permission(struct inode *, int);
+extern int inode_permission2(struct vfsmount *, struct inode *, int);
extern int __inode_permission(struct inode *, int);
+extern int __inode_permission2(struct vfsmount *, struct inode *, int);
extern int generic_permission(struct inode *, int);
extern int __check_sticky(struct inode *dir, struct inode *inode);
diff --git a/include/linux/fscrypt.h b/include/linux/fscrypt.h
new file mode 100644
index 000000000000..9e535af579e8
--- /dev/null
+++ b/include/linux/fscrypt.h
@@ -0,0 +1,250 @@
+/*
+ * fscrypt.h: declarations for per-file encryption
+ *
+ * Filesystems that implement per-file encryption include this header
+ * file with the __FS_HAS_ENCRYPTION set according to whether that filesystem
+ * is being built with encryption support or not.
+ *
+ * Copyright (C) 2015, Google, Inc.
+ *
+ * Written by Michael Halcrow, 2015.
+ * Modified by Jaegeuk Kim, 2015.
+ */
+#ifndef _LINUX_FSCRYPT_H
+#define _LINUX_FSCRYPT_H
+
+#include <linux/fs.h>
+
+#define FS_CRYPTO_BLOCK_SIZE 16
+
+struct fscrypt_ctx;
+struct fscrypt_info;
+
+struct fscrypt_str {
+ unsigned char *name;
+ u32 len;
+};
+
+struct fscrypt_name {
+ const struct qstr *usr_fname;
+ struct fscrypt_str disk_name;
+ u32 hash;
+ u32 minor_hash;
+ struct fscrypt_str crypto_buf;
+};
+
+#define FSTR_INIT(n, l) { .name = n, .len = l }
+#define FSTR_TO_QSTR(f) QSTR_INIT((f)->name, (f)->len)
+#define fname_name(p) ((p)->disk_name.name)
+#define fname_len(p) ((p)->disk_name.len)
+
+#if __FS_HAS_ENCRYPTION
+#include <linux/fscrypt_supp.h>
+#else
+#include <linux/fscrypt_notsupp.h>
+#endif
+
+/**
+ * fscrypt_require_key - require an inode's encryption key
+ * @inode: the inode we need the key for
+ *
+ * If the inode is encrypted, set up its encryption key if not already done.
+ * Then require that the key be present and return -ENOKEY otherwise.
+ *
+ * No locks are needed, and the key will live as long as the struct inode --- so
+ * it won't go away from under you.
+ *
+ * Return: 0 on success, -ENOKEY if the key is missing, or another -errno code
+ * if a problem occurred while setting up the encryption key.
+ */
+static inline int fscrypt_require_key(struct inode *inode)
+{
+ if (IS_ENCRYPTED(inode)) {
+ int err = fscrypt_get_encryption_info(inode);
+
+ if (err)
+ return err;
+ if (!fscrypt_has_encryption_key(inode))
+ return -ENOKEY;
+ }
+ return 0;
+}
+
+/**
+ * fscrypt_prepare_link - prepare to link an inode into a possibly-encrypted directory
+ * @old_dentry: an existing dentry for the inode being linked
+ * @dir: the target directory
+ * @dentry: negative dentry for the target filename
+ *
+ * A new link can only be added to an encrypted directory if the directory's
+ * encryption key is available --- since otherwise we'd have no way to encrypt
+ * the filename. Therefore, we first set up the directory's encryption key (if
+ * not already done) and return an error if it's unavailable.
+ *
+ * We also verify that the link will not violate the constraint that all files
+ * in an encrypted directory tree use the same encryption policy.
+ *
+ * Return: 0 on success, -ENOKEY if the directory's encryption key is missing,
+ * -EPERM if the link would result in an inconsistent encryption policy, or
+ * another -errno code.
+ */
+static inline int fscrypt_prepare_link(struct dentry *old_dentry,
+ struct inode *dir,
+ struct dentry *dentry)
+{
+ if (IS_ENCRYPTED(dir))
+ return __fscrypt_prepare_link(d_inode(old_dentry), dir);
+ return 0;
+}
+
+/**
+ * fscrypt_prepare_rename - prepare for a rename between possibly-encrypted directories
+ * @old_dir: source directory
+ * @old_dentry: dentry for source file
+ * @new_dir: target directory
+ * @new_dentry: dentry for target location (may be negative unless exchanging)
+ * @flags: rename flags (we care at least about %RENAME_EXCHANGE)
+ *
+ * Prepare for ->rename() where the source and/or target directories may be
+ * encrypted. A new link can only be added to an encrypted directory if the
+ * directory's encryption key is available --- since otherwise we'd have no way
+ * to encrypt the filename. A rename to an existing name, on the other hand,
+ * *is* cryptographically possible without the key. However, we take the more
+ * conservative approach and just forbid all no-key renames.
+ *
+ * We also verify that the rename will not violate the constraint that all files
+ * in an encrypted directory tree use the same encryption policy.
+ *
+ * Return: 0 on success, -ENOKEY if an encryption key is missing, -EPERM if the
+ * rename would cause inconsistent encryption policies, or another -errno code.
+ */
+static inline int fscrypt_prepare_rename(struct inode *old_dir,
+ struct dentry *old_dentry,
+ struct inode *new_dir,
+ struct dentry *new_dentry,
+ unsigned int flags)
+{
+ if (IS_ENCRYPTED(old_dir) || IS_ENCRYPTED(new_dir))
+ return __fscrypt_prepare_rename(old_dir, old_dentry,
+ new_dir, new_dentry, flags);
+ return 0;
+}
+
+/**
+ * fscrypt_prepare_lookup - prepare to lookup a name in a possibly-encrypted directory
+ * @dir: directory being searched
+ * @dentry: filename being looked up
+ * @flags: lookup flags
+ *
+ * Prepare for ->lookup() in a directory which may be encrypted. Lookups can be
+ * done with or without the directory's encryption key; without the key,
+ * filenames are presented in encrypted form. Therefore, we'll try to set up
+ * the directory's encryption key, but even without it the lookup can continue.
+ *
+ * To allow invalidating stale dentries if the directory's encryption key is
+ * added later, we also install a custom ->d_revalidate() method and use the
+ * DCACHE_ENCRYPTED_WITH_KEY flag to indicate whether a given dentry is a
+ * plaintext name (flag set) or a ciphertext name (flag cleared).
+ *
+ * Return: 0 on success, -errno if a problem occurred while setting up the
+ * encryption key
+ */
+static inline int fscrypt_prepare_lookup(struct inode *dir,
+ struct dentry *dentry,
+ unsigned int flags)
+{
+ if (IS_ENCRYPTED(dir))
+ return __fscrypt_prepare_lookup(dir, dentry);
+ return 0;
+}
+
+/**
+ * fscrypt_prepare_setattr - prepare to change a possibly-encrypted inode's attributes
+ * @dentry: dentry through which the inode is being changed
+ * @attr: attributes to change
+ *
+ * Prepare for ->setattr() on a possibly-encrypted inode. On an encrypted file,
+ * most attribute changes are allowed even without the encryption key. However,
+ * without the encryption key we do have to forbid truncates. This is needed
+ * because the size being truncated to may not be a multiple of the filesystem
+ * block size, and in that case we'd have to decrypt the final block, zero the
+ * portion past i_size, and re-encrypt it. (We *could* allow truncating to a
+ * filesystem block boundary, but it's simpler to just forbid all truncates ---
+ * and we already forbid all other contents modifications without the key.)
+ *
+ * Return: 0 on success, -ENOKEY if the key is missing, or another -errno code
+ * if a problem occurred while setting up the encryption key.
+ */
+static inline int fscrypt_prepare_setattr(struct dentry *dentry,
+ struct iattr *attr)
+{
+ if (attr->ia_valid & ATTR_SIZE)
+ return fscrypt_require_key(d_inode(dentry));
+ return 0;
+}
+
+/**
+ * fscrypt_prepare_symlink - prepare to create a possibly-encrypted symlink
+ * @dir: directory in which the symlink is being created
+ * @target: plaintext symlink target
+ * @len: length of @target excluding null terminator
+ * @max_len: space the filesystem has available to store the symlink target
+ * @disk_link: (out) the on-disk symlink target being prepared
+ *
+ * This function computes the size the symlink target will require on-disk,
+ * stores it in @disk_link->len, and validates it against @max_len. An
+ * encrypted symlink may be longer than the original.
+ *
+ * Additionally, @disk_link->name is set to @target if the symlink will be
+ * unencrypted, but left NULL if the symlink will be encrypted. For encrypted
+ * symlinks, the filesystem must call fscrypt_encrypt_symlink() to create the
+ * on-disk target later. (The reason for the two-step process is that some
+ * filesystems need to know the size of the symlink target before creating the
+ * inode, e.g. to determine whether it will be a "fast" or "slow" symlink.)
+ *
+ * Return: 0 on success, -ENAMETOOLONG if the symlink target is too long,
+ * -ENOKEY if the encryption key is missing, or another -errno code if a problem
+ * occurred while setting up the encryption key.
+ */
+static inline int fscrypt_prepare_symlink(struct inode *dir,
+ const char *target,
+ unsigned int len,
+ unsigned int max_len,
+ struct fscrypt_str *disk_link)
+{
+ if (IS_ENCRYPTED(dir) || fscrypt_dummy_context_enabled(dir))
+ return __fscrypt_prepare_symlink(dir, len, max_len, disk_link);
+
+ disk_link->name = (unsigned char *)target;
+ disk_link->len = len + 1;
+ if (disk_link->len > max_len)
+ return -ENAMETOOLONG;
+ return 0;
+}
+
+/**
+ * fscrypt_encrypt_symlink - encrypt the symlink target if needed
+ * @inode: symlink inode
+ * @target: plaintext symlink target
+ * @len: length of @target excluding null terminator
+ * @disk_link: (in/out) the on-disk symlink target being prepared
+ *
+ * If the symlink target needs to be encrypted, then this function encrypts it
+ * into @disk_link->name. fscrypt_prepare_symlink() must have been called
+ * previously to compute @disk_link->len. If the filesystem did not allocate a
+ * buffer for @disk_link->name after calling fscrypt_prepare_link(), then one
+ * will be kmalloc()'ed and the filesystem will be responsible for freeing it.
+ *
+ * Return: 0 on success, -errno on failure
+ */
+static inline int fscrypt_encrypt_symlink(struct inode *inode,
+ const char *target,
+ unsigned int len,
+ struct fscrypt_str *disk_link)
+{
+ if (IS_ENCRYPTED(inode))
+ return __fscrypt_encrypt_symlink(inode, target, len, disk_link);
+ return 0;
+}
+
+#endif /* _LINUX_FSCRYPT_H */
diff --git a/include/linux/fscrypt_notsupp.h b/include/linux/fscrypt_notsupp.h
new file mode 100644
index 000000000000..cbec0864d37f
--- /dev/null
+++ b/include/linux/fscrypt_notsupp.h
@@ -0,0 +1,230 @@
+/*
+ * fscrypt_notsupp.h
+ *
+ * This stubs out the fscrypt functions for filesystems configured without
+ * encryption support.
+ *
+ * Do not include this file directly. Use fscrypt.h instead!
+ */
+#ifndef _LINUX_FSCRYPT_H
+#error "Incorrect include of linux/fscrypt_notsupp.h!"
+#endif
+
+#ifndef _LINUX_FSCRYPT_NOTSUPP_H
+#define _LINUX_FSCRYPT_NOTSUPP_H
+
+static inline bool fscrypt_has_encryption_key(const struct inode *inode)
+{
+ return false;
+}
+
+static inline bool fscrypt_dummy_context_enabled(struct inode *inode)
+{
+ return false;
+}
+
+/* crypto.c */
+static inline void fscrypt_enqueue_decrypt_work(struct work_struct *work)
+{
+}
+
+static inline struct fscrypt_ctx *fscrypt_get_ctx(const struct inode *inode,
+ gfp_t gfp_flags)
+{
+ return ERR_PTR(-EOPNOTSUPP);
+}
+
+static inline void fscrypt_release_ctx(struct fscrypt_ctx *ctx)
+{
+ return;
+}
+
+static inline struct page *fscrypt_encrypt_page(const struct inode *inode,
+ struct page *page,
+ unsigned int len,
+ unsigned int offs,
+ u64 lblk_num, gfp_t gfp_flags)
+{
+ return ERR_PTR(-EOPNOTSUPP);
+}
+
+static inline int fscrypt_decrypt_page(const struct inode *inode,
+ struct page *page,
+ unsigned int len, unsigned int offs,
+ u64 lblk_num)
+{
+ return -EOPNOTSUPP;
+}
+
+static inline struct page *fscrypt_control_page(struct page *page)
+{
+ WARN_ON_ONCE(1);
+ return ERR_PTR(-EINVAL);
+}
+
+static inline void fscrypt_restore_control_page(struct page *page)
+{
+ return;
+}
+
+/* policy.c */
+static inline int fscrypt_ioctl_set_policy(struct file *filp,
+ const void __user *arg)
+{
+ return -EOPNOTSUPP;
+}
+
+static inline int fscrypt_ioctl_get_policy(struct file *filp, void __user *arg)
+{
+ return -EOPNOTSUPP;
+}
+
+static inline int fscrypt_has_permitted_context(struct inode *parent,
+ struct inode *child)
+{
+ return 0;
+}
+
+static inline int fscrypt_inherit_context(struct inode *parent,
+ struct inode *child,
+ void *fs_data, bool preload)
+{
+ return -EOPNOTSUPP;
+}
+
+/* keyinfo.c */
+static inline int fscrypt_get_encryption_info(struct inode *inode)
+{
+ return -EOPNOTSUPP;
+}
+
+static inline void fscrypt_put_encryption_info(struct inode *inode)
+{
+ return;
+}
+
+ /* fname.c */
+static inline int fscrypt_setup_filename(struct inode *dir,
+ const struct qstr *iname,
+ int lookup, struct fscrypt_name *fname)
+{
+ if (IS_ENCRYPTED(dir))
+ return -EOPNOTSUPP;
+
+ memset(fname, 0, sizeof(struct fscrypt_name));
+ fname->usr_fname = iname;
+ fname->disk_name.name = (unsigned char *)iname->name;
+ fname->disk_name.len = iname->len;
+ return 0;
+}
+
+static inline void fscrypt_free_filename(struct fscrypt_name *fname)
+{
+ return;
+}
+
+static inline int fscrypt_fname_alloc_buffer(const struct inode *inode,
+ u32 max_encrypted_len,
+ struct fscrypt_str *crypto_str)
+{
+ return -EOPNOTSUPP;
+}
+
+static inline void fscrypt_fname_free_buffer(struct fscrypt_str *crypto_str)
+{
+ return;
+}
+
+static inline int fscrypt_fname_disk_to_usr(struct inode *inode,
+ u32 hash, u32 minor_hash,
+ const struct fscrypt_str *iname,
+ struct fscrypt_str *oname)
+{
+ return -EOPNOTSUPP;
+}
+
+static inline bool fscrypt_match_name(const struct fscrypt_name *fname,
+ const u8 *de_name, u32 de_name_len)
+{
+ /* Encryption support disabled; use standard comparison */
+ if (de_name_len != fname->disk_name.len)
+ return false;
+ return !memcmp(de_name, fname->disk_name.name, fname->disk_name.len);
+}
+
+/* bio.c */
+static inline void fscrypt_decrypt_bio(struct bio *bio)
+{
+}
+
+static inline void fscrypt_enqueue_decrypt_bio(struct fscrypt_ctx *ctx,
+ struct bio *bio)
+{
+}
+
+static inline void fscrypt_pullback_bio_page(struct page **page, bool restore)
+{
+ return;
+}
+
+static inline int fscrypt_zeroout_range(const struct inode *inode, pgoff_t lblk,
+ sector_t pblk, unsigned int len)
+{
+ return -EOPNOTSUPP;
+}
+
+/* hooks.c */
+
+static inline int fscrypt_file_open(struct inode *inode, struct file *filp)
+{
+ if (IS_ENCRYPTED(inode))
+ return -EOPNOTSUPP;
+ return 0;
+}
+
+static inline int __fscrypt_prepare_link(struct inode *inode,
+ struct inode *dir)
+{
+ return -EOPNOTSUPP;
+}
+
+static inline int __fscrypt_prepare_rename(struct inode *old_dir,
+ struct dentry *old_dentry,
+ struct inode *new_dir,
+ struct dentry *new_dentry,
+ unsigned int flags)
+{
+ return -EOPNOTSUPP;
+}
+
+static inline int __fscrypt_prepare_lookup(struct inode *dir,
+ struct dentry *dentry)
+{
+ return -EOPNOTSUPP;
+}
+
+static inline int __fscrypt_prepare_symlink(struct inode *dir,
+ unsigned int len,
+ unsigned int max_len,
+ struct fscrypt_str *disk_link)
+{
+ return -EOPNOTSUPP;
+}
+
+static inline int __fscrypt_encrypt_symlink(struct inode *inode,
+ const char *target,
+ unsigned int len,
+ struct fscrypt_str *disk_link)
+{
+ return -EOPNOTSUPP;
+}
+
+static inline const char *fscrypt_get_symlink(struct inode *inode,
+ const void *caddr,
+ unsigned int max_size,
+ struct delayed_call *done)
+{
+ return ERR_PTR(-EOPNOTSUPP);
+}
+
+#endif /* _LINUX_FSCRYPT_NOTSUPP_H */
diff --git a/include/linux/fscrypt_supp.h b/include/linux/fscrypt_supp.h
new file mode 100644
index 000000000000..99b6c52a305c
--- /dev/null
+++ b/include/linux/fscrypt_supp.h
@@ -0,0 +1,203 @@
+/*
+ * fscrypt_supp.h
+ *
+ * Do not include this file directly. Use fscrypt.h instead!
+ */
+#ifndef _LINUX_FSCRYPT_H
+#error "Incorrect include of linux/fscrypt_supp.h!"
+#endif
+
+#ifndef _LINUX_FSCRYPT_SUPP_H
+#define _LINUX_FSCRYPT_SUPP_H
+
+#include <linux/mm.h>
+#include <linux/slab.h>
+
+/*
+ * fscrypt superblock flags
+ */
+#define FS_CFLG_OWN_PAGES (1U << 1)
+
+/*
+ * crypto operations for filesystems
+ */
+struct fscrypt_operations {
+ unsigned int flags;
+ const char *key_prefix;
+ int (*get_context)(struct inode *, void *, size_t);
+ int (*set_context)(struct inode *, const void *, size_t, void *);
+ bool (*dummy_context)(struct inode *);
+ bool (*empty_dir)(struct inode *);
+ unsigned int max_namelen;
+};
+
+struct fscrypt_ctx {
+ union {
+ struct {
+ struct page *bounce_page; /* Ciphertext page */
+ struct page *control_page; /* Original page */
+ } w;
+ struct {
+ struct bio *bio;
+ struct work_struct work;
+ } r;
+ struct list_head free_list; /* Free list */
+ };
+ u8 flags; /* Flags */
+};
+
+static inline bool fscrypt_has_encryption_key(const struct inode *inode)
+{
+ return (inode->i_crypt_info != NULL);
+}
+
+static inline bool fscrypt_dummy_context_enabled(struct inode *inode)
+{
+ return inode->i_sb->s_cop->dummy_context &&
+ inode->i_sb->s_cop->dummy_context(inode);
+}
+
+/* crypto.c */
+extern void fscrypt_enqueue_decrypt_work(struct work_struct *);
+extern struct fscrypt_ctx *fscrypt_get_ctx(const struct inode *, gfp_t);
+extern void fscrypt_release_ctx(struct fscrypt_ctx *);
+extern struct page *fscrypt_encrypt_page(const struct inode *, struct page *,
+ unsigned int, unsigned int,
+ u64, gfp_t);
+extern int fscrypt_decrypt_page(const struct inode *, struct page *, unsigned int,
+ unsigned int, u64);
+
+static inline struct page *fscrypt_control_page(struct page *page)
+{
+ return ((struct fscrypt_ctx *)page_private(page))->w.control_page;
+}
+
+extern void fscrypt_restore_control_page(struct page *);
+
+/* policy.c */
+extern int fscrypt_ioctl_set_policy(struct file *, const void __user *);
+extern int fscrypt_ioctl_get_policy(struct file *, void __user *);
+extern int fscrypt_has_permitted_context(struct inode *, struct inode *);
+extern int fscrypt_inherit_context(struct inode *, struct inode *,
+ void *, bool);
+/* keyinfo.c */
+extern int fscrypt_get_encryption_info(struct inode *);
+extern void fscrypt_put_encryption_info(struct inode *);
+
+/* fname.c */
+extern int fscrypt_setup_filename(struct inode *, const struct qstr *,
+ int lookup, struct fscrypt_name *);
+
+static inline void fscrypt_free_filename(struct fscrypt_name *fname)
+{
+ kfree(fname->crypto_buf.name);
+}
+
+extern int fscrypt_fname_alloc_buffer(const struct inode *, u32,
+ struct fscrypt_str *);
+extern void fscrypt_fname_free_buffer(struct fscrypt_str *);
+extern int fscrypt_fname_disk_to_usr(struct inode *, u32, u32,
+ const struct fscrypt_str *, struct fscrypt_str *);
+
+#define FSCRYPT_FNAME_MAX_UNDIGESTED_SIZE 32
+
+/* Extracts the second-to-last ciphertext block; see explanation below */
+#define FSCRYPT_FNAME_DIGEST(name, len) \
+ ((name) + round_down((len) - FS_CRYPTO_BLOCK_SIZE - 1, \
+ FS_CRYPTO_BLOCK_SIZE))
+
+#define FSCRYPT_FNAME_DIGEST_SIZE FS_CRYPTO_BLOCK_SIZE
+
+/**
+ * fscrypt_digested_name - alternate identifier for an on-disk filename
+ *
+ * When userspace lists an encrypted directory without access to the key,
+ * filenames whose ciphertext is longer than FSCRYPT_FNAME_MAX_UNDIGESTED_SIZE
+ * bytes are shown in this abbreviated form (base64-encoded) rather than as the
+ * full ciphertext (base64-encoded). This is necessary to allow supporting
+ * filenames up to NAME_MAX bytes, since base64 encoding expands the length.
+ *
+ * To make it possible for filesystems to still find the correct directory entry
+ * despite not knowing the full on-disk name, we encode any filesystem-specific
+ * 'hash' and/or 'minor_hash' which the filesystem may need for its lookups,
+ * followed by the second-to-last ciphertext block of the filename. Due to the
+ * use of the CBC-CTS encryption mode, the second-to-last ciphertext block
+ * depends on the full plaintext. (Note that ciphertext stealing causes the
+ * last two blocks to appear "flipped".) This makes accidental collisions very
+ * unlikely: just a 1 in 2^128 chance for two filenames to collide even if they
+ * share the same filesystem-specific hashes.
+ *
+ * However, this scheme isn't immune to intentional collisions, which can be
+ * created by anyone able to create arbitrary plaintext filenames and view them
+ * without the key. Making the "digest" be a real cryptographic hash like
+ * SHA-256 over the full ciphertext would prevent this, although it would be
+ * less efficient and harder to implement, especially since the filesystem would
+ * need to calculate it for each directory entry examined during a search.
+ */
+struct fscrypt_digested_name {
+ u32 hash;
+ u32 minor_hash;
+ u8 digest[FSCRYPT_FNAME_DIGEST_SIZE];
+};
+
+/**
+ * fscrypt_match_name() - test whether the given name matches a directory entry
+ * @fname: the name being searched for
+ * @de_name: the name from the directory entry
+ * @de_name_len: the length of @de_name in bytes
+ *
+ * Normally @fname->disk_name will be set, and in that case we simply compare
+ * that to the name stored in the directory entry. The only exception is that
+ * if we don't have the key for an encrypted directory and a filename in it is
+ * very long, then we won't have the full disk_name and we'll instead need to
+ * match against the fscrypt_digested_name.
+ *
+ * Return: %true if the name matches, otherwise %false.
+ */
+static inline bool fscrypt_match_name(const struct fscrypt_name *fname,
+ const u8 *de_name, u32 de_name_len)
+{
+ if (unlikely(!fname->disk_name.name)) {
+ const struct fscrypt_digested_name *n =
+ (const void *)fname->crypto_buf.name;
+ if (WARN_ON_ONCE(fname->usr_fname->name[0] != '_'))
+ return false;
+ if (de_name_len <= FSCRYPT_FNAME_MAX_UNDIGESTED_SIZE)
+ return false;
+ return !memcmp(FSCRYPT_FNAME_DIGEST(de_name, de_name_len),
+ n->digest, FSCRYPT_FNAME_DIGEST_SIZE);
+ }
+
+ if (de_name_len != fname->disk_name.len)
+ return false;
+ return !memcmp(de_name, fname->disk_name.name, fname->disk_name.len);
+}
+
+/* bio.c */
+extern void fscrypt_decrypt_bio(struct bio *);
+extern void fscrypt_enqueue_decrypt_bio(struct fscrypt_ctx *ctx,
+ struct bio *bio);
+extern void fscrypt_pullback_bio_page(struct page **, bool);
+extern int fscrypt_zeroout_range(const struct inode *, pgoff_t, sector_t,
+ unsigned int);
+
+/* hooks.c */
+extern int fscrypt_file_open(struct inode *inode, struct file *filp);
+extern int __fscrypt_prepare_link(struct inode *inode, struct inode *dir);
+extern int __fscrypt_prepare_rename(struct inode *old_dir,
+ struct dentry *old_dentry,
+ struct inode *new_dir,
+ struct dentry *new_dentry,
+ unsigned int flags);
+extern int __fscrypt_prepare_lookup(struct inode *dir, struct dentry *dentry);
+extern int __fscrypt_prepare_symlink(struct inode *dir, unsigned int len,
+ unsigned int max_len,
+ struct fscrypt_str *disk_link);
+extern int __fscrypt_encrypt_symlink(struct inode *inode, const char *target,
+ unsigned int len,
+ struct fscrypt_str *disk_link);
+extern const char *fscrypt_get_symlink(struct inode *inode, const void *caddr,
+ unsigned int max_size,
+ struct delayed_call *done);
+
+#endif /* _LINUX_FSCRYPT_SUPP_H */
diff --git a/include/linux/fscrypto.h b/include/linux/fscrypto.h
deleted file mode 100644
index f6dfc2950f76..000000000000
--- a/include/linux/fscrypto.h
+++ /dev/null
@@ -1,409 +0,0 @@
-/*
- * General per-file encryption definition
- *
- * Copyright (C) 2015, Google, Inc.
- *
- * Written by Michael Halcrow, 2015.
- * Modified by Jaegeuk Kim, 2015.
- */
-
-#ifndef _LINUX_FSCRYPTO_H
-#define _LINUX_FSCRYPTO_H
-
-#include <linux/key.h>
-#include <linux/fs.h>
-#include <linux/mm.h>
-#include <linux/bio.h>
-#include <linux/dcache.h>
-#include <crypto/skcipher.h>
-#include <uapi/linux/fs.h>
-
-#define FS_KEY_DERIVATION_NONCE_SIZE 16
-#define FS_ENCRYPTION_CONTEXT_FORMAT_V1 1
-
-#define FS_POLICY_FLAGS_PAD_4 0x00
-#define FS_POLICY_FLAGS_PAD_8 0x01
-#define FS_POLICY_FLAGS_PAD_16 0x02
-#define FS_POLICY_FLAGS_PAD_32 0x03
-#define FS_POLICY_FLAGS_PAD_MASK 0x03
-#define FS_POLICY_FLAGS_VALID 0x03
-
-/* Encryption algorithms */
-#define FS_ENCRYPTION_MODE_INVALID 0
-#define FS_ENCRYPTION_MODE_AES_256_XTS 1
-#define FS_ENCRYPTION_MODE_AES_256_GCM 2
-#define FS_ENCRYPTION_MODE_AES_256_CBC 3
-#define FS_ENCRYPTION_MODE_AES_256_CTS 4
-
-/**
- * Encryption context for inode
- *
- * Protector format:
- * 1 byte: Protector format (1 = this version)
- * 1 byte: File contents encryption mode
- * 1 byte: File names encryption mode
- * 1 byte: Flags
- * 8 bytes: Master Key descriptor
- * 16 bytes: Encryption Key derivation nonce
- */
-struct fscrypt_context {
- u8 format;
- u8 contents_encryption_mode;
- u8 filenames_encryption_mode;
- u8 flags;
- u8 master_key_descriptor[FS_KEY_DESCRIPTOR_SIZE];
- u8 nonce[FS_KEY_DERIVATION_NONCE_SIZE];
-} __packed;
-
-/* Encryption parameters */
-#define FS_XTS_TWEAK_SIZE 16
-#define FS_AES_128_ECB_KEY_SIZE 16
-#define FS_AES_256_GCM_KEY_SIZE 32
-#define FS_AES_256_CBC_KEY_SIZE 32
-#define FS_AES_256_CTS_KEY_SIZE 32
-#define FS_AES_256_XTS_KEY_SIZE 64
-#define FS_MAX_KEY_SIZE 64
-
-#define FS_KEY_DESC_PREFIX "fscrypt:"
-#define FS_KEY_DESC_PREFIX_SIZE 8
-
-/* This is passed in from userspace into the kernel keyring */
-struct fscrypt_key {
- u32 mode;
- u8 raw[FS_MAX_KEY_SIZE];
- u32 size;
-} __packed;
-
-struct fscrypt_info {
- u8 ci_data_mode;
- u8 ci_filename_mode;
- u8 ci_flags;
- struct crypto_skcipher *ci_ctfm;
- u8 ci_master_key[FS_KEY_DESCRIPTOR_SIZE];
-};
-
-#define FS_CTX_REQUIRES_FREE_ENCRYPT_FL 0x00000001
-#define FS_WRITE_PATH_FL 0x00000002
-
-struct fscrypt_ctx {
- union {
- struct {
- struct page *bounce_page; /* Ciphertext page */
- struct page *control_page; /* Original page */
- } w;
- struct {
- struct bio *bio;
- struct work_struct work;
- } r;
- struct list_head free_list; /* Free list */
- };
- u8 flags; /* Flags */
- u8 mode; /* Encryption mode for tfm */
-};
-
-struct fscrypt_completion_result {
- struct completion completion;
- int res;
-};
-
-#define DECLARE_FS_COMPLETION_RESULT(ecr) \
- struct fscrypt_completion_result ecr = { \
- COMPLETION_INITIALIZER((ecr).completion), 0 }
-
-#define FS_FNAME_NUM_SCATTER_ENTRIES 4
-#define FS_CRYPTO_BLOCK_SIZE 16
-#define FS_FNAME_CRYPTO_DIGEST_SIZE 32
-
-/**
- * For encrypted symlinks, the ciphertext length is stored at the beginning
- * of the string in little-endian format.
- */
-struct fscrypt_symlink_data {
- __le16 len;
- char encrypted_path[1];
-} __packed;
-
-/**
- * This function is used to calculate the disk space required to
- * store a filename of length l in encrypted symlink format.
- */
-static inline u32 fscrypt_symlink_data_len(u32 l)
-{
- if (l < FS_CRYPTO_BLOCK_SIZE)
- l = FS_CRYPTO_BLOCK_SIZE;
- return (l + sizeof(struct fscrypt_symlink_data) - 1);
-}
-
-struct fscrypt_str {
- unsigned char *name;
- u32 len;
-};
-
-struct fscrypt_name {
- const struct qstr *usr_fname;
- struct fscrypt_str disk_name;
- u32 hash;
- u32 minor_hash;
- struct fscrypt_str crypto_buf;
-};
-
-#define FSTR_INIT(n, l) { .name = n, .len = l }
-#define FSTR_TO_QSTR(f) QSTR_INIT((f)->name, (f)->len)
-#define fname_name(p) ((p)->disk_name.name)
-#define fname_len(p) ((p)->disk_name.len)
-
-/*
- * crypto opertions for filesystems
- */
-struct fscrypt_operations {
- int (*get_context)(struct inode *, void *, size_t);
- int (*key_prefix)(struct inode *, u8 **);
- int (*prepare_context)(struct inode *);
- int (*set_context)(struct inode *, const void *, size_t, void *);
- int (*dummy_context)(struct inode *);
- bool (*is_encrypted)(struct inode *);
- bool (*empty_dir)(struct inode *);
- unsigned (*max_namelen)(struct inode *);
-};
-
-static inline bool fscrypt_dummy_context_enabled(struct inode *inode)
-{
- if (inode->i_sb->s_cop->dummy_context &&
- inode->i_sb->s_cop->dummy_context(inode))
- return true;
- return false;
-}
-
-static inline bool fscrypt_valid_contents_enc_mode(u32 mode)
-{
- return (mode == FS_ENCRYPTION_MODE_AES_256_XTS);
-}
-
-static inline bool fscrypt_valid_filenames_enc_mode(u32 mode)
-{
- return (mode == FS_ENCRYPTION_MODE_AES_256_CTS);
-}
-
-static inline bool fscrypt_is_dot_dotdot(const struct qstr *str)
-{
- if (str->len == 1 && str->name[0] == '.')
- return true;
-
- if (str->len == 2 && str->name[0] == '.' && str->name[1] == '.')
- return true;
-
- return false;
-}
-
-static inline struct page *fscrypt_control_page(struct page *page)
-{
-#if IS_ENABLED(CONFIG_FS_ENCRYPTION)
- return ((struct fscrypt_ctx *)page_private(page))->w.control_page;
-#else
- WARN_ON_ONCE(1);
- return ERR_PTR(-EINVAL);
-#endif
-}
-
-static inline int fscrypt_has_encryption_key(struct inode *inode)
-{
-#if IS_ENABLED(CONFIG_FS_ENCRYPTION)
- return (inode->i_crypt_info != NULL);
-#else
- return 0;
-#endif
-}
-
-static inline void fscrypt_set_encrypted_dentry(struct dentry *dentry)
-{
-#if IS_ENABLED(CONFIG_FS_ENCRYPTION)
- spin_lock(&dentry->d_lock);
- dentry->d_flags |= DCACHE_ENCRYPTED_WITH_KEY;
- spin_unlock(&dentry->d_lock);
-#endif
-}
-
-#if IS_ENABLED(CONFIG_FS_ENCRYPTION)
-extern const struct dentry_operations fscrypt_d_ops;
-#endif
-
-static inline void fscrypt_set_d_op(struct dentry *dentry)
-{
-#if IS_ENABLED(CONFIG_FS_ENCRYPTION)
- d_set_d_op(dentry, &fscrypt_d_ops);
-#endif
-}
-
-#if IS_ENABLED(CONFIG_FS_ENCRYPTION)
-/* crypto.c */
-extern struct kmem_cache *fscrypt_info_cachep;
-int fscrypt_initialize(void);
-
-extern struct fscrypt_ctx *fscrypt_get_ctx(struct inode *, gfp_t);
-extern void fscrypt_release_ctx(struct fscrypt_ctx *);
-extern struct page *fscrypt_encrypt_page(struct inode *, struct page *, gfp_t);
-extern int fscrypt_decrypt_page(struct page *);
-extern void fscrypt_decrypt_bio_pages(struct fscrypt_ctx *, struct bio *);
-extern void fscrypt_pullback_bio_page(struct page **, bool);
-extern void fscrypt_restore_control_page(struct page *);
-extern int fscrypt_zeroout_range(struct inode *, pgoff_t, sector_t,
- unsigned int);
-/* policy.c */
-extern int fscrypt_process_policy(struct file *, const struct fscrypt_policy *);
-extern int fscrypt_get_policy(struct inode *, struct fscrypt_policy *);
-extern int fscrypt_has_permitted_context(struct inode *, struct inode *);
-extern int fscrypt_inherit_context(struct inode *, struct inode *,
- void *, bool);
-/* keyinfo.c */
-extern int fscrypt_get_encryption_info(struct inode *);
-extern void fscrypt_put_encryption_info(struct inode *, struct fscrypt_info *);
-
-/* fname.c */
-extern int fscrypt_setup_filename(struct inode *, const struct qstr *,
- int lookup, struct fscrypt_name *);
-extern void fscrypt_free_filename(struct fscrypt_name *);
-extern u32 fscrypt_fname_encrypted_size(struct inode *, u32);
-extern int fscrypt_fname_alloc_buffer(struct inode *, u32,
- struct fscrypt_str *);
-extern void fscrypt_fname_free_buffer(struct fscrypt_str *);
-extern int fscrypt_fname_disk_to_usr(struct inode *, u32, u32,
- const struct fscrypt_str *, struct fscrypt_str *);
-extern int fscrypt_fname_usr_to_disk(struct inode *, const struct qstr *,
- struct fscrypt_str *);
-#endif
-
-/* crypto.c */
-static inline struct fscrypt_ctx *fscrypt_notsupp_get_ctx(struct inode *i,
- gfp_t f)
-{
- return ERR_PTR(-EOPNOTSUPP);
-}
-
-static inline void fscrypt_notsupp_release_ctx(struct fscrypt_ctx *c)
-{
- return;
-}
-
-static inline struct page *fscrypt_notsupp_encrypt_page(struct inode *i,
- struct page *p, gfp_t f)
-{
- return ERR_PTR(-EOPNOTSUPP);
-}
-
-static inline int fscrypt_notsupp_decrypt_page(struct page *p)
-{
- return -EOPNOTSUPP;
-}
-
-static inline void fscrypt_notsupp_decrypt_bio_pages(struct fscrypt_ctx *c,
- struct bio *b)
-{
- return;
-}
-
-static inline void fscrypt_notsupp_pullback_bio_page(struct page **p, bool b)
-{
- return;
-}
-
-static inline void fscrypt_notsupp_restore_control_page(struct page *p)
-{
- return;
-}
-
-static inline int fscrypt_notsupp_zeroout_range(struct inode *i, pgoff_t p,
- sector_t s, unsigned int f)
-{
- return -EOPNOTSUPP;
-}
-
-/* policy.c */
-static inline int fscrypt_notsupp_process_policy(struct file *f,
- const struct fscrypt_policy *p)
-{
- return -EOPNOTSUPP;
-}
-
-static inline int fscrypt_notsupp_get_policy(struct inode *i,
- struct fscrypt_policy *p)
-{
- return -EOPNOTSUPP;
-}
-
-static inline int fscrypt_notsupp_has_permitted_context(struct inode *p,
- struct inode *i)
-{
- return 0;
-}
-
-static inline int fscrypt_notsupp_inherit_context(struct inode *p,
- struct inode *i, void *v, bool b)
-{
- return -EOPNOTSUPP;
-}
-
-/* keyinfo.c */
-static inline int fscrypt_notsupp_get_encryption_info(struct inode *i)
-{
- return -EOPNOTSUPP;
-}
-
-static inline void fscrypt_notsupp_put_encryption_info(struct inode *i,
- struct fscrypt_info *f)
-{
- return;
-}
-
- /* fname.c */
-static inline int fscrypt_notsupp_setup_filename(struct inode *dir,
- const struct qstr *iname,
- int lookup, struct fscrypt_name *fname)
-{
- if (dir->i_sb->s_cop->is_encrypted(dir))
- return -EOPNOTSUPP;
-
- memset(fname, 0, sizeof(struct fscrypt_name));
- fname->usr_fname = iname;
- fname->disk_name.name = (unsigned char *)iname->name;
- fname->disk_name.len = iname->len;
- return 0;
-}
-
-static inline void fscrypt_notsupp_free_filename(struct fscrypt_name *fname)
-{
- return;
-}
-
-static inline u32 fscrypt_notsupp_fname_encrypted_size(struct inode *i, u32 s)
-{
- /* never happens */
- WARN_ON(1);
- return 0;
-}
-
-static inline int fscrypt_notsupp_fname_alloc_buffer(struct inode *inode,
- u32 ilen, struct fscrypt_str *crypto_str)
-{
- return -EOPNOTSUPP;
-}
-
-static inline void fscrypt_notsupp_fname_free_buffer(struct fscrypt_str *c)
-{
- return;
-}
-
-static inline int fscrypt_notsupp_fname_disk_to_usr(struct inode *inode,
- u32 hash, u32 minor_hash,
- const struct fscrypt_str *iname,
- struct fscrypt_str *oname)
-{
- return -EOPNOTSUPP;
-}
-
-static inline int fscrypt_notsupp_fname_usr_to_disk(struct inode *inode,
- const struct qstr *iname,
- struct fscrypt_str *oname)
-{
- return -EOPNOTSUPP;
-}
-#endif /* _LINUX_FSCRYPTO_H */
diff --git a/include/linux/fsnotify.h b/include/linux/fsnotify.h
index e5f03a4d8430..ff51592c4792 100644
--- a/include/linux/fsnotify.h
+++ b/include/linux/fsnotify.h
@@ -213,12 +213,20 @@ static inline void fsnotify_modify(struct file *file)
static inline void fsnotify_open(struct file *file)
{
struct path *path = &file->f_path;
+ struct path lower_path;
struct inode *inode = path->dentry->d_inode;
+
__u32 mask = FS_OPEN;
if (S_ISDIR(inode->i_mode))
mask |= FS_ISDIR;
+ if (path->dentry->d_op && path->dentry->d_op->d_canonical_path) {
+ path->dentry->d_op->d_canonical_path(path, &lower_path);
+ fsnotify_parent(&lower_path, NULL, mask);
+ fsnotify(lower_path.dentry->d_inode, mask, &lower_path, FSNOTIFY_EVENT_PATH, NULL, 0);
+ path_put(&lower_path);
+ }
fsnotify_parent(path, NULL, mask);
fsnotify(inode, mask, path, FSNOTIFY_EVENT_PATH, NULL, 0);
}
diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h
index b3d34d3e0e7e..ab7938a58ca0 100644
--- a/include/linux/ftrace.h
+++ b/include/linux/ftrace.h
@@ -244,8 +244,16 @@ static inline int ftrace_function_local_disabled(struct ftrace_ops *ops)
return *this_cpu_ptr(ops->disabled);
}
+#ifdef CONFIG_CFI_CLANG
+/* Use a C stub with the correct type for CFI */
+static inline void ftrace_stub(unsigned long a0, unsigned long a1,
+ struct ftrace_ops *op, struct pt_regs *regs)
+{
+}
+#else
extern void ftrace_stub(unsigned long a0, unsigned long a1,
struct ftrace_ops *op, struct pt_regs *regs);
+#endif
#else /* !CONFIG_FUNCTION_TRACER */
/*
@@ -734,7 +742,8 @@ static inline unsigned long get_lock_parent_ip(void)
static inline void time_hardirqs_off(unsigned long a0, unsigned long a1) { }
#endif
-#ifdef CONFIG_PREEMPT_TRACER
+#if defined(CONFIG_PREEMPT_TRACER) || \
+ (defined(CONFIG_DEBUG_PREEMPT) && defined(CONFIG_PREEMPTIRQ_EVENTS))
extern void trace_preempt_on(unsigned long a0, unsigned long a1);
extern void trace_preempt_off(unsigned long a0, unsigned long a1);
#else
diff --git a/include/linux/gfp.h b/include/linux/gfp.h
index f8041f9de31e..e47dbc1933a2 100644
--- a/include/linux/gfp.h
+++ b/include/linux/gfp.h
@@ -516,7 +516,7 @@ extern void __free_page_frag(void *addr);
void page_alloc_init(void);
void drain_zone_pages(struct zone *zone, struct per_cpu_pages *pcp);
void drain_all_pages(struct zone *zone);
-void drain_local_pages(struct zone *zone);
+void drain_local_pages(void *zone);
void page_alloc_init_late(void);
diff --git a/include/linux/gpio_event.h b/include/linux/gpio_event.h
new file mode 100644
index 000000000000..2613fc5e4a93
--- /dev/null
+++ b/include/linux/gpio_event.h
@@ -0,0 +1,170 @@
+/* include/linux/gpio_event.h
+ *
+ * Copyright (C) 2007 Google, Inc.
+ *
+ * This software is licensed under the terms of the GNU General Public
+ * License version 2, as published by the Free Software Foundation, and
+ * may be copied, distributed, and modified under those terms.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ */
+
+#ifndef _LINUX_GPIO_EVENT_H
+#define _LINUX_GPIO_EVENT_H
+
+#include <linux/input.h>
+
+struct gpio_event_input_devs {
+ int count;
+ struct input_dev *dev[];
+};
+enum {
+ GPIO_EVENT_FUNC_UNINIT = 0x0,
+ GPIO_EVENT_FUNC_INIT = 0x1,
+ GPIO_EVENT_FUNC_SUSPEND = 0x2,
+ GPIO_EVENT_FUNC_RESUME = 0x3,
+};
+struct gpio_event_info {
+ int (*func)(struct gpio_event_input_devs *input_devs,
+ struct gpio_event_info *info,
+ void **data, int func);
+ int (*event)(struct gpio_event_input_devs *input_devs,
+ struct gpio_event_info *info,
+ void **data, unsigned int dev, unsigned int type,
+ unsigned int code, int value); /* out events */
+ bool no_suspend;
+};
+
+struct gpio_event_platform_data {
+ const char *name;
+ struct gpio_event_info **info;
+ size_t info_count;
+ int (*power)(const struct gpio_event_platform_data *pdata, bool on);
+ const char *names[]; /* If name is NULL, names contain a NULL */
+ /* terminated list of input devices to create */
+};
+
+#define GPIO_EVENT_DEV_NAME "gpio-event"
+
+/* Key matrix */
+
+enum gpio_event_matrix_flags {
+ /* unset: drive active output low, set: drive active output high */
+ GPIOKPF_ACTIVE_HIGH = 1U << 0,
+ GPIOKPF_DEBOUNCE = 1U << 1,
+ GPIOKPF_REMOVE_SOME_PHANTOM_KEYS = 1U << 2,
+ GPIOKPF_REMOVE_PHANTOM_KEYS = GPIOKPF_REMOVE_SOME_PHANTOM_KEYS |
+ GPIOKPF_DEBOUNCE,
+ GPIOKPF_DRIVE_INACTIVE = 1U << 3,
+ GPIOKPF_LEVEL_TRIGGERED_IRQ = 1U << 4,
+ GPIOKPF_PRINT_UNMAPPED_KEYS = 1U << 16,
+ GPIOKPF_PRINT_MAPPED_KEYS = 1U << 17,
+ GPIOKPF_PRINT_PHANTOM_KEYS = 1U << 18,
+};
+
+#define MATRIX_CODE_BITS (10)
+#define MATRIX_KEY_MASK ((1U << MATRIX_CODE_BITS) - 1)
+#define MATRIX_KEY(dev, code) \
+ (((dev) << MATRIX_CODE_BITS) | (code & MATRIX_KEY_MASK))
+
+extern int gpio_event_matrix_func(struct gpio_event_input_devs *input_devs,
+ struct gpio_event_info *info, void **data, int func);
+struct gpio_event_matrix_info {
+ /* initialize to gpio_event_matrix_func */
+ struct gpio_event_info info;
+ /* size must be ninputs * noutputs */
+ const unsigned short *keymap;
+ unsigned int *input_gpios;
+ unsigned int *output_gpios;
+ unsigned int ninputs;
+ unsigned int noutputs;
+ /* time to wait before reading inputs after driving each output */
+ ktime_t settle_time;
+ /* time to wait before scanning the keypad a second time */
+ ktime_t debounce_delay;
+ ktime_t poll_time;
+ unsigned flags;
+};
+
+/* Directly connected inputs and outputs */
+
+enum gpio_event_direct_flags {
+ GPIOEDF_ACTIVE_HIGH = 1U << 0,
+/* GPIOEDF_USE_DOWN_IRQ = 1U << 1, */
+/* GPIOEDF_USE_IRQ = (1U << 2) | GPIOIDF_USE_DOWN_IRQ, */
+ GPIOEDF_PRINT_KEYS = 1U << 8,
+ GPIOEDF_PRINT_KEY_DEBOUNCE = 1U << 9,
+ GPIOEDF_PRINT_KEY_UNSTABLE = 1U << 10,
+};
+
+struct gpio_event_direct_entry {
+ uint32_t gpio:16;
+ uint32_t code:10;
+ uint32_t dev:6;
+};
+
+/* inputs */
+extern int gpio_event_input_func(struct gpio_event_input_devs *input_devs,
+ struct gpio_event_info *info, void **data, int func);
+struct gpio_event_input_info {
+ /* initialize to gpio_event_input_func */
+ struct gpio_event_info info;
+ ktime_t debounce_time;
+ ktime_t poll_time;
+ uint16_t flags;
+ uint16_t type;
+ const struct gpio_event_direct_entry *keymap;
+ size_t keymap_size;
+};
+
+/* outputs */
+extern int gpio_event_output_func(struct gpio_event_input_devs *input_devs,
+ struct gpio_event_info *info, void **data, int func);
+extern int gpio_event_output_event(struct gpio_event_input_devs *input_devs,
+ struct gpio_event_info *info, void **data,
+ unsigned int dev, unsigned int type,
+ unsigned int code, int value);
+struct gpio_event_output_info {
+ /* initialize to gpio_event_output_func and gpio_event_output_event */
+ struct gpio_event_info info;
+ uint16_t flags;
+ uint16_t type;
+ const struct gpio_event_direct_entry *keymap;
+ size_t keymap_size;
+};
+
+
+/* axes */
+
+enum gpio_event_axis_flags {
+ GPIOEAF_PRINT_UNKNOWN_DIRECTION = 1U << 16,
+ GPIOEAF_PRINT_RAW = 1U << 17,
+ GPIOEAF_PRINT_EVENT = 1U << 18,
+};
+
+extern int gpio_event_axis_func(struct gpio_event_input_devs *input_devs,
+ struct gpio_event_info *info, void **data, int func);
+struct gpio_event_axis_info {
+ /* initialize to gpio_event_axis_func */
+ struct gpio_event_info info;
+ uint8_t count; /* number of gpios for this axis */
+ uint8_t dev; /* device index when using multiple input devices */
+ uint8_t type; /* EV_REL or EV_ABS */
+ uint16_t code;
+ uint16_t decoded_size;
+ uint16_t (*map)(struct gpio_event_axis_info *info, uint16_t in);
+ uint32_t *gpio;
+ uint32_t flags;
+};
+#define gpio_axis_2bit_gray_map gpio_axis_4bit_gray_map
+#define gpio_axis_3bit_gray_map gpio_axis_4bit_gray_map
+uint16_t gpio_axis_4bit_gray_map(
+ struct gpio_event_axis_info *info, uint16_t in);
+uint16_t gpio_axis_5bit_singletrack_map(
+ struct gpio_event_axis_info *info, uint16_t in);
+
+#endif
diff --git a/include/linux/if_pppolac.h b/include/linux/if_pppolac.h
new file mode 100644
index 000000000000..e40aa1075a30
--- /dev/null
+++ b/include/linux/if_pppolac.h
@@ -0,0 +1,23 @@
+/* include/linux/if_pppolac.h
+ *
+ * Header for PPP on L2TP Access Concentrator / PPPoLAC Socket (RFC 2661)
+ *
+ * Copyright (C) 2009 Google, Inc.
+ * Author: Chia-chi Yeh <chiachi@android.com>
+ *
+ * This software is licensed under the terms of the GNU General Public
+ * License version 2, as published by the Free Software Foundation, and
+ * may be copied, distributed, and modified under those terms.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ */
+
+#ifndef __LINUX_IF_PPPOLAC_H
+#define __LINUX_IF_PPPOLAC_H
+
+#include <uapi/linux/if_pppolac.h>
+
+#endif /* __LINUX_IF_PPPOLAC_H */
diff --git a/include/linux/if_pppopns.h b/include/linux/if_pppopns.h
new file mode 100644
index 000000000000..4ac621a9ce7c
--- /dev/null
+++ b/include/linux/if_pppopns.h
@@ -0,0 +1,23 @@
+/* include/linux/if_pppopns.h
+ *
+ * Header for PPP on PPTP Network Server / PPPoPNS Socket (RFC 2637)
+ *
+ * Copyright (C) 2009 Google, Inc.
+ * Author: Chia-chi Yeh <chiachi@android.com>
+ *
+ * This software is licensed under the terms of the GNU General Public
+ * License version 2, as published by the Free Software Foundation, and
+ * may be copied, distributed, and modified under those terms.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ */
+
+#ifndef __LINUX_IF_PPPOPNS_H
+#define __LINUX_IF_PPPOPNS_H
+
+#include <uapi/linux/if_pppopns.h>
+
+#endif /* __LINUX_IF_PPPOPNS_H */
diff --git a/include/linux/if_pppox.h b/include/linux/if_pppox.h
index ba7a9b0c7c57..325727a7096a 100644
--- a/include/linux/if_pppox.h
+++ b/include/linux/if_pppox.h
@@ -43,6 +43,25 @@ struct pptp_opt {
u32 seq_sent, seq_recv;
int ppp_flags;
};
+
+struct pppolac_opt {
+ __u32 local;
+ __u32 remote;
+ __u32 recv_sequence;
+ __u32 xmit_sequence;
+ atomic_t sequencing;
+ int (*backlog_rcv)(struct sock *sk_udp, struct sk_buff *skb);
+};
+
+struct pppopns_opt {
+ __u16 local;
+ __u16 remote;
+ __u32 recv_sequence;
+ __u32 xmit_sequence;
+ void (*data_ready)(struct sock *sk_raw);
+ int (*backlog_rcv)(struct sock *sk_raw, struct sk_buff *skb);
+};
+
#include <net/sock.h>
struct pppox_sock {
@@ -53,6 +72,8 @@ struct pppox_sock {
union {
struct pppoe_opt pppoe;
struct pptp_opt pptp;
+ struct pppolac_opt lac;
+ struct pppopns_opt pns;
} proto;
__be16 num;
};
diff --git a/include/linux/init.h b/include/linux/init.h
index 0cca4142987f..d4b13a4e04e6 100644
--- a/include/linux/init.h
+++ b/include/linux/init.h
@@ -46,7 +46,7 @@
/* These are for everybody (although not all archs will actually
discard it in modules) */
-#define __init __section(.init.text) __cold notrace __latent_entropy __noinitretpoline
+#define __init __section(.init.text) __cold notrace __latent_entropy __noinitretpoline __nocfi
#define __initdata __section(.init.data)
#define __initconst __section(.init.rodata)
#define __exitdata __section(.exit.data)
@@ -150,6 +150,15 @@ extern bool initcall_debug;
#ifndef __ASSEMBLY__
+#ifdef CONFIG_LTO_CLANG
+ /* prepend the variable name with __COUNTER__ to ensure correct ordering */
+ #define ___initcall_name2(c, fn, id) __initcall_##c##_##fn##id
+ #define ___initcall_name1(c, fn, id) ___initcall_name2(c, fn, id)
+ #define __initcall_name(fn, id) ___initcall_name1(__COUNTER__, fn, id)
+#else
+ #define __initcall_name(fn, id) __initcall_##fn##id
+#endif
+
/*
* initcalls are now grouped by functionality into separate
* subsections. Ordering inside the subsections is determined
@@ -167,7 +176,7 @@ extern bool initcall_debug;
*/
#define __define_initcall(fn, id) \
- static initcall_t __initcall_##fn##id __used \
+ static initcall_t __initcall_name(fn, id) __used \
__attribute__((__section__(".initcall" #id ".init"))) = fn;
/*
diff --git a/include/linux/initramfs.h b/include/linux/initramfs.h
new file mode 100644
index 000000000000..fc7da63b125b
--- /dev/null
+++ b/include/linux/initramfs.h
@@ -0,0 +1,32 @@
+/*
+ * include/linux/initramfs.h
+ *
+ * Copyright (C) 2015, Google
+ * Rom Lemarchand <romlem@android.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; version 2 of the License.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ */
+
+#ifndef _LINUX_INITRAMFS_H
+#define _LINUX_INITRAMFS_H
+
+#include <linux/kconfig.h>
+
+#if IS_BUILTIN(CONFIG_BLK_DEV_INITRD)
+
+int __init default_rootfs(void);
+
+#endif
+
+#endif /* _LINUX_INITRAMFS_H */
diff --git a/include/linux/interrupt.h b/include/linux/interrupt.h
index 72f0721f75e7..999b7c374645 100644
--- a/include/linux/interrupt.h
+++ b/include/linux/interrupt.h
@@ -18,6 +18,7 @@
#include <linux/atomic.h>
#include <asm/ptrace.h>
#include <asm/irq.h>
+#include <asm/sections.h>
/*
* These correspond to the IORESOURCE_IRQ_* defines in
@@ -699,7 +700,6 @@ extern int early_irq_init(void);
extern int arch_probe_nr_irqs(void);
extern int arch_early_irq_init(void);
-#if defined(CONFIG_FUNCTION_GRAPH_TRACER) || defined(CONFIG_KASAN)
/*
* We want to know which function is an entrypoint of a hardirq or a softirq.
*/
@@ -707,16 +707,4 @@ extern int arch_early_irq_init(void);
#define __softirq_entry \
__attribute__((__section__(".softirqentry.text")))
-/* Limits of hardirq entrypoints */
-extern char __irqentry_text_start[];
-extern char __irqentry_text_end[];
-/* Limits of softirq entrypoints */
-extern char __softirqentry_text_start[];
-extern char __softirqentry_text_end[];
-
-#else
-#define __irq_entry
-#define __softirq_entry
-#endif
-
#endif
diff --git a/include/linux/ipv6.h b/include/linux/ipv6.h
index b9dfca557a6c..11ff75140361 100644
--- a/include/linux/ipv6.h
+++ b/include/linux/ipv6.h
@@ -37,9 +37,11 @@ struct ipv6_devconf {
__s32 accept_ra_rtr_pref;
__s32 rtr_probe_interval;
#ifdef CONFIG_IPV6_ROUTE_INFO
+ __s32 accept_ra_rt_info_min_plen;
__s32 accept_ra_rt_info_max_plen;
#endif
#endif
+ __s32 accept_ra_rt_table;
__s32 proxy_ndp;
__s32 accept_source_route;
__s32 accept_ra_from_local;
diff --git a/include/linux/kasan.h b/include/linux/kasan.h
index 820c0ad54a01..b37afd197eed 100644
--- a/include/linux/kasan.h
+++ b/include/linux/kasan.h
@@ -30,16 +30,10 @@ static inline void *kasan_mem_to_shadow(const void *addr)
}
/* Enable reporting bugs after kasan_disable_current() */
-static inline void kasan_enable_current(void)
-{
- current->kasan_depth++;
-}
+extern void kasan_enable_current(void);
/* Disable reporting bugs for current task */
-static inline void kasan_disable_current(void)
-{
- current->kasan_depth--;
-}
+extern void kasan_disable_current(void);
void kasan_unpoison_shadow(const void *address, size_t size);
@@ -52,7 +46,7 @@ void kasan_free_pages(struct page *page, unsigned int order);
void kasan_cache_create(struct kmem_cache *cache, size_t *size,
unsigned long *flags);
void kasan_cache_shrink(struct kmem_cache *cache);
-void kasan_cache_destroy(struct kmem_cache *cache);
+void kasan_cache_shutdown(struct kmem_cache *cache);
void kasan_poison_slab(struct page *page);
void kasan_unpoison_object_data(struct kmem_cache *cache, void *object);
@@ -81,6 +75,9 @@ size_t ksize(const void *);
static inline void kasan_unpoison_slab(const void *ptr) { ksize(ptr); }
size_t kasan_metadata_size(struct kmem_cache *cache);
+bool kasan_save_enable_multi_shot(void);
+void kasan_restore_multi_shot(bool enabled);
+
#else /* CONFIG_KASAN */
static inline void kasan_unpoison_shadow(const void *address, size_t size) {}
@@ -98,7 +95,7 @@ static inline void kasan_cache_create(struct kmem_cache *cache,
size_t *size,
unsigned long *flags) {}
static inline void kasan_cache_shrink(struct kmem_cache *cache) {}
-static inline void kasan_cache_destroy(struct kmem_cache *cache) {}
+static inline void kasan_cache_shutdown(struct kmem_cache *cache) {}
static inline void kasan_poison_slab(struct page *page) {}
static inline void kasan_unpoison_object_data(struct kmem_cache *cache,
diff --git a/include/linux/kcov.h b/include/linux/kcov.h
index 2883ac98c280..87e2a44f1bab 100644
--- a/include/linux/kcov.h
+++ b/include/linux/kcov.h
@@ -7,19 +7,23 @@ struct task_struct;
#ifdef CONFIG_KCOV
-void kcov_task_init(struct task_struct *t);
-void kcov_task_exit(struct task_struct *t);
-
enum kcov_mode {
/* Coverage collection is not enabled yet. */
KCOV_MODE_DISABLED = 0,
+ /* KCOV was initialized, but tracing mode hasn't been chosen yet. */
+ KCOV_MODE_INIT = 1,
/*
* Tracing coverage collection mode.
* Covered PCs are collected in a per-task buffer.
*/
- KCOV_MODE_TRACE = 1,
+ KCOV_MODE_TRACE_PC = 2,
+ /* Collecting comparison operands mode. */
+ KCOV_MODE_TRACE_CMP = 3,
};
+void kcov_task_init(struct task_struct *t);
+void kcov_task_exit(struct task_struct *t);
+
#else
static inline void kcov_task_init(struct task_struct *t) {}
diff --git a/include/linux/key.h b/include/linux/key.h
index ed9b44fd9580..7e2d14379f49 100644
--- a/include/linux/key.h
+++ b/include/linux/key.h
@@ -369,7 +369,10 @@ static inline bool key_is_negative(const struct key *key)
return key_read_state(key) < 0;
}
-#define rcu_dereference_key(KEY) \
+#define dereference_key_rcu(KEY) \
+ (rcu_dereference((KEY)->payload.rcu_data0))
+
+#define dereference_key_locked(KEY) \
(rcu_dereference_protected((KEY)->payload.rcu_data0, \
rwsem_is_locked(&((struct key *)(KEY))->sem)))
diff --git a/include/linux/keychord.h b/include/linux/keychord.h
new file mode 100644
index 000000000000..08cf5402102c
--- /dev/null
+++ b/include/linux/keychord.h
@@ -0,0 +1,23 @@
+/*
+ * Key chord input driver
+ *
+ * Copyright (C) 2008 Google, Inc.
+ * Author: Mike Lockwood <lockwood@android.com>
+ *
+ * This software is licensed under the terms of the GNU General Public
+ * License version 2, as published by the Free Software Foundation, and
+ * may be copied, distributed, and modified under those terms.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+*/
+
+#ifndef __LINUX_KEYCHORD_H_
+#define __LINUX_KEYCHORD_H_
+
+#include <uapi/linux/keychord.h>
+
+#endif /* __LINUX_KEYCHORD_H_ */
diff --git a/include/linux/keycombo.h b/include/linux/keycombo.h
new file mode 100644
index 000000000000..c6db2626b0d3
--- /dev/null
+++ b/include/linux/keycombo.h
@@ -0,0 +1,36 @@
+/*
+ * include/linux/keycombo.h - platform data structure for keycombo driver
+ *
+ * Copyright (C) 2014 Google, Inc.
+ *
+ * This software is licensed under the terms of the GNU General Public
+ * License version 2, as published by the Free Software Foundation, and
+ * may be copied, distributed, and modified under those terms.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ */
+
+#ifndef _LINUX_KEYCOMBO_H
+#define _LINUX_KEYCOMBO_H
+
+#define KEYCOMBO_NAME "keycombo"
+
+/*
+ * if key_down_fn and key_up_fn are both present, you are guaranteed that
+ * key_down_fn will return before key_up_fn is called, and that key_up_fn
+ * is called iff key_down_fn is called.
+ */
+struct keycombo_platform_data {
+ void (*key_down_fn)(void *);
+ void (*key_up_fn)(void *);
+ void *priv;
+ int key_down_delay; /* Time in ms */
+ int *keys_up;
+ int keys_down[]; /* 0 terminated */
+};
+
+#endif /* _LINUX_KEYCOMBO_H */
diff --git a/include/linux/keyreset.h b/include/linux/keyreset.h
new file mode 100644
index 000000000000..2e34afab65e4
--- /dev/null
+++ b/include/linux/keyreset.h
@@ -0,0 +1,29 @@
+/*
+ * include/linux/keyreset.h - platform data structure for resetkeys driver
+ *
+ * Copyright (C) 2014 Google, Inc.
+ *
+ * This software is licensed under the terms of the GNU General Public
+ * License version 2, as published by the Free Software Foundation, and
+ * may be copied, distributed, and modified under those terms.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ */
+
+#ifndef _LINUX_KEYRESET_H
+#define _LINUX_KEYRESET_H
+
+#define KEYRESET_NAME "keyreset"
+
+struct keyreset_platform_data {
+ int (*reset_fn)(void);
+ int key_down_delay;
+ int *keys_up;
+ int keys_down[]; /* 0 terminated */
+};
+
+#endif /* _LINUX_KEYRESET_H */
diff --git a/include/linux/list.h b/include/linux/list.h
index 5809e9a2de5b..d1039ecaf94f 100644
--- a/include/linux/list.h
+++ b/include/linux/list.h
@@ -28,27 +28,42 @@ static inline void INIT_LIST_HEAD(struct list_head *list)
list->prev = list;
}
+#ifdef CONFIG_DEBUG_LIST
+extern bool __list_add_valid(struct list_head *new,
+ struct list_head *prev,
+ struct list_head *next);
+extern bool __list_del_entry_valid(struct list_head *entry);
+#else
+static inline bool __list_add_valid(struct list_head *new,
+ struct list_head *prev,
+ struct list_head *next)
+{
+ return true;
+}
+static inline bool __list_del_entry_valid(struct list_head *entry)
+{
+ return true;
+}
+#endif
+
/*
* Insert a new entry between two known consecutive entries.
*
* This is only for internal list manipulation where we know
* the prev/next entries already!
*/
-#ifndef CONFIG_DEBUG_LIST
static inline void __list_add(struct list_head *new,
struct list_head *prev,
struct list_head *next)
{
+ if (!__list_add_valid(new, prev, next))
+ return;
+
next->prev = new;
new->next = next;
new->prev = prev;
WRITE_ONCE(prev->next, new);
}
-#else
-extern void __list_add(struct list_head *new,
- struct list_head *prev,
- struct list_head *next);
-#endif
/**
* list_add - add a new entry
@@ -96,22 +111,20 @@ static inline void __list_del(struct list_head * prev, struct list_head * next)
* Note: list_empty() on entry does not return true after this, the entry is
* in an undefined state.
*/
-#ifndef CONFIG_DEBUG_LIST
static inline void __list_del_entry(struct list_head *entry)
{
+ if (!__list_del_entry_valid(entry))
+ return;
+
__list_del(entry->prev, entry->next);
}
static inline void list_del(struct list_head *entry)
{
- __list_del(entry->prev, entry->next);
+ __list_del_entry(entry);
entry->next = LIST_POISON1;
entry->prev = LIST_POISON2;
}
-#else
-extern void __list_del_entry(struct list_head *entry);
-extern void list_del(struct list_head *entry);
-#endif
/**
* list_replace - replace old entry by new one
diff --git a/include/linux/lsm_hooks.h b/include/linux/lsm_hooks.h
index 558adfa5c8a8..121f2f26b037 100644
--- a/include/linux/lsm_hooks.h
+++ b/include/linux/lsm_hooks.h
@@ -1328,7 +1328,40 @@
* @inode we wish to get the security context of.
* @ctx is a pointer in which to place the allocated security context.
* @ctxlen points to the place to put the length of @ctx.
- * This is the main security structure.
+ *
+ * Security hooks for using the eBPF maps and programs functionalities through
+ * eBPF syscalls.
+ *
+ * @bpf:
+ * Do a initial check for all bpf syscalls after the attribute is copied
+ * into the kernel. The actual security module can implement their own
+ * rules to check the specific cmd they need.
+ *
+ * @bpf_map:
+ * Do a check when the kernel generate and return a file descriptor for
+ * eBPF maps.
+ *
+ * @map: bpf map that we want to access
+ * @mask: the access flags
+ *
+ * @bpf_prog:
+ * Do a check when the kernel generate and return a file descriptor for
+ * eBPF programs.
+ *
+ * @prog: bpf prog that userspace want to use.
+ *
+ * @bpf_map_alloc_security:
+ * Initialize the security field inside bpf map.
+ *
+ * @bpf_map_free_security:
+ * Clean up the security information stored inside bpf map.
+ *
+ * @bpf_prog_alloc_security:
+ * Initialize the security field inside bpf program.
+ *
+ * @bpf_prog_free_security:
+ * Clean up the security information stored inside bpf prog.
+ *
*/
union security_list_options {
@@ -1652,6 +1685,17 @@ union security_list_options {
struct audit_context *actx);
void (*audit_rule_free)(void *lsmrule);
#endif /* CONFIG_AUDIT */
+
+#ifdef CONFIG_BPF_SYSCALL
+ int (*bpf)(int cmd, union bpf_attr *attr,
+ unsigned int size);
+ int (*bpf_map)(struct bpf_map *map, fmode_t fmode);
+ int (*bpf_prog)(struct bpf_prog *prog);
+ int (*bpf_map_alloc_security)(struct bpf_map *map);
+ void (*bpf_map_free_security)(struct bpf_map *map);
+ int (*bpf_prog_alloc_security)(struct bpf_prog_aux *aux);
+ void (*bpf_prog_free_security)(struct bpf_prog_aux *aux);
+#endif /* CONFIG_BPF_SYSCALL */
};
struct security_hook_heads {
@@ -1866,6 +1910,15 @@ struct security_hook_heads {
struct list_head audit_rule_match;
struct list_head audit_rule_free;
#endif /* CONFIG_AUDIT */
+#ifdef CONFIG_BPF_SYSCALL
+ struct list_head bpf;
+ struct list_head bpf_map;
+ struct list_head bpf_prog;
+ struct list_head bpf_map_alloc_security;
+ struct list_head bpf_map_free_security;
+ struct list_head bpf_prog_alloc_security;
+ struct list_head bpf_prog_free_security;
+#endif /* CONFIG_BPF_SYSCALL */
};
/*
diff --git a/include/linux/memory-state-time.h b/include/linux/memory-state-time.h
new file mode 100644
index 000000000000..d2212b027866
--- /dev/null
+++ b/include/linux/memory-state-time.h
@@ -0,0 +1,42 @@
+/* include/linux/memory-state-time.h
+ *
+ * Copyright (C) 2016 Google, Inc.
+ *
+ * This software is licensed under the terms of the GNU General Public
+ * License version 2, as published by the Free Software Foundation, and
+ * may be copied, distributed, and modified under those terms.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ */
+
+#include <linux/workqueue.h>
+
+#define UPDATE_MEMORY_STATE(BLOCK, VALUE) BLOCK->update_call(BLOCK, VALUE)
+
+struct memory_state_update_block;
+
+typedef void (*memory_state_update_fn_t)(struct memory_state_update_block *ub,
+ int value);
+
+/* This struct is populated when you pass it to a memory_state_register*
+ * function. The update_call function is used for an update and defined in the
+ * typedef memory_state_update_fn_t
+ */
+struct memory_state_update_block {
+ memory_state_update_fn_t update_call;
+ int id;
+};
+
+/* Register a frequency struct memory_state_update_block to provide updates to
+ * memory_state_time about frequency changes using its update_call function.
+ */
+struct memory_state_update_block *memory_state_register_frequency_source(void);
+
+/* Register a bandwidth struct memory_state_update_block to provide updates to
+ * memory_state_time about bandwidth changes using its update_call function.
+ */
+struct memory_state_update_block *memory_state_register_bandwidth_source(void);
diff --git a/include/linux/mm.h b/include/linux/mm.h
index 11a5a46ce72b..5013b3a7ebcb 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -1143,6 +1143,7 @@ extern void pagefault_out_of_memory(void);
extern void show_free_areas(unsigned int flags);
extern bool skip_free_areas_node(unsigned int flags, int nid);
+void shmem_set_file(struct vm_area_struct *vma, struct file *file);
int shmem_zero_setup(struct vm_area_struct *);
#ifdef CONFIG_SHMEM
bool shmem_mapping(struct address_space *mapping);
@@ -1966,7 +1967,7 @@ static inline int vma_adjust(struct vm_area_struct *vma, unsigned long start,
extern struct vm_area_struct *vma_merge(struct mm_struct *,
struct vm_area_struct *prev, unsigned long addr, unsigned long end,
unsigned long vm_flags, struct anon_vma *, struct file *, pgoff_t,
- struct mempolicy *, struct vm_userfaultfd_ctx);
+ struct mempolicy *, struct vm_userfaultfd_ctx, const char __user *);
extern struct anon_vma *find_mergeable_anon_vma(struct vm_area_struct *);
extern int split_vma(struct mm_struct *,
struct vm_area_struct *, unsigned long addr, int new_below);
diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
index 8d6decd50220..35daed7743ee 100644
--- a/include/linux/mm_types.h
+++ b/include/linux/mm_types.h
@@ -326,11 +326,18 @@ struct vm_area_struct {
/*
* For areas with an address space and backing store,
* linkage into the address_space->i_mmap interval tree.
+ *
+ * For private anonymous mappings, a pointer to a null terminated string
+ * in the user process containing the name given to the vma, or NULL
+ * if unnamed.
*/
- struct {
- struct rb_node rb;
- unsigned long rb_subtree_last;
- } shared;
+ union {
+ struct {
+ struct rb_node rb;
+ unsigned long rb_subtree_last;
+ } shared;
+ const char __user *anon_name;
+ };
/*
* A file's MAP_PRIVATE vma can be in both i_mmap tree and anon_vma
@@ -620,4 +627,13 @@ typedef struct {
unsigned long val;
} swp_entry_t;
+/* Return the name for an anonymous mapping or NULL for a file-backed mapping */
+static inline const char __user *vma_get_anon_name(struct vm_area_struct *vma)
+{
+ if (vma->vm_file)
+ return NULL;
+
+ return vma->anon_name;
+}
+
#endif /* _LINUX_MM_TYPES_H */
diff --git a/include/linux/mmc/card.h b/include/linux/mmc/card.h
index 73fad83acbcb..510a73a7a3d2 100644
--- a/include/linux/mmc/card.h
+++ b/include/linux/mmc/card.h
@@ -119,6 +119,9 @@ struct mmc_ext_csd {
u8 raw_pwr_cl_ddr_200_360; /* 253 */
u8 raw_bkops_status; /* 246 */
u8 raw_sectors[4]; /* 212 - 4 bytes */
+ u8 pre_eol_info; /* 267 */
+ u8 device_life_time_est_typ_a; /* 268 */
+ u8 device_life_time_est_typ_b; /* 269 */
unsigned int feature_support;
#define MMC_DISCARD_FEATURE BIT(0) /* CMD38 feature */
diff --git a/include/linux/mmc/core.h b/include/linux/mmc/core.h
index 2b953eb8ceae..46a4b798c7cf 100644
--- a/include/linux/mmc/core.h
+++ b/include/linux/mmc/core.h
@@ -142,6 +142,10 @@ struct mmc_request {
/* Allow other commands during this ongoing data transfer or busy wait */
bool cap_cmd_during_tfr;
+ ktime_t io_start;
+#ifdef CONFIG_BLOCK
+ int lat_hist_enabled;
+#endif
};
struct mmc_card;
diff --git a/include/linux/mmc/host.h b/include/linux/mmc/host.h
index 0b2439441cc8..1808b7895dac 100644
--- a/include/linux/mmc/host.h
+++ b/include/linux/mmc/host.h
@@ -16,6 +16,7 @@
#include <linux/sched.h>
#include <linux/device.h>
#include <linux/fault-inject.h>
+#include <linux/blkdev.h>
#include <linux/mmc/core.h>
#include <linux/mmc/card.h>
@@ -397,6 +398,21 @@ struct mmc_host {
int dsr_req; /* DSR value is valid */
u32 dsr; /* optional driver stage (DSR) value */
+#ifdef CONFIG_MMC_EMBEDDED_SDIO
+ struct {
+ struct sdio_cis *cis;
+ struct sdio_cccr *cccr;
+ struct sdio_embedded_func *funcs;
+ int num_funcs;
+ } embedded_sdio_data;
+#endif
+
+#ifdef CONFIG_BLOCK
+ int latency_hist_enabled;
+ struct io_latency_state io_lat_read;
+ struct io_latency_state io_lat_write;
+#endif
+
unsigned long private[0] ____cacheline_aligned;
};
@@ -406,6 +422,14 @@ void mmc_remove_host(struct mmc_host *);
void mmc_free_host(struct mmc_host *);
int mmc_of_parse(struct mmc_host *host);
+#ifdef CONFIG_MMC_EMBEDDED_SDIO
+extern void mmc_set_embedded_sdio_data(struct mmc_host *host,
+ struct sdio_cis *cis,
+ struct sdio_cccr *cccr,
+ struct sdio_embedded_func *funcs,
+ int num_funcs);
+#endif
+
static inline void *mmc_priv(struct mmc_host *host)
{
return (void *)host->private;
diff --git a/include/linux/mmc/mmc.h b/include/linux/mmc/mmc.h
index c376209c70ef..a034d07c218d 100644
--- a/include/linux/mmc/mmc.h
+++ b/include/linux/mmc/mmc.h
@@ -331,6 +331,9 @@ struct _mmc_csd {
#define EXT_CSD_CACHE_SIZE 249 /* RO, 4 bytes */
#define EXT_CSD_PWR_CL_DDR_200_360 253 /* RO */
#define EXT_CSD_FIRMWARE_VERSION 254 /* RO, 8 bytes */
+#define EXT_CSD_PRE_EOL_INFO 267 /* RO */
+#define EXT_CSD_DEVICE_LIFE_TIME_EST_TYP_A 268 /* RO */
+#define EXT_CSD_DEVICE_LIFE_TIME_EST_TYP_B 269 /* RO */
#define EXT_CSD_SUPPORTED_MODE 493 /* RO */
#define EXT_CSD_TAG_UNIT_SIZE 498 /* RO */
#define EXT_CSD_DATA_TAG_SUPPORT 499 /* RO */
diff --git a/include/linux/mmc/pm.h b/include/linux/mmc/pm.h
index 4a139204c20c..6e2d6a135c7e 100644
--- a/include/linux/mmc/pm.h
+++ b/include/linux/mmc/pm.h
@@ -26,5 +26,6 @@ typedef unsigned int mmc_pm_flag_t;
#define MMC_PM_KEEP_POWER (1 << 0) /* preserve card power during suspend */
#define MMC_PM_WAKE_SDIO_IRQ (1 << 1) /* wake up host system on SDIO IRQ assertion */
+#define MMC_PM_IGNORE_PM_NOTIFY (1 << 2) /* ignore mmc pm notify */
#endif /* LINUX_MMC_PM_H */
diff --git a/include/linux/mmc/sdio_func.h b/include/linux/mmc/sdio_func.h
index 97ca105347a6..f563bcf55e7f 100644
--- a/include/linux/mmc/sdio_func.h
+++ b/include/linux/mmc/sdio_func.h
@@ -23,6 +23,14 @@ struct sdio_func;
typedef void (sdio_irq_handler_t)(struct sdio_func *);
/*
+ * Structure used to hold embedded SDIO device data from platform layer
+ */
+struct sdio_embedded_func {
+ uint8_t f_class;
+ uint32_t f_maxblksize;
+};
+
+/*
* SDIO function CIS tuple (unknown to the core)
*/
struct sdio_func_tuple {
@@ -128,6 +136,8 @@ extern int sdio_release_irq(struct sdio_func *func);
extern unsigned int sdio_align_size(struct sdio_func *func, unsigned int sz);
extern u8 sdio_readb(struct sdio_func *func, unsigned int addr, int *err_ret);
+extern u8 sdio_readb_ext(struct sdio_func *func, unsigned int addr, int *err_ret,
+ unsigned in);
extern u16 sdio_readw(struct sdio_func *func, unsigned int addr, int *err_ret);
extern u32 sdio_readl(struct sdio_func *func, unsigned int addr, int *err_ret);
diff --git a/include/linux/module.h b/include/linux/module.h
index fd9e121c7b3f..9d6fd1d17edb 100644
--- a/include/linux/module.h
+++ b/include/linux/module.h
@@ -20,6 +20,7 @@
#include <linux/export.h>
#include <linux/extable.h> /* only as arch move module.h -> extable.h */
#include <linux/rbtree_latch.h>
+#include <linux/cfi.h>
#include <linux/percpu.h>
#include <asm/module.h>
@@ -349,6 +350,10 @@ struct module {
const unsigned long *crcs;
unsigned int num_syms;
+#ifdef CONFIG_CFI_CLANG
+ cfi_check_fn cfi_check;
+#endif
+
/* Kernel parameters. */
#ifdef CONFIG_SYSFS
struct mutex param_lock;
diff --git a/include/linux/moduleparam.h b/include/linux/moduleparam.h
index 52666d90ca94..060db5c327be 100644
--- a/include/linux/moduleparam.h
+++ b/include/linux/moduleparam.h
@@ -225,19 +225,11 @@ struct kparam_array
VERIFY_OCTAL_PERMISSIONS(perm), level, flags, { arg } }
/* Obsolete - use module_param_cb() */
-#define module_param_call(name, set, get, arg, perm) \
- static const struct kernel_param_ops __param_ops_##name = \
- { .flags = 0, (void *)set, (void *)get }; \
+#define module_param_call(name, _set, _get, arg, perm) \
+ static const struct kernel_param_ops __param_ops_##name = \
+ { .flags = 0, .set = _set, .get = _get }; \
__module_param_call(MODULE_PARAM_PREFIX, \
- name, &__param_ops_##name, arg, \
- (perm) + sizeof(__check_old_set_param(set))*0, -1, 0)
-
-/* We don't get oldget: it's often a new-style param_get_uint, etc. */
-static inline int
-__check_old_set_param(int (*oldset)(const char *, struct kernel_param *))
-{
- return 0;
-}
+ name, &__param_ops_##name, arg, perm, -1, 0)
#ifdef CONFIG_SYSFS
extern void kernel_param_lock(struct module *mod);
diff --git a/include/linux/mount.h b/include/linux/mount.h
index e0f3a82eee6d..5615a9eb59db 100644
--- a/include/linux/mount.h
+++ b/include/linux/mount.h
@@ -67,6 +67,7 @@ struct vfsmount {
struct dentry *mnt_root; /* root of the mounted tree */
struct super_block *mnt_sb; /* pointer to superblock */
int mnt_flags;
+ void *data;
};
struct file; /* forward dec */
diff --git a/include/linux/namei.h b/include/linux/namei.h
index f29abda31e6d..cf437f56baf4 100644
--- a/include/linux/namei.h
+++ b/include/linux/namei.h
@@ -78,8 +78,11 @@ extern struct dentry *user_path_create(int, const char __user *, struct path *,
extern void done_path_create(struct path *, struct dentry *);
extern struct dentry *kern_path_locked(const char *, struct path *);
extern int kern_path_mountpoint(int, const char *, struct path *, unsigned int);
+extern int vfs_path_lookup(struct dentry *, struct vfsmount *,
+ const char *, unsigned int, struct path *);
extern struct dentry *lookup_one_len(const char *, struct dentry *, int);
+extern struct dentry *lookup_one_len2(const char *, struct vfsmount *mnt, struct dentry *, int);
extern struct dentry *lookup_one_len_unlocked(const char *, struct dentry *, int);
extern int follow_down_one(struct path *);
diff --git a/include/linux/netfilter/xt_qtaguid.h b/include/linux/netfilter/xt_qtaguid.h
new file mode 100644
index 000000000000..1c671552ec37
--- /dev/null
+++ b/include/linux/netfilter/xt_qtaguid.h
@@ -0,0 +1,14 @@
+#ifndef _XT_QTAGUID_MATCH_H
+#define _XT_QTAGUID_MATCH_H
+
+/* For now we just replace the xt_owner.
+ * FIXME: make iptables aware of qtaguid. */
+#include <linux/netfilter/xt_owner.h>
+
+#define XT_QTAGUID_UID XT_OWNER_UID
+#define XT_QTAGUID_GID XT_OWNER_GID
+#define XT_QTAGUID_SOCKET XT_OWNER_SOCKET
+#define xt_qtaguid_match_info xt_owner_match_info
+
+int qtaguid_untag(struct socket *sock, bool kernel);
+#endif /* _XT_QTAGUID_MATCH_H */
diff --git a/include/linux/netfilter/xt_quota2.h b/include/linux/netfilter/xt_quota2.h
new file mode 100644
index 000000000000..eadc6903314e
--- /dev/null
+++ b/include/linux/netfilter/xt_quota2.h
@@ -0,0 +1,25 @@
+#ifndef _XT_QUOTA_H
+#define _XT_QUOTA_H
+
+enum xt_quota_flags {
+ XT_QUOTA_INVERT = 1 << 0,
+ XT_QUOTA_GROW = 1 << 1,
+ XT_QUOTA_PACKET = 1 << 2,
+ XT_QUOTA_NO_CHANGE = 1 << 3,
+ XT_QUOTA_MASK = 0x0F,
+};
+
+struct xt_quota_counter;
+
+struct xt_quota_mtinfo2 {
+ char name[15];
+ u_int8_t flags;
+
+ /* Comparison-invariant */
+ aligned_u64 quota;
+
+ /* Used internally by the kernel */
+ struct xt_quota_counter *master __attribute__((aligned(8)));
+};
+
+#endif /* _XT_QUOTA_H */
diff --git a/include/linux/of_fdt.h b/include/linux/of_fdt.h
index 4341f32516d8..501d461a6a1d 100644
--- a/include/linux/of_fdt.h
+++ b/include/linux/of_fdt.h
@@ -63,6 +63,27 @@ extern int of_flat_dt_match(unsigned long node, const char *const *matches);
extern unsigned long of_get_flat_dt_root(void);
extern int of_get_flat_dt_size(void);
+/*
+ * early_init_dt_scan_chosen - scan the device tree for ramdisk and bootargs
+ *
+ * The boot arguments will be placed into the memory pointed to by @data.
+ * That memory should be COMMAND_LINE_SIZE big and initialized to be a valid
+ * (possibly empty) string. Logic for what will be in @data after this
+ * function finishes:
+ *
+ * - CONFIG_CMDLINE_FORCE=true
+ * CONFIG_CMDLINE
+ * - CONFIG_CMDLINE_EXTEND=true, @data is non-empty string
+ * @data + dt bootargs (even if dt bootargs are empty)
+ * - CONFIG_CMDLINE_EXTEND=true, @data is empty string
+ * CONFIG_CMDLINE + dt bootargs (even if dt bootargs are empty)
+ * - CMDLINE_FROM_BOOTLOADER=true, dt bootargs=non-empty:
+ * dt bootargs
+ * - CMDLINE_FROM_BOOTLOADER=true, dt bootargs=empty, @data is non-empty string
+ * @data is left unchanged
+ * - CMDLINE_FROM_BOOTLOADER=true, dt bootargs=empty, @data is empty string
+ * CONFIG_CMDLINE (or "" if that's not defined)
+ */
extern int early_init_dt_scan_chosen(unsigned long node, const char *uname,
int depth, void *data);
extern int early_init_dt_scan_memory(unsigned long node, const char *uname,
diff --git a/include/linux/overflow.h b/include/linux/overflow.h
new file mode 100644
index 000000000000..8712ff70995f
--- /dev/null
+++ b/include/linux/overflow.h
@@ -0,0 +1,278 @@
+/* SPDX-License-Identifier: GPL-2.0 OR MIT */
+#ifndef __LINUX_OVERFLOW_H
+#define __LINUX_OVERFLOW_H
+
+#include <linux/compiler.h>
+
+/*
+ * In the fallback code below, we need to compute the minimum and
+ * maximum values representable in a given type. These macros may also
+ * be useful elsewhere, so we provide them outside the
+ * COMPILER_HAS_GENERIC_BUILTIN_OVERFLOW block.
+ *
+ * It would seem more obvious to do something like
+ *
+ * #define type_min(T) (T)(is_signed_type(T) ? (T)1 << (8*sizeof(T)-1) : 0)
+ * #define type_max(T) (T)(is_signed_type(T) ? ((T)1 << (8*sizeof(T)-1)) - 1 : ~(T)0)
+ *
+ * Unfortunately, the middle expressions, strictly speaking, have
+ * undefined behaviour, and at least some versions of gcc warn about
+ * the type_max expression (but not if -fsanitize=undefined is in
+ * effect; in that case, the warning is deferred to runtime...).
+ *
+ * The slightly excessive casting in type_min is to make sure the
+ * macros also produce sensible values for the exotic type _Bool. [The
+ * overflow checkers only almost work for _Bool, but that's
+ * a-feature-not-a-bug, since people shouldn't be doing arithmetic on
+ * _Bools. Besides, the gcc builtins don't allow _Bool* as third
+ * argument.]
+ *
+ * Idea stolen from
+ * https://mail-index.netbsd.org/tech-misc/2007/02/05/0000.html -
+ * credit to Christian Biere.
+ */
+#define is_signed_type(type) (((type)(-1)) < (type)1)
+#define __type_half_max(type) ((type)1 << (8*sizeof(type) - 1 - is_signed_type(type)))
+#define type_max(T) ((T)((__type_half_max(T) - 1) + __type_half_max(T)))
+#define type_min(T) ((T)((T)-type_max(T)-(T)1))
+
+
+#ifdef COMPILER_HAS_GENERIC_BUILTIN_OVERFLOW
+/*
+ * For simplicity and code hygiene, the fallback code below insists on
+ * a, b and *d having the same type (similar to the min() and max()
+ * macros), whereas gcc's type-generic overflow checkers accept
+ * different types. Hence we don't just make check_add_overflow an
+ * alias for __builtin_add_overflow, but add type checks similar to
+ * below.
+ */
+#define check_add_overflow(a, b, d) ({ \
+ typeof(a) __a = (a); \
+ typeof(b) __b = (b); \
+ typeof(d) __d = (d); \
+ (void) (&__a == &__b); \
+ (void) (&__a == __d); \
+ __builtin_add_overflow(__a, __b, __d); \
+})
+
+#define check_sub_overflow(a, b, d) ({ \
+ typeof(a) __a = (a); \
+ typeof(b) __b = (b); \
+ typeof(d) __d = (d); \
+ (void) (&__a == &__b); \
+ (void) (&__a == __d); \
+ __builtin_sub_overflow(__a, __b, __d); \
+})
+
+#define check_mul_overflow(a, b, d) ({ \
+ typeof(a) __a = (a); \
+ typeof(b) __b = (b); \
+ typeof(d) __d = (d); \
+ (void) (&__a == &__b); \
+ (void) (&__a == __d); \
+ __builtin_mul_overflow(__a, __b, __d); \
+})
+
+#else
+
+
+/* Checking for unsigned overflow is relatively easy without causing UB. */
+#define __unsigned_add_overflow(a, b, d) ({ \
+ typeof(a) __a = (a); \
+ typeof(b) __b = (b); \
+ typeof(d) __d = (d); \
+ (void) (&__a == &__b); \
+ (void) (&__a == __d); \
+ *__d = __a + __b; \
+ *__d < __a; \
+})
+#define __unsigned_sub_overflow(a, b, d) ({ \
+ typeof(a) __a = (a); \
+ typeof(b) __b = (b); \
+ typeof(d) __d = (d); \
+ (void) (&__a == &__b); \
+ (void) (&__a == __d); \
+ *__d = __a - __b; \
+ __a < __b; \
+})
+/*
+ * If one of a or b is a compile-time constant, this avoids a division.
+ */
+#define __unsigned_mul_overflow(a, b, d) ({ \
+ typeof(a) __a = (a); \
+ typeof(b) __b = (b); \
+ typeof(d) __d = (d); \
+ (void) (&__a == &__b); \
+ (void) (&__a == __d); \
+ *__d = __a * __b; \
+ __builtin_constant_p(__b) ? \
+ __b > 0 && __a > type_max(typeof(__a)) / __b : \
+ __a > 0 && __b > type_max(typeof(__b)) / __a; \
+})
+
+/*
+ * For signed types, detecting overflow is much harder, especially if
+ * we want to avoid UB. But the interface of these macros is such that
+ * we must provide a result in *d, and in fact we must produce the
+ * result promised by gcc's builtins, which is simply the possibly
+ * wrapped-around value. Fortunately, we can just formally do the
+ * operations in the widest relevant unsigned type (u64) and then
+ * truncate the result - gcc is smart enough to generate the same code
+ * with and without the (u64) casts.
+ */
+
+/*
+ * Adding two signed integers can overflow only if they have the same
+ * sign, and overflow has happened iff the result has the opposite
+ * sign.
+ */
+#define __signed_add_overflow(a, b, d) ({ \
+ typeof(a) __a = (a); \
+ typeof(b) __b = (b); \
+ typeof(d) __d = (d); \
+ (void) (&__a == &__b); \
+ (void) (&__a == __d); \
+ *__d = (u64)__a + (u64)__b; \
+ (((~(__a ^ __b)) & (*__d ^ __a)) \
+ & type_min(typeof(__a))) != 0; \
+})
+
+/*
+ * Subtraction is similar, except that overflow can now happen only
+ * when the signs are opposite. In this case, overflow has happened if
+ * the result has the opposite sign of a.
+ */
+#define __signed_sub_overflow(a, b, d) ({ \
+ typeof(a) __a = (a); \
+ typeof(b) __b = (b); \
+ typeof(d) __d = (d); \
+ (void) (&__a == &__b); \
+ (void) (&__a == __d); \
+ *__d = (u64)__a - (u64)__b; \
+ ((((__a ^ __b)) & (*__d ^ __a)) \
+ & type_min(typeof(__a))) != 0; \
+})
+
+/*
+ * Signed multiplication is rather hard. gcc always follows C99, so
+ * division is truncated towards 0. This means that we can write the
+ * overflow check like this:
+ *
+ * (a > 0 && (b > MAX/a || b < MIN/a)) ||
+ * (a < -1 && (b > MIN/a || b < MAX/a) ||
+ * (a == -1 && b == MIN)
+ *
+ * The redundant casts of -1 are to silence an annoying -Wtype-limits
+ * (included in -Wextra) warning: When the type is u8 or u16, the
+ * __b_c_e in check_mul_overflow obviously selects
+ * __unsigned_mul_overflow, but unfortunately gcc still parses this
+ * code and warns about the limited range of __b.
+ */
+
+#define __signed_mul_overflow(a, b, d) ({ \
+ typeof(a) __a = (a); \
+ typeof(b) __b = (b); \
+ typeof(d) __d = (d); \
+ typeof(a) __tmax = type_max(typeof(a)); \
+ typeof(a) __tmin = type_min(typeof(a)); \
+ (void) (&__a == &__b); \
+ (void) (&__a == __d); \
+ *__d = (u64)__a * (u64)__b; \
+ (__b > 0 && (__a > __tmax/__b || __a < __tmin/__b)) || \
+ (__b < (typeof(__b))-1 && (__a > __tmin/__b || __a < __tmax/__b)) || \
+ (__b == (typeof(__b))-1 && __a == __tmin); \
+})
+
+
+#define check_add_overflow(a, b, d) \
+ __builtin_choose_expr(is_signed_type(typeof(a)), \
+ __signed_add_overflow(a, b, d), \
+ __unsigned_add_overflow(a, b, d))
+
+#define check_sub_overflow(a, b, d) \
+ __builtin_choose_expr(is_signed_type(typeof(a)), \
+ __signed_sub_overflow(a, b, d), \
+ __unsigned_sub_overflow(a, b, d))
+
+#define check_mul_overflow(a, b, d) \
+ __builtin_choose_expr(is_signed_type(typeof(a)), \
+ __signed_mul_overflow(a, b, d), \
+ __unsigned_mul_overflow(a, b, d))
+
+
+#endif /* COMPILER_HAS_GENERIC_BUILTIN_OVERFLOW */
+
+/**
+ * array_size() - Calculate size of 2-dimensional array.
+ *
+ * @a: dimension one
+ * @b: dimension two
+ *
+ * Calculates size of 2-dimensional array: @a * @b.
+ *
+ * Returns: number of bytes needed to represent the array or SIZE_MAX on
+ * overflow.
+ */
+static inline __must_check size_t array_size(size_t a, size_t b)
+{
+ size_t bytes;
+
+ if (check_mul_overflow(a, b, &bytes))
+ return SIZE_MAX;
+
+ return bytes;
+}
+
+/**
+ * array3_size() - Calculate size of 3-dimensional array.
+ *
+ * @a: dimension one
+ * @b: dimension two
+ * @c: dimension three
+ *
+ * Calculates size of 3-dimensional array: @a * @b * @c.
+ *
+ * Returns: number of bytes needed to represent the array or SIZE_MAX on
+ * overflow.
+ */
+static inline __must_check size_t array3_size(size_t a, size_t b, size_t c)
+{
+ size_t bytes;
+
+ if (check_mul_overflow(a, b, &bytes))
+ return SIZE_MAX;
+ if (check_mul_overflow(bytes, c, &bytes))
+ return SIZE_MAX;
+
+ return bytes;
+}
+
+static inline __must_check size_t __ab_c_size(size_t n, size_t size, size_t c)
+{
+ size_t bytes;
+
+ if (check_mul_overflow(n, size, &bytes))
+ return SIZE_MAX;
+ if (check_add_overflow(bytes, c, &bytes))
+ return SIZE_MAX;
+
+ return bytes;
+}
+
+/**
+ * struct_size() - Calculate size of structure with trailing array.
+ * @p: Pointer to the structure.
+ * @member: Name of the array member.
+ * @n: Number of elements in the array.
+ *
+ * Calculates size of memory needed for structure @p followed by an
+ * array of @n @member elements.
+ *
+ * Return: number of bytes needed or SIZE_MAX on overflow.
+ */
+#define struct_size(p, member, n) \
+ __ab_c_size(n, \
+ sizeof(*(p)->member) + __must_be_array((p)->member),\
+ sizeof(*(p)))
+
+#endif /* __LINUX_OVERFLOW_H */
diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h
index 35f4c4d9c405..6e755d92c942 100644
--- a/include/linux/pagemap.h
+++ b/include/linux/pagemap.h
@@ -225,7 +225,7 @@ static inline gfp_t readahead_gfp_mask(struct address_space *x)
__GFP_COLD | __GFP_NORETRY | __GFP_NOWARN;
}
-typedef int filler_t(void *, struct page *);
+typedef int filler_t(struct file *, struct page *);
pgoff_t page_cache_next_hole(struct address_space *mapping,
pgoff_t index, unsigned long max_scan);
@@ -341,8 +341,16 @@ unsigned find_get_pages(struct address_space *mapping, pgoff_t start,
unsigned int nr_pages, struct page **pages);
unsigned find_get_pages_contig(struct address_space *mapping, pgoff_t start,
unsigned int nr_pages, struct page **pages);
-unsigned find_get_pages_tag(struct address_space *mapping, pgoff_t *index,
- int tag, unsigned int nr_pages, struct page **pages);
+unsigned find_get_pages_range_tag(struct address_space *mapping, pgoff_t *index,
+ pgoff_t end, int tag, unsigned int nr_pages,
+ struct page **pages);
+static inline unsigned find_get_pages_tag(struct address_space *mapping,
+ pgoff_t *index, int tag, unsigned int nr_pages,
+ struct page **pages)
+{
+ return find_get_pages_range_tag(mapping, index, (pgoff_t)-1, tag,
+ nr_pages, pages);
+}
unsigned find_get_entries_tag(struct address_space *mapping, pgoff_t start,
int tag, unsigned int nr_entries,
struct page **entries, pgoff_t *indices);
@@ -369,7 +377,7 @@ extern int read_cache_pages(struct address_space *mapping,
static inline struct page *read_mapping_page(struct address_space *mapping,
pgoff_t index, void *data)
{
- filler_t *filler = (filler_t *)mapping->a_ops->readpage;
+ filler_t *filler = mapping->a_ops->readpage;
return read_cache_page(mapping, index, filler, data);
}
diff --git a/include/linux/pagevec.h b/include/linux/pagevec.h
index b45d391b4540..cead4419f933 100644
--- a/include/linux/pagevec.h
+++ b/include/linux/pagevec.h
@@ -29,9 +29,17 @@ unsigned pagevec_lookup_entries(struct pagevec *pvec,
void pagevec_remove_exceptionals(struct pagevec *pvec);
unsigned pagevec_lookup(struct pagevec *pvec, struct address_space *mapping,
pgoff_t start, unsigned nr_pages);
-unsigned pagevec_lookup_tag(struct pagevec *pvec,
- struct address_space *mapping, pgoff_t *index, int tag,
- unsigned nr_pages);
+unsigned pagevec_lookup_range_tag(struct pagevec *pvec,
+ struct address_space *mapping, pgoff_t *index, pgoff_t end,
+ int tag);
+unsigned pagevec_lookup_range_nr_tag(struct pagevec *pvec,
+ struct address_space *mapping, pgoff_t *index, pgoff_t end,
+ int tag, unsigned max_pages);
+static inline unsigned pagevec_lookup_tag(struct pagevec *pvec,
+ struct address_space *mapping, pgoff_t *index, int tag)
+{
+ return pagevec_lookup_range_tag(pvec, mapping, index, (pgoff_t)-1, tag);
+}
static inline void pagevec_init(struct pagevec *pvec, int cold)
{
diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
index 78ed8105e64d..61ab566856e6 100644
--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -1168,6 +1168,11 @@ extern int perf_cpu_time_max_percent_handler(struct ctl_table *table, int write,
int perf_event_max_stack_handler(struct ctl_table *table, int write,
void __user *buffer, size_t *lenp, loff_t *ppos);
+static inline bool perf_paranoid_any(void)
+{
+ return sysctl_perf_event_paranoid > 2;
+}
+
static inline bool perf_paranoid_tracepoint_raw(void)
{
return sysctl_perf_event_paranoid > -1;
diff --git a/include/linux/platform_data/ds2482.h b/include/linux/platform_data/ds2482.h
new file mode 100644
index 000000000000..5a6879e2a09a
--- /dev/null
+++ b/include/linux/platform_data/ds2482.h
@@ -0,0 +1,21 @@
+/*
+ * Copyright (C) 2012 Google, Inc.
+ *
+ * This software is licensed under the terms of the GNU General Public
+ * License version 2, as published by the Free Software Foundation, and
+ * may be copied, distributed, and modified under those terms.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ */
+
+#ifndef __PLATFORM_DATA_DS2482__
+#define __PLATFORM_DATA_DS2482__
+
+struct ds2482_platform_data {
+ int slpz_gpio;
+};
+
+#endif /* __PLATFORM_DATA_DS2482__ */
diff --git a/include/linux/power_supply.h b/include/linux/power_supply.h
index ad97baf7b8de..29fc01af3b37 100644
--- a/include/linux/power_supply.h
+++ b/include/linux/power_supply.h
@@ -18,6 +18,7 @@
#include <linux/leds.h>
#include <linux/spinlock.h>
#include <linux/notifier.h>
+#include <linux/types.h>
/*
* All voltages, currents, charges, energies, time and temperatures in uV,
@@ -148,6 +149,12 @@ enum power_supply_property {
POWER_SUPPLY_PROP_SCOPE,
POWER_SUPPLY_PROP_CHARGE_TERM_CURRENT,
POWER_SUPPLY_PROP_CALIBRATE,
+ /* Local extensions */
+ POWER_SUPPLY_PROP_USB_HC,
+ POWER_SUPPLY_PROP_USB_OTG,
+ POWER_SUPPLY_PROP_CHARGE_ENABLED,
+ /* Local extensions of type int64_t */
+ POWER_SUPPLY_PROP_CHARGE_COUNTER_EXT,
/* Properties of type `const char *' */
POWER_SUPPLY_PROP_MODEL_NAME,
POWER_SUPPLY_PROP_MANUFACTURER,
@@ -175,6 +182,7 @@ enum power_supply_notifier_events {
union power_supply_propval {
int intval;
const char *strval;
+ int64_t int64val;
};
struct device_node;
diff --git a/include/linux/proc_fs.h b/include/linux/proc_fs.h
index b97bf2ef996e..b326d0a0cace 100644
--- a/include/linux/proc_fs.h
+++ b/include/linux/proc_fs.h
@@ -74,6 +74,12 @@ static inline int remove_proc_subtree(const char *name, struct proc_dir_entry *p
#endif /* CONFIG_PROC_FS */
+#ifdef CONFIG_PROC_UID
+extern void proc_register_uid(kuid_t uid);
+#else
+static inline void proc_register_uid(kuid_t uid) {}
+#endif
+
struct net;
static inline struct proc_dir_entry *proc_net_mkdir(
diff --git a/include/linux/pstore_ram.h b/include/linux/pstore_ram.h
index 4058bf991868..cb5edd64a3d3 100644
--- a/include/linux/pstore_ram.h
+++ b/include/linux/pstore_ram.h
@@ -80,6 +80,8 @@ void persistent_ram_free_old(struct persistent_ram_zone *prz);
ssize_t persistent_ram_ecc_string(struct persistent_ram_zone *prz,
char *str, size_t len);
+void ramoops_console_write_buf(const char *buf, size_t size);
+
/*
* Ramoops platform data
* @mem_size memory size for ramoops
diff --git a/include/linux/rculist.h b/include/linux/rculist.h
index 8beb98dcf14f..4f7a9561b8c4 100644
--- a/include/linux/rculist.h
+++ b/include/linux/rculist.h
@@ -45,19 +45,17 @@ static inline void INIT_LIST_HEAD_RCU(struct list_head *list)
* This is only for internal list manipulation where we know
* the prev/next entries already!
*/
-#ifndef CONFIG_DEBUG_LIST
static inline void __list_add_rcu(struct list_head *new,
struct list_head *prev, struct list_head *next)
{
+ if (!__list_add_valid(new, prev, next))
+ return;
+
new->next = next;
new->prev = prev;
rcu_assign_pointer(list_next_rcu(prev), new);
next->prev = new;
}
-#else
-void __list_add_rcu(struct list_head *new,
- struct list_head *prev, struct list_head *next);
-#endif
/**
* list_add_rcu - add a new entry to rcu-protected list
diff --git a/include/linux/reservation.h b/include/linux/reservation.h
index b0f305e77b7f..bad7710866af 100644
--- a/include/linux/reservation.h
+++ b/include/linux/reservation.h
@@ -177,17 +177,14 @@ static inline struct fence *
reservation_object_get_excl_rcu(struct reservation_object *obj)
{
struct fence *fence;
- unsigned seq;
-retry:
- seq = read_seqcount_begin(&obj->seq);
+
+ if (!rcu_access_pointer(obj->fence_excl))
+ return NULL;
+
rcu_read_lock();
- fence = rcu_dereference(obj->fence_excl);
- if (read_seqcount_retry(&obj->seq, seq)) {
- rcu_read_unlock();
- goto retry;
- }
- fence = fence_get(fence);
+ fence = fence_get_rcu_safe(&obj->fence_excl);
rcu_read_unlock();
+
return fence;
}
diff --git a/include/linux/restart_block.h b/include/linux/restart_block.h
new file mode 100644
index 000000000000..0d905d8ec553
--- /dev/null
+++ b/include/linux/restart_block.h
@@ -0,0 +1,51 @@
+/*
+ * Common syscall restarting data
+ */
+#ifndef __LINUX_RESTART_BLOCK_H
+#define __LINUX_RESTART_BLOCK_H
+
+#include <linux/compiler.h>
+#include <linux/types.h>
+
+struct timespec;
+struct compat_timespec;
+struct pollfd;
+
+/*
+ * System call restart block.
+ */
+struct restart_block {
+ long (*fn)(struct restart_block *);
+ union {
+ /* For futex_wait and futex_wait_requeue_pi */
+ struct {
+ u32 __user *uaddr;
+ u32 val;
+ u32 flags;
+ u32 bitset;
+ u64 time;
+ u32 __user *uaddr2;
+ } futex;
+ /* For nanosleep */
+ struct {
+ clockid_t clockid;
+ struct timespec __user *rmtp;
+#ifdef CONFIG_COMPAT
+ struct compat_timespec __user *compat_rmtp;
+#endif
+ u64 expires;
+ } nanosleep;
+ /* For poll */
+ struct {
+ struct pollfd __user *ufds;
+ int nfds;
+ int has_timeout;
+ unsigned long tv_sec;
+ unsigned long tv_nsec;
+ } poll;
+ };
+};
+
+extern long do_no_restart_syscall(struct restart_block *parm);
+
+#endif /* __LINUX_RESTART_BLOCK_H */
diff --git a/include/linux/sched.h b/include/linux/sched.h
index f4a551a5482c..e94a3ec4ce56 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -173,6 +173,9 @@ extern bool single_task_running(void);
extern unsigned long nr_iowait(void);
extern unsigned long nr_iowait_cpu(int cpu);
extern void get_iowait_load(unsigned long *nr_waiters, unsigned long *load);
+#ifdef CONFIG_CPU_QUIET
+extern u64 nr_running_integral(unsigned int cpu);
+#endif
extern void calc_global_load(unsigned long ticks);
@@ -315,6 +318,15 @@ extern char ___assert_task_state[1 - 2*!!(
/* Task command name length */
#define TASK_COMM_LEN 16
+enum task_event {
+ PUT_PREV_TASK = 0,
+ PICK_NEXT_TASK = 1,
+ TASK_WAKE = 2,
+ TASK_MIGRATE = 3,
+ TASK_UPDATE = 4,
+ IRQ_UPDATE = 5,
+};
+
#include <linux/spinlock.h>
/*
@@ -982,6 +994,14 @@ enum cpu_idle_type {
#define SCHED_CAPACITY_SHIFT SCHED_FIXEDPOINT_SHIFT
#define SCHED_CAPACITY_SCALE (1L << SCHED_CAPACITY_SHIFT)
+struct sched_capacity_reqs {
+ unsigned long cfs;
+ unsigned long rt;
+ unsigned long dl;
+
+ unsigned long total;
+};
+
/*
* Wake-queues are lists of tasks with a pending wakeup, whose
* callers have already marked the task as woken internally,
@@ -1045,6 +1065,7 @@ extern void wake_up_q(struct wake_q_head *head);
#define SD_PREFER_SIBLING 0x1000 /* Prefer to place tasks in a sibling domain */
#define SD_OVERLAP 0x2000 /* sched_domains of this level overlap */
#define SD_NUMA 0x4000 /* cross-node balancing */
+#define SD_SHARE_CAP_STATES 0x8000 /* Domain members share capacity state */
#ifdef CONFIG_SCHED_SMT
static inline int cpu_smt_flags(void)
@@ -1077,8 +1098,57 @@ struct sched_domain_attr {
extern int sched_domain_level_max;
+struct capacity_state {
+ unsigned long cap; /* compute capacity */
+ unsigned long power; /* power consumption at this compute capacity */
+};
+
+struct idle_state {
+ unsigned long power; /* power consumption in this idle state */
+};
+
+struct sched_group_energy {
+ unsigned int nr_idle_states; /* number of idle states */
+ struct idle_state *idle_states; /* ptr to idle state array */
+ unsigned int nr_cap_states; /* number of capacity states */
+ struct capacity_state *cap_states; /* ptr to capacity state array */
+};
+
+unsigned long capacity_curr_of(int cpu);
+
struct sched_group;
+struct eas_stats {
+ /* select_idle_sibling() stats */
+ u64 sis_attempts;
+ u64 sis_idle;
+ u64 sis_cache_affine;
+ u64 sis_suff_cap;
+ u64 sis_idle_cpu;
+ u64 sis_count;
+
+ /* select_energy_cpu_brute() stats */
+ u64 secb_attempts;
+ u64 secb_sync;
+ u64 secb_idle_bt;
+ u64 secb_insuff_cap;
+ u64 secb_no_nrg_sav;
+ u64 secb_nrg_sav;
+ u64 secb_count;
+
+ /* find_best_target() stats */
+ u64 fbt_attempts;
+ u64 fbt_no_cpu;
+ u64 fbt_no_sd;
+ u64 fbt_pref_idle;
+ u64 fbt_count;
+
+ /* cas */
+ /* select_task_rq_fair() stats */
+ u64 cas_attempts;
+ u64 cas_count;
+};
+
struct sched_domain_shared {
atomic_t ref;
atomic_t nr_busy_cpus;
@@ -1147,6 +1217,8 @@ struct sched_domain {
unsigned int ttwu_wake_remote;
unsigned int ttwu_move_affine;
unsigned int ttwu_move_balance;
+
+ struct eas_stats eas_stats;
#endif
#ifdef CONFIG_SCHED_DEBUG
char *name;
@@ -1184,6 +1256,8 @@ bool cpus_share_cache(int this_cpu, int that_cpu);
typedef const struct cpumask *(*sched_domain_mask_f)(int cpu);
typedef int (*sched_domain_flags_f)(void);
+typedef
+const struct sched_group_energy * const(*sched_domain_energy_f)(int cpu);
#define SDTL_OVERLAP 0x01
@@ -1197,6 +1271,7 @@ struct sd_data {
struct sched_domain_topology_level {
sched_domain_mask_f mask;
sched_domain_flags_f sd_flags;
+ sched_domain_energy_f energy;
int flags;
int numa_level;
struct sd_data data;
@@ -1342,6 +1417,70 @@ struct sched_statistics {
u64 nr_wakeups_affine_attempts;
u64 nr_wakeups_passive;
u64 nr_wakeups_idle;
+
+ /* select_idle_sibling() */
+ u64 nr_wakeups_sis_attempts;
+ u64 nr_wakeups_sis_idle;
+ u64 nr_wakeups_sis_cache_affine;
+ u64 nr_wakeups_sis_suff_cap;
+ u64 nr_wakeups_sis_idle_cpu;
+ u64 nr_wakeups_sis_count;
+
+ /* energy_aware_wake_cpu() */
+ u64 nr_wakeups_secb_attempts;
+ u64 nr_wakeups_secb_sync;
+ u64 nr_wakeups_secb_idle_bt;
+ u64 nr_wakeups_secb_insuff_cap;
+ u64 nr_wakeups_secb_no_nrg_sav;
+ u64 nr_wakeups_secb_nrg_sav;
+ u64 nr_wakeups_secb_count;
+
+ /* find_best_target() */
+ u64 nr_wakeups_fbt_attempts;
+ u64 nr_wakeups_fbt_no_cpu;
+ u64 nr_wakeups_fbt_no_sd;
+ u64 nr_wakeups_fbt_pref_idle;
+ u64 nr_wakeups_fbt_count;
+
+ /* cas */
+ /* select_task_rq_fair() */
+ u64 nr_wakeups_cas_attempts;
+ u64 nr_wakeups_cas_count;
+};
+#endif
+
+#ifdef CONFIG_SCHED_WALT
+#define RAVG_HIST_SIZE_MAX 5
+
+/* ravg represents frequency scaled cpu-demand of tasks */
+struct ravg {
+ /*
+ * 'mark_start' marks the beginning of an event (task waking up, task
+ * starting to execute, task being preempted) within a window
+ *
+ * 'sum' represents how runnable a task has been within current
+ * window. It incorporates both running time and wait time and is
+ * frequency scaled.
+ *
+ * 'sum_history' keeps track of history of 'sum' seen over previous
+ * RAVG_HIST_SIZE windows. Windows where task was entirely sleeping are
+ * ignored.
+ *
+ * 'demand' represents maximum sum seen over previous
+ * sysctl_sched_ravg_hist_size windows. 'demand' could drive frequency
+ * demand for tasks.
+ *
+ * 'curr_window' represents task's contribution to cpu busy time
+ * statistics (rq->curr_runnable_sum) in current window
+ *
+ * 'prev_window' represents task's contribution to cpu busy time
+ * statistics (rq->prev_runnable_sum) in previous window
+ */
+ u64 mark_start;
+ u32 sum, demand;
+ u32 sum_history[RAVG_HIST_SIZE_MAX];
+ u32 curr_window, prev_window;
+ u16 active_windows;
};
#endif
@@ -1516,6 +1655,16 @@ struct task_struct {
const struct sched_class *sched_class;
struct sched_entity se;
struct sched_rt_entity rt;
+#ifdef CONFIG_SCHED_WALT
+ struct ravg ravg;
+ /*
+ * 'init_load_pct' represents the initial task load assigned to children
+ * of this task
+ */
+ u32 init_load_pct;
+ u64 last_sleep_ts;
+#endif
+
#ifdef CONFIG_CGROUP_SCHED
struct task_group *sched_task_group;
#endif
@@ -1644,6 +1793,10 @@ struct task_struct {
cputime_t utime, stime, utimescaled, stimescaled;
cputime_t gtime;
+#ifdef CONFIG_CPU_FREQ_TIMES
+ u64 *time_in_state;
+ unsigned int max_state;
+#endif
struct prev_cputime prev_cputime;
#ifdef CONFIG_VIRT_CPU_ACCOUNTING_GEN
seqcount_t vtime_seqcount;
@@ -3571,6 +3724,11 @@ static inline void inc_syscw(struct task_struct *tsk)
{
tsk->ioac.syscw++;
}
+
+static inline void inc_syscfs(struct task_struct *tsk)
+{
+ tsk->ioac.syscfs++;
+}
#else
static inline void add_rchar(struct task_struct *tsk, ssize_t amt)
{
@@ -3587,6 +3745,9 @@ static inline void inc_syscr(struct task_struct *tsk)
static inline void inc_syscw(struct task_struct *tsk)
{
}
+static inline void inc_syscfs(struct task_struct *tsk)
+{
+}
#endif
#ifndef TASK_SIZE_OF
diff --git a/include/linux/sched/sysctl.h b/include/linux/sched/sysctl.h
index 22db1e63707e..da3a02d351b4 100644
--- a/include/linux/sched/sysctl.h
+++ b/include/linux/sched/sysctl.h
@@ -18,6 +18,14 @@ extern unsigned int sysctl_sched_latency;
extern unsigned int sysctl_sched_min_granularity;
extern unsigned int sysctl_sched_wakeup_granularity;
extern unsigned int sysctl_sched_child_runs_first;
+extern unsigned int sysctl_sched_sync_hint_enable;
+extern unsigned int sysctl_sched_cstate_aware;
+#ifdef CONFIG_SCHED_WALT
+extern unsigned int sysctl_sched_use_walt_cpu_util;
+extern unsigned int sysctl_sched_use_walt_task_util;
+extern unsigned int sysctl_sched_walt_init_task_load_pct;
+extern unsigned int sysctl_sched_walt_cpu_high_irqload;
+#endif
enum sched_tunable_scaling {
SCHED_TUNABLESCALING_NONE,
@@ -56,6 +64,22 @@ extern int sysctl_sched_rt_runtime;
extern unsigned int sysctl_sched_cfs_bandwidth_slice;
#endif
+#ifdef CONFIG_SCHED_TUNE
+extern unsigned int sysctl_sched_cfs_boost;
+int sysctl_sched_cfs_boost_handler(struct ctl_table *table, int write,
+ void __user *buffer, size_t *length,
+ loff_t *ppos);
+static inline unsigned int get_sysctl_sched_cfs_boost(void)
+{
+ return sysctl_sched_cfs_boost;
+}
+#else
+static inline unsigned int get_sysctl_sched_cfs_boost(void)
+{
+ return 0;
+}
+#endif
+
#ifdef CONFIG_SCHED_AUTOGROUP
extern unsigned int sysctl_sched_autogroup_enabled;
#endif
diff --git a/include/linux/sched_energy.h b/include/linux/sched_energy.h
new file mode 100644
index 000000000000..1daf3e1f98a7
--- /dev/null
+++ b/include/linux/sched_energy.h
@@ -0,0 +1,44 @@
+#ifndef _LINUX_SCHED_ENERGY_H
+#define _LINUX_SCHED_ENERGY_H
+
+#include <linux/sched.h>
+#include <linux/slab.h>
+
+/*
+ * There doesn't seem to be an NR_CPUS style max number of sched domain
+ * levels so here's an arbitrary constant one for the moment.
+ *
+ * The levels alluded to here correspond to entries in struct
+ * sched_domain_topology_level that are meant to be populated by arch
+ * specific code (topology.c).
+ */
+#define NR_SD_LEVELS 8
+
+#define SD_LEVEL0 0
+#define SD_LEVEL1 1
+#define SD_LEVEL2 2
+#define SD_LEVEL3 3
+#define SD_LEVEL4 4
+#define SD_LEVEL5 5
+#define SD_LEVEL6 6
+#define SD_LEVEL7 7
+
+/*
+ * Convenience macro for iterating through said sd levels.
+ */
+#define for_each_possible_sd_level(level) \
+ for (level = 0; level < NR_SD_LEVELS; level++)
+
+#ifdef CONFIG_SMP
+
+extern struct sched_group_energy *sge_array[NR_CPUS][NR_SD_LEVELS];
+
+void init_sched_energy_costs(void);
+
+#else
+
+#define init_sched_energy_costs() do { } while (0)
+
+#endif /* CONFIG_SMP */
+
+#endif
diff --git a/include/linux/security.h b/include/linux/security.h
index c2125e9093e8..363242871b66 100644
--- a/include/linux/security.h
+++ b/include/linux/security.h
@@ -1662,6 +1662,54 @@ static inline void securityfs_remove(struct dentry *dentry)
#endif
+#ifdef CONFIG_BPF_SYSCALL
+union bpf_attr;
+struct bpf_map;
+struct bpf_prog;
+struct bpf_prog_aux;
+#ifdef CONFIG_SECURITY
+extern int security_bpf(int cmd, union bpf_attr *attr, unsigned int size);
+extern int security_bpf_map(struct bpf_map *map, fmode_t fmode);
+extern int security_bpf_prog(struct bpf_prog *prog);
+extern int security_bpf_map_alloc(struct bpf_map *map);
+extern void security_bpf_map_free(struct bpf_map *map);
+extern int security_bpf_prog_alloc(struct bpf_prog_aux *aux);
+extern void security_bpf_prog_free(struct bpf_prog_aux *aux);
+#else
+static inline int security_bpf(int cmd, union bpf_attr *attr,
+ unsigned int size)
+{
+ return 0;
+}
+
+static inline int security_bpf_map(struct bpf_map *map, fmode_t fmode)
+{
+ return 0;
+}
+
+static inline int security_bpf_prog(struct bpf_prog *prog)
+{
+ return 0;
+}
+
+static inline int security_bpf_map_alloc(struct bpf_map *map)
+{
+ return 0;
+}
+
+static inline void security_bpf_map_free(struct bpf_map *map)
+{ }
+
+static inline int security_bpf_prog_alloc(struct bpf_prog_aux *aux)
+{
+ return 0;
+}
+
+static inline void security_bpf_prog_free(struct bpf_prog_aux *aux)
+{ }
+#endif /* CONFIG_SECURITY */
+#endif /* CONFIG_BPF_SYSCALL */
+
#ifdef CONFIG_SECURITY
static inline char *alloc_secdata(void)
diff --git a/include/linux/serial_core.h b/include/linux/serial_core.h
index eb4f6456521e..c7dff69a4103 100644
--- a/include/linux/serial_core.h
+++ b/include/linux/serial_core.h
@@ -66,6 +66,7 @@ struct uart_ops {
void (*set_ldisc)(struct uart_port *, struct ktermios *);
void (*pm)(struct uart_port *, unsigned int state,
unsigned int oldstate);
+ void (*wake_peer)(struct uart_port *);
/*
* Return a string describing the type of the port
diff --git a/include/linux/sock_diag.h b/include/linux/sock_diag.h
index a0596ca0e80a..a2f8109bb215 100644
--- a/include/linux/sock_diag.h
+++ b/include/linux/sock_diag.h
@@ -24,6 +24,7 @@ void sock_diag_unregister(const struct sock_diag_handler *h);
void sock_diag_register_inet_compat(int (*fn)(struct sk_buff *skb, struct nlmsghdr *nlh));
void sock_diag_unregister_inet_compat(int (*fn)(struct sk_buff *skb, struct nlmsghdr *nlh));
+u64 sock_gen_cookie(struct sock *sk);
int sock_diag_check_cookie(struct sock *sk, const __u32 *cookie);
void sock_diag_save_cookie(struct sock *sk, __u32 *cookie);
diff --git a/include/linux/suspend.h b/include/linux/suspend.h
index 249dafce2788..90d85690ab7b 100644
--- a/include/linux/suspend.h
+++ b/include/linux/suspend.h
@@ -436,6 +436,7 @@ extern bool pm_get_wakeup_count(unsigned int *count, bool block);
extern bool pm_save_wakeup_count(unsigned int count);
extern void pm_wakep_autosleep_enabled(bool set);
extern void pm_print_active_wakeup_sources(void);
+extern void pm_get_active_wakeup_sources(char *pending_sources, size_t max);
static inline void lock_system_sleep(void)
{
diff --git a/include/linux/sync_file.h b/include/linux/sync_file.h
index aa17ccfc2f57..35ec6c45e190 100644
--- a/include/linux/sync_file.h
+++ b/include/linux/sync_file.h
@@ -40,12 +40,13 @@ struct sync_file {
#endif
wait_queue_head_t wq;
+ unsigned long flags;
struct fence *fence;
struct fence_cb cb;
};
-#define POLL_ENABLED FENCE_FLAG_USER_BITS
+#define POLL_ENABLED 0
struct sync_file *sync_file_create(struct fence *fence);
struct fence *sync_file_get_fence(int fd);
diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h
index 91a740f6b884..ef4bc88540ef 100644
--- a/include/linux/syscalls.h
+++ b/include/linux/syscalls.h
@@ -205,6 +205,26 @@ extern struct trace_event_functions exit_syscall_print_funcs;
} \
static inline long SYSC##name(__MAP(x,__SC_DECL,__VA_ARGS__))
+/*
+ * Called before coming back to user-mode. Returning to user-mode with an
+ * address limit different than USER_DS can allow to overwrite kernel memory.
+ */
+static inline void addr_limit_user_check(void)
+{
+#ifdef TIF_FSCHECK
+ if (!test_thread_flag(TIF_FSCHECK))
+ return;
+#endif
+
+ if (CHECK_DATA_CORRUPTION(!segment_eq(get_fs(), USER_DS),
+ "Invalid address limit on user-mode return"))
+ force_sig(SIGKILL, current);
+
+#ifdef TIF_FSCHECK
+ clear_thread_flag(TIF_FSCHECK);
+#endif
+}
+
asmlinkage long sys32_quotactl(unsigned int cmd, const char __user *special,
qid_t id, void __user *addr);
asmlinkage long sys_time(time_t __user *tloc);
diff --git a/include/linux/sysfs.h b/include/linux/sysfs.h
index d3c19f8c4564..2839d624d5ee 100644
--- a/include/linux/sysfs.h
+++ b/include/linux/sysfs.h
@@ -532,7 +532,7 @@ static inline void sysfs_notify_dirent(struct kernfs_node *kn)
}
static inline struct kernfs_node *sysfs_get_dirent(struct kernfs_node *parent,
- const unsigned char *name)
+ const char *name)
{
return kernfs_find_and_get(parent, name);
}
diff --git a/include/linux/task_io_accounting.h b/include/linux/task_io_accounting.h
index bdf855c2856f..2dd338fdf881 100644
--- a/include/linux/task_io_accounting.h
+++ b/include/linux/task_io_accounting.h
@@ -18,6 +18,8 @@ struct task_io_accounting {
u64 syscr;
/* # of write syscalls */
u64 syscw;
+ /* # of fsync syscalls */
+ u64 syscfs;
#endif /* CONFIG_TASK_XACCT */
#ifdef CONFIG_TASK_IO_ACCOUNTING
diff --git a/include/linux/task_io_accounting_ops.h b/include/linux/task_io_accounting_ops.h
index 4d090f9ee608..1b505c804af3 100644
--- a/include/linux/task_io_accounting_ops.h
+++ b/include/linux/task_io_accounting_ops.h
@@ -96,6 +96,7 @@ static inline void task_chr_io_accounting_add(struct task_io_accounting *dst,
dst->wchar += src->wchar;
dst->syscr += src->syscr;
dst->syscw += src->syscw;
+ dst->syscfs += src->syscfs;
}
#else
static inline void task_chr_io_accounting_add(struct task_io_accounting *dst,
diff --git a/include/linux/tcp.h b/include/linux/tcp.h
index d0c3615f9050..b8ea15ad3790 100644
--- a/include/linux/tcp.h
+++ b/include/linux/tcp.h
@@ -219,8 +219,9 @@ struct tcp_sock {
} rack;
u16 advmss; /* Advertised MSS */
u8 rate_app_limited:1, /* rate_{delivered,interval_us} limited? */
+ fastopen_connect:1, /* FASTOPEN_CONNECT sockopt */
is_sack_reneg:1, /* in recovery from loss with SACK reneg? */
- unused:6;
+ unused:5;
u8 nonagle : 4,/* Disable Nagle algorithm? */
thin_lto : 1,/* Use linear timeouts for thin streams */
thin_dupack : 1,/* Fast retransmit on first dupack */
diff --git a/include/linux/tee_drv.h b/include/linux/tee_drv.h
new file mode 100644
index 000000000000..a2b3dfcee0b5
--- /dev/null
+++ b/include/linux/tee_drv.h
@@ -0,0 +1,468 @@
+/*
+ * Copyright (c) 2015-2016, Linaro Limited
+ *
+ * This software is licensed under the terms of the GNU General Public
+ * License version 2, as published by the Free Software Foundation, and
+ * may be copied, distributed, and modified under those terms.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ */
+
+#ifndef __TEE_DRV_H
+#define __TEE_DRV_H
+
+#include <linux/types.h>
+#include <linux/idr.h>
+#include <linux/kref.h>
+#include <linux/list.h>
+#include <linux/tee.h>
+
+/*
+ * The file describes the API provided by the generic TEE driver to the
+ * specific TEE driver.
+ */
+
+#define TEE_SHM_MAPPED BIT(0) /* Memory mapped by the kernel */
+#define TEE_SHM_DMA_BUF BIT(1) /* Memory with dma-buf handle */
+#define TEE_SHM_EXT_DMA_BUF BIT(2) /* Memory with dma-buf handle */
+#define TEE_SHM_REGISTER BIT(3) /* Memory registered in secure world */
+#define TEE_SHM_USER_MAPPED BIT(4) /* Memory mapped in user space */
+#define TEE_SHM_POOL BIT(5) /* Memory allocated from pool */
+
+struct device;
+struct tee_device;
+struct tee_shm;
+struct tee_shm_pool;
+
+/**
+ * struct tee_context - driver specific context on file pointer data
+ * @teedev: pointer to this drivers struct tee_device
+ * @list_shm: List of shared memory object owned by this context
+ * @data: driver specific context data, managed by the driver
+ * @refcount: reference counter for this structure
+ * @releasing: flag that indicates if context is being released right now.
+ * It is needed to break circular dependency on context during
+ * shared memory release.
+ */
+struct tee_context {
+ struct tee_device *teedev;
+ struct list_head list_shm;
+ void *data;
+ struct kref refcount;
+ bool releasing;
+};
+
+struct tee_param_memref {
+ size_t shm_offs;
+ size_t size;
+ struct tee_shm *shm;
+};
+
+struct tee_param_value {
+ u64 a;
+ u64 b;
+ u64 c;
+};
+
+struct tee_param {
+ u64 attr;
+ union {
+ struct tee_param_memref memref;
+ struct tee_param_value value;
+ } u;
+};
+
+/**
+ * struct tee_driver_ops - driver operations vtable
+ * @get_version: returns version of driver
+ * @open: called when the device file is opened
+ * @release: release this open file
+ * @open_session: open a new session
+ * @close_session: close a session
+ * @invoke_func: invoke a trusted function
+ * @cancel_req: request cancel of an ongoing invoke or open
+ * @supp_revc: called for supplicant to get a command
+ * @supp_send: called for supplicant to send a response
+ * @shm_register: register shared memory buffer in TEE
+ * @shm_unregister: unregister shared memory buffer in TEE
+ */
+struct tee_driver_ops {
+ void (*get_version)(struct tee_device *teedev,
+ struct tee_ioctl_version_data *vers);
+ int (*open)(struct tee_context *ctx);
+ void (*release)(struct tee_context *ctx);
+ int (*open_session)(struct tee_context *ctx,
+ struct tee_ioctl_open_session_arg *arg,
+ struct tee_param *param);
+ int (*close_session)(struct tee_context *ctx, u32 session);
+ int (*invoke_func)(struct tee_context *ctx,
+ struct tee_ioctl_invoke_arg *arg,
+ struct tee_param *param);
+ int (*cancel_req)(struct tee_context *ctx, u32 cancel_id, u32 session);
+ int (*supp_recv)(struct tee_context *ctx, u32 *func, u32 *num_params,
+ struct tee_param *param);
+ int (*supp_send)(struct tee_context *ctx, u32 ret, u32 num_params,
+ struct tee_param *param);
+ int (*shm_register)(struct tee_context *ctx, struct tee_shm *shm,
+ struct page **pages, size_t num_pages,
+ unsigned long start);
+ int (*shm_unregister)(struct tee_context *ctx, struct tee_shm *shm);
+};
+
+/**
+ * struct tee_desc - Describes the TEE driver to the subsystem
+ * @name: name of driver
+ * @ops: driver operations vtable
+ * @owner: module providing the driver
+ * @flags: Extra properties of driver, defined by TEE_DESC_* below
+ */
+#define TEE_DESC_PRIVILEGED 0x1
+struct tee_desc {
+ const char *name;
+ const struct tee_driver_ops *ops;
+ struct module *owner;
+ u32 flags;
+};
+
+/**
+ * tee_device_alloc() - Allocate a new struct tee_device instance
+ * @teedesc: Descriptor for this driver
+ * @dev: Parent device for this device
+ * @pool: Shared memory pool, NULL if not used
+ * @driver_data: Private driver data for this device
+ *
+ * Allocates a new struct tee_device instance. The device is
+ * removed by tee_device_unregister().
+ *
+ * @returns a pointer to a 'struct tee_device' or an ERR_PTR on failure
+ */
+struct tee_device *tee_device_alloc(const struct tee_desc *teedesc,
+ struct device *dev,
+ struct tee_shm_pool *pool,
+ void *driver_data);
+
+/**
+ * tee_device_register() - Registers a TEE device
+ * @teedev: Device to register
+ *
+ * tee_device_unregister() need to be called to remove the @teedev if
+ * this function fails.
+ *
+ * @returns < 0 on failure
+ */
+int tee_device_register(struct tee_device *teedev);
+
+/**
+ * tee_device_unregister() - Removes a TEE device
+ * @teedev: Device to unregister
+ *
+ * This function should be called to remove the @teedev even if
+ * tee_device_register() hasn't been called yet. Does nothing if
+ * @teedev is NULL.
+ */
+void tee_device_unregister(struct tee_device *teedev);
+
+/**
+ * struct tee_shm - shared memory object
+ * @teedev: device used to allocate the object
+ * @ctx: context using the object, if NULL the context is gone
+ * @link link element
+ * @paddr: physical address of the shared memory
+ * @kaddr: virtual address of the shared memory
+ * @size: size of shared memory
+ * @offset: offset of buffer in user space
+ * @pages: locked pages from userspace
+ * @num_pages: number of locked pages
+ * @dmabuf: dmabuf used to for exporting to user space
+ * @flags: defined by TEE_SHM_* in tee_drv.h
+ * @id: unique id of a shared memory object on this device
+ *
+ * This pool is only supposed to be accessed directly from the TEE
+ * subsystem and from drivers that implements their own shm pool manager.
+ */
+struct tee_shm {
+ struct tee_device *teedev;
+ struct tee_context *ctx;
+ struct list_head link;
+ phys_addr_t paddr;
+ void *kaddr;
+ size_t size;
+ unsigned int offset;
+ struct page **pages;
+ size_t num_pages;
+ struct dma_buf *dmabuf;
+ u32 flags;
+ int id;
+};
+
+/**
+ * struct tee_shm_pool_mgr - shared memory manager
+ * @ops: operations
+ * @private_data: private data for the shared memory manager
+ */
+struct tee_shm_pool_mgr {
+ const struct tee_shm_pool_mgr_ops *ops;
+ void *private_data;
+};
+
+/**
+ * struct tee_shm_pool_mgr_ops - shared memory pool manager operations
+ * @alloc: called when allocating shared memory
+ * @free: called when freeing shared memory
+ * @destroy_poolmgr: called when destroying the pool manager
+ */
+struct tee_shm_pool_mgr_ops {
+ int (*alloc)(struct tee_shm_pool_mgr *poolmgr, struct tee_shm *shm,
+ size_t size);
+ void (*free)(struct tee_shm_pool_mgr *poolmgr, struct tee_shm *shm);
+ void (*destroy_poolmgr)(struct tee_shm_pool_mgr *poolmgr);
+};
+
+/**
+ * tee_shm_pool_alloc() - Create a shared memory pool from shm managers
+ * @priv_mgr: manager for driver private shared memory allocations
+ * @dmabuf_mgr: manager for dma-buf shared memory allocations
+ *
+ * Allocation with the flag TEE_SHM_DMA_BUF set will use the range supplied
+ * in @dmabuf, others will use the range provided by @priv.
+ *
+ * @returns pointer to a 'struct tee_shm_pool' or an ERR_PTR on failure.
+ */
+struct tee_shm_pool *tee_shm_pool_alloc(struct tee_shm_pool_mgr *priv_mgr,
+ struct tee_shm_pool_mgr *dmabuf_mgr);
+
+/*
+ * tee_shm_pool_mgr_alloc_res_mem() - Create a shm manager for reserved
+ * memory
+ * @vaddr: Virtual address of start of pool
+ * @paddr: Physical address of start of pool
+ * @size: Size in bytes of the pool
+ *
+ * @returns pointer to a 'struct tee_shm_pool_mgr' or an ERR_PTR on failure.
+ */
+struct tee_shm_pool_mgr *tee_shm_pool_mgr_alloc_res_mem(unsigned long vaddr,
+ phys_addr_t paddr,
+ size_t size,
+ int min_alloc_order);
+
+/**
+ * tee_shm_pool_mgr_destroy() - Free a shared memory manager
+ */
+static inline void tee_shm_pool_mgr_destroy(struct tee_shm_pool_mgr *poolm)
+{
+ poolm->ops->destroy_poolmgr(poolm);
+}
+
+/**
+ * struct tee_shm_pool_mem_info - holds information needed to create a shared
+ * memory pool
+ * @vaddr: Virtual address of start of pool
+ * @paddr: Physical address of start of pool
+ * @size: Size in bytes of the pool
+ */
+struct tee_shm_pool_mem_info {
+ unsigned long vaddr;
+ phys_addr_t paddr;
+ size_t size;
+};
+
+/**
+ * tee_shm_pool_alloc_res_mem() - Create a shared memory pool from reserved
+ * memory range
+ * @priv_info: Information for driver private shared memory pool
+ * @dmabuf_info: Information for dma-buf shared memory pool
+ *
+ * Start and end of pools will must be page aligned.
+ *
+ * Allocation with the flag TEE_SHM_DMA_BUF set will use the range supplied
+ * in @dmabuf, others will use the range provided by @priv.
+ *
+ * @returns pointer to a 'struct tee_shm_pool' or an ERR_PTR on failure.
+ */
+struct tee_shm_pool *
+tee_shm_pool_alloc_res_mem(struct tee_shm_pool_mem_info *priv_info,
+ struct tee_shm_pool_mem_info *dmabuf_info);
+
+/**
+ * tee_shm_pool_free() - Free a shared memory pool
+ * @pool: The shared memory pool to free
+ *
+ * The must be no remaining shared memory allocated from this pool when
+ * this function is called.
+ */
+void tee_shm_pool_free(struct tee_shm_pool *pool);
+
+/**
+ * tee_get_drvdata() - Return driver_data pointer
+ * @returns the driver_data pointer supplied to tee_register().
+ */
+void *tee_get_drvdata(struct tee_device *teedev);
+
+/**
+ * tee_shm_alloc() - Allocate shared memory
+ * @ctx: Context that allocates the shared memory
+ * @size: Requested size of shared memory
+ * @flags: Flags setting properties for the requested shared memory.
+ *
+ * Memory allocated as global shared memory is automatically freed when the
+ * TEE file pointer is closed. The @flags field uses the bits defined by
+ * TEE_SHM_* above. TEE_SHM_MAPPED must currently always be set. If
+ * TEE_SHM_DMA_BUF global shared memory will be allocated and associated
+ * with a dma-buf handle, else driver private memory.
+ *
+ * @returns a pointer to 'struct tee_shm'
+ */
+struct tee_shm *tee_shm_alloc(struct tee_context *ctx, size_t size, u32 flags);
+
+/**
+ * tee_shm_priv_alloc() - Allocate shared memory privately
+ * @dev: Device that allocates the shared memory
+ * @size: Requested size of shared memory
+ *
+ * Allocates shared memory buffer that is not associated with any client
+ * context. Such buffers are owned by TEE driver and used for internal calls.
+ *
+ * @returns a pointer to 'struct tee_shm'
+ */
+struct tee_shm *tee_shm_priv_alloc(struct tee_device *teedev, size_t size);
+
+/**
+ * tee_shm_register() - Register shared memory buffer
+ * @ctx: Context that registers the shared memory
+ * @addr: Address is userspace of the shared buffer
+ * @length: Length of the shared buffer
+ * @flags: Flags setting properties for the requested shared memory.
+ *
+ * @returns a pointer to 'struct tee_shm'
+ */
+struct tee_shm *tee_shm_register(struct tee_context *ctx, unsigned long addr,
+ size_t length, u32 flags);
+
+/**
+ * tee_shm_is_registered() - Check if shared memory object in registered in TEE
+ * @shm: Shared memory handle
+ * @returns true if object is registered in TEE
+ */
+static inline bool tee_shm_is_registered(struct tee_shm *shm)
+{
+ return shm && (shm->flags & TEE_SHM_REGISTER);
+}
+
+/**
+ * tee_shm_free() - Free shared memory
+ * @shm: Handle to shared memory to free
+ */
+void tee_shm_free(struct tee_shm *shm);
+
+/**
+ * tee_shm_put() - Decrease reference count on a shared memory handle
+ * @shm: Shared memory handle
+ */
+void tee_shm_put(struct tee_shm *shm);
+
+/**
+ * tee_shm_va2pa() - Get physical address of a virtual address
+ * @shm: Shared memory handle
+ * @va: Virtual address to tranlsate
+ * @pa: Returned physical address
+ * @returns 0 on success and < 0 on failure
+ */
+int tee_shm_va2pa(struct tee_shm *shm, void *va, phys_addr_t *pa);
+
+/**
+ * tee_shm_pa2va() - Get virtual address of a physical address
+ * @shm: Shared memory handle
+ * @pa: Physical address to tranlsate
+ * @va: Returned virtual address
+ * @returns 0 on success and < 0 on failure
+ */
+int tee_shm_pa2va(struct tee_shm *shm, phys_addr_t pa, void **va);
+
+/**
+ * tee_shm_get_va() - Get virtual address of a shared memory plus an offset
+ * @shm: Shared memory handle
+ * @offs: Offset from start of this shared memory
+ * @returns virtual address of the shared memory + offs if offs is within
+ * the bounds of this shared memory, else an ERR_PTR
+ */
+void *tee_shm_get_va(struct tee_shm *shm, size_t offs);
+
+/**
+ * tee_shm_get_pa() - Get physical address of a shared memory plus an offset
+ * @shm: Shared memory handle
+ * @offs: Offset from start of this shared memory
+ * @pa: Physical address to return
+ * @returns 0 if offs is within the bounds of this shared memory, else an
+ * error code.
+ */
+int tee_shm_get_pa(struct tee_shm *shm, size_t offs, phys_addr_t *pa);
+
+/**
+ * tee_shm_get_size() - Get size of shared memory buffer
+ * @shm: Shared memory handle
+ * @returns size of shared memory
+ */
+static inline size_t tee_shm_get_size(struct tee_shm *shm)
+{
+ return shm->size;
+}
+
+/**
+ * tee_shm_get_pages() - Get list of pages that hold shared buffer
+ * @shm: Shared memory handle
+ * @num_pages: Number of pages will be stored there
+ * @returns pointer to pages array
+ */
+static inline struct page **tee_shm_get_pages(struct tee_shm *shm,
+ size_t *num_pages)
+{
+ *num_pages = shm->num_pages;
+ return shm->pages;
+}
+
+/**
+ * tee_shm_get_page_offset() - Get shared buffer offset from page start
+ * @shm: Shared memory handle
+ * @returns page offset of shared buffer
+ */
+static inline size_t tee_shm_get_page_offset(struct tee_shm *shm)
+{
+ return shm->offset;
+}
+
+/**
+ * tee_shm_get_id() - Get id of a shared memory object
+ * @shm: Shared memory handle
+ * @returns id
+ */
+static inline int tee_shm_get_id(struct tee_shm *shm)
+{
+ return shm->id;
+}
+
+/**
+ * tee_shm_get_from_id() - Find shared memory object and increase reference
+ * count
+ * @ctx: Context owning the shared memory
+ * @id: Id of shared memory object
+ * @returns a pointer to 'struct tee_shm' on success or an ERR_PTR on failure
+ */
+struct tee_shm *tee_shm_get_from_id(struct tee_context *ctx, int id);
+
+static inline bool tee_param_is_memref(struct tee_param *param)
+{
+ switch (param->attr & TEE_IOCTL_PARAM_ATTR_TYPE_MASK) {
+ case TEE_IOCTL_PARAM_ATTR_TYPE_MEMREF_INPUT:
+ case TEE_IOCTL_PARAM_ATTR_TYPE_MEMREF_OUTPUT:
+ case TEE_IOCTL_PARAM_ATTR_TYPE_MEMREF_INOUT:
+ return true;
+ default:
+ return false;
+ }
+}
+
+#endif /*__TEE_DRV_H*/
diff --git a/include/linux/thread_info.h b/include/linux/thread_info.h
index 5e6436741f96..ce98a8e7ef78 100644
--- a/include/linux/thread_info.h
+++ b/include/linux/thread_info.h
@@ -9,50 +9,17 @@
#include <linux/types.h>
#include <linux/bug.h>
-
-struct timespec;
-struct compat_timespec;
+#include <linux/restart_block.h>
#ifdef CONFIG_THREAD_INFO_IN_TASK
-#define current_thread_info() ((struct thread_info *)current)
-#endif
-
/*
- * System call restart block.
+ * For CONFIG_THREAD_INFO_IN_TASK kernels we need <asm/current.h> for the
+ * definition of current, but for !CONFIG_THREAD_INFO_IN_TASK kernels,
+ * including <asm/current.h> can cause a circular dependency on some platforms.
*/
-struct restart_block {
- long (*fn)(struct restart_block *);
- union {
- /* For futex_wait and futex_wait_requeue_pi */
- struct {
- u32 __user *uaddr;
- u32 val;
- u32 flags;
- u32 bitset;
- u64 time;
- u32 __user *uaddr2;
- } futex;
- /* For nanosleep */
- struct {
- clockid_t clockid;
- struct timespec __user *rmtp;
-#ifdef CONFIG_COMPAT
- struct compat_timespec __user *compat_rmtp;
+#include <asm/current.h>
+#define current_thread_info() ((struct thread_info *)current)
#endif
- u64 expires;
- } nanosleep;
- /* For poll */
- struct {
- struct pollfd __user *ufds;
- int nfds;
- int has_timeout;
- unsigned long tv_sec;
- unsigned long tv_nsec;
- } poll;
- };
-};
-
-extern long do_no_restart_syscall(struct restart_block *parm);
#include <linux/bitops.h>
#include <asm/thread_info.h>
diff --git a/include/linux/tick.h b/include/linux/tick.h
index 62be0786d6d0..78ec2eb4d340 100644
--- a/include/linux/tick.h
+++ b/include/linux/tick.h
@@ -117,6 +117,7 @@ extern void tick_nohz_idle_enter(void);
extern void tick_nohz_idle_exit(void);
extern void tick_nohz_irq_exit(void);
extern ktime_t tick_nohz_get_sleep_length(void);
+extern unsigned long tick_nohz_get_idle_calls(void);
extern u64 get_cpu_idle_time_us(int cpu, u64 *last_update_time);
extern u64 get_cpu_iowait_time_us(int cpu, u64 *last_update_time);
#else /* !CONFIG_NO_HZ_COMMON */
diff --git a/include/linux/timekeeper_internal.h b/include/linux/timekeeper_internal.h
index 2c225d46a428..dfc80b72c32a 100644
--- a/include/linux/timekeeper_internal.h
+++ b/include/linux/timekeeper_internal.h
@@ -51,7 +51,7 @@ struct tk_read_base {
* @clock_was_set_seq: The sequence number of clock was set events
* @cs_was_changed_seq: The sequence number of clocksource change events
* @next_leap_ktime: CLOCK_MONOTONIC time value of a pending leap-second
- * @raw_time: Monotonic raw base time in timespec64 format
+ * @raw_sec: CLOCK_MONOTONIC_RAW time in seconds
* @cycle_interval: Number of clock cycles in one NTP interval
* @xtime_interval: Number of clock shifted nano seconds in one NTP
* interval.
@@ -93,7 +93,7 @@ struct timekeeper {
unsigned int clock_was_set_seq;
u8 cs_was_changed_seq;
ktime_t next_leap_ktime;
- struct timespec64 raw_time;
+ u64 raw_sec;
/* The following members are for timekeeping internal use */
cycle_t cycle_interval;
diff --git a/include/linux/timekeeping.h b/include/linux/timekeeping.h
index 09168c52ab64..361f8bf1429d 100644
--- a/include/linux/timekeeping.h
+++ b/include/linux/timekeeping.h
@@ -249,6 +249,7 @@ static inline u64 ktime_get_raw_ns(void)
extern u64 ktime_get_mono_fast_ns(void);
extern u64 ktime_get_raw_fast_ns(void);
+extern u64 ktime_get_boot_fast_ns(void);
/*
* Timespec interfaces utilizing the ktime based ones
diff --git a/include/linux/usb/class-dual-role.h b/include/linux/usb/class-dual-role.h
new file mode 100644
index 000000000000..c6df2238012e
--- /dev/null
+++ b/include/linux/usb/class-dual-role.h
@@ -0,0 +1,129 @@
+#ifndef __LINUX_CLASS_DUAL_ROLE_H__
+#define __LINUX_CLASS_DUAL_ROLE_H__
+
+#include <linux/workqueue.h>
+#include <linux/errno.h>
+#include <linux/types.h>
+
+struct device;
+
+enum dual_role_supported_modes {
+ DUAL_ROLE_SUPPORTED_MODES_DFP_AND_UFP = 0,
+ DUAL_ROLE_SUPPORTED_MODES_DFP,
+ DUAL_ROLE_SUPPORTED_MODES_UFP,
+/*The following should be the last element*/
+ DUAL_ROLE_PROP_SUPPORTED_MODES_TOTAL,
+};
+
+enum {
+ DUAL_ROLE_PROP_MODE_UFP = 0,
+ DUAL_ROLE_PROP_MODE_DFP,
+ DUAL_ROLE_PROP_MODE_NONE,
+/*The following should be the last element*/
+ DUAL_ROLE_PROP_MODE_TOTAL,
+};
+
+enum {
+ DUAL_ROLE_PROP_PR_SRC = 0,
+ DUAL_ROLE_PROP_PR_SNK,
+ DUAL_ROLE_PROP_PR_NONE,
+/*The following should be the last element*/
+ DUAL_ROLE_PROP_PR_TOTAL,
+
+};
+
+enum {
+ DUAL_ROLE_PROP_DR_HOST = 0,
+ DUAL_ROLE_PROP_DR_DEVICE,
+ DUAL_ROLE_PROP_DR_NONE,
+/*The following should be the last element*/
+ DUAL_ROLE_PROP_DR_TOTAL,
+};
+
+enum {
+ DUAL_ROLE_PROP_VCONN_SUPPLY_NO = 0,
+ DUAL_ROLE_PROP_VCONN_SUPPLY_YES,
+/*The following should be the last element*/
+ DUAL_ROLE_PROP_VCONN_SUPPLY_TOTAL,
+};
+
+enum dual_role_property {
+ DUAL_ROLE_PROP_SUPPORTED_MODES = 0,
+ DUAL_ROLE_PROP_MODE,
+ DUAL_ROLE_PROP_PR,
+ DUAL_ROLE_PROP_DR,
+ DUAL_ROLE_PROP_VCONN_SUPPLY,
+};
+
+struct dual_role_phy_instance;
+
+/* Description of typec port */
+struct dual_role_phy_desc {
+ /* /sys/class/dual_role_usb/<name>/ */
+ const char *name;
+ enum dual_role_supported_modes supported_modes;
+ enum dual_role_property *properties;
+ size_t num_properties;
+
+ /* Callback for "cat /sys/class/dual_role_usb/<name>/<property>" */
+ int (*get_property)(struct dual_role_phy_instance *dual_role,
+ enum dual_role_property prop,
+ unsigned int *val);
+ /* Callback for "echo <value> >
+ * /sys/class/dual_role_usb/<name>/<property>" */
+ int (*set_property)(struct dual_role_phy_instance *dual_role,
+ enum dual_role_property prop,
+ const unsigned int *val);
+ /* Decides whether userspace can change a specific property */
+ int (*property_is_writeable)(struct dual_role_phy_instance *dual_role,
+ enum dual_role_property prop);
+};
+
+struct dual_role_phy_instance {
+ const struct dual_role_phy_desc *desc;
+
+ /* Driver private data */
+ void *drv_data;
+
+ struct device dev;
+ struct work_struct changed_work;
+};
+
+#if IS_ENABLED(CONFIG_DUAL_ROLE_USB_INTF)
+extern void dual_role_instance_changed(struct dual_role_phy_instance
+ *dual_role);
+extern struct dual_role_phy_instance *__must_check
+devm_dual_role_instance_register(struct device *parent,
+ const struct dual_role_phy_desc *desc);
+extern void devm_dual_role_instance_unregister(struct device *dev,
+ struct dual_role_phy_instance
+ *dual_role);
+extern int dual_role_get_property(struct dual_role_phy_instance *dual_role,
+ enum dual_role_property prop,
+ unsigned int *val);
+extern int dual_role_set_property(struct dual_role_phy_instance *dual_role,
+ enum dual_role_property prop,
+ const unsigned int *val);
+extern int dual_role_property_is_writeable(struct dual_role_phy_instance
+ *dual_role,
+ enum dual_role_property prop);
+extern void *dual_role_get_drvdata(struct dual_role_phy_instance *dual_role);
+#else /* CONFIG_DUAL_ROLE_USB_INTF */
+static inline void dual_role_instance_changed(struct dual_role_phy_instance
+ *dual_role){}
+static inline struct dual_role_phy_instance *__must_check
+devm_dual_role_instance_register(struct device *parent,
+ const struct dual_role_phy_desc *desc)
+{
+ return ERR_PTR(-ENOSYS);
+}
+static inline void devm_dual_role_instance_unregister(struct device *dev,
+ struct dual_role_phy_instance
+ *dual_role){}
+static inline void *dual_role_get_drvdata(struct dual_role_phy_instance
+ *dual_role)
+{
+ return ERR_PTR(-ENOSYS);
+}
+#endif /* CONFIG_DUAL_ROLE_USB_INTF */
+#endif /* __LINUX_CLASS_DUAL_ROLE_H__ */
diff --git a/include/linux/usb/composite.h b/include/linux/usb/composite.h
index 667d20454a21..74f97cee3051 100644
--- a/include/linux/usb/composite.h
+++ b/include/linux/usb/composite.h
@@ -584,6 +584,7 @@ struct usb_function_instance {
struct config_group group;
struct list_head cfs_list;
struct usb_function_driver *fd;
+ struct usb_function *f;
int (*set_inst_name)(struct usb_function_instance *inst,
const char *name);
void (*free_func_inst)(struct usb_function_instance *inst);
diff --git a/include/linux/usb/f_accessory.h b/include/linux/usb/f_accessory.h
new file mode 100644
index 000000000000..ebe3c4d59309
--- /dev/null
+++ b/include/linux/usb/f_accessory.h
@@ -0,0 +1,23 @@
+/*
+ * Gadget Function Driver for Android USB accessories
+ *
+ * Copyright (C) 2011 Google, Inc.
+ * Author: Mike Lockwood <lockwood@android.com>
+ *
+ * This software is licensed under the terms of the GNU General Public
+ * License version 2, as published by the Free Software Foundation, and
+ * may be copied, distributed, and modified under those terms.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ */
+
+#ifndef __LINUX_USB_F_ACCESSORY_H
+#define __LINUX_USB_F_ACCESSORY_H
+
+#include <uapi/linux/usb/f_accessory.h>
+
+#endif /* __LINUX_USB_F_ACCESSORY_H */
diff --git a/include/linux/usb/f_mtp.h b/include/linux/usb/f_mtp.h
new file mode 100644
index 000000000000..4e8417791bea
--- /dev/null
+++ b/include/linux/usb/f_mtp.h
@@ -0,0 +1,23 @@
+/*
+ * Gadget Function Driver for MTP
+ *
+ * Copyright (C) 2010 Google, Inc.
+ * Author: Mike Lockwood <lockwood@android.com>
+ *
+ * This software is licensed under the terms of the GNU General Public
+ * License version 2, as published by the Free Software Foundation, and
+ * may be copied, distributed, and modified under those terms.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ */
+
+#ifndef __LINUX_USB_F_MTP_H
+#define __LINUX_USB_F_MTP_H
+
+#include <uapi/linux/usb/f_mtp.h>
+
+#endif /* __LINUX_USB_F_MTP_H */
diff --git a/include/linux/verification.h b/include/linux/verification.h
index cfa4730d607a..60ea906b603f 100644
--- a/include/linux/verification.h
+++ b/include/linux/verification.h
@@ -32,9 +32,13 @@ enum key_being_used_for {
};
extern const char *const key_being_used_for[NR__KEY_BEING_USED_FOR];
-#ifdef CONFIG_SYSTEM_DATA_VERIFICATION
-
struct key;
+struct public_key_signature;
+
+extern int verify_signature_one(const struct public_key_signature *sig,
+ struct key *trusted_keys, const char *keyid);
+
+#ifdef CONFIG_SYSTEM_DATA_VERIFICATION
extern int verify_pkcs7_signature(const void *data, size_t len,
const void *raw_pkcs7, size_t pkcs7_len,
diff --git a/include/linux/wakeup_reason.h b/include/linux/wakeup_reason.h
new file mode 100644
index 000000000000..d84d8c301546
--- /dev/null
+++ b/include/linux/wakeup_reason.h
@@ -0,0 +1,32 @@
+/*
+ * include/linux/wakeup_reason.h
+ *
+ * Logs the reason which caused the kernel to resume
+ * from the suspend mode.
+ *
+ * Copyright (C) 2014 Google, Inc.
+ * This software is licensed under the terms of the GNU General Public
+ * License version 2, as published by the Free Software Foundation, and
+ * may be copied, distributed, and modified under those terms.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ */
+
+#ifndef _LINUX_WAKEUP_REASON_H
+#define _LINUX_WAKEUP_REASON_H
+
+#define MAX_SUSPEND_ABORT_LEN 256
+
+void log_wakeup_reason(int irq);
+int check_wakeup_reason(int irq);
+
+#ifdef CONFIG_SUSPEND
+void log_suspend_abort_reason(const char *fmt, ...);
+#else
+static inline void log_suspend_abort_reason(const char *fmt, ...) { }
+#endif
+
+#endif /* _LINUX_WAKEUP_REASON_H */
diff --git a/include/linux/wlan_plat.h b/include/linux/wlan_plat.h
new file mode 100644
index 000000000000..8e8b06f1ba4a
--- /dev/null
+++ b/include/linux/wlan_plat.h
@@ -0,0 +1,30 @@
+/* include/linux/wlan_plat.h
+ *
+ * Copyright (C) 2010 Google, Inc.
+ *
+ * This software is licensed under the terms of the GNU General Public
+ * License version 2, as published by the Free Software Foundation, and
+ * may be copied, distributed, and modified under those terms.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ */
+#ifndef _LINUX_WLAN_PLAT_H_
+#define _LINUX_WLAN_PLAT_H_
+
+#define WLAN_PLAT_NODFS_FLAG 0x01
+
+struct wifi_platform_data {
+ int (*set_power)(int val);
+ int (*set_reset)(int val);
+ int (*set_carddetect)(int val);
+ void *(*mem_prealloc)(int section, unsigned long size);
+ int (*get_mac_addr)(unsigned char *buf);
+ int (*get_wake_irq)(void);
+ void *(*get_country_code)(char *ccode, u32 flags);
+};
+
+#endif
diff --git a/include/linux/xxhash.h b/include/linux/xxhash.h
new file mode 100644
index 000000000000..9e1f42cb57e9
--- /dev/null
+++ b/include/linux/xxhash.h
@@ -0,0 +1,236 @@
+/*
+ * xxHash - Extremely Fast Hash algorithm
+ * Copyright (C) 2012-2016, Yann Collet.
+ *
+ * BSD 2-Clause License (http://www.opensource.org/licenses/bsd-license.php)
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are
+ * met:
+ *
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following disclaimer
+ * in the documentation and/or other materials provided with the
+ * distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ * This program is free software; you can redistribute it and/or modify it under
+ * the terms of the GNU General Public License version 2 as published by the
+ * Free Software Foundation. This program is dual-licensed; you may select
+ * either version 2 of the GNU General Public License ("GPL") or BSD license
+ * ("BSD").
+ *
+ * You can contact the author at:
+ * - xxHash homepage: http://cyan4973.github.io/xxHash/
+ * - xxHash source repository: https://github.com/Cyan4973/xxHash
+ */
+
+/*
+ * Notice extracted from xxHash homepage:
+ *
+ * xxHash is an extremely fast Hash algorithm, running at RAM speed limits.
+ * It also successfully passes all tests from the SMHasher suite.
+ *
+ * Comparison (single thread, Windows Seven 32 bits, using SMHasher on a Core 2
+ * Duo @3GHz)
+ *
+ * Name Speed Q.Score Author
+ * xxHash 5.4 GB/s 10
+ * CrapWow 3.2 GB/s 2 Andrew
+ * MumurHash 3a 2.7 GB/s 10 Austin Appleby
+ * SpookyHash 2.0 GB/s 10 Bob Jenkins
+ * SBox 1.4 GB/s 9 Bret Mulvey
+ * Lookup3 1.2 GB/s 9 Bob Jenkins
+ * SuperFastHash 1.2 GB/s 1 Paul Hsieh
+ * CityHash64 1.05 GB/s 10 Pike & Alakuijala
+ * FNV 0.55 GB/s 5 Fowler, Noll, Vo
+ * CRC32 0.43 GB/s 9
+ * MD5-32 0.33 GB/s 10 Ronald L. Rivest
+ * SHA1-32 0.28 GB/s 10
+ *
+ * Q.Score is a measure of quality of the hash function.
+ * It depends on successfully passing SMHasher test set.
+ * 10 is a perfect score.
+ *
+ * A 64-bits version, named xxh64 offers much better speed,
+ * but for 64-bits applications only.
+ * Name Speed on 64 bits Speed on 32 bits
+ * xxh64 13.8 GB/s 1.9 GB/s
+ * xxh32 6.8 GB/s 6.0 GB/s
+ */
+
+#ifndef XXHASH_H
+#define XXHASH_H
+
+#include <linux/types.h>
+
+/*-****************************
+ * Simple Hash Functions
+ *****************************/
+
+/**
+ * xxh32() - calculate the 32-bit hash of the input with a given seed.
+ *
+ * @input: The data to hash.
+ * @length: The length of the data to hash.
+ * @seed: The seed can be used to alter the result predictably.
+ *
+ * Speed on Core 2 Duo @ 3 GHz (single thread, SMHasher benchmark) : 5.4 GB/s
+ *
+ * Return: The 32-bit hash of the data.
+ */
+uint32_t xxh32(const void *input, size_t length, uint32_t seed);
+
+/**
+ * xxh64() - calculate the 64-bit hash of the input with a given seed.
+ *
+ * @input: The data to hash.
+ * @length: The length of the data to hash.
+ * @seed: The seed can be used to alter the result predictably.
+ *
+ * This function runs 2x faster on 64-bit systems, but slower on 32-bit systems.
+ *
+ * Return: The 64-bit hash of the data.
+ */
+uint64_t xxh64(const void *input, size_t length, uint64_t seed);
+
+/*-****************************
+ * Streaming Hash Functions
+ *****************************/
+
+/*
+ * These definitions are only meant to allow allocation of XXH state
+ * statically, on stack, or in a struct for example.
+ * Do not use members directly.
+ */
+
+/**
+ * struct xxh32_state - private xxh32 state, do not use members directly
+ */
+struct xxh32_state {
+ uint32_t total_len_32;
+ uint32_t large_len;
+ uint32_t v1;
+ uint32_t v2;
+ uint32_t v3;
+ uint32_t v4;
+ uint32_t mem32[4];
+ uint32_t memsize;
+};
+
+/**
+ * struct xxh32_state - private xxh64 state, do not use members directly
+ */
+struct xxh64_state {
+ uint64_t total_len;
+ uint64_t v1;
+ uint64_t v2;
+ uint64_t v3;
+ uint64_t v4;
+ uint64_t mem64[4];
+ uint32_t memsize;
+};
+
+/**
+ * xxh32_reset() - reset the xxh32 state to start a new hashing operation
+ *
+ * @state: The xxh32 state to reset.
+ * @seed: Initialize the hash state with this seed.
+ *
+ * Call this function on any xxh32_state to prepare for a new hashing operation.
+ */
+void xxh32_reset(struct xxh32_state *state, uint32_t seed);
+
+/**
+ * xxh32_update() - hash the data given and update the xxh32 state
+ *
+ * @state: The xxh32 state to update.
+ * @input: The data to hash.
+ * @length: The length of the data to hash.
+ *
+ * After calling xxh32_reset() call xxh32_update() as many times as necessary.
+ *
+ * Return: Zero on success, otherwise an error code.
+ */
+int xxh32_update(struct xxh32_state *state, const void *input, size_t length);
+
+/**
+ * xxh32_digest() - produce the current xxh32 hash
+ *
+ * @state: Produce the current xxh32 hash of this state.
+ *
+ * A hash value can be produced at any time. It is still possible to continue
+ * inserting input into the hash state after a call to xxh32_digest(), and
+ * generate new hashes later on, by calling xxh32_digest() again.
+ *
+ * Return: The xxh32 hash stored in the state.
+ */
+uint32_t xxh32_digest(const struct xxh32_state *state);
+
+/**
+ * xxh64_reset() - reset the xxh64 state to start a new hashing operation
+ *
+ * @state: The xxh64 state to reset.
+ * @seed: Initialize the hash state with this seed.
+ */
+void xxh64_reset(struct xxh64_state *state, uint64_t seed);
+
+/**
+ * xxh64_update() - hash the data given and update the xxh64 state
+ * @state: The xxh64 state to update.
+ * @input: The data to hash.
+ * @length: The length of the data to hash.
+ *
+ * After calling xxh64_reset() call xxh64_update() as many times as necessary.
+ *
+ * Return: Zero on success, otherwise an error code.
+ */
+int xxh64_update(struct xxh64_state *state, const void *input, size_t length);
+
+/**
+ * xxh64_digest() - produce the current xxh64 hash
+ *
+ * @state: Produce the current xxh64 hash of this state.
+ *
+ * A hash value can be produced at any time. It is still possible to continue
+ * inserting input into the hash state after a call to xxh64_digest(), and
+ * generate new hashes later on, by calling xxh64_digest() again.
+ *
+ * Return: The xxh64 hash stored in the state.
+ */
+uint64_t xxh64_digest(const struct xxh64_state *state);
+
+/*-**************************
+ * Utils
+ ***************************/
+
+/**
+ * xxh32_copy_state() - copy the source state into the destination state
+ *
+ * @src: The source xxh32 state.
+ * @dst: The destination xxh32 state.
+ */
+void xxh32_copy_state(struct xxh32_state *dst, const struct xxh32_state *src);
+
+/**
+ * xxh64_copy_state() - copy the source state into the destination state
+ *
+ * @src: The source xxh64 state.
+ * @dst: The destination xxh64 state.
+ */
+void xxh64_copy_state(struct xxh64_state *dst, const struct xxh64_state *src);
+
+#endif /* XXHASH_H */
diff --git a/include/linux/zsmalloc.h b/include/linux/zsmalloc.h
index 57a8e98f2708..2219cce81ca4 100644
--- a/include/linux/zsmalloc.h
+++ b/include/linux/zsmalloc.h
@@ -47,6 +47,8 @@ void zs_destroy_pool(struct zs_pool *pool);
unsigned long zs_malloc(struct zs_pool *pool, size_t size, gfp_t flags);
void zs_free(struct zs_pool *pool, unsigned long obj);
+size_t zs_huge_class_size(struct zs_pool *pool);
+
void *zs_map_object(struct zs_pool *pool, unsigned long handle,
enum zs_mapmode mm);
void zs_unmap_object(struct zs_pool *pool, unsigned long handle);
diff --git a/include/linux/zstd.h b/include/linux/zstd.h
new file mode 100644
index 000000000000..249575e2485f
--- /dev/null
+++ b/include/linux/zstd.h
@@ -0,0 +1,1157 @@
+/*
+ * Copyright (c) 2016-present, Yann Collet, Facebook, Inc.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of https://github.com/facebook/zstd.
+ * An additional grant of patent rights can be found in the PATENTS file in the
+ * same directory.
+ *
+ * This program is free software; you can redistribute it and/or modify it under
+ * the terms of the GNU General Public License version 2 as published by the
+ * Free Software Foundation. This program is dual-licensed; you may select
+ * either version 2 of the GNU General Public License ("GPL") or BSD license
+ * ("BSD").
+ */
+
+#ifndef ZSTD_H
+#define ZSTD_H
+
+/* ====== Dependency ======*/
+#include <linux/types.h> /* size_t */
+
+
+/*-*****************************************************************************
+ * Introduction
+ *
+ * zstd, short for Zstandard, is a fast lossless compression algorithm,
+ * targeting real-time compression scenarios at zlib-level and better
+ * compression ratios. The zstd compression library provides in-memory
+ * compression and decompression functions. The library supports compression
+ * levels from 1 up to ZSTD_maxCLevel() which is 22. Levels >= 20, labeled
+ * ultra, should be used with caution, as they require more memory.
+ * Compression can be done in:
+ * - a single step, reusing a context (described as Explicit memory management)
+ * - unbounded multiple steps (described as Streaming compression)
+ * The compression ratio achievable on small data can be highly improved using
+ * compression with a dictionary in:
+ * - a single step (described as Simple dictionary API)
+ * - a single step, reusing a dictionary (described as Fast dictionary API)
+ ******************************************************************************/
+
+/*====== Helper functions ======*/
+
+/**
+ * enum ZSTD_ErrorCode - zstd error codes
+ *
+ * Functions that return size_t can be checked for errors using ZSTD_isError()
+ * and the ZSTD_ErrorCode can be extracted using ZSTD_getErrorCode().
+ */
+typedef enum {
+ ZSTD_error_no_error,
+ ZSTD_error_GENERIC,
+ ZSTD_error_prefix_unknown,
+ ZSTD_error_version_unsupported,
+ ZSTD_error_parameter_unknown,
+ ZSTD_error_frameParameter_unsupported,
+ ZSTD_error_frameParameter_unsupportedBy32bits,
+ ZSTD_error_frameParameter_windowTooLarge,
+ ZSTD_error_compressionParameter_unsupported,
+ ZSTD_error_init_missing,
+ ZSTD_error_memory_allocation,
+ ZSTD_error_stage_wrong,
+ ZSTD_error_dstSize_tooSmall,
+ ZSTD_error_srcSize_wrong,
+ ZSTD_error_corruption_detected,
+ ZSTD_error_checksum_wrong,
+ ZSTD_error_tableLog_tooLarge,
+ ZSTD_error_maxSymbolValue_tooLarge,
+ ZSTD_error_maxSymbolValue_tooSmall,
+ ZSTD_error_dictionary_corrupted,
+ ZSTD_error_dictionary_wrong,
+ ZSTD_error_dictionaryCreation_failed,
+ ZSTD_error_maxCode
+} ZSTD_ErrorCode;
+
+/**
+ * ZSTD_maxCLevel() - maximum compression level available
+ *
+ * Return: Maximum compression level available.
+ */
+int ZSTD_maxCLevel(void);
+/**
+ * ZSTD_compressBound() - maximum compressed size in worst case scenario
+ * @srcSize: The size of the data to compress.
+ *
+ * Return: The maximum compressed size in the worst case scenario.
+ */
+size_t ZSTD_compressBound(size_t srcSize);
+/**
+ * ZSTD_isError() - tells if a size_t function result is an error code
+ * @code: The function result to check for error.
+ *
+ * Return: Non-zero iff the code is an error.
+ */
+static __attribute__((unused)) unsigned int ZSTD_isError(size_t code)
+{
+ return code > (size_t)-ZSTD_error_maxCode;
+}
+/**
+ * ZSTD_getErrorCode() - translates an error function result to a ZSTD_ErrorCode
+ * @functionResult: The result of a function for which ZSTD_isError() is true.
+ *
+ * Return: The ZSTD_ErrorCode corresponding to the functionResult or 0
+ * if the functionResult isn't an error.
+ */
+static __attribute__((unused)) ZSTD_ErrorCode ZSTD_getErrorCode(
+ size_t functionResult)
+{
+ if (!ZSTD_isError(functionResult))
+ return (ZSTD_ErrorCode)0;
+ return (ZSTD_ErrorCode)(0 - functionResult);
+}
+
+/**
+ * enum ZSTD_strategy - zstd compression search strategy
+ *
+ * From faster to stronger.
+ */
+typedef enum {
+ ZSTD_fast,
+ ZSTD_dfast,
+ ZSTD_greedy,
+ ZSTD_lazy,
+ ZSTD_lazy2,
+ ZSTD_btlazy2,
+ ZSTD_btopt,
+ ZSTD_btopt2
+} ZSTD_strategy;
+
+/**
+ * struct ZSTD_compressionParameters - zstd compression parameters
+ * @windowLog: Log of the largest match distance. Larger means more
+ * compression, and more memory needed during decompression.
+ * @chainLog: Fully searched segment. Larger means more compression, slower,
+ * and more memory (useless for fast).
+ * @hashLog: Dispatch table. Larger means more compression,
+ * slower, and more memory.
+ * @searchLog: Number of searches. Larger means more compression and slower.
+ * @searchLength: Match length searched. Larger means faster decompression,
+ * sometimes less compression.
+ * @targetLength: Acceptable match size for optimal parser (only). Larger means
+ * more compression, and slower.
+ * @strategy: The zstd compression strategy.
+ */
+typedef struct {
+ unsigned int windowLog;
+ unsigned int chainLog;
+ unsigned int hashLog;
+ unsigned int searchLog;
+ unsigned int searchLength;
+ unsigned int targetLength;
+ ZSTD_strategy strategy;
+} ZSTD_compressionParameters;
+
+/**
+ * struct ZSTD_frameParameters - zstd frame parameters
+ * @contentSizeFlag: Controls whether content size will be present in the frame
+ * header (when known).
+ * @checksumFlag: Controls whether a 32-bit checksum is generated at the end
+ * of the frame for error detection.
+ * @noDictIDFlag: Controls whether dictID will be saved into the frame header
+ * when using dictionary compression.
+ *
+ * The default value is all fields set to 0.
+ */
+typedef struct {
+ unsigned int contentSizeFlag;
+ unsigned int checksumFlag;
+ unsigned int noDictIDFlag;
+} ZSTD_frameParameters;
+
+/**
+ * struct ZSTD_parameters - zstd parameters
+ * @cParams: The compression parameters.
+ * @fParams: The frame parameters.
+ */
+typedef struct {
+ ZSTD_compressionParameters cParams;
+ ZSTD_frameParameters fParams;
+} ZSTD_parameters;
+
+/**
+ * ZSTD_getCParams() - returns ZSTD_compressionParameters for selected level
+ * @compressionLevel: The compression level from 1 to ZSTD_maxCLevel().
+ * @estimatedSrcSize: The estimated source size to compress or 0 if unknown.
+ * @dictSize: The dictionary size or 0 if a dictionary isn't being used.
+ *
+ * Return: The selected ZSTD_compressionParameters.
+ */
+ZSTD_compressionParameters ZSTD_getCParams(int compressionLevel,
+ unsigned long long estimatedSrcSize, size_t dictSize);
+
+/**
+ * ZSTD_getParams() - returns ZSTD_parameters for selected level
+ * @compressionLevel: The compression level from 1 to ZSTD_maxCLevel().
+ * @estimatedSrcSize: The estimated source size to compress or 0 if unknown.
+ * @dictSize: The dictionary size or 0 if a dictionary isn't being used.
+ *
+ * The same as ZSTD_getCParams() except also selects the default frame
+ * parameters (all zero).
+ *
+ * Return: The selected ZSTD_parameters.
+ */
+ZSTD_parameters ZSTD_getParams(int compressionLevel,
+ unsigned long long estimatedSrcSize, size_t dictSize);
+
+/*-*************************************
+ * Explicit memory management
+ **************************************/
+
+/**
+ * ZSTD_CCtxWorkspaceBound() - amount of memory needed to initialize a ZSTD_CCtx
+ * @cParams: The compression parameters to be used for compression.
+ *
+ * If multiple compression parameters might be used, the caller must call
+ * ZSTD_CCtxWorkspaceBound() for each set of parameters and use the maximum
+ * size.
+ *
+ * Return: A lower bound on the size of the workspace that is passed to
+ * ZSTD_initCCtx().
+ */
+size_t ZSTD_CCtxWorkspaceBound(ZSTD_compressionParameters cParams);
+
+/**
+ * struct ZSTD_CCtx - the zstd compression context
+ *
+ * When compressing many times it is recommended to allocate a context just once
+ * and reuse it for each successive compression operation.
+ */
+typedef struct ZSTD_CCtx_s ZSTD_CCtx;
+/**
+ * ZSTD_initCCtx() - initialize a zstd compression context
+ * @workspace: The workspace to emplace the context into. It must outlive
+ * the returned context.
+ * @workspaceSize: The size of workspace. Use ZSTD_CCtxWorkspaceBound() to
+ * determine how large the workspace must be.
+ *
+ * Return: A compression context emplaced into workspace.
+ */
+ZSTD_CCtx *ZSTD_initCCtx(void *workspace, size_t workspaceSize);
+
+/**
+ * ZSTD_compressCCtx() - compress src into dst
+ * @ctx: The context. Must have been initialized with a workspace at
+ * least as large as ZSTD_CCtxWorkspaceBound(params.cParams).
+ * @dst: The buffer to compress src into.
+ * @dstCapacity: The size of the destination buffer. May be any size, but
+ * ZSTD_compressBound(srcSize) is guaranteed to be large enough.
+ * @src: The data to compress.
+ * @srcSize: The size of the data to compress.
+ * @params: The parameters to use for compression. See ZSTD_getParams().
+ *
+ * Return: The compressed size or an error, which can be checked using
+ * ZSTD_isError().
+ */
+size_t ZSTD_compressCCtx(ZSTD_CCtx *ctx, void *dst, size_t dstCapacity,
+ const void *src, size_t srcSize, ZSTD_parameters params);
+
+/**
+ * ZSTD_DCtxWorkspaceBound() - amount of memory needed to initialize a ZSTD_DCtx
+ *
+ * Return: A lower bound on the size of the workspace that is passed to
+ * ZSTD_initDCtx().
+ */
+size_t ZSTD_DCtxWorkspaceBound(void);
+
+/**
+ * struct ZSTD_DCtx - the zstd decompression context
+ *
+ * When decompressing many times it is recommended to allocate a context just
+ * once and reuse it for each successive decompression operation.
+ */
+typedef struct ZSTD_DCtx_s ZSTD_DCtx;
+/**
+ * ZSTD_initDCtx() - initialize a zstd decompression context
+ * @workspace: The workspace to emplace the context into. It must outlive
+ * the returned context.
+ * @workspaceSize: The size of workspace. Use ZSTD_DCtxWorkspaceBound() to
+ * determine how large the workspace must be.
+ *
+ * Return: A decompression context emplaced into workspace.
+ */
+ZSTD_DCtx *ZSTD_initDCtx(void *workspace, size_t workspaceSize);
+
+/**
+ * ZSTD_decompressDCtx() - decompress zstd compressed src into dst
+ * @ctx: The decompression context.
+ * @dst: The buffer to decompress src into.
+ * @dstCapacity: The size of the destination buffer. Must be at least as large
+ * as the decompressed size. If the caller cannot upper bound the
+ * decompressed size, then it's better to use the streaming API.
+ * @src: The zstd compressed data to decompress. Multiple concatenated
+ * frames and skippable frames are allowed.
+ * @srcSize: The exact size of the data to decompress.
+ *
+ * Return: The decompressed size or an error, which can be checked using
+ * ZSTD_isError().
+ */
+size_t ZSTD_decompressDCtx(ZSTD_DCtx *ctx, void *dst, size_t dstCapacity,
+ const void *src, size_t srcSize);
+
+/*-************************
+ * Simple dictionary API
+ **************************/
+
+/**
+ * ZSTD_compress_usingDict() - compress src into dst using a dictionary
+ * @ctx: The context. Must have been initialized with a workspace at
+ * least as large as ZSTD_CCtxWorkspaceBound(params.cParams).
+ * @dst: The buffer to compress src into.
+ * @dstCapacity: The size of the destination buffer. May be any size, but
+ * ZSTD_compressBound(srcSize) is guaranteed to be large enough.
+ * @src: The data to compress.
+ * @srcSize: The size of the data to compress.
+ * @dict: The dictionary to use for compression.
+ * @dictSize: The size of the dictionary.
+ * @params: The parameters to use for compression. See ZSTD_getParams().
+ *
+ * Compression using a predefined dictionary. The same dictionary must be used
+ * during decompression.
+ *
+ * Return: The compressed size or an error, which can be checked using
+ * ZSTD_isError().
+ */
+size_t ZSTD_compress_usingDict(ZSTD_CCtx *ctx, void *dst, size_t dstCapacity,
+ const void *src, size_t srcSize, const void *dict, size_t dictSize,
+ ZSTD_parameters params);
+
+/**
+ * ZSTD_decompress_usingDict() - decompress src into dst using a dictionary
+ * @ctx: The decompression context.
+ * @dst: The buffer to decompress src into.
+ * @dstCapacity: The size of the destination buffer. Must be at least as large
+ * as the decompressed size. If the caller cannot upper bound the
+ * decompressed size, then it's better to use the streaming API.
+ * @src: The zstd compressed data to decompress. Multiple concatenated
+ * frames and skippable frames are allowed.
+ * @srcSize: The exact size of the data to decompress.
+ * @dict: The dictionary to use for decompression. The same dictionary
+ * must've been used to compress the data.
+ * @dictSize: The size of the dictionary.
+ *
+ * Return: The decompressed size or an error, which can be checked using
+ * ZSTD_isError().
+ */
+size_t ZSTD_decompress_usingDict(ZSTD_DCtx *ctx, void *dst, size_t dstCapacity,
+ const void *src, size_t srcSize, const void *dict, size_t dictSize);
+
+/*-**************************
+ * Fast dictionary API
+ ***************************/
+
+/**
+ * ZSTD_CDictWorkspaceBound() - memory needed to initialize a ZSTD_CDict
+ * @cParams: The compression parameters to be used for compression.
+ *
+ * Return: A lower bound on the size of the workspace that is passed to
+ * ZSTD_initCDict().
+ */
+size_t ZSTD_CDictWorkspaceBound(ZSTD_compressionParameters cParams);
+
+/**
+ * struct ZSTD_CDict - a digested dictionary to be used for compression
+ */
+typedef struct ZSTD_CDict_s ZSTD_CDict;
+
+/**
+ * ZSTD_initCDict() - initialize a digested dictionary for compression
+ * @dictBuffer: The dictionary to digest. The buffer is referenced by the
+ * ZSTD_CDict so it must outlive the returned ZSTD_CDict.
+ * @dictSize: The size of the dictionary.
+ * @params: The parameters to use for compression. See ZSTD_getParams().
+ * @workspace: The workspace. It must outlive the returned ZSTD_CDict.
+ * @workspaceSize: The workspace size. Must be at least
+ * ZSTD_CDictWorkspaceBound(params.cParams).
+ *
+ * When compressing multiple messages / blocks with the same dictionary it is
+ * recommended to load it just once. The ZSTD_CDict merely references the
+ * dictBuffer, so it must outlive the returned ZSTD_CDict.
+ *
+ * Return: The digested dictionary emplaced into workspace.
+ */
+ZSTD_CDict *ZSTD_initCDict(const void *dictBuffer, size_t dictSize,
+ ZSTD_parameters params, void *workspace, size_t workspaceSize);
+
+/**
+ * ZSTD_compress_usingCDict() - compress src into dst using a ZSTD_CDict
+ * @ctx: The context. Must have been initialized with a workspace at
+ * least as large as ZSTD_CCtxWorkspaceBound(cParams) where
+ * cParams are the compression parameters used to initialize the
+ * cdict.
+ * @dst: The buffer to compress src into.
+ * @dstCapacity: The size of the destination buffer. May be any size, but
+ * ZSTD_compressBound(srcSize) is guaranteed to be large enough.
+ * @src: The data to compress.
+ * @srcSize: The size of the data to compress.
+ * @cdict: The digested dictionary to use for compression.
+ * @params: The parameters to use for compression. See ZSTD_getParams().
+ *
+ * Compression using a digested dictionary. The same dictionary must be used
+ * during decompression.
+ *
+ * Return: The compressed size or an error, which can be checked using
+ * ZSTD_isError().
+ */
+size_t ZSTD_compress_usingCDict(ZSTD_CCtx *cctx, void *dst, size_t dstCapacity,
+ const void *src, size_t srcSize, const ZSTD_CDict *cdict);
+
+
+/**
+ * ZSTD_DDictWorkspaceBound() - memory needed to initialize a ZSTD_DDict
+ *
+ * Return: A lower bound on the size of the workspace that is passed to
+ * ZSTD_initDDict().
+ */
+size_t ZSTD_DDictWorkspaceBound(void);
+
+/**
+ * struct ZSTD_DDict - a digested dictionary to be used for decompression
+ */
+typedef struct ZSTD_DDict_s ZSTD_DDict;
+
+/**
+ * ZSTD_initDDict() - initialize a digested dictionary for decompression
+ * @dictBuffer: The dictionary to digest. The buffer is referenced by the
+ * ZSTD_DDict so it must outlive the returned ZSTD_DDict.
+ * @dictSize: The size of the dictionary.
+ * @workspace: The workspace. It must outlive the returned ZSTD_DDict.
+ * @workspaceSize: The workspace size. Must be at least
+ * ZSTD_DDictWorkspaceBound().
+ *
+ * When decompressing multiple messages / blocks with the same dictionary it is
+ * recommended to load it just once. The ZSTD_DDict merely references the
+ * dictBuffer, so it must outlive the returned ZSTD_DDict.
+ *
+ * Return: The digested dictionary emplaced into workspace.
+ */
+ZSTD_DDict *ZSTD_initDDict(const void *dictBuffer, size_t dictSize,
+ void *workspace, size_t workspaceSize);
+
+/**
+ * ZSTD_decompress_usingDDict() - decompress src into dst using a ZSTD_DDict
+ * @ctx: The decompression context.
+ * @dst: The buffer to decompress src into.
+ * @dstCapacity: The size of the destination buffer. Must be at least as large
+ * as the decompressed size. If the caller cannot upper bound the
+ * decompressed size, then it's better to use the streaming API.
+ * @src: The zstd compressed data to decompress. Multiple concatenated
+ * frames and skippable frames are allowed.
+ * @srcSize: The exact size of the data to decompress.
+ * @ddict: The digested dictionary to use for decompression. The same
+ * dictionary must've been used to compress the data.
+ *
+ * Return: The decompressed size or an error, which can be checked using
+ * ZSTD_isError().
+ */
+size_t ZSTD_decompress_usingDDict(ZSTD_DCtx *dctx, void *dst,
+ size_t dstCapacity, const void *src, size_t srcSize,
+ const ZSTD_DDict *ddict);
+
+
+/*-**************************
+ * Streaming
+ ***************************/
+
+/**
+ * struct ZSTD_inBuffer - input buffer for streaming
+ * @src: Start of the input buffer.
+ * @size: Size of the input buffer.
+ * @pos: Position where reading stopped. Will be updated.
+ * Necessarily 0 <= pos <= size.
+ */
+typedef struct ZSTD_inBuffer_s {
+ const void *src;
+ size_t size;
+ size_t pos;
+} ZSTD_inBuffer;
+
+/**
+ * struct ZSTD_outBuffer - output buffer for streaming
+ * @dst: Start of the output buffer.
+ * @size: Size of the output buffer.
+ * @pos: Position where writing stopped. Will be updated.
+ * Necessarily 0 <= pos <= size.
+ */
+typedef struct ZSTD_outBuffer_s {
+ void *dst;
+ size_t size;
+ size_t pos;
+} ZSTD_outBuffer;
+
+
+
+/*-*****************************************************************************
+ * Streaming compression - HowTo
+ *
+ * A ZSTD_CStream object is required to track streaming operation.
+ * Use ZSTD_initCStream() to initialize a ZSTD_CStream object.
+ * ZSTD_CStream objects can be reused multiple times on consecutive compression
+ * operations. It is recommended to re-use ZSTD_CStream in situations where many
+ * streaming operations will be achieved consecutively. Use one separate
+ * ZSTD_CStream per thread for parallel execution.
+ *
+ * Use ZSTD_compressStream() repetitively to consume input stream.
+ * The function will automatically update both `pos` fields.
+ * Note that it may not consume the entire input, in which case `pos < size`,
+ * and it's up to the caller to present again remaining data.
+ * It returns a hint for the preferred number of bytes to use as an input for
+ * the next function call.
+ *
+ * At any moment, it's possible to flush whatever data remains within internal
+ * buffer, using ZSTD_flushStream(). `output->pos` will be updated. There might
+ * still be some content left within the internal buffer if `output->size` is
+ * too small. It returns the number of bytes left in the internal buffer and
+ * must be called until it returns 0.
+ *
+ * ZSTD_endStream() instructs to finish a frame. It will perform a flush and
+ * write frame epilogue. The epilogue is required for decoders to consider a
+ * frame completed. Similar to ZSTD_flushStream(), it may not be able to flush
+ * the full content if `output->size` is too small. In which case, call again
+ * ZSTD_endStream() to complete the flush. It returns the number of bytes left
+ * in the internal buffer and must be called until it returns 0.
+ ******************************************************************************/
+
+/**
+ * ZSTD_CStreamWorkspaceBound() - memory needed to initialize a ZSTD_CStream
+ * @cParams: The compression parameters to be used for compression.
+ *
+ * Return: A lower bound on the size of the workspace that is passed to
+ * ZSTD_initCStream() and ZSTD_initCStream_usingCDict().
+ */
+size_t ZSTD_CStreamWorkspaceBound(ZSTD_compressionParameters cParams);
+
+/**
+ * struct ZSTD_CStream - the zstd streaming compression context
+ */
+typedef struct ZSTD_CStream_s ZSTD_CStream;
+
+/*===== ZSTD_CStream management functions =====*/
+/**
+ * ZSTD_initCStream() - initialize a zstd streaming compression context
+ * @params: The zstd compression parameters.
+ * @pledgedSrcSize: If params.fParams.contentSizeFlag == 1 then the caller must
+ * pass the source size (zero means empty source). Otherwise,
+ * the caller may optionally pass the source size, or zero if
+ * unknown.
+ * @workspace: The workspace to emplace the context into. It must outlive
+ * the returned context.
+ * @workspaceSize: The size of workspace.
+ * Use ZSTD_CStreamWorkspaceBound(params.cParams) to determine
+ * how large the workspace must be.
+ *
+ * Return: The zstd streaming compression context.
+ */
+ZSTD_CStream *ZSTD_initCStream(ZSTD_parameters params,
+ unsigned long long pledgedSrcSize, void *workspace,
+ size_t workspaceSize);
+
+/**
+ * ZSTD_initCStream_usingCDict() - initialize a streaming compression context
+ * @cdict: The digested dictionary to use for compression.
+ * @pledgedSrcSize: Optionally the source size, or zero if unknown.
+ * @workspace: The workspace to emplace the context into. It must outlive
+ * the returned context.
+ * @workspaceSize: The size of workspace. Call ZSTD_CStreamWorkspaceBound()
+ * with the cParams used to initialize the cdict to determine
+ * how large the workspace must be.
+ *
+ * Return: The zstd streaming compression context.
+ */
+ZSTD_CStream *ZSTD_initCStream_usingCDict(const ZSTD_CDict *cdict,
+ unsigned long long pledgedSrcSize, void *workspace,
+ size_t workspaceSize);
+
+/*===== Streaming compression functions =====*/
+/**
+ * ZSTD_resetCStream() - reset the context using parameters from creation
+ * @zcs: The zstd streaming compression context to reset.
+ * @pledgedSrcSize: Optionally the source size, or zero if unknown.
+ *
+ * Resets the context using the parameters from creation. Skips dictionary
+ * loading, since it can be reused. If `pledgedSrcSize` is non-zero the frame
+ * content size is always written into the frame header.
+ *
+ * Return: Zero or an error, which can be checked using ZSTD_isError().
+ */
+size_t ZSTD_resetCStream(ZSTD_CStream *zcs, unsigned long long pledgedSrcSize);
+/**
+ * ZSTD_compressStream() - streaming compress some of input into output
+ * @zcs: The zstd streaming compression context.
+ * @output: Destination buffer. `output->pos` is updated to indicate how much
+ * compressed data was written.
+ * @input: Source buffer. `input->pos` is updated to indicate how much data was
+ * read. Note that it may not consume the entire input, in which case
+ * `input->pos < input->size`, and it's up to the caller to present
+ * remaining data again.
+ *
+ * The `input` and `output` buffers may be any size. Guaranteed to make some
+ * forward progress if `input` and `output` are not empty.
+ *
+ * Return: A hint for the number of bytes to use as the input for the next
+ * function call or an error, which can be checked using
+ * ZSTD_isError().
+ */
+size_t ZSTD_compressStream(ZSTD_CStream *zcs, ZSTD_outBuffer *output,
+ ZSTD_inBuffer *input);
+/**
+ * ZSTD_flushStream() - flush internal buffers into output
+ * @zcs: The zstd streaming compression context.
+ * @output: Destination buffer. `output->pos` is updated to indicate how much
+ * compressed data was written.
+ *
+ * ZSTD_flushStream() must be called until it returns 0, meaning all the data
+ * has been flushed. Since ZSTD_flushStream() causes a block to be ended,
+ * calling it too often will degrade the compression ratio.
+ *
+ * Return: The number of bytes still present within internal buffers or an
+ * error, which can be checked using ZSTD_isError().
+ */
+size_t ZSTD_flushStream(ZSTD_CStream *zcs, ZSTD_outBuffer *output);
+/**
+ * ZSTD_endStream() - flush internal buffers into output and end the frame
+ * @zcs: The zstd streaming compression context.
+ * @output: Destination buffer. `output->pos` is updated to indicate how much
+ * compressed data was written.
+ *
+ * ZSTD_endStream() must be called until it returns 0, meaning all the data has
+ * been flushed and the frame epilogue has been written.
+ *
+ * Return: The number of bytes still present within internal buffers or an
+ * error, which can be checked using ZSTD_isError().
+ */
+size_t ZSTD_endStream(ZSTD_CStream *zcs, ZSTD_outBuffer *output);
+
+/**
+ * ZSTD_CStreamInSize() - recommended size for the input buffer
+ *
+ * Return: The recommended size for the input buffer.
+ */
+size_t ZSTD_CStreamInSize(void);
+/**
+ * ZSTD_CStreamOutSize() - recommended size for the output buffer
+ *
+ * When the output buffer is at least this large, it is guaranteed to be large
+ * enough to flush at least one complete compressed block.
+ *
+ * Return: The recommended size for the output buffer.
+ */
+size_t ZSTD_CStreamOutSize(void);
+
+
+
+/*-*****************************************************************************
+ * Streaming decompression - HowTo
+ *
+ * A ZSTD_DStream object is required to track streaming operations.
+ * Use ZSTD_initDStream() to initialize a ZSTD_DStream object.
+ * ZSTD_DStream objects can be re-used multiple times.
+ *
+ * Use ZSTD_decompressStream() repetitively to consume your input.
+ * The function will update both `pos` fields.
+ * If `input->pos < input->size`, some input has not been consumed.
+ * It's up to the caller to present again remaining data.
+ * If `output->pos < output->size`, decoder has flushed everything it could.
+ * Returns 0 iff a frame is completely decoded and fully flushed.
+ * Otherwise it returns a suggested next input size that will never load more
+ * than the current frame.
+ ******************************************************************************/
+
+/**
+ * ZSTD_DStreamWorkspaceBound() - memory needed to initialize a ZSTD_DStream
+ * @maxWindowSize: The maximum window size allowed for compressed frames.
+ *
+ * Return: A lower bound on the size of the workspace that is passed to
+ * ZSTD_initDStream() and ZSTD_initDStream_usingDDict().
+ */
+size_t ZSTD_DStreamWorkspaceBound(size_t maxWindowSize);
+
+/**
+ * struct ZSTD_DStream - the zstd streaming decompression context
+ */
+typedef struct ZSTD_DStream_s ZSTD_DStream;
+/*===== ZSTD_DStream management functions =====*/
+/**
+ * ZSTD_initDStream() - initialize a zstd streaming decompression context
+ * @maxWindowSize: The maximum window size allowed for compressed frames.
+ * @workspace: The workspace to emplace the context into. It must outlive
+ * the returned context.
+ * @workspaceSize: The size of workspace.
+ * Use ZSTD_DStreamWorkspaceBound(maxWindowSize) to determine
+ * how large the workspace must be.
+ *
+ * Return: The zstd streaming decompression context.
+ */
+ZSTD_DStream *ZSTD_initDStream(size_t maxWindowSize, void *workspace,
+ size_t workspaceSize);
+/**
+ * ZSTD_initDStream_usingDDict() - initialize streaming decompression context
+ * @maxWindowSize: The maximum window size allowed for compressed frames.
+ * @ddict: The digested dictionary to use for decompression.
+ * @workspace: The workspace to emplace the context into. It must outlive
+ * the returned context.
+ * @workspaceSize: The size of workspace.
+ * Use ZSTD_DStreamWorkspaceBound(maxWindowSize) to determine
+ * how large the workspace must be.
+ *
+ * Return: The zstd streaming decompression context.
+ */
+ZSTD_DStream *ZSTD_initDStream_usingDDict(size_t maxWindowSize,
+ const ZSTD_DDict *ddict, void *workspace, size_t workspaceSize);
+
+/*===== Streaming decompression functions =====*/
+/**
+ * ZSTD_resetDStream() - reset the context using parameters from creation
+ * @zds: The zstd streaming decompression context to reset.
+ *
+ * Resets the context using the parameters from creation. Skips dictionary
+ * loading, since it can be reused.
+ *
+ * Return: Zero or an error, which can be checked using ZSTD_isError().
+ */
+size_t ZSTD_resetDStream(ZSTD_DStream *zds);
+/**
+ * ZSTD_decompressStream() - streaming decompress some of input into output
+ * @zds: The zstd streaming decompression context.
+ * @output: Destination buffer. `output.pos` is updated to indicate how much
+ * decompressed data was written.
+ * @input: Source buffer. `input.pos` is updated to indicate how much data was
+ * read. Note that it may not consume the entire input, in which case
+ * `input.pos < input.size`, and it's up to the caller to present
+ * remaining data again.
+ *
+ * The `input` and `output` buffers may be any size. Guaranteed to make some
+ * forward progress if `input` and `output` are not empty.
+ * ZSTD_decompressStream() will not consume the last byte of the frame until
+ * the entire frame is flushed.
+ *
+ * Return: Returns 0 iff a frame is completely decoded and fully flushed.
+ * Otherwise returns a hint for the number of bytes to use as the input
+ * for the next function call or an error, which can be checked using
+ * ZSTD_isError(). The size hint will never load more than the frame.
+ */
+size_t ZSTD_decompressStream(ZSTD_DStream *zds, ZSTD_outBuffer *output,
+ ZSTD_inBuffer *input);
+
+/**
+ * ZSTD_DStreamInSize() - recommended size for the input buffer
+ *
+ * Return: The recommended size for the input buffer.
+ */
+size_t ZSTD_DStreamInSize(void);
+/**
+ * ZSTD_DStreamOutSize() - recommended size for the output buffer
+ *
+ * When the output buffer is at least this large, it is guaranteed to be large
+ * enough to flush at least one complete decompressed block.
+ *
+ * Return: The recommended size for the output buffer.
+ */
+size_t ZSTD_DStreamOutSize(void);
+
+
+/* --- Constants ---*/
+#define ZSTD_MAGICNUMBER 0xFD2FB528 /* >= v0.8.0 */
+#define ZSTD_MAGIC_SKIPPABLE_START 0x184D2A50U
+
+#define ZSTD_CONTENTSIZE_UNKNOWN (0ULL - 1)
+#define ZSTD_CONTENTSIZE_ERROR (0ULL - 2)
+
+#define ZSTD_WINDOWLOG_MAX_32 27
+#define ZSTD_WINDOWLOG_MAX_64 27
+#define ZSTD_WINDOWLOG_MAX \
+ ((unsigned int)(sizeof(size_t) == 4 \
+ ? ZSTD_WINDOWLOG_MAX_32 \
+ : ZSTD_WINDOWLOG_MAX_64))
+#define ZSTD_WINDOWLOG_MIN 10
+#define ZSTD_HASHLOG_MAX ZSTD_WINDOWLOG_MAX
+#define ZSTD_HASHLOG_MIN 6
+#define ZSTD_CHAINLOG_MAX (ZSTD_WINDOWLOG_MAX+1)
+#define ZSTD_CHAINLOG_MIN ZSTD_HASHLOG_MIN
+#define ZSTD_HASHLOG3_MAX 17
+#define ZSTD_SEARCHLOG_MAX (ZSTD_WINDOWLOG_MAX-1)
+#define ZSTD_SEARCHLOG_MIN 1
+/* only for ZSTD_fast, other strategies are limited to 6 */
+#define ZSTD_SEARCHLENGTH_MAX 7
+/* only for ZSTD_btopt, other strategies are limited to 4 */
+#define ZSTD_SEARCHLENGTH_MIN 3
+#define ZSTD_TARGETLENGTH_MIN 4
+#define ZSTD_TARGETLENGTH_MAX 999
+
+/* for static allocation */
+#define ZSTD_FRAMEHEADERSIZE_MAX 18
+#define ZSTD_FRAMEHEADERSIZE_MIN 6
+static const size_t ZSTD_frameHeaderSize_prefix = 5;
+static const size_t ZSTD_frameHeaderSize_min = ZSTD_FRAMEHEADERSIZE_MIN;
+static const size_t ZSTD_frameHeaderSize_max = ZSTD_FRAMEHEADERSIZE_MAX;
+/* magic number + skippable frame length */
+static const size_t ZSTD_skippableHeaderSize = 8;
+
+
+/*-*************************************
+ * Compressed size functions
+ **************************************/
+
+/**
+ * ZSTD_findFrameCompressedSize() - returns the size of a compressed frame
+ * @src: Source buffer. It should point to the start of a zstd encoded frame
+ * or a skippable frame.
+ * @srcSize: The size of the source buffer. It must be at least as large as the
+ * size of the frame.
+ *
+ * Return: The compressed size of the frame pointed to by `src` or an error,
+ * which can be check with ZSTD_isError().
+ * Suitable to pass to ZSTD_decompress() or similar functions.
+ */
+size_t ZSTD_findFrameCompressedSize(const void *src, size_t srcSize);
+
+/*-*************************************
+ * Decompressed size functions
+ **************************************/
+/**
+ * ZSTD_getFrameContentSize() - returns the content size in a zstd frame header
+ * @src: It should point to the start of a zstd encoded frame.
+ * @srcSize: The size of the source buffer. It must be at least as large as the
+ * frame header. `ZSTD_frameHeaderSize_max` is always large enough.
+ *
+ * Return: The frame content size stored in the frame header if known.
+ * `ZSTD_CONTENTSIZE_UNKNOWN` if the content size isn't stored in the
+ * frame header. `ZSTD_CONTENTSIZE_ERROR` on invalid input.
+ */
+unsigned long long ZSTD_getFrameContentSize(const void *src, size_t srcSize);
+
+/**
+ * ZSTD_findDecompressedSize() - returns decompressed size of a series of frames
+ * @src: It should point to the start of a series of zstd encoded and/or
+ * skippable frames.
+ * @srcSize: The exact size of the series of frames.
+ *
+ * If any zstd encoded frame in the series doesn't have the frame content size
+ * set, `ZSTD_CONTENTSIZE_UNKNOWN` is returned. But frame content size is always
+ * set when using ZSTD_compress(). The decompressed size can be very large.
+ * If the source is untrusted, the decompressed size could be wrong or
+ * intentionally modified. Always ensure the result fits within the
+ * application's authorized limits. ZSTD_findDecompressedSize() handles multiple
+ * frames, and so it must traverse the input to read each frame header. This is
+ * efficient as most of the data is skipped, however it does mean that all frame
+ * data must be present and valid.
+ *
+ * Return: Decompressed size of all the data contained in the frames if known.
+ * `ZSTD_CONTENTSIZE_UNKNOWN` if the decompressed size is unknown.
+ * `ZSTD_CONTENTSIZE_ERROR` if an error occurred.
+ */
+unsigned long long ZSTD_findDecompressedSize(const void *src, size_t srcSize);
+
+/*-*************************************
+ * Advanced compression functions
+ **************************************/
+/**
+ * ZSTD_checkCParams() - ensure parameter values remain within authorized range
+ * @cParams: The zstd compression parameters.
+ *
+ * Return: Zero or an error, which can be checked using ZSTD_isError().
+ */
+size_t ZSTD_checkCParams(ZSTD_compressionParameters cParams);
+
+/**
+ * ZSTD_adjustCParams() - optimize parameters for a given srcSize and dictSize
+ * @srcSize: Optionally the estimated source size, or zero if unknown.
+ * @dictSize: Optionally the estimated dictionary size, or zero if unknown.
+ *
+ * Return: The optimized parameters.
+ */
+ZSTD_compressionParameters ZSTD_adjustCParams(
+ ZSTD_compressionParameters cParams, unsigned long long srcSize,
+ size_t dictSize);
+
+/*--- Advanced decompression functions ---*/
+
+/**
+ * ZSTD_isFrame() - returns true iff the buffer starts with a valid frame
+ * @buffer: The source buffer to check.
+ * @size: The size of the source buffer, must be at least 4 bytes.
+ *
+ * Return: True iff the buffer starts with a zstd or skippable frame identifier.
+ */
+unsigned int ZSTD_isFrame(const void *buffer, size_t size);
+
+/**
+ * ZSTD_getDictID_fromDict() - returns the dictionary id stored in a dictionary
+ * @dict: The dictionary buffer.
+ * @dictSize: The size of the dictionary buffer.
+ *
+ * Return: The dictionary id stored within the dictionary or 0 if the
+ * dictionary is not a zstd dictionary. If it returns 0 the
+ * dictionary can still be loaded as a content-only dictionary.
+ */
+unsigned int ZSTD_getDictID_fromDict(const void *dict, size_t dictSize);
+
+/**
+ * ZSTD_getDictID_fromDDict() - returns the dictionary id stored in a ZSTD_DDict
+ * @ddict: The ddict to find the id of.
+ *
+ * Return: The dictionary id stored within `ddict` or 0 if the dictionary is not
+ * a zstd dictionary. If it returns 0 `ddict` will be loaded as a
+ * content-only dictionary.
+ */
+unsigned int ZSTD_getDictID_fromDDict(const ZSTD_DDict *ddict);
+
+/**
+ * ZSTD_getDictID_fromFrame() - returns the dictionary id stored in a zstd frame
+ * @src: Source buffer. It must be a zstd encoded frame.
+ * @srcSize: The size of the source buffer. It must be at least as large as the
+ * frame header. `ZSTD_frameHeaderSize_max` is always large enough.
+ *
+ * Return: The dictionary id required to decompress the frame stored within
+ * `src` or 0 if the dictionary id could not be decoded. It can return
+ * 0 if the frame does not require a dictionary, the dictionary id
+ * wasn't stored in the frame, `src` is not a zstd frame, or `srcSize`
+ * is too small.
+ */
+unsigned int ZSTD_getDictID_fromFrame(const void *src, size_t srcSize);
+
+/**
+ * struct ZSTD_frameParams - zstd frame parameters stored in the frame header
+ * @frameContentSize: The frame content size, or 0 if not present.
+ * @windowSize: The window size, or 0 if the frame is a skippable frame.
+ * @dictID: The dictionary id, or 0 if not present.
+ * @checksumFlag: Whether a checksum was used.
+ */
+typedef struct {
+ unsigned long long frameContentSize;
+ unsigned int windowSize;
+ unsigned int dictID;
+ unsigned int checksumFlag;
+} ZSTD_frameParams;
+
+/**
+ * ZSTD_getFrameParams() - extracts parameters from a zstd or skippable frame
+ * @fparamsPtr: On success the frame parameters are written here.
+ * @src: The source buffer. It must point to a zstd or skippable frame.
+ * @srcSize: The size of the source buffer. `ZSTD_frameHeaderSize_max` is
+ * always large enough to succeed.
+ *
+ * Return: 0 on success. If more data is required it returns how many bytes
+ * must be provided to make forward progress. Otherwise it returns
+ * an error, which can be checked using ZSTD_isError().
+ */
+size_t ZSTD_getFrameParams(ZSTD_frameParams *fparamsPtr, const void *src,
+ size_t srcSize);
+
+/*-*****************************************************************************
+ * Buffer-less and synchronous inner streaming functions
+ *
+ * This is an advanced API, giving full control over buffer management, for
+ * users which need direct control over memory.
+ * But it's also a complex one, with many restrictions (documented below).
+ * Prefer using normal streaming API for an easier experience
+ ******************************************************************************/
+
+/*-*****************************************************************************
+ * Buffer-less streaming compression (synchronous mode)
+ *
+ * A ZSTD_CCtx object is required to track streaming operations.
+ * Use ZSTD_initCCtx() to initialize a context.
+ * ZSTD_CCtx object can be re-used multiple times within successive compression
+ * operations.
+ *
+ * Start by initializing a context.
+ * Use ZSTD_compressBegin(), or ZSTD_compressBegin_usingDict() for dictionary
+ * compression,
+ * or ZSTD_compressBegin_advanced(), for finer parameter control.
+ * It's also possible to duplicate a reference context which has already been
+ * initialized, using ZSTD_copyCCtx()
+ *
+ * Then, consume your input using ZSTD_compressContinue().
+ * There are some important considerations to keep in mind when using this
+ * advanced function :
+ * - ZSTD_compressContinue() has no internal buffer. It uses externally provided
+ * buffer only.
+ * - Interface is synchronous : input is consumed entirely and produce 1+
+ * (or more) compressed blocks.
+ * - Caller must ensure there is enough space in `dst` to store compressed data
+ * under worst case scenario. Worst case evaluation is provided by
+ * ZSTD_compressBound().
+ * ZSTD_compressContinue() doesn't guarantee recover after a failed
+ * compression.
+ * - ZSTD_compressContinue() presumes prior input ***is still accessible and
+ * unmodified*** (up to maximum distance size, see WindowLog).
+ * It remembers all previous contiguous blocks, plus one separated memory
+ * segment (which can itself consists of multiple contiguous blocks)
+ * - ZSTD_compressContinue() detects that prior input has been overwritten when
+ * `src` buffer overlaps. In which case, it will "discard" the relevant memory
+ * section from its history.
+ *
+ * Finish a frame with ZSTD_compressEnd(), which will write the last block(s)
+ * and optional checksum. It's possible to use srcSize==0, in which case, it
+ * will write a final empty block to end the frame. Without last block mark,
+ * frames will be considered unfinished (corrupted) by decoders.
+ *
+ * `ZSTD_CCtx` object can be re-used (ZSTD_compressBegin()) to compress some new
+ * frame.
+ ******************************************************************************/
+
+/*===== Buffer-less streaming compression functions =====*/
+size_t ZSTD_compressBegin(ZSTD_CCtx *cctx, int compressionLevel);
+size_t ZSTD_compressBegin_usingDict(ZSTD_CCtx *cctx, const void *dict,
+ size_t dictSize, int compressionLevel);
+size_t ZSTD_compressBegin_advanced(ZSTD_CCtx *cctx, const void *dict,
+ size_t dictSize, ZSTD_parameters params,
+ unsigned long long pledgedSrcSize);
+size_t ZSTD_copyCCtx(ZSTD_CCtx *cctx, const ZSTD_CCtx *preparedCCtx,
+ unsigned long long pledgedSrcSize);
+size_t ZSTD_compressBegin_usingCDict(ZSTD_CCtx *cctx, const ZSTD_CDict *cdict,
+ unsigned long long pledgedSrcSize);
+size_t ZSTD_compressContinue(ZSTD_CCtx *cctx, void *dst, size_t dstCapacity,
+ const void *src, size_t srcSize);
+size_t ZSTD_compressEnd(ZSTD_CCtx *cctx, void *dst, size_t dstCapacity,
+ const void *src, size_t srcSize);
+
+
+
+/*-*****************************************************************************
+ * Buffer-less streaming decompression (synchronous mode)
+ *
+ * A ZSTD_DCtx object is required to track streaming operations.
+ * Use ZSTD_initDCtx() to initialize a context.
+ * A ZSTD_DCtx object can be re-used multiple times.
+ *
+ * First typical operation is to retrieve frame parameters, using
+ * ZSTD_getFrameParams(). It fills a ZSTD_frameParams structure which provide
+ * important information to correctly decode the frame, such as the minimum
+ * rolling buffer size to allocate to decompress data (`windowSize`), and the
+ * dictionary ID used.
+ * Note: content size is optional, it may not be present. 0 means unknown.
+ * Note that these values could be wrong, either because of data malformation,
+ * or because an attacker is spoofing deliberate false information. As a
+ * consequence, check that values remain within valid application range,
+ * especially `windowSize`, before allocation. Each application can set its own
+ * limit, depending on local restrictions. For extended interoperability, it is
+ * recommended to support at least 8 MB.
+ * Frame parameters are extracted from the beginning of the compressed frame.
+ * Data fragment must be large enough to ensure successful decoding, typically
+ * `ZSTD_frameHeaderSize_max` bytes.
+ * Result: 0: successful decoding, the `ZSTD_frameParams` structure is filled.
+ * >0: `srcSize` is too small, provide at least this many bytes.
+ * errorCode, which can be tested using ZSTD_isError().
+ *
+ * Start decompression, with ZSTD_decompressBegin() or
+ * ZSTD_decompressBegin_usingDict(). Alternatively, you can copy a prepared
+ * context, using ZSTD_copyDCtx().
+ *
+ * Then use ZSTD_nextSrcSizeToDecompress() and ZSTD_decompressContinue()
+ * alternatively.
+ * ZSTD_nextSrcSizeToDecompress() tells how many bytes to provide as 'srcSize'
+ * to ZSTD_decompressContinue().
+ * ZSTD_decompressContinue() requires this _exact_ amount of bytes, or it will
+ * fail.
+ *
+ * The result of ZSTD_decompressContinue() is the number of bytes regenerated
+ * within 'dst' (necessarily <= dstCapacity). It can be zero, which is not an
+ * error; it just means ZSTD_decompressContinue() has decoded some metadata
+ * item. It can also be an error code, which can be tested with ZSTD_isError().
+ *
+ * ZSTD_decompressContinue() needs previous data blocks during decompression, up
+ * to `windowSize`. They should preferably be located contiguously, prior to
+ * current block. Alternatively, a round buffer of sufficient size is also
+ * possible. Sufficient size is determined by frame parameters.
+ * ZSTD_decompressContinue() is very sensitive to contiguity, if 2 blocks don't
+ * follow each other, make sure that either the compressor breaks contiguity at
+ * the same place, or that previous contiguous segment is large enough to
+ * properly handle maximum back-reference.
+ *
+ * A frame is fully decoded when ZSTD_nextSrcSizeToDecompress() returns zero.
+ * Context can then be reset to start a new decompression.
+ *
+ * Note: it's possible to know if next input to present is a header or a block,
+ * using ZSTD_nextInputType(). This information is not required to properly
+ * decode a frame.
+ *
+ * == Special case: skippable frames ==
+ *
+ * Skippable frames allow integration of user-defined data into a flow of
+ * concatenated frames. Skippable frames will be ignored (skipped) by a
+ * decompressor. The format of skippable frames is as follows:
+ * a) Skippable frame ID - 4 Bytes, Little endian format, any value from
+ * 0x184D2A50 to 0x184D2A5F
+ * b) Frame Size - 4 Bytes, Little endian format, unsigned 32-bits
+ * c) Frame Content - any content (User Data) of length equal to Frame Size
+ * For skippable frames ZSTD_decompressContinue() always returns 0.
+ * For skippable frames ZSTD_getFrameParams() returns fparamsPtr->windowLog==0
+ * what means that a frame is skippable.
+ * Note: If fparamsPtr->frameContentSize==0, it is ambiguous: the frame might
+ * actually be a zstd encoded frame with no content. For purposes of
+ * decompression, it is valid in both cases to skip the frame using
+ * ZSTD_findFrameCompressedSize() to find its size in bytes.
+ * It also returns frame size as fparamsPtr->frameContentSize.
+ ******************************************************************************/
+
+/*===== Buffer-less streaming decompression functions =====*/
+size_t ZSTD_decompressBegin(ZSTD_DCtx *dctx);
+size_t ZSTD_decompressBegin_usingDict(ZSTD_DCtx *dctx, const void *dict,
+ size_t dictSize);
+void ZSTD_copyDCtx(ZSTD_DCtx *dctx, const ZSTD_DCtx *preparedDCtx);
+size_t ZSTD_nextSrcSizeToDecompress(ZSTD_DCtx *dctx);
+size_t ZSTD_decompressContinue(ZSTD_DCtx *dctx, void *dst, size_t dstCapacity,
+ const void *src, size_t srcSize);
+typedef enum {
+ ZSTDnit_frameHeader,
+ ZSTDnit_blockHeader,
+ ZSTDnit_block,
+ ZSTDnit_lastBlock,
+ ZSTDnit_checksum,
+ ZSTDnit_skippableFrame
+} ZSTD_nextInputType_e;
+ZSTD_nextInputType_e ZSTD_nextInputType(ZSTD_DCtx *dctx);
+
+/*-*****************************************************************************
+ * Block functions
+ *
+ * Block functions produce and decode raw zstd blocks, without frame metadata.
+ * Frame metadata cost is typically ~18 bytes, which can be non-negligible for
+ * very small blocks (< 100 bytes). User will have to take in charge required
+ * information to regenerate data, such as compressed and content sizes.
+ *
+ * A few rules to respect:
+ * - Compressing and decompressing require a context structure
+ * + Use ZSTD_initCCtx() and ZSTD_initDCtx()
+ * - It is necessary to init context before starting
+ * + compression : ZSTD_compressBegin()
+ * + decompression : ZSTD_decompressBegin()
+ * + variants _usingDict() are also allowed
+ * + copyCCtx() and copyDCtx() work too
+ * - Block size is limited, it must be <= ZSTD_getBlockSizeMax()
+ * + If you need to compress more, cut data into multiple blocks
+ * + Consider using the regular ZSTD_compress() instead, as frame metadata
+ * costs become negligible when source size is large.
+ * - When a block is considered not compressible enough, ZSTD_compressBlock()
+ * result will be zero. In which case, nothing is produced into `dst`.
+ * + User must test for such outcome and deal directly with uncompressed data
+ * + ZSTD_decompressBlock() doesn't accept uncompressed data as input!!!
+ * + In case of multiple successive blocks, decoder must be informed of
+ * uncompressed block existence to follow proper history. Use
+ * ZSTD_insertBlock() in such a case.
+ ******************************************************************************/
+
+/* Define for static allocation */
+#define ZSTD_BLOCKSIZE_ABSOLUTEMAX (128 * 1024)
+/*===== Raw zstd block functions =====*/
+size_t ZSTD_getBlockSizeMax(ZSTD_CCtx *cctx);
+size_t ZSTD_compressBlock(ZSTD_CCtx *cctx, void *dst, size_t dstCapacity,
+ const void *src, size_t srcSize);
+size_t ZSTD_decompressBlock(ZSTD_DCtx *dctx, void *dst, size_t dstCapacity,
+ const void *src, size_t srcSize);
+size_t ZSTD_insertBlock(ZSTD_DCtx *dctx, const void *blockStart,
+ size_t blockSize);
+
+#endif /* ZSTD_H */
diff --git a/include/net/addrconf.h b/include/net/addrconf.h
index b8ee8a113e32..858f308d69ca 100644
--- a/include/net/addrconf.h
+++ b/include/net/addrconf.h
@@ -244,6 +244,8 @@ static inline bool ipv6_is_mld(struct sk_buff *skb, int nexthdr, int offset)
void addrconf_prefix_rcv(struct net_device *dev,
u8 *opt, int len, bool sllao);
+u32 addrconf_rt_table(const struct net_device *dev, u32 default_table);
+
/*
* anycast prototypes (anycast.c)
*/
diff --git a/include/net/fib_rules.h b/include/net/fib_rules.h
index 456e4a6006ab..8dbfdf728cd8 100644
--- a/include/net/fib_rules.h
+++ b/include/net/fib_rules.h
@@ -8,6 +8,11 @@
#include <net/flow.h>
#include <net/rtnetlink.h>
+struct fib_kuid_range {
+ kuid_t start;
+ kuid_t end;
+};
+
struct fib_rule {
struct list_head list;
int iifindex;
@@ -30,6 +35,7 @@ struct fib_rule {
int suppress_prefixlen;
char iifname[IFNAMSIZ];
char oifname[IFNAMSIZ];
+ struct fib_kuid_range uid_range;
struct rcu_head rcu;
};
@@ -92,7 +98,8 @@ struct fib_rules_ops {
[FRA_SUPPRESS_PREFIXLEN] = { .type = NLA_U32 }, \
[FRA_SUPPRESS_IFGROUP] = { .type = NLA_U32 }, \
[FRA_GOTO] = { .type = NLA_U32 }, \
- [FRA_L3MDEV] = { .type = NLA_U8 }
+ [FRA_L3MDEV] = { .type = NLA_U8 }, \
+ [FRA_UID_RANGE] = { .len = sizeof(struct fib_rule_uid_range) }
static inline void fib_rule_get(struct fib_rule *rule)
{
diff --git a/include/net/flow.h b/include/net/flow.h
index 035aa7716967..6fc3e131237d 100644
--- a/include/net/flow.h
+++ b/include/net/flow.h
@@ -11,6 +11,7 @@
#include <linux/in6.h>
#include <linux/atomic.h>
#include <net/flow_dissector.h>
+#include <linux/uidgid.h>
/*
* ifindex generation is per-net namespace, and loopback is
@@ -24,6 +25,10 @@ struct flowi_tunnel {
__be64 tun_id;
};
+struct flowi_xfrm {
+ __u32 if_id;
+};
+
struct flowi_common {
int flowic_oif;
int flowic_iif;
@@ -37,6 +42,8 @@ struct flowi_common {
#define FLOWI_FLAG_SKIP_NH_OIF 0x04
__u32 flowic_secid;
struct flowi_tunnel flowic_tun_key;
+ struct flowi_xfrm xfrm;
+ kuid_t flowic_uid;
};
union flowi_uli {
@@ -74,6 +81,8 @@ struct flowi4 {
#define flowi4_flags __fl_common.flowic_flags
#define flowi4_secid __fl_common.flowic_secid
#define flowi4_tun_key __fl_common.flowic_tun_key
+#define flowi4_uid __fl_common.flowic_uid
+#define flowi4_xfrm __fl_common.xfrm
/* (saddr,daddr) must be grouped, same order as in IP header */
__be32 saddr;
@@ -93,7 +102,8 @@ static inline void flowi4_init_output(struct flowi4 *fl4, int oif,
__u32 mark, __u8 tos, __u8 scope,
__u8 proto, __u8 flags,
__be32 daddr, __be32 saddr,
- __be16 dport, __be16 sport)
+ __be16 dport, __be16 sport,
+ kuid_t uid)
{
fl4->flowi4_oif = oif;
fl4->flowi4_iif = LOOPBACK_IFINDEX;
@@ -104,6 +114,8 @@ static inline void flowi4_init_output(struct flowi4 *fl4, int oif,
fl4->flowi4_flags = flags;
fl4->flowi4_secid = 0;
fl4->flowi4_tun_key.tun_id = 0;
+ fl4->flowi4_xfrm.if_id = 0;
+ fl4->flowi4_uid = uid;
fl4->daddr = daddr;
fl4->saddr = saddr;
fl4->fl4_dport = dport;
@@ -131,6 +143,8 @@ struct flowi6 {
#define flowi6_flags __fl_common.flowic_flags
#define flowi6_secid __fl_common.flowic_secid
#define flowi6_tun_key __fl_common.flowic_tun_key
+#define flowi6_uid __fl_common.flowic_uid
+#define flowi6_xfrm __fl_common.xfrm
struct in6_addr daddr;
struct in6_addr saddr;
/* Note: flowi6_tos is encoded in flowlabel, too. */
@@ -176,6 +190,8 @@ struct flowi {
#define flowi_flags u.__fl_common.flowic_flags
#define flowi_secid u.__fl_common.flowic_secid
#define flowi_tun_key u.__fl_common.flowic_tun_key
+#define flowi_uid u.__fl_common.flowic_uid
+#define flowi_xfrm u.__fl_common.xfrm
} __attribute__((__aligned__(BITS_PER_LONG/8)));
static inline struct flowi *flowi4_to_flowi(struct flowi4 *fl4)
diff --git a/include/net/inet_sock.h b/include/net/inet_sock.h
index 6213a90a8cec..c2865f5febbc 100644
--- a/include/net/inet_sock.h
+++ b/include/net/inet_sock.h
@@ -206,7 +206,11 @@ struct inet_sock {
transparent:1,
mc_all:1,
nodefrag:1;
- __u8 bind_address_no_port:1;
+ __u8 bind_address_no_port:1,
+ defer_connect:1; /* Indicates that fastopen_connect is set
+ * and cookie exists so we defer connect
+ * until first data frame is written
+ */
__u8 rcv_tos;
__u8 convert_csum;
int uc_index;
diff --git a/include/net/ip.h b/include/net/ip.h
index 8646da034851..eacff8e14f29 100644
--- a/include/net/ip.h
+++ b/include/net/ip.h
@@ -180,6 +180,7 @@ struct ip_reply_arg {
/* -1 if not needed */
int bound_dev_if;
u8 tos;
+ kuid_t uid;
};
#define IP_REPLY_ARG_NOSRCCHECK 1
diff --git a/include/net/ip6_route.h b/include/net/ip6_route.h
index 2c43993e079c..4341731f39a5 100644
--- a/include/net/ip6_route.h
+++ b/include/net/ip6_route.h
@@ -142,9 +142,10 @@ int rt6_route_rcv(struct net_device *dev, u8 *opt, int len,
const struct in6_addr *gwaddr);
void ip6_update_pmtu(struct sk_buff *skb, struct net *net, __be32 mtu, int oif,
- u32 mark);
+ u32 mark, kuid_t uid);
void ip6_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, __be32 mtu);
-void ip6_redirect(struct sk_buff *skb, struct net *net, int oif, u32 mark);
+void ip6_redirect(struct sk_buff *skb, struct net *net, int oif, u32 mark,
+ kuid_t uid);
void ip6_redirect_no_header(struct sk_buff *skb, struct net *net, int oif,
u32 mark);
void ip6_sk_redirect(struct sk_buff *skb, struct sock *sk);
diff --git a/include/net/netfilter/nf_conntrack.h b/include/net/netfilter/nf_conntrack.h
index 9ae819e27940..29f0d4765bfc 100644
--- a/include/net/netfilter/nf_conntrack.h
+++ b/include/net/netfilter/nf_conntrack.h
@@ -306,7 +306,7 @@ static inline bool nf_ct_should_gc(const struct nf_conn *ct)
struct kernel_param;
-int nf_conntrack_set_hashsize(const char *val, struct kernel_param *kp);
+int nf_conntrack_set_hashsize(const char *val, const struct kernel_param *kp);
int nf_conntrack_hash_resize(unsigned int hashsize);
extern struct hlist_nulls_head *nf_conntrack_hash;
diff --git a/include/net/route.h b/include/net/route.h
index b8488efef920..2702b7ac9a29 100644
--- a/include/net/route.h
+++ b/include/net/route.h
@@ -154,7 +154,7 @@ static inline struct rtable *ip_route_output_ports(struct net *net, struct flowi
flowi4_init_output(fl4, oif, sk ? sk->sk_mark : 0, tos,
RT_SCOPE_UNIVERSE, proto,
sk ? inet_sk_flowi_flags(sk) : 0,
- daddr, saddr, dport, sport);
+ daddr, saddr, dport, sport, sock_net_uid(net, sk));
if (sk)
security_sk_classify_flow(sk, flowi4_to_flowi(fl4));
return ip_route_output_flow(net, fl4, sk);
@@ -270,7 +270,8 @@ static inline void ip_route_connect_init(struct flowi4 *fl4, __be32 dst, __be32
flow_flags |= FLOWI_FLAG_ANYSRC;
flowi4_init_output(fl4, oif, sk->sk_mark, tos, RT_SCOPE_UNIVERSE,
- protocol, flow_flags, dst, src, dport, sport);
+ protocol, flow_flags, dst, src, dport, sport,
+ sk->sk_uid);
}
static inline struct rtable *ip_route_connect(struct flowi4 *fl4,
diff --git a/include/net/sock.h b/include/net/sock.h
index 15bb04dec40e..292c05d7f191 100644
--- a/include/net/sock.h
+++ b/include/net/sock.h
@@ -420,6 +420,7 @@ struct sock {
u32 sk_max_ack_backlog;
__u32 sk_priority;
__u32 sk_mark;
+ kuid_t sk_uid;
struct pid *sk_peer_pid;
const struct cred *sk_peer_cred;
long sk_rcvtimeo;
@@ -1651,6 +1652,7 @@ static inline void sock_graft(struct sock *sk, struct socket *parent)
sk->sk_wq = parent->wq;
parent->sk = sk;
sk_set_socket(sk, parent);
+ sk->sk_uid = SOCK_INODE(parent)->i_uid;
security_sock_graft(sk, parent);
write_unlock_bh(&sk->sk_callback_lock);
}
@@ -1658,6 +1660,11 @@ static inline void sock_graft(struct sock *sk, struct socket *parent)
kuid_t sock_i_uid(struct sock *sk);
unsigned long sock_i_ino(struct sock *sk);
+static inline kuid_t sock_net_uid(const struct net *net, const struct sock *sk)
+{
+ return sk ? sk->sk_uid : make_kuid(net->user_ns, 0);
+}
+
static inline u32 net_tx_rndhash(void)
{
u32 v = prandom_u32();
diff --git a/include/net/tcp.h b/include/net/tcp.h
index c3f4f6a9e6c3..01611c199276 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -271,6 +271,7 @@ extern int sysctl_tcp_autocorking;
extern int sysctl_tcp_invalid_ratelimit;
extern int sysctl_tcp_pacing_ss_ratio;
extern int sysctl_tcp_pacing_ca_ratio;
+extern int sysctl_tcp_default_init_rwnd;
extern atomic_long_t tcp_memory_allocated;
extern struct percpu_counter tcp_sockets_allocated;
@@ -1508,6 +1509,9 @@ struct sock *tcp_try_fastopen(struct sock *sk, struct sk_buff *skb,
struct tcp_fastopen_cookie *foc,
struct dst_entry *dst);
void tcp_fastopen_init_key_once(bool publish);
+bool tcp_fastopen_cookie_check(struct sock *sk, u16 *mss,
+ struct tcp_fastopen_cookie *cookie);
+bool tcp_fastopen_defer_connect(struct sock *sk, int *err);
#define TCP_FASTOPEN_KEY_LENGTH 16
/* Fastopen key context */
diff --git a/include/net/xfrm.h b/include/net/xfrm.h
index 835c30e491c8..1de03977212a 100644
--- a/include/net/xfrm.h
+++ b/include/net/xfrm.h
@@ -21,6 +21,7 @@
#include <net/ipv6.h>
#include <net/ip6_fib.h>
#include <net/flow.h>
+#include <net/gro_cells.h>
#include <linux/interrupt.h>
@@ -136,6 +137,7 @@ struct xfrm_state {
struct xfrm_id id;
struct xfrm_selector sel;
struct xfrm_mark mark;
+ u32 if_id;
u32 tfcpad;
u32 genid;
@@ -155,6 +157,7 @@ struct xfrm_state {
int header_len;
int trailer_len;
u32 extra_flags;
+ struct xfrm_mark smark;
} props;
struct xfrm_lifetime_cfg lft;
@@ -274,6 +277,13 @@ struct xfrm_replay {
int (*overflow)(struct xfrm_state *x, struct sk_buff *skb);
};
+struct xfrm_if_cb {
+ struct xfrm_if *(*decode_session)(struct sk_buff *skb);
+};
+
+void xfrm_if_register_cb(const struct xfrm_if_cb *ifcb);
+void xfrm_if_unregister_cb(void);
+
struct net_device;
struct xfrm_type;
struct xfrm_dst;
@@ -284,10 +294,12 @@ struct xfrm_policy_afinfo {
struct dst_entry *(*dst_lookup)(struct net *net,
int tos, int oif,
const xfrm_address_t *saddr,
- const xfrm_address_t *daddr);
+ const xfrm_address_t *daddr,
+ u32 mark);
int (*get_saddr)(struct net *net, int oif,
xfrm_address_t *saddr,
- xfrm_address_t *daddr);
+ xfrm_address_t *daddr,
+ u32 mark);
void (*decode_session)(struct sk_buff *skb,
struct flowi *fl,
int reverse);
@@ -531,6 +543,7 @@ struct xfrm_policy {
atomic_t genid;
u32 priority;
u32 index;
+ u32 if_id;
struct xfrm_mark mark;
struct xfrm_selector selector;
struct xfrm_lifetime_cfg lft;
@@ -964,6 +977,22 @@ static inline void xfrm_dst_destroy(struct xfrm_dst *xdst)
void xfrm_dst_ifdown(struct dst_entry *dst, struct net_device *dev);
+struct xfrm_if_parms {
+ char name[IFNAMSIZ]; /* name of XFRM device */
+ int link; /* ifindex of underlying L2 interface */
+ u32 if_id; /* interface identifyer */
+};
+
+struct xfrm_if {
+ struct xfrm_if __rcu *next; /* next interface in list */
+ struct net_device *dev; /* virtual device associated with interface */
+ struct net_device *phydev; /* physical device */
+ struct net *net; /* netns for packet i/o */
+ struct xfrm_if_parms p; /* interface parms */
+
+ struct gro_cells gro_cells;
+};
+
struct sec_path {
atomic_t refcnt;
int len;
@@ -1163,12 +1192,12 @@ void xfrm_garbage_collect(struct net *net);
static inline void xfrm_sk_free_policy(struct sock *sk) {}
static inline int xfrm_sk_clone_policy(struct sock *sk, const struct sock *osk) { return 0; }
-static inline int xfrm6_route_forward(struct sk_buff *skb) { return 1; }
-static inline int xfrm4_route_forward(struct sk_buff *skb) { return 1; }
+static inline int xfrm6_route_forward(struct sk_buff *skb) { return 1; }
+static inline int xfrm4_route_forward(struct sk_buff *skb) { return 1; }
static inline int xfrm6_policy_check(struct sock *sk, int dir, struct sk_buff *skb)
-{
- return 1;
-}
+{
+ return 1;
+}
static inline int xfrm4_policy_check(struct sock *sk, int dir, struct sk_buff *skb)
{
return 1;
@@ -1255,7 +1284,7 @@ __xfrm6_state_addr_check(const struct xfrm_state *x,
{
if (ipv6_addr_equal((struct in6_addr *)daddr, (struct in6_addr *)&x->id.daddr) &&
(ipv6_addr_equal((struct in6_addr *)saddr, (struct in6_addr *)&x->props.saddr) ||
- ipv6_addr_any((struct in6_addr *)saddr) ||
+ ipv6_addr_any((struct in6_addr *)saddr) ||
ipv6_addr_any((struct in6_addr *)&x->props.saddr)))
return 1;
return 0;
@@ -1431,7 +1460,7 @@ struct xfrm_state *xfrm_state_find(const xfrm_address_t *daddr,
struct xfrm_tmpl *tmpl,
struct xfrm_policy *pol, int *err,
unsigned short family);
-struct xfrm_state *xfrm_stateonly_find(struct net *net, u32 mark,
+struct xfrm_state *xfrm_stateonly_find(struct net *net, u32 mark, u32 if_id,
xfrm_address_t *daddr,
xfrm_address_t *saddr,
unsigned short family,
@@ -1560,7 +1589,7 @@ int xfrm_user_policy(struct sock *sk, int optname,
static inline int xfrm_user_policy(struct sock *sk, int optname, u8 __user *optval, int optlen)
{
return -ENOPROTOOPT;
-}
+}
static inline int xfrm4_udp_encap_rcv(struct sock *sk, struct sk_buff *skb)
{
@@ -1578,20 +1607,20 @@ int xfrm_policy_walk(struct net *net, struct xfrm_policy_walk *walk,
void *);
void xfrm_policy_walk_done(struct xfrm_policy_walk *walk, struct net *net);
int xfrm_policy_insert(int dir, struct xfrm_policy *policy, int excl);
-struct xfrm_policy *xfrm_policy_bysel_ctx(struct net *net, u32 mark,
+struct xfrm_policy *xfrm_policy_bysel_ctx(struct net *net, u32 mark, u32 if_id,
u8 type, int dir,
struct xfrm_selector *sel,
struct xfrm_sec_ctx *ctx, int delete,
int *err);
-struct xfrm_policy *xfrm_policy_byid(struct net *net, u32 mark, u8, int dir,
- u32 id, int delete, int *err);
+struct xfrm_policy *xfrm_policy_byid(struct net *net, u32 mark, u32 if_id, u8,
+ int dir, u32 id, int delete, int *err);
int xfrm_policy_flush(struct net *net, u8 type, bool task_valid);
void xfrm_policy_hash_rebuild(struct net *net);
u32 xfrm_get_acqseq(void);
int verify_spi_info(u8 proto, u32 min, u32 max);
int xfrm_alloc_spi(struct xfrm_state *x, u32 minspi, u32 maxspi);
struct xfrm_state *xfrm_find_acq(struct net *net, const struct xfrm_mark *mark,
- u8 mode, u32 reqid, u8 proto,
+ u8 mode, u32 reqid, u32 if_id, u8 proto,
const xfrm_address_t *daddr,
const xfrm_address_t *saddr, int create,
unsigned short family);
@@ -1785,6 +1814,22 @@ static inline int xfrm_mark_put(struct sk_buff *skb, const struct xfrm_mark *m)
return ret;
}
+static inline __u32 xfrm_smark_get(__u32 mark, struct xfrm_state *x)
+{
+ struct xfrm_mark *m = &x->props.smark;
+
+ return (m->v & m->m) | (mark & ~m->m);
+}
+
+static inline int xfrm_if_id_put(struct sk_buff *skb, __u32 if_id)
+{
+ int ret = 0;
+
+ if (if_id)
+ ret = nla_put_u32(skb, XFRMA_IF_ID, if_id);
+ return ret;
+}
+
static inline int xfrm_tunnel_check(struct sk_buff *skb, struct xfrm_state *x,
unsigned int family)
{
diff --git a/include/trace/events/android_fs.h b/include/trace/events/android_fs.h
new file mode 100644
index 000000000000..49509533d3fa
--- /dev/null
+++ b/include/trace/events/android_fs.h
@@ -0,0 +1,65 @@
+#undef TRACE_SYSTEM
+#define TRACE_SYSTEM android_fs
+
+#if !defined(_TRACE_ANDROID_FS_H) || defined(TRACE_HEADER_MULTI_READ)
+#define _TRACE_ANDROID_FS_H
+
+#include <linux/tracepoint.h>
+#include <trace/events/android_fs_template.h>
+
+DEFINE_EVENT(android_fs_data_start_template, android_fs_dataread_start,
+ TP_PROTO(struct inode *inode, loff_t offset, int bytes,
+ pid_t pid, char *pathname, char *command),
+ TP_ARGS(inode, offset, bytes, pid, pathname, command));
+
+DEFINE_EVENT(android_fs_data_end_template, android_fs_dataread_end,
+ TP_PROTO(struct inode *inode, loff_t offset, int bytes),
+ TP_ARGS(inode, offset, bytes));
+
+DEFINE_EVENT(android_fs_data_start_template, android_fs_datawrite_start,
+ TP_PROTO(struct inode *inode, loff_t offset, int bytes,
+ pid_t pid, char *pathname, char *command),
+ TP_ARGS(inode, offset, bytes, pid, pathname, command));
+
+DEFINE_EVENT(android_fs_data_end_template, android_fs_datawrite_end,
+ TP_PROTO(struct inode *inode, loff_t offset, int bytes),
+ TP_ARGS(inode, offset, bytes));
+
+#endif /* _TRACE_ANDROID_FS_H */
+
+/* This part must be outside protection */
+#include <trace/define_trace.h>
+
+#ifndef ANDROID_FSTRACE_GET_PATHNAME
+#define ANDROID_FSTRACE_GET_PATHNAME
+
+/* Sizes an on-stack array, so careful if sizing this up ! */
+#define MAX_TRACE_PATHBUF_LEN 256
+
+static inline char *
+android_fstrace_get_pathname(char *buf, int buflen, struct inode *inode)
+{
+ char *path;
+ struct dentry *d;
+
+ /*
+ * d_obtain_alias() will either iput() if it locates an existing
+ * dentry or transfer the reference to the new dentry created.
+ * So get an extra reference here.
+ */
+ ihold(inode);
+ d = d_obtain_alias(inode);
+ if (likely(!IS_ERR(d))) {
+ path = dentry_path_raw(d, buf, buflen);
+ if (unlikely(IS_ERR(path))) {
+ strcpy(buf, "ERROR");
+ path = buf;
+ }
+ dput(d);
+ } else {
+ strcpy(buf, "ERROR");
+ path = buf;
+ }
+ return path;
+}
+#endif
diff --git a/include/trace/events/android_fs_template.h b/include/trace/events/android_fs_template.h
new file mode 100644
index 000000000000..b23d17b56c63
--- /dev/null
+++ b/include/trace/events/android_fs_template.h
@@ -0,0 +1,64 @@
+#if !defined(_TRACE_ANDROID_FS_TEMPLATE_H) || defined(TRACE_HEADER_MULTI_READ)
+#define _TRACE_ANDROID_FS_TEMPLATE_H
+
+#include <linux/tracepoint.h>
+
+DECLARE_EVENT_CLASS(android_fs_data_start_template,
+ TP_PROTO(struct inode *inode, loff_t offset, int bytes,
+ pid_t pid, char *pathname, char *command),
+ TP_ARGS(inode, offset, bytes, pid, pathname, command),
+ TP_STRUCT__entry(
+ __string(pathbuf, pathname);
+ __field(loff_t, offset);
+ __field(int, bytes);
+ __field(loff_t, i_size);
+ __string(cmdline, command);
+ __field(pid_t, pid);
+ __field(ino_t, ino);
+ ),
+ TP_fast_assign(
+ {
+ /*
+ * Replace the spaces in filenames and cmdlines
+ * because this screws up the tooling that parses
+ * the traces.
+ */
+ __assign_str(pathbuf, pathname);
+ (void)strreplace(__get_str(pathbuf), ' ', '_');
+ __entry->offset = offset;
+ __entry->bytes = bytes;
+ __entry->i_size = i_size_read(inode);
+ __assign_str(cmdline, command);
+ (void)strreplace(__get_str(cmdline), ' ', '_');
+ __entry->pid = pid;
+ __entry->ino = inode->i_ino;
+ }
+ ),
+ TP_printk("entry_name %s, offset %llu, bytes %d, cmdline %s,"
+ " pid %d, i_size %llu, ino %lu",
+ __get_str(pathbuf), __entry->offset, __entry->bytes,
+ __get_str(cmdline), __entry->pid, __entry->i_size,
+ (unsigned long) __entry->ino)
+);
+
+DECLARE_EVENT_CLASS(android_fs_data_end_template,
+ TP_PROTO(struct inode *inode, loff_t offset, int bytes),
+ TP_ARGS(inode, offset, bytes),
+ TP_STRUCT__entry(
+ __field(ino_t, ino);
+ __field(loff_t, offset);
+ __field(int, bytes);
+ ),
+ TP_fast_assign(
+ {
+ __entry->ino = inode->i_ino;
+ __entry->offset = offset;
+ __entry->bytes = bytes;
+ }
+ ),
+ TP_printk("ino %lu, offset %llu, bytes %d",
+ (unsigned long) __entry->ino,
+ __entry->offset, __entry->bytes)
+);
+
+#endif /* _TRACE_ANDROID_FS_TEMPLATE_H */
diff --git a/include/trace/events/cpufreq_interactive.h b/include/trace/events/cpufreq_interactive.h
new file mode 100644
index 000000000000..faecc0bfdeff
--- /dev/null
+++ b/include/trace/events/cpufreq_interactive.h
@@ -0,0 +1,112 @@
+#undef TRACE_SYSTEM
+#define TRACE_SYSTEM cpufreq_interactive
+
+#if !defined(_TRACE_CPUFREQ_INTERACTIVE_H) || defined(TRACE_HEADER_MULTI_READ)
+#define _TRACE_CPUFREQ_INTERACTIVE_H
+
+#include <linux/tracepoint.h>
+
+DECLARE_EVENT_CLASS(set,
+ TP_PROTO(u32 cpu_id, unsigned long targfreq,
+ unsigned long actualfreq),
+ TP_ARGS(cpu_id, targfreq, actualfreq),
+
+ TP_STRUCT__entry(
+ __field(u32, cpu_id)
+ __field(unsigned long, targfreq)
+ __field(unsigned long, actualfreq)
+ ),
+
+ TP_fast_assign(
+ __entry->cpu_id = (u32)cpu_id;
+ __entry->targfreq = targfreq;
+ __entry->actualfreq = actualfreq;
+ ),
+
+ TP_printk("cpu=%u targ=%lu actual=%lu",
+ __entry->cpu_id, __entry->targfreq,
+ __entry->actualfreq)
+);
+
+DEFINE_EVENT(set, cpufreq_interactive_setspeed,
+ TP_PROTO(u32 cpu_id, unsigned long targfreq,
+ unsigned long actualfreq),
+ TP_ARGS(cpu_id, targfreq, actualfreq)
+);
+
+DECLARE_EVENT_CLASS(loadeval,
+ TP_PROTO(unsigned long cpu_id, unsigned long load,
+ unsigned long curtarg, unsigned long curactual,
+ unsigned long newtarg),
+ TP_ARGS(cpu_id, load, curtarg, curactual, newtarg),
+
+ TP_STRUCT__entry(
+ __field(unsigned long, cpu_id)
+ __field(unsigned long, load)
+ __field(unsigned long, curtarg)
+ __field(unsigned long, curactual)
+ __field(unsigned long, newtarg)
+ ),
+
+ TP_fast_assign(
+ __entry->cpu_id = cpu_id;
+ __entry->load = load;
+ __entry->curtarg = curtarg;
+ __entry->curactual = curactual;
+ __entry->newtarg = newtarg;
+ ),
+
+ TP_printk("cpu=%lu load=%lu cur=%lu actual=%lu targ=%lu",
+ __entry->cpu_id, __entry->load, __entry->curtarg,
+ __entry->curactual, __entry->newtarg)
+);
+
+DEFINE_EVENT(loadeval, cpufreq_interactive_target,
+ TP_PROTO(unsigned long cpu_id, unsigned long load,
+ unsigned long curtarg, unsigned long curactual,
+ unsigned long newtarg),
+ TP_ARGS(cpu_id, load, curtarg, curactual, newtarg)
+);
+
+DEFINE_EVENT(loadeval, cpufreq_interactive_already,
+ TP_PROTO(unsigned long cpu_id, unsigned long load,
+ unsigned long curtarg, unsigned long curactual,
+ unsigned long newtarg),
+ TP_ARGS(cpu_id, load, curtarg, curactual, newtarg)
+);
+
+DEFINE_EVENT(loadeval, cpufreq_interactive_notyet,
+ TP_PROTO(unsigned long cpu_id, unsigned long load,
+ unsigned long curtarg, unsigned long curactual,
+ unsigned long newtarg),
+ TP_ARGS(cpu_id, load, curtarg, curactual, newtarg)
+);
+
+TRACE_EVENT(cpufreq_interactive_boost,
+ TP_PROTO(const char *s),
+ TP_ARGS(s),
+ TP_STRUCT__entry(
+ __string(s, s)
+ ),
+ TP_fast_assign(
+ __assign_str(s, s);
+ ),
+ TP_printk("%s", __get_str(s))
+);
+
+TRACE_EVENT(cpufreq_interactive_unboost,
+ TP_PROTO(const char *s),
+ TP_ARGS(s),
+ TP_STRUCT__entry(
+ __string(s, s)
+ ),
+ TP_fast_assign(
+ __assign_str(s, s);
+ ),
+ TP_printk("%s", __get_str(s))
+);
+
+#endif /* _TRACE_CPUFREQ_INTERACTIVE_H */
+
+/* This part must be outside protection */
+#include <trace/define_trace.h>
diff --git a/include/trace/events/cpufreq_sched.h b/include/trace/events/cpufreq_sched.h
new file mode 100644
index 000000000000..a46cd088e969
--- /dev/null
+++ b/include/trace/events/cpufreq_sched.h
@@ -0,0 +1,87 @@
+/*
+ * Copyright (C) 2015 Steve Muckle <smuckle@linaro.org>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#undef TRACE_SYSTEM
+#define TRACE_SYSTEM cpufreq_sched
+
+#if !defined(_TRACE_CPUFREQ_SCHED_H) || defined(TRACE_HEADER_MULTI_READ)
+#define _TRACE_CPUFREQ_SCHED_H
+
+#include <linux/sched.h>
+#include <linux/tracepoint.h>
+
+TRACE_EVENT(cpufreq_sched_throttled,
+ TP_PROTO(unsigned int rem),
+ TP_ARGS(rem),
+ TP_STRUCT__entry(
+ __field( unsigned int, rem)
+ ),
+ TP_fast_assign(
+ __entry->rem = rem;
+ ),
+ TP_printk("throttled - %d usec remaining", __entry->rem)
+);
+
+TRACE_EVENT(cpufreq_sched_request_opp,
+ TP_PROTO(int cpu,
+ unsigned long capacity,
+ unsigned int freq_new,
+ unsigned int requested_freq),
+ TP_ARGS(cpu, capacity, freq_new, requested_freq),
+ TP_STRUCT__entry(
+ __field( int, cpu)
+ __field( unsigned long, capacity)
+ __field( unsigned int, freq_new)
+ __field( unsigned int, requested_freq)
+ ),
+ TP_fast_assign(
+ __entry->cpu = cpu;
+ __entry->capacity = capacity;
+ __entry->freq_new = freq_new;
+ __entry->requested_freq = requested_freq;
+ ),
+ TP_printk("cpu %d cap change, cluster cap request %ld => OPP %d "
+ "(cur %d)",
+ __entry->cpu, __entry->capacity, __entry->freq_new,
+ __entry->requested_freq)
+);
+
+TRACE_EVENT(cpufreq_sched_update_capacity,
+ TP_PROTO(int cpu,
+ bool request,
+ struct sched_capacity_reqs *scr,
+ unsigned long new_capacity),
+ TP_ARGS(cpu, request, scr, new_capacity),
+ TP_STRUCT__entry(
+ __field( int, cpu)
+ __field( bool, request)
+ __field( unsigned long, cfs)
+ __field( unsigned long, rt)
+ __field( unsigned long, dl)
+ __field( unsigned long, total)
+ __field( unsigned long, new_total)
+ ),
+ TP_fast_assign(
+ __entry->cpu = cpu;
+ __entry->request = request;
+ __entry->cfs = scr->cfs;
+ __entry->rt = scr->rt;
+ __entry->dl = scr->dl;
+ __entry->total = scr->total;
+ __entry->new_total = new_capacity;
+ ),
+ TP_printk("cpu=%d set_cap=%d cfs=%ld rt=%ld dl=%ld old_tot=%ld "
+ "new_tot=%ld",
+ __entry->cpu, __entry->request, __entry->cfs, __entry->rt,
+ __entry->dl, __entry->total, __entry->new_total)
+);
+
+#endif /* _TRACE_CPUFREQ_SCHED_H */
+
+/* This part must be outside protection */
+#include <trace/define_trace.h>
diff --git a/include/trace/events/f2fs.h b/include/trace/events/f2fs.h
index 903a09165bb1..e88465ab47e1 100644
--- a/include/trace/events/f2fs.h
+++ b/include/trace/events/f2fs.h
@@ -6,8 +6,8 @@
#include <linux/tracepoint.h>
-#define show_dev(entry) MAJOR(entry->dev), MINOR(entry->dev)
-#define show_dev_ino(entry) show_dev(entry), (unsigned long)entry->ino
+#define show_dev(dev) MAJOR(dev), MINOR(dev)
+#define show_dev_ino(entry) show_dev(entry->dev), (unsigned long)entry->ino
TRACE_DEFINE_ENUM(NODE);
TRACE_DEFINE_ENUM(DATA);
@@ -15,6 +15,8 @@ TRACE_DEFINE_ENUM(META);
TRACE_DEFINE_ENUM(META_FLUSH);
TRACE_DEFINE_ENUM(INMEM);
TRACE_DEFINE_ENUM(INMEM_DROP);
+TRACE_DEFINE_ENUM(INMEM_INVALIDATE);
+TRACE_DEFINE_ENUM(INMEM_REVOKE);
TRACE_DEFINE_ENUM(IPU);
TRACE_DEFINE_ENUM(OPU);
TRACE_DEFINE_ENUM(CURSEG_HOT_DATA);
@@ -42,6 +44,7 @@ TRACE_DEFINE_ENUM(CP_FASTBOOT);
TRACE_DEFINE_ENUM(CP_SYNC);
TRACE_DEFINE_ENUM(CP_RECOVERY);
TRACE_DEFINE_ENUM(CP_DISCARD);
+TRACE_DEFINE_ENUM(CP_TRIMMED);
#define show_block_type(type) \
__print_symbolic(type, \
@@ -51,32 +54,40 @@ TRACE_DEFINE_ENUM(CP_DISCARD);
{ META_FLUSH, "META_FLUSH" }, \
{ INMEM, "INMEM" }, \
{ INMEM_DROP, "INMEM_DROP" }, \
+ { INMEM_INVALIDATE, "INMEM_INVALIDATE" }, \
{ INMEM_REVOKE, "INMEM_REVOKE" }, \
{ IPU, "IN-PLACE" }, \
{ OPU, "OUT-OF-PLACE" })
-#define F2FS_BIO_FLAG_MASK(t) (t & (REQ_RAHEAD | WRITE_FLUSH_FUA))
-#define F2FS_BIO_EXTRA_MASK(t) (t & (REQ_META | REQ_PRIO))
+#define F2FS_OP_FLAGS (REQ_RAHEAD | REQ_SYNC | REQ_META | REQ_PRIO | \
+ REQ_PREFLUSH | REQ_FUA)
+#define F2FS_BIO_FLAG_MASK(t) (t & F2FS_OP_FLAGS)
-#define show_bio_type(op_flags) show_bio_op_flags(op_flags), \
- show_bio_extra(op_flags)
+#define show_bio_type(op,op_flags) show_bio_op(op), \
+ show_bio_op_flags(op_flags)
+
+#define show_bio_op(op) \
+ __print_symbolic(op, \
+ { REQ_OP_READ, "READ" }, \
+ { REQ_OP_WRITE, "WRITE" }, \
+ { REQ_OP_DISCARD, "DISCARD" }, \
+ { REQ_OP_SECURE_ERASE, "SECURE_ERASE" }, \
+ { REQ_OP_WRITE_SAME, "WRITE_SAME" })
#define show_bio_op_flags(flags) \
- __print_symbolic(F2FS_BIO_FLAG_MASK(flags), \
- { 0, "WRITE" }, \
- { REQ_RAHEAD, "READAHEAD" }, \
- { READ_SYNC, "READ_SYNC" }, \
- { WRITE_SYNC, "WRITE_SYNC" }, \
- { WRITE_FLUSH, "WRITE_FLUSH" }, \
- { WRITE_FUA, "WRITE_FUA" }, \
- { WRITE_FLUSH_FUA, "WRITE_FLUSH_FUA" })
-
-#define show_bio_extra(type) \
- __print_symbolic(F2FS_BIO_EXTRA_MASK(type), \
- { REQ_META, "(M)" }, \
- { REQ_PRIO, "(P)" }, \
- { REQ_META | REQ_PRIO, "(MP)" }, \
- { 0, " \b" })
+ __print_flags(F2FS_BIO_FLAG_MASK(flags), "|", \
+ { REQ_RAHEAD, "R" }, \
+ { REQ_SYNC, "S" }, \
+ { REQ_META, "M" }, \
+ { REQ_PRIO, "P" }, \
+ { REQ_PREFLUSH, "PF" }, \
+ { REQ_FUA, "FUA" })
+
+#define show_block_temp(temp) \
+ __print_symbolic(temp, \
+ { HOT, "HOT" }, \
+ { WARM, "WARM" }, \
+ { COLD, "COLD" })
#define show_data_type(type) \
__print_symbolic(type, \
@@ -109,12 +120,27 @@ TRACE_DEFINE_ENUM(CP_DISCARD);
{ GC_CB, "Cost-Benefit" })
#define show_cpreason(type) \
- __print_symbolic(type, \
+ __print_flags(type, "|", \
{ CP_UMOUNT, "Umount" }, \
{ CP_FASTBOOT, "Fastboot" }, \
{ CP_SYNC, "Sync" }, \
{ CP_RECOVERY, "Recovery" }, \
- { CP_DISCARD, "Discard" })
+ { CP_DISCARD, "Discard" }, \
+ { CP_UMOUNT, "Umount" }, \
+ { CP_TRIMMED, "Trimmed" })
+
+#define show_fsync_cpreason(type) \
+ __print_symbolic(type, \
+ { CP_NO_NEEDED, "no needed" }, \
+ { CP_NON_REGULAR, "non regular" }, \
+ { CP_HARDLINK, "hardlink" }, \
+ { CP_SB_NEED_CP, "sb needs cp" }, \
+ { CP_WRONG_PINO, "wrong pino" }, \
+ { CP_NO_SPC_ROLL, "no space roll forward" }, \
+ { CP_NODE_NEED_CP, "node needs cp" }, \
+ { CP_FASTBOOT_MODE, "fastboot mode" }, \
+ { CP_SPEC_LOG_NUM, "log type is 2" }, \
+ { CP_RECOVER_DIR, "dir needs recovery" })
struct victim_sel_policy;
struct f2fs_map_blocks;
@@ -190,14 +216,14 @@ DEFINE_EVENT(f2fs__inode, f2fs_sync_file_enter,
TRACE_EVENT(f2fs_sync_file_exit,
- TP_PROTO(struct inode *inode, int need_cp, int datasync, int ret),
+ TP_PROTO(struct inode *inode, int cp_reason, int datasync, int ret),
- TP_ARGS(inode, need_cp, datasync, ret),
+ TP_ARGS(inode, cp_reason, datasync, ret),
TP_STRUCT__entry(
__field(dev_t, dev)
__field(ino_t, ino)
- __field(int, need_cp)
+ __field(int, cp_reason)
__field(int, datasync)
__field(int, ret)
),
@@ -205,15 +231,15 @@ TRACE_EVENT(f2fs_sync_file_exit,
TP_fast_assign(
__entry->dev = inode->i_sb->s_dev;
__entry->ino = inode->i_ino;
- __entry->need_cp = need_cp;
+ __entry->cp_reason = cp_reason;
__entry->datasync = datasync;
__entry->ret = ret;
),
- TP_printk("dev = (%d,%d), ino = %lu, checkpoint is %s, "
+ TP_printk("dev = (%d,%d), ino = %lu, cp_reason: %s, "
"datasync = %d, ret = %d",
show_dev_ino(__entry),
- __entry->need_cp ? "needed" : "not needed",
+ show_fsync_cpreason(__entry->cp_reason),
__entry->datasync,
__entry->ret)
);
@@ -237,7 +263,7 @@ TRACE_EVENT(f2fs_sync_fs,
),
TP_printk("dev = (%d,%d), superblock is %s, wait = %d",
- show_dev(__entry),
+ show_dev(__entry->dev),
__entry->dirty ? "dirty" : "not dirty",
__entry->wait)
);
@@ -307,6 +333,13 @@ DEFINE_EVENT(f2fs__inode_exit, f2fs_unlink_exit,
TP_ARGS(inode, ret)
);
+DEFINE_EVENT(f2fs__inode_exit, f2fs_drop_inode,
+
+ TP_PROTO(struct inode *inode, int ret),
+
+ TP_ARGS(inode, ret)
+);
+
DEFINE_EVENT(f2fs__inode, f2fs_truncate,
TP_PROTO(struct inode *inode),
@@ -516,14 +549,14 @@ TRACE_EVENT(f2fs_map_blocks,
TRACE_EVENT(f2fs_background_gc,
- TP_PROTO(struct super_block *sb, long wait_ms,
+ TP_PROTO(struct super_block *sb, unsigned int wait_ms,
unsigned int prefree, unsigned int free),
TP_ARGS(sb, wait_ms, prefree, free),
TP_STRUCT__entry(
__field(dev_t, dev)
- __field(long, wait_ms)
+ __field(unsigned int, wait_ms)
__field(unsigned int, prefree)
__field(unsigned int, free)
),
@@ -535,13 +568,120 @@ TRACE_EVENT(f2fs_background_gc,
__entry->free = free;
),
- TP_printk("dev = (%d,%d), wait_ms = %ld, prefree = %u, free = %u",
- show_dev(__entry),
+ TP_printk("dev = (%d,%d), wait_ms = %u, prefree = %u, free = %u",
+ show_dev(__entry->dev),
__entry->wait_ms,
__entry->prefree,
__entry->free)
);
+TRACE_EVENT(f2fs_gc_begin,
+
+ TP_PROTO(struct super_block *sb, bool sync, bool background,
+ long long dirty_nodes, long long dirty_dents,
+ long long dirty_imeta, unsigned int free_sec,
+ unsigned int free_seg, int reserved_seg,
+ unsigned int prefree_seg),
+
+ TP_ARGS(sb, sync, background, dirty_nodes, dirty_dents, dirty_imeta,
+ free_sec, free_seg, reserved_seg, prefree_seg),
+
+ TP_STRUCT__entry(
+ __field(dev_t, dev)
+ __field(bool, sync)
+ __field(bool, background)
+ __field(long long, dirty_nodes)
+ __field(long long, dirty_dents)
+ __field(long long, dirty_imeta)
+ __field(unsigned int, free_sec)
+ __field(unsigned int, free_seg)
+ __field(int, reserved_seg)
+ __field(unsigned int, prefree_seg)
+ ),
+
+ TP_fast_assign(
+ __entry->dev = sb->s_dev;
+ __entry->sync = sync;
+ __entry->background = background;
+ __entry->dirty_nodes = dirty_nodes;
+ __entry->dirty_dents = dirty_dents;
+ __entry->dirty_imeta = dirty_imeta;
+ __entry->free_sec = free_sec;
+ __entry->free_seg = free_seg;
+ __entry->reserved_seg = reserved_seg;
+ __entry->prefree_seg = prefree_seg;
+ ),
+
+ TP_printk("dev = (%d,%d), sync = %d, background = %d, nodes = %lld, "
+ "dents = %lld, imeta = %lld, free_sec:%u, free_seg:%u, "
+ "rsv_seg:%d, prefree_seg:%u",
+ show_dev(__entry->dev),
+ __entry->sync,
+ __entry->background,
+ __entry->dirty_nodes,
+ __entry->dirty_dents,
+ __entry->dirty_imeta,
+ __entry->free_sec,
+ __entry->free_seg,
+ __entry->reserved_seg,
+ __entry->prefree_seg)
+);
+
+TRACE_EVENT(f2fs_gc_end,
+
+ TP_PROTO(struct super_block *sb, int ret, int seg_freed,
+ int sec_freed, long long dirty_nodes,
+ long long dirty_dents, long long dirty_imeta,
+ unsigned int free_sec, unsigned int free_seg,
+ int reserved_seg, unsigned int prefree_seg),
+
+ TP_ARGS(sb, ret, seg_freed, sec_freed, dirty_nodes, dirty_dents,
+ dirty_imeta, free_sec, free_seg, reserved_seg, prefree_seg),
+
+ TP_STRUCT__entry(
+ __field(dev_t, dev)
+ __field(int, ret)
+ __field(int, seg_freed)
+ __field(int, sec_freed)
+ __field(long long, dirty_nodes)
+ __field(long long, dirty_dents)
+ __field(long long, dirty_imeta)
+ __field(unsigned int, free_sec)
+ __field(unsigned int, free_seg)
+ __field(int, reserved_seg)
+ __field(unsigned int, prefree_seg)
+ ),
+
+ TP_fast_assign(
+ __entry->dev = sb->s_dev;
+ __entry->ret = ret;
+ __entry->seg_freed = seg_freed;
+ __entry->sec_freed = sec_freed;
+ __entry->dirty_nodes = dirty_nodes;
+ __entry->dirty_dents = dirty_dents;
+ __entry->dirty_imeta = dirty_imeta;
+ __entry->free_sec = free_sec;
+ __entry->free_seg = free_seg;
+ __entry->reserved_seg = reserved_seg;
+ __entry->prefree_seg = prefree_seg;
+ ),
+
+ TP_printk("dev = (%d,%d), ret = %d, seg_freed = %d, sec_freed = %d, "
+ "nodes = %lld, dents = %lld, imeta = %lld, free_sec:%u, "
+ "free_seg:%u, rsv_seg:%d, prefree_seg:%u",
+ show_dev(__entry->dev),
+ __entry->ret,
+ __entry->seg_freed,
+ __entry->sec_freed,
+ __entry->dirty_nodes,
+ __entry->dirty_dents,
+ __entry->dirty_imeta,
+ __entry->free_sec,
+ __entry->free_seg,
+ __entry->reserved_seg,
+ __entry->prefree_seg)
+);
+
TRACE_EVENT(f2fs_get_victim,
TP_PROTO(struct super_block *sb, int type, int gc_type,
@@ -557,6 +697,7 @@ TRACE_EVENT(f2fs_get_victim,
__field(int, alloc_mode)
__field(int, gc_mode)
__field(unsigned int, victim)
+ __field(unsigned int, cost)
__field(unsigned int, ofs_unit)
__field(unsigned int, pre_victim)
__field(unsigned int, prefree)
@@ -570,26 +711,114 @@ TRACE_EVENT(f2fs_get_victim,
__entry->alloc_mode = p->alloc_mode;
__entry->gc_mode = p->gc_mode;
__entry->victim = p->min_segno;
+ __entry->cost = p->min_cost;
__entry->ofs_unit = p->ofs_unit;
__entry->pre_victim = pre_victim;
__entry->prefree = prefree;
__entry->free = free;
),
- TP_printk("dev = (%d,%d), type = %s, policy = (%s, %s, %s), victim = %u "
- "ofs_unit = %u, pre_victim_secno = %d, prefree = %u, free = %u",
- show_dev(__entry),
+ TP_printk("dev = (%d,%d), type = %s, policy = (%s, %s, %s), "
+ "victim = %u, cost = %u, ofs_unit = %u, "
+ "pre_victim_secno = %d, prefree = %u, free = %u",
+ show_dev(__entry->dev),
show_data_type(__entry->type),
show_gc_type(__entry->gc_type),
show_alloc_mode(__entry->alloc_mode),
show_victim_policy(__entry->gc_mode),
__entry->victim,
+ __entry->cost,
__entry->ofs_unit,
(int)__entry->pre_victim,
__entry->prefree,
__entry->free)
);
+TRACE_EVENT(f2fs_lookup_start,
+
+ TP_PROTO(struct inode *dir, struct dentry *dentry, unsigned int flags),
+
+ TP_ARGS(dir, dentry, flags),
+
+ TP_STRUCT__entry(
+ __field(dev_t, dev)
+ __field(ino_t, ino)
+ __field(const char *, name)
+ __field(unsigned int, flags)
+ ),
+
+ TP_fast_assign(
+ __entry->dev = dir->i_sb->s_dev;
+ __entry->ino = dir->i_ino;
+ __entry->name = dentry->d_name.name;
+ __entry->flags = flags;
+ ),
+
+ TP_printk("dev = (%d,%d), pino = %lu, name:%s, flags:%u",
+ show_dev_ino(__entry),
+ __entry->name,
+ __entry->flags)
+);
+
+TRACE_EVENT(f2fs_lookup_end,
+
+ TP_PROTO(struct inode *dir, struct dentry *dentry, nid_t ino,
+ int err),
+
+ TP_ARGS(dir, dentry, ino, err),
+
+ TP_STRUCT__entry(
+ __field(dev_t, dev)
+ __field(ino_t, ino)
+ __field(const char *, name)
+ __field(nid_t, cino)
+ __field(int, err)
+ ),
+
+ TP_fast_assign(
+ __entry->dev = dir->i_sb->s_dev;
+ __entry->ino = dir->i_ino;
+ __entry->name = dentry->d_name.name;
+ __entry->cino = ino;
+ __entry->err = err;
+ ),
+
+ TP_printk("dev = (%d,%d), pino = %lu, name:%s, ino:%u, err:%d",
+ show_dev_ino(__entry),
+ __entry->name,
+ __entry->cino,
+ __entry->err)
+);
+
+TRACE_EVENT(f2fs_readdir,
+
+ TP_PROTO(struct inode *dir, loff_t start_pos, loff_t end_pos, int err),
+
+ TP_ARGS(dir, start_pos, end_pos, err),
+
+ TP_STRUCT__entry(
+ __field(dev_t, dev)
+ __field(ino_t, ino)
+ __field(loff_t, start)
+ __field(loff_t, end)
+ __field(int, err)
+ ),
+
+ TP_fast_assign(
+ __entry->dev = dir->i_sb->s_dev;
+ __entry->ino = dir->i_ino;
+ __entry->start = start_pos;
+ __entry->end = end_pos;
+ __entry->err = err;
+ ),
+
+ TP_printk("dev = (%d,%d), ino = %lu, start_pos:%llu, end_pos:%llu, err:%d",
+ show_dev_ino(__entry),
+ __entry->start,
+ __entry->end,
+ __entry->err)
+);
+
TRACE_EVENT(f2fs_fallocate,
TP_PROTO(struct inode *inode, int mode,
@@ -715,7 +944,7 @@ TRACE_EVENT(f2fs_reserve_new_blocks,
),
TP_printk("dev = (%d,%d), nid = %u, ofs_in_node = %u, count = %llu",
- show_dev(__entry),
+ show_dev(__entry->dev),
(unsigned int)__entry->nid,
__entry->ofs_in_node,
(unsigned long long)__entry->count)
@@ -735,6 +964,7 @@ DECLARE_EVENT_CLASS(f2fs__submit_page_bio,
__field(block_t, new_blkaddr)
__field(int, op)
__field(int, op_flags)
+ __field(int, temp)
__field(int, type)
),
@@ -746,16 +976,18 @@ DECLARE_EVENT_CLASS(f2fs__submit_page_bio,
__entry->new_blkaddr = fio->new_blkaddr;
__entry->op = fio->op;
__entry->op_flags = fio->op_flags;
+ __entry->temp = fio->temp;
__entry->type = fio->type;
),
TP_printk("dev = (%d,%d), ino = %lu, page_index = 0x%lx, "
- "oldaddr = 0x%llx, newaddr = 0x%llx, rw = %s%s, type = %s",
+ "oldaddr = 0x%llx, newaddr = 0x%llx, rw = %s(%s), type = %s_%s",
show_dev_ino(__entry),
(unsigned long)__entry->index,
(unsigned long long)__entry->old_blkaddr,
(unsigned long long)__entry->new_blkaddr,
- show_bio_type(__entry->op_flags),
+ show_bio_type(__entry->op, __entry->op_flags),
+ show_block_temp(__entry->temp),
show_block_type(__entry->type))
);
@@ -768,7 +1000,7 @@ DEFINE_EVENT_CONDITION(f2fs__submit_page_bio, f2fs_submit_page_bio,
TP_CONDITION(page->mapping)
);
-DEFINE_EVENT_CONDITION(f2fs__submit_page_bio, f2fs_submit_page_mbio,
+DEFINE_EVENT_CONDITION(f2fs__submit_page_bio, f2fs_submit_page_write,
TP_PROTO(struct page *page, struct f2fs_io_info *fio),
@@ -777,15 +1009,15 @@ DEFINE_EVENT_CONDITION(f2fs__submit_page_bio, f2fs_submit_page_mbio,
TP_CONDITION(page->mapping)
);
-DECLARE_EVENT_CLASS(f2fs__submit_bio,
+DECLARE_EVENT_CLASS(f2fs__bio,
- TP_PROTO(struct super_block *sb, struct f2fs_io_info *fio,
- struct bio *bio),
+ TP_PROTO(struct super_block *sb, int type, struct bio *bio),
- TP_ARGS(sb, fio, bio),
+ TP_ARGS(sb, type, bio),
TP_STRUCT__entry(
__field(dev_t, dev)
+ __field(dev_t, target)
__field(int, op)
__field(int, op_flags)
__field(int, type)
@@ -795,37 +1027,55 @@ DECLARE_EVENT_CLASS(f2fs__submit_bio,
TP_fast_assign(
__entry->dev = sb->s_dev;
- __entry->op = fio->op;
- __entry->op_flags = fio->op_flags;
- __entry->type = fio->type;
+ __entry->target = bio->bi_bdev->bd_dev;
+ __entry->op = bio_op(bio);
+ __entry->op_flags = bio->bi_opf;
+ __entry->type = type;
__entry->sector = bio->bi_iter.bi_sector;
__entry->size = bio->bi_iter.bi_size;
),
- TP_printk("dev = (%d,%d), rw = %s%s, %s, sector = %lld, size = %u",
- show_dev(__entry),
- show_bio_type(__entry->op_flags),
+ TP_printk("dev = (%d,%d)/(%d,%d), rw = %s(%s), %s, sector = %lld, size = %u",
+ show_dev(__entry->target),
+ show_dev(__entry->dev),
+ show_bio_type(__entry->op, __entry->op_flags),
show_block_type(__entry->type),
(unsigned long long)__entry->sector,
__entry->size)
);
-DEFINE_EVENT_CONDITION(f2fs__submit_bio, f2fs_submit_write_bio,
+DEFINE_EVENT_CONDITION(f2fs__bio, f2fs_prepare_write_bio,
+
+ TP_PROTO(struct super_block *sb, int type, struct bio *bio),
+
+ TP_ARGS(sb, type, bio),
+
+ TP_CONDITION(bio)
+);
+
+DEFINE_EVENT_CONDITION(f2fs__bio, f2fs_prepare_read_bio,
- TP_PROTO(struct super_block *sb, struct f2fs_io_info *fio,
- struct bio *bio),
+ TP_PROTO(struct super_block *sb, int type, struct bio *bio),
- TP_ARGS(sb, fio, bio),
+ TP_ARGS(sb, type, bio),
TP_CONDITION(bio)
);
-DEFINE_EVENT_CONDITION(f2fs__submit_bio, f2fs_submit_read_bio,
+DEFINE_EVENT_CONDITION(f2fs__bio, f2fs_submit_read_bio,
- TP_PROTO(struct super_block *sb, struct f2fs_io_info *fio,
- struct bio *bio),
+ TP_PROTO(struct super_block *sb, int type, struct bio *bio),
- TP_ARGS(sb, fio, bio),
+ TP_ARGS(sb, type, bio),
+
+ TP_CONDITION(bio)
+);
+
+DEFINE_EVENT_CONDITION(f2fs__bio, f2fs_submit_write_bio,
+
+ TP_PROTO(struct super_block *sb, int type, struct bio *bio),
+
+ TP_ARGS(sb, type, bio),
TP_CONDITION(bio)
);
@@ -1084,16 +1334,16 @@ TRACE_EVENT(f2fs_write_checkpoint,
),
TP_printk("dev = (%d,%d), checkpoint for %s, state = %s",
- show_dev(__entry),
+ show_dev(__entry->dev),
show_cpreason(__entry->reason),
__entry->msg)
);
-TRACE_EVENT(f2fs_issue_discard,
+DECLARE_EVENT_CLASS(f2fs_discard,
- TP_PROTO(struct super_block *sb, block_t blkstart, block_t blklen),
+ TP_PROTO(struct block_device *dev, block_t blkstart, block_t blklen),
- TP_ARGS(sb, blkstart, blklen),
+ TP_ARGS(dev, blkstart, blklen),
TP_STRUCT__entry(
__field(dev_t, dev)
@@ -1102,40 +1352,85 @@ TRACE_EVENT(f2fs_issue_discard,
),
TP_fast_assign(
- __entry->dev = sb->s_dev;
+ __entry->dev = dev->bd_dev;
__entry->blkstart = blkstart;
__entry->blklen = blklen;
),
TP_printk("dev = (%d,%d), blkstart = 0x%llx, blklen = 0x%llx",
- show_dev(__entry),
+ show_dev(__entry->dev),
(unsigned long long)__entry->blkstart,
(unsigned long long)__entry->blklen)
);
+DEFINE_EVENT(f2fs_discard, f2fs_queue_discard,
+
+ TP_PROTO(struct block_device *dev, block_t blkstart, block_t blklen),
+
+ TP_ARGS(dev, blkstart, blklen)
+);
+
+DEFINE_EVENT(f2fs_discard, f2fs_issue_discard,
+
+ TP_PROTO(struct block_device *dev, block_t blkstart, block_t blklen),
+
+ TP_ARGS(dev, blkstart, blklen)
+);
+
+DEFINE_EVENT(f2fs_discard, f2fs_remove_discard,
+
+ TP_PROTO(struct block_device *dev, block_t blkstart, block_t blklen),
+
+ TP_ARGS(dev, blkstart, blklen)
+);
+
+TRACE_EVENT(f2fs_issue_reset_zone,
+
+ TP_PROTO(struct block_device *dev, block_t blkstart),
+
+ TP_ARGS(dev, blkstart),
+
+ TP_STRUCT__entry(
+ __field(dev_t, dev)
+ __field(block_t, blkstart)
+ ),
+
+ TP_fast_assign(
+ __entry->dev = dev->bd_dev;
+ __entry->blkstart = blkstart;
+ ),
+
+ TP_printk("dev = (%d,%d), reset zone at block = 0x%llx",
+ show_dev(__entry->dev),
+ (unsigned long long)__entry->blkstart)
+);
+
TRACE_EVENT(f2fs_issue_flush,
- TP_PROTO(struct super_block *sb, unsigned int nobarrier,
- unsigned int flush_merge),
+ TP_PROTO(struct block_device *dev, unsigned int nobarrier,
+ unsigned int flush_merge, int ret),
- TP_ARGS(sb, nobarrier, flush_merge),
+ TP_ARGS(dev, nobarrier, flush_merge, ret),
TP_STRUCT__entry(
__field(dev_t, dev)
__field(unsigned int, nobarrier)
__field(unsigned int, flush_merge)
+ __field(int, ret)
),
TP_fast_assign(
- __entry->dev = sb->s_dev;
+ __entry->dev = dev->bd_dev;
__entry->nobarrier = nobarrier;
__entry->flush_merge = flush_merge;
+ __entry->ret = ret;
),
- TP_printk("dev = (%d,%d), %s %s",
- show_dev(__entry),
+ TP_printk("dev = (%d,%d), %s %s, ret = %d",
+ show_dev(__entry->dev),
__entry->nobarrier ? "skip (nobarrier)" : "issue",
- __entry->flush_merge ? " with flush_merge" : "")
+ __entry->flush_merge ? " with flush_merge" : "",
+ __entry->ret)
);
TRACE_EVENT(f2fs_lookup_extent_tree_start,
@@ -1248,7 +1543,7 @@ TRACE_EVENT(f2fs_shrink_extent_tree,
),
TP_printk("dev = (%d,%d), shrunk: node_cnt = %u, tree_cnt = %u",
- show_dev(__entry),
+ show_dev(__entry->dev),
__entry->node_cnt,
__entry->tree_cnt)
);
@@ -1295,7 +1590,7 @@ DECLARE_EVENT_CLASS(f2fs_sync_dirty_inodes,
),
TP_printk("dev = (%d,%d), %s, dirty count = %lld",
- show_dev(__entry),
+ show_dev(__entry->dev),
show_file_type(__entry->type),
__entry->count)
);
diff --git a/include/trace/events/gpu.h b/include/trace/events/gpu.h
new file mode 100644
index 000000000000..7e15cdfafe5a
--- /dev/null
+++ b/include/trace/events/gpu.h
@@ -0,0 +1,143 @@
+#undef TRACE_SYSTEM
+#define TRACE_SYSTEM gpu
+
+#if !defined(_TRACE_GPU_H) || defined(TRACE_HEADER_MULTI_READ)
+#define _TRACE_GPU_H
+
+#include <linux/tracepoint.h>
+#include <linux/time.h>
+
+#define show_secs_from_ns(ns) \
+ ({ \
+ u64 t = ns + (NSEC_PER_USEC / 2); \
+ do_div(t, NSEC_PER_SEC); \
+ t; \
+ })
+
+#define show_usecs_from_ns(ns) \
+ ({ \
+ u64 t = ns + (NSEC_PER_USEC / 2) ; \
+ u32 rem; \
+ do_div(t, NSEC_PER_USEC); \
+ rem = do_div(t, USEC_PER_SEC); \
+ })
+
+/*
+ * The gpu_sched_switch event indicates that a switch from one GPU context to
+ * another occurred on one of the GPU hardware blocks.
+ *
+ * The gpu_name argument identifies the GPU hardware block. Each independently
+ * scheduled GPU hardware block should have a different name. This may be used
+ * in different ways for different GPUs. For example, if a GPU includes
+ * multiple processing cores it may use names "GPU 0", "GPU 1", etc. If a GPU
+ * includes a separately scheduled 2D and 3D hardware block, it might use the
+ * names "2D" and "3D".
+ *
+ * The timestamp argument is the timestamp at which the switch occurred on the
+ * GPU. These timestamps are in units of nanoseconds and must use
+ * approximately the same time as sched_clock, though they need not come from
+ * any CPU clock. The timestamps for a single hardware block must be
+ * monotonically nondecreasing. This means that if a variable compensation
+ * offset is used to translate from some other clock to the sched_clock, then
+ * care must be taken when increasing that offset, and doing so may result in
+ * multiple events with the same timestamp.
+ *
+ * The next_ctx_id argument identifies the next context that was running on
+ * the GPU hardware block. A value of 0 indicates that the hardware block
+ * will be idle.
+ *
+ * The next_prio argument indicates the priority of the next context at the
+ * time of the event. The exact numeric values may mean different things for
+ * different GPUs, but they should follow the rule that lower values indicate a
+ * higher priority.
+ *
+ * The next_job_id argument identifies the batch of work that the GPU will be
+ * working on. This should correspond to a job_id that was previously traced
+ * as a gpu_job_enqueue event when the batch of work was created.
+ */
+TRACE_EVENT(gpu_sched_switch,
+
+ TP_PROTO(const char *gpu_name, u64 timestamp,
+ u32 next_ctx_id, s32 next_prio, u32 next_job_id),
+
+ TP_ARGS(gpu_name, timestamp, next_ctx_id, next_prio, next_job_id),
+
+ TP_STRUCT__entry(
+ __string( gpu_name, gpu_name )
+ __field( u64, timestamp )
+ __field( u32, next_ctx_id )
+ __field( s32, next_prio )
+ __field( u32, next_job_id )
+ ),
+
+ TP_fast_assign(
+ __assign_str(gpu_name, gpu_name);
+ __entry->timestamp = timestamp;
+ __entry->next_ctx_id = next_ctx_id;
+ __entry->next_prio = next_prio;
+ __entry->next_job_id = next_job_id;
+ ),
+
+ TP_printk("gpu_name=%s ts=%llu.%06lu next_ctx_id=%lu next_prio=%ld "
+ "next_job_id=%lu",
+ __get_str(gpu_name),
+ (unsigned long long)show_secs_from_ns(__entry->timestamp),
+ (unsigned long)show_usecs_from_ns(__entry->timestamp),
+ (unsigned long)__entry->next_ctx_id,
+ (long)__entry->next_prio,
+ (unsigned long)__entry->next_job_id)
+);
+
+/*
+ * The gpu_job_enqueue event indicates that a batch of work has been queued up
+ * to be processed by the GPU. This event is not intended to indicate that
+ * the batch of work has been submitted to the GPU hardware, but rather that
+ * it has been submitted to the GPU kernel driver.
+ *
+ * This event should be traced on the thread that initiated the work being
+ * queued. For example, if a batch of work is submitted to the kernel by a
+ * userland thread, the event should be traced on that thread.
+ *
+ * The ctx_id field identifies the GPU context in which the batch of work
+ * being queued is to be run.
+ *
+ * The job_id field identifies the batch of work being queued within the given
+ * GPU context. The first batch of work submitted for a given GPU context
+ * should have a job_id of 0, and each subsequent batch of work should
+ * increment the job_id by 1.
+ *
+ * The type field identifies the type of the job being enqueued. The job
+ * types may be different for different GPU hardware. For example, a GPU may
+ * differentiate between "2D", "3D", and "compute" jobs.
+ */
+TRACE_EVENT(gpu_job_enqueue,
+
+ TP_PROTO(u32 ctx_id, u32 job_id, const char *type),
+
+ TP_ARGS(ctx_id, job_id, type),
+
+ TP_STRUCT__entry(
+ __field( u32, ctx_id )
+ __field( u32, job_id )
+ __string( type, type )
+ ),
+
+ TP_fast_assign(
+ __entry->ctx_id = ctx_id;
+ __entry->job_id = job_id;
+ __assign_str(type, type);
+ ),
+
+ TP_printk("ctx_id=%lu job_id=%lu type=%s",
+ (unsigned long)__entry->ctx_id,
+ (unsigned long)__entry->job_id,
+ __get_str(type))
+);
+
+#undef show_secs_from_ns
+#undef show_usecs_from_ns
+
+#endif /* _TRACE_GPU_H */
+
+/* This part must be outside protection */
+#include <trace/define_trace.h>
diff --git a/include/trace/events/net.h b/include/trace/events/net.h
index 49cc7c3de252..89d009e10938 100644
--- a/include/trace/events/net.h
+++ b/include/trace/events/net.h
@@ -57,7 +57,7 @@ TRACE_EVENT(net_dev_start_xmit,
__entry->gso_type = skb_shinfo(skb)->gso_type;
),
- TP_printk("dev=%s queue_mapping=%u skbaddr=%p vlan_tagged=%d vlan_proto=0x%04x vlan_tci=0x%04x protocol=0x%04x ip_summed=%d len=%u data_len=%u network_offset=%d transport_offset_valid=%d transport_offset=%d tx_flags=%d gso_size=%d gso_segs=%d gso_type=%#x",
+ TP_printk("dev=%s queue_mapping=%u skbaddr=%pK vlan_tagged=%d vlan_proto=0x%04x vlan_tci=0x%04x protocol=0x%04x ip_summed=%d len=%u data_len=%u network_offset=%d transport_offset_valid=%d transport_offset=%d tx_flags=%d gso_size=%d gso_segs=%d gso_type=%#x",
__get_str(name), __entry->queue_mapping, __entry->skbaddr,
__entry->vlan_tagged, __entry->vlan_proto, __entry->vlan_tci,
__entry->protocol, __entry->ip_summed, __entry->len,
@@ -90,7 +90,7 @@ TRACE_EVENT(net_dev_xmit,
__assign_str(name, dev->name);
),
- TP_printk("dev=%s skbaddr=%p len=%u rc=%d",
+ TP_printk("dev=%s skbaddr=%pK len=%u rc=%d",
__get_str(name), __entry->skbaddr, __entry->len, __entry->rc)
);
@@ -112,7 +112,7 @@ DECLARE_EVENT_CLASS(net_dev_template,
__assign_str(name, skb->dev->name);
),
- TP_printk("dev=%s skbaddr=%p len=%u",
+ TP_printk("dev=%s skbaddr=%pK len=%u",
__get_str(name), __entry->skbaddr, __entry->len)
)
@@ -191,7 +191,7 @@ DECLARE_EVENT_CLASS(net_dev_rx_verbose_template,
__entry->gso_type = skb_shinfo(skb)->gso_type;
),
- TP_printk("dev=%s napi_id=%#x queue_mapping=%u skbaddr=%p vlan_tagged=%d vlan_proto=0x%04x vlan_tci=0x%04x protocol=0x%04x ip_summed=%d hash=0x%08x l4_hash=%d len=%u data_len=%u truesize=%u mac_header_valid=%d mac_header=%d nr_frags=%d gso_size=%d gso_type=%#x",
+ TP_printk("dev=%s napi_id=%#x queue_mapping=%u skbaddr=%pK vlan_tagged=%d vlan_proto=0x%04x vlan_tci=0x%04x protocol=0x%04x ip_summed=%d hash=0x%08x l4_hash=%d len=%u data_len=%u truesize=%u mac_header_valid=%d mac_header=%d nr_frags=%d gso_size=%d gso_type=%#x",
__get_str(name), __entry->napi_id, __entry->queue_mapping,
__entry->skbaddr, __entry->vlan_tagged, __entry->vlan_proto,
__entry->vlan_tci, __entry->protocol, __entry->ip_summed,
diff --git a/include/trace/events/power.h b/include/trace/events/power.h
index 54e3aad32806..ec6f81561558 100644
--- a/include/trace/events/power.h
+++ b/include/trace/events/power.h
@@ -147,6 +147,38 @@ DEFINE_EVENT(cpu, cpu_frequency,
TP_ARGS(frequency, cpu_id)
);
+TRACE_EVENT(cpu_frequency_limits,
+
+ TP_PROTO(unsigned int max_freq, unsigned int min_freq,
+ unsigned int cpu_id),
+
+ TP_ARGS(max_freq, min_freq, cpu_id),
+
+ TP_STRUCT__entry(
+ __field( u32, min_freq )
+ __field( u32, max_freq )
+ __field( u32, cpu_id )
+ ),
+
+ TP_fast_assign(
+ __entry->min_freq = min_freq;
+ __entry->max_freq = max_freq;
+ __entry->cpu_id = cpu_id;
+ ),
+
+ TP_printk("min=%lu max=%lu cpu_id=%lu",
+ (unsigned long)__entry->min_freq,
+ (unsigned long)__entry->max_freq,
+ (unsigned long)__entry->cpu_id)
+);
+
+DEFINE_EVENT(cpu, cpu_capacity,
+
+ TP_PROTO(unsigned int capacity, unsigned int cpu_id),
+
+ TP_ARGS(capacity, cpu_id)
+);
+
TRACE_EVENT(device_pm_callback_start,
TP_PROTO(struct device *dev, const char *pm_ops, int event),
@@ -300,6 +332,25 @@ DEFINE_EVENT(clock, clock_set_rate,
TP_ARGS(name, state, cpu_id)
);
+TRACE_EVENT(clock_set_parent,
+
+ TP_PROTO(const char *name, const char *parent_name),
+
+ TP_ARGS(name, parent_name),
+
+ TP_STRUCT__entry(
+ __string( name, name )
+ __string( parent_name, parent_name )
+ ),
+
+ TP_fast_assign(
+ __assign_str(name, name);
+ __assign_str(parent_name, parent_name);
+ ),
+
+ TP_printk("%s parent=%s", __get_str(name), __get_str(parent_name))
+);
+
/*
* The power domain events are used for power domains transitions
*/
diff --git a/include/trace/events/preemptirq.h b/include/trace/events/preemptirq.h
new file mode 100644
index 000000000000..9c4eb33c5a1d
--- /dev/null
+++ b/include/trace/events/preemptirq.h
@@ -0,0 +1,73 @@
+#ifdef CONFIG_PREEMPTIRQ_EVENTS
+
+#undef TRACE_SYSTEM
+#define TRACE_SYSTEM preemptirq
+
+#if !defined(_TRACE_PREEMPTIRQ_H) || defined(TRACE_HEADER_MULTI_READ)
+#define _TRACE_PREEMPTIRQ_H
+
+#include <linux/ktime.h>
+#include <linux/tracepoint.h>
+#include <linux/string.h>
+#include <asm/sections.h>
+
+DECLARE_EVENT_CLASS(preemptirq_template,
+
+ TP_PROTO(unsigned long ip, unsigned long parent_ip),
+
+ TP_ARGS(ip, parent_ip),
+
+ TP_STRUCT__entry(
+ __field(u32, caller_offs)
+ __field(u32, parent_offs)
+ ),
+
+ TP_fast_assign(
+ __entry->caller_offs = (u32)(ip - (unsigned long)_stext);
+ __entry->parent_offs = (u32)(parent_ip - (unsigned long)_stext);
+ ),
+
+ TP_printk("caller=%pF parent=%pF",
+ (void *)((unsigned long)(_stext) + __entry->caller_offs),
+ (void *)((unsigned long)(_stext) + __entry->parent_offs))
+);
+
+#ifndef CONFIG_PROVE_LOCKING
+DEFINE_EVENT(preemptirq_template, irq_disable,
+ TP_PROTO(unsigned long ip, unsigned long parent_ip),
+ TP_ARGS(ip, parent_ip));
+
+DEFINE_EVENT(preemptirq_template, irq_enable,
+ TP_PROTO(unsigned long ip, unsigned long parent_ip),
+ TP_ARGS(ip, parent_ip));
+#endif
+
+#ifdef CONFIG_DEBUG_PREEMPT
+DEFINE_EVENT(preemptirq_template, preempt_disable,
+ TP_PROTO(unsigned long ip, unsigned long parent_ip),
+ TP_ARGS(ip, parent_ip));
+
+DEFINE_EVENT(preemptirq_template, preempt_enable,
+ TP_PROTO(unsigned long ip, unsigned long parent_ip),
+ TP_ARGS(ip, parent_ip));
+#endif
+
+#endif /* _TRACE_PREEMPTIRQ_H */
+
+#include <trace/define_trace.h>
+
+#endif /* !CONFIG_PREEMPTIRQ_EVENTS */
+
+#if !defined(CONFIG_PREEMPTIRQ_EVENTS) || defined(CONFIG_PROVE_LOCKING)
+#define trace_irq_enable(...)
+#define trace_irq_disable(...)
+#define trace_irq_enable_rcuidle(...)
+#define trace_irq_disable_rcuidle(...)
+#endif
+
+#if !defined(CONFIG_PREEMPTIRQ_EVENTS) || !defined(CONFIG_DEBUG_PREEMPT)
+#define trace_preempt_enable(...)
+#define trace_preempt_disable(...)
+#define trace_preempt_enable_rcuidle(...)
+#define trace_preempt_disable_rcuidle(...)
+#endif
diff --git a/include/trace/events/sched.h b/include/trace/events/sched.h
index 9b90c57517a9..f5780473a5c1 100644
--- a/include/trace/events/sched.h
+++ b/include/trace/events/sched.h
@@ -219,7 +219,7 @@ DECLARE_EVENT_CLASS(sched_process_template,
DEFINE_EVENT(sched_process_template, sched_process_free,
TP_PROTO(struct task_struct *p),
TP_ARGS(p));
-
+
/*
* Tracepoint for a task exiting:
@@ -374,6 +374,30 @@ DEFINE_EVENT(sched_stat_template, sched_stat_blocked,
TP_ARGS(tsk, delay));
/*
+ * Tracepoint for recording the cause of uninterruptible sleep.
+ */
+TRACE_EVENT(sched_blocked_reason,
+
+ TP_PROTO(struct task_struct *tsk),
+
+ TP_ARGS(tsk),
+
+ TP_STRUCT__entry(
+ __field( pid_t, pid )
+ __field( void*, caller )
+ __field( bool, io_wait )
+ ),
+
+ TP_fast_assign(
+ __entry->pid = tsk->pid;
+ __entry->caller = (void*)get_wchan(tsk);
+ __entry->io_wait = tsk->in_iowait;
+ ),
+
+ TP_printk("pid=%d iowait=%d caller=%pS", __entry->pid, __entry->io_wait, __entry->caller)
+);
+
+/*
* Tracepoint for accounting runtime (time the task is executing
* on a CPU).
*/
@@ -562,6 +586,520 @@ TRACE_EVENT(sched_wake_idle_without_ipi,
TP_printk("cpu=%d", __entry->cpu)
);
+
+TRACE_EVENT(sched_contrib_scale_f,
+
+ TP_PROTO(int cpu, unsigned long freq_scale_factor,
+ unsigned long cpu_scale_factor),
+
+ TP_ARGS(cpu, freq_scale_factor, cpu_scale_factor),
+
+ TP_STRUCT__entry(
+ __field(int, cpu)
+ __field(unsigned long, freq_scale_factor)
+ __field(unsigned long, cpu_scale_factor)
+ ),
+
+ TP_fast_assign(
+ __entry->cpu = cpu;
+ __entry->freq_scale_factor = freq_scale_factor;
+ __entry->cpu_scale_factor = cpu_scale_factor;
+ ),
+
+ TP_printk("cpu=%d freq_scale_factor=%lu cpu_scale_factor=%lu",
+ __entry->cpu, __entry->freq_scale_factor,
+ __entry->cpu_scale_factor)
+);
+
+#ifdef CONFIG_SMP
+
+#ifdef CONFIG_SCHED_WALT
+extern unsigned int sysctl_sched_use_walt_cpu_util;
+extern unsigned int sysctl_sched_use_walt_task_util;
+extern unsigned int walt_ravg_window;
+extern bool walt_disabled;
+#endif
+
+/*
+ * Tracepoint for accounting sched averages for tasks.
+ */
+TRACE_EVENT(sched_load_avg_task,
+
+ TP_PROTO(struct task_struct *tsk, struct sched_avg *avg, void *_ravg),
+
+ TP_ARGS(tsk, avg, _ravg),
+
+ TP_STRUCT__entry(
+ __array( char, comm, TASK_COMM_LEN )
+ __field( pid_t, pid )
+ __field( int, cpu )
+ __field( unsigned long, load_avg )
+ __field( unsigned long, util_avg )
+ __field( unsigned long, util_avg_pelt )
+ __field( u32, util_avg_walt )
+ __field( u64, load_sum )
+ __field( u32, util_sum )
+ __field( u32, period_contrib )
+ ),
+
+ TP_fast_assign(
+ memcpy(__entry->comm, tsk->comm, TASK_COMM_LEN);
+ __entry->pid = tsk->pid;
+ __entry->cpu = task_cpu(tsk);
+ __entry->load_avg = avg->load_avg;
+ __entry->util_avg = avg->util_avg;
+ __entry->load_sum = avg->load_sum;
+ __entry->util_sum = avg->util_sum;
+ __entry->period_contrib = avg->period_contrib;
+ __entry->util_avg_pelt = avg->util_avg;
+ __entry->util_avg_walt = 0;
+#ifdef CONFIG_SCHED_WALT
+ __entry->util_avg_walt = ((struct ravg*)_ravg)->demand /
+ (walt_ravg_window >> SCHED_CAPACITY_SHIFT);
+ if (!walt_disabled && sysctl_sched_use_walt_task_util)
+ __entry->util_avg = __entry->util_avg_walt;
+#endif
+ ),
+ TP_printk("comm=%s pid=%d cpu=%d load_avg=%lu util_avg=%lu "
+ "util_avg_pelt=%lu util_avg_walt=%u load_sum=%llu"
+ " util_sum=%u period_contrib=%u",
+ __entry->comm,
+ __entry->pid,
+ __entry->cpu,
+ __entry->load_avg,
+ __entry->util_avg,
+ __entry->util_avg_pelt,
+ __entry->util_avg_walt,
+ (u64)__entry->load_sum,
+ (u32)__entry->util_sum,
+ (u32)__entry->period_contrib)
+);
+
+/*
+ * Tracepoint for accounting sched averages for cpus.
+ */
+TRACE_EVENT(sched_load_avg_cpu,
+
+ TP_PROTO(int cpu, struct cfs_rq *cfs_rq),
+
+ TP_ARGS(cpu, cfs_rq),
+
+ TP_STRUCT__entry(
+ __field( int, cpu )
+ __field( unsigned long, load_avg )
+ __field( unsigned long, util_avg )
+ __field( unsigned long, util_avg_pelt )
+ __field( u32, util_avg_walt )
+ ),
+
+ TP_fast_assign(
+ __entry->cpu = cpu;
+ __entry->load_avg = cfs_rq->avg.load_avg;
+ __entry->util_avg = cfs_rq->avg.util_avg;
+ __entry->util_avg_pelt = cfs_rq->avg.util_avg;
+ __entry->util_avg_walt = 0;
+#ifdef CONFIG_SCHED_WALT
+ __entry->util_avg_walt = div64_ul(cpu_rq(cpu)->prev_runnable_sum,
+ walt_ravg_window >> SCHED_CAPACITY_SHIFT);
+ if (!walt_disabled && sysctl_sched_use_walt_cpu_util)
+ __entry->util_avg = __entry->util_avg_walt;
+#endif
+ ),
+
+ TP_printk("cpu=%d load_avg=%lu util_avg=%lu "
+ "util_avg_pelt=%lu util_avg_walt=%u",
+ __entry->cpu, __entry->load_avg, __entry->util_avg,
+ __entry->util_avg_pelt, __entry->util_avg_walt)
+);
+
+/*
+ * Tracepoint for sched_tune_config settings
+ */
+TRACE_EVENT(sched_tune_config,
+
+ TP_PROTO(int boost),
+
+ TP_ARGS(boost),
+
+ TP_STRUCT__entry(
+ __field( int, boost )
+ ),
+
+ TP_fast_assign(
+ __entry->boost = boost;
+ ),
+
+ TP_printk("boost=%d ", __entry->boost)
+);
+
+/*
+ * Tracepoint for accounting CPU boosted utilization
+ */
+TRACE_EVENT(sched_boost_cpu,
+
+ TP_PROTO(int cpu, unsigned long util, long margin),
+
+ TP_ARGS(cpu, util, margin),
+
+ TP_STRUCT__entry(
+ __field( int, cpu )
+ __field( unsigned long, util )
+ __field(long, margin )
+ ),
+
+ TP_fast_assign(
+ __entry->cpu = cpu;
+ __entry->util = util;
+ __entry->margin = margin;
+ ),
+
+ TP_printk("cpu=%d util=%lu margin=%ld",
+ __entry->cpu,
+ __entry->util,
+ __entry->margin)
+);
+
+/*
+ * Tracepoint for schedtune_tasks_update
+ */
+TRACE_EVENT(sched_tune_tasks_update,
+
+ TP_PROTO(struct task_struct *tsk, int cpu, int tasks, int idx,
+ int boost, int max_boost),
+
+ TP_ARGS(tsk, cpu, tasks, idx, boost, max_boost),
+
+ TP_STRUCT__entry(
+ __array( char, comm, TASK_COMM_LEN )
+ __field( pid_t, pid )
+ __field( int, cpu )
+ __field( int, tasks )
+ __field( int, idx )
+ __field( int, boost )
+ __field( int, max_boost )
+ ),
+
+ TP_fast_assign(
+ memcpy(__entry->comm, tsk->comm, TASK_COMM_LEN);
+ __entry->pid = tsk->pid;
+ __entry->cpu = cpu;
+ __entry->tasks = tasks;
+ __entry->idx = idx;
+ __entry->boost = boost;
+ __entry->max_boost = max_boost;
+ ),
+
+ TP_printk("pid=%d comm=%s "
+ "cpu=%d tasks=%d idx=%d boost=%d max_boost=%d",
+ __entry->pid, __entry->comm,
+ __entry->cpu, __entry->tasks, __entry->idx,
+ __entry->boost, __entry->max_boost)
+);
+
+/*
+ * Tracepoint for schedtune_boostgroup_update
+ */
+TRACE_EVENT(sched_tune_boostgroup_update,
+
+ TP_PROTO(int cpu, int variation, int max_boost),
+
+ TP_ARGS(cpu, variation, max_boost),
+
+ TP_STRUCT__entry(
+ __field( int, cpu )
+ __field( int, variation )
+ __field( int, max_boost )
+ ),
+
+ TP_fast_assign(
+ __entry->cpu = cpu;
+ __entry->variation = variation;
+ __entry->max_boost = max_boost;
+ ),
+
+ TP_printk("cpu=%d variation=%d max_boost=%d",
+ __entry->cpu, __entry->variation, __entry->max_boost)
+);
+
+/*
+ * Tracepoint for accounting task boosted utilization
+ */
+TRACE_EVENT(sched_boost_task,
+
+ TP_PROTO(struct task_struct *tsk, unsigned long util, long margin),
+
+ TP_ARGS(tsk, util, margin),
+
+ TP_STRUCT__entry(
+ __array( char, comm, TASK_COMM_LEN )
+ __field( pid_t, pid )
+ __field( unsigned long, util )
+ __field( long, margin )
+
+ ),
+
+ TP_fast_assign(
+ memcpy(__entry->comm, tsk->comm, TASK_COMM_LEN);
+ __entry->pid = tsk->pid;
+ __entry->util = util;
+ __entry->margin = margin;
+ ),
+
+ TP_printk("comm=%s pid=%d util=%lu margin=%ld",
+ __entry->comm, __entry->pid,
+ __entry->util,
+ __entry->margin)
+);
+
+/*
+ * Tracepoint for find_best_target
+ */
+TRACE_EVENT(sched_find_best_target,
+
+ TP_PROTO(struct task_struct *tsk, bool prefer_idle,
+ unsigned long min_util, int start_cpu,
+ int best_idle, int best_active, int target),
+
+ TP_ARGS(tsk, prefer_idle, min_util, start_cpu,
+ best_idle, best_active, target),
+
+ TP_STRUCT__entry(
+ __array( char, comm, TASK_COMM_LEN )
+ __field( pid_t, pid )
+ __field( unsigned long, min_util )
+ __field( bool, prefer_idle )
+ __field( int, start_cpu )
+ __field( int, best_idle )
+ __field( int, best_active )
+ __field( int, target )
+ ),
+
+ TP_fast_assign(
+ memcpy(__entry->comm, tsk->comm, TASK_COMM_LEN);
+ __entry->pid = tsk->pid;
+ __entry->min_util = min_util;
+ __entry->prefer_idle = prefer_idle;
+ __entry->start_cpu = start_cpu;
+ __entry->best_idle = best_idle;
+ __entry->best_active = best_active;
+ __entry->target = target;
+ ),
+
+ TP_printk("pid=%d comm=%s prefer_idle=%d start_cpu=%d "
+ "best_idle=%d best_active=%d target=%d",
+ __entry->pid, __entry->comm,
+ __entry->prefer_idle, __entry->start_cpu,
+ __entry->best_idle, __entry->best_active,
+ __entry->target)
+);
+
+/*
+ * Tracepoint for schedtune_tasks_update
+ */
+TRACE_EVENT(sched_tune_filter,
+
+ TP_PROTO(int nrg_delta, int cap_delta,
+ int nrg_gain, int cap_gain,
+ int payoff, int region),
+
+ TP_ARGS(nrg_delta, cap_delta, nrg_gain, cap_gain, payoff, region),
+
+ TP_STRUCT__entry(
+ __field( int, nrg_delta )
+ __field( int, cap_delta )
+ __field( int, nrg_gain )
+ __field( int, cap_gain )
+ __field( int, payoff )
+ __field( int, region )
+ ),
+
+ TP_fast_assign(
+ __entry->nrg_delta = nrg_delta;
+ __entry->cap_delta = cap_delta;
+ __entry->nrg_gain = nrg_gain;
+ __entry->cap_gain = cap_gain;
+ __entry->payoff = payoff;
+ __entry->region = region;
+ ),
+
+ TP_printk("nrg_delta=%d cap_delta=%d nrg_gain=%d cap_gain=%d payoff=%d region=%d",
+ __entry->nrg_delta, __entry->cap_delta,
+ __entry->nrg_gain, __entry->cap_gain,
+ __entry->payoff, __entry->region)
+);
+
+/*
+ * Tracepoint for system overutilized flag
+ */
+TRACE_EVENT(sched_overutilized,
+
+ TP_PROTO(bool overutilized),
+
+ TP_ARGS(overutilized),
+
+ TP_STRUCT__entry(
+ __field( bool, overutilized )
+ ),
+
+ TP_fast_assign(
+ __entry->overutilized = overutilized;
+ ),
+
+ TP_printk("overutilized=%d",
+ __entry->overutilized ? 1 : 0)
+);
+#ifdef CONFIG_SCHED_WALT
+struct rq;
+
+TRACE_EVENT(walt_update_task_ravg,
+
+ TP_PROTO(struct task_struct *p, struct rq *rq, int evt,
+ u64 wallclock, u64 irqtime),
+
+ TP_ARGS(p, rq, evt, wallclock, irqtime),
+
+ TP_STRUCT__entry(
+ __array( char, comm, TASK_COMM_LEN )
+ __field( pid_t, pid )
+ __field( pid_t, cur_pid )
+ __field( u64, wallclock )
+ __field( u64, mark_start )
+ __field( u64, delta_m )
+ __field( u64, win_start )
+ __field( u64, delta )
+ __field( u64, irqtime )
+ __field( int, evt )
+ __field(unsigned int, demand )
+ __field(unsigned int, sum )
+ __field( int, cpu )
+ __field( u64, cs )
+ __field( u64, ps )
+ __field(unsigned long, util )
+ __field( u32, curr_window )
+ __field( u32, prev_window )
+ __field( u64, nt_cs )
+ __field( u64, nt_ps )
+ __field( u32, active_windows )
+ ),
+
+ TP_fast_assign(
+ __entry->wallclock = wallclock;
+ __entry->win_start = rq->window_start;
+ __entry->delta = (wallclock - rq->window_start);
+ __entry->evt = evt;
+ __entry->cpu = rq->cpu;
+ __entry->cur_pid = rq->curr->pid;
+ memcpy(__entry->comm, p->comm, TASK_COMM_LEN);
+ __entry->pid = p->pid;
+ __entry->mark_start = p->ravg.mark_start;
+ __entry->delta_m = (wallclock - p->ravg.mark_start);
+ __entry->demand = p->ravg.demand;
+ __entry->sum = p->ravg.sum;
+ __entry->irqtime = irqtime;
+ __entry->cs = rq->curr_runnable_sum;
+ __entry->ps = rq->prev_runnable_sum;
+ __entry->util = rq->prev_runnable_sum << SCHED_CAPACITY_SHIFT;
+ do_div(__entry->util, walt_ravg_window);
+ __entry->curr_window = p->ravg.curr_window;
+ __entry->prev_window = p->ravg.prev_window;
+ __entry->nt_cs = rq->nt_curr_runnable_sum;
+ __entry->nt_ps = rq->nt_prev_runnable_sum;
+ __entry->active_windows = p->ravg.active_windows;
+ ),
+
+ TP_printk("wc %llu ws %llu delta %llu event %d cpu %d cur_pid %d task %d (%s) ms %llu delta %llu demand %u sum %u irqtime %llu"
+ " cs %llu ps %llu util %lu cur_window %u prev_window %u active_wins %u"
+ , __entry->wallclock, __entry->win_start, __entry->delta,
+ __entry->evt, __entry->cpu, __entry->cur_pid,
+ __entry->pid, __entry->comm, __entry->mark_start,
+ __entry->delta_m, __entry->demand,
+ __entry->sum, __entry->irqtime,
+ __entry->cs, __entry->ps, __entry->util,
+ __entry->curr_window, __entry->prev_window,
+ __entry->active_windows
+ )
+);
+
+TRACE_EVENT(walt_update_history,
+
+ TP_PROTO(struct rq *rq, struct task_struct *p, u32 runtime, int samples,
+ int evt),
+
+ TP_ARGS(rq, p, runtime, samples, evt),
+
+ TP_STRUCT__entry(
+ __array( char, comm, TASK_COMM_LEN )
+ __field( pid_t, pid )
+ __field(unsigned int, runtime )
+ __field( int, samples )
+ __field( int, evt )
+ __field( u64, demand )
+ __field( u64, walt_avg )
+ __field(unsigned int, pelt_avg )
+ __array( u32, hist, RAVG_HIST_SIZE_MAX)
+ __field( int, cpu )
+ ),
+
+ TP_fast_assign(
+ memcpy(__entry->comm, p->comm, TASK_COMM_LEN);
+ __entry->pid = p->pid;
+ __entry->runtime = runtime;
+ __entry->samples = samples;
+ __entry->evt = evt;
+ __entry->demand = p->ravg.demand;
+ __entry->walt_avg = (__entry->demand << SCHED_CAPACITY_SHIFT);
+ __entry->walt_avg = div_u64(__entry->walt_avg,
+ walt_ravg_window);
+ __entry->pelt_avg = p->se.avg.util_avg;
+ memcpy(__entry->hist, p->ravg.sum_history,
+ RAVG_HIST_SIZE_MAX * sizeof(u32));
+ __entry->cpu = rq->cpu;
+ ),
+
+ TP_printk("%d (%s): runtime %u samples %d event %d demand %llu"
+ " walt %llu pelt %u (hist: %u %u %u %u %u) cpu %d",
+ __entry->pid, __entry->comm,
+ __entry->runtime, __entry->samples, __entry->evt,
+ __entry->demand,
+ __entry->walt_avg,
+ __entry->pelt_avg,
+ __entry->hist[0], __entry->hist[1],
+ __entry->hist[2], __entry->hist[3],
+ __entry->hist[4], __entry->cpu)
+);
+
+TRACE_EVENT(walt_migration_update_sum,
+
+ TP_PROTO(struct rq *rq, struct task_struct *p),
+
+ TP_ARGS(rq, p),
+
+ TP_STRUCT__entry(
+ __field(int, cpu )
+ __field(int, pid )
+ __field( u64, cs )
+ __field( u64, ps )
+ __field( s64, nt_cs )
+ __field( s64, nt_ps )
+ ),
+
+ TP_fast_assign(
+ __entry->cpu = cpu_of(rq);
+ __entry->cs = rq->curr_runnable_sum;
+ __entry->ps = rq->prev_runnable_sum;
+ __entry->nt_cs = (s64)rq->nt_curr_runnable_sum;
+ __entry->nt_ps = (s64)rq->nt_prev_runnable_sum;
+ __entry->pid = p->pid;
+ ),
+
+ TP_printk("cpu %d: cs %llu ps %llu nt_cs %lld nt_ps %lld pid %d",
+ __entry->cpu, __entry->cs, __entry->ps,
+ __entry->nt_cs, __entry->nt_ps, __entry->pid)
+);
+#endif /* CONFIG_SCHED_WALT */
+
+#endif /* CONFIG_SMP */
+
#endif /* _TRACE_SCHED_H */
/* This part must be outside protection */
diff --git a/include/uapi/asm-generic/socket.h b/include/uapi/asm-generic/socket.h
index 67d632f1743d..2d078c20abcb 100644
--- a/include/uapi/asm-generic/socket.h
+++ b/include/uapi/asm-generic/socket.h
@@ -92,4 +92,6 @@
#define SO_CNX_ADVICE 53
+#define SO_COOKIE 57
+
#endif /* __ASM_GENERIC_SOCKET_H */
diff --git a/include/uapi/linux/android/binder.h b/include/uapi/linux/android/binder.h
index 41420e341e75..bd0da0e992b8 100644
--- a/include/uapi/linux/android/binder.h
+++ b/include/uapi/linux/android/binder.h
@@ -33,11 +33,60 @@ enum {
BINDER_TYPE_HANDLE = B_PACK_CHARS('s', 'h', '*', B_TYPE_LARGE),
BINDER_TYPE_WEAK_HANDLE = B_PACK_CHARS('w', 'h', '*', B_TYPE_LARGE),
BINDER_TYPE_FD = B_PACK_CHARS('f', 'd', '*', B_TYPE_LARGE),
+ BINDER_TYPE_FDA = B_PACK_CHARS('f', 'd', 'a', B_TYPE_LARGE),
+ BINDER_TYPE_PTR = B_PACK_CHARS('p', 't', '*', B_TYPE_LARGE),
};
-enum {
+/**
+ * enum flat_binder_object_shifts: shift values for flat_binder_object_flags
+ * @FLAT_BINDER_FLAG_SCHED_POLICY_SHIFT: shift for getting scheduler policy.
+ *
+ */
+enum flat_binder_object_shifts {
+ FLAT_BINDER_FLAG_SCHED_POLICY_SHIFT = 9,
+};
+
+/**
+ * enum flat_binder_object_flags - flags for use in flat_binder_object.flags
+ */
+enum flat_binder_object_flags {
+ /**
+ * @FLAT_BINDER_FLAG_PRIORITY_MASK: bit-mask for min scheduler priority
+ *
+ * These bits can be used to set the minimum scheduler priority
+ * at which transactions into this node should run. Valid values
+ * in these bits depend on the scheduler policy encoded in
+ * @FLAT_BINDER_FLAG_SCHED_POLICY_MASK.
+ *
+ * For SCHED_NORMAL/SCHED_BATCH, the valid range is between [-20..19]
+ * For SCHED_FIFO/SCHED_RR, the value can run between [1..99]
+ */
FLAT_BINDER_FLAG_PRIORITY_MASK = 0xff,
+ /**
+ * @FLAT_BINDER_FLAG_ACCEPTS_FDS: whether the node accepts fds.
+ */
FLAT_BINDER_FLAG_ACCEPTS_FDS = 0x100,
+ /**
+ * @FLAT_BINDER_FLAG_SCHED_POLICY_MASK: bit-mask for scheduling policy
+ *
+ * These two bits can be used to set the min scheduling policy at which
+ * transactions on this node should run. These match the UAPI
+ * scheduler policy values, eg:
+ * 00b: SCHED_NORMAL
+ * 01b: SCHED_FIFO
+ * 10b: SCHED_RR
+ * 11b: SCHED_BATCH
+ */
+ FLAT_BINDER_FLAG_SCHED_POLICY_MASK =
+ 3U << FLAT_BINDER_FLAG_SCHED_POLICY_SHIFT,
+
+ /**
+ * @FLAT_BINDER_FLAG_INHERIT_RT: whether the node inherits RT policy
+ *
+ * Only when set, calls into this node will inherit a real-time
+ * scheduling policy from the caller (for synchronous transactions).
+ */
+ FLAT_BINDER_FLAG_INHERIT_RT = 0x800,
};
#ifdef BINDER_IPC_32BIT
@@ -48,6 +97,14 @@ typedef __u64 binder_size_t;
typedef __u64 binder_uintptr_t;
#endif
+/**
+ * struct binder_object_header - header shared by all binder metadata objects.
+ * @type: type of the object
+ */
+struct binder_object_header {
+ __u32 type;
+};
+
/*
* This is the flattened representation of a Binder object for transfer
* between processes. The 'offsets' supplied as part of a binder transaction
@@ -56,9 +113,8 @@ typedef __u64 binder_uintptr_t;
* between processes.
*/
struct flat_binder_object {
- /* 8 bytes for large_flat_header. */
- __u32 type;
- __u32 flags;
+ struct binder_object_header hdr;
+ __u32 flags;
/* 8 bytes of data. */
union {
@@ -70,6 +126,86 @@ struct flat_binder_object {
binder_uintptr_t cookie;
};
+/**
+ * struct binder_fd_object - describes a filedescriptor to be fixed up.
+ * @hdr: common header structure
+ * @pad_flags: padding to remain compatible with old userspace code
+ * @pad_binder: padding to remain compatible with old userspace code
+ * @fd: file descriptor
+ * @cookie: opaque data, used by user-space
+ */
+struct binder_fd_object {
+ struct binder_object_header hdr;
+ __u32 pad_flags;
+ union {
+ binder_uintptr_t pad_binder;
+ __u32 fd;
+ };
+
+ binder_uintptr_t cookie;
+};
+
+/* struct binder_buffer_object - object describing a userspace buffer
+ * @hdr: common header structure
+ * @flags: one or more BINDER_BUFFER_* flags
+ * @buffer: address of the buffer
+ * @length: length of the buffer
+ * @parent: index in offset array pointing to parent buffer
+ * @parent_offset: offset in @parent pointing to this buffer
+ *
+ * A binder_buffer object represents an object that the
+ * binder kernel driver can copy verbatim to the target
+ * address space. A buffer itself may be pointed to from
+ * within another buffer, meaning that the pointer inside
+ * that other buffer needs to be fixed up as well. This
+ * can be done by setting the BINDER_BUFFER_FLAG_HAS_PARENT
+ * flag in @flags, by setting @parent buffer to the index
+ * in the offset array pointing to the parent binder_buffer_object,
+ * and by setting @parent_offset to the offset in the parent buffer
+ * at which the pointer to this buffer is located.
+ */
+struct binder_buffer_object {
+ struct binder_object_header hdr;
+ __u32 flags;
+ binder_uintptr_t buffer;
+ binder_size_t length;
+ binder_size_t parent;
+ binder_size_t parent_offset;
+};
+
+enum {
+ BINDER_BUFFER_FLAG_HAS_PARENT = 0x01,
+};
+
+/* struct binder_fd_array_object - object describing an array of fds in a buffer
+ * @hdr: common header structure
+ * @pad: padding to ensure correct alignment
+ * @num_fds: number of file descriptors in the buffer
+ * @parent: index in offset array to buffer holding the fd array
+ * @parent_offset: start offset of fd array in the buffer
+ *
+ * A binder_fd_array object represents an array of file
+ * descriptors embedded in a binder_buffer_object. It is
+ * different from a regular binder_buffer_object because it
+ * describes a list of file descriptors to fix up, not an opaque
+ * blob of memory, and hence the kernel needs to treat it differently.
+ *
+ * An example of how this would be used is with Android's
+ * native_handle_t object, which is a struct with a list of integers
+ * and a list of file descriptors. The native_handle_t struct itself
+ * will be represented by a struct binder_buffer_objct, whereas the
+ * embedded list of file descriptors is represented by a
+ * struct binder_fd_array_object with that binder_buffer_object as
+ * a parent.
+ */
+struct binder_fd_array_object {
+ struct binder_object_header hdr;
+ __u32 pad;
+ binder_size_t num_fds;
+ binder_size_t parent;
+ binder_size_t parent_offset;
+};
+
/*
* On 64-bit platforms where user code may run in 32-bits the driver must
* translate the buffer (and local binder) addresses appropriately.
@@ -97,6 +233,28 @@ struct binder_version {
#define BINDER_CURRENT_PROTOCOL_VERSION 8
#endif
+/*
+ * Use with BINDER_GET_NODE_DEBUG_INFO, driver reads ptr, writes to all fields.
+ * Set ptr to NULL for the first call to get the info for the first node, and
+ * then repeat the call passing the previously returned value to get the next
+ * nodes. ptr will be 0 when there are no more nodes.
+ */
+struct binder_node_debug_info {
+ binder_uintptr_t ptr;
+ binder_uintptr_t cookie;
+ __u32 has_strong_ref;
+ __u32 has_weak_ref;
+};
+
+struct binder_node_info_for_ref {
+ __u32 handle;
+ __u32 strong_count;
+ __u32 weak_count;
+ __u32 reserved1;
+ __u32 reserved2;
+ __u32 reserved3;
+};
+
#define BINDER_WRITE_READ _IOWR('b', 1, struct binder_write_read)
#define BINDER_SET_IDLE_TIMEOUT _IOW('b', 3, __s64)
#define BINDER_SET_MAX_THREADS _IOW('b', 5, __u32)
@@ -104,6 +262,8 @@ struct binder_version {
#define BINDER_SET_CONTEXT_MGR _IOW('b', 7, __s32)
#define BINDER_THREAD_EXIT _IOW('b', 8, __s32)
#define BINDER_VERSION _IOWR('b', 9, struct binder_version)
+#define BINDER_GET_NODE_DEBUG_INFO _IOWR('b', 11, struct binder_node_debug_info)
+#define BINDER_GET_NODE_INFO_FOR_REF _IOWR('b', 12, struct binder_node_info_for_ref)
/*
* NOTE: Two special error codes you should check for when calling
@@ -162,6 +322,11 @@ struct binder_transaction_data {
} data;
};
+struct binder_transaction_data_sg {
+ struct binder_transaction_data transaction_data;
+ binder_size_t buffers_size;
+};
+
struct binder_ptr_cookie {
binder_uintptr_t ptr;
binder_uintptr_t cookie;
@@ -346,6 +511,12 @@ enum binder_driver_command_protocol {
/*
* void *: cookie
*/
+
+ BC_TRANSACTION_SG = _IOW('c', 17, struct binder_transaction_data_sg),
+ BC_REPLY_SG = _IOW('c', 18, struct binder_transaction_data_sg),
+ /*
+ * binder_transaction_data_sg: the sent command.
+ */
};
#endif /* _UAPI_LINUX_BINDER_H */
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index f09c70b97eca..a339bea1f4c8 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -73,6 +73,8 @@ enum bpf_cmd {
BPF_PROG_LOAD,
BPF_OBJ_PIN,
BPF_OBJ_GET,
+ BPF_PROG_ATTACH,
+ BPF_PROG_DETACH,
};
enum bpf_map_type {
@@ -96,8 +98,23 @@ enum bpf_prog_type {
BPF_PROG_TYPE_TRACEPOINT,
BPF_PROG_TYPE_XDP,
BPF_PROG_TYPE_PERF_EVENT,
+ BPF_PROG_TYPE_CGROUP_SKB,
};
+enum bpf_attach_type {
+ BPF_CGROUP_INET_INGRESS,
+ BPF_CGROUP_INET_EGRESS,
+ __MAX_BPF_ATTACH_TYPE
+};
+
+#define MAX_BPF_ATTACH_TYPE __MAX_BPF_ATTACH_TYPE
+
+/* If BPF_F_ALLOW_OVERRIDE flag is used in BPF_PROG_ATTACH command
+ * to the given target_fd cgroup the descendent cgroup will be able to
+ * override effective bpf program that was inherited from this cgroup
+ */
+#define BPF_F_ALLOW_OVERRIDE (1U << 0)
+
#define BPF_PSEUDO_MAP_FD 1
/* flags for BPF_MAP_UPDATE_ELEM command */
@@ -107,6 +124,10 @@ enum bpf_prog_type {
#define BPF_F_NO_PREALLOC (1U << 0)
+/* Flags for accessing BPF object */
+#define BPF_F_RDONLY (1U << 3)
+#define BPF_F_WRONLY (1U << 4)
+
union bpf_attr {
struct { /* anonymous struct used by BPF_MAP_CREATE command */
__u32 map_type; /* one of enum bpf_map_type */
@@ -140,6 +161,14 @@ union bpf_attr {
struct { /* anonymous struct used by BPF_OBJ_* commands */
__aligned_u64 pathname;
__u32 bpf_fd;
+ __u32 file_flags;
+ };
+
+ struct { /* anonymous struct used by BPF_PROG_ATTACH/DETACH commands */
+ __u32 target_fd; /* container object to attach to */
+ __u32 attach_bpf_fd; /* eBPF program to attach */
+ __u32 attach_type;
+ __u32 attach_flags;
};
} __attribute__((aligned(8)));
@@ -426,6 +455,67 @@ enum bpf_func_id {
*/
BPF_FUNC_set_hash_invalid,
+ /**
+ * int bpf_get_numa_node_id()
+ * Return: Id of current NUMA node.
+ */
+ BPF_FUNC_get_numa_node_id,
+
+ /**
+ * int bpf_skb_change_head()
+ * Grows headroom of skb and adjusts MAC header offset accordingly.
+ * Will extends/reallocae as required automatically.
+ * May change skb data pointer and will thus invalidate any check
+ * performed for direct packet access.
+ * @skb: pointer to skb
+ * @len: length of header to be pushed in front
+ * @flags: Flags (unused for now)
+ * Return: 0 on success or negative error
+ */
+ BPF_FUNC_skb_change_head,
+
+ /**
+ * int bpf_xdp_adjust_head(xdp_md, delta)
+ * Adjust the xdp_md.data by delta
+ * @xdp_md: pointer to xdp_md
+ * @delta: An positive/negative integer to be added to xdp_md.data
+ * Return: 0 on success or negative on error
+ */
+ BPF_FUNC_xdp_adjust_head,
+
+ /**
+ * int bpf_probe_read_str(void *dst, int size, const void *unsafe_ptr)
+ * Copy a NUL terminated string from unsafe address. In case the string
+ * length is smaller than size, the target is not padded with further NUL
+ * bytes. In case the string length is larger than size, just count-1
+ * bytes are copied and the last byte is set to NUL.
+ * @dst: destination address
+ * @size: maximum number of bytes to copy, including the trailing NUL
+ * @unsafe_ptr: unsafe address
+ * Return:
+ * > 0 length of the string including the trailing NUL on success
+ * < 0 error
+ */
+ BPF_FUNC_probe_read_str,
+
+ /**
+ * u64 bpf_bpf_get_socket_cookie(skb)
+ * Get the cookie for the socket stored inside sk_buff.
+ * @skb: pointer to skb
+ * Return: 8 Bytes non-decreasing number on success or 0 if the socket
+ * field is missing inside sk_buff
+ */
+ BPF_FUNC_get_socket_cookie,
+
+ /**
+ * u32 bpf_get_socket_uid(skb)
+ * Get the owner uid of the socket stored inside sk_buff.
+ * @skb: pointer to skb
+ * Return: uid of the socket owner on success or 0 if the socket pointer
+ * inside sk_buff is NULL
+ */
+ BPF_FUNC_get_socket_uid,
+
__BPF_FUNC_MAX_ID,
};
diff --git a/include/uapi/linux/fcntl.h b/include/uapi/linux/fcntl.h
index beed138bd359..f85ed3a5ef4d 100644
--- a/include/uapi/linux/fcntl.h
+++ b/include/uapi/linux/fcntl.h
@@ -43,6 +43,27 @@
/* (1U << 31) is reserved for signed error codes */
/*
+ * Set/Get write life time hints. {GET,SET}_RW_HINT operate on the
+ * underlying inode, while {GET,SET}_FILE_RW_HINT operate only on
+ * the specific file.
+ */
+#define F_GET_RW_HINT (F_LINUX_SPECIFIC_BASE + 11)
+#define F_SET_RW_HINT (F_LINUX_SPECIFIC_BASE + 12)
+#define F_GET_FILE_RW_HINT (F_LINUX_SPECIFIC_BASE + 13)
+#define F_SET_FILE_RW_HINT (F_LINUX_SPECIFIC_BASE + 14)
+
+/*
+ * Valid hint values for F_{GET,SET}_RW_HINT. 0 is "not set", or can be
+ * used to clear any hints previously set.
+ */
+#define RWF_WRITE_LIFE_NOT_SET 0
+#define RWH_WRITE_LIFE_NONE 1
+#define RWH_WRITE_LIFE_SHORT 2
+#define RWH_WRITE_LIFE_MEDIUM 3
+#define RWH_WRITE_LIFE_LONG 4
+#define RWH_WRITE_LIFE_EXTREME 5
+
+/*
* Types of directory notifications that may be requested.
*/
#define DN_ACCESS 0x00000001 /* File accessed */
diff --git a/include/uapi/linux/fib_rules.h b/include/uapi/linux/fib_rules.h
index 14404b3ebb89..bbf02a63a011 100644
--- a/include/uapi/linux/fib_rules.h
+++ b/include/uapi/linux/fib_rules.h
@@ -29,6 +29,11 @@ struct fib_rule_hdr {
__u32 flags;
};
+struct fib_rule_uid_range {
+ __u32 start;
+ __u32 end;
+};
+
enum {
FRA_UNSPEC,
FRA_DST, /* destination address */
@@ -51,6 +56,7 @@ enum {
FRA_OIFNAME,
FRA_PAD,
FRA_L3MDEV, /* iif or oif is l3mdev goto its table */
+ FRA_UID_RANGE, /* UID range */
__FRA_MAX
};
diff --git a/include/uapi/linux/fs.h b/include/uapi/linux/fs.h
index 474995568f35..af8da91455da 100644
--- a/include/uapi/linux/fs.h
+++ b/include/uapi/linux/fs.h
@@ -237,6 +237,8 @@ struct fsxattr {
#define FICLONERANGE _IOW(0x94, 13, struct file_clone_range)
#define FIDEDUPERANGE _IOWR(0x94, 54, struct file_dedupe_range)
+#define FIDTRIM _IOWR('f', 128, struct fstrim_range) /* Deep discard trim */
+
#define FS_IOC_GETFLAGS _IOR('f', 1, long)
#define FS_IOC_SETFLAGS _IOW('f', 2, long)
#define FS_IOC_GETVERSION _IOR('v', 1, long)
@@ -255,18 +257,51 @@ struct fsxattr {
/* Policy provided via an ioctl on the topmost directory */
#define FS_KEY_DESCRIPTOR_SIZE 8
+#define FS_POLICY_FLAGS_PAD_4 0x00
+#define FS_POLICY_FLAGS_PAD_8 0x01
+#define FS_POLICY_FLAGS_PAD_16 0x02
+#define FS_POLICY_FLAGS_PAD_32 0x03
+#define FS_POLICY_FLAGS_PAD_MASK 0x03
+#define FS_POLICY_FLAG_DIRECT_KEY 0x04 /* use master key directly */
+#define FS_POLICY_FLAGS_VALID 0x07
+
+/* Encryption algorithms */
+#define FS_ENCRYPTION_MODE_INVALID 0
+#define FS_ENCRYPTION_MODE_AES_256_XTS 1
+#define FS_ENCRYPTION_MODE_AES_256_GCM 2
+#define FS_ENCRYPTION_MODE_AES_256_CBC 3
+#define FS_ENCRYPTION_MODE_AES_256_CTS 4
+#define FS_ENCRYPTION_MODE_AES_128_CBC 5
+#define FS_ENCRYPTION_MODE_AES_128_CTS 6
+#define FS_ENCRYPTION_MODE_SPECK128_256_XTS 7 /* Removed, do not use. */
+#define FS_ENCRYPTION_MODE_SPECK128_256_CTS 8 /* Removed, do not use. */
+#define FS_ENCRYPTION_MODE_ADIANTUM 9
+
struct fscrypt_policy {
__u8 version;
__u8 contents_encryption_mode;
__u8 filenames_encryption_mode;
__u8 flags;
__u8 master_key_descriptor[FS_KEY_DESCRIPTOR_SIZE];
-} __packed;
+};
#define FS_IOC_SET_ENCRYPTION_POLICY _IOR('f', 19, struct fscrypt_policy)
#define FS_IOC_GET_ENCRYPTION_PWSALT _IOW('f', 20, __u8[16])
#define FS_IOC_GET_ENCRYPTION_POLICY _IOW('f', 21, struct fscrypt_policy)
+/* Parameters for passing an encryption key into the kernel keyring */
+#define FS_KEY_DESC_PREFIX "fscrypt:"
+#define FS_KEY_DESC_PREFIX_SIZE 8
+
+/* Structure that userspace passes to the kernel keyring */
+#define FS_MAX_KEY_SIZE 64
+
+struct fscrypt_key {
+ __u32 mode;
+ __u8 raw[FS_MAX_KEY_SIZE];
+ __u32 size;
+};
+
/*
* Inode flags (FS_IOC_GETFLAGS / FS_IOC_SETFLAGS)
*
diff --git a/include/uapi/linux/fuse.h b/include/uapi/linux/fuse.h
index 42fa977e3b14..093237817ed0 100644
--- a/include/uapi/linux/fuse.h
+++ b/include/uapi/linux/fuse.h
@@ -375,6 +375,7 @@ enum fuse_opcode {
FUSE_READDIRPLUS = 44,
FUSE_RENAME2 = 45,
FUSE_LSEEK = 46,
+ FUSE_CANONICAL_PATH= 2016,
/* CUSE specific operations */
CUSE_INIT = 4096,
diff --git a/include/uapi/linux/hw_breakpoint.h b/include/uapi/linux/hw_breakpoint.h
index b04000a2296a..2b65efd19a46 100644
--- a/include/uapi/linux/hw_breakpoint.h
+++ b/include/uapi/linux/hw_breakpoint.h
@@ -4,7 +4,11 @@
enum {
HW_BREAKPOINT_LEN_1 = 1,
HW_BREAKPOINT_LEN_2 = 2,
+ HW_BREAKPOINT_LEN_3 = 3,
HW_BREAKPOINT_LEN_4 = 4,
+ HW_BREAKPOINT_LEN_5 = 5,
+ HW_BREAKPOINT_LEN_6 = 6,
+ HW_BREAKPOINT_LEN_7 = 7,
HW_BREAKPOINT_LEN_8 = 8,
};
diff --git a/include/uapi/linux/if_link.h b/include/uapi/linux/if_link.h
index b4fba662cd32..a39a43b7c4e8 100644
--- a/include/uapi/linux/if_link.h
+++ b/include/uapi/linux/if_link.h
@@ -444,6 +444,16 @@ enum {
#define IFLA_MACSEC_MAX (__IFLA_MACSEC_MAX - 1)
+/* XFRM section */
+enum {
+ IFLA_XFRM_UNSPEC,
+ IFLA_XFRM_LINK,
+ IFLA_XFRM_IF_ID,
+ __IFLA_XFRM_MAX
+};
+
+#define IFLA_XFRM_MAX (__IFLA_XFRM_MAX - 1)
+
enum macsec_validation_type {
MACSEC_VALIDATE_DISABLED = 0,
MACSEC_VALIDATE_CHECK = 1,
diff --git a/include/uapi/linux/if_pppolac.h b/include/uapi/linux/if_pppolac.h
new file mode 100644
index 000000000000..b7eb8153ef66
--- /dev/null
+++ b/include/uapi/linux/if_pppolac.h
@@ -0,0 +1,33 @@
+/* include/uapi/linux/if_pppolac.h
+ *
+ * Header for PPP on L2TP Access Concentrator / PPPoLAC Socket (RFC 2661)
+ *
+ * Copyright (C) 2009 Google, Inc.
+ * Author: Chia-chi Yeh <chiachi@android.com>
+ *
+ * This software is licensed under the terms of the GNU General Public
+ * License version 2, as published by the Free Software Foundation, and
+ * may be copied, distributed, and modified under those terms.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ */
+
+#ifndef _UAPI_LINUX_IF_PPPOLAC_H
+#define _UAPI_LINUX_IF_PPPOLAC_H
+
+#include <linux/socket.h>
+#include <linux/types.h>
+
+struct sockaddr_pppolac {
+ sa_family_t sa_family; /* AF_PPPOX */
+ unsigned int sa_protocol; /* PX_PROTO_OLAC */
+ int udp_socket;
+ struct __attribute__((packed)) {
+ __u16 tunnel, session;
+ } local, remote;
+} __attribute__((packed));
+
+#endif /* _UAPI_LINUX_IF_PPPOLAC_H */
diff --git a/include/uapi/linux/if_pppopns.h b/include/uapi/linux/if_pppopns.h
new file mode 100644
index 000000000000..a392b52ea6ec
--- /dev/null
+++ b/include/uapi/linux/if_pppopns.h
@@ -0,0 +1,32 @@
+/* include/uapi/linux/if_pppopns.h
+ *
+ * Header for PPP on PPTP Network Server / PPPoPNS Socket (RFC 2637)
+ *
+ * Copyright (C) 2009 Google, Inc.
+ * Author: Chia-chi Yeh <chiachi@android.com>
+ *
+ * This software is licensed under the terms of the GNU General Public
+ * License version 2, as published by the Free Software Foundation, and
+ * may be copied, distributed, and modified under those terms.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ */
+
+#ifndef _UAPI_LINUX_IF_PPPOPNS_H
+#define _UAPI_LINUX_IF_PPPOPNS_H
+
+#include <linux/socket.h>
+#include <linux/types.h>
+
+struct sockaddr_pppopns {
+ sa_family_t sa_family; /* AF_PPPOX */
+ unsigned int sa_protocol; /* PX_PROTO_OPNS */
+ int tcp_socket;
+ __u16 local;
+ __u16 remote;
+} __attribute__((packed));
+
+#endif /* _UAPI_LINUX_IF_PPPOPNS_H */
diff --git a/include/uapi/linux/if_pppox.h b/include/uapi/linux/if_pppox.h
index d37bbb17a007..6aad18a517d3 100644
--- a/include/uapi/linux/if_pppox.h
+++ b/include/uapi/linux/if_pppox.h
@@ -24,6 +24,8 @@
#include <linux/if.h>
#include <linux/if_ether.h>
#include <linux/if_pppol2tp.h>
+#include <linux/if_pppolac.h>
+#include <linux/if_pppopns.h>
#include <linux/in.h>
#include <linux/in6.h>
@@ -59,7 +61,9 @@ struct pptp_addr {
#define PX_PROTO_OE 0 /* Currently just PPPoE */
#define PX_PROTO_OL2TP 1 /* Now L2TP also */
#define PX_PROTO_PPTP 2
-#define PX_MAX_PROTO 3
+#define PX_PROTO_OLAC 3
+#define PX_PROTO_OPNS 4
+#define PX_MAX_PROTO 5
struct sockaddr_pppox {
__kernel_sa_family_t sa_family; /* address family, AF_PPPOX */
diff --git a/include/uapi/linux/input.h b/include/uapi/linux/input.h
index e794f7bee22f..f561c0eb7d63 100644
--- a/include/uapi/linux/input.h
+++ b/include/uapi/linux/input.h
@@ -61,9 +61,14 @@ struct input_id {
* Note that input core does not clamp reported values to the
* [minimum, maximum] limits, such task is left to userspace.
*
- * Resolution for main axes (ABS_X, ABS_Y, ABS_Z) is reported in
- * units per millimeter (units/mm), resolution for rotational axes
- * (ABS_RX, ABS_RY, ABS_RZ) is reported in units per radian.
+ * The default resolution for main axes (ABS_X, ABS_Y, ABS_Z)
+ * is reported in units per millimeter (units/mm), resolution
+ * for rotational axes (ABS_RX, ABS_RY, ABS_RZ) is reported
+ * in units per radian.
+ * When INPUT_PROP_ACCELEROMETER is set the resolution changes.
+ * The main axes (ABS_X, ABS_Y, ABS_Z) are then reported in
+ * in units per g (units/g) and in units per degree per second
+ * (units/deg/s) for rotational axes (ABS_RX, ABS_RY, ABS_RZ).
*/
struct input_absinfo {
__s32 value;
diff --git a/include/uapi/linux/ipv6.h b/include/uapi/linux/ipv6.h
index 8c2772340c3f..c462f1dc175e 100644
--- a/include/uapi/linux/ipv6.h
+++ b/include/uapi/linux/ipv6.h
@@ -164,6 +164,7 @@ enum {
DEVCONF_ACCEPT_DAD,
DEVCONF_FORCE_TLLAO,
DEVCONF_NDISC_NOTIFY,
+ DEVCONF_ACCEPT_RA_RT_TABLE,
DEVCONF_MLDV1_UNSOLICITED_REPORT_INTERVAL,
DEVCONF_MLDV2_UNSOLICITED_REPORT_INTERVAL,
DEVCONF_SUPPRESS_FRAG_NDISC,
@@ -178,6 +179,12 @@ enum {
DEVCONF_DROP_UNSOLICITED_NA,
DEVCONF_KEEP_ADDR_ON_DOWN,
DEVCONF_RTR_SOLICIT_MAX_INTERVAL,
+ DEVCONF_SEG6_ENABLED,
+ DEVCONF_SEG6_REQUIRE_HMAC,
+ DEVCONF_ENHANCED_DAD,
+ DEVCONF_ADDR_GEN_MODE,
+ DEVCONF_DISABLE_POLICY,
+ DEVCONF_ACCEPT_RA_RT_INFO_MIN_PLEN,
DEVCONF_MAX
};
diff --git a/include/uapi/linux/kcov.h b/include/uapi/linux/kcov.h
index 574e22ec640d..33b826b9946e 100644
--- a/include/uapi/linux/kcov.h
+++ b/include/uapi/linux/kcov.h
@@ -7,4 +7,28 @@
#define KCOV_ENABLE _IO('c', 100)
#define KCOV_DISABLE _IO('c', 101)
+enum {
+ /*
+ * Tracing coverage collection mode.
+ * Covered PCs are collected in a per-task buffer.
+ * In new KCOV version the mode is chosen by calling
+ * ioctl(fd, KCOV_ENABLE, mode). In older versions the mode argument
+ * was supposed to be 0 in such a call. So, for reasons of backward
+ * compatibility, we have chosen the value KCOV_TRACE_PC to be 0.
+ */
+ KCOV_TRACE_PC = 0,
+ /* Collecting comparison operands mode. */
+ KCOV_TRACE_CMP = 1,
+};
+
+/*
+ * The format for the types of collected comparisons.
+ *
+ * Bit 0 shows whether one of the arguments is a compile-time constant.
+ * Bits 1 & 2 contain log2 of the argument size, up to 8 bytes.
+ */
+#define KCOV_CMP_CONST (1 << 0)
+#define KCOV_CMP_SIZE(n) ((n) << 1)
+#define KCOV_CMP_MASK KCOV_CMP_SIZE(3)
+
#endif /* _LINUX_KCOV_IOCTLS_H */
diff --git a/include/uapi/linux/keychord.h b/include/uapi/linux/keychord.h
new file mode 100644
index 000000000000..ea7cf4d27bbd
--- /dev/null
+++ b/include/uapi/linux/keychord.h
@@ -0,0 +1,52 @@
+/*
+ * Key chord input driver
+ *
+ * Copyright (C) 2008 Google, Inc.
+ * Author: Mike Lockwood <lockwood@android.com>
+ *
+ * This software is licensed under the terms of the GNU General Public
+ * License version 2, as published by the Free Software Foundation, and
+ * may be copied, distributed, and modified under those terms.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+*/
+
+#ifndef _UAPI_LINUX_KEYCHORD_H_
+#define _UAPI_LINUX_KEYCHORD_H_
+
+#include <linux/input.h>
+
+#define KEYCHORD_VERSION 1
+
+/*
+ * One or more input_keychord structs are written to /dev/keychord
+ * at once to specify the list of keychords to monitor.
+ * Reading /dev/keychord returns the id of a keychord when the
+ * keychord combination is pressed. A keychord is signalled when
+ * all of the keys in the keycode list are in the pressed state.
+ * The order in which the keys are pressed does not matter.
+ * The keychord will not be signalled if keys not in the keycode
+ * list are pressed.
+ * Keychords will not be signalled on key release events.
+ */
+struct input_keychord {
+ /* should be KEYCHORD_VERSION */
+ __u16 version;
+ /*
+ * client specified ID, returned from read()
+ * when this keychord is pressed.
+ */
+ __u16 id;
+
+ /* number of keycodes in this keychord */
+ __u16 count;
+
+ /* variable length array of keycodes */
+ __u16 keycodes[];
+};
+
+#endif /* _UAPI_LINUX_KEYCHORD_H_ */
diff --git a/include/uapi/linux/loop.h b/include/uapi/linux/loop.h
index c8125ec1f4f2..23158dbe2424 100644
--- a/include/uapi/linux/loop.h
+++ b/include/uapi/linux/loop.h
@@ -88,6 +88,7 @@ struct loop_info64 {
#define LOOP_CHANGE_FD 0x4C06
#define LOOP_SET_CAPACITY 0x4C07
#define LOOP_SET_DIRECT_IO 0x4C08
+#define LOOP_SET_BLOCK_SIZE 0x4C09
/* /dev/loop-control interface */
#define LOOP_CTL_ADD 0x4C80
diff --git a/include/uapi/linux/magic.h b/include/uapi/linux/magic.h
index 9bd559472c92..bd017699420e 100644
--- a/include/uapi/linux/magic.h
+++ b/include/uapi/linux/magic.h
@@ -53,6 +53,8 @@
#define REISER2FS_SUPER_MAGIC_STRING "ReIsEr2Fs"
#define REISER2FS_JR_SUPER_MAGIC_STRING "ReIsEr3Fs"
+#define SDCARDFS_SUPER_MAGIC 0x5dca2df5
+
#define SMB_SUPER_MAGIC 0x517B
#define CGROUP_SUPER_MAGIC 0x27e0eb
#define CGROUP2_SUPER_MAGIC 0x63677270
diff --git a/include/uapi/linux/netfilter/xt_IDLETIMER.h b/include/uapi/linux/netfilter/xt_IDLETIMER.h
index 208ae9387331..faaa28b3d061 100644
--- a/include/uapi/linux/netfilter/xt_IDLETIMER.h
+++ b/include/uapi/linux/netfilter/xt_IDLETIMER.h
@@ -4,6 +4,7 @@
* Header file for Xtables timer target module.
*
* Copyright (C) 2004, 2010 Nokia Corporation
+ *
* Written by Timo Teras <ext-timo.teras@nokia.com>
*
* Converted to x_tables and forward-ported to 2.6.34
@@ -32,12 +33,19 @@
#include <linux/types.h>
#define MAX_IDLETIMER_LABEL_SIZE 28
+#define NLMSG_MAX_SIZE 64
+
+#define NL_EVENT_TYPE_INACTIVE 0
+#define NL_EVENT_TYPE_ACTIVE 1
struct idletimer_tg_info {
__u32 timeout;
char label[MAX_IDLETIMER_LABEL_SIZE];
+ /* Use netlink messages for notification in addition to sysfs */
+ __u8 send_nl_msg;
+
/* for kernel module internal use only */
struct idletimer_tg *timer __attribute__((aligned(8)));
};
diff --git a/include/uapi/linux/netfilter/xt_bpf.h b/include/uapi/linux/netfilter/xt_bpf.h
index 1fad2c27ac32..da161b56c79e 100644
--- a/include/uapi/linux/netfilter/xt_bpf.h
+++ b/include/uapi/linux/netfilter/xt_bpf.h
@@ -2,9 +2,11 @@
#define _XT_BPF_H
#include <linux/filter.h>
+#include <linux/limits.h>
#include <linux/types.h>
#define XT_BPF_MAX_NUM_INSTR 64
+#define XT_BPF_PATH_MAX (XT_BPF_MAX_NUM_INSTR * sizeof(struct sock_filter))
struct bpf_prog;
@@ -16,4 +18,24 @@ struct xt_bpf_info {
struct bpf_prog *filter __attribute__((aligned(8)));
};
+enum xt_bpf_modes {
+ XT_BPF_MODE_BYTECODE,
+ XT_BPF_MODE_FD_PINNED,
+ XT_BPF_MODE_FD_ELF,
+};
+#define XT_BPF_MODE_PATH_PINNED XT_BPF_MODE_FD_PINNED
+
+struct xt_bpf_info_v1 {
+ __u16 mode;
+ __u16 bpf_program_num_elem;
+ __s32 fd;
+ union {
+ struct sock_filter bpf_program[XT_BPF_MAX_NUM_INSTR];
+ char path[XT_BPF_PATH_MAX];
+ };
+
+ /* only used in the kernel */
+ struct bpf_prog *filter __attribute__((aligned(8)));
+};
+
#endif /*_XT_BPF_H */
diff --git a/include/uapi/linux/netfilter/xt_socket.h b/include/uapi/linux/netfilter/xt_socket.h
index 87644f832494..7f00df6cd897 100644
--- a/include/uapi/linux/netfilter/xt_socket.h
+++ b/include/uapi/linux/netfilter/xt_socket.h
@@ -26,4 +26,11 @@ struct xt_socket_mtinfo3 {
| XT_SOCKET_NOWILDCARD \
| XT_SOCKET_RESTORESKMARK)
+struct sock *xt_socket_lookup_slow_v4(struct net *net,
+ const struct sk_buff *skb,
+ const struct net_device *indev);
+struct sock *xt_socket_lookup_slow_v6(struct net *net,
+ const struct sk_buff *skb,
+ const struct net_device *indev);
+
#endif /* _XT_SOCKET_H */
diff --git a/include/uapi/linux/prctl.h b/include/uapi/linux/prctl.h
index 64776b72e1eb..8b8a5e9934af 100644
--- a/include/uapi/linux/prctl.h
+++ b/include/uapi/linux/prctl.h
@@ -209,4 +209,7 @@ struct prctl_mm_map {
# define PR_SPEC_DISABLE (1UL << 2)
# define PR_SPEC_FORCE_DISABLE (1UL << 3)
+#define PR_SET_VMA 0x53564d41
+# define PR_SET_VMA_ANON_NAME 0
+
#endif /* _LINUX_PRCTL_H */
diff --git a/include/uapi/linux/rtnetlink.h b/include/uapi/linux/rtnetlink.h
index 5a78be518101..e14377f2ec27 100644
--- a/include/uapi/linux/rtnetlink.h
+++ b/include/uapi/linux/rtnetlink.h
@@ -318,6 +318,7 @@ enum rtattr_type_t {
RTA_ENCAP,
RTA_EXPIRES,
RTA_PAD,
+ RTA_UID,
__RTA_MAX
};
diff --git a/include/uapi/linux/sysctl.h b/include/uapi/linux/sysctl.h
index d2b12152e358..e13d48058b8d 100644
--- a/include/uapi/linux/sysctl.h
+++ b/include/uapi/linux/sysctl.h
@@ -568,6 +568,7 @@ enum {
NET_IPV6_PROXY_NDP=23,
NET_IPV6_ACCEPT_SOURCE_ROUTE=25,
NET_IPV6_ACCEPT_RA_FROM_LOCAL=26,
+ NET_IPV6_ACCEPT_RA_RT_INFO_MIN_PLEN=27,
__NET_IPV6_MAX
};
diff --git a/include/uapi/linux/tcp.h b/include/uapi/linux/tcp.h
index 73ac0db487f8..84a3eb28265a 100644
--- a/include/uapi/linux/tcp.h
+++ b/include/uapi/linux/tcp.h
@@ -116,6 +116,7 @@ enum {
#define TCP_SAVE_SYN 27 /* Record SYN headers for new connections */
#define TCP_SAVED_SYN 28 /* Get SYN headers recorded for connection */
#define TCP_REPAIR_WINDOW 29 /* Get/set window parameters */
+#define TCP_FASTOPEN_CONNECT 30 /* Attempt FastOpen with connect */
struct tcp_repair_opt {
__u32 opt_code;
diff --git a/include/uapi/linux/tee.h b/include/uapi/linux/tee.h
new file mode 100644
index 000000000000..4b9eb064d7e7
--- /dev/null
+++ b/include/uapi/linux/tee.h
@@ -0,0 +1,384 @@
+/*
+ * Copyright (c) 2015-2016, Linaro Limited
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef __TEE_H
+#define __TEE_H
+
+#include <linux/ioctl.h>
+#include <linux/types.h>
+
+/*
+ * This file describes the API provided by a TEE driver to user space.
+ *
+ * Each TEE driver defines a TEE specific protocol which is used for the
+ * data passed back and forth using TEE_IOC_CMD.
+ */
+
+/* Helpers to make the ioctl defines */
+#define TEE_IOC_MAGIC 0xa4
+#define TEE_IOC_BASE 0
+
+/* Flags relating to shared memory */
+#define TEE_IOCTL_SHM_MAPPED 0x1 /* memory mapped in normal world */
+#define TEE_IOCTL_SHM_DMA_BUF 0x2 /* dma-buf handle on shared memory */
+
+#define TEE_MAX_ARG_SIZE 1024
+
+#define TEE_GEN_CAP_GP (1 << 0)/* GlobalPlatform compliant TEE */
+#define TEE_GEN_CAP_PRIVILEGED (1 << 1)/* Privileged device (for supplicant) */
+#define TEE_GEN_CAP_REG_MEM (1 << 2)/* Supports registering shared memory */
+
+/*
+ * TEE Implementation ID
+ */
+#define TEE_IMPL_ID_OPTEE 1
+
+/*
+ * OP-TEE specific capabilities
+ */
+#define TEE_OPTEE_CAP_TZ (1 << 0)
+
+/**
+ * struct tee_ioctl_version_data - TEE version
+ * @impl_id: [out] TEE implementation id
+ * @impl_caps: [out] Implementation specific capabilities
+ * @gen_caps: [out] Generic capabilities, defined by TEE_GEN_CAPS_* above
+ *
+ * Identifies the TEE implementation, @impl_id is one of TEE_IMPL_ID_* above.
+ * @impl_caps is implementation specific, for example TEE_OPTEE_CAP_*
+ * is valid when @impl_id == TEE_IMPL_ID_OPTEE.
+ */
+struct tee_ioctl_version_data {
+ __u32 impl_id;
+ __u32 impl_caps;
+ __u32 gen_caps;
+};
+
+/**
+ * TEE_IOC_VERSION - query version of TEE
+ *
+ * Takes a tee_ioctl_version_data struct and returns with the TEE version
+ * data filled in.
+ */
+#define TEE_IOC_VERSION _IOR(TEE_IOC_MAGIC, TEE_IOC_BASE + 0, \
+ struct tee_ioctl_version_data)
+
+/**
+ * struct tee_ioctl_shm_alloc_data - Shared memory allocate argument
+ * @size: [in/out] Size of shared memory to allocate
+ * @flags: [in/out] Flags to/from allocation.
+ * @id: [out] Identifier of the shared memory
+ *
+ * The flags field should currently be zero as input. Updated by the call
+ * with actual flags as defined by TEE_IOCTL_SHM_* above.
+ * This structure is used as argument for TEE_IOC_SHM_ALLOC below.
+ */
+struct tee_ioctl_shm_alloc_data {
+ __u64 size;
+ __u32 flags;
+ __s32 id;
+};
+
+/**
+ * TEE_IOC_SHM_ALLOC - allocate shared memory
+ *
+ * Allocates shared memory between the user space process and secure OS.
+ *
+ * Returns a file descriptor on success or < 0 on failure
+ *
+ * The returned file descriptor is used to map the shared memory into user
+ * space. The shared memory is freed when the descriptor is closed and the
+ * memory is unmapped.
+ */
+#define TEE_IOC_SHM_ALLOC _IOWR(TEE_IOC_MAGIC, TEE_IOC_BASE + 1, \
+ struct tee_ioctl_shm_alloc_data)
+
+/**
+ * struct tee_ioctl_buf_data - Variable sized buffer
+ * @buf_ptr: [in] A __user pointer to a buffer
+ * @buf_len: [in] Length of the buffer above
+ *
+ * Used as argument for TEE_IOC_OPEN_SESSION, TEE_IOC_INVOKE,
+ * TEE_IOC_SUPPL_RECV, and TEE_IOC_SUPPL_SEND below.
+ */
+struct tee_ioctl_buf_data {
+ __u64 buf_ptr;
+ __u64 buf_len;
+};
+
+/*
+ * Attributes for struct tee_ioctl_param, selects field in the union
+ */
+#define TEE_IOCTL_PARAM_ATTR_TYPE_NONE 0 /* parameter not used */
+
+/*
+ * These defines value parameters (struct tee_ioctl_param_value)
+ */
+#define TEE_IOCTL_PARAM_ATTR_TYPE_VALUE_INPUT 1
+#define TEE_IOCTL_PARAM_ATTR_TYPE_VALUE_OUTPUT 2
+#define TEE_IOCTL_PARAM_ATTR_TYPE_VALUE_INOUT 3 /* input and output */
+
+/*
+ * These defines shared memory reference parameters (struct
+ * tee_ioctl_param_memref)
+ */
+#define TEE_IOCTL_PARAM_ATTR_TYPE_MEMREF_INPUT 5
+#define TEE_IOCTL_PARAM_ATTR_TYPE_MEMREF_OUTPUT 6
+#define TEE_IOCTL_PARAM_ATTR_TYPE_MEMREF_INOUT 7 /* input and output */
+
+/*
+ * Mask for the type part of the attribute, leaves room for more types
+ */
+#define TEE_IOCTL_PARAM_ATTR_TYPE_MASK 0xff
+
+/* Meta parameter carrying extra information about the message. */
+#define TEE_IOCTL_PARAM_ATTR_META 0x100
+
+/* Mask of all known attr bits */
+#define TEE_IOCTL_PARAM_ATTR_MASK \
+ (TEE_IOCTL_PARAM_ATTR_TYPE_MASK | TEE_IOCTL_PARAM_ATTR_META)
+
+/*
+ * Matches TEEC_LOGIN_* in GP TEE Client API
+ * Are only defined for GP compliant TEEs
+ */
+#define TEE_IOCTL_LOGIN_PUBLIC 0
+#define TEE_IOCTL_LOGIN_USER 1
+#define TEE_IOCTL_LOGIN_GROUP 2
+#define TEE_IOCTL_LOGIN_APPLICATION 4
+#define TEE_IOCTL_LOGIN_USER_APPLICATION 5
+#define TEE_IOCTL_LOGIN_GROUP_APPLICATION 6
+
+/**
+ * struct tee_ioctl_param - parameter
+ * @attr: attributes
+ * @a: if a memref, offset into the shared memory object, else a value parameter
+ * @b: if a memref, size of the buffer, else a value parameter
+ * @c: if a memref, shared memory identifier, else a value parameter
+ *
+ * @attr & TEE_PARAM_ATTR_TYPE_MASK indicates if memref or value is used in
+ * the union. TEE_PARAM_ATTR_TYPE_VALUE_* indicates value and
+ * TEE_PARAM_ATTR_TYPE_MEMREF_* indicates memref. TEE_PARAM_ATTR_TYPE_NONE
+ * indicates that none of the members are used.
+ *
+ * Shared memory is allocated with TEE_IOC_SHM_ALLOC which returns an
+ * identifier representing the shared memory object. A memref can reference
+ * a part of a shared memory by specifying an offset (@a) and size (@b) of
+ * the object. To supply the entire shared memory object set the offset
+ * (@a) to 0 and size (@b) to the previously returned size of the object.
+ */
+struct tee_ioctl_param {
+ __u64 attr;
+ __u64 a;
+ __u64 b;
+ __u64 c;
+};
+
+#define TEE_IOCTL_UUID_LEN 16
+
+/**
+ * struct tee_ioctl_open_session_arg - Open session argument
+ * @uuid: [in] UUID of the Trusted Application
+ * @clnt_uuid: [in] UUID of client
+ * @clnt_login: [in] Login class of client, TEE_IOCTL_LOGIN_* above
+ * @cancel_id: [in] Cancellation id, a unique value to identify this request
+ * @session: [out] Session id
+ * @ret: [out] return value
+ * @ret_origin [out] origin of the return value
+ * @num_params [in] number of parameters following this struct
+ */
+struct tee_ioctl_open_session_arg {
+ __u8 uuid[TEE_IOCTL_UUID_LEN];
+ __u8 clnt_uuid[TEE_IOCTL_UUID_LEN];
+ __u32 clnt_login;
+ __u32 cancel_id;
+ __u32 session;
+ __u32 ret;
+ __u32 ret_origin;
+ __u32 num_params;
+ /* num_params tells the actual number of element in params */
+ struct tee_ioctl_param params[];
+};
+
+/**
+ * TEE_IOC_OPEN_SESSION - opens a session to a Trusted Application
+ *
+ * Takes a struct tee_ioctl_buf_data which contains a struct
+ * tee_ioctl_open_session_arg followed by any array of struct
+ * tee_ioctl_param
+ */
+#define TEE_IOC_OPEN_SESSION _IOR(TEE_IOC_MAGIC, TEE_IOC_BASE + 2, \
+ struct tee_ioctl_buf_data)
+
+/**
+ * struct tee_ioctl_invoke_func_arg - Invokes a function in a Trusted
+ * Application
+ * @func: [in] Trusted Application function, specific to the TA
+ * @session: [in] Session id
+ * @cancel_id: [in] Cancellation id, a unique value to identify this request
+ * @ret: [out] return value
+ * @ret_origin [out] origin of the return value
+ * @num_params [in] number of parameters following this struct
+ */
+struct tee_ioctl_invoke_arg {
+ __u32 func;
+ __u32 session;
+ __u32 cancel_id;
+ __u32 ret;
+ __u32 ret_origin;
+ __u32 num_params;
+ /* num_params tells the actual number of element in params */
+ struct tee_ioctl_param params[];
+};
+
+/**
+ * TEE_IOC_INVOKE - Invokes a function in a Trusted Application
+ *
+ * Takes a struct tee_ioctl_buf_data which contains a struct
+ * tee_invoke_func_arg followed by any array of struct tee_param
+ */
+#define TEE_IOC_INVOKE _IOR(TEE_IOC_MAGIC, TEE_IOC_BASE + 3, \
+ struct tee_ioctl_buf_data)
+
+/**
+ * struct tee_ioctl_cancel_arg - Cancels an open session or invoke ioctl
+ * @cancel_id: [in] Cancellation id, a unique value to identify this request
+ * @session: [in] Session id, if the session is opened, else set to 0
+ */
+struct tee_ioctl_cancel_arg {
+ __u32 cancel_id;
+ __u32 session;
+};
+
+/**
+ * TEE_IOC_CANCEL - Cancels an open session or invoke
+ */
+#define TEE_IOC_CANCEL _IOR(TEE_IOC_MAGIC, TEE_IOC_BASE + 4, \
+ struct tee_ioctl_cancel_arg)
+
+/**
+ * struct tee_ioctl_close_session_arg - Closes an open session
+ * @session: [in] Session id
+ */
+struct tee_ioctl_close_session_arg {
+ __u32 session;
+};
+
+/**
+ * TEE_IOC_CLOSE_SESSION - Closes a session
+ */
+#define TEE_IOC_CLOSE_SESSION _IOR(TEE_IOC_MAGIC, TEE_IOC_BASE + 5, \
+ struct tee_ioctl_close_session_arg)
+
+/**
+ * struct tee_iocl_supp_recv_arg - Receive a request for a supplicant function
+ * @func: [in] supplicant function
+ * @num_params [in/out] number of parameters following this struct
+ *
+ * @num_params is the number of params that tee-supplicant has room to
+ * receive when input, @num_params is the number of actual params
+ * tee-supplicant receives when output.
+ */
+struct tee_iocl_supp_recv_arg {
+ __u32 func;
+ __u32 num_params;
+ /* num_params tells the actual number of element in params */
+ struct tee_ioctl_param params[];
+};
+
+/**
+ * TEE_IOC_SUPPL_RECV - Receive a request for a supplicant function
+ *
+ * Takes a struct tee_ioctl_buf_data which contains a struct
+ * tee_iocl_supp_recv_arg followed by any array of struct tee_param
+ */
+#define TEE_IOC_SUPPL_RECV _IOR(TEE_IOC_MAGIC, TEE_IOC_BASE + 6, \
+ struct tee_ioctl_buf_data)
+
+/**
+ * struct tee_iocl_supp_send_arg - Send a response to a received request
+ * @ret: [out] return value
+ * @num_params [in] number of parameters following this struct
+ */
+struct tee_iocl_supp_send_arg {
+ __u32 ret;
+ __u32 num_params;
+ /* num_params tells the actual number of element in params */
+ struct tee_ioctl_param params[];
+};
+
+/**
+ * TEE_IOC_SUPPL_SEND - Receive a request for a supplicant function
+ *
+ * Takes a struct tee_ioctl_buf_data which contains a struct
+ * tee_iocl_supp_send_arg followed by any array of struct tee_param
+ */
+#define TEE_IOC_SUPPL_SEND _IOR(TEE_IOC_MAGIC, TEE_IOC_BASE + 7, \
+ struct tee_ioctl_buf_data)
+
+/**
+ * struct tee_ioctl_shm_register_data - Shared memory register argument
+ * @addr: [in] Start address of shared memory to register
+ * @length: [in/out] Length of shared memory to register
+ * @flags: [in/out] Flags to/from registration.
+ * @id: [out] Identifier of the shared memory
+ *
+ * The flags field should currently be zero as input. Updated by the call
+ * with actual flags as defined by TEE_IOCTL_SHM_* above.
+ * This structure is used as argument for TEE_IOC_SHM_REGISTER below.
+ */
+struct tee_ioctl_shm_register_data {
+ __u64 addr;
+ __u64 length;
+ __u32 flags;
+ __s32 id;
+};
+
+/**
+ * TEE_IOC_SHM_REGISTER - Register shared memory argument
+ *
+ * Registers shared memory between the user space process and secure OS.
+ *
+ * Returns a file descriptor on success or < 0 on failure
+ *
+ * The shared memory is unregisterred when the descriptor is closed.
+ */
+#define TEE_IOC_SHM_REGISTER _IOWR(TEE_IOC_MAGIC, TEE_IOC_BASE + 9, \
+ struct tee_ioctl_shm_register_data)
+/*
+ * Five syscalls are used when communicating with the TEE driver.
+ * open(): opens the device associated with the driver
+ * ioctl(): as described above operating on the file descriptor from open()
+ * close(): two cases
+ * - closes the device file descriptor
+ * - closes a file descriptor connected to allocated shared memory
+ * mmap(): maps shared memory into user space using information from struct
+ * tee_ioctl_shm_alloc_data
+ * munmap(): unmaps previously shared memory
+ */
+
+#endif /*__TEE_H*/
diff --git a/include/uapi/linux/usb/f_accessory.h b/include/uapi/linux/usb/f_accessory.h
new file mode 100644
index 000000000000..0baeb7d0d74c
--- /dev/null
+++ b/include/uapi/linux/usb/f_accessory.h
@@ -0,0 +1,146 @@
+/*
+ * Gadget Function Driver for Android USB accessories
+ *
+ * Copyright (C) 2011 Google, Inc.
+ * Author: Mike Lockwood <lockwood@android.com>
+ *
+ * This software is licensed under the terms of the GNU General Public
+ * License version 2, as published by the Free Software Foundation, and
+ * may be copied, distributed, and modified under those terms.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ */
+
+#ifndef _UAPI_LINUX_USB_F_ACCESSORY_H
+#define _UAPI_LINUX_USB_F_ACCESSORY_H
+
+/* Use Google Vendor ID when in accessory mode */
+#define USB_ACCESSORY_VENDOR_ID 0x18D1
+
+
+/* Product ID to use when in accessory mode */
+#define USB_ACCESSORY_PRODUCT_ID 0x2D00
+
+/* Product ID to use when in accessory mode and adb is enabled */
+#define USB_ACCESSORY_ADB_PRODUCT_ID 0x2D01
+
+/* Indexes for strings sent by the host via ACCESSORY_SEND_STRING */
+#define ACCESSORY_STRING_MANUFACTURER 0
+#define ACCESSORY_STRING_MODEL 1
+#define ACCESSORY_STRING_DESCRIPTION 2
+#define ACCESSORY_STRING_VERSION 3
+#define ACCESSORY_STRING_URI 4
+#define ACCESSORY_STRING_SERIAL 5
+
+/* Control request for retrieving device's protocol version
+ *
+ * requestType: USB_DIR_IN | USB_TYPE_VENDOR
+ * request: ACCESSORY_GET_PROTOCOL
+ * value: 0
+ * index: 0
+ * data version number (16 bits little endian)
+ * 1 for original accessory support
+ * 2 adds HID and device to host audio support
+ */
+#define ACCESSORY_GET_PROTOCOL 51
+
+/* Control request for host to send a string to the device
+ *
+ * requestType: USB_DIR_OUT | USB_TYPE_VENDOR
+ * request: ACCESSORY_SEND_STRING
+ * value: 0
+ * index: string ID
+ * data zero terminated UTF8 string
+ *
+ * The device can later retrieve these strings via the
+ * ACCESSORY_GET_STRING_* ioctls
+ */
+#define ACCESSORY_SEND_STRING 52
+
+/* Control request for starting device in accessory mode.
+ * The host sends this after setting all its strings to the device.
+ *
+ * requestType: USB_DIR_OUT | USB_TYPE_VENDOR
+ * request: ACCESSORY_START
+ * value: 0
+ * index: 0
+ * data none
+ */
+#define ACCESSORY_START 53
+
+/* Control request for registering a HID device.
+ * Upon registering, a unique ID is sent by the accessory in the
+ * value parameter. This ID will be used for future commands for
+ * the device
+ *
+ * requestType: USB_DIR_OUT | USB_TYPE_VENDOR
+ * request: ACCESSORY_REGISTER_HID_DEVICE
+ * value: Accessory assigned ID for the HID device
+ * index: total length of the HID report descriptor
+ * data none
+ */
+#define ACCESSORY_REGISTER_HID 54
+
+/* Control request for unregistering a HID device.
+ *
+ * requestType: USB_DIR_OUT | USB_TYPE_VENDOR
+ * request: ACCESSORY_REGISTER_HID
+ * value: Accessory assigned ID for the HID device
+ * index: 0
+ * data none
+ */
+#define ACCESSORY_UNREGISTER_HID 55
+
+/* Control request for sending the HID report descriptor.
+ * If the HID descriptor is longer than the endpoint zero max packet size,
+ * the descriptor will be sent in multiple ACCESSORY_SET_HID_REPORT_DESC
+ * commands. The data for the descriptor must be sent sequentially
+ * if multiple packets are needed.
+ *
+ * requestType: USB_DIR_OUT | USB_TYPE_VENDOR
+ * request: ACCESSORY_SET_HID_REPORT_DESC
+ * value: Accessory assigned ID for the HID device
+ * index: offset of data in descriptor
+ * (needed when HID descriptor is too big for one packet)
+ * data the HID report descriptor
+ */
+#define ACCESSORY_SET_HID_REPORT_DESC 56
+
+/* Control request for sending HID events.
+ *
+ * requestType: USB_DIR_OUT | USB_TYPE_VENDOR
+ * request: ACCESSORY_SEND_HID_EVENT
+ * value: Accessory assigned ID for the HID device
+ * index: 0
+ * data the HID report for the event
+ */
+#define ACCESSORY_SEND_HID_EVENT 57
+
+/* Control request for setting the audio mode.
+ *
+ * requestType: USB_DIR_OUT | USB_TYPE_VENDOR
+ * request: ACCESSORY_SET_AUDIO_MODE
+ * value: 0 - no audio
+ * 1 - device to host, 44100 16-bit stereo PCM
+ * index: 0
+ * data none
+ */
+#define ACCESSORY_SET_AUDIO_MODE 58
+
+/* ioctls for retrieving strings set by the host */
+#define ACCESSORY_GET_STRING_MANUFACTURER _IOW('M', 1, char[256])
+#define ACCESSORY_GET_STRING_MODEL _IOW('M', 2, char[256])
+#define ACCESSORY_GET_STRING_DESCRIPTION _IOW('M', 3, char[256])
+#define ACCESSORY_GET_STRING_VERSION _IOW('M', 4, char[256])
+#define ACCESSORY_GET_STRING_URI _IOW('M', 5, char[256])
+#define ACCESSORY_GET_STRING_SERIAL _IOW('M', 6, char[256])
+/* returns 1 if there is a start request pending */
+#define ACCESSORY_IS_START_REQUESTED _IO('M', 7)
+/* returns audio mode (set via the ACCESSORY_SET_AUDIO_MODE control request) */
+#define ACCESSORY_GET_AUDIO_MODE _IO('M', 8)
+
+#endif /* _UAPI_LINUX_USB_F_ACCESSORY_H */
diff --git a/include/uapi/linux/usb/f_mtp.h b/include/uapi/linux/usb/f_mtp.h
new file mode 100644
index 000000000000..503291855abd
--- /dev/null
+++ b/include/uapi/linux/usb/f_mtp.h
@@ -0,0 +1,61 @@
+/*
+ * Gadget Function Driver for MTP
+ *
+ * Copyright (C) 2010 Google, Inc.
+ * Author: Mike Lockwood <lockwood@android.com>
+ *
+ * This software is licensed under the terms of the GNU General Public
+ * License version 2, as published by the Free Software Foundation, and
+ * may be copied, distributed, and modified under those terms.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ */
+
+#ifndef _UAPI_LINUX_USB_F_MTP_H
+#define _UAPI_LINUX_USB_F_MTP_H
+
+#include <linux/ioctl.h>
+#include <linux/types.h>
+
+struct mtp_file_range {
+ /* file descriptor for file to transfer */
+ int fd;
+ /* offset in file for start of transfer */
+ loff_t offset;
+ /* number of bytes to transfer */
+ int64_t length;
+ /* MTP command ID for data header,
+ * used only for MTP_SEND_FILE_WITH_HEADER
+ */
+ uint16_t command;
+ /* MTP transaction ID for data header,
+ * used only for MTP_SEND_FILE_WITH_HEADER
+ */
+ uint32_t transaction_id;
+};
+
+struct mtp_event {
+ /* size of the event */
+ size_t length;
+ /* event data to send */
+ void *data;
+};
+
+/* Sends the specified file range to the host */
+#define MTP_SEND_FILE _IOW('M', 0, struct mtp_file_range)
+/* Receives data from the host and writes it to a file.
+ * The file is created if it does not exist.
+ */
+#define MTP_RECEIVE_FILE _IOW('M', 1, struct mtp_file_range)
+/* Sends an event to the host via the interrupt endpoint */
+#define MTP_SEND_EVENT _IOW('M', 3, struct mtp_event)
+/* Sends the specified file range to the host,
+ * with a 12 byte MTP data packet header at the beginning.
+ */
+#define MTP_SEND_FILE_WITH_HEADER _IOW('M', 4, struct mtp_file_range)
+
+#endif /* _UAPI_LINUX_USB_F_MTP_H */
diff --git a/include/uapi/linux/xfrm.h b/include/uapi/linux/xfrm.h
index 1fc62b239f1b..4215bbdb1f87 100644
--- a/include/uapi/linux/xfrm.h
+++ b/include/uapi/linux/xfrm.h
@@ -303,8 +303,13 @@ enum xfrm_attr_type_t {
XFRMA_PROTO, /* __u8 */
XFRMA_ADDRESS_FILTER, /* struct xfrm_address_filter */
XFRMA_PAD,
+ XFRMA_OFFLOAD_DEV, /* struct xfrm_state_offload */
+ XFRMA_SET_MARK, /* __u32 */
+ XFRMA_SET_MARK_MASK, /* __u32 */
+ XFRMA_IF_ID, /* __u32 */
__XFRMA_MAX
+#define XFRMA_OUTPUT_MARK XFRMA_SET_MARK /* Compatibility */
#define XFRMA_MAX (__XFRMA_MAX - 1)
};
diff --git a/init/Kconfig b/init/Kconfig
index b331feeabda4..4ca610d9e692 100644
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -400,6 +400,15 @@ config IRQ_TIME_ACCOUNTING
If in doubt, say N here.
+config SCHED_WALT
+ bool "Support window based load tracking"
+ depends on SMP
+ help
+ This feature will allow the scheduler to maintain a tunable window
+ based set of metrics for tasks and runqueues. These metrics can be
+ used to guide task placement as well as task frequency requirements
+ for cpufreq governors.
+
config BSD_PROCESS_ACCT
bool "BSD Process Accounting"
depends on MULTIUSER
@@ -971,6 +980,82 @@ menuconfig CGROUPS
if CGROUPS
+config CGROUP_DEBUG
+ bool "Example debug cgroup subsystem"
+ default n
+ help
+ This option enables a simple cgroup subsystem that
+ exports useful debugging information about the cgroups
+ framework.
+
+ Say N if unsure.
+
+config CGROUP_FREEZER
+ bool "Freezer cgroup subsystem"
+ help
+ Provides a way to freeze and unfreeze all tasks in a
+ cgroup.
+
+config CGROUP_PIDS
+ bool "PIDs cgroup subsystem"
+ help
+ Provides enforcement of process number limits in the scope of a
+ cgroup. Any attempt to fork more processes than is allowed in the
+ cgroup will fail. PIDs are fundamentally a global resource because it
+ is fairly trivial to reach PID exhaustion before you reach even a
+ conservative kmemcg limit. As a result, it is possible to grind a
+ system to halt without being limited by other cgroup policies. The
+ PIDs cgroup subsystem is designed to stop this from happening.
+
+ It should be noted that organisational operations (such as attaching
+ to a cgroup hierarchy will *not* be blocked by the PIDs subsystem),
+ since the PIDs limit only affects a process's ability to fork, not to
+ attach to a cgroup.
+
+config CGROUP_DEVICE
+ bool "Device controller for cgroups"
+ help
+ Provides a cgroup implementing whitelists for devices which
+ a process in the cgroup can mknod or open.
+
+config CPUSETS
+ bool "Cpuset support"
+ help
+ This option will let you create and manage CPUSETs which
+ allow dynamically partitioning a system into sets of CPUs and
+ Memory Nodes and assigning tasks to run only within those sets.
+ This is primarily useful on large SMP or NUMA systems.
+
+ Say N if unsure.
+
+config PROC_PID_CPUSET
+ bool "Include legacy /proc/<pid>/cpuset file"
+ depends on CPUSETS
+ default y
+
+config CGROUP_CPUACCT
+ bool "Simple CPU accounting cgroup subsystem"
+ help
+ Provides a simple Resource Controller for monitoring the
+ total CPU consumed by the tasks in a cgroup.
+
+config CGROUP_SCHEDTUNE
+ bool "CFS tasks boosting cgroup subsystem (EXPERIMENTAL)"
+ depends on SCHED_TUNE
+ help
+ This option provides the "schedtune" controller which improves the
+ flexibility of the task boosting mechanism by introducing the support
+ to define "per task" boost values.
+
+ This new controller:
+ 1. allows only a two layers hierarchy, where the root defines the
+ system-wide boost value and its direct childrens define each one a
+ different "class of tasks" to be boosted with a different value
+ 2. supports up to 16 different task classes, each one which could be
+ configured with a different boost value
+
+ Say N if unsure.
+
config PAGE_COUNTER
bool
@@ -1154,6 +1239,19 @@ config CGROUP_PERF
Say N if unsure.
+config CGROUP_BPF
+ bool "Support for eBPF programs attached to cgroups"
+ depends on BPF_SYSCALL
+ select SOCK_CGROUP_DATA
+ help
+ Allow attaching eBPF programs to a cgroup using the bpf(2)
+ syscall command BPF_PROG_ATTACH.
+
+ In which context these programs are accessed depends on the type
+ of attachment. For instance, programs that are attached using
+ BPF_CGROUP_INET_INGRESS will be executed on the ingress path of
+ inet sockets.
+
config CGROUP_DEBUG
bool "Example controller"
default n
@@ -1163,6 +1261,10 @@ config CGROUP_DEBUG
Say N.
+config SOCK_CGROUP_DATA
+ bool
+ default n
+
endif # CGROUPS
config CHECKPOINT_RESTORE
@@ -1248,6 +1350,43 @@ config SCHED_AUTOGROUP
desktop applications. Task group autogeneration is currently based
upon task session.
+config SCHED_TUNE
+ bool "Boosting for CFS tasks (EXPERIMENTAL)"
+ depends on SMP
+ help
+ This option enables the system-wide support for task boosting.
+ When this support is enabled a new sysctl interface is exposed to
+ userspace via:
+ /proc/sys/kernel/sched_cfs_boost
+ which allows to set a system-wide boost value in range [0..100].
+
+ The currently boosting strategy is implemented in such a way that:
+ - a 0% boost value requires to operate in "standard" mode by
+ scheduling all tasks at the minimum capacities required by their
+ workload demand
+ - a 100% boost value requires to push at maximum the task
+ performances, "regardless" of the incurred energy consumption
+
+ A boost value in between these two boundaries is used to bias the
+ power/performance trade-off, the higher the boost value the more the
+ scheduler is biased toward performance boosting instead of energy
+ efficiency.
+
+ Since this support exposes a single system-wide knob, the specified
+ boost value is applied to all (CFS) tasks in the system.
+
+ If unsure, say N.
+
+config DEFAULT_USE_ENERGY_AWARE
+ bool "Default to enabling the Energy Aware Scheduler feature"
+ default n
+ help
+ This option defaults the ENERGY_AWARE scheduling feature to true,
+ as without SCHED_DEBUG set this feature can't be enabled or disabled
+ via sysctl.
+
+ Say N if unsure.
+
config SYSFS_DEPRECATED
bool "Enable deprecated sysfs features to support old userspace tools"
depends on SYSFS
@@ -1759,6 +1898,20 @@ config SLUB_DEBUG
SLUB sysfs support. /sys/slab will not exist and there will be
no support for cache validation etc.
+config SLUB_MEMCG_SYSFS_ON
+ default n
+ bool "Enable memcg SLUB sysfs support by default" if EXPERT
+ depends on SLUB && SYSFS && MEMCG
+ help
+ SLUB creates a directory under /sys/kernel/slab for each
+ allocation cache to host info and debug files. If memory
+ cgroup is enabled, each cache can have per memory cgroup
+ caches. SLUB can create the same sysfs directories for these
+ caches under /sys/kernel/slab/CACHE/cgroup but it can lead
+ to a very high number of debug files being created. This is
+ controlled by slub_memcg_sysfs boot parameter and this
+ config option determines the parameter's default value.
+
config COMPAT_BRK
bool "Disable heap randomization"
default y
@@ -2106,7 +2259,7 @@ endif # MODULES
config MODULES_TREE_LOOKUP
def_bool y
- depends on PERF_EVENTS || TRACING
+ depends on PERF_EVENTS || TRACING || CFI_CLANG
config INIT_ALL_POSSIBLE
bool
diff --git a/init/Makefile b/init/Makefile
index c4fb45525d08..d210b235c5d7 100644
--- a/init/Makefile
+++ b/init/Makefile
@@ -5,11 +5,8 @@
ccflags-y := -fno-function-sections -fno-data-sections
obj-y := main.o version.o mounts.o
-ifneq ($(CONFIG_BLK_DEV_INITRD),y)
obj-y += noinitramfs.o
-else
obj-$(CONFIG_BLK_DEV_INITRD) += initramfs.o
-endif
obj-$(CONFIG_GENERIC_CALIBRATE_DELAY) += calibrate.o
ifneq ($(CONFIG_ARCH_INIT_TASK),y)
@@ -20,6 +17,7 @@ mounts-y := do_mounts.o
mounts-$(CONFIG_BLK_DEV_RAM) += do_mounts_rd.o
mounts-$(CONFIG_BLK_DEV_INITRD) += do_mounts_initrd.o
mounts-$(CONFIG_BLK_DEV_MD) += do_mounts_md.o
+mounts-$(CONFIG_BLK_DEV_DM) += do_mounts_dm.o
# dependencies on generated files need to be listed explicitly
$(obj)/version.o: include/generated/compile.h
diff --git a/init/do_mounts.c b/init/do_mounts.c
index dea5de95c2dd..1902a1c80831 100644
--- a/init/do_mounts.c
+++ b/init/do_mounts.c
@@ -566,6 +566,7 @@ void __init prepare_namespace(void)
wait_for_device_probe();
md_run_setup();
+ dm_run_setup();
if (saved_root_name[0]) {
root_device_name = saved_root_name;
diff --git a/init/do_mounts.h b/init/do_mounts.h
index 067af1d9e8b6..ecb275782c03 100644
--- a/init/do_mounts.h
+++ b/init/do_mounts.h
@@ -74,3 +74,13 @@ void md_run_setup(void);
static inline void md_run_setup(void) {}
#endif
+
+#ifdef CONFIG_BLK_DEV_DM
+
+void dm_run_setup(void);
+
+#else
+
+static inline void dm_run_setup(void) {}
+
+#endif
diff --git a/init/do_mounts_dm.c b/init/do_mounts_dm.c
new file mode 100644
index 000000000000..af84b01ccfbc
--- /dev/null
+++ b/init/do_mounts_dm.c
@@ -0,0 +1,470 @@
+/* do_mounts_dm.c
+ * Copyright (C) 2010 The Chromium OS Authors <chromium-os-dev@chromium.org>
+ * All Rights Reserved.
+ * Based on do_mounts_md.c
+ *
+ * This file is released under the GPL.
+ */
+#include <linux/async.h>
+#include <linux/ctype.h>
+#include <linux/device-mapper.h>
+#include <linux/fs.h>
+#include <linux/string.h>
+#include <linux/delay.h>
+
+#include "do_mounts.h"
+
+#define DM_MAX_DEVICES 256
+#define DM_MAX_TARGETS 256
+#define DM_MAX_NAME 32
+#define DM_MAX_UUID 129
+#define DM_NO_UUID "none"
+
+#define DM_MSG_PREFIX "init"
+
+/* Separators used for parsing the dm= argument. */
+#define DM_FIELD_SEP " "
+#define DM_LINE_SEP ","
+#define DM_ANY_SEP DM_FIELD_SEP DM_LINE_SEP
+
+/*
+ * When the device-mapper and any targets are compiled into the kernel
+ * (not a module), one or more device-mappers may be created and used
+ * as the root device at boot time with the parameters given with the
+ * boot line dm=...
+ *
+ * Multiple device-mappers can be stacked specifing the number of
+ * devices. A device can have multiple targets if the the number of
+ * targets is specified.
+ *
+ * TODO(taysom:defect 32847)
+ * In the future, the <num> field will be mandatory.
+ *
+ * <device> ::= [<num>] <device-mapper>+
+ * <device-mapper> ::= <head> "," <target>+
+ * <head> ::= <name> <uuid> <mode> [<num>]
+ * <target> ::= <start> <length> <type> <options> ","
+ * <mode> ::= "ro" | "rw"
+ * <uuid> ::= xxxxxxxx-xxxx-xxxx-xxxx-xxxxxxxxxxxx | "none"
+ * <type> ::= "verity" | "bootcache" | ...
+ *
+ * Example:
+ * 2 vboot none ro 1,
+ * 0 1768000 bootcache
+ * device=aa55b119-2a47-8c45-946a-5ac57765011f+1
+ * signature=76e9be054b15884a9fa85973e9cb274c93afadb6
+ * cache_start=1768000 max_blocks=100000 size_limit=23 max_trace=20000,
+ * vroot none ro 1,
+ * 0 1740800 verity payload=254:0 hashtree=254:0 hashstart=1740800 alg=sha1
+ * root_hexdigest=76e9be054b15884a9fa85973e9cb274c93afadb6
+ * salt=5b3549d54d6c7a3837b9b81ed72e49463a64c03680c47835bef94d768e5646fe
+ *
+ * Notes:
+ * 1. uuid is a label for the device and we set it to "none".
+ * 2. The <num> field will be optional initially and assumed to be 1.
+ * Once all the scripts that set these fields have been set, it will
+ * be made mandatory.
+ */
+
+struct dm_setup_target {
+ sector_t begin;
+ sector_t length;
+ char *type;
+ char *params;
+ /* simple singly linked list */
+ struct dm_setup_target *next;
+};
+
+struct dm_device {
+ int minor;
+ int ro;
+ char name[DM_MAX_NAME];
+ char uuid[DM_MAX_UUID];
+ unsigned long num_targets;
+ struct dm_setup_target *target;
+ int target_count;
+ struct dm_device *next;
+};
+
+struct dm_option {
+ char *start;
+ char *next;
+ size_t len;
+ char delim;
+};
+
+static struct {
+ unsigned long num_devices;
+ char *str;
+} dm_setup_args __initdata;
+
+static __initdata int dm_early_setup;
+
+static int __init get_dm_option(struct dm_option *opt, const char *accept)
+{
+ char *str = opt->next;
+ char *endp;
+
+ if (!str)
+ return 0;
+
+ str = skip_spaces(str);
+ opt->start = str;
+ endp = strpbrk(str, accept);
+ if (!endp) { /* act like strchrnul */
+ opt->len = strlen(str);
+ endp = str + opt->len;
+ } else {
+ opt->len = endp - str;
+ }
+ opt->delim = *endp;
+ if (*endp == 0) {
+ /* Don't advance past the nul. */
+ opt->next = endp;
+ } else {
+ opt->next = endp + 1;
+ }
+ return opt->len != 0;
+}
+
+static int __init dm_setup_cleanup(struct dm_device *devices)
+{
+ struct dm_device *dev = devices;
+
+ while (dev) {
+ struct dm_device *old_dev = dev;
+ struct dm_setup_target *target = dev->target;
+ while (target) {
+ struct dm_setup_target *old_target = target;
+ kfree(target->type);
+ kfree(target->params);
+ target = target->next;
+ kfree(old_target);
+ dev->target_count--;
+ }
+ BUG_ON(dev->target_count);
+ dev = dev->next;
+ kfree(old_dev);
+ }
+ return 0;
+}
+
+static char * __init dm_parse_device(struct dm_device *dev, char *str)
+{
+ struct dm_option opt;
+ size_t len;
+
+ /* Grab the logical name of the device to be exported to udev */
+ opt.next = str;
+ if (!get_dm_option(&opt, DM_FIELD_SEP)) {
+ DMERR("failed to parse device name");
+ goto parse_fail;
+ }
+ len = min(opt.len + 1, sizeof(dev->name));
+ strlcpy(dev->name, opt.start, len); /* includes nul */
+
+ /* Grab the UUID value or "none" */
+ if (!get_dm_option(&opt, DM_FIELD_SEP)) {
+ DMERR("failed to parse device uuid");
+ goto parse_fail;
+ }
+ len = min(opt.len + 1, sizeof(dev->uuid));
+ strlcpy(dev->uuid, opt.start, len);
+
+ /* Determine if the table/device will be read only or read-write */
+ get_dm_option(&opt, DM_ANY_SEP);
+ if (!strncmp("ro", opt.start, opt.len)) {
+ dev->ro = 1;
+ } else if (!strncmp("rw", opt.start, opt.len)) {
+ dev->ro = 0;
+ } else {
+ DMERR("failed to parse table mode");
+ goto parse_fail;
+ }
+
+ /* Optional number field */
+ /* XXX: The <num> field will be mandatory in the next round */
+ if (opt.delim == DM_FIELD_SEP[0]) {
+ if (!get_dm_option(&opt, DM_LINE_SEP))
+ return NULL;
+ dev->num_targets = simple_strtoul(opt.start, NULL, 10);
+ } else {
+ dev->num_targets = 1;
+ }
+ if (dev->num_targets > DM_MAX_TARGETS) {
+ DMERR("too many targets %lu > %d",
+ dev->num_targets, DM_MAX_TARGETS);
+ }
+ return opt.next;
+
+parse_fail:
+ return NULL;
+}
+
+static char * __init dm_parse_targets(struct dm_device *dev, char *str)
+{
+ struct dm_option opt;
+ struct dm_setup_target **target = &dev->target;
+ unsigned long num_targets = dev->num_targets;
+ unsigned long i;
+
+ /* Targets are defined as per the table format but with a
+ * comma as a newline separator. */
+ opt.next = str;
+ for (i = 0; i < num_targets; i++) {
+ *target = kzalloc(sizeof(struct dm_setup_target), GFP_KERNEL);
+ if (!*target) {
+ DMERR("failed to allocate memory for target %s<%ld>",
+ dev->name, i);
+ goto parse_fail;
+ }
+ dev->target_count++;
+
+ if (!get_dm_option(&opt, DM_FIELD_SEP)) {
+ DMERR("failed to parse starting sector"
+ " for target %s<%ld>", dev->name, i);
+ goto parse_fail;
+ }
+ (*target)->begin = simple_strtoull(opt.start, NULL, 10);
+
+ if (!get_dm_option(&opt, DM_FIELD_SEP)) {
+ DMERR("failed to parse length for target %s<%ld>",
+ dev->name, i);
+ goto parse_fail;
+ }
+ (*target)->length = simple_strtoull(opt.start, NULL, 10);
+
+ if (get_dm_option(&opt, DM_FIELD_SEP))
+ (*target)->type = kstrndup(opt.start, opt.len,
+ GFP_KERNEL);
+ if (!((*target)->type)) {
+ DMERR("failed to parse type for target %s<%ld>",
+ dev->name, i);
+ goto parse_fail;
+ }
+ if (get_dm_option(&opt, DM_LINE_SEP))
+ (*target)->params = kstrndup(opt.start, opt.len,
+ GFP_KERNEL);
+ if (!((*target)->params)) {
+ DMERR("failed to parse params for target %s<%ld>",
+ dev->name, i);
+ goto parse_fail;
+ }
+ target = &((*target)->next);
+ }
+ DMDEBUG("parsed %d targets", dev->target_count);
+
+ return opt.next;
+
+parse_fail:
+ return NULL;
+}
+
+static struct dm_device * __init dm_parse_args(void)
+{
+ struct dm_device *devices = NULL;
+ struct dm_device **tail = &devices;
+ struct dm_device *dev;
+ char *str = dm_setup_args.str;
+ unsigned long num_devices = dm_setup_args.num_devices;
+ unsigned long i;
+
+ if (!str)
+ return NULL;
+ for (i = 0; i < num_devices; i++) {
+ dev = kzalloc(sizeof(*dev), GFP_KERNEL);
+ if (!dev) {
+ DMERR("failed to allocated memory for dev");
+ goto error;
+ }
+ *tail = dev;
+ tail = &dev->next;
+ /*
+ * devices are given minor numbers 0 - n-1
+ * in the order they are found in the arg
+ * string.
+ */
+ dev->minor = i;
+ str = dm_parse_device(dev, str);
+ if (!str) /* NULL indicates error in parsing, bail */
+ goto error;
+
+ str = dm_parse_targets(dev, str);
+ if (!str)
+ goto error;
+ }
+ return devices;
+error:
+ dm_setup_cleanup(devices);
+ return NULL;
+}
+
+/*
+ * Parse the command-line parameters given our kernel, but do not
+ * actually try to invoke the DM device now; that is handled by
+ * dm_setup_drives after the low-level disk drivers have initialised.
+ * dm format is described at the top of the file.
+ *
+ * Because dm minor numbers are assigned in assending order starting with 0,
+ * You can assume the first device is /dev/dm-0, the next device is /dev/dm-1,
+ * and so forth.
+ */
+static int __init dm_setup(char *str)
+{
+ struct dm_option opt;
+ unsigned long num_devices;
+
+ if (!str) {
+ DMDEBUG("str is NULL");
+ goto parse_fail;
+ }
+ opt.next = str;
+ if (!get_dm_option(&opt, DM_FIELD_SEP))
+ goto parse_fail;
+ if (isdigit(opt.start[0])) { /* XXX: Optional number field */
+ num_devices = simple_strtoul(opt.start, NULL, 10);
+ str = opt.next;
+ } else {
+ num_devices = 1;
+ /* Don't advance str */
+ }
+ if (num_devices > DM_MAX_DEVICES) {
+ DMDEBUG("too many devices %lu > %d",
+ num_devices, DM_MAX_DEVICES);
+ }
+ dm_setup_args.str = str;
+ dm_setup_args.num_devices = num_devices;
+ DMINFO("will configure %lu devices", num_devices);
+ dm_early_setup = 1;
+ return 1;
+
+parse_fail:
+ DMWARN("Invalid arguments supplied to dm=.");
+ return 0;
+}
+
+static void __init dm_setup_drives(void)
+{
+ struct mapped_device *md = NULL;
+ struct dm_table *table = NULL;
+ struct dm_setup_target *target;
+ struct dm_device *dev;
+ char *uuid;
+ fmode_t fmode = FMODE_READ;
+ struct dm_device *devices;
+
+ devices = dm_parse_args();
+
+ for (dev = devices; dev; dev = dev->next) {
+ if (dm_create(dev->minor, &md)) {
+ DMDEBUG("failed to create the device");
+ goto dm_create_fail;
+ }
+ DMDEBUG("created device '%s'", dm_device_name(md));
+
+ /*
+ * In addition to flagging the table below, the disk must be
+ * set explicitly ro/rw.
+ */
+ set_disk_ro(dm_disk(md), dev->ro);
+
+ if (!dev->ro)
+ fmode |= FMODE_WRITE;
+ if (dm_table_create(&table, fmode, dev->target_count, md)) {
+ DMDEBUG("failed to create the table");
+ goto dm_table_create_fail;
+ }
+
+ dm_lock_md_type(md);
+
+ for (target = dev->target; target; target = target->next) {
+ DMINFO("adding target '%llu %llu %s %s'",
+ (unsigned long long) target->begin,
+ (unsigned long long) target->length,
+ target->type, target->params);
+ if (dm_table_add_target(table, target->type,
+ target->begin,
+ target->length,
+ target->params)) {
+ DMDEBUG("failed to add the target"
+ " to the table");
+ goto add_target_fail;
+ }
+ }
+ if (dm_table_complete(table)) {
+ DMDEBUG("failed to complete the table");
+ goto table_complete_fail;
+ }
+
+ /* Suspend the device so that we can bind it to the table. */
+ if (dm_suspend(md, 0)) {
+ DMDEBUG("failed to suspend the device pre-bind");
+ goto suspend_fail;
+ }
+
+ /* Initial table load: acquire type of table. */
+ dm_set_md_type(md, dm_table_get_type(table));
+
+ /* Setup md->queue to reflect md's type. */
+ if (dm_setup_md_queue(md, table)) {
+ DMWARN("unable to set up device queue for new table.");
+ goto setup_md_queue_fail;
+ }
+
+ /*
+ * Bind the table to the device. This is the only way
+ * to associate md->map with the table and set the disk
+ * capacity directly.
+ */
+ if (dm_swap_table(md, table)) { /* should return NULL. */
+ DMDEBUG("failed to bind the device to the table");
+ goto table_bind_fail;
+ }
+
+ /* Finally, resume and the device should be ready. */
+ if (dm_resume(md)) {
+ DMDEBUG("failed to resume the device");
+ goto resume_fail;
+ }
+
+ /* Export the dm device via the ioctl interface */
+ if (!strcmp(DM_NO_UUID, dev->uuid))
+ uuid = NULL;
+ if (dm_ioctl_export(md, dev->name, uuid)) {
+ DMDEBUG("failed to export device with given"
+ " name and uuid");
+ goto export_fail;
+ }
+
+ dm_unlock_md_type(md);
+
+ DMINFO("dm-%d is ready", dev->minor);
+ }
+ dm_setup_cleanup(devices);
+ return;
+
+export_fail:
+resume_fail:
+table_bind_fail:
+setup_md_queue_fail:
+suspend_fail:
+table_complete_fail:
+add_target_fail:
+ dm_unlock_md_type(md);
+dm_table_create_fail:
+ dm_put(md);
+dm_create_fail:
+ DMWARN("starting dm-%d (%s) failed",
+ dev->minor, dev->name);
+ dm_setup_cleanup(devices);
+}
+
+__setup("dm=", dm_setup);
+
+void __init dm_run_setup(void)
+{
+ if (!dm_early_setup)
+ return;
+ DMINFO("attempting early device configuration.");
+ dm_setup_drives();
+}
diff --git a/init/initramfs.c b/init/initramfs.c
index 981f286c1d16..bf3af10c500a 100644
--- a/init/initramfs.c
+++ b/init/initramfs.c
@@ -18,6 +18,7 @@
#include <linux/dirent.h>
#include <linux/syscalls.h>
#include <linux/utime.h>
+#include <linux/initramfs.h>
#include <linux/file.h>
static ssize_t __init xwrite(int fd, const char *p, size_t count)
@@ -606,9 +607,28 @@ static void __init clean_rootfs(void)
}
#endif
+static int __initdata do_skip_initramfs;
+
+static int __init skip_initramfs_param(char *str)
+{
+ if (*str)
+ return 0;
+ do_skip_initramfs = 1;
+ return 1;
+}
+__setup("skip_initramfs", skip_initramfs_param);
+
static int __init populate_rootfs(void)
{
- char *err = unpack_to_rootfs(__initramfs_start, __initramfs_size);
+ char *err;
+
+ if (do_skip_initramfs) {
+ if (initrd_start)
+ free_initrd();
+ return default_rootfs();
+ }
+
+ err = unpack_to_rootfs(__initramfs_start, __initramfs_size);
if (err)
panic("%s", err); /* Failed to decompress INTERNAL initramfs */
if (initrd_start) {
diff --git a/init/noinitramfs.c b/init/noinitramfs.c
index 267739d85179..bcc8bcb053ee 100644
--- a/init/noinitramfs.c
+++ b/init/noinitramfs.c
@@ -21,11 +21,16 @@
#include <linux/stat.h>
#include <linux/kdev_t.h>
#include <linux/syscalls.h>
+#include <linux/kconfig.h>
+#include <linux/initramfs.h>
/*
* Create a simple rootfs that is similar to the default initramfs
*/
-static int __init default_rootfs(void)
+#if !IS_BUILTIN(CONFIG_BLK_DEV_INITRD)
+static
+#endif
+int __init default_rootfs(void)
{
int err;
@@ -49,4 +54,6 @@ out:
printk(KERN_WARNING "Failed to create a rootfs\n");
return err;
}
+#if !IS_BUILTIN(CONFIG_BLK_DEV_INITRD)
rootfs_initcall(default_rootfs);
+#endif
diff --git a/ipc/mqueue.c b/ipc/mqueue.c
index 28a142f1be36..02fb438fd3af 100644
--- a/ipc/mqueue.c
+++ b/ipc/mqueue.c
@@ -745,7 +745,7 @@ static struct file *do_create(struct ipc_namespace *ipc_ns, struct inode *dir,
}
mode &= ~current_umask();
- ret = vfs_create(dir, path->dentry, mode, true);
+ ret = vfs_create2(path->mnt, dir, path->dentry, mode, true);
path->dentry->d_fsdata = NULL;
if (ret)
return ERR_PTR(ret);
@@ -761,7 +761,7 @@ static struct file *do_open(struct path *path, int oflag)
if ((oflag & O_ACCMODE) == (O_RDWR | O_WRONLY))
return ERR_PTR(-EINVAL);
acc = oflag2acc[oflag & O_ACCMODE];
- if (inode_permission(d_inode(path->dentry), acc))
+ if (inode_permission2(path->mnt, d_inode(path->dentry), acc))
return ERR_PTR(-EACCES);
return dentry_open(path, oflag, current_cred());
}
@@ -794,7 +794,7 @@ SYSCALL_DEFINE4(mq_open, const char __user *, u_name, int, oflag, umode_t, mode,
ro = mnt_want_write(mnt); /* we'll drop it in any case */
error = 0;
inode_lock(d_inode(root));
- path.dentry = lookup_one_len(name->name, root, strlen(name->name));
+ path.dentry = lookup_one_len2(name->name, mnt, root, strlen(name->name));
if (IS_ERR(path.dentry)) {
error = PTR_ERR(path.dentry);
goto out_putfd;
@@ -865,7 +865,7 @@ SYSCALL_DEFINE1(mq_unlink, const char __user *, u_name)
if (err)
goto out_name;
inode_lock_nested(d_inode(mnt->mnt_root), I_MUTEX_PARENT);
- dentry = lookup_one_len(name->name, mnt->mnt_root,
+ dentry = lookup_one_len2(name->name, mnt, mnt->mnt_root,
strlen(name->name));
if (IS_ERR(dentry)) {
err = PTR_ERR(dentry);
@@ -877,7 +877,7 @@ SYSCALL_DEFINE1(mq_unlink, const char __user *, u_name)
err = -ENOENT;
} else {
ihold(inode);
- err = vfs_unlink(d_inode(dentry->d_parent), dentry, NULL);
+ err = vfs_unlink2(mnt, d_inode(dentry->d_parent), dentry, NULL);
}
dput(dentry);
diff --git a/kernel/Makefile b/kernel/Makefile
index 314e7d62f5f0..108f5bee0111 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -32,6 +32,9 @@ KASAN_SANITIZE_kcov.o := n
# cond_syscall is currently not LTO compatible
CFLAGS_sys_ni.o = $(DISABLE_LTO)
+# Don't instrument error handlers
+CFLAGS_cfi.o = $(DISABLE_CFI_CLANG)
+
obj-y += sched/
obj-y += locking/
obj-y += power/
@@ -101,6 +104,7 @@ obj-$(CONFIG_TRACEPOINTS) += trace/
obj-$(CONFIG_IRQ_WORK) += irq_work.o
obj-$(CONFIG_CPU_PM) += cpu_pm.o
obj-$(CONFIG_BPF) += bpf/
+obj-$(CONFIG_CFI_CLANG) += cfi.o
obj-$(CONFIG_PERF_EVENTS) += events/
diff --git a/kernel/bpf/Makefile b/kernel/bpf/Makefile
index eed911d091da..b22256b3893d 100644
--- a/kernel/bpf/Makefile
+++ b/kernel/bpf/Makefile
@@ -5,3 +5,4 @@ obj-$(CONFIG_BPF_SYSCALL) += hashtab.o arraymap.o percpu_freelist.o
ifeq ($(CONFIG_PERF_EVENTS),y)
obj-$(CONFIG_BPF_SYSCALL) += stackmap.o
endif
+obj-$(CONFIG_CGROUP_BPF) += cgroup.o
diff --git a/kernel/bpf/arraymap.c b/kernel/bpf/arraymap.c
index eb43f7e219f9..b30ca0f2fa41 100644
--- a/kernel/bpf/arraymap.c
+++ b/kernel/bpf/arraymap.c
@@ -16,6 +16,9 @@
#include <linux/filter.h>
#include <linux/perf_event.h>
+#define ARRAY_CREATE_FLAG_MASK \
+ (BPF_F_RDONLY | BPF_F_WRONLY)
+
static void bpf_array_free_percpu(struct bpf_array *array)
{
int i;
@@ -57,7 +60,8 @@ static struct bpf_map *array_map_alloc(union bpf_attr *attr)
/* check sanity of attributes */
if (attr->max_entries == 0 || attr->key_size != 4 ||
- attr->value_size == 0 || attr->map_flags)
+ attr->value_size == 0 ||
+ attr->map_flags & ~ARRAY_CREATE_FLAG_MASK)
return ERR_PTR(-EINVAL);
if (attr->value_size >= 1 << (KMALLOC_SHIFT_MAX - 1))
diff --git a/kernel/bpf/cgroup.c b/kernel/bpf/cgroup.c
new file mode 100644
index 000000000000..a44a7e4c23d2
--- /dev/null
+++ b/kernel/bpf/cgroup.c
@@ -0,0 +1,205 @@
+/*
+ * Functions to manage eBPF programs attached to cgroups
+ *
+ * Copyright (c) 2016 Daniel Mack
+ *
+ * This file is subject to the terms and conditions of version 2 of the GNU
+ * General Public License. See the file COPYING in the main directory of the
+ * Linux distribution for more details.
+ */
+
+#include <linux/kernel.h>
+#include <linux/atomic.h>
+#include <linux/cgroup.h>
+#include <linux/slab.h>
+#include <linux/bpf.h>
+#include <linux/bpf-cgroup.h>
+#include <net/sock.h>
+
+DEFINE_STATIC_KEY_FALSE(cgroup_bpf_enabled_key);
+EXPORT_SYMBOL(cgroup_bpf_enabled_key);
+
+/**
+ * cgroup_bpf_put() - put references of all bpf programs
+ * @cgrp: the cgroup to modify
+ */
+void cgroup_bpf_put(struct cgroup *cgrp)
+{
+ unsigned int type;
+
+ for (type = 0; type < ARRAY_SIZE(cgrp->bpf.prog); type++) {
+ struct bpf_prog *prog = cgrp->bpf.prog[type];
+
+ if (prog) {
+ bpf_prog_put(prog);
+ static_branch_dec(&cgroup_bpf_enabled_key);
+ }
+ }
+}
+
+/**
+ * cgroup_bpf_inherit() - inherit effective programs from parent
+ * @cgrp: the cgroup to modify
+ * @parent: the parent to inherit from
+ */
+void cgroup_bpf_inherit(struct cgroup *cgrp, struct cgroup *parent)
+{
+ unsigned int type;
+
+ for (type = 0; type < ARRAY_SIZE(cgrp->bpf.effective); type++) {
+ struct bpf_prog *e;
+
+ e = rcu_dereference_protected(parent->bpf.effective[type],
+ lockdep_is_held(&cgroup_mutex));
+ rcu_assign_pointer(cgrp->bpf.effective[type], e);
+ cgrp->bpf.disallow_override[type] = parent->bpf.disallow_override[type];
+ }
+}
+
+/**
+ * __cgroup_bpf_update() - Update the pinned program of a cgroup, and
+ * propagate the change to descendants
+ * @cgrp: The cgroup which descendants to traverse
+ * @parent: The parent of @cgrp, or %NULL if @cgrp is the root
+ * @prog: A new program to pin
+ * @type: Type of pinning operation (ingress/egress)
+ *
+ * Each cgroup has a set of two pointers for bpf programs; one for eBPF
+ * programs it owns, and which is effective for execution.
+ *
+ * If @prog is not %NULL, this function attaches a new program to the cgroup
+ * and releases the one that is currently attached, if any. @prog is then made
+ * the effective program of type @type in that cgroup.
+ *
+ * If @prog is %NULL, the currently attached program of type @type is released,
+ * and the effective program of the parent cgroup (if any) is inherited to
+ * @cgrp.
+ *
+ * Then, the descendants of @cgrp are walked and the effective program for
+ * each of them is set to the effective program of @cgrp unless the
+ * descendant has its own program attached, in which case the subbranch is
+ * skipped. This ensures that delegated subcgroups with own programs are left
+ * untouched.
+ *
+ * Must be called with cgroup_mutex held.
+ */
+int __cgroup_bpf_update(struct cgroup *cgrp, struct cgroup *parent,
+ struct bpf_prog *prog, enum bpf_attach_type type,
+ bool new_overridable)
+{
+ struct bpf_prog *old_prog, *effective = NULL;
+ struct cgroup_subsys_state *pos;
+ bool overridable = true;
+
+ if (parent) {
+ overridable = !parent->bpf.disallow_override[type];
+ effective = rcu_dereference_protected(parent->bpf.effective[type],
+ lockdep_is_held(&cgroup_mutex));
+ }
+
+ if (prog && effective && !overridable)
+ /* if parent has non-overridable prog attached, disallow
+ * attaching new programs to descendent cgroup
+ */
+ return -EPERM;
+
+ if (prog && effective && overridable != new_overridable)
+ /* if parent has overridable prog attached, only
+ * allow overridable programs in descendent cgroup
+ */
+ return -EPERM;
+
+ old_prog = cgrp->bpf.prog[type];
+
+ if (prog) {
+ overridable = new_overridable;
+ effective = prog;
+ if (old_prog &&
+ cgrp->bpf.disallow_override[type] == new_overridable)
+ /* disallow attaching non-overridable on top
+ * of existing overridable in this cgroup
+ * and vice versa
+ */
+ return -EPERM;
+ }
+
+ if (!prog && !old_prog)
+ /* report error when trying to detach and nothing is attached */
+ return -ENOENT;
+
+ cgrp->bpf.prog[type] = prog;
+
+ css_for_each_descendant_pre(pos, &cgrp->self) {
+ struct cgroup *desc = container_of(pos, struct cgroup, self);
+
+ /* skip the subtree if the descendant has its own program */
+ if (desc->bpf.prog[type] && desc != cgrp) {
+ pos = css_rightmost_descendant(pos);
+ } else {
+ rcu_assign_pointer(desc->bpf.effective[type],
+ effective);
+ desc->bpf.disallow_override[type] = !overridable;
+ }
+ }
+
+ if (prog)
+ static_branch_inc(&cgroup_bpf_enabled_key);
+
+ if (old_prog) {
+ bpf_prog_put(old_prog);
+ static_branch_dec(&cgroup_bpf_enabled_key);
+ }
+ return 0;
+}
+
+/**
+ * __cgroup_bpf_run_filter() - Run a program for packet filtering
+ * @sk: The socket sending or receiving traffic
+ * @skb: The skb that is being sent or received
+ * @type: The type of program to be exectuted
+ *
+ * If no socket is passed, or the socket is not of type INET or INET6,
+ * this function does nothing and returns 0.
+ *
+ * The program type passed in via @type must be suitable for network
+ * filtering. No further check is performed to assert that.
+ *
+ * This function will return %-EPERM if any if an attached program was found
+ * and if it returned != 1 during execution. In all other cases, 0 is returned.
+ */
+int __cgroup_bpf_run_filter(struct sock *sk,
+ struct sk_buff *skb,
+ enum bpf_attach_type type)
+{
+ struct bpf_prog *prog;
+ struct cgroup *cgrp;
+ int ret = 0;
+
+ if (!sk || !sk_fullsock(sk))
+ return 0;
+
+ if (sk->sk_family != AF_INET &&
+ sk->sk_family != AF_INET6)
+ return 0;
+
+ cgrp = sock_cgroup_ptr(&sk->sk_cgrp_data);
+
+ rcu_read_lock();
+
+ prog = rcu_dereference(cgrp->bpf.effective[type]);
+ if (prog) {
+ unsigned int offset = skb->data - skb_network_header(skb);
+ struct sock *save_sk = skb->sk;
+
+ skb->sk = sk;
+ __skb_push(skb, offset);
+ ret = bpf_prog_run_save_cb(prog, skb) == 1 ? 0 : -EPERM;
+ __skb_pull(skb, offset);
+ skb->sk = save_sk;
+ }
+
+ rcu_read_unlock();
+
+ return ret;
+}
+EXPORT_SYMBOL(__cgroup_bpf_run_filter);
diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c
index 879ca844ba1d..66d64db4f8c7 100644
--- a/kernel/bpf/core.c
+++ b/kernel/bpf/core.c
@@ -466,7 +466,7 @@ EXPORT_SYMBOL_GPL(__bpf_call_base);
*
* Decode and execute eBPF instructions.
*/
-static unsigned int __bpf_prog_run(void *ctx, const struct bpf_insn *insn)
+static unsigned int __bpf_prog_run(const struct sk_buff *ctx, const struct bpf_insn *insn)
{
u64 stack[MAX_BPF_STACK / sizeof(u64)];
u64 regs[MAX_BPF_REG], tmp;
diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c
index a36a532c056d..9c86d5d180bb 100644
--- a/kernel/bpf/hashtab.c
+++ b/kernel/bpf/hashtab.c
@@ -14,6 +14,8 @@
#include <linux/jhash.h>
#include <linux/filter.h>
#include "percpu_freelist.h"
+#define HTAB_CREATE_FLAG_MASK \
+ (BPF_F_NO_PREALLOC | BPF_F_RDONLY | BPF_F_WRONLY)
struct bucket {
struct hlist_head head;
@@ -148,7 +150,7 @@ static struct bpf_map *htab_map_alloc(union bpf_attr *attr)
int err, i;
u64 cost;
- if (attr->map_flags & ~BPF_F_NO_PREALLOC)
+ if (attr->map_flags & ~HTAB_CREATE_FLAG_MASK)
/* reserved bits should not be used */
return ERR_PTR(-EINVAL);
diff --git a/kernel/bpf/inode.c b/kernel/bpf/inode.c
index 1ed8473ec537..7e48fa609579 100644
--- a/kernel/bpf/inode.c
+++ b/kernel/bpf/inode.c
@@ -255,7 +255,7 @@ out:
}
static void *bpf_obj_do_get(const struct filename *pathname,
- enum bpf_type *type)
+ enum bpf_type *type, int flags)
{
struct inode *inode;
struct path path;
@@ -267,7 +267,7 @@ static void *bpf_obj_do_get(const struct filename *pathname,
return ERR_PTR(ret);
inode = d_backing_inode(path.dentry);
- ret = inode_permission(inode, MAY_WRITE);
+ ret = inode_permission(inode, ACC_MODE(flags));
if (ret)
goto out;
@@ -286,18 +286,23 @@ out:
return ERR_PTR(ret);
}
-int bpf_obj_get_user(const char __user *pathname)
+int bpf_obj_get_user(const char __user *pathname, int flags)
{
enum bpf_type type = BPF_TYPE_UNSPEC;
struct filename *pname;
int ret = -ENOENT;
+ int f_flags;
void *raw;
+ f_flags = bpf_get_file_flag(flags);
+ if (f_flags < 0)
+ return f_flags;
+
pname = getname(pathname);
if (IS_ERR(pname))
return PTR_ERR(pname);
- raw = bpf_obj_do_get(pname, &type);
+ raw = bpf_obj_do_get(pname, &type, f_flags);
if (IS_ERR(raw)) {
ret = PTR_ERR(raw);
goto out;
@@ -306,7 +311,7 @@ int bpf_obj_get_user(const char __user *pathname)
if (type == BPF_TYPE_PROG)
ret = bpf_prog_new_fd(raw);
else if (type == BPF_TYPE_MAP)
- ret = bpf_map_new_fd(raw);
+ ret = bpf_map_new_fd(raw, f_flags);
else
goto out;
@@ -317,6 +322,42 @@ out:
return ret;
}
+static struct bpf_prog *__get_prog_inode(struct inode *inode, enum bpf_prog_type type)
+{
+ struct bpf_prog *prog;
+ int ret = inode_permission(inode, MAY_READ | MAY_WRITE);
+ if (ret)
+ return ERR_PTR(ret);
+
+ if (inode->i_op == &bpf_map_iops)
+ return ERR_PTR(-EINVAL);
+ if (inode->i_op != &bpf_prog_iops)
+ return ERR_PTR(-EACCES);
+
+ prog = inode->i_private;
+
+ ret = security_bpf_prog(prog);
+ if (ret < 0)
+ return ERR_PTR(ret);
+
+ return bpf_prog_inc(prog);
+}
+
+struct bpf_prog *bpf_prog_get_type_path(const char *name, enum bpf_prog_type type)
+{
+ struct bpf_prog *prog;
+ struct path path;
+ int ret = kern_path(name, LOOKUP_FOLLOW, &path);
+ if (ret)
+ return ERR_PTR(ret);
+ prog = __get_prog_inode(d_backing_inode(path.dentry), type);
+ if (!IS_ERR(prog))
+ touch_atime(&path);
+ path_put(&path);
+ return prog;
+}
+EXPORT_SYMBOL(bpf_prog_get_type_path);
+
static void bpf_evict_inode(struct inode *inode)
{
enum bpf_type type;
diff --git a/kernel/bpf/stackmap.c b/kernel/bpf/stackmap.c
index a2a232dec236..6039db3dcf02 100644
--- a/kernel/bpf/stackmap.c
+++ b/kernel/bpf/stackmap.c
@@ -11,6 +11,9 @@
#include <linux/perf_event.h>
#include "percpu_freelist.h"
+#define STACK_CREATE_FLAG_MASK \
+ (BPF_F_RDONLY | BPF_F_WRONLY)
+
struct stack_map_bucket {
struct pcpu_freelist_node fnode;
u32 hash;
@@ -59,7 +62,7 @@ static struct bpf_map *stack_map_alloc(union bpf_attr *attr)
if (!capable(CAP_SYS_ADMIN))
return ERR_PTR(-EPERM);
- if (attr->map_flags)
+ if (attr->map_flags & ~STACK_CREATE_FLAG_MASK)
return ERR_PTR(-EINVAL);
/* check sanity of attributes */
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index ca7e277e8b5f..85ea5983d875 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -20,6 +20,8 @@
#include <linux/filter.h>
#include <linux/version.h>
+#define BPF_OBJ_FLAG_MASK (BPF_F_RDONLY | BPF_F_WRONLY)
+
DEFINE_PER_CPU(int, bpf_prog_active);
int sysctl_unprivileged_bpf_disabled __read_mostly;
@@ -119,6 +121,7 @@ static void bpf_map_free_deferred(struct work_struct *work)
struct bpf_map *map = container_of(work, struct bpf_map, work);
bpf_map_uncharge_memlock(map);
+ security_bpf_map_free(map);
/* implementation dependent freeing */
map->ops->map_free(map);
}
@@ -178,17 +181,54 @@ static void bpf_map_show_fdinfo(struct seq_file *m, struct file *filp)
}
#endif
-static const struct file_operations bpf_map_fops = {
+static ssize_t bpf_dummy_read(struct file *filp, char __user *buf, size_t siz,
+ loff_t *ppos)
+{
+ /* We need this handler such that alloc_file() enables
+ * f_mode with FMODE_CAN_READ.
+ */
+ return -EINVAL;
+}
+
+static ssize_t bpf_dummy_write(struct file *filp, const char __user *buf,
+ size_t siz, loff_t *ppos)
+{
+ /* We need this handler such that alloc_file() enables
+ * f_mode with FMODE_CAN_WRITE.
+ */
+ return -EINVAL;
+}
+
+const struct file_operations bpf_map_fops = {
#ifdef CONFIG_PROC_FS
.show_fdinfo = bpf_map_show_fdinfo,
#endif
.release = bpf_map_release,
+ .read = bpf_dummy_read,
+ .write = bpf_dummy_write,
};
-int bpf_map_new_fd(struct bpf_map *map)
+int bpf_map_new_fd(struct bpf_map *map, int flags)
{
+ int ret;
+
+ ret = security_bpf_map(map, OPEN_FMODE(flags));
+ if (ret < 0)
+ return ret;
+
return anon_inode_getfd("bpf-map", &bpf_map_fops, map,
- O_RDWR | O_CLOEXEC);
+ flags | O_CLOEXEC);
+}
+
+int bpf_get_file_flag(int flags)
+{
+ if ((flags & BPF_F_RDONLY) && (flags & BPF_F_WRONLY))
+ return -EINVAL;
+ if (flags & BPF_F_RDONLY)
+ return O_RDONLY;
+ if (flags & BPF_F_WRONLY)
+ return O_WRONLY;
+ return O_RDWR;
}
/* helper macro to check that unused fields 'union bpf_attr' are zero */
@@ -204,12 +244,17 @@ int bpf_map_new_fd(struct bpf_map *map)
static int map_create(union bpf_attr *attr)
{
struct bpf_map *map;
+ int f_flags;
int err;
err = CHECK_ATTR(BPF_MAP_CREATE);
if (err)
return -EINVAL;
+ f_flags = bpf_get_file_flag(attr->map_flags);
+ if (f_flags < 0)
+ return f_flags;
+
/* find map type and init map: hashtable vs rbtree vs bloom vs ... */
map = find_and_alloc_map(attr);
if (IS_ERR(map))
@@ -218,11 +263,15 @@ static int map_create(union bpf_attr *attr)
atomic_set(&map->refcnt, 1);
atomic_set(&map->usercnt, 1);
- err = bpf_map_charge_memlock(map);
+ err = security_bpf_map_alloc(map);
if (err)
goto free_map_nouncharge;
- err = bpf_map_new_fd(map);
+ err = bpf_map_charge_memlock(map);
+ if (err)
+ goto free_map_sec;
+
+ err = bpf_map_new_fd(map, f_flags);
if (err < 0)
/* failed to allocate fd */
goto free_map;
@@ -231,6 +280,8 @@ static int map_create(union bpf_attr *attr)
free_map:
bpf_map_uncharge_memlock(map);
+free_map_sec:
+ security_bpf_map_free(map);
free_map_nouncharge:
map->ops->map_free(map);
return err;
@@ -313,6 +364,11 @@ static int map_lookup_elem(union bpf_attr *attr)
if (IS_ERR(map))
return PTR_ERR(map);
+ if (!(f.file->f_mode & FMODE_CAN_READ)) {
+ err = -EPERM;
+ goto err_put;
+ }
+
err = -ENOMEM;
key = kmalloc(map->key_size, GFP_USER);
if (!key)
@@ -387,6 +443,11 @@ static int map_update_elem(union bpf_attr *attr)
if (IS_ERR(map))
return PTR_ERR(map);
+ if (!(f.file->f_mode & FMODE_CAN_WRITE)) {
+ err = -EPERM;
+ goto err_put;
+ }
+
err = -ENOMEM;
key = kmalloc(map->key_size, GFP_USER);
if (!key)
@@ -463,6 +524,11 @@ static int map_delete_elem(union bpf_attr *attr)
if (IS_ERR(map))
return PTR_ERR(map);
+ if (!(f.file->f_mode & FMODE_CAN_WRITE)) {
+ err = -EPERM;
+ goto err_put;
+ }
+
err = -ENOMEM;
key = kmalloc(map->key_size, GFP_USER);
if (!key)
@@ -508,6 +574,11 @@ static int map_get_next_key(union bpf_attr *attr)
if (IS_ERR(map))
return PTR_ERR(map);
+ if (!(f.file->f_mode & FMODE_CAN_READ)) {
+ err = -EPERM;
+ goto err_put;
+ }
+
if (ukey) {
err = -ENOMEM;
key = kmalloc(map->key_size, GFP_USER);
@@ -611,6 +682,7 @@ static void __bpf_prog_put_rcu(struct rcu_head *rcu)
free_used_maps(aux);
bpf_prog_uncharge_memlock(aux->prog);
+ security_bpf_prog_free(aux);
bpf_prog_free(aux->prog);
}
@@ -629,12 +701,20 @@ static int bpf_prog_release(struct inode *inode, struct file *filp)
return 0;
}
-static const struct file_operations bpf_prog_fops = {
+const struct file_operations bpf_prog_fops = {
.release = bpf_prog_release,
+ .read = bpf_dummy_read,
+ .write = bpf_dummy_write,
};
int bpf_prog_new_fd(struct bpf_prog *prog)
{
+ int ret;
+
+ ret = security_bpf_prog(prog);
+ if (ret < 0)
+ return ret;
+
return anon_inode_getfd("bpf-prog", &bpf_prog_fops, prog,
O_RDWR | O_CLOEXEC);
}
@@ -726,7 +806,9 @@ static int bpf_prog_load(union bpf_attr *attr)
attr->kern_version != LINUX_VERSION_CODE)
return -EINVAL;
- if (type != BPF_PROG_TYPE_SOCKET_FILTER && !capable(CAP_SYS_ADMIN))
+ if (type != BPF_PROG_TYPE_SOCKET_FILTER &&
+ type != BPF_PROG_TYPE_CGROUP_SKB &&
+ !capable(CAP_SYS_ADMIN))
return -EPERM;
/* plain bpf_prog allocation */
@@ -734,10 +816,14 @@ static int bpf_prog_load(union bpf_attr *attr)
if (!prog)
return -ENOMEM;
- err = bpf_prog_charge_memlock(prog);
+ err = security_bpf_prog_alloc(prog->aux);
if (err)
goto free_prog_nouncharge;
+ err = bpf_prog_charge_memlock(prog);
+ if (err)
+ goto free_prog_sec;
+
prog->len = attr->insn_cnt;
err = -EFAULT;
@@ -777,16 +863,18 @@ free_used_maps:
free_used_maps(prog->aux);
free_prog:
bpf_prog_uncharge_memlock(prog);
+free_prog_sec:
+ security_bpf_prog_free(prog->aux);
free_prog_nouncharge:
bpf_prog_free(prog);
return err;
}
-#define BPF_OBJ_LAST_FIELD bpf_fd
+#define BPF_OBJ_LAST_FIELD file_flags
static int bpf_obj_pin(const union bpf_attr *attr)
{
- if (CHECK_ATTR(BPF_OBJ))
+ if (CHECK_ATTR(BPF_OBJ) || attr->file_flags != 0)
return -EINVAL;
return bpf_obj_pin_user(attr->bpf_fd, u64_to_ptr(attr->pathname));
@@ -794,12 +882,93 @@ static int bpf_obj_pin(const union bpf_attr *attr)
static int bpf_obj_get(const union bpf_attr *attr)
{
- if (CHECK_ATTR(BPF_OBJ) || attr->bpf_fd != 0)
+ if (CHECK_ATTR(BPF_OBJ) || attr->bpf_fd != 0 ||
+ attr->file_flags & ~BPF_OBJ_FLAG_MASK)
return -EINVAL;
- return bpf_obj_get_user(u64_to_ptr(attr->pathname));
+ return bpf_obj_get_user(u64_to_ptr(attr->pathname),
+ attr->file_flags);
}
+#ifdef CONFIG_CGROUP_BPF
+
+#define BPF_PROG_ATTACH_LAST_FIELD attach_flags
+
+static int bpf_prog_attach(const union bpf_attr *attr)
+{
+ struct bpf_prog *prog;
+ struct cgroup *cgrp;
+ int ret;
+
+ if (!capable(CAP_NET_ADMIN))
+ return -EPERM;
+
+ if (CHECK_ATTR(BPF_PROG_ATTACH))
+ return -EINVAL;
+
+ if (attr->attach_flags & ~BPF_F_ALLOW_OVERRIDE)
+ return -EINVAL;
+
+ switch (attr->attach_type) {
+ case BPF_CGROUP_INET_INGRESS:
+ case BPF_CGROUP_INET_EGRESS:
+ prog = bpf_prog_get_type(attr->attach_bpf_fd,
+ BPF_PROG_TYPE_CGROUP_SKB);
+ if (IS_ERR(prog))
+ return PTR_ERR(prog);
+
+ cgrp = cgroup_get_from_fd(attr->target_fd);
+ if (IS_ERR(cgrp)) {
+ bpf_prog_put(prog);
+ return PTR_ERR(cgrp);
+ }
+
+ ret = cgroup_bpf_update(cgrp, prog, attr->attach_type,
+ attr->attach_flags & BPF_F_ALLOW_OVERRIDE);
+ if (ret)
+ bpf_prog_put(prog);
+ cgroup_put(cgrp);
+ break;
+
+ default:
+ return -EINVAL;
+ }
+
+ return ret;
+}
+
+#define BPF_PROG_DETACH_LAST_FIELD attach_type
+
+static int bpf_prog_detach(const union bpf_attr *attr)
+{
+ struct cgroup *cgrp;
+ int ret;
+
+ if (!capable(CAP_NET_ADMIN))
+ return -EPERM;
+
+ if (CHECK_ATTR(BPF_PROG_DETACH))
+ return -EINVAL;
+
+ switch (attr->attach_type) {
+ case BPF_CGROUP_INET_INGRESS:
+ case BPF_CGROUP_INET_EGRESS:
+ cgrp = cgroup_get_from_fd(attr->target_fd);
+ if (IS_ERR(cgrp))
+ return PTR_ERR(cgrp);
+
+ ret = cgroup_bpf_update(cgrp, NULL, attr->attach_type, false);
+ cgroup_put(cgrp);
+ break;
+
+ default:
+ return -EINVAL;
+ }
+
+ return ret;
+}
+#endif /* CONFIG_CGROUP_BPF */
+
SYSCALL_DEFINE3(bpf, int, cmd, union bpf_attr __user *, uattr, unsigned int, size)
{
union bpf_attr attr = {};
@@ -841,6 +1010,10 @@ SYSCALL_DEFINE3(bpf, int, cmd, union bpf_attr __user *, uattr, unsigned int, siz
if (copy_from_user(&attr, uattr, size) != 0)
return -EFAULT;
+ err = security_bpf(cmd, &attr, size);
+ if (err < 0)
+ return err;
+
switch (cmd) {
case BPF_MAP_CREATE:
err = map_create(&attr);
@@ -866,6 +1039,16 @@ SYSCALL_DEFINE3(bpf, int, cmd, union bpf_attr __user *, uattr, unsigned int, siz
case BPF_OBJ_GET:
err = bpf_obj_get(&attr);
break;
+
+#ifdef CONFIG_CGROUP_BPF
+ case BPF_PROG_ATTACH:
+ err = bpf_prog_attach(&attr);
+ break;
+ case BPF_PROG_DETACH:
+ err = bpf_prog_detach(&attr);
+ break;
+#endif
+
default:
err = -EINVAL;
break;
diff --git a/kernel/cfi.c b/kernel/cfi.c
new file mode 100644
index 000000000000..6951c25d311b
--- /dev/null
+++ b/kernel/cfi.c
@@ -0,0 +1,299 @@
+/*
+ * CFI (Control Flow Integrity) error and slowpath handling
+ *
+ * Copyright (C) 2017 Google, Inc.
+ */
+
+#include <linux/gfp.h>
+#include <linux/module.h>
+#include <linux/printk.h>
+#include <linux/ratelimit.h>
+#include <linux/rcupdate.h>
+#include <linux/spinlock.h>
+#include <asm/bug.h>
+#include <asm/cacheflush.h>
+#include <asm/memory.h>
+
+/* Compiler-defined handler names */
+#ifdef CONFIG_CFI_PERMISSIVE
+#define cfi_failure_handler __ubsan_handle_cfi_check_fail
+#define cfi_slowpath_handler __cfi_slowpath_diag
+#else /* enforcing */
+#define cfi_failure_handler __ubsan_handle_cfi_check_fail_abort
+#define cfi_slowpath_handler __cfi_slowpath
+#endif /* CONFIG_CFI_PERMISSIVE */
+
+static inline void handle_cfi_failure(void *ptr)
+{
+#ifdef CONFIG_CFI_PERMISSIVE
+ WARN_RATELIMIT(1, "CFI failure (target: [<%px>] %pF):\n", ptr, ptr);
+#else
+ pr_err("CFI failure (target: [<%px>] %pF):\n", ptr, ptr);
+ BUG();
+#endif
+}
+
+#ifdef CONFIG_MODULES
+#ifdef CONFIG_CFI_CLANG_SHADOW
+struct shadow_range {
+ /* Module address range */
+ unsigned long mod_min_addr;
+ unsigned long mod_max_addr;
+ /* Module page range */
+ unsigned long min_page;
+ unsigned long max_page;
+};
+
+#define SHADOW_ORDER 1
+#define SHADOW_PAGES (1 << SHADOW_ORDER)
+#define SHADOW_SIZE \
+ ((SHADOW_PAGES * PAGE_SIZE - sizeof(struct shadow_range)) / sizeof(u16))
+#define SHADOW_INVALID 0xFFFF
+
+struct cfi_shadow {
+ /* Page range covered by the shadow */
+ struct shadow_range r;
+ /* Page offsets to __cfi_check functions in modules */
+ u16 shadow[SHADOW_SIZE];
+};
+
+static DEFINE_SPINLOCK(shadow_update_lock);
+static struct cfi_shadow __rcu *cfi_shadow __read_mostly = NULL;
+
+static inline int ptr_to_shadow(const struct cfi_shadow *s, unsigned long ptr)
+{
+ unsigned long index;
+ unsigned long page = ptr >> PAGE_SHIFT;
+
+ if (unlikely(page < s->r.min_page))
+ return -1; /* Outside of module area */
+
+ index = page - s->r.min_page;
+
+ if (index >= SHADOW_SIZE)
+ return -1; /* Cannot be addressed with shadow */
+
+ return (int)index;
+}
+
+static inline unsigned long shadow_to_ptr(const struct cfi_shadow *s,
+ int index)
+{
+ BUG_ON(index < 0 || index >= SHADOW_SIZE);
+
+ if (unlikely(s->shadow[index] == SHADOW_INVALID))
+ return 0;
+
+ return (s->r.min_page + s->shadow[index]) << PAGE_SHIFT;
+}
+
+static void prepare_next_shadow(const struct cfi_shadow __rcu *prev,
+ struct cfi_shadow *next)
+{
+ int i, index, check;
+
+ /* Mark everything invalid */
+ memset(next->shadow, 0xFF, sizeof(next->shadow));
+
+ if (!prev)
+ return; /* No previous shadow */
+
+ /* If the base address didn't change, update is not needed */
+ if (prev->r.min_page == next->r.min_page) {
+ memcpy(next->shadow, prev->shadow, sizeof(next->shadow));
+ return;
+ }
+
+ /* Convert the previous shadow to the new address range */
+ for (i = 0; i < SHADOW_SIZE; ++i) {
+ if (prev->shadow[i] == SHADOW_INVALID)
+ continue;
+
+ index = ptr_to_shadow(next, shadow_to_ptr(prev, i));
+ if (index < 0)
+ continue;
+
+ check = ptr_to_shadow(next,
+ shadow_to_ptr(prev, prev->shadow[i]));
+ if (check < 0)
+ continue;
+
+ next->shadow[index] = (u16)check;
+ }
+}
+
+static void add_module_to_shadow(struct cfi_shadow *s, struct module *mod)
+{
+ unsigned long ptr;
+ unsigned long min_page_addr;
+ unsigned long max_page_addr;
+ unsigned long check = (unsigned long)mod->cfi_check;
+ int check_index = ptr_to_shadow(s, check);
+
+ BUG_ON((check & PAGE_MASK) != check); /* Must be page aligned */
+
+ if (check_index < 0)
+ return; /* Module not addressable with shadow */
+
+ min_page_addr = (unsigned long)mod->core_layout.base & PAGE_MASK;
+ max_page_addr = (unsigned long)mod->core_layout.base +
+ mod->core_layout.text_size;
+ max_page_addr &= PAGE_MASK;
+
+ /* For each page, store the check function index in the shadow */
+ for (ptr = min_page_addr; ptr <= max_page_addr; ptr += PAGE_SIZE) {
+ int index = ptr_to_shadow(s, ptr);
+ if (index >= 0) {
+ /* Assume a page only contains code for one module */
+ BUG_ON(s->shadow[index] != SHADOW_INVALID);
+ s->shadow[index] = (u16)check_index;
+ }
+ }
+}
+
+static void remove_module_from_shadow(struct cfi_shadow *s, struct module *mod)
+{
+ unsigned long ptr;
+ unsigned long min_page_addr;
+ unsigned long max_page_addr;
+
+ min_page_addr = (unsigned long)mod->core_layout.base & PAGE_MASK;
+ max_page_addr = (unsigned long)mod->core_layout.base +
+ mod->core_layout.text_size;
+ max_page_addr &= PAGE_MASK;
+
+ for (ptr = min_page_addr; ptr <= max_page_addr; ptr += PAGE_SIZE) {
+ int index = ptr_to_shadow(s, ptr);
+ if (index >= 0)
+ s->shadow[index] = SHADOW_INVALID;
+ }
+}
+
+typedef void (*update_shadow_fn)(struct cfi_shadow *, struct module *);
+
+static void update_shadow(struct module *mod, unsigned long min_addr,
+ unsigned long max_addr, update_shadow_fn fn)
+{
+ struct cfi_shadow *prev;
+ struct cfi_shadow *next = (struct cfi_shadow *)
+ __get_free_pages(GFP_KERNEL, SHADOW_ORDER);
+
+ BUG_ON(!next);
+
+ next->r.mod_min_addr = min_addr;
+ next->r.mod_max_addr = max_addr;
+ next->r.min_page = min_addr >> PAGE_SHIFT;
+ next->r.max_page = max_addr >> PAGE_SHIFT;
+
+ spin_lock(&shadow_update_lock);
+ prev = rcu_dereference_protected(cfi_shadow, 1);
+ prepare_next_shadow(prev, next);
+
+ fn(next, mod);
+ set_memory_ro((unsigned long)next, SHADOW_PAGES);
+ rcu_assign_pointer(cfi_shadow, next);
+
+ spin_unlock(&shadow_update_lock);
+ synchronize_rcu();
+
+ if (prev) {
+ set_memory_rw((unsigned long)prev, SHADOW_PAGES);
+ free_pages((unsigned long)prev, SHADOW_ORDER);
+ }
+}
+
+void cfi_module_add(struct module *mod, unsigned long min_addr,
+ unsigned long max_addr)
+{
+ update_shadow(mod, min_addr, max_addr, add_module_to_shadow);
+}
+EXPORT_SYMBOL(cfi_module_add);
+
+void cfi_module_remove(struct module *mod, unsigned long min_addr,
+ unsigned long max_addr)
+{
+ update_shadow(mod, min_addr, max_addr, remove_module_from_shadow);
+}
+EXPORT_SYMBOL(cfi_module_remove);
+
+static inline cfi_check_fn ptr_to_check_fn(const struct cfi_shadow __rcu *s,
+ unsigned long ptr)
+{
+ int index;
+ unsigned long check;
+
+ if (unlikely(!s))
+ return NULL; /* No shadow available */
+
+ if (ptr < s->r.mod_min_addr || ptr > s->r.mod_max_addr)
+ return NULL; /* Not in a mapped module */
+
+ index = ptr_to_shadow(s, ptr);
+ if (index < 0)
+ return NULL; /* Cannot be addressed with shadow */
+
+ return (cfi_check_fn)shadow_to_ptr(s, index);
+}
+#endif /* CONFIG_CFI_CLANG_SHADOW */
+
+static inline cfi_check_fn find_module_cfi_check(void *ptr)
+{
+ struct module *mod;
+
+ preempt_disable();
+ mod = __module_address((unsigned long)ptr);
+ preempt_enable();
+
+ if (mod)
+ return mod->cfi_check;
+
+ return CFI_CHECK_FN;
+}
+
+static inline cfi_check_fn find_cfi_check(void *ptr)
+{
+#ifdef CONFIG_CFI_CLANG_SHADOW
+ cfi_check_fn f;
+
+ if (!rcu_access_pointer(cfi_shadow))
+ return CFI_CHECK_FN; /* No loaded modules */
+
+ /* Look up the __cfi_check function to use */
+ rcu_read_lock();
+ f = ptr_to_check_fn(rcu_dereference(cfi_shadow), (unsigned long)ptr);
+ rcu_read_unlock();
+
+ if (f)
+ return f;
+
+ /*
+ * Fall back to find_module_cfi_check, which works also for a larger
+ * module address space, but is slower.
+ */
+#endif /* CONFIG_CFI_CLANG_SHADOW */
+
+ return find_module_cfi_check(ptr);
+}
+
+void cfi_slowpath_handler(uint64_t id, void *ptr, void *diag)
+{
+ cfi_check_fn check = find_cfi_check(ptr);
+
+ if (likely(check))
+ check(id, ptr, diag);
+ else /* Don't allow unchecked modules */
+ handle_cfi_failure(ptr);
+}
+EXPORT_SYMBOL(cfi_slowpath_handler);
+#endif /* CONFIG_MODULES */
+
+void cfi_failure_handler(void *data, void *ptr, void *vtable)
+{
+ handle_cfi_failure(ptr);
+}
+EXPORT_SYMBOL(cfi_failure_handler);
+
+void __cfi_check_fail(void *data, void *ptr)
+{
+ handle_cfi_failure(ptr);
+}
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index bb0cf1caf1cd..2ebd03d49bff 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -2856,7 +2856,8 @@ static int cgroup_procs_write_permission(struct task_struct *task,
*/
if (!uid_eq(cred->euid, GLOBAL_ROOT_UID) &&
!uid_eq(cred->euid, tcred->uid) &&
- !uid_eq(cred->euid, tcred->suid))
+ !uid_eq(cred->euid, tcred->suid) &&
+ !ns_capable(tcred->user_ns, CAP_SYS_NICE))
ret = -EACCES;
if (!ret && cgroup_on_dfl(dst_cgrp)) {
@@ -5079,6 +5080,8 @@ static void css_release_work_fn(struct work_struct *work)
if (cgrp->kn)
RCU_INIT_POINTER(*(void __rcu __force **)&cgrp->kn->priv,
NULL);
+
+ cgroup_bpf_put(cgrp);
}
mutex_unlock(&cgroup_mutex);
@@ -5291,6 +5294,9 @@ static struct cgroup *cgroup_create(struct cgroup *parent)
if (!cgroup_on_dfl(cgrp))
cgrp->subtree_control = cgroup_control(cgrp);
+ if (parent)
+ cgroup_bpf_inherit(cgrp, parent);
+
cgroup_propagate_control(cgrp);
return cgrp;
@@ -6506,6 +6512,20 @@ static __init int cgroup_namespaces_init(void)
}
subsys_initcall(cgroup_namespaces_init);
+#ifdef CONFIG_CGROUP_BPF
+int cgroup_bpf_update(struct cgroup *cgrp, struct bpf_prog *prog,
+ enum bpf_attach_type type, bool overridable)
+{
+ struct cgroup *parent = cgroup_parent(cgrp);
+ int ret;
+
+ mutex_lock(&cgroup_mutex);
+ ret = __cgroup_bpf_update(cgrp, parent, prog, type, overridable);
+ mutex_unlock(&cgroup_mutex);
+ return ret;
+}
+#endif /* CONFIG_CGROUP_BPF */
+
#ifdef CONFIG_CGROUP_DEBUG
static struct cgroup_subsys_state *
debug_css_alloc(struct cgroup_subsys_state *parent_css)
diff --git a/kernel/configs/android-base.config b/kernel/configs/android-base.config
deleted file mode 100644
index 1a8f34f63601..000000000000
--- a/kernel/configs/android-base.config
+++ /dev/null
@@ -1,149 +0,0 @@
-# KEEP ALPHABETICALLY SORTED
-# CONFIG_DEVKMEM is not set
-# CONFIG_DEVMEM is not set
-# CONFIG_INET_LRO is not set
-# CONFIG_MODULES is not set
-# CONFIG_OABI_COMPAT is not set
-# CONFIG_SYSVIPC is not set
-CONFIG_ANDROID=y
-CONFIG_ANDROID_BINDER_IPC=y
-CONFIG_ANDROID_LOW_MEMORY_KILLER=y
-CONFIG_ARMV8_DEPRECATED=y
-CONFIG_ASHMEM=y
-CONFIG_AUDIT=y
-CONFIG_BLK_DEV_INITRD=y
-CONFIG_CGROUPS=y
-CONFIG_CGROUP_CPUACCT=y
-CONFIG_CGROUP_DEBUG=y
-CONFIG_CGROUP_FREEZER=y
-CONFIG_CGROUP_SCHED=y
-CONFIG_CP15_BARRIER_EMULATION=y
-CONFIG_DEFAULT_SECURITY_SELINUX=y
-CONFIG_EMBEDDED=y
-CONFIG_FB=y
-CONFIG_HIGH_RES_TIMERS=y
-CONFIG_INET6_AH=y
-CONFIG_INET6_ESP=y
-CONFIG_INET6_IPCOMP=y
-CONFIG_INET=y
-CONFIG_INET_DIAG_DESTROY=y
-CONFIG_INET_ESP=y
-CONFIG_INET_XFRM_MODE_TUNNEL=y
-CONFIG_IP6_NF_FILTER=y
-CONFIG_IP6_NF_IPTABLES=y
-CONFIG_IP6_NF_MANGLE=y
-CONFIG_IP6_NF_RAW=y
-CONFIG_IP6_NF_TARGET_REJECT=y
-CONFIG_IPV6=y
-CONFIG_IPV6_MIP6=y
-CONFIG_IPV6_MULTIPLE_TABLES=y
-CONFIG_IPV6_OPTIMISTIC_DAD=y
-CONFIG_IPV6_ROUTER_PREF=y
-CONFIG_IPV6_ROUTE_INFO=y
-CONFIG_IP_ADVANCED_ROUTER=y
-CONFIG_IP_MULTICAST=y
-CONFIG_IP_MULTIPLE_TABLES=y
-CONFIG_IP_NF_ARPFILTER=y
-CONFIG_IP_NF_ARPTABLES=y
-CONFIG_IP_NF_ARP_MANGLE=y
-CONFIG_IP_NF_FILTER=y
-CONFIG_IP_NF_IPTABLES=y
-CONFIG_IP_NF_MANGLE=y
-CONFIG_IP_NF_MATCH_AH=y
-CONFIG_IP_NF_MATCH_ECN=y
-CONFIG_IP_NF_MATCH_TTL=y
-CONFIG_IP_NF_NAT=y
-CONFIG_IP_NF_RAW=y
-CONFIG_IP_NF_SECURITY=y
-CONFIG_IP_NF_TARGET_MASQUERADE=y
-CONFIG_IP_NF_TARGET_NETMAP=y
-CONFIG_IP_NF_TARGET_REDIRECT=y
-CONFIG_IP_NF_TARGET_REJECT=y
-CONFIG_NET=y
-CONFIG_NETDEVICES=y
-CONFIG_NETFILTER=y
-CONFIG_NETFILTER_TPROXY=y
-CONFIG_NETFILTER_XT_MATCH_COMMENT=y
-CONFIG_NETFILTER_XT_MATCH_CONNLIMIT=y
-CONFIG_NETFILTER_XT_MATCH_CONNMARK=y
-CONFIG_NETFILTER_XT_MATCH_CONNTRACK=y
-CONFIG_NETFILTER_XT_MATCH_HASHLIMIT=y
-CONFIG_NETFILTER_XT_MATCH_HELPER=y
-CONFIG_NETFILTER_XT_MATCH_IPRANGE=y
-CONFIG_NETFILTER_XT_MATCH_LENGTH=y
-CONFIG_NETFILTER_XT_MATCH_LIMIT=y
-CONFIG_NETFILTER_XT_MATCH_MAC=y
-CONFIG_NETFILTER_XT_MATCH_MARK=y
-CONFIG_NETFILTER_XT_MATCH_PKTTYPE=y
-CONFIG_NETFILTER_XT_MATCH_POLICY=y
-CONFIG_NETFILTER_XT_MATCH_QUOTA=y
-CONFIG_NETFILTER_XT_MATCH_SOCKET=y
-CONFIG_NETFILTER_XT_MATCH_STATE=y
-CONFIG_NETFILTER_XT_MATCH_STATISTIC=y
-CONFIG_NETFILTER_XT_MATCH_STRING=y
-CONFIG_NETFILTER_XT_MATCH_TIME=y
-CONFIG_NETFILTER_XT_MATCH_U32=y
-CONFIG_NETFILTER_XT_TARGET_CLASSIFY=y
-CONFIG_NETFILTER_XT_TARGET_CONNMARK=y
-CONFIG_NETFILTER_XT_TARGET_CONNSECMARK=y
-CONFIG_NETFILTER_XT_TARGET_IDLETIMER=y
-CONFIG_NETFILTER_XT_TARGET_MARK=y
-CONFIG_NETFILTER_XT_TARGET_NFLOG=y
-CONFIG_NETFILTER_XT_TARGET_NFQUEUE=y
-CONFIG_NETFILTER_XT_TARGET_SECMARK=y
-CONFIG_NETFILTER_XT_TARGET_TCPMSS=y
-CONFIG_NETFILTER_XT_TARGET_TPROXY=y
-CONFIG_NETFILTER_XT_TARGET_TRACE=y
-CONFIG_NET_CLS_ACT=y
-CONFIG_NET_CLS_U32=y
-CONFIG_NET_EMATCH=y
-CONFIG_NET_EMATCH_U32=y
-CONFIG_NET_KEY=y
-CONFIG_NET_SCHED=y
-CONFIG_NET_SCH_HTB=y
-CONFIG_NF_CONNTRACK=y
-CONFIG_NF_CONNTRACK_AMANDA=y
-CONFIG_NF_CONNTRACK_EVENTS=y
-CONFIG_NF_CONNTRACK_FTP=y
-CONFIG_NF_CONNTRACK_H323=y
-CONFIG_NF_CONNTRACK_IPV4=y
-CONFIG_NF_CONNTRACK_IPV6=y
-CONFIG_NF_CONNTRACK_IRC=y
-CONFIG_NF_CONNTRACK_NETBIOS_NS=y
-CONFIG_NF_CONNTRACK_PPTP=y
-CONFIG_NF_CONNTRACK_SANE=y
-CONFIG_NF_CONNTRACK_SECMARK=y
-CONFIG_NF_CONNTRACK_TFTP=y
-CONFIG_NF_CT_NETLINK=y
-CONFIG_NF_CT_PROTO_DCCP=y
-CONFIG_NF_CT_PROTO_SCTP=y
-CONFIG_NF_CT_PROTO_UDPLITE=y
-CONFIG_NF_NAT=y
-CONFIG_NO_HZ=y
-CONFIG_PACKET=y
-CONFIG_PM_AUTOSLEEP=y
-CONFIG_PM_WAKELOCKS=y
-CONFIG_PPP=y
-CONFIG_PPP_BSDCOMP=y
-CONFIG_PPP_DEFLATE=y
-CONFIG_PPP_MPPE=y
-CONFIG_PREEMPT=y
-CONFIG_QUOTA=y
-CONFIG_RTC_CLASS=y
-CONFIG_RT_GROUP_SCHED=y
-CONFIG_SECCOMP=y
-CONFIG_SECURITY=y
-CONFIG_SECURITY_NETWORK=y
-CONFIG_SECURITY_SELINUX=y
-CONFIG_SETEND_EMULATION=y
-CONFIG_STAGING=y
-CONFIG_SWP_EMULATION=y
-CONFIG_SYNC=y
-CONFIG_TUN=y
-CONFIG_UNIX=y
-CONFIG_USB_GADGET=y
-CONFIG_USB_CONFIGFS=y
-CONFIG_USB_CONFIGFS_F_FS=y
-CONFIG_USB_CONFIGFS_F_MIDI=y
-CONFIG_USB_OTG_WAKELOCK=y
-CONFIG_XFRM_USER=y
diff --git a/kernel/configs/android-fetch-configs.sh b/kernel/configs/android-fetch-configs.sh
new file mode 100755
index 000000000000..a5b56d4908dc
--- /dev/null
+++ b/kernel/configs/android-fetch-configs.sh
@@ -0,0 +1,4 @@
+#!/bin/sh
+
+curl https://android.googlesource.com/kernel/configs/+archive/master/android-4.9.tar.gz | tar xzv
+
diff --git a/kernel/configs/android-recommended.config b/kernel/configs/android-recommended.config
deleted file mode 100644
index 297756be369c..000000000000
--- a/kernel/configs/android-recommended.config
+++ /dev/null
@@ -1,125 +0,0 @@
-# KEEP ALPHABETICALLY SORTED
-# CONFIG_CORE_DUMP_DEFAULT_ELF_HEADERS is not set
-# CONFIG_INPUT_MOUSE is not set
-# CONFIG_LEGACY_PTYS is not set
-# CONFIG_NF_CONNTRACK_SIP is not set
-# CONFIG_PM_WAKELOCKS_GC is not set
-# CONFIG_VT is not set
-CONFIG_BACKLIGHT_LCD_SUPPORT=y
-CONFIG_BLK_DEV_DM=y
-CONFIG_BLK_DEV_LOOP=y
-CONFIG_BLK_DEV_RAM=y
-CONFIG_BLK_DEV_RAM_SIZE=8192
-CONFIG_COMPACTION=y
-CONFIG_DEBUG_RODATA=y
-CONFIG_DM_CRYPT=y
-CONFIG_DM_UEVENT=y
-CONFIG_DM_VERITY=y
-CONFIG_DM_VERITY_FEC=y
-CONFIG_DRAGONRISE_FF=y
-CONFIG_ENABLE_DEFAULT_TRACERS=y
-CONFIG_EXT4_FS=y
-CONFIG_EXT4_FS_SECURITY=y
-CONFIG_FUSE_FS=y
-CONFIG_GREENASIA_FF=y
-CONFIG_HIDRAW=y
-CONFIG_HID_A4TECH=y
-CONFIG_HID_ACRUX=y
-CONFIG_HID_ACRUX_FF=y
-CONFIG_HID_APPLE=y
-CONFIG_HID_BELKIN=y
-CONFIG_HID_CHERRY=y
-CONFIG_HID_CHICONY=y
-CONFIG_HID_CYPRESS=y
-CONFIG_HID_DRAGONRISE=y
-CONFIG_HID_ELECOM=y
-CONFIG_HID_EMS_FF=y
-CONFIG_HID_EZKEY=y
-CONFIG_HID_GREENASIA=y
-CONFIG_HID_GYRATION=y
-CONFIG_HID_HOLTEK=y
-CONFIG_HID_KENSINGTON=y
-CONFIG_HID_KEYTOUCH=y
-CONFIG_HID_KYE=y
-CONFIG_HID_LCPOWER=y
-CONFIG_HID_LOGITECH=y
-CONFIG_HID_LOGITECH_DJ=y
-CONFIG_HID_MAGICMOUSE=y
-CONFIG_HID_MICROSOFT=y
-CONFIG_HID_MONTEREY=y
-CONFIG_HID_MULTITOUCH=y
-CONFIG_HID_NTRIG=y
-CONFIG_HID_ORTEK=y
-CONFIG_HID_PANTHERLORD=y
-CONFIG_HID_PETALYNX=y
-CONFIG_HID_PICOLCD=y
-CONFIG_HID_PRIMAX=y
-CONFIG_HID_PRODIKEYS=y
-CONFIG_HID_ROCCAT=y
-CONFIG_HID_SAITEK=y
-CONFIG_HID_SAMSUNG=y
-CONFIG_HID_SMARTJOYPLUS=y
-CONFIG_HID_SONY=y
-CONFIG_HID_SPEEDLINK=y
-CONFIG_HID_SUNPLUS=y
-CONFIG_HID_THRUSTMASTER=y
-CONFIG_HID_TIVO=y
-CONFIG_HID_TOPSEED=y
-CONFIG_HID_TWINHAN=y
-CONFIG_HID_UCLOGIC=y
-CONFIG_HID_WACOM=y
-CONFIG_HID_WALTOP=y
-CONFIG_HID_WIIMOTE=y
-CONFIG_HID_ZEROPLUS=y
-CONFIG_HID_ZYDACRON=y
-CONFIG_INPUT_EVDEV=y
-CONFIG_INPUT_GPIO=y
-CONFIG_INPUT_JOYSTICK=y
-CONFIG_INPUT_MISC=y
-CONFIG_INPUT_TABLET=y
-CONFIG_INPUT_UINPUT=y
-CONFIG_ION=y
-CONFIG_JOYSTICK_XPAD=y
-CONFIG_JOYSTICK_XPAD_FF=y
-CONFIG_JOYSTICK_XPAD_LEDS=y
-CONFIG_KALLSYMS_ALL=y
-CONFIG_KSM=y
-CONFIG_LOGIG940_FF=y
-CONFIG_LOGIRUMBLEPAD2_FF=y
-CONFIG_LOGITECH_FF=y
-CONFIG_MD=y
-CONFIG_MEDIA_SUPPORT=y
-CONFIG_MSDOS_FS=y
-CONFIG_PANIC_TIMEOUT=5
-CONFIG_PANTHERLORD_FF=y
-CONFIG_PERF_EVENTS=y
-CONFIG_PM_DEBUG=y
-CONFIG_PM_RUNTIME=y
-CONFIG_PM_WAKELOCKS_LIMIT=0
-CONFIG_POWER_SUPPLY=y
-CONFIG_PSTORE=y
-CONFIG_PSTORE_CONSOLE=y
-CONFIG_PSTORE_RAM=y
-CONFIG_SCHEDSTATS=y
-CONFIG_SMARTJOYPLUS_FF=y
-CONFIG_SND=y
-CONFIG_SOUND=y
-CONFIG_SUSPEND_TIME=y
-CONFIG_TABLET_USB_ACECAD=y
-CONFIG_TABLET_USB_AIPTEK=y
-CONFIG_TABLET_USB_GTCO=y
-CONFIG_TABLET_USB_HANWANG=y
-CONFIG_TABLET_USB_KBTAB=y
-CONFIG_TASKSTATS=y
-CONFIG_TASK_DELAY_ACCT=y
-CONFIG_TASK_IO_ACCOUNTING=y
-CONFIG_TASK_XACCT=y
-CONFIG_TIMER_STATS=y
-CONFIG_TMPFS=y
-CONFIG_TMPFS_POSIX_ACL=y
-CONFIG_UHID=y
-CONFIG_USB_ANNOUNCE_NEW_DEVICES=y
-CONFIG_USB_EHCI_HCD=y
-CONFIG_USB_HIDDEV=y
-CONFIG_USB_USBNET=y
-CONFIG_VFAT_FS=y
diff --git a/kernel/cpu.c b/kernel/cpu.c
index b5a0165b7300..67eb51252542 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -1262,6 +1262,7 @@ void __weak arch_enable_nonboot_cpus_end(void)
void enable_nonboot_cpus(void)
{
int cpu, error;
+ struct device *cpu_device;
/* Allow everyone to use the CPU hotplug again */
cpu_maps_update_begin();
@@ -1279,6 +1280,12 @@ void enable_nonboot_cpus(void)
trace_suspend_resume(TPS("CPU_ON"), cpu, false);
if (!error) {
pr_info("CPU%d is up\n", cpu);
+ cpu_device = get_cpu_device(cpu);
+ if (!cpu_device)
+ pr_err("%s: failed to get cpu%d device\n",
+ __func__, cpu);
+ else
+ kobject_uevent(&cpu_device->kobj, KOBJ_ONLINE);
continue;
}
pr_warn("Error taking CPU%d up: %d\n", cpu, error);
@@ -2206,3 +2213,23 @@ void __init boot_cpu_hotplug_init(void)
#endif
this_cpu_write(cpuhp_state.state, CPUHP_ONLINE);
}
+
+static ATOMIC_NOTIFIER_HEAD(idle_notifier);
+
+void idle_notifier_register(struct notifier_block *n)
+{
+ atomic_notifier_chain_register(&idle_notifier, n);
+}
+EXPORT_SYMBOL_GPL(idle_notifier_register);
+
+void idle_notifier_unregister(struct notifier_block *n)
+{
+ atomic_notifier_chain_unregister(&idle_notifier, n);
+}
+EXPORT_SYMBOL_GPL(idle_notifier_unregister);
+
+void idle_notifier_call_chain(unsigned long val)
+{
+ atomic_notifier_call_chain(&idle_notifier, val, NULL);
+}
+EXPORT_SYMBOL_GPL(idle_notifier_call_chain);
diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index 511b1dd8ff09..194e2f24841b 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -100,6 +100,7 @@ struct cpuset {
/* user-configured CPUs and Memory Nodes allow to tasks */
cpumask_var_t cpus_allowed;
+ cpumask_var_t cpus_requested;
nodemask_t mems_allowed;
/* effective CPUs and Memory Nodes allow to tasks */
@@ -399,7 +400,7 @@ static void cpuset_update_task_spread_flag(struct cpuset *cs,
static int is_cpuset_subset(const struct cpuset *p, const struct cpuset *q)
{
- return cpumask_subset(p->cpus_allowed, q->cpus_allowed) &&
+ return cpumask_subset(p->cpus_requested, q->cpus_requested) &&
nodes_subset(p->mems_allowed, q->mems_allowed) &&
is_cpu_exclusive(p) <= is_cpu_exclusive(q) &&
is_mem_exclusive(p) <= is_mem_exclusive(q);
@@ -499,7 +500,7 @@ static int validate_change(struct cpuset *cur, struct cpuset *trial)
cpuset_for_each_child(c, css, par) {
if ((is_cpu_exclusive(trial) || is_cpu_exclusive(c)) &&
c != cur &&
- cpumask_intersects(trial->cpus_allowed, c->cpus_allowed))
+ cpumask_intersects(trial->cpus_requested, c->cpus_requested))
goto out;
if ((is_mem_exclusive(trial) || is_mem_exclusive(c)) &&
c != cur &&
@@ -958,17 +959,18 @@ static int update_cpumask(struct cpuset *cs, struct cpuset *trialcs,
if (!*buf) {
cpumask_clear(trialcs->cpus_allowed);
} else {
- retval = cpulist_parse(buf, trialcs->cpus_allowed);
+ retval = cpulist_parse(buf, trialcs->cpus_requested);
if (retval < 0)
return retval;
- if (!cpumask_subset(trialcs->cpus_allowed,
- top_cpuset.cpus_allowed))
+ if (!cpumask_subset(trialcs->cpus_requested, cpu_present_mask))
return -EINVAL;
+
+ cpumask_and(trialcs->cpus_allowed, trialcs->cpus_requested, cpu_active_mask);
}
/* Nothing to do if the cpus didn't change */
- if (cpumask_equal(cs->cpus_allowed, trialcs->cpus_allowed))
+ if (cpumask_equal(cs->cpus_requested, trialcs->cpus_requested))
return 0;
retval = validate_change(cs, trialcs);
@@ -977,6 +979,7 @@ static int update_cpumask(struct cpuset *cs, struct cpuset *trialcs,
spin_lock_irq(&callback_lock);
cpumask_copy(cs->cpus_allowed, trialcs->cpus_allowed);
+ cpumask_copy(cs->cpus_requested, trialcs->cpus_requested);
spin_unlock_irq(&callback_lock);
/* use trialcs->cpus_allowed as a temp variable */
@@ -1761,7 +1764,7 @@ static int cpuset_common_seq_show(struct seq_file *sf, void *v)
switch (type) {
case FILE_CPULIST:
- seq_printf(sf, "%*pbl\n", cpumask_pr_args(cs->cpus_allowed));
+ seq_printf(sf, "%*pbl\n", cpumask_pr_args(cs->cpus_requested));
break;
case FILE_MEMLIST:
seq_printf(sf, "%*pbl\n", nodemask_pr_args(&cs->mems_allowed));
@@ -1951,11 +1954,14 @@ cpuset_css_alloc(struct cgroup_subsys_state *parent_css)
return ERR_PTR(-ENOMEM);
if (!alloc_cpumask_var(&cs->cpus_allowed, GFP_KERNEL))
goto free_cs;
+ if (!alloc_cpumask_var(&cs->cpus_requested, GFP_KERNEL))
+ goto free_allowed;
if (!alloc_cpumask_var(&cs->effective_cpus, GFP_KERNEL))
- goto free_cpus;
+ goto free_requested;
set_bit(CS_SCHED_LOAD_BALANCE, &cs->flags);
cpumask_clear(cs->cpus_allowed);
+ cpumask_clear(cs->cpus_requested);
nodes_clear(cs->mems_allowed);
cpumask_clear(cs->effective_cpus);
nodes_clear(cs->effective_mems);
@@ -1964,7 +1970,9 @@ cpuset_css_alloc(struct cgroup_subsys_state *parent_css)
return &cs->css;
-free_cpus:
+free_requested:
+ free_cpumask_var(cs->cpus_requested);
+free_allowed:
free_cpumask_var(cs->cpus_allowed);
free_cs:
kfree(cs);
@@ -2027,6 +2035,7 @@ static int cpuset_css_online(struct cgroup_subsys_state *css)
cs->mems_allowed = parent->mems_allowed;
cs->effective_mems = parent->mems_allowed;
cpumask_copy(cs->cpus_allowed, parent->cpus_allowed);
+ cpumask_copy(cs->cpus_requested, parent->cpus_requested);
cpumask_copy(cs->effective_cpus, parent->cpus_allowed);
spin_unlock_irq(&callback_lock);
out_unlock:
@@ -2061,6 +2070,7 @@ static void cpuset_css_free(struct cgroup_subsys_state *css)
free_cpumask_var(cs->effective_cpus);
free_cpumask_var(cs->cpus_allowed);
+ free_cpumask_var(cs->cpus_requested);
kfree(cs);
}
@@ -2125,8 +2135,11 @@ int __init cpuset_init(void)
BUG();
if (!alloc_cpumask_var(&top_cpuset.effective_cpus, GFP_KERNEL))
BUG();
+ if (!alloc_cpumask_var(&top_cpuset.cpus_requested, GFP_KERNEL))
+ BUG();
cpumask_setall(top_cpuset.cpus_allowed);
+ cpumask_setall(top_cpuset.cpus_requested);
nodes_setall(top_cpuset.mems_allowed);
cpumask_setall(top_cpuset.effective_cpus);
nodes_setall(top_cpuset.effective_mems);
@@ -2260,7 +2273,7 @@ retry:
goto retry;
}
- cpumask_and(&new_cpus, cs->cpus_allowed, parent_cs(cs)->effective_cpus);
+ cpumask_and(&new_cpus, cs->cpus_requested, parent_cs(cs)->effective_cpus);
nodes_and(new_mems, cs->mems_allowed, parent_cs(cs)->effective_mems);
cpus_updated = !cpumask_equal(&new_cpus, cs->effective_cpus);
diff --git a/kernel/debug/kdb/kdb_io.c b/kernel/debug/kdb/kdb_io.c
index cc892a9e109d..d3c5b15c86c1 100644
--- a/kernel/debug/kdb/kdb_io.c
+++ b/kernel/debug/kdb/kdb_io.c
@@ -216,7 +216,7 @@ static char *kdb_read(char *buffer, size_t bufsize)
int i;
int diag, dtab_count;
int key, buf_size, ret;
-
+ static int last_crlf;
diag = kdbgetintenv("DTABCOUNT", &dtab_count);
if (diag)
@@ -237,6 +237,9 @@ poll_again:
return buffer;
if (key != 9)
tab = 0;
+ if (key != 10 && key != 13)
+ last_crlf = 0;
+
switch (key) {
case 8: /* backspace */
if (cp > buffer) {
@@ -254,7 +257,12 @@ poll_again:
*cp = tmp;
}
break;
- case 13: /* enter */
+ case 10: /* new line */
+ case 13: /* carriage return */
+ /* handle \n after \r */
+ if (last_crlf && last_crlf != key)
+ break;
+ last_crlf = key;
*lastchar++ = '\n';
*lastchar++ = '\0';
if (!KDB_STATE(KGDB_TRANS)) {
diff --git a/kernel/events/core.c b/kernel/events/core.c
index 1af0bbf20984..be42bfeb87ae 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -389,8 +389,13 @@ static struct srcu_struct pmus_srcu;
* 0 - disallow raw tracepoint access for unpriv
* 1 - disallow cpu events for unpriv
* 2 - disallow kernel profiling for unpriv
+ * 3 - disallow all unpriv perf event use
*/
+#ifdef CONFIG_SECURITY_PERF_EVENTS_RESTRICT
+int sysctl_perf_event_paranoid __read_mostly = 3;
+#else
int sysctl_perf_event_paranoid __read_mostly = 2;
+#endif
/* Minimum for 512 kiB + 1 user control page */
int sysctl_perf_event_mlock __read_mostly = 512 + (PAGE_SIZE / 1024); /* 'free' kiB per user */
@@ -9670,6 +9675,9 @@ SYSCALL_DEFINE5(perf_event_open,
if (flags & ~PERF_FLAG_ALL)
return -EINVAL;
+ if (perf_paranoid_any() && !capable(CAP_SYS_ADMIN))
+ return -EACCES;
+
err = perf_copy_attr(attr_uptr, &attr);
if (err)
return err;
diff --git a/kernel/exit.c b/kernel/exit.c
index 6dd7ff4b337a..61921bc3fd5f 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -55,6 +55,8 @@
#include <linux/shm.h>
#include <linux/kcov.h>
+#include "sched/tune.h"
+
#include <asm/uaccess.h>
#include <asm/unistd.h>
#include <asm/pgtable.h>
@@ -775,6 +777,9 @@ void __noreturn do_exit(long code)
}
exit_signals(tsk); /* sets PF_EXITING */
+
+ schedtune_exit_task(tsk);
+
/*
* Ensure that all new tsk->pi_lock acquisitions must observe
* PF_EXITING. Serializes against futex.c:attach_to_pi_owner().
diff --git a/kernel/fork.c b/kernel/fork.c
index e92b06351dec..6c9efef54262 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -77,6 +77,7 @@
#include <linux/compiler.h>
#include <linux/sysctl.h>
#include <linux/kcov.h>
+#include <linux/cpufreq_times.h>
#include <asm/pgtable.h>
#include <asm/pgalloc.h>
@@ -342,6 +343,8 @@ void put_task_stack(struct task_struct *tsk)
void free_task(struct task_struct *tsk)
{
+ cpufreq_task_times_exit(tsk);
+
#ifndef CONFIG_THREAD_INFO_IN_TASK
/*
* The task is finally done with both the stack and thread_info,
@@ -1532,6 +1535,8 @@ static __latent_entropy struct task_struct *copy_process(
if (!p)
goto fork_out;
+ cpufreq_task_times_init(p);
+
/*
* This _must_ happen before we call free_task(), i.e. before we jump
* to any of the bad_fork_* labels. This is to avoid freeing
@@ -1984,6 +1989,8 @@ long _do_fork(unsigned long clone_flags,
struct completion vfork;
struct pid *pid;
+ cpufreq_task_times_alloc(p);
+
trace_sched_process_fork(current, p);
pid = get_task_pid(p, PIDTYPE_PID);
diff --git a/kernel/kallsyms.c b/kernel/kallsyms.c
index fafd1a3ef0da..a0c3365c884b 100644
--- a/kernel/kallsyms.c
+++ b/kernel/kallsyms.c
@@ -306,6 +306,24 @@ int kallsyms_lookup_size_offset(unsigned long addr, unsigned long *symbolsize,
return !!module_address_lookup(addr, symbolsize, offset, NULL, namebuf);
}
+#ifdef CONFIG_CFI_CLANG
+/*
+ * LLVM appends .cfi to function names when CONFIG_CFI_CLANG is enabled,
+ * which causes confusion and potentially breaks user space tools, so we
+ * will strip the postfix from expanded symbol names.
+ */
+static inline void cleanup_symbol_name(char *s)
+{
+ char *res;
+
+ res = strrchr(s, '.');
+ if (res && !strcmp(res, ".cfi"))
+ *res = '\0';
+}
+#else
+static inline void cleanup_symbol_name(char *s) {}
+#endif
+
/*
* Lookup an address
* - modname is set to NULL if it's in the kernel.
@@ -330,16 +348,23 @@ const char *kallsyms_lookup(unsigned long addr,
namebuf, KSYM_NAME_LEN);
if (modname)
*modname = NULL;
- return namebuf;
+ goto found;
}
/* See if it's in a module. */
- return module_address_lookup(addr, symbolsize, offset, modname,
- namebuf);
+ if (!module_address_lookup(addr, symbolsize, offset, modname,
+ namebuf))
+ return NULL;
+
+found:
+ cleanup_symbol_name(namebuf);
+ return namebuf;
}
int lookup_symbol_name(unsigned long addr, char *symname)
{
+ int res;
+
symname[0] = '\0';
symname[KSYM_NAME_LEN - 1] = '\0';
@@ -350,15 +375,23 @@ int lookup_symbol_name(unsigned long addr, char *symname)
/* Grab name */
kallsyms_expand_symbol(get_symbol_offset(pos),
symname, KSYM_NAME_LEN);
- return 0;
+ goto found;
}
/* See if it's in a module. */
- return lookup_module_symbol_name(addr, symname);
+ res = lookup_module_symbol_name(addr, symname);
+ if (res)
+ return res;
+
+found:
+ cleanup_symbol_name(symname);
+ return 0;
}
int lookup_symbol_attrs(unsigned long addr, unsigned long *size,
unsigned long *offset, char *modname, char *name)
{
+ int res;
+
name[0] = '\0';
name[KSYM_NAME_LEN - 1] = '\0';
@@ -370,10 +403,16 @@ int lookup_symbol_attrs(unsigned long addr, unsigned long *size,
kallsyms_expand_symbol(get_symbol_offset(pos),
name, KSYM_NAME_LEN);
modname[0] = '\0';
- return 0;
+ goto found;
}
/* See if it's in a module. */
- return lookup_module_symbol_attrs(addr, size, offset, modname, name);
+ res = lookup_module_symbol_attrs(addr, size, offset, modname, name);
+ if (res)
+ return res;
+
+found:
+ cleanup_symbol_name(name);
+ return 0;
}
/* Look up a kernel symbol and return it in a text buffer. */
diff --git a/kernel/kcov.c b/kernel/kcov.c
index b0ec31493fdc..fad51446590f 100644
--- a/kernel/kcov.c
+++ b/kernel/kcov.c
@@ -1,11 +1,16 @@
#define pr_fmt(fmt) "kcov: " fmt
#define DISABLE_BRANCH_PROFILING
+#include <linux/atomic.h>
#include <linux/compiler.h>
+#include <linux/errno.h>
+#include <linux/export.h>
#include <linux/types.h>
#include <linux/file.h>
#include <linux/fs.h>
+#include <linux/init.h>
#include <linux/mm.h>
+#include <linux/preempt.h>
#include <linux/printk.h>
#include <linux/sched.h>
#include <linux/slab.h>
@@ -14,6 +19,10 @@
#include <linux/debugfs.h>
#include <linux/uaccess.h>
#include <linux/kcov.h>
+#include <asm/setup.h>
+
+/* Number of 64-bit words written per one comparison: */
+#define KCOV_WORDS_PER_CMP 4
/*
* kcov descriptor (one per opened debugfs file).
@@ -21,7 +30,12 @@
* - initial state after open()
* - then there must be a single ioctl(KCOV_INIT_TRACE) call
* - then, mmap() call (several calls are allowed but not useful)
- * - then, repeated enable/disable for a task (only one task a time allowed)
+ * - then, ioctl(KCOV_ENABLE, arg), where arg is
+ * KCOV_TRACE_PC - to trace only the PCs
+ * or
+ * KCOV_TRACE_CMP - to trace only the comparison operands
+ * - then, ioctl(KCOV_DISABLE) to disable the task.
+ * Enabling/disabling ioctls can be repeated (only one task a time allowed).
*/
struct kcov {
/*
@@ -41,6 +55,36 @@ struct kcov {
struct task_struct *t;
};
+static bool check_kcov_mode(enum kcov_mode needed_mode, struct task_struct *t)
+{
+ enum kcov_mode mode;
+
+ /*
+ * We are interested in code coverage as a function of a syscall inputs,
+ * so we ignore code executed in interrupts.
+ */
+ if (!in_task())
+ return false;
+ mode = READ_ONCE(t->kcov_mode);
+ /*
+ * There is some code that runs in interrupts but for which
+ * in_interrupt() returns false (e.g. preempt_schedule_irq()).
+ * READ_ONCE()/barrier() effectively provides load-acquire wrt
+ * interrupts, there are paired barrier()/WRITE_ONCE() in
+ * kcov_ioctl_locked().
+ */
+ barrier();
+ return mode == needed_mode;
+}
+
+static unsigned long canonicalize_ip(unsigned long ip)
+{
+#ifdef CONFIG_RANDOMIZE_BASE
+ ip -= kaslr_offset();
+#endif
+ return ip;
+}
+
/*
* Entry point from instrumented code.
* This is called once per basic-block/edge.
@@ -48,46 +92,139 @@ struct kcov {
void notrace __sanitizer_cov_trace_pc(void)
{
struct task_struct *t;
- enum kcov_mode mode;
+ unsigned long *area;
+ unsigned long ip = canonicalize_ip(_RET_IP_);
+ unsigned long pos;
t = current;
- /*
- * We are interested in code coverage as a function of a syscall inputs,
- * so we ignore code executed in interrupts.
- * The checks for whether we are in an interrupt are open-coded, because
- * 1. We can't use in_interrupt() here, since it also returns true
- * when we are inside local_bh_disable() section.
- * 2. We don't want to use (in_irq() | in_serving_softirq() | in_nmi()),
- * since that leads to slower generated code (three separate tests,
- * one for each of the flags).
- */
- if (!t || (preempt_count() & (HARDIRQ_MASK | SOFTIRQ_OFFSET
- | NMI_MASK)))
+ if (!check_kcov_mode(KCOV_MODE_TRACE_PC, t))
return;
- mode = READ_ONCE(t->kcov_mode);
- if (mode == KCOV_MODE_TRACE) {
- unsigned long *area;
- unsigned long pos;
- /*
- * There is some code that runs in interrupts but for which
- * in_interrupt() returns false (e.g. preempt_schedule_irq()).
- * READ_ONCE()/barrier() effectively provides load-acquire wrt
- * interrupts, there are paired barrier()/WRITE_ONCE() in
- * kcov_ioctl_locked().
- */
- barrier();
- area = t->kcov_area;
- /* The first word is number of subsequent PCs. */
- pos = READ_ONCE(area[0]) + 1;
- if (likely(pos < t->kcov_size)) {
- area[pos] = _RET_IP_;
- WRITE_ONCE(area[0], pos);
- }
+ area = t->kcov_area;
+ /* The first 64-bit word is the number of subsequent PCs. */
+ pos = READ_ONCE(area[0]) + 1;
+ if (likely(pos < t->kcov_size)) {
+ area[pos] = ip;
+ WRITE_ONCE(area[0], pos);
}
}
EXPORT_SYMBOL(__sanitizer_cov_trace_pc);
+#ifdef CONFIG_KCOV_ENABLE_COMPARISONS
+static void write_comp_data(u64 type, u64 arg1, u64 arg2, u64 ip)
+{
+ struct task_struct *t;
+ u64 *area;
+ u64 count, start_index, end_pos, max_pos;
+
+ t = current;
+ if (!check_kcov_mode(KCOV_MODE_TRACE_CMP, t))
+ return;
+
+ ip = canonicalize_ip(ip);
+
+ /*
+ * We write all comparison arguments and types as u64.
+ * The buffer was allocated for t->kcov_size unsigned longs.
+ */
+ area = (u64 *)t->kcov_area;
+ max_pos = t->kcov_size * sizeof(unsigned long);
+
+ count = READ_ONCE(area[0]);
+
+ /* Every record is KCOV_WORDS_PER_CMP 64-bit words. */
+ start_index = 1 + count * KCOV_WORDS_PER_CMP;
+ end_pos = (start_index + KCOV_WORDS_PER_CMP) * sizeof(u64);
+ if (likely(end_pos <= max_pos)) {
+ area[start_index] = type;
+ area[start_index + 1] = arg1;
+ area[start_index + 2] = arg2;
+ area[start_index + 3] = ip;
+ WRITE_ONCE(area[0], count + 1);
+ }
+}
+
+void notrace __sanitizer_cov_trace_cmp1(u8 arg1, u8 arg2)
+{
+ write_comp_data(KCOV_CMP_SIZE(0), arg1, arg2, _RET_IP_);
+}
+EXPORT_SYMBOL(__sanitizer_cov_trace_cmp1);
+
+void notrace __sanitizer_cov_trace_cmp2(u16 arg1, u16 arg2)
+{
+ write_comp_data(KCOV_CMP_SIZE(1), arg1, arg2, _RET_IP_);
+}
+EXPORT_SYMBOL(__sanitizer_cov_trace_cmp2);
+
+void notrace __sanitizer_cov_trace_cmp4(u32 arg1, u32 arg2)
+{
+ write_comp_data(KCOV_CMP_SIZE(2), arg1, arg2, _RET_IP_);
+}
+EXPORT_SYMBOL(__sanitizer_cov_trace_cmp4);
+
+void notrace __sanitizer_cov_trace_cmp8(u64 arg1, u64 arg2)
+{
+ write_comp_data(KCOV_CMP_SIZE(3), arg1, arg2, _RET_IP_);
+}
+EXPORT_SYMBOL(__sanitizer_cov_trace_cmp8);
+
+void notrace __sanitizer_cov_trace_const_cmp1(u8 arg1, u8 arg2)
+{
+ write_comp_data(KCOV_CMP_SIZE(0) | KCOV_CMP_CONST, arg1, arg2,
+ _RET_IP_);
+}
+EXPORT_SYMBOL(__sanitizer_cov_trace_const_cmp1);
+
+void notrace __sanitizer_cov_trace_const_cmp2(u16 arg1, u16 arg2)
+{
+ write_comp_data(KCOV_CMP_SIZE(1) | KCOV_CMP_CONST, arg1, arg2,
+ _RET_IP_);
+}
+EXPORT_SYMBOL(__sanitizer_cov_trace_const_cmp2);
+
+void notrace __sanitizer_cov_trace_const_cmp4(u32 arg1, u32 arg2)
+{
+ write_comp_data(KCOV_CMP_SIZE(2) | KCOV_CMP_CONST, arg1, arg2,
+ _RET_IP_);
+}
+EXPORT_SYMBOL(__sanitizer_cov_trace_const_cmp4);
+
+void notrace __sanitizer_cov_trace_const_cmp8(u64 arg1, u64 arg2)
+{
+ write_comp_data(KCOV_CMP_SIZE(3) | KCOV_CMP_CONST, arg1, arg2,
+ _RET_IP_);
+}
+EXPORT_SYMBOL(__sanitizer_cov_trace_const_cmp8);
+
+void notrace __sanitizer_cov_trace_switch(u64 val, u64 *cases)
+{
+ u64 i;
+ u64 count = cases[0];
+ u64 size = cases[1];
+ u64 type = KCOV_CMP_CONST;
+
+ switch (size) {
+ case 8:
+ type |= KCOV_CMP_SIZE(0);
+ break;
+ case 16:
+ type |= KCOV_CMP_SIZE(1);
+ break;
+ case 32:
+ type |= KCOV_CMP_SIZE(2);
+ break;
+ case 64:
+ type |= KCOV_CMP_SIZE(3);
+ break;
+ default:
+ return;
+ }
+ for (i = 0; i < count; i++)
+ write_comp_data(type, cases[i + 2], val, _RET_IP_);
+}
+EXPORT_SYMBOL(__sanitizer_cov_trace_switch);
+#endif /* ifdef CONFIG_KCOV_ENABLE_COMPARISONS */
+
static void kcov_get(struct kcov *kcov)
{
atomic_inc(&kcov->refcount);
@@ -125,6 +262,7 @@ void kcov_task_exit(struct task_struct *t)
/* Just to not leave dangling references behind. */
kcov_task_init(t);
kcov->t = NULL;
+ kcov->mode = KCOV_MODE_INIT;
spin_unlock(&kcov->lock);
kcov_put(kcov);
}
@@ -143,7 +281,7 @@ static int kcov_mmap(struct file *filep, struct vm_area_struct *vma)
spin_lock(&kcov->lock);
size = kcov->size * sizeof(unsigned long);
- if (kcov->mode == KCOV_MODE_DISABLED || vma->vm_pgoff != 0 ||
+ if (kcov->mode != KCOV_MODE_INIT || vma->vm_pgoff != 0 ||
vma->vm_end - vma->vm_start != size) {
res = -EINVAL;
goto exit;
@@ -172,6 +310,7 @@ static int kcov_open(struct inode *inode, struct file *filep)
kcov = kzalloc(sizeof(*kcov), GFP_KERNEL);
if (!kcov)
return -ENOMEM;
+ kcov->mode = KCOV_MODE_DISABLED;
atomic_set(&kcov->refcount, 1);
spin_lock_init(&kcov->lock);
filep->private_data = kcov;
@@ -207,7 +346,7 @@ static int kcov_ioctl_locked(struct kcov *kcov, unsigned int cmd,
if (size < 2 || size > INT_MAX / sizeof(unsigned long))
return -EINVAL;
kcov->size = size;
- kcov->mode = KCOV_MODE_TRACE;
+ kcov->mode = KCOV_MODE_INIT;
return 0;
case KCOV_ENABLE:
/*
@@ -217,9 +356,19 @@ static int kcov_ioctl_locked(struct kcov *kcov, unsigned int cmd,
* at task exit or voluntary by KCOV_DISABLE. After that it can
* be enabled for another task.
*/
- unused = arg;
- if (unused != 0 || kcov->mode == KCOV_MODE_DISABLED ||
- kcov->area == NULL)
+ if (kcov->mode != KCOV_MODE_INIT || !kcov->area)
+ return -EINVAL;
+ if (kcov->t != NULL)
+ return -EBUSY;
+ if (arg == KCOV_TRACE_PC)
+ kcov->mode = KCOV_MODE_TRACE_PC;
+ else if (arg == KCOV_TRACE_CMP)
+#ifdef CONFIG_KCOV_ENABLE_COMPARISONS
+ kcov->mode = KCOV_MODE_TRACE_CMP;
+#else
+ return -ENOTSUPP;
+#endif
+ else
return -EINVAL;
t = current;
if (kcov->t != NULL || t->kcov != NULL)
@@ -227,7 +376,7 @@ static int kcov_ioctl_locked(struct kcov *kcov, unsigned int cmd,
/* Cache in task struct for performance. */
t->kcov_size = kcov->size;
t->kcov_area = kcov->area;
- /* See comment in __sanitizer_cov_trace_pc(). */
+ /* See comment in check_kcov_mode(). */
barrier();
WRITE_ONCE(t->kcov_mode, kcov->mode);
t->kcov = kcov;
@@ -245,6 +394,7 @@ static int kcov_ioctl_locked(struct kcov *kcov, unsigned int cmd,
return -EINVAL;
kcov_task_init(t);
kcov->t = NULL;
+ kcov->mode = KCOV_MODE_INIT;
kcov_put(kcov);
return 0;
default:
@@ -267,6 +417,7 @@ static long kcov_ioctl(struct file *filep, unsigned int cmd, unsigned long arg)
static const struct file_operations kcov_fops = {
.open = kcov_open,
.unlocked_ioctl = kcov_ioctl,
+ .compat_ioctl = kcov_ioctl,
.mmap = kcov_mmap,
.release = kcov_close,
};
diff --git a/kernel/module.c b/kernel/module.c
index 2325c9821f2a..b63d342bd505 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -2099,6 +2099,8 @@ void __weak module_arch_freeing_init(struct module *mod)
{
}
+static void cfi_cleanup(struct module *mod);
+
/* Free a module, remove from lists, etc. */
static void free_module(struct module *mod)
{
@@ -2140,6 +2142,10 @@ static void free_module(struct module *mod)
/* This may be empty, but that's OK */
disable_ro_nx(&mod->init_layout);
+
+ /* Clean up CFI for the module. */
+ cfi_cleanup(mod);
+
module_arch_freeing_init(mod);
module_memfree(mod->init_layout.base);
kfree(mod->args);
@@ -3321,6 +3327,8 @@ int __weak module_finalize(const Elf_Ehdr *hdr,
return 0;
}
+static void cfi_init(struct module *mod);
+
static int post_relocation(struct module *mod, const struct load_info *info)
{
/* Sort exception table now relocations are done. */
@@ -3333,6 +3341,9 @@ static int post_relocation(struct module *mod, const struct load_info *info)
/* Setup kallsyms-specific fields. */
add_kallsyms(mod, info);
+ /* Setup CFI for the module. */
+ cfi_init(mod);
+
/* Arch-specific module finalizing. */
return module_finalize(info->hdr, info->sechdrs, mod);
}
@@ -4071,6 +4082,22 @@ int module_kallsyms_on_each_symbol(int (*fn)(void *, const char *,
}
#endif /* CONFIG_KALLSYMS */
+static void cfi_init(struct module *mod)
+{
+#ifdef CONFIG_CFI_CLANG
+ mod->cfi_check =
+ (cfi_check_fn)mod_find_symname(mod, CFI_CHECK_FN_NAME);
+ cfi_module_add(mod, module_addr_min, module_addr_max);
+#endif
+}
+
+static void cfi_cleanup(struct module *mod)
+{
+#ifdef CONFIG_CFI_CLANG
+ cfi_module_remove(mod, module_addr_min, module_addr_max);
+#endif
+}
+
static char *module_flags(struct module *mod, char *buf)
{
int bx = 0;
diff --git a/kernel/power/Kconfig b/kernel/power/Kconfig
index dd2b5a4d89a5..826b733eaeb9 100644
--- a/kernel/power/Kconfig
+++ b/kernel/power/Kconfig
@@ -1,6 +1,7 @@
config SUSPEND
bool "Suspend to RAM and standby"
depends on ARCH_SUSPEND_POSSIBLE
+ select RTC_LIB
default y
---help---
Allow the system to enter sleep states in which main memory is
diff --git a/kernel/power/Makefile b/kernel/power/Makefile
index eb4f717705ba..80578f272be4 100644
--- a/kernel/power/Makefile
+++ b/kernel/power/Makefile
@@ -14,3 +14,5 @@ obj-$(CONFIG_PM_AUTOSLEEP) += autosleep.o
obj-$(CONFIG_PM_WAKELOCKS) += wakelock.o
obj-$(CONFIG_MAGIC_SYSRQ) += poweroff.o
+
+obj-$(CONFIG_SUSPEND) += wakeup_reason.o
diff --git a/kernel/power/process.c b/kernel/power/process.c
index 8ea24ded1dab..9a12c835b6c2 100644
--- a/kernel/power/process.c
+++ b/kernel/power/process.c
@@ -18,6 +18,7 @@
#include <linux/workqueue.h>
#include <linux/kmod.h>
#include <trace/events/power.h>
+#include <linux/wakeup_reason.h>
#include <linux/cpuset.h>
/*
@@ -35,6 +36,9 @@ static int try_to_freeze_tasks(bool user_only)
unsigned int elapsed_msecs;
bool wakeup = false;
int sleep_usecs = USEC_PER_MSEC;
+#ifdef CONFIG_PM_SLEEP
+ char suspend_abort[MAX_SUSPEND_ABORT_LEN];
+#endif
start = ktime_get_boottime();
@@ -64,6 +68,11 @@ static int try_to_freeze_tasks(bool user_only)
break;
if (pm_wakeup_pending()) {
+#ifdef CONFIG_PM_SLEEP
+ pm_get_active_wakeup_sources(suspend_abort,
+ MAX_SUSPEND_ABORT_LEN);
+ log_suspend_abort_reason(suspend_abort);
+#endif
wakeup = true;
break;
}
@@ -82,26 +91,27 @@ static int try_to_freeze_tasks(bool user_only)
elapsed = ktime_sub(end, start);
elapsed_msecs = ktime_to_ms(elapsed);
- if (todo) {
+ if (wakeup) {
pr_cont("\n");
- pr_err("Freezing of tasks %s after %d.%03d seconds "
- "(%d tasks refusing to freeze, wq_busy=%d):\n",
- wakeup ? "aborted" : "failed",
+ pr_err("Freezing of tasks aborted after %d.%03d seconds",
+ elapsed_msecs / 1000, elapsed_msecs % 1000);
+ } else if (todo) {
+ pr_cont("\n");
+ pr_err("Freezing of tasks failed after %d.%03d seconds"
+ " (%d tasks refusing to freeze, wq_busy=%d):\n",
elapsed_msecs / 1000, elapsed_msecs % 1000,
todo - wq_busy, wq_busy);
if (wq_busy)
show_workqueue_state();
- if (!wakeup) {
- read_lock(&tasklist_lock);
- for_each_process_thread(g, p) {
- if (p != current && !freezer_should_skip(p)
- && freezing(p) && !frozen(p))
- sched_show_task(p);
- }
- read_unlock(&tasklist_lock);
+ read_lock(&tasklist_lock);
+ for_each_process_thread(g, p) {
+ if (p != current && !freezer_should_skip(p)
+ && freezing(p) && !frozen(p))
+ sched_show_task(p);
}
+ read_unlock(&tasklist_lock);
} else {
pr_cont("(elapsed %d.%03d seconds) ", elapsed_msecs / 1000,
elapsed_msecs % 1000);
diff --git a/kernel/power/suspend.c b/kernel/power/suspend.c
index 6ccb08f57fcb..2d0c99b3f34c 100644
--- a/kernel/power/suspend.c
+++ b/kernel/power/suspend.c
@@ -26,9 +26,11 @@
#include <linux/suspend.h>
#include <linux/syscore_ops.h>
#include <linux/ftrace.h>
+#include <linux/rtc.h>
#include <trace/events/power.h>
#include <linux/compiler.h>
#include <linux/moduleparam.h>
+#include <linux/wakeup_reason.h>
#include "power.h"
@@ -322,7 +324,8 @@ void __weak arch_suspend_enable_irqs(void)
*/
static int suspend_enter(suspend_state_t state, bool *wakeup)
{
- int error;
+ char suspend_abort[MAX_SUSPEND_ABORT_LEN];
+ int error, last_dev;
error = platform_suspend_prepare(state);
if (error)
@@ -330,7 +333,11 @@ static int suspend_enter(suspend_state_t state, bool *wakeup)
error = dpm_suspend_late(PMSG_SUSPEND);
if (error) {
+ last_dev = suspend_stats.last_failed_dev + REC_FAILED_NUM - 1;
+ last_dev %= REC_FAILED_NUM;
pr_err("PM: late suspend of devices failed\n");
+ log_suspend_abort_reason("%s device failed to power down",
+ suspend_stats.failed_devs[last_dev]);
goto Platform_finish;
}
error = platform_suspend_prepare_late(state);
@@ -339,7 +346,11 @@ static int suspend_enter(suspend_state_t state, bool *wakeup)
error = dpm_suspend_noirq(PMSG_SUSPEND);
if (error) {
+ last_dev = suspend_stats.last_failed_dev + REC_FAILED_NUM - 1;
+ last_dev %= REC_FAILED_NUM;
pr_err("PM: noirq suspend of devices failed\n");
+ log_suspend_abort_reason("noirq suspend of %s device failed",
+ suspend_stats.failed_devs[last_dev]);
goto Platform_early_resume;
}
error = platform_suspend_prepare_noirq(state);
@@ -363,8 +374,10 @@ static int suspend_enter(suspend_state_t state, bool *wakeup)
}
error = disable_nonboot_cpus();
- if (error || suspend_test(TEST_CPUS))
+ if (error || suspend_test(TEST_CPUS)) {
+ log_suspend_abort_reason("Disabling non-boot cpus failed");
goto Enable_cpus;
+ }
arch_suspend_disable_irqs();
BUG_ON(!irqs_disabled());
@@ -380,6 +393,9 @@ static int suspend_enter(suspend_state_t state, bool *wakeup)
state, false);
events_check_enabled = false;
} else if (*wakeup) {
+ pm_get_active_wakeup_sources(suspend_abort,
+ MAX_SUSPEND_ABORT_LEN);
+ log_suspend_abort_reason(suspend_abort);
error = -EBUSY;
}
syscore_resume();
@@ -427,6 +443,7 @@ int suspend_devices_and_enter(suspend_state_t state)
error = dpm_suspend_start(PMSG_SUSPEND);
if (error) {
pr_err("PM: Some devices failed to suspend, or early wake event detected\n");
+ log_suspend_abort_reason("Some devices failed to suspend, or early wake event detected");
goto Recover_platform;
}
suspend_test_finish("suspend devices");
@@ -527,6 +544,18 @@ static int enter_state(suspend_state_t state)
return error;
}
+static void pm_suspend_marker(char *annotation)
+{
+ struct timespec ts;
+ struct rtc_time tm;
+
+ getnstimeofday(&ts);
+ rtc_time_to_tm(ts.tv_sec, &tm);
+ pr_info("PM: suspend %s %d-%02d-%02d %02d:%02d:%02d.%09lu UTC\n",
+ annotation, tm.tm_year + 1900, tm.tm_mon + 1, tm.tm_mday,
+ tm.tm_hour, tm.tm_min, tm.tm_sec, ts.tv_nsec);
+}
+
/**
* pm_suspend - Externally visible function for suspending the system.
* @state: System sleep state to enter.
@@ -541,6 +570,7 @@ int pm_suspend(suspend_state_t state)
if (state <= PM_SUSPEND_ON || state >= PM_SUSPEND_MAX)
return -EINVAL;
+ pm_suspend_marker("entry");
error = enter_state(state);
if (error) {
suspend_stats.fail++;
@@ -548,6 +578,7 @@ int pm_suspend(suspend_state_t state)
} else {
suspend_stats.success++;
}
+ pm_suspend_marker("exit");
return error;
}
EXPORT_SYMBOL(pm_suspend);
diff --git a/kernel/power/wakeup_reason.c b/kernel/power/wakeup_reason.c
new file mode 100644
index 000000000000..252611fad2fe
--- /dev/null
+++ b/kernel/power/wakeup_reason.c
@@ -0,0 +1,225 @@
+/*
+ * kernel/power/wakeup_reason.c
+ *
+ * Logs the reasons which caused the kernel to resume from
+ * the suspend mode.
+ *
+ * Copyright (C) 2014 Google, Inc.
+ * This software is licensed under the terms of the GNU General Public
+ * License version 2, as published by the Free Software Foundation, and
+ * may be copied, distributed, and modified under those terms.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ */
+
+#include <linux/wakeup_reason.h>
+#include <linux/kernel.h>
+#include <linux/irq.h>
+#include <linux/interrupt.h>
+#include <linux/io.h>
+#include <linux/kobject.h>
+#include <linux/sysfs.h>
+#include <linux/init.h>
+#include <linux/spinlock.h>
+#include <linux/notifier.h>
+#include <linux/suspend.h>
+
+
+#define MAX_WAKEUP_REASON_IRQS 32
+static int irq_list[MAX_WAKEUP_REASON_IRQS];
+static int irqcount;
+static bool suspend_abort;
+static char abort_reason[MAX_SUSPEND_ABORT_LEN];
+static struct kobject *wakeup_reason;
+static DEFINE_SPINLOCK(resume_reason_lock);
+
+static ktime_t last_monotime; /* monotonic time before last suspend */
+static ktime_t curr_monotime; /* monotonic time after last suspend */
+static ktime_t last_stime; /* monotonic boottime offset before last suspend */
+static ktime_t curr_stime; /* monotonic boottime offset after last suspend */
+
+static ssize_t last_resume_reason_show(struct kobject *kobj, struct kobj_attribute *attr,
+ char *buf)
+{
+ int irq_no, buf_offset = 0;
+ struct irq_desc *desc;
+ spin_lock(&resume_reason_lock);
+ if (suspend_abort) {
+ buf_offset = sprintf(buf, "Abort: %s", abort_reason);
+ } else {
+ for (irq_no = 0; irq_no < irqcount; irq_no++) {
+ desc = irq_to_desc(irq_list[irq_no]);
+ if (desc && desc->action && desc->action->name)
+ buf_offset += sprintf(buf + buf_offset, "%d %s\n",
+ irq_list[irq_no], desc->action->name);
+ else
+ buf_offset += sprintf(buf + buf_offset, "%d\n",
+ irq_list[irq_no]);
+ }
+ }
+ spin_unlock(&resume_reason_lock);
+ return buf_offset;
+}
+
+static ssize_t last_suspend_time_show(struct kobject *kobj,
+ struct kobj_attribute *attr, char *buf)
+{
+ struct timespec sleep_time;
+ struct timespec total_time;
+ struct timespec suspend_resume_time;
+
+ /*
+ * total_time is calculated from monotonic bootoffsets because
+ * unlike CLOCK_MONOTONIC it include the time spent in suspend state.
+ */
+ total_time = ktime_to_timespec(ktime_sub(curr_stime, last_stime));
+
+ /*
+ * suspend_resume_time is calculated as monotonic (CLOCK_MONOTONIC)
+ * time interval before entering suspend and post suspend.
+ */
+ suspend_resume_time = ktime_to_timespec(ktime_sub(curr_monotime, last_monotime));
+
+ /* sleep_time = total_time - suspend_resume_time */
+ sleep_time = timespec_sub(total_time, suspend_resume_time);
+
+ /* Export suspend_resume_time and sleep_time in pair here. */
+ return sprintf(buf, "%lu.%09lu %lu.%09lu\n",
+ suspend_resume_time.tv_sec, suspend_resume_time.tv_nsec,
+ sleep_time.tv_sec, sleep_time.tv_nsec);
+}
+
+static struct kobj_attribute resume_reason = __ATTR_RO(last_resume_reason);
+static struct kobj_attribute suspend_time = __ATTR_RO(last_suspend_time);
+
+static struct attribute *attrs[] = {
+ &resume_reason.attr,
+ &suspend_time.attr,
+ NULL,
+};
+static struct attribute_group attr_group = {
+ .attrs = attrs,
+};
+
+/*
+ * logs all the wake up reasons to the kernel
+ * stores the irqs to expose them to the userspace via sysfs
+ */
+void log_wakeup_reason(int irq)
+{
+ struct irq_desc *desc;
+ desc = irq_to_desc(irq);
+ if (desc && desc->action && desc->action->name)
+ printk(KERN_INFO "Resume caused by IRQ %d, %s\n", irq,
+ desc->action->name);
+ else
+ printk(KERN_INFO "Resume caused by IRQ %d\n", irq);
+
+ spin_lock(&resume_reason_lock);
+ if (irqcount == MAX_WAKEUP_REASON_IRQS) {
+ spin_unlock(&resume_reason_lock);
+ printk(KERN_WARNING "Resume caused by more than %d IRQs\n",
+ MAX_WAKEUP_REASON_IRQS);
+ return;
+ }
+
+ irq_list[irqcount++] = irq;
+ spin_unlock(&resume_reason_lock);
+}
+
+int check_wakeup_reason(int irq)
+{
+ int irq_no;
+ int ret = false;
+
+ spin_lock(&resume_reason_lock);
+ for (irq_no = 0; irq_no < irqcount; irq_no++)
+ if (irq_list[irq_no] == irq) {
+ ret = true;
+ break;
+ }
+ spin_unlock(&resume_reason_lock);
+ return ret;
+}
+
+void log_suspend_abort_reason(const char *fmt, ...)
+{
+ va_list args;
+
+ spin_lock(&resume_reason_lock);
+
+ //Suspend abort reason has already been logged.
+ if (suspend_abort) {
+ spin_unlock(&resume_reason_lock);
+ return;
+ }
+
+ suspend_abort = true;
+ va_start(args, fmt);
+ vsnprintf(abort_reason, MAX_SUSPEND_ABORT_LEN, fmt, args);
+ va_end(args);
+ spin_unlock(&resume_reason_lock);
+}
+
+/* Detects a suspend and clears all the previous wake up reasons*/
+static int wakeup_reason_pm_event(struct notifier_block *notifier,
+ unsigned long pm_event, void *unused)
+{
+ switch (pm_event) {
+ case PM_SUSPEND_PREPARE:
+ spin_lock(&resume_reason_lock);
+ irqcount = 0;
+ suspend_abort = false;
+ spin_unlock(&resume_reason_lock);
+ /* monotonic time since boot */
+ last_monotime = ktime_get();
+ /* monotonic time since boot including the time spent in suspend */
+ last_stime = ktime_get_boottime();
+ break;
+ case PM_POST_SUSPEND:
+ /* monotonic time since boot */
+ curr_monotime = ktime_get();
+ /* monotonic time since boot including the time spent in suspend */
+ curr_stime = ktime_get_boottime();
+ break;
+ default:
+ break;
+ }
+ return NOTIFY_DONE;
+}
+
+static struct notifier_block wakeup_reason_pm_notifier_block = {
+ .notifier_call = wakeup_reason_pm_event,
+};
+
+/* Initializes the sysfs parameter
+ * registers the pm_event notifier
+ */
+int __init wakeup_reason_init(void)
+{
+ int retval;
+
+ retval = register_pm_notifier(&wakeup_reason_pm_notifier_block);
+ if (retval)
+ printk(KERN_WARNING "[%s] failed to register PM notifier %d\n",
+ __func__, retval);
+
+ wakeup_reason = kobject_create_and_add("wakeup_reasons", kernel_kobj);
+ if (!wakeup_reason) {
+ printk(KERN_WARNING "[%s] failed to create a sysfs kobject\n",
+ __func__);
+ return 1;
+ }
+ retval = sysfs_create_group(wakeup_reason, &attr_group);
+ if (retval) {
+ kobject_put(wakeup_reason);
+ printk(KERN_WARNING "[%s] failed to create a sysfs group %d\n",
+ __func__, retval);
+ }
+ return 0;
+}
+
+late_initcall(wakeup_reason_init);
diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c
index 27adaaab96ba..b6e193da6ea4 100644
--- a/kernel/printk/printk.c
+++ b/kernel/printk/printk.c
@@ -56,6 +56,10 @@
#include "braille.h"
#include "internal.h"
+#ifdef CONFIG_EARLY_PRINTK_DIRECT
+extern void printascii(char *);
+#endif
+
int console_printk[4] = {
CONSOLE_LOGLEVEL_DEFAULT, /* console_loglevel */
MESSAGE_LOGLEVEL_DEFAULT, /* default_message_loglevel */
@@ -1875,6 +1879,10 @@ asmlinkage int vprintk_emit(int facility, int level,
}
}
+#ifdef CONFIG_EARLY_PRINTK_DIRECT
+ printascii(text);
+#endif
+
if (level == LOGLEVEL_DEFAULT)
level = default_message_loglevel;
diff --git a/kernel/sched/Makefile b/kernel/sched/Makefile
index 5e59b832ae2b..c9dbe935bd24 100644
--- a/kernel/sched/Makefile
+++ b/kernel/sched/Makefile
@@ -18,10 +18,12 @@ endif
obj-y += core.o loadavg.o clock.o cputime.o
obj-y += idle_task.o fair.o rt.o deadline.o stop_task.o
obj-y += wait.o swait.o completion.o idle.o
-obj-$(CONFIG_SMP) += cpupri.o cpudeadline.o
+obj-$(CONFIG_SMP) += cpupri.o cpudeadline.o energy.o
+obj-$(CONFIG_SCHED_WALT) += walt.o
obj-$(CONFIG_SCHED_AUTOGROUP) += auto_group.o
obj-$(CONFIG_SCHEDSTATS) += stats.o
obj-$(CONFIG_SCHED_DEBUG) += debug.o
+obj-$(CONFIG_SCHED_TUNE) += tune.o
obj-$(CONFIG_CGROUP_CPUACCT) += cpuacct.o
obj-$(CONFIG_CPU_FREQ) += cpufreq.o
obj-$(CONFIG_CPU_FREQ_GOV_SCHEDUTIL) += cpufreq_schedutil.o
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 6b3fff6a6437..97d9b5e62b0e 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -75,6 +75,7 @@
#include <linux/compiler.h>
#include <linux/frame.h>
#include <linux/prefetch.h>
+#include <linux/cpufreq_times.h>
#include <asm/switch_to.h>
#include <asm/tlb.h>
@@ -90,6 +91,7 @@
#define CREATE_TRACE_POINTS
#include <trace/events/sched.h>
+#include "walt.h"
DEFINE_MUTEX(sched_domains_mutex);
DEFINE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues);
@@ -156,6 +158,18 @@ int sysctl_sched_rt_runtime = 950000;
/* cpus with isolated domains */
cpumask_var_t cpu_isolated_map;
+struct rq *
+lock_rq_of(struct task_struct *p, struct rq_flags *flags)
+{
+ return task_rq_lock(p, flags);
+}
+
+void
+unlock_rq_of(struct rq *rq, struct task_struct *p, struct rq_flags *flags)
+{
+ task_rq_unlock(rq, p, flags);
+}
+
/*
* this_rq_lock - lock this runqueue and disable interrupts.
*/
@@ -998,7 +1012,9 @@ static struct rq *move_queued_task(struct rq *rq, struct task_struct *p, int new
p->on_rq = TASK_ON_RQ_MIGRATING;
dequeue_task(rq, p, 0);
+ double_lock_balance(rq, cpu_rq(new_cpu));
set_task_cpu(p, new_cpu);
+ double_unlock_balance(rq, cpu_rq(new_cpu));
raw_spin_unlock(&rq->lock);
rq = cpu_rq(new_cpu);
@@ -1254,6 +1270,8 @@ void set_task_cpu(struct task_struct *p, unsigned int new_cpu)
p->sched_class->migrate_task_rq(p);
p->se.nr_migrations++;
perf_event_task_migrate(p);
+
+ walt_fixup_busy_time(p, new_cpu);
}
__set_task_cpu(p, new_cpu);
@@ -1269,7 +1287,9 @@ static void __migrate_swap_task(struct task_struct *p, int cpu)
p->on_rq = TASK_ON_RQ_MIGRATING;
deactivate_task(src_rq, p, 0);
+ p->on_rq = TASK_ON_RQ_MIGRATING;
set_task_cpu(p, cpu);
+ p->on_rq = TASK_ON_RQ_QUEUED;
activate_task(dst_rq, p, 0);
p->on_rq = TASK_ON_RQ_QUEUED;
check_preempt_curr(dst_rq, p, 0);
@@ -2010,6 +2030,10 @@ try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags)
{
unsigned long flags;
int cpu, success = 0;
+#ifdef CONFIG_SMP
+ struct rq *rq;
+ u64 wallclock;
+#endif
/*
* If we are going to wake up a thread waiting for CONDITION we
@@ -2083,14 +2107,24 @@ try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags)
*/
smp_cond_load_acquire(&p->on_cpu, !VAL);
+ rq = cpu_rq(task_cpu(p));
+
+ raw_spin_lock(&rq->lock);
+ wallclock = walt_ktime_clock();
+ walt_update_task_ravg(rq->curr, rq, TASK_UPDATE, wallclock, 0);
+ walt_update_task_ravg(p, rq, TASK_WAKE, wallclock, 0);
+ raw_spin_unlock(&rq->lock);
+
p->sched_contributes_to_load = !!task_contributes_to_load(p);
p->state = TASK_WAKING;
cpu = select_task_rq(p, p->wake_cpu, SD_BALANCE_WAKE, wake_flags);
+
if (task_cpu(p) != cpu) {
wake_flags |= WF_MIGRATED;
set_task_cpu(p, cpu);
}
+
#endif /* CONFIG_SMP */
ttwu_queue(p, cpu, wake_flags);
@@ -2140,8 +2174,13 @@ static void try_to_wake_up_local(struct task_struct *p, struct pin_cookie cookie
trace_sched_waking(p);
- if (!task_on_rq_queued(p))
+ if (!task_on_rq_queued(p)) {
+ u64 wallclock = walt_ktime_clock();
+
+ walt_update_task_ravg(rq->curr, rq, TASK_UPDATE, wallclock, 0);
+ walt_update_task_ravg(p, rq, TASK_WAKE, wallclock, 0);
ttwu_activate(rq, p, ENQUEUE_WAKEUP);
+ }
ttwu_do_wakeup(rq, p, 0, cookie);
ttwu_stat(p, smp_processor_id(), 0);
@@ -2206,7 +2245,12 @@ static void __sched_fork(unsigned long clone_flags, struct task_struct *p)
p->se.prev_sum_exec_runtime = 0;
p->se.nr_migrations = 0;
p->se.vruntime = 0;
+#ifdef CONFIG_SCHED_WALT
+ p->last_sleep_ts = 0;
+#endif
+
INIT_LIST_HEAD(&p->se.group_node);
+ walt_init_new_task_load(p);
#ifdef CONFIG_FAIR_GROUP_SCHED
p->se.cfs_rq = NULL;
@@ -2566,6 +2610,9 @@ void wake_up_new_task(struct task_struct *p)
struct rq *rq;
raw_spin_lock_irqsave(&p->pi_lock, rf.flags);
+
+ walt_init_new_task_load(p);
+
p->state = TASK_RUNNING;
#ifdef CONFIG_SMP
/*
@@ -2579,9 +2626,12 @@ void wake_up_new_task(struct task_struct *p)
__set_task_cpu(p, select_task_rq(p, task_cpu(p), SD_BALANCE_FORK, 0));
#endif
rq = __task_rq_lock(p, &rf);
+ update_rq_clock(rq);
post_init_entity_util_avg(&p->se);
- activate_task(rq, p, 0);
+ walt_mark_task_starting(p);
+
+ activate_task(rq, p, ENQUEUE_WAKEUP_NEW);
p->on_rq = TASK_ON_RQ_QUEUED;
trace_sched_wakeup_new(p);
check_preempt_curr(rq, p, WF_FORK);
@@ -2966,6 +3016,36 @@ unsigned long nr_iowait_cpu(int cpu)
return atomic_read(&this->nr_iowait);
}
+#ifdef CONFIG_CPU_QUIET
+u64 nr_running_integral(unsigned int cpu)
+{
+ unsigned int seqcnt;
+ u64 integral;
+ struct rq *q;
+
+ if (cpu >= nr_cpu_ids)
+ return 0;
+
+ q = cpu_rq(cpu);
+
+ /*
+ * Update average to avoid reading stalled value if there were
+ * no run-queue changes for a long time. On the other hand if
+ * the changes are happening right now, just read current value
+ * directly.
+ */
+
+ seqcnt = read_seqcount_begin(&q->ave_seqcnt);
+ integral = do_nr_running_integral(q);
+ if (read_seqcount_retry(&q->ave_seqcnt, seqcnt)) {
+ read_seqcount_begin(&q->ave_seqcnt);
+ integral = q->nr_running_integral;
+ }
+
+ return integral;
+}
+#endif
+
void get_iowait_load(unsigned long *nr_waiters, unsigned long *load)
{
struct rq *rq = this_rq();
@@ -3083,6 +3163,9 @@ void scheduler_tick(void)
sched_clock_tick();
raw_spin_lock(&rq->lock);
+ walt_set_window_start(rq);
+ walt_update_task_ravg(rq->curr, rq, TASK_UPDATE,
+ walt_ktime_clock(), 0);
update_rq_clock(rq);
curr->sched_class->task_tick(rq, curr, 0);
cpu_load_update_active(rq);
@@ -3096,6 +3179,9 @@ void scheduler_tick(void)
trigger_load_balance(rq);
#endif
rq_last_tick_reset(rq);
+
+ if (curr->sched_class == &fair_sched_class)
+ check_for_migration(rq, curr);
}
#ifdef CONFIG_NO_HZ_FULL
@@ -3339,6 +3425,7 @@ static void __sched notrace __schedule(bool preempt)
struct pin_cookie cookie;
struct rq *rq;
int cpu;
+ u64 wallclock;
cpu = smp_processor_id();
rq = cpu_rq(cpu);
@@ -3391,11 +3478,18 @@ static void __sched notrace __schedule(bool preempt)
update_rq_clock(rq);
next = pick_next_task(rq, prev, cookie);
+ wallclock = walt_ktime_clock();
+ walt_update_task_ravg(prev, rq, PUT_PREV_TASK, wallclock, 0);
+ walt_update_task_ravg(next, rq, PICK_NEXT_TASK, wallclock, 0);
clear_tsk_need_resched(prev);
clear_preempt_need_resched();
rq->clock_skip_update = 0;
if (likely(prev != next)) {
+#ifdef CONFIG_SCHED_WALT
+ if (!prev->on_rq)
+ prev->last_sleep_ts = wallclock;
+#endif
rq->nr_switches++;
rq->curr = next;
++*switch_count;
@@ -3652,6 +3746,7 @@ void rt_mutex_setprio(struct task_struct *p, int prio)
BUG_ON(prio > MAX_PRIO);
rq = __task_rq_lock(p, &rf);
+ update_rq_clock(rq);
/*
* Idle task boosting is a nono in general. There is one
@@ -3748,6 +3843,8 @@ void set_user_nice(struct task_struct *p, long nice)
* the task might be in the middle of scheduling on another CPU.
*/
rq = task_rq_lock(p, &rf);
+ update_rq_clock(rq);
+
/*
* The RT priorities are set via sched_setscheduler(), but we still
* allow the 'normal' nice value to be set - but as expected
@@ -4181,6 +4278,7 @@ recheck:
* runqueue lock must be held.
*/
rq = task_rq_lock(p, &rf);
+ update_rq_clock(rq);
/*
* Changing the policy of the stop threads its a very bad idea
@@ -5666,9 +5764,6 @@ static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level,
if (!(sd->flags & SD_LOAD_BALANCE)) {
printk("does not load-balance\n");
- if (sd->parent)
- printk(KERN_ERR "ERROR: !SD_LOAD_BALANCE domain"
- " has parent");
return -1;
}
@@ -5710,7 +5805,7 @@ static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level,
printk(KERN_CONT " %*pbl",
cpumask_pr_args(sched_group_cpus(group)));
if (group->sgc->capacity != SCHED_CAPACITY_SCALE) {
- printk(KERN_CONT " (cpu_capacity = %d)",
+ printk(KERN_CONT " (cpu_capacity = %lu)",
group->sgc->capacity);
}
@@ -5763,8 +5858,12 @@ static inline bool sched_debug(void)
static int sd_degenerate(struct sched_domain *sd)
{
- if (cpumask_weight(sched_domain_span(sd)) == 1)
- return 1;
+ if (cpumask_weight(sched_domain_span(sd)) == 1) {
+ if (sd->groups->sge)
+ sd->flags &= ~SD_LOAD_BALANCE;
+ else
+ return 1;
+ }
/* Following flags need at least 2 groups */
if (sd->flags & (SD_LOAD_BALANCE |
@@ -5774,7 +5873,8 @@ static int sd_degenerate(struct sched_domain *sd)
SD_SHARE_CPUCAPACITY |
SD_ASYM_CPUCAPACITY |
SD_SHARE_PKG_RESOURCES |
- SD_SHARE_POWERDOMAIN)) {
+ SD_SHARE_POWERDOMAIN |
+ SD_SHARE_CAP_STATES)) {
if (sd->groups != sd->groups->next)
return 0;
}
@@ -5807,7 +5907,12 @@ sd_parent_degenerate(struct sched_domain *sd, struct sched_domain *parent)
SD_SHARE_CPUCAPACITY |
SD_SHARE_PKG_RESOURCES |
SD_PREFER_SIBLING |
- SD_SHARE_POWERDOMAIN);
+ SD_SHARE_POWERDOMAIN |
+ SD_SHARE_CAP_STATES);
+ if (parent->groups->sge) {
+ parent->flags &= ~SD_LOAD_BALANCE;
+ return 0;
+ }
if (nr_node_ids == 1)
pflags &= ~SD_SERIALIZE;
}
@@ -5905,6 +6010,11 @@ static int init_rootdomain(struct root_domain *rd)
if (cpupri_init(&rd->cpupri) != 0)
goto free_rto_mask;
+
+ init_max_cpu_capacity(&rd->max_cpu_capacity);
+
+ rd->max_cap_orig_cpu = rd->min_cap_orig_cpu = -1;
+
return 0;
free_rto_mask:
@@ -6016,11 +6126,14 @@ DEFINE_PER_CPU(int, sd_llc_id);
DEFINE_PER_CPU(struct sched_domain_shared *, sd_llc_shared);
DEFINE_PER_CPU(struct sched_domain *, sd_numa);
DEFINE_PER_CPU(struct sched_domain *, sd_asym);
+DEFINE_PER_CPU(struct sched_domain *, sd_ea);
+DEFINE_PER_CPU(struct sched_domain *, sd_scs);
static void update_top_cache_domain(int cpu)
{
struct sched_domain_shared *sds = NULL;
struct sched_domain *sd;
+ struct sched_domain *ea_sd = NULL;
int id = cpu;
int size = 1;
@@ -6041,6 +6154,17 @@ static void update_top_cache_domain(int cpu)
sd = highest_flag_domain(cpu, SD_ASYM_PACKING);
rcu_assign_pointer(per_cpu(sd_asym, cpu), sd);
+
+ for_each_domain(cpu, sd) {
+ if (sd->groups->sge)
+ ea_sd = sd;
+ else
+ break;
+ }
+ rcu_assign_pointer(per_cpu(sd_ea, cpu), ea_sd);
+
+ sd = highest_flag_domain(cpu, SD_SHARE_CAP_STATES);
+ rcu_assign_pointer(per_cpu(sd_scs, cpu), sd);
}
/*
@@ -6222,6 +6346,8 @@ build_overlap_sched_groups(struct sched_domain *sd, int cpu)
* die on a /0 trap.
*/
sg->sgc->capacity = SCHED_CAPACITY_SCALE * cpumask_weight(sg_span);
+ sg->sgc->max_capacity = SCHED_CAPACITY_SCALE;
+ sg->sgc->min_capacity = SCHED_CAPACITY_SCALE;
/*
* Make sure the first group of this domain contains the
@@ -6350,6 +6476,66 @@ static void init_sched_groups_capacity(int cpu, struct sched_domain *sd)
}
/*
+ * Check that the per-cpu provided sd energy data is consistent for all cpus
+ * within the mask.
+ */
+static inline void check_sched_energy_data(int cpu, sched_domain_energy_f fn,
+ const struct cpumask *cpumask)
+{
+ const struct sched_group_energy * const sge = fn(cpu);
+ struct cpumask mask;
+ int i;
+
+ if (cpumask_weight(cpumask) <= 1)
+ return;
+
+ cpumask_xor(&mask, cpumask, get_cpu_mask(cpu));
+
+ for_each_cpu(i, &mask) {
+ const struct sched_group_energy * const e = fn(i);
+ int y;
+
+ BUG_ON(e->nr_idle_states != sge->nr_idle_states);
+
+ for (y = 0; y < (e->nr_idle_states); y++) {
+ BUG_ON(e->idle_states[y].power !=
+ sge->idle_states[y].power);
+ }
+
+ BUG_ON(e->nr_cap_states != sge->nr_cap_states);
+
+ for (y = 0; y < (e->nr_cap_states); y++) {
+ BUG_ON(e->cap_states[y].cap != sge->cap_states[y].cap);
+ BUG_ON(e->cap_states[y].power !=
+ sge->cap_states[y].power);
+ }
+ }
+}
+
+static void init_sched_energy(int cpu, struct sched_domain *sd,
+ sched_domain_energy_f fn)
+{
+ if (!(fn && fn(cpu)))
+ return;
+
+ if (cpu != group_balance_cpu(sd->groups))
+ return;
+
+ if (sd->child && !sd->child->groups->sge) {
+ pr_err("BUG: EAS setup broken for CPU%d\n", cpu);
+#ifdef CONFIG_SCHED_DEBUG
+ pr_err(" energy data on %s but not on %s domain\n",
+ sd->name, sd->child->name);
+#endif
+ return;
+ }
+
+ check_sched_energy_data(cpu, fn, sched_group_cpus(sd->groups));
+
+ sd->groups->sge = fn(cpu);
+}
+
+/*
* Initializers for schedule domains
* Non-inlined to reduce accumulated stack pressure in build_sched_domains()
*/
@@ -6465,6 +6651,7 @@ static int sched_domains_curr_level;
* SD_NUMA - describes NUMA topologies
* SD_SHARE_POWERDOMAIN - describes shared power domain
* SD_ASYM_CPUCAPACITY - describes mixed capacity topologies
+ * SD_SHARE_CAP_STATES - describes shared capacity states
*
* Odd one out, which beside describing the topology has a quirk also
* prescribes the desired behaviour that goes along with it:
@@ -6477,7 +6664,8 @@ static int sched_domains_curr_level;
SD_NUMA | \
SD_ASYM_PACKING | \
SD_ASYM_CPUCAPACITY | \
- SD_SHARE_POWERDOMAIN)
+ SD_SHARE_POWERDOMAIN | \
+ SD_SHARE_CAP_STATES)
static struct sched_domain *
sd_init(struct sched_domain_topology_level *tl,
@@ -7035,7 +7223,6 @@ static int build_sched_domains(const struct cpumask *cpu_map,
enum s_alloc alloc_state;
struct sched_domain *sd;
struct s_data d;
- struct rq *rq = NULL;
int i, ret = -ENOMEM;
alloc_state = __visit_domain_allocation_hell(&d, cpu_map);
@@ -7053,8 +7240,6 @@ static int build_sched_domains(const struct cpumask *cpu_map,
*per_cpu_ptr(d.sd, i) = sd;
if (tl->flags & SDTL_OVERLAP || sched_feat(FORCE_SD_OVERLAP))
sd->flags |= SD_OVERLAP;
- if (cpumask_equal(cpu_map, sched_domain_span(sd)))
- break;
}
}
@@ -7074,10 +7259,13 @@ static int build_sched_domains(const struct cpumask *cpu_map,
/* Calculate CPU capacity for physical packages and nodes */
for (i = nr_cpumask_bits-1; i >= 0; i--) {
+ struct sched_domain_topology_level *tl = sched_domain_topology;
+
if (!cpumask_test_cpu(i, cpu_map))
continue;
- for (sd = *per_cpu_ptr(d.sd, i); sd; sd = sd->parent) {
+ for (sd = *per_cpu_ptr(d.sd, i); sd; sd = sd->parent, tl++) {
+ init_sched_energy(i, sd, tl->energy);
claim_allocations(i, sd);
init_sched_groups_capacity(i, sd);
}
@@ -7086,22 +7274,23 @@ static int build_sched_domains(const struct cpumask *cpu_map,
/* Attach the domains */
rcu_read_lock();
for_each_cpu(i, cpu_map) {
- rq = cpu_rq(i);
- sd = *per_cpu_ptr(d.sd, i);
+ int max_cpu = READ_ONCE(d.rd->max_cap_orig_cpu);
+ int min_cpu = READ_ONCE(d.rd->min_cap_orig_cpu);
+
+ if ((max_cpu < 0) || (cpu_rq(i)->cpu_capacity_orig >
+ cpu_rq(max_cpu)->cpu_capacity_orig))
+ WRITE_ONCE(d.rd->max_cap_orig_cpu, i);
- /* Use READ_ONCE()/WRITE_ONCE() to avoid load/store tearing: */
- if (rq->cpu_capacity_orig > READ_ONCE(d.rd->max_cpu_capacity))
- WRITE_ONCE(d.rd->max_cpu_capacity, rq->cpu_capacity_orig);
+ if ((min_cpu < 0) || (cpu_rq(i)->cpu_capacity_orig <
+ cpu_rq(min_cpu)->cpu_capacity_orig))
+ WRITE_ONCE(d.rd->min_cap_orig_cpu, i);
+
+ sd = *per_cpu_ptr(d.sd, i);
cpu_attach_domain(sd, d.rd, i);
}
rcu_read_unlock();
- if (rq && sched_debug_enabled) {
- pr_info("span: %*pbl (max cpu_capacity = %lu)\n",
- cpumask_pr_args(cpu_map), rq->rd->max_cpu_capacity);
- }
-
ret = 0;
error:
__free_domain_allocs(&d, alloc_state, cpu_map);
@@ -7444,6 +7633,9 @@ int sched_cpu_dying(unsigned int cpu)
/* Handle pending wakeups and then migrate everything off */
sched_ttwu_pending();
raw_spin_lock_irqsave(&rq->lock, flags);
+
+ walt_migrate_sync_cpu(cpu);
+
if (rq->rd) {
BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span));
set_rq_offline(rq);
@@ -7620,6 +7812,7 @@ void __init sched_init(void)
#ifdef CONFIG_FAIR_GROUP_SCHED
root_task_group.shares = ROOT_TASK_GROUP_LOAD;
INIT_LIST_HEAD(&rq->leaf_cfs_rq_list);
+ rq->tmp_alone_branch = &rq->leaf_cfs_rq_list;
/*
* How much cpu bandwidth does root_task_group get?
*
@@ -7659,11 +7852,17 @@ void __init sched_init(void)
rq->active_balance = 0;
rq->next_balance = jiffies;
rq->push_cpu = 0;
+ rq->push_task = NULL;
rq->cpu = i;
rq->online = 0;
rq->idle_stamp = 0;
rq->avg_idle = 2*sysctl_sched_migration_cost;
rq->max_idle_balance_cost = sysctl_sched_migration_cost;
+#ifdef CONFIG_SCHED_WALT
+ rq->cur_irqload = 0;
+ rq->avg_irqload = 0;
+ rq->irqload_ts = 0;
+#endif
INIT_LIST_HEAD(&rq->cfs_tasks);
@@ -7721,6 +7920,14 @@ static inline int preempt_count_equals(int preempt_offset)
return (nested == preempt_offset);
}
+static int __might_sleep_init_called;
+int __init __might_sleep_init(void)
+{
+ __might_sleep_init_called = 1;
+ return 0;
+}
+early_initcall(__might_sleep_init);
+
void __might_sleep(const char *file, int line, int preempt_offset)
{
/*
@@ -7746,8 +7953,10 @@ void ___might_sleep(const char *file, int line, int preempt_offset)
rcu_sleep_check(); /* WARN_ON_ONCE() by default, no rate limit reqd. */
if ((preempt_count_equals(preempt_offset) && !irqs_disabled() &&
- !is_idle_task(current)) ||
- system_state != SYSTEM_RUNNING || oops_in_progress)
+ !is_idle_task(current)) || oops_in_progress)
+ return;
+ if (system_state != SYSTEM_RUNNING &&
+ (!__might_sleep_init_called || system_state != SYSTEM_BOOTING))
return;
if (time_before(jiffies, prev_jiffy + HZ) && prev_jiffy)
return;
@@ -8440,6 +8649,7 @@ static void cpu_cgroup_fork(struct task_struct *task)
rq = task_rq_lock(task, &rf);
+ update_rq_clock(rq);
sched_change_group(task, TASK_SET_GROUP);
task_rq_unlock(rq, task, &rf);
diff --git a/kernel/sched/cpufreq_schedutil.c b/kernel/sched/cpufreq_schedutil.c
index cb771c76682e..0526dc0365db 100644
--- a/kernel/sched/cpufreq_schedutil.c
+++ b/kernel/sched/cpufreq_schedutil.c
@@ -12,14 +12,27 @@
#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
#include <linux/cpufreq.h>
+#include <linux/kthread.h>
#include <linux/slab.h>
#include <trace/events/power.h>
#include "sched.h"
+#include "tune.h"
+
+unsigned long boosted_cpu_util(int cpu);
+
+/* Stub out fast switch routines present on mainline to reduce the backport
+ * overhead. */
+#define cpufreq_driver_fast_switch(x, y) 0
+#define cpufreq_enable_fast_switch(x)
+#define cpufreq_disable_fast_switch(x)
+#define LATENCY_MULTIPLIER (1000)
+#define SUGOV_KTHREAD_PRIORITY 50
struct sugov_tunables {
struct gov_attr_set attr_set;
- unsigned int rate_limit_us;
+ unsigned int up_rate_limit_us;
+ unsigned int down_rate_limit_us;
};
struct sugov_policy {
@@ -30,14 +43,18 @@ struct sugov_policy {
raw_spinlock_t update_lock; /* For shared policies */
u64 last_freq_update_time;
- s64 freq_update_delay_ns;
+ s64 min_rate_limit_ns;
+ s64 up_rate_delay_ns;
+ s64 down_rate_delay_ns;
unsigned int next_freq;
unsigned int cached_raw_freq;
/* The next fields are only needed if fast switch cannot be used. */
struct irq_work irq_work;
- struct work_struct work;
+ struct kthread_work work;
struct mutex work_lock;
+ struct kthread_worker worker;
+ struct task_struct *thread;
bool work_in_progress;
bool need_freq_update;
@@ -55,6 +72,11 @@ struct sugov_cpu {
unsigned long util;
unsigned long max;
unsigned int flags;
+
+ /* The field below is for single-CPU policies only. */
+#ifdef CONFIG_NO_HZ_COMMON
+ unsigned long saved_idle_calls;
+#endif
};
static DEFINE_PER_CPU(struct sugov_cpu, sugov_cpu);
@@ -79,7 +101,27 @@ static bool sugov_should_update_freq(struct sugov_policy *sg_policy, u64 time)
}
delta_ns = time - sg_policy->last_freq_update_time;
- return delta_ns >= sg_policy->freq_update_delay_ns;
+
+ /* No need to recalculate next freq for min_rate_limit_us at least */
+ return delta_ns >= sg_policy->min_rate_limit_ns;
+}
+
+static bool sugov_up_down_rate_limit(struct sugov_policy *sg_policy, u64 time,
+ unsigned int next_freq)
+{
+ s64 delta_ns;
+
+ delta_ns = time - sg_policy->last_freq_update_time;
+
+ if (next_freq > sg_policy->next_freq &&
+ delta_ns < sg_policy->up_rate_delay_ns)
+ return true;
+
+ if (next_freq < sg_policy->next_freq &&
+ delta_ns < sg_policy->down_rate_delay_ns)
+ return true;
+
+ return false;
}
static void sugov_update_commit(struct sugov_policy *sg_policy, u64 time,
@@ -87,22 +129,26 @@ static void sugov_update_commit(struct sugov_policy *sg_policy, u64 time,
{
struct cpufreq_policy *policy = sg_policy->policy;
+ if (sugov_up_down_rate_limit(sg_policy, time, next_freq)) {
+ /* Reset cached freq as next_freq isn't changed */
+ sg_policy->cached_raw_freq = 0;
+ return;
+ }
+
+ if (sg_policy->next_freq == next_freq)
+ return;
+
+ sg_policy->next_freq = next_freq;
sg_policy->last_freq_update_time = time;
if (policy->fast_switch_enabled) {
- if (sg_policy->next_freq == next_freq) {
- trace_cpu_frequency(policy->cur, smp_processor_id());
- return;
- }
- sg_policy->next_freq = next_freq;
next_freq = cpufreq_driver_fast_switch(policy, next_freq);
if (next_freq == CPUFREQ_ENTRY_INVALID)
return;
policy->cur = next_freq;
trace_cpu_frequency(next_freq, smp_processor_id());
- } else if (sg_policy->next_freq != next_freq) {
- sg_policy->next_freq = next_freq;
+ } else {
sg_policy->work_in_progress = true;
irq_work_queue(&sg_policy->irq_work);
}
@@ -110,7 +156,7 @@ static void sugov_update_commit(struct sugov_policy *sg_policy, u64 time,
/**
* get_next_freq - Compute a new frequency for a given cpufreq policy.
- * @sg_cpu: schedutil cpu object to compute the new frequency for.
+ * @sg_policy: schedutil policy object to compute the new frequency for.
* @util: Current CPU utilization.
* @max: CPU capacity.
*
@@ -130,10 +176,9 @@ static void sugov_update_commit(struct sugov_policy *sg_policy, u64 time,
* next_freq (as calculated above) is returned, subject to policy min/max and
* cpufreq driver limitations.
*/
-static unsigned int get_next_freq(struct sugov_cpu *sg_cpu, unsigned long util,
- unsigned long max)
+static unsigned int get_next_freq(struct sugov_policy *sg_policy,
+ unsigned long util, unsigned long max)
{
- struct sugov_policy *sg_policy = sg_cpu->sg_policy;
struct cpufreq_policy *policy = sg_policy->policy;
unsigned int freq = arch_scale_freq_invariant() ?
policy->cpuinfo.max_freq : policy->cur;
@@ -146,15 +191,36 @@ static unsigned int get_next_freq(struct sugov_cpu *sg_cpu, unsigned long util,
return cpufreq_driver_resolve_freq(policy, freq);
}
-static void sugov_get_util(unsigned long *util, unsigned long *max)
+static inline bool use_pelt(void)
{
- struct rq *rq = this_rq();
- unsigned long cfs_max;
+#ifdef CONFIG_SCHED_WALT
+ return (!sysctl_sched_use_walt_cpu_util || walt_disabled);
+#else
+ return true;
+#endif
+}
+
+static void sugov_get_util(unsigned long *util, unsigned long *max, u64 time)
+{
+ int cpu = smp_processor_id();
+ struct rq *rq = cpu_rq(cpu);
+ unsigned long max_cap, rt;
+ s64 delta;
+
+ max_cap = arch_scale_cpu_capacity(NULL, cpu);
- cfs_max = arch_scale_cpu_capacity(NULL, smp_processor_id());
+ sched_avg_update(rq);
+ delta = time - rq->age_stamp;
+ if (unlikely(delta < 0))
+ delta = 0;
+ rt = div64_u64(rq->rt_avg, sched_avg_period() + delta);
+ rt = (rt * max_cap) >> SCHED_CAPACITY_SHIFT;
- *util = min(rq->cfs.avg.util_avg, cfs_max);
- *max = cfs_max;
+ *util = boosted_cpu_util(cpu);
+ if (likely(use_pelt()))
+ *util = min((*util + rt), max_cap);
+
+ *max = max_cap;
}
static void sugov_set_iowait_boost(struct sugov_cpu *sg_cpu, u64 time,
@@ -187,6 +253,19 @@ static void sugov_iowait_boost(struct sugov_cpu *sg_cpu, unsigned long *util,
sg_cpu->iowait_boost >>= 1;
}
+#ifdef CONFIG_NO_HZ_COMMON
+static bool sugov_cpu_is_busy(struct sugov_cpu *sg_cpu)
+{
+ unsigned long idle_calls = tick_nohz_get_idle_calls();
+ bool ret = idle_calls == sg_cpu->saved_idle_calls;
+
+ sg_cpu->saved_idle_calls = idle_calls;
+ return ret;
+}
+#else
+static inline bool sugov_cpu_is_busy(struct sugov_cpu *sg_cpu) { return false; }
+#endif /* CONFIG_NO_HZ_COMMON */
+
static void sugov_update_single(struct update_util_data *hook, u64 time,
unsigned int flags)
{
@@ -195,6 +274,7 @@ static void sugov_update_single(struct update_util_data *hook, u64 time,
struct cpufreq_policy *policy = sg_policy->policy;
unsigned long util, max;
unsigned int next_f;
+ bool busy;
sugov_set_iowait_boost(sg_cpu, time, flags);
sg_cpu->last_update = time;
@@ -202,40 +282,40 @@ static void sugov_update_single(struct update_util_data *hook, u64 time,
if (!sugov_should_update_freq(sg_policy, time))
return;
- if (flags & SCHED_CPUFREQ_RT_DL) {
+ busy = sugov_cpu_is_busy(sg_cpu);
+
+ if (flags & SCHED_CPUFREQ_DL) {
next_f = policy->cpuinfo.max_freq;
} else {
- sugov_get_util(&util, &max);
+ sugov_get_util(&util, &max, time);
sugov_iowait_boost(sg_cpu, &util, &max);
- next_f = get_next_freq(sg_cpu, util, max);
+ next_f = get_next_freq(sg_policy, util, max);
+ /*
+ * Do not reduce the frequency if the CPU has not been idle
+ * recently, as the reduction is likely to be premature then.
+ */
+ if (busy && next_f < sg_policy->next_freq) {
+ next_f = sg_policy->next_freq;
+
+ /* Reset cached freq as next_freq has changed */
+ sg_policy->cached_raw_freq = 0;
+ }
}
sugov_update_commit(sg_policy, time, next_f);
}
-static unsigned int sugov_next_freq_shared(struct sugov_cpu *sg_cpu,
- unsigned long util, unsigned long max,
- unsigned int flags)
+static unsigned int sugov_next_freq_shared(struct sugov_cpu *sg_cpu, u64 time)
{
struct sugov_policy *sg_policy = sg_cpu->sg_policy;
struct cpufreq_policy *policy = sg_policy->policy;
- unsigned int max_f = policy->cpuinfo.max_freq;
- u64 last_freq_update_time = sg_policy->last_freq_update_time;
+ unsigned long util = 0, max = 1;
unsigned int j;
- if (flags & SCHED_CPUFREQ_RT_DL)
- return max_f;
-
- sugov_iowait_boost(sg_cpu, &util, &max);
-
for_each_cpu(j, policy->cpus) {
- struct sugov_cpu *j_sg_cpu;
+ struct sugov_cpu *j_sg_cpu = &per_cpu(sugov_cpu, j);
unsigned long j_util, j_max;
s64 delta_ns;
- if (j == smp_processor_id())
- continue;
-
- j_sg_cpu = &per_cpu(sugov_cpu, j);
/*
* If the CPU utilization was last updated before the previous
* frequency update and the time elapsed between the last update
@@ -243,13 +323,13 @@ static unsigned int sugov_next_freq_shared(struct sugov_cpu *sg_cpu,
* enough, don't take the CPU into account as it probably is
* idle now (and clear iowait_boost for it).
*/
- delta_ns = last_freq_update_time - j_sg_cpu->last_update;
+ delta_ns = time - j_sg_cpu->last_update;
if (delta_ns > TICK_NSEC) {
j_sg_cpu->iowait_boost = 0;
continue;
}
- if (j_sg_cpu->flags & SCHED_CPUFREQ_RT_DL)
- return max_f;
+ if (j_sg_cpu->flags & SCHED_CPUFREQ_DL)
+ return policy->cpuinfo.max_freq;
j_util = j_sg_cpu->util;
j_max = j_sg_cpu->max;
@@ -261,7 +341,7 @@ static unsigned int sugov_next_freq_shared(struct sugov_cpu *sg_cpu,
sugov_iowait_boost(j_sg_cpu, &util, &max);
}
- return get_next_freq(sg_cpu, util, max);
+ return get_next_freq(sg_policy, util, max);
}
static void sugov_update_shared(struct update_util_data *hook, u64 time,
@@ -272,7 +352,7 @@ static void sugov_update_shared(struct update_util_data *hook, u64 time,
unsigned long util, max;
unsigned int next_f;
- sugov_get_util(&util, &max);
+ sugov_get_util(&util, &max, time);
raw_spin_lock(&sg_policy->update_lock);
@@ -284,14 +364,18 @@ static void sugov_update_shared(struct update_util_data *hook, u64 time,
sg_cpu->last_update = time;
if (sugov_should_update_freq(sg_policy, time)) {
- next_f = sugov_next_freq_shared(sg_cpu, util, max, flags);
+ if (flags & SCHED_CPUFREQ_DL)
+ next_f = sg_policy->policy->cpuinfo.max_freq;
+ else
+ next_f = sugov_next_freq_shared(sg_cpu, time);
+
sugov_update_commit(sg_policy, time, next_f);
}
raw_spin_unlock(&sg_policy->update_lock);
}
-static void sugov_work(struct work_struct *work)
+static void sugov_work(struct kthread_work *work)
{
struct sugov_policy *sg_policy = container_of(work, struct sugov_policy, work);
@@ -308,7 +392,21 @@ static void sugov_irq_work(struct irq_work *irq_work)
struct sugov_policy *sg_policy;
sg_policy = container_of(irq_work, struct sugov_policy, irq_work);
- schedule_work_on(smp_processor_id(), &sg_policy->work);
+
+ /*
+ * For RT and deadline tasks, the schedutil governor shoots the
+ * frequency to maximum. Special care must be taken to ensure that this
+ * kthread doesn't result in the same behavior.
+ *
+ * This is (mostly) guaranteed by the work_in_progress flag. The flag is
+ * updated only at the end of the sugov_work() function and before that
+ * the schedutil governor rejects all other frequency scaling requests.
+ *
+ * There is a very rare case though, where the RT thread yields right
+ * after the work_in_progress flag is cleared. The effects of that are
+ * neglected for now.
+ */
+ kthread_queue_work(&sg_policy->worker, &sg_policy->work);
}
/************************** sysfs interface ************************/
@@ -321,15 +419,32 @@ static inline struct sugov_tunables *to_sugov_tunables(struct gov_attr_set *attr
return container_of(attr_set, struct sugov_tunables, attr_set);
}
-static ssize_t rate_limit_us_show(struct gov_attr_set *attr_set, char *buf)
+static DEFINE_MUTEX(min_rate_lock);
+
+static void update_min_rate_limit_us(struct sugov_policy *sg_policy)
+{
+ mutex_lock(&min_rate_lock);
+ sg_policy->min_rate_limit_ns = min(sg_policy->up_rate_delay_ns,
+ sg_policy->down_rate_delay_ns);
+ mutex_unlock(&min_rate_lock);
+}
+
+static ssize_t up_rate_limit_us_show(struct gov_attr_set *attr_set, char *buf)
+{
+ struct sugov_tunables *tunables = to_sugov_tunables(attr_set);
+
+ return sprintf(buf, "%u\n", tunables->up_rate_limit_us);
+}
+
+static ssize_t down_rate_limit_us_show(struct gov_attr_set *attr_set, char *buf)
{
struct sugov_tunables *tunables = to_sugov_tunables(attr_set);
- return sprintf(buf, "%u\n", tunables->rate_limit_us);
+ return sprintf(buf, "%u\n", tunables->down_rate_limit_us);
}
-static ssize_t rate_limit_us_store(struct gov_attr_set *attr_set, const char *buf,
- size_t count)
+static ssize_t up_rate_limit_us_store(struct gov_attr_set *attr_set,
+ const char *buf, size_t count)
{
struct sugov_tunables *tunables = to_sugov_tunables(attr_set);
struct sugov_policy *sg_policy;
@@ -338,18 +453,42 @@ static ssize_t rate_limit_us_store(struct gov_attr_set *attr_set, const char *bu
if (kstrtouint(buf, 10, &rate_limit_us))
return -EINVAL;
- tunables->rate_limit_us = rate_limit_us;
+ tunables->up_rate_limit_us = rate_limit_us;
- list_for_each_entry(sg_policy, &attr_set->policy_list, tunables_hook)
- sg_policy->freq_update_delay_ns = rate_limit_us * NSEC_PER_USEC;
+ list_for_each_entry(sg_policy, &attr_set->policy_list, tunables_hook) {
+ sg_policy->up_rate_delay_ns = rate_limit_us * NSEC_PER_USEC;
+ update_min_rate_limit_us(sg_policy);
+ }
return count;
}
-static struct governor_attr rate_limit_us = __ATTR_RW(rate_limit_us);
+static ssize_t down_rate_limit_us_store(struct gov_attr_set *attr_set,
+ const char *buf, size_t count)
+{
+ struct sugov_tunables *tunables = to_sugov_tunables(attr_set);
+ struct sugov_policy *sg_policy;
+ unsigned int rate_limit_us;
+
+ if (kstrtouint(buf, 10, &rate_limit_us))
+ return -EINVAL;
+
+ tunables->down_rate_limit_us = rate_limit_us;
+
+ list_for_each_entry(sg_policy, &attr_set->policy_list, tunables_hook) {
+ sg_policy->down_rate_delay_ns = rate_limit_us * NSEC_PER_USEC;
+ update_min_rate_limit_us(sg_policy);
+ }
+
+ return count;
+}
+
+static struct governor_attr up_rate_limit_us = __ATTR_RW(up_rate_limit_us);
+static struct governor_attr down_rate_limit_us = __ATTR_RW(down_rate_limit_us);
static struct attribute *sugov_attributes[] = {
- &rate_limit_us.attr,
+ &up_rate_limit_us.attr,
+ &down_rate_limit_us.attr,
NULL
};
@@ -371,19 +510,64 @@ static struct sugov_policy *sugov_policy_alloc(struct cpufreq_policy *policy)
return NULL;
sg_policy->policy = policy;
- init_irq_work(&sg_policy->irq_work, sugov_irq_work);
- INIT_WORK(&sg_policy->work, sugov_work);
- mutex_init(&sg_policy->work_lock);
raw_spin_lock_init(&sg_policy->update_lock);
return sg_policy;
}
static void sugov_policy_free(struct sugov_policy *sg_policy)
{
- mutex_destroy(&sg_policy->work_lock);
kfree(sg_policy);
}
+static int sugov_kthread_create(struct sugov_policy *sg_policy)
+{
+ struct task_struct *thread;
+ struct sched_param param = { .sched_priority = MAX_USER_RT_PRIO / 2 };
+ struct cpufreq_policy *policy = sg_policy->policy;
+ int ret;
+
+ /* kthread only required for slow path */
+ if (policy->fast_switch_enabled)
+ return 0;
+
+ kthread_init_work(&sg_policy->work, sugov_work);
+ kthread_init_worker(&sg_policy->worker);
+ thread = kthread_create(kthread_worker_fn, &sg_policy->worker,
+ "sugov:%d",
+ cpumask_first(policy->related_cpus));
+ if (IS_ERR(thread)) {
+ pr_err("failed to create sugov thread: %ld\n", PTR_ERR(thread));
+ return PTR_ERR(thread);
+ }
+
+ ret = sched_setscheduler_nocheck(thread, SCHED_FIFO, &param);
+ if (ret) {
+ kthread_stop(thread);
+ pr_warn("%s: failed to set SCHED_FIFO\n", __func__);
+ return ret;
+ }
+
+ sg_policy->thread = thread;
+ kthread_bind_mask(thread, policy->related_cpus);
+ init_irq_work(&sg_policy->irq_work, sugov_irq_work);
+ mutex_init(&sg_policy->work_lock);
+
+ wake_up_process(thread);
+
+ return 0;
+}
+
+static void sugov_kthread_stop(struct sugov_policy *sg_policy)
+{
+ /* kthread only required for slow path */
+ if (sg_policy->policy->fast_switch_enabled)
+ return;
+
+ kthread_flush_worker(&sg_policy->worker);
+ kthread_stop(sg_policy->thread);
+ mutex_destroy(&sg_policy->work_lock);
+}
+
static struct sugov_tunables *sugov_tunables_alloc(struct sugov_policy *sg_policy)
{
struct sugov_tunables *tunables;
@@ -409,23 +593,30 @@ static int sugov_init(struct cpufreq_policy *policy)
{
struct sugov_policy *sg_policy;
struct sugov_tunables *tunables;
- unsigned int lat;
int ret = 0;
/* State should be equivalent to EXIT */
if (policy->governor_data)
return -EBUSY;
+ cpufreq_enable_fast_switch(policy);
+
sg_policy = sugov_policy_alloc(policy);
- if (!sg_policy)
- return -ENOMEM;
+ if (!sg_policy) {
+ ret = -ENOMEM;
+ goto disable_fast_switch;
+ }
+
+ ret = sugov_kthread_create(sg_policy);
+ if (ret)
+ goto free_sg_policy;
mutex_lock(&global_tunables_lock);
if (global_tunables) {
if (WARN_ON(have_governor_per_policy())) {
ret = -EINVAL;
- goto free_sg_policy;
+ goto stop_kthread;
}
policy->governor_data = sg_policy;
sg_policy->tunables = global_tunables;
@@ -437,13 +628,23 @@ static int sugov_init(struct cpufreq_policy *policy)
tunables = sugov_tunables_alloc(sg_policy);
if (!tunables) {
ret = -ENOMEM;
- goto free_sg_policy;
+ goto stop_kthread;
}
- tunables->rate_limit_us = LATENCY_MULTIPLIER;
- lat = policy->cpuinfo.transition_latency / NSEC_PER_USEC;
- if (lat)
- tunables->rate_limit_us *= lat;
+ if (policy->up_transition_delay_us && policy->down_transition_delay_us) {
+ tunables->up_rate_limit_us = policy->up_transition_delay_us;
+ tunables->down_rate_limit_us = policy->down_transition_delay_us;
+ } else {
+ unsigned int lat;
+
+ tunables->up_rate_limit_us = LATENCY_MULTIPLIER;
+ tunables->down_rate_limit_us = LATENCY_MULTIPLIER;
+ lat = policy->cpuinfo.transition_latency / NSEC_PER_USEC;
+ if (lat) {
+ tunables->up_rate_limit_us *= lat;
+ tunables->down_rate_limit_us *= lat;
+ }
+ }
policy->governor_data = sg_policy;
sg_policy->tunables = tunables;
@@ -454,20 +655,25 @@ static int sugov_init(struct cpufreq_policy *policy)
if (ret)
goto fail;
- out:
+out:
mutex_unlock(&global_tunables_lock);
-
- cpufreq_enable_fast_switch(policy);
return 0;
- fail:
+fail:
policy->governor_data = NULL;
sugov_tunables_free(tunables);
+ stop_kthread:
+ sugov_kthread_stop(sg_policy);
+
free_sg_policy:
mutex_unlock(&global_tunables_lock);
sugov_policy_free(sg_policy);
+
+disable_fast_switch:
+ cpufreq_disable_fast_switch(policy);
+
pr_err("initialization failed (error %d)\n", ret);
return ret;
}
@@ -478,8 +684,6 @@ static void sugov_exit(struct cpufreq_policy *policy)
struct sugov_tunables *tunables = sg_policy->tunables;
unsigned int count;
- cpufreq_disable_fast_switch(policy);
-
mutex_lock(&global_tunables_lock);
count = gov_attr_set_put(&tunables->attr_set, &sg_policy->tunables_hook);
@@ -489,7 +693,10 @@ static void sugov_exit(struct cpufreq_policy *policy)
mutex_unlock(&global_tunables_lock);
+ sugov_kthread_stop(sg_policy);
sugov_policy_free(sg_policy);
+
+ cpufreq_disable_fast_switch(policy);
}
static int sugov_start(struct cpufreq_policy *policy)
@@ -497,7 +704,11 @@ static int sugov_start(struct cpufreq_policy *policy)
struct sugov_policy *sg_policy = policy->governor_data;
unsigned int cpu;
- sg_policy->freq_update_delay_ns = sg_policy->tunables->rate_limit_us * NSEC_PER_USEC;
+ sg_policy->up_rate_delay_ns =
+ sg_policy->tunables->up_rate_limit_us * NSEC_PER_USEC;
+ sg_policy->down_rate_delay_ns =
+ sg_policy->tunables->down_rate_limit_us * NSEC_PER_USEC;
+ update_min_rate_limit_us(sg_policy);
sg_policy->last_freq_update_time = 0;
sg_policy->next_freq = UINT_MAX;
sg_policy->work_in_progress = false;
@@ -509,7 +720,7 @@ static int sugov_start(struct cpufreq_policy *policy)
memset(sg_cpu, 0, sizeof(*sg_cpu));
sg_cpu->sg_policy = sg_policy;
- sg_cpu->flags = SCHED_CPUFREQ_RT;
+ sg_cpu->flags = SCHED_CPUFREQ_DL;
sg_cpu->iowait_boost_max = policy->cpuinfo.max_freq;
cpufreq_add_update_util_hook(cpu, &sg_cpu->update_util,
policy_is_shared(policy) ?
@@ -529,8 +740,10 @@ static void sugov_stop(struct cpufreq_policy *policy)
synchronize_sched();
- irq_work_sync(&sg_policy->irq_work);
- cancel_work_sync(&sg_policy->work);
+ if (!policy->fast_switch_enabled) {
+ irq_work_sync(&sg_policy->irq_work);
+ kthread_cancel_work_sync(&sg_policy->work);
+ }
}
static void sugov_limits(struct cpufreq_policy *policy)
diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c
index 448d6426fa5f..1a15aac3d48c 100644
--- a/kernel/sched/cputime.c
+++ b/kernel/sched/cputime.c
@@ -4,11 +4,12 @@
#include <linux/kernel_stat.h>
#include <linux/static_key.h>
#include <linux/context_tracking.h>
+#include <linux/cpufreq_times.h>
#include "sched.h"
#ifdef CONFIG_PARAVIRT
#include <asm/paravirt.h>
#endif
-
+#include "walt.h"
#ifdef CONFIG_IRQ_TIME_ACCOUNTING
@@ -58,11 +59,18 @@ void irqtime_account_irq(struct task_struct *curr)
struct irqtime *irqtime = this_cpu_ptr(&cpu_irqtime);
s64 delta;
int cpu;
+#ifdef CONFIG_SCHED_WALT
+ u64 wallclock;
+ bool account = true;
+#endif
if (!sched_clock_irqtime)
return;
cpu = smp_processor_id();
+#ifdef CONFIG_SCHED_WALT
+ wallclock = sched_clock_cpu(cpu);
+#endif
delta = sched_clock_cpu(cpu) - irqtime->irq_start_time;
irqtime->irq_start_time += delta;
@@ -76,6 +84,13 @@ void irqtime_account_irq(struct task_struct *curr)
irqtime_account_delta(irqtime, delta, CPUTIME_IRQ);
else if (in_serving_softirq() && curr != this_cpu_ksoftirqd())
irqtime_account_delta(irqtime, delta, CPUTIME_SOFTIRQ);
+#ifdef CONFIG_SCHED_WALT
+ else
+ account = false;
+
+ if (account)
+ walt_account_irqtime(cpu, curr, delta, wallclock);
+#endif
}
EXPORT_SYMBOL_GPL(irqtime_account_irq);
@@ -139,6 +154,9 @@ void account_user_time(struct task_struct *p, cputime_t cputime,
/* Account for user time used */
acct_account_cputime(p);
+
+ /* Account power usage for user time */
+ cpufreq_acct_update_power(p, cputime);
}
/*
@@ -189,6 +207,9 @@ void __account_system_time(struct task_struct *p, cputime_t cputime,
/* Account for system time used */
acct_account_cputime(p);
+
+ /* Account power usage for system time */
+ cpufreq_acct_update_power(p, cputime);
}
/*
diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c
index 3042927c8b8a..d46449014e88 100644
--- a/kernel/sched/deadline.c
+++ b/kernel/sched/deadline.c
@@ -18,6 +18,8 @@
#include <linux/slab.h>
+#include "walt.h"
+
struct dl_bandwidth def_dl_bandwidth;
static inline struct task_struct *dl_task_of(struct sched_dl_entity *dl_se)
@@ -947,6 +949,7 @@ void inc_dl_tasks(struct sched_dl_entity *dl_se, struct dl_rq *dl_rq)
WARN_ON(!dl_prio(prio));
dl_rq->dl_nr_running++;
add_nr_running(rq_of_dl_rq(dl_rq), 1);
+ walt_inc_cumulative_runnable_avg(rq_of_dl_rq(dl_rq), dl_task_of(dl_se));
inc_dl_deadline(dl_rq, deadline);
inc_dl_migration(dl_se, dl_rq);
@@ -961,6 +964,7 @@ void dec_dl_tasks(struct sched_dl_entity *dl_se, struct dl_rq *dl_rq)
WARN_ON(!dl_rq->dl_nr_running);
dl_rq->dl_nr_running--;
sub_nr_running(rq_of_dl_rq(dl_rq), 1);
+ walt_dec_cumulative_runnable_avg(rq_of_dl_rq(dl_rq), dl_task_of(dl_se));
dec_dl_deadline(dl_rq, dl_se->deadline);
dec_dl_migration(dl_se, dl_rq);
@@ -1628,7 +1632,9 @@ retry:
}
deactivate_task(rq, next_task, 0);
+ next_task->on_rq = TASK_ON_RQ_MIGRATING;
set_task_cpu(next_task, later_rq->cpu);
+ next_task->on_rq = TASK_ON_RQ_QUEUED;
activate_task(later_rq, next_task, 0);
ret = 1;
@@ -1716,7 +1722,9 @@ static void pull_dl_task(struct rq *this_rq)
resched = true;
deactivate_task(src_rq, p, 0);
+ p->on_rq = TASK_ON_RQ_MIGRATING;
set_task_cpu(p, this_cpu);
+ p->on_rq = TASK_ON_RQ_QUEUED;
activate_task(this_rq, p, 0);
dmin = p->dl.deadline;
diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c
index fa178b62ea79..f621b3a59e49 100644
--- a/kernel/sched/debug.c
+++ b/kernel/sched/debug.c
@@ -261,9 +261,60 @@ set_table_entry(struct ctl_table *entry,
}
static struct ctl_table *
+sd_alloc_ctl_energy_table(struct sched_group_energy *sge)
+{
+ struct ctl_table *table = sd_alloc_ctl_entry(5);
+
+ if (table == NULL)
+ return NULL;
+
+ set_table_entry(&table[0], "nr_idle_states", &sge->nr_idle_states,
+ sizeof(int), 0444, proc_dointvec_minmax, false);
+ set_table_entry(&table[1], "idle_states", &sge->idle_states[0].power,
+ sge->nr_idle_states*sizeof(struct idle_state), 0444,
+ proc_doulongvec_minmax, false);
+ set_table_entry(&table[2], "nr_cap_states", &sge->nr_cap_states,
+ sizeof(int), 0444, proc_dointvec_minmax, false);
+ set_table_entry(&table[3], "cap_states", &sge->cap_states[0].cap,
+ sge->nr_cap_states*sizeof(struct capacity_state), 0444,
+ proc_doulongvec_minmax, false);
+
+ return table;
+}
+
+static struct ctl_table *
+sd_alloc_ctl_group_table(struct sched_group *sg)
+{
+ struct ctl_table *table = sd_alloc_ctl_entry(2);
+
+ if (table == NULL)
+ return NULL;
+
+ table->procname = kstrdup("energy", GFP_KERNEL);
+ table->mode = 0555;
+ table->child = sd_alloc_ctl_energy_table((struct sched_group_energy *)sg->sge);
+
+ return table;
+}
+
+static struct ctl_table *
sd_alloc_ctl_domain_table(struct sched_domain *sd)
{
- struct ctl_table *table = sd_alloc_ctl_entry(14);
+ struct ctl_table *table;
+ unsigned int nr_entries = 14;
+
+ int i = 0;
+ struct sched_group *sg = sd->groups;
+
+ if (sg->sge) {
+ int nr_sgs = 0;
+
+ do {} while (nr_sgs++, sg = sg->next, sg != sd->groups);
+
+ nr_entries += nr_sgs;
+ }
+
+ table = sd_alloc_ctl_entry(nr_entries);
if (table == NULL)
return NULL;
@@ -296,7 +347,19 @@ sd_alloc_ctl_domain_table(struct sched_domain *sd)
sizeof(long), 0644, proc_doulongvec_minmax, false);
set_table_entry(&table[12], "name", sd->name,
CORENAME_MAX_SIZE, 0444, proc_dostring, false);
- /* &table[13] is terminator */
+ sg = sd->groups;
+ if (sg->sge) {
+ char buf[32];
+ struct ctl_table *entry = &table[13];
+
+ do {
+ snprintf(buf, 32, "group%d", i);
+ entry->procname = kstrdup(buf, GFP_KERNEL);
+ entry->mode = 0555;
+ entry->child = sd_alloc_ctl_group_table(sg);
+ } while (entry++, i++, sg = sg->next, sg != sd->groups);
+ }
+ /* &table[nr_entries-1] is terminator */
return table;
}
@@ -918,7 +981,33 @@ void proc_sched_show_task(struct task_struct *p, struct seq_file *m)
P_SCHEDSTAT(se.statistics.nr_wakeups_affine_attempts);
P_SCHEDSTAT(se.statistics.nr_wakeups_passive);
P_SCHEDSTAT(se.statistics.nr_wakeups_idle);
-
+ /* eas */
+ /* select_idle_sibling() */
+ P_SCHEDSTAT(se.statistics.nr_wakeups_sis_attempts);
+ P_SCHEDSTAT(se.statistics.nr_wakeups_sis_idle);
+ P_SCHEDSTAT(se.statistics.nr_wakeups_sis_cache_affine);
+ P_SCHEDSTAT(se.statistics.nr_wakeups_sis_suff_cap);
+ P_SCHEDSTAT(se.statistics.nr_wakeups_sis_idle_cpu);
+ P_SCHEDSTAT(se.statistics.nr_wakeups_sis_count);
+ /* select_energy_cpu_brute() */
+ P_SCHEDSTAT(se.statistics.nr_wakeups_secb_attempts);
+ P_SCHEDSTAT(se.statistics.nr_wakeups_secb_sync);
+ P_SCHEDSTAT(se.statistics.nr_wakeups_secb_idle_bt);
+ P_SCHEDSTAT(se.statistics.nr_wakeups_secb_insuff_cap);
+ P_SCHEDSTAT(se.statistics.nr_wakeups_secb_no_nrg_sav);
+ P_SCHEDSTAT(se.statistics.nr_wakeups_secb_nrg_sav);
+ P_SCHEDSTAT(se.statistics.nr_wakeups_secb_count);
+ /* find_best_target() */
+ P_SCHEDSTAT(se.statistics.nr_wakeups_fbt_attempts);
+ P_SCHEDSTAT(se.statistics.nr_wakeups_fbt_no_cpu);
+ P_SCHEDSTAT(se.statistics.nr_wakeups_fbt_no_sd);
+ P_SCHEDSTAT(se.statistics.nr_wakeups_fbt_pref_idle);
+ P_SCHEDSTAT(se.statistics.nr_wakeups_fbt_count);
+ /* cas */
+ /* select_task_rq_fair() */
+ P_SCHEDSTAT(se.statistics.nr_wakeups_cas_attempts);
+ P_SCHEDSTAT(se.statistics.nr_wakeups_cas_count);
+
avg_atom = p->se.sum_exec_runtime;
if (nr_switches)
avg_atom = div64_ul(avg_atom, nr_switches);
diff --git a/kernel/sched/energy.c b/kernel/sched/energy.c
new file mode 100644
index 000000000000..b0656b7a93e3
--- /dev/null
+++ b/kernel/sched/energy.c
@@ -0,0 +1,124 @@
+/*
+ * Obtain energy cost data from DT and populate relevant scheduler data
+ * structures.
+ *
+ * Copyright (C) 2015 ARM Ltd.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program. If not, see <http://www.gnu.org/licenses/>.
+ */
+#define pr_fmt(fmt) "sched-energy: " fmt
+
+#define DEBUG
+
+#include <linux/gfp.h>
+#include <linux/of.h>
+#include <linux/printk.h>
+#include <linux/sched.h>
+#include <linux/sched_energy.h>
+#include <linux/stddef.h>
+
+struct sched_group_energy *sge_array[NR_CPUS][NR_SD_LEVELS];
+
+static void free_resources(void)
+{
+ int cpu, sd_level;
+ struct sched_group_energy *sge;
+
+ for_each_possible_cpu(cpu) {
+ for_each_possible_sd_level(sd_level) {
+ sge = sge_array[cpu][sd_level];
+ if (sge) {
+ kfree(sge->cap_states);
+ kfree(sge->idle_states);
+ kfree(sge);
+ }
+ }
+ }
+}
+
+void init_sched_energy_costs(void)
+{
+ struct device_node *cn, *cp;
+ struct capacity_state *cap_states;
+ struct idle_state *idle_states;
+ struct sched_group_energy *sge;
+ const struct property *prop;
+ int sd_level, i, nstates, cpu;
+ const __be32 *val;
+
+ for_each_possible_cpu(cpu) {
+ cn = of_get_cpu_node(cpu, NULL);
+ if (!cn) {
+ pr_warn("CPU device node missing for CPU %d\n", cpu);
+ return;
+ }
+
+ if (!of_find_property(cn, "sched-energy-costs", NULL)) {
+ pr_warn("CPU device node has no sched-energy-costs\n");
+ return;
+ }
+
+ for_each_possible_sd_level(sd_level) {
+ cp = of_parse_phandle(cn, "sched-energy-costs", sd_level);
+ if (!cp)
+ break;
+
+ prop = of_find_property(cp, "busy-cost-data", NULL);
+ if (!prop || !prop->value) {
+ pr_warn("No busy-cost data, skipping sched_energy init\n");
+ goto out;
+ }
+
+ sge = kcalloc(1, sizeof(struct sched_group_energy),
+ GFP_NOWAIT);
+
+ nstates = (prop->length / sizeof(u32)) / 2;
+ cap_states = kcalloc(nstates,
+ sizeof(struct capacity_state),
+ GFP_NOWAIT);
+
+ for (i = 0, val = prop->value; i < nstates; i++) {
+ cap_states[i].cap = be32_to_cpup(val++);
+ cap_states[i].power = be32_to_cpup(val++);
+ }
+
+ sge->nr_cap_states = nstates;
+ sge->cap_states = cap_states;
+
+ prop = of_find_property(cp, "idle-cost-data", NULL);
+ if (!prop || !prop->value) {
+ pr_warn("No idle-cost data, skipping sched_energy init\n");
+ goto out;
+ }
+
+ nstates = (prop->length / sizeof(u32));
+ idle_states = kcalloc(nstates,
+ sizeof(struct idle_state),
+ GFP_NOWAIT);
+
+ for (i = 0, val = prop->value; i < nstates; i++)
+ idle_states[i].power = be32_to_cpup(val++);
+
+ sge->nr_idle_states = nstates;
+ sge->idle_states = idle_states;
+
+ sge_array[cpu][sd_level] = sge;
+ }
+ }
+
+ pr_info("Sched-energy-costs installed from DT\n");
+ return;
+
+out:
+ free_resources();
+}
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 0c91d72f3e8f..ca354c305989 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -30,10 +30,13 @@
#include <linux/mempolicy.h>
#include <linux/migrate.h>
#include <linux/task_work.h>
+#include <linux/module.h>
#include <trace/events/sched.h>
#include "sched.h"
+#include "tune.h"
+#include "walt.h"
/*
* Targeted preemption latency for CPU-bound tasks:
@@ -50,6 +53,15 @@
unsigned int sysctl_sched_latency = 6000000ULL;
unsigned int normalized_sysctl_sched_latency = 6000000ULL;
+unsigned int sysctl_sched_sync_hint_enable = 1;
+unsigned int sysctl_sched_cstate_aware = 1;
+
+#ifdef CONFIG_SCHED_WALT
+unsigned int sysctl_sched_use_walt_cpu_util = 1;
+unsigned int sysctl_sched_use_walt_task_util = 1;
+__read_mostly unsigned int sysctl_sched_walt_cpu_high_irqload =
+ (10 * NSEC_PER_MSEC);
+#endif
/*
* The initial- and re-scaling of tunables is configurable
* (default SCHED_TUNABLESCALING_LOG = *(1+ilog(ncpus))
@@ -116,7 +128,7 @@ unsigned int sysctl_sched_cfs_bandwidth_slice = 5000UL;
/*
* The margin used when comparing utilization with CPU capacity:
- * util * 1024 < capacity * margin
+ * util * margin < capacity * 1024
*/
unsigned int capacity_margin = 1280; /* ~20% */
@@ -290,19 +302,59 @@ static inline struct cfs_rq *group_cfs_rq(struct sched_entity *grp)
static inline void list_add_leaf_cfs_rq(struct cfs_rq *cfs_rq)
{
if (!cfs_rq->on_list) {
+ struct rq *rq = rq_of(cfs_rq);
+ int cpu = cpu_of(rq);
/*
* Ensure we either appear before our parent (if already
* enqueued) or force our parent to appear after us when it is
- * enqueued. The fact that we always enqueue bottom-up
- * reduces this to two cases.
+ * enqueued. The fact that we always enqueue bottom-up
+ * reduces this to two cases and a special case for the root
+ * cfs_rq. Furthermore, it also means that we will always reset
+ * tmp_alone_branch either when the branch is connected
+ * to a tree or when we reach the beg of the tree
*/
if (cfs_rq->tg->parent &&
- cfs_rq->tg->parent->cfs_rq[cpu_of(rq_of(cfs_rq))]->on_list) {
- list_add_rcu(&cfs_rq->leaf_cfs_rq_list,
- &rq_of(cfs_rq)->leaf_cfs_rq_list);
- } else {
+ cfs_rq->tg->parent->cfs_rq[cpu]->on_list) {
+ /*
+ * If parent is already on the list, we add the child
+ * just before. Thanks to circular linked property of
+ * the list, this means to put the child at the tail
+ * of the list that starts by parent.
+ */
list_add_tail_rcu(&cfs_rq->leaf_cfs_rq_list,
- &rq_of(cfs_rq)->leaf_cfs_rq_list);
+ &(cfs_rq->tg->parent->cfs_rq[cpu]->leaf_cfs_rq_list));
+ /*
+ * The branch is now connected to its tree so we can
+ * reset tmp_alone_branch to the beginning of the
+ * list.
+ */
+ rq->tmp_alone_branch = &rq->leaf_cfs_rq_list;
+ } else if (!cfs_rq->tg->parent) {
+ /*
+ * cfs rq without parent should be put
+ * at the tail of the list.
+ */
+ list_add_tail_rcu(&cfs_rq->leaf_cfs_rq_list,
+ &rq->leaf_cfs_rq_list);
+ /*
+ * We have reach the beg of a tree so we can reset
+ * tmp_alone_branch to the beginning of the list.
+ */
+ rq->tmp_alone_branch = &rq->leaf_cfs_rq_list;
+ } else {
+ /*
+ * The parent has not already been added so we want to
+ * make sure that it will be put after us.
+ * tmp_alone_branch points to the beg of the branch
+ * where we will add parent.
+ */
+ list_add_rcu(&cfs_rq->leaf_cfs_rq_list,
+ rq->tmp_alone_branch);
+ /*
+ * update tmp_alone_branch to points to the new beg
+ * of the branch
+ */
+ rq->tmp_alone_branch = &cfs_rq->leaf_cfs_rq_list;
}
cfs_rq->on_list = 1;
@@ -699,6 +751,7 @@ void init_entity_runnable_average(struct sched_entity *se)
if (entity_is_task(se))
sa->load_avg = scale_load_down(se->load.weight);
sa->load_sum = sa->load_avg * LOAD_AVG_MAX;
+
/*
* At this point, util_avg won't be used in select_task_rq_fair anyway
*/
@@ -708,9 +761,7 @@ void init_entity_runnable_average(struct sched_entity *se)
}
static inline u64 cfs_rq_clock_task(struct cfs_rq *cfs_rq);
-static int update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq, bool update_freq);
-static void update_tg_load_avg(struct cfs_rq *cfs_rq, int force);
-static void attach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se);
+static void attach_entity_cfs_rq(struct sched_entity *se);
/*
* With new tasks being created, their initial util_avgs are extrapolated
@@ -742,7 +793,6 @@ void post_init_entity_util_avg(struct sched_entity *se)
struct cfs_rq *cfs_rq = cfs_rq_of(se);
struct sched_avg *sa = &se->avg;
long cap = (long)(SCHED_CAPACITY_SCALE - cfs_rq->avg.util_avg) / 2;
- u64 now = cfs_rq_clock_task(cfs_rq);
if (cap > 0) {
if (cfs_rq->avg.util_avg != 0) {
@@ -770,14 +820,12 @@ void post_init_entity_util_avg(struct sched_entity *se)
* such that the next switched_to_fair() has the
* expected state.
*/
- se->avg.last_update_time = now;
+ se->avg.last_update_time = cfs_rq_clock_task(cfs_rq);
return;
}
}
- update_cfs_rq_load_avg(now, cfs_rq, false);
- attach_entity_load_avg(cfs_rq, se);
- update_tg_load_avg(cfs_rq, false);
+ attach_entity_cfs_rq(se);
}
#else /* !CONFIG_SMP */
@@ -937,6 +985,7 @@ update_stats_enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se)
}
trace_sched_stat_blocked(tsk, delta);
+ trace_sched_blocked_reason(tsk);
/*
* Blocking time is in units of nanosecs, so shift by
@@ -2646,16 +2695,20 @@ static void reweight_entity(struct cfs_rq *cfs_rq, struct sched_entity *se,
static inline int throttled_hierarchy(struct cfs_rq *cfs_rq);
-static void update_cfs_shares(struct cfs_rq *cfs_rq)
+static void update_cfs_shares(struct sched_entity *se)
{
+ struct cfs_rq *cfs_rq = group_cfs_rq(se);
struct task_group *tg;
- struct sched_entity *se;
long shares;
- tg = cfs_rq->tg;
- se = tg->se[cpu_of(rq_of(cfs_rq))];
- if (!se || throttled_hierarchy(cfs_rq))
+ if (!cfs_rq)
return;
+
+ if (throttled_hierarchy(cfs_rq))
+ return;
+
+ tg = cfs_rq->tg;
+
#ifndef CONFIG_SMP
if (likely(se->load.weight == tg->shares))
return;
@@ -2664,8 +2717,9 @@ static void update_cfs_shares(struct cfs_rq *cfs_rq)
reweight_entity(cfs_rq_of(se), se, shares);
}
+
#else /* CONFIG_FAIR_GROUP_SCHED */
-static inline void update_cfs_shares(struct cfs_rq *cfs_rq)
+static inline void update_cfs_shares(struct sched_entity *se)
{
}
#endif /* CONFIG_FAIR_GROUP_SCHED */
@@ -2816,6 +2870,7 @@ __update_load_avg(u64 now, int cpu, struct sched_avg *sa,
scale_freq = arch_scale_freq_capacity(NULL, cpu);
scale_cpu = arch_scale_cpu_capacity(NULL, cpu);
+ trace_sched_contrib_scale_f(cpu, scale_freq, scale_cpu);
/* delta_w is the amount already accumulated against our next period */
delta_w = sa->period_contrib;
@@ -2891,6 +2946,26 @@ __update_load_avg(u64 now, int cpu, struct sched_avg *sa,
return decayed;
}
+/*
+ * Signed add and clamp on underflow.
+ *
+ * Explicitly do a load-store to ensure the intermediate value never hits
+ * memory. This allows lockless observations without ever seeing the negative
+ * values.
+ */
+#define add_positive(_ptr, _val) do { \
+ typeof(_ptr) ptr = (_ptr); \
+ typeof(_val) val = (_val); \
+ typeof(*ptr) res, var = READ_ONCE(*ptr); \
+ \
+ res = var + val; \
+ \
+ if (val < 0 && res > var) \
+ res = 0; \
+ \
+ WRITE_ONCE(*ptr, res); \
+} while (0)
+
#ifdef CONFIG_FAIR_GROUP_SCHED
/**
* update_tg_load_avg - update the tg's load avg
@@ -2970,8 +3045,168 @@ void set_task_rq_fair(struct sched_entity *se,
se->avg.last_update_time = n_last_update_time;
}
}
+
+/* Take into account change of utilization of a child task group */
+static inline void
+update_tg_cfs_util(struct cfs_rq *cfs_rq, struct sched_entity *se)
+{
+ struct cfs_rq *gcfs_rq = group_cfs_rq(se);
+ long delta = gcfs_rq->avg.util_avg - se->avg.util_avg;
+
+ /* Nothing to update */
+ if (!delta)
+ return;
+
+ /* Set new sched_entity's utilization */
+ se->avg.util_avg = gcfs_rq->avg.util_avg;
+ se->avg.util_sum = se->avg.util_avg * LOAD_AVG_MAX;
+
+ /* Update parent cfs_rq utilization */
+ add_positive(&cfs_rq->avg.util_avg, delta);
+ cfs_rq->avg.util_sum = cfs_rq->avg.util_avg * LOAD_AVG_MAX;
+}
+
+/* Take into account change of load of a child task group */
+static inline void
+update_tg_cfs_load(struct cfs_rq *cfs_rq, struct sched_entity *se)
+{
+ struct cfs_rq *gcfs_rq = group_cfs_rq(se);
+ long delta, load = gcfs_rq->avg.load_avg;
+
+ /*
+ * If the load of group cfs_rq is null, the load of the
+ * sched_entity will also be null so we can skip the formula
+ */
+ if (load) {
+ long tg_load;
+
+ /* Get tg's load and ensure tg_load > 0 */
+ tg_load = atomic_long_read(&gcfs_rq->tg->load_avg) + 1;
+
+ /* Ensure tg_load >= load and updated with current load*/
+ tg_load -= gcfs_rq->tg_load_avg_contrib;
+ tg_load += load;
+
+ /*
+ * We need to compute a correction term in the case that the
+ * task group is consuming more CPU than a task of equal
+ * weight. A task with a weight equals to tg->shares will have
+ * a load less or equal to scale_load_down(tg->shares).
+ * Similarly, the sched_entities that represent the task group
+ * at parent level, can't have a load higher than
+ * scale_load_down(tg->shares). And the Sum of sched_entities'
+ * load must be <= scale_load_down(tg->shares).
+ */
+ if (tg_load > scale_load_down(gcfs_rq->tg->shares)) {
+ /* scale gcfs_rq's load into tg's shares*/
+ load *= scale_load_down(gcfs_rq->tg->shares);
+ load /= tg_load;
+ }
+ }
+
+ delta = load - se->avg.load_avg;
+
+ /* Nothing to update */
+ if (!delta)
+ return;
+
+ /* Set new sched_entity's load */
+ se->avg.load_avg = load;
+ se->avg.load_sum = se->avg.load_avg * LOAD_AVG_MAX;
+
+ /* Update parent cfs_rq load */
+ add_positive(&cfs_rq->avg.load_avg, delta);
+ cfs_rq->avg.load_sum = cfs_rq->avg.load_avg * LOAD_AVG_MAX;
+
+ /*
+ * If the sched_entity is already enqueued, we also have to update the
+ * runnable load avg.
+ */
+ if (se->on_rq) {
+ /* Update parent cfs_rq runnable_load_avg */
+ add_positive(&cfs_rq->runnable_load_avg, delta);
+ cfs_rq->runnable_load_sum = cfs_rq->runnable_load_avg * LOAD_AVG_MAX;
+ }
+}
+
+static inline void set_tg_cfs_propagate(struct cfs_rq *cfs_rq)
+{
+ cfs_rq->propagate_avg = 1;
+}
+
+static inline int test_and_clear_tg_cfs_propagate(struct sched_entity *se)
+{
+ struct cfs_rq *cfs_rq = group_cfs_rq(se);
+
+ if (!cfs_rq->propagate_avg)
+ return 0;
+
+ cfs_rq->propagate_avg = 0;
+ return 1;
+}
+
+/* Update task and its cfs_rq load average */
+static inline int propagate_entity_load_avg(struct sched_entity *se)
+{
+ struct cfs_rq *cfs_rq;
+
+ if (entity_is_task(se))
+ return 0;
+
+ if (!test_and_clear_tg_cfs_propagate(se))
+ return 0;
+
+ cfs_rq = cfs_rq_of(se);
+
+ set_tg_cfs_propagate(cfs_rq);
+
+ update_tg_cfs_util(cfs_rq, se);
+ update_tg_cfs_load(cfs_rq, se);
+
+ return 1;
+}
+
+/*
+ * Check if we need to update the load and the utilization of a blocked
+ * group_entity:
+ */
+static inline bool skip_blocked_update(struct sched_entity *se)
+{
+ struct cfs_rq *gcfs_rq = group_cfs_rq(se);
+
+ /*
+ * If sched_entity still have not zero load or utilization, we have to
+ * decay it:
+ */
+ if (se->avg.load_avg || se->avg.util_avg)
+ return false;
+
+ /*
+ * If there is a pending propagation, we have to update the load and
+ * the utilization of the sched_entity:
+ */
+ if (gcfs_rq->propagate_avg)
+ return false;
+
+ /*
+ * Otherwise, the load and the utilization of the sched_entity is
+ * already zero and there is no pending propagation, so it will be a
+ * waste of time to try to decay it:
+ */
+ return true;
+}
+
#else /* CONFIG_FAIR_GROUP_SCHED */
+
static inline void update_tg_load_avg(struct cfs_rq *cfs_rq, int force) {}
+
+static inline int propagate_entity_load_avg(struct sched_entity *se)
+{
+ return 0;
+}
+
+static inline void set_tg_cfs_propagate(struct cfs_rq *cfs_rq) {}
+
#endif /* CONFIG_FAIR_GROUP_SCHED */
static inline void cfs_rq_util_change(struct cfs_rq *cfs_rq)
@@ -3042,6 +3277,7 @@ update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq, bool update_freq)
sub_positive(&sa->load_avg, r);
sub_positive(&sa->load_sum, r * LOAD_AVG_MAX);
removed_load = 1;
+ set_tg_cfs_propagate(cfs_rq);
}
if (atomic_long_read(&cfs_rq->removed_util_avg)) {
@@ -3049,6 +3285,7 @@ update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq, bool update_freq)
sub_positive(&sa->util_avg, r);
sub_positive(&sa->util_sum, r * LOAD_AVG_MAX);
removed_util = 1;
+ set_tg_cfs_propagate(cfs_rq);
}
decayed = __update_load_avg(now, cpu_of(rq_of(cfs_rq)), sa,
@@ -3062,27 +3299,51 @@ update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq, bool update_freq)
if (update_freq && (decayed || removed_util))
cfs_rq_util_change(cfs_rq);
+ /* Trace CPU load, unless cfs_rq belongs to a non-root task_group */
+ if (cfs_rq == &rq_of(cfs_rq)->cfs)
+ trace_sched_load_avg_cpu(cpu_of(rq_of(cfs_rq)), cfs_rq);
+
return decayed || removed_load;
}
+/*
+ * Optional action to be done while updating the load average
+ */
+#define UPDATE_TG 0x1
+#define SKIP_AGE_LOAD 0x2
+
/* Update task and its cfs_rq load average */
-static inline void update_load_avg(struct sched_entity *se, int update_tg)
+static inline void update_load_avg(struct sched_entity *se, int flags)
{
struct cfs_rq *cfs_rq = cfs_rq_of(se);
u64 now = cfs_rq_clock_task(cfs_rq);
struct rq *rq = rq_of(cfs_rq);
int cpu = cpu_of(rq);
+ int decayed;
+ void *ptr = NULL;
/*
* Track task load average for carrying it to new CPU after migrated, and
* track group sched_entity load average for task_h_load calc in migration
*/
- __update_load_avg(now, cpu, &se->avg,
+ if (se->avg.last_update_time && !(flags & SKIP_AGE_LOAD)) {
+ __update_load_avg(now, cpu, &se->avg,
se->on_rq * scale_load_down(se->load.weight),
cfs_rq->curr == se, NULL);
+ }
- if (update_cfs_rq_load_avg(now, cfs_rq, true) && update_tg)
+ decayed = update_cfs_rq_load_avg(now, cfs_rq, true);
+ decayed |= propagate_entity_load_avg(se);
+
+ if (decayed && (flags & UPDATE_TG))
update_tg_load_avg(cfs_rq, 0);
+
+ if (entity_is_task(se)) {
+#ifdef CONFIG_SCHED_WALT
+ ptr = (void *)&(task_of(se)->ravg);
+#endif
+ trace_sched_load_avg_task(task_of(se), &se->avg, ptr);
+ }
}
/**
@@ -3095,31 +3356,12 @@ static inline void update_load_avg(struct sched_entity *se, int update_tg)
*/
static void attach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se)
{
- if (!sched_feat(ATTACH_AGE_LOAD))
- goto skip_aging;
-
- /*
- * If we got migrated (either between CPUs or between cgroups) we'll
- * have aged the average right before clearing @last_update_time.
- *
- * Or we're fresh through post_init_entity_util_avg().
- */
- if (se->avg.last_update_time) {
- __update_load_avg(cfs_rq->avg.last_update_time, cpu_of(rq_of(cfs_rq)),
- &se->avg, 0, 0, NULL);
-
- /*
- * XXX: we could have just aged the entire load away if we've been
- * absent from the fair class for too long.
- */
- }
-
-skip_aging:
se->avg.last_update_time = cfs_rq->avg.last_update_time;
cfs_rq->avg.load_avg += se->avg.load_avg;
cfs_rq->avg.load_sum += se->avg.load_sum;
cfs_rq->avg.util_avg += se->avg.util_avg;
cfs_rq->avg.util_sum += se->avg.util_sum;
+ set_tg_cfs_propagate(cfs_rq);
cfs_rq_util_change(cfs_rq);
}
@@ -3134,14 +3376,12 @@ skip_aging:
*/
static void detach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se)
{
- __update_load_avg(cfs_rq->avg.last_update_time, cpu_of(rq_of(cfs_rq)),
- &se->avg, se->on_rq * scale_load_down(se->load.weight),
- cfs_rq->curr == se, NULL);
sub_positive(&cfs_rq->avg.load_avg, se->avg.load_avg);
sub_positive(&cfs_rq->avg.load_sum, se->avg.load_sum);
sub_positive(&cfs_rq->avg.util_avg, se->avg.util_avg);
sub_positive(&cfs_rq->avg.util_sum, se->avg.util_sum);
+ set_tg_cfs_propagate(cfs_rq);
cfs_rq_util_change(cfs_rq);
}
@@ -3151,34 +3391,20 @@ static inline void
enqueue_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se)
{
struct sched_avg *sa = &se->avg;
- u64 now = cfs_rq_clock_task(cfs_rq);
- int migrated, decayed;
-
- migrated = !sa->last_update_time;
- if (!migrated) {
- __update_load_avg(now, cpu_of(rq_of(cfs_rq)), sa,
- se->on_rq * scale_load_down(se->load.weight),
- cfs_rq->curr == se, NULL);
- }
-
- decayed = update_cfs_rq_load_avg(now, cfs_rq, !migrated);
cfs_rq->runnable_load_avg += sa->load_avg;
cfs_rq->runnable_load_sum += sa->load_sum;
- if (migrated)
+ if (!sa->last_update_time) {
attach_entity_load_avg(cfs_rq, se);
-
- if (decayed || migrated)
update_tg_load_avg(cfs_rq, 0);
+ }
}
/* Remove the runnable load generated by se from cfs_rq's runnable load average */
static inline void
dequeue_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se)
{
- update_load_avg(se, 1);
-
cfs_rq->runnable_load_avg =
max_t(long, cfs_rq->runnable_load_avg - se->avg.load_avg, 0);
cfs_rq->runnable_load_sum =
@@ -3207,13 +3433,25 @@ static inline u64 cfs_rq_last_update_time(struct cfs_rq *cfs_rq)
#endif
/*
+ * Synchronize entity load avg of dequeued entity without locking
+ * the previous rq.
+ */
+void sync_entity_load_avg(struct sched_entity *se)
+{
+ struct cfs_rq *cfs_rq = cfs_rq_of(se);
+ u64 last_update_time;
+
+ last_update_time = cfs_rq_last_update_time(cfs_rq);
+ __update_load_avg(last_update_time, cpu_of(rq_of(cfs_rq)), &se->avg, 0, 0, NULL);
+}
+
+/*
* Task first catches up with cfs_rq, and then subtract
* itself from the cfs_rq (task must be off the queue now).
*/
void remove_entity_load_avg(struct sched_entity *se)
{
struct cfs_rq *cfs_rq = cfs_rq_of(se);
- u64 last_update_time;
/*
* tasks cannot exit without having gone through wake_up_new_task() ->
@@ -3225,9 +3463,7 @@ void remove_entity_load_avg(struct sched_entity *se)
* calls this.
*/
- last_update_time = cfs_rq_last_update_time(cfs_rq);
-
- __update_load_avg(last_update_time, cpu_of(rq_of(cfs_rq)), &se->avg, 0, 0, NULL);
+ sync_entity_load_avg(se);
atomic_long_add(se->avg.load_avg, &cfs_rq->removed_load_avg);
atomic_long_add(se->avg.util_avg, &cfs_rq->removed_util_avg);
}
@@ -3252,7 +3488,10 @@ update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq, bool update_freq)
return 0;
}
-static inline void update_load_avg(struct sched_entity *se, int not_used)
+#define UPDATE_TG 0x0
+#define SKIP_AGE_LOAD 0x0
+
+static inline void update_load_avg(struct sched_entity *se, int not_used1)
{
cpufreq_update_util(rq_of(cfs_rq_of(se)), 0);
}
@@ -3397,9 +3636,18 @@ enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
if (renorm && !curr)
se->vruntime += cfs_rq->min_vruntime;
+ /*
+ * When enqueuing a sched_entity, we must:
+ * - Update loads to have both entity and cfs_rq synced with now.
+ * - Add its load to cfs_rq->runnable_avg
+ * - For group_entity, update its weight to reflect the new share of
+ * its group cfs_rq
+ * - Add its new weight to cfs_rq->load.weight
+ */
+ update_load_avg(se, UPDATE_TG);
enqueue_entity_load_avg(cfs_rq, se);
+ update_cfs_shares(se);
account_entity_enqueue(cfs_rq, se);
- update_cfs_shares(cfs_rq);
if (flags & ENQUEUE_WAKEUP)
place_entity(cfs_rq, se, 0);
@@ -3471,6 +3719,16 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
* Update run-time statistics of the 'current'.
*/
update_curr(cfs_rq);
+
+ /*
+ * When dequeuing a sched_entity, we must:
+ * - Update loads to have both entity and cfs_rq synced with now.
+ * - Substract its load from the cfs_rq->runnable_avg.
+ * - Substract its previous weight from cfs_rq->load.weight.
+ * - For group entity, update its weight to reflect the new share
+ * of its group cfs_rq.
+ */
+ update_load_avg(se, UPDATE_TG);
dequeue_entity_load_avg(cfs_rq, se);
update_stats_dequeue(cfs_rq, se, flags);
@@ -3494,7 +3752,7 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
/* return excess runtime on last dequeue */
return_cfs_rq_runtime(cfs_rq);
- update_cfs_shares(cfs_rq);
+ update_cfs_shares(se);
/*
* Now advance min_vruntime if @se was the entity holding it back,
@@ -3558,7 +3816,7 @@ set_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
*/
update_stats_wait_end(cfs_rq, se);
__dequeue_entity(cfs_rq, se);
- update_load_avg(se, 1);
+ update_load_avg(se, UPDATE_TG);
}
update_stats_curr_start(cfs_rq, se);
@@ -3676,8 +3934,8 @@ entity_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr, int queued)
/*
* Ensure that runnable average is periodically updated.
*/
- update_load_avg(curr, 1);
- update_cfs_shares(cfs_rq);
+ update_load_avg(curr, UPDATE_TG);
+ update_cfs_shares(curr);
#ifdef CONFIG_SCHED_HRTICK
/*
@@ -4544,6 +4802,14 @@ static inline void hrtick_update(struct rq *rq)
}
#endif
+#ifdef CONFIG_SMP
+static bool __cpu_overutilized(int cpu, int delta);
+static bool cpu_overutilized(int cpu);
+unsigned long boosted_cpu_util(int cpu);
+#else
+#define boosted_cpu_util(cpu) cpu_util_freq(cpu)
+#endif
+
/*
* The enqueue_task method is called before nr_running is
* increased. Here we update the fair scheduling stats and
@@ -4554,6 +4820,9 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
{
struct cfs_rq *cfs_rq;
struct sched_entity *se = &p->se;
+#ifdef CONFIG_SMP
+ int task_new = flags & ENQUEUE_WAKEUP_NEW;
+#endif
/*
* If in_iowait is set, the code below may not trigger any cpufreq
@@ -4578,6 +4847,7 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
if (cfs_rq_throttled(cfs_rq))
break;
cfs_rq->h_nr_running++;
+ walt_inc_cfs_cumulative_runnable_avg(cfs_rq, p);
flags = ENQUEUE_WAKEUP;
}
@@ -4585,17 +4855,49 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
for_each_sched_entity(se) {
cfs_rq = cfs_rq_of(se);
cfs_rq->h_nr_running++;
+ walt_inc_cfs_cumulative_runnable_avg(cfs_rq, p);
if (cfs_rq_throttled(cfs_rq))
break;
- update_load_avg(se, 1);
- update_cfs_shares(cfs_rq);
+ update_load_avg(se, UPDATE_TG);
+ update_cfs_shares(se);
}
if (!se)
add_nr_running(rq, 1);
+#ifdef CONFIG_SMP
+
+ /*
+ * Update SchedTune accounting.
+ *
+ * We do it before updating the CPU capacity to ensure the
+ * boost value of the current task is accounted for in the
+ * selection of the OPP.
+ *
+ * We do it also in the case where we enqueue a throttled task;
+ * we could argue that a throttled task should not boost a CPU,
+ * however:
+ * a) properly implementing CPU boosting considering throttled
+ * tasks will increase a lot the complexity of the solution
+ * b) it's not easy to quantify the benefits introduced by
+ * such a more complex solution.
+ * Thus, for the time being we go for the simple solution and boost
+ * also for throttled RQs.
+ */
+ schedtune_enqueue_task(p, cpu_of(rq));
+
+ if (!se) {
+ walt_inc_cumulative_runnable_avg(rq, p);
+ if (!task_new && !rq->rd->overutilized &&
+ cpu_overutilized(rq->cpu)) {
+ rq->rd->overutilized = true;
+ trace_sched_overutilized(true);
+ }
+ }
+
+#endif /* CONFIG_SMP */
hrtick_update(rq);
}
@@ -4625,6 +4927,7 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags)
if (cfs_rq_throttled(cfs_rq))
break;
cfs_rq->h_nr_running--;
+ walt_dec_cfs_cumulative_runnable_avg(cfs_rq, p);
/* Don't dequeue parent if it has other entities besides us */
if (cfs_rq->load.weight) {
@@ -4644,17 +4947,33 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags)
for_each_sched_entity(se) {
cfs_rq = cfs_rq_of(se);
cfs_rq->h_nr_running--;
+ walt_dec_cfs_cumulative_runnable_avg(cfs_rq, p);
if (cfs_rq_throttled(cfs_rq))
break;
- update_load_avg(se, 1);
- update_cfs_shares(cfs_rq);
+ update_load_avg(se, UPDATE_TG);
+ update_cfs_shares(se);
}
if (!se)
sub_nr_running(rq, 1);
+#ifdef CONFIG_SMP
+
+ /*
+ * Update SchedTune accounting
+ *
+ * We do it before updating the CPU capacity to ensure the
+ * boost value of the current task is accounted for in the
+ * selection of the OPP.
+ */
+ schedtune_dequeue_task(p, cpu_of(rq));
+
+ if (!se)
+ walt_dec_cumulative_runnable_avg(rq, p);
+#endif /* CONFIG_SMP */
+
hrtick_update(rq);
}
@@ -4961,15 +5280,6 @@ static unsigned long target_load(int cpu, int type)
return max(rq->cpu_load[type-1], total);
}
-static unsigned long capacity_of(int cpu)
-{
- return cpu_rq(cpu)->cpu_capacity;
-}
-
-static unsigned long capacity_orig_of(int cpu)
-{
- return cpu_rq(cpu)->cpu_capacity_orig;
-}
static unsigned long cpu_avg_load_per_task(int cpu)
{
@@ -5121,6 +5431,532 @@ static void record_wakee(struct task_struct *p)
}
/*
+ * Returns the current capacity of cpu after applying both
+ * cpu and freq scaling.
+ */
+unsigned long capacity_curr_of(int cpu)
+{
+ return cpu_rq(cpu)->cpu_capacity_orig *
+ arch_scale_freq_capacity(NULL, cpu)
+ >> SCHED_CAPACITY_SHIFT;
+}
+
+/*
+ * Returns the current capacity of cpu after applying both
+ * cpu and min freq scaling.
+ */
+unsigned long capacity_min_of(int cpu)
+{
+ if (!sched_feat(MIN_CAPACITY_CAPPING))
+ return 0;
+ return arch_scale_cpu_capacity(NULL, cpu) *
+ arch_scale_min_freq_capacity(NULL, cpu)
+ >> SCHED_CAPACITY_SHIFT;
+}
+
+
+static inline bool energy_aware(void)
+{
+ return sched_feat(ENERGY_AWARE);
+}
+
+/*
+ * CPU candidates.
+ *
+ * These are labels to reference CPU candidates for an energy_diff.
+ * Currently we support only two possible candidates: the task's previous CPU
+ * and another candiate CPU.
+ * More advanced/aggressive EAS selection policies can consider more
+ * candidates.
+ */
+#define EAS_CPU_PRV 0
+#define EAS_CPU_NXT 1
+#define EAS_CPU_BKP 2
+#define EAS_CPU_CNT 3
+
+/*
+ * energy_diff - supports the computation of the estimated energy impact in
+ * moving a "task"'s "util_delta" between different CPU candidates.
+ */
+struct energy_env {
+ /* Utilization to move */
+ struct task_struct *p;
+ int util_delta;
+
+ /* Mask of CPUs candidates to evaluate */
+ cpumask_t cpus_mask;
+
+ /* CPU candidates to evaluate */
+ struct {
+
+ /* CPU ID, must be in cpus_mask */
+ int cpu_id;
+
+ /*
+ * Index (into sched_group_energy::cap_states) of the OPP the
+ * CPU needs to run at if the task is placed on it.
+ * This includes the both active and blocked load, due to
+ * other tasks on this CPU, as well as the task's own
+ * utilization.
+ */
+ int cap_idx;
+ int cap;
+
+ /* Estimated system energy */
+ unsigned int energy;
+
+ /* Estimated energy variation wrt EAS_CPU_PRV */
+ int nrg_delta;
+
+ } cpu[EAS_CPU_CNT];
+
+ /*
+ * Index (into energy_env::cpu) of the morst energy efficient CPU for
+ * the specified energy_env::task
+ */
+ int next_idx;
+
+ /* Support data */
+ struct sched_group *sg_top;
+ struct sched_group *sg_cap;
+ struct sched_group *sg;
+};
+
+static int cpu_util_wake(int cpu, struct task_struct *p);
+
+/*
+ * __cpu_norm_util() returns the cpu util relative to a specific capacity,
+ * i.e. it's busy ratio, in the range [0..SCHED_LOAD_SCALE], which is useful for
+ * energy calculations.
+ *
+ * Since util is a scale-invariant utilization defined as:
+ *
+ * util ~ (curr_freq/max_freq)*1024 * capacity_orig/1024 * running_time/time
+ *
+ * the normalized util can be found using the specific capacity.
+ *
+ * capacity = capacity_orig * curr_freq/max_freq
+ *
+ * norm_util = running_time/time ~ util/capacity
+ */
+static unsigned long __cpu_norm_util(unsigned long util, unsigned long capacity)
+{
+ if (util >= capacity)
+ return SCHED_CAPACITY_SCALE;
+
+ return (util << SCHED_CAPACITY_SHIFT)/capacity;
+}
+
+static unsigned long group_max_util(struct energy_env *eenv, int cpu_idx)
+{
+ unsigned long max_util = 0;
+ unsigned long util;
+ int cpu;
+
+ for_each_cpu(cpu, sched_group_cpus(eenv->sg_cap)) {
+ util = cpu_util_wake(cpu, eenv->p);
+
+ /*
+ * If we are looking at the target CPU specified by the eenv,
+ * then we should add the (estimated) utilization of the task
+ * assuming we will wake it up on that CPU.
+ */
+ if (unlikely(cpu == eenv->cpu[cpu_idx].cpu_id))
+ util += eenv->util_delta;
+
+ max_util = max(max_util, util);
+
+ /*
+ * Take into account any minimum frequency imposed
+ * elsewhere which limits the energy states available
+ * If the MIN_CAPACITY_CAPPING feature is not enabled
+ * capacity_min_of will return 0 (not capped).
+ */
+ max_util = max(max_util, capacity_min_of(cpu));
+
+ }
+
+ return max_util;
+}
+
+/*
+ * group_norm_util() returns the approximated group util relative to it's
+ * current capacity (busy ratio), in the range [0..SCHED_LOAD_SCALE], for use
+ * in energy calculations.
+ *
+ * Since task executions may or may not overlap in time in the group the true
+ * normalized util is between MAX(cpu_norm_util(i)) and SUM(cpu_norm_util(i))
+ * when iterating over all CPUs in the group.
+ * The latter estimate is used as it leads to a more pessimistic energy
+ * estimate (more busy).
+ */
+static unsigned
+long group_norm_util(struct energy_env *eenv, int cpu_idx)
+{
+ unsigned long capacity = eenv->cpu[cpu_idx].cap;
+ unsigned long util, util_sum = 0;
+ int cpu;
+
+ for_each_cpu(cpu, sched_group_cpus(eenv->sg)) {
+ util = cpu_util_wake(cpu, eenv->p);
+
+ /*
+ * If we are looking at the target CPU specified by the eenv,
+ * then we should add the (estimated) utilization of the task
+ * assuming we will wake it up on that CPU.
+ */
+ if (unlikely(cpu == eenv->cpu[cpu_idx].cpu_id))
+ util += eenv->util_delta;
+
+ util_sum += __cpu_norm_util(util, capacity);
+ }
+
+ return min_t(unsigned long, util_sum, SCHED_CAPACITY_SCALE);
+}
+
+static int find_new_capacity(struct energy_env *eenv, int cpu_idx)
+{
+ const struct sched_group_energy *sge = eenv->sg->sge;
+ int idx, max_idx = sge->nr_cap_states - 1;
+ unsigned long util = group_max_util(eenv, cpu_idx);
+
+ /* default is max_cap if we don't find a match */
+ eenv->cpu[cpu_idx].cap_idx = max_idx;
+ eenv->cpu[cpu_idx].cap = sge->cap_states[max_idx].cap;
+
+ for (idx = 0; idx < sge->nr_cap_states; idx++) {
+ if (sge->cap_states[idx].cap >= util) {
+ /* Keep track of SG's capacity */
+ eenv->cpu[cpu_idx].cap_idx = idx;
+ eenv->cpu[cpu_idx].cap = sge->cap_states[idx].cap;
+ break;
+ }
+ }
+
+ return eenv->cpu[cpu_idx].cap_idx;
+}
+
+static int group_idle_state(struct energy_env *eenv, int cpu_idx)
+{
+ struct sched_group *sg = eenv->sg;
+ int i, state = INT_MAX;
+ int src_in_grp, dst_in_grp;
+ long grp_util = 0;
+
+ /* Find the shallowest idle state in the sched group. */
+ for_each_cpu(i, sched_group_cpus(sg))
+ state = min(state, idle_get_state_idx(cpu_rq(i)));
+
+ /* Take non-cpuidle idling into account (active idle/arch_cpu_idle()) */
+ state++;
+
+ src_in_grp = cpumask_test_cpu(eenv->cpu[EAS_CPU_PRV].cpu_id,
+ sched_group_cpus(sg));
+ dst_in_grp = cpumask_test_cpu(eenv->cpu[cpu_idx].cpu_id,
+ sched_group_cpus(sg));
+ if (src_in_grp == dst_in_grp) {
+ /* both CPUs under consideration are in the same group or not in
+ * either group, migration should leave idle state the same.
+ */
+ goto end;
+ }
+
+ /*
+ * Try to estimate if a deeper idle state is
+ * achievable when we move the task.
+ */
+ for_each_cpu(i, sched_group_cpus(sg)) {
+ grp_util += cpu_util_wake(i, eenv->p);
+ if (unlikely(i == eenv->cpu[cpu_idx].cpu_id))
+ grp_util += eenv->util_delta;
+ }
+
+ if (grp_util <=
+ ((long)sg->sgc->max_capacity * (int)sg->group_weight)) {
+ /* after moving, this group is at most partly
+ * occupied, so it should have some idle time.
+ */
+ int max_idle_state_idx = sg->sge->nr_idle_states - 2;
+ int new_state = grp_util * max_idle_state_idx;
+ if (grp_util <= 0)
+ /* group will have no util, use lowest state */
+ new_state = max_idle_state_idx + 1;
+ else {
+ /* for partially idle, linearly map util to idle
+ * states, excluding the lowest one. This does not
+ * correspond to the state we expect to enter in
+ * reality, but an indication of what might happen.
+ */
+ new_state = min(max_idle_state_idx, (int)
+ (new_state / sg->sgc->max_capacity));
+ new_state = max_idle_state_idx - new_state;
+ }
+ state = new_state;
+ } else {
+ /* After moving, the group will be fully occupied
+ * so assume it will not be idle at all.
+ */
+ state = 0;
+ }
+end:
+ return state;
+}
+
+/*
+ * calc_sg_energy: compute energy for the eenv's SG (i.e. eenv->sg).
+ *
+ * This works in iterations to compute the SG's energy for each CPU
+ * candidate defined by the energy_env's cpu array.
+ *
+ * NOTE: in the following computations for busy_energy and idle_energy we do
+ * not shift by SCHED_CAPACITY_SHIFT in order to reduce rounding errors.
+ * The required scaling will be performed just one time, by the calling
+ * functions, once we accumulated the contributons for all the SGs.
+ */
+static void calc_sg_energy(struct energy_env *eenv)
+{
+ struct sched_group *sg = eenv->sg;
+ int busy_energy, idle_energy;
+ unsigned int busy_power;
+ unsigned int idle_power;
+ unsigned long sg_util;
+ int cap_idx, idle_idx;
+ int total_energy = 0;
+ int cpu_idx;
+
+ for (cpu_idx = EAS_CPU_PRV; cpu_idx < EAS_CPU_CNT; ++cpu_idx) {
+
+
+ if (eenv->cpu[cpu_idx].cpu_id == -1)
+ continue;
+ /* Compute ACTIVE energy */
+ cap_idx = find_new_capacity(eenv, cpu_idx);
+ busy_power = sg->sge->cap_states[cap_idx].power;
+ /*
+ * in order to calculate cpu_norm_util, we need to know which
+ * capacity level the group will be at, so calculate that first
+ */
+ sg_util = group_norm_util(eenv, cpu_idx);
+
+ busy_energy = sg_util * busy_power;
+
+ /* Compute IDLE energy */
+ idle_idx = group_idle_state(eenv, cpu_idx);
+ idle_power = sg->sge->idle_states[idle_idx].power;
+
+ idle_energy = SCHED_CAPACITY_SCALE - sg_util;
+ idle_energy *= idle_power;
+
+ total_energy = busy_energy + idle_energy;
+ eenv->cpu[cpu_idx].energy += total_energy;
+ }
+}
+
+/*
+ * compute_energy() computes the absolute variation in energy consumption by
+ * moving eenv.util_delta from EAS_CPU_PRV to EAS_CPU_NXT.
+ *
+ * NOTE: compute_energy() may fail when racing with sched_domain updates, in
+ * which case we abort by returning -EINVAL.
+ */
+static int compute_energy(struct energy_env *eenv)
+{
+ struct cpumask visit_cpus;
+ int cpu_count;
+
+ WARN_ON(!eenv->sg_top->sge);
+
+ cpumask_copy(&visit_cpus, sched_group_cpus(eenv->sg_top));
+ /* If a cpu is hotplugged in while we are in this function,
+ * it does not appear in the existing visit_cpus mask
+ * which came from the sched_group pointer of the
+ * sched_domain pointed at by sd_ea for either the prev
+ * or next cpu and was dereferenced in __energy_diff.
+ * Since we will dereference sd_scs later as we iterate
+ * through the CPUs we expect to visit, new CPUs can
+ * be present which are not in the visit_cpus mask.
+ * Guard this with cpu_count.
+ */
+ cpu_count = cpumask_weight(&visit_cpus);
+
+ while (!cpumask_empty(&visit_cpus)) {
+ struct sched_group *sg_shared_cap = NULL;
+ int cpu = cpumask_first(&visit_cpus);
+ struct sched_domain *sd;
+
+ /*
+ * Is the group utilization affected by cpus outside this
+ * sched_group?
+ * This sd may have groups with cpus which were not present
+ * when we took visit_cpus.
+ */
+ sd = rcu_dereference(per_cpu(sd_scs, cpu));
+ if (sd && sd->parent)
+ sg_shared_cap = sd->parent->groups;
+
+ for_each_domain(cpu, sd) {
+ struct sched_group *sg = sd->groups;
+
+ /* Has this sched_domain already been visited? */
+ if (sd->child && group_first_cpu(sg) != cpu)
+ break;
+
+ do {
+ eenv->sg_cap = sg;
+ if (sg_shared_cap && sg_shared_cap->group_weight >= sg->group_weight)
+ eenv->sg_cap = sg_shared_cap;
+
+ /*
+ * Compute the energy for all the candidate
+ * CPUs in the current visited SG.
+ */
+ eenv->sg = sg;
+ calc_sg_energy(eenv);
+
+ if (!sd->child) {
+ /*
+ * cpu_count here is the number of
+ * cpus we expect to visit in this
+ * calculation. If we race against
+ * hotplug, we can have extra cpus
+ * added to the groups we are
+ * iterating which do not appear in
+ * the visit_cpus mask. In that case
+ * we are not able to calculate energy
+ * without restarting so we will bail
+ * out and use prev_cpu this time.
+ */
+ if (!cpu_count)
+ return -EINVAL;
+ cpumask_xor(&visit_cpus, &visit_cpus, sched_group_cpus(sg));
+ cpu_count--;
+ }
+
+ if (cpumask_equal(sched_group_cpus(sg), sched_group_cpus(eenv->sg_top)))
+ goto next_cpu;
+
+ } while (sg = sg->next, sg != sd->groups);
+ }
+
+ /*
+ * If we raced with hotplug and got an sd NULL-pointer;
+ * returning a wrong energy estimation is better than
+ * entering an infinite loop.
+ * Specifically: If a cpu is unplugged after we took
+ * the visit_cpus mask, it no longer has an sd_scs
+ * pointer, so when we dereference it, we get NULL.
+ */
+ if (cpumask_test_cpu(cpu, &visit_cpus))
+ return -EINVAL;
+next_cpu:
+ cpumask_clear_cpu(cpu, &visit_cpus);
+ continue;
+ }
+
+ return 0;
+}
+
+static inline bool cpu_in_sg(struct sched_group *sg, int cpu)
+{
+ return cpu != -1 && cpumask_test_cpu(cpu, sched_group_cpus(sg));
+}
+
+/*
+ * select_energy_cpu_idx(): estimate the energy impact of changing the
+ * utilization distribution.
+ *
+ * The eenv parameter specifies the changes: utilisation amount and a pair of
+ * possible CPU candidates (the previous CPU and a different target CPU).
+ *
+ * This function returns the index of a CPU candidate specified by the
+ * energy_env which corresponds to the first CPU saving energy.
+ * Thus, 0 (EAS_CPU_PRV) means that non of the CPU candidate is more energy
+ * efficient than running on prev_cpu. This is also the value returned in case
+ * of abort due to error conditions during the computations.
+ * A value greater than zero means that the first energy-efficient CPU is the
+ * one represented by eenv->cpu[eenv->next_idx].cpu_id.
+ */
+static inline int select_energy_cpu_idx(struct energy_env *eenv)
+{
+ struct sched_domain *sd;
+ struct sched_group *sg;
+ int sd_cpu = -1;
+ int cpu_idx;
+ int margin;
+
+ sd_cpu = eenv->cpu[EAS_CPU_PRV].cpu_id;
+ sd = rcu_dereference(per_cpu(sd_ea, sd_cpu));
+ if (!sd)
+ return EAS_CPU_PRV;
+
+ cpumask_clear(&eenv->cpus_mask);
+ for (cpu_idx = EAS_CPU_PRV; cpu_idx < EAS_CPU_CNT; ++cpu_idx) {
+ int cpu = eenv->cpu[cpu_idx].cpu_id;
+
+ if (cpu < 0)
+ continue;
+ cpumask_set_cpu(cpu, &eenv->cpus_mask);
+ }
+
+ sg = sd->groups;
+ do {
+ /* Skip SGs which do not contains a candidate CPU */
+ if (!cpumask_intersects(&eenv->cpus_mask, sched_group_cpus(sg)))
+ continue;
+
+ eenv->sg_top = sg;
+ /* energy is unscaled to reduce rounding errors */
+ if (compute_energy(eenv) == -EINVAL)
+ return EAS_CPU_PRV;
+
+ } while (sg = sg->next, sg != sd->groups);
+
+ /* Scale energy before comparisons */
+ for (cpu_idx = EAS_CPU_PRV; cpu_idx < EAS_CPU_CNT; ++cpu_idx)
+ eenv->cpu[cpu_idx].energy >>= SCHED_CAPACITY_SHIFT;
+
+ /*
+ * Compute the dead-zone margin used to prevent too many task
+ * migrations with negligible energy savings.
+ * An energy saving is considered meaningful if it reduces the energy
+ * consumption of EAS_CPU_PRV CPU candidate by at least ~1.56%
+ */
+ margin = eenv->cpu[EAS_CPU_PRV].energy >> 6;
+
+ /*
+ * By default the EAS_CPU_PRV CPU is considered the most energy
+ * efficient, with a 0 energy variation.
+ */
+ eenv->next_idx = EAS_CPU_PRV;
+
+ /*
+ * Compare the other CPU candidates to find a CPU which can be
+ * more energy efficient then EAS_CPU_PRV
+ */
+ for (cpu_idx = EAS_CPU_NXT; cpu_idx < EAS_CPU_CNT; ++cpu_idx) {
+ /* Skip not valid scheduled candidates */
+ if (eenv->cpu[cpu_idx].cpu_id < 0)
+ continue;
+ /* Compute energy delta wrt EAS_CPU_PRV */
+ eenv->cpu[cpu_idx].nrg_delta =
+ eenv->cpu[cpu_idx].energy -
+ eenv->cpu[EAS_CPU_PRV].energy;
+ /* filter energy variations within the dead-zone margin */
+ if (abs(eenv->cpu[cpu_idx].nrg_delta) < margin)
+ eenv->cpu[cpu_idx].nrg_delta = 0;
+ /* update the schedule candidate with min(nrg_delta) */
+ if (eenv->cpu[cpu_idx].nrg_delta <
+ eenv->cpu[eenv->next_idx].nrg_delta) {
+ eenv->next_idx = cpu_idx;
+ if (sched_feat(FBT_STRICT_ORDER))
+ break;
+ }
+ }
+
+ return eenv->next_idx;
+}
+
+/*
* Detect M:N waker/wakee relationships via a switching-frequency heuristic.
*
* A waker of many should wake a different task than the one last awakened
@@ -5216,24 +6052,180 @@ static int wake_affine(struct sched_domain *sd, struct task_struct *p,
return 1;
}
+static inline unsigned long task_util(struct task_struct *p)
+{
+#ifdef CONFIG_SCHED_WALT
+ if (!walt_disabled && sysctl_sched_use_walt_task_util) {
+ unsigned long demand = p->ravg.demand;
+ return (demand << SCHED_CAPACITY_SHIFT) / walt_ravg_window;
+ }
+#endif
+ return p->se.avg.util_avg;
+}
+
+static inline unsigned long boosted_task_util(struct task_struct *p);
+
+static inline bool __task_fits(struct task_struct *p, int cpu, int util)
+{
+ unsigned long capacity = capacity_of(cpu);
+
+ util += boosted_task_util(p);
+
+ return (capacity * 1024) > (util * capacity_margin);
+}
+
+static inline bool task_fits_max(struct task_struct *p, int cpu)
+{
+ unsigned long capacity = capacity_of(cpu);
+ unsigned long max_capacity = cpu_rq(cpu)->rd->max_cpu_capacity.val;
+
+ if (capacity == max_capacity)
+ return true;
+
+ if (capacity * capacity_margin > max_capacity * 1024)
+ return true;
+
+ return __task_fits(p, cpu, 0);
+}
+
+static bool __cpu_overutilized(int cpu, int delta)
+{
+ return (capacity_of(cpu) * 1024) < ((cpu_util(cpu) + delta) * capacity_margin);
+}
+
+static bool cpu_overutilized(int cpu)
+{
+ return __cpu_overutilized(cpu, 0);
+}
+
+#ifdef CONFIG_SCHED_TUNE
+
+struct reciprocal_value schedtune_spc_rdiv;
+
+static long
+schedtune_margin(unsigned long signal, long boost)
+{
+ long long margin = 0;
+
+ /*
+ * Signal proportional compensation (SPC)
+ *
+ * The Boost (B) value is used to compute a Margin (M) which is
+ * proportional to the complement of the original Signal (S):
+ * M = B * (SCHED_CAPACITY_SCALE - S)
+ * The obtained M could be used by the caller to "boost" S.
+ */
+ if (boost >= 0) {
+ margin = SCHED_CAPACITY_SCALE - signal;
+ margin *= boost;
+ } else {
+ margin = -signal * boost;
+ }
+
+ margin = reciprocal_divide(margin, schedtune_spc_rdiv);
+ if (boost < 0)
+ margin *= -1;
+
+ return margin;
+}
+
+static inline int
+schedtune_cpu_margin(unsigned long util, int cpu)
+{
+ int boost = schedtune_cpu_boost(cpu);
+
+ if (boost == 0)
+ return 0;
+
+ return schedtune_margin(util, boost);
+}
+
+static inline long
+schedtune_task_margin(struct task_struct *p)
+{
+ int boost = schedtune_task_boost(p);
+ unsigned long util;
+ long margin;
+
+ if (boost == 0)
+ return 0;
+
+ util = task_util(p);
+ margin = schedtune_margin(util, boost);
+
+ return margin;
+}
+
+#else /* CONFIG_SCHED_TUNE */
+
+static inline int
+schedtune_cpu_margin(unsigned long util, int cpu)
+{
+ return 0;
+}
+
+static inline int
+schedtune_task_margin(struct task_struct *p)
+{
+ return 0;
+}
+
+#endif /* CONFIG_SCHED_TUNE */
+
+unsigned long
+boosted_cpu_util(int cpu)
+{
+ unsigned long util = cpu_util_freq(cpu);
+ long margin = schedtune_cpu_margin(util, cpu);
+
+ trace_sched_boost_cpu(cpu, util, margin);
+
+ return util + margin;
+}
+
+static inline unsigned long
+boosted_task_util(struct task_struct *p)
+{
+ unsigned long util = task_util(p);
+ long margin = schedtune_task_margin(p);
+
+ trace_sched_boost_task(p, util, margin);
+
+ return util + margin;
+}
+
+static unsigned long capacity_spare_wake(int cpu, struct task_struct *p)
+{
+ return max_t(long, capacity_of(cpu) - cpu_util_wake(cpu, p), 0);
+}
+
/*
* find_idlest_group finds and returns the least busy CPU group within the
* domain.
+ *
+ * Assumes p is allowed on at least one CPU in sd.
*/
static struct sched_group *
find_idlest_group(struct sched_domain *sd, struct task_struct *p,
int this_cpu, int sd_flag)
{
struct sched_group *idlest = NULL, *group = sd->groups;
- unsigned long min_load = ULONG_MAX, this_load = 0;
+ struct sched_group *most_spare_sg = NULL;
+ unsigned long min_runnable_load = ULONG_MAX;
+ unsigned long this_runnable_load = ULONG_MAX;
+ unsigned long min_avg_load = ULONG_MAX, this_avg_load = ULONG_MAX;
+ unsigned long most_spare = 0, this_spare = 0;
int load_idx = sd->forkexec_idx;
- int imbalance = 100 + (sd->imbalance_pct-100)/2;
+ int imbalance_scale = 100 + (sd->imbalance_pct-100)/2;
+ unsigned long imbalance = scale_load_down(NICE_0_LOAD) *
+ (sd->imbalance_pct-100) / 100;
if (sd_flag & SD_BALANCE_WAKE)
load_idx = sd->wake_idx;
do {
- unsigned long load, avg_load;
+ unsigned long load, avg_load, runnable_load;
+ unsigned long spare_cap, max_spare_cap;
int local_group;
int i;
@@ -5245,8 +6237,13 @@ find_idlest_group(struct sched_domain *sd, struct task_struct *p,
local_group = cpumask_test_cpu(this_cpu,
sched_group_cpus(group));
- /* Tally up the load of all CPUs in the group */
+ /*
+ * Tally up the load of all CPUs in the group and find
+ * the group containing the CPU with most spare capacity.
+ */
avg_load = 0;
+ runnable_load = 0;
+ max_spare_cap = 0;
for_each_cpu(i, sched_group_cpus(group)) {
/* Bias balancing toward cpus of our domain */
@@ -5255,30 +6252,85 @@ find_idlest_group(struct sched_domain *sd, struct task_struct *p,
else
load = target_load(i, load_idx);
- avg_load += load;
+ runnable_load += load;
+
+ avg_load += cfs_rq_load_avg(&cpu_rq(i)->cfs);
+
+ spare_cap = capacity_spare_wake(i, p);
+
+ if (spare_cap > max_spare_cap)
+ max_spare_cap = spare_cap;
}
/* Adjust by relative CPU capacity of the group */
- avg_load = (avg_load * SCHED_CAPACITY_SCALE) / group->sgc->capacity;
+ avg_load = (avg_load * SCHED_CAPACITY_SCALE) /
+ group->sgc->capacity;
+ runnable_load = (runnable_load * SCHED_CAPACITY_SCALE) /
+ group->sgc->capacity;
if (local_group) {
- this_load = avg_load;
- } else if (avg_load < min_load) {
- min_load = avg_load;
- idlest = group;
+ this_runnable_load = runnable_load;
+ this_avg_load = avg_load;
+ this_spare = max_spare_cap;
+ } else {
+ if (min_runnable_load > (runnable_load + imbalance)) {
+ /*
+ * The runnable load is significantly smaller
+ * so we can pick this new cpu
+ */
+ min_runnable_load = runnable_load;
+ min_avg_load = avg_load;
+ idlest = group;
+ } else if ((runnable_load < (min_runnable_load + imbalance)) &&
+ (100*min_avg_load > imbalance_scale*avg_load)) {
+ /*
+ * The runnable loads are close so we take
+ * into account blocked load through avg_load
+ * which is blocked + runnable load
+ */
+ min_avg_load = avg_load;
+ idlest = group;
+ }
+
+ if (most_spare < max_spare_cap) {
+ most_spare = max_spare_cap;
+ most_spare_sg = group;
+ }
}
} while (group = group->next, group != sd->groups);
- if (!idlest || 100*this_load < imbalance*min_load)
+ /*
+ * The cross-over point between using spare capacity or least load
+ * is too conservative for high utilization tasks on partially
+ * utilized systems if we require spare_capacity > task_util(p),
+ * so we allow for some task stuffing by using
+ * spare_capacity > task_util(p)/2.
+ * spare capacity can't be used for fork because the utilization has
+ * not been set yet as it need to get a rq to init the utilization
+ */
+ if (sd_flag & SD_BALANCE_FORK)
+ goto skip_spare;
+
+ if (this_spare > task_util(p) / 2 &&
+ imbalance_scale*this_spare > 100*most_spare)
+ return NULL;
+ else if (most_spare > task_util(p) / 2)
+ return most_spare_sg;
+
+skip_spare:
+ if (!idlest ||
+ (min_runnable_load > (this_runnable_load + imbalance)) ||
+ ((this_runnable_load < (min_runnable_load + imbalance)) &&
+ (100*this_avg_load < imbalance_scale*min_avg_load)))
return NULL;
return idlest;
}
/*
- * find_idlest_cpu - find the idlest cpu among the cpus in group.
+ * find_idlest_group_cpu - find the idlest cpu among the cpus in group.
*/
static int
-find_idlest_cpu(struct sched_group *group, struct task_struct *p, int this_cpu)
+find_idlest_group_cpu(struct sched_group *group, struct task_struct *p, int this_cpu)
{
unsigned long load, min_load = ULONG_MAX;
unsigned int min_exit_latency = UINT_MAX;
@@ -5327,6 +6379,68 @@ find_idlest_cpu(struct sched_group *group, struct task_struct *p, int this_cpu)
return shallowest_idle_cpu != -1 ? shallowest_idle_cpu : least_loaded_cpu;
}
+static inline int find_idlest_cpu(struct sched_domain *sd, struct task_struct *p,
+ int cpu, int prev_cpu, int sd_flag)
+{
+ int wu = sd_flag & SD_BALANCE_WAKE;
+ int cas_cpu = -1;
+ int new_cpu = cpu;
+
+ if (wu) {
+ schedstat_inc(p->se.statistics.nr_wakeups_cas_attempts);
+ schedstat_inc(this_rq()->eas_stats.cas_attempts);
+ }
+
+ if (!cpumask_intersects(sched_domain_span(sd), &p->cpus_allowed))
+ return prev_cpu;
+
+ while (sd) {
+ struct sched_group *group;
+ struct sched_domain *tmp;
+ int weight;
+
+ if (wu)
+ schedstat_inc(sd->eas_stats.cas_attempts);
+
+ if (!(sd->flags & sd_flag)) {
+ sd = sd->child;
+ continue;
+ }
+
+ group = find_idlest_group(sd, p, cpu, sd_flag);
+ if (!group) {
+ sd = sd->child;
+ continue;
+ }
+
+ new_cpu = find_idlest_group_cpu(group, p, cpu);
+ if (new_cpu == cpu) {
+ /* Now try balancing at a lower domain level of cpu */
+ sd = sd->child;
+ continue;
+ }
+
+ /* Now try balancing at a lower domain level of new_cpu */
+ cpu = cas_cpu = new_cpu;
+ weight = sd->span_weight;
+ sd = NULL;
+ for_each_domain(cpu, tmp) {
+ if (weight <= tmp->span_weight)
+ break;
+ if (tmp->flags & sd_flag)
+ sd = tmp;
+ }
+ /* while loop will break here if sd == NULL */
+ }
+
+ if (wu && (cas_cpu >= 0)) {
+ schedstat_inc(p->se.statistics.nr_wakeups_cas_count);
+ schedstat_inc(this_rq()->eas_stats.cas_count);
+ }
+
+ return new_cpu;
+}
+
#ifdef CONFIG_SCHED_SMT
static inline void set_idle_cores(int cpu, int val)
@@ -5494,96 +6608,583 @@ static int select_idle_cpu(struct task_struct *p, struct sched_domain *sd, int t
static int select_idle_sibling(struct task_struct *p, int prev, int target)
{
struct sched_domain *sd;
- int i;
+ struct sched_group *sg;
+ int i = task_cpu(p);
+ int best_idle_cpu = -1;
+ int best_idle_cstate = INT_MAX;
+ unsigned long best_idle_capacity = ULONG_MAX;
+
+ schedstat_inc(p->se.statistics.nr_wakeups_sis_attempts);
+ schedstat_inc(this_rq()->eas_stats.sis_attempts);
+
+ if (!sysctl_sched_cstate_aware) {
+ if (idle_cpu(target)) {
+ schedstat_inc(p->se.statistics.nr_wakeups_sis_idle);
+ schedstat_inc(this_rq()->eas_stats.sis_idle);
+ return target;
+ }
+
+ /*
+ * If the prevous cpu is cache affine and idle, don't be stupid.
+ */
+ if (i != target && cpus_share_cache(i, target) && idle_cpu(i)) {
+ schedstat_inc(p->se.statistics.nr_wakeups_sis_cache_affine);
+ schedstat_inc(this_rq()->eas_stats.sis_cache_affine);
+ return i;
+ }
+
+ sd = rcu_dereference(per_cpu(sd_llc, target));
+ if (!sd)
+ return target;
+
+ i = select_idle_core(p, sd, target);
+ if ((unsigned)i < nr_cpumask_bits)
+ return i;
- if (idle_cpu(target))
- return target;
+ i = select_idle_cpu(p, sd, target);
+ if ((unsigned)i < nr_cpumask_bits)
+ return i;
+
+ i = select_idle_smt(p, sd, target);
+ if ((unsigned)i < nr_cpumask_bits)
+ return i;
+ }
/*
- * If the previous cpu is cache affine and idle, don't be stupid.
+ * Otherwise, iterate the domains and find an elegible idle cpu.
*/
- if (prev != target && cpus_share_cache(prev, target) && idle_cpu(prev))
- return prev;
-
sd = rcu_dereference(per_cpu(sd_llc, target));
- if (!sd)
- return target;
+ for_each_lower_domain(sd) {
+ sg = sd->groups;
+ do {
+ if (!cpumask_intersects(sched_group_cpus(sg),
+ tsk_cpus_allowed(p)))
+ goto next;
+
+
+ if (sysctl_sched_cstate_aware) {
+ for_each_cpu_and(i, tsk_cpus_allowed(p), sched_group_cpus(sg)) {
+ int idle_idx = idle_get_state_idx(cpu_rq(i));
+ unsigned long new_usage = boosted_task_util(p);
+ unsigned long capacity_orig = capacity_orig_of(i);
+
+ if (new_usage > capacity_orig || !idle_cpu(i))
+ goto next;
+
+ if (i == target && new_usage <= capacity_curr_of(target)) {
+ schedstat_inc(p->se.statistics.nr_wakeups_sis_suff_cap);
+ schedstat_inc(this_rq()->eas_stats.sis_suff_cap);
+ schedstat_inc(sd->eas_stats.sis_suff_cap);
+ return target;
+ }
+
+ if (idle_idx < best_idle_cstate &&
+ capacity_orig <= best_idle_capacity) {
+ best_idle_cpu = i;
+ best_idle_cstate = idle_idx;
+ best_idle_capacity = capacity_orig;
+ }
+ }
+ } else {
+ for_each_cpu(i, sched_group_cpus(sg)) {
+ if (i == target || !idle_cpu(i))
+ goto next;
+ }
- i = select_idle_core(p, sd, target);
- if ((unsigned)i < nr_cpumask_bits)
- return i;
+ target = cpumask_first_and(sched_group_cpus(sg),
+ tsk_cpus_allowed(p));
+ schedstat_inc(p->se.statistics.nr_wakeups_sis_idle_cpu);
+ schedstat_inc(this_rq()->eas_stats.sis_idle_cpu);
+ schedstat_inc(sd->eas_stats.sis_idle_cpu);
+ goto done;
+ }
+next:
+ sg = sg->next;
+ } while (sg != sd->groups);
+ }
- i = select_idle_cpu(p, sd, target);
- if ((unsigned)i < nr_cpumask_bits)
- return i;
+ if (best_idle_cpu >= 0)
+ target = best_idle_cpu;
- i = select_idle_smt(p, sd, target);
- if ((unsigned)i < nr_cpumask_bits)
- return i;
+done:
+ schedstat_inc(p->se.statistics.nr_wakeups_sis_count);
+ schedstat_inc(this_rq()->eas_stats.sis_count);
return target;
}
-
+
/*
- * cpu_util returns the amount of capacity of a CPU that is used by CFS
- * tasks. The unit of the return value must be the one of capacity so we can
- * compare the utilization with the capacity of the CPU that is available for
- * CFS task (ie cpu_capacity).
- *
- * cfs_rq.avg.util_avg is the sum of running time of runnable tasks plus the
- * recent utilization of currently non-runnable tasks on a CPU. It represents
- * the amount of utilization of a CPU in the range [0..capacity_orig] where
- * capacity_orig is the cpu_capacity available at the highest frequency
- * (arch_scale_freq_capacity()).
- * The utilization of a CPU converges towards a sum equal to or less than the
- * current capacity (capacity_curr <= capacity_orig) of the CPU because it is
- * the running time on this CPU scaled by capacity_curr.
- *
- * Nevertheless, cfs_rq.avg.util_avg can be higher than capacity_curr or even
- * higher than capacity_orig because of unfortunate rounding in
- * cfs.avg.util_avg or just after migrating tasks and new task wakeups until
- * the average stabilizes with the new running time. We need to check that the
- * utilization stays within the range of [0..capacity_orig] and cap it if
- * necessary. Without utilization capping, a group could be seen as overloaded
- * (CPU0 utilization at 121% + CPU1 utilization at 80%) whereas CPU1 has 20% of
- * available capacity. We allow utilization to overshoot capacity_curr (but not
- * capacity_orig) as it useful for predicting the capacity required after task
- * migrations (scheduler-driven DVFS).
+ * cpu_util_wake: Compute cpu utilization with any contributions from
+ * the waking task p removed. check_for_migration() looks for a better CPU of
+ * rq->curr. For that case we should return cpu util with contributions from
+ * currently running task p removed.
*/
-static int cpu_util(int cpu)
+static int cpu_util_wake(int cpu, struct task_struct *p)
{
- unsigned long util = cpu_rq(cpu)->cfs.avg.util_avg;
- unsigned long capacity = capacity_orig_of(cpu);
+ unsigned long util, capacity;
+
+#ifdef CONFIG_SCHED_WALT
+ /*
+ * WALT does not decay idle tasks in the same manner
+ * as PELT, so it makes little sense to subtract task
+ * utilization from cpu utilization. Instead just use
+ * cpu_util for this case.
+ */
+ if (!walt_disabled && sysctl_sched_use_walt_cpu_util &&
+ p->state == TASK_WAKING)
+ return cpu_util(cpu);
+#endif
+ /* Task has no contribution or is new */
+ if (cpu != task_cpu(p) || !p->se.avg.last_update_time)
+ return cpu_util(cpu);
+
+ capacity = capacity_orig_of(cpu);
+ util = max_t(long, cpu_util(cpu) - task_util(p), 0);
return (util >= capacity) ? capacity : util;
}
-static inline int task_util(struct task_struct *p)
+static int start_cpu(bool boosted)
{
- return p->se.avg.util_avg;
+ struct root_domain *rd = cpu_rq(smp_processor_id())->rd;
+
+ return boosted ? rd->max_cap_orig_cpu : rd->min_cap_orig_cpu;
+}
+
+static inline int find_best_target(struct task_struct *p, int *backup_cpu,
+ bool boosted, bool prefer_idle)
+{
+ unsigned long best_idle_min_cap_orig = ULONG_MAX;
+ unsigned long min_util = boosted_task_util(p);
+ unsigned long target_capacity = ULONG_MAX;
+ unsigned long min_wake_util = ULONG_MAX;
+ unsigned long target_max_spare_cap = 0;
+ unsigned long target_util = ULONG_MAX;
+ unsigned long best_active_util = ULONG_MAX;
+ unsigned long target_idle_max_spare_cap = 0;
+ int best_idle_cstate = INT_MAX;
+ struct sched_domain *sd;
+ struct sched_group *sg;
+ int best_active_cpu = -1;
+ int best_idle_cpu = -1;
+ int target_cpu = -1;
+ int cpu, i;
+
+ *backup_cpu = -1;
+
+ schedstat_inc(p->se.statistics.nr_wakeups_fbt_attempts);
+ schedstat_inc(this_rq()->eas_stats.fbt_attempts);
+
+ /* Find start CPU based on boost value */
+ cpu = start_cpu(boosted);
+ if (cpu < 0) {
+ schedstat_inc(p->se.statistics.nr_wakeups_fbt_no_cpu);
+ schedstat_inc(this_rq()->eas_stats.fbt_no_cpu);
+ return -1;
+ }
+
+ /* Find SD for the start CPU */
+ sd = rcu_dereference(per_cpu(sd_ea, cpu));
+ if (!sd) {
+ schedstat_inc(p->se.statistics.nr_wakeups_fbt_no_sd);
+ schedstat_inc(this_rq()->eas_stats.fbt_no_sd);
+ return -1;
+ }
+
+ /* Scan CPUs in all SDs */
+ sg = sd->groups;
+ do {
+ for_each_cpu_and(i, tsk_cpus_allowed(p), sched_group_cpus(sg)) {
+ unsigned long capacity_curr = capacity_curr_of(i);
+ unsigned long capacity_orig = capacity_orig_of(i);
+ unsigned long wake_util, new_util, min_capped_util;
+
+ if (!cpu_online(i))
+ continue;
+
+ if (walt_cpu_high_irqload(i))
+ continue;
+
+ /*
+ * p's blocked utilization is still accounted for on prev_cpu
+ * so prev_cpu will receive a negative bias due to the double
+ * accounting. However, the blocked utilization may be zero.
+ */
+ wake_util = cpu_util_wake(i, p);
+ new_util = wake_util + task_util(p);
+
+ /*
+ * Ensure minimum capacity to grant the required boost.
+ * The target CPU can be already at a capacity level higher
+ * than the one required to boost the task.
+ */
+ new_util = max(min_util, new_util);
+
+ /*
+ * Include minimum capacity constraint:
+ * new_util contains the required utilization including
+ * boost. min_capped_util also takes into account a
+ * minimum capacity cap imposed on the CPU by external
+ * actors.
+ */
+ min_capped_util = max(new_util, capacity_min_of(i));
+
+ if (new_util > capacity_orig)
+ continue;
+
+ /*
+ * Case A) Latency sensitive tasks
+ *
+ * Unconditionally favoring tasks that prefer idle CPU to
+ * improve latency.
+ *
+ * Looking for:
+ * - an idle CPU, whatever its idle_state is, since
+ * the first CPUs we explore are more likely to be
+ * reserved for latency sensitive tasks.
+ * - a non idle CPU where the task fits in its current
+ * capacity and has the maximum spare capacity.
+ * - a non idle CPU with lower contention from other
+ * tasks and running at the lowest possible OPP.
+ *
+ * The last two goals tries to favor a non idle CPU
+ * where the task can run as if it is "almost alone".
+ * A maximum spare capacity CPU is favoured since
+ * the task already fits into that CPU's capacity
+ * without waiting for an OPP chance.
+ *
+ * The following code path is the only one in the CPUs
+ * exploration loop which is always used by
+ * prefer_idle tasks. It exits the loop with wither a
+ * best_active_cpu or a target_cpu which should
+ * represent an optimal choice for latency sensitive
+ * tasks.
+ */
+ if (prefer_idle) {
+
+ /*
+ * Case A.1: IDLE CPU
+ * Return the first IDLE CPU we find.
+ */
+ if (idle_cpu(i)) {
+ schedstat_inc(p->se.statistics.nr_wakeups_fbt_pref_idle);
+ schedstat_inc(this_rq()->eas_stats.fbt_pref_idle);
+
+ trace_sched_find_best_target(p,
+ prefer_idle, min_util,
+ cpu, best_idle_cpu,
+ best_active_cpu, i);
+
+ return i;
+ }
+
+ /*
+ * Case A.2: Target ACTIVE CPU
+ * Favor CPUs with max spare capacity.
+ */
+ if ((capacity_curr > new_util) &&
+ (capacity_orig - new_util > target_max_spare_cap)) {
+ target_max_spare_cap = capacity_orig - new_util;
+ target_cpu = i;
+ continue;
+ }
+ if (target_cpu != -1)
+ continue;
+
+
+ /*
+ * Case A.3: Backup ACTIVE CPU
+ * Favor CPUs with:
+ * - lower utilization due to other tasks
+ * - lower utilization with the task in
+ */
+ if (wake_util > min_wake_util)
+ continue;
+ if (new_util > best_active_util)
+ continue;
+ min_wake_util = wake_util;
+ best_active_util = new_util;
+ best_active_cpu = i;
+ continue;
+ }
+
+ /*
+ * Enforce EAS mode
+ *
+ * For non latency sensitive tasks, skip CPUs that
+ * will be overutilized by moving the task there.
+ *
+ * The goal here is to remain in EAS mode as long as
+ * possible at least for !prefer_idle tasks.
+ */
+ if ((new_util * capacity_margin) >
+ (capacity_orig * SCHED_CAPACITY_SCALE))
+ continue;
+
+ /*
+ * Case B) Non latency sensitive tasks on IDLE CPUs.
+ *
+ * Find an optimal backup IDLE CPU for non latency
+ * sensitive tasks.
+ *
+ * Looking for:
+ * - minimizing the capacity_orig,
+ * i.e. preferring LITTLE CPUs
+ * - favoring shallowest idle states
+ * i.e. avoid to wakeup deep-idle CPUs
+ *
+ * The following code path is used by non latency
+ * sensitive tasks if IDLE CPUs are available. If at
+ * least one of such CPUs are available it sets the
+ * best_idle_cpu to the most suitable idle CPU to be
+ * selected.
+ *
+ * If idle CPUs are available, favour these CPUs to
+ * improve performances by spreading tasks.
+ * Indeed, the energy_diff() computed by the caller
+ * will take care to ensure the minimization of energy
+ * consumptions without affecting performance.
+ */
+ if (idle_cpu(i)) {
+ int idle_idx = idle_get_state_idx(cpu_rq(i));
+
+ /* Select idle CPU with lower cap_orig */
+ if (capacity_orig > best_idle_min_cap_orig)
+ continue;
+ /* Favor CPUs that won't end up running at a
+ * high OPP.
+ */
+ if ((capacity_orig - min_capped_util) <
+ target_idle_max_spare_cap)
+ continue;
+
+ /*
+ * Skip CPUs in deeper idle state, but only
+ * if they are also less energy efficient.
+ * IOW, prefer a deep IDLE LITTLE CPU vs a
+ * shallow idle big CPU.
+ */
+ if (sysctl_sched_cstate_aware &&
+ best_idle_cstate <= idle_idx)
+ continue;
+
+ /* Keep track of best idle CPU */
+ best_idle_min_cap_orig = capacity_orig;
+ target_idle_max_spare_cap = capacity_orig -
+ min_capped_util;
+ best_idle_cstate = idle_idx;
+ best_idle_cpu = i;
+ continue;
+ }
+
+ /*
+ * Case C) Non latency sensitive tasks on ACTIVE CPUs.
+ *
+ * Pack tasks in the most energy efficient capacities.
+ *
+ * This task packing strategy prefers more energy
+ * efficient CPUs (i.e. pack on smaller maximum
+ * capacity CPUs) while also trying to spread tasks to
+ * run them all at the lower OPP.
+ *
+ * This assumes for example that it's more energy
+ * efficient to run two tasks on two CPUs at a lower
+ * OPP than packing both on a single CPU but running
+ * that CPU at an higher OPP.
+ *
+ * Thus, this case keep track of the CPU with the
+ * smallest maximum capacity and highest spare maximum
+ * capacity.
+ */
+
+ /* Favor CPUs with smaller capacity */
+ if (capacity_orig > target_capacity)
+ continue;
+
+ /* Favor CPUs with maximum spare capacity */
+ if ((capacity_orig - min_capped_util) <
+ target_max_spare_cap)
+ continue;
+
+ target_max_spare_cap = capacity_orig - min_capped_util;
+ target_capacity = capacity_orig;
+ target_util = new_util;
+ target_cpu = i;
+ }
+
+ } while (sg = sg->next, sg != sd->groups);
+
+ /*
+ * For non latency sensitive tasks, cases B and C in the previous loop,
+ * we pick the best IDLE CPU only if we was not able to find a target
+ * ACTIVE CPU.
+ *
+ * Policies priorities:
+ *
+ * - prefer_idle tasks:
+ *
+ * a) IDLE CPU available, we return immediately
+ * b) ACTIVE CPU where task fits and has the bigger maximum spare
+ * capacity (i.e. target_cpu)
+ * c) ACTIVE CPU with less contention due to other tasks
+ * (i.e. best_active_cpu)
+ *
+ * - NON prefer_idle tasks:
+ *
+ * a) ACTIVE CPU: target_cpu
+ * b) IDLE CPU: best_idle_cpu
+ */
+ if (target_cpu == -1)
+ target_cpu = prefer_idle
+ ? best_active_cpu
+ : best_idle_cpu;
+ else
+ *backup_cpu = prefer_idle
+ ? best_active_cpu
+ : best_idle_cpu;
+
+ trace_sched_find_best_target(p, prefer_idle, min_util, cpu,
+ best_idle_cpu, best_active_cpu,
+ target_cpu);
+
+ schedstat_inc(p->se.statistics.nr_wakeups_fbt_count);
+ schedstat_inc(this_rq()->eas_stats.fbt_count);
+
+ return target_cpu;
}
/*
* Disable WAKE_AFFINE in the case where task @p doesn't fit in the
* capacity of either the waking CPU @cpu or the previous CPU @prev_cpu.
- *
+ *
* In that case WAKE_AFFINE doesn't make sense and we'll let
* BALANCE_WAKE sort things out.
*/
static int wake_cap(struct task_struct *p, int cpu, int prev_cpu)
{
long min_cap, max_cap;
-
min_cap = min(capacity_orig_of(prev_cpu), capacity_orig_of(cpu));
- max_cap = cpu_rq(cpu)->rd->max_cpu_capacity;
-
+ max_cap = cpu_rq(cpu)->rd->max_cpu_capacity.val;
/* Minimum capacity is close to max, no need to abort wake_affine */
if (max_cap - min_cap < max_cap >> 3)
return 0;
+ /* Bring task utilization in sync with prev_cpu */
+ sync_entity_load_avg(&p->se);
+
return min_cap * 1024 < task_util(p) * capacity_margin;
}
+static int select_energy_cpu_brute(struct task_struct *p, int prev_cpu, int sync)
+{
+ bool boosted, prefer_idle;
+ struct sched_domain *sd;
+ int target_cpu;
+ int backup_cpu;
+ int next_cpu;
+
+ schedstat_inc(p->se.statistics.nr_wakeups_secb_attempts);
+ schedstat_inc(this_rq()->eas_stats.secb_attempts);
+
+ if (sysctl_sched_sync_hint_enable && sync) {
+ int cpu = smp_processor_id();
+
+ if (cpumask_test_cpu(cpu, tsk_cpus_allowed(p))) {
+ schedstat_inc(p->se.statistics.nr_wakeups_secb_sync);
+ schedstat_inc(this_rq()->eas_stats.secb_sync);
+ return cpu;
+ }
+ }
+
+#ifdef CONFIG_CGROUP_SCHEDTUNE
+ boosted = schedtune_task_boost(p) > 0;
+ prefer_idle = schedtune_prefer_idle(p) > 0;
+#else
+ boosted = get_sysctl_sched_cfs_boost() > 0;
+ prefer_idle = 0;
+#endif
+
+ rcu_read_lock();
+
+ sd = rcu_dereference(per_cpu(sd_ea, prev_cpu));
+ if (!sd) {
+ target_cpu = prev_cpu;
+ goto unlock;
+ }
+
+ sync_entity_load_avg(&p->se);
+
+ /* Find a cpu with sufficient capacity */
+ next_cpu = find_best_target(p, &backup_cpu, boosted, prefer_idle);
+ if (next_cpu == -1) {
+ target_cpu = prev_cpu;
+ goto unlock;
+ }
+
+ /* Unconditionally prefer IDLE CPUs for boosted/prefer_idle tasks */
+ if ((boosted || prefer_idle) && idle_cpu(next_cpu)) {
+ schedstat_inc(p->se.statistics.nr_wakeups_secb_idle_bt);
+ schedstat_inc(this_rq()->eas_stats.secb_idle_bt);
+ target_cpu = next_cpu;
+ goto unlock;
+ }
+
+ target_cpu = prev_cpu;
+ if (next_cpu != prev_cpu) {
+ int delta = 0;
+ struct energy_env eenv = {
+ .p = p,
+ .util_delta = task_util(p),
+ /* Task's previous CPU candidate */
+ .cpu[EAS_CPU_PRV] = {
+ .cpu_id = prev_cpu,
+ },
+ /* Main alternative CPU candidate */
+ .cpu[EAS_CPU_NXT] = {
+ .cpu_id = next_cpu,
+ },
+ /* Backup alternative CPU candidate */
+ .cpu[EAS_CPU_BKP] = {
+ .cpu_id = backup_cpu,
+ },
+ };
+
+
+#ifdef CONFIG_SCHED_WALT
+ if (!walt_disabled && sysctl_sched_use_walt_cpu_util &&
+ p->state == TASK_WAKING)
+ delta = task_util(p);
+#endif
+ /* Not enough spare capacity on previous cpu */
+ if (__cpu_overutilized(prev_cpu, delta)) {
+ schedstat_inc(p->se.statistics.nr_wakeups_secb_insuff_cap);
+ schedstat_inc(this_rq()->eas_stats.secb_insuff_cap);
+ target_cpu = next_cpu;
+ goto unlock;
+ }
+
+ /* Check if EAS_CPU_NXT is a more energy efficient CPU */
+ if (select_energy_cpu_idx(&eenv) != EAS_CPU_PRV) {
+ schedstat_inc(p->se.statistics.nr_wakeups_secb_nrg_sav);
+ schedstat_inc(this_rq()->eas_stats.secb_nrg_sav);
+ target_cpu = eenv.cpu[eenv.next_idx].cpu_id;
+ goto unlock;
+ }
+
+ schedstat_inc(p->se.statistics.nr_wakeups_secb_no_nrg_sav);
+ schedstat_inc(this_rq()->eas_stats.secb_no_nrg_sav);
+ target_cpu = prev_cpu;
+ goto unlock;
+ }
+
+ schedstat_inc(p->se.statistics.nr_wakeups_secb_count);
+ schedstat_inc(this_rq()->eas_stats.secb_count);
+
+unlock:
+ rcu_read_unlock();
+ return target_cpu;
+}
+
/*
* select_task_rq_fair: Select target runqueue for the waking task in domains
* that have the 'sd_flag' flag set. In practice, this is SD_BALANCE_WAKE,
@@ -5607,10 +7208,13 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int sd_flag, int wake_f
if (sd_flag & SD_BALANCE_WAKE) {
record_wakee(p);
- want_affine = !wake_wide(p) && !wake_cap(p, cpu, prev_cpu)
- && cpumask_test_cpu(cpu, tsk_cpus_allowed(p));
+ want_affine = (!wake_wide(p) && !wake_cap(p, cpu, prev_cpu) &&
+ cpumask_test_cpu(cpu, tsk_cpus_allowed(p)));
}
+ if (energy_aware() && !(cpu_rq(prev_cpu)->rd->overutilized))
+ return select_energy_cpu_brute(p, prev_cpu, sync);
+
rcu_read_lock();
for_each_domain(cpu, tmp) {
if (!(tmp->flags & SD_LOAD_BALANCE))
@@ -5638,43 +7242,21 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int sd_flag, int wake_f
new_cpu = cpu;
}
+ if (sd && !(sd_flag & SD_BALANCE_FORK)) {
+ /*
+ * We're going to need the task's util for capacity_spare_wake
+ * in find_idlest_group. Sync it up to prev_cpu's
+ * last_update_time.
+ */
+ sync_entity_load_avg(&p->se);
+ }
+
if (!sd) {
if (sd_flag & SD_BALANCE_WAKE) /* XXX always ? */
new_cpu = select_idle_sibling(p, prev_cpu, new_cpu);
- } else while (sd) {
- struct sched_group *group;
- int weight;
-
- if (!(sd->flags & sd_flag)) {
- sd = sd->child;
- continue;
- }
-
- group = find_idlest_group(sd, p, cpu, sd_flag);
- if (!group) {
- sd = sd->child;
- continue;
- }
-
- new_cpu = find_idlest_cpu(group, p, cpu);
- if (new_cpu == -1 || new_cpu == cpu) {
- /* Now try balancing at a lower domain level of cpu */
- sd = sd->child;
- continue;
- }
-
- /* Now try balancing at a lower domain level of new_cpu */
- cpu = new_cpu;
- weight = sd->span_weight;
- sd = NULL;
- for_each_domain(cpu, tmp) {
- if (weight <= tmp->span_weight)
- break;
- if (tmp->flags & sd_flag)
- sd = tmp;
- }
- /* while loop will break here if sd == NULL */
+ } else {
+ new_cpu = find_idlest_cpu(sd, p, cpu, prev_cpu, sd_flag);
}
rcu_read_unlock();
@@ -5734,6 +7316,8 @@ static void task_dead_fair(struct task_struct *p)
{
remove_entity_load_avg(&p->se);
}
+#else
+#define task_fits_max(p, cpu) true
#endif /* CONFIG_SMP */
static unsigned long
@@ -5980,6 +7564,8 @@ again:
if (hrtick_enabled(rq))
hrtick_start_fair(rq, p);
+ rq->misfit_task = !task_fits_max(p, rq->cpu);
+
return p;
simple:
cfs_rq = &rq->cfs;
@@ -6001,9 +7587,12 @@ simple:
if (hrtick_enabled(rq))
hrtick_start_fair(rq, p);
+ rq->misfit_task = !task_fits_max(p, rq->cpu);
+
return p;
idle:
+ rq->misfit_task = 0;
/*
* This is OK, because current is on_cpu, which avoids it being picked
* for load-balance and preemption/IRQs are still disabled avoiding
@@ -6216,6 +7805,13 @@ static unsigned long __read_mostly max_load_balance_interval = HZ/10;
enum fbq_type { regular, remote, all };
+enum group_type {
+ group_other = 0,
+ group_misfit_task,
+ group_imbalanced,
+ group_overloaded,
+};
+
#define LBF_ALL_PINNED 0x01
#define LBF_NEED_BREAK 0x02
#define LBF_DST_PINNED 0x04
@@ -6234,6 +7830,7 @@ struct lb_env {
int new_dst_cpu;
enum cpu_idle_type idle;
long imbalance;
+ unsigned int src_grp_nr_running;
/* The set of CPUs under consideration for load-balancing */
struct cpumask *cpus;
@@ -6244,6 +7841,7 @@ struct lb_env {
unsigned int loop_max;
enum fbq_type fbq_type;
+ enum group_type busiest_group_type;
struct list_head tasks;
};
@@ -6425,7 +8023,9 @@ static void detach_task(struct task_struct *p, struct lb_env *env)
p->on_rq = TASK_ON_RQ_MIGRATING;
deactivate_task(env->src_rq, p, 0);
+ double_lock_balance(env->src_rq, env->dst_rq);
set_task_cpu(p, env->dst_cpu);
+ double_unlock_balance(env->src_rq, env->dst_rq);
}
/*
@@ -6609,12 +8209,19 @@ static void update_blocked_averages(int cpu)
* list_add_leaf_cfs_rq() for details.
*/
for_each_leaf_cfs_rq(rq, cfs_rq) {
+ struct sched_entity *se;
+
/* throttled entities do not contribute to load */
if (throttled_hierarchy(cfs_rq))
continue;
if (update_cfs_rq_load_avg(cfs_rq_clock_task(cfs_rq), cfs_rq, true))
update_tg_load_avg(cfs_rq, 0);
+
+ /* Propagate pending load changes to the parent, if any: */
+ se = cfs_rq->tg->se[cpu];
+ if (se && !skip_blocked_update(se))
+ update_load_avg(se, 0);
}
raw_spin_unlock_irqrestore(&rq->lock, flags);
}
@@ -6686,12 +8293,6 @@ static unsigned long task_h_load(struct task_struct *p)
/********** Helpers for find_busiest_group ************************/
-enum group_type {
- group_other = 0,
- group_imbalanced,
- group_overloaded,
-};
-
/*
* sg_lb_stats - stats of a sched_group required for load_balancing
*/
@@ -6707,6 +8308,7 @@ struct sg_lb_stats {
unsigned int group_weight;
enum group_type group_type;
int group_no_capacity;
+ int group_misfit_task; /* A cpu has a task too big for its capacity */
#ifdef CONFIG_NUMA_BALANCING
unsigned int nr_numa_running;
unsigned int nr_preferred_running;
@@ -6804,13 +8406,46 @@ static unsigned long scale_rt_capacity(int cpu)
return 1;
}
+void init_max_cpu_capacity(struct max_cpu_capacity *mcc)
+{
+ raw_spin_lock_init(&mcc->lock);
+ mcc->val = 0;
+ mcc->cpu = -1;
+}
+
static void update_cpu_capacity(struct sched_domain *sd, int cpu)
{
unsigned long capacity = arch_scale_cpu_capacity(sd, cpu);
struct sched_group *sdg = sd->groups;
+ struct max_cpu_capacity *mcc;
+ unsigned long max_capacity;
+ int max_cap_cpu;
+ unsigned long flags;
cpu_rq(cpu)->cpu_capacity_orig = capacity;
+ capacity *= arch_scale_max_freq_capacity(sd, cpu);
+ capacity >>= SCHED_CAPACITY_SHIFT;
+
+ mcc = &cpu_rq(cpu)->rd->max_cpu_capacity;
+
+ raw_spin_lock_irqsave(&mcc->lock, flags);
+ max_capacity = mcc->val;
+ max_cap_cpu = mcc->cpu;
+
+ if ((max_capacity > capacity && max_cap_cpu == cpu) ||
+ (max_capacity < capacity)) {
+ mcc->val = capacity;
+ mcc->cpu = cpu;
+#ifdef CONFIG_SCHED_DEBUG
+ raw_spin_unlock_irqrestore(&mcc->lock, flags);
+ pr_info("CPU%d: update max cpu_capacity %lu\n", cpu, capacity);
+ goto skip_unlock;
+#endif
+ }
+ raw_spin_unlock_irqrestore(&mcc->lock, flags);
+
+skip_unlock: __attribute__ ((unused));
capacity *= scale_rt_capacity(cpu);
capacity >>= SCHED_CAPACITY_SHIFT;
@@ -6819,13 +8454,15 @@ static void update_cpu_capacity(struct sched_domain *sd, int cpu)
cpu_rq(cpu)->cpu_capacity = capacity;
sdg->sgc->capacity = capacity;
+ sdg->sgc->max_capacity = capacity;
+ sdg->sgc->min_capacity = capacity;
}
void update_group_capacity(struct sched_domain *sd, int cpu)
{
struct sched_domain *child = sd->child;
struct sched_group *group, *sdg = sd->groups;
- unsigned long capacity;
+ unsigned long capacity, max_capacity, min_capacity;
unsigned long interval;
interval = msecs_to_jiffies(sd->balance_interval);
@@ -6838,6 +8475,8 @@ void update_group_capacity(struct sched_domain *sd, int cpu)
}
capacity = 0;
+ max_capacity = 0;
+ min_capacity = ULONG_MAX;
if (child->flags & SD_OVERLAP) {
/*
@@ -6862,11 +8501,13 @@ void update_group_capacity(struct sched_domain *sd, int cpu)
*/
if (unlikely(!rq->sd)) {
capacity += capacity_of(cpu);
- continue;
+ } else {
+ sgc = rq->sd->groups->sgc;
+ capacity += sgc->capacity;
}
- sgc = rq->sd->groups->sgc;
- capacity += sgc->capacity;
+ max_capacity = max(capacity, max_capacity);
+ min_capacity = min(capacity, min_capacity);
}
} else {
/*
@@ -6876,12 +8517,18 @@ void update_group_capacity(struct sched_domain *sd, int cpu)
group = child->groups;
do {
- capacity += group->sgc->capacity;
+ struct sched_group_capacity *sgc = group->sgc;
+
+ capacity += sgc->capacity;
+ max_capacity = max(sgc->max_capacity, max_capacity);
+ min_capacity = min(sgc->min_capacity, min_capacity);
group = group->next;
} while (group != child->groups);
}
sdg->sgc->capacity = capacity;
+ sdg->sgc->max_capacity = max_capacity;
+ sdg->sgc->min_capacity = min_capacity;
}
/*
@@ -6976,6 +8623,17 @@ group_is_overloaded(struct lb_env *env, struct sg_lb_stats *sgs)
return false;
}
+/*
+ * group_smaller_cpu_capacity: Returns true if sched_group sg has smaller
+ * per-cpu capacity than sched_group ref.
+ */
+static inline bool
+group_smaller_cpu_capacity(struct sched_group *sg, struct sched_group *ref)
+{
+ return sg->sgc->max_capacity + capacity_margin - SCHED_CAPACITY_SCALE <
+ ref->sgc->max_capacity;
+}
+
static inline enum
group_type group_classify(struct sched_group *group,
struct sg_lb_stats *sgs)
@@ -6986,9 +8644,44 @@ group_type group_classify(struct sched_group *group,
if (sg_imbalanced(group))
return group_imbalanced;
+ if (sgs->group_misfit_task)
+ return group_misfit_task;
+
return group_other;
}
+#ifdef CONFIG_NO_HZ_COMMON
+/*
+ * idle load balancing data
+ * - used by the nohz balance, but we want it available here
+ * so that we can see which CPUs have no tick.
+ */
+static struct {
+ cpumask_var_t idle_cpus_mask;
+ atomic_t nr_cpus;
+ unsigned long next_balance; /* in jiffy units */
+} nohz ____cacheline_aligned;
+
+static inline void update_cpu_stats_if_tickless(struct rq *rq)
+{
+ /* only called from update_sg_lb_stats when irqs are disabled */
+ if (cpumask_test_cpu(rq->cpu, nohz.idle_cpus_mask)) {
+ /* rate limit updates to once-per-jiffie at most */
+ if (READ_ONCE(jiffies) <= rq->last_load_update_tick)
+ return;
+
+ raw_spin_lock(&rq->lock);
+ update_rq_clock(rq);
+ cpu_load_update_idle(rq);
+ update_cfs_rq_load_avg(rq->clock_task, &rq->cfs, false);
+ raw_spin_unlock(&rq->lock);
+ }
+}
+
+#else
+static inline void update_cpu_stats_if_tickless(struct rq *rq) { }
+#endif
+
/**
* update_sg_lb_stats - Update sched_group's statistics for load balancing.
* @env: The load balancing environment.
@@ -6997,11 +8690,12 @@ group_type group_classify(struct sched_group *group,
* @local_group: Does group contain this_cpu.
* @sgs: variable to hold the statistics for this group.
* @overload: Indicate more than one runnable task for any CPU.
+ * @overutilized: Indicate overutilization for any CPU.
*/
static inline void update_sg_lb_stats(struct lb_env *env,
struct sched_group *group, int load_idx,
int local_group, struct sg_lb_stats *sgs,
- bool *overload)
+ bool *overload, bool *overutilized)
{
unsigned long load;
int i, nr_running;
@@ -7011,6 +8705,12 @@ static inline void update_sg_lb_stats(struct lb_env *env,
for_each_cpu_and(i, sched_group_cpus(group), env->cpus) {
struct rq *rq = cpu_rq(i);
+ /* if we are entering idle and there are CPUs with
+ * their tick stopped, do an update for them
+ */
+ if (env->idle == CPU_NEWLY_IDLE)
+ update_cpu_stats_if_tickless(rq);
+
/* Bias balancing toward cpus of our domain */
if (local_group)
load = target_load(i, load_idx);
@@ -7035,6 +8735,12 @@ static inline void update_sg_lb_stats(struct lb_env *env,
*/
if (!nr_running && idle_cpu(i))
sgs->idle_cpus++;
+
+ if (cpu_overutilized(i)) {
+ *overutilized = true;
+ if (!sgs->group_misfit_task && rq->misfit_task)
+ sgs->group_misfit_task = capacity_of(i);
+ }
}
/* Adjust by relative CPU capacity of the group */
@@ -7076,9 +8782,31 @@ static bool update_sd_pick_busiest(struct lb_env *env,
if (sgs->group_type < busiest->group_type)
return false;
+ /*
+ * Candidate sg doesn't face any serious load-balance problems
+ * so don't pick it if the local sg is already filled up.
+ */
+ if (sgs->group_type == group_other &&
+ !group_has_capacity(env, &sds->local_stat))
+ return false;
+
if (sgs->avg_load <= busiest->avg_load)
return false;
+ if (!(env->sd->flags & SD_ASYM_CPUCAPACITY))
+ goto asym_packing;
+
+ /*
+ * Candidate sg has no more than one task per CPU and
+ * has higher per-CPU capacity. Migrating tasks to less
+ * capable CPUs may harm throughput. Maximize throughput,
+ * power/energy consequences are not considered.
+ */
+ if (sgs->sum_nr_running <= sgs->group_weight &&
+ group_smaller_cpu_capacity(sds->local, sg))
+ return false;
+
+asym_packing:
/* This is the busiest node in its class. */
if (!(env->sd->flags & SD_ASYM_PACKING))
return true;
@@ -7133,6 +8861,9 @@ static inline enum fbq_type fbq_classify_rq(struct rq *rq)
}
#endif /* CONFIG_NUMA_BALANCING */
+#define lb_sd_parent(sd) \
+ (sd->parent && sd->parent->groups != sd->parent->groups->next)
+
/**
* update_sd_lb_stats - Update sched_domain's statistics for load balancing.
* @env: The load balancing environment.
@@ -7144,7 +8875,7 @@ static inline void update_sd_lb_stats(struct lb_env *env, struct sd_lb_stats *sd
struct sched_group *sg = env->sd->groups;
struct sg_lb_stats tmp_sgs;
int load_idx, prefer_sibling = 0;
- bool overload = false;
+ bool overload = false, overutilized = false;
if (child && child->flags & SD_PREFER_SIBLING)
prefer_sibling = 1;
@@ -7166,7 +8897,7 @@ static inline void update_sd_lb_stats(struct lb_env *env, struct sd_lb_stats *sd
}
update_sg_lb_stats(env, sg, load_idx, local_group, sgs,
- &overload);
+ &overload, &overutilized);
if (local_group)
goto next_group;
@@ -7188,6 +8919,15 @@ static inline void update_sd_lb_stats(struct lb_env *env, struct sd_lb_stats *sd
sgs->group_type = group_classify(sg, sgs);
}
+ /*
+ * Ignore task groups with misfit tasks if local group has no
+ * capacity or if per-cpu capacity isn't higher.
+ */
+ if (sgs->group_type == group_misfit_task &&
+ (!group_has_capacity(env, &sds->local_stat) ||
+ !group_smaller_cpu_capacity(sg, sds->local)))
+ sgs->group_type = group_other;
+
if (update_sd_pick_busiest(env, sds, sg, sgs)) {
sds->busiest = sg;
sds->busiest_stat = *sgs;
@@ -7204,10 +8944,23 @@ next_group:
if (env->sd->flags & SD_NUMA)
env->fbq_type = fbq_classify_group(&sds->busiest_stat);
- if (!env->sd->parent) {
+ env->src_grp_nr_running = sds->busiest_stat.sum_nr_running;
+
+ if (!lb_sd_parent(env->sd)) {
/* update overload indicator if we are at root domain */
if (env->dst_rq->rd->overload != overload)
env->dst_rq->rd->overload = overload;
+
+ /* Update over-utilization (tipping point, U >= 0) indicator */
+ if (env->dst_rq->rd->overutilized != overutilized) {
+ env->dst_rq->rd->overutilized = overutilized;
+ trace_sched_overutilized(overutilized);
+ }
+ } else {
+ if (!env->dst_rq->rd->overutilized && overutilized) {
+ env->dst_rq->rd->overutilized = true;
+ trace_sched_overutilized(true);
+ }
}
}
@@ -7360,6 +9113,22 @@ static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *s
*/
if (busiest->avg_load <= sds->avg_load ||
local->avg_load >= sds->avg_load) {
+ /* Misfitting tasks should be migrated in any case */
+ if (busiest->group_type == group_misfit_task) {
+ env->imbalance = busiest->group_misfit_task;
+ return;
+ }
+
+ /*
+ * Busiest group is overloaded, local is not, use the spare
+ * cycles to maximize throughput
+ */
+ if (busiest->group_type == group_overloaded &&
+ local->group_type <= group_misfit_task) {
+ env->imbalance = busiest->load_per_task;
+ return;
+ }
+
env->imbalance = 0;
return fix_small_imbalance(env, sds);
}
@@ -7393,6 +9162,11 @@ static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *s
(sds->avg_load - local->avg_load) * local->group_capacity
) / SCHED_CAPACITY_SCALE;
+ /* Boost imbalance to allow misfit task to be balanced. */
+ if (busiest->group_type == group_misfit_task)
+ env->imbalance = max_t(long, env->imbalance,
+ busiest->group_misfit_task);
+
/*
* if *imbalance is less than the average load per runnable task
* there is no guarantee that any tasks will be moved so we'll have
@@ -7428,6 +9202,10 @@ static struct sched_group *find_busiest_group(struct lb_env *env)
* this level.
*/
update_sd_lb_stats(env, &sds);
+
+ if (energy_aware() && !env->dst_rq->rd->overutilized)
+ goto out_balanced;
+
local = &sds.local_stat;
busiest = &sds.busiest_stat;
@@ -7450,11 +9228,19 @@ static struct sched_group *find_busiest_group(struct lb_env *env)
if (busiest->group_type == group_imbalanced)
goto force_balance;
- /* SD_BALANCE_NEWIDLE trumps SMP nice when underutilized */
- if (env->idle == CPU_NEWLY_IDLE && group_has_capacity(env, local) &&
+ /*
+ * When dst_cpu is idle, prevent SMP nice and/or asymmetric group
+ * capacities from resulting in underutilization due to avg_load.
+ */
+ if (env->idle != CPU_NOT_IDLE && group_has_capacity(env, local) &&
busiest->group_no_capacity)
goto force_balance;
+ /* Misfitting tasks should be dealt with regardless of the avg load */
+ if (busiest->group_type == group_misfit_task) {
+ goto force_balance;
+ }
+
/*
* If the local group is busier than the selected busiest group
* don't try and pull any tasks.
@@ -7478,7 +9264,8 @@ static struct sched_group *find_busiest_group(struct lb_env *env)
* might end up to just move the imbalance on another group
*/
if ((busiest->group_type != group_overloaded) &&
- (local->idle_cpus <= (busiest->idle_cpus + 1)))
+ (local->idle_cpus <= (busiest->idle_cpus + 1)) &&
+ !group_smaller_cpu_capacity(sds.busiest, sds.local))
goto out_balanced;
} else {
/*
@@ -7491,6 +9278,7 @@ static struct sched_group *find_busiest_group(struct lb_env *env)
}
force_balance:
+ env->busiest_group_type = busiest->group_type;
/* Looks like there is an imbalance. Compute it */
calculate_imbalance(env, &sds);
return sds.busiest;
@@ -7549,7 +9337,8 @@ static struct rq *find_busiest_queue(struct lb_env *env,
*/
if (rq->nr_running == 1 && wl > env->imbalance &&
- !check_cpu_capacity(rq, env->sd))
+ !check_cpu_capacity(rq, env->sd) &&
+ env->busiest_group_type != group_misfit_task)
continue;
/*
@@ -7607,6 +9396,14 @@ static int need_active_balance(struct lb_env *env)
return 1;
}
+ if ((capacity_of(env->src_cpu) < capacity_of(env->dst_cpu)) &&
+ ((capacity_orig_of(env->src_cpu) < capacity_orig_of(env->dst_cpu))) &&
+ env->src_rq->cfs.h_nr_running == 1 &&
+ cpu_overutilized(env->src_cpu) &&
+ !cpu_overutilized(env->dst_cpu)) {
+ return 1;
+ }
+
return unlikely(sd->nr_balance_failed > sd->cache_nice_tries+2);
}
@@ -7655,7 +9452,7 @@ static int load_balance(int this_cpu, struct rq *this_rq,
int *continue_balancing)
{
int ld_moved, cur_ld_moved, active_balance = 0;
- struct sched_domain *sd_parent = sd->parent;
+ struct sched_domain *sd_parent = lb_sd_parent(sd) ? sd->parent : NULL;
struct sched_group *group;
struct rq *busiest;
unsigned long flags;
@@ -7722,6 +9519,7 @@ redo:
more_balance:
raw_spin_lock_irqsave(&busiest->lock, flags);
+ update_rq_clock(busiest);
/*
* cur_ld_moved - load moved in current iteration
@@ -7819,7 +9617,8 @@ more_balance:
* excessive cache_hot migrations and active balances.
*/
if (idle != CPU_NEWLY_IDLE)
- sd->nr_balance_failed++;
+ if (env.src_grp_nr_running > 1)
+ sd->nr_balance_failed++;
if (need_active_balance(&env)) {
raw_spin_lock_irqsave(&busiest->lock, flags);
@@ -7956,8 +9755,9 @@ static int idle_balance(struct rq *this_rq)
*/
this_rq->idle_stamp = rq_clock(this_rq);
- if (this_rq->avg_idle < sysctl_sched_migration_cost ||
- !this_rq->rd->overload) {
+ if (!energy_aware() &&
+ (this_rq->avg_idle < sysctl_sched_migration_cost ||
+ !this_rq->rd->overload)) {
rcu_read_lock();
sd = rcu_dereference_check_sched_domain(this_rq->sd);
if (sd)
@@ -8048,8 +9848,18 @@ static int active_load_balance_cpu_stop(void *data)
int busiest_cpu = cpu_of(busiest_rq);
int target_cpu = busiest_rq->push_cpu;
struct rq *target_rq = cpu_rq(target_cpu);
- struct sched_domain *sd;
+ struct sched_domain *sd = NULL;
struct task_struct *p = NULL;
+ struct task_struct *push_task = NULL;
+ int push_task_detached = 0;
+ struct lb_env env = {
+ .sd = sd,
+ .dst_cpu = target_cpu,
+ .dst_rq = target_rq,
+ .src_cpu = busiest_rq->cpu,
+ .src_rq = busiest_rq,
+ .idle = CPU_IDLE,
+ };
raw_spin_lock_irq(&busiest_rq->lock);
@@ -8069,6 +9879,17 @@ static int active_load_balance_cpu_stop(void *data)
*/
BUG_ON(busiest_rq == target_rq);
+ push_task = busiest_rq->push_task;
+ if (push_task) {
+ if (task_on_rq_queued(push_task) &&
+ task_cpu(push_task) == busiest_cpu &&
+ cpu_online(target_cpu)) {
+ detach_task(push_task, &env);
+ push_task_detached = 1;
+ }
+ goto out_unlock;
+ }
+
/* Search for an sd spanning us and the target CPU. */
rcu_read_lock();
for_each_domain(target_cpu, sd) {
@@ -8078,16 +9899,9 @@ static int active_load_balance_cpu_stop(void *data)
}
if (likely(sd)) {
- struct lb_env env = {
- .sd = sd,
- .dst_cpu = target_cpu,
- .dst_rq = target_rq,
- .src_cpu = busiest_rq->cpu,
- .src_rq = busiest_rq,
- .idle = CPU_IDLE,
- };
-
+ env.sd = sd;
schedstat_inc(sd->alb_count);
+ update_rq_clock(busiest_rq);
p = detach_one_task(&env);
if (p) {
@@ -8101,8 +9915,18 @@ static int active_load_balance_cpu_stop(void *data)
rcu_read_unlock();
out_unlock:
busiest_rq->active_balance = 0;
+
+ if (push_task)
+ busiest_rq->push_task = NULL;
+
raw_spin_unlock(&busiest_rq->lock);
+ if (push_task) {
+ if (push_task_detached)
+ attach_one_task(target_rq, push_task);
+ put_task_struct(push_task);
+ }
+
if (p)
attach_one_task(target_rq, p);
@@ -8123,12 +9947,6 @@ static inline int on_null_domain(struct rq *rq)
* needed, they will kick the idle load balancer, which then does idle
* load balancing for all the idle CPUs.
*/
-static struct {
- cpumask_var_t idle_cpus_mask;
- atomic_t nr_cpus;
- unsigned long next_balance; /* in jiffy units */
-} nohz ____cacheline_aligned;
-
static inline int find_new_ilb(void)
{
int ilb = cpumask_first(nohz.idle_cpus_mask);
@@ -8462,9 +10280,14 @@ static inline bool nohz_kick_needed(struct rq *rq)
if (time_before(now, nohz.next_balance))
return false;
- if (rq->nr_running >= 2)
+ if (rq->nr_running >= 2 &&
+ (!energy_aware() || cpu_overutilized(cpu)))
return true;
+ /* Do idle load balance if there have misfit task */
+ if (energy_aware())
+ return rq->misfit_task;
+
rcu_read_lock();
sds = rcu_dereference(per_cpu(sd_llc_shared, cpu));
if (sds) {
@@ -8558,6 +10381,47 @@ static void rq_offline_fair(struct rq *rq)
unthrottle_offline_cfs_rqs(rq);
}
+static inline int
+kick_active_balance(struct rq *rq, struct task_struct *p, int new_cpu)
+{
+ int rc = 0;
+
+ /* Invoke active balance to force migrate currently running task */
+ raw_spin_lock(&rq->lock);
+ if (!rq->active_balance) {
+ rq->active_balance = 1;
+ rq->push_cpu = new_cpu;
+ get_task_struct(p);
+ rq->push_task = p;
+ rc = 1;
+ }
+ raw_spin_unlock(&rq->lock);
+
+ return rc;
+}
+
+void check_for_migration(struct rq *rq, struct task_struct *p)
+{
+ int new_cpu;
+ int active_balance;
+ int cpu = task_cpu(p);
+
+ if (energy_aware() && rq->misfit_task) {
+ if (rq->curr->state != TASK_RUNNING ||
+ rq->curr->nr_cpus_allowed == 1)
+ return;
+
+ new_cpu = select_energy_cpu_brute(p, cpu, 0);
+ if (capacity_orig_of(new_cpu) > capacity_orig_of(cpu)) {
+ active_balance = kick_active_balance(rq, p, new_cpu);
+ if (active_balance)
+ stop_one_cpu_nowait(cpu,
+ active_load_balance_cpu_stop,
+ rq, &rq->active_balance_work);
+ }
+ }
+}
+
#endif /* CONFIG_SMP */
/*
@@ -8575,6 +10439,16 @@ static void task_tick_fair(struct rq *rq, struct task_struct *curr, int queued)
if (static_branch_unlikely(&sched_numa_balancing))
task_tick_numa(rq, curr);
+
+#ifdef CONFIG_SMP
+ if (!rq->rd->overutilized && cpu_overutilized(task_cpu(curr))) {
+ rq->rd->overutilized = true;
+ trace_sched_overutilized(true);
+ }
+
+ rq->misfit_task = !task_fits_max(curr, rq->cpu);
+#endif
+
}
/*
@@ -8662,32 +10536,45 @@ static inline bool vruntime_normalized(struct task_struct *p)
return false;
}
-static void detach_task_cfs_rq(struct task_struct *p)
+#ifdef CONFIG_FAIR_GROUP_SCHED
+/*
+ * Propagate the changes of the sched_entity across the tg tree to make it
+ * visible to the root
+ */
+static void propagate_entity_cfs_rq(struct sched_entity *se)
{
- struct sched_entity *se = &p->se;
- struct cfs_rq *cfs_rq = cfs_rq_of(se);
- u64 now = cfs_rq_clock_task(cfs_rq);
+ struct cfs_rq *cfs_rq;
- if (!vruntime_normalized(p)) {
- /*
- * Fix up our vruntime so that the current sleep doesn't
- * cause 'unlimited' sleep bonus.
- */
- place_entity(cfs_rq, se, 0);
- se->vruntime -= cfs_rq->min_vruntime;
+ /* Start to propagate at parent */
+ se = se->parent;
+
+ for_each_sched_entity(se) {
+ cfs_rq = cfs_rq_of(se);
+
+ if (cfs_rq_throttled(cfs_rq))
+ break;
+
+ update_load_avg(se, UPDATE_TG);
}
+}
+#else
+static void propagate_entity_cfs_rq(struct sched_entity *se) { }
+#endif
+
+static void detach_entity_cfs_rq(struct sched_entity *se)
+{
+ struct cfs_rq *cfs_rq = cfs_rq_of(se);
/* Catch up with the cfs_rq and remove our load when we leave */
- update_cfs_rq_load_avg(now, cfs_rq, false);
+ update_load_avg(se, 0);
detach_entity_load_avg(cfs_rq, se);
update_tg_load_avg(cfs_rq, false);
+ propagate_entity_cfs_rq(se);
}
-static void attach_task_cfs_rq(struct task_struct *p)
+static void attach_entity_cfs_rq(struct sched_entity *se)
{
- struct sched_entity *se = &p->se;
struct cfs_rq *cfs_rq = cfs_rq_of(se);
- u64 now = cfs_rq_clock_task(cfs_rq);
#ifdef CONFIG_FAIR_GROUP_SCHED
/*
@@ -8697,10 +10584,36 @@ static void attach_task_cfs_rq(struct task_struct *p)
se->depth = se->parent ? se->parent->depth + 1 : 0;
#endif
- /* Synchronize task with its cfs_rq */
- update_cfs_rq_load_avg(now, cfs_rq, false);
+ /* Synchronize entity with its cfs_rq */
+ update_load_avg(se, sched_feat(ATTACH_AGE_LOAD) ? 0 : SKIP_AGE_LOAD);
attach_entity_load_avg(cfs_rq, se);
update_tg_load_avg(cfs_rq, false);
+ propagate_entity_cfs_rq(se);
+}
+
+static void detach_task_cfs_rq(struct task_struct *p)
+{
+ struct sched_entity *se = &p->se;
+ struct cfs_rq *cfs_rq = cfs_rq_of(se);
+
+ if (!vruntime_normalized(p)) {
+ /*
+ * Fix up our vruntime so that the current sleep doesn't
+ * cause 'unlimited' sleep bonus.
+ */
+ place_entity(cfs_rq, se, 0);
+ se->vruntime -= cfs_rq->min_vruntime;
+ }
+
+ detach_entity_cfs_rq(se);
+}
+
+static void attach_task_cfs_rq(struct task_struct *p)
+{
+ struct sched_entity *se = &p->se;
+ struct cfs_rq *cfs_rq = cfs_rq_of(se);
+
+ attach_entity_cfs_rq(se);
if (!vruntime_normalized(p))
se->vruntime += cfs_rq->min_vruntime;
@@ -8754,6 +10667,9 @@ void init_cfs_rq(struct cfs_rq *cfs_rq)
cfs_rq->min_vruntime_copy = cfs_rq->min_vruntime;
#endif
#ifdef CONFIG_SMP
+#ifdef CONFIG_FAIR_GROUP_SCHED
+ cfs_rq->propagate_avg = 0;
+#endif
atomic_long_set(&cfs_rq->removed_load_avg, 0);
atomic_long_set(&cfs_rq->removed_util_avg, 0);
#endif
@@ -8862,7 +10778,8 @@ void online_fair_sched_group(struct task_group *tg)
se = tg->se[i];
raw_spin_lock_irq(&rq->lock);
- post_init_entity_util_avg(se);
+ update_rq_clock(rq);
+ attach_entity_cfs_rq(se);
sync_throttle(tg, i);
raw_spin_unlock_irq(&rq->lock);
}
@@ -8954,8 +10871,10 @@ int sched_group_set_shares(struct task_group *tg, unsigned long shares)
/* Possible calls to update_curr() need rq clock */
update_rq_clock(rq);
- for_each_sched_entity(se)
- update_cfs_shares(group_cfs_rq(se));
+ for_each_sched_entity(se) {
+ update_load_avg(se, UPDATE_TG);
+ update_cfs_shares(se);
+ }
raw_spin_unlock_irqrestore(&rq->lock, flags);
}
diff --git a/kernel/sched/features.h b/kernel/sched/features.h
index 1b3c8189b286..c7f1bdd4dcc8 100644
--- a/kernel/sched/features.h
+++ b/kernel/sched/features.h
@@ -74,3 +74,27 @@ SCHED_FEAT(RT_RUNTIME_SHARE, true)
SCHED_FEAT(LB_MIN, false)
SCHED_FEAT(ATTACH_AGE_LOAD, true)
+/*
+ * Energy aware scheduling. Use platform energy model to guide scheduling
+ * decisions optimizing for energy efficiency.
+ */
+#ifdef CONFIG_DEFAULT_USE_ENERGY_AWARE
+SCHED_FEAT(ENERGY_AWARE, true)
+#else
+SCHED_FEAT(ENERGY_AWARE, false)
+#endif
+
+/*
+ * Minimum capacity capping. Keep track of minimum capacity factor when
+ * minimum frequency available to a policy is modified.
+ * If enabled, this can be used to inform the scheduler about capacity
+ * restrictions.
+ */
+SCHED_FEAT(MIN_CAPACITY_CAPPING, false)
+
+/*
+ * Enforce the priority of candidates selected by find_best_target()
+ * ON: If the target CPU saves any energy, use that.
+ * OFF: Use whichever of target or backup saves most.
+ */
+SCHED_FEAT(FBT_STRICT_ORDER, true)
diff --git a/kernel/sched/idle.c b/kernel/sched/idle.c
index 1d8718d5300d..cf75f00f7037 100644
--- a/kernel/sched/idle.c
+++ b/kernel/sched/idle.c
@@ -23,9 +23,10 @@ extern char __cpuidle_text_start[], __cpuidle_text_end[];
* sched_idle_set_state - Record idle state for the current CPU.
* @idle_state: State to record.
*/
-void sched_idle_set_state(struct cpuidle_state *idle_state)
+void sched_idle_set_state(struct cpuidle_state *idle_state, int index)
{
idle_set_state(this_rq(), idle_state);
+ idle_set_state_idx(this_rq(), index);
}
static int __read_mostly cpu_idle_force_poll;
diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c
index 9ab4d73e9cc9..5b4c9b49551a 100644
--- a/kernel/sched/rt.c
+++ b/kernel/sched/rt.c
@@ -8,6 +8,8 @@
#include <linux/slab.h>
#include <linux/irq_work.h>
+#include "walt.h"
+
int sched_rr_timeslice = RR_TIMESLICE;
static int do_sched_rt_period_timer(struct rt_bandwidth *rt_b, int overrun);
@@ -888,6 +890,51 @@ static inline int rt_se_prio(struct sched_rt_entity *rt_se)
return rt_task_of(rt_se)->prio;
}
+static void dump_throttled_rt_tasks(struct rt_rq *rt_rq)
+{
+ struct rt_prio_array *array = &rt_rq->active;
+ struct sched_rt_entity *rt_se;
+ char buf[500];
+ char *pos = buf;
+ char *end = buf + sizeof(buf);
+ int idx;
+
+ pos += snprintf(pos, sizeof(buf),
+ "sched: RT throttling activated for rt_rq %p (cpu %d)\n",
+ rt_rq, cpu_of(rq_of_rt_rq(rt_rq)));
+
+ if (bitmap_empty(array->bitmap, MAX_RT_PRIO))
+ goto out;
+
+ pos += snprintf(pos, end - pos, "potential CPU hogs:\n");
+ idx = sched_find_first_bit(array->bitmap);
+ while (idx < MAX_RT_PRIO) {
+ list_for_each_entry(rt_se, array->queue + idx, run_list) {
+ struct task_struct *p;
+
+ if (!rt_entity_is_task(rt_se))
+ continue;
+
+ p = rt_task_of(rt_se);
+ if (pos < end)
+ pos += snprintf(pos, end - pos, "\t%s (%d)\n",
+ p->comm, p->pid);
+ }
+ idx = find_next_bit(array->bitmap, MAX_RT_PRIO, idx + 1);
+ }
+out:
+#ifdef CONFIG_PANIC_ON_RT_THROTTLING
+ /*
+ * Use pr_err() in the BUG() case since printk_sched() will
+ * not get flushed and deadlock is not a concern.
+ */
+ pr_err("%s", buf);
+ BUG();
+#else
+ printk_deferred("%s", buf);
+#endif
+}
+
static int sched_rt_runtime_exceeded(struct rt_rq *rt_rq)
{
u64 runtime = sched_rt_runtime(rt_rq);
@@ -911,8 +958,14 @@ static int sched_rt_runtime_exceeded(struct rt_rq *rt_rq)
* but accrue some time due to boosting.
*/
if (likely(rt_b->rt_runtime)) {
+ static bool once = false;
+
rt_rq->rt_throttled = 1;
- printk_deferred_once("sched: RT throttling activated\n");
+
+ if (!once) {
+ once = true;
+ dump_throttled_rt_tasks(rt_rq);
+ }
} else {
/*
* In case we did anyway, make it go away,
@@ -1313,6 +1366,7 @@ enqueue_task_rt(struct rq *rq, struct task_struct *p, int flags)
rt_se->timeout = 0;
enqueue_rt_entity(rt_se, flags);
+ walt_inc_cumulative_runnable_avg(rq, p);
if (!task_current(rq, p) && tsk_nr_cpus_allowed(p) > 1)
enqueue_pushable_task(rq, p);
@@ -1324,6 +1378,7 @@ static void dequeue_task_rt(struct rq *rq, struct task_struct *p, int flags)
update_curr_rt(rq);
dequeue_rt_entity(rt_se, flags);
+ walt_dec_cumulative_runnable_avg(rq, p);
dequeue_pushable_task(rq, p);
}
@@ -1833,7 +1888,9 @@ retry:
}
deactivate_task(rq, next_task, 0);
+ next_task->on_rq = TASK_ON_RQ_MIGRATING;
set_task_cpu(next_task, lowest_rq->cpu);
+ next_task->on_rq = TASK_ON_RQ_QUEUED;
activate_task(lowest_rq, next_task, 0);
ret = 1;
@@ -2105,7 +2162,9 @@ static void pull_rt_task(struct rq *this_rq)
resched = true;
deactivate_task(src_rq, p, 0);
+ p->on_rq = TASK_ON_RQ_MIGRATING;
set_task_cpu(p, this_cpu);
+ p->on_rq = TASK_ON_RQ_QUEUED;
activate_task(this_rq, p, 0);
/*
* We continue with the search, just in
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index ec6e838e991a..40bc5aaef223 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -40,8 +40,10 @@ extern long calc_load_fold_active(struct rq *this_rq, long adjust);
#ifdef CONFIG_SMP
extern void cpu_load_update_active(struct rq *this_rq);
+extern void check_for_migration(struct rq *rq, struct task_struct *p);
#else
static inline void cpu_load_update_active(struct rq *this_rq) { }
+static inline void check_for_migration(struct rq *rq, struct task_struct *p) { }
#endif
#ifdef CONFIG_SCHED_SMT
@@ -413,6 +415,7 @@ struct cfs_rq {
unsigned long runnable_load_avg;
#ifdef CONFIG_FAIR_GROUP_SCHED
unsigned long tg_load_avg_contrib;
+ unsigned long propagate_avg;
#endif
atomic_long_t removed_load_avg, removed_util_avg;
#ifndef CONFIG_64BIT
@@ -456,6 +459,9 @@ struct cfs_rq {
u64 throttled_clock_task_time;
int throttled, throttle_count;
struct list_head throttled_list;
+#ifdef CONFIG_SCHED_WALT
+ u64 cumulative_runnable_avg;
+#endif /* CONFIG_SCHED_WALT */
#endif /* CONFIG_CFS_BANDWIDTH */
#endif /* CONFIG_FAIR_GROUP_SCHED */
};
@@ -542,6 +548,12 @@ struct dl_rq {
#ifdef CONFIG_SMP
+struct max_cpu_capacity {
+ raw_spinlock_t lock;
+ unsigned long val;
+ int cpu;
+};
+
/*
* We add the notion of a root-domain which will be used to define per-domain
* variables. Each exclusive cpuset essentially defines an island domain by
@@ -560,6 +572,9 @@ struct root_domain {
/* Indicate more than one runnable task for any CPU */
bool overload;
+ /* Indicate one or more cpus over-utilized (tipping point) */
+ bool overutilized;
+
/*
* The bit corresponding to a CPU gets set here if such CPU has more
* than one runnable -deadline task (as it is below for RT tasks).
@@ -589,7 +604,11 @@ struct root_domain {
cpumask_var_t rto_mask;
struct cpupri cpupri;
- unsigned long max_cpu_capacity;
+ /* Maximum cpu capacity in the system. */
+ struct max_cpu_capacity max_cpu_capacity;
+
+ /* First cpu with maximum and minimum original capacity */
+ int max_cap_orig_cpu, min_cap_orig_cpu;
};
extern struct root_domain def_root_domain;
@@ -623,6 +642,7 @@ struct rq {
#endif
#define CPU_LOAD_IDX_MAX 5
unsigned long cpu_load[CPU_LOAD_IDX_MAX];
+ unsigned int misfit_task;
#ifdef CONFIG_NO_HZ_COMMON
#ifdef CONFIG_SMP
unsigned long last_load_update_tick;
@@ -632,6 +652,14 @@ struct rq {
#ifdef CONFIG_NO_HZ_FULL
unsigned long last_sched_tick;
#endif
+
+#ifdef CONFIG_CPU_QUIET
+ /* time-based average load */
+ u64 nr_last_stamp;
+ u64 nr_running_integral;
+ seqcount_t ave_seqcnt;
+#endif
+
/* capture load from *all* tasks on this cpu: */
struct load_weight load;
unsigned long nr_load_updates;
@@ -644,6 +672,7 @@ struct rq {
#ifdef CONFIG_FAIR_GROUP_SCHED
/* list of leaf cfs_rq on this cpu: */
struct list_head leaf_cfs_rq_list;
+ struct list_head *tmp_alone_branch;
#endif /* CONFIG_FAIR_GROUP_SCHED */
/*
@@ -677,6 +706,7 @@ struct rq {
/* For active balancing */
int active_balance;
int push_cpu;
+ struct task_struct *push_task;
struct cpu_stop_work active_balance_work;
/* cpu of this runqueue: */
int cpu;
@@ -693,6 +723,20 @@ struct rq {
u64 max_idle_balance_cost;
#endif
+#ifdef CONFIG_SCHED_WALT
+ u64 cumulative_runnable_avg;
+ u64 window_start;
+ u64 curr_runnable_sum;
+ u64 prev_runnable_sum;
+ u64 nt_curr_runnable_sum;
+ u64 nt_prev_runnable_sum;
+ u64 cur_irqload;
+ u64 avg_irqload;
+ u64 irqload_ts;
+ u64 cum_window_demand;
+#endif /* CONFIG_SCHED_WALT */
+
+
#ifdef CONFIG_IRQ_TIME_ACCOUNTING
u64 prev_irq_time;
#endif
@@ -731,6 +775,9 @@ struct rq {
/* try_to_wake_up() stats */
unsigned int ttwu_count;
unsigned int ttwu_local;
+#ifdef CONFIG_SMP
+ struct eas_stats eas_stats;
+#endif
#endif
#ifdef CONFIG_SMP
@@ -740,6 +787,7 @@ struct rq {
#ifdef CONFIG_CPU_IDLE
/* Must be inspected within a rcu lock section */
struct cpuidle_state *idle_state;
+ int idle_state_idx;
#endif
};
@@ -889,6 +937,8 @@ DECLARE_PER_CPU(int, sd_llc_id);
DECLARE_PER_CPU(struct sched_domain_shared *, sd_llc_shared);
DECLARE_PER_CPU(struct sched_domain *, sd_numa);
DECLARE_PER_CPU(struct sched_domain *, sd_asym);
+DECLARE_PER_CPU(struct sched_domain *, sd_ea);
+DECLARE_PER_CPU(struct sched_domain *, sd_scs);
struct sched_group_capacity {
atomic_t ref;
@@ -896,7 +946,9 @@ struct sched_group_capacity {
* CPU capacity of this group, SCHED_CAPACITY_SCALE being max capacity
* for a single CPU.
*/
- unsigned int capacity;
+ unsigned long capacity;
+ unsigned long max_capacity; /* Max per-cpu capacity in group */
+ unsigned long min_capacity; /* Min per-CPU capacity in group */
unsigned long next_update;
int imbalance; /* XXX unrelated to capacity but shared group state */
@@ -909,6 +961,7 @@ struct sched_group {
unsigned int group_weight;
struct sched_group_capacity *sgc;
+ const struct sched_group_energy *sge;
/*
* The CPUs this group covers.
@@ -1217,6 +1270,7 @@ extern const u32 sched_prio_to_wmult[40];
#else
#define ENQUEUE_MIGRATED 0x00
#endif
+#define ENQUEUE_WAKEUP_NEW 0x40
#define RETRY_TASK ((void *)-1UL)
@@ -1307,6 +1361,7 @@ extern const struct sched_class idle_sched_class;
#ifdef CONFIG_SMP
+extern void init_max_cpu_capacity(struct max_cpu_capacity *mcc);
extern void update_group_capacity(struct sched_domain *sd, int cpu);
extern void trigger_load_balance(struct rq *rq);
@@ -1327,6 +1382,17 @@ static inline struct cpuidle_state *idle_get_state(struct rq *rq)
SCHED_WARN_ON(!rcu_read_lock_held());
return rq->idle_state;
}
+
+static inline void idle_set_state_idx(struct rq *rq, int idle_state_idx)
+{
+ rq->idle_state_idx = idle_state_idx;
+}
+
+static inline int idle_get_state_idx(struct rq *rq)
+{
+ WARN_ON(!rcu_read_lock_held());
+ return rq->idle_state_idx;
+}
#else
static inline void idle_set_state(struct rq *rq,
struct cpuidle_state *idle_state)
@@ -1337,6 +1403,15 @@ static inline struct cpuidle_state *idle_get_state(struct rq *rq)
{
return NULL;
}
+
+static inline void idle_set_state_idx(struct rq *rq, int idle_state_idx)
+{
+}
+
+static inline int idle_get_state_idx(struct rq *rq)
+{
+ return -1;
+}
#endif
extern void sysrq_sched_debug_show(void);
@@ -1391,7 +1466,7 @@ static inline void sched_update_tick_dependency(struct rq *rq)
static inline void sched_update_tick_dependency(struct rq *rq) { }
#endif
-static inline void add_nr_running(struct rq *rq, unsigned count)
+static inline void __add_nr_running(struct rq *rq, unsigned count)
{
unsigned prev_nr = rq->nr_running;
@@ -1407,13 +1482,50 @@ static inline void add_nr_running(struct rq *rq, unsigned count)
sched_update_tick_dependency(rq);
}
-static inline void sub_nr_running(struct rq *rq, unsigned count)
+static inline void __sub_nr_running(struct rq *rq, unsigned count)
{
rq->nr_running -= count;
/* Check if we still need preemption */
sched_update_tick_dependency(rq);
}
+#ifdef CONFIG_CPU_QUIET
+#define NR_AVE_SCALE(x) ((x) << FSHIFT)
+static inline u64 do_nr_running_integral(struct rq *rq)
+{
+ s64 nr, deltax;
+ u64 nr_running_integral = rq->nr_running_integral;
+
+ deltax = rq->clock_task - rq->nr_last_stamp;
+ nr = NR_AVE_SCALE(rq->nr_running);
+
+ nr_running_integral += nr * deltax;
+
+ return nr_running_integral;
+}
+
+static inline void add_nr_running(struct rq *rq, unsigned count)
+{
+ write_seqcount_begin(&rq->ave_seqcnt);
+ rq->nr_running_integral = do_nr_running_integral(rq);
+ rq->nr_last_stamp = rq->clock_task;
+ __add_nr_running(rq, count);
+ write_seqcount_end(&rq->ave_seqcnt);
+}
+
+static inline void sub_nr_running(struct rq *rq, unsigned count)
+{
+ write_seqcount_begin(&rq->ave_seqcnt);
+ rq->nr_running_integral = do_nr_running_integral(rq);
+ rq->nr_last_stamp = rq->clock_task;
+ __sub_nr_running(rq, count);
+ write_seqcount_end(&rq->ave_seqcnt);
+}
+#else
+#define add_nr_running __add_nr_running
+#define sub_nr_running __sub_nr_running
+#endif
+
static inline void rq_last_tick_reset(struct rq *rq)
{
#ifdef CONFIG_NO_HZ_FULL
@@ -1475,6 +1587,26 @@ unsigned long arch_scale_freq_capacity(struct sched_domain *sd, int cpu)
}
#endif
+#ifndef arch_scale_max_freq_capacity
+static __always_inline
+unsigned long arch_scale_max_freq_capacity(struct sched_domain *sd, int cpu)
+{
+ return SCHED_CAPACITY_SCALE;
+}
+#endif
+
+#ifndef arch_scale_min_freq_capacity
+static __always_inline
+unsigned long arch_scale_min_freq_capacity(struct sched_domain *sd, int cpu)
+{
+ /*
+ * Multiplied with any capacity value, this scale factor will return
+ * 0, which represents an un-capped state
+ */
+ return 0;
+}
+#endif
+
#ifndef arch_scale_cpu_capacity
static __always_inline
unsigned long arch_scale_cpu_capacity(struct sched_domain *sd, int cpu)
@@ -1486,10 +1618,87 @@ unsigned long arch_scale_cpu_capacity(struct sched_domain *sd, int cpu)
}
#endif
+#ifdef CONFIG_SMP
+static inline unsigned long capacity_of(int cpu)
+{
+ return cpu_rq(cpu)->cpu_capacity;
+}
+
+static inline unsigned long capacity_orig_of(int cpu)
+{
+ return cpu_rq(cpu)->cpu_capacity_orig;
+}
+
+extern unsigned int sysctl_sched_use_walt_cpu_util;
+extern unsigned int walt_ravg_window;
+extern bool walt_disabled;
+
+/*
+ * cpu_util returns the amount of capacity of a CPU that is used by CFS
+ * tasks. The unit of the return value must be the one of capacity so we can
+ * compare the utilization with the capacity of the CPU that is available for
+ * CFS task (ie cpu_capacity).
+ *
+ * cfs_rq.avg.util_avg is the sum of running time of runnable tasks plus the
+ * recent utilization of currently non-runnable tasks on a CPU. It represents
+ * the amount of utilization of a CPU in the range [0..capacity_orig] where
+ * capacity_orig is the cpu_capacity available at the highest frequency
+ * (arch_scale_freq_capacity()).
+ * The utilization of a CPU converges towards a sum equal to or less than the
+ * current capacity (capacity_curr <= capacity_orig) of the CPU because it is
+ * the running time on this CPU scaled by capacity_curr.
+ *
+ * Nevertheless, cfs_rq.avg.util_avg can be higher than capacity_curr or even
+ * higher than capacity_orig because of unfortunate rounding in
+ * cfs.avg.util_avg or just after migrating tasks and new task wakeups until
+ * the average stabilizes with the new running time. We need to check that the
+ * utilization stays within the range of [0..capacity_orig] and cap it if
+ * necessary. Without utilization capping, a group could be seen as overloaded
+ * (CPU0 utilization at 121% + CPU1 utilization at 80%) whereas CPU1 has 20% of
+ * available capacity. We allow utilization to overshoot capacity_curr (but not
+ * capacity_orig) as it useful for predicting the capacity required after task
+ * migrations (scheduler-driven DVFS).
+ */
+static inline unsigned long __cpu_util(int cpu, int delta)
+{
+ unsigned long util = cpu_rq(cpu)->cfs.avg.util_avg;
+ unsigned long capacity = capacity_orig_of(cpu);
+
+#ifdef CONFIG_SCHED_WALT
+ if (!walt_disabled && sysctl_sched_use_walt_cpu_util)
+ util = div64_u64(cpu_rq(cpu)->cumulative_runnable_avg,
+ walt_ravg_window >> SCHED_CAPACITY_SHIFT);
+#endif
+ delta += util;
+ if (delta < 0)
+ return 0;
+
+ return (delta >= capacity) ? capacity : delta;
+}
+
+static inline unsigned long cpu_util(int cpu)
+{
+ return __cpu_util(cpu, 0);
+}
+
+static inline unsigned long cpu_util_freq(int cpu)
+{
+ unsigned long util = cpu_rq(cpu)->cfs.avg.util_avg;
+ unsigned long capacity = capacity_orig_of(cpu);
+
+#ifdef CONFIG_SCHED_WALT
+ if (!walt_disabled && sysctl_sched_use_walt_cpu_util)
+ util = div64_u64(cpu_rq(cpu)->prev_runnable_sum,
+ walt_ravg_window >> SCHED_CAPACITY_SHIFT);
+#endif
+ return (util >= capacity) ? capacity : util;
+}
+
+#endif
+
static inline void sched_rt_avg_update(struct rq *rq, u64 rt_delta)
{
rq->rt_avg += rt_delta * arch_scale_freq_capacity(NULL, cpu_of(rq));
- sched_avg_update(rq);
}
#else
static inline void sched_rt_avg_update(struct rq *rq, u64 rt_delta) { }
@@ -1524,6 +1733,9 @@ task_rq_unlock(struct rq *rq, struct task_struct *p, struct rq_flags *rf)
raw_spin_unlock_irqrestore(&p->pi_lock, rf->flags);
}
+extern struct rq *lock_rq_of(struct task_struct *p, struct rq_flags *flags);
+extern void unlock_rq_of(struct rq *rq, struct task_struct *p, struct rq_flags *flags);
+
#ifdef CONFIG_SMP
#ifdef CONFIG_PREEMPT
@@ -1596,7 +1808,8 @@ static inline int double_lock_balance(struct rq *this_rq, struct rq *busiest)
static inline void double_unlock_balance(struct rq *this_rq, struct rq *busiest)
__releases(busiest->lock)
{
- raw_spin_unlock(&busiest->lock);
+ if (this_rq != busiest)
+ raw_spin_unlock(&busiest->lock);
lock_set_subclass(&this_rq->lock.dep_map, 0, _RET_IP_);
}
@@ -1817,6 +2030,17 @@ static inline void cpufreq_update_util(struct rq *rq, unsigned int flags) {}
static inline void cpufreq_update_this_cpu(struct rq *rq, unsigned int flags) {}
#endif /* CONFIG_CPU_FREQ */
+#ifdef CONFIG_SCHED_WALT
+
+static inline bool
+walt_task_in_cum_window_demand(struct rq *rq, struct task_struct *p)
+{
+ return cpu_of(rq) == task_cpu(p) &&
+ (p->on_rq || p->last_sleep_ts >= rq->window_start);
+}
+
+#endif /* CONFIG_SCHED_WALT */
+
#ifdef arch_scale_freq_capacity
#ifndef arch_scale_freq_invariant
#define arch_scale_freq_invariant() (true)
diff --git a/kernel/sched/stats.c b/kernel/sched/stats.c
index 87e2c9f0c33e..6d74a7c77c8c 100644
--- a/kernel/sched/stats.c
+++ b/kernel/sched/stats.c
@@ -12,6 +12,28 @@
*/
#define SCHEDSTAT_VERSION 15
+#ifdef CONFIG_SMP
+static inline void show_easstat(struct seq_file *seq, struct eas_stats *stats)
+{
+ /* eas-specific runqueue stats */
+ seq_printf(seq, "eas %llu %llu %llu %llu %llu %llu ",
+ stats->sis_attempts, stats->sis_idle, stats->sis_cache_affine,
+ stats->sis_suff_cap, stats->sis_idle_cpu, stats->sis_count);
+
+ seq_printf(seq, "%llu %llu %llu %llu %llu %llu %llu ",
+ stats->secb_attempts, stats->secb_sync, stats->secb_idle_bt,
+ stats->secb_insuff_cap, stats->secb_no_nrg_sav,
+ stats->secb_nrg_sav, stats->secb_count);
+
+ seq_printf(seq, "%llu %llu %llu %llu %llu ",
+ stats->fbt_attempts, stats->fbt_no_cpu, stats->fbt_no_sd,
+ stats->fbt_pref_idle, stats->fbt_count);
+
+ seq_printf(seq, "%llu %llu\n",
+ stats->cas_attempts, stats->cas_count);
+}
+#endif
+
static int show_schedstat(struct seq_file *seq, void *v)
{
int cpu;
@@ -40,6 +62,8 @@ static int show_schedstat(struct seq_file *seq, void *v)
seq_printf(seq, "\n");
#ifdef CONFIG_SMP
+ show_easstat(seq, &rq->eas_stats);
+
/* domain-specific stats */
rcu_read_lock();
for_each_domain(cpu, sd) {
@@ -66,6 +90,8 @@ static int show_schedstat(struct seq_file *seq, void *v)
sd->sbf_count, sd->sbf_balanced, sd->sbf_pushed,
sd->ttwu_wake_remote, sd->ttwu_move_affine,
sd->ttwu_move_balance);
+
+ show_easstat(seq, &sd->eas_stats);
}
rcu_read_unlock();
#endif
diff --git a/kernel/sched/stop_task.c b/kernel/sched/stop_task.c
index 604297a08b3a..836a3894cf57 100644
--- a/kernel/sched/stop_task.c
+++ b/kernel/sched/stop_task.c
@@ -1,4 +1,5 @@
#include "sched.h"
+#include "walt.h"
/*
* stop-task scheduling class.
@@ -42,12 +43,14 @@ static void
enqueue_task_stop(struct rq *rq, struct task_struct *p, int flags)
{
add_nr_running(rq, 1);
+ walt_inc_cumulative_runnable_avg(rq, p);
}
static void
dequeue_task_stop(struct rq *rq, struct task_struct *p, int flags)
{
sub_nr_running(rq, 1);
+ walt_dec_cumulative_runnable_avg(rq, p);
}
static void yield_task_stop(struct rq *rq)
diff --git a/kernel/sched/tune.c b/kernel/sched/tune.c
new file mode 100644
index 000000000000..78b99bb84010
--- /dev/null
+++ b/kernel/sched/tune.c
@@ -0,0 +1,1027 @@
+#include <linux/cgroup.h>
+#include <linux/err.h>
+#include <linux/kernel.h>
+#include <linux/percpu.h>
+#include <linux/printk.h>
+#include <linux/rcupdate.h>
+#include <linux/slab.h>
+
+#include <trace/events/sched.h>
+
+#include "sched.h"
+#include "tune.h"
+
+#ifdef CONFIG_CGROUP_SCHEDTUNE
+bool schedtune_initialized = false;
+#endif
+
+unsigned int sysctl_sched_cfs_boost __read_mostly;
+
+extern struct reciprocal_value schedtune_spc_rdiv;
+struct target_nrg schedtune_target_nrg;
+
+/* Performance Boost region (B) threshold params */
+static int perf_boost_idx;
+
+/* Performance Constraint region (C) threshold params */
+static int perf_constrain_idx;
+
+/**
+ * Performance-Energy (P-E) Space thresholds constants
+ */
+struct threshold_params {
+ int nrg_gain;
+ int cap_gain;
+};
+
+/*
+ * System specific P-E space thresholds constants
+ */
+static struct threshold_params
+threshold_gains[] = {
+ { 0, 5 }, /* < 10% */
+ { 1, 5 }, /* < 20% */
+ { 2, 5 }, /* < 30% */
+ { 3, 5 }, /* < 40% */
+ { 4, 5 }, /* < 50% */
+ { 5, 4 }, /* < 60% */
+ { 5, 3 }, /* < 70% */
+ { 5, 2 }, /* < 80% */
+ { 5, 1 }, /* < 90% */
+ { 5, 0 } /* <= 100% */
+};
+
+static int
+__schedtune_accept_deltas(int nrg_delta, int cap_delta,
+ int perf_boost_idx, int perf_constrain_idx)
+{
+ int payoff = -INT_MAX;
+ int gain_idx = -1;
+
+ /* Performance Boost (B) region */
+ if (nrg_delta >= 0 && cap_delta > 0)
+ gain_idx = perf_boost_idx;
+ /* Performance Constraint (C) region */
+ else if (nrg_delta < 0 && cap_delta <= 0)
+ gain_idx = perf_constrain_idx;
+
+ /* Default: reject schedule candidate */
+ if (gain_idx == -1)
+ return payoff;
+
+ /*
+ * Evaluate "Performance Boost" vs "Energy Increase"
+ *
+ * - Performance Boost (B) region
+ *
+ * Condition: nrg_delta > 0 && cap_delta > 0
+ * Payoff criteria:
+ * cap_gain / nrg_gain < cap_delta / nrg_delta =
+ * cap_gain * nrg_delta < cap_delta * nrg_gain
+ * Note that since both nrg_gain and nrg_delta are positive, the
+ * inequality does not change. Thus:
+ *
+ * payoff = (cap_delta * nrg_gain) - (cap_gain * nrg_delta)
+ *
+ * - Performance Constraint (C) region
+ *
+ * Condition: nrg_delta < 0 && cap_delta < 0
+ * payoff criteria:
+ * cap_gain / nrg_gain > cap_delta / nrg_delta =
+ * cap_gain * nrg_delta < cap_delta * nrg_gain
+ * Note that since nrg_gain > 0 while nrg_delta < 0, the
+ * inequality change. Thus:
+ *
+ * payoff = (cap_delta * nrg_gain) - (cap_gain * nrg_delta)
+ *
+ * This means that, in case of same positive defined {cap,nrg}_gain
+ * for both the B and C regions, we can use the same payoff formula
+ * where a positive value represents the accept condition.
+ */
+ payoff = cap_delta * threshold_gains[gain_idx].nrg_gain;
+ payoff -= nrg_delta * threshold_gains[gain_idx].cap_gain;
+
+ return payoff;
+}
+
+#ifdef CONFIG_CGROUP_SCHEDTUNE
+
+/*
+ * EAS scheduler tunables for task groups.
+ *
+ * When CGroup support is enabled, we have to synchronize two different
+ * paths:
+ * - slow path: where CGroups are created/updated/removed
+ * - fast path: where tasks in a CGroups are accounted
+ *
+ * The slow path tracks (a limited number of) CGroups and maps each on a
+ * "boost_group" index. The fastpath accounts tasks currently RUNNABLE on each
+ * "boost_group".
+ *
+ * Once a new CGroup is created, a boost group idx is assigned and the
+ * corresponding "boost_group" marked as valid on each CPU.
+ * Once a CGroup is release, the corresponding "boost_group" is marked as
+ * invalid on each CPU. The CPU boost value (boost_max) is aggregated by
+ * considering only valid boost_groups with a non null tasks counter.
+ *
+ * .:: Locking strategy
+ *
+ * The fast path uses a spin lock for each CPU boost_group which protects the
+ * tasks counter.
+ *
+ * The "valid" and "boost" values of each CPU boost_group is instead
+ * protected by the RCU lock provided by the CGroups callbacks. Thus, only the
+ * slow path can access and modify the boost_group attribtues of each CPU.
+ * The fast path will catch up the most updated values at the next scheduling
+ * event (i.e. enqueue/dequeue).
+ *
+ * |
+ * SLOW PATH | FAST PATH
+ * CGroup add/update/remove | Scheduler enqueue/dequeue events
+ * |
+ * |
+ * | DEFINE_PER_CPU(struct boost_groups)
+ * | +--------------+----+---+----+----+
+ * | | idle | | | | |
+ * | | boost_max | | | | |
+ * | +---->lock | | | | |
+ * struct schedtune allocated_groups | | | group[ ] | | | | |
+ * +------------------------------+ +-------+ | | +--+---------+-+----+---+----+----+
+ * | idx | | | | | | valid |
+ * | boots / prefer_idle | | | | | | boost |
+ * | perf_{boost/constraints}_idx | <---------+(*) | | | | tasks | <------------+
+ * | css | +-------+ | | +---------+ |
+ * +-+----------------------------+ | | | | | | |
+ * ^ | | | | | | |
+ * | +-------+ | | +---------+ |
+ * | | | | | | | |
+ * | | | | | | | |
+ * | +-------+ | | +---------+ |
+ * | zmalloc | | | | | | |
+ * | | | | | | | |
+ * | +-------+ | | +---------+ |
+ * + BOOSTGROUPS_COUNT | | BOOSTGROUPS_COUNT |
+ * schedtune_boostgroup_init() | + |
+ * | schedtune_{en,de}queue_task() |
+ * | +
+ * | schedtune_tasks_update()
+ * |
+ */
+
+/* SchdTune tunables for a group of tasks */
+struct schedtune {
+ /* SchedTune CGroup subsystem */
+ struct cgroup_subsys_state css;
+
+ /* Boost group allocated ID */
+ int idx;
+
+ /* Boost value for tasks on that SchedTune CGroup */
+ int boost;
+
+ /* Performance Boost (B) region threshold params */
+ int perf_boost_idx;
+
+ /* Performance Constraint (C) region threshold params */
+ int perf_constrain_idx;
+
+ /* Hint to bias scheduling of tasks on that SchedTune CGroup
+ * towards idle CPUs */
+ int prefer_idle;
+};
+
+static inline struct schedtune *css_st(struct cgroup_subsys_state *css)
+{
+ return css ? container_of(css, struct schedtune, css) : NULL;
+}
+
+static inline struct schedtune *task_schedtune(struct task_struct *tsk)
+{
+ return css_st(task_css(tsk, schedtune_cgrp_id));
+}
+
+static inline struct schedtune *parent_st(struct schedtune *st)
+{
+ return css_st(st->css.parent);
+}
+
+/*
+ * SchedTune root control group
+ * The root control group is used to defined a system-wide boosting tuning,
+ * which is applied to all tasks in the system.
+ * Task specific boost tuning could be specified by creating and
+ * configuring a child control group under the root one.
+ * By default, system-wide boosting is disabled, i.e. no boosting is applied
+ * to tasks which are not into a child control group.
+ */
+static struct schedtune
+root_schedtune = {
+ .boost = 0,
+ .perf_boost_idx = 0,
+ .perf_constrain_idx = 0,
+ .prefer_idle = 0,
+};
+
+int
+schedtune_accept_deltas(int nrg_delta, int cap_delta,
+ struct task_struct *task)
+{
+ struct schedtune *ct;
+ int perf_boost_idx;
+ int perf_constrain_idx;
+
+ /* Optimal (O) region */
+ if (nrg_delta < 0 && cap_delta > 0) {
+ trace_sched_tune_filter(nrg_delta, cap_delta, 0, 0, 1, 0);
+ return INT_MAX;
+ }
+
+ /* Suboptimal (S) region */
+ if (nrg_delta > 0 && cap_delta < 0) {
+ trace_sched_tune_filter(nrg_delta, cap_delta, 0, 0, -1, 5);
+ return -INT_MAX;
+ }
+
+ /* Get task specific perf Boost/Constraints indexes */
+ rcu_read_lock();
+ ct = task_schedtune(task);
+ perf_boost_idx = ct->perf_boost_idx;
+ perf_constrain_idx = ct->perf_constrain_idx;
+ rcu_read_unlock();
+
+ return __schedtune_accept_deltas(nrg_delta, cap_delta,
+ perf_boost_idx, perf_constrain_idx);
+}
+
+/*
+ * Maximum number of boost groups to support
+ * When per-task boosting is used we still allow only limited number of
+ * boost groups for two main reasons:
+ * 1. on a real system we usually have only few classes of workloads which
+ * make sense to boost with different values (e.g. background vs foreground
+ * tasks, interactive vs low-priority tasks)
+ * 2. a limited number allows for a simpler and more memory/time efficient
+ * implementation especially for the computation of the per-CPU boost
+ * value
+ */
+#define BOOSTGROUPS_COUNT 5
+
+/* Array of configured boostgroups */
+static struct schedtune *allocated_group[BOOSTGROUPS_COUNT] = {
+ &root_schedtune,
+ NULL,
+};
+
+/* SchedTune boost groups
+ * Keep track of all the boost groups which impact on CPU, for example when a
+ * CPU has two RUNNABLE tasks belonging to two different boost groups and thus
+ * likely with different boost values.
+ * Since on each system we expect only a limited number of boost groups, here
+ * we use a simple array to keep track of the metrics required to compute the
+ * maximum per-CPU boosting value.
+ */
+struct boost_groups {
+ /* Maximum boost value for all RUNNABLE tasks on a CPU */
+ int boost_max;
+ struct {
+ /* True when this boost group maps an actual cgroup */
+ bool valid;
+ /* The boost for tasks on that boost group */
+ int boost;
+ /* Count of RUNNABLE tasks on that boost group */
+ unsigned tasks;
+ } group[BOOSTGROUPS_COUNT];
+ /* CPU's boost group locking */
+ raw_spinlock_t lock;
+};
+
+/* Boost groups affecting each CPU in the system */
+DEFINE_PER_CPU(struct boost_groups, cpu_boost_groups);
+
+static void
+schedtune_cpu_update(int cpu)
+{
+ struct boost_groups *bg;
+ int boost_max;
+ int idx;
+
+ bg = &per_cpu(cpu_boost_groups, cpu);
+
+ /* The root boost group is always active */
+ boost_max = bg->group[0].boost;
+ for (idx = 1; idx < BOOSTGROUPS_COUNT; ++idx) {
+
+ /* Ignore non boostgroups not mapping a cgroup */
+ if (!bg->group[idx].valid)
+ continue;
+
+ /*
+ * A boost group affects a CPU only if it has
+ * RUNNABLE tasks on that CPU
+ */
+ if (bg->group[idx].tasks == 0)
+ continue;
+
+ boost_max = max(boost_max, bg->group[idx].boost);
+ }
+
+ /* Ensures boost_max is non-negative when all cgroup boost values
+ * are neagtive. Avoids under-accounting of cpu capacity which may cause
+ * task stacking and frequency spikes.*/
+ boost_max = max(boost_max, 0);
+ bg->boost_max = boost_max;
+}
+
+static int
+schedtune_boostgroup_update(int idx, int boost)
+{
+ struct boost_groups *bg;
+ int cur_boost_max;
+ int old_boost;
+ int cpu;
+
+ /* Update per CPU boost groups */
+ for_each_possible_cpu(cpu) {
+ bg = &per_cpu(cpu_boost_groups, cpu);
+
+ /* CGroups are never associated to non active cgroups */
+ BUG_ON(!bg->group[idx].valid);
+
+ /*
+ * Keep track of current boost values to compute the per CPU
+ * maximum only when it has been affected by the new value of
+ * the updated boost group
+ */
+ cur_boost_max = bg->boost_max;
+ old_boost = bg->group[idx].boost;
+
+ /* Update the boost value of this boost group */
+ bg->group[idx].boost = boost;
+
+ /* Check if this update increase current max */
+ if (boost > cur_boost_max && bg->group[idx].tasks) {
+ bg->boost_max = boost;
+ trace_sched_tune_boostgroup_update(cpu, 1, bg->boost_max);
+ continue;
+ }
+
+ /* Check if this update has decreased current max */
+ if (cur_boost_max == old_boost && old_boost > boost) {
+ schedtune_cpu_update(cpu);
+ trace_sched_tune_boostgroup_update(cpu, -1, bg->boost_max);
+ continue;
+ }
+
+ trace_sched_tune_boostgroup_update(cpu, 0, bg->boost_max);
+ }
+
+ return 0;
+}
+
+#define ENQUEUE_TASK 1
+#define DEQUEUE_TASK -1
+
+static inline void
+schedtune_tasks_update(struct task_struct *p, int cpu, int idx, int task_count)
+{
+ struct boost_groups *bg = &per_cpu(cpu_boost_groups, cpu);
+ int tasks = bg->group[idx].tasks + task_count;
+
+ /* Update boosted tasks count while avoiding to make it negative */
+ bg->group[idx].tasks = max(0, tasks);
+
+ trace_sched_tune_tasks_update(p, cpu, tasks, idx,
+ bg->group[idx].boost, bg->boost_max);
+
+ /* Boost group activation or deactivation on that RQ */
+ if (tasks == 1 || tasks == 0)
+ schedtune_cpu_update(cpu);
+}
+
+/*
+ * NOTE: This function must be called while holding the lock on the CPU RQ
+ */
+void schedtune_enqueue_task(struct task_struct *p, int cpu)
+{
+ struct boost_groups *bg = &per_cpu(cpu_boost_groups, cpu);
+ unsigned long irq_flags;
+ struct schedtune *st;
+ int idx;
+
+ if (!unlikely(schedtune_initialized))
+ return;
+
+ /*
+ * When a task is marked PF_EXITING by do_exit() it's going to be
+ * dequeued and enqueued multiple times in the exit path.
+ * Thus we avoid any further update, since we do not want to change
+ * CPU boosting while the task is exiting.
+ */
+ if (p->flags & PF_EXITING)
+ return;
+
+ /*
+ * Boost group accouting is protected by a per-cpu lock and requires
+ * interrupt to be disabled to avoid race conditions for example on
+ * do_exit()::cgroup_exit() and task migration.
+ */
+ raw_spin_lock_irqsave(&bg->lock, irq_flags);
+ rcu_read_lock();
+
+ st = task_schedtune(p);
+ idx = st->idx;
+
+ schedtune_tasks_update(p, cpu, idx, ENQUEUE_TASK);
+
+ rcu_read_unlock();
+ raw_spin_unlock_irqrestore(&bg->lock, irq_flags);
+}
+
+int schedtune_can_attach(struct cgroup_taskset *tset)
+{
+ struct task_struct *task;
+ struct cgroup_subsys_state *css;
+ struct boost_groups *bg;
+ struct rq_flags irq_flags;
+ unsigned int cpu;
+ struct rq *rq;
+ int src_bg; /* Source boost group index */
+ int dst_bg; /* Destination boost group index */
+ int tasks;
+
+ if (!unlikely(schedtune_initialized))
+ return 0;
+
+
+ cgroup_taskset_for_each(task, css, tset) {
+
+ /*
+ * Lock the CPU's RQ the task is enqueued to avoid race
+ * conditions with migration code while the task is being
+ * accounted
+ */
+ rq = lock_rq_of(task, &irq_flags);
+
+ if (!task->on_rq) {
+ unlock_rq_of(rq, task, &irq_flags);
+ continue;
+ }
+
+ /*
+ * Boost group accouting is protected by a per-cpu lock and requires
+ * interrupt to be disabled to avoid race conditions on...
+ */
+ cpu = cpu_of(rq);
+ bg = &per_cpu(cpu_boost_groups, cpu);
+ raw_spin_lock(&bg->lock);
+
+ dst_bg = css_st(css)->idx;
+ src_bg = task_schedtune(task)->idx;
+
+ /*
+ * Current task is not changing boostgroup, which can
+ * happen when the new hierarchy is in use.
+ */
+ if (unlikely(dst_bg == src_bg)) {
+ raw_spin_unlock(&bg->lock);
+ unlock_rq_of(rq, task, &irq_flags);
+ continue;
+ }
+
+ /*
+ * This is the case of a RUNNABLE task which is switching its
+ * current boost group.
+ */
+
+ /* Move task from src to dst boost group */
+ tasks = bg->group[src_bg].tasks - 1;
+ bg->group[src_bg].tasks = max(0, tasks);
+ bg->group[dst_bg].tasks += 1;
+
+ raw_spin_unlock(&bg->lock);
+ unlock_rq_of(rq, task, &irq_flags);
+
+ /* Update CPU boost group */
+ if (bg->group[src_bg].tasks == 0 || bg->group[dst_bg].tasks == 1)
+ schedtune_cpu_update(task_cpu(task));
+
+ }
+
+ return 0;
+}
+
+void schedtune_cancel_attach(struct cgroup_taskset *tset)
+{
+ /* This can happen only if SchedTune controller is mounted with
+ * other hierarchies ane one of them fails. Since usually SchedTune is
+ * mouted on its own hierarcy, for the time being we do not implement
+ * a proper rollback mechanism */
+ WARN(1, "SchedTune cancel attach not implemented");
+}
+
+/*
+ * NOTE: This function must be called while holding the lock on the CPU RQ
+ */
+void schedtune_dequeue_task(struct task_struct *p, int cpu)
+{
+ struct boost_groups *bg = &per_cpu(cpu_boost_groups, cpu);
+ unsigned long irq_flags;
+ struct schedtune *st;
+ int idx;
+
+ if (!unlikely(schedtune_initialized))
+ return;
+
+ /*
+ * When a task is marked PF_EXITING by do_exit() it's going to be
+ * dequeued and enqueued multiple times in the exit path.
+ * Thus we avoid any further update, since we do not want to change
+ * CPU boosting while the task is exiting.
+ * The last dequeue is already enforce by the do_exit() code path
+ * via schedtune_exit_task().
+ */
+ if (p->flags & PF_EXITING)
+ return;
+
+ /*
+ * Boost group accouting is protected by a per-cpu lock and requires
+ * interrupt to be disabled to avoid race conditions on...
+ */
+ raw_spin_lock_irqsave(&bg->lock, irq_flags);
+ rcu_read_lock();
+
+ st = task_schedtune(p);
+ idx = st->idx;
+
+ schedtune_tasks_update(p, cpu, idx, DEQUEUE_TASK);
+
+ rcu_read_unlock();
+ raw_spin_unlock_irqrestore(&bg->lock, irq_flags);
+}
+
+void schedtune_exit_task(struct task_struct *tsk)
+{
+ struct schedtune *st;
+ struct rq_flags irq_flags;
+ unsigned int cpu;
+ struct rq *rq;
+ int idx;
+
+ if (!unlikely(schedtune_initialized))
+ return;
+
+ rq = lock_rq_of(tsk, &irq_flags);
+ rcu_read_lock();
+
+ cpu = cpu_of(rq);
+ st = task_schedtune(tsk);
+ idx = st->idx;
+ schedtune_tasks_update(tsk, cpu, idx, DEQUEUE_TASK);
+
+ rcu_read_unlock();
+ unlock_rq_of(rq, tsk, &irq_flags);
+}
+
+int schedtune_cpu_boost(int cpu)
+{
+ struct boost_groups *bg;
+
+ bg = &per_cpu(cpu_boost_groups, cpu);
+ return bg->boost_max;
+}
+
+int schedtune_task_boost(struct task_struct *p)
+{
+ struct schedtune *st;
+ int task_boost;
+
+ if (!unlikely(schedtune_initialized))
+ return 0;
+
+ /* Get task boost value */
+ rcu_read_lock();
+ st = task_schedtune(p);
+ task_boost = st->boost;
+ rcu_read_unlock();
+
+ return task_boost;
+}
+
+int schedtune_prefer_idle(struct task_struct *p)
+{
+ struct schedtune *st;
+ int prefer_idle;
+
+ if (!unlikely(schedtune_initialized))
+ return 0;
+
+ /* Get prefer_idle value */
+ rcu_read_lock();
+ st = task_schedtune(p);
+ prefer_idle = st->prefer_idle;
+ rcu_read_unlock();
+
+ return prefer_idle;
+}
+
+static u64
+prefer_idle_read(struct cgroup_subsys_state *css, struct cftype *cft)
+{
+ struct schedtune *st = css_st(css);
+
+ return st->prefer_idle;
+}
+
+static int
+prefer_idle_write(struct cgroup_subsys_state *css, struct cftype *cft,
+ u64 prefer_idle)
+{
+ struct schedtune *st = css_st(css);
+ st->prefer_idle = !!prefer_idle;
+
+ return 0;
+}
+
+static s64
+boost_read(struct cgroup_subsys_state *css, struct cftype *cft)
+{
+ struct schedtune *st = css_st(css);
+
+ return st->boost;
+}
+
+static int
+boost_write(struct cgroup_subsys_state *css, struct cftype *cft,
+ s64 boost)
+{
+ struct schedtune *st = css_st(css);
+ unsigned threshold_idx;
+ int boost_pct;
+
+ if (boost < -100 || boost > 100)
+ return -EINVAL;
+ boost_pct = boost;
+
+ /*
+ * Update threshold params for Performance Boost (B)
+ * and Performance Constraint (C) regions.
+ * The current implementatio uses the same cuts for both
+ * B and C regions.
+ */
+ threshold_idx = clamp(boost_pct, 0, 99) / 10;
+ st->perf_boost_idx = threshold_idx;
+ st->perf_constrain_idx = threshold_idx;
+
+ st->boost = boost;
+ if (css == &root_schedtune.css) {
+ sysctl_sched_cfs_boost = boost;
+ perf_boost_idx = threshold_idx;
+ perf_constrain_idx = threshold_idx;
+ }
+
+ /* Update CPU boost */
+ schedtune_boostgroup_update(st->idx, st->boost);
+
+ trace_sched_tune_config(st->boost);
+
+ return 0;
+}
+
+static struct cftype files[] = {
+ {
+ .name = "boost",
+ .read_s64 = boost_read,
+ .write_s64 = boost_write,
+ },
+ {
+ .name = "prefer_idle",
+ .read_u64 = prefer_idle_read,
+ .write_u64 = prefer_idle_write,
+ },
+ { } /* terminate */
+};
+
+static void
+schedtune_boostgroup_init(struct schedtune *st, int idx)
+{
+ struct boost_groups *bg;
+ int cpu;
+
+ /* Initialize per CPUs boost group support */
+ for_each_possible_cpu(cpu) {
+ bg = &per_cpu(cpu_boost_groups, cpu);
+ bg->group[idx].boost = 0;
+ bg->group[idx].valid = true;
+ }
+
+ /* Keep track of allocated boost groups */
+ allocated_group[idx] = st;
+ st->idx = idx;
+}
+
+static struct cgroup_subsys_state *
+schedtune_css_alloc(struct cgroup_subsys_state *parent_css)
+{
+ struct schedtune *st;
+ int idx;
+
+ if (!parent_css)
+ return &root_schedtune.css;
+
+ /* Allow only single level hierachies */
+ if (parent_css != &root_schedtune.css) {
+ pr_err("Nested SchedTune boosting groups not allowed\n");
+ return ERR_PTR(-ENOMEM);
+ }
+
+ /* Allow only a limited number of boosting groups */
+ for (idx = 1; idx < BOOSTGROUPS_COUNT; ++idx)
+ if (!allocated_group[idx])
+ break;
+ if (idx == BOOSTGROUPS_COUNT) {
+ pr_err("Trying to create more than %d SchedTune boosting groups\n",
+ BOOSTGROUPS_COUNT);
+ return ERR_PTR(-ENOSPC);
+ }
+
+ st = kzalloc(sizeof(*st), GFP_KERNEL);
+ if (!st)
+ goto out;
+
+ /* Initialize per CPUs boost group support */
+ schedtune_boostgroup_init(st, idx);
+
+ return &st->css;
+
+out:
+ return ERR_PTR(-ENOMEM);
+}
+
+static void
+schedtune_boostgroup_release(struct schedtune *st)
+{
+ struct boost_groups *bg;
+ int cpu;
+
+ /* Reset per CPUs boost group support */
+ for_each_possible_cpu(cpu) {
+ bg = &per_cpu(cpu_boost_groups, cpu);
+ bg->group[st->idx].valid = false;
+ bg->group[st->idx].boost = 0;
+ }
+
+ /* Keep track of allocated boost groups */
+ allocated_group[st->idx] = NULL;
+}
+
+static void
+schedtune_css_free(struct cgroup_subsys_state *css)
+{
+ struct schedtune *st = css_st(css);
+
+ /* Release per CPUs boost group support */
+ schedtune_boostgroup_release(st);
+ kfree(st);
+}
+
+struct cgroup_subsys schedtune_cgrp_subsys = {
+ .css_alloc = schedtune_css_alloc,
+ .css_free = schedtune_css_free,
+ .can_attach = schedtune_can_attach,
+ .cancel_attach = schedtune_cancel_attach,
+ .legacy_cftypes = files,
+ .early_init = 1,
+};
+
+static inline void
+schedtune_init_cgroups(void)
+{
+ struct boost_groups *bg;
+ int cpu;
+
+ /* Initialize the per CPU boost groups */
+ for_each_possible_cpu(cpu) {
+ bg = &per_cpu(cpu_boost_groups, cpu);
+ memset(bg, 0, sizeof(struct boost_groups));
+ bg->group[0].valid = true;
+ raw_spin_lock_init(&bg->lock);
+ }
+
+ pr_info("schedtune: configured to support %d boost groups\n",
+ BOOSTGROUPS_COUNT);
+
+ schedtune_initialized = true;
+}
+
+#else /* CONFIG_CGROUP_SCHEDTUNE */
+
+int
+schedtune_accept_deltas(int nrg_delta, int cap_delta,
+ struct task_struct *task)
+{
+ /* Optimal (O) region */
+ if (nrg_delta < 0 && cap_delta > 0) {
+ trace_sched_tune_filter(nrg_delta, cap_delta, 0, 0, 1, 0);
+ return INT_MAX;
+ }
+
+ /* Suboptimal (S) region */
+ if (nrg_delta > 0 && cap_delta < 0) {
+ trace_sched_tune_filter(nrg_delta, cap_delta, 0, 0, -1, 5);
+ return -INT_MAX;
+ }
+
+ return __schedtune_accept_deltas(nrg_delta, cap_delta,
+ perf_boost_idx, perf_constrain_idx);
+}
+
+#endif /* CONFIG_CGROUP_SCHEDTUNE */
+
+int
+sysctl_sched_cfs_boost_handler(struct ctl_table *table, int write,
+ void __user *buffer, size_t *lenp,
+ loff_t *ppos)
+{
+ int ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
+ unsigned threshold_idx;
+ int boost_pct;
+
+ if (ret || !write)
+ return ret;
+
+ if (sysctl_sched_cfs_boost < -100 || sysctl_sched_cfs_boost > 100)
+ return -EINVAL;
+ boost_pct = sysctl_sched_cfs_boost;
+
+ /*
+ * Update threshold params for Performance Boost (B)
+ * and Performance Constraint (C) regions.
+ * The current implementatio uses the same cuts for both
+ * B and C regions.
+ */
+ threshold_idx = clamp(boost_pct, 0, 99) / 10;
+ perf_boost_idx = threshold_idx;
+ perf_constrain_idx = threshold_idx;
+
+ return 0;
+}
+
+#ifdef CONFIG_SCHED_DEBUG
+static void
+schedtune_test_nrg(unsigned long delta_pwr)
+{
+ unsigned long test_delta_pwr;
+ unsigned long test_norm_pwr;
+ int idx;
+
+ /*
+ * Check normalization constants using some constant system
+ * energy values
+ */
+ pr_info("schedtune: verify normalization constants...\n");
+ for (idx = 0; idx < 6; ++idx) {
+ test_delta_pwr = delta_pwr >> idx;
+
+ /* Normalize on max energy for target platform */
+ test_norm_pwr = reciprocal_divide(
+ test_delta_pwr << SCHED_CAPACITY_SHIFT,
+ schedtune_target_nrg.rdiv);
+
+ pr_info("schedtune: max_pwr/2^%d: %4lu => norm_pwr: %5lu\n",
+ idx, test_delta_pwr, test_norm_pwr);
+ }
+}
+#else
+#define schedtune_test_nrg(delta_pwr)
+#endif
+
+/*
+ * Compute the min/max power consumption of a cluster and all its CPUs
+ */
+static void
+schedtune_add_cluster_nrg(
+ struct sched_domain *sd,
+ struct sched_group *sg,
+ struct target_nrg *ste)
+{
+ struct sched_domain *sd2;
+ struct sched_group *sg2;
+
+ struct cpumask *cluster_cpus;
+ char str[32];
+
+ unsigned long min_pwr;
+ unsigned long max_pwr;
+ int cpu;
+
+ /* Get Cluster energy using EM data for the first CPU */
+ cluster_cpus = sched_group_cpus(sg);
+ snprintf(str, 32, "CLUSTER[%*pbl]",
+ cpumask_pr_args(cluster_cpus));
+
+ min_pwr = sg->sge->idle_states[sg->sge->nr_idle_states - 1].power;
+ max_pwr = sg->sge->cap_states[sg->sge->nr_cap_states - 1].power;
+ pr_info("schedtune: %-17s min_pwr: %5lu max_pwr: %5lu\n",
+ str, min_pwr, max_pwr);
+
+ /*
+ * Keep track of this cluster's energy in the computation of the
+ * overall system energy
+ */
+ ste->min_power += min_pwr;
+ ste->max_power += max_pwr;
+
+ /* Get CPU energy using EM data for each CPU in the group */
+ for_each_cpu(cpu, cluster_cpus) {
+ /* Get a SD view for the specific CPU */
+ for_each_domain(cpu, sd2) {
+ /* Get the CPU group */
+ sg2 = sd2->groups;
+ min_pwr = sg2->sge->idle_states[sg2->sge->nr_idle_states - 1].power;
+ max_pwr = sg2->sge->cap_states[sg2->sge->nr_cap_states - 1].power;
+
+ ste->min_power += min_pwr;
+ ste->max_power += max_pwr;
+
+ snprintf(str, 32, "CPU[%d]", cpu);
+ pr_info("schedtune: %-17s min_pwr: %5lu max_pwr: %5lu\n",
+ str, min_pwr, max_pwr);
+
+ /*
+ * Assume we have EM data only at the CPU and
+ * the upper CLUSTER level
+ */
+ BUG_ON(!cpumask_equal(
+ sched_group_cpus(sg),
+ sched_group_cpus(sd2->parent->groups)
+ ));
+ break;
+ }
+ }
+}
+
+/*
+ * Initialize the constants required to compute normalized energy.
+ * The values of these constants depends on the EM data for the specific
+ * target system and topology.
+ * Thus, this function is expected to be called by the code
+ * that bind the EM to the topology information.
+ */
+static int
+schedtune_init(void)
+{
+ struct target_nrg *ste = &schedtune_target_nrg;
+ unsigned long delta_pwr = 0;
+ struct sched_domain *sd;
+ struct sched_group *sg;
+
+ pr_info("schedtune: init normalization constants...\n");
+ ste->max_power = 0;
+ ste->min_power = 0;
+
+ rcu_read_lock();
+
+ /*
+ * When EAS is in use, we always have a pointer to the highest SD
+ * which provides EM data.
+ */
+ sd = rcu_dereference(per_cpu(sd_ea, cpumask_first(cpu_online_mask)));
+ if (!sd) {
+ pr_info("schedtune: no energy model data\n");
+ goto nodata;
+ }
+
+ sg = sd->groups;
+ do {
+ schedtune_add_cluster_nrg(sd, sg, ste);
+ } while (sg = sg->next, sg != sd->groups);
+
+ rcu_read_unlock();
+
+ pr_info("schedtune: %-17s min_pwr: %5lu max_pwr: %5lu\n",
+ "SYSTEM", ste->min_power, ste->max_power);
+
+ /* Compute normalization constants */
+ delta_pwr = ste->max_power - ste->min_power;
+ ste->rdiv = reciprocal_value(delta_pwr);
+ pr_info("schedtune: using normalization constants mul: %u sh1: %u sh2: %u\n",
+ ste->rdiv.m, ste->rdiv.sh1, ste->rdiv.sh2);
+
+ schedtune_test_nrg(delta_pwr);
+
+#ifdef CONFIG_CGROUP_SCHEDTUNE
+ schedtune_init_cgroups();
+#else
+ pr_info("schedtune: configured to support global boosting only\n");
+#endif
+
+ schedtune_spc_rdiv = reciprocal_value(100);
+
+ return 0;
+
+nodata:
+ pr_warning("schedtune: disabled!\n");
+ rcu_read_unlock();
+ return -EINVAL;
+}
+postcore_initcall(schedtune_init);
diff --git a/kernel/sched/tune.h b/kernel/sched/tune.h
new file mode 100644
index 000000000000..4f6441771e4c
--- /dev/null
+++ b/kernel/sched/tune.h
@@ -0,0 +1,55 @@
+
+#ifdef CONFIG_SCHED_TUNE
+
+#include <linux/reciprocal_div.h>
+
+/*
+ * System energy normalization constants
+ */
+struct target_nrg {
+ unsigned long min_power;
+ unsigned long max_power;
+ struct reciprocal_value rdiv;
+};
+
+#ifdef CONFIG_CGROUP_SCHEDTUNE
+
+int schedtune_cpu_boost(int cpu);
+int schedtune_task_boost(struct task_struct *tsk);
+
+int schedtune_prefer_idle(struct task_struct *tsk);
+
+void schedtune_exit_task(struct task_struct *tsk);
+
+void schedtune_enqueue_task(struct task_struct *p, int cpu);
+void schedtune_dequeue_task(struct task_struct *p, int cpu);
+
+#else /* CONFIG_CGROUP_SCHEDTUNE */
+
+#define schedtune_cpu_boost(cpu) get_sysctl_sched_cfs_boost()
+#define schedtune_task_boost(tsk) get_sysctl_sched_cfs_boost()
+
+#define schedtune_exit_task(task) do { } while (0)
+
+#define schedtune_enqueue_task(task, cpu) do { } while (0)
+#define schedtune_dequeue_task(task, cpu) do { } while (0)
+
+#endif /* CONFIG_CGROUP_SCHEDTUNE */
+
+int schedtune_normalize_energy(int energy);
+int schedtune_accept_deltas(int nrg_delta, int cap_delta,
+ struct task_struct *task);
+
+#else /* CONFIG_SCHED_TUNE */
+
+#define schedtune_cpu_boost(cpu) 0
+#define schedtune_task_boost(tsk) 0
+
+#define schedtune_exit_task(task) do { } while (0)
+
+#define schedtune_enqueue_task(task, cpu) do { } while (0)
+#define schedtune_dequeue_task(task, cpu) do { } while (0)
+
+#define schedtune_accept_deltas(nrg_delta, cap_delta, task) nrg_delta
+
+#endif /* CONFIG_SCHED_TUNE */
diff --git a/kernel/sched/walt.c b/kernel/sched/walt.c
new file mode 100644
index 000000000000..0162ddedcd68
--- /dev/null
+++ b/kernel/sched/walt.c
@@ -0,0 +1,903 @@
+/*
+ * Copyright (c) 2016, The Linux Foundation. All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 and
+ * only version 2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ *
+ * Window Assisted Load Tracking (WALT) implementation credits:
+ * Srivatsa Vaddagiri, Steve Muckle, Syed Rameez Mustafa, Joonwoo Park,
+ * Pavan Kumar Kondeti, Olav Haugan
+ *
+ * 2016-03-06: Integration with EAS/refactoring by Vikram Mulukutla
+ * and Todd Kjos
+ */
+
+#include <linux/acpi.h>
+#include <linux/syscore_ops.h>
+#include <trace/events/sched.h>
+#include "sched.h"
+#include "walt.h"
+
+#define WINDOW_STATS_RECENT 0
+#define WINDOW_STATS_MAX 1
+#define WINDOW_STATS_MAX_RECENT_AVG 2
+#define WINDOW_STATS_AVG 3
+#define WINDOW_STATS_INVALID_POLICY 4
+
+#define EXITING_TASK_MARKER 0xdeaddead
+
+static __read_mostly unsigned int walt_ravg_hist_size = 5;
+static __read_mostly unsigned int walt_window_stats_policy =
+ WINDOW_STATS_MAX_RECENT_AVG;
+static __read_mostly unsigned int walt_account_wait_time = 1;
+static __read_mostly unsigned int walt_freq_account_wait_time = 0;
+static __read_mostly unsigned int walt_io_is_busy = 0;
+
+unsigned int sysctl_sched_walt_init_task_load_pct = 15;
+
+/* true -> use PELT based load stats, false -> use window-based load stats */
+bool __read_mostly walt_disabled = false;
+
+/*
+ * Window size (in ns). Adjust for the tick size so that the window
+ * rollover occurs just before the tick boundary.
+ */
+__read_mostly unsigned int walt_ravg_window =
+ (20000000 / TICK_NSEC) * TICK_NSEC;
+#define MIN_SCHED_RAVG_WINDOW ((10000000 / TICK_NSEC) * TICK_NSEC)
+#define MAX_SCHED_RAVG_WINDOW ((1000000000 / TICK_NSEC) * TICK_NSEC)
+
+static unsigned int sync_cpu;
+static ktime_t ktime_last;
+static __read_mostly bool walt_ktime_suspended;
+
+static unsigned int task_load(struct task_struct *p)
+{
+ return p->ravg.demand;
+}
+
+static inline void fixup_cum_window_demand(struct rq *rq, s64 delta)
+{
+ rq->cum_window_demand += delta;
+ if (unlikely((s64)rq->cum_window_demand < 0))
+ rq->cum_window_demand = 0;
+}
+
+void
+walt_inc_cumulative_runnable_avg(struct rq *rq,
+ struct task_struct *p)
+{
+ rq->cumulative_runnable_avg += p->ravg.demand;
+
+ /*
+ * Add a task's contribution to the cumulative window demand when
+ *
+ * (1) task is enqueued with on_rq = 1 i.e migration,
+ * prio/cgroup/class change.
+ * (2) task is waking for the first time in this window.
+ */
+ if (p->on_rq || (p->last_sleep_ts < rq->window_start))
+ fixup_cum_window_demand(rq, p->ravg.demand);
+}
+
+void
+walt_dec_cumulative_runnable_avg(struct rq *rq,
+ struct task_struct *p)
+{
+ rq->cumulative_runnable_avg -= p->ravg.demand;
+ BUG_ON((s64)rq->cumulative_runnable_avg < 0);
+
+ /*
+ * on_rq will be 1 for sleeping tasks. So check if the task
+ * is migrating or dequeuing in RUNNING state to change the
+ * prio/cgroup/class.
+ */
+ if (task_on_rq_migrating(p) || p->state == TASK_RUNNING)
+ fixup_cum_window_demand(rq, -(s64)p->ravg.demand);
+}
+
+static void
+fixup_cumulative_runnable_avg(struct rq *rq,
+ struct task_struct *p, u64 new_task_load)
+{
+ s64 task_load_delta = (s64)new_task_load - task_load(p);
+
+ rq->cumulative_runnable_avg += task_load_delta;
+ if ((s64)rq->cumulative_runnable_avg < 0)
+ panic("cra less than zero: tld: %lld, task_load(p) = %u\n",
+ task_load_delta, task_load(p));
+
+ fixup_cum_window_demand(rq, task_load_delta);
+}
+
+u64 walt_ktime_clock(void)
+{
+ if (unlikely(walt_ktime_suspended))
+ return ktime_to_ns(ktime_last);
+ return ktime_get_ns();
+}
+
+static void walt_resume(void)
+{
+ walt_ktime_suspended = false;
+}
+
+static int walt_suspend(void)
+{
+ ktime_last = ktime_get();
+ walt_ktime_suspended = true;
+ return 0;
+}
+
+static struct syscore_ops walt_syscore_ops = {
+ .resume = walt_resume,
+ .suspend = walt_suspend
+};
+
+static int __init walt_init_ops(void)
+{
+ register_syscore_ops(&walt_syscore_ops);
+ return 0;
+}
+late_initcall(walt_init_ops);
+
+#ifdef CONFIG_CFS_BANDWIDTH
+void walt_inc_cfs_cumulative_runnable_avg(struct cfs_rq *cfs_rq,
+ struct task_struct *p)
+{
+ cfs_rq->cumulative_runnable_avg += p->ravg.demand;
+}
+
+void walt_dec_cfs_cumulative_runnable_avg(struct cfs_rq *cfs_rq,
+ struct task_struct *p)
+{
+ cfs_rq->cumulative_runnable_avg -= p->ravg.demand;
+}
+#endif
+
+static int exiting_task(struct task_struct *p)
+{
+ if (p->flags & PF_EXITING) {
+ if (p->ravg.sum_history[0] != EXITING_TASK_MARKER) {
+ p->ravg.sum_history[0] = EXITING_TASK_MARKER;
+ }
+ return 1;
+ }
+ return 0;
+}
+
+static int __init set_walt_ravg_window(char *str)
+{
+ unsigned int adj_window;
+ bool no_walt = walt_disabled;
+
+ get_option(&str, &walt_ravg_window);
+
+ /* Adjust for CONFIG_HZ */
+ adj_window = (walt_ravg_window / TICK_NSEC) * TICK_NSEC;
+
+ /* Warn if we're a bit too far away from the expected window size */
+ WARN(adj_window < walt_ravg_window - NSEC_PER_MSEC,
+ "tick-adjusted window size %u, original was %u\n", adj_window,
+ walt_ravg_window);
+
+ walt_ravg_window = adj_window;
+
+ walt_disabled = walt_disabled ||
+ (walt_ravg_window < MIN_SCHED_RAVG_WINDOW ||
+ walt_ravg_window > MAX_SCHED_RAVG_WINDOW);
+
+ WARN(!no_walt && walt_disabled,
+ "invalid window size, disabling WALT\n");
+
+ return 0;
+}
+
+early_param("walt_ravg_window", set_walt_ravg_window);
+
+static void
+update_window_start(struct rq *rq, u64 wallclock)
+{
+ s64 delta;
+ int nr_windows;
+
+ delta = wallclock - rq->window_start;
+ /* If the MPM global timer is cleared, set delta as 0 to avoid kernel BUG happening */
+ if (delta < 0) {
+ delta = 0;
+ WARN_ONCE(1, "WALT wallclock appears to have gone backwards or reset\n");
+ }
+
+ if (delta < walt_ravg_window)
+ return;
+
+ nr_windows = div64_u64(delta, walt_ravg_window);
+ rq->window_start += (u64)nr_windows * (u64)walt_ravg_window;
+
+ rq->cum_window_demand = rq->cumulative_runnable_avg;
+}
+
+/*
+ * Translate absolute delta time accounted on a CPU
+ * to a scale where 1024 is the capacity of the most
+ * capable CPU running at FMAX
+ */
+static u64 scale_exec_time(u64 delta, struct rq *rq)
+{
+ unsigned long capcurr = capacity_curr_of(cpu_of(rq));
+
+ return (delta * capcurr) >> SCHED_CAPACITY_SHIFT;
+}
+
+static int cpu_is_waiting_on_io(struct rq *rq)
+{
+ if (!walt_io_is_busy)
+ return 0;
+
+ return atomic_read(&rq->nr_iowait);
+}
+
+void walt_account_irqtime(int cpu, struct task_struct *curr,
+ u64 delta, u64 wallclock)
+{
+ struct rq *rq = cpu_rq(cpu);
+ unsigned long flags, nr_windows;
+ u64 cur_jiffies_ts;
+
+ raw_spin_lock_irqsave(&rq->lock, flags);
+
+ /*
+ * cputime (wallclock) uses sched_clock so use the same here for
+ * consistency.
+ */
+ delta += sched_clock() - wallclock;
+ cur_jiffies_ts = get_jiffies_64();
+
+ if (is_idle_task(curr))
+ walt_update_task_ravg(curr, rq, IRQ_UPDATE, walt_ktime_clock(),
+ delta);
+
+ nr_windows = cur_jiffies_ts - rq->irqload_ts;
+
+ if (nr_windows) {
+ if (nr_windows < 10) {
+ /* Decay CPU's irqload by 3/4 for each window. */
+ rq->avg_irqload *= (3 * nr_windows);
+ rq->avg_irqload = div64_u64(rq->avg_irqload,
+ 4 * nr_windows);
+ } else {
+ rq->avg_irqload = 0;
+ }
+ rq->avg_irqload += rq->cur_irqload;
+ rq->cur_irqload = 0;
+ }
+
+ rq->cur_irqload += delta;
+ rq->irqload_ts = cur_jiffies_ts;
+ raw_spin_unlock_irqrestore(&rq->lock, flags);
+}
+
+
+#define WALT_HIGH_IRQ_TIMEOUT 3
+
+u64 walt_irqload(int cpu) {
+ struct rq *rq = cpu_rq(cpu);
+ s64 delta;
+ delta = get_jiffies_64() - rq->irqload_ts;
+
+ /*
+ * Current context can be preempted by irq and rq->irqload_ts can be
+ * updated by irq context so that delta can be negative.
+ * But this is okay and we can safely return as this means there
+ * was recent irq occurrence.
+ */
+
+ if (delta < WALT_HIGH_IRQ_TIMEOUT)
+ return rq->avg_irqload;
+ else
+ return 0;
+}
+
+int walt_cpu_high_irqload(int cpu) {
+ return walt_irqload(cpu) >= sysctl_sched_walt_cpu_high_irqload;
+}
+
+static int account_busy_for_cpu_time(struct rq *rq, struct task_struct *p,
+ u64 irqtime, int event)
+{
+ if (is_idle_task(p)) {
+ /* TASK_WAKE && TASK_MIGRATE is not possible on idle task! */
+ if (event == PICK_NEXT_TASK)
+ return 0;
+
+ /* PUT_PREV_TASK, TASK_UPDATE && IRQ_UPDATE are left */
+ return irqtime || cpu_is_waiting_on_io(rq);
+ }
+
+ if (event == TASK_WAKE)
+ return 0;
+
+ if (event == PUT_PREV_TASK || event == IRQ_UPDATE ||
+ event == TASK_UPDATE)
+ return 1;
+
+ /* Only TASK_MIGRATE && PICK_NEXT_TASK left */
+ return walt_freq_account_wait_time;
+}
+
+/*
+ * Account cpu activity in its busy time counters (rq->curr/prev_runnable_sum)
+ */
+static void update_cpu_busy_time(struct task_struct *p, struct rq *rq,
+ int event, u64 wallclock, u64 irqtime)
+{
+ int new_window, nr_full_windows = 0;
+ int p_is_curr_task = (p == rq->curr);
+ u64 mark_start = p->ravg.mark_start;
+ u64 window_start = rq->window_start;
+ u32 window_size = walt_ravg_window;
+ u64 delta;
+
+ new_window = mark_start < window_start;
+ if (new_window) {
+ nr_full_windows = div64_u64((window_start - mark_start),
+ window_size);
+ if (p->ravg.active_windows < USHRT_MAX)
+ p->ravg.active_windows++;
+ }
+
+ /* Handle per-task window rollover. We don't care about the idle
+ * task or exiting tasks. */
+ if (new_window && !is_idle_task(p) && !exiting_task(p)) {
+ u32 curr_window = 0;
+
+ if (!nr_full_windows)
+ curr_window = p->ravg.curr_window;
+
+ p->ravg.prev_window = curr_window;
+ p->ravg.curr_window = 0;
+ }
+
+ if (!account_busy_for_cpu_time(rq, p, irqtime, event)) {
+ /* account_busy_for_cpu_time() = 0, so no update to the
+ * task's current window needs to be made. This could be
+ * for example
+ *
+ * - a wakeup event on a task within the current
+ * window (!new_window below, no action required),
+ * - switching to a new task from idle (PICK_NEXT_TASK)
+ * in a new window where irqtime is 0 and we aren't
+ * waiting on IO */
+
+ if (!new_window)
+ return;
+
+ /* A new window has started. The RQ demand must be rolled
+ * over if p is the current task. */
+ if (p_is_curr_task) {
+ u64 prev_sum = 0;
+
+ /* p is either idle task or an exiting task */
+ if (!nr_full_windows) {
+ prev_sum = rq->curr_runnable_sum;
+ }
+
+ rq->prev_runnable_sum = prev_sum;
+ rq->curr_runnable_sum = 0;
+ }
+
+ return;
+ }
+
+ if (!new_window) {
+ /* account_busy_for_cpu_time() = 1 so busy time needs
+ * to be accounted to the current window. No rollover
+ * since we didn't start a new window. An example of this is
+ * when a task starts execution and then sleeps within the
+ * same window. */
+
+ if (!irqtime || !is_idle_task(p) || cpu_is_waiting_on_io(rq))
+ delta = wallclock - mark_start;
+ else
+ delta = irqtime;
+ delta = scale_exec_time(delta, rq);
+ rq->curr_runnable_sum += delta;
+ if (!is_idle_task(p) && !exiting_task(p))
+ p->ravg.curr_window += delta;
+
+ return;
+ }
+
+ if (!p_is_curr_task) {
+ /* account_busy_for_cpu_time() = 1 so busy time needs
+ * to be accounted to the current window. A new window
+ * has also started, but p is not the current task, so the
+ * window is not rolled over - just split up and account
+ * as necessary into curr and prev. The window is only
+ * rolled over when a new window is processed for the current
+ * task.
+ *
+ * Irqtime can't be accounted by a task that isn't the
+ * currently running task. */
+
+ if (!nr_full_windows) {
+ /* A full window hasn't elapsed, account partial
+ * contribution to previous completed window. */
+ delta = scale_exec_time(window_start - mark_start, rq);
+ if (!exiting_task(p))
+ p->ravg.prev_window += delta;
+ } else {
+ /* Since at least one full window has elapsed,
+ * the contribution to the previous window is the
+ * full window (window_size). */
+ delta = scale_exec_time(window_size, rq);
+ if (!exiting_task(p))
+ p->ravg.prev_window = delta;
+ }
+ rq->prev_runnable_sum += delta;
+
+ /* Account piece of busy time in the current window. */
+ delta = scale_exec_time(wallclock - window_start, rq);
+ rq->curr_runnable_sum += delta;
+ if (!exiting_task(p))
+ p->ravg.curr_window = delta;
+
+ return;
+ }
+
+ if (!irqtime || !is_idle_task(p) || cpu_is_waiting_on_io(rq)) {
+ /* account_busy_for_cpu_time() = 1 so busy time needs
+ * to be accounted to the current window. A new window
+ * has started and p is the current task so rollover is
+ * needed. If any of these three above conditions are true
+ * then this busy time can't be accounted as irqtime.
+ *
+ * Busy time for the idle task or exiting tasks need not
+ * be accounted.
+ *
+ * An example of this would be a task that starts execution
+ * and then sleeps once a new window has begun. */
+
+ if (!nr_full_windows) {
+ /* A full window hasn't elapsed, account partial
+ * contribution to previous completed window. */
+ delta = scale_exec_time(window_start - mark_start, rq);
+ if (!is_idle_task(p) && !exiting_task(p))
+ p->ravg.prev_window += delta;
+
+ delta += rq->curr_runnable_sum;
+ } else {
+ /* Since at least one full window has elapsed,
+ * the contribution to the previous window is the
+ * full window (window_size). */
+ delta = scale_exec_time(window_size, rq);
+ if (!is_idle_task(p) && !exiting_task(p))
+ p->ravg.prev_window = delta;
+
+ }
+ /*
+ * Rollover for normal runnable sum is done here by overwriting
+ * the values in prev_runnable_sum and curr_runnable_sum.
+ * Rollover for new task runnable sum has completed by previous
+ * if-else statement.
+ */
+ rq->prev_runnable_sum = delta;
+
+ /* Account piece of busy time in the current window. */
+ delta = scale_exec_time(wallclock - window_start, rq);
+ rq->curr_runnable_sum = delta;
+ if (!is_idle_task(p) && !exiting_task(p))
+ p->ravg.curr_window = delta;
+
+ return;
+ }
+
+ if (irqtime) {
+ /* account_busy_for_cpu_time() = 1 so busy time needs
+ * to be accounted to the current window. A new window
+ * has started and p is the current task so rollover is
+ * needed. The current task must be the idle task because
+ * irqtime is not accounted for any other task.
+ *
+ * Irqtime will be accounted each time we process IRQ activity
+ * after a period of idleness, so we know the IRQ busy time
+ * started at wallclock - irqtime. */
+
+ BUG_ON(!is_idle_task(p));
+ mark_start = wallclock - irqtime;
+
+ /* Roll window over. If IRQ busy time was just in the current
+ * window then that is all that need be accounted. */
+ rq->prev_runnable_sum = rq->curr_runnable_sum;
+ if (mark_start > window_start) {
+ rq->curr_runnable_sum = scale_exec_time(irqtime, rq);
+ return;
+ }
+
+ /* The IRQ busy time spanned multiple windows. Process the
+ * busy time preceding the current window start first. */
+ delta = window_start - mark_start;
+ if (delta > window_size)
+ delta = window_size;
+ delta = scale_exec_time(delta, rq);
+ rq->prev_runnable_sum += delta;
+
+ /* Process the remaining IRQ busy time in the current window. */
+ delta = wallclock - window_start;
+ rq->curr_runnable_sum = scale_exec_time(delta, rq);
+
+ return;
+ }
+
+ BUG();
+}
+
+static int account_busy_for_task_demand(struct task_struct *p, int event)
+{
+ /* No need to bother updating task demand for exiting tasks
+ * or the idle task. */
+ if (exiting_task(p) || is_idle_task(p))
+ return 0;
+
+ /* When a task is waking up it is completing a segment of non-busy
+ * time. Likewise, if wait time is not treated as busy time, then
+ * when a task begins to run or is migrated, it is not running and
+ * is completing a segment of non-busy time. */
+ if (event == TASK_WAKE || (!walt_account_wait_time &&
+ (event == PICK_NEXT_TASK || event == TASK_MIGRATE)))
+ return 0;
+
+ return 1;
+}
+
+/*
+ * Called when new window is starting for a task, to record cpu usage over
+ * recently concluded window(s). Normally 'samples' should be 1. It can be > 1
+ * when, say, a real-time task runs without preemption for several windows at a
+ * stretch.
+ */
+static void update_history(struct rq *rq, struct task_struct *p,
+ u32 runtime, int samples, int event)
+{
+ u32 *hist = &p->ravg.sum_history[0];
+ int ridx, widx;
+ u32 max = 0, avg, demand;
+ u64 sum = 0;
+
+ /* Ignore windows where task had no activity */
+ if (!runtime || is_idle_task(p) || exiting_task(p) || !samples)
+ goto done;
+
+ /* Push new 'runtime' value onto stack */
+ widx = walt_ravg_hist_size - 1;
+ ridx = widx - samples;
+ for (; ridx >= 0; --widx, --ridx) {
+ hist[widx] = hist[ridx];
+ sum += hist[widx];
+ if (hist[widx] > max)
+ max = hist[widx];
+ }
+
+ for (widx = 0; widx < samples && widx < walt_ravg_hist_size; widx++) {
+ hist[widx] = runtime;
+ sum += hist[widx];
+ if (hist[widx] > max)
+ max = hist[widx];
+ }
+
+ p->ravg.sum = 0;
+
+ if (walt_window_stats_policy == WINDOW_STATS_RECENT) {
+ demand = runtime;
+ } else if (walt_window_stats_policy == WINDOW_STATS_MAX) {
+ demand = max;
+ } else {
+ avg = div64_u64(sum, walt_ravg_hist_size);
+ if (walt_window_stats_policy == WINDOW_STATS_AVG)
+ demand = avg;
+ else
+ demand = max(avg, runtime);
+ }
+
+ /*
+ * A throttled deadline sched class task gets dequeued without
+ * changing p->on_rq. Since the dequeue decrements hmp stats
+ * avoid decrementing it here again.
+ *
+ * When window is rolled over, the cumulative window demand
+ * is reset to the cumulative runnable average (contribution from
+ * the tasks on the runqueue). If the current task is dequeued
+ * already, it's demand is not included in the cumulative runnable
+ * average. So add the task demand separately to cumulative window
+ * demand.
+ */
+ if (!task_has_dl_policy(p) || !p->dl.dl_throttled) {
+ if (task_on_rq_queued(p))
+ fixup_cumulative_runnable_avg(rq, p, demand);
+ else if (rq->curr == p)
+ fixup_cum_window_demand(rq, demand);
+ }
+
+ p->ravg.demand = demand;
+
+done:
+ trace_walt_update_history(rq, p, runtime, samples, event);
+ return;
+}
+
+static void add_to_task_demand(struct rq *rq, struct task_struct *p,
+ u64 delta)
+{
+ delta = scale_exec_time(delta, rq);
+ p->ravg.sum += delta;
+ if (unlikely(p->ravg.sum > walt_ravg_window))
+ p->ravg.sum = walt_ravg_window;
+}
+
+/*
+ * Account cpu demand of task and/or update task's cpu demand history
+ *
+ * ms = p->ravg.mark_start;
+ * wc = wallclock
+ * ws = rq->window_start
+ *
+ * Three possibilities:
+ *
+ * a) Task event is contained within one window.
+ * window_start < mark_start < wallclock
+ *
+ * ws ms wc
+ * | | |
+ * V V V
+ * |---------------|
+ *
+ * In this case, p->ravg.sum is updated *iff* event is appropriate
+ * (ex: event == PUT_PREV_TASK)
+ *
+ * b) Task event spans two windows.
+ * mark_start < window_start < wallclock
+ *
+ * ms ws wc
+ * | | |
+ * V V V
+ * -----|-------------------
+ *
+ * In this case, p->ravg.sum is updated with (ws - ms) *iff* event
+ * is appropriate, then a new window sample is recorded followed
+ * by p->ravg.sum being set to (wc - ws) *iff* event is appropriate.
+ *
+ * c) Task event spans more than two windows.
+ *
+ * ms ws_tmp ws wc
+ * | | | |
+ * V V V V
+ * ---|-------|-------|-------|-------|------
+ * | |
+ * |<------ nr_full_windows ------>|
+ *
+ * In this case, p->ravg.sum is updated with (ws_tmp - ms) first *iff*
+ * event is appropriate, window sample of p->ravg.sum is recorded,
+ * 'nr_full_window' samples of window_size is also recorded *iff*
+ * event is appropriate and finally p->ravg.sum is set to (wc - ws)
+ * *iff* event is appropriate.
+ *
+ * IMPORTANT : Leave p->ravg.mark_start unchanged, as update_cpu_busy_time()
+ * depends on it!
+ */
+static void update_task_demand(struct task_struct *p, struct rq *rq,
+ int event, u64 wallclock)
+{
+ u64 mark_start = p->ravg.mark_start;
+ u64 delta, window_start = rq->window_start;
+ int new_window, nr_full_windows;
+ u32 window_size = walt_ravg_window;
+
+ new_window = mark_start < window_start;
+ if (!account_busy_for_task_demand(p, event)) {
+ if (new_window)
+ /* If the time accounted isn't being accounted as
+ * busy time, and a new window started, only the
+ * previous window need be closed out with the
+ * pre-existing demand. Multiple windows may have
+ * elapsed, but since empty windows are dropped,
+ * it is not necessary to account those. */
+ update_history(rq, p, p->ravg.sum, 1, event);
+ return;
+ }
+
+ if (!new_window) {
+ /* The simple case - busy time contained within the existing
+ * window. */
+ add_to_task_demand(rq, p, wallclock - mark_start);
+ return;
+ }
+
+ /* Busy time spans at least two windows. Temporarily rewind
+ * window_start to first window boundary after mark_start. */
+ delta = window_start - mark_start;
+ nr_full_windows = div64_u64(delta, window_size);
+ window_start -= (u64)nr_full_windows * (u64)window_size;
+
+ /* Process (window_start - mark_start) first */
+ add_to_task_demand(rq, p, window_start - mark_start);
+
+ /* Push new sample(s) into task's demand history */
+ update_history(rq, p, p->ravg.sum, 1, event);
+ if (nr_full_windows)
+ update_history(rq, p, scale_exec_time(window_size, rq),
+ nr_full_windows, event);
+
+ /* Roll window_start back to current to process any remainder
+ * in current window. */
+ window_start += (u64)nr_full_windows * (u64)window_size;
+
+ /* Process (wallclock - window_start) next */
+ mark_start = window_start;
+ add_to_task_demand(rq, p, wallclock - mark_start);
+}
+
+/* Reflect task activity on its demand and cpu's busy time statistics */
+void walt_update_task_ravg(struct task_struct *p, struct rq *rq,
+ int event, u64 wallclock, u64 irqtime)
+{
+ if (walt_disabled || !rq->window_start)
+ return;
+
+ lockdep_assert_held(&rq->lock);
+
+ update_window_start(rq, wallclock);
+
+ if (!p->ravg.mark_start)
+ goto done;
+
+ update_task_demand(p, rq, event, wallclock);
+ update_cpu_busy_time(p, rq, event, wallclock, irqtime);
+
+done:
+ trace_walt_update_task_ravg(p, rq, event, wallclock, irqtime);
+
+ p->ravg.mark_start = wallclock;
+}
+
+static void reset_task_stats(struct task_struct *p)
+{
+ u32 sum = 0;
+
+ if (exiting_task(p))
+ sum = EXITING_TASK_MARKER;
+
+ memset(&p->ravg, 0, sizeof(struct ravg));
+ /* Retain EXITING_TASK marker */
+ p->ravg.sum_history[0] = sum;
+}
+
+void walt_mark_task_starting(struct task_struct *p)
+{
+ u64 wallclock;
+ struct rq *rq = task_rq(p);
+
+ if (!rq->window_start) {
+ reset_task_stats(p);
+ return;
+ }
+
+ wallclock = walt_ktime_clock();
+ p->ravg.mark_start = wallclock;
+}
+
+void walt_set_window_start(struct rq *rq)
+{
+ int cpu = cpu_of(rq);
+ struct rq *sync_rq = cpu_rq(sync_cpu);
+
+ if (likely(rq->window_start))
+ return;
+
+ if (cpu == sync_cpu) {
+ rq->window_start = 1;
+ } else {
+ raw_spin_unlock(&rq->lock);
+ double_rq_lock(rq, sync_rq);
+ rq->window_start = cpu_rq(sync_cpu)->window_start;
+ rq->curr_runnable_sum = rq->prev_runnable_sum = 0;
+ raw_spin_unlock(&sync_rq->lock);
+ }
+
+ rq->curr->ravg.mark_start = rq->window_start;
+}
+
+void walt_migrate_sync_cpu(int cpu)
+{
+ if (cpu == sync_cpu)
+ sync_cpu = smp_processor_id();
+}
+
+void walt_fixup_busy_time(struct task_struct *p, int new_cpu)
+{
+ struct rq *src_rq = task_rq(p);
+ struct rq *dest_rq = cpu_rq(new_cpu);
+ u64 wallclock;
+
+ if (!p->on_rq && p->state != TASK_WAKING)
+ return;
+
+ if (exiting_task(p)) {
+ return;
+ }
+
+ if (p->state == TASK_WAKING)
+ double_rq_lock(src_rq, dest_rq);
+
+ wallclock = walt_ktime_clock();
+
+ walt_update_task_ravg(task_rq(p)->curr, task_rq(p),
+ TASK_UPDATE, wallclock, 0);
+ walt_update_task_ravg(dest_rq->curr, dest_rq,
+ TASK_UPDATE, wallclock, 0);
+
+ walt_update_task_ravg(p, task_rq(p), TASK_MIGRATE, wallclock, 0);
+
+ /*
+ * When a task is migrating during the wakeup, adjust
+ * the task's contribution towards cumulative window
+ * demand.
+ */
+ if (p->state == TASK_WAKING &&
+ p->last_sleep_ts >= src_rq->window_start) {
+ fixup_cum_window_demand(src_rq, -(s64)p->ravg.demand);
+ fixup_cum_window_demand(dest_rq, p->ravg.demand);
+ }
+
+ if (p->ravg.curr_window) {
+ src_rq->curr_runnable_sum -= p->ravg.curr_window;
+ dest_rq->curr_runnable_sum += p->ravg.curr_window;
+ }
+
+ if (p->ravg.prev_window) {
+ src_rq->prev_runnable_sum -= p->ravg.prev_window;
+ dest_rq->prev_runnable_sum += p->ravg.prev_window;
+ }
+
+ if ((s64)src_rq->prev_runnable_sum < 0) {
+ src_rq->prev_runnable_sum = 0;
+ WARN_ON(1);
+ }
+ if ((s64)src_rq->curr_runnable_sum < 0) {
+ src_rq->curr_runnable_sum = 0;
+ WARN_ON(1);
+ }
+
+ trace_walt_migration_update_sum(src_rq, p);
+ trace_walt_migration_update_sum(dest_rq, p);
+
+ if (p->state == TASK_WAKING)
+ double_rq_unlock(src_rq, dest_rq);
+}
+
+void walt_init_new_task_load(struct task_struct *p)
+{
+ int i;
+ u32 init_load_windows =
+ div64_u64((u64)sysctl_sched_walt_init_task_load_pct *
+ (u64)walt_ravg_window, 100);
+ u32 init_load_pct = current->init_load_pct;
+
+ p->init_load_pct = 0;
+ memset(&p->ravg, 0, sizeof(struct ravg));
+
+ if (init_load_pct) {
+ init_load_windows = div64_u64((u64)init_load_pct *
+ (u64)walt_ravg_window, 100);
+ }
+
+ p->ravg.demand = init_load_windows;
+ for (i = 0; i < RAVG_HIST_SIZE_MAX; ++i)
+ p->ravg.sum_history[i] = init_load_windows;
+}
diff --git a/kernel/sched/walt.h b/kernel/sched/walt.h
new file mode 100644
index 000000000000..bd20ed87f5e0
--- /dev/null
+++ b/kernel/sched/walt.h
@@ -0,0 +1,69 @@
+/*
+ * Copyright (c) 2016, The Linux Foundation. All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 and
+ * only version 2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ */
+
+#ifndef __WALT_H
+#define __WALT_H
+
+#ifdef CONFIG_SCHED_WALT
+
+void walt_update_task_ravg(struct task_struct *p, struct rq *rq, int event,
+ u64 wallclock, u64 irqtime);
+void walt_inc_cumulative_runnable_avg(struct rq *rq, struct task_struct *p);
+void walt_dec_cumulative_runnable_avg(struct rq *rq, struct task_struct *p);
+
+void walt_fixup_busy_time(struct task_struct *p, int new_cpu);
+void walt_init_new_task_load(struct task_struct *p);
+void walt_mark_task_starting(struct task_struct *p);
+void walt_set_window_start(struct rq *rq);
+void walt_migrate_sync_cpu(int cpu);
+void walt_init_cpu_efficiency(void);
+u64 walt_ktime_clock(void);
+void walt_account_irqtime(int cpu, struct task_struct *curr, u64 delta,
+ u64 wallclock);
+
+u64 walt_irqload(int cpu);
+int walt_cpu_high_irqload(int cpu);
+
+#else /* CONFIG_SCHED_WALT */
+
+static inline void walt_update_task_ravg(struct task_struct *p, struct rq *rq,
+ int event, u64 wallclock, u64 irqtime) { }
+static inline void walt_inc_cumulative_runnable_avg(struct rq *rq, struct task_struct *p) { }
+static inline void walt_dec_cumulative_runnable_avg(struct rq *rq, struct task_struct *p) { }
+static inline void walt_fixup_busy_time(struct task_struct *p, int new_cpu) { }
+static inline void walt_init_new_task_load(struct task_struct *p) { }
+static inline void walt_mark_task_starting(struct task_struct *p) { }
+static inline void walt_set_window_start(struct rq *rq) { }
+static inline void walt_migrate_sync_cpu(int cpu) { }
+static inline void walt_init_cpu_efficiency(void) { }
+static inline u64 walt_ktime_clock(void) { return 0; }
+
+#define walt_cpu_high_irqload(cpu) false
+
+#endif /* CONFIG_SCHED_WALT */
+
+#if defined(CONFIG_CFS_BANDWIDTH) && defined(CONFIG_SCHED_WALT)
+void walt_inc_cfs_cumulative_runnable_avg(struct cfs_rq *rq,
+ struct task_struct *p);
+void walt_dec_cfs_cumulative_runnable_avg(struct cfs_rq *rq,
+ struct task_struct *p);
+#else
+static inline void walt_inc_cfs_cumulative_runnable_avg(struct cfs_rq *rq,
+ struct task_struct *p) { }
+static inline void walt_dec_cfs_cumulative_runnable_avg(struct cfs_rq *rq,
+ struct task_struct *p) { }
+#endif
+
+extern bool walt_disabled;
+
+#endif
diff --git a/kernel/sys.c b/kernel/sys.c
index 6c4e9b533258..23559571068f 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -41,6 +41,8 @@
#include <linux/syscore_ops.h>
#include <linux/version.h>
#include <linux/ctype.h>
+#include <linux/mm.h>
+#include <linux/mempolicy.h>
#include <linux/compat.h>
#include <linux/syscalls.h>
@@ -2070,6 +2072,153 @@ static int prctl_get_tid_address(struct task_struct *me, int __user **tid_addr)
}
#endif
+#ifdef CONFIG_MMU
+static int prctl_update_vma_anon_name(struct vm_area_struct *vma,
+ struct vm_area_struct **prev,
+ unsigned long start, unsigned long end,
+ const char __user *name_addr)
+{
+ struct mm_struct *mm = vma->vm_mm;
+ int error = 0;
+ pgoff_t pgoff;
+
+ if (name_addr == vma_get_anon_name(vma)) {
+ *prev = vma;
+ goto out;
+ }
+
+ pgoff = vma->vm_pgoff + ((start - vma->vm_start) >> PAGE_SHIFT);
+ *prev = vma_merge(mm, *prev, start, end, vma->vm_flags, vma->anon_vma,
+ vma->vm_file, pgoff, vma_policy(vma),
+ vma->vm_userfaultfd_ctx, name_addr);
+ if (*prev) {
+ vma = *prev;
+ goto success;
+ }
+
+ *prev = vma;
+
+ if (start != vma->vm_start) {
+ error = split_vma(mm, vma, start, 1);
+ if (error)
+ goto out;
+ }
+
+ if (end != vma->vm_end) {
+ error = split_vma(mm, vma, end, 0);
+ if (error)
+ goto out;
+ }
+
+success:
+ if (!vma->vm_file)
+ vma->anon_name = name_addr;
+
+out:
+ if (error == -ENOMEM)
+ error = -EAGAIN;
+ return error;
+}
+
+static int prctl_set_vma_anon_name(unsigned long start, unsigned long end,
+ unsigned long arg)
+{
+ unsigned long tmp;
+ struct vm_area_struct *vma, *prev;
+ int unmapped_error = 0;
+ int error = -EINVAL;
+
+ /*
+ * If the interval [start,end) covers some unmapped address
+ * ranges, just ignore them, but return -ENOMEM at the end.
+ * - this matches the handling in madvise.
+ */
+ vma = find_vma_prev(current->mm, start, &prev);
+ if (vma && start > vma->vm_start)
+ prev = vma;
+
+ for (;;) {
+ /* Still start < end. */
+ error = -ENOMEM;
+ if (!vma)
+ return error;
+
+ /* Here start < (end|vma->vm_end). */
+ if (start < vma->vm_start) {
+ unmapped_error = -ENOMEM;
+ start = vma->vm_start;
+ if (start >= end)
+ return error;
+ }
+
+ /* Here vma->vm_start <= start < (end|vma->vm_end) */
+ tmp = vma->vm_end;
+ if (end < tmp)
+ tmp = end;
+
+ /* Here vma->vm_start <= start < tmp <= (end|vma->vm_end). */
+ error = prctl_update_vma_anon_name(vma, &prev, start, tmp,
+ (const char __user *)arg);
+ if (error)
+ return error;
+ start = tmp;
+ if (prev && start < prev->vm_end)
+ start = prev->vm_end;
+ error = unmapped_error;
+ if (start >= end)
+ return error;
+ if (prev)
+ vma = prev->vm_next;
+ else /* madvise_remove dropped mmap_sem */
+ vma = find_vma(current->mm, start);
+ }
+}
+
+static int prctl_set_vma(unsigned long opt, unsigned long start,
+ unsigned long len_in, unsigned long arg)
+{
+ struct mm_struct *mm = current->mm;
+ int error;
+ unsigned long len;
+ unsigned long end;
+
+ if (start & ~PAGE_MASK)
+ return -EINVAL;
+ len = (len_in + ~PAGE_MASK) & PAGE_MASK;
+
+ /* Check to see whether len was rounded up from small -ve to zero */
+ if (len_in && !len)
+ return -EINVAL;
+
+ end = start + len;
+ if (end < start)
+ return -EINVAL;
+
+ if (end == start)
+ return 0;
+
+ down_write(&mm->mmap_sem);
+
+ switch (opt) {
+ case PR_SET_VMA_ANON_NAME:
+ error = prctl_set_vma_anon_name(start, end, arg);
+ break;
+ default:
+ error = -EINVAL;
+ }
+
+ up_write(&mm->mmap_sem);
+
+ return error;
+}
+#else /* CONFIG_MMU */
+static int prctl_set_vma(unsigned long opt, unsigned long start,
+ unsigned long len_in, unsigned long arg)
+{
+ return -EINVAL;
+}
+#endif
+
int __weak arch_prctl_spec_ctrl_get(struct task_struct *t, unsigned long which)
{
return -EINVAL;
@@ -2289,6 +2438,9 @@ SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3,
return -EINVAL;
error = arch_prctl_spec_ctrl_set(me, arg2, arg3);
break;
+ case PR_SET_VMA:
+ error = prctl_set_vma(arg2, arg3, arg4, arg5);
+ break;
default:
error = -EINVAL;
break;
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 23f658d311c0..a57c0a9c36f1 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -105,6 +105,7 @@ extern char core_pattern[];
extern unsigned int core_pipe_limit;
#endif
extern int pid_max;
+extern int extra_free_kbytes;
extern int pid_max_min, pid_max_max;
extern int percpu_pagelist_fraction;
extern int latencytop_enabled;
@@ -308,6 +309,50 @@ static struct ctl_table kern_table[] = {
.extra2 = &max_sched_granularity_ns,
},
{
+ .procname = "sched_sync_hint_enable",
+ .data = &sysctl_sched_sync_hint_enable,
+ .maxlen = sizeof(unsigned int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec,
+ },
+#ifdef CONFIG_SCHED_WALT
+ {
+ .procname = "sched_use_walt_cpu_util",
+ .data = &sysctl_sched_use_walt_cpu_util,
+ .maxlen = sizeof(unsigned int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec,
+ },
+ {
+ .procname = "sched_use_walt_task_util",
+ .data = &sysctl_sched_use_walt_task_util,
+ .maxlen = sizeof(unsigned int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec,
+ },
+ {
+ .procname = "sched_walt_init_task_load_pct",
+ .data = &sysctl_sched_walt_init_task_load_pct,
+ .maxlen = sizeof(unsigned int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec,
+ },
+ {
+ .procname = "sched_walt_cpu_high_irqload",
+ .data = &sysctl_sched_walt_cpu_high_irqload,
+ .maxlen = sizeof(unsigned int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec,
+ },
+#endif
+ {
+ .procname = "sched_cstate_aware",
+ .data = &sysctl_sched_cstate_aware,
+ .maxlen = sizeof(unsigned int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec,
+ },
+ {
.procname = "sched_wakeup_granularity_ns",
.data = &sysctl_sched_wakeup_granularity,
.maxlen = sizeof(unsigned int),
@@ -450,6 +495,21 @@ static struct ctl_table kern_table[] = {
.extra1 = &one,
},
#endif
+#ifdef CONFIG_SCHED_TUNE
+ {
+ .procname = "sched_cfs_boost",
+ .data = &sysctl_sched_cfs_boost,
+ .maxlen = sizeof(sysctl_sched_cfs_boost),
+#ifdef CONFIG_CGROUP_SCHEDTUNE
+ .mode = 0444,
+#else
+ .mode = 0644,
+#endif
+ .proc_handler = &sysctl_sched_cfs_boost_handler,
+ .extra1 = &zero,
+ .extra2 = &one_hundred,
+ },
+#endif
#ifdef CONFIG_PROVE_LOCKING
{
.procname = "prove_locking",
@@ -1447,6 +1507,14 @@ static struct ctl_table vm_table[] = {
.extra2 = &one_thousand,
},
{
+ .procname = "extra_free_kbytes",
+ .data = &extra_free_kbytes,
+ .maxlen = sizeof(extra_free_kbytes),
+ .mode = 0644,
+ .proc_handler = min_free_kbytes_sysctl_handler,
+ .extra1 = &zero,
+ },
+ {
.procname = "percpu_pagelist_fraction",
.data = &percpu_pagelist_fraction,
.maxlen = sizeof(percpu_pagelist_fraction),
diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c
index b6bebe28a3e0..65aafadad8e2 100644
--- a/kernel/time/tick-sched.c
+++ b/kernel/time/tick-sched.c
@@ -1009,6 +1009,18 @@ ktime_t tick_nohz_get_sleep_length(void)
return ts->sleep_length;
}
+/**
+ * tick_nohz_get_idle_calls - return the current idle calls counter value
+ *
+ * Called from the schedutil frequency scaling governor in scheduler context.
+ */
+unsigned long tick_nohz_get_idle_calls(void)
+{
+ struct tick_sched *ts = this_cpu_ptr(&tick_cpu_sched);
+
+ return ts->idle_calls;
+}
+
static void tick_nohz_account_idle_ticks(struct tick_sched *ts)
{
#ifndef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE
diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c
index d831827d7ab0..427e33dbb98d 100644
--- a/kernel/time/timekeeping.c
+++ b/kernel/time/timekeeping.c
@@ -70,6 +70,10 @@ static inline void tk_normalize_xtime(struct timekeeper *tk)
tk->tkr_mono.xtime_nsec -= (u64)NSEC_PER_SEC << tk->tkr_mono.shift;
tk->xtime_sec++;
}
+ while (tk->tkr_raw.xtime_nsec >= ((u64)NSEC_PER_SEC << tk->tkr_raw.shift)) {
+ tk->tkr_raw.xtime_nsec -= (u64)NSEC_PER_SEC << tk->tkr_raw.shift;
+ tk->raw_sec++;
+ }
}
static inline struct timespec64 tk_xtime(struct timekeeper *tk)
@@ -283,12 +287,14 @@ static void tk_setup_internals(struct timekeeper *tk, struct clocksource *clock)
/* if changing clocks, convert xtime_nsec shift units */
if (old_clock) {
int shift_change = clock->shift - old_clock->shift;
- if (shift_change < 0)
+ if (shift_change < 0) {
tk->tkr_mono.xtime_nsec >>= -shift_change;
- else
+ tk->tkr_raw.xtime_nsec >>= -shift_change;
+ } else {
tk->tkr_mono.xtime_nsec <<= shift_change;
+ tk->tkr_raw.xtime_nsec <<= shift_change;
+ }
}
- tk->tkr_raw.xtime_nsec = 0;
tk->tkr_mono.shift = clock->shift;
tk->tkr_raw.shift = clock->shift;
@@ -442,6 +448,35 @@ u64 ktime_get_raw_fast_ns(void)
}
EXPORT_SYMBOL_GPL(ktime_get_raw_fast_ns);
+/**
+ * ktime_get_boot_fast_ns - NMI safe and fast access to boot clock.
+ *
+ * To keep it NMI safe since we're accessing from tracing, we're not using a
+ * separate timekeeper with updates to monotonic clock and boot offset
+ * protected with seqlocks. This has the following minor side effects:
+ *
+ * (1) Its possible that a timestamp be taken after the boot offset is updated
+ * but before the timekeeper is updated. If this happens, the new boot offset
+ * is added to the old timekeeping making the clock appear to update slightly
+ * earlier:
+ * CPU 0 CPU 1
+ * timekeeping_inject_sleeptime64()
+ * __timekeeping_inject_sleeptime(tk, delta);
+ * timestamp();
+ * timekeeping_update(tk, TK_CLEAR_NTP...);
+ *
+ * (2) On 32-bit systems, the 64-bit boot offset (tk->offs_boot) may be
+ * partially updated. Since the tk->offs_boot update is a rare event, this
+ * should be a rare occurrence which postprocessing should be able to handle.
+ */
+u64 notrace ktime_get_boot_fast_ns(void)
+{
+ struct timekeeper *tk = &tk_core.timekeeper;
+
+ return (ktime_get_mono_fast_ns() + ktime_to_ns(tk->offs_boot));
+}
+EXPORT_SYMBOL_GPL(ktime_get_boot_fast_ns);
+
/* Suspend-time cycles value for halted fast timekeeper. */
static cycle_t cycles_at_suspend;
@@ -590,9 +625,6 @@ static inline void tk_update_ktime_data(struct timekeeper *tk)
nsec = (u32) tk->wall_to_monotonic.tv_nsec;
tk->tkr_mono.base = ns_to_ktime(seconds * NSEC_PER_SEC + nsec);
- /* Update the monotonic raw base */
- tk->tkr_raw.base = timespec64_to_ktime(tk->raw_time);
-
/*
* The sum of the nanoseconds portions of xtime and
* wall_to_monotonic can be greater/equal one second. Take
@@ -602,6 +634,9 @@ static inline void tk_update_ktime_data(struct timekeeper *tk)
if (nsec >= NSEC_PER_SEC)
seconds++;
tk->ktime_sec = seconds;
+
+ /* Update the monotonic raw base */
+ tk->tkr_raw.base = ns_to_ktime(tk->raw_sec * NSEC_PER_SEC);
}
/* must hold timekeeper_lock */
@@ -643,7 +678,6 @@ static void timekeeping_update(struct timekeeper *tk, unsigned int action)
static void timekeeping_forward_now(struct timekeeper *tk)
{
cycle_t cycle_now, delta;
- s64 nsec;
cycle_now = tk_clock_read(&tk->tkr_mono);
delta = clocksource_delta(cycle_now, tk->tkr_mono.cycle_last, tk->tkr_mono.mask);
@@ -655,10 +689,13 @@ static void timekeeping_forward_now(struct timekeeper *tk)
/* If arch requires, add in get_arch_timeoffset() */
tk->tkr_mono.xtime_nsec += (u64)arch_gettimeoffset() << tk->tkr_mono.shift;
- tk_normalize_xtime(tk);
- nsec = clocksource_cyc2ns(delta, tk->tkr_raw.mult, tk->tkr_raw.shift);
- timespec64_add_ns(&tk->raw_time, nsec);
+ tk->tkr_raw.xtime_nsec += delta * tk->tkr_raw.mult;
+
+ /* If arch requires, add in get_arch_timeoffset() */
+ tk->tkr_raw.xtime_nsec += (u64)arch_gettimeoffset() << tk->tkr_raw.shift;
+
+ tk_normalize_xtime(tk);
}
/**
@@ -1382,19 +1419,18 @@ int timekeeping_notify(struct clocksource *clock)
void getrawmonotonic64(struct timespec64 *ts)
{
struct timekeeper *tk = &tk_core.timekeeper;
- struct timespec64 ts64;
unsigned long seq;
s64 nsecs;
do {
seq = read_seqcount_begin(&tk_core.seq);
+ ts->tv_sec = tk->raw_sec;
nsecs = timekeeping_get_ns(&tk->tkr_raw);
- ts64 = tk->raw_time;
} while (read_seqcount_retry(&tk_core.seq, seq));
- timespec64_add_ns(&ts64, nsecs);
- *ts = ts64;
+ ts->tv_nsec = 0;
+ timespec64_add_ns(ts, nsecs);
}
EXPORT_SYMBOL(getrawmonotonic64);
@@ -1518,8 +1554,7 @@ void __init timekeeping_init(void)
tk_setup_internals(tk, clock);
tk_set_xtime(tk, &now);
- tk->raw_time.tv_sec = 0;
- tk->raw_time.tv_nsec = 0;
+ tk->raw_sec = 0;
if (boot.tv_sec == 0 && boot.tv_nsec == 0)
boot = tk_xtime(tk);
@@ -2037,15 +2072,12 @@ static cycle_t logarithmic_accumulation(struct timekeeper *tk, cycle_t offset,
*clock_set |= accumulate_nsecs_to_secs(tk);
/* Accumulate raw time */
- tk->tkr_raw.xtime_nsec += (u64)tk->raw_time.tv_nsec << tk->tkr_raw.shift;
tk->tkr_raw.xtime_nsec += tk->raw_interval << shift;
snsec_per_sec = (u64)NSEC_PER_SEC << tk->tkr_raw.shift;
while (tk->tkr_raw.xtime_nsec >= snsec_per_sec) {
tk->tkr_raw.xtime_nsec -= snsec_per_sec;
- tk->raw_time.tv_sec++;
+ tk->raw_sec++;
}
- tk->raw_time.tv_nsec = tk->tkr_raw.xtime_nsec >> tk->tkr_raw.shift;
- tk->tkr_raw.xtime_nsec -= (u64)tk->raw_time.tv_nsec << tk->tkr_raw.shift;
/* Accumulate error between NTP and clock interval */
tk->ntp_error += tk->ntp_tick << shift;
diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig
index 2a96b063d659..5771ce7f08b3 100644
--- a/kernel/trace/Kconfig
+++ b/kernel/trace/Kconfig
@@ -72,6 +72,9 @@ config EVENT_TRACING
select CONTEXT_SWITCH_TRACER
bool
+config GPU_TRACEPOINTS
+ bool
+
config CONTEXT_SWITCH_TRACER
bool
@@ -157,6 +160,17 @@ config FUNCTION_GRAPH_TRACER
address on the current task structure into a stack of calls.
+config PREEMPTIRQ_EVENTS
+ bool "Enable trace events for preempt and irq disable/enable"
+ select TRACE_IRQFLAGS
+ depends on DEBUG_PREEMPT || !PROVE_LOCKING
+ default n
+ help
+ Enable tracing of disable and enable events for preemption and irqs.
+ For tracing preempt disable/enable events, DEBUG_PREEMPT must be
+ enabled. For tracing irq disable/enable events, PROVE_LOCKING must
+ be disabled.
+
config IRQSOFF_TRACER
bool "Interrupts-off Latency Tracer"
default n
diff --git a/kernel/trace/Makefile b/kernel/trace/Makefile
index e57980845549..907c14446dbd 100644
--- a/kernel/trace/Makefile
+++ b/kernel/trace/Makefile
@@ -34,6 +34,7 @@ obj-$(CONFIG_TRACING) += trace_printk.o
obj-$(CONFIG_TRACING_MAP) += tracing_map.o
obj-$(CONFIG_CONTEXT_SWITCH_TRACER) += trace_sched_switch.o
obj-$(CONFIG_FUNCTION_TRACER) += trace_functions.o
+obj-$(CONFIG_PREEMPTIRQ_EVENTS) += trace_irqsoff.o
obj-$(CONFIG_IRQSOFF_TRACER) += trace_irqsoff.o
obj-$(CONFIG_PREEMPT_TRACER) += trace_irqsoff.o
obj-$(CONFIG_SCHED_TRACER) += trace_sched_wakeup.o
@@ -67,6 +68,7 @@ obj-$(CONFIG_KGDB_KDB) += trace_kdb.o
endif
obj-$(CONFIG_PROBE_EVENTS) += trace_probe.o
obj-$(CONFIG_UPROBE_EVENT) += trace_uprobe.o
+obj-$(CONFIG_GPU_TRACEPOINTS) += gpu-traces.o
obj-$(CONFIG_TRACEPOINT_BENCHMARK) += trace_benchmark.o
diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index 8f4227d4cd39..ae6bdf3b0646 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -120,8 +120,9 @@ static void ftrace_ops_list_func(unsigned long ip, unsigned long parent_ip,
struct ftrace_ops *op, struct pt_regs *regs);
#else
/* See comment below, where ftrace_ops_list_func is defined */
-static void ftrace_ops_no_ops(unsigned long ip, unsigned long parent_ip);
-#define ftrace_ops_list_func ((ftrace_func_t)ftrace_ops_no_ops)
+static void ftrace_ops_no_ops(unsigned long ip, unsigned long parent_ip,
+ struct ftrace_ops *op, struct pt_regs *regs);
+#define ftrace_ops_list_func ftrace_ops_no_ops
#endif
/*
@@ -4875,9 +4876,9 @@ static int ftrace_cmp_ips(const void *a, const void *b)
return 0;
}
-static int ftrace_process_locs(struct module *mod,
- unsigned long *start,
- unsigned long *end)
+static int __norecordmcount ftrace_process_locs(struct module *mod,
+ unsigned long *start,
+ unsigned long *end)
{
struct ftrace_page *start_pg;
struct ftrace_page *pg;
@@ -5310,7 +5311,8 @@ static void ftrace_ops_list_func(unsigned long ip, unsigned long parent_ip,
__ftrace_ops_list_func(ip, parent_ip, NULL, regs);
}
#else
-static void ftrace_ops_no_ops(unsigned long ip, unsigned long parent_ip)
+static void ftrace_ops_no_ops(unsigned long ip, unsigned long parent_ip,
+ struct ftrace_ops *op, struct pt_regs *regs)
{
__ftrace_ops_list_func(ip, parent_ip, NULL, NULL);
}
@@ -5736,14 +5738,17 @@ void ftrace_graph_graph_time_control(bool enable)
fgraph_graph_time = enable;
}
+void ftrace_graph_return_stub(struct ftrace_graph_ret *trace)
+{
+}
+
int ftrace_graph_entry_stub(struct ftrace_graph_ent *trace)
{
return 0;
}
/* The callbacks that hook a function */
-trace_func_graph_ret_t ftrace_graph_return =
- (trace_func_graph_ret_t)ftrace_stub;
+trace_func_graph_ret_t ftrace_graph_return = ftrace_graph_return_stub;
trace_func_graph_ent_t ftrace_graph_entry = ftrace_graph_entry_stub;
static trace_func_graph_ent_t __ftrace_graph_entry = ftrace_graph_entry_stub;
@@ -5971,7 +5976,7 @@ void unregister_ftrace_graph(void)
goto out;
ftrace_graph_active--;
- ftrace_graph_return = (trace_func_graph_ret_t)ftrace_stub;
+ ftrace_graph_return = ftrace_graph_return_stub;
ftrace_graph_entry = ftrace_graph_entry_stub;
__ftrace_graph_entry = ftrace_graph_entry_stub;
ftrace_shutdown(&graph_ops, FTRACE_STOP_FUNC_RET);
diff --git a/kernel/trace/gpu-traces.c b/kernel/trace/gpu-traces.c
new file mode 100644
index 000000000000..a4b3f00faee3
--- /dev/null
+++ b/kernel/trace/gpu-traces.c
@@ -0,0 +1,23 @@
+/*
+ * GPU tracepoints
+ *
+ * Copyright (C) 2013 Google, Inc.
+ *
+ * This software is licensed under the terms of the GNU General Public
+ * License version 2, as published by the Free Software Foundation, and
+ * may be copied, distributed, and modified under those terms.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ */
+
+#include <linux/module.h>
+
+#define CREATE_TRACE_POINTS
+#include <trace/events/gpu.h>
+
+EXPORT_TRACEPOINT_SYMBOL(gpu_sched_switch);
+EXPORT_TRACEPOINT_SYMBOL(gpu_job_enqueue);
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index a47339b156ce..6d1532d6cde6 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -1125,6 +1125,7 @@ static struct {
{ trace_clock, "perf", 1 },
{ ktime_get_mono_fast_ns, "mono", 1 },
{ ktime_get_raw_fast_ns, "mono_raw", 1 },
+ { ktime_get_boot_fast_ns, "boot", 1 },
ARCH_TRACE_CLOCKS
};
@@ -1607,6 +1608,7 @@ static arch_spinlock_t trace_cmdline_lock = __ARCH_SPIN_LOCK_UNLOCKED;
struct saved_cmdlines_buffer {
unsigned map_pid_to_cmdline[PID_MAX_DEFAULT+1];
unsigned *map_cmdline_to_pid;
+ unsigned *map_cmdline_to_tgid;
unsigned cmdline_num;
int cmdline_idx;
char *saved_cmdlines;
@@ -1640,12 +1642,23 @@ static int allocate_cmdlines_buffer(unsigned int val,
return -ENOMEM;
}
+ s->map_cmdline_to_tgid = kmalloc_array(val,
+ sizeof(*s->map_cmdline_to_tgid),
+ GFP_KERNEL);
+ if (!s->map_cmdline_to_tgid) {
+ kfree(s->map_cmdline_to_pid);
+ kfree(s->saved_cmdlines);
+ return -ENOMEM;
+ }
+
s->cmdline_idx = 0;
s->cmdline_num = val;
memset(&s->map_pid_to_cmdline, NO_CMDLINE_MAP,
sizeof(s->map_pid_to_cmdline));
memset(s->map_cmdline_to_pid, NO_CMDLINE_MAP,
val * sizeof(*s->map_cmdline_to_pid));
+ memset(s->map_cmdline_to_tgid, NO_CMDLINE_MAP,
+ val * sizeof(*s->map_cmdline_to_tgid));
return 0;
}
@@ -1811,14 +1824,17 @@ static int trace_save_cmdline(struct task_struct *tsk)
if (!tsk->pid || unlikely(tsk->pid > PID_MAX_DEFAULT))
return 0;
+ preempt_disable();
/*
* It's not the end of the world if we don't get
* the lock, but we also don't want to spin
* nor do we want to disable interrupts,
* so if we miss here, then better luck next time.
*/
- if (!arch_spin_trylock(&trace_cmdline_lock))
+ if (!arch_spin_trylock(&trace_cmdline_lock)) {
+ preempt_enable();
return 0;
+ }
idx = savedcmd->map_pid_to_cmdline[tsk->pid];
if (idx == NO_CMDLINE_MAP) {
@@ -1841,8 +1857,9 @@ static int trace_save_cmdline(struct task_struct *tsk)
}
set_cmdline(idx, tsk->comm);
-
+ savedcmd->map_cmdline_to_tgid[idx] = tsk->tgid;
arch_spin_unlock(&trace_cmdline_lock);
+ preempt_enable();
return 1;
}
@@ -1884,6 +1901,35 @@ void trace_find_cmdline(int pid, char comm[])
preempt_enable();
}
+static int __find_tgid_locked(int pid)
+{
+ unsigned map;
+ int tgid;
+
+ map = savedcmd->map_pid_to_cmdline[pid];
+ if (map != NO_CMDLINE_MAP)
+ tgid = savedcmd->map_cmdline_to_tgid[map];
+ else
+ tgid = -1;
+
+ return tgid;
+}
+
+int trace_find_tgid(int pid)
+{
+ int tgid;
+
+ preempt_disable();
+ arch_spin_lock(&trace_cmdline_lock);
+
+ tgid = __find_tgid_locked(pid);
+
+ arch_spin_unlock(&trace_cmdline_lock);
+ preempt_enable();
+
+ return tgid;
+}
+
void tracing_record_cmdline(struct task_struct *tsk)
{
if (atomic_read(&trace_record_cmdline_disabled) || !tracing_is_on())
@@ -2937,6 +2983,13 @@ static void print_func_help_header(struct trace_buffer *buf, struct seq_file *m)
"# | | | | |\n");
}
+static void print_func_help_header_tgid(struct trace_buffer *buf, struct seq_file *m)
+{
+ print_event_info(buf, m);
+ seq_puts(m, "# TASK-PID TGID CPU# TIMESTAMP FUNCTION\n");
+ seq_puts(m, "# | | | | | |\n");
+}
+
static void print_func_help_header_irq(struct trace_buffer *buf, struct seq_file *m)
{
print_event_info(buf, m);
@@ -2949,6 +3002,18 @@ static void print_func_help_header_irq(struct trace_buffer *buf, struct seq_file
"# | | | |||| | |\n");
}
+static void print_func_help_header_irq_tgid(struct trace_buffer *buf, struct seq_file *m)
+{
+ print_event_info(buf, m);
+ seq_puts(m, "# _-----=> irqs-off\n");
+ seq_puts(m, "# / _----=> need-resched\n");
+ seq_puts(m, "# | / _---=> hardirq/softirq\n");
+ seq_puts(m, "# || / _--=> preempt-depth\n");
+ seq_puts(m, "# ||| / delay\n");
+ seq_puts(m, "# TASK-PID TGID CPU# |||| TIMESTAMP FUNCTION\n");
+ seq_puts(m, "# | | | | |||| | |\n");
+}
+
void
print_trace_header(struct seq_file *m, struct trace_iterator *iter)
{
@@ -3022,13 +3087,14 @@ static void test_cpu_buff_start(struct trace_iterator *iter)
if (!(iter->iter_flags & TRACE_FILE_ANNOTATE))
return;
- if (iter->started && cpumask_test_cpu(iter->cpu, iter->started))
+ if (cpumask_available(iter->started) &&
+ cpumask_test_cpu(iter->cpu, iter->started))
return;
if (per_cpu_ptr(iter->trace_buffer->data, iter->cpu)->skipped_entries)
return;
- if (iter->started)
+ if (cpumask_available(iter->started))
cpumask_set_cpu(iter->cpu, iter->started);
/* Don't print started cpu buffer for the first entry of the trace */
@@ -3261,9 +3327,15 @@ void trace_default_header(struct seq_file *m)
} else {
if (!(trace_flags & TRACE_ITER_VERBOSE)) {
if (trace_flags & TRACE_ITER_IRQ_INFO)
- print_func_help_header_irq(iter->trace_buffer, m);
+ if (trace_flags & TRACE_ITER_TGID)
+ print_func_help_header_irq_tgid(iter->trace_buffer, m);
+ else
+ print_func_help_header_irq(iter->trace_buffer, m);
else
- print_func_help_header(iter->trace_buffer, m);
+ if (trace_flags & TRACE_ITER_TGID)
+ print_func_help_header_tgid(iter->trace_buffer, m);
+ else
+ print_func_help_header(iter->trace_buffer, m);
}
}
}
@@ -4350,10 +4422,15 @@ tracing_saved_cmdlines_size_read(struct file *filp, char __user *ubuf,
{
char buf[64];
int r;
+ unsigned int n;
+ preempt_disable();
arch_spin_lock(&trace_cmdline_lock);
- r = scnprintf(buf, sizeof(buf), "%u\n", savedcmd->cmdline_num);
+ n = savedcmd->cmdline_num;
arch_spin_unlock(&trace_cmdline_lock);
+ preempt_enable();
+
+ r = scnprintf(buf, sizeof(buf), "%u\n", n);
return simple_read_from_buffer(ubuf, cnt, ppos, buf, r);
}
@@ -4362,6 +4439,7 @@ static void free_saved_cmdlines_buffer(struct saved_cmdlines_buffer *s)
{
kfree(s->saved_cmdlines);
kfree(s->map_cmdline_to_pid);
+ kfree(s->map_cmdline_to_tgid);
kfree(s);
}
@@ -4378,10 +4456,12 @@ static int tracing_resize_saved_cmdlines(unsigned int val)
return -ENOMEM;
}
+ preempt_disable();
arch_spin_lock(&trace_cmdline_lock);
savedcmd_temp = savedcmd;
savedcmd = s;
arch_spin_unlock(&trace_cmdline_lock);
+ preempt_enable();
free_saved_cmdlines_buffer(savedcmd_temp);
return 0;
@@ -4594,6 +4674,78 @@ static void trace_insert_enum_map(struct module *mod,
}
static ssize_t
+tracing_saved_tgids_read(struct file *file, char __user *ubuf,
+ size_t cnt, loff_t *ppos)
+{
+ char *file_buf;
+ char *buf;
+ int len = 0;
+ int i;
+ int *pids;
+ int n = 0;
+
+ preempt_disable();
+ arch_spin_lock(&trace_cmdline_lock);
+
+ pids = kmalloc_array(savedcmd->cmdline_num, 2*sizeof(int), GFP_KERNEL);
+ if (!pids) {
+ arch_spin_unlock(&trace_cmdline_lock);
+ preempt_enable();
+ return -ENOMEM;
+ }
+
+ for (i = 0; i < savedcmd->cmdline_num; i++) {
+ int pid;
+
+ pid = savedcmd->map_cmdline_to_pid[i];
+ if (pid == -1 || pid == NO_CMDLINE_MAP)
+ continue;
+
+ pids[n] = pid;
+ pids[n+1] = __find_tgid_locked(pid);
+ n += 2;
+ }
+ arch_spin_unlock(&trace_cmdline_lock);
+ preempt_enable();
+
+ if (n == 0) {
+ kfree(pids);
+ return 0;
+ }
+
+ /* enough to hold max pair of pids + space, lr and nul */
+ len = n * 12;
+ file_buf = kmalloc(len, GFP_KERNEL);
+ if (!file_buf) {
+ kfree(pids);
+ return -ENOMEM;
+ }
+
+ buf = file_buf;
+ for (i = 0; i < n && len > 0; i += 2) {
+ int r;
+
+ r = snprintf(buf, len, "%d %d\n", pids[i], pids[i+1]);
+ buf += r;
+ len -= r;
+ }
+
+ len = simple_read_from_buffer(ubuf, cnt, ppos,
+ file_buf, buf - file_buf);
+
+ kfree(file_buf);
+ kfree(pids);
+
+ return len;
+}
+
+static const struct file_operations tracing_saved_tgids_fops = {
+ .open = tracing_open_generic,
+ .read = tracing_saved_tgids_read,
+ .llseek = generic_file_llseek,
+};
+
+static ssize_t
tracing_set_trace_read(struct file *filp, char __user *ubuf,
size_t cnt, loff_t *ppos)
{
@@ -7227,6 +7379,9 @@ init_tracer_tracefs(struct trace_array *tr, struct dentry *d_tracer)
trace_create_file("trace_marker", 0220, d_tracer,
tr, &tracing_mark_fops);
+ trace_create_file("saved_tgids", 0444, d_tracer,
+ tr, &tracing_saved_tgids_fops);
+
trace_create_file("trace_clock", 0644, d_tracer, tr,
&trace_clock_fops);
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index b0d8576c27ae..d4c66331d5f5 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -690,6 +690,7 @@ extern cycle_t ftrace_now(int cpu);
extern void trace_find_cmdline(int pid, char comm[]);
extern void trace_event_follow_fork(struct trace_array *tr, bool enable);
+extern int trace_find_tgid(int pid);
#ifdef CONFIG_DYNAMIC_FTRACE
extern unsigned long ftrace_update_tot_cnt;
@@ -1009,7 +1010,8 @@ extern int trace_get_user(struct trace_parser *parser, const char __user *ubuf,
FUNCTION_FLAGS \
FGRAPH_FLAGS \
STACK_FLAGS \
- BRANCH_FLAGS
+ BRANCH_FLAGS \
+ C(TGID, "print-tgid"),
/*
* By defining C, we can make TRACE_FLAGS a list of bit names
diff --git a/kernel/trace/trace_functions_graph.c b/kernel/trace/trace_functions_graph.c
index 01e71812e174..7461d51342d7 100644
--- a/kernel/trace/trace_functions_graph.c
+++ b/kernel/trace/trace_functions_graph.c
@@ -65,6 +65,9 @@ struct fgraph_data {
#define TRACE_GRAPH_INDENT 2
+/* Flag options */
+#define TRACE_GRAPH_PRINT_FLAT 0x80
+
static unsigned int max_depth;
static struct tracer_opt trace_opts[] = {
@@ -88,6 +91,8 @@ static struct tracer_opt trace_opts[] = {
{ TRACER_OPT(sleep-time, TRACE_GRAPH_SLEEP_TIME) },
/* Include time within nested functions */
{ TRACER_OPT(graph-time, TRACE_GRAPH_GRAPH_TIME) },
+ /* Use standard trace formatting rather than hierarchical */
+ { TRACER_OPT(funcgraph-flat, TRACE_GRAPH_PRINT_FLAT) },
{ } /* Empty entry */
};
@@ -1246,6 +1251,9 @@ print_graph_function_flags(struct trace_iterator *iter, u32 flags)
int cpu = iter->cpu;
int ret;
+ if (flags & TRACE_GRAPH_PRINT_FLAT)
+ return TRACE_TYPE_UNHANDLED;
+
if (data && per_cpu_ptr(data->cpu_data, cpu)->ignore) {
per_cpu_ptr(data->cpu_data, cpu)->ignore = 0;
return TRACE_TYPE_HANDLED;
@@ -1303,13 +1311,6 @@ print_graph_function(struct trace_iterator *iter)
return print_graph_function_flags(iter, tracer_flags.val);
}
-static enum print_line_t
-print_graph_function_event(struct trace_iterator *iter, int flags,
- struct trace_event *event)
-{
- return print_graph_function(iter);
-}
-
static void print_lat_header(struct seq_file *s, u32 flags)
{
static const char spaces[] = " " /* 16 spaces */
@@ -1378,6 +1379,11 @@ void print_graph_headers_flags(struct seq_file *s, u32 flags)
struct trace_iterator *iter = s->private;
struct trace_array *tr = iter->tr;
+ if (flags & TRACE_GRAPH_PRINT_FLAT) {
+ trace_default_header(s);
+ return;
+ }
+
if (!(tr->trace_flags & TRACE_ITER_CONTEXT_INFO))
return;
@@ -1459,19 +1465,6 @@ func_graph_set_flag(struct trace_array *tr, u32 old_flags, u32 bit, int set)
return 0;
}
-static struct trace_event_functions graph_functions = {
- .trace = print_graph_function_event,
-};
-
-static struct trace_event graph_trace_entry_event = {
- .type = TRACE_GRAPH_ENT,
- .funcs = &graph_functions,
-};
-
-static struct trace_event graph_trace_ret_event = {
- .type = TRACE_GRAPH_RET,
- .funcs = &graph_functions
-};
static struct tracer graph_trace __tracer_data = {
.name = "function_graph",
@@ -1548,16 +1541,6 @@ static __init int init_graph_trace(void)
{
max_bytes_for_cpu = snprintf(NULL, 0, "%d", nr_cpu_ids - 1);
- if (!register_trace_event(&graph_trace_entry_event)) {
- pr_warn("Warning: could not register graph trace events\n");
- return 1;
- }
-
- if (!register_trace_event(&graph_trace_ret_event)) {
- pr_warn("Warning: could not register graph trace events\n");
- return 1;
- }
-
return register_tracer(&graph_trace);
}
diff --git a/kernel/trace/trace_irqsoff.c b/kernel/trace/trace_irqsoff.c
index 03cdff84d026..c180fe513eb2 100644
--- a/kernel/trace/trace_irqsoff.c
+++ b/kernel/trace/trace_irqsoff.c
@@ -16,6 +16,10 @@
#include "trace.h"
+#define CREATE_TRACE_POINTS
+#include <trace/events/preemptirq.h>
+
+#if defined(CONFIG_IRQSOFF_TRACER) || defined(CONFIG_PREEMPT_TRACER)
static struct trace_array *irqsoff_trace __read_mostly;
static int tracer_enabled __read_mostly;
@@ -451,63 +455,43 @@ void time_hardirqs_off(unsigned long a0, unsigned long a1)
#else /* !CONFIG_PROVE_LOCKING */
/*
- * Stubs:
- */
-
-void trace_softirqs_on(unsigned long ip)
-{
-}
-
-void trace_softirqs_off(unsigned long ip)
-{
-}
-
-inline void print_irqtrace_events(struct task_struct *curr)
-{
-}
-
-/*
* We are only interested in hardirq on/off events:
*/
-void trace_hardirqs_on(void)
+static inline void tracer_hardirqs_on(void)
{
if (!preempt_trace() && irq_trace())
stop_critical_timing(CALLER_ADDR0, CALLER_ADDR1);
}
-EXPORT_SYMBOL(trace_hardirqs_on);
-void trace_hardirqs_off(void)
+static inline void tracer_hardirqs_off(void)
{
if (!preempt_trace() && irq_trace())
start_critical_timing(CALLER_ADDR0, CALLER_ADDR1);
}
-EXPORT_SYMBOL(trace_hardirqs_off);
-__visible void trace_hardirqs_on_caller(unsigned long caller_addr)
+static inline void tracer_hardirqs_on_caller(unsigned long caller_addr)
{
if (!preempt_trace() && irq_trace())
stop_critical_timing(CALLER_ADDR0, caller_addr);
}
-EXPORT_SYMBOL(trace_hardirqs_on_caller);
-__visible void trace_hardirqs_off_caller(unsigned long caller_addr)
+static inline void tracer_hardirqs_off_caller(unsigned long caller_addr)
{
if (!preempt_trace() && irq_trace())
start_critical_timing(CALLER_ADDR0, caller_addr);
}
-EXPORT_SYMBOL(trace_hardirqs_off_caller);
#endif /* CONFIG_PROVE_LOCKING */
#endif /* CONFIG_IRQSOFF_TRACER */
#ifdef CONFIG_PREEMPT_TRACER
-void trace_preempt_on(unsigned long a0, unsigned long a1)
+static inline void tracer_preempt_on(unsigned long a0, unsigned long a1)
{
if (preempt_trace() && !irq_trace())
stop_critical_timing(a0, a1);
}
-void trace_preempt_off(unsigned long a0, unsigned long a1)
+static inline void tracer_preempt_off(unsigned long a0, unsigned long a1)
{
if (preempt_trace() && !irq_trace())
start_critical_timing(a0, a1);
@@ -769,3 +753,100 @@ __init static int init_irqsoff_tracer(void)
return 0;
}
core_initcall(init_irqsoff_tracer);
+#endif /* IRQSOFF_TRACER || PREEMPTOFF_TRACER */
+
+#ifndef CONFIG_IRQSOFF_TRACER
+static inline void tracer_hardirqs_on(void) { }
+static inline void tracer_hardirqs_off(void) { }
+static inline void tracer_hardirqs_on_caller(unsigned long caller_addr) { }
+static inline void tracer_hardirqs_off_caller(unsigned long caller_addr) { }
+#endif
+
+#ifndef CONFIG_PREEMPT_TRACER
+static inline void tracer_preempt_on(unsigned long a0, unsigned long a1) { }
+static inline void tracer_preempt_off(unsigned long a0, unsigned long a1) { }
+#endif
+
+/* Per-cpu variable to prevent redundant calls when IRQs already off */
+static DEFINE_PER_CPU(int, tracing_irq_cpu);
+
+#if defined(CONFIG_TRACE_IRQFLAGS) && !defined(CONFIG_PROVE_LOCKING)
+void trace_hardirqs_on(void)
+{
+ if (!this_cpu_read(tracing_irq_cpu))
+ return;
+
+ trace_irq_enable_rcuidle(CALLER_ADDR0, CALLER_ADDR1);
+ tracer_hardirqs_on();
+
+ this_cpu_write(tracing_irq_cpu, 0);
+}
+EXPORT_SYMBOL(trace_hardirqs_on);
+
+void trace_hardirqs_off(void)
+{
+ if (this_cpu_read(tracing_irq_cpu))
+ return;
+
+ this_cpu_write(tracing_irq_cpu, 1);
+
+ trace_irq_disable_rcuidle(CALLER_ADDR0, CALLER_ADDR1);
+ tracer_hardirqs_off();
+}
+EXPORT_SYMBOL(trace_hardirqs_off);
+
+__visible void trace_hardirqs_on_caller(unsigned long caller_addr)
+{
+ if (!this_cpu_read(tracing_irq_cpu))
+ return;
+
+ trace_irq_enable_rcuidle(CALLER_ADDR0, caller_addr);
+ tracer_hardirqs_on_caller(caller_addr);
+
+ this_cpu_write(tracing_irq_cpu, 0);
+}
+EXPORT_SYMBOL(trace_hardirqs_on_caller);
+
+__visible void trace_hardirqs_off_caller(unsigned long caller_addr)
+{
+ if (this_cpu_read(tracing_irq_cpu))
+ return;
+
+ this_cpu_write(tracing_irq_cpu, 1);
+
+ trace_irq_disable_rcuidle(CALLER_ADDR0, caller_addr);
+ tracer_hardirqs_off_caller(caller_addr);
+}
+EXPORT_SYMBOL(trace_hardirqs_off_caller);
+
+/*
+ * Stubs:
+ */
+
+void trace_softirqs_on(unsigned long ip)
+{
+}
+
+void trace_softirqs_off(unsigned long ip)
+{
+}
+
+inline void print_irqtrace_events(struct task_struct *curr)
+{
+}
+#endif
+
+#if defined(CONFIG_PREEMPT_TRACER) || \
+ (defined(CONFIG_DEBUG_PREEMPT) && defined(CONFIG_PREEMPTIRQ_EVENTS))
+void trace_preempt_on(unsigned long a0, unsigned long a1)
+{
+ trace_preempt_enable_rcuidle(a0, a1);
+ tracer_preempt_on(a0, a1);
+}
+
+void trace_preempt_off(unsigned long a0, unsigned long a1)
+{
+ trace_preempt_disable_rcuidle(a0, a1);
+ tracer_preempt_off(a0, a1);
+}
+#endif
diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c
index 3fc20422c166..034675950649 100644
--- a/kernel/trace/trace_output.c
+++ b/kernel/trace/trace_output.c
@@ -530,11 +530,21 @@ int trace_print_context(struct trace_iterator *iter)
unsigned long long t;
unsigned long secs, usec_rem;
char comm[TASK_COMM_LEN];
+ int tgid;
trace_find_cmdline(entry->pid, comm);
- trace_seq_printf(s, "%16s-%-5d [%03d] ",
- comm, entry->pid, iter->cpu);
+ trace_seq_printf(s, "%16s-%-5d ", comm, entry->pid);
+
+ if (tr->trace_flags & TRACE_ITER_TGID) {
+ tgid = trace_find_tgid(entry->pid);
+ if (tgid < 0)
+ trace_seq_puts(s, "(-----) ");
+ else
+ trace_seq_printf(s, "(%5d) ", tgid);
+ }
+
+ trace_seq_printf(s, "[%03d] ", iter->cpu);
if (tr->trace_flags & TRACE_ITER_IRQ_INFO)
trace_print_lat_fmt(s, entry);
@@ -849,6 +859,174 @@ static struct trace_event trace_fn_event = {
.funcs = &trace_fn_funcs,
};
+/* TRACE_GRAPH_ENT */
+static enum print_line_t trace_graph_ent_trace(struct trace_iterator *iter, int flags,
+ struct trace_event *event)
+{
+ struct trace_seq *s = &iter->seq;
+ struct ftrace_graph_ent_entry *field;
+
+ trace_assign_type(field, iter->ent);
+
+ trace_seq_puts(s, "graph_ent: func=");
+ if (trace_seq_has_overflowed(s))
+ return TRACE_TYPE_PARTIAL_LINE;
+
+ if (!seq_print_ip_sym(s, field->graph_ent.func, flags))
+ return TRACE_TYPE_PARTIAL_LINE;
+
+ trace_seq_puts(s, "\n");
+ if (trace_seq_has_overflowed(s))
+ return TRACE_TYPE_PARTIAL_LINE;
+
+ return TRACE_TYPE_HANDLED;
+}
+
+static enum print_line_t trace_graph_ent_raw(struct trace_iterator *iter, int flags,
+ struct trace_event *event)
+{
+ struct ftrace_graph_ent_entry *field;
+
+ trace_assign_type(field, iter->ent);
+
+ trace_seq_printf(&iter->seq, "%lx %d\n",
+ field->graph_ent.func,
+ field->graph_ent.depth);
+ if (trace_seq_has_overflowed(&iter->seq))
+ return TRACE_TYPE_PARTIAL_LINE;
+
+ return TRACE_TYPE_HANDLED;
+}
+
+static enum print_line_t trace_graph_ent_hex(struct trace_iterator *iter, int flags,
+ struct trace_event *event)
+{
+ struct ftrace_graph_ent_entry *field;
+ struct trace_seq *s = &iter->seq;
+
+ trace_assign_type(field, iter->ent);
+
+ SEQ_PUT_HEX_FIELD(s, field->graph_ent.func);
+ SEQ_PUT_HEX_FIELD(s, field->graph_ent.depth);
+
+ return TRACE_TYPE_HANDLED;
+}
+
+static enum print_line_t trace_graph_ent_bin(struct trace_iterator *iter, int flags,
+ struct trace_event *event)
+{
+ struct ftrace_graph_ent_entry *field;
+ struct trace_seq *s = &iter->seq;
+
+ trace_assign_type(field, iter->ent);
+
+ SEQ_PUT_FIELD(s, field->graph_ent.func);
+ SEQ_PUT_FIELD(s, field->graph_ent.depth);
+
+ return TRACE_TYPE_HANDLED;
+}
+
+static struct trace_event_functions trace_graph_ent_funcs = {
+ .trace = trace_graph_ent_trace,
+ .raw = trace_graph_ent_raw,
+ .hex = trace_graph_ent_hex,
+ .binary = trace_graph_ent_bin,
+};
+
+static struct trace_event trace_graph_ent_event = {
+ .type = TRACE_GRAPH_ENT,
+ .funcs = &trace_graph_ent_funcs,
+};
+
+/* TRACE_GRAPH_RET */
+static enum print_line_t trace_graph_ret_trace(struct trace_iterator *iter, int flags,
+ struct trace_event *event)
+{
+ struct trace_seq *s = &iter->seq;
+ struct trace_entry *entry = iter->ent;
+ struct ftrace_graph_ret_entry *field;
+
+ trace_assign_type(field, entry);
+
+ trace_seq_puts(s, "graph_ret: func=");
+ if (trace_seq_has_overflowed(s))
+ return TRACE_TYPE_PARTIAL_LINE;
+
+ if (!seq_print_ip_sym(s, field->ret.func, flags))
+ return TRACE_TYPE_PARTIAL_LINE;
+
+ trace_seq_puts(s, "\n");
+ if (trace_seq_has_overflowed(s))
+ return TRACE_TYPE_PARTIAL_LINE;
+
+ return TRACE_TYPE_HANDLED;
+}
+
+static enum print_line_t trace_graph_ret_raw(struct trace_iterator *iter, int flags,
+ struct trace_event *event)
+{
+ struct ftrace_graph_ret_entry *field;
+
+ trace_assign_type(field, iter->ent);
+
+ trace_seq_printf(&iter->seq, "%lx %lld %lld %ld %d\n",
+ field->ret.func,
+ field->ret.calltime,
+ field->ret.rettime,
+ field->ret.overrun,
+ field->ret.depth);
+ if (trace_seq_has_overflowed(&iter->seq))
+ return TRACE_TYPE_PARTIAL_LINE;
+
+ return TRACE_TYPE_HANDLED;
+}
+
+static enum print_line_t trace_graph_ret_hex(struct trace_iterator *iter, int flags,
+ struct trace_event *event)
+{
+ struct ftrace_graph_ret_entry *field;
+ struct trace_seq *s = &iter->seq;
+
+ trace_assign_type(field, iter->ent);
+
+ SEQ_PUT_HEX_FIELD(s, field->ret.func);
+ SEQ_PUT_HEX_FIELD(s, field->ret.calltime);
+ SEQ_PUT_HEX_FIELD(s, field->ret.rettime);
+ SEQ_PUT_HEX_FIELD(s, field->ret.overrun);
+ SEQ_PUT_HEX_FIELD(s, field->ret.depth);
+
+ return TRACE_TYPE_HANDLED;
+}
+
+static enum print_line_t trace_graph_ret_bin(struct trace_iterator *iter, int flags,
+ struct trace_event *event)
+{
+ struct ftrace_graph_ret_entry *field;
+ struct trace_seq *s = &iter->seq;
+
+ trace_assign_type(field, iter->ent);
+
+ SEQ_PUT_FIELD(s, field->ret.func);
+ SEQ_PUT_FIELD(s, field->ret.calltime);
+ SEQ_PUT_FIELD(s, field->ret.rettime);
+ SEQ_PUT_FIELD(s, field->ret.overrun);
+ SEQ_PUT_FIELD(s, field->ret.depth);
+
+ return TRACE_TYPE_HANDLED;
+}
+
+static struct trace_event_functions trace_graph_ret_funcs = {
+ .trace = trace_graph_ret_trace,
+ .raw = trace_graph_ret_raw,
+ .hex = trace_graph_ret_hex,
+ .binary = trace_graph_ret_bin,
+};
+
+static struct trace_event trace_graph_ret_event = {
+ .type = TRACE_GRAPH_RET,
+ .funcs = &trace_graph_ret_funcs,
+};
+
/* TRACE_CTX an TRACE_WAKE */
static enum print_line_t trace_ctxwake_print(struct trace_iterator *iter,
char *delim)
@@ -1291,6 +1469,8 @@ static struct trace_event trace_print_event = {
static struct trace_event *events[] __initdata = {
&trace_fn_event,
+ &trace_graph_ent_event,
+ &trace_graph_ret_event,
&trace_ctx_event,
&trace_wake_event,
&trace_stack_event,
diff --git a/kernel/user.c b/kernel/user.c
index b069ccbfb0b0..41e94e453836 100644
--- a/kernel/user.c
+++ b/kernel/user.c
@@ -16,6 +16,7 @@
#include <linux/interrupt.h>
#include <linux/export.h>
#include <linux/user_namespace.h>
+#include <linux/proc_fs.h>
#include <linux/proc_ns.h>
/*
@@ -201,6 +202,7 @@ struct user_struct *alloc_uid(kuid_t uid)
}
spin_unlock_irq(&uidhash_lock);
}
+ proc_register_uid(uid);
return up;
@@ -222,6 +224,7 @@ static int __init uid_cache_init(void)
spin_lock_irq(&uidhash_lock);
uid_hash_insert(&root_user, uidhashentry(GLOBAL_ROOT_UID));
spin_unlock_irq(&uidhash_lock);
+ proc_register_uid(GLOBAL_ROOT_UID);
return 0;
}
diff --git a/lib/Kconfig b/lib/Kconfig
index 260a80e313b9..8deab4f7e81a 100644
--- a/lib/Kconfig
+++ b/lib/Kconfig
@@ -185,6 +185,9 @@ config CRC8
when they need to do cyclic redundancy check according CRC8
algorithm. Module will be called crc8.
+config XXHASH
+ tristate
+
config AUDIT_GENERIC
bool
depends on AUDIT && !AUDIT_ARCH
@@ -239,6 +242,14 @@ config LZ4HC_COMPRESS
config LZ4_DECOMPRESS
tristate
+config ZSTD_COMPRESS
+ select XXHASH
+ tristate
+
+config ZSTD_DECOMPRESS
+ select XXHASH
+ tristate
+
source "lib/xz/Kconfig"
#
diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug
index 58a22ca10f33..930d74de4bce 100644
--- a/lib/Kconfig.debug
+++ b/lib/Kconfig.debug
@@ -922,6 +922,15 @@ config SCHED_INFO
bool
default n
+config PANIC_ON_RT_THROTTLING
+ bool "Panic on RT throttling"
+ help
+ Say Y here to enable the kernel to panic when a realtime
+ runqueue is throttled. This may be useful for detecting
+ and debugging RT throttling issues.
+
+ Say N if unsure.
+
config SCHEDSTATS
bool "Collect scheduler statistics"
depends on DEBUG_KERNEL && PROC_FS
@@ -1963,6 +1972,16 @@ config TEST_STATIC_KEYS
If unsure, say N.
+config BUG_ON_DATA_CORRUPTION
+ bool "Trigger a BUG when data corruption is detected"
+ select CONFIG_DEBUG_LIST
+ help
+ Select this option if the kernel should BUG when it encounters
+ data corruption in kernel memory structures when they get checked
+ for validity.
+
+ If unsure, say N.
+
source "samples/Kconfig"
source "lib/Kconfig.kgdb"
diff --git a/lib/Makefile b/lib/Makefile
index 50144a3aeebd..c14af95f06a1 100644
--- a/lib/Makefile
+++ b/lib/Makefile
@@ -19,7 +19,7 @@ KCOV_INSTRUMENT_dynamic_debug.o := n
lib-y := ctype.o string.o vsprintf.o cmdline.o \
rbtree.o radix-tree.o dump_stack.o timerqueue.o\
idr.o int_sqrt.o extable.o \
- sha1.o chacha20.o md5.o irq_regs.o argv_split.o \
+ sha1.o chacha.o md5.o irq_regs.o argv_split.o \
flex_proportions.o ratelimit.o show_mem.o \
is_single_threaded.o plist.o decompress.o kobject_uevent.o \
earlycpio.o seq_buf.o nmi_backtrace.o nodemask.o win_minmax.o
@@ -93,6 +93,7 @@ obj-$(CONFIG_CRC32) += crc32.o
obj-$(CONFIG_CRC7) += crc7.o
obj-$(CONFIG_LIBCRC32C) += libcrc32c.o
obj-$(CONFIG_CRC8) += crc8.o
+obj-$(CONFIG_XXHASH) += xxhash.o
obj-$(CONFIG_GENERIC_ALLOCATOR) += genalloc.o
obj-$(CONFIG_842_COMPRESS) += 842/
@@ -106,6 +107,8 @@ obj-$(CONFIG_LZO_DECOMPRESS) += lzo/
obj-$(CONFIG_LZ4_COMPRESS) += lz4/
obj-$(CONFIG_LZ4HC_COMPRESS) += lz4/
obj-$(CONFIG_LZ4_DECOMPRESS) += lz4/
+obj-$(CONFIG_ZSTD_COMPRESS) += zstd/
+obj-$(CONFIG_ZSTD_DECOMPRESS) += zstd/
obj-$(CONFIG_XZ_DEC) += xz/
obj-$(CONFIG_RAID6_PQ) += raid6/
diff --git a/lib/chacha.c b/lib/chacha.c
new file mode 100644
index 000000000000..a46d2832dbab
--- /dev/null
+++ b/lib/chacha.c
@@ -0,0 +1,117 @@
+/*
+ * The "hash function" used as the core of the ChaCha stream cipher (RFC7539)
+ *
+ * Copyright (C) 2015 Martin Willi
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ */
+
+#include <linux/kernel.h>
+#include <linux/export.h>
+#include <linux/bitops.h>
+#include <linux/cryptohash.h>
+#include <asm/unaligned.h>
+#include <crypto/chacha.h>
+
+static void chacha_permute(u32 *x, int nrounds)
+{
+ int i;
+
+ /* whitelist the allowed round counts */
+ WARN_ON_ONCE(nrounds != 20 && nrounds != 12);
+
+ for (i = 0; i < nrounds; i += 2) {
+ x[0] += x[4]; x[12] = rol32(x[12] ^ x[0], 16);
+ x[1] += x[5]; x[13] = rol32(x[13] ^ x[1], 16);
+ x[2] += x[6]; x[14] = rol32(x[14] ^ x[2], 16);
+ x[3] += x[7]; x[15] = rol32(x[15] ^ x[3], 16);
+
+ x[8] += x[12]; x[4] = rol32(x[4] ^ x[8], 12);
+ x[9] += x[13]; x[5] = rol32(x[5] ^ x[9], 12);
+ x[10] += x[14]; x[6] = rol32(x[6] ^ x[10], 12);
+ x[11] += x[15]; x[7] = rol32(x[7] ^ x[11], 12);
+
+ x[0] += x[4]; x[12] = rol32(x[12] ^ x[0], 8);
+ x[1] += x[5]; x[13] = rol32(x[13] ^ x[1], 8);
+ x[2] += x[6]; x[14] = rol32(x[14] ^ x[2], 8);
+ x[3] += x[7]; x[15] = rol32(x[15] ^ x[3], 8);
+
+ x[8] += x[12]; x[4] = rol32(x[4] ^ x[8], 7);
+ x[9] += x[13]; x[5] = rol32(x[5] ^ x[9], 7);
+ x[10] += x[14]; x[6] = rol32(x[6] ^ x[10], 7);
+ x[11] += x[15]; x[7] = rol32(x[7] ^ x[11], 7);
+
+ x[0] += x[5]; x[15] = rol32(x[15] ^ x[0], 16);
+ x[1] += x[6]; x[12] = rol32(x[12] ^ x[1], 16);
+ x[2] += x[7]; x[13] = rol32(x[13] ^ x[2], 16);
+ x[3] += x[4]; x[14] = rol32(x[14] ^ x[3], 16);
+
+ x[10] += x[15]; x[5] = rol32(x[5] ^ x[10], 12);
+ x[11] += x[12]; x[6] = rol32(x[6] ^ x[11], 12);
+ x[8] += x[13]; x[7] = rol32(x[7] ^ x[8], 12);
+ x[9] += x[14]; x[4] = rol32(x[4] ^ x[9], 12);
+
+ x[0] += x[5]; x[15] = rol32(x[15] ^ x[0], 8);
+ x[1] += x[6]; x[12] = rol32(x[12] ^ x[1], 8);
+ x[2] += x[7]; x[13] = rol32(x[13] ^ x[2], 8);
+ x[3] += x[4]; x[14] = rol32(x[14] ^ x[3], 8);
+
+ x[10] += x[15]; x[5] = rol32(x[5] ^ x[10], 7);
+ x[11] += x[12]; x[6] = rol32(x[6] ^ x[11], 7);
+ x[8] += x[13]; x[7] = rol32(x[7] ^ x[8], 7);
+ x[9] += x[14]; x[4] = rol32(x[4] ^ x[9], 7);
+ }
+}
+
+/**
+ * chacha_block - generate one keystream block and increment block counter
+ * @state: input state matrix (16 32-bit words)
+ * @stream: output keystream block (64 bytes)
+ * @nrounds: number of rounds (20 or 12; 20 is recommended)
+ *
+ * This is the ChaCha core, a function from 64-byte strings to 64-byte strings.
+ * The caller has already converted the endianness of the input. This function
+ * also handles incrementing the block counter in the input matrix.
+ */
+void chacha_block(u32 *state, u8 *stream, int nrounds)
+{
+ u32 x[16];
+ int i;
+
+ memcpy(x, state, 64);
+
+ chacha_permute(x, nrounds);
+
+ for (i = 0; i < ARRAY_SIZE(x); i++)
+ put_unaligned_le32(x[i] + state[i], &stream[i * sizeof(u32)]);
+
+ state[12]++;
+}
+EXPORT_SYMBOL(chacha_block);
+
+/**
+ * hchacha_block - abbreviated ChaCha core, for XChaCha
+ * @in: input state matrix (16 32-bit words)
+ * @out: output (8 32-bit words)
+ * @nrounds: number of rounds (20 or 12; 20 is recommended)
+ *
+ * HChaCha is the ChaCha equivalent of HSalsa and is an intermediate step
+ * towards XChaCha (see https://cr.yp.to/snuffle/xsalsa-20081128.pdf). HChaCha
+ * skips the final addition of the initial state, and outputs only certain words
+ * of the state. It should not be used for streaming directly.
+ */
+void hchacha_block(const u32 *in, u32 *out, int nrounds)
+{
+ u32 x[16];
+
+ memcpy(x, in, 64);
+
+ chacha_permute(x, nrounds);
+
+ memcpy(&out[0], &x[0], 16);
+ memcpy(&out[4], &x[12], 16);
+}
+EXPORT_SYMBOL(hchacha_block);
diff --git a/lib/chacha20.c b/lib/chacha20.c
deleted file mode 100644
index 250ceed9ec9a..000000000000
--- a/lib/chacha20.c
+++ /dev/null
@@ -1,79 +0,0 @@
-/*
- * ChaCha20 256-bit cipher algorithm, RFC7539
- *
- * Copyright (C) 2015 Martin Willi
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- */
-
-#include <linux/kernel.h>
-#include <linux/export.h>
-#include <linux/bitops.h>
-#include <linux/cryptohash.h>
-#include <asm/unaligned.h>
-#include <crypto/chacha20.h>
-
-static inline u32 rotl32(u32 v, u8 n)
-{
- return (v << n) | (v >> (sizeof(v) * 8 - n));
-}
-
-extern void chacha20_block(u32 *state, void *stream)
-{
- u32 x[16], *out = stream;
- int i;
-
- for (i = 0; i < ARRAY_SIZE(x); i++)
- x[i] = state[i];
-
- for (i = 0; i < 20; i += 2) {
- x[0] += x[4]; x[12] = rotl32(x[12] ^ x[0], 16);
- x[1] += x[5]; x[13] = rotl32(x[13] ^ x[1], 16);
- x[2] += x[6]; x[14] = rotl32(x[14] ^ x[2], 16);
- x[3] += x[7]; x[15] = rotl32(x[15] ^ x[3], 16);
-
- x[8] += x[12]; x[4] = rotl32(x[4] ^ x[8], 12);
- x[9] += x[13]; x[5] = rotl32(x[5] ^ x[9], 12);
- x[10] += x[14]; x[6] = rotl32(x[6] ^ x[10], 12);
- x[11] += x[15]; x[7] = rotl32(x[7] ^ x[11], 12);
-
- x[0] += x[4]; x[12] = rotl32(x[12] ^ x[0], 8);
- x[1] += x[5]; x[13] = rotl32(x[13] ^ x[1], 8);
- x[2] += x[6]; x[14] = rotl32(x[14] ^ x[2], 8);
- x[3] += x[7]; x[15] = rotl32(x[15] ^ x[3], 8);
-
- x[8] += x[12]; x[4] = rotl32(x[4] ^ x[8], 7);
- x[9] += x[13]; x[5] = rotl32(x[5] ^ x[9], 7);
- x[10] += x[14]; x[6] = rotl32(x[6] ^ x[10], 7);
- x[11] += x[15]; x[7] = rotl32(x[7] ^ x[11], 7);
-
- x[0] += x[5]; x[15] = rotl32(x[15] ^ x[0], 16);
- x[1] += x[6]; x[12] = rotl32(x[12] ^ x[1], 16);
- x[2] += x[7]; x[13] = rotl32(x[13] ^ x[2], 16);
- x[3] += x[4]; x[14] = rotl32(x[14] ^ x[3], 16);
-
- x[10] += x[15]; x[5] = rotl32(x[5] ^ x[10], 12);
- x[11] += x[12]; x[6] = rotl32(x[6] ^ x[11], 12);
- x[8] += x[13]; x[7] = rotl32(x[7] ^ x[8], 12);
- x[9] += x[14]; x[4] = rotl32(x[4] ^ x[9], 12);
-
- x[0] += x[5]; x[15] = rotl32(x[15] ^ x[0], 8);
- x[1] += x[6]; x[12] = rotl32(x[12] ^ x[1], 8);
- x[2] += x[7]; x[13] = rotl32(x[13] ^ x[2], 8);
- x[3] += x[4]; x[14] = rotl32(x[14] ^ x[3], 8);
-
- x[10] += x[15]; x[5] = rotl32(x[5] ^ x[10], 7);
- x[11] += x[12]; x[6] = rotl32(x[6] ^ x[11], 7);
- x[8] += x[13]; x[7] = rotl32(x[7] ^ x[8], 7);
- x[9] += x[14]; x[4] = rotl32(x[4] ^ x[9], 7);
- }
-
- for (i = 0; i < ARRAY_SIZE(x); i++)
- out[i] = cpu_to_le32(x[i] + state[i]);
-
- state[12]++;
-}
-EXPORT_SYMBOL(chacha20_block);
diff --git a/lib/digsig.c b/lib/digsig.c
index a876156503f0..6ba6fcd92dd1 100644
--- a/lib/digsig.c
+++ b/lib/digsig.c
@@ -85,7 +85,7 @@ static int digsig_verify_rsa(struct key *key,
struct pubkey_hdr *pkh;
down_read(&key->sem);
- ukp = user_key_payload(key);
+ ukp = user_key_payload_locked(key);
if (!ukp) {
/* key was revoked before we acquired its semaphore */
diff --git a/lib/list_debug.c b/lib/list_debug.c
index 3859bf63561c..a34db8d27667 100644
--- a/lib/list_debug.c
+++ b/lib/list_debug.c
@@ -2,8 +2,7 @@
* Copyright 2006, Red Hat, Inc., Dave Jones
* Released under the General Public License (GPL).
*
- * This file contains the linked list implementations for
- * DEBUG_LIST.
+ * This file contains the linked list validation for DEBUG_LIST.
*/
#include <linux/export.h>
@@ -13,88 +12,51 @@
#include <linux/rculist.h>
/*
- * Insert a new entry between two known consecutive entries.
- *
- * This is only for internal list manipulation where we know
- * the prev/next entries already!
+ * Check that the data structures for the list manipulations are reasonably
+ * valid. Failures here indicate memory corruption (and possibly an exploit
+ * attempt).
*/
-void __list_add(struct list_head *new,
- struct list_head *prev,
- struct list_head *next)
+bool __list_add_valid(struct list_head *new, struct list_head *prev,
+ struct list_head *next)
{
- WARN(next->prev != prev,
- "list_add corruption. next->prev should be "
- "prev (%p), but was %p. (next=%p).\n",
- prev, next->prev, next);
- WARN(prev->next != next,
- "list_add corruption. prev->next should be "
- "next (%p), but was %p. (prev=%p).\n",
- next, prev->next, prev);
- WARN(new == prev || new == next,
- "list_add double add: new=%p, prev=%p, next=%p.\n",
- new, prev, next);
- next->prev = new;
- new->next = next;
- new->prev = prev;
- WRITE_ONCE(prev->next, new);
+ if (CHECK_DATA_CORRUPTION(next->prev != prev,
+ "list_add corruption. next->prev should be prev (%p), but was %p. (next=%p).\n",
+ prev, next->prev, next) ||
+ CHECK_DATA_CORRUPTION(prev->next != next,
+ "list_add corruption. prev->next should be next (%p), but was %p. (prev=%p).\n",
+ next, prev->next, prev) ||
+ CHECK_DATA_CORRUPTION(new == prev || new == next,
+ "list_add double add: new=%p, prev=%p, next=%p.\n",
+ new, prev, next))
+ return false;
+
+ return true;
}
-EXPORT_SYMBOL(__list_add);
+EXPORT_SYMBOL(__list_add_valid);
-void __list_del_entry(struct list_head *entry)
+bool __list_del_entry_valid(struct list_head *entry)
{
struct list_head *prev, *next;
prev = entry->prev;
next = entry->next;
- if (WARN(next == LIST_POISON1,
- "list_del corruption, %p->next is LIST_POISON1 (%p)\n",
- entry, LIST_POISON1) ||
- WARN(prev == LIST_POISON2,
- "list_del corruption, %p->prev is LIST_POISON2 (%p)\n",
- entry, LIST_POISON2) ||
- WARN(prev->next != entry,
- "list_del corruption. prev->next should be %p, "
- "but was %p\n", entry, prev->next) ||
- WARN(next->prev != entry,
- "list_del corruption. next->prev should be %p, "
- "but was %p\n", entry, next->prev))
- return;
-
- __list_del(prev, next);
-}
-EXPORT_SYMBOL(__list_del_entry);
-
-/**
- * list_del - deletes entry from list.
- * @entry: the element to delete from the list.
- * Note: list_empty on entry does not return true after this, the entry is
- * in an undefined state.
- */
-void list_del(struct list_head *entry)
-{
- __list_del_entry(entry);
- entry->next = LIST_POISON1;
- entry->prev = LIST_POISON2;
-}
-EXPORT_SYMBOL(list_del);
+ if (CHECK_DATA_CORRUPTION(next == LIST_POISON1,
+ "list_del corruption, %p->next is LIST_POISON1 (%p)\n",
+ entry, LIST_POISON1) ||
+ CHECK_DATA_CORRUPTION(prev == LIST_POISON2,
+ "list_del corruption, %p->prev is LIST_POISON2 (%p)\n",
+ entry, LIST_POISON2) ||
+ CHECK_DATA_CORRUPTION(prev->next != entry,
+ "list_del corruption. prev->next should be %p, but was %p\n",
+ entry, prev->next) ||
+ CHECK_DATA_CORRUPTION(next->prev != entry,
+ "list_del corruption. next->prev should be %p, but was %p\n",
+ entry, next->prev))
+ return false;
+
+ return true;
-/*
- * RCU variants.
- */
-void __list_add_rcu(struct list_head *new,
- struct list_head *prev, struct list_head *next)
-{
- WARN(next->prev != prev,
- "list_add_rcu corruption. next->prev should be prev (%p), but was %p. (next=%p).\n",
- prev, next->prev, next);
- WARN(prev->next != next,
- "list_add_rcu corruption. prev->next should be next (%p), but was %p. (prev=%p).\n",
- next, prev->next, prev);
- new->next = next;
- new->prev = prev;
- rcu_assign_pointer(list_next_rcu(prev), new);
- next->prev = new;
}
-EXPORT_SYMBOL(__list_add_rcu);
+EXPORT_SYMBOL(__list_del_entry_valid);
diff --git a/lib/test_kasan.c b/lib/test_kasan.c
index fbdf87920093..8fcad6191ade 100644
--- a/lib/test_kasan.c
+++ b/lib/test_kasan.c
@@ -19,6 +19,7 @@
#include <linux/string.h>
#include <linux/uaccess.h>
#include <linux/module.h>
+#include <linux/kasan.h>
/*
* Note: test functions are marked noinline so that their names appear in
@@ -439,8 +440,34 @@ static noinline void __init use_after_scope_test(void)
p[1023] = 1;
}
+static noinline void __init kasan_alloca_oob_left(void)
+{
+ volatile int i = 10;
+ char alloca_array[i];
+ char *p = alloca_array - 1;
+
+ pr_info("out-of-bounds to left on alloca\n");
+ *(volatile char *)p;
+}
+
+static noinline void __init kasan_alloca_oob_right(void)
+{
+ volatile int i = 10;
+ char alloca_array[i];
+ char *p = alloca_array + i;
+
+ pr_info("out-of-bounds to right on alloca\n");
+ *(volatile char *)p;
+}
+
static int __init kmalloc_tests_init(void)
{
+ /*
+ * Temporarily enable multi-shot mode. Otherwise, we'd only get a
+ * report for the first case.
+ */
+ bool multishot = kasan_save_enable_multi_shot();
+
kmalloc_oob_right();
kmalloc_oob_left();
kmalloc_node_oob_right();
@@ -462,9 +489,14 @@ static int __init kmalloc_tests_init(void)
kmem_cache_oob();
kasan_stack_oob();
kasan_global_oob();
+ kasan_alloca_oob_left();
+ kasan_alloca_oob_right();
ksize_unpoisons_memory();
copy_user_test();
use_after_scope_test();
+
+ kasan_restore_multi_shot(multishot);
+
return -EAGAIN;
}
diff --git a/lib/xxhash.c b/lib/xxhash.c
new file mode 100644
index 000000000000..aa61e2a3802f
--- /dev/null
+++ b/lib/xxhash.c
@@ -0,0 +1,500 @@
+/*
+ * xxHash - Extremely Fast Hash algorithm
+ * Copyright (C) 2012-2016, Yann Collet.
+ *
+ * BSD 2-Clause License (http://www.opensource.org/licenses/bsd-license.php)
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are
+ * met:
+ *
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following disclaimer
+ * in the documentation and/or other materials provided with the
+ * distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ * This program is free software; you can redistribute it and/or modify it under
+ * the terms of the GNU General Public License version 2 as published by the
+ * Free Software Foundation. This program is dual-licensed; you may select
+ * either version 2 of the GNU General Public License ("GPL") or BSD license
+ * ("BSD").
+ *
+ * You can contact the author at:
+ * - xxHash homepage: http://cyan4973.github.io/xxHash/
+ * - xxHash source repository: https://github.com/Cyan4973/xxHash
+ */
+
+#include <asm/unaligned.h>
+#include <linux/errno.h>
+#include <linux/compiler.h>
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/string.h>
+#include <linux/xxhash.h>
+
+/*-*************************************
+ * Macros
+ **************************************/
+#define xxh_rotl32(x, r) ((x << r) | (x >> (32 - r)))
+#define xxh_rotl64(x, r) ((x << r) | (x >> (64 - r)))
+
+#ifdef __LITTLE_ENDIAN
+# define XXH_CPU_LITTLE_ENDIAN 1
+#else
+# define XXH_CPU_LITTLE_ENDIAN 0
+#endif
+
+/*-*************************************
+ * Constants
+ **************************************/
+static const uint32_t PRIME32_1 = 2654435761U;
+static const uint32_t PRIME32_2 = 2246822519U;
+static const uint32_t PRIME32_3 = 3266489917U;
+static const uint32_t PRIME32_4 = 668265263U;
+static const uint32_t PRIME32_5 = 374761393U;
+
+static const uint64_t PRIME64_1 = 11400714785074694791ULL;
+static const uint64_t PRIME64_2 = 14029467366897019727ULL;
+static const uint64_t PRIME64_3 = 1609587929392839161ULL;
+static const uint64_t PRIME64_4 = 9650029242287828579ULL;
+static const uint64_t PRIME64_5 = 2870177450012600261ULL;
+
+/*-**************************
+ * Utils
+ ***************************/
+void xxh32_copy_state(struct xxh32_state *dst, const struct xxh32_state *src)
+{
+ memcpy(dst, src, sizeof(*dst));
+}
+EXPORT_SYMBOL(xxh32_copy_state);
+
+void xxh64_copy_state(struct xxh64_state *dst, const struct xxh64_state *src)
+{
+ memcpy(dst, src, sizeof(*dst));
+}
+EXPORT_SYMBOL(xxh64_copy_state);
+
+/*-***************************
+ * Simple Hash Functions
+ ****************************/
+static uint32_t xxh32_round(uint32_t seed, const uint32_t input)
+{
+ seed += input * PRIME32_2;
+ seed = xxh_rotl32(seed, 13);
+ seed *= PRIME32_1;
+ return seed;
+}
+
+uint32_t xxh32(const void *input, const size_t len, const uint32_t seed)
+{
+ const uint8_t *p = (const uint8_t *)input;
+ const uint8_t *b_end = p + len;
+ uint32_t h32;
+
+ if (len >= 16) {
+ const uint8_t *const limit = b_end - 16;
+ uint32_t v1 = seed + PRIME32_1 + PRIME32_2;
+ uint32_t v2 = seed + PRIME32_2;
+ uint32_t v3 = seed + 0;
+ uint32_t v4 = seed - PRIME32_1;
+
+ do {
+ v1 = xxh32_round(v1, get_unaligned_le32(p));
+ p += 4;
+ v2 = xxh32_round(v2, get_unaligned_le32(p));
+ p += 4;
+ v3 = xxh32_round(v3, get_unaligned_le32(p));
+ p += 4;
+ v4 = xxh32_round(v4, get_unaligned_le32(p));
+ p += 4;
+ } while (p <= limit);
+
+ h32 = xxh_rotl32(v1, 1) + xxh_rotl32(v2, 7) +
+ xxh_rotl32(v3, 12) + xxh_rotl32(v4, 18);
+ } else {
+ h32 = seed + PRIME32_5;
+ }
+
+ h32 += (uint32_t)len;
+
+ while (p + 4 <= b_end) {
+ h32 += get_unaligned_le32(p) * PRIME32_3;
+ h32 = xxh_rotl32(h32, 17) * PRIME32_4;
+ p += 4;
+ }
+
+ while (p < b_end) {
+ h32 += (*p) * PRIME32_5;
+ h32 = xxh_rotl32(h32, 11) * PRIME32_1;
+ p++;
+ }
+
+ h32 ^= h32 >> 15;
+ h32 *= PRIME32_2;
+ h32 ^= h32 >> 13;
+ h32 *= PRIME32_3;
+ h32 ^= h32 >> 16;
+
+ return h32;
+}
+EXPORT_SYMBOL(xxh32);
+
+static uint64_t xxh64_round(uint64_t acc, const uint64_t input)
+{
+ acc += input * PRIME64_2;
+ acc = xxh_rotl64(acc, 31);
+ acc *= PRIME64_1;
+ return acc;
+}
+
+static uint64_t xxh64_merge_round(uint64_t acc, uint64_t val)
+{
+ val = xxh64_round(0, val);
+ acc ^= val;
+ acc = acc * PRIME64_1 + PRIME64_4;
+ return acc;
+}
+
+uint64_t xxh64(const void *input, const size_t len, const uint64_t seed)
+{
+ const uint8_t *p = (const uint8_t *)input;
+ const uint8_t *const b_end = p + len;
+ uint64_t h64;
+
+ if (len >= 32) {
+ const uint8_t *const limit = b_end - 32;
+ uint64_t v1 = seed + PRIME64_1 + PRIME64_2;
+ uint64_t v2 = seed + PRIME64_2;
+ uint64_t v3 = seed + 0;
+ uint64_t v4 = seed - PRIME64_1;
+
+ do {
+ v1 = xxh64_round(v1, get_unaligned_le64(p));
+ p += 8;
+ v2 = xxh64_round(v2, get_unaligned_le64(p));
+ p += 8;
+ v3 = xxh64_round(v3, get_unaligned_le64(p));
+ p += 8;
+ v4 = xxh64_round(v4, get_unaligned_le64(p));
+ p += 8;
+ } while (p <= limit);
+
+ h64 = xxh_rotl64(v1, 1) + xxh_rotl64(v2, 7) +
+ xxh_rotl64(v3, 12) + xxh_rotl64(v4, 18);
+ h64 = xxh64_merge_round(h64, v1);
+ h64 = xxh64_merge_round(h64, v2);
+ h64 = xxh64_merge_round(h64, v3);
+ h64 = xxh64_merge_round(h64, v4);
+
+ } else {
+ h64 = seed + PRIME64_5;
+ }
+
+ h64 += (uint64_t)len;
+
+ while (p + 8 <= b_end) {
+ const uint64_t k1 = xxh64_round(0, get_unaligned_le64(p));
+
+ h64 ^= k1;
+ h64 = xxh_rotl64(h64, 27) * PRIME64_1 + PRIME64_4;
+ p += 8;
+ }
+
+ if (p + 4 <= b_end) {
+ h64 ^= (uint64_t)(get_unaligned_le32(p)) * PRIME64_1;
+ h64 = xxh_rotl64(h64, 23) * PRIME64_2 + PRIME64_3;
+ p += 4;
+ }
+
+ while (p < b_end) {
+ h64 ^= (*p) * PRIME64_5;
+ h64 = xxh_rotl64(h64, 11) * PRIME64_1;
+ p++;
+ }
+
+ h64 ^= h64 >> 33;
+ h64 *= PRIME64_2;
+ h64 ^= h64 >> 29;
+ h64 *= PRIME64_3;
+ h64 ^= h64 >> 32;
+
+ return h64;
+}
+EXPORT_SYMBOL(xxh64);
+
+/*-**************************************************
+ * Advanced Hash Functions
+ ***************************************************/
+void xxh32_reset(struct xxh32_state *statePtr, const uint32_t seed)
+{
+ /* use a local state for memcpy() to avoid strict-aliasing warnings */
+ struct xxh32_state state;
+
+ memset(&state, 0, sizeof(state));
+ state.v1 = seed + PRIME32_1 + PRIME32_2;
+ state.v2 = seed + PRIME32_2;
+ state.v3 = seed + 0;
+ state.v4 = seed - PRIME32_1;
+ memcpy(statePtr, &state, sizeof(state));
+}
+EXPORT_SYMBOL(xxh32_reset);
+
+void xxh64_reset(struct xxh64_state *statePtr, const uint64_t seed)
+{
+ /* use a local state for memcpy() to avoid strict-aliasing warnings */
+ struct xxh64_state state;
+
+ memset(&state, 0, sizeof(state));
+ state.v1 = seed + PRIME64_1 + PRIME64_2;
+ state.v2 = seed + PRIME64_2;
+ state.v3 = seed + 0;
+ state.v4 = seed - PRIME64_1;
+ memcpy(statePtr, &state, sizeof(state));
+}
+EXPORT_SYMBOL(xxh64_reset);
+
+int xxh32_update(struct xxh32_state *state, const void *input, const size_t len)
+{
+ const uint8_t *p = (const uint8_t *)input;
+ const uint8_t *const b_end = p + len;
+
+ if (input == NULL)
+ return -EINVAL;
+
+ state->total_len_32 += (uint32_t)len;
+ state->large_len |= (len >= 16) | (state->total_len_32 >= 16);
+
+ if (state->memsize + len < 16) { /* fill in tmp buffer */
+ memcpy((uint8_t *)(state->mem32) + state->memsize, input, len);
+ state->memsize += (uint32_t)len;
+ return 0;
+ }
+
+ if (state->memsize) { /* some data left from previous update */
+ const uint32_t *p32 = state->mem32;
+
+ memcpy((uint8_t *)(state->mem32) + state->memsize, input,
+ 16 - state->memsize);
+
+ state->v1 = xxh32_round(state->v1, get_unaligned_le32(p32));
+ p32++;
+ state->v2 = xxh32_round(state->v2, get_unaligned_le32(p32));
+ p32++;
+ state->v3 = xxh32_round(state->v3, get_unaligned_le32(p32));
+ p32++;
+ state->v4 = xxh32_round(state->v4, get_unaligned_le32(p32));
+ p32++;
+
+ p += 16-state->memsize;
+ state->memsize = 0;
+ }
+
+ if (p <= b_end - 16) {
+ const uint8_t *const limit = b_end - 16;
+ uint32_t v1 = state->v1;
+ uint32_t v2 = state->v2;
+ uint32_t v3 = state->v3;
+ uint32_t v4 = state->v4;
+
+ do {
+ v1 = xxh32_round(v1, get_unaligned_le32(p));
+ p += 4;
+ v2 = xxh32_round(v2, get_unaligned_le32(p));
+ p += 4;
+ v3 = xxh32_round(v3, get_unaligned_le32(p));
+ p += 4;
+ v4 = xxh32_round(v4, get_unaligned_le32(p));
+ p += 4;
+ } while (p <= limit);
+
+ state->v1 = v1;
+ state->v2 = v2;
+ state->v3 = v3;
+ state->v4 = v4;
+ }
+
+ if (p < b_end) {
+ memcpy(state->mem32, p, (size_t)(b_end-p));
+ state->memsize = (uint32_t)(b_end-p);
+ }
+
+ return 0;
+}
+EXPORT_SYMBOL(xxh32_update);
+
+uint32_t xxh32_digest(const struct xxh32_state *state)
+{
+ const uint8_t *p = (const uint8_t *)state->mem32;
+ const uint8_t *const b_end = (const uint8_t *)(state->mem32) +
+ state->memsize;
+ uint32_t h32;
+
+ if (state->large_len) {
+ h32 = xxh_rotl32(state->v1, 1) + xxh_rotl32(state->v2, 7) +
+ xxh_rotl32(state->v3, 12) + xxh_rotl32(state->v4, 18);
+ } else {
+ h32 = state->v3 /* == seed */ + PRIME32_5;
+ }
+
+ h32 += state->total_len_32;
+
+ while (p + 4 <= b_end) {
+ h32 += get_unaligned_le32(p) * PRIME32_3;
+ h32 = xxh_rotl32(h32, 17) * PRIME32_4;
+ p += 4;
+ }
+
+ while (p < b_end) {
+ h32 += (*p) * PRIME32_5;
+ h32 = xxh_rotl32(h32, 11) * PRIME32_1;
+ p++;
+ }
+
+ h32 ^= h32 >> 15;
+ h32 *= PRIME32_2;
+ h32 ^= h32 >> 13;
+ h32 *= PRIME32_3;
+ h32 ^= h32 >> 16;
+
+ return h32;
+}
+EXPORT_SYMBOL(xxh32_digest);
+
+int xxh64_update(struct xxh64_state *state, const void *input, const size_t len)
+{
+ const uint8_t *p = (const uint8_t *)input;
+ const uint8_t *const b_end = p + len;
+
+ if (input == NULL)
+ return -EINVAL;
+
+ state->total_len += len;
+
+ if (state->memsize + len < 32) { /* fill in tmp buffer */
+ memcpy(((uint8_t *)state->mem64) + state->memsize, input, len);
+ state->memsize += (uint32_t)len;
+ return 0;
+ }
+
+ if (state->memsize) { /* tmp buffer is full */
+ uint64_t *p64 = state->mem64;
+
+ memcpy(((uint8_t *)p64) + state->memsize, input,
+ 32 - state->memsize);
+
+ state->v1 = xxh64_round(state->v1, get_unaligned_le64(p64));
+ p64++;
+ state->v2 = xxh64_round(state->v2, get_unaligned_le64(p64));
+ p64++;
+ state->v3 = xxh64_round(state->v3, get_unaligned_le64(p64));
+ p64++;
+ state->v4 = xxh64_round(state->v4, get_unaligned_le64(p64));
+
+ p += 32 - state->memsize;
+ state->memsize = 0;
+ }
+
+ if (p + 32 <= b_end) {
+ const uint8_t *const limit = b_end - 32;
+ uint64_t v1 = state->v1;
+ uint64_t v2 = state->v2;
+ uint64_t v3 = state->v3;
+ uint64_t v4 = state->v4;
+
+ do {
+ v1 = xxh64_round(v1, get_unaligned_le64(p));
+ p += 8;
+ v2 = xxh64_round(v2, get_unaligned_le64(p));
+ p += 8;
+ v3 = xxh64_round(v3, get_unaligned_le64(p));
+ p += 8;
+ v4 = xxh64_round(v4, get_unaligned_le64(p));
+ p += 8;
+ } while (p <= limit);
+
+ state->v1 = v1;
+ state->v2 = v2;
+ state->v3 = v3;
+ state->v4 = v4;
+ }
+
+ if (p < b_end) {
+ memcpy(state->mem64, p, (size_t)(b_end-p));
+ state->memsize = (uint32_t)(b_end - p);
+ }
+
+ return 0;
+}
+EXPORT_SYMBOL(xxh64_update);
+
+uint64_t xxh64_digest(const struct xxh64_state *state)
+{
+ const uint8_t *p = (const uint8_t *)state->mem64;
+ const uint8_t *const b_end = (const uint8_t *)state->mem64 +
+ state->memsize;
+ uint64_t h64;
+
+ if (state->total_len >= 32) {
+ const uint64_t v1 = state->v1;
+ const uint64_t v2 = state->v2;
+ const uint64_t v3 = state->v3;
+ const uint64_t v4 = state->v4;
+
+ h64 = xxh_rotl64(v1, 1) + xxh_rotl64(v2, 7) +
+ xxh_rotl64(v3, 12) + xxh_rotl64(v4, 18);
+ h64 = xxh64_merge_round(h64, v1);
+ h64 = xxh64_merge_round(h64, v2);
+ h64 = xxh64_merge_round(h64, v3);
+ h64 = xxh64_merge_round(h64, v4);
+ } else {
+ h64 = state->v3 + PRIME64_5;
+ }
+
+ h64 += (uint64_t)state->total_len;
+
+ while (p + 8 <= b_end) {
+ const uint64_t k1 = xxh64_round(0, get_unaligned_le64(p));
+
+ h64 ^= k1;
+ h64 = xxh_rotl64(h64, 27) * PRIME64_1 + PRIME64_4;
+ p += 8;
+ }
+
+ if (p + 4 <= b_end) {
+ h64 ^= (uint64_t)(get_unaligned_le32(p)) * PRIME64_1;
+ h64 = xxh_rotl64(h64, 23) * PRIME64_2 + PRIME64_3;
+ p += 4;
+ }
+
+ while (p < b_end) {
+ h64 ^= (*p) * PRIME64_5;
+ h64 = xxh_rotl64(h64, 11) * PRIME64_1;
+ p++;
+ }
+
+ h64 ^= h64 >> 33;
+ h64 *= PRIME64_2;
+ h64 ^= h64 >> 29;
+ h64 *= PRIME64_3;
+ h64 ^= h64 >> 32;
+
+ return h64;
+}
+EXPORT_SYMBOL(xxh64_digest);
+
+MODULE_LICENSE("Dual BSD/GPL");
+MODULE_DESCRIPTION("xxHash");
diff --git a/lib/zstd/Makefile b/lib/zstd/Makefile
new file mode 100644
index 000000000000..dd0a359c135b
--- /dev/null
+++ b/lib/zstd/Makefile
@@ -0,0 +1,18 @@
+obj-$(CONFIG_ZSTD_COMPRESS) += zstd_compress.o
+obj-$(CONFIG_ZSTD_DECOMPRESS) += zstd_decompress.o
+
+ccflags-y += -O3
+
+# Object files unique to zstd_compress and zstd_decompress
+zstd_compress-y := fse_compress.o huf_compress.o compress.o
+zstd_decompress-y := huf_decompress.o decompress.o
+
+# These object files are shared between the modules.
+# Always add them to zstd_compress.
+# Unless both zstd_compress and zstd_decompress are built in
+# then also add them to zstd_decompress.
+zstd_compress-y += entropy_common.o fse_decompress.o zstd_common.o
+
+ifneq ($(CONFIG_ZSTD_COMPRESS)$(CONFIG_ZSTD_DECOMPRESS),yy)
+ zstd_decompress-y += entropy_common.o fse_decompress.o zstd_common.o
+endif
diff --git a/lib/zstd/bitstream.h b/lib/zstd/bitstream.h
new file mode 100644
index 000000000000..a826b99e1d63
--- /dev/null
+++ b/lib/zstd/bitstream.h
@@ -0,0 +1,374 @@
+/*
+ * bitstream
+ * Part of FSE library
+ * header file (to include)
+ * Copyright (C) 2013-2016, Yann Collet.
+ *
+ * BSD 2-Clause License (http://www.opensource.org/licenses/bsd-license.php)
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are
+ * met:
+ *
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following disclaimer
+ * in the documentation and/or other materials provided with the
+ * distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ * This program is free software; you can redistribute it and/or modify it under
+ * the terms of the GNU General Public License version 2 as published by the
+ * Free Software Foundation. This program is dual-licensed; you may select
+ * either version 2 of the GNU General Public License ("GPL") or BSD license
+ * ("BSD").
+ *
+ * You can contact the author at :
+ * - Source repository : https://github.com/Cyan4973/FiniteStateEntropy
+ */
+#ifndef BITSTREAM_H_MODULE
+#define BITSTREAM_H_MODULE
+
+/*
+* This API consists of small unitary functions, which must be inlined for best performance.
+* Since link-time-optimization is not available for all compilers,
+* these functions are defined into a .h to be included.
+*/
+
+/*-****************************************
+* Dependencies
+******************************************/
+#include "error_private.h" /* error codes and messages */
+#include "mem.h" /* unaligned access routines */
+
+/*=========================================
+* Target specific
+=========================================*/
+#define STREAM_ACCUMULATOR_MIN_32 25
+#define STREAM_ACCUMULATOR_MIN_64 57
+#define STREAM_ACCUMULATOR_MIN ((U32)(ZSTD_32bits() ? STREAM_ACCUMULATOR_MIN_32 : STREAM_ACCUMULATOR_MIN_64))
+
+/*-******************************************
+* bitStream encoding API (write forward)
+********************************************/
+/* bitStream can mix input from multiple sources.
+* A critical property of these streams is that they encode and decode in **reverse** direction.
+* So the first bit sequence you add will be the last to be read, like a LIFO stack.
+*/
+typedef struct {
+ size_t bitContainer;
+ int bitPos;
+ char *startPtr;
+ char *ptr;
+ char *endPtr;
+} BIT_CStream_t;
+
+ZSTD_STATIC size_t BIT_initCStream(BIT_CStream_t *bitC, void *dstBuffer, size_t dstCapacity);
+ZSTD_STATIC void BIT_addBits(BIT_CStream_t *bitC, size_t value, unsigned nbBits);
+ZSTD_STATIC void BIT_flushBits(BIT_CStream_t *bitC);
+ZSTD_STATIC size_t BIT_closeCStream(BIT_CStream_t *bitC);
+
+/* Start with initCStream, providing the size of buffer to write into.
+* bitStream will never write outside of this buffer.
+* `dstCapacity` must be >= sizeof(bitD->bitContainer), otherwise @return will be an error code.
+*
+* bits are first added to a local register.
+* Local register is size_t, hence 64-bits on 64-bits systems, or 32-bits on 32-bits systems.
+* Writing data into memory is an explicit operation, performed by the flushBits function.
+* Hence keep track how many bits are potentially stored into local register to avoid register overflow.
+* After a flushBits, a maximum of 7 bits might still be stored into local register.
+*
+* Avoid storing elements of more than 24 bits if you want compatibility with 32-bits bitstream readers.
+*
+* Last operation is to close the bitStream.
+* The function returns the final size of CStream in bytes.
+* If data couldn't fit into `dstBuffer`, it will return a 0 ( == not storable)
+*/
+
+/*-********************************************
+* bitStream decoding API (read backward)
+**********************************************/
+typedef struct {
+ size_t bitContainer;
+ unsigned bitsConsumed;
+ const char *ptr;
+ const char *start;
+} BIT_DStream_t;
+
+typedef enum {
+ BIT_DStream_unfinished = 0,
+ BIT_DStream_endOfBuffer = 1,
+ BIT_DStream_completed = 2,
+ BIT_DStream_overflow = 3
+} BIT_DStream_status; /* result of BIT_reloadDStream() */
+/* 1,2,4,8 would be better for bitmap combinations, but slows down performance a bit ... :( */
+
+ZSTD_STATIC size_t BIT_initDStream(BIT_DStream_t *bitD, const void *srcBuffer, size_t srcSize);
+ZSTD_STATIC size_t BIT_readBits(BIT_DStream_t *bitD, unsigned nbBits);
+ZSTD_STATIC BIT_DStream_status BIT_reloadDStream(BIT_DStream_t *bitD);
+ZSTD_STATIC unsigned BIT_endOfDStream(const BIT_DStream_t *bitD);
+
+/* Start by invoking BIT_initDStream().
+* A chunk of the bitStream is then stored into a local register.
+* Local register size is 64-bits on 64-bits systems, 32-bits on 32-bits systems (size_t).
+* You can then retrieve bitFields stored into the local register, **in reverse order**.
+* Local register is explicitly reloaded from memory by the BIT_reloadDStream() method.
+* A reload guarantee a minimum of ((8*sizeof(bitD->bitContainer))-7) bits when its result is BIT_DStream_unfinished.
+* Otherwise, it can be less than that, so proceed accordingly.
+* Checking if DStream has reached its end can be performed with BIT_endOfDStream().
+*/
+
+/*-****************************************
+* unsafe API
+******************************************/
+ZSTD_STATIC void BIT_addBitsFast(BIT_CStream_t *bitC, size_t value, unsigned nbBits);
+/* faster, but works only if value is "clean", meaning all high bits above nbBits are 0 */
+
+ZSTD_STATIC void BIT_flushBitsFast(BIT_CStream_t *bitC);
+/* unsafe version; does not check buffer overflow */
+
+ZSTD_STATIC size_t BIT_readBitsFast(BIT_DStream_t *bitD, unsigned nbBits);
+/* faster, but works only if nbBits >= 1 */
+
+/*-**************************************************************
+* Internal functions
+****************************************************************/
+ZSTD_STATIC unsigned BIT_highbit32(register U32 val) { return 31 - __builtin_clz(val); }
+
+/*===== Local Constants =====*/
+static const unsigned BIT_mask[] = {0, 1, 3, 7, 0xF, 0x1F, 0x3F, 0x7F, 0xFF,
+ 0x1FF, 0x3FF, 0x7FF, 0xFFF, 0x1FFF, 0x3FFF, 0x7FFF, 0xFFFF, 0x1FFFF,
+ 0x3FFFF, 0x7FFFF, 0xFFFFF, 0x1FFFFF, 0x3FFFFF, 0x7FFFFF, 0xFFFFFF, 0x1FFFFFF, 0x3FFFFFF}; /* up to 26 bits */
+
+/*-**************************************************************
+* bitStream encoding
+****************************************************************/
+/*! BIT_initCStream() :
+ * `dstCapacity` must be > sizeof(void*)
+ * @return : 0 if success,
+ otherwise an error code (can be tested using ERR_isError() ) */
+ZSTD_STATIC size_t BIT_initCStream(BIT_CStream_t *bitC, void *startPtr, size_t dstCapacity)
+{
+ bitC->bitContainer = 0;
+ bitC->bitPos = 0;
+ bitC->startPtr = (char *)startPtr;
+ bitC->ptr = bitC->startPtr;
+ bitC->endPtr = bitC->startPtr + dstCapacity - sizeof(bitC->ptr);
+ if (dstCapacity <= sizeof(bitC->ptr))
+ return ERROR(dstSize_tooSmall);
+ return 0;
+}
+
+/*! BIT_addBits() :
+ can add up to 26 bits into `bitC`.
+ Does not check for register overflow ! */
+ZSTD_STATIC void BIT_addBits(BIT_CStream_t *bitC, size_t value, unsigned nbBits)
+{
+ bitC->bitContainer |= (value & BIT_mask[nbBits]) << bitC->bitPos;
+ bitC->bitPos += nbBits;
+}
+
+/*! BIT_addBitsFast() :
+ * works only if `value` is _clean_, meaning all high bits above nbBits are 0 */
+ZSTD_STATIC void BIT_addBitsFast(BIT_CStream_t *bitC, size_t value, unsigned nbBits)
+{
+ bitC->bitContainer |= value << bitC->bitPos;
+ bitC->bitPos += nbBits;
+}
+
+/*! BIT_flushBitsFast() :
+ * unsafe version; does not check buffer overflow */
+ZSTD_STATIC void BIT_flushBitsFast(BIT_CStream_t *bitC)
+{
+ size_t const nbBytes = bitC->bitPos >> 3;
+ ZSTD_writeLEST(bitC->ptr, bitC->bitContainer);
+ bitC->ptr += nbBytes;
+ bitC->bitPos &= 7;
+ bitC->bitContainer >>= nbBytes * 8; /* if bitPos >= sizeof(bitContainer)*8 --> undefined behavior */
+}
+
+/*! BIT_flushBits() :
+ * safe version; check for buffer overflow, and prevents it.
+ * note : does not signal buffer overflow. This will be revealed later on using BIT_closeCStream() */
+ZSTD_STATIC void BIT_flushBits(BIT_CStream_t *bitC)
+{
+ size_t const nbBytes = bitC->bitPos >> 3;
+ ZSTD_writeLEST(bitC->ptr, bitC->bitContainer);
+ bitC->ptr += nbBytes;
+ if (bitC->ptr > bitC->endPtr)
+ bitC->ptr = bitC->endPtr;
+ bitC->bitPos &= 7;
+ bitC->bitContainer >>= nbBytes * 8; /* if bitPos >= sizeof(bitContainer)*8 --> undefined behavior */
+}
+
+/*! BIT_closeCStream() :
+ * @return : size of CStream, in bytes,
+ or 0 if it could not fit into dstBuffer */
+ZSTD_STATIC size_t BIT_closeCStream(BIT_CStream_t *bitC)
+{
+ BIT_addBitsFast(bitC, 1, 1); /* endMark */
+ BIT_flushBits(bitC);
+
+ if (bitC->ptr >= bitC->endPtr)
+ return 0; /* doesn't fit within authorized budget : cancel */
+
+ return (bitC->ptr - bitC->startPtr) + (bitC->bitPos > 0);
+}
+
+/*-********************************************************
+* bitStream decoding
+**********************************************************/
+/*! BIT_initDStream() :
+* Initialize a BIT_DStream_t.
+* `bitD` : a pointer to an already allocated BIT_DStream_t structure.
+* `srcSize` must be the *exact* size of the bitStream, in bytes.
+* @return : size of stream (== srcSize) or an errorCode if a problem is detected
+*/
+ZSTD_STATIC size_t BIT_initDStream(BIT_DStream_t *bitD, const void *srcBuffer, size_t srcSize)
+{
+ if (srcSize < 1) {
+ memset(bitD, 0, sizeof(*bitD));
+ return ERROR(srcSize_wrong);
+ }
+
+ if (srcSize >= sizeof(bitD->bitContainer)) { /* normal case */
+ bitD->start = (const char *)srcBuffer;
+ bitD->ptr = (const char *)srcBuffer + srcSize - sizeof(bitD->bitContainer);
+ bitD->bitContainer = ZSTD_readLEST(bitD->ptr);
+ {
+ BYTE const lastByte = ((const BYTE *)srcBuffer)[srcSize - 1];
+ bitD->bitsConsumed = lastByte ? 8 - BIT_highbit32(lastByte) : 0; /* ensures bitsConsumed is always set */
+ if (lastByte == 0)
+ return ERROR(GENERIC); /* endMark not present */
+ }
+ } else {
+ bitD->start = (const char *)srcBuffer;
+ bitD->ptr = bitD->start;
+ bitD->bitContainer = *(const BYTE *)(bitD->start);
+ switch (srcSize) {
+ case 7: bitD->bitContainer += (size_t)(((const BYTE *)(srcBuffer))[6]) << (sizeof(bitD->bitContainer) * 8 - 16);
+ case 6: bitD->bitContainer += (size_t)(((const BYTE *)(srcBuffer))[5]) << (sizeof(bitD->bitContainer) * 8 - 24);
+ case 5: bitD->bitContainer += (size_t)(((const BYTE *)(srcBuffer))[4]) << (sizeof(bitD->bitContainer) * 8 - 32);
+ case 4: bitD->bitContainer += (size_t)(((const BYTE *)(srcBuffer))[3]) << 24;
+ case 3: bitD->bitContainer += (size_t)(((const BYTE *)(srcBuffer))[2]) << 16;
+ case 2: bitD->bitContainer += (size_t)(((const BYTE *)(srcBuffer))[1]) << 8;
+ default:;
+ }
+ {
+ BYTE const lastByte = ((const BYTE *)srcBuffer)[srcSize - 1];
+ bitD->bitsConsumed = lastByte ? 8 - BIT_highbit32(lastByte) : 0;
+ if (lastByte == 0)
+ return ERROR(GENERIC); /* endMark not present */
+ }
+ bitD->bitsConsumed += (U32)(sizeof(bitD->bitContainer) - srcSize) * 8;
+ }
+
+ return srcSize;
+}
+
+ZSTD_STATIC size_t BIT_getUpperBits(size_t bitContainer, U32 const start) { return bitContainer >> start; }
+
+ZSTD_STATIC size_t BIT_getMiddleBits(size_t bitContainer, U32 const start, U32 const nbBits) { return (bitContainer >> start) & BIT_mask[nbBits]; }
+
+ZSTD_STATIC size_t BIT_getLowerBits(size_t bitContainer, U32 const nbBits) { return bitContainer & BIT_mask[nbBits]; }
+
+/*! BIT_lookBits() :
+ * Provides next n bits from local register.
+ * local register is not modified.
+ * On 32-bits, maxNbBits==24.
+ * On 64-bits, maxNbBits==56.
+ * @return : value extracted
+ */
+ZSTD_STATIC size_t BIT_lookBits(const BIT_DStream_t *bitD, U32 nbBits)
+{
+ U32 const bitMask = sizeof(bitD->bitContainer) * 8 - 1;
+ return ((bitD->bitContainer << (bitD->bitsConsumed & bitMask)) >> 1) >> ((bitMask - nbBits) & bitMask);
+}
+
+/*! BIT_lookBitsFast() :
+* unsafe version; only works only if nbBits >= 1 */
+ZSTD_STATIC size_t BIT_lookBitsFast(const BIT_DStream_t *bitD, U32 nbBits)
+{
+ U32 const bitMask = sizeof(bitD->bitContainer) * 8 - 1;
+ return (bitD->bitContainer << (bitD->bitsConsumed & bitMask)) >> (((bitMask + 1) - nbBits) & bitMask);
+}
+
+ZSTD_STATIC void BIT_skipBits(BIT_DStream_t *bitD, U32 nbBits) { bitD->bitsConsumed += nbBits; }
+
+/*! BIT_readBits() :
+ * Read (consume) next n bits from local register and update.
+ * Pay attention to not read more than nbBits contained into local register.
+ * @return : extracted value.
+ */
+ZSTD_STATIC size_t BIT_readBits(BIT_DStream_t *bitD, U32 nbBits)
+{
+ size_t const value = BIT_lookBits(bitD, nbBits);
+ BIT_skipBits(bitD, nbBits);
+ return value;
+}
+
+/*! BIT_readBitsFast() :
+* unsafe version; only works only if nbBits >= 1 */
+ZSTD_STATIC size_t BIT_readBitsFast(BIT_DStream_t *bitD, U32 nbBits)
+{
+ size_t const value = BIT_lookBitsFast(bitD, nbBits);
+ BIT_skipBits(bitD, nbBits);
+ return value;
+}
+
+/*! BIT_reloadDStream() :
+* Refill `bitD` from buffer previously set in BIT_initDStream() .
+* This function is safe, it guarantees it will not read beyond src buffer.
+* @return : status of `BIT_DStream_t` internal register.
+ if status == BIT_DStream_unfinished, internal register is filled with >= (sizeof(bitD->bitContainer)*8 - 7) bits */
+ZSTD_STATIC BIT_DStream_status BIT_reloadDStream(BIT_DStream_t *bitD)
+{
+ if (bitD->bitsConsumed > (sizeof(bitD->bitContainer) * 8)) /* should not happen => corruption detected */
+ return BIT_DStream_overflow;
+
+ if (bitD->ptr >= bitD->start + sizeof(bitD->bitContainer)) {
+ bitD->ptr -= bitD->bitsConsumed >> 3;
+ bitD->bitsConsumed &= 7;
+ bitD->bitContainer = ZSTD_readLEST(bitD->ptr);
+ return BIT_DStream_unfinished;
+ }
+ if (bitD->ptr == bitD->start) {
+ if (bitD->bitsConsumed < sizeof(bitD->bitContainer) * 8)
+ return BIT_DStream_endOfBuffer;
+ return BIT_DStream_completed;
+ }
+ {
+ U32 nbBytes = bitD->bitsConsumed >> 3;
+ BIT_DStream_status result = BIT_DStream_unfinished;
+ if (bitD->ptr - nbBytes < bitD->start) {
+ nbBytes = (U32)(bitD->ptr - bitD->start); /* ptr > start */
+ result = BIT_DStream_endOfBuffer;
+ }
+ bitD->ptr -= nbBytes;
+ bitD->bitsConsumed -= nbBytes * 8;
+ bitD->bitContainer = ZSTD_readLEST(bitD->ptr); /* reminder : srcSize > sizeof(bitD) */
+ return result;
+ }
+}
+
+/*! BIT_endOfDStream() :
+* @return Tells if DStream has exactly reached its end (all bits consumed).
+*/
+ZSTD_STATIC unsigned BIT_endOfDStream(const BIT_DStream_t *DStream)
+{
+ return ((DStream->ptr == DStream->start) && (DStream->bitsConsumed == sizeof(DStream->bitContainer) * 8));
+}
+
+#endif /* BITSTREAM_H_MODULE */
diff --git a/lib/zstd/compress.c b/lib/zstd/compress.c
new file mode 100644
index 000000000000..f9166cf4f7a9
--- /dev/null
+++ b/lib/zstd/compress.c
@@ -0,0 +1,3484 @@
+/**
+ * Copyright (c) 2016-present, Yann Collet, Facebook, Inc.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of https://github.com/facebook/zstd.
+ * An additional grant of patent rights can be found in the PATENTS file in the
+ * same directory.
+ *
+ * This program is free software; you can redistribute it and/or modify it under
+ * the terms of the GNU General Public License version 2 as published by the
+ * Free Software Foundation. This program is dual-licensed; you may select
+ * either version 2 of the GNU General Public License ("GPL") or BSD license
+ * ("BSD").
+ */
+
+/*-*************************************
+* Dependencies
+***************************************/
+#include "fse.h"
+#include "huf.h"
+#include "mem.h"
+#include "zstd_internal.h" /* includes zstd.h */
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/string.h> /* memset */
+
+/*-*************************************
+* Constants
+***************************************/
+static const U32 g_searchStrength = 8; /* control skip over incompressible data */
+#define HASH_READ_SIZE 8
+typedef enum { ZSTDcs_created = 0, ZSTDcs_init, ZSTDcs_ongoing, ZSTDcs_ending } ZSTD_compressionStage_e;
+
+/*-*************************************
+* Helper functions
+***************************************/
+size_t ZSTD_compressBound(size_t srcSize) { return FSE_compressBound(srcSize) + 12; }
+
+/*-*************************************
+* Sequence storage
+***************************************/
+static void ZSTD_resetSeqStore(seqStore_t *ssPtr)
+{
+ ssPtr->lit = ssPtr->litStart;
+ ssPtr->sequences = ssPtr->sequencesStart;
+ ssPtr->longLengthID = 0;
+}
+
+/*-*************************************
+* Context memory management
+***************************************/
+struct ZSTD_CCtx_s {
+ const BYTE *nextSrc; /* next block here to continue on curr prefix */
+ const BYTE *base; /* All regular indexes relative to this position */
+ const BYTE *dictBase; /* extDict indexes relative to this position */
+ U32 dictLimit; /* below that point, need extDict */
+ U32 lowLimit; /* below that point, no more data */
+ U32 nextToUpdate; /* index from which to continue dictionary update */
+ U32 nextToUpdate3; /* index from which to continue dictionary update */
+ U32 hashLog3; /* dispatch table : larger == faster, more memory */
+ U32 loadedDictEnd; /* index of end of dictionary */
+ U32 forceWindow; /* force back-references to respect limit of 1<<wLog, even for dictionary */
+ U32 forceRawDict; /* Force loading dictionary in "content-only" mode (no header analysis) */
+ ZSTD_compressionStage_e stage;
+ U32 rep[ZSTD_REP_NUM];
+ U32 repToConfirm[ZSTD_REP_NUM];
+ U32 dictID;
+ ZSTD_parameters params;
+ void *workSpace;
+ size_t workSpaceSize;
+ size_t blockSize;
+ U64 frameContentSize;
+ struct xxh64_state xxhState;
+ ZSTD_customMem customMem;
+
+ seqStore_t seqStore; /* sequences storage ptrs */
+ U32 *hashTable;
+ U32 *hashTable3;
+ U32 *chainTable;
+ HUF_CElt *hufTable;
+ U32 flagStaticTables;
+ HUF_repeat flagStaticHufTable;
+ FSE_CTable offcodeCTable[FSE_CTABLE_SIZE_U32(OffFSELog, MaxOff)];
+ FSE_CTable matchlengthCTable[FSE_CTABLE_SIZE_U32(MLFSELog, MaxML)];
+ FSE_CTable litlengthCTable[FSE_CTABLE_SIZE_U32(LLFSELog, MaxLL)];
+ unsigned tmpCounters[HUF_COMPRESS_WORKSPACE_SIZE_U32];
+};
+
+size_t ZSTD_CCtxWorkspaceBound(ZSTD_compressionParameters cParams)
+{
+ size_t const blockSize = MIN(ZSTD_BLOCKSIZE_ABSOLUTEMAX, (size_t)1 << cParams.windowLog);
+ U32 const divider = (cParams.searchLength == 3) ? 3 : 4;
+ size_t const maxNbSeq = blockSize / divider;
+ size_t const tokenSpace = blockSize + 11 * maxNbSeq;
+ size_t const chainSize = (cParams.strategy == ZSTD_fast) ? 0 : (1 << cParams.chainLog);
+ size_t const hSize = ((size_t)1) << cParams.hashLog;
+ U32 const hashLog3 = (cParams.searchLength > 3) ? 0 : MIN(ZSTD_HASHLOG3_MAX, cParams.windowLog);
+ size_t const h3Size = ((size_t)1) << hashLog3;
+ size_t const tableSpace = (chainSize + hSize + h3Size) * sizeof(U32);
+ size_t const optSpace =
+ ((MaxML + 1) + (MaxLL + 1) + (MaxOff + 1) + (1 << Litbits)) * sizeof(U32) + (ZSTD_OPT_NUM + 1) * (sizeof(ZSTD_match_t) + sizeof(ZSTD_optimal_t));
+ size_t const workspaceSize = tableSpace + (256 * sizeof(U32)) /* huffTable */ + tokenSpace +
+ (((cParams.strategy == ZSTD_btopt) || (cParams.strategy == ZSTD_btopt2)) ? optSpace : 0);
+
+ return ZSTD_ALIGN(sizeof(ZSTD_stack)) + ZSTD_ALIGN(sizeof(ZSTD_CCtx)) + ZSTD_ALIGN(workspaceSize);
+}
+
+static ZSTD_CCtx *ZSTD_createCCtx_advanced(ZSTD_customMem customMem)
+{
+ ZSTD_CCtx *cctx;
+ if (!customMem.customAlloc || !customMem.customFree)
+ return NULL;
+ cctx = (ZSTD_CCtx *)ZSTD_malloc(sizeof(ZSTD_CCtx), customMem);
+ if (!cctx)
+ return NULL;
+ memset(cctx, 0, sizeof(ZSTD_CCtx));
+ cctx->customMem = customMem;
+ return cctx;
+}
+
+ZSTD_CCtx *ZSTD_initCCtx(void *workspace, size_t workspaceSize)
+{
+ ZSTD_customMem const stackMem = ZSTD_initStack(workspace, workspaceSize);
+ ZSTD_CCtx *cctx = ZSTD_createCCtx_advanced(stackMem);
+ if (cctx) {
+ cctx->workSpace = ZSTD_stackAllocAll(cctx->customMem.opaque, &cctx->workSpaceSize);
+ }
+ return cctx;
+}
+
+size_t ZSTD_freeCCtx(ZSTD_CCtx *cctx)
+{
+ if (cctx == NULL)
+ return 0; /* support free on NULL */
+ ZSTD_free(cctx->workSpace, cctx->customMem);
+ ZSTD_free(cctx, cctx->customMem);
+ return 0; /* reserved as a potential error code in the future */
+}
+
+const seqStore_t *ZSTD_getSeqStore(const ZSTD_CCtx *ctx) /* hidden interface */ { return &(ctx->seqStore); }
+
+static ZSTD_parameters ZSTD_getParamsFromCCtx(const ZSTD_CCtx *cctx) { return cctx->params; }
+
+/** ZSTD_checkParams() :
+ ensure param values remain within authorized range.
+ @return : 0, or an error code if one value is beyond authorized range */
+size_t ZSTD_checkCParams(ZSTD_compressionParameters cParams)
+{
+#define CLAMPCHECK(val, min, max) \
+ { \
+ if ((val < min) | (val > max)) \
+ return ERROR(compressionParameter_unsupported); \
+ }
+ CLAMPCHECK(cParams.windowLog, ZSTD_WINDOWLOG_MIN, ZSTD_WINDOWLOG_MAX);
+ CLAMPCHECK(cParams.chainLog, ZSTD_CHAINLOG_MIN, ZSTD_CHAINLOG_MAX);
+ CLAMPCHECK(cParams.hashLog, ZSTD_HASHLOG_MIN, ZSTD_HASHLOG_MAX);
+ CLAMPCHECK(cParams.searchLog, ZSTD_SEARCHLOG_MIN, ZSTD_SEARCHLOG_MAX);
+ CLAMPCHECK(cParams.searchLength, ZSTD_SEARCHLENGTH_MIN, ZSTD_SEARCHLENGTH_MAX);
+ CLAMPCHECK(cParams.targetLength, ZSTD_TARGETLENGTH_MIN, ZSTD_TARGETLENGTH_MAX);
+ if ((U32)(cParams.strategy) > (U32)ZSTD_btopt2)
+ return ERROR(compressionParameter_unsupported);
+ return 0;
+}
+
+/** ZSTD_cycleLog() :
+ * condition for correct operation : hashLog > 1 */
+static U32 ZSTD_cycleLog(U32 hashLog, ZSTD_strategy strat)
+{
+ U32 const btScale = ((U32)strat >= (U32)ZSTD_btlazy2);
+ return hashLog - btScale;
+}
+
+/** ZSTD_adjustCParams() :
+ optimize `cPar` for a given input (`srcSize` and `dictSize`).
+ mostly downsizing to reduce memory consumption and initialization.
+ Both `srcSize` and `dictSize` are optional (use 0 if unknown),
+ but if both are 0, no optimization can be done.
+ Note : cPar is considered validated at this stage. Use ZSTD_checkParams() to ensure that. */
+ZSTD_compressionParameters ZSTD_adjustCParams(ZSTD_compressionParameters cPar, unsigned long long srcSize, size_t dictSize)
+{
+ if (srcSize + dictSize == 0)
+ return cPar; /* no size information available : no adjustment */
+
+ /* resize params, to use less memory when necessary */
+ {
+ U32 const minSrcSize = (srcSize == 0) ? 500 : 0;
+ U64 const rSize = srcSize + dictSize + minSrcSize;
+ if (rSize < ((U64)1 << ZSTD_WINDOWLOG_MAX)) {
+ U32 const srcLog = MAX(ZSTD_HASHLOG_MIN, ZSTD_highbit32((U32)(rSize)-1) + 1);
+ if (cPar.windowLog > srcLog)
+ cPar.windowLog = srcLog;
+ }
+ }
+ if (cPar.hashLog > cPar.windowLog)
+ cPar.hashLog = cPar.windowLog;
+ {
+ U32 const cycleLog = ZSTD_cycleLog(cPar.chainLog, cPar.strategy);
+ if (cycleLog > cPar.windowLog)
+ cPar.chainLog -= (cycleLog - cPar.windowLog);
+ }
+
+ if (cPar.windowLog < ZSTD_WINDOWLOG_ABSOLUTEMIN)
+ cPar.windowLog = ZSTD_WINDOWLOG_ABSOLUTEMIN; /* required for frame header */
+
+ return cPar;
+}
+
+static U32 ZSTD_equivalentParams(ZSTD_parameters param1, ZSTD_parameters param2)
+{
+ return (param1.cParams.hashLog == param2.cParams.hashLog) & (param1.cParams.chainLog == param2.cParams.chainLog) &
+ (param1.cParams.strategy == param2.cParams.strategy) & ((param1.cParams.searchLength == 3) == (param2.cParams.searchLength == 3));
+}
+
+/*! ZSTD_continueCCtx() :
+ reuse CCtx without reset (note : requires no dictionary) */
+static size_t ZSTD_continueCCtx(ZSTD_CCtx *cctx, ZSTD_parameters params, U64 frameContentSize)
+{
+ U32 const end = (U32)(cctx->nextSrc - cctx->base);
+ cctx->params = params;
+ cctx->frameContentSize = frameContentSize;
+ cctx->lowLimit = end;
+ cctx->dictLimit = end;
+ cctx->nextToUpdate = end + 1;
+ cctx->stage = ZSTDcs_init;
+ cctx->dictID = 0;
+ cctx->loadedDictEnd = 0;
+ {
+ int i;
+ for (i = 0; i < ZSTD_REP_NUM; i++)
+ cctx->rep[i] = repStartValue[i];
+ }
+ cctx->seqStore.litLengthSum = 0; /* force reset of btopt stats */
+ xxh64_reset(&cctx->xxhState, 0);
+ return 0;
+}
+
+typedef enum { ZSTDcrp_continue, ZSTDcrp_noMemset, ZSTDcrp_fullReset } ZSTD_compResetPolicy_e;
+
+/*! ZSTD_resetCCtx_advanced() :
+ note : `params` must be validated */
+static size_t ZSTD_resetCCtx_advanced(ZSTD_CCtx *zc, ZSTD_parameters params, U64 frameContentSize, ZSTD_compResetPolicy_e const crp)
+{
+ if (crp == ZSTDcrp_continue)
+ if (ZSTD_equivalentParams(params, zc->params)) {
+ zc->flagStaticTables = 0;
+ zc->flagStaticHufTable = HUF_repeat_none;
+ return ZSTD_continueCCtx(zc, params, frameContentSize);
+ }
+
+ {
+ size_t const blockSize = MIN(ZSTD_BLOCKSIZE_ABSOLUTEMAX, (size_t)1 << params.cParams.windowLog);
+ U32 const divider = (params.cParams.searchLength == 3) ? 3 : 4;
+ size_t const maxNbSeq = blockSize / divider;
+ size_t const tokenSpace = blockSize + 11 * maxNbSeq;
+ size_t const chainSize = (params.cParams.strategy == ZSTD_fast) ? 0 : (1 << params.cParams.chainLog);
+ size_t const hSize = ((size_t)1) << params.cParams.hashLog;
+ U32 const hashLog3 = (params.cParams.searchLength > 3) ? 0 : MIN(ZSTD_HASHLOG3_MAX, params.cParams.windowLog);
+ size_t const h3Size = ((size_t)1) << hashLog3;
+ size_t const tableSpace = (chainSize + hSize + h3Size) * sizeof(U32);
+ void *ptr;
+
+ /* Check if workSpace is large enough, alloc a new one if needed */
+ {
+ size_t const optSpace = ((MaxML + 1) + (MaxLL + 1) + (MaxOff + 1) + (1 << Litbits)) * sizeof(U32) +
+ (ZSTD_OPT_NUM + 1) * (sizeof(ZSTD_match_t) + sizeof(ZSTD_optimal_t));
+ size_t const neededSpace = tableSpace + (256 * sizeof(U32)) /* huffTable */ + tokenSpace +
+ (((params.cParams.strategy == ZSTD_btopt) || (params.cParams.strategy == ZSTD_btopt2)) ? optSpace : 0);
+ if (zc->workSpaceSize < neededSpace) {
+ ZSTD_free(zc->workSpace, zc->customMem);
+ zc->workSpace = ZSTD_malloc(neededSpace, zc->customMem);
+ if (zc->workSpace == NULL)
+ return ERROR(memory_allocation);
+ zc->workSpaceSize = neededSpace;
+ }
+ }
+
+ if (crp != ZSTDcrp_noMemset)
+ memset(zc->workSpace, 0, tableSpace); /* reset tables only */
+ xxh64_reset(&zc->xxhState, 0);
+ zc->hashLog3 = hashLog3;
+ zc->hashTable = (U32 *)(zc->workSpace);
+ zc->chainTable = zc->hashTable + hSize;
+ zc->hashTable3 = zc->chainTable + chainSize;
+ ptr = zc->hashTable3 + h3Size;
+ zc->hufTable = (HUF_CElt *)ptr;
+ zc->flagStaticTables = 0;
+ zc->flagStaticHufTable = HUF_repeat_none;
+ ptr = ((U32 *)ptr) + 256; /* note : HUF_CElt* is incomplete type, size is simulated using U32 */
+
+ zc->nextToUpdate = 1;
+ zc->nextSrc = NULL;
+ zc->base = NULL;
+ zc->dictBase = NULL;
+ zc->dictLimit = 0;
+ zc->lowLimit = 0;
+ zc->params = params;
+ zc->blockSize = blockSize;
+ zc->frameContentSize = frameContentSize;
+ {
+ int i;
+ for (i = 0; i < ZSTD_REP_NUM; i++)
+ zc->rep[i] = repStartValue[i];
+ }
+
+ if ((params.cParams.strategy == ZSTD_btopt) || (params.cParams.strategy == ZSTD_btopt2)) {
+ zc->seqStore.litFreq = (U32 *)ptr;
+ zc->seqStore.litLengthFreq = zc->seqStore.litFreq + (1 << Litbits);
+ zc->seqStore.matchLengthFreq = zc->seqStore.litLengthFreq + (MaxLL + 1);
+ zc->seqStore.offCodeFreq = zc->seqStore.matchLengthFreq + (MaxML + 1);
+ ptr = zc->seqStore.offCodeFreq + (MaxOff + 1);
+ zc->seqStore.matchTable = (ZSTD_match_t *)ptr;
+ ptr = zc->seqStore.matchTable + ZSTD_OPT_NUM + 1;
+ zc->seqStore.priceTable = (ZSTD_optimal_t *)ptr;
+ ptr = zc->seqStore.priceTable + ZSTD_OPT_NUM + 1;
+ zc->seqStore.litLengthSum = 0;
+ }
+ zc->seqStore.sequencesStart = (seqDef *)ptr;
+ ptr = zc->seqStore.sequencesStart + maxNbSeq;
+ zc->seqStore.llCode = (BYTE *)ptr;
+ zc->seqStore.mlCode = zc->seqStore.llCode + maxNbSeq;
+ zc->seqStore.ofCode = zc->seqStore.mlCode + maxNbSeq;
+ zc->seqStore.litStart = zc->seqStore.ofCode + maxNbSeq;
+
+ zc->stage = ZSTDcs_init;
+ zc->dictID = 0;
+ zc->loadedDictEnd = 0;
+
+ return 0;
+ }
+}
+
+/* ZSTD_invalidateRepCodes() :
+ * ensures next compression will not use repcodes from previous block.
+ * Note : only works with regular variant;
+ * do not use with extDict variant ! */
+void ZSTD_invalidateRepCodes(ZSTD_CCtx *cctx)
+{
+ int i;
+ for (i = 0; i < ZSTD_REP_NUM; i++)
+ cctx->rep[i] = 0;
+}
+
+/*! ZSTD_copyCCtx() :
+* Duplicate an existing context `srcCCtx` into another one `dstCCtx`.
+* Only works during stage ZSTDcs_init (i.e. after creation, but before first call to ZSTD_compressContinue()).
+* @return : 0, or an error code */
+size_t ZSTD_copyCCtx(ZSTD_CCtx *dstCCtx, const ZSTD_CCtx *srcCCtx, unsigned long long pledgedSrcSize)
+{
+ if (srcCCtx->stage != ZSTDcs_init)
+ return ERROR(stage_wrong);
+
+ memcpy(&dstCCtx->customMem, &srcCCtx->customMem, sizeof(ZSTD_customMem));
+ {
+ ZSTD_parameters params = srcCCtx->params;
+ params.fParams.contentSizeFlag = (pledgedSrcSize > 0);
+ ZSTD_resetCCtx_advanced(dstCCtx, params, pledgedSrcSize, ZSTDcrp_noMemset);
+ }
+
+ /* copy tables */
+ {
+ size_t const chainSize = (srcCCtx->params.cParams.strategy == ZSTD_fast) ? 0 : (1 << srcCCtx->params.cParams.chainLog);
+ size_t const hSize = ((size_t)1) << srcCCtx->params.cParams.hashLog;
+ size_t const h3Size = (size_t)1 << srcCCtx->hashLog3;
+ size_t const tableSpace = (chainSize + hSize + h3Size) * sizeof(U32);
+ memcpy(dstCCtx->workSpace, srcCCtx->workSpace, tableSpace);
+ }
+
+ /* copy dictionary offsets */
+ dstCCtx->nextToUpdate = srcCCtx->nextToUpdate;
+ dstCCtx->nextToUpdate3 = srcCCtx->nextToUpdate3;
+ dstCCtx->nextSrc = srcCCtx->nextSrc;
+ dstCCtx->base = srcCCtx->base;
+ dstCCtx->dictBase = srcCCtx->dictBase;
+ dstCCtx->dictLimit = srcCCtx->dictLimit;
+ dstCCtx->lowLimit = srcCCtx->lowLimit;
+ dstCCtx->loadedDictEnd = srcCCtx->loadedDictEnd;
+ dstCCtx->dictID = srcCCtx->dictID;
+
+ /* copy entropy tables */
+ dstCCtx->flagStaticTables = srcCCtx->flagStaticTables;
+ dstCCtx->flagStaticHufTable = srcCCtx->flagStaticHufTable;
+ if (srcCCtx->flagStaticTables) {
+ memcpy(dstCCtx->litlengthCTable, srcCCtx->litlengthCTable, sizeof(dstCCtx->litlengthCTable));
+ memcpy(dstCCtx->matchlengthCTable, srcCCtx->matchlengthCTable, sizeof(dstCCtx->matchlengthCTable));
+ memcpy(dstCCtx->offcodeCTable, srcCCtx->offcodeCTable, sizeof(dstCCtx->offcodeCTable));
+ }
+ if (srcCCtx->flagStaticHufTable) {
+ memcpy(dstCCtx->hufTable, srcCCtx->hufTable, 256 * 4);
+ }
+
+ return 0;
+}
+
+/*! ZSTD_reduceTable() :
+* reduce table indexes by `reducerValue` */
+static void ZSTD_reduceTable(U32 *const table, U32 const size, U32 const reducerValue)
+{
+ U32 u;
+ for (u = 0; u < size; u++) {
+ if (table[u] < reducerValue)
+ table[u] = 0;
+ else
+ table[u] -= reducerValue;
+ }
+}
+
+/*! ZSTD_reduceIndex() :
+* rescale all indexes to avoid future overflow (indexes are U32) */
+static void ZSTD_reduceIndex(ZSTD_CCtx *zc, const U32 reducerValue)
+{
+ {
+ U32 const hSize = 1 << zc->params.cParams.hashLog;
+ ZSTD_reduceTable(zc->hashTable, hSize, reducerValue);
+ }
+
+ {
+ U32 const chainSize = (zc->params.cParams.strategy == ZSTD_fast) ? 0 : (1 << zc->params.cParams.chainLog);
+ ZSTD_reduceTable(zc->chainTable, chainSize, reducerValue);
+ }
+
+ {
+ U32 const h3Size = (zc->hashLog3) ? 1 << zc->hashLog3 : 0;
+ ZSTD_reduceTable(zc->hashTable3, h3Size, reducerValue);
+ }
+}
+
+/*-*******************************************************
+* Block entropic compression
+*********************************************************/
+
+/* See doc/zstd_compression_format.md for detailed format description */
+
+size_t ZSTD_noCompressBlock(void *dst, size_t dstCapacity, const void *src, size_t srcSize)
+{
+ if (srcSize + ZSTD_blockHeaderSize > dstCapacity)
+ return ERROR(dstSize_tooSmall);
+ memcpy((BYTE *)dst + ZSTD_blockHeaderSize, src, srcSize);
+ ZSTD_writeLE24(dst, (U32)(srcSize << 2) + (U32)bt_raw);
+ return ZSTD_blockHeaderSize + srcSize;
+}
+
+static size_t ZSTD_noCompressLiterals(void *dst, size_t dstCapacity, const void *src, size_t srcSize)
+{
+ BYTE *const ostart = (BYTE * const)dst;
+ U32 const flSize = 1 + (srcSize > 31) + (srcSize > 4095);
+
+ if (srcSize + flSize > dstCapacity)
+ return ERROR(dstSize_tooSmall);
+
+ switch (flSize) {
+ case 1: /* 2 - 1 - 5 */ ostart[0] = (BYTE)((U32)set_basic + (srcSize << 3)); break;
+ case 2: /* 2 - 2 - 12 */ ZSTD_writeLE16(ostart, (U16)((U32)set_basic + (1 << 2) + (srcSize << 4))); break;
+ default: /*note : should not be necessary : flSize is within {1,2,3} */
+ case 3: /* 2 - 2 - 20 */ ZSTD_writeLE32(ostart, (U32)((U32)set_basic + (3 << 2) + (srcSize << 4))); break;
+ }
+
+ memcpy(ostart + flSize, src, srcSize);
+ return srcSize + flSize;
+}
+
+static size_t ZSTD_compressRleLiteralsBlock(void *dst, size_t dstCapacity, const void *src, size_t srcSize)
+{
+ BYTE *const ostart = (BYTE * const)dst;
+ U32 const flSize = 1 + (srcSize > 31) + (srcSize > 4095);
+
+ (void)dstCapacity; /* dstCapacity already guaranteed to be >=4, hence large enough */
+
+ switch (flSize) {
+ case 1: /* 2 - 1 - 5 */ ostart[0] = (BYTE)((U32)set_rle + (srcSize << 3)); break;
+ case 2: /* 2 - 2 - 12 */ ZSTD_writeLE16(ostart, (U16)((U32)set_rle + (1 << 2) + (srcSize << 4))); break;
+ default: /*note : should not be necessary : flSize is necessarily within {1,2,3} */
+ case 3: /* 2 - 2 - 20 */ ZSTD_writeLE32(ostart, (U32)((U32)set_rle + (3 << 2) + (srcSize << 4))); break;
+ }
+
+ ostart[flSize] = *(const BYTE *)src;
+ return flSize + 1;
+}
+
+static size_t ZSTD_minGain(size_t srcSize) { return (srcSize >> 6) + 2; }
+
+static size_t ZSTD_compressLiterals(ZSTD_CCtx *zc, void *dst, size_t dstCapacity, const void *src, size_t srcSize)
+{
+ size_t const minGain = ZSTD_minGain(srcSize);
+ size_t const lhSize = 3 + (srcSize >= 1 KB) + (srcSize >= 16 KB);
+ BYTE *const ostart = (BYTE *)dst;
+ U32 singleStream = srcSize < 256;
+ symbolEncodingType_e hType = set_compressed;
+ size_t cLitSize;
+
+/* small ? don't even attempt compression (speed opt) */
+#define LITERAL_NOENTROPY 63
+ {
+ size_t const minLitSize = zc->flagStaticHufTable == HUF_repeat_valid ? 6 : LITERAL_NOENTROPY;
+ if (srcSize <= minLitSize)
+ return ZSTD_noCompressLiterals(dst, dstCapacity, src, srcSize);
+ }
+
+ if (dstCapacity < lhSize + 1)
+ return ERROR(dstSize_tooSmall); /* not enough space for compression */
+ {
+ HUF_repeat repeat = zc->flagStaticHufTable;
+ int const preferRepeat = zc->params.cParams.strategy < ZSTD_lazy ? srcSize <= 1024 : 0;
+ if (repeat == HUF_repeat_valid && lhSize == 3)
+ singleStream = 1;
+ cLitSize = singleStream ? HUF_compress1X_repeat(ostart + lhSize, dstCapacity - lhSize, src, srcSize, 255, 11, zc->tmpCounters,
+ sizeof(zc->tmpCounters), zc->hufTable, &repeat, preferRepeat)
+ : HUF_compress4X_repeat(ostart + lhSize, dstCapacity - lhSize, src, srcSize, 255, 11, zc->tmpCounters,
+ sizeof(zc->tmpCounters), zc->hufTable, &repeat, preferRepeat);
+ if (repeat != HUF_repeat_none) {
+ hType = set_repeat;
+ } /* reused the existing table */
+ else {
+ zc->flagStaticHufTable = HUF_repeat_check;
+ } /* now have a table to reuse */
+ }
+
+ if ((cLitSize == 0) | (cLitSize >= srcSize - minGain)) {
+ zc->flagStaticHufTable = HUF_repeat_none;
+ return ZSTD_noCompressLiterals(dst, dstCapacity, src, srcSize);
+ }
+ if (cLitSize == 1) {
+ zc->flagStaticHufTable = HUF_repeat_none;
+ return ZSTD_compressRleLiteralsBlock(dst, dstCapacity, src, srcSize);
+ }
+
+ /* Build header */
+ switch (lhSize) {
+ case 3: /* 2 - 2 - 10 - 10 */
+ {
+ U32 const lhc = hType + ((!singleStream) << 2) + ((U32)srcSize << 4) + ((U32)cLitSize << 14);
+ ZSTD_writeLE24(ostart, lhc);
+ break;
+ }
+ case 4: /* 2 - 2 - 14 - 14 */
+ {
+ U32 const lhc = hType + (2 << 2) + ((U32)srcSize << 4) + ((U32)cLitSize << 18);
+ ZSTD_writeLE32(ostart, lhc);
+ break;
+ }
+ default: /* should not be necessary, lhSize is only {3,4,5} */
+ case 5: /* 2 - 2 - 18 - 18 */
+ {
+ U32 const lhc = hType + (3 << 2) + ((U32)srcSize << 4) + ((U32)cLitSize << 22);
+ ZSTD_writeLE32(ostart, lhc);
+ ostart[4] = (BYTE)(cLitSize >> 10);
+ break;
+ }
+ }
+ return lhSize + cLitSize;
+}
+
+static const BYTE LL_Code[64] = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 16, 17, 17, 18, 18,
+ 19, 19, 20, 20, 20, 20, 21, 21, 21, 21, 22, 22, 22, 22, 22, 22, 22, 22, 23, 23, 23, 23,
+ 23, 23, 23, 23, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24};
+
+static const BYTE ML_Code[128] = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25,
+ 26, 27, 28, 29, 30, 31, 32, 32, 33, 33, 34, 34, 35, 35, 36, 36, 36, 36, 37, 37, 37, 37, 38, 38, 38, 38,
+ 38, 38, 38, 38, 39, 39, 39, 39, 39, 39, 39, 39, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40,
+ 40, 40, 41, 41, 41, 41, 41, 41, 41, 41, 41, 41, 41, 41, 41, 41, 41, 41, 42, 42, 42, 42, 42, 42, 42, 42,
+ 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42};
+
+void ZSTD_seqToCodes(const seqStore_t *seqStorePtr)
+{
+ BYTE const LL_deltaCode = 19;
+ BYTE const ML_deltaCode = 36;
+ const seqDef *const sequences = seqStorePtr->sequencesStart;
+ BYTE *const llCodeTable = seqStorePtr->llCode;
+ BYTE *const ofCodeTable = seqStorePtr->ofCode;
+ BYTE *const mlCodeTable = seqStorePtr->mlCode;
+ U32 const nbSeq = (U32)(seqStorePtr->sequences - seqStorePtr->sequencesStart);
+ U32 u;
+ for (u = 0; u < nbSeq; u++) {
+ U32 const llv = sequences[u].litLength;
+ U32 const mlv = sequences[u].matchLength;
+ llCodeTable[u] = (llv > 63) ? (BYTE)ZSTD_highbit32(llv) + LL_deltaCode : LL_Code[llv];
+ ofCodeTable[u] = (BYTE)ZSTD_highbit32(sequences[u].offset);
+ mlCodeTable[u] = (mlv > 127) ? (BYTE)ZSTD_highbit32(mlv) + ML_deltaCode : ML_Code[mlv];
+ }
+ if (seqStorePtr->longLengthID == 1)
+ llCodeTable[seqStorePtr->longLengthPos] = MaxLL;
+ if (seqStorePtr->longLengthID == 2)
+ mlCodeTable[seqStorePtr->longLengthPos] = MaxML;
+}
+
+ZSTD_STATIC size_t ZSTD_compressSequences_internal(ZSTD_CCtx *zc, void *dst, size_t dstCapacity)
+{
+ const int longOffsets = zc->params.cParams.windowLog > STREAM_ACCUMULATOR_MIN;
+ const seqStore_t *seqStorePtr = &(zc->seqStore);
+ FSE_CTable *CTable_LitLength = zc->litlengthCTable;
+ FSE_CTable *CTable_OffsetBits = zc->offcodeCTable;
+ FSE_CTable *CTable_MatchLength = zc->matchlengthCTable;
+ U32 LLtype, Offtype, MLtype; /* compressed, raw or rle */
+ const seqDef *const sequences = seqStorePtr->sequencesStart;
+ const BYTE *const ofCodeTable = seqStorePtr->ofCode;
+ const BYTE *const llCodeTable = seqStorePtr->llCode;
+ const BYTE *const mlCodeTable = seqStorePtr->mlCode;
+ BYTE *const ostart = (BYTE *)dst;
+ BYTE *const oend = ostart + dstCapacity;
+ BYTE *op = ostart;
+ size_t const nbSeq = seqStorePtr->sequences - seqStorePtr->sequencesStart;
+ BYTE *seqHead;
+
+ U32 *count;
+ S16 *norm;
+ U32 *workspace;
+ size_t workspaceSize = sizeof(zc->tmpCounters);
+ {
+ size_t spaceUsed32 = 0;
+ count = (U32 *)zc->tmpCounters + spaceUsed32;
+ spaceUsed32 += MaxSeq + 1;
+ norm = (S16 *)((U32 *)zc->tmpCounters + spaceUsed32);
+ spaceUsed32 += ALIGN(sizeof(S16) * (MaxSeq + 1), sizeof(U32)) >> 2;
+
+ workspace = (U32 *)zc->tmpCounters + spaceUsed32;
+ workspaceSize -= (spaceUsed32 << 2);
+ }
+
+ /* Compress literals */
+ {
+ const BYTE *const literals = seqStorePtr->litStart;
+ size_t const litSize = seqStorePtr->lit - literals;
+ size_t const cSize = ZSTD_compressLiterals(zc, op, dstCapacity, literals, litSize);
+ if (ZSTD_isError(cSize))
+ return cSize;
+ op += cSize;
+ }
+
+ /* Sequences Header */
+ if ((oend - op) < 3 /*max nbSeq Size*/ + 1 /*seqHead */)
+ return ERROR(dstSize_tooSmall);
+ if (nbSeq < 0x7F)
+ *op++ = (BYTE)nbSeq;
+ else if (nbSeq < LONGNBSEQ)
+ op[0] = (BYTE)((nbSeq >> 8) + 0x80), op[1] = (BYTE)nbSeq, op += 2;
+ else
+ op[0] = 0xFF, ZSTD_writeLE16(op + 1, (U16)(nbSeq - LONGNBSEQ)), op += 3;
+ if (nbSeq == 0)
+ return op - ostart;
+
+ /* seqHead : flags for FSE encoding type */
+ seqHead = op++;
+
+#define MIN_SEQ_FOR_DYNAMIC_FSE 64
+#define MAX_SEQ_FOR_STATIC_FSE 1000
+
+ /* convert length/distances into codes */
+ ZSTD_seqToCodes(seqStorePtr);
+
+ /* CTable for Literal Lengths */
+ {
+ U32 max = MaxLL;
+ size_t const mostFrequent = FSE_countFast_wksp(count, &max, llCodeTable, nbSeq, workspace);
+ if ((mostFrequent == nbSeq) && (nbSeq > 2)) {
+ *op++ = llCodeTable[0];
+ FSE_buildCTable_rle(CTable_LitLength, (BYTE)max);
+ LLtype = set_rle;
+ } else if ((zc->flagStaticTables) && (nbSeq < MAX_SEQ_FOR_STATIC_FSE)) {
+ LLtype = set_repeat;
+ } else if ((nbSeq < MIN_SEQ_FOR_DYNAMIC_FSE) || (mostFrequent < (nbSeq >> (LL_defaultNormLog - 1)))) {
+ FSE_buildCTable_wksp(CTable_LitLength, LL_defaultNorm, MaxLL, LL_defaultNormLog, workspace, workspaceSize);
+ LLtype = set_basic;
+ } else {
+ size_t nbSeq_1 = nbSeq;
+ const U32 tableLog = FSE_optimalTableLog(LLFSELog, nbSeq, max);
+ if (count[llCodeTable[nbSeq - 1]] > 1) {
+ count[llCodeTable[nbSeq - 1]]--;
+ nbSeq_1--;
+ }
+ FSE_normalizeCount(norm, tableLog, count, nbSeq_1, max);
+ {
+ size_t const NCountSize = FSE_writeNCount(op, oend - op, norm, max, tableLog); /* overflow protected */
+ if (FSE_isError(NCountSize))
+ return NCountSize;
+ op += NCountSize;
+ }
+ FSE_buildCTable_wksp(CTable_LitLength, norm, max, tableLog, workspace, workspaceSize);
+ LLtype = set_compressed;
+ }
+ }
+
+ /* CTable for Offsets */
+ {
+ U32 max = MaxOff;
+ size_t const mostFrequent = FSE_countFast_wksp(count, &max, ofCodeTable, nbSeq, workspace);
+ if ((mostFrequent == nbSeq) && (nbSeq > 2)) {
+ *op++ = ofCodeTable[0];
+ FSE_buildCTable_rle(CTable_OffsetBits, (BYTE)max);
+ Offtype = set_rle;
+ } else if ((zc->flagStaticTables) && (nbSeq < MAX_SEQ_FOR_STATIC_FSE)) {
+ Offtype = set_repeat;
+ } else if ((nbSeq < MIN_SEQ_FOR_DYNAMIC_FSE) || (mostFrequent < (nbSeq >> (OF_defaultNormLog - 1)))) {
+ FSE_buildCTable_wksp(CTable_OffsetBits, OF_defaultNorm, MaxOff, OF_defaultNormLog, workspace, workspaceSize);
+ Offtype = set_basic;
+ } else {
+ size_t nbSeq_1 = nbSeq;
+ const U32 tableLog = FSE_optimalTableLog(OffFSELog, nbSeq, max);
+ if (count[ofCodeTable[nbSeq - 1]] > 1) {
+ count[ofCodeTable[nbSeq - 1]]--;
+ nbSeq_1--;
+ }
+ FSE_normalizeCount(norm, tableLog, count, nbSeq_1, max);
+ {
+ size_t const NCountSize = FSE_writeNCount(op, oend - op, norm, max, tableLog); /* overflow protected */
+ if (FSE_isError(NCountSize))
+ return NCountSize;
+ op += NCountSize;
+ }
+ FSE_buildCTable_wksp(CTable_OffsetBits, norm, max, tableLog, workspace, workspaceSize);
+ Offtype = set_compressed;
+ }
+ }
+
+ /* CTable for MatchLengths */
+ {
+ U32 max = MaxML;
+ size_t const mostFrequent = FSE_countFast_wksp(count, &max, mlCodeTable, nbSeq, workspace);
+ if ((mostFrequent == nbSeq) && (nbSeq > 2)) {
+ *op++ = *mlCodeTable;
+ FSE_buildCTable_rle(CTable_MatchLength, (BYTE)max);
+ MLtype = set_rle;
+ } else if ((zc->flagStaticTables) && (nbSeq < MAX_SEQ_FOR_STATIC_FSE)) {
+ MLtype = set_repeat;
+ } else if ((nbSeq < MIN_SEQ_FOR_DYNAMIC_FSE) || (mostFrequent < (nbSeq >> (ML_defaultNormLog - 1)))) {
+ FSE_buildCTable_wksp(CTable_MatchLength, ML_defaultNorm, MaxML, ML_defaultNormLog, workspace, workspaceSize);
+ MLtype = set_basic;
+ } else {
+ size_t nbSeq_1 = nbSeq;
+ const U32 tableLog = FSE_optimalTableLog(MLFSELog, nbSeq, max);
+ if (count[mlCodeTable[nbSeq - 1]] > 1) {
+ count[mlCodeTable[nbSeq - 1]]--;
+ nbSeq_1--;
+ }
+ FSE_normalizeCount(norm, tableLog, count, nbSeq_1, max);
+ {
+ size_t const NCountSize = FSE_writeNCount(op, oend - op, norm, max, tableLog); /* overflow protected */
+ if (FSE_isError(NCountSize))
+ return NCountSize;
+ op += NCountSize;
+ }
+ FSE_buildCTable_wksp(CTable_MatchLength, norm, max, tableLog, workspace, workspaceSize);
+ MLtype = set_compressed;
+ }
+ }
+
+ *seqHead = (BYTE)((LLtype << 6) + (Offtype << 4) + (MLtype << 2));
+ zc->flagStaticTables = 0;
+
+ /* Encoding Sequences */
+ {
+ BIT_CStream_t blockStream;
+ FSE_CState_t stateMatchLength;
+ FSE_CState_t stateOffsetBits;
+ FSE_CState_t stateLitLength;
+
+ CHECK_E(BIT_initCStream(&blockStream, op, oend - op), dstSize_tooSmall); /* not enough space remaining */
+
+ /* first symbols */
+ FSE_initCState2(&stateMatchLength, CTable_MatchLength, mlCodeTable[nbSeq - 1]);
+ FSE_initCState2(&stateOffsetBits, CTable_OffsetBits, ofCodeTable[nbSeq - 1]);
+ FSE_initCState2(&stateLitLength, CTable_LitLength, llCodeTable[nbSeq - 1]);
+ BIT_addBits(&blockStream, sequences[nbSeq - 1].litLength, LL_bits[llCodeTable[nbSeq - 1]]);
+ if (ZSTD_32bits())
+ BIT_flushBits(&blockStream);
+ BIT_addBits(&blockStream, sequences[nbSeq - 1].matchLength, ML_bits[mlCodeTable[nbSeq - 1]]);
+ if (ZSTD_32bits())
+ BIT_flushBits(&blockStream);
+ if (longOffsets) {
+ U32 const ofBits = ofCodeTable[nbSeq - 1];
+ int const extraBits = ofBits - MIN(ofBits, STREAM_ACCUMULATOR_MIN - 1);
+ if (extraBits) {
+ BIT_addBits(&blockStream, sequences[nbSeq - 1].offset, extraBits);
+ BIT_flushBits(&blockStream);
+ }
+ BIT_addBits(&blockStream, sequences[nbSeq - 1].offset >> extraBits, ofBits - extraBits);
+ } else {
+ BIT_addBits(&blockStream, sequences[nbSeq - 1].offset, ofCodeTable[nbSeq - 1]);
+ }
+ BIT_flushBits(&blockStream);
+
+ {
+ size_t n;
+ for (n = nbSeq - 2; n < nbSeq; n--) { /* intentional underflow */
+ BYTE const llCode = llCodeTable[n];
+ BYTE const ofCode = ofCodeTable[n];
+ BYTE const mlCode = mlCodeTable[n];
+ U32 const llBits = LL_bits[llCode];
+ U32 const ofBits = ofCode; /* 32b*/ /* 64b*/
+ U32 const mlBits = ML_bits[mlCode];
+ /* (7)*/ /* (7)*/
+ FSE_encodeSymbol(&blockStream, &stateOffsetBits, ofCode); /* 15 */ /* 15 */
+ FSE_encodeSymbol(&blockStream, &stateMatchLength, mlCode); /* 24 */ /* 24 */
+ if (ZSTD_32bits())
+ BIT_flushBits(&blockStream); /* (7)*/
+ FSE_encodeSymbol(&blockStream, &stateLitLength, llCode); /* 16 */ /* 33 */
+ if (ZSTD_32bits() || (ofBits + mlBits + llBits >= 64 - 7 - (LLFSELog + MLFSELog + OffFSELog)))
+ BIT_flushBits(&blockStream); /* (7)*/
+ BIT_addBits(&blockStream, sequences[n].litLength, llBits);
+ if (ZSTD_32bits() && ((llBits + mlBits) > 24))
+ BIT_flushBits(&blockStream);
+ BIT_addBits(&blockStream, sequences[n].matchLength, mlBits);
+ if (ZSTD_32bits())
+ BIT_flushBits(&blockStream); /* (7)*/
+ if (longOffsets) {
+ int const extraBits = ofBits - MIN(ofBits, STREAM_ACCUMULATOR_MIN - 1);
+ if (extraBits) {
+ BIT_addBits(&blockStream, sequences[n].offset, extraBits);
+ BIT_flushBits(&blockStream); /* (7)*/
+ }
+ BIT_addBits(&blockStream, sequences[n].offset >> extraBits, ofBits - extraBits); /* 31 */
+ } else {
+ BIT_addBits(&blockStream, sequences[n].offset, ofBits); /* 31 */
+ }
+ BIT_flushBits(&blockStream); /* (7)*/
+ }
+ }
+
+ FSE_flushCState(&blockStream, &stateMatchLength);
+ FSE_flushCState(&blockStream, &stateOffsetBits);
+ FSE_flushCState(&blockStream, &stateLitLength);
+
+ {
+ size_t const streamSize = BIT_closeCStream(&blockStream);
+ if (streamSize == 0)
+ return ERROR(dstSize_tooSmall); /* not enough space */
+ op += streamSize;
+ }
+ }
+ return op - ostart;
+}
+
+ZSTD_STATIC size_t ZSTD_compressSequences(ZSTD_CCtx *zc, void *dst, size_t dstCapacity, size_t srcSize)
+{
+ size_t const cSize = ZSTD_compressSequences_internal(zc, dst, dstCapacity);
+ size_t const minGain = ZSTD_minGain(srcSize);
+ size_t const maxCSize = srcSize - minGain;
+ /* If the srcSize <= dstCapacity, then there is enough space to write a
+ * raw uncompressed block. Since we ran out of space, the block must not
+ * be compressible, so fall back to a raw uncompressed block.
+ */
+ int const uncompressibleError = cSize == ERROR(dstSize_tooSmall) && srcSize <= dstCapacity;
+ int i;
+
+ if (ZSTD_isError(cSize) && !uncompressibleError)
+ return cSize;
+ if (cSize >= maxCSize || uncompressibleError) {
+ zc->flagStaticHufTable = HUF_repeat_none;
+ return 0;
+ }
+ /* confirm repcodes */
+ for (i = 0; i < ZSTD_REP_NUM; i++)
+ zc->rep[i] = zc->repToConfirm[i];
+ return cSize;
+}
+
+/*! ZSTD_storeSeq() :
+ Store a sequence (literal length, literals, offset code and match length code) into seqStore_t.
+ `offsetCode` : distance to match, or 0 == repCode.
+ `matchCode` : matchLength - MINMATCH
+*/
+ZSTD_STATIC void ZSTD_storeSeq(seqStore_t *seqStorePtr, size_t litLength, const void *literals, U32 offsetCode, size_t matchCode)
+{
+ /* copy Literals */
+ ZSTD_wildcopy(seqStorePtr->lit, literals, litLength);
+ seqStorePtr->lit += litLength;
+
+ /* literal Length */
+ if (litLength > 0xFFFF) {
+ seqStorePtr->longLengthID = 1;
+ seqStorePtr->longLengthPos = (U32)(seqStorePtr->sequences - seqStorePtr->sequencesStart);
+ }
+ seqStorePtr->sequences[0].litLength = (U16)litLength;
+
+ /* match offset */
+ seqStorePtr->sequences[0].offset = offsetCode + 1;
+
+ /* match Length */
+ if (matchCode > 0xFFFF) {
+ seqStorePtr->longLengthID = 2;
+ seqStorePtr->longLengthPos = (U32)(seqStorePtr->sequences - seqStorePtr->sequencesStart);
+ }
+ seqStorePtr->sequences[0].matchLength = (U16)matchCode;
+
+ seqStorePtr->sequences++;
+}
+
+/*-*************************************
+* Match length counter
+***************************************/
+static unsigned ZSTD_NbCommonBytes(register size_t val)
+{
+ if (ZSTD_isLittleEndian()) {
+ if (ZSTD_64bits()) {
+ return (__builtin_ctzll((U64)val) >> 3);
+ } else { /* 32 bits */
+ return (__builtin_ctz((U32)val) >> 3);
+ }
+ } else { /* Big Endian CPU */
+ if (ZSTD_64bits()) {
+ return (__builtin_clzll(val) >> 3);
+ } else { /* 32 bits */
+ return (__builtin_clz((U32)val) >> 3);
+ }
+ }
+}
+
+static size_t ZSTD_count(const BYTE *pIn, const BYTE *pMatch, const BYTE *const pInLimit)
+{
+ const BYTE *const pStart = pIn;
+ const BYTE *const pInLoopLimit = pInLimit - (sizeof(size_t) - 1);
+
+ while (pIn < pInLoopLimit) {
+ size_t const diff = ZSTD_readST(pMatch) ^ ZSTD_readST(pIn);
+ if (!diff) {
+ pIn += sizeof(size_t);
+ pMatch += sizeof(size_t);
+ continue;
+ }
+ pIn += ZSTD_NbCommonBytes(diff);
+ return (size_t)(pIn - pStart);
+ }
+ if (ZSTD_64bits())
+ if ((pIn < (pInLimit - 3)) && (ZSTD_read32(pMatch) == ZSTD_read32(pIn))) {
+ pIn += 4;
+ pMatch += 4;
+ }
+ if ((pIn < (pInLimit - 1)) && (ZSTD_read16(pMatch) == ZSTD_read16(pIn))) {
+ pIn += 2;
+ pMatch += 2;
+ }
+ if ((pIn < pInLimit) && (*pMatch == *pIn))
+ pIn++;
+ return (size_t)(pIn - pStart);
+}
+
+/** ZSTD_count_2segments() :
+* can count match length with `ip` & `match` in 2 different segments.
+* convention : on reaching mEnd, match count continue starting from iStart
+*/
+static size_t ZSTD_count_2segments(const BYTE *ip, const BYTE *match, const BYTE *iEnd, const BYTE *mEnd, const BYTE *iStart)
+{
+ const BYTE *const vEnd = MIN(ip + (mEnd - match), iEnd);
+ size_t const matchLength = ZSTD_count(ip, match, vEnd);
+ if (match + matchLength != mEnd)
+ return matchLength;
+ return matchLength + ZSTD_count(ip + matchLength, iStart, iEnd);
+}
+
+/*-*************************************
+* Hashes
+***************************************/
+static const U32 prime3bytes = 506832829U;
+static U32 ZSTD_hash3(U32 u, U32 h) { return ((u << (32 - 24)) * prime3bytes) >> (32 - h); }
+ZSTD_STATIC size_t ZSTD_hash3Ptr(const void *ptr, U32 h) { return ZSTD_hash3(ZSTD_readLE32(ptr), h); } /* only in zstd_opt.h */
+
+static const U32 prime4bytes = 2654435761U;
+static U32 ZSTD_hash4(U32 u, U32 h) { return (u * prime4bytes) >> (32 - h); }
+static size_t ZSTD_hash4Ptr(const void *ptr, U32 h) { return ZSTD_hash4(ZSTD_read32(ptr), h); }
+
+static const U64 prime5bytes = 889523592379ULL;
+static size_t ZSTD_hash5(U64 u, U32 h) { return (size_t)(((u << (64 - 40)) * prime5bytes) >> (64 - h)); }
+static size_t ZSTD_hash5Ptr(const void *p, U32 h) { return ZSTD_hash5(ZSTD_readLE64(p), h); }
+
+static const U64 prime6bytes = 227718039650203ULL;
+static size_t ZSTD_hash6(U64 u, U32 h) { return (size_t)(((u << (64 - 48)) * prime6bytes) >> (64 - h)); }
+static size_t ZSTD_hash6Ptr(const void *p, U32 h) { return ZSTD_hash6(ZSTD_readLE64(p), h); }
+
+static const U64 prime7bytes = 58295818150454627ULL;
+static size_t ZSTD_hash7(U64 u, U32 h) { return (size_t)(((u << (64 - 56)) * prime7bytes) >> (64 - h)); }
+static size_t ZSTD_hash7Ptr(const void *p, U32 h) { return ZSTD_hash7(ZSTD_readLE64(p), h); }
+
+static const U64 prime8bytes = 0xCF1BBCDCB7A56463ULL;
+static size_t ZSTD_hash8(U64 u, U32 h) { return (size_t)(((u)*prime8bytes) >> (64 - h)); }
+static size_t ZSTD_hash8Ptr(const void *p, U32 h) { return ZSTD_hash8(ZSTD_readLE64(p), h); }
+
+static size_t ZSTD_hashPtr(const void *p, U32 hBits, U32 mls)
+{
+ switch (mls) {
+ // case 3: return ZSTD_hash3Ptr(p, hBits);
+ default:
+ case 4: return ZSTD_hash4Ptr(p, hBits);
+ case 5: return ZSTD_hash5Ptr(p, hBits);
+ case 6: return ZSTD_hash6Ptr(p, hBits);
+ case 7: return ZSTD_hash7Ptr(p, hBits);
+ case 8: return ZSTD_hash8Ptr(p, hBits);
+ }
+}
+
+/*-*************************************
+* Fast Scan
+***************************************/
+static void ZSTD_fillHashTable(ZSTD_CCtx *zc, const void *end, const U32 mls)
+{
+ U32 *const hashTable = zc->hashTable;
+ U32 const hBits = zc->params.cParams.hashLog;
+ const BYTE *const base = zc->base;
+ const BYTE *ip = base + zc->nextToUpdate;
+ const BYTE *const iend = ((const BYTE *)end) - HASH_READ_SIZE;
+ const size_t fastHashFillStep = 3;
+
+ while (ip <= iend) {
+ hashTable[ZSTD_hashPtr(ip, hBits, mls)] = (U32)(ip - base);
+ ip += fastHashFillStep;
+ }
+}
+
+FORCE_INLINE
+void ZSTD_compressBlock_fast_generic(ZSTD_CCtx *cctx, const void *src, size_t srcSize, const U32 mls)
+{
+ U32 *const hashTable = cctx->hashTable;
+ U32 const hBits = cctx->params.cParams.hashLog;
+ seqStore_t *seqStorePtr = &(cctx->seqStore);
+ const BYTE *const base = cctx->base;
+ const BYTE *const istart = (const BYTE *)src;
+ const BYTE *ip = istart;
+ const BYTE *anchor = istart;
+ const U32 lowestIndex = cctx->dictLimit;
+ const BYTE *const lowest = base + lowestIndex;
+ const BYTE *const iend = istart + srcSize;
+ const BYTE *const ilimit = iend - HASH_READ_SIZE;
+ U32 offset_1 = cctx->rep[0], offset_2 = cctx->rep[1];
+ U32 offsetSaved = 0;
+
+ /* init */
+ ip += (ip == lowest);
+ {
+ U32 const maxRep = (U32)(ip - lowest);
+ if (offset_2 > maxRep)
+ offsetSaved = offset_2, offset_2 = 0;
+ if (offset_1 > maxRep)
+ offsetSaved = offset_1, offset_1 = 0;
+ }
+
+ /* Main Search Loop */
+ while (ip < ilimit) { /* < instead of <=, because repcode check at (ip+1) */
+ size_t mLength;
+ size_t const h = ZSTD_hashPtr(ip, hBits, mls);
+ U32 const curr = (U32)(ip - base);
+ U32 const matchIndex = hashTable[h];
+ const BYTE *match = base + matchIndex;
+ hashTable[h] = curr; /* update hash table */
+
+ if ((offset_1 > 0) & (ZSTD_read32(ip + 1 - offset_1) == ZSTD_read32(ip + 1))) {
+ mLength = ZSTD_count(ip + 1 + 4, ip + 1 + 4 - offset_1, iend) + 4;
+ ip++;
+ ZSTD_storeSeq(seqStorePtr, ip - anchor, anchor, 0, mLength - MINMATCH);
+ } else {
+ U32 offset;
+ if ((matchIndex <= lowestIndex) || (ZSTD_read32(match) != ZSTD_read32(ip))) {
+ ip += ((ip - anchor) >> g_searchStrength) + 1;
+ continue;
+ }
+ mLength = ZSTD_count(ip + 4, match + 4, iend) + 4;
+ offset = (U32)(ip - match);
+ while (((ip > anchor) & (match > lowest)) && (ip[-1] == match[-1])) {
+ ip--;
+ match--;
+ mLength++;
+ } /* catch up */
+ offset_2 = offset_1;
+ offset_1 = offset;
+
+ ZSTD_storeSeq(seqStorePtr, ip - anchor, anchor, offset + ZSTD_REP_MOVE, mLength - MINMATCH);
+ }
+
+ /* match found */
+ ip += mLength;
+ anchor = ip;
+
+ if (ip <= ilimit) {
+ /* Fill Table */
+ hashTable[ZSTD_hashPtr(base + curr + 2, hBits, mls)] = curr + 2; /* here because curr+2 could be > iend-8 */
+ hashTable[ZSTD_hashPtr(ip - 2, hBits, mls)] = (U32)(ip - 2 - base);
+ /* check immediate repcode */
+ while ((ip <= ilimit) && ((offset_2 > 0) & (ZSTD_read32(ip) == ZSTD_read32(ip - offset_2)))) {
+ /* store sequence */
+ size_t const rLength = ZSTD_count(ip + 4, ip + 4 - offset_2, iend) + 4;
+ {
+ U32 const tmpOff = offset_2;
+ offset_2 = offset_1;
+ offset_1 = tmpOff;
+ } /* swap offset_2 <=> offset_1 */
+ hashTable[ZSTD_hashPtr(ip, hBits, mls)] = (U32)(ip - base);
+ ZSTD_storeSeq(seqStorePtr, 0, anchor, 0, rLength - MINMATCH);
+ ip += rLength;
+ anchor = ip;
+ continue; /* faster when present ... (?) */
+ }
+ }
+ }
+
+ /* save reps for next block */
+ cctx->repToConfirm[0] = offset_1 ? offset_1 : offsetSaved;
+ cctx->repToConfirm[1] = offset_2 ? offset_2 : offsetSaved;
+
+ /* Last Literals */
+ {
+ size_t const lastLLSize = iend - anchor;
+ memcpy(seqStorePtr->lit, anchor, lastLLSize);
+ seqStorePtr->lit += lastLLSize;
+ }
+}
+
+static void ZSTD_compressBlock_fast(ZSTD_CCtx *ctx, const void *src, size_t srcSize)
+{
+ const U32 mls = ctx->params.cParams.searchLength;
+ switch (mls) {
+ default: /* includes case 3 */
+ case 4: ZSTD_compressBlock_fast_generic(ctx, src, srcSize, 4); return;
+ case 5: ZSTD_compressBlock_fast_generic(ctx, src, srcSize, 5); return;
+ case 6: ZSTD_compressBlock_fast_generic(ctx, src, srcSize, 6); return;
+ case 7: ZSTD_compressBlock_fast_generic(ctx, src, srcSize, 7); return;
+ }
+}
+
+static void ZSTD_compressBlock_fast_extDict_generic(ZSTD_CCtx *ctx, const void *src, size_t srcSize, const U32 mls)
+{
+ U32 *hashTable = ctx->hashTable;
+ const U32 hBits = ctx->params.cParams.hashLog;
+ seqStore_t *seqStorePtr = &(ctx->seqStore);
+ const BYTE *const base = ctx->base;
+ const BYTE *const dictBase = ctx->dictBase;
+ const BYTE *const istart = (const BYTE *)src;
+ const BYTE *ip = istart;
+ const BYTE *anchor = istart;
+ const U32 lowestIndex = ctx->lowLimit;
+ const BYTE *const dictStart = dictBase + lowestIndex;
+ const U32 dictLimit = ctx->dictLimit;
+ const BYTE *const lowPrefixPtr = base + dictLimit;
+ const BYTE *const dictEnd = dictBase + dictLimit;
+ const BYTE *const iend = istart + srcSize;
+ const BYTE *const ilimit = iend - 8;
+ U32 offset_1 = ctx->rep[0], offset_2 = ctx->rep[1];
+
+ /* Search Loop */
+ while (ip < ilimit) { /* < instead of <=, because (ip+1) */
+ const size_t h = ZSTD_hashPtr(ip, hBits, mls);
+ const U32 matchIndex = hashTable[h];
+ const BYTE *matchBase = matchIndex < dictLimit ? dictBase : base;
+ const BYTE *match = matchBase + matchIndex;
+ const U32 curr = (U32)(ip - base);
+ const U32 repIndex = curr + 1 - offset_1; /* offset_1 expected <= curr +1 */
+ const BYTE *repBase = repIndex < dictLimit ? dictBase : base;
+ const BYTE *repMatch = repBase + repIndex;
+ size_t mLength;
+ hashTable[h] = curr; /* update hash table */
+
+ if ((((U32)((dictLimit - 1) - repIndex) >= 3) /* intentional underflow */ & (repIndex > lowestIndex)) &&
+ (ZSTD_read32(repMatch) == ZSTD_read32(ip + 1))) {
+ const BYTE *repMatchEnd = repIndex < dictLimit ? dictEnd : iend;
+ mLength = ZSTD_count_2segments(ip + 1 + EQUAL_READ32, repMatch + EQUAL_READ32, iend, repMatchEnd, lowPrefixPtr) + EQUAL_READ32;
+ ip++;
+ ZSTD_storeSeq(seqStorePtr, ip - anchor, anchor, 0, mLength - MINMATCH);
+ } else {
+ if ((matchIndex < lowestIndex) || (ZSTD_read32(match) != ZSTD_read32(ip))) {
+ ip += ((ip - anchor) >> g_searchStrength) + 1;
+ continue;
+ }
+ {
+ const BYTE *matchEnd = matchIndex < dictLimit ? dictEnd : iend;
+ const BYTE *lowMatchPtr = matchIndex < dictLimit ? dictStart : lowPrefixPtr;
+ U32 offset;
+ mLength = ZSTD_count_2segments(ip + EQUAL_READ32, match + EQUAL_READ32, iend, matchEnd, lowPrefixPtr) + EQUAL_READ32;
+ while (((ip > anchor) & (match > lowMatchPtr)) && (ip[-1] == match[-1])) {
+ ip--;
+ match--;
+ mLength++;
+ } /* catch up */
+ offset = curr - matchIndex;
+ offset_2 = offset_1;
+ offset_1 = offset;
+ ZSTD_storeSeq(seqStorePtr, ip - anchor, anchor, offset + ZSTD_REP_MOVE, mLength - MINMATCH);
+ }
+ }
+
+ /* found a match : store it */
+ ip += mLength;
+ anchor = ip;
+
+ if (ip <= ilimit) {
+ /* Fill Table */
+ hashTable[ZSTD_hashPtr(base + curr + 2, hBits, mls)] = curr + 2;
+ hashTable[ZSTD_hashPtr(ip - 2, hBits, mls)] = (U32)(ip - 2 - base);
+ /* check immediate repcode */
+ while (ip <= ilimit) {
+ U32 const curr2 = (U32)(ip - base);
+ U32 const repIndex2 = curr2 - offset_2;
+ const BYTE *repMatch2 = repIndex2 < dictLimit ? dictBase + repIndex2 : base + repIndex2;
+ if ((((U32)((dictLimit - 1) - repIndex2) >= 3) & (repIndex2 > lowestIndex)) /* intentional overflow */
+ && (ZSTD_read32(repMatch2) == ZSTD_read32(ip))) {
+ const BYTE *const repEnd2 = repIndex2 < dictLimit ? dictEnd : iend;
+ size_t repLength2 =
+ ZSTD_count_2segments(ip + EQUAL_READ32, repMatch2 + EQUAL_READ32, iend, repEnd2, lowPrefixPtr) + EQUAL_READ32;
+ U32 tmpOffset = offset_2;
+ offset_2 = offset_1;
+ offset_1 = tmpOffset; /* swap offset_2 <=> offset_1 */
+ ZSTD_storeSeq(seqStorePtr, 0, anchor, 0, repLength2 - MINMATCH);
+ hashTable[ZSTD_hashPtr(ip, hBits, mls)] = curr2;
+ ip += repLength2;
+ anchor = ip;
+ continue;
+ }
+ break;
+ }
+ }
+ }
+
+ /* save reps for next block */
+ ctx->repToConfirm[0] = offset_1;
+ ctx->repToConfirm[1] = offset_2;
+
+ /* Last Literals */
+ {
+ size_t const lastLLSize = iend - anchor;
+ memcpy(seqStorePtr->lit, anchor, lastLLSize);
+ seqStorePtr->lit += lastLLSize;
+ }
+}
+
+static void ZSTD_compressBlock_fast_extDict(ZSTD_CCtx *ctx, const void *src, size_t srcSize)
+{
+ U32 const mls = ctx->params.cParams.searchLength;
+ switch (mls) {
+ default: /* includes case 3 */
+ case 4: ZSTD_compressBlock_fast_extDict_generic(ctx, src, srcSize, 4); return;
+ case 5: ZSTD_compressBlock_fast_extDict_generic(ctx, src, srcSize, 5); return;
+ case 6: ZSTD_compressBlock_fast_extDict_generic(ctx, src, srcSize, 6); return;
+ case 7: ZSTD_compressBlock_fast_extDict_generic(ctx, src, srcSize, 7); return;
+ }
+}
+
+/*-*************************************
+* Double Fast
+***************************************/
+static void ZSTD_fillDoubleHashTable(ZSTD_CCtx *cctx, const void *end, const U32 mls)
+{
+ U32 *const hashLarge = cctx->hashTable;
+ U32 const hBitsL = cctx->params.cParams.hashLog;
+ U32 *const hashSmall = cctx->chainTable;
+ U32 const hBitsS = cctx->params.cParams.chainLog;
+ const BYTE *const base = cctx->base;
+ const BYTE *ip = base + cctx->nextToUpdate;
+ const BYTE *const iend = ((const BYTE *)end) - HASH_READ_SIZE;
+ const size_t fastHashFillStep = 3;
+
+ while (ip <= iend) {
+ hashSmall[ZSTD_hashPtr(ip, hBitsS, mls)] = (U32)(ip - base);
+ hashLarge[ZSTD_hashPtr(ip, hBitsL, 8)] = (U32)(ip - base);
+ ip += fastHashFillStep;
+ }
+}
+
+FORCE_INLINE
+void ZSTD_compressBlock_doubleFast_generic(ZSTD_CCtx *cctx, const void *src, size_t srcSize, const U32 mls)
+{
+ U32 *const hashLong = cctx->hashTable;
+ const U32 hBitsL = cctx->params.cParams.hashLog;
+ U32 *const hashSmall = cctx->chainTable;
+ const U32 hBitsS = cctx->params.cParams.chainLog;
+ seqStore_t *seqStorePtr = &(cctx->seqStore);
+ const BYTE *const base = cctx->base;
+ const BYTE *const istart = (const BYTE *)src;
+ const BYTE *ip = istart;
+ const BYTE *anchor = istart;
+ const U32 lowestIndex = cctx->dictLimit;
+ const BYTE *const lowest = base + lowestIndex;
+ const BYTE *const iend = istart + srcSize;
+ const BYTE *const ilimit = iend - HASH_READ_SIZE;
+ U32 offset_1 = cctx->rep[0], offset_2 = cctx->rep[1];
+ U32 offsetSaved = 0;
+
+ /* init */
+ ip += (ip == lowest);
+ {
+ U32 const maxRep = (U32)(ip - lowest);
+ if (offset_2 > maxRep)
+ offsetSaved = offset_2, offset_2 = 0;
+ if (offset_1 > maxRep)
+ offsetSaved = offset_1, offset_1 = 0;
+ }
+
+ /* Main Search Loop */
+ while (ip < ilimit) { /* < instead of <=, because repcode check at (ip+1) */
+ size_t mLength;
+ size_t const h2 = ZSTD_hashPtr(ip, hBitsL, 8);
+ size_t const h = ZSTD_hashPtr(ip, hBitsS, mls);
+ U32 const curr = (U32)(ip - base);
+ U32 const matchIndexL = hashLong[h2];
+ U32 const matchIndexS = hashSmall[h];
+ const BYTE *matchLong = base + matchIndexL;
+ const BYTE *match = base + matchIndexS;
+ hashLong[h2] = hashSmall[h] = curr; /* update hash tables */
+
+ if ((offset_1 > 0) & (ZSTD_read32(ip + 1 - offset_1) == ZSTD_read32(ip + 1))) { /* note : by construction, offset_1 <= curr */
+ mLength = ZSTD_count(ip + 1 + 4, ip + 1 + 4 - offset_1, iend) + 4;
+ ip++;
+ ZSTD_storeSeq(seqStorePtr, ip - anchor, anchor, 0, mLength - MINMATCH);
+ } else {
+ U32 offset;
+ if ((matchIndexL > lowestIndex) && (ZSTD_read64(matchLong) == ZSTD_read64(ip))) {
+ mLength = ZSTD_count(ip + 8, matchLong + 8, iend) + 8;
+ offset = (U32)(ip - matchLong);
+ while (((ip > anchor) & (matchLong > lowest)) && (ip[-1] == matchLong[-1])) {
+ ip--;
+ matchLong--;
+ mLength++;
+ } /* catch up */
+ } else if ((matchIndexS > lowestIndex) && (ZSTD_read32(match) == ZSTD_read32(ip))) {
+ size_t const h3 = ZSTD_hashPtr(ip + 1, hBitsL, 8);
+ U32 const matchIndex3 = hashLong[h3];
+ const BYTE *match3 = base + matchIndex3;
+ hashLong[h3] = curr + 1;
+ if ((matchIndex3 > lowestIndex) && (ZSTD_read64(match3) == ZSTD_read64(ip + 1))) {
+ mLength = ZSTD_count(ip + 9, match3 + 8, iend) + 8;
+ ip++;
+ offset = (U32)(ip - match3);
+ while (((ip > anchor) & (match3 > lowest)) && (ip[-1] == match3[-1])) {
+ ip--;
+ match3--;
+ mLength++;
+ } /* catch up */
+ } else {
+ mLength = ZSTD_count(ip + 4, match + 4, iend) + 4;
+ offset = (U32)(ip - match);
+ while (((ip > anchor) & (match > lowest)) && (ip[-1] == match[-1])) {
+ ip--;
+ match--;
+ mLength++;
+ } /* catch up */
+ }
+ } else {
+ ip += ((ip - anchor) >> g_searchStrength) + 1;
+ continue;
+ }
+
+ offset_2 = offset_1;
+ offset_1 = offset;
+
+ ZSTD_storeSeq(seqStorePtr, ip - anchor, anchor, offset + ZSTD_REP_MOVE, mLength - MINMATCH);
+ }
+
+ /* match found */
+ ip += mLength;
+ anchor = ip;
+
+ if (ip <= ilimit) {
+ /* Fill Table */
+ hashLong[ZSTD_hashPtr(base + curr + 2, hBitsL, 8)] = hashSmall[ZSTD_hashPtr(base + curr + 2, hBitsS, mls)] =
+ curr + 2; /* here because curr+2 could be > iend-8 */
+ hashLong[ZSTD_hashPtr(ip - 2, hBitsL, 8)] = hashSmall[ZSTD_hashPtr(ip - 2, hBitsS, mls)] = (U32)(ip - 2 - base);
+
+ /* check immediate repcode */
+ while ((ip <= ilimit) && ((offset_2 > 0) & (ZSTD_read32(ip) == ZSTD_read32(ip - offset_2)))) {
+ /* store sequence */
+ size_t const rLength = ZSTD_count(ip + 4, ip + 4 - offset_2, iend) + 4;
+ {
+ U32 const tmpOff = offset_2;
+ offset_2 = offset_1;
+ offset_1 = tmpOff;
+ } /* swap offset_2 <=> offset_1 */
+ hashSmall[ZSTD_hashPtr(ip, hBitsS, mls)] = (U32)(ip - base);
+ hashLong[ZSTD_hashPtr(ip, hBitsL, 8)] = (U32)(ip - base);
+ ZSTD_storeSeq(seqStorePtr, 0, anchor, 0, rLength - MINMATCH);
+ ip += rLength;
+ anchor = ip;
+ continue; /* faster when present ... (?) */
+ }
+ }
+ }
+
+ /* save reps for next block */
+ cctx->repToConfirm[0] = offset_1 ? offset_1 : offsetSaved;
+ cctx->repToConfirm[1] = offset_2 ? offset_2 : offsetSaved;
+
+ /* Last Literals */
+ {
+ size_t const lastLLSize = iend - anchor;
+ memcpy(seqStorePtr->lit, anchor, lastLLSize);
+ seqStorePtr->lit += lastLLSize;
+ }
+}
+
+static void ZSTD_compressBlock_doubleFast(ZSTD_CCtx *ctx, const void *src, size_t srcSize)
+{
+ const U32 mls = ctx->params.cParams.searchLength;
+ switch (mls) {
+ default: /* includes case 3 */
+ case 4: ZSTD_compressBlock_doubleFast_generic(ctx, src, srcSize, 4); return;
+ case 5: ZSTD_compressBlock_doubleFast_generic(ctx, src, srcSize, 5); return;
+ case 6: ZSTD_compressBlock_doubleFast_generic(ctx, src, srcSize, 6); return;
+ case 7: ZSTD_compressBlock_doubleFast_generic(ctx, src, srcSize, 7); return;
+ }
+}
+
+static void ZSTD_compressBlock_doubleFast_extDict_generic(ZSTD_CCtx *ctx, const void *src, size_t srcSize, const U32 mls)
+{
+ U32 *const hashLong = ctx->hashTable;
+ U32 const hBitsL = ctx->params.cParams.hashLog;
+ U32 *const hashSmall = ctx->chainTable;
+ U32 const hBitsS = ctx->params.cParams.chainLog;
+ seqStore_t *seqStorePtr = &(ctx->seqStore);
+ const BYTE *const base = ctx->base;
+ const BYTE *const dictBase = ctx->dictBase;
+ const BYTE *const istart = (const BYTE *)src;
+ const BYTE *ip = istart;
+ const BYTE *anchor = istart;
+ const U32 lowestIndex = ctx->lowLimit;
+ const BYTE *const dictStart = dictBase + lowestIndex;
+ const U32 dictLimit = ctx->dictLimit;
+ const BYTE *const lowPrefixPtr = base + dictLimit;
+ const BYTE *const dictEnd = dictBase + dictLimit;
+ const BYTE *const iend = istart + srcSize;
+ const BYTE *const ilimit = iend - 8;
+ U32 offset_1 = ctx->rep[0], offset_2 = ctx->rep[1];
+
+ /* Search Loop */
+ while (ip < ilimit) { /* < instead of <=, because (ip+1) */
+ const size_t hSmall = ZSTD_hashPtr(ip, hBitsS, mls);
+ const U32 matchIndex = hashSmall[hSmall];
+ const BYTE *matchBase = matchIndex < dictLimit ? dictBase : base;
+ const BYTE *match = matchBase + matchIndex;
+
+ const size_t hLong = ZSTD_hashPtr(ip, hBitsL, 8);
+ const U32 matchLongIndex = hashLong[hLong];
+ const BYTE *matchLongBase = matchLongIndex < dictLimit ? dictBase : base;
+ const BYTE *matchLong = matchLongBase + matchLongIndex;
+
+ const U32 curr = (U32)(ip - base);
+ const U32 repIndex = curr + 1 - offset_1; /* offset_1 expected <= curr +1 */
+ const BYTE *repBase = repIndex < dictLimit ? dictBase : base;
+ const BYTE *repMatch = repBase + repIndex;
+ size_t mLength;
+ hashSmall[hSmall] = hashLong[hLong] = curr; /* update hash table */
+
+ if ((((U32)((dictLimit - 1) - repIndex) >= 3) /* intentional underflow */ & (repIndex > lowestIndex)) &&
+ (ZSTD_read32(repMatch) == ZSTD_read32(ip + 1))) {
+ const BYTE *repMatchEnd = repIndex < dictLimit ? dictEnd : iend;
+ mLength = ZSTD_count_2segments(ip + 1 + 4, repMatch + 4, iend, repMatchEnd, lowPrefixPtr) + 4;
+ ip++;
+ ZSTD_storeSeq(seqStorePtr, ip - anchor, anchor, 0, mLength - MINMATCH);
+ } else {
+ if ((matchLongIndex > lowestIndex) && (ZSTD_read64(matchLong) == ZSTD_read64(ip))) {
+ const BYTE *matchEnd = matchLongIndex < dictLimit ? dictEnd : iend;
+ const BYTE *lowMatchPtr = matchLongIndex < dictLimit ? dictStart : lowPrefixPtr;
+ U32 offset;
+ mLength = ZSTD_count_2segments(ip + 8, matchLong + 8, iend, matchEnd, lowPrefixPtr) + 8;
+ offset = curr - matchLongIndex;
+ while (((ip > anchor) & (matchLong > lowMatchPtr)) && (ip[-1] == matchLong[-1])) {
+ ip--;
+ matchLong--;
+ mLength++;
+ } /* catch up */
+ offset_2 = offset_1;
+ offset_1 = offset;
+ ZSTD_storeSeq(seqStorePtr, ip - anchor, anchor, offset + ZSTD_REP_MOVE, mLength - MINMATCH);
+
+ } else if ((matchIndex > lowestIndex) && (ZSTD_read32(match) == ZSTD_read32(ip))) {
+ size_t const h3 = ZSTD_hashPtr(ip + 1, hBitsL, 8);
+ U32 const matchIndex3 = hashLong[h3];
+ const BYTE *const match3Base = matchIndex3 < dictLimit ? dictBase : base;
+ const BYTE *match3 = match3Base + matchIndex3;
+ U32 offset;
+ hashLong[h3] = curr + 1;
+ if ((matchIndex3 > lowestIndex) && (ZSTD_read64(match3) == ZSTD_read64(ip + 1))) {
+ const BYTE *matchEnd = matchIndex3 < dictLimit ? dictEnd : iend;
+ const BYTE *lowMatchPtr = matchIndex3 < dictLimit ? dictStart : lowPrefixPtr;
+ mLength = ZSTD_count_2segments(ip + 9, match3 + 8, iend, matchEnd, lowPrefixPtr) + 8;
+ ip++;
+ offset = curr + 1 - matchIndex3;
+ while (((ip > anchor) & (match3 > lowMatchPtr)) && (ip[-1] == match3[-1])) {
+ ip--;
+ match3--;
+ mLength++;
+ } /* catch up */
+ } else {
+ const BYTE *matchEnd = matchIndex < dictLimit ? dictEnd : iend;
+ const BYTE *lowMatchPtr = matchIndex < dictLimit ? dictStart : lowPrefixPtr;
+ mLength = ZSTD_count_2segments(ip + 4, match + 4, iend, matchEnd, lowPrefixPtr) + 4;
+ offset = curr - matchIndex;
+ while (((ip > anchor) & (match > lowMatchPtr)) && (ip[-1] == match[-1])) {
+ ip--;
+ match--;
+ mLength++;
+ } /* catch up */
+ }
+ offset_2 = offset_1;
+ offset_1 = offset;
+ ZSTD_storeSeq(seqStorePtr, ip - anchor, anchor, offset + ZSTD_REP_MOVE, mLength - MINMATCH);
+
+ } else {
+ ip += ((ip - anchor) >> g_searchStrength) + 1;
+ continue;
+ }
+ }
+
+ /* found a match : store it */
+ ip += mLength;
+ anchor = ip;
+
+ if (ip <= ilimit) {
+ /* Fill Table */
+ hashSmall[ZSTD_hashPtr(base + curr + 2, hBitsS, mls)] = curr + 2;
+ hashLong[ZSTD_hashPtr(base + curr + 2, hBitsL, 8)] = curr + 2;
+ hashSmall[ZSTD_hashPtr(ip - 2, hBitsS, mls)] = (U32)(ip - 2 - base);
+ hashLong[ZSTD_hashPtr(ip - 2, hBitsL, 8)] = (U32)(ip - 2 - base);
+ /* check immediate repcode */
+ while (ip <= ilimit) {
+ U32 const curr2 = (U32)(ip - base);
+ U32 const repIndex2 = curr2 - offset_2;
+ const BYTE *repMatch2 = repIndex2 < dictLimit ? dictBase + repIndex2 : base + repIndex2;
+ if ((((U32)((dictLimit - 1) - repIndex2) >= 3) & (repIndex2 > lowestIndex)) /* intentional overflow */
+ && (ZSTD_read32(repMatch2) == ZSTD_read32(ip))) {
+ const BYTE *const repEnd2 = repIndex2 < dictLimit ? dictEnd : iend;
+ size_t const repLength2 =
+ ZSTD_count_2segments(ip + EQUAL_READ32, repMatch2 + EQUAL_READ32, iend, repEnd2, lowPrefixPtr) + EQUAL_READ32;
+ U32 tmpOffset = offset_2;
+ offset_2 = offset_1;
+ offset_1 = tmpOffset; /* swap offset_2 <=> offset_1 */
+ ZSTD_storeSeq(seqStorePtr, 0, anchor, 0, repLength2 - MINMATCH);
+ hashSmall[ZSTD_hashPtr(ip, hBitsS, mls)] = curr2;
+ hashLong[ZSTD_hashPtr(ip, hBitsL, 8)] = curr2;
+ ip += repLength2;
+ anchor = ip;
+ continue;
+ }
+ break;
+ }
+ }
+ }
+
+ /* save reps for next block */
+ ctx->repToConfirm[0] = offset_1;
+ ctx->repToConfirm[1] = offset_2;
+
+ /* Last Literals */
+ {
+ size_t const lastLLSize = iend - anchor;
+ memcpy(seqStorePtr->lit, anchor, lastLLSize);
+ seqStorePtr->lit += lastLLSize;
+ }
+}
+
+static void ZSTD_compressBlock_doubleFast_extDict(ZSTD_CCtx *ctx, const void *src, size_t srcSize)
+{
+ U32 const mls = ctx->params.cParams.searchLength;
+ switch (mls) {
+ default: /* includes case 3 */
+ case 4: ZSTD_compressBlock_doubleFast_extDict_generic(ctx, src, srcSize, 4); return;
+ case 5: ZSTD_compressBlock_doubleFast_extDict_generic(ctx, src, srcSize, 5); return;
+ case 6: ZSTD_compressBlock_doubleFast_extDict_generic(ctx, src, srcSize, 6); return;
+ case 7: ZSTD_compressBlock_doubleFast_extDict_generic(ctx, src, srcSize, 7); return;
+ }
+}
+
+/*-*************************************
+* Binary Tree search
+***************************************/
+/** ZSTD_insertBt1() : add one or multiple positions to tree.
+* ip : assumed <= iend-8 .
+* @return : nb of positions added */
+static U32 ZSTD_insertBt1(ZSTD_CCtx *zc, const BYTE *const ip, const U32 mls, const BYTE *const iend, U32 nbCompares, U32 extDict)
+{
+ U32 *const hashTable = zc->hashTable;
+ U32 const hashLog = zc->params.cParams.hashLog;
+ size_t const h = ZSTD_hashPtr(ip, hashLog, mls);
+ U32 *const bt = zc->chainTable;
+ U32 const btLog = zc->params.cParams.chainLog - 1;
+ U32 const btMask = (1 << btLog) - 1;
+ U32 matchIndex = hashTable[h];
+ size_t commonLengthSmaller = 0, commonLengthLarger = 0;
+ const BYTE *const base = zc->base;
+ const BYTE *const dictBase = zc->dictBase;
+ const U32 dictLimit = zc->dictLimit;
+ const BYTE *const dictEnd = dictBase + dictLimit;
+ const BYTE *const prefixStart = base + dictLimit;
+ const BYTE *match;
+ const U32 curr = (U32)(ip - base);
+ const U32 btLow = btMask >= curr ? 0 : curr - btMask;
+ U32 *smallerPtr = bt + 2 * (curr & btMask);
+ U32 *largerPtr = smallerPtr + 1;
+ U32 dummy32; /* to be nullified at the end */
+ U32 const windowLow = zc->lowLimit;
+ U32 matchEndIdx = curr + 8;
+ size_t bestLength = 8;
+
+ hashTable[h] = curr; /* Update Hash Table */
+
+ while (nbCompares-- && (matchIndex > windowLow)) {
+ U32 *const nextPtr = bt + 2 * (matchIndex & btMask);
+ size_t matchLength = MIN(commonLengthSmaller, commonLengthLarger); /* guaranteed minimum nb of common bytes */
+
+ if ((!extDict) || (matchIndex + matchLength >= dictLimit)) {
+ match = base + matchIndex;
+ if (match[matchLength] == ip[matchLength])
+ matchLength += ZSTD_count(ip + matchLength + 1, match + matchLength + 1, iend) + 1;
+ } else {
+ match = dictBase + matchIndex;
+ matchLength += ZSTD_count_2segments(ip + matchLength, match + matchLength, iend, dictEnd, prefixStart);
+ if (matchIndex + matchLength >= dictLimit)
+ match = base + matchIndex; /* to prepare for next usage of match[matchLength] */
+ }
+
+ if (matchLength > bestLength) {
+ bestLength = matchLength;
+ if (matchLength > matchEndIdx - matchIndex)
+ matchEndIdx = matchIndex + (U32)matchLength;
+ }
+
+ if (ip + matchLength == iend) /* equal : no way to know if inf or sup */
+ break; /* drop , to guarantee consistency ; miss a bit of compression, but other solutions can corrupt the tree */
+
+ if (match[matchLength] < ip[matchLength]) { /* necessarily within correct buffer */
+ /* match is smaller than curr */
+ *smallerPtr = matchIndex; /* update smaller idx */
+ commonLengthSmaller = matchLength; /* all smaller will now have at least this guaranteed common length */
+ if (matchIndex <= btLow) {
+ smallerPtr = &dummy32;
+ break;
+ } /* beyond tree size, stop the search */
+ smallerPtr = nextPtr + 1; /* new "smaller" => larger of match */
+ matchIndex = nextPtr[1]; /* new matchIndex larger than previous (closer to curr) */
+ } else {
+ /* match is larger than curr */
+ *largerPtr = matchIndex;
+ commonLengthLarger = matchLength;
+ if (matchIndex <= btLow) {
+ largerPtr = &dummy32;
+ break;
+ } /* beyond tree size, stop the search */
+ largerPtr = nextPtr;
+ matchIndex = nextPtr[0];
+ }
+ }
+
+ *smallerPtr = *largerPtr = 0;
+ if (bestLength > 384)
+ return MIN(192, (U32)(bestLength - 384)); /* speed optimization */
+ if (matchEndIdx > curr + 8)
+ return matchEndIdx - curr - 8;
+ return 1;
+}
+
+static size_t ZSTD_insertBtAndFindBestMatch(ZSTD_CCtx *zc, const BYTE *const ip, const BYTE *const iend, size_t *offsetPtr, U32 nbCompares, const U32 mls,
+ U32 extDict)
+{
+ U32 *const hashTable = zc->hashTable;
+ U32 const hashLog = zc->params.cParams.hashLog;
+ size_t const h = ZSTD_hashPtr(ip, hashLog, mls);
+ U32 *const bt = zc->chainTable;
+ U32 const btLog = zc->params.cParams.chainLog - 1;
+ U32 const btMask = (1 << btLog) - 1;
+ U32 matchIndex = hashTable[h];
+ size_t commonLengthSmaller = 0, commonLengthLarger = 0;
+ const BYTE *const base = zc->base;
+ const BYTE *const dictBase = zc->dictBase;
+ const U32 dictLimit = zc->dictLimit;
+ const BYTE *const dictEnd = dictBase + dictLimit;
+ const BYTE *const prefixStart = base + dictLimit;
+ const U32 curr = (U32)(ip - base);
+ const U32 btLow = btMask >= curr ? 0 : curr - btMask;
+ const U32 windowLow = zc->lowLimit;
+ U32 *smallerPtr = bt + 2 * (curr & btMask);
+ U32 *largerPtr = bt + 2 * (curr & btMask) + 1;
+ U32 matchEndIdx = curr + 8;
+ U32 dummy32; /* to be nullified at the end */
+ size_t bestLength = 0;
+
+ hashTable[h] = curr; /* Update Hash Table */
+
+ while (nbCompares-- && (matchIndex > windowLow)) {
+ U32 *const nextPtr = bt + 2 * (matchIndex & btMask);
+ size_t matchLength = MIN(commonLengthSmaller, commonLengthLarger); /* guaranteed minimum nb of common bytes */
+ const BYTE *match;
+
+ if ((!extDict) || (matchIndex + matchLength >= dictLimit)) {
+ match = base + matchIndex;
+ if (match[matchLength] == ip[matchLength])
+ matchLength += ZSTD_count(ip + matchLength + 1, match + matchLength + 1, iend) + 1;
+ } else {
+ match = dictBase + matchIndex;
+ matchLength += ZSTD_count_2segments(ip + matchLength, match + matchLength, iend, dictEnd, prefixStart);
+ if (matchIndex + matchLength >= dictLimit)
+ match = base + matchIndex; /* to prepare for next usage of match[matchLength] */
+ }
+
+ if (matchLength > bestLength) {
+ if (matchLength > matchEndIdx - matchIndex)
+ matchEndIdx = matchIndex + (U32)matchLength;
+ if ((4 * (int)(matchLength - bestLength)) > (int)(ZSTD_highbit32(curr - matchIndex + 1) - ZSTD_highbit32((U32)offsetPtr[0] + 1)))
+ bestLength = matchLength, *offsetPtr = ZSTD_REP_MOVE + curr - matchIndex;
+ if (ip + matchLength == iend) /* equal : no way to know if inf or sup */
+ break; /* drop, to guarantee consistency (miss a little bit of compression) */
+ }
+
+ if (match[matchLength] < ip[matchLength]) {
+ /* match is smaller than curr */
+ *smallerPtr = matchIndex; /* update smaller idx */
+ commonLengthSmaller = matchLength; /* all smaller will now have at least this guaranteed common length */
+ if (matchIndex <= btLow) {
+ smallerPtr = &dummy32;
+ break;
+ } /* beyond tree size, stop the search */
+ smallerPtr = nextPtr + 1; /* new "smaller" => larger of match */
+ matchIndex = nextPtr[1]; /* new matchIndex larger than previous (closer to curr) */
+ } else {
+ /* match is larger than curr */
+ *largerPtr = matchIndex;
+ commonLengthLarger = matchLength;
+ if (matchIndex <= btLow) {
+ largerPtr = &dummy32;
+ break;
+ } /* beyond tree size, stop the search */
+ largerPtr = nextPtr;
+ matchIndex = nextPtr[0];
+ }
+ }
+
+ *smallerPtr = *largerPtr = 0;
+
+ zc->nextToUpdate = (matchEndIdx > curr + 8) ? matchEndIdx - 8 : curr + 1;
+ return bestLength;
+}
+
+static void ZSTD_updateTree(ZSTD_CCtx *zc, const BYTE *const ip, const BYTE *const iend, const U32 nbCompares, const U32 mls)
+{
+ const BYTE *const base = zc->base;
+ const U32 target = (U32)(ip - base);
+ U32 idx = zc->nextToUpdate;
+
+ while (idx < target)
+ idx += ZSTD_insertBt1(zc, base + idx, mls, iend, nbCompares, 0);
+}
+
+/** ZSTD_BtFindBestMatch() : Tree updater, providing best match */
+static size_t ZSTD_BtFindBestMatch(ZSTD_CCtx *zc, const BYTE *const ip, const BYTE *const iLimit, size_t *offsetPtr, const U32 maxNbAttempts, const U32 mls)
+{
+ if (ip < zc->base + zc->nextToUpdate)
+ return 0; /* skipped area */
+ ZSTD_updateTree(zc, ip, iLimit, maxNbAttempts, mls);
+ return ZSTD_insertBtAndFindBestMatch(zc, ip, iLimit, offsetPtr, maxNbAttempts, mls, 0);
+}
+
+static size_t ZSTD_BtFindBestMatch_selectMLS(ZSTD_CCtx *zc, /* Index table will be updated */
+ const BYTE *ip, const BYTE *const iLimit, size_t *offsetPtr, const U32 maxNbAttempts, const U32 matchLengthSearch)
+{
+ switch (matchLengthSearch) {
+ default: /* includes case 3 */
+ case 4: return ZSTD_BtFindBestMatch(zc, ip, iLimit, offsetPtr, maxNbAttempts, 4);
+ case 5: return ZSTD_BtFindBestMatch(zc, ip, iLimit, offsetPtr, maxNbAttempts, 5);
+ case 7:
+ case 6: return ZSTD_BtFindBestMatch(zc, ip, iLimit, offsetPtr, maxNbAttempts, 6);
+ }
+}
+
+static void ZSTD_updateTree_extDict(ZSTD_CCtx *zc, const BYTE *const ip, const BYTE *const iend, const U32 nbCompares, const U32 mls)
+{
+ const BYTE *const base = zc->base;
+ const U32 target = (U32)(ip - base);
+ U32 idx = zc->nextToUpdate;
+
+ while (idx < target)
+ idx += ZSTD_insertBt1(zc, base + idx, mls, iend, nbCompares, 1);
+}
+
+/** Tree updater, providing best match */
+static size_t ZSTD_BtFindBestMatch_extDict(ZSTD_CCtx *zc, const BYTE *const ip, const BYTE *const iLimit, size_t *offsetPtr, const U32 maxNbAttempts,
+ const U32 mls)
+{
+ if (ip < zc->base + zc->nextToUpdate)
+ return 0; /* skipped area */
+ ZSTD_updateTree_extDict(zc, ip, iLimit, maxNbAttempts, mls);
+ return ZSTD_insertBtAndFindBestMatch(zc, ip, iLimit, offsetPtr, maxNbAttempts, mls, 1);
+}
+
+static size_t ZSTD_BtFindBestMatch_selectMLS_extDict(ZSTD_CCtx *zc, /* Index table will be updated */
+ const BYTE *ip, const BYTE *const iLimit, size_t *offsetPtr, const U32 maxNbAttempts,
+ const U32 matchLengthSearch)
+{
+ switch (matchLengthSearch) {
+ default: /* includes case 3 */
+ case 4: return ZSTD_BtFindBestMatch_extDict(zc, ip, iLimit, offsetPtr, maxNbAttempts, 4);
+ case 5: return ZSTD_BtFindBestMatch_extDict(zc, ip, iLimit, offsetPtr, maxNbAttempts, 5);
+ case 7:
+ case 6: return ZSTD_BtFindBestMatch_extDict(zc, ip, iLimit, offsetPtr, maxNbAttempts, 6);
+ }
+}
+
+/* *********************************
+* Hash Chain
+***********************************/
+#define NEXT_IN_CHAIN(d, mask) chainTable[(d)&mask]
+
+/* Update chains up to ip (excluded)
+ Assumption : always within prefix (i.e. not within extDict) */
+FORCE_INLINE
+U32 ZSTD_insertAndFindFirstIndex(ZSTD_CCtx *zc, const BYTE *ip, U32 mls)
+{
+ U32 *const hashTable = zc->hashTable;
+ const U32 hashLog = zc->params.cParams.hashLog;
+ U32 *const chainTable = zc->chainTable;
+ const U32 chainMask = (1 << zc->params.cParams.chainLog) - 1;
+ const BYTE *const base = zc->base;
+ const U32 target = (U32)(ip - base);
+ U32 idx = zc->nextToUpdate;
+
+ while (idx < target) { /* catch up */
+ size_t const h = ZSTD_hashPtr(base + idx, hashLog, mls);
+ NEXT_IN_CHAIN(idx, chainMask) = hashTable[h];
+ hashTable[h] = idx;
+ idx++;
+ }
+
+ zc->nextToUpdate = target;
+ return hashTable[ZSTD_hashPtr(ip, hashLog, mls)];
+}
+
+/* inlining is important to hardwire a hot branch (template emulation) */
+FORCE_INLINE
+size_t ZSTD_HcFindBestMatch_generic(ZSTD_CCtx *zc, /* Index table will be updated */
+ const BYTE *const ip, const BYTE *const iLimit, size_t *offsetPtr, const U32 maxNbAttempts, const U32 mls,
+ const U32 extDict)
+{
+ U32 *const chainTable = zc->chainTable;
+ const U32 chainSize = (1 << zc->params.cParams.chainLog);
+ const U32 chainMask = chainSize - 1;
+ const BYTE *const base = zc->base;
+ const BYTE *const dictBase = zc->dictBase;
+ const U32 dictLimit = zc->dictLimit;
+ const BYTE *const prefixStart = base + dictLimit;
+ const BYTE *const dictEnd = dictBase + dictLimit;
+ const U32 lowLimit = zc->lowLimit;
+ const U32 curr = (U32)(ip - base);
+ const U32 minChain = curr > chainSize ? curr - chainSize : 0;
+ int nbAttempts = maxNbAttempts;
+ size_t ml = EQUAL_READ32 - 1;
+
+ /* HC4 match finder */
+ U32 matchIndex = ZSTD_insertAndFindFirstIndex(zc, ip, mls);
+
+ for (; (matchIndex > lowLimit) & (nbAttempts > 0); nbAttempts--) {
+ const BYTE *match;
+ size_t currMl = 0;
+ if ((!extDict) || matchIndex >= dictLimit) {
+ match = base + matchIndex;
+ if (match[ml] == ip[ml]) /* potentially better */
+ currMl = ZSTD_count(ip, match, iLimit);
+ } else {
+ match = dictBase + matchIndex;
+ if (ZSTD_read32(match) == ZSTD_read32(ip)) /* assumption : matchIndex <= dictLimit-4 (by table construction) */
+ currMl = ZSTD_count_2segments(ip + EQUAL_READ32, match + EQUAL_READ32, iLimit, dictEnd, prefixStart) + EQUAL_READ32;
+ }
+
+ /* save best solution */
+ if (currMl > ml) {
+ ml = currMl;
+ *offsetPtr = curr - matchIndex + ZSTD_REP_MOVE;
+ if (ip + currMl == iLimit)
+ break; /* best possible, and avoid read overflow*/
+ }
+
+ if (matchIndex <= minChain)
+ break;
+ matchIndex = NEXT_IN_CHAIN(matchIndex, chainMask);
+ }
+
+ return ml;
+}
+
+FORCE_INLINE size_t ZSTD_HcFindBestMatch_selectMLS(ZSTD_CCtx *zc, const BYTE *ip, const BYTE *const iLimit, size_t *offsetPtr, const U32 maxNbAttempts,
+ const U32 matchLengthSearch)
+{
+ switch (matchLengthSearch) {
+ default: /* includes case 3 */
+ case 4: return ZSTD_HcFindBestMatch_generic(zc, ip, iLimit, offsetPtr, maxNbAttempts, 4, 0);
+ case 5: return ZSTD_HcFindBestMatch_generic(zc, ip, iLimit, offsetPtr, maxNbAttempts, 5, 0);
+ case 7:
+ case 6: return ZSTD_HcFindBestMatch_generic(zc, ip, iLimit, offsetPtr, maxNbAttempts, 6, 0);
+ }
+}
+
+FORCE_INLINE size_t ZSTD_HcFindBestMatch_extDict_selectMLS(ZSTD_CCtx *zc, const BYTE *ip, const BYTE *const iLimit, size_t *offsetPtr, const U32 maxNbAttempts,
+ const U32 matchLengthSearch)
+{
+ switch (matchLengthSearch) {
+ default: /* includes case 3 */
+ case 4: return ZSTD_HcFindBestMatch_generic(zc, ip, iLimit, offsetPtr, maxNbAttempts, 4, 1);
+ case 5: return ZSTD_HcFindBestMatch_generic(zc, ip, iLimit, offsetPtr, maxNbAttempts, 5, 1);
+ case 7:
+ case 6: return ZSTD_HcFindBestMatch_generic(zc, ip, iLimit, offsetPtr, maxNbAttempts, 6, 1);
+ }
+}
+
+/* *******************************
+* Common parser - lazy strategy
+*********************************/
+FORCE_INLINE
+void ZSTD_compressBlock_lazy_generic(ZSTD_CCtx *ctx, const void *src, size_t srcSize, const U32 searchMethod, const U32 depth)
+{
+ seqStore_t *seqStorePtr = &(ctx->seqStore);
+ const BYTE *const istart = (const BYTE *)src;
+ const BYTE *ip = istart;
+ const BYTE *anchor = istart;
+ const BYTE *const iend = istart + srcSize;
+ const BYTE *const ilimit = iend - 8;
+ const BYTE *const base = ctx->base + ctx->dictLimit;
+
+ U32 const maxSearches = 1 << ctx->params.cParams.searchLog;
+ U32 const mls = ctx->params.cParams.searchLength;
+
+ typedef size_t (*searchMax_f)(ZSTD_CCtx * zc, const BYTE *ip, const BYTE *iLimit, size_t *offsetPtr, U32 maxNbAttempts, U32 matchLengthSearch);
+ searchMax_f const searchMax = searchMethod ? ZSTD_BtFindBestMatch_selectMLS : ZSTD_HcFindBestMatch_selectMLS;
+ U32 offset_1 = ctx->rep[0], offset_2 = ctx->rep[1], savedOffset = 0;
+
+ /* init */
+ ip += (ip == base);
+ ctx->nextToUpdate3 = ctx->nextToUpdate;
+ {
+ U32 const maxRep = (U32)(ip - base);
+ if (offset_2 > maxRep)
+ savedOffset = offset_2, offset_2 = 0;
+ if (offset_1 > maxRep)
+ savedOffset = offset_1, offset_1 = 0;
+ }
+
+ /* Match Loop */
+ while (ip < ilimit) {
+ size_t matchLength = 0;
+ size_t offset = 0;
+ const BYTE *start = ip + 1;
+
+ /* check repCode */
+ if ((offset_1 > 0) & (ZSTD_read32(ip + 1) == ZSTD_read32(ip + 1 - offset_1))) {
+ /* repcode : we take it */
+ matchLength = ZSTD_count(ip + 1 + EQUAL_READ32, ip + 1 + EQUAL_READ32 - offset_1, iend) + EQUAL_READ32;
+ if (depth == 0)
+ goto _storeSequence;
+ }
+
+ /* first search (depth 0) */
+ {
+ size_t offsetFound = 99999999;
+ size_t const ml2 = searchMax(ctx, ip, iend, &offsetFound, maxSearches, mls);
+ if (ml2 > matchLength)
+ matchLength = ml2, start = ip, offset = offsetFound;
+ }
+
+ if (matchLength < EQUAL_READ32) {
+ ip += ((ip - anchor) >> g_searchStrength) + 1; /* jump faster over incompressible sections */
+ continue;
+ }
+
+ /* let's try to find a better solution */
+ if (depth >= 1)
+ while (ip < ilimit) {
+ ip++;
+ if ((offset) && ((offset_1 > 0) & (ZSTD_read32(ip) == ZSTD_read32(ip - offset_1)))) {
+ size_t const mlRep = ZSTD_count(ip + EQUAL_READ32, ip + EQUAL_READ32 - offset_1, iend) + EQUAL_READ32;
+ int const gain2 = (int)(mlRep * 3);
+ int const gain1 = (int)(matchLength * 3 - ZSTD_highbit32((U32)offset + 1) + 1);
+ if ((mlRep >= EQUAL_READ32) && (gain2 > gain1))
+ matchLength = mlRep, offset = 0, start = ip;
+ }
+ {
+ size_t offset2 = 99999999;
+ size_t const ml2 = searchMax(ctx, ip, iend, &offset2, maxSearches, mls);
+ int const gain2 = (int)(ml2 * 4 - ZSTD_highbit32((U32)offset2 + 1)); /* raw approx */
+ int const gain1 = (int)(matchLength * 4 - ZSTD_highbit32((U32)offset + 1) + 4);
+ if ((ml2 >= EQUAL_READ32) && (gain2 > gain1)) {
+ matchLength = ml2, offset = offset2, start = ip;
+ continue; /* search a better one */
+ }
+ }
+
+ /* let's find an even better one */
+ if ((depth == 2) && (ip < ilimit)) {
+ ip++;
+ if ((offset) && ((offset_1 > 0) & (ZSTD_read32(ip) == ZSTD_read32(ip - offset_1)))) {
+ size_t const ml2 = ZSTD_count(ip + EQUAL_READ32, ip + EQUAL_READ32 - offset_1, iend) + EQUAL_READ32;
+ int const gain2 = (int)(ml2 * 4);
+ int const gain1 = (int)(matchLength * 4 - ZSTD_highbit32((U32)offset + 1) + 1);
+ if ((ml2 >= EQUAL_READ32) && (gain2 > gain1))
+ matchLength = ml2, offset = 0, start = ip;
+ }
+ {
+ size_t offset2 = 99999999;
+ size_t const ml2 = searchMax(ctx, ip, iend, &offset2, maxSearches, mls);
+ int const gain2 = (int)(ml2 * 4 - ZSTD_highbit32((U32)offset2 + 1)); /* raw approx */
+ int const gain1 = (int)(matchLength * 4 - ZSTD_highbit32((U32)offset + 1) + 7);
+ if ((ml2 >= EQUAL_READ32) && (gain2 > gain1)) {
+ matchLength = ml2, offset = offset2, start = ip;
+ continue;
+ }
+ }
+ }
+ break; /* nothing found : store previous solution */
+ }
+
+ /* NOTE:
+ * start[-offset+ZSTD_REP_MOVE-1] is undefined behavior.
+ * (-offset+ZSTD_REP_MOVE-1) is unsigned, and is added to start, which
+ * overflows the pointer, which is undefined behavior.
+ */
+ /* catch up */
+ if (offset) {
+ while ((start > anchor) && (start > base + offset - ZSTD_REP_MOVE) &&
+ (start[-1] == (start-offset+ZSTD_REP_MOVE)[-1])) /* only search for offset within prefix */
+ {
+ start--;
+ matchLength++;
+ }
+ offset_2 = offset_1;
+ offset_1 = (U32)(offset - ZSTD_REP_MOVE);
+ }
+
+ /* store sequence */
+_storeSequence:
+ {
+ size_t const litLength = start - anchor;
+ ZSTD_storeSeq(seqStorePtr, litLength, anchor, (U32)offset, matchLength - MINMATCH);
+ anchor = ip = start + matchLength;
+ }
+
+ /* check immediate repcode */
+ while ((ip <= ilimit) && ((offset_2 > 0) & (ZSTD_read32(ip) == ZSTD_read32(ip - offset_2)))) {
+ /* store sequence */
+ matchLength = ZSTD_count(ip + EQUAL_READ32, ip + EQUAL_READ32 - offset_2, iend) + EQUAL_READ32;
+ offset = offset_2;
+ offset_2 = offset_1;
+ offset_1 = (U32)offset; /* swap repcodes */
+ ZSTD_storeSeq(seqStorePtr, 0, anchor, 0, matchLength - MINMATCH);
+ ip += matchLength;
+ anchor = ip;
+ continue; /* faster when present ... (?) */
+ }
+ }
+
+ /* Save reps for next block */
+ ctx->repToConfirm[0] = offset_1 ? offset_1 : savedOffset;
+ ctx->repToConfirm[1] = offset_2 ? offset_2 : savedOffset;
+
+ /* Last Literals */
+ {
+ size_t const lastLLSize = iend - anchor;
+ memcpy(seqStorePtr->lit, anchor, lastLLSize);
+ seqStorePtr->lit += lastLLSize;
+ }
+}
+
+static void ZSTD_compressBlock_btlazy2(ZSTD_CCtx *ctx, const void *src, size_t srcSize) { ZSTD_compressBlock_lazy_generic(ctx, src, srcSize, 1, 2); }
+
+static void ZSTD_compressBlock_lazy2(ZSTD_CCtx *ctx, const void *src, size_t srcSize) { ZSTD_compressBlock_lazy_generic(ctx, src, srcSize, 0, 2); }
+
+static void ZSTD_compressBlock_lazy(ZSTD_CCtx *ctx, const void *src, size_t srcSize) { ZSTD_compressBlock_lazy_generic(ctx, src, srcSize, 0, 1); }
+
+static void ZSTD_compressBlock_greedy(ZSTD_CCtx *ctx, const void *src, size_t srcSize) { ZSTD_compressBlock_lazy_generic(ctx, src, srcSize, 0, 0); }
+
+FORCE_INLINE
+void ZSTD_compressBlock_lazy_extDict_generic(ZSTD_CCtx *ctx, const void *src, size_t srcSize, const U32 searchMethod, const U32 depth)
+{
+ seqStore_t *seqStorePtr = &(ctx->seqStore);
+ const BYTE *const istart = (const BYTE *)src;
+ const BYTE *ip = istart;
+ const BYTE *anchor = istart;
+ const BYTE *const iend = istart + srcSize;
+ const BYTE *const ilimit = iend - 8;
+ const BYTE *const base = ctx->base;
+ const U32 dictLimit = ctx->dictLimit;
+ const U32 lowestIndex = ctx->lowLimit;
+ const BYTE *const prefixStart = base + dictLimit;
+ const BYTE *const dictBase = ctx->dictBase;
+ const BYTE *const dictEnd = dictBase + dictLimit;
+ const BYTE *const dictStart = dictBase + ctx->lowLimit;
+
+ const U32 maxSearches = 1 << ctx->params.cParams.searchLog;
+ const U32 mls = ctx->params.cParams.searchLength;
+
+ typedef size_t (*searchMax_f)(ZSTD_CCtx * zc, const BYTE *ip, const BYTE *iLimit, size_t *offsetPtr, U32 maxNbAttempts, U32 matchLengthSearch);
+ searchMax_f searchMax = searchMethod ? ZSTD_BtFindBestMatch_selectMLS_extDict : ZSTD_HcFindBestMatch_extDict_selectMLS;
+
+ U32 offset_1 = ctx->rep[0], offset_2 = ctx->rep[1];
+
+ /* init */
+ ctx->nextToUpdate3 = ctx->nextToUpdate;
+ ip += (ip == prefixStart);
+
+ /* Match Loop */
+ while (ip < ilimit) {
+ size_t matchLength = 0;
+ size_t offset = 0;
+ const BYTE *start = ip + 1;
+ U32 curr = (U32)(ip - base);
+
+ /* check repCode */
+ {
+ const U32 repIndex = (U32)(curr + 1 - offset_1);
+ const BYTE *const repBase = repIndex < dictLimit ? dictBase : base;
+ const BYTE *const repMatch = repBase + repIndex;
+ if (((U32)((dictLimit - 1) - repIndex) >= 3) & (repIndex > lowestIndex)) /* intentional overflow */
+ if (ZSTD_read32(ip + 1) == ZSTD_read32(repMatch)) {
+ /* repcode detected we should take it */
+ const BYTE *const repEnd = repIndex < dictLimit ? dictEnd : iend;
+ matchLength =
+ ZSTD_count_2segments(ip + 1 + EQUAL_READ32, repMatch + EQUAL_READ32, iend, repEnd, prefixStart) + EQUAL_READ32;
+ if (depth == 0)
+ goto _storeSequence;
+ }
+ }
+
+ /* first search (depth 0) */
+ {
+ size_t offsetFound = 99999999;
+ size_t const ml2 = searchMax(ctx, ip, iend, &offsetFound, maxSearches, mls);
+ if (ml2 > matchLength)
+ matchLength = ml2, start = ip, offset = offsetFound;
+ }
+
+ if (matchLength < EQUAL_READ32) {
+ ip += ((ip - anchor) >> g_searchStrength) + 1; /* jump faster over incompressible sections */
+ continue;
+ }
+
+ /* let's try to find a better solution */
+ if (depth >= 1)
+ while (ip < ilimit) {
+ ip++;
+ curr++;
+ /* check repCode */
+ if (offset) {
+ const U32 repIndex = (U32)(curr - offset_1);
+ const BYTE *const repBase = repIndex < dictLimit ? dictBase : base;
+ const BYTE *const repMatch = repBase + repIndex;
+ if (((U32)((dictLimit - 1) - repIndex) >= 3) & (repIndex > lowestIndex)) /* intentional overflow */
+ if (ZSTD_read32(ip) == ZSTD_read32(repMatch)) {
+ /* repcode detected */
+ const BYTE *const repEnd = repIndex < dictLimit ? dictEnd : iend;
+ size_t const repLength =
+ ZSTD_count_2segments(ip + EQUAL_READ32, repMatch + EQUAL_READ32, iend, repEnd, prefixStart) +
+ EQUAL_READ32;
+ int const gain2 = (int)(repLength * 3);
+ int const gain1 = (int)(matchLength * 3 - ZSTD_highbit32((U32)offset + 1) + 1);
+ if ((repLength >= EQUAL_READ32) && (gain2 > gain1))
+ matchLength = repLength, offset = 0, start = ip;
+ }
+ }
+
+ /* search match, depth 1 */
+ {
+ size_t offset2 = 99999999;
+ size_t const ml2 = searchMax(ctx, ip, iend, &offset2, maxSearches, mls);
+ int const gain2 = (int)(ml2 * 4 - ZSTD_highbit32((U32)offset2 + 1)); /* raw approx */
+ int const gain1 = (int)(matchLength * 4 - ZSTD_highbit32((U32)offset + 1) + 4);
+ if ((ml2 >= EQUAL_READ32) && (gain2 > gain1)) {
+ matchLength = ml2, offset = offset2, start = ip;
+ continue; /* search a better one */
+ }
+ }
+
+ /* let's find an even better one */
+ if ((depth == 2) && (ip < ilimit)) {
+ ip++;
+ curr++;
+ /* check repCode */
+ if (offset) {
+ const U32 repIndex = (U32)(curr - offset_1);
+ const BYTE *const repBase = repIndex < dictLimit ? dictBase : base;
+ const BYTE *const repMatch = repBase + repIndex;
+ if (((U32)((dictLimit - 1) - repIndex) >= 3) & (repIndex > lowestIndex)) /* intentional overflow */
+ if (ZSTD_read32(ip) == ZSTD_read32(repMatch)) {
+ /* repcode detected */
+ const BYTE *const repEnd = repIndex < dictLimit ? dictEnd : iend;
+ size_t repLength = ZSTD_count_2segments(ip + EQUAL_READ32, repMatch + EQUAL_READ32, iend,
+ repEnd, prefixStart) +
+ EQUAL_READ32;
+ int gain2 = (int)(repLength * 4);
+ int gain1 = (int)(matchLength * 4 - ZSTD_highbit32((U32)offset + 1) + 1);
+ if ((repLength >= EQUAL_READ32) && (gain2 > gain1))
+ matchLength = repLength, offset = 0, start = ip;
+ }
+ }
+
+ /* search match, depth 2 */
+ {
+ size_t offset2 = 99999999;
+ size_t const ml2 = searchMax(ctx, ip, iend, &offset2, maxSearches, mls);
+ int const gain2 = (int)(ml2 * 4 - ZSTD_highbit32((U32)offset2 + 1)); /* raw approx */
+ int const gain1 = (int)(matchLength * 4 - ZSTD_highbit32((U32)offset + 1) + 7);
+ if ((ml2 >= EQUAL_READ32) && (gain2 > gain1)) {
+ matchLength = ml2, offset = offset2, start = ip;
+ continue;
+ }
+ }
+ }
+ break; /* nothing found : store previous solution */
+ }
+
+ /* catch up */
+ if (offset) {
+ U32 const matchIndex = (U32)((start - base) - (offset - ZSTD_REP_MOVE));
+ const BYTE *match = (matchIndex < dictLimit) ? dictBase + matchIndex : base + matchIndex;
+ const BYTE *const mStart = (matchIndex < dictLimit) ? dictStart : prefixStart;
+ while ((start > anchor) && (match > mStart) && (start[-1] == match[-1])) {
+ start--;
+ match--;
+ matchLength++;
+ } /* catch up */
+ offset_2 = offset_1;
+ offset_1 = (U32)(offset - ZSTD_REP_MOVE);
+ }
+
+ /* store sequence */
+ _storeSequence : {
+ size_t const litLength = start - anchor;
+ ZSTD_storeSeq(seqStorePtr, litLength, anchor, (U32)offset, matchLength - MINMATCH);
+ anchor = ip = start + matchLength;
+ }
+
+ /* check immediate repcode */
+ while (ip <= ilimit) {
+ const U32 repIndex = (U32)((ip - base) - offset_2);
+ const BYTE *const repBase = repIndex < dictLimit ? dictBase : base;
+ const BYTE *const repMatch = repBase + repIndex;
+ if (((U32)((dictLimit - 1) - repIndex) >= 3) & (repIndex > lowestIndex)) /* intentional overflow */
+ if (ZSTD_read32(ip) == ZSTD_read32(repMatch)) {
+ /* repcode detected we should take it */
+ const BYTE *const repEnd = repIndex < dictLimit ? dictEnd : iend;
+ matchLength =
+ ZSTD_count_2segments(ip + EQUAL_READ32, repMatch + EQUAL_READ32, iend, repEnd, prefixStart) + EQUAL_READ32;
+ offset = offset_2;
+ offset_2 = offset_1;
+ offset_1 = (U32)offset; /* swap offset history */
+ ZSTD_storeSeq(seqStorePtr, 0, anchor, 0, matchLength - MINMATCH);
+ ip += matchLength;
+ anchor = ip;
+ continue; /* faster when present ... (?) */
+ }
+ break;
+ }
+ }
+
+ /* Save reps for next block */
+ ctx->repToConfirm[0] = offset_1;
+ ctx->repToConfirm[1] = offset_2;
+
+ /* Last Literals */
+ {
+ size_t const lastLLSize = iend - anchor;
+ memcpy(seqStorePtr->lit, anchor, lastLLSize);
+ seqStorePtr->lit += lastLLSize;
+ }
+}
+
+void ZSTD_compressBlock_greedy_extDict(ZSTD_CCtx *ctx, const void *src, size_t srcSize) { ZSTD_compressBlock_lazy_extDict_generic(ctx, src, srcSize, 0, 0); }
+
+static void ZSTD_compressBlock_lazy_extDict(ZSTD_CCtx *ctx, const void *src, size_t srcSize)
+{
+ ZSTD_compressBlock_lazy_extDict_generic(ctx, src, srcSize, 0, 1);
+}
+
+static void ZSTD_compressBlock_lazy2_extDict(ZSTD_CCtx *ctx, const void *src, size_t srcSize)
+{
+ ZSTD_compressBlock_lazy_extDict_generic(ctx, src, srcSize, 0, 2);
+}
+
+static void ZSTD_compressBlock_btlazy2_extDict(ZSTD_CCtx *ctx, const void *src, size_t srcSize)
+{
+ ZSTD_compressBlock_lazy_extDict_generic(ctx, src, srcSize, 1, 2);
+}
+
+/* The optimal parser */
+#include "zstd_opt.h"
+
+static void ZSTD_compressBlock_btopt(ZSTD_CCtx *ctx, const void *src, size_t srcSize)
+{
+#ifdef ZSTD_OPT_H_91842398743
+ ZSTD_compressBlock_opt_generic(ctx, src, srcSize, 0);
+#else
+ (void)ctx;
+ (void)src;
+ (void)srcSize;
+ return;
+#endif
+}
+
+static void ZSTD_compressBlock_btopt2(ZSTD_CCtx *ctx, const void *src, size_t srcSize)
+{
+#ifdef ZSTD_OPT_H_91842398743
+ ZSTD_compressBlock_opt_generic(ctx, src, srcSize, 1);
+#else
+ (void)ctx;
+ (void)src;
+ (void)srcSize;
+ return;
+#endif
+}
+
+static void ZSTD_compressBlock_btopt_extDict(ZSTD_CCtx *ctx, const void *src, size_t srcSize)
+{
+#ifdef ZSTD_OPT_H_91842398743
+ ZSTD_compressBlock_opt_extDict_generic(ctx, src, srcSize, 0);
+#else
+ (void)ctx;
+ (void)src;
+ (void)srcSize;
+ return;
+#endif
+}
+
+static void ZSTD_compressBlock_btopt2_extDict(ZSTD_CCtx *ctx, const void *src, size_t srcSize)
+{
+#ifdef ZSTD_OPT_H_91842398743
+ ZSTD_compressBlock_opt_extDict_generic(ctx, src, srcSize, 1);
+#else
+ (void)ctx;
+ (void)src;
+ (void)srcSize;
+ return;
+#endif
+}
+
+typedef void (*ZSTD_blockCompressor)(ZSTD_CCtx *ctx, const void *src, size_t srcSize);
+
+static ZSTD_blockCompressor ZSTD_selectBlockCompressor(ZSTD_strategy strat, int extDict)
+{
+ static const ZSTD_blockCompressor blockCompressor[2][8] = {
+ {ZSTD_compressBlock_fast, ZSTD_compressBlock_doubleFast, ZSTD_compressBlock_greedy, ZSTD_compressBlock_lazy, ZSTD_compressBlock_lazy2,
+ ZSTD_compressBlock_btlazy2, ZSTD_compressBlock_btopt, ZSTD_compressBlock_btopt2},
+ {ZSTD_compressBlock_fast_extDict, ZSTD_compressBlock_doubleFast_extDict, ZSTD_compressBlock_greedy_extDict, ZSTD_compressBlock_lazy_extDict,
+ ZSTD_compressBlock_lazy2_extDict, ZSTD_compressBlock_btlazy2_extDict, ZSTD_compressBlock_btopt_extDict, ZSTD_compressBlock_btopt2_extDict}};
+
+ return blockCompressor[extDict][(U32)strat];
+}
+
+static size_t ZSTD_compressBlock_internal(ZSTD_CCtx *zc, void *dst, size_t dstCapacity, const void *src, size_t srcSize)
+{
+ ZSTD_blockCompressor const blockCompressor = ZSTD_selectBlockCompressor(zc->params.cParams.strategy, zc->lowLimit < zc->dictLimit);
+ const BYTE *const base = zc->base;
+ const BYTE *const istart = (const BYTE *)src;
+ const U32 curr = (U32)(istart - base);
+ if (srcSize < MIN_CBLOCK_SIZE + ZSTD_blockHeaderSize + 1)
+ return 0; /* don't even attempt compression below a certain srcSize */
+ ZSTD_resetSeqStore(&(zc->seqStore));
+ if (curr > zc->nextToUpdate + 384)
+ zc->nextToUpdate = curr - MIN(192, (U32)(curr - zc->nextToUpdate - 384)); /* update tree not updated after finding very long rep matches */
+ blockCompressor(zc, src, srcSize);
+ return ZSTD_compressSequences(zc, dst, dstCapacity, srcSize);
+}
+
+/*! ZSTD_compress_generic() :
+* Compress a chunk of data into one or multiple blocks.
+* All blocks will be terminated, all input will be consumed.
+* Function will issue an error if there is not enough `dstCapacity` to hold the compressed content.
+* Frame is supposed already started (header already produced)
+* @return : compressed size, or an error code
+*/
+static size_t ZSTD_compress_generic(ZSTD_CCtx *cctx, void *dst, size_t dstCapacity, const void *src, size_t srcSize, U32 lastFrameChunk)
+{
+ size_t blockSize = cctx->blockSize;
+ size_t remaining = srcSize;
+ const BYTE *ip = (const BYTE *)src;
+ BYTE *const ostart = (BYTE *)dst;
+ BYTE *op = ostart;
+ U32 const maxDist = 1 << cctx->params.cParams.windowLog;
+
+ if (cctx->params.fParams.checksumFlag && srcSize)
+ xxh64_update(&cctx->xxhState, src, srcSize);
+
+ while (remaining) {
+ U32 const lastBlock = lastFrameChunk & (blockSize >= remaining);
+ size_t cSize;
+
+ if (dstCapacity < ZSTD_blockHeaderSize + MIN_CBLOCK_SIZE)
+ return ERROR(dstSize_tooSmall); /* not enough space to store compressed block */
+ if (remaining < blockSize)
+ blockSize = remaining;
+
+ /* preemptive overflow correction */
+ if (cctx->lowLimit > (3U << 29)) {
+ U32 const cycleMask = (1 << ZSTD_cycleLog(cctx->params.cParams.hashLog, cctx->params.cParams.strategy)) - 1;
+ U32 const curr = (U32)(ip - cctx->base);
+ U32 const newCurr = (curr & cycleMask) + (1 << cctx->params.cParams.windowLog);
+ U32 const correction = curr - newCurr;
+ ZSTD_STATIC_ASSERT(ZSTD_WINDOWLOG_MAX_64 <= 30);
+ ZSTD_reduceIndex(cctx, correction);
+ cctx->base += correction;
+ cctx->dictBase += correction;
+ cctx->lowLimit -= correction;
+ cctx->dictLimit -= correction;
+ if (cctx->nextToUpdate < correction)
+ cctx->nextToUpdate = 0;
+ else
+ cctx->nextToUpdate -= correction;
+ }
+
+ if ((U32)(ip + blockSize - cctx->base) > cctx->loadedDictEnd + maxDist) {
+ /* enforce maxDist */
+ U32 const newLowLimit = (U32)(ip + blockSize - cctx->base) - maxDist;
+ if (cctx->lowLimit < newLowLimit)
+ cctx->lowLimit = newLowLimit;
+ if (cctx->dictLimit < cctx->lowLimit)
+ cctx->dictLimit = cctx->lowLimit;
+ }
+
+ cSize = ZSTD_compressBlock_internal(cctx, op + ZSTD_blockHeaderSize, dstCapacity - ZSTD_blockHeaderSize, ip, blockSize);
+ if (ZSTD_isError(cSize))
+ return cSize;
+
+ if (cSize == 0) { /* block is not compressible */
+ U32 const cBlockHeader24 = lastBlock + (((U32)bt_raw) << 1) + (U32)(blockSize << 3);
+ if (blockSize + ZSTD_blockHeaderSize > dstCapacity)
+ return ERROR(dstSize_tooSmall);
+ ZSTD_writeLE32(op, cBlockHeader24); /* no pb, 4th byte will be overwritten */
+ memcpy(op + ZSTD_blockHeaderSize, ip, blockSize);
+ cSize = ZSTD_blockHeaderSize + blockSize;
+ } else {
+ U32 const cBlockHeader24 = lastBlock + (((U32)bt_compressed) << 1) + (U32)(cSize << 3);
+ ZSTD_writeLE24(op, cBlockHeader24);
+ cSize += ZSTD_blockHeaderSize;
+ }
+
+ remaining -= blockSize;
+ dstCapacity -= cSize;
+ ip += blockSize;
+ op += cSize;
+ }
+
+ if (lastFrameChunk && (op > ostart))
+ cctx->stage = ZSTDcs_ending;
+ return op - ostart;
+}
+
+static size_t ZSTD_writeFrameHeader(void *dst, size_t dstCapacity, ZSTD_parameters params, U64 pledgedSrcSize, U32 dictID)
+{
+ BYTE *const op = (BYTE *)dst;
+ U32 const dictIDSizeCode = (dictID > 0) + (dictID >= 256) + (dictID >= 65536); /* 0-3 */
+ U32 const checksumFlag = params.fParams.checksumFlag > 0;
+ U32 const windowSize = 1U << params.cParams.windowLog;
+ U32 const singleSegment = params.fParams.contentSizeFlag && (windowSize >= pledgedSrcSize);
+ BYTE const windowLogByte = (BYTE)((params.cParams.windowLog - ZSTD_WINDOWLOG_ABSOLUTEMIN) << 3);
+ U32 const fcsCode =
+ params.fParams.contentSizeFlag ? (pledgedSrcSize >= 256) + (pledgedSrcSize >= 65536 + 256) + (pledgedSrcSize >= 0xFFFFFFFFU) : 0; /* 0-3 */
+ BYTE const frameHeaderDecriptionByte = (BYTE)(dictIDSizeCode + (checksumFlag << 2) + (singleSegment << 5) + (fcsCode << 6));
+ size_t pos;
+
+ if (dstCapacity < ZSTD_frameHeaderSize_max)
+ return ERROR(dstSize_tooSmall);
+
+ ZSTD_writeLE32(dst, ZSTD_MAGICNUMBER);
+ op[4] = frameHeaderDecriptionByte;
+ pos = 5;
+ if (!singleSegment)
+ op[pos++] = windowLogByte;
+ switch (dictIDSizeCode) {
+ default: /* impossible */
+ case 0: break;
+ case 1:
+ op[pos] = (BYTE)(dictID);
+ pos++;
+ break;
+ case 2:
+ ZSTD_writeLE16(op + pos, (U16)dictID);
+ pos += 2;
+ break;
+ case 3:
+ ZSTD_writeLE32(op + pos, dictID);
+ pos += 4;
+ break;
+ }
+ switch (fcsCode) {
+ default: /* impossible */
+ case 0:
+ if (singleSegment)
+ op[pos++] = (BYTE)(pledgedSrcSize);
+ break;
+ case 1:
+ ZSTD_writeLE16(op + pos, (U16)(pledgedSrcSize - 256));
+ pos += 2;
+ break;
+ case 2:
+ ZSTD_writeLE32(op + pos, (U32)(pledgedSrcSize));
+ pos += 4;
+ break;
+ case 3:
+ ZSTD_writeLE64(op + pos, (U64)(pledgedSrcSize));
+ pos += 8;
+ break;
+ }
+ return pos;
+}
+
+static size_t ZSTD_compressContinue_internal(ZSTD_CCtx *cctx, void *dst, size_t dstCapacity, const void *src, size_t srcSize, U32 frame, U32 lastFrameChunk)
+{
+ const BYTE *const ip = (const BYTE *)src;
+ size_t fhSize = 0;
+
+ if (cctx->stage == ZSTDcs_created)
+ return ERROR(stage_wrong); /* missing init (ZSTD_compressBegin) */
+
+ if (frame && (cctx->stage == ZSTDcs_init)) {
+ fhSize = ZSTD_writeFrameHeader(dst, dstCapacity, cctx->params, cctx->frameContentSize, cctx->dictID);
+ if (ZSTD_isError(fhSize))
+ return fhSize;
+ dstCapacity -= fhSize;
+ dst = (char *)dst + fhSize;
+ cctx->stage = ZSTDcs_ongoing;
+ }
+
+ /* Check if blocks follow each other */
+ if (src != cctx->nextSrc) {
+ /* not contiguous */
+ ptrdiff_t const delta = cctx->nextSrc - ip;
+ cctx->lowLimit = cctx->dictLimit;
+ cctx->dictLimit = (U32)(cctx->nextSrc - cctx->base);
+ cctx->dictBase = cctx->base;
+ cctx->base -= delta;
+ cctx->nextToUpdate = cctx->dictLimit;
+ if (cctx->dictLimit - cctx->lowLimit < HASH_READ_SIZE)
+ cctx->lowLimit = cctx->dictLimit; /* too small extDict */
+ }
+
+ /* if input and dictionary overlap : reduce dictionary (area presumed modified by input) */
+ if ((ip + srcSize > cctx->dictBase + cctx->lowLimit) & (ip < cctx->dictBase + cctx->dictLimit)) {
+ ptrdiff_t const highInputIdx = (ip + srcSize) - cctx->dictBase;
+ U32 const lowLimitMax = (highInputIdx > (ptrdiff_t)cctx->dictLimit) ? cctx->dictLimit : (U32)highInputIdx;
+ cctx->lowLimit = lowLimitMax;
+ }
+
+ cctx->nextSrc = ip + srcSize;
+
+ if (srcSize) {
+ size_t const cSize = frame ? ZSTD_compress_generic(cctx, dst, dstCapacity, src, srcSize, lastFrameChunk)
+ : ZSTD_compressBlock_internal(cctx, dst, dstCapacity, src, srcSize);
+ if (ZSTD_isError(cSize))
+ return cSize;
+ return cSize + fhSize;
+ } else
+ return fhSize;
+}
+
+size_t ZSTD_compressContinue(ZSTD_CCtx *cctx, void *dst, size_t dstCapacity, const void *src, size_t srcSize)
+{
+ return ZSTD_compressContinue_internal(cctx, dst, dstCapacity, src, srcSize, 1, 0);
+}
+
+size_t ZSTD_getBlockSizeMax(ZSTD_CCtx *cctx) { return MIN(ZSTD_BLOCKSIZE_ABSOLUTEMAX, 1 << cctx->params.cParams.windowLog); }
+
+size_t ZSTD_compressBlock(ZSTD_CCtx *cctx, void *dst, size_t dstCapacity, const void *src, size_t srcSize)
+{
+ size_t const blockSizeMax = ZSTD_getBlockSizeMax(cctx);
+ if (srcSize > blockSizeMax)
+ return ERROR(srcSize_wrong);
+ return ZSTD_compressContinue_internal(cctx, dst, dstCapacity, src, srcSize, 0, 0);
+}
+
+/*! ZSTD_loadDictionaryContent() :
+ * @return : 0, or an error code
+ */
+static size_t ZSTD_loadDictionaryContent(ZSTD_CCtx *zc, const void *src, size_t srcSize)
+{
+ const BYTE *const ip = (const BYTE *)src;
+ const BYTE *const iend = ip + srcSize;
+
+ /* input becomes curr prefix */
+ zc->lowLimit = zc->dictLimit;
+ zc->dictLimit = (U32)(zc->nextSrc - zc->base);
+ zc->dictBase = zc->base;
+ zc->base += ip - zc->nextSrc;
+ zc->nextToUpdate = zc->dictLimit;
+ zc->loadedDictEnd = zc->forceWindow ? 0 : (U32)(iend - zc->base);
+
+ zc->nextSrc = iend;
+ if (srcSize <= HASH_READ_SIZE)
+ return 0;
+
+ switch (zc->params.cParams.strategy) {
+ case ZSTD_fast: ZSTD_fillHashTable(zc, iend, zc->params.cParams.searchLength); break;
+
+ case ZSTD_dfast: ZSTD_fillDoubleHashTable(zc, iend, zc->params.cParams.searchLength); break;
+
+ case ZSTD_greedy:
+ case ZSTD_lazy:
+ case ZSTD_lazy2:
+ if (srcSize >= HASH_READ_SIZE)
+ ZSTD_insertAndFindFirstIndex(zc, iend - HASH_READ_SIZE, zc->params.cParams.searchLength);
+ break;
+
+ case ZSTD_btlazy2:
+ case ZSTD_btopt:
+ case ZSTD_btopt2:
+ if (srcSize >= HASH_READ_SIZE)
+ ZSTD_updateTree(zc, iend - HASH_READ_SIZE, iend, 1 << zc->params.cParams.searchLog, zc->params.cParams.searchLength);
+ break;
+
+ default:
+ return ERROR(GENERIC); /* strategy doesn't exist; impossible */
+ }
+
+ zc->nextToUpdate = (U32)(iend - zc->base);
+ return 0;
+}
+
+/* Dictionaries that assign zero probability to symbols that show up causes problems
+ when FSE encoding. Refuse dictionaries that assign zero probability to symbols
+ that we may encounter during compression.
+ NOTE: This behavior is not standard and could be improved in the future. */
+static size_t ZSTD_checkDictNCount(short *normalizedCounter, unsigned dictMaxSymbolValue, unsigned maxSymbolValue)
+{
+ U32 s;
+ if (dictMaxSymbolValue < maxSymbolValue)
+ return ERROR(dictionary_corrupted);
+ for (s = 0; s <= maxSymbolValue; ++s) {
+ if (normalizedCounter[s] == 0)
+ return ERROR(dictionary_corrupted);
+ }
+ return 0;
+}
+
+/* Dictionary format :
+ * See :
+ * https://github.com/facebook/zstd/blob/master/doc/zstd_compression_format.md#dictionary-format
+ */
+/*! ZSTD_loadZstdDictionary() :
+ * @return : 0, or an error code
+ * assumptions : magic number supposed already checked
+ * dictSize supposed > 8
+ */
+static size_t ZSTD_loadZstdDictionary(ZSTD_CCtx *cctx, const void *dict, size_t dictSize)
+{
+ const BYTE *dictPtr = (const BYTE *)dict;
+ const BYTE *const dictEnd = dictPtr + dictSize;
+ short offcodeNCount[MaxOff + 1];
+ unsigned offcodeMaxValue = MaxOff;
+
+ dictPtr += 4; /* skip magic number */
+ cctx->dictID = cctx->params.fParams.noDictIDFlag ? 0 : ZSTD_readLE32(dictPtr);
+ dictPtr += 4;
+
+ {
+ size_t const hufHeaderSize = HUF_readCTable_wksp(cctx->hufTable, 255, dictPtr, dictEnd - dictPtr, cctx->tmpCounters, sizeof(cctx->tmpCounters));
+ if (HUF_isError(hufHeaderSize))
+ return ERROR(dictionary_corrupted);
+ dictPtr += hufHeaderSize;
+ }
+
+ {
+ unsigned offcodeLog;
+ size_t const offcodeHeaderSize = FSE_readNCount(offcodeNCount, &offcodeMaxValue, &offcodeLog, dictPtr, dictEnd - dictPtr);
+ if (FSE_isError(offcodeHeaderSize))
+ return ERROR(dictionary_corrupted);
+ if (offcodeLog > OffFSELog)
+ return ERROR(dictionary_corrupted);
+ /* Defer checking offcodeMaxValue because we need to know the size of the dictionary content */
+ CHECK_E(FSE_buildCTable_wksp(cctx->offcodeCTable, offcodeNCount, offcodeMaxValue, offcodeLog, cctx->tmpCounters, sizeof(cctx->tmpCounters)),
+ dictionary_corrupted);
+ dictPtr += offcodeHeaderSize;
+ }
+
+ {
+ short matchlengthNCount[MaxML + 1];
+ unsigned matchlengthMaxValue = MaxML, matchlengthLog;
+ size_t const matchlengthHeaderSize = FSE_readNCount(matchlengthNCount, &matchlengthMaxValue, &matchlengthLog, dictPtr, dictEnd - dictPtr);
+ if (FSE_isError(matchlengthHeaderSize))
+ return ERROR(dictionary_corrupted);
+ if (matchlengthLog > MLFSELog)
+ return ERROR(dictionary_corrupted);
+ /* Every match length code must have non-zero probability */
+ CHECK_F(ZSTD_checkDictNCount(matchlengthNCount, matchlengthMaxValue, MaxML));
+ CHECK_E(
+ FSE_buildCTable_wksp(cctx->matchlengthCTable, matchlengthNCount, matchlengthMaxValue, matchlengthLog, cctx->tmpCounters, sizeof(cctx->tmpCounters)),
+ dictionary_corrupted);
+ dictPtr += matchlengthHeaderSize;
+ }
+
+ {
+ short litlengthNCount[MaxLL + 1];
+ unsigned litlengthMaxValue = MaxLL, litlengthLog;
+ size_t const litlengthHeaderSize = FSE_readNCount(litlengthNCount, &litlengthMaxValue, &litlengthLog, dictPtr, dictEnd - dictPtr);
+ if (FSE_isError(litlengthHeaderSize))
+ return ERROR(dictionary_corrupted);
+ if (litlengthLog > LLFSELog)
+ return ERROR(dictionary_corrupted);
+ /* Every literal length code must have non-zero probability */
+ CHECK_F(ZSTD_checkDictNCount(litlengthNCount, litlengthMaxValue, MaxLL));
+ CHECK_E(FSE_buildCTable_wksp(cctx->litlengthCTable, litlengthNCount, litlengthMaxValue, litlengthLog, cctx->tmpCounters, sizeof(cctx->tmpCounters)),
+ dictionary_corrupted);
+ dictPtr += litlengthHeaderSize;
+ }
+
+ if (dictPtr + 12 > dictEnd)
+ return ERROR(dictionary_corrupted);
+ cctx->rep[0] = ZSTD_readLE32(dictPtr + 0);
+ cctx->rep[1] = ZSTD_readLE32(dictPtr + 4);
+ cctx->rep[2] = ZSTD_readLE32(dictPtr + 8);
+ dictPtr += 12;
+
+ {
+ size_t const dictContentSize = (size_t)(dictEnd - dictPtr);
+ U32 offcodeMax = MaxOff;
+ if (dictContentSize <= ((U32)-1) - 128 KB) {
+ U32 const maxOffset = (U32)dictContentSize + 128 KB; /* The maximum offset that must be supported */
+ offcodeMax = ZSTD_highbit32(maxOffset); /* Calculate minimum offset code required to represent maxOffset */
+ }
+ /* All offset values <= dictContentSize + 128 KB must be representable */
+ CHECK_F(ZSTD_checkDictNCount(offcodeNCount, offcodeMaxValue, MIN(offcodeMax, MaxOff)));
+ /* All repCodes must be <= dictContentSize and != 0*/
+ {
+ U32 u;
+ for (u = 0; u < 3; u++) {
+ if (cctx->rep[u] == 0)
+ return ERROR(dictionary_corrupted);
+ if (cctx->rep[u] > dictContentSize)
+ return ERROR(dictionary_corrupted);
+ }
+ }
+
+ cctx->flagStaticTables = 1;
+ cctx->flagStaticHufTable = HUF_repeat_valid;
+ return ZSTD_loadDictionaryContent(cctx, dictPtr, dictContentSize);
+ }
+}
+
+/** ZSTD_compress_insertDictionary() :
+* @return : 0, or an error code */
+static size_t ZSTD_compress_insertDictionary(ZSTD_CCtx *cctx, const void *dict, size_t dictSize)
+{
+ if ((dict == NULL) || (dictSize <= 8))
+ return 0;
+
+ /* dict as pure content */
+ if ((ZSTD_readLE32(dict) != ZSTD_DICT_MAGIC) || (cctx->forceRawDict))
+ return ZSTD_loadDictionaryContent(cctx, dict, dictSize);
+
+ /* dict as zstd dictionary */
+ return ZSTD_loadZstdDictionary(cctx, dict, dictSize);
+}
+
+/*! ZSTD_compressBegin_internal() :
+* @return : 0, or an error code */
+static size_t ZSTD_compressBegin_internal(ZSTD_CCtx *cctx, const void *dict, size_t dictSize, ZSTD_parameters params, U64 pledgedSrcSize)
+{
+ ZSTD_compResetPolicy_e const crp = dictSize ? ZSTDcrp_fullReset : ZSTDcrp_continue;
+ CHECK_F(ZSTD_resetCCtx_advanced(cctx, params, pledgedSrcSize, crp));
+ return ZSTD_compress_insertDictionary(cctx, dict, dictSize);
+}
+
+/*! ZSTD_compressBegin_advanced() :
+* @return : 0, or an error code */
+size_t ZSTD_compressBegin_advanced(ZSTD_CCtx *cctx, const void *dict, size_t dictSize, ZSTD_parameters params, unsigned long long pledgedSrcSize)
+{
+ /* compression parameters verification and optimization */
+ CHECK_F(ZSTD_checkCParams(params.cParams));
+ return ZSTD_compressBegin_internal(cctx, dict, dictSize, params, pledgedSrcSize);
+}
+
+size_t ZSTD_compressBegin_usingDict(ZSTD_CCtx *cctx, const void *dict, size_t dictSize, int compressionLevel)
+{
+ ZSTD_parameters const params = ZSTD_getParams(compressionLevel, 0, dictSize);
+ return ZSTD_compressBegin_internal(cctx, dict, dictSize, params, 0);
+}
+
+size_t ZSTD_compressBegin(ZSTD_CCtx *cctx, int compressionLevel) { return ZSTD_compressBegin_usingDict(cctx, NULL, 0, compressionLevel); }
+
+/*! ZSTD_writeEpilogue() :
+* Ends a frame.
+* @return : nb of bytes written into dst (or an error code) */
+static size_t ZSTD_writeEpilogue(ZSTD_CCtx *cctx, void *dst, size_t dstCapacity)
+{
+ BYTE *const ostart = (BYTE *)dst;
+ BYTE *op = ostart;
+ size_t fhSize = 0;
+
+ if (cctx->stage == ZSTDcs_created)
+ return ERROR(stage_wrong); /* init missing */
+
+ /* special case : empty frame */
+ if (cctx->stage == ZSTDcs_init) {
+ fhSize = ZSTD_writeFrameHeader(dst, dstCapacity, cctx->params, 0, 0);
+ if (ZSTD_isError(fhSize))
+ return fhSize;
+ dstCapacity -= fhSize;
+ op += fhSize;
+ cctx->stage = ZSTDcs_ongoing;
+ }
+
+ if (cctx->stage != ZSTDcs_ending) {
+ /* write one last empty block, make it the "last" block */
+ U32 const cBlockHeader24 = 1 /* last block */ + (((U32)bt_raw) << 1) + 0;
+ if (dstCapacity < 4)
+ return ERROR(dstSize_tooSmall);
+ ZSTD_writeLE32(op, cBlockHeader24);
+ op += ZSTD_blockHeaderSize;
+ dstCapacity -= ZSTD_blockHeaderSize;
+ }
+
+ if (cctx->params.fParams.checksumFlag) {
+ U32 const checksum = (U32)xxh64_digest(&cctx->xxhState);
+ if (dstCapacity < 4)
+ return ERROR(dstSize_tooSmall);
+ ZSTD_writeLE32(op, checksum);
+ op += 4;
+ }
+
+ cctx->stage = ZSTDcs_created; /* return to "created but no init" status */
+ return op - ostart;
+}
+
+size_t ZSTD_compressEnd(ZSTD_CCtx *cctx, void *dst, size_t dstCapacity, const void *src, size_t srcSize)
+{
+ size_t endResult;
+ size_t const cSize = ZSTD_compressContinue_internal(cctx, dst, dstCapacity, src, srcSize, 1, 1);
+ if (ZSTD_isError(cSize))
+ return cSize;
+ endResult = ZSTD_writeEpilogue(cctx, (char *)dst + cSize, dstCapacity - cSize);
+ if (ZSTD_isError(endResult))
+ return endResult;
+ return cSize + endResult;
+}
+
+static size_t ZSTD_compress_internal(ZSTD_CCtx *cctx, void *dst, size_t dstCapacity, const void *src, size_t srcSize, const void *dict, size_t dictSize,
+ ZSTD_parameters params)
+{
+ CHECK_F(ZSTD_compressBegin_internal(cctx, dict, dictSize, params, srcSize));
+ return ZSTD_compressEnd(cctx, dst, dstCapacity, src, srcSize);
+}
+
+size_t ZSTD_compress_usingDict(ZSTD_CCtx *ctx, void *dst, size_t dstCapacity, const void *src, size_t srcSize, const void *dict, size_t dictSize,
+ ZSTD_parameters params)
+{
+ return ZSTD_compress_internal(ctx, dst, dstCapacity, src, srcSize, dict, dictSize, params);
+}
+
+size_t ZSTD_compressCCtx(ZSTD_CCtx *ctx, void *dst, size_t dstCapacity, const void *src, size_t srcSize, ZSTD_parameters params)
+{
+ return ZSTD_compress_internal(ctx, dst, dstCapacity, src, srcSize, NULL, 0, params);
+}
+
+/* ===== Dictionary API ===== */
+
+struct ZSTD_CDict_s {
+ void *dictBuffer;
+ const void *dictContent;
+ size_t dictContentSize;
+ ZSTD_CCtx *refContext;
+}; /* typedef'd tp ZSTD_CDict within "zstd.h" */
+
+size_t ZSTD_CDictWorkspaceBound(ZSTD_compressionParameters cParams) { return ZSTD_CCtxWorkspaceBound(cParams) + ZSTD_ALIGN(sizeof(ZSTD_CDict)); }
+
+static ZSTD_CDict *ZSTD_createCDict_advanced(const void *dictBuffer, size_t dictSize, unsigned byReference, ZSTD_parameters params, ZSTD_customMem customMem)
+{
+ if (!customMem.customAlloc || !customMem.customFree)
+ return NULL;
+
+ {
+ ZSTD_CDict *const cdict = (ZSTD_CDict *)ZSTD_malloc(sizeof(ZSTD_CDict), customMem);
+ ZSTD_CCtx *const cctx = ZSTD_createCCtx_advanced(customMem);
+
+ if (!cdict || !cctx) {
+ ZSTD_free(cdict, customMem);
+ ZSTD_freeCCtx(cctx);
+ return NULL;
+ }
+
+ if ((byReference) || (!dictBuffer) || (!dictSize)) {
+ cdict->dictBuffer = NULL;
+ cdict->dictContent = dictBuffer;
+ } else {
+ void *const internalBuffer = ZSTD_malloc(dictSize, customMem);
+ if (!internalBuffer) {
+ ZSTD_free(cctx, customMem);
+ ZSTD_free(cdict, customMem);
+ return NULL;
+ }
+ memcpy(internalBuffer, dictBuffer, dictSize);
+ cdict->dictBuffer = internalBuffer;
+ cdict->dictContent = internalBuffer;
+ }
+
+ {
+ size_t const errorCode = ZSTD_compressBegin_advanced(cctx, cdict->dictContent, dictSize, params, 0);
+ if (ZSTD_isError(errorCode)) {
+ ZSTD_free(cdict->dictBuffer, customMem);
+ ZSTD_free(cdict, customMem);
+ ZSTD_freeCCtx(cctx);
+ return NULL;
+ }
+ }
+
+ cdict->refContext = cctx;
+ cdict->dictContentSize = dictSize;
+ return cdict;
+ }
+}
+
+ZSTD_CDict *ZSTD_initCDict(const void *dict, size_t dictSize, ZSTD_parameters params, void *workspace, size_t workspaceSize)
+{
+ ZSTD_customMem const stackMem = ZSTD_initStack(workspace, workspaceSize);
+ return ZSTD_createCDict_advanced(dict, dictSize, 1, params, stackMem);
+}
+
+size_t ZSTD_freeCDict(ZSTD_CDict *cdict)
+{
+ if (cdict == NULL)
+ return 0; /* support free on NULL */
+ {
+ ZSTD_customMem const cMem = cdict->refContext->customMem;
+ ZSTD_freeCCtx(cdict->refContext);
+ ZSTD_free(cdict->dictBuffer, cMem);
+ ZSTD_free(cdict, cMem);
+ return 0;
+ }
+}
+
+static ZSTD_parameters ZSTD_getParamsFromCDict(const ZSTD_CDict *cdict) { return ZSTD_getParamsFromCCtx(cdict->refContext); }
+
+size_t ZSTD_compressBegin_usingCDict(ZSTD_CCtx *cctx, const ZSTD_CDict *cdict, unsigned long long pledgedSrcSize)
+{
+ if (cdict->dictContentSize)
+ CHECK_F(ZSTD_copyCCtx(cctx, cdict->refContext, pledgedSrcSize))
+ else {
+ ZSTD_parameters params = cdict->refContext->params;
+ params.fParams.contentSizeFlag = (pledgedSrcSize > 0);
+ CHECK_F(ZSTD_compressBegin_advanced(cctx, NULL, 0, params, pledgedSrcSize));
+ }
+ return 0;
+}
+
+/*! ZSTD_compress_usingCDict() :
+* Compression using a digested Dictionary.
+* Faster startup than ZSTD_compress_usingDict(), recommended when same dictionary is used multiple times.
+* Note that compression level is decided during dictionary creation */
+size_t ZSTD_compress_usingCDict(ZSTD_CCtx *cctx, void *dst, size_t dstCapacity, const void *src, size_t srcSize, const ZSTD_CDict *cdict)
+{
+ CHECK_F(ZSTD_compressBegin_usingCDict(cctx, cdict, srcSize));
+
+ if (cdict->refContext->params.fParams.contentSizeFlag == 1) {
+ cctx->params.fParams.contentSizeFlag = 1;
+ cctx->frameContentSize = srcSize;
+ } else {
+ cctx->params.fParams.contentSizeFlag = 0;
+ }
+
+ return ZSTD_compressEnd(cctx, dst, dstCapacity, src, srcSize);
+}
+
+/* ******************************************************************
+* Streaming
+********************************************************************/
+
+typedef enum { zcss_init, zcss_load, zcss_flush, zcss_final } ZSTD_cStreamStage;
+
+struct ZSTD_CStream_s {
+ ZSTD_CCtx *cctx;
+ ZSTD_CDict *cdictLocal;
+ const ZSTD_CDict *cdict;
+ char *inBuff;
+ size_t inBuffSize;
+ size_t inToCompress;
+ size_t inBuffPos;
+ size_t inBuffTarget;
+ size_t blockSize;
+ char *outBuff;
+ size_t outBuffSize;
+ size_t outBuffContentSize;
+ size_t outBuffFlushedSize;
+ ZSTD_cStreamStage stage;
+ U32 checksum;
+ U32 frameEnded;
+ U64 pledgedSrcSize;
+ U64 inputProcessed;
+ ZSTD_parameters params;
+ ZSTD_customMem customMem;
+}; /* typedef'd to ZSTD_CStream within "zstd.h" */
+
+size_t ZSTD_CStreamWorkspaceBound(ZSTD_compressionParameters cParams)
+{
+ size_t const inBuffSize = (size_t)1 << cParams.windowLog;
+ size_t const blockSize = MIN(ZSTD_BLOCKSIZE_ABSOLUTEMAX, inBuffSize);
+ size_t const outBuffSize = ZSTD_compressBound(blockSize) + 1;
+
+ return ZSTD_CCtxWorkspaceBound(cParams) + ZSTD_ALIGN(sizeof(ZSTD_CStream)) + ZSTD_ALIGN(inBuffSize) + ZSTD_ALIGN(outBuffSize);
+}
+
+ZSTD_CStream *ZSTD_createCStream_advanced(ZSTD_customMem customMem)
+{
+ ZSTD_CStream *zcs;
+
+ if (!customMem.customAlloc || !customMem.customFree)
+ return NULL;
+
+ zcs = (ZSTD_CStream *)ZSTD_malloc(sizeof(ZSTD_CStream), customMem);
+ if (zcs == NULL)
+ return NULL;
+ memset(zcs, 0, sizeof(ZSTD_CStream));
+ memcpy(&zcs->customMem, &customMem, sizeof(ZSTD_customMem));
+ zcs->cctx = ZSTD_createCCtx_advanced(customMem);
+ if (zcs->cctx == NULL) {
+ ZSTD_freeCStream(zcs);
+ return NULL;
+ }
+ return zcs;
+}
+
+size_t ZSTD_freeCStream(ZSTD_CStream *zcs)
+{
+ if (zcs == NULL)
+ return 0; /* support free on NULL */
+ {
+ ZSTD_customMem const cMem = zcs->customMem;
+ ZSTD_freeCCtx(zcs->cctx);
+ zcs->cctx = NULL;
+ ZSTD_freeCDict(zcs->cdictLocal);
+ zcs->cdictLocal = NULL;
+ ZSTD_free(zcs->inBuff, cMem);
+ zcs->inBuff = NULL;
+ ZSTD_free(zcs->outBuff, cMem);
+ zcs->outBuff = NULL;
+ ZSTD_free(zcs, cMem);
+ return 0;
+ }
+}
+
+/*====== Initialization ======*/
+
+size_t ZSTD_CStreamInSize(void) { return ZSTD_BLOCKSIZE_ABSOLUTEMAX; }
+size_t ZSTD_CStreamOutSize(void) { return ZSTD_compressBound(ZSTD_BLOCKSIZE_ABSOLUTEMAX) + ZSTD_blockHeaderSize + 4 /* 32-bits hash */; }
+
+static size_t ZSTD_resetCStream_internal(ZSTD_CStream *zcs, unsigned long long pledgedSrcSize)
+{
+ if (zcs->inBuffSize == 0)
+ return ERROR(stage_wrong); /* zcs has not been init at least once => can't reset */
+
+ if (zcs->cdict)
+ CHECK_F(ZSTD_compressBegin_usingCDict(zcs->cctx, zcs->cdict, pledgedSrcSize))
+ else
+ CHECK_F(ZSTD_compressBegin_advanced(zcs->cctx, NULL, 0, zcs->params, pledgedSrcSize));
+
+ zcs->inToCompress = 0;
+ zcs->inBuffPos = 0;
+ zcs->inBuffTarget = zcs->blockSize;
+ zcs->outBuffContentSize = zcs->outBuffFlushedSize = 0;
+ zcs->stage = zcss_load;
+ zcs->frameEnded = 0;
+ zcs->pledgedSrcSize = pledgedSrcSize;
+ zcs->inputProcessed = 0;
+ return 0; /* ready to go */
+}
+
+size_t ZSTD_resetCStream(ZSTD_CStream *zcs, unsigned long long pledgedSrcSize)
+{
+
+ zcs->params.fParams.contentSizeFlag = (pledgedSrcSize > 0);
+
+ return ZSTD_resetCStream_internal(zcs, pledgedSrcSize);
+}
+
+static size_t ZSTD_initCStream_advanced(ZSTD_CStream *zcs, const void *dict, size_t dictSize, ZSTD_parameters params, unsigned long long pledgedSrcSize)
+{
+ /* allocate buffers */
+ {
+ size_t const neededInBuffSize = (size_t)1 << params.cParams.windowLog;
+ if (zcs->inBuffSize < neededInBuffSize) {
+ zcs->inBuffSize = neededInBuffSize;
+ ZSTD_free(zcs->inBuff, zcs->customMem);
+ zcs->inBuff = (char *)ZSTD_malloc(neededInBuffSize, zcs->customMem);
+ if (zcs->inBuff == NULL)
+ return ERROR(memory_allocation);
+ }
+ zcs->blockSize = MIN(ZSTD_BLOCKSIZE_ABSOLUTEMAX, neededInBuffSize);
+ }
+ if (zcs->outBuffSize < ZSTD_compressBound(zcs->blockSize) + 1) {
+ zcs->outBuffSize = ZSTD_compressBound(zcs->blockSize) + 1;
+ ZSTD_free(zcs->outBuff, zcs->customMem);
+ zcs->outBuff = (char *)ZSTD_malloc(zcs->outBuffSize, zcs->customMem);
+ if (zcs->outBuff == NULL)
+ return ERROR(memory_allocation);
+ }
+
+ if (dict && dictSize >= 8) {
+ ZSTD_freeCDict(zcs->cdictLocal);
+ zcs->cdictLocal = ZSTD_createCDict_advanced(dict, dictSize, 0, params, zcs->customMem);
+ if (zcs->cdictLocal == NULL)
+ return ERROR(memory_allocation);
+ zcs->cdict = zcs->cdictLocal;
+ } else
+ zcs->cdict = NULL;
+
+ zcs->checksum = params.fParams.checksumFlag > 0;
+ zcs->params = params;
+
+ return ZSTD_resetCStream_internal(zcs, pledgedSrcSize);
+}
+
+ZSTD_CStream *ZSTD_initCStream(ZSTD_parameters params, unsigned long long pledgedSrcSize, void *workspace, size_t workspaceSize)
+{
+ ZSTD_customMem const stackMem = ZSTD_initStack(workspace, workspaceSize);
+ ZSTD_CStream *const zcs = ZSTD_createCStream_advanced(stackMem);
+ if (zcs) {
+ size_t const code = ZSTD_initCStream_advanced(zcs, NULL, 0, params, pledgedSrcSize);
+ if (ZSTD_isError(code)) {
+ return NULL;
+ }
+ }
+ return zcs;
+}
+
+ZSTD_CStream *ZSTD_initCStream_usingCDict(const ZSTD_CDict *cdict, unsigned long long pledgedSrcSize, void *workspace, size_t workspaceSize)
+{
+ ZSTD_parameters const params = ZSTD_getParamsFromCDict(cdict);
+ ZSTD_CStream *const zcs = ZSTD_initCStream(params, pledgedSrcSize, workspace, workspaceSize);
+ if (zcs) {
+ zcs->cdict = cdict;
+ if (ZSTD_isError(ZSTD_resetCStream_internal(zcs, pledgedSrcSize))) {
+ return NULL;
+ }
+ }
+ return zcs;
+}
+
+/*====== Compression ======*/
+
+typedef enum { zsf_gather, zsf_flush, zsf_end } ZSTD_flush_e;
+
+ZSTD_STATIC size_t ZSTD_limitCopy(void *dst, size_t dstCapacity, const void *src, size_t srcSize)
+{
+ size_t const length = MIN(dstCapacity, srcSize);
+ memcpy(dst, src, length);
+ return length;
+}
+
+static size_t ZSTD_compressStream_generic(ZSTD_CStream *zcs, void *dst, size_t *dstCapacityPtr, const void *src, size_t *srcSizePtr, ZSTD_flush_e const flush)
+{
+ U32 someMoreWork = 1;
+ const char *const istart = (const char *)src;
+ const char *const iend = istart + *srcSizePtr;
+ const char *ip = istart;
+ char *const ostart = (char *)dst;
+ char *const oend = ostart + *dstCapacityPtr;
+ char *op = ostart;
+
+ while (someMoreWork) {
+ switch (zcs->stage) {
+ case zcss_init:
+ return ERROR(init_missing); /* call ZBUFF_compressInit() first ! */
+
+ case zcss_load:
+ /* complete inBuffer */
+ {
+ size_t const toLoad = zcs->inBuffTarget - zcs->inBuffPos;
+ size_t const loaded = ZSTD_limitCopy(zcs->inBuff + zcs->inBuffPos, toLoad, ip, iend - ip);
+ zcs->inBuffPos += loaded;
+ ip += loaded;
+ if ((zcs->inBuffPos == zcs->inToCompress) || (!flush && (toLoad != loaded))) {
+ someMoreWork = 0;
+ break; /* not enough input to get a full block : stop there, wait for more */
+ }
+ }
+ /* compress curr block (note : this stage cannot be stopped in the middle) */
+ {
+ void *cDst;
+ size_t cSize;
+ size_t const iSize = zcs->inBuffPos - zcs->inToCompress;
+ size_t oSize = oend - op;
+ if (oSize >= ZSTD_compressBound(iSize))
+ cDst = op; /* compress directly into output buffer (avoid flush stage) */
+ else
+ cDst = zcs->outBuff, oSize = zcs->outBuffSize;
+ cSize = (flush == zsf_end) ? ZSTD_compressEnd(zcs->cctx, cDst, oSize, zcs->inBuff + zcs->inToCompress, iSize)
+ : ZSTD_compressContinue(zcs->cctx, cDst, oSize, zcs->inBuff + zcs->inToCompress, iSize);
+ if (ZSTD_isError(cSize))
+ return cSize;
+ if (flush == zsf_end)
+ zcs->frameEnded = 1;
+ /* prepare next block */
+ zcs->inBuffTarget = zcs->inBuffPos + zcs->blockSize;
+ if (zcs->inBuffTarget > zcs->inBuffSize)
+ zcs->inBuffPos = 0, zcs->inBuffTarget = zcs->blockSize; /* note : inBuffSize >= blockSize */
+ zcs->inToCompress = zcs->inBuffPos;
+ if (cDst == op) {
+ op += cSize;
+ break;
+ } /* no need to flush */
+ zcs->outBuffContentSize = cSize;
+ zcs->outBuffFlushedSize = 0;
+ zcs->stage = zcss_flush; /* pass-through to flush stage */
+ }
+
+ case zcss_flush: {
+ size_t const toFlush = zcs->outBuffContentSize - zcs->outBuffFlushedSize;
+ size_t const flushed = ZSTD_limitCopy(op, oend - op, zcs->outBuff + zcs->outBuffFlushedSize, toFlush);
+ op += flushed;
+ zcs->outBuffFlushedSize += flushed;
+ if (toFlush != flushed) {
+ someMoreWork = 0;
+ break;
+ } /* dst too small to store flushed data : stop there */
+ zcs->outBuffContentSize = zcs->outBuffFlushedSize = 0;
+ zcs->stage = zcss_load;
+ break;
+ }
+
+ case zcss_final:
+ someMoreWork = 0; /* do nothing */
+ break;
+
+ default:
+ return ERROR(GENERIC); /* impossible */
+ }
+ }
+
+ *srcSizePtr = ip - istart;
+ *dstCapacityPtr = op - ostart;
+ zcs->inputProcessed += *srcSizePtr;
+ if (zcs->frameEnded)
+ return 0;
+ {
+ size_t hintInSize = zcs->inBuffTarget - zcs->inBuffPos;
+ if (hintInSize == 0)
+ hintInSize = zcs->blockSize;
+ return hintInSize;
+ }
+}
+
+size_t ZSTD_compressStream(ZSTD_CStream *zcs, ZSTD_outBuffer *output, ZSTD_inBuffer *input)
+{
+ size_t sizeRead = input->size - input->pos;
+ size_t sizeWritten = output->size - output->pos;
+ size_t const result =
+ ZSTD_compressStream_generic(zcs, (char *)(output->dst) + output->pos, &sizeWritten, (const char *)(input->src) + input->pos, &sizeRead, zsf_gather);
+ input->pos += sizeRead;
+ output->pos += sizeWritten;
+ return result;
+}
+
+/*====== Finalize ======*/
+
+/*! ZSTD_flushStream() :
+* @return : amount of data remaining to flush */
+size_t ZSTD_flushStream(ZSTD_CStream *zcs, ZSTD_outBuffer *output)
+{
+ size_t srcSize = 0;
+ size_t sizeWritten = output->size - output->pos;
+ size_t const result = ZSTD_compressStream_generic(zcs, (char *)(output->dst) + output->pos, &sizeWritten, &srcSize,
+ &srcSize, /* use a valid src address instead of NULL */
+ zsf_flush);
+ output->pos += sizeWritten;
+ if (ZSTD_isError(result))
+ return result;
+ return zcs->outBuffContentSize - zcs->outBuffFlushedSize; /* remaining to flush */
+}
+
+size_t ZSTD_endStream(ZSTD_CStream *zcs, ZSTD_outBuffer *output)
+{
+ BYTE *const ostart = (BYTE *)(output->dst) + output->pos;
+ BYTE *const oend = (BYTE *)(output->dst) + output->size;
+ BYTE *op = ostart;
+
+ if ((zcs->pledgedSrcSize) && (zcs->inputProcessed != zcs->pledgedSrcSize))
+ return ERROR(srcSize_wrong); /* pledgedSrcSize not respected */
+
+ if (zcs->stage != zcss_final) {
+ /* flush whatever remains */
+ size_t srcSize = 0;
+ size_t sizeWritten = output->size - output->pos;
+ size_t const notEnded =
+ ZSTD_compressStream_generic(zcs, ostart, &sizeWritten, &srcSize, &srcSize, zsf_end); /* use a valid src address instead of NULL */
+ size_t const remainingToFlush = zcs->outBuffContentSize - zcs->outBuffFlushedSize;
+ op += sizeWritten;
+ if (remainingToFlush) {
+ output->pos += sizeWritten;
+ return remainingToFlush + ZSTD_BLOCKHEADERSIZE /* final empty block */ + (zcs->checksum * 4);
+ }
+ /* create epilogue */
+ zcs->stage = zcss_final;
+ zcs->outBuffContentSize = !notEnded ? 0 : ZSTD_compressEnd(zcs->cctx, zcs->outBuff, zcs->outBuffSize, NULL,
+ 0); /* write epilogue, including final empty block, into outBuff */
+ }
+
+ /* flush epilogue */
+ {
+ size_t const toFlush = zcs->outBuffContentSize - zcs->outBuffFlushedSize;
+ size_t const flushed = ZSTD_limitCopy(op, oend - op, zcs->outBuff + zcs->outBuffFlushedSize, toFlush);
+ op += flushed;
+ zcs->outBuffFlushedSize += flushed;
+ output->pos += op - ostart;
+ if (toFlush == flushed)
+ zcs->stage = zcss_init; /* end reached */
+ return toFlush - flushed;
+ }
+}
+
+/*-===== Pre-defined compression levels =====-*/
+
+#define ZSTD_DEFAULT_CLEVEL 1
+#define ZSTD_MAX_CLEVEL 22
+int ZSTD_maxCLevel(void) { return ZSTD_MAX_CLEVEL; }
+
+static const ZSTD_compressionParameters ZSTD_defaultCParameters[4][ZSTD_MAX_CLEVEL + 1] = {
+ {
+ /* "default" */
+ /* W, C, H, S, L, TL, strat */
+ {18, 12, 12, 1, 7, 16, ZSTD_fast}, /* level 0 - never used */
+ {19, 13, 14, 1, 7, 16, ZSTD_fast}, /* level 1 */
+ {19, 15, 16, 1, 6, 16, ZSTD_fast}, /* level 2 */
+ {20, 16, 17, 1, 5, 16, ZSTD_dfast}, /* level 3.*/
+ {20, 18, 18, 1, 5, 16, ZSTD_dfast}, /* level 4.*/
+ {20, 15, 18, 3, 5, 16, ZSTD_greedy}, /* level 5 */
+ {21, 16, 19, 2, 5, 16, ZSTD_lazy}, /* level 6 */
+ {21, 17, 20, 3, 5, 16, ZSTD_lazy}, /* level 7 */
+ {21, 18, 20, 3, 5, 16, ZSTD_lazy2}, /* level 8 */
+ {21, 20, 20, 3, 5, 16, ZSTD_lazy2}, /* level 9 */
+ {21, 19, 21, 4, 5, 16, ZSTD_lazy2}, /* level 10 */
+ {22, 20, 22, 4, 5, 16, ZSTD_lazy2}, /* level 11 */
+ {22, 20, 22, 5, 5, 16, ZSTD_lazy2}, /* level 12 */
+ {22, 21, 22, 5, 5, 16, ZSTD_lazy2}, /* level 13 */
+ {22, 21, 22, 6, 5, 16, ZSTD_lazy2}, /* level 14 */
+ {22, 21, 21, 5, 5, 16, ZSTD_btlazy2}, /* level 15 */
+ {23, 22, 22, 5, 5, 16, ZSTD_btlazy2}, /* level 16 */
+ {23, 21, 22, 4, 5, 24, ZSTD_btopt}, /* level 17 */
+ {23, 23, 22, 6, 5, 32, ZSTD_btopt}, /* level 18 */
+ {23, 23, 22, 6, 3, 48, ZSTD_btopt}, /* level 19 */
+ {25, 25, 23, 7, 3, 64, ZSTD_btopt2}, /* level 20 */
+ {26, 26, 23, 7, 3, 256, ZSTD_btopt2}, /* level 21 */
+ {27, 27, 25, 9, 3, 512, ZSTD_btopt2}, /* level 22 */
+ },
+ {
+ /* for srcSize <= 256 KB */
+ /* W, C, H, S, L, T, strat */
+ {0, 0, 0, 0, 0, 0, ZSTD_fast}, /* level 0 - not used */
+ {18, 13, 14, 1, 6, 8, ZSTD_fast}, /* level 1 */
+ {18, 14, 13, 1, 5, 8, ZSTD_dfast}, /* level 2 */
+ {18, 16, 15, 1, 5, 8, ZSTD_dfast}, /* level 3 */
+ {18, 15, 17, 1, 5, 8, ZSTD_greedy}, /* level 4.*/
+ {18, 16, 17, 4, 5, 8, ZSTD_greedy}, /* level 5.*/
+ {18, 16, 17, 3, 5, 8, ZSTD_lazy}, /* level 6.*/
+ {18, 17, 17, 4, 4, 8, ZSTD_lazy}, /* level 7 */
+ {18, 17, 17, 4, 4, 8, ZSTD_lazy2}, /* level 8 */
+ {18, 17, 17, 5, 4, 8, ZSTD_lazy2}, /* level 9 */
+ {18, 17, 17, 6, 4, 8, ZSTD_lazy2}, /* level 10 */
+ {18, 18, 17, 6, 4, 8, ZSTD_lazy2}, /* level 11.*/
+ {18, 18, 17, 7, 4, 8, ZSTD_lazy2}, /* level 12.*/
+ {18, 19, 17, 6, 4, 8, ZSTD_btlazy2}, /* level 13 */
+ {18, 18, 18, 4, 4, 16, ZSTD_btopt}, /* level 14.*/
+ {18, 18, 18, 4, 3, 16, ZSTD_btopt}, /* level 15.*/
+ {18, 19, 18, 6, 3, 32, ZSTD_btopt}, /* level 16.*/
+ {18, 19, 18, 8, 3, 64, ZSTD_btopt}, /* level 17.*/
+ {18, 19, 18, 9, 3, 128, ZSTD_btopt}, /* level 18.*/
+ {18, 19, 18, 10, 3, 256, ZSTD_btopt}, /* level 19.*/
+ {18, 19, 18, 11, 3, 512, ZSTD_btopt2}, /* level 20.*/
+ {18, 19, 18, 12, 3, 512, ZSTD_btopt2}, /* level 21.*/
+ {18, 19, 18, 13, 3, 512, ZSTD_btopt2}, /* level 22.*/
+ },
+ {
+ /* for srcSize <= 128 KB */
+ /* W, C, H, S, L, T, strat */
+ {17, 12, 12, 1, 7, 8, ZSTD_fast}, /* level 0 - not used */
+ {17, 12, 13, 1, 6, 8, ZSTD_fast}, /* level 1 */
+ {17, 13, 16, 1, 5, 8, ZSTD_fast}, /* level 2 */
+ {17, 16, 16, 2, 5, 8, ZSTD_dfast}, /* level 3 */
+ {17, 13, 15, 3, 4, 8, ZSTD_greedy}, /* level 4 */
+ {17, 15, 17, 4, 4, 8, ZSTD_greedy}, /* level 5 */
+ {17, 16, 17, 3, 4, 8, ZSTD_lazy}, /* level 6 */
+ {17, 15, 17, 4, 4, 8, ZSTD_lazy2}, /* level 7 */
+ {17, 17, 17, 4, 4, 8, ZSTD_lazy2}, /* level 8 */
+ {17, 17, 17, 5, 4, 8, ZSTD_lazy2}, /* level 9 */
+ {17, 17, 17, 6, 4, 8, ZSTD_lazy2}, /* level 10 */
+ {17, 17, 17, 7, 4, 8, ZSTD_lazy2}, /* level 11 */
+ {17, 17, 17, 8, 4, 8, ZSTD_lazy2}, /* level 12 */
+ {17, 18, 17, 6, 4, 8, ZSTD_btlazy2}, /* level 13.*/
+ {17, 17, 17, 7, 3, 8, ZSTD_btopt}, /* level 14.*/
+ {17, 17, 17, 7, 3, 16, ZSTD_btopt}, /* level 15.*/
+ {17, 18, 17, 7, 3, 32, ZSTD_btopt}, /* level 16.*/
+ {17, 18, 17, 7, 3, 64, ZSTD_btopt}, /* level 17.*/
+ {17, 18, 17, 7, 3, 256, ZSTD_btopt}, /* level 18.*/
+ {17, 18, 17, 8, 3, 256, ZSTD_btopt}, /* level 19.*/
+ {17, 18, 17, 9, 3, 256, ZSTD_btopt2}, /* level 20.*/
+ {17, 18, 17, 10, 3, 256, ZSTD_btopt2}, /* level 21.*/
+ {17, 18, 17, 11, 3, 512, ZSTD_btopt2}, /* level 22.*/
+ },
+ {
+ /* for srcSize <= 16 KB */
+ /* W, C, H, S, L, T, strat */
+ {14, 12, 12, 1, 7, 6, ZSTD_fast}, /* level 0 - not used */
+ {14, 14, 14, 1, 6, 6, ZSTD_fast}, /* level 1 */
+ {14, 14, 14, 1, 4, 6, ZSTD_fast}, /* level 2 */
+ {14, 14, 14, 1, 4, 6, ZSTD_dfast}, /* level 3.*/
+ {14, 14, 14, 4, 4, 6, ZSTD_greedy}, /* level 4.*/
+ {14, 14, 14, 3, 4, 6, ZSTD_lazy}, /* level 5.*/
+ {14, 14, 14, 4, 4, 6, ZSTD_lazy2}, /* level 6 */
+ {14, 14, 14, 5, 4, 6, ZSTD_lazy2}, /* level 7 */
+ {14, 14, 14, 6, 4, 6, ZSTD_lazy2}, /* level 8.*/
+ {14, 15, 14, 6, 4, 6, ZSTD_btlazy2}, /* level 9.*/
+ {14, 15, 14, 3, 3, 6, ZSTD_btopt}, /* level 10.*/
+ {14, 15, 14, 6, 3, 8, ZSTD_btopt}, /* level 11.*/
+ {14, 15, 14, 6, 3, 16, ZSTD_btopt}, /* level 12.*/
+ {14, 15, 14, 6, 3, 24, ZSTD_btopt}, /* level 13.*/
+ {14, 15, 15, 6, 3, 48, ZSTD_btopt}, /* level 14.*/
+ {14, 15, 15, 6, 3, 64, ZSTD_btopt}, /* level 15.*/
+ {14, 15, 15, 6, 3, 96, ZSTD_btopt}, /* level 16.*/
+ {14, 15, 15, 6, 3, 128, ZSTD_btopt}, /* level 17.*/
+ {14, 15, 15, 6, 3, 256, ZSTD_btopt}, /* level 18.*/
+ {14, 15, 15, 7, 3, 256, ZSTD_btopt}, /* level 19.*/
+ {14, 15, 15, 8, 3, 256, ZSTD_btopt2}, /* level 20.*/
+ {14, 15, 15, 9, 3, 256, ZSTD_btopt2}, /* level 21.*/
+ {14, 15, 15, 10, 3, 256, ZSTD_btopt2}, /* level 22.*/
+ },
+};
+
+/*! ZSTD_getCParams() :
+* @return ZSTD_compressionParameters structure for a selected compression level, `srcSize` and `dictSize`.
+* Size values are optional, provide 0 if not known or unused */
+ZSTD_compressionParameters ZSTD_getCParams(int compressionLevel, unsigned long long srcSize, size_t dictSize)
+{
+ ZSTD_compressionParameters cp;
+ size_t const addedSize = srcSize ? 0 : 500;
+ U64 const rSize = srcSize + dictSize ? srcSize + dictSize + addedSize : (U64)-1;
+ U32 const tableID = (rSize <= 256 KB) + (rSize <= 128 KB) + (rSize <= 16 KB); /* intentional underflow for srcSizeHint == 0 */
+ if (compressionLevel <= 0)
+ compressionLevel = ZSTD_DEFAULT_CLEVEL; /* 0 == default; no negative compressionLevel yet */
+ if (compressionLevel > ZSTD_MAX_CLEVEL)
+ compressionLevel = ZSTD_MAX_CLEVEL;
+ cp = ZSTD_defaultCParameters[tableID][compressionLevel];
+ if (ZSTD_32bits()) { /* auto-correction, for 32-bits mode */
+ if (cp.windowLog > ZSTD_WINDOWLOG_MAX)
+ cp.windowLog = ZSTD_WINDOWLOG_MAX;
+ if (cp.chainLog > ZSTD_CHAINLOG_MAX)
+ cp.chainLog = ZSTD_CHAINLOG_MAX;
+ if (cp.hashLog > ZSTD_HASHLOG_MAX)
+ cp.hashLog = ZSTD_HASHLOG_MAX;
+ }
+ cp = ZSTD_adjustCParams(cp, srcSize, dictSize);
+ return cp;
+}
+
+/*! ZSTD_getParams() :
+* same as ZSTD_getCParams(), but @return a `ZSTD_parameters` object (instead of `ZSTD_compressionParameters`).
+* All fields of `ZSTD_frameParameters` are set to default (0) */
+ZSTD_parameters ZSTD_getParams(int compressionLevel, unsigned long long srcSize, size_t dictSize)
+{
+ ZSTD_parameters params;
+ ZSTD_compressionParameters const cParams = ZSTD_getCParams(compressionLevel, srcSize, dictSize);
+ memset(&params, 0, sizeof(params));
+ params.cParams = cParams;
+ return params;
+}
+
+EXPORT_SYMBOL(ZSTD_maxCLevel);
+EXPORT_SYMBOL(ZSTD_compressBound);
+
+EXPORT_SYMBOL(ZSTD_CCtxWorkspaceBound);
+EXPORT_SYMBOL(ZSTD_initCCtx);
+EXPORT_SYMBOL(ZSTD_compressCCtx);
+EXPORT_SYMBOL(ZSTD_compress_usingDict);
+
+EXPORT_SYMBOL(ZSTD_CDictWorkspaceBound);
+EXPORT_SYMBOL(ZSTD_initCDict);
+EXPORT_SYMBOL(ZSTD_compress_usingCDict);
+
+EXPORT_SYMBOL(ZSTD_CStreamWorkspaceBound);
+EXPORT_SYMBOL(ZSTD_initCStream);
+EXPORT_SYMBOL(ZSTD_initCStream_usingCDict);
+EXPORT_SYMBOL(ZSTD_resetCStream);
+EXPORT_SYMBOL(ZSTD_compressStream);
+EXPORT_SYMBOL(ZSTD_flushStream);
+EXPORT_SYMBOL(ZSTD_endStream);
+EXPORT_SYMBOL(ZSTD_CStreamInSize);
+EXPORT_SYMBOL(ZSTD_CStreamOutSize);
+
+EXPORT_SYMBOL(ZSTD_getCParams);
+EXPORT_SYMBOL(ZSTD_getParams);
+EXPORT_SYMBOL(ZSTD_checkCParams);
+EXPORT_SYMBOL(ZSTD_adjustCParams);
+
+EXPORT_SYMBOL(ZSTD_compressBegin);
+EXPORT_SYMBOL(ZSTD_compressBegin_usingDict);
+EXPORT_SYMBOL(ZSTD_compressBegin_advanced);
+EXPORT_SYMBOL(ZSTD_copyCCtx);
+EXPORT_SYMBOL(ZSTD_compressBegin_usingCDict);
+EXPORT_SYMBOL(ZSTD_compressContinue);
+EXPORT_SYMBOL(ZSTD_compressEnd);
+
+EXPORT_SYMBOL(ZSTD_getBlockSizeMax);
+EXPORT_SYMBOL(ZSTD_compressBlock);
+
+MODULE_LICENSE("Dual BSD/GPL");
+MODULE_DESCRIPTION("Zstd Compressor");
diff --git a/lib/zstd/decompress.c b/lib/zstd/decompress.c
new file mode 100644
index 000000000000..b17846725ca0
--- /dev/null
+++ b/lib/zstd/decompress.c
@@ -0,0 +1,2528 @@
+/**
+ * Copyright (c) 2016-present, Yann Collet, Facebook, Inc.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of https://github.com/facebook/zstd.
+ * An additional grant of patent rights can be found in the PATENTS file in the
+ * same directory.
+ *
+ * This program is free software; you can redistribute it and/or modify it under
+ * the terms of the GNU General Public License version 2 as published by the
+ * Free Software Foundation. This program is dual-licensed; you may select
+ * either version 2 of the GNU General Public License ("GPL") or BSD license
+ * ("BSD").
+ */
+
+/* ***************************************************************
+* Tuning parameters
+*****************************************************************/
+/*!
+* MAXWINDOWSIZE_DEFAULT :
+* maximum window size accepted by DStream, by default.
+* Frames requiring more memory will be rejected.
+*/
+#ifndef ZSTD_MAXWINDOWSIZE_DEFAULT
+#define ZSTD_MAXWINDOWSIZE_DEFAULT ((1 << ZSTD_WINDOWLOG_MAX) + 1) /* defined within zstd.h */
+#endif
+
+/*-*******************************************************
+* Dependencies
+*********************************************************/
+#include "fse.h"
+#include "huf.h"
+#include "mem.h" /* low level memory routines */
+#include "zstd_internal.h"
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/string.h> /* memcpy, memmove, memset */
+
+#define ZSTD_PREFETCH(ptr) __builtin_prefetch(ptr, 0, 0)
+
+/*-*************************************
+* Macros
+***************************************/
+#define ZSTD_isError ERR_isError /* for inlining */
+#define FSE_isError ERR_isError
+#define HUF_isError ERR_isError
+
+/*_*******************************************************
+* Memory operations
+**********************************************************/
+static void ZSTD_copy4(void *dst, const void *src) { memcpy(dst, src, 4); }
+
+/*-*************************************************************
+* Context management
+***************************************************************/
+typedef enum {
+ ZSTDds_getFrameHeaderSize,
+ ZSTDds_decodeFrameHeader,
+ ZSTDds_decodeBlockHeader,
+ ZSTDds_decompressBlock,
+ ZSTDds_decompressLastBlock,
+ ZSTDds_checkChecksum,
+ ZSTDds_decodeSkippableHeader,
+ ZSTDds_skipFrame
+} ZSTD_dStage;
+
+typedef struct {
+ FSE_DTable LLTable[FSE_DTABLE_SIZE_U32(LLFSELog)];
+ FSE_DTable OFTable[FSE_DTABLE_SIZE_U32(OffFSELog)];
+ FSE_DTable MLTable[FSE_DTABLE_SIZE_U32(MLFSELog)];
+ HUF_DTable hufTable[HUF_DTABLE_SIZE(HufLog)]; /* can accommodate HUF_decompress4X */
+ U64 workspace[HUF_DECOMPRESS_WORKSPACE_SIZE_U32 / 2];
+ U32 rep[ZSTD_REP_NUM];
+} ZSTD_entropyTables_t;
+
+struct ZSTD_DCtx_s {
+ const FSE_DTable *LLTptr;
+ const FSE_DTable *MLTptr;
+ const FSE_DTable *OFTptr;
+ const HUF_DTable *HUFptr;
+ ZSTD_entropyTables_t entropy;
+ const void *previousDstEnd; /* detect continuity */
+ const void *base; /* start of curr segment */
+ const void *vBase; /* virtual start of previous segment if it was just before curr one */
+ const void *dictEnd; /* end of previous segment */
+ size_t expected;
+ ZSTD_frameParams fParams;
+ blockType_e bType; /* used in ZSTD_decompressContinue(), to transfer blockType between header decoding and block decoding stages */
+ ZSTD_dStage stage;
+ U32 litEntropy;
+ U32 fseEntropy;
+ struct xxh64_state xxhState;
+ size_t headerSize;
+ U32 dictID;
+ const BYTE *litPtr;
+ ZSTD_customMem customMem;
+ size_t litSize;
+ size_t rleSize;
+ BYTE litBuffer[ZSTD_BLOCKSIZE_ABSOLUTEMAX + WILDCOPY_OVERLENGTH];
+ BYTE headerBuffer[ZSTD_FRAMEHEADERSIZE_MAX];
+}; /* typedef'd to ZSTD_DCtx within "zstd.h" */
+
+size_t ZSTD_DCtxWorkspaceBound(void) { return ZSTD_ALIGN(sizeof(ZSTD_stack)) + ZSTD_ALIGN(sizeof(ZSTD_DCtx)); }
+
+size_t ZSTD_decompressBegin(ZSTD_DCtx *dctx)
+{
+ dctx->expected = ZSTD_frameHeaderSize_prefix;
+ dctx->stage = ZSTDds_getFrameHeaderSize;
+ dctx->previousDstEnd = NULL;
+ dctx->base = NULL;
+ dctx->vBase = NULL;
+ dctx->dictEnd = NULL;
+ dctx->entropy.hufTable[0] = (HUF_DTable)((HufLog)*0x1000001); /* cover both little and big endian */
+ dctx->litEntropy = dctx->fseEntropy = 0;
+ dctx->dictID = 0;
+ ZSTD_STATIC_ASSERT(sizeof(dctx->entropy.rep) == sizeof(repStartValue));
+ memcpy(dctx->entropy.rep, repStartValue, sizeof(repStartValue)); /* initial repcodes */
+ dctx->LLTptr = dctx->entropy.LLTable;
+ dctx->MLTptr = dctx->entropy.MLTable;
+ dctx->OFTptr = dctx->entropy.OFTable;
+ dctx->HUFptr = dctx->entropy.hufTable;
+ return 0;
+}
+
+ZSTD_DCtx *ZSTD_createDCtx_advanced(ZSTD_customMem customMem)
+{
+ ZSTD_DCtx *dctx;
+
+ if (!customMem.customAlloc || !customMem.customFree)
+ return NULL;
+
+ dctx = (ZSTD_DCtx *)ZSTD_malloc(sizeof(ZSTD_DCtx), customMem);
+ if (!dctx)
+ return NULL;
+ memcpy(&dctx->customMem, &customMem, sizeof(customMem));
+ ZSTD_decompressBegin(dctx);
+ return dctx;
+}
+
+ZSTD_DCtx *ZSTD_initDCtx(void *workspace, size_t workspaceSize)
+{
+ ZSTD_customMem const stackMem = ZSTD_initStack(workspace, workspaceSize);
+ return ZSTD_createDCtx_advanced(stackMem);
+}
+
+size_t ZSTD_freeDCtx(ZSTD_DCtx *dctx)
+{
+ if (dctx == NULL)
+ return 0; /* support free on NULL */
+ ZSTD_free(dctx, dctx->customMem);
+ return 0; /* reserved as a potential error code in the future */
+}
+
+void ZSTD_copyDCtx(ZSTD_DCtx *dstDCtx, const ZSTD_DCtx *srcDCtx)
+{
+ size_t const workSpaceSize = (ZSTD_BLOCKSIZE_ABSOLUTEMAX + WILDCOPY_OVERLENGTH) + ZSTD_frameHeaderSize_max;
+ memcpy(dstDCtx, srcDCtx, sizeof(ZSTD_DCtx) - workSpaceSize); /* no need to copy workspace */
+}
+
+static void ZSTD_refDDict(ZSTD_DCtx *dstDCtx, const ZSTD_DDict *ddict);
+
+/*-*************************************************************
+* Decompression section
+***************************************************************/
+
+/*! ZSTD_isFrame() :
+ * Tells if the content of `buffer` starts with a valid Frame Identifier.
+ * Note : Frame Identifier is 4 bytes. If `size < 4`, @return will always be 0.
+ * Note 2 : Legacy Frame Identifiers are considered valid only if Legacy Support is enabled.
+ * Note 3 : Skippable Frame Identifiers are considered valid. */
+unsigned ZSTD_isFrame(const void *buffer, size_t size)
+{
+ if (size < 4)
+ return 0;
+ {
+ U32 const magic = ZSTD_readLE32(buffer);
+ if (magic == ZSTD_MAGICNUMBER)
+ return 1;
+ if ((magic & 0xFFFFFFF0U) == ZSTD_MAGIC_SKIPPABLE_START)
+ return 1;
+ }
+ return 0;
+}
+
+/** ZSTD_frameHeaderSize() :
+* srcSize must be >= ZSTD_frameHeaderSize_prefix.
+* @return : size of the Frame Header */
+static size_t ZSTD_frameHeaderSize(const void *src, size_t srcSize)
+{
+ if (srcSize < ZSTD_frameHeaderSize_prefix)
+ return ERROR(srcSize_wrong);
+ {
+ BYTE const fhd = ((const BYTE *)src)[4];
+ U32 const dictID = fhd & 3;
+ U32 const singleSegment = (fhd >> 5) & 1;
+ U32 const fcsId = fhd >> 6;
+ return ZSTD_frameHeaderSize_prefix + !singleSegment + ZSTD_did_fieldSize[dictID] + ZSTD_fcs_fieldSize[fcsId] + (singleSegment && !fcsId);
+ }
+}
+
+/** ZSTD_getFrameParams() :
+* decode Frame Header, or require larger `srcSize`.
+* @return : 0, `fparamsPtr` is correctly filled,
+* >0, `srcSize` is too small, result is expected `srcSize`,
+* or an error code, which can be tested using ZSTD_isError() */
+size_t ZSTD_getFrameParams(ZSTD_frameParams *fparamsPtr, const void *src, size_t srcSize)
+{
+ const BYTE *ip = (const BYTE *)src;
+
+ if (srcSize < ZSTD_frameHeaderSize_prefix)
+ return ZSTD_frameHeaderSize_prefix;
+ if (ZSTD_readLE32(src) != ZSTD_MAGICNUMBER) {
+ if ((ZSTD_readLE32(src) & 0xFFFFFFF0U) == ZSTD_MAGIC_SKIPPABLE_START) {
+ if (srcSize < ZSTD_skippableHeaderSize)
+ return ZSTD_skippableHeaderSize; /* magic number + skippable frame length */
+ memset(fparamsPtr, 0, sizeof(*fparamsPtr));
+ fparamsPtr->frameContentSize = ZSTD_readLE32((const char *)src + 4);
+ fparamsPtr->windowSize = 0; /* windowSize==0 means a frame is skippable */
+ return 0;
+ }
+ return ERROR(prefix_unknown);
+ }
+
+ /* ensure there is enough `srcSize` to fully read/decode frame header */
+ {
+ size_t const fhsize = ZSTD_frameHeaderSize(src, srcSize);
+ if (srcSize < fhsize)
+ return fhsize;
+ }
+
+ {
+ BYTE const fhdByte = ip[4];
+ size_t pos = 5;
+ U32 const dictIDSizeCode = fhdByte & 3;
+ U32 const checksumFlag = (fhdByte >> 2) & 1;
+ U32 const singleSegment = (fhdByte >> 5) & 1;
+ U32 const fcsID = fhdByte >> 6;
+ U32 const windowSizeMax = 1U << ZSTD_WINDOWLOG_MAX;
+ U32 windowSize = 0;
+ U32 dictID = 0;
+ U64 frameContentSize = 0;
+ if ((fhdByte & 0x08) != 0)
+ return ERROR(frameParameter_unsupported); /* reserved bits, which must be zero */
+ if (!singleSegment) {
+ BYTE const wlByte = ip[pos++];
+ U32 const windowLog = (wlByte >> 3) + ZSTD_WINDOWLOG_ABSOLUTEMIN;
+ if (windowLog > ZSTD_WINDOWLOG_MAX)
+ return ERROR(frameParameter_windowTooLarge); /* avoids issue with 1 << windowLog */
+ windowSize = (1U << windowLog);
+ windowSize += (windowSize >> 3) * (wlByte & 7);
+ }
+
+ switch (dictIDSizeCode) {
+ default: /* impossible */
+ case 0: break;
+ case 1:
+ dictID = ip[pos];
+ pos++;
+ break;
+ case 2:
+ dictID = ZSTD_readLE16(ip + pos);
+ pos += 2;
+ break;
+ case 3:
+ dictID = ZSTD_readLE32(ip + pos);
+ pos += 4;
+ break;
+ }
+ switch (fcsID) {
+ default: /* impossible */
+ case 0:
+ if (singleSegment)
+ frameContentSize = ip[pos];
+ break;
+ case 1: frameContentSize = ZSTD_readLE16(ip + pos) + 256; break;
+ case 2: frameContentSize = ZSTD_readLE32(ip + pos); break;
+ case 3: frameContentSize = ZSTD_readLE64(ip + pos); break;
+ }
+ if (!windowSize)
+ windowSize = (U32)frameContentSize;
+ if (windowSize > windowSizeMax)
+ return ERROR(frameParameter_windowTooLarge);
+ fparamsPtr->frameContentSize = frameContentSize;
+ fparamsPtr->windowSize = windowSize;
+ fparamsPtr->dictID = dictID;
+ fparamsPtr->checksumFlag = checksumFlag;
+ }
+ return 0;
+}
+
+/** ZSTD_getFrameContentSize() :
+* compatible with legacy mode
+* @return : decompressed size of the single frame pointed to be `src` if known, otherwise
+* - ZSTD_CONTENTSIZE_UNKNOWN if the size cannot be determined
+* - ZSTD_CONTENTSIZE_ERROR if an error occurred (e.g. invalid magic number, srcSize too small) */
+unsigned long long ZSTD_getFrameContentSize(const void *src, size_t srcSize)
+{
+ {
+ ZSTD_frameParams fParams;
+ if (ZSTD_getFrameParams(&fParams, src, srcSize) != 0)
+ return ZSTD_CONTENTSIZE_ERROR;
+ if (fParams.windowSize == 0) {
+ /* Either skippable or empty frame, size == 0 either way */
+ return 0;
+ } else if (fParams.frameContentSize != 0) {
+ return fParams.frameContentSize;
+ } else {
+ return ZSTD_CONTENTSIZE_UNKNOWN;
+ }
+ }
+}
+
+/** ZSTD_findDecompressedSize() :
+ * compatible with legacy mode
+ * `srcSize` must be the exact length of some number of ZSTD compressed and/or
+ * skippable frames
+ * @return : decompressed size of the frames contained */
+unsigned long long ZSTD_findDecompressedSize(const void *src, size_t srcSize)
+{
+ {
+ unsigned long long totalDstSize = 0;
+ while (srcSize >= ZSTD_frameHeaderSize_prefix) {
+ const U32 magicNumber = ZSTD_readLE32(src);
+
+ if ((magicNumber & 0xFFFFFFF0U) == ZSTD_MAGIC_SKIPPABLE_START) {
+ size_t skippableSize;
+ if (srcSize < ZSTD_skippableHeaderSize)
+ return ERROR(srcSize_wrong);
+ skippableSize = ZSTD_readLE32((const BYTE *)src + 4) + ZSTD_skippableHeaderSize;
+ if (srcSize < skippableSize) {
+ return ZSTD_CONTENTSIZE_ERROR;
+ }
+
+ src = (const BYTE *)src + skippableSize;
+ srcSize -= skippableSize;
+ continue;
+ }
+
+ {
+ unsigned long long const ret = ZSTD_getFrameContentSize(src, srcSize);
+ if (ret >= ZSTD_CONTENTSIZE_ERROR)
+ return ret;
+
+ /* check for overflow */
+ if (totalDstSize + ret < totalDstSize)
+ return ZSTD_CONTENTSIZE_ERROR;
+ totalDstSize += ret;
+ }
+ {
+ size_t const frameSrcSize = ZSTD_findFrameCompressedSize(src, srcSize);
+ if (ZSTD_isError(frameSrcSize)) {
+ return ZSTD_CONTENTSIZE_ERROR;
+ }
+
+ src = (const BYTE *)src + frameSrcSize;
+ srcSize -= frameSrcSize;
+ }
+ }
+
+ if (srcSize) {
+ return ZSTD_CONTENTSIZE_ERROR;
+ }
+
+ return totalDstSize;
+ }
+}
+
+/** ZSTD_decodeFrameHeader() :
+* `headerSize` must be the size provided by ZSTD_frameHeaderSize().
+* @return : 0 if success, or an error code, which can be tested using ZSTD_isError() */
+static size_t ZSTD_decodeFrameHeader(ZSTD_DCtx *dctx, const void *src, size_t headerSize)
+{
+ size_t const result = ZSTD_getFrameParams(&(dctx->fParams), src, headerSize);
+ if (ZSTD_isError(result))
+ return result; /* invalid header */
+ if (result > 0)
+ return ERROR(srcSize_wrong); /* headerSize too small */
+ if (dctx->fParams.dictID && (dctx->dictID != dctx->fParams.dictID))
+ return ERROR(dictionary_wrong);
+ if (dctx->fParams.checksumFlag)
+ xxh64_reset(&dctx->xxhState, 0);
+ return 0;
+}
+
+typedef struct {
+ blockType_e blockType;
+ U32 lastBlock;
+ U32 origSize;
+} blockProperties_t;
+
+/*! ZSTD_getcBlockSize() :
+* Provides the size of compressed block from block header `src` */
+size_t ZSTD_getcBlockSize(const void *src, size_t srcSize, blockProperties_t *bpPtr)
+{
+ if (srcSize < ZSTD_blockHeaderSize)
+ return ERROR(srcSize_wrong);
+ {
+ U32 const cBlockHeader = ZSTD_readLE24(src);
+ U32 const cSize = cBlockHeader >> 3;
+ bpPtr->lastBlock = cBlockHeader & 1;
+ bpPtr->blockType = (blockType_e)((cBlockHeader >> 1) & 3);
+ bpPtr->origSize = cSize; /* only useful for RLE */
+ if (bpPtr->blockType == bt_rle)
+ return 1;
+ if (bpPtr->blockType == bt_reserved)
+ return ERROR(corruption_detected);
+ return cSize;
+ }
+}
+
+static size_t ZSTD_copyRawBlock(void *dst, size_t dstCapacity, const void *src, size_t srcSize)
+{
+ if (srcSize > dstCapacity)
+ return ERROR(dstSize_tooSmall);
+ memcpy(dst, src, srcSize);
+ return srcSize;
+}
+
+static size_t ZSTD_setRleBlock(void *dst, size_t dstCapacity, const void *src, size_t srcSize, size_t regenSize)
+{
+ if (srcSize != 1)
+ return ERROR(srcSize_wrong);
+ if (regenSize > dstCapacity)
+ return ERROR(dstSize_tooSmall);
+ memset(dst, *(const BYTE *)src, regenSize);
+ return regenSize;
+}
+
+/*! ZSTD_decodeLiteralsBlock() :
+ @return : nb of bytes read from src (< srcSize ) */
+size_t ZSTD_decodeLiteralsBlock(ZSTD_DCtx *dctx, const void *src, size_t srcSize) /* note : srcSize < BLOCKSIZE */
+{
+ if (srcSize < MIN_CBLOCK_SIZE)
+ return ERROR(corruption_detected);
+
+ {
+ const BYTE *const istart = (const BYTE *)src;
+ symbolEncodingType_e const litEncType = (symbolEncodingType_e)(istart[0] & 3);
+
+ switch (litEncType) {
+ case set_repeat:
+ if (dctx->litEntropy == 0)
+ return ERROR(dictionary_corrupted);
+ /* fall-through */
+ case set_compressed:
+ if (srcSize < 5)
+ return ERROR(corruption_detected); /* srcSize >= MIN_CBLOCK_SIZE == 3; here we need up to 5 for case 3 */
+ {
+ size_t lhSize, litSize, litCSize;
+ U32 singleStream = 0;
+ U32 const lhlCode = (istart[0] >> 2) & 3;
+ U32 const lhc = ZSTD_readLE32(istart);
+ switch (lhlCode) {
+ case 0:
+ case 1:
+ default: /* note : default is impossible, since lhlCode into [0..3] */
+ /* 2 - 2 - 10 - 10 */
+ singleStream = !lhlCode;
+ lhSize = 3;
+ litSize = (lhc >> 4) & 0x3FF;
+ litCSize = (lhc >> 14) & 0x3FF;
+ break;
+ case 2:
+ /* 2 - 2 - 14 - 14 */
+ lhSize = 4;
+ litSize = (lhc >> 4) & 0x3FFF;
+ litCSize = lhc >> 18;
+ break;
+ case 3:
+ /* 2 - 2 - 18 - 18 */
+ lhSize = 5;
+ litSize = (lhc >> 4) & 0x3FFFF;
+ litCSize = (lhc >> 22) + (istart[4] << 10);
+ break;
+ }
+ if (litSize > ZSTD_BLOCKSIZE_ABSOLUTEMAX)
+ return ERROR(corruption_detected);
+ if (litCSize + lhSize > srcSize)
+ return ERROR(corruption_detected);
+
+ if (HUF_isError(
+ (litEncType == set_repeat)
+ ? (singleStream ? HUF_decompress1X_usingDTable(dctx->litBuffer, litSize, istart + lhSize, litCSize, dctx->HUFptr)
+ : HUF_decompress4X_usingDTable(dctx->litBuffer, litSize, istart + lhSize, litCSize, dctx->HUFptr))
+ : (singleStream
+ ? HUF_decompress1X2_DCtx_wksp(dctx->entropy.hufTable, dctx->litBuffer, litSize, istart + lhSize, litCSize,
+ dctx->entropy.workspace, sizeof(dctx->entropy.workspace))
+ : HUF_decompress4X_hufOnly_wksp(dctx->entropy.hufTable, dctx->litBuffer, litSize, istart + lhSize, litCSize,
+ dctx->entropy.workspace, sizeof(dctx->entropy.workspace)))))
+ return ERROR(corruption_detected);
+
+ dctx->litPtr = dctx->litBuffer;
+ dctx->litSize = litSize;
+ dctx->litEntropy = 1;
+ if (litEncType == set_compressed)
+ dctx->HUFptr = dctx->entropy.hufTable;
+ memset(dctx->litBuffer + dctx->litSize, 0, WILDCOPY_OVERLENGTH);
+ return litCSize + lhSize;
+ }
+
+ case set_basic: {
+ size_t litSize, lhSize;
+ U32 const lhlCode = ((istart[0]) >> 2) & 3;
+ switch (lhlCode) {
+ case 0:
+ case 2:
+ default: /* note : default is impossible, since lhlCode into [0..3] */
+ lhSize = 1;
+ litSize = istart[0] >> 3;
+ break;
+ case 1:
+ lhSize = 2;
+ litSize = ZSTD_readLE16(istart) >> 4;
+ break;
+ case 3:
+ lhSize = 3;
+ litSize = ZSTD_readLE24(istart) >> 4;
+ break;
+ }
+
+ if (lhSize + litSize + WILDCOPY_OVERLENGTH > srcSize) { /* risk reading beyond src buffer with wildcopy */
+ if (litSize + lhSize > srcSize)
+ return ERROR(corruption_detected);
+ memcpy(dctx->litBuffer, istart + lhSize, litSize);
+ dctx->litPtr = dctx->litBuffer;
+ dctx->litSize = litSize;
+ memset(dctx->litBuffer + dctx->litSize, 0, WILDCOPY_OVERLENGTH);
+ return lhSize + litSize;
+ }
+ /* direct reference into compressed stream */
+ dctx->litPtr = istart + lhSize;
+ dctx->litSize = litSize;
+ return lhSize + litSize;
+ }
+
+ case set_rle: {
+ U32 const lhlCode = ((istart[0]) >> 2) & 3;
+ size_t litSize, lhSize;
+ switch (lhlCode) {
+ case 0:
+ case 2:
+ default: /* note : default is impossible, since lhlCode into [0..3] */
+ lhSize = 1;
+ litSize = istart[0] >> 3;
+ break;
+ case 1:
+ lhSize = 2;
+ litSize = ZSTD_readLE16(istart) >> 4;
+ break;
+ case 3:
+ lhSize = 3;
+ litSize = ZSTD_readLE24(istart) >> 4;
+ if (srcSize < 4)
+ return ERROR(corruption_detected); /* srcSize >= MIN_CBLOCK_SIZE == 3; here we need lhSize+1 = 4 */
+ break;
+ }
+ if (litSize > ZSTD_BLOCKSIZE_ABSOLUTEMAX)
+ return ERROR(corruption_detected);
+ memset(dctx->litBuffer, istart[lhSize], litSize + WILDCOPY_OVERLENGTH);
+ dctx->litPtr = dctx->litBuffer;
+ dctx->litSize = litSize;
+ return lhSize + 1;
+ }
+ default:
+ return ERROR(corruption_detected); /* impossible */
+ }
+ }
+}
+
+typedef union {
+ FSE_decode_t realData;
+ U32 alignedBy4;
+} FSE_decode_t4;
+
+static const FSE_decode_t4 LL_defaultDTable[(1 << LL_DEFAULTNORMLOG) + 1] = {
+ {{LL_DEFAULTNORMLOG, 1, 1}}, /* header : tableLog, fastMode, fastMode */
+ {{0, 0, 4}}, /* 0 : base, symbol, bits */
+ {{16, 0, 4}},
+ {{32, 1, 5}},
+ {{0, 3, 5}},
+ {{0, 4, 5}},
+ {{0, 6, 5}},
+ {{0, 7, 5}},
+ {{0, 9, 5}},
+ {{0, 10, 5}},
+ {{0, 12, 5}},
+ {{0, 14, 6}},
+ {{0, 16, 5}},
+ {{0, 18, 5}},
+ {{0, 19, 5}},
+ {{0, 21, 5}},
+ {{0, 22, 5}},
+ {{0, 24, 5}},
+ {{32, 25, 5}},
+ {{0, 26, 5}},
+ {{0, 27, 6}},
+ {{0, 29, 6}},
+ {{0, 31, 6}},
+ {{32, 0, 4}},
+ {{0, 1, 4}},
+ {{0, 2, 5}},
+ {{32, 4, 5}},
+ {{0, 5, 5}},
+ {{32, 7, 5}},
+ {{0, 8, 5}},
+ {{32, 10, 5}},
+ {{0, 11, 5}},
+ {{0, 13, 6}},
+ {{32, 16, 5}},
+ {{0, 17, 5}},
+ {{32, 19, 5}},
+ {{0, 20, 5}},
+ {{32, 22, 5}},
+ {{0, 23, 5}},
+ {{0, 25, 4}},
+ {{16, 25, 4}},
+ {{32, 26, 5}},
+ {{0, 28, 6}},
+ {{0, 30, 6}},
+ {{48, 0, 4}},
+ {{16, 1, 4}},
+ {{32, 2, 5}},
+ {{32, 3, 5}},
+ {{32, 5, 5}},
+ {{32, 6, 5}},
+ {{32, 8, 5}},
+ {{32, 9, 5}},
+ {{32, 11, 5}},
+ {{32, 12, 5}},
+ {{0, 15, 6}},
+ {{32, 17, 5}},
+ {{32, 18, 5}},
+ {{32, 20, 5}},
+ {{32, 21, 5}},
+ {{32, 23, 5}},
+ {{32, 24, 5}},
+ {{0, 35, 6}},
+ {{0, 34, 6}},
+ {{0, 33, 6}},
+ {{0, 32, 6}},
+}; /* LL_defaultDTable */
+
+static const FSE_decode_t4 ML_defaultDTable[(1 << ML_DEFAULTNORMLOG) + 1] = {
+ {{ML_DEFAULTNORMLOG, 1, 1}}, /* header : tableLog, fastMode, fastMode */
+ {{0, 0, 6}}, /* 0 : base, symbol, bits */
+ {{0, 1, 4}},
+ {{32, 2, 5}},
+ {{0, 3, 5}},
+ {{0, 5, 5}},
+ {{0, 6, 5}},
+ {{0, 8, 5}},
+ {{0, 10, 6}},
+ {{0, 13, 6}},
+ {{0, 16, 6}},
+ {{0, 19, 6}},
+ {{0, 22, 6}},
+ {{0, 25, 6}},
+ {{0, 28, 6}},
+ {{0, 31, 6}},
+ {{0, 33, 6}},
+ {{0, 35, 6}},
+ {{0, 37, 6}},
+ {{0, 39, 6}},
+ {{0, 41, 6}},
+ {{0, 43, 6}},
+ {{0, 45, 6}},
+ {{16, 1, 4}},
+ {{0, 2, 4}},
+ {{32, 3, 5}},
+ {{0, 4, 5}},
+ {{32, 6, 5}},
+ {{0, 7, 5}},
+ {{0, 9, 6}},
+ {{0, 12, 6}},
+ {{0, 15, 6}},
+ {{0, 18, 6}},
+ {{0, 21, 6}},
+ {{0, 24, 6}},
+ {{0, 27, 6}},
+ {{0, 30, 6}},
+ {{0, 32, 6}},
+ {{0, 34, 6}},
+ {{0, 36, 6}},
+ {{0, 38, 6}},
+ {{0, 40, 6}},
+ {{0, 42, 6}},
+ {{0, 44, 6}},
+ {{32, 1, 4}},
+ {{48, 1, 4}},
+ {{16, 2, 4}},
+ {{32, 4, 5}},
+ {{32, 5, 5}},
+ {{32, 7, 5}},
+ {{32, 8, 5}},
+ {{0, 11, 6}},
+ {{0, 14, 6}},
+ {{0, 17, 6}},
+ {{0, 20, 6}},
+ {{0, 23, 6}},
+ {{0, 26, 6}},
+ {{0, 29, 6}},
+ {{0, 52, 6}},
+ {{0, 51, 6}},
+ {{0, 50, 6}},
+ {{0, 49, 6}},
+ {{0, 48, 6}},
+ {{0, 47, 6}},
+ {{0, 46, 6}},
+}; /* ML_defaultDTable */
+
+static const FSE_decode_t4 OF_defaultDTable[(1 << OF_DEFAULTNORMLOG) + 1] = {
+ {{OF_DEFAULTNORMLOG, 1, 1}}, /* header : tableLog, fastMode, fastMode */
+ {{0, 0, 5}}, /* 0 : base, symbol, bits */
+ {{0, 6, 4}},
+ {{0, 9, 5}},
+ {{0, 15, 5}},
+ {{0, 21, 5}},
+ {{0, 3, 5}},
+ {{0, 7, 4}},
+ {{0, 12, 5}},
+ {{0, 18, 5}},
+ {{0, 23, 5}},
+ {{0, 5, 5}},
+ {{0, 8, 4}},
+ {{0, 14, 5}},
+ {{0, 20, 5}},
+ {{0, 2, 5}},
+ {{16, 7, 4}},
+ {{0, 11, 5}},
+ {{0, 17, 5}},
+ {{0, 22, 5}},
+ {{0, 4, 5}},
+ {{16, 8, 4}},
+ {{0, 13, 5}},
+ {{0, 19, 5}},
+ {{0, 1, 5}},
+ {{16, 6, 4}},
+ {{0, 10, 5}},
+ {{0, 16, 5}},
+ {{0, 28, 5}},
+ {{0, 27, 5}},
+ {{0, 26, 5}},
+ {{0, 25, 5}},
+ {{0, 24, 5}},
+}; /* OF_defaultDTable */
+
+/*! ZSTD_buildSeqTable() :
+ @return : nb bytes read from src,
+ or an error code if it fails, testable with ZSTD_isError()
+*/
+static size_t ZSTD_buildSeqTable(FSE_DTable *DTableSpace, const FSE_DTable **DTablePtr, symbolEncodingType_e type, U32 max, U32 maxLog, const void *src,
+ size_t srcSize, const FSE_decode_t4 *defaultTable, U32 flagRepeatTable, void *workspace, size_t workspaceSize)
+{
+ const void *const tmpPtr = defaultTable; /* bypass strict aliasing */
+ switch (type) {
+ case set_rle:
+ if (!srcSize)
+ return ERROR(srcSize_wrong);
+ if ((*(const BYTE *)src) > max)
+ return ERROR(corruption_detected);
+ FSE_buildDTable_rle(DTableSpace, *(const BYTE *)src);
+ *DTablePtr = DTableSpace;
+ return 1;
+ case set_basic: *DTablePtr = (const FSE_DTable *)tmpPtr; return 0;
+ case set_repeat:
+ if (!flagRepeatTable)
+ return ERROR(corruption_detected);
+ return 0;
+ default: /* impossible */
+ case set_compressed: {
+ U32 tableLog;
+ S16 *norm = (S16 *)workspace;
+ size_t const spaceUsed32 = ALIGN(sizeof(S16) * (MaxSeq + 1), sizeof(U32)) >> 2;
+
+ if ((spaceUsed32 << 2) > workspaceSize)
+ return ERROR(GENERIC);
+ workspace = (U32 *)workspace + spaceUsed32;
+ workspaceSize -= (spaceUsed32 << 2);
+ {
+ size_t const headerSize = FSE_readNCount(norm, &max, &tableLog, src, srcSize);
+ if (FSE_isError(headerSize))
+ return ERROR(corruption_detected);
+ if (tableLog > maxLog)
+ return ERROR(corruption_detected);
+ FSE_buildDTable_wksp(DTableSpace, norm, max, tableLog, workspace, workspaceSize);
+ *DTablePtr = DTableSpace;
+ return headerSize;
+ }
+ }
+ }
+}
+
+size_t ZSTD_decodeSeqHeaders(ZSTD_DCtx *dctx, int *nbSeqPtr, const void *src, size_t srcSize)
+{
+ const BYTE *const istart = (const BYTE *const)src;
+ const BYTE *const iend = istart + srcSize;
+ const BYTE *ip = istart;
+
+ /* check */
+ if (srcSize < MIN_SEQUENCES_SIZE)
+ return ERROR(srcSize_wrong);
+
+ /* SeqHead */
+ {
+ int nbSeq = *ip++;
+ if (!nbSeq) {
+ *nbSeqPtr = 0;
+ return 1;
+ }
+ if (nbSeq > 0x7F) {
+ if (nbSeq == 0xFF) {
+ if (ip + 2 > iend)
+ return ERROR(srcSize_wrong);
+ nbSeq = ZSTD_readLE16(ip) + LONGNBSEQ, ip += 2;
+ } else {
+ if (ip >= iend)
+ return ERROR(srcSize_wrong);
+ nbSeq = ((nbSeq - 0x80) << 8) + *ip++;
+ }
+ }
+ *nbSeqPtr = nbSeq;
+ }
+
+ /* FSE table descriptors */
+ if (ip + 4 > iend)
+ return ERROR(srcSize_wrong); /* minimum possible size */
+ {
+ symbolEncodingType_e const LLtype = (symbolEncodingType_e)(*ip >> 6);
+ symbolEncodingType_e const OFtype = (symbolEncodingType_e)((*ip >> 4) & 3);
+ symbolEncodingType_e const MLtype = (symbolEncodingType_e)((*ip >> 2) & 3);
+ ip++;
+
+ /* Build DTables */
+ {
+ size_t const llhSize = ZSTD_buildSeqTable(dctx->entropy.LLTable, &dctx->LLTptr, LLtype, MaxLL, LLFSELog, ip, iend - ip,
+ LL_defaultDTable, dctx->fseEntropy, dctx->entropy.workspace, sizeof(dctx->entropy.workspace));
+ if (ZSTD_isError(llhSize))
+ return ERROR(corruption_detected);
+ ip += llhSize;
+ }
+ {
+ size_t const ofhSize = ZSTD_buildSeqTable(dctx->entropy.OFTable, &dctx->OFTptr, OFtype, MaxOff, OffFSELog, ip, iend - ip,
+ OF_defaultDTable, dctx->fseEntropy, dctx->entropy.workspace, sizeof(dctx->entropy.workspace));
+ if (ZSTD_isError(ofhSize))
+ return ERROR(corruption_detected);
+ ip += ofhSize;
+ }
+ {
+ size_t const mlhSize = ZSTD_buildSeqTable(dctx->entropy.MLTable, &dctx->MLTptr, MLtype, MaxML, MLFSELog, ip, iend - ip,
+ ML_defaultDTable, dctx->fseEntropy, dctx->entropy.workspace, sizeof(dctx->entropy.workspace));
+ if (ZSTD_isError(mlhSize))
+ return ERROR(corruption_detected);
+ ip += mlhSize;
+ }
+ }
+
+ return ip - istart;
+}
+
+typedef struct {
+ size_t litLength;
+ size_t matchLength;
+ size_t offset;
+ const BYTE *match;
+} seq_t;
+
+typedef struct {
+ BIT_DStream_t DStream;
+ FSE_DState_t stateLL;
+ FSE_DState_t stateOffb;
+ FSE_DState_t stateML;
+ size_t prevOffset[ZSTD_REP_NUM];
+ const BYTE *base;
+ size_t pos;
+ uPtrDiff gotoDict;
+} seqState_t;
+
+FORCE_NOINLINE
+size_t ZSTD_execSequenceLast7(BYTE *op, BYTE *const oend, seq_t sequence, const BYTE **litPtr, const BYTE *const litLimit, const BYTE *const base,
+ const BYTE *const vBase, const BYTE *const dictEnd)
+{
+ BYTE *const oLitEnd = op + sequence.litLength;
+ size_t const sequenceLength = sequence.litLength + sequence.matchLength;
+ BYTE *const oMatchEnd = op + sequenceLength; /* risk : address space overflow (32-bits) */
+ BYTE *const oend_w = oend - WILDCOPY_OVERLENGTH;
+ const BYTE *const iLitEnd = *litPtr + sequence.litLength;
+ const BYTE *match = oLitEnd - sequence.offset;
+
+ /* check */
+ if (oMatchEnd > oend)
+ return ERROR(dstSize_tooSmall); /* last match must start at a minimum distance of WILDCOPY_OVERLENGTH from oend */
+ if (iLitEnd > litLimit)
+ return ERROR(corruption_detected); /* over-read beyond lit buffer */
+ if (oLitEnd <= oend_w)
+ return ERROR(GENERIC); /* Precondition */
+
+ /* copy literals */
+ if (op < oend_w) {
+ ZSTD_wildcopy(op, *litPtr, oend_w - op);
+ *litPtr += oend_w - op;
+ op = oend_w;
+ }
+ while (op < oLitEnd)
+ *op++ = *(*litPtr)++;
+
+ /* copy Match */
+ if (sequence.offset > (size_t)(oLitEnd - base)) {
+ /* offset beyond prefix */
+ if (sequence.offset > (size_t)(oLitEnd - vBase))
+ return ERROR(corruption_detected);
+ match = dictEnd - (base - match);
+ if (match + sequence.matchLength <= dictEnd) {
+ memmove(oLitEnd, match, sequence.matchLength);
+ return sequenceLength;
+ }
+ /* span extDict & currPrefixSegment */
+ {
+ size_t const length1 = dictEnd - match;
+ memmove(oLitEnd, match, length1);
+ op = oLitEnd + length1;
+ sequence.matchLength -= length1;
+ match = base;
+ }
+ }
+ while (op < oMatchEnd)
+ *op++ = *match++;
+ return sequenceLength;
+}
+
+static seq_t ZSTD_decodeSequence(seqState_t *seqState)
+{
+ seq_t seq;
+
+ U32 const llCode = FSE_peekSymbol(&seqState->stateLL);
+ U32 const mlCode = FSE_peekSymbol(&seqState->stateML);
+ U32 const ofCode = FSE_peekSymbol(&seqState->stateOffb); /* <= maxOff, by table construction */
+
+ U32 const llBits = LL_bits[llCode];
+ U32 const mlBits = ML_bits[mlCode];
+ U32 const ofBits = ofCode;
+ U32 const totalBits = llBits + mlBits + ofBits;
+
+ static const U32 LL_base[MaxLL + 1] = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 18,
+ 20, 22, 24, 28, 32, 40, 48, 64, 0x80, 0x100, 0x200, 0x400, 0x800, 0x1000, 0x2000, 0x4000, 0x8000, 0x10000};
+
+ static const U32 ML_base[MaxML + 1] = {3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20,
+ 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 37, 39, 41,
+ 43, 47, 51, 59, 67, 83, 99, 0x83, 0x103, 0x203, 0x403, 0x803, 0x1003, 0x2003, 0x4003, 0x8003, 0x10003};
+
+ static const U32 OF_base[MaxOff + 1] = {0, 1, 1, 5, 0xD, 0x1D, 0x3D, 0x7D, 0xFD, 0x1FD,
+ 0x3FD, 0x7FD, 0xFFD, 0x1FFD, 0x3FFD, 0x7FFD, 0xFFFD, 0x1FFFD, 0x3FFFD, 0x7FFFD,
+ 0xFFFFD, 0x1FFFFD, 0x3FFFFD, 0x7FFFFD, 0xFFFFFD, 0x1FFFFFD, 0x3FFFFFD, 0x7FFFFFD, 0xFFFFFFD};
+
+ /* sequence */
+ {
+ size_t offset;
+ if (!ofCode)
+ offset = 0;
+ else {
+ offset = OF_base[ofCode] + BIT_readBitsFast(&seqState->DStream, ofBits); /* <= (ZSTD_WINDOWLOG_MAX-1) bits */
+ if (ZSTD_32bits())
+ BIT_reloadDStream(&seqState->DStream);
+ }
+
+ if (ofCode <= 1) {
+ offset += (llCode == 0);
+ if (offset) {
+ size_t temp = (offset == 3) ? seqState->prevOffset[0] - 1 : seqState->prevOffset[offset];
+ temp += !temp; /* 0 is not valid; input is corrupted; force offset to 1 */
+ if (offset != 1)
+ seqState->prevOffset[2] = seqState->prevOffset[1];
+ seqState->prevOffset[1] = seqState->prevOffset[0];
+ seqState->prevOffset[0] = offset = temp;
+ } else {
+ offset = seqState->prevOffset[0];
+ }
+ } else {
+ seqState->prevOffset[2] = seqState->prevOffset[1];
+ seqState->prevOffset[1] = seqState->prevOffset[0];
+ seqState->prevOffset[0] = offset;
+ }
+ seq.offset = offset;
+ }
+
+ seq.matchLength = ML_base[mlCode] + ((mlCode > 31) ? BIT_readBitsFast(&seqState->DStream, mlBits) : 0); /* <= 16 bits */
+ if (ZSTD_32bits() && (mlBits + llBits > 24))
+ BIT_reloadDStream(&seqState->DStream);
+
+ seq.litLength = LL_base[llCode] + ((llCode > 15) ? BIT_readBitsFast(&seqState->DStream, llBits) : 0); /* <= 16 bits */
+ if (ZSTD_32bits() || (totalBits > 64 - 7 - (LLFSELog + MLFSELog + OffFSELog)))
+ BIT_reloadDStream(&seqState->DStream);
+
+ /* ANS state update */
+ FSE_updateState(&seqState->stateLL, &seqState->DStream); /* <= 9 bits */
+ FSE_updateState(&seqState->stateML, &seqState->DStream); /* <= 9 bits */
+ if (ZSTD_32bits())
+ BIT_reloadDStream(&seqState->DStream); /* <= 18 bits */
+ FSE_updateState(&seqState->stateOffb, &seqState->DStream); /* <= 8 bits */
+
+ seq.match = NULL;
+
+ return seq;
+}
+
+FORCE_INLINE
+size_t ZSTD_execSequence(BYTE *op, BYTE *const oend, seq_t sequence, const BYTE **litPtr, const BYTE *const litLimit, const BYTE *const base,
+ const BYTE *const vBase, const BYTE *const dictEnd)
+{
+ BYTE *const oLitEnd = op + sequence.litLength;
+ size_t const sequenceLength = sequence.litLength + sequence.matchLength;
+ BYTE *const oMatchEnd = op + sequenceLength; /* risk : address space overflow (32-bits) */
+ BYTE *const oend_w = oend - WILDCOPY_OVERLENGTH;
+ const BYTE *const iLitEnd = *litPtr + sequence.litLength;
+ const BYTE *match = oLitEnd - sequence.offset;
+
+ /* check */
+ if (oMatchEnd > oend)
+ return ERROR(dstSize_tooSmall); /* last match must start at a minimum distance of WILDCOPY_OVERLENGTH from oend */
+ if (iLitEnd > litLimit)
+ return ERROR(corruption_detected); /* over-read beyond lit buffer */
+ if (oLitEnd > oend_w)
+ return ZSTD_execSequenceLast7(op, oend, sequence, litPtr, litLimit, base, vBase, dictEnd);
+
+ /* copy Literals */
+ ZSTD_copy8(op, *litPtr);
+ if (sequence.litLength > 8)
+ ZSTD_wildcopy(op + 8, (*litPtr) + 8,
+ sequence.litLength - 8); /* note : since oLitEnd <= oend-WILDCOPY_OVERLENGTH, no risk of overwrite beyond oend */
+ op = oLitEnd;
+ *litPtr = iLitEnd; /* update for next sequence */
+
+ /* copy Match */
+ if (sequence.offset > (size_t)(oLitEnd - base)) {
+ /* offset beyond prefix */
+ if (sequence.offset > (size_t)(oLitEnd - vBase))
+ return ERROR(corruption_detected);
+ match = dictEnd + (match - base);
+ if (match + sequence.matchLength <= dictEnd) {
+ memmove(oLitEnd, match, sequence.matchLength);
+ return sequenceLength;
+ }
+ /* span extDict & currPrefixSegment */
+ {
+ size_t const length1 = dictEnd - match;
+ memmove(oLitEnd, match, length1);
+ op = oLitEnd + length1;
+ sequence.matchLength -= length1;
+ match = base;
+ if (op > oend_w || sequence.matchLength < MINMATCH) {
+ U32 i;
+ for (i = 0; i < sequence.matchLength; ++i)
+ op[i] = match[i];
+ return sequenceLength;
+ }
+ }
+ }
+ /* Requirement: op <= oend_w && sequence.matchLength >= MINMATCH */
+
+ /* match within prefix */
+ if (sequence.offset < 8) {
+ /* close range match, overlap */
+ static const U32 dec32table[] = {0, 1, 2, 1, 4, 4, 4, 4}; /* added */
+ static const int dec64table[] = {8, 8, 8, 7, 8, 9, 10, 11}; /* subtracted */
+ int const sub2 = dec64table[sequence.offset];
+ op[0] = match[0];
+ op[1] = match[1];
+ op[2] = match[2];
+ op[3] = match[3];
+ match += dec32table[sequence.offset];
+ ZSTD_copy4(op + 4, match);
+ match -= sub2;
+ } else {
+ ZSTD_copy8(op, match);
+ }
+ op += 8;
+ match += 8;
+
+ if (oMatchEnd > oend - (16 - MINMATCH)) {
+ if (op < oend_w) {
+ ZSTD_wildcopy(op, match, oend_w - op);
+ match += oend_w - op;
+ op = oend_w;
+ }
+ while (op < oMatchEnd)
+ *op++ = *match++;
+ } else {
+ ZSTD_wildcopy(op, match, (ptrdiff_t)sequence.matchLength - 8); /* works even if matchLength < 8 */
+ }
+ return sequenceLength;
+}
+
+static size_t ZSTD_decompressSequences(ZSTD_DCtx *dctx, void *dst, size_t maxDstSize, const void *seqStart, size_t seqSize)
+{
+ const BYTE *ip = (const BYTE *)seqStart;
+ const BYTE *const iend = ip + seqSize;
+ BYTE *const ostart = (BYTE * const)dst;
+ BYTE *const oend = ostart + maxDstSize;
+ BYTE *op = ostart;
+ const BYTE *litPtr = dctx->litPtr;
+ const BYTE *const litEnd = litPtr + dctx->litSize;
+ const BYTE *const base = (const BYTE *)(dctx->base);
+ const BYTE *const vBase = (const BYTE *)(dctx->vBase);
+ const BYTE *const dictEnd = (const BYTE *)(dctx->dictEnd);
+ int nbSeq;
+
+ /* Build Decoding Tables */
+ {
+ size_t const seqHSize = ZSTD_decodeSeqHeaders(dctx, &nbSeq, ip, seqSize);
+ if (ZSTD_isError(seqHSize))
+ return seqHSize;
+ ip += seqHSize;
+ }
+
+ /* Regen sequences */
+ if (nbSeq) {
+ seqState_t seqState;
+ dctx->fseEntropy = 1;
+ {
+ U32 i;
+ for (i = 0; i < ZSTD_REP_NUM; i++)
+ seqState.prevOffset[i] = dctx->entropy.rep[i];
+ }
+ CHECK_E(BIT_initDStream(&seqState.DStream, ip, iend - ip), corruption_detected);
+ FSE_initDState(&seqState.stateLL, &seqState.DStream, dctx->LLTptr);
+ FSE_initDState(&seqState.stateOffb, &seqState.DStream, dctx->OFTptr);
+ FSE_initDState(&seqState.stateML, &seqState.DStream, dctx->MLTptr);
+
+ for (; (BIT_reloadDStream(&(seqState.DStream)) <= BIT_DStream_completed) && nbSeq;) {
+ nbSeq--;
+ {
+ seq_t const sequence = ZSTD_decodeSequence(&seqState);
+ size_t const oneSeqSize = ZSTD_execSequence(op, oend, sequence, &litPtr, litEnd, base, vBase, dictEnd);
+ if (ZSTD_isError(oneSeqSize))
+ return oneSeqSize;
+ op += oneSeqSize;
+ }
+ }
+
+ /* check if reached exact end */
+ if (nbSeq)
+ return ERROR(corruption_detected);
+ /* save reps for next block */
+ {
+ U32 i;
+ for (i = 0; i < ZSTD_REP_NUM; i++)
+ dctx->entropy.rep[i] = (U32)(seqState.prevOffset[i]);
+ }
+ }
+
+ /* last literal segment */
+ {
+ size_t const lastLLSize = litEnd - litPtr;
+ if (lastLLSize > (size_t)(oend - op))
+ return ERROR(dstSize_tooSmall);
+ memcpy(op, litPtr, lastLLSize);
+ op += lastLLSize;
+ }
+
+ return op - ostart;
+}
+
+FORCE_INLINE seq_t ZSTD_decodeSequenceLong_generic(seqState_t *seqState, int const longOffsets)
+{
+ seq_t seq;
+
+ U32 const llCode = FSE_peekSymbol(&seqState->stateLL);
+ U32 const mlCode = FSE_peekSymbol(&seqState->stateML);
+ U32 const ofCode = FSE_peekSymbol(&seqState->stateOffb); /* <= maxOff, by table construction */
+
+ U32 const llBits = LL_bits[llCode];
+ U32 const mlBits = ML_bits[mlCode];
+ U32 const ofBits = ofCode;
+ U32 const totalBits = llBits + mlBits + ofBits;
+
+ static const U32 LL_base[MaxLL + 1] = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 18,
+ 20, 22, 24, 28, 32, 40, 48, 64, 0x80, 0x100, 0x200, 0x400, 0x800, 0x1000, 0x2000, 0x4000, 0x8000, 0x10000};
+
+ static const U32 ML_base[MaxML + 1] = {3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20,
+ 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 37, 39, 41,
+ 43, 47, 51, 59, 67, 83, 99, 0x83, 0x103, 0x203, 0x403, 0x803, 0x1003, 0x2003, 0x4003, 0x8003, 0x10003};
+
+ static const U32 OF_base[MaxOff + 1] = {0, 1, 1, 5, 0xD, 0x1D, 0x3D, 0x7D, 0xFD, 0x1FD,
+ 0x3FD, 0x7FD, 0xFFD, 0x1FFD, 0x3FFD, 0x7FFD, 0xFFFD, 0x1FFFD, 0x3FFFD, 0x7FFFD,
+ 0xFFFFD, 0x1FFFFD, 0x3FFFFD, 0x7FFFFD, 0xFFFFFD, 0x1FFFFFD, 0x3FFFFFD, 0x7FFFFFD, 0xFFFFFFD};
+
+ /* sequence */
+ {
+ size_t offset;
+ if (!ofCode)
+ offset = 0;
+ else {
+ if (longOffsets) {
+ int const extraBits = ofBits - MIN(ofBits, STREAM_ACCUMULATOR_MIN);
+ offset = OF_base[ofCode] + (BIT_readBitsFast(&seqState->DStream, ofBits - extraBits) << extraBits);
+ if (ZSTD_32bits() || extraBits)
+ BIT_reloadDStream(&seqState->DStream);
+ if (extraBits)
+ offset += BIT_readBitsFast(&seqState->DStream, extraBits);
+ } else {
+ offset = OF_base[ofCode] + BIT_readBitsFast(&seqState->DStream, ofBits); /* <= (ZSTD_WINDOWLOG_MAX-1) bits */
+ if (ZSTD_32bits())
+ BIT_reloadDStream(&seqState->DStream);
+ }
+ }
+
+ if (ofCode <= 1) {
+ offset += (llCode == 0);
+ if (offset) {
+ size_t temp = (offset == 3) ? seqState->prevOffset[0] - 1 : seqState->prevOffset[offset];
+ temp += !temp; /* 0 is not valid; input is corrupted; force offset to 1 */
+ if (offset != 1)
+ seqState->prevOffset[2] = seqState->prevOffset[1];
+ seqState->prevOffset[1] = seqState->prevOffset[0];
+ seqState->prevOffset[0] = offset = temp;
+ } else {
+ offset = seqState->prevOffset[0];
+ }
+ } else {
+ seqState->prevOffset[2] = seqState->prevOffset[1];
+ seqState->prevOffset[1] = seqState->prevOffset[0];
+ seqState->prevOffset[0] = offset;
+ }
+ seq.offset = offset;
+ }
+
+ seq.matchLength = ML_base[mlCode] + ((mlCode > 31) ? BIT_readBitsFast(&seqState->DStream, mlBits) : 0); /* <= 16 bits */
+ if (ZSTD_32bits() && (mlBits + llBits > 24))
+ BIT_reloadDStream(&seqState->DStream);
+
+ seq.litLength = LL_base[llCode] + ((llCode > 15) ? BIT_readBitsFast(&seqState->DStream, llBits) : 0); /* <= 16 bits */
+ if (ZSTD_32bits() || (totalBits > 64 - 7 - (LLFSELog + MLFSELog + OffFSELog)))
+ BIT_reloadDStream(&seqState->DStream);
+
+ {
+ size_t const pos = seqState->pos + seq.litLength;
+ seq.match = seqState->base + pos - seq.offset; /* single memory segment */
+ if (seq.offset > pos)
+ seq.match += seqState->gotoDict; /* separate memory segment */
+ seqState->pos = pos + seq.matchLength;
+ }
+
+ /* ANS state update */
+ FSE_updateState(&seqState->stateLL, &seqState->DStream); /* <= 9 bits */
+ FSE_updateState(&seqState->stateML, &seqState->DStream); /* <= 9 bits */
+ if (ZSTD_32bits())
+ BIT_reloadDStream(&seqState->DStream); /* <= 18 bits */
+ FSE_updateState(&seqState->stateOffb, &seqState->DStream); /* <= 8 bits */
+
+ return seq;
+}
+
+static seq_t ZSTD_decodeSequenceLong(seqState_t *seqState, unsigned const windowSize)
+{
+ if (ZSTD_highbit32(windowSize) > STREAM_ACCUMULATOR_MIN) {
+ return ZSTD_decodeSequenceLong_generic(seqState, 1);
+ } else {
+ return ZSTD_decodeSequenceLong_generic(seqState, 0);
+ }
+}
+
+FORCE_INLINE
+size_t ZSTD_execSequenceLong(BYTE *op, BYTE *const oend, seq_t sequence, const BYTE **litPtr, const BYTE *const litLimit, const BYTE *const base,
+ const BYTE *const vBase, const BYTE *const dictEnd)
+{
+ BYTE *const oLitEnd = op + sequence.litLength;
+ size_t const sequenceLength = sequence.litLength + sequence.matchLength;
+ BYTE *const oMatchEnd = op + sequenceLength; /* risk : address space overflow (32-bits) */
+ BYTE *const oend_w = oend - WILDCOPY_OVERLENGTH;
+ const BYTE *const iLitEnd = *litPtr + sequence.litLength;
+ const BYTE *match = sequence.match;
+
+ /* check */
+ if (oMatchEnd > oend)
+ return ERROR(dstSize_tooSmall); /* last match must start at a minimum distance of WILDCOPY_OVERLENGTH from oend */
+ if (iLitEnd > litLimit)
+ return ERROR(corruption_detected); /* over-read beyond lit buffer */
+ if (oLitEnd > oend_w)
+ return ZSTD_execSequenceLast7(op, oend, sequence, litPtr, litLimit, base, vBase, dictEnd);
+
+ /* copy Literals */
+ ZSTD_copy8(op, *litPtr);
+ if (sequence.litLength > 8)
+ ZSTD_wildcopy(op + 8, (*litPtr) + 8,
+ sequence.litLength - 8); /* note : since oLitEnd <= oend-WILDCOPY_OVERLENGTH, no risk of overwrite beyond oend */
+ op = oLitEnd;
+ *litPtr = iLitEnd; /* update for next sequence */
+
+ /* copy Match */
+ if (sequence.offset > (size_t)(oLitEnd - base)) {
+ /* offset beyond prefix */
+ if (sequence.offset > (size_t)(oLitEnd - vBase))
+ return ERROR(corruption_detected);
+ if (match + sequence.matchLength <= dictEnd) {
+ memmove(oLitEnd, match, sequence.matchLength);
+ return sequenceLength;
+ }
+ /* span extDict & currPrefixSegment */
+ {
+ size_t const length1 = dictEnd - match;
+ memmove(oLitEnd, match, length1);
+ op = oLitEnd + length1;
+ sequence.matchLength -= length1;
+ match = base;
+ if (op > oend_w || sequence.matchLength < MINMATCH) {
+ U32 i;
+ for (i = 0; i < sequence.matchLength; ++i)
+ op[i] = match[i];
+ return sequenceLength;
+ }
+ }
+ }
+ /* Requirement: op <= oend_w && sequence.matchLength >= MINMATCH */
+
+ /* match within prefix */
+ if (sequence.offset < 8) {
+ /* close range match, overlap */
+ static const U32 dec32table[] = {0, 1, 2, 1, 4, 4, 4, 4}; /* added */
+ static const int dec64table[] = {8, 8, 8, 7, 8, 9, 10, 11}; /* subtracted */
+ int const sub2 = dec64table[sequence.offset];
+ op[0] = match[0];
+ op[1] = match[1];
+ op[2] = match[2];
+ op[3] = match[3];
+ match += dec32table[sequence.offset];
+ ZSTD_copy4(op + 4, match);
+ match -= sub2;
+ } else {
+ ZSTD_copy8(op, match);
+ }
+ op += 8;
+ match += 8;
+
+ if (oMatchEnd > oend - (16 - MINMATCH)) {
+ if (op < oend_w) {
+ ZSTD_wildcopy(op, match, oend_w - op);
+ match += oend_w - op;
+ op = oend_w;
+ }
+ while (op < oMatchEnd)
+ *op++ = *match++;
+ } else {
+ ZSTD_wildcopy(op, match, (ptrdiff_t)sequence.matchLength - 8); /* works even if matchLength < 8 */
+ }
+ return sequenceLength;
+}
+
+static size_t ZSTD_decompressSequencesLong(ZSTD_DCtx *dctx, void *dst, size_t maxDstSize, const void *seqStart, size_t seqSize)
+{
+ const BYTE *ip = (const BYTE *)seqStart;
+ const BYTE *const iend = ip + seqSize;
+ BYTE *const ostart = (BYTE * const)dst;
+ BYTE *const oend = ostart + maxDstSize;
+ BYTE *op = ostart;
+ const BYTE *litPtr = dctx->litPtr;
+ const BYTE *const litEnd = litPtr + dctx->litSize;
+ const BYTE *const base = (const BYTE *)(dctx->base);
+ const BYTE *const vBase = (const BYTE *)(dctx->vBase);
+ const BYTE *const dictEnd = (const BYTE *)(dctx->dictEnd);
+ unsigned const windowSize = dctx->fParams.windowSize;
+ int nbSeq;
+
+ /* Build Decoding Tables */
+ {
+ size_t const seqHSize = ZSTD_decodeSeqHeaders(dctx, &nbSeq, ip, seqSize);
+ if (ZSTD_isError(seqHSize))
+ return seqHSize;
+ ip += seqHSize;
+ }
+
+ /* Regen sequences */
+ if (nbSeq) {
+#define STORED_SEQS 4
+#define STOSEQ_MASK (STORED_SEQS - 1)
+#define ADVANCED_SEQS 4
+ seq_t *sequences = (seq_t *)dctx->entropy.workspace;
+ int const seqAdvance = MIN(nbSeq, ADVANCED_SEQS);
+ seqState_t seqState;
+ int seqNb;
+ ZSTD_STATIC_ASSERT(sizeof(dctx->entropy.workspace) >= sizeof(seq_t) * STORED_SEQS);
+ dctx->fseEntropy = 1;
+ {
+ U32 i;
+ for (i = 0; i < ZSTD_REP_NUM; i++)
+ seqState.prevOffset[i] = dctx->entropy.rep[i];
+ }
+ seqState.base = base;
+ seqState.pos = (size_t)(op - base);
+ seqState.gotoDict = (uPtrDiff)dictEnd - (uPtrDiff)base; /* cast to avoid undefined behaviour */
+ CHECK_E(BIT_initDStream(&seqState.DStream, ip, iend - ip), corruption_detected);
+ FSE_initDState(&seqState.stateLL, &seqState.DStream, dctx->LLTptr);
+ FSE_initDState(&seqState.stateOffb, &seqState.DStream, dctx->OFTptr);
+ FSE_initDState(&seqState.stateML, &seqState.DStream, dctx->MLTptr);
+
+ /* prepare in advance */
+ for (seqNb = 0; (BIT_reloadDStream(&seqState.DStream) <= BIT_DStream_completed) && seqNb < seqAdvance; seqNb++) {
+ sequences[seqNb] = ZSTD_decodeSequenceLong(&seqState, windowSize);
+ }
+ if (seqNb < seqAdvance)
+ return ERROR(corruption_detected);
+
+ /* decode and decompress */
+ for (; (BIT_reloadDStream(&(seqState.DStream)) <= BIT_DStream_completed) && seqNb < nbSeq; seqNb++) {
+ seq_t const sequence = ZSTD_decodeSequenceLong(&seqState, windowSize);
+ size_t const oneSeqSize =
+ ZSTD_execSequenceLong(op, oend, sequences[(seqNb - ADVANCED_SEQS) & STOSEQ_MASK], &litPtr, litEnd, base, vBase, dictEnd);
+ if (ZSTD_isError(oneSeqSize))
+ return oneSeqSize;
+ ZSTD_PREFETCH(sequence.match);
+ sequences[seqNb & STOSEQ_MASK] = sequence;
+ op += oneSeqSize;
+ }
+ if (seqNb < nbSeq)
+ return ERROR(corruption_detected);
+
+ /* finish queue */
+ seqNb -= seqAdvance;
+ for (; seqNb < nbSeq; seqNb++) {
+ size_t const oneSeqSize = ZSTD_execSequenceLong(op, oend, sequences[seqNb & STOSEQ_MASK], &litPtr, litEnd, base, vBase, dictEnd);
+ if (ZSTD_isError(oneSeqSize))
+ return oneSeqSize;
+ op += oneSeqSize;
+ }
+
+ /* save reps for next block */
+ {
+ U32 i;
+ for (i = 0; i < ZSTD_REP_NUM; i++)
+ dctx->entropy.rep[i] = (U32)(seqState.prevOffset[i]);
+ }
+ }
+
+ /* last literal segment */
+ {
+ size_t const lastLLSize = litEnd - litPtr;
+ if (lastLLSize > (size_t)(oend - op))
+ return ERROR(dstSize_tooSmall);
+ memcpy(op, litPtr, lastLLSize);
+ op += lastLLSize;
+ }
+
+ return op - ostart;
+}
+
+static size_t ZSTD_decompressBlock_internal(ZSTD_DCtx *dctx, void *dst, size_t dstCapacity, const void *src, size_t srcSize)
+{ /* blockType == blockCompressed */
+ const BYTE *ip = (const BYTE *)src;
+
+ if (srcSize >= ZSTD_BLOCKSIZE_ABSOLUTEMAX)
+ return ERROR(srcSize_wrong);
+
+ /* Decode literals section */
+ {
+ size_t const litCSize = ZSTD_decodeLiteralsBlock(dctx, src, srcSize);
+ if (ZSTD_isError(litCSize))
+ return litCSize;
+ ip += litCSize;
+ srcSize -= litCSize;
+ }
+ if (sizeof(size_t) > 4) /* do not enable prefetching on 32-bits x86, as it's performance detrimental */
+ /* likely because of register pressure */
+ /* if that's the correct cause, then 32-bits ARM should be affected differently */
+ /* it would be good to test this on ARM real hardware, to see if prefetch version improves speed */
+ if (dctx->fParams.windowSize > (1 << 23))
+ return ZSTD_decompressSequencesLong(dctx, dst, dstCapacity, ip, srcSize);
+ return ZSTD_decompressSequences(dctx, dst, dstCapacity, ip, srcSize);
+}
+
+static void ZSTD_checkContinuity(ZSTD_DCtx *dctx, const void *dst)
+{
+ if (dst != dctx->previousDstEnd) { /* not contiguous */
+ dctx->dictEnd = dctx->previousDstEnd;
+ dctx->vBase = (const char *)dst - ((const char *)(dctx->previousDstEnd) - (const char *)(dctx->base));
+ dctx->base = dst;
+ dctx->previousDstEnd = dst;
+ }
+}
+
+size_t ZSTD_decompressBlock(ZSTD_DCtx *dctx, void *dst, size_t dstCapacity, const void *src, size_t srcSize)
+{
+ size_t dSize;
+ ZSTD_checkContinuity(dctx, dst);
+ dSize = ZSTD_decompressBlock_internal(dctx, dst, dstCapacity, src, srcSize);
+ dctx->previousDstEnd = (char *)dst + dSize;
+ return dSize;
+}
+
+/** ZSTD_insertBlock() :
+ insert `src` block into `dctx` history. Useful to track uncompressed blocks. */
+size_t ZSTD_insertBlock(ZSTD_DCtx *dctx, const void *blockStart, size_t blockSize)
+{
+ ZSTD_checkContinuity(dctx, blockStart);
+ dctx->previousDstEnd = (const char *)blockStart + blockSize;
+ return blockSize;
+}
+
+size_t ZSTD_generateNxBytes(void *dst, size_t dstCapacity, BYTE byte, size_t length)
+{
+ if (length > dstCapacity)
+ return ERROR(dstSize_tooSmall);
+ memset(dst, byte, length);
+ return length;
+}
+
+/** ZSTD_findFrameCompressedSize() :
+ * compatible with legacy mode
+ * `src` must point to the start of a ZSTD frame, ZSTD legacy frame, or skippable frame
+ * `srcSize` must be at least as large as the frame contained
+ * @return : the compressed size of the frame starting at `src` */
+size_t ZSTD_findFrameCompressedSize(const void *src, size_t srcSize)
+{
+ if (srcSize >= ZSTD_skippableHeaderSize && (ZSTD_readLE32(src) & 0xFFFFFFF0U) == ZSTD_MAGIC_SKIPPABLE_START) {
+ return ZSTD_skippableHeaderSize + ZSTD_readLE32((const BYTE *)src + 4);
+ } else {
+ const BYTE *ip = (const BYTE *)src;
+ const BYTE *const ipstart = ip;
+ size_t remainingSize = srcSize;
+ ZSTD_frameParams fParams;
+
+ size_t const headerSize = ZSTD_frameHeaderSize(ip, remainingSize);
+ if (ZSTD_isError(headerSize))
+ return headerSize;
+
+ /* Frame Header */
+ {
+ size_t const ret = ZSTD_getFrameParams(&fParams, ip, remainingSize);
+ if (ZSTD_isError(ret))
+ return ret;
+ if (ret > 0)
+ return ERROR(srcSize_wrong);
+ }
+
+ ip += headerSize;
+ remainingSize -= headerSize;
+
+ /* Loop on each block */
+ while (1) {
+ blockProperties_t blockProperties;
+ size_t const cBlockSize = ZSTD_getcBlockSize(ip, remainingSize, &blockProperties);
+ if (ZSTD_isError(cBlockSize))
+ return cBlockSize;
+
+ if (ZSTD_blockHeaderSize + cBlockSize > remainingSize)
+ return ERROR(srcSize_wrong);
+
+ ip += ZSTD_blockHeaderSize + cBlockSize;
+ remainingSize -= ZSTD_blockHeaderSize + cBlockSize;
+
+ if (blockProperties.lastBlock)
+ break;
+ }
+
+ if (fParams.checksumFlag) { /* Frame content checksum */
+ if (remainingSize < 4)
+ return ERROR(srcSize_wrong);
+ ip += 4;
+ remainingSize -= 4;
+ }
+
+ return ip - ipstart;
+ }
+}
+
+/*! ZSTD_decompressFrame() :
+* @dctx must be properly initialized */
+static size_t ZSTD_decompressFrame(ZSTD_DCtx *dctx, void *dst, size_t dstCapacity, const void **srcPtr, size_t *srcSizePtr)
+{
+ const BYTE *ip = (const BYTE *)(*srcPtr);
+ BYTE *const ostart = (BYTE * const)dst;
+ BYTE *const oend = ostart + dstCapacity;
+ BYTE *op = ostart;
+ size_t remainingSize = *srcSizePtr;
+
+ /* check */
+ if (remainingSize < ZSTD_frameHeaderSize_min + ZSTD_blockHeaderSize)
+ return ERROR(srcSize_wrong);
+
+ /* Frame Header */
+ {
+ size_t const frameHeaderSize = ZSTD_frameHeaderSize(ip, ZSTD_frameHeaderSize_prefix);
+ if (ZSTD_isError(frameHeaderSize))
+ return frameHeaderSize;
+ if (remainingSize < frameHeaderSize + ZSTD_blockHeaderSize)
+ return ERROR(srcSize_wrong);
+ CHECK_F(ZSTD_decodeFrameHeader(dctx, ip, frameHeaderSize));
+ ip += frameHeaderSize;
+ remainingSize -= frameHeaderSize;
+ }
+
+ /* Loop on each block */
+ while (1) {
+ size_t decodedSize;
+ blockProperties_t blockProperties;
+ size_t const cBlockSize = ZSTD_getcBlockSize(ip, remainingSize, &blockProperties);
+ if (ZSTD_isError(cBlockSize))
+ return cBlockSize;
+
+ ip += ZSTD_blockHeaderSize;
+ remainingSize -= ZSTD_blockHeaderSize;
+ if (cBlockSize > remainingSize)
+ return ERROR(srcSize_wrong);
+
+ switch (blockProperties.blockType) {
+ case bt_compressed: decodedSize = ZSTD_decompressBlock_internal(dctx, op, oend - op, ip, cBlockSize); break;
+ case bt_raw: decodedSize = ZSTD_copyRawBlock(op, oend - op, ip, cBlockSize); break;
+ case bt_rle: decodedSize = ZSTD_generateNxBytes(op, oend - op, *ip, blockProperties.origSize); break;
+ case bt_reserved:
+ default: return ERROR(corruption_detected);
+ }
+
+ if (ZSTD_isError(decodedSize))
+ return decodedSize;
+ if (dctx->fParams.checksumFlag)
+ xxh64_update(&dctx->xxhState, op, decodedSize);
+ op += decodedSize;
+ ip += cBlockSize;
+ remainingSize -= cBlockSize;
+ if (blockProperties.lastBlock)
+ break;
+ }
+
+ if (dctx->fParams.checksumFlag) { /* Frame content checksum verification */
+ U32 const checkCalc = (U32)xxh64_digest(&dctx->xxhState);
+ U32 checkRead;
+ if (remainingSize < 4)
+ return ERROR(checksum_wrong);
+ checkRead = ZSTD_readLE32(ip);
+ if (checkRead != checkCalc)
+ return ERROR(checksum_wrong);
+ ip += 4;
+ remainingSize -= 4;
+ }
+
+ /* Allow caller to get size read */
+ *srcPtr = ip;
+ *srcSizePtr = remainingSize;
+ return op - ostart;
+}
+
+static const void *ZSTD_DDictDictContent(const ZSTD_DDict *ddict);
+static size_t ZSTD_DDictDictSize(const ZSTD_DDict *ddict);
+
+static size_t ZSTD_decompressMultiFrame(ZSTD_DCtx *dctx, void *dst, size_t dstCapacity, const void *src, size_t srcSize, const void *dict, size_t dictSize,
+ const ZSTD_DDict *ddict)
+{
+ void *const dststart = dst;
+
+ if (ddict) {
+ if (dict) {
+ /* programmer error, these two cases should be mutually exclusive */
+ return ERROR(GENERIC);
+ }
+
+ dict = ZSTD_DDictDictContent(ddict);
+ dictSize = ZSTD_DDictDictSize(ddict);
+ }
+
+ while (srcSize >= ZSTD_frameHeaderSize_prefix) {
+ U32 magicNumber;
+
+ magicNumber = ZSTD_readLE32(src);
+ if (magicNumber != ZSTD_MAGICNUMBER) {
+ if ((magicNumber & 0xFFFFFFF0U) == ZSTD_MAGIC_SKIPPABLE_START) {
+ size_t skippableSize;
+ if (srcSize < ZSTD_skippableHeaderSize)
+ return ERROR(srcSize_wrong);
+ skippableSize = ZSTD_readLE32((const BYTE *)src + 4) + ZSTD_skippableHeaderSize;
+ if (srcSize < skippableSize) {
+ return ERROR(srcSize_wrong);
+ }
+
+ src = (const BYTE *)src + skippableSize;
+ srcSize -= skippableSize;
+ continue;
+ } else {
+ return ERROR(prefix_unknown);
+ }
+ }
+
+ if (ddict) {
+ /* we were called from ZSTD_decompress_usingDDict */
+ ZSTD_refDDict(dctx, ddict);
+ } else {
+ /* this will initialize correctly with no dict if dict == NULL, so
+ * use this in all cases but ddict */
+ CHECK_F(ZSTD_decompressBegin_usingDict(dctx, dict, dictSize));
+ }
+ ZSTD_checkContinuity(dctx, dst);
+
+ {
+ const size_t res = ZSTD_decompressFrame(dctx, dst, dstCapacity, &src, &srcSize);
+ if (ZSTD_isError(res))
+ return res;
+ /* don't need to bounds check this, ZSTD_decompressFrame will have
+ * already */
+ dst = (BYTE *)dst + res;
+ dstCapacity -= res;
+ }
+ }
+
+ if (srcSize)
+ return ERROR(srcSize_wrong); /* input not entirely consumed */
+
+ return (BYTE *)dst - (BYTE *)dststart;
+}
+
+size_t ZSTD_decompress_usingDict(ZSTD_DCtx *dctx, void *dst, size_t dstCapacity, const void *src, size_t srcSize, const void *dict, size_t dictSize)
+{
+ return ZSTD_decompressMultiFrame(dctx, dst, dstCapacity, src, srcSize, dict, dictSize, NULL);
+}
+
+size_t ZSTD_decompressDCtx(ZSTD_DCtx *dctx, void *dst, size_t dstCapacity, const void *src, size_t srcSize)
+{
+ return ZSTD_decompress_usingDict(dctx, dst, dstCapacity, src, srcSize, NULL, 0);
+}
+
+/*-**************************************
+* Advanced Streaming Decompression API
+* Bufferless and synchronous
+****************************************/
+size_t ZSTD_nextSrcSizeToDecompress(ZSTD_DCtx *dctx) { return dctx->expected; }
+
+ZSTD_nextInputType_e ZSTD_nextInputType(ZSTD_DCtx *dctx)
+{
+ switch (dctx->stage) {
+ default: /* should not happen */
+ case ZSTDds_getFrameHeaderSize:
+ case ZSTDds_decodeFrameHeader: return ZSTDnit_frameHeader;
+ case ZSTDds_decodeBlockHeader: return ZSTDnit_blockHeader;
+ case ZSTDds_decompressBlock: return ZSTDnit_block;
+ case ZSTDds_decompressLastBlock: return ZSTDnit_lastBlock;
+ case ZSTDds_checkChecksum: return ZSTDnit_checksum;
+ case ZSTDds_decodeSkippableHeader:
+ case ZSTDds_skipFrame: return ZSTDnit_skippableFrame;
+ }
+}
+
+int ZSTD_isSkipFrame(ZSTD_DCtx *dctx) { return dctx->stage == ZSTDds_skipFrame; } /* for zbuff */
+
+/** ZSTD_decompressContinue() :
+* @return : nb of bytes generated into `dst` (necessarily <= `dstCapacity)
+* or an error code, which can be tested using ZSTD_isError() */
+size_t ZSTD_decompressContinue(ZSTD_DCtx *dctx, void *dst, size_t dstCapacity, const void *src, size_t srcSize)
+{
+ /* Sanity check */
+ if (srcSize != dctx->expected)
+ return ERROR(srcSize_wrong);
+ if (dstCapacity)
+ ZSTD_checkContinuity(dctx, dst);
+
+ switch (dctx->stage) {
+ case ZSTDds_getFrameHeaderSize:
+ if (srcSize != ZSTD_frameHeaderSize_prefix)
+ return ERROR(srcSize_wrong); /* impossible */
+ if ((ZSTD_readLE32(src) & 0xFFFFFFF0U) == ZSTD_MAGIC_SKIPPABLE_START) { /* skippable frame */
+ memcpy(dctx->headerBuffer, src, ZSTD_frameHeaderSize_prefix);
+ dctx->expected = ZSTD_skippableHeaderSize - ZSTD_frameHeaderSize_prefix; /* magic number + skippable frame length */
+ dctx->stage = ZSTDds_decodeSkippableHeader;
+ return 0;
+ }
+ dctx->headerSize = ZSTD_frameHeaderSize(src, ZSTD_frameHeaderSize_prefix);
+ if (ZSTD_isError(dctx->headerSize))
+ return dctx->headerSize;
+ memcpy(dctx->headerBuffer, src, ZSTD_frameHeaderSize_prefix);
+ if (dctx->headerSize > ZSTD_frameHeaderSize_prefix) {
+ dctx->expected = dctx->headerSize - ZSTD_frameHeaderSize_prefix;
+ dctx->stage = ZSTDds_decodeFrameHeader;
+ return 0;
+ }
+ dctx->expected = 0; /* not necessary to copy more */
+
+ case ZSTDds_decodeFrameHeader:
+ memcpy(dctx->headerBuffer + ZSTD_frameHeaderSize_prefix, src, dctx->expected);
+ CHECK_F(ZSTD_decodeFrameHeader(dctx, dctx->headerBuffer, dctx->headerSize));
+ dctx->expected = ZSTD_blockHeaderSize;
+ dctx->stage = ZSTDds_decodeBlockHeader;
+ return 0;
+
+ case ZSTDds_decodeBlockHeader: {
+ blockProperties_t bp;
+ size_t const cBlockSize = ZSTD_getcBlockSize(src, ZSTD_blockHeaderSize, &bp);
+ if (ZSTD_isError(cBlockSize))
+ return cBlockSize;
+ dctx->expected = cBlockSize;
+ dctx->bType = bp.blockType;
+ dctx->rleSize = bp.origSize;
+ if (cBlockSize) {
+ dctx->stage = bp.lastBlock ? ZSTDds_decompressLastBlock : ZSTDds_decompressBlock;
+ return 0;
+ }
+ /* empty block */
+ if (bp.lastBlock) {
+ if (dctx->fParams.checksumFlag) {
+ dctx->expected = 4;
+ dctx->stage = ZSTDds_checkChecksum;
+ } else {
+ dctx->expected = 0; /* end of frame */
+ dctx->stage = ZSTDds_getFrameHeaderSize;
+ }
+ } else {
+ dctx->expected = 3; /* go directly to next header */
+ dctx->stage = ZSTDds_decodeBlockHeader;
+ }
+ return 0;
+ }
+ case ZSTDds_decompressLastBlock:
+ case ZSTDds_decompressBlock: {
+ size_t rSize;
+ switch (dctx->bType) {
+ case bt_compressed: rSize = ZSTD_decompressBlock_internal(dctx, dst, dstCapacity, src, srcSize); break;
+ case bt_raw: rSize = ZSTD_copyRawBlock(dst, dstCapacity, src, srcSize); break;
+ case bt_rle: rSize = ZSTD_setRleBlock(dst, dstCapacity, src, srcSize, dctx->rleSize); break;
+ case bt_reserved: /* should never happen */
+ default: return ERROR(corruption_detected);
+ }
+ if (ZSTD_isError(rSize))
+ return rSize;
+ if (dctx->fParams.checksumFlag)
+ xxh64_update(&dctx->xxhState, dst, rSize);
+
+ if (dctx->stage == ZSTDds_decompressLastBlock) { /* end of frame */
+ if (dctx->fParams.checksumFlag) { /* another round for frame checksum */
+ dctx->expected = 4;
+ dctx->stage = ZSTDds_checkChecksum;
+ } else {
+ dctx->expected = 0; /* ends here */
+ dctx->stage = ZSTDds_getFrameHeaderSize;
+ }
+ } else {
+ dctx->stage = ZSTDds_decodeBlockHeader;
+ dctx->expected = ZSTD_blockHeaderSize;
+ dctx->previousDstEnd = (char *)dst + rSize;
+ }
+ return rSize;
+ }
+ case ZSTDds_checkChecksum: {
+ U32 const h32 = (U32)xxh64_digest(&dctx->xxhState);
+ U32 const check32 = ZSTD_readLE32(src); /* srcSize == 4, guaranteed by dctx->expected */
+ if (check32 != h32)
+ return ERROR(checksum_wrong);
+ dctx->expected = 0;
+ dctx->stage = ZSTDds_getFrameHeaderSize;
+ return 0;
+ }
+ case ZSTDds_decodeSkippableHeader: {
+ memcpy(dctx->headerBuffer + ZSTD_frameHeaderSize_prefix, src, dctx->expected);
+ dctx->expected = ZSTD_readLE32(dctx->headerBuffer + 4);
+ dctx->stage = ZSTDds_skipFrame;
+ return 0;
+ }
+ case ZSTDds_skipFrame: {
+ dctx->expected = 0;
+ dctx->stage = ZSTDds_getFrameHeaderSize;
+ return 0;
+ }
+ default:
+ return ERROR(GENERIC); /* impossible */
+ }
+}
+
+static size_t ZSTD_refDictContent(ZSTD_DCtx *dctx, const void *dict, size_t dictSize)
+{
+ dctx->dictEnd = dctx->previousDstEnd;
+ dctx->vBase = (const char *)dict - ((const char *)(dctx->previousDstEnd) - (const char *)(dctx->base));
+ dctx->base = dict;
+ dctx->previousDstEnd = (const char *)dict + dictSize;
+ return 0;
+}
+
+/* ZSTD_loadEntropy() :
+ * dict : must point at beginning of a valid zstd dictionary
+ * @return : size of entropy tables read */
+static size_t ZSTD_loadEntropy(ZSTD_entropyTables_t *entropy, const void *const dict, size_t const dictSize)
+{
+ const BYTE *dictPtr = (const BYTE *)dict;
+ const BYTE *const dictEnd = dictPtr + dictSize;
+
+ if (dictSize <= 8)
+ return ERROR(dictionary_corrupted);
+ dictPtr += 8; /* skip header = magic + dictID */
+
+ {
+ size_t const hSize = HUF_readDTableX4_wksp(entropy->hufTable, dictPtr, dictEnd - dictPtr, entropy->workspace, sizeof(entropy->workspace));
+ if (HUF_isError(hSize))
+ return ERROR(dictionary_corrupted);
+ dictPtr += hSize;
+ }
+
+ {
+ short offcodeNCount[MaxOff + 1];
+ U32 offcodeMaxValue = MaxOff, offcodeLog;
+ size_t const offcodeHeaderSize = FSE_readNCount(offcodeNCount, &offcodeMaxValue, &offcodeLog, dictPtr, dictEnd - dictPtr);
+ if (FSE_isError(offcodeHeaderSize))
+ return ERROR(dictionary_corrupted);
+ if (offcodeLog > OffFSELog)
+ return ERROR(dictionary_corrupted);
+ CHECK_E(FSE_buildDTable_wksp(entropy->OFTable, offcodeNCount, offcodeMaxValue, offcodeLog, entropy->workspace, sizeof(entropy->workspace)), dictionary_corrupted);
+ dictPtr += offcodeHeaderSize;
+ }
+
+ {
+ short matchlengthNCount[MaxML + 1];
+ unsigned matchlengthMaxValue = MaxML, matchlengthLog;
+ size_t const matchlengthHeaderSize = FSE_readNCount(matchlengthNCount, &matchlengthMaxValue, &matchlengthLog, dictPtr, dictEnd - dictPtr);
+ if (FSE_isError(matchlengthHeaderSize))
+ return ERROR(dictionary_corrupted);
+ if (matchlengthLog > MLFSELog)
+ return ERROR(dictionary_corrupted);
+ CHECK_E(FSE_buildDTable_wksp(entropy->MLTable, matchlengthNCount, matchlengthMaxValue, matchlengthLog, entropy->workspace, sizeof(entropy->workspace)), dictionary_corrupted);
+ dictPtr += matchlengthHeaderSize;
+ }
+
+ {
+ short litlengthNCount[MaxLL + 1];
+ unsigned litlengthMaxValue = MaxLL, litlengthLog;
+ size_t const litlengthHeaderSize = FSE_readNCount(litlengthNCount, &litlengthMaxValue, &litlengthLog, dictPtr, dictEnd - dictPtr);
+ if (FSE_isError(litlengthHeaderSize))
+ return ERROR(dictionary_corrupted);
+ if (litlengthLog > LLFSELog)
+ return ERROR(dictionary_corrupted);
+ CHECK_E(FSE_buildDTable_wksp(entropy->LLTable, litlengthNCount, litlengthMaxValue, litlengthLog, entropy->workspace, sizeof(entropy->workspace)), dictionary_corrupted);
+ dictPtr += litlengthHeaderSize;
+ }
+
+ if (dictPtr + 12 > dictEnd)
+ return ERROR(dictionary_corrupted);
+ {
+ int i;
+ size_t const dictContentSize = (size_t)(dictEnd - (dictPtr + 12));
+ for (i = 0; i < 3; i++) {
+ U32 const rep = ZSTD_readLE32(dictPtr);
+ dictPtr += 4;
+ if (rep == 0 || rep >= dictContentSize)
+ return ERROR(dictionary_corrupted);
+ entropy->rep[i] = rep;
+ }
+ }
+
+ return dictPtr - (const BYTE *)dict;
+}
+
+static size_t ZSTD_decompress_insertDictionary(ZSTD_DCtx *dctx, const void *dict, size_t dictSize)
+{
+ if (dictSize < 8)
+ return ZSTD_refDictContent(dctx, dict, dictSize);
+ {
+ U32 const magic = ZSTD_readLE32(dict);
+ if (magic != ZSTD_DICT_MAGIC) {
+ return ZSTD_refDictContent(dctx, dict, dictSize); /* pure content mode */
+ }
+ }
+ dctx->dictID = ZSTD_readLE32((const char *)dict + 4);
+
+ /* load entropy tables */
+ {
+ size_t const eSize = ZSTD_loadEntropy(&dctx->entropy, dict, dictSize);
+ if (ZSTD_isError(eSize))
+ return ERROR(dictionary_corrupted);
+ dict = (const char *)dict + eSize;
+ dictSize -= eSize;
+ }
+ dctx->litEntropy = dctx->fseEntropy = 1;
+
+ /* reference dictionary content */
+ return ZSTD_refDictContent(dctx, dict, dictSize);
+}
+
+size_t ZSTD_decompressBegin_usingDict(ZSTD_DCtx *dctx, const void *dict, size_t dictSize)
+{
+ CHECK_F(ZSTD_decompressBegin(dctx));
+ if (dict && dictSize)
+ CHECK_E(ZSTD_decompress_insertDictionary(dctx, dict, dictSize), dictionary_corrupted);
+ return 0;
+}
+
+/* ====== ZSTD_DDict ====== */
+
+struct ZSTD_DDict_s {
+ void *dictBuffer;
+ const void *dictContent;
+ size_t dictSize;
+ ZSTD_entropyTables_t entropy;
+ U32 dictID;
+ U32 entropyPresent;
+ ZSTD_customMem cMem;
+}; /* typedef'd to ZSTD_DDict within "zstd.h" */
+
+size_t ZSTD_DDictWorkspaceBound(void) { return ZSTD_ALIGN(sizeof(ZSTD_stack)) + ZSTD_ALIGN(sizeof(ZSTD_DDict)); }
+
+static const void *ZSTD_DDictDictContent(const ZSTD_DDict *ddict) { return ddict->dictContent; }
+
+static size_t ZSTD_DDictDictSize(const ZSTD_DDict *ddict) { return ddict->dictSize; }
+
+static void ZSTD_refDDict(ZSTD_DCtx *dstDCtx, const ZSTD_DDict *ddict)
+{
+ ZSTD_decompressBegin(dstDCtx); /* init */
+ if (ddict) { /* support refDDict on NULL */
+ dstDCtx->dictID = ddict->dictID;
+ dstDCtx->base = ddict->dictContent;
+ dstDCtx->vBase = ddict->dictContent;
+ dstDCtx->dictEnd = (const BYTE *)ddict->dictContent + ddict->dictSize;
+ dstDCtx->previousDstEnd = dstDCtx->dictEnd;
+ if (ddict->entropyPresent) {
+ dstDCtx->litEntropy = 1;
+ dstDCtx->fseEntropy = 1;
+ dstDCtx->LLTptr = ddict->entropy.LLTable;
+ dstDCtx->MLTptr = ddict->entropy.MLTable;
+ dstDCtx->OFTptr = ddict->entropy.OFTable;
+ dstDCtx->HUFptr = ddict->entropy.hufTable;
+ dstDCtx->entropy.rep[0] = ddict->entropy.rep[0];
+ dstDCtx->entropy.rep[1] = ddict->entropy.rep[1];
+ dstDCtx->entropy.rep[2] = ddict->entropy.rep[2];
+ } else {
+ dstDCtx->litEntropy = 0;
+ dstDCtx->fseEntropy = 0;
+ }
+ }
+}
+
+static size_t ZSTD_loadEntropy_inDDict(ZSTD_DDict *ddict)
+{
+ ddict->dictID = 0;
+ ddict->entropyPresent = 0;
+ if (ddict->dictSize < 8)
+ return 0;
+ {
+ U32 const magic = ZSTD_readLE32(ddict->dictContent);
+ if (magic != ZSTD_DICT_MAGIC)
+ return 0; /* pure content mode */
+ }
+ ddict->dictID = ZSTD_readLE32((const char *)ddict->dictContent + 4);
+
+ /* load entropy tables */
+ CHECK_E(ZSTD_loadEntropy(&ddict->entropy, ddict->dictContent, ddict->dictSize), dictionary_corrupted);
+ ddict->entropyPresent = 1;
+ return 0;
+}
+
+static ZSTD_DDict *ZSTD_createDDict_advanced(const void *dict, size_t dictSize, unsigned byReference, ZSTD_customMem customMem)
+{
+ if (!customMem.customAlloc || !customMem.customFree)
+ return NULL;
+
+ {
+ ZSTD_DDict *const ddict = (ZSTD_DDict *)ZSTD_malloc(sizeof(ZSTD_DDict), customMem);
+ if (!ddict)
+ return NULL;
+ ddict->cMem = customMem;
+
+ if ((byReference) || (!dict) || (!dictSize)) {
+ ddict->dictBuffer = NULL;
+ ddict->dictContent = dict;
+ } else {
+ void *const internalBuffer = ZSTD_malloc(dictSize, customMem);
+ if (!internalBuffer) {
+ ZSTD_freeDDict(ddict);
+ return NULL;
+ }
+ memcpy(internalBuffer, dict, dictSize);
+ ddict->dictBuffer = internalBuffer;
+ ddict->dictContent = internalBuffer;
+ }
+ ddict->dictSize = dictSize;
+ ddict->entropy.hufTable[0] = (HUF_DTable)((HufLog)*0x1000001); /* cover both little and big endian */
+ /* parse dictionary content */
+ {
+ size_t const errorCode = ZSTD_loadEntropy_inDDict(ddict);
+ if (ZSTD_isError(errorCode)) {
+ ZSTD_freeDDict(ddict);
+ return NULL;
+ }
+ }
+
+ return ddict;
+ }
+}
+
+/*! ZSTD_initDDict() :
+* Create a digested dictionary, to start decompression without startup delay.
+* `dict` content is copied inside DDict.
+* Consequently, `dict` can be released after `ZSTD_DDict` creation */
+ZSTD_DDict *ZSTD_initDDict(const void *dict, size_t dictSize, void *workspace, size_t workspaceSize)
+{
+ ZSTD_customMem const stackMem = ZSTD_initStack(workspace, workspaceSize);
+ return ZSTD_createDDict_advanced(dict, dictSize, 1, stackMem);
+}
+
+size_t ZSTD_freeDDict(ZSTD_DDict *ddict)
+{
+ if (ddict == NULL)
+ return 0; /* support free on NULL */
+ {
+ ZSTD_customMem const cMem = ddict->cMem;
+ ZSTD_free(ddict->dictBuffer, cMem);
+ ZSTD_free(ddict, cMem);
+ return 0;
+ }
+}
+
+/*! ZSTD_getDictID_fromDict() :
+ * Provides the dictID stored within dictionary.
+ * if @return == 0, the dictionary is not conformant with Zstandard specification.
+ * It can still be loaded, but as a content-only dictionary. */
+unsigned ZSTD_getDictID_fromDict(const void *dict, size_t dictSize)
+{
+ if (dictSize < 8)
+ return 0;
+ if (ZSTD_readLE32(dict) != ZSTD_DICT_MAGIC)
+ return 0;
+ return ZSTD_readLE32((const char *)dict + 4);
+}
+
+/*! ZSTD_getDictID_fromDDict() :
+ * Provides the dictID of the dictionary loaded into `ddict`.
+ * If @return == 0, the dictionary is not conformant to Zstandard specification, or empty.
+ * Non-conformant dictionaries can still be loaded, but as content-only dictionaries. */
+unsigned ZSTD_getDictID_fromDDict(const ZSTD_DDict *ddict)
+{
+ if (ddict == NULL)
+ return 0;
+ return ZSTD_getDictID_fromDict(ddict->dictContent, ddict->dictSize);
+}
+
+/*! ZSTD_getDictID_fromFrame() :
+ * Provides the dictID required to decompressed the frame stored within `src`.
+ * If @return == 0, the dictID could not be decoded.
+ * This could for one of the following reasons :
+ * - The frame does not require a dictionary to be decoded (most common case).
+ * - The frame was built with dictID intentionally removed. Whatever dictionary is necessary is a hidden information.
+ * Note : this use case also happens when using a non-conformant dictionary.
+ * - `srcSize` is too small, and as a result, the frame header could not be decoded (only possible if `srcSize < ZSTD_FRAMEHEADERSIZE_MAX`).
+ * - This is not a Zstandard frame.
+ * When identifying the exact failure cause, it's possible to used ZSTD_getFrameParams(), which will provide a more precise error code. */
+unsigned ZSTD_getDictID_fromFrame(const void *src, size_t srcSize)
+{
+ ZSTD_frameParams zfp = {0, 0, 0, 0};
+ size_t const hError = ZSTD_getFrameParams(&zfp, src, srcSize);
+ if (ZSTD_isError(hError))
+ return 0;
+ return zfp.dictID;
+}
+
+/*! ZSTD_decompress_usingDDict() :
+* Decompression using a pre-digested Dictionary
+* Use dictionary without significant overhead. */
+size_t ZSTD_decompress_usingDDict(ZSTD_DCtx *dctx, void *dst, size_t dstCapacity, const void *src, size_t srcSize, const ZSTD_DDict *ddict)
+{
+ /* pass content and size in case legacy frames are encountered */
+ return ZSTD_decompressMultiFrame(dctx, dst, dstCapacity, src, srcSize, NULL, 0, ddict);
+}
+
+/*=====================================
+* Streaming decompression
+*====================================*/
+
+typedef enum { zdss_init, zdss_loadHeader, zdss_read, zdss_load, zdss_flush } ZSTD_dStreamStage;
+
+/* *** Resource management *** */
+struct ZSTD_DStream_s {
+ ZSTD_DCtx *dctx;
+ ZSTD_DDict *ddictLocal;
+ const ZSTD_DDict *ddict;
+ ZSTD_frameParams fParams;
+ ZSTD_dStreamStage stage;
+ char *inBuff;
+ size_t inBuffSize;
+ size_t inPos;
+ size_t maxWindowSize;
+ char *outBuff;
+ size_t outBuffSize;
+ size_t outStart;
+ size_t outEnd;
+ size_t blockSize;
+ BYTE headerBuffer[ZSTD_FRAMEHEADERSIZE_MAX]; /* tmp buffer to store frame header */
+ size_t lhSize;
+ ZSTD_customMem customMem;
+ void *legacyContext;
+ U32 previousLegacyVersion;
+ U32 legacyVersion;
+ U32 hostageByte;
+}; /* typedef'd to ZSTD_DStream within "zstd.h" */
+
+size_t ZSTD_DStreamWorkspaceBound(size_t maxWindowSize)
+{
+ size_t const blockSize = MIN(maxWindowSize, ZSTD_BLOCKSIZE_ABSOLUTEMAX);
+ size_t const inBuffSize = blockSize;
+ size_t const outBuffSize = maxWindowSize + blockSize + WILDCOPY_OVERLENGTH * 2;
+ return ZSTD_DCtxWorkspaceBound() + ZSTD_ALIGN(sizeof(ZSTD_DStream)) + ZSTD_ALIGN(inBuffSize) + ZSTD_ALIGN(outBuffSize);
+}
+
+static ZSTD_DStream *ZSTD_createDStream_advanced(ZSTD_customMem customMem)
+{
+ ZSTD_DStream *zds;
+
+ if (!customMem.customAlloc || !customMem.customFree)
+ return NULL;
+
+ zds = (ZSTD_DStream *)ZSTD_malloc(sizeof(ZSTD_DStream), customMem);
+ if (zds == NULL)
+ return NULL;
+ memset(zds, 0, sizeof(ZSTD_DStream));
+ memcpy(&zds->customMem, &customMem, sizeof(ZSTD_customMem));
+ zds->dctx = ZSTD_createDCtx_advanced(customMem);
+ if (zds->dctx == NULL) {
+ ZSTD_freeDStream(zds);
+ return NULL;
+ }
+ zds->stage = zdss_init;
+ zds->maxWindowSize = ZSTD_MAXWINDOWSIZE_DEFAULT;
+ return zds;
+}
+
+ZSTD_DStream *ZSTD_initDStream(size_t maxWindowSize, void *workspace, size_t workspaceSize)
+{
+ ZSTD_customMem const stackMem = ZSTD_initStack(workspace, workspaceSize);
+ ZSTD_DStream *zds = ZSTD_createDStream_advanced(stackMem);
+ if (!zds) {
+ return NULL;
+ }
+
+ zds->maxWindowSize = maxWindowSize;
+ zds->stage = zdss_loadHeader;
+ zds->lhSize = zds->inPos = zds->outStart = zds->outEnd = 0;
+ ZSTD_freeDDict(zds->ddictLocal);
+ zds->ddictLocal = NULL;
+ zds->ddict = zds->ddictLocal;
+ zds->legacyVersion = 0;
+ zds->hostageByte = 0;
+
+ {
+ size_t const blockSize = MIN(zds->maxWindowSize, ZSTD_BLOCKSIZE_ABSOLUTEMAX);
+ size_t const neededOutSize = zds->maxWindowSize + blockSize + WILDCOPY_OVERLENGTH * 2;
+
+ zds->inBuff = (char *)ZSTD_malloc(blockSize, zds->customMem);
+ zds->inBuffSize = blockSize;
+ zds->outBuff = (char *)ZSTD_malloc(neededOutSize, zds->customMem);
+ zds->outBuffSize = neededOutSize;
+ if (zds->inBuff == NULL || zds->outBuff == NULL) {
+ ZSTD_freeDStream(zds);
+ return NULL;
+ }
+ }
+ return zds;
+}
+
+ZSTD_DStream *ZSTD_initDStream_usingDDict(size_t maxWindowSize, const ZSTD_DDict *ddict, void *workspace, size_t workspaceSize)
+{
+ ZSTD_DStream *zds = ZSTD_initDStream(maxWindowSize, workspace, workspaceSize);
+ if (zds) {
+ zds->ddict = ddict;
+ }
+ return zds;
+}
+
+size_t ZSTD_freeDStream(ZSTD_DStream *zds)
+{
+ if (zds == NULL)
+ return 0; /* support free on null */
+ {
+ ZSTD_customMem const cMem = zds->customMem;
+ ZSTD_freeDCtx(zds->dctx);
+ zds->dctx = NULL;
+ ZSTD_freeDDict(zds->ddictLocal);
+ zds->ddictLocal = NULL;
+ ZSTD_free(zds->inBuff, cMem);
+ zds->inBuff = NULL;
+ ZSTD_free(zds->outBuff, cMem);
+ zds->outBuff = NULL;
+ ZSTD_free(zds, cMem);
+ return 0;
+ }
+}
+
+/* *** Initialization *** */
+
+size_t ZSTD_DStreamInSize(void) { return ZSTD_BLOCKSIZE_ABSOLUTEMAX + ZSTD_blockHeaderSize; }
+size_t ZSTD_DStreamOutSize(void) { return ZSTD_BLOCKSIZE_ABSOLUTEMAX; }
+
+size_t ZSTD_resetDStream(ZSTD_DStream *zds)
+{
+ zds->stage = zdss_loadHeader;
+ zds->lhSize = zds->inPos = zds->outStart = zds->outEnd = 0;
+ zds->legacyVersion = 0;
+ zds->hostageByte = 0;
+ return ZSTD_frameHeaderSize_prefix;
+}
+
+/* ***** Decompression ***** */
+
+ZSTD_STATIC size_t ZSTD_limitCopy(void *dst, size_t dstCapacity, const void *src, size_t srcSize)
+{
+ size_t const length = MIN(dstCapacity, srcSize);
+ memcpy(dst, src, length);
+ return length;
+}
+
+size_t ZSTD_decompressStream(ZSTD_DStream *zds, ZSTD_outBuffer *output, ZSTD_inBuffer *input)
+{
+ const char *const istart = (const char *)(input->src) + input->pos;
+ const char *const iend = (const char *)(input->src) + input->size;
+ const char *ip = istart;
+ char *const ostart = (char *)(output->dst) + output->pos;
+ char *const oend = (char *)(output->dst) + output->size;
+ char *op = ostart;
+ U32 someMoreWork = 1;
+
+ while (someMoreWork) {
+ switch (zds->stage) {
+ case zdss_init:
+ ZSTD_resetDStream(zds); /* transparent reset on starting decoding a new frame */
+ /* fall-through */
+
+ case zdss_loadHeader: {
+ size_t const hSize = ZSTD_getFrameParams(&zds->fParams, zds->headerBuffer, zds->lhSize);
+ if (ZSTD_isError(hSize))
+ return hSize;
+ if (hSize != 0) { /* need more input */
+ size_t const toLoad = hSize - zds->lhSize; /* if hSize!=0, hSize > zds->lhSize */
+ if (toLoad > (size_t)(iend - ip)) { /* not enough input to load full header */
+ memcpy(zds->headerBuffer + zds->lhSize, ip, iend - ip);
+ zds->lhSize += iend - ip;
+ input->pos = input->size;
+ return (MAX(ZSTD_frameHeaderSize_min, hSize) - zds->lhSize) +
+ ZSTD_blockHeaderSize; /* remaining header bytes + next block header */
+ }
+ memcpy(zds->headerBuffer + zds->lhSize, ip, toLoad);
+ zds->lhSize = hSize;
+ ip += toLoad;
+ break;
+ }
+
+ /* check for single-pass mode opportunity */
+ if (zds->fParams.frameContentSize && zds->fParams.windowSize /* skippable frame if == 0 */
+ && (U64)(size_t)(oend - op) >= zds->fParams.frameContentSize) {
+ size_t const cSize = ZSTD_findFrameCompressedSize(istart, iend - istart);
+ if (cSize <= (size_t)(iend - istart)) {
+ size_t const decompressedSize = ZSTD_decompress_usingDDict(zds->dctx, op, oend - op, istart, cSize, zds->ddict);
+ if (ZSTD_isError(decompressedSize))
+ return decompressedSize;
+ ip = istart + cSize;
+ op += decompressedSize;
+ zds->dctx->expected = 0;
+ zds->stage = zdss_init;
+ someMoreWork = 0;
+ break;
+ }
+ }
+
+ /* Consume header */
+ ZSTD_refDDict(zds->dctx, zds->ddict);
+ {
+ size_t const h1Size = ZSTD_nextSrcSizeToDecompress(zds->dctx); /* == ZSTD_frameHeaderSize_prefix */
+ CHECK_F(ZSTD_decompressContinue(zds->dctx, NULL, 0, zds->headerBuffer, h1Size));
+ {
+ size_t const h2Size = ZSTD_nextSrcSizeToDecompress(zds->dctx);
+ CHECK_F(ZSTD_decompressContinue(zds->dctx, NULL, 0, zds->headerBuffer + h1Size, h2Size));
+ }
+ }
+
+ zds->fParams.windowSize = MAX(zds->fParams.windowSize, 1U << ZSTD_WINDOWLOG_ABSOLUTEMIN);
+ if (zds->fParams.windowSize > zds->maxWindowSize)
+ return ERROR(frameParameter_windowTooLarge);
+
+ /* Buffers are preallocated, but double check */
+ {
+ size_t const blockSize = MIN(zds->maxWindowSize, ZSTD_BLOCKSIZE_ABSOLUTEMAX);
+ size_t const neededOutSize = zds->maxWindowSize + blockSize + WILDCOPY_OVERLENGTH * 2;
+ if (zds->inBuffSize < blockSize) {
+ return ERROR(GENERIC);
+ }
+ if (zds->outBuffSize < neededOutSize) {
+ return ERROR(GENERIC);
+ }
+ zds->blockSize = blockSize;
+ }
+ zds->stage = zdss_read;
+ }
+ /* pass-through */
+
+ case zdss_read: {
+ size_t const neededInSize = ZSTD_nextSrcSizeToDecompress(zds->dctx);
+ if (neededInSize == 0) { /* end of frame */
+ zds->stage = zdss_init;
+ someMoreWork = 0;
+ break;
+ }
+ if ((size_t)(iend - ip) >= neededInSize) { /* decode directly from src */
+ const int isSkipFrame = ZSTD_isSkipFrame(zds->dctx);
+ size_t const decodedSize = ZSTD_decompressContinue(zds->dctx, zds->outBuff + zds->outStart,
+ (isSkipFrame ? 0 : zds->outBuffSize - zds->outStart), ip, neededInSize);
+ if (ZSTD_isError(decodedSize))
+ return decodedSize;
+ ip += neededInSize;
+ if (!decodedSize && !isSkipFrame)
+ break; /* this was just a header */
+ zds->outEnd = zds->outStart + decodedSize;
+ zds->stage = zdss_flush;
+ break;
+ }
+ if (ip == iend) {
+ someMoreWork = 0;
+ break;
+ } /* no more input */
+ zds->stage = zdss_load;
+ /* pass-through */
+ }
+
+ case zdss_load: {
+ size_t const neededInSize = ZSTD_nextSrcSizeToDecompress(zds->dctx);
+ size_t const toLoad = neededInSize - zds->inPos; /* should always be <= remaining space within inBuff */
+ size_t loadedSize;
+ if (toLoad > zds->inBuffSize - zds->inPos)
+ return ERROR(corruption_detected); /* should never happen */
+ loadedSize = ZSTD_limitCopy(zds->inBuff + zds->inPos, toLoad, ip, iend - ip);
+ ip += loadedSize;
+ zds->inPos += loadedSize;
+ if (loadedSize < toLoad) {
+ someMoreWork = 0;
+ break;
+ } /* not enough input, wait for more */
+
+ /* decode loaded input */
+ {
+ const int isSkipFrame = ZSTD_isSkipFrame(zds->dctx);
+ size_t const decodedSize = ZSTD_decompressContinue(zds->dctx, zds->outBuff + zds->outStart, zds->outBuffSize - zds->outStart,
+ zds->inBuff, neededInSize);
+ if (ZSTD_isError(decodedSize))
+ return decodedSize;
+ zds->inPos = 0; /* input is consumed */
+ if (!decodedSize && !isSkipFrame) {
+ zds->stage = zdss_read;
+ break;
+ } /* this was just a header */
+ zds->outEnd = zds->outStart + decodedSize;
+ zds->stage = zdss_flush;
+ /* pass-through */
+ }
+ }
+
+ case zdss_flush: {
+ size_t const toFlushSize = zds->outEnd - zds->outStart;
+ size_t const flushedSize = ZSTD_limitCopy(op, oend - op, zds->outBuff + zds->outStart, toFlushSize);
+ op += flushedSize;
+ zds->outStart += flushedSize;
+ if (flushedSize == toFlushSize) { /* flush completed */
+ zds->stage = zdss_read;
+ if (zds->outStart + zds->blockSize > zds->outBuffSize)
+ zds->outStart = zds->outEnd = 0;
+ break;
+ }
+ /* cannot complete flush */
+ someMoreWork = 0;
+ break;
+ }
+ default:
+ return ERROR(GENERIC); /* impossible */
+ }
+ }
+
+ /* result */
+ input->pos += (size_t)(ip - istart);
+ output->pos += (size_t)(op - ostart);
+ {
+ size_t nextSrcSizeHint = ZSTD_nextSrcSizeToDecompress(zds->dctx);
+ if (!nextSrcSizeHint) { /* frame fully decoded */
+ if (zds->outEnd == zds->outStart) { /* output fully flushed */
+ if (zds->hostageByte) {
+ if (input->pos >= input->size) {
+ zds->stage = zdss_read;
+ return 1;
+ } /* can't release hostage (not present) */
+ input->pos++; /* release hostage */
+ }
+ return 0;
+ }
+ if (!zds->hostageByte) { /* output not fully flushed; keep last byte as hostage; will be released when all output is flushed */
+ input->pos--; /* note : pos > 0, otherwise, impossible to finish reading last block */
+ zds->hostageByte = 1;
+ }
+ return 1;
+ }
+ nextSrcSizeHint += ZSTD_blockHeaderSize * (ZSTD_nextInputType(zds->dctx) == ZSTDnit_block); /* preload header of next block */
+ if (zds->inPos > nextSrcSizeHint)
+ return ERROR(GENERIC); /* should never happen */
+ nextSrcSizeHint -= zds->inPos; /* already loaded*/
+ return nextSrcSizeHint;
+ }
+}
+
+EXPORT_SYMBOL(ZSTD_DCtxWorkspaceBound);
+EXPORT_SYMBOL(ZSTD_initDCtx);
+EXPORT_SYMBOL(ZSTD_decompressDCtx);
+EXPORT_SYMBOL(ZSTD_decompress_usingDict);
+
+EXPORT_SYMBOL(ZSTD_DDictWorkspaceBound);
+EXPORT_SYMBOL(ZSTD_initDDict);
+EXPORT_SYMBOL(ZSTD_decompress_usingDDict);
+
+EXPORT_SYMBOL(ZSTD_DStreamWorkspaceBound);
+EXPORT_SYMBOL(ZSTD_initDStream);
+EXPORT_SYMBOL(ZSTD_initDStream_usingDDict);
+EXPORT_SYMBOL(ZSTD_resetDStream);
+EXPORT_SYMBOL(ZSTD_decompressStream);
+EXPORT_SYMBOL(ZSTD_DStreamInSize);
+EXPORT_SYMBOL(ZSTD_DStreamOutSize);
+
+EXPORT_SYMBOL(ZSTD_findFrameCompressedSize);
+EXPORT_SYMBOL(ZSTD_getFrameContentSize);
+EXPORT_SYMBOL(ZSTD_findDecompressedSize);
+
+EXPORT_SYMBOL(ZSTD_isFrame);
+EXPORT_SYMBOL(ZSTD_getDictID_fromDict);
+EXPORT_SYMBOL(ZSTD_getDictID_fromDDict);
+EXPORT_SYMBOL(ZSTD_getDictID_fromFrame);
+
+EXPORT_SYMBOL(ZSTD_getFrameParams);
+EXPORT_SYMBOL(ZSTD_decompressBegin);
+EXPORT_SYMBOL(ZSTD_decompressBegin_usingDict);
+EXPORT_SYMBOL(ZSTD_copyDCtx);
+EXPORT_SYMBOL(ZSTD_nextSrcSizeToDecompress);
+EXPORT_SYMBOL(ZSTD_decompressContinue);
+EXPORT_SYMBOL(ZSTD_nextInputType);
+
+EXPORT_SYMBOL(ZSTD_decompressBlock);
+EXPORT_SYMBOL(ZSTD_insertBlock);
+
+MODULE_LICENSE("Dual BSD/GPL");
+MODULE_DESCRIPTION("Zstd Decompressor");
diff --git a/lib/zstd/entropy_common.c b/lib/zstd/entropy_common.c
new file mode 100644
index 000000000000..2b0a643c32c4
--- /dev/null
+++ b/lib/zstd/entropy_common.c
@@ -0,0 +1,243 @@
+/*
+ * Common functions of New Generation Entropy library
+ * Copyright (C) 2016, Yann Collet.
+ *
+ * BSD 2-Clause License (http://www.opensource.org/licenses/bsd-license.php)
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are
+ * met:
+ *
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following disclaimer
+ * in the documentation and/or other materials provided with the
+ * distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ * This program is free software; you can redistribute it and/or modify it under
+ * the terms of the GNU General Public License version 2 as published by the
+ * Free Software Foundation. This program is dual-licensed; you may select
+ * either version 2 of the GNU General Public License ("GPL") or BSD license
+ * ("BSD").
+ *
+ * You can contact the author at :
+ * - Source repository : https://github.com/Cyan4973/FiniteStateEntropy
+ */
+
+/* *************************************
+* Dependencies
+***************************************/
+#include "error_private.h" /* ERR_*, ERROR */
+#include "fse.h"
+#include "huf.h"
+#include "mem.h"
+
+/*=== Version ===*/
+unsigned FSE_versionNumber(void) { return FSE_VERSION_NUMBER; }
+
+/*=== Error Management ===*/
+unsigned FSE_isError(size_t code) { return ERR_isError(code); }
+
+unsigned HUF_isError(size_t code) { return ERR_isError(code); }
+
+/*-**************************************************************
+* FSE NCount encoding-decoding
+****************************************************************/
+size_t FSE_readNCount(short *normalizedCounter, unsigned *maxSVPtr, unsigned *tableLogPtr, const void *headerBuffer, size_t hbSize)
+{
+ const BYTE *const istart = (const BYTE *)headerBuffer;
+ const BYTE *const iend = istart + hbSize;
+ const BYTE *ip = istart;
+ int nbBits;
+ int remaining;
+ int threshold;
+ U32 bitStream;
+ int bitCount;
+ unsigned charnum = 0;
+ int previous0 = 0;
+
+ if (hbSize < 4)
+ return ERROR(srcSize_wrong);
+ bitStream = ZSTD_readLE32(ip);
+ nbBits = (bitStream & 0xF) + FSE_MIN_TABLELOG; /* extract tableLog */
+ if (nbBits > FSE_TABLELOG_ABSOLUTE_MAX)
+ return ERROR(tableLog_tooLarge);
+ bitStream >>= 4;
+ bitCount = 4;
+ *tableLogPtr = nbBits;
+ remaining = (1 << nbBits) + 1;
+ threshold = 1 << nbBits;
+ nbBits++;
+
+ while ((remaining > 1) & (charnum <= *maxSVPtr)) {
+ if (previous0) {
+ unsigned n0 = charnum;
+ while ((bitStream & 0xFFFF) == 0xFFFF) {
+ n0 += 24;
+ if (ip < iend - 5) {
+ ip += 2;
+ bitStream = ZSTD_readLE32(ip) >> bitCount;
+ } else {
+ bitStream >>= 16;
+ bitCount += 16;
+ }
+ }
+ while ((bitStream & 3) == 3) {
+ n0 += 3;
+ bitStream >>= 2;
+ bitCount += 2;
+ }
+ n0 += bitStream & 3;
+ bitCount += 2;
+ if (n0 > *maxSVPtr)
+ return ERROR(maxSymbolValue_tooSmall);
+ while (charnum < n0)
+ normalizedCounter[charnum++] = 0;
+ if ((ip <= iend - 7) || (ip + (bitCount >> 3) <= iend - 4)) {
+ ip += bitCount >> 3;
+ bitCount &= 7;
+ bitStream = ZSTD_readLE32(ip) >> bitCount;
+ } else {
+ bitStream >>= 2;
+ }
+ }
+ {
+ int const max = (2 * threshold - 1) - remaining;
+ int count;
+
+ if ((bitStream & (threshold - 1)) < (U32)max) {
+ count = bitStream & (threshold - 1);
+ bitCount += nbBits - 1;
+ } else {
+ count = bitStream & (2 * threshold - 1);
+ if (count >= threshold)
+ count -= max;
+ bitCount += nbBits;
+ }
+
+ count--; /* extra accuracy */
+ remaining -= count < 0 ? -count : count; /* -1 means +1 */
+ normalizedCounter[charnum++] = (short)count;
+ previous0 = !count;
+ while (remaining < threshold) {
+ nbBits--;
+ threshold >>= 1;
+ }
+
+ if ((ip <= iend - 7) || (ip + (bitCount >> 3) <= iend - 4)) {
+ ip += bitCount >> 3;
+ bitCount &= 7;
+ } else {
+ bitCount -= (int)(8 * (iend - 4 - ip));
+ ip = iend - 4;
+ }
+ bitStream = ZSTD_readLE32(ip) >> (bitCount & 31);
+ }
+ } /* while ((remaining>1) & (charnum<=*maxSVPtr)) */
+ if (remaining != 1)
+ return ERROR(corruption_detected);
+ if (bitCount > 32)
+ return ERROR(corruption_detected);
+ *maxSVPtr = charnum - 1;
+
+ ip += (bitCount + 7) >> 3;
+ return ip - istart;
+}
+
+/*! HUF_readStats() :
+ Read compact Huffman tree, saved by HUF_writeCTable().
+ `huffWeight` is destination buffer.
+ `rankStats` is assumed to be a table of at least HUF_TABLELOG_MAX U32.
+ @return : size read from `src` , or an error Code .
+ Note : Needed by HUF_readCTable() and HUF_readDTableX?() .
+*/
+size_t HUF_readStats_wksp(BYTE *huffWeight, size_t hwSize, U32 *rankStats, U32 *nbSymbolsPtr, U32 *tableLogPtr, const void *src, size_t srcSize, void *workspace, size_t workspaceSize)
+{
+ U32 weightTotal;
+ const BYTE *ip = (const BYTE *)src;
+ size_t iSize;
+ size_t oSize;
+
+ if (!srcSize)
+ return ERROR(srcSize_wrong);
+ iSize = ip[0];
+ /* memset(huffWeight, 0, hwSize); */ /* is not necessary, even though some analyzer complain ... */
+
+ if (iSize >= 128) { /* special header */
+ oSize = iSize - 127;
+ iSize = ((oSize + 1) / 2);
+ if (iSize + 1 > srcSize)
+ return ERROR(srcSize_wrong);
+ if (oSize >= hwSize)
+ return ERROR(corruption_detected);
+ ip += 1;
+ {
+ U32 n;
+ for (n = 0; n < oSize; n += 2) {
+ huffWeight[n] = ip[n / 2] >> 4;
+ huffWeight[n + 1] = ip[n / 2] & 15;
+ }
+ }
+ } else { /* header compressed with FSE (normal case) */
+ if (iSize + 1 > srcSize)
+ return ERROR(srcSize_wrong);
+ oSize = FSE_decompress_wksp(huffWeight, hwSize - 1, ip + 1, iSize, 6, workspace, workspaceSize); /* max (hwSize-1) values decoded, as last one is implied */
+ if (FSE_isError(oSize))
+ return oSize;
+ }
+
+ /* collect weight stats */
+ memset(rankStats, 0, (HUF_TABLELOG_MAX + 1) * sizeof(U32));
+ weightTotal = 0;
+ {
+ U32 n;
+ for (n = 0; n < oSize; n++) {
+ if (huffWeight[n] >= HUF_TABLELOG_MAX)
+ return ERROR(corruption_detected);
+ rankStats[huffWeight[n]]++;
+ weightTotal += (1 << huffWeight[n]) >> 1;
+ }
+ }
+ if (weightTotal == 0)
+ return ERROR(corruption_detected);
+
+ /* get last non-null symbol weight (implied, total must be 2^n) */
+ {
+ U32 const tableLog = BIT_highbit32(weightTotal) + 1;
+ if (tableLog > HUF_TABLELOG_MAX)
+ return ERROR(corruption_detected);
+ *tableLogPtr = tableLog;
+ /* determine last weight */
+ {
+ U32 const total = 1 << tableLog;
+ U32 const rest = total - weightTotal;
+ U32 const verif = 1 << BIT_highbit32(rest);
+ U32 const lastWeight = BIT_highbit32(rest) + 1;
+ if (verif != rest)
+ return ERROR(corruption_detected); /* last value must be a clean power of 2 */
+ huffWeight[oSize] = (BYTE)lastWeight;
+ rankStats[lastWeight]++;
+ }
+ }
+
+ /* check tree construction validity */
+ if ((rankStats[1] < 2) || (rankStats[1] & 1))
+ return ERROR(corruption_detected); /* by construction : at least 2 elts of rank 1, must be even */
+
+ /* results */
+ *nbSymbolsPtr = (U32)(oSize + 1);
+ return iSize + 1;
+}
diff --git a/lib/zstd/error_private.h b/lib/zstd/error_private.h
new file mode 100644
index 000000000000..1a60b31f706c
--- /dev/null
+++ b/lib/zstd/error_private.h
@@ -0,0 +1,53 @@
+/**
+ * Copyright (c) 2016-present, Yann Collet, Facebook, Inc.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of https://github.com/facebook/zstd.
+ * An additional grant of patent rights can be found in the PATENTS file in the
+ * same directory.
+ *
+ * This program is free software; you can redistribute it and/or modify it under
+ * the terms of the GNU General Public License version 2 as published by the
+ * Free Software Foundation. This program is dual-licensed; you may select
+ * either version 2 of the GNU General Public License ("GPL") or BSD license
+ * ("BSD").
+ */
+
+/* Note : this module is expected to remain private, do not expose it */
+
+#ifndef ERROR_H_MODULE
+#define ERROR_H_MODULE
+
+/* ****************************************
+* Dependencies
+******************************************/
+#include <linux/types.h> /* size_t */
+#include <linux/zstd.h> /* enum list */
+
+/* ****************************************
+* Compiler-specific
+******************************************/
+#define ERR_STATIC static __attribute__((unused))
+
+/*-****************************************
+* Customization (error_public.h)
+******************************************/
+typedef ZSTD_ErrorCode ERR_enum;
+#define PREFIX(name) ZSTD_error_##name
+
+/*-****************************************
+* Error codes handling
+******************************************/
+#define ERROR(name) ((size_t)-PREFIX(name))
+
+ERR_STATIC unsigned ERR_isError(size_t code) { return (code > ERROR(maxCode)); }
+
+ERR_STATIC ERR_enum ERR_getErrorCode(size_t code)
+{
+ if (!ERR_isError(code))
+ return (ERR_enum)0;
+ return (ERR_enum)(0 - code);
+}
+
+#endif /* ERROR_H_MODULE */
diff --git a/lib/zstd/fse.h b/lib/zstd/fse.h
new file mode 100644
index 000000000000..7460ab04b191
--- /dev/null
+++ b/lib/zstd/fse.h
@@ -0,0 +1,575 @@
+/*
+ * FSE : Finite State Entropy codec
+ * Public Prototypes declaration
+ * Copyright (C) 2013-2016, Yann Collet.
+ *
+ * BSD 2-Clause License (http://www.opensource.org/licenses/bsd-license.php)
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are
+ * met:
+ *
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following disclaimer
+ * in the documentation and/or other materials provided with the
+ * distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ * This program is free software; you can redistribute it and/or modify it under
+ * the terms of the GNU General Public License version 2 as published by the
+ * Free Software Foundation. This program is dual-licensed; you may select
+ * either version 2 of the GNU General Public License ("GPL") or BSD license
+ * ("BSD").
+ *
+ * You can contact the author at :
+ * - Source repository : https://github.com/Cyan4973/FiniteStateEntropy
+ */
+#ifndef FSE_H
+#define FSE_H
+
+/*-*****************************************
+* Dependencies
+******************************************/
+#include <linux/types.h> /* size_t, ptrdiff_t */
+
+/*-*****************************************
+* FSE_PUBLIC_API : control library symbols visibility
+******************************************/
+#define FSE_PUBLIC_API
+
+/*------ Version ------*/
+#define FSE_VERSION_MAJOR 0
+#define FSE_VERSION_MINOR 9
+#define FSE_VERSION_RELEASE 0
+
+#define FSE_LIB_VERSION FSE_VERSION_MAJOR.FSE_VERSION_MINOR.FSE_VERSION_RELEASE
+#define FSE_QUOTE(str) #str
+#define FSE_EXPAND_AND_QUOTE(str) FSE_QUOTE(str)
+#define FSE_VERSION_STRING FSE_EXPAND_AND_QUOTE(FSE_LIB_VERSION)
+
+#define FSE_VERSION_NUMBER (FSE_VERSION_MAJOR * 100 * 100 + FSE_VERSION_MINOR * 100 + FSE_VERSION_RELEASE)
+FSE_PUBLIC_API unsigned FSE_versionNumber(void); /**< library version number; to be used when checking dll version */
+
+/*-*****************************************
+* Tool functions
+******************************************/
+FSE_PUBLIC_API size_t FSE_compressBound(size_t size); /* maximum compressed size */
+
+/* Error Management */
+FSE_PUBLIC_API unsigned FSE_isError(size_t code); /* tells if a return value is an error code */
+
+/*-*****************************************
+* FSE detailed API
+******************************************/
+/*!
+FSE_compress() does the following:
+1. count symbol occurrence from source[] into table count[]
+2. normalize counters so that sum(count[]) == Power_of_2 (2^tableLog)
+3. save normalized counters to memory buffer using writeNCount()
+4. build encoding table 'CTable' from normalized counters
+5. encode the data stream using encoding table 'CTable'
+
+FSE_decompress() does the following:
+1. read normalized counters with readNCount()
+2. build decoding table 'DTable' from normalized counters
+3. decode the data stream using decoding table 'DTable'
+
+The following API allows targeting specific sub-functions for advanced tasks.
+For example, it's possible to compress several blocks using the same 'CTable',
+or to save and provide normalized distribution using external method.
+*/
+
+/* *** COMPRESSION *** */
+/*! FSE_optimalTableLog():
+ dynamically downsize 'tableLog' when conditions are met.
+ It saves CPU time, by using smaller tables, while preserving or even improving compression ratio.
+ @return : recommended tableLog (necessarily <= 'maxTableLog') */
+FSE_PUBLIC_API unsigned FSE_optimalTableLog(unsigned maxTableLog, size_t srcSize, unsigned maxSymbolValue);
+
+/*! FSE_normalizeCount():
+ normalize counts so that sum(count[]) == Power_of_2 (2^tableLog)
+ 'normalizedCounter' is a table of short, of minimum size (maxSymbolValue+1).
+ @return : tableLog,
+ or an errorCode, which can be tested using FSE_isError() */
+FSE_PUBLIC_API size_t FSE_normalizeCount(short *normalizedCounter, unsigned tableLog, const unsigned *count, size_t srcSize, unsigned maxSymbolValue);
+
+/*! FSE_NCountWriteBound():
+ Provides the maximum possible size of an FSE normalized table, given 'maxSymbolValue' and 'tableLog'.
+ Typically useful for allocation purpose. */
+FSE_PUBLIC_API size_t FSE_NCountWriteBound(unsigned maxSymbolValue, unsigned tableLog);
+
+/*! FSE_writeNCount():
+ Compactly save 'normalizedCounter' into 'buffer'.
+ @return : size of the compressed table,
+ or an errorCode, which can be tested using FSE_isError(). */
+FSE_PUBLIC_API size_t FSE_writeNCount(void *buffer, size_t bufferSize, const short *normalizedCounter, unsigned maxSymbolValue, unsigned tableLog);
+
+/*! Constructor and Destructor of FSE_CTable.
+ Note that FSE_CTable size depends on 'tableLog' and 'maxSymbolValue' */
+typedef unsigned FSE_CTable; /* don't allocate that. It's only meant to be more restrictive than void* */
+
+/*! FSE_compress_usingCTable():
+ Compress `src` using `ct` into `dst` which must be already allocated.
+ @return : size of compressed data (<= `dstCapacity`),
+ or 0 if compressed data could not fit into `dst`,
+ or an errorCode, which can be tested using FSE_isError() */
+FSE_PUBLIC_API size_t FSE_compress_usingCTable(void *dst, size_t dstCapacity, const void *src, size_t srcSize, const FSE_CTable *ct);
+
+/*!
+Tutorial :
+----------
+The first step is to count all symbols. FSE_count() does this job very fast.
+Result will be saved into 'count', a table of unsigned int, which must be already allocated, and have 'maxSymbolValuePtr[0]+1' cells.
+'src' is a table of bytes of size 'srcSize'. All values within 'src' MUST be <= maxSymbolValuePtr[0]
+maxSymbolValuePtr[0] will be updated, with its real value (necessarily <= original value)
+FSE_count() will return the number of occurrence of the most frequent symbol.
+This can be used to know if there is a single symbol within 'src', and to quickly evaluate its compressibility.
+If there is an error, the function will return an ErrorCode (which can be tested using FSE_isError()).
+
+The next step is to normalize the frequencies.
+FSE_normalizeCount() will ensure that sum of frequencies is == 2 ^'tableLog'.
+It also guarantees a minimum of 1 to any Symbol with frequency >= 1.
+You can use 'tableLog'==0 to mean "use default tableLog value".
+If you are unsure of which tableLog value to use, you can ask FSE_optimalTableLog(),
+which will provide the optimal valid tableLog given sourceSize, maxSymbolValue, and a user-defined maximum (0 means "default").
+
+The result of FSE_normalizeCount() will be saved into a table,
+called 'normalizedCounter', which is a table of signed short.
+'normalizedCounter' must be already allocated, and have at least 'maxSymbolValue+1' cells.
+The return value is tableLog if everything proceeded as expected.
+It is 0 if there is a single symbol within distribution.
+If there is an error (ex: invalid tableLog value), the function will return an ErrorCode (which can be tested using FSE_isError()).
+
+'normalizedCounter' can be saved in a compact manner to a memory area using FSE_writeNCount().
+'buffer' must be already allocated.
+For guaranteed success, buffer size must be at least FSE_headerBound().
+The result of the function is the number of bytes written into 'buffer'.
+If there is an error, the function will return an ErrorCode (which can be tested using FSE_isError(); ex : buffer size too small).
+
+'normalizedCounter' can then be used to create the compression table 'CTable'.
+The space required by 'CTable' must be already allocated, using FSE_createCTable().
+You can then use FSE_buildCTable() to fill 'CTable'.
+If there is an error, both functions will return an ErrorCode (which can be tested using FSE_isError()).
+
+'CTable' can then be used to compress 'src', with FSE_compress_usingCTable().
+Similar to FSE_count(), the convention is that 'src' is assumed to be a table of char of size 'srcSize'
+The function returns the size of compressed data (without header), necessarily <= `dstCapacity`.
+If it returns '0', compressed data could not fit into 'dst'.
+If there is an error, the function will return an ErrorCode (which can be tested using FSE_isError()).
+*/
+
+/* *** DECOMPRESSION *** */
+
+/*! FSE_readNCount():
+ Read compactly saved 'normalizedCounter' from 'rBuffer'.
+ @return : size read from 'rBuffer',
+ or an errorCode, which can be tested using FSE_isError().
+ maxSymbolValuePtr[0] and tableLogPtr[0] will also be updated with their respective values */
+FSE_PUBLIC_API size_t FSE_readNCount(short *normalizedCounter, unsigned *maxSymbolValuePtr, unsigned *tableLogPtr, const void *rBuffer, size_t rBuffSize);
+
+/*! Constructor and Destructor of FSE_DTable.
+ Note that its size depends on 'tableLog' */
+typedef unsigned FSE_DTable; /* don't allocate that. It's just a way to be more restrictive than void* */
+
+/*! FSE_buildDTable():
+ Builds 'dt', which must be already allocated, using FSE_createDTable().
+ return : 0, or an errorCode, which can be tested using FSE_isError() */
+FSE_PUBLIC_API size_t FSE_buildDTable_wksp(FSE_DTable *dt, const short *normalizedCounter, unsigned maxSymbolValue, unsigned tableLog, void *workspace, size_t workspaceSize);
+
+/*! FSE_decompress_usingDTable():
+ Decompress compressed source `cSrc` of size `cSrcSize` using `dt`
+ into `dst` which must be already allocated.
+ @return : size of regenerated data (necessarily <= `dstCapacity`),
+ or an errorCode, which can be tested using FSE_isError() */
+FSE_PUBLIC_API size_t FSE_decompress_usingDTable(void *dst, size_t dstCapacity, const void *cSrc, size_t cSrcSize, const FSE_DTable *dt);
+
+/*!
+Tutorial :
+----------
+(Note : these functions only decompress FSE-compressed blocks.
+ If block is uncompressed, use memcpy() instead
+ If block is a single repeated byte, use memset() instead )
+
+The first step is to obtain the normalized frequencies of symbols.
+This can be performed by FSE_readNCount() if it was saved using FSE_writeNCount().
+'normalizedCounter' must be already allocated, and have at least 'maxSymbolValuePtr[0]+1' cells of signed short.
+In practice, that means it's necessary to know 'maxSymbolValue' beforehand,
+or size the table to handle worst case situations (typically 256).
+FSE_readNCount() will provide 'tableLog' and 'maxSymbolValue'.
+The result of FSE_readNCount() is the number of bytes read from 'rBuffer'.
+Note that 'rBufferSize' must be at least 4 bytes, even if useful information is less than that.
+If there is an error, the function will return an error code, which can be tested using FSE_isError().
+
+The next step is to build the decompression tables 'FSE_DTable' from 'normalizedCounter'.
+This is performed by the function FSE_buildDTable().
+The space required by 'FSE_DTable' must be already allocated using FSE_createDTable().
+If there is an error, the function will return an error code, which can be tested using FSE_isError().
+
+`FSE_DTable` can then be used to decompress `cSrc`, with FSE_decompress_usingDTable().
+`cSrcSize` must be strictly correct, otherwise decompression will fail.
+FSE_decompress_usingDTable() result will tell how many bytes were regenerated (<=`dstCapacity`).
+If there is an error, the function will return an error code, which can be tested using FSE_isError(). (ex: dst buffer too small)
+*/
+
+/* *** Dependency *** */
+#include "bitstream.h"
+
+/* *****************************************
+* Static allocation
+*******************************************/
+/* FSE buffer bounds */
+#define FSE_NCOUNTBOUND 512
+#define FSE_BLOCKBOUND(size) (size + (size >> 7))
+#define FSE_COMPRESSBOUND(size) (FSE_NCOUNTBOUND + FSE_BLOCKBOUND(size)) /* Macro version, useful for static allocation */
+
+/* It is possible to statically allocate FSE CTable/DTable as a table of FSE_CTable/FSE_DTable using below macros */
+#define FSE_CTABLE_SIZE_U32(maxTableLog, maxSymbolValue) (1 + (1 << (maxTableLog - 1)) + ((maxSymbolValue + 1) * 2))
+#define FSE_DTABLE_SIZE_U32(maxTableLog) (1 + (1 << maxTableLog))
+
+/* *****************************************
+* FSE advanced API
+*******************************************/
+/* FSE_count_wksp() :
+ * Same as FSE_count(), but using an externally provided scratch buffer.
+ * `workSpace` size must be table of >= `1024` unsigned
+ */
+size_t FSE_count_wksp(unsigned *count, unsigned *maxSymbolValuePtr, const void *source, size_t sourceSize, unsigned *workSpace);
+
+/* FSE_countFast_wksp() :
+ * Same as FSE_countFast(), but using an externally provided scratch buffer.
+ * `workSpace` must be a table of minimum `1024` unsigned
+ */
+size_t FSE_countFast_wksp(unsigned *count, unsigned *maxSymbolValuePtr, const void *src, size_t srcSize, unsigned *workSpace);
+
+/*! FSE_count_simple
+ * Same as FSE_countFast(), but does not use any additional memory (not even on stack).
+ * This function is unsafe, and will segfault if any value within `src` is `> *maxSymbolValuePtr` (presuming it's also the size of `count`).
+*/
+size_t FSE_count_simple(unsigned *count, unsigned *maxSymbolValuePtr, const void *src, size_t srcSize);
+
+unsigned FSE_optimalTableLog_internal(unsigned maxTableLog, size_t srcSize, unsigned maxSymbolValue, unsigned minus);
+/**< same as FSE_optimalTableLog(), which used `minus==2` */
+
+size_t FSE_buildCTable_raw(FSE_CTable *ct, unsigned nbBits);
+/**< build a fake FSE_CTable, designed for a flat distribution, where each symbol uses nbBits */
+
+size_t FSE_buildCTable_rle(FSE_CTable *ct, unsigned char symbolValue);
+/**< build a fake FSE_CTable, designed to compress always the same symbolValue */
+
+/* FSE_buildCTable_wksp() :
+ * Same as FSE_buildCTable(), but using an externally allocated scratch buffer (`workSpace`).
+ * `wkspSize` must be >= `(1<<tableLog)`.
+ */
+size_t FSE_buildCTable_wksp(FSE_CTable *ct, const short *normalizedCounter, unsigned maxSymbolValue, unsigned tableLog, void *workSpace, size_t wkspSize);
+
+size_t FSE_buildDTable_raw(FSE_DTable *dt, unsigned nbBits);
+/**< build a fake FSE_DTable, designed to read a flat distribution where each symbol uses nbBits */
+
+size_t FSE_buildDTable_rle(FSE_DTable *dt, unsigned char symbolValue);
+/**< build a fake FSE_DTable, designed to always generate the same symbolValue */
+
+size_t FSE_decompress_wksp(void *dst, size_t dstCapacity, const void *cSrc, size_t cSrcSize, unsigned maxLog, void *workspace, size_t workspaceSize);
+/**< same as FSE_decompress(), using an externally allocated `workSpace` produced with `FSE_DTABLE_SIZE_U32(maxLog)` */
+
+/* *****************************************
+* FSE symbol compression API
+*******************************************/
+/*!
+ This API consists of small unitary functions, which highly benefit from being inlined.
+ Hence their body are included in next section.
+*/
+typedef struct {
+ ptrdiff_t value;
+ const void *stateTable;
+ const void *symbolTT;
+ unsigned stateLog;
+} FSE_CState_t;
+
+static void FSE_initCState(FSE_CState_t *CStatePtr, const FSE_CTable *ct);
+
+static void FSE_encodeSymbol(BIT_CStream_t *bitC, FSE_CState_t *CStatePtr, unsigned symbol);
+
+static void FSE_flushCState(BIT_CStream_t *bitC, const FSE_CState_t *CStatePtr);
+
+/**<
+These functions are inner components of FSE_compress_usingCTable().
+They allow the creation of custom streams, mixing multiple tables and bit sources.
+
+A key property to keep in mind is that encoding and decoding are done **in reverse direction**.
+So the first symbol you will encode is the last you will decode, like a LIFO stack.
+
+You will need a few variables to track your CStream. They are :
+
+FSE_CTable ct; // Provided by FSE_buildCTable()
+BIT_CStream_t bitStream; // bitStream tracking structure
+FSE_CState_t state; // State tracking structure (can have several)
+
+
+The first thing to do is to init bitStream and state.
+ size_t errorCode = BIT_initCStream(&bitStream, dstBuffer, maxDstSize);
+ FSE_initCState(&state, ct);
+
+Note that BIT_initCStream() can produce an error code, so its result should be tested, using FSE_isError();
+You can then encode your input data, byte after byte.
+FSE_encodeSymbol() outputs a maximum of 'tableLog' bits at a time.
+Remember decoding will be done in reverse direction.
+ FSE_encodeByte(&bitStream, &state, symbol);
+
+At any time, you can also add any bit sequence.
+Note : maximum allowed nbBits is 25, for compatibility with 32-bits decoders
+ BIT_addBits(&bitStream, bitField, nbBits);
+
+The above methods don't commit data to memory, they just store it into local register, for speed.
+Local register size is 64-bits on 64-bits systems, 32-bits on 32-bits systems (size_t).
+Writing data to memory is a manual operation, performed by the flushBits function.
+ BIT_flushBits(&bitStream);
+
+Your last FSE encoding operation shall be to flush your last state value(s).
+ FSE_flushState(&bitStream, &state);
+
+Finally, you must close the bitStream.
+The function returns the size of CStream in bytes.
+If data couldn't fit into dstBuffer, it will return a 0 ( == not compressible)
+If there is an error, it returns an errorCode (which can be tested using FSE_isError()).
+ size_t size = BIT_closeCStream(&bitStream);
+*/
+
+/* *****************************************
+* FSE symbol decompression API
+*******************************************/
+typedef struct {
+ size_t state;
+ const void *table; /* precise table may vary, depending on U16 */
+} FSE_DState_t;
+
+static void FSE_initDState(FSE_DState_t *DStatePtr, BIT_DStream_t *bitD, const FSE_DTable *dt);
+
+static unsigned char FSE_decodeSymbol(FSE_DState_t *DStatePtr, BIT_DStream_t *bitD);
+
+static unsigned FSE_endOfDState(const FSE_DState_t *DStatePtr);
+
+/**<
+Let's now decompose FSE_decompress_usingDTable() into its unitary components.
+You will decode FSE-encoded symbols from the bitStream,
+and also any other bitFields you put in, **in reverse order**.
+
+You will need a few variables to track your bitStream. They are :
+
+BIT_DStream_t DStream; // Stream context
+FSE_DState_t DState; // State context. Multiple ones are possible
+FSE_DTable* DTablePtr; // Decoding table, provided by FSE_buildDTable()
+
+The first thing to do is to init the bitStream.
+ errorCode = BIT_initDStream(&DStream, srcBuffer, srcSize);
+
+You should then retrieve your initial state(s)
+(in reverse flushing order if you have several ones) :
+ errorCode = FSE_initDState(&DState, &DStream, DTablePtr);
+
+You can then decode your data, symbol after symbol.
+For information the maximum number of bits read by FSE_decodeSymbol() is 'tableLog'.
+Keep in mind that symbols are decoded in reverse order, like a LIFO stack (last in, first out).
+ unsigned char symbol = FSE_decodeSymbol(&DState, &DStream);
+
+You can retrieve any bitfield you eventually stored into the bitStream (in reverse order)
+Note : maximum allowed nbBits is 25, for 32-bits compatibility
+ size_t bitField = BIT_readBits(&DStream, nbBits);
+
+All above operations only read from local register (which size depends on size_t).
+Refueling the register from memory is manually performed by the reload method.
+ endSignal = FSE_reloadDStream(&DStream);
+
+BIT_reloadDStream() result tells if there is still some more data to read from DStream.
+BIT_DStream_unfinished : there is still some data left into the DStream.
+BIT_DStream_endOfBuffer : Dstream reached end of buffer. Its container may no longer be completely filled.
+BIT_DStream_completed : Dstream reached its exact end, corresponding in general to decompression completed.
+BIT_DStream_tooFar : Dstream went too far. Decompression result is corrupted.
+
+When reaching end of buffer (BIT_DStream_endOfBuffer), progress slowly, notably if you decode multiple symbols per loop,
+to properly detect the exact end of stream.
+After each decoded symbol, check if DStream is fully consumed using this simple test :
+ BIT_reloadDStream(&DStream) >= BIT_DStream_completed
+
+When it's done, verify decompression is fully completed, by checking both DStream and the relevant states.
+Checking if DStream has reached its end is performed by :
+ BIT_endOfDStream(&DStream);
+Check also the states. There might be some symbols left there, if some high probability ones (>50%) are possible.
+ FSE_endOfDState(&DState);
+*/
+
+/* *****************************************
+* FSE unsafe API
+*******************************************/
+static unsigned char FSE_decodeSymbolFast(FSE_DState_t *DStatePtr, BIT_DStream_t *bitD);
+/* faster, but works only if nbBits is always >= 1 (otherwise, result will be corrupted) */
+
+/* *****************************************
+* Implementation of inlined functions
+*******************************************/
+typedef struct {
+ int deltaFindState;
+ U32 deltaNbBits;
+} FSE_symbolCompressionTransform; /* total 8 bytes */
+
+ZSTD_STATIC void FSE_initCState(FSE_CState_t *statePtr, const FSE_CTable *ct)
+{
+ const void *ptr = ct;
+ const U16 *u16ptr = (const U16 *)ptr;
+ const U32 tableLog = ZSTD_read16(ptr);
+ statePtr->value = (ptrdiff_t)1 << tableLog;
+ statePtr->stateTable = u16ptr + 2;
+ statePtr->symbolTT = ((const U32 *)ct + 1 + (tableLog ? (1 << (tableLog - 1)) : 1));
+ statePtr->stateLog = tableLog;
+}
+
+/*! FSE_initCState2() :
+* Same as FSE_initCState(), but the first symbol to include (which will be the last to be read)
+* uses the smallest state value possible, saving the cost of this symbol */
+ZSTD_STATIC void FSE_initCState2(FSE_CState_t *statePtr, const FSE_CTable *ct, U32 symbol)
+{
+ FSE_initCState(statePtr, ct);
+ {
+ const FSE_symbolCompressionTransform symbolTT = ((const FSE_symbolCompressionTransform *)(statePtr->symbolTT))[symbol];
+ const U16 *stateTable = (const U16 *)(statePtr->stateTable);
+ U32 nbBitsOut = (U32)((symbolTT.deltaNbBits + (1 << 15)) >> 16);
+ statePtr->value = (nbBitsOut << 16) - symbolTT.deltaNbBits;
+ statePtr->value = stateTable[(statePtr->value >> nbBitsOut) + symbolTT.deltaFindState];
+ }
+}
+
+ZSTD_STATIC void FSE_encodeSymbol(BIT_CStream_t *bitC, FSE_CState_t *statePtr, U32 symbol)
+{
+ const FSE_symbolCompressionTransform symbolTT = ((const FSE_symbolCompressionTransform *)(statePtr->symbolTT))[symbol];
+ const U16 *const stateTable = (const U16 *)(statePtr->stateTable);
+ U32 nbBitsOut = (U32)((statePtr->value + symbolTT.deltaNbBits) >> 16);
+ BIT_addBits(bitC, statePtr->value, nbBitsOut);
+ statePtr->value = stateTable[(statePtr->value >> nbBitsOut) + symbolTT.deltaFindState];
+}
+
+ZSTD_STATIC void FSE_flushCState(BIT_CStream_t *bitC, const FSE_CState_t *statePtr)
+{
+ BIT_addBits(bitC, statePtr->value, statePtr->stateLog);
+ BIT_flushBits(bitC);
+}
+
+/* ====== Decompression ====== */
+
+typedef struct {
+ U16 tableLog;
+ U16 fastMode;
+} FSE_DTableHeader; /* sizeof U32 */
+
+typedef struct {
+ unsigned short newState;
+ unsigned char symbol;
+ unsigned char nbBits;
+} FSE_decode_t; /* size == U32 */
+
+ZSTD_STATIC void FSE_initDState(FSE_DState_t *DStatePtr, BIT_DStream_t *bitD, const FSE_DTable *dt)
+{
+ const void *ptr = dt;
+ const FSE_DTableHeader *const DTableH = (const FSE_DTableHeader *)ptr;
+ DStatePtr->state = BIT_readBits(bitD, DTableH->tableLog);
+ BIT_reloadDStream(bitD);
+ DStatePtr->table = dt + 1;
+}
+
+ZSTD_STATIC BYTE FSE_peekSymbol(const FSE_DState_t *DStatePtr)
+{
+ FSE_decode_t const DInfo = ((const FSE_decode_t *)(DStatePtr->table))[DStatePtr->state];
+ return DInfo.symbol;
+}
+
+ZSTD_STATIC void FSE_updateState(FSE_DState_t *DStatePtr, BIT_DStream_t *bitD)
+{
+ FSE_decode_t const DInfo = ((const FSE_decode_t *)(DStatePtr->table))[DStatePtr->state];
+ U32 const nbBits = DInfo.nbBits;
+ size_t const lowBits = BIT_readBits(bitD, nbBits);
+ DStatePtr->state = DInfo.newState + lowBits;
+}
+
+ZSTD_STATIC BYTE FSE_decodeSymbol(FSE_DState_t *DStatePtr, BIT_DStream_t *bitD)
+{
+ FSE_decode_t const DInfo = ((const FSE_decode_t *)(DStatePtr->table))[DStatePtr->state];
+ U32 const nbBits = DInfo.nbBits;
+ BYTE const symbol = DInfo.symbol;
+ size_t const lowBits = BIT_readBits(bitD, nbBits);
+
+ DStatePtr->state = DInfo.newState + lowBits;
+ return symbol;
+}
+
+/*! FSE_decodeSymbolFast() :
+ unsafe, only works if no symbol has a probability > 50% */
+ZSTD_STATIC BYTE FSE_decodeSymbolFast(FSE_DState_t *DStatePtr, BIT_DStream_t *bitD)
+{
+ FSE_decode_t const DInfo = ((const FSE_decode_t *)(DStatePtr->table))[DStatePtr->state];
+ U32 const nbBits = DInfo.nbBits;
+ BYTE const symbol = DInfo.symbol;
+ size_t const lowBits = BIT_readBitsFast(bitD, nbBits);
+
+ DStatePtr->state = DInfo.newState + lowBits;
+ return symbol;
+}
+
+ZSTD_STATIC unsigned FSE_endOfDState(const FSE_DState_t *DStatePtr) { return DStatePtr->state == 0; }
+
+/* **************************************************************
+* Tuning parameters
+****************************************************************/
+/*!MEMORY_USAGE :
+* Memory usage formula : N->2^N Bytes (examples : 10 -> 1KB; 12 -> 4KB ; 16 -> 64KB; 20 -> 1MB; etc.)
+* Increasing memory usage improves compression ratio
+* Reduced memory usage can improve speed, due to cache effect
+* Recommended max value is 14, for 16KB, which nicely fits into Intel x86 L1 cache */
+#ifndef FSE_MAX_MEMORY_USAGE
+#define FSE_MAX_MEMORY_USAGE 14
+#endif
+#ifndef FSE_DEFAULT_MEMORY_USAGE
+#define FSE_DEFAULT_MEMORY_USAGE 13
+#endif
+
+/*!FSE_MAX_SYMBOL_VALUE :
+* Maximum symbol value authorized.
+* Required for proper stack allocation */
+#ifndef FSE_MAX_SYMBOL_VALUE
+#define FSE_MAX_SYMBOL_VALUE 255
+#endif
+
+/* **************************************************************
+* template functions type & suffix
+****************************************************************/
+#define FSE_FUNCTION_TYPE BYTE
+#define FSE_FUNCTION_EXTENSION
+#define FSE_DECODE_TYPE FSE_decode_t
+
+/* ***************************************************************
+* Constants
+*****************************************************************/
+#define FSE_MAX_TABLELOG (FSE_MAX_MEMORY_USAGE - 2)
+#define FSE_MAX_TABLESIZE (1U << FSE_MAX_TABLELOG)
+#define FSE_MAXTABLESIZE_MASK (FSE_MAX_TABLESIZE - 1)
+#define FSE_DEFAULT_TABLELOG (FSE_DEFAULT_MEMORY_USAGE - 2)
+#define FSE_MIN_TABLELOG 5
+
+#define FSE_TABLELOG_ABSOLUTE_MAX 15
+#if FSE_MAX_TABLELOG > FSE_TABLELOG_ABSOLUTE_MAX
+#error "FSE_MAX_TABLELOG > FSE_TABLELOG_ABSOLUTE_MAX is not supported"
+#endif
+
+#define FSE_TABLESTEP(tableSize) ((tableSize >> 1) + (tableSize >> 3) + 3)
+
+#endif /* FSE_H */
diff --git a/lib/zstd/fse_compress.c b/lib/zstd/fse_compress.c
new file mode 100644
index 000000000000..ef3d1741d532
--- /dev/null
+++ b/lib/zstd/fse_compress.c
@@ -0,0 +1,795 @@
+/*
+ * FSE : Finite State Entropy encoder
+ * Copyright (C) 2013-2015, Yann Collet.
+ *
+ * BSD 2-Clause License (http://www.opensource.org/licenses/bsd-license.php)
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are
+ * met:
+ *
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following disclaimer
+ * in the documentation and/or other materials provided with the
+ * distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ * This program is free software; you can redistribute it and/or modify it under
+ * the terms of the GNU General Public License version 2 as published by the
+ * Free Software Foundation. This program is dual-licensed; you may select
+ * either version 2 of the GNU General Public License ("GPL") or BSD license
+ * ("BSD").
+ *
+ * You can contact the author at :
+ * - Source repository : https://github.com/Cyan4973/FiniteStateEntropy
+ */
+
+/* **************************************************************
+* Compiler specifics
+****************************************************************/
+#define FORCE_INLINE static __always_inline
+
+/* **************************************************************
+* Includes
+****************************************************************/
+#include "bitstream.h"
+#include "fse.h"
+#include <linux/compiler.h>
+#include <linux/kernel.h>
+#include <linux/math64.h>
+#include <linux/string.h> /* memcpy, memset */
+
+/* **************************************************************
+* Error Management
+****************************************************************/
+#define FSE_STATIC_ASSERT(c) \
+ { \
+ enum { FSE_static_assert = 1 / (int)(!!(c)) }; \
+ } /* use only *after* variable declarations */
+
+/* **************************************************************
+* Templates
+****************************************************************/
+/*
+ designed to be included
+ for type-specific functions (template emulation in C)
+ Objective is to write these functions only once, for improved maintenance
+*/
+
+/* safety checks */
+#ifndef FSE_FUNCTION_EXTENSION
+#error "FSE_FUNCTION_EXTENSION must be defined"
+#endif
+#ifndef FSE_FUNCTION_TYPE
+#error "FSE_FUNCTION_TYPE must be defined"
+#endif
+
+/* Function names */
+#define FSE_CAT(X, Y) X##Y
+#define FSE_FUNCTION_NAME(X, Y) FSE_CAT(X, Y)
+#define FSE_TYPE_NAME(X, Y) FSE_CAT(X, Y)
+
+/* Function templates */
+
+/* FSE_buildCTable_wksp() :
+ * Same as FSE_buildCTable(), but using an externally allocated scratch buffer (`workSpace`).
+ * wkspSize should be sized to handle worst case situation, which is `1<<max_tableLog * sizeof(FSE_FUNCTION_TYPE)`
+ * workSpace must also be properly aligned with FSE_FUNCTION_TYPE requirements
+ */
+size_t FSE_buildCTable_wksp(FSE_CTable *ct, const short *normalizedCounter, unsigned maxSymbolValue, unsigned tableLog, void *workspace, size_t workspaceSize)
+{
+ U32 const tableSize = 1 << tableLog;
+ U32 const tableMask = tableSize - 1;
+ void *const ptr = ct;
+ U16 *const tableU16 = ((U16 *)ptr) + 2;
+ void *const FSCT = ((U32 *)ptr) + 1 /* header */ + (tableLog ? tableSize >> 1 : 1);
+ FSE_symbolCompressionTransform *const symbolTT = (FSE_symbolCompressionTransform *)(FSCT);
+ U32 const step = FSE_TABLESTEP(tableSize);
+ U32 highThreshold = tableSize - 1;
+
+ U32 *cumul;
+ FSE_FUNCTION_TYPE *tableSymbol;
+ size_t spaceUsed32 = 0;
+
+ cumul = (U32 *)workspace + spaceUsed32;
+ spaceUsed32 += FSE_MAX_SYMBOL_VALUE + 2;
+ tableSymbol = (FSE_FUNCTION_TYPE *)((U32 *)workspace + spaceUsed32);
+ spaceUsed32 += ALIGN(sizeof(FSE_FUNCTION_TYPE) * ((size_t)1 << tableLog), sizeof(U32)) >> 2;
+
+ if ((spaceUsed32 << 2) > workspaceSize)
+ return ERROR(tableLog_tooLarge);
+ workspace = (U32 *)workspace + spaceUsed32;
+ workspaceSize -= (spaceUsed32 << 2);
+
+ /* CTable header */
+ tableU16[-2] = (U16)tableLog;
+ tableU16[-1] = (U16)maxSymbolValue;
+
+ /* For explanations on how to distribute symbol values over the table :
+ * http://fastcompression.blogspot.fr/2014/02/fse-distributing-symbol-values.html */
+
+ /* symbol start positions */
+ {
+ U32 u;
+ cumul[0] = 0;
+ for (u = 1; u <= maxSymbolValue + 1; u++) {
+ if (normalizedCounter[u - 1] == -1) { /* Low proba symbol */
+ cumul[u] = cumul[u - 1] + 1;
+ tableSymbol[highThreshold--] = (FSE_FUNCTION_TYPE)(u - 1);
+ } else {
+ cumul[u] = cumul[u - 1] + normalizedCounter[u - 1];
+ }
+ }
+ cumul[maxSymbolValue + 1] = tableSize + 1;
+ }
+
+ /* Spread symbols */
+ {
+ U32 position = 0;
+ U32 symbol;
+ for (symbol = 0; symbol <= maxSymbolValue; symbol++) {
+ int nbOccurences;
+ for (nbOccurences = 0; nbOccurences < normalizedCounter[symbol]; nbOccurences++) {
+ tableSymbol[position] = (FSE_FUNCTION_TYPE)symbol;
+ position = (position + step) & tableMask;
+ while (position > highThreshold)
+ position = (position + step) & tableMask; /* Low proba area */
+ }
+ }
+
+ if (position != 0)
+ return ERROR(GENERIC); /* Must have gone through all positions */
+ }
+
+ /* Build table */
+ {
+ U32 u;
+ for (u = 0; u < tableSize; u++) {
+ FSE_FUNCTION_TYPE s = tableSymbol[u]; /* note : static analyzer may not understand tableSymbol is properly initialized */
+ tableU16[cumul[s]++] = (U16)(tableSize + u); /* TableU16 : sorted by symbol order; gives next state value */
+ }
+ }
+
+ /* Build Symbol Transformation Table */
+ {
+ unsigned total = 0;
+ unsigned s;
+ for (s = 0; s <= maxSymbolValue; s++) {
+ switch (normalizedCounter[s]) {
+ case 0: break;
+
+ case -1:
+ case 1:
+ symbolTT[s].deltaNbBits = (tableLog << 16) - (1 << tableLog);
+ symbolTT[s].deltaFindState = total - 1;
+ total++;
+ break;
+ default: {
+ U32 const maxBitsOut = tableLog - BIT_highbit32(normalizedCounter[s] - 1);
+ U32 const minStatePlus = normalizedCounter[s] << maxBitsOut;
+ symbolTT[s].deltaNbBits = (maxBitsOut << 16) - minStatePlus;
+ symbolTT[s].deltaFindState = total - normalizedCounter[s];
+ total += normalizedCounter[s];
+ }
+ }
+ }
+ }
+
+ return 0;
+}
+
+/*-**************************************************************
+* FSE NCount encoding-decoding
+****************************************************************/
+size_t FSE_NCountWriteBound(unsigned maxSymbolValue, unsigned tableLog)
+{
+ size_t const maxHeaderSize = (((maxSymbolValue + 1) * tableLog) >> 3) + 3;
+ return maxSymbolValue ? maxHeaderSize : FSE_NCOUNTBOUND; /* maxSymbolValue==0 ? use default */
+}
+
+static size_t FSE_writeNCount_generic(void *header, size_t headerBufferSize, const short *normalizedCounter, unsigned maxSymbolValue, unsigned tableLog,
+ unsigned writeIsSafe)
+{
+ BYTE *const ostart = (BYTE *)header;
+ BYTE *out = ostart;
+ BYTE *const oend = ostart + headerBufferSize;
+ int nbBits;
+ const int tableSize = 1 << tableLog;
+ int remaining;
+ int threshold;
+ U32 bitStream;
+ int bitCount;
+ unsigned charnum = 0;
+ int previous0 = 0;
+
+ bitStream = 0;
+ bitCount = 0;
+ /* Table Size */
+ bitStream += (tableLog - FSE_MIN_TABLELOG) << bitCount;
+ bitCount += 4;
+
+ /* Init */
+ remaining = tableSize + 1; /* +1 for extra accuracy */
+ threshold = tableSize;
+ nbBits = tableLog + 1;
+
+ while (remaining > 1) { /* stops at 1 */
+ if (previous0) {
+ unsigned start = charnum;
+ while (!normalizedCounter[charnum])
+ charnum++;
+ while (charnum >= start + 24) {
+ start += 24;
+ bitStream += 0xFFFFU << bitCount;
+ if ((!writeIsSafe) && (out > oend - 2))
+ return ERROR(dstSize_tooSmall); /* Buffer overflow */
+ out[0] = (BYTE)bitStream;
+ out[1] = (BYTE)(bitStream >> 8);
+ out += 2;
+ bitStream >>= 16;
+ }
+ while (charnum >= start + 3) {
+ start += 3;
+ bitStream += 3 << bitCount;
+ bitCount += 2;
+ }
+ bitStream += (charnum - start) << bitCount;
+ bitCount += 2;
+ if (bitCount > 16) {
+ if ((!writeIsSafe) && (out > oend - 2))
+ return ERROR(dstSize_tooSmall); /* Buffer overflow */
+ out[0] = (BYTE)bitStream;
+ out[1] = (BYTE)(bitStream >> 8);
+ out += 2;
+ bitStream >>= 16;
+ bitCount -= 16;
+ }
+ }
+ {
+ int count = normalizedCounter[charnum++];
+ int const max = (2 * threshold - 1) - remaining;
+ remaining -= count < 0 ? -count : count;
+ count++; /* +1 for extra accuracy */
+ if (count >= threshold)
+ count += max; /* [0..max[ [max..threshold[ (...) [threshold+max 2*threshold[ */
+ bitStream += count << bitCount;
+ bitCount += nbBits;
+ bitCount -= (count < max);
+ previous0 = (count == 1);
+ if (remaining < 1)
+ return ERROR(GENERIC);
+ while (remaining < threshold)
+ nbBits--, threshold >>= 1;
+ }
+ if (bitCount > 16) {
+ if ((!writeIsSafe) && (out > oend - 2))
+ return ERROR(dstSize_tooSmall); /* Buffer overflow */
+ out[0] = (BYTE)bitStream;
+ out[1] = (BYTE)(bitStream >> 8);
+ out += 2;
+ bitStream >>= 16;
+ bitCount -= 16;
+ }
+ }
+
+ /* flush remaining bitStream */
+ if ((!writeIsSafe) && (out > oend - 2))
+ return ERROR(dstSize_tooSmall); /* Buffer overflow */
+ out[0] = (BYTE)bitStream;
+ out[1] = (BYTE)(bitStream >> 8);
+ out += (bitCount + 7) / 8;
+
+ if (charnum > maxSymbolValue + 1)
+ return ERROR(GENERIC);
+
+ return (out - ostart);
+}
+
+size_t FSE_writeNCount(void *buffer, size_t bufferSize, const short *normalizedCounter, unsigned maxSymbolValue, unsigned tableLog)
+{
+ if (tableLog > FSE_MAX_TABLELOG)
+ return ERROR(tableLog_tooLarge); /* Unsupported */
+ if (tableLog < FSE_MIN_TABLELOG)
+ return ERROR(GENERIC); /* Unsupported */
+
+ if (bufferSize < FSE_NCountWriteBound(maxSymbolValue, tableLog))
+ return FSE_writeNCount_generic(buffer, bufferSize, normalizedCounter, maxSymbolValue, tableLog, 0);
+
+ return FSE_writeNCount_generic(buffer, bufferSize, normalizedCounter, maxSymbolValue, tableLog, 1);
+}
+
+/*-**************************************************************
+* Counting histogram
+****************************************************************/
+/*! FSE_count_simple
+ This function counts byte values within `src`, and store the histogram into table `count`.
+ It doesn't use any additional memory.
+ But this function is unsafe : it doesn't check that all values within `src` can fit into `count`.
+ For this reason, prefer using a table `count` with 256 elements.
+ @return : count of most numerous element
+*/
+size_t FSE_count_simple(unsigned *count, unsigned *maxSymbolValuePtr, const void *src, size_t srcSize)
+{
+ const BYTE *ip = (const BYTE *)src;
+ const BYTE *const end = ip + srcSize;
+ unsigned maxSymbolValue = *maxSymbolValuePtr;
+ unsigned max = 0;
+
+ memset(count, 0, (maxSymbolValue + 1) * sizeof(*count));
+ if (srcSize == 0) {
+ *maxSymbolValuePtr = 0;
+ return 0;
+ }
+
+ while (ip < end)
+ count[*ip++]++;
+
+ while (!count[maxSymbolValue])
+ maxSymbolValue--;
+ *maxSymbolValuePtr = maxSymbolValue;
+
+ {
+ U32 s;
+ for (s = 0; s <= maxSymbolValue; s++)
+ if (count[s] > max)
+ max = count[s];
+ }
+
+ return (size_t)max;
+}
+
+/* FSE_count_parallel_wksp() :
+ * Same as FSE_count_parallel(), but using an externally provided scratch buffer.
+ * `workSpace` size must be a minimum of `1024 * sizeof(unsigned)`` */
+static size_t FSE_count_parallel_wksp(unsigned *count, unsigned *maxSymbolValuePtr, const void *source, size_t sourceSize, unsigned checkMax,
+ unsigned *const workSpace)
+{
+ const BYTE *ip = (const BYTE *)source;
+ const BYTE *const iend = ip + sourceSize;
+ unsigned maxSymbolValue = *maxSymbolValuePtr;
+ unsigned max = 0;
+ U32 *const Counting1 = workSpace;
+ U32 *const Counting2 = Counting1 + 256;
+ U32 *const Counting3 = Counting2 + 256;
+ U32 *const Counting4 = Counting3 + 256;
+
+ memset(Counting1, 0, 4 * 256 * sizeof(unsigned));
+
+ /* safety checks */
+ if (!sourceSize) {
+ memset(count, 0, maxSymbolValue + 1);
+ *maxSymbolValuePtr = 0;
+ return 0;
+ }
+ if (!maxSymbolValue)
+ maxSymbolValue = 255; /* 0 == default */
+
+ /* by stripes of 16 bytes */
+ {
+ U32 cached = ZSTD_read32(ip);
+ ip += 4;
+ while (ip < iend - 15) {
+ U32 c = cached;
+ cached = ZSTD_read32(ip);
+ ip += 4;
+ Counting1[(BYTE)c]++;
+ Counting2[(BYTE)(c >> 8)]++;
+ Counting3[(BYTE)(c >> 16)]++;
+ Counting4[c >> 24]++;
+ c = cached;
+ cached = ZSTD_read32(ip);
+ ip += 4;
+ Counting1[(BYTE)c]++;
+ Counting2[(BYTE)(c >> 8)]++;
+ Counting3[(BYTE)(c >> 16)]++;
+ Counting4[c >> 24]++;
+ c = cached;
+ cached = ZSTD_read32(ip);
+ ip += 4;
+ Counting1[(BYTE)c]++;
+ Counting2[(BYTE)(c >> 8)]++;
+ Counting3[(BYTE)(c >> 16)]++;
+ Counting4[c >> 24]++;
+ c = cached;
+ cached = ZSTD_read32(ip);
+ ip += 4;
+ Counting1[(BYTE)c]++;
+ Counting2[(BYTE)(c >> 8)]++;
+ Counting3[(BYTE)(c >> 16)]++;
+ Counting4[c >> 24]++;
+ }
+ ip -= 4;
+ }
+
+ /* finish last symbols */
+ while (ip < iend)
+ Counting1[*ip++]++;
+
+ if (checkMax) { /* verify stats will fit into destination table */
+ U32 s;
+ for (s = 255; s > maxSymbolValue; s--) {
+ Counting1[s] += Counting2[s] + Counting3[s] + Counting4[s];
+ if (Counting1[s])
+ return ERROR(maxSymbolValue_tooSmall);
+ }
+ }
+
+ {
+ U32 s;
+ for (s = 0; s <= maxSymbolValue; s++) {
+ count[s] = Counting1[s] + Counting2[s] + Counting3[s] + Counting4[s];
+ if (count[s] > max)
+ max = count[s];
+ }
+ }
+
+ while (!count[maxSymbolValue])
+ maxSymbolValue--;
+ *maxSymbolValuePtr = maxSymbolValue;
+ return (size_t)max;
+}
+
+/* FSE_countFast_wksp() :
+ * Same as FSE_countFast(), but using an externally provided scratch buffer.
+ * `workSpace` size must be table of >= `1024` unsigned */
+size_t FSE_countFast_wksp(unsigned *count, unsigned *maxSymbolValuePtr, const void *source, size_t sourceSize, unsigned *workSpace)
+{
+ if (sourceSize < 1500)
+ return FSE_count_simple(count, maxSymbolValuePtr, source, sourceSize);
+ return FSE_count_parallel_wksp(count, maxSymbolValuePtr, source, sourceSize, 0, workSpace);
+}
+
+/* FSE_count_wksp() :
+ * Same as FSE_count(), but using an externally provided scratch buffer.
+ * `workSpace` size must be table of >= `1024` unsigned */
+size_t FSE_count_wksp(unsigned *count, unsigned *maxSymbolValuePtr, const void *source, size_t sourceSize, unsigned *workSpace)
+{
+ if (*maxSymbolValuePtr < 255)
+ return FSE_count_parallel_wksp(count, maxSymbolValuePtr, source, sourceSize, 1, workSpace);
+ *maxSymbolValuePtr = 255;
+ return FSE_countFast_wksp(count, maxSymbolValuePtr, source, sourceSize, workSpace);
+}
+
+/*-**************************************************************
+* FSE Compression Code
+****************************************************************/
+/*! FSE_sizeof_CTable() :
+ FSE_CTable is a variable size structure which contains :
+ `U16 tableLog;`
+ `U16 maxSymbolValue;`
+ `U16 nextStateNumber[1 << tableLog];` // This size is variable
+ `FSE_symbolCompressionTransform symbolTT[maxSymbolValue+1];` // This size is variable
+Allocation is manual (C standard does not support variable-size structures).
+*/
+size_t FSE_sizeof_CTable(unsigned maxSymbolValue, unsigned tableLog)
+{
+ if (tableLog > FSE_MAX_TABLELOG)
+ return ERROR(tableLog_tooLarge);
+ return FSE_CTABLE_SIZE_U32(tableLog, maxSymbolValue) * sizeof(U32);
+}
+
+/* provides the minimum logSize to safely represent a distribution */
+static unsigned FSE_minTableLog(size_t srcSize, unsigned maxSymbolValue)
+{
+ U32 minBitsSrc = BIT_highbit32((U32)(srcSize - 1)) + 1;
+ U32 minBitsSymbols = BIT_highbit32(maxSymbolValue) + 2;
+ U32 minBits = minBitsSrc < minBitsSymbols ? minBitsSrc : minBitsSymbols;
+ return minBits;
+}
+
+unsigned FSE_optimalTableLog_internal(unsigned maxTableLog, size_t srcSize, unsigned maxSymbolValue, unsigned minus)
+{
+ U32 maxBitsSrc = BIT_highbit32((U32)(srcSize - 1)) - minus;
+ U32 tableLog = maxTableLog;
+ U32 minBits = FSE_minTableLog(srcSize, maxSymbolValue);
+ if (tableLog == 0)
+ tableLog = FSE_DEFAULT_TABLELOG;
+ if (maxBitsSrc < tableLog)
+ tableLog = maxBitsSrc; /* Accuracy can be reduced */
+ if (minBits > tableLog)
+ tableLog = minBits; /* Need a minimum to safely represent all symbol values */
+ if (tableLog < FSE_MIN_TABLELOG)
+ tableLog = FSE_MIN_TABLELOG;
+ if (tableLog > FSE_MAX_TABLELOG)
+ tableLog = FSE_MAX_TABLELOG;
+ return tableLog;
+}
+
+unsigned FSE_optimalTableLog(unsigned maxTableLog, size_t srcSize, unsigned maxSymbolValue)
+{
+ return FSE_optimalTableLog_internal(maxTableLog, srcSize, maxSymbolValue, 2);
+}
+
+/* Secondary normalization method.
+ To be used when primary method fails. */
+
+static size_t FSE_normalizeM2(short *norm, U32 tableLog, const unsigned *count, size_t total, U32 maxSymbolValue)
+{
+ short const NOT_YET_ASSIGNED = -2;
+ U32 s;
+ U32 distributed = 0;
+ U32 ToDistribute;
+
+ /* Init */
+ U32 const lowThreshold = (U32)(total >> tableLog);
+ U32 lowOne = (U32)((total * 3) >> (tableLog + 1));
+
+ for (s = 0; s <= maxSymbolValue; s++) {
+ if (count[s] == 0) {
+ norm[s] = 0;
+ continue;
+ }
+ if (count[s] <= lowThreshold) {
+ norm[s] = -1;
+ distributed++;
+ total -= count[s];
+ continue;
+ }
+ if (count[s] <= lowOne) {
+ norm[s] = 1;
+ distributed++;
+ total -= count[s];
+ continue;
+ }
+
+ norm[s] = NOT_YET_ASSIGNED;
+ }
+ ToDistribute = (1 << tableLog) - distributed;
+
+ if ((total / ToDistribute) > lowOne) {
+ /* risk of rounding to zero */
+ lowOne = (U32)((total * 3) / (ToDistribute * 2));
+ for (s = 0; s <= maxSymbolValue; s++) {
+ if ((norm[s] == NOT_YET_ASSIGNED) && (count[s] <= lowOne)) {
+ norm[s] = 1;
+ distributed++;
+ total -= count[s];
+ continue;
+ }
+ }
+ ToDistribute = (1 << tableLog) - distributed;
+ }
+
+ if (distributed == maxSymbolValue + 1) {
+ /* all values are pretty poor;
+ probably incompressible data (should have already been detected);
+ find max, then give all remaining points to max */
+ U32 maxV = 0, maxC = 0;
+ for (s = 0; s <= maxSymbolValue; s++)
+ if (count[s] > maxC)
+ maxV = s, maxC = count[s];
+ norm[maxV] += (short)ToDistribute;
+ return 0;
+ }
+
+ if (total == 0) {
+ /* all of the symbols were low enough for the lowOne or lowThreshold */
+ for (s = 0; ToDistribute > 0; s = (s + 1) % (maxSymbolValue + 1))
+ if (norm[s] > 0)
+ ToDistribute--, norm[s]++;
+ return 0;
+ }
+
+ {
+ U64 const vStepLog = 62 - tableLog;
+ U64 const mid = (1ULL << (vStepLog - 1)) - 1;
+ U64 const rStep = div_u64((((U64)1 << vStepLog) * ToDistribute) + mid, (U32)total); /* scale on remaining */
+ U64 tmpTotal = mid;
+ for (s = 0; s <= maxSymbolValue; s++) {
+ if (norm[s] == NOT_YET_ASSIGNED) {
+ U64 const end = tmpTotal + (count[s] * rStep);
+ U32 const sStart = (U32)(tmpTotal >> vStepLog);
+ U32 const sEnd = (U32)(end >> vStepLog);
+ U32 const weight = sEnd - sStart;
+ if (weight < 1)
+ return ERROR(GENERIC);
+ norm[s] = (short)weight;
+ tmpTotal = end;
+ }
+ }
+ }
+
+ return 0;
+}
+
+size_t FSE_normalizeCount(short *normalizedCounter, unsigned tableLog, const unsigned *count, size_t total, unsigned maxSymbolValue)
+{
+ /* Sanity checks */
+ if (tableLog == 0)
+ tableLog = FSE_DEFAULT_TABLELOG;
+ if (tableLog < FSE_MIN_TABLELOG)
+ return ERROR(GENERIC); /* Unsupported size */
+ if (tableLog > FSE_MAX_TABLELOG)
+ return ERROR(tableLog_tooLarge); /* Unsupported size */
+ if (tableLog < FSE_minTableLog(total, maxSymbolValue))
+ return ERROR(GENERIC); /* Too small tableLog, compression potentially impossible */
+
+ {
+ U32 const rtbTable[] = {0, 473195, 504333, 520860, 550000, 700000, 750000, 830000};
+ U64 const scale = 62 - tableLog;
+ U64 const step = div_u64((U64)1 << 62, (U32)total); /* <== here, one division ! */
+ U64 const vStep = 1ULL << (scale - 20);
+ int stillToDistribute = 1 << tableLog;
+ unsigned s;
+ unsigned largest = 0;
+ short largestP = 0;
+ U32 lowThreshold = (U32)(total >> tableLog);
+
+ for (s = 0; s <= maxSymbolValue; s++) {
+ if (count[s] == total)
+ return 0; /* rle special case */
+ if (count[s] == 0) {
+ normalizedCounter[s] = 0;
+ continue;
+ }
+ if (count[s] <= lowThreshold) {
+ normalizedCounter[s] = -1;
+ stillToDistribute--;
+ } else {
+ short proba = (short)((count[s] * step) >> scale);
+ if (proba < 8) {
+ U64 restToBeat = vStep * rtbTable[proba];
+ proba += (count[s] * step) - ((U64)proba << scale) > restToBeat;
+ }
+ if (proba > largestP)
+ largestP = proba, largest = s;
+ normalizedCounter[s] = proba;
+ stillToDistribute -= proba;
+ }
+ }
+ if (-stillToDistribute >= (normalizedCounter[largest] >> 1)) {
+ /* corner case, need another normalization method */
+ size_t const errorCode = FSE_normalizeM2(normalizedCounter, tableLog, count, total, maxSymbolValue);
+ if (FSE_isError(errorCode))
+ return errorCode;
+ } else
+ normalizedCounter[largest] += (short)stillToDistribute;
+ }
+
+ return tableLog;
+}
+
+/* fake FSE_CTable, for raw (uncompressed) input */
+size_t FSE_buildCTable_raw(FSE_CTable *ct, unsigned nbBits)
+{
+ const unsigned tableSize = 1 << nbBits;
+ const unsigned tableMask = tableSize - 1;
+ const unsigned maxSymbolValue = tableMask;
+ void *const ptr = ct;
+ U16 *const tableU16 = ((U16 *)ptr) + 2;
+ void *const FSCT = ((U32 *)ptr) + 1 /* header */ + (tableSize >> 1); /* assumption : tableLog >= 1 */
+ FSE_symbolCompressionTransform *const symbolTT = (FSE_symbolCompressionTransform *)(FSCT);
+ unsigned s;
+
+ /* Sanity checks */
+ if (nbBits < 1)
+ return ERROR(GENERIC); /* min size */
+
+ /* header */
+ tableU16[-2] = (U16)nbBits;
+ tableU16[-1] = (U16)maxSymbolValue;
+
+ /* Build table */
+ for (s = 0; s < tableSize; s++)
+ tableU16[s] = (U16)(tableSize + s);
+
+ /* Build Symbol Transformation Table */
+ {
+ const U32 deltaNbBits = (nbBits << 16) - (1 << nbBits);
+ for (s = 0; s <= maxSymbolValue; s++) {
+ symbolTT[s].deltaNbBits = deltaNbBits;
+ symbolTT[s].deltaFindState = s - 1;
+ }
+ }
+
+ return 0;
+}
+
+/* fake FSE_CTable, for rle input (always same symbol) */
+size_t FSE_buildCTable_rle(FSE_CTable *ct, BYTE symbolValue)
+{
+ void *ptr = ct;
+ U16 *tableU16 = ((U16 *)ptr) + 2;
+ void *FSCTptr = (U32 *)ptr + 2;
+ FSE_symbolCompressionTransform *symbolTT = (FSE_symbolCompressionTransform *)FSCTptr;
+
+ /* header */
+ tableU16[-2] = (U16)0;
+ tableU16[-1] = (U16)symbolValue;
+
+ /* Build table */
+ tableU16[0] = 0;
+ tableU16[1] = 0; /* just in case */
+
+ /* Build Symbol Transformation Table */
+ symbolTT[symbolValue].deltaNbBits = 0;
+ symbolTT[symbolValue].deltaFindState = 0;
+
+ return 0;
+}
+
+static size_t FSE_compress_usingCTable_generic(void *dst, size_t dstSize, const void *src, size_t srcSize, const FSE_CTable *ct, const unsigned fast)
+{
+ const BYTE *const istart = (const BYTE *)src;
+ const BYTE *const iend = istart + srcSize;
+ const BYTE *ip = iend;
+
+ BIT_CStream_t bitC;
+ FSE_CState_t CState1, CState2;
+
+ /* init */
+ if (srcSize <= 2)
+ return 0;
+ {
+ size_t const initError = BIT_initCStream(&bitC, dst, dstSize);
+ if (FSE_isError(initError))
+ return 0; /* not enough space available to write a bitstream */
+ }
+
+#define FSE_FLUSHBITS(s) (fast ? BIT_flushBitsFast(s) : BIT_flushBits(s))
+
+ if (srcSize & 1) {
+ FSE_initCState2(&CState1, ct, *--ip);
+ FSE_initCState2(&CState2, ct, *--ip);
+ FSE_encodeSymbol(&bitC, &CState1, *--ip);
+ FSE_FLUSHBITS(&bitC);
+ } else {
+ FSE_initCState2(&CState2, ct, *--ip);
+ FSE_initCState2(&CState1, ct, *--ip);
+ }
+
+ /* join to mod 4 */
+ srcSize -= 2;
+ if ((sizeof(bitC.bitContainer) * 8 > FSE_MAX_TABLELOG * 4 + 7) && (srcSize & 2)) { /* test bit 2 */
+ FSE_encodeSymbol(&bitC, &CState2, *--ip);
+ FSE_encodeSymbol(&bitC, &CState1, *--ip);
+ FSE_FLUSHBITS(&bitC);
+ }
+
+ /* 2 or 4 encoding per loop */
+ while (ip > istart) {
+
+ FSE_encodeSymbol(&bitC, &CState2, *--ip);
+
+ if (sizeof(bitC.bitContainer) * 8 < FSE_MAX_TABLELOG * 2 + 7) /* this test must be static */
+ FSE_FLUSHBITS(&bitC);
+
+ FSE_encodeSymbol(&bitC, &CState1, *--ip);
+
+ if (sizeof(bitC.bitContainer) * 8 > FSE_MAX_TABLELOG * 4 + 7) { /* this test must be static */
+ FSE_encodeSymbol(&bitC, &CState2, *--ip);
+ FSE_encodeSymbol(&bitC, &CState1, *--ip);
+ }
+
+ FSE_FLUSHBITS(&bitC);
+ }
+
+ FSE_flushCState(&bitC, &CState2);
+ FSE_flushCState(&bitC, &CState1);
+ return BIT_closeCStream(&bitC);
+}
+
+size_t FSE_compress_usingCTable(void *dst, size_t dstSize, const void *src, size_t srcSize, const FSE_CTable *ct)
+{
+ unsigned const fast = (dstSize >= FSE_BLOCKBOUND(srcSize));
+
+ if (fast)
+ return FSE_compress_usingCTable_generic(dst, dstSize, src, srcSize, ct, 1);
+ else
+ return FSE_compress_usingCTable_generic(dst, dstSize, src, srcSize, ct, 0);
+}
+
+size_t FSE_compressBound(size_t size) { return FSE_COMPRESSBOUND(size); }
diff --git a/lib/zstd/fse_decompress.c b/lib/zstd/fse_decompress.c
new file mode 100644
index 000000000000..a84300e5a013
--- /dev/null
+++ b/lib/zstd/fse_decompress.c
@@ -0,0 +1,332 @@
+/*
+ * FSE : Finite State Entropy decoder
+ * Copyright (C) 2013-2015, Yann Collet.
+ *
+ * BSD 2-Clause License (http://www.opensource.org/licenses/bsd-license.php)
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are
+ * met:
+ *
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following disclaimer
+ * in the documentation and/or other materials provided with the
+ * distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ * This program is free software; you can redistribute it and/or modify it under
+ * the terms of the GNU General Public License version 2 as published by the
+ * Free Software Foundation. This program is dual-licensed; you may select
+ * either version 2 of the GNU General Public License ("GPL") or BSD license
+ * ("BSD").
+ *
+ * You can contact the author at :
+ * - Source repository : https://github.com/Cyan4973/FiniteStateEntropy
+ */
+
+/* **************************************************************
+* Compiler specifics
+****************************************************************/
+#define FORCE_INLINE static __always_inline
+
+/* **************************************************************
+* Includes
+****************************************************************/
+#include "bitstream.h"
+#include "fse.h"
+#include <linux/compiler.h>
+#include <linux/kernel.h>
+#include <linux/string.h> /* memcpy, memset */
+
+/* **************************************************************
+* Error Management
+****************************************************************/
+#define FSE_isError ERR_isError
+#define FSE_STATIC_ASSERT(c) \
+ { \
+ enum { FSE_static_assert = 1 / (int)(!!(c)) }; \
+ } /* use only *after* variable declarations */
+
+/* check and forward error code */
+#define CHECK_F(f) \
+ { \
+ size_t const e = f; \
+ if (FSE_isError(e)) \
+ return e; \
+ }
+
+/* **************************************************************
+* Templates
+****************************************************************/
+/*
+ designed to be included
+ for type-specific functions (template emulation in C)
+ Objective is to write these functions only once, for improved maintenance
+*/
+
+/* safety checks */
+#ifndef FSE_FUNCTION_EXTENSION
+#error "FSE_FUNCTION_EXTENSION must be defined"
+#endif
+#ifndef FSE_FUNCTION_TYPE
+#error "FSE_FUNCTION_TYPE must be defined"
+#endif
+
+/* Function names */
+#define FSE_CAT(X, Y) X##Y
+#define FSE_FUNCTION_NAME(X, Y) FSE_CAT(X, Y)
+#define FSE_TYPE_NAME(X, Y) FSE_CAT(X, Y)
+
+/* Function templates */
+
+size_t FSE_buildDTable_wksp(FSE_DTable *dt, const short *normalizedCounter, unsigned maxSymbolValue, unsigned tableLog, void *workspace, size_t workspaceSize)
+{
+ void *const tdPtr = dt + 1; /* because *dt is unsigned, 32-bits aligned on 32-bits */
+ FSE_DECODE_TYPE *const tableDecode = (FSE_DECODE_TYPE *)(tdPtr);
+ U16 *symbolNext = (U16 *)workspace;
+
+ U32 const maxSV1 = maxSymbolValue + 1;
+ U32 const tableSize = 1 << tableLog;
+ U32 highThreshold = tableSize - 1;
+
+ /* Sanity Checks */
+ if (workspaceSize < sizeof(U16) * (FSE_MAX_SYMBOL_VALUE + 1))
+ return ERROR(tableLog_tooLarge);
+ if (maxSymbolValue > FSE_MAX_SYMBOL_VALUE)
+ return ERROR(maxSymbolValue_tooLarge);
+ if (tableLog > FSE_MAX_TABLELOG)
+ return ERROR(tableLog_tooLarge);
+
+ /* Init, lay down lowprob symbols */
+ {
+ FSE_DTableHeader DTableH;
+ DTableH.tableLog = (U16)tableLog;
+ DTableH.fastMode = 1;
+ {
+ S16 const largeLimit = (S16)(1 << (tableLog - 1));
+ U32 s;
+ for (s = 0; s < maxSV1; s++) {
+ if (normalizedCounter[s] == -1) {
+ tableDecode[highThreshold--].symbol = (FSE_FUNCTION_TYPE)s;
+ symbolNext[s] = 1;
+ } else {
+ if (normalizedCounter[s] >= largeLimit)
+ DTableH.fastMode = 0;
+ symbolNext[s] = normalizedCounter[s];
+ }
+ }
+ }
+ memcpy(dt, &DTableH, sizeof(DTableH));
+ }
+
+ /* Spread symbols */
+ {
+ U32 const tableMask = tableSize - 1;
+ U32 const step = FSE_TABLESTEP(tableSize);
+ U32 s, position = 0;
+ for (s = 0; s < maxSV1; s++) {
+ int i;
+ for (i = 0; i < normalizedCounter[s]; i++) {
+ tableDecode[position].symbol = (FSE_FUNCTION_TYPE)s;
+ position = (position + step) & tableMask;
+ while (position > highThreshold)
+ position = (position + step) & tableMask; /* lowprob area */
+ }
+ }
+ if (position != 0)
+ return ERROR(GENERIC); /* position must reach all cells once, otherwise normalizedCounter is incorrect */
+ }
+
+ /* Build Decoding table */
+ {
+ U32 u;
+ for (u = 0; u < tableSize; u++) {
+ FSE_FUNCTION_TYPE const symbol = (FSE_FUNCTION_TYPE)(tableDecode[u].symbol);
+ U16 nextState = symbolNext[symbol]++;
+ tableDecode[u].nbBits = (BYTE)(tableLog - BIT_highbit32((U32)nextState));
+ tableDecode[u].newState = (U16)((nextState << tableDecode[u].nbBits) - tableSize);
+ }
+ }
+
+ return 0;
+}
+
+/*-*******************************************************
+* Decompression (Byte symbols)
+*********************************************************/
+size_t FSE_buildDTable_rle(FSE_DTable *dt, BYTE symbolValue)
+{
+ void *ptr = dt;
+ FSE_DTableHeader *const DTableH = (FSE_DTableHeader *)ptr;
+ void *dPtr = dt + 1;
+ FSE_decode_t *const cell = (FSE_decode_t *)dPtr;
+
+ DTableH->tableLog = 0;
+ DTableH->fastMode = 0;
+
+ cell->newState = 0;
+ cell->symbol = symbolValue;
+ cell->nbBits = 0;
+
+ return 0;
+}
+
+size_t FSE_buildDTable_raw(FSE_DTable *dt, unsigned nbBits)
+{
+ void *ptr = dt;
+ FSE_DTableHeader *const DTableH = (FSE_DTableHeader *)ptr;
+ void *dPtr = dt + 1;
+ FSE_decode_t *const dinfo = (FSE_decode_t *)dPtr;
+ const unsigned tableSize = 1 << nbBits;
+ const unsigned tableMask = tableSize - 1;
+ const unsigned maxSV1 = tableMask + 1;
+ unsigned s;
+
+ /* Sanity checks */
+ if (nbBits < 1)
+ return ERROR(GENERIC); /* min size */
+
+ /* Build Decoding Table */
+ DTableH->tableLog = (U16)nbBits;
+ DTableH->fastMode = 1;
+ for (s = 0; s < maxSV1; s++) {
+ dinfo[s].newState = 0;
+ dinfo[s].symbol = (BYTE)s;
+ dinfo[s].nbBits = (BYTE)nbBits;
+ }
+
+ return 0;
+}
+
+FORCE_INLINE size_t FSE_decompress_usingDTable_generic(void *dst, size_t maxDstSize, const void *cSrc, size_t cSrcSize, const FSE_DTable *dt,
+ const unsigned fast)
+{
+ BYTE *const ostart = (BYTE *)dst;
+ BYTE *op = ostart;
+ BYTE *const omax = op + maxDstSize;
+ BYTE *const olimit = omax - 3;
+
+ BIT_DStream_t bitD;
+ FSE_DState_t state1;
+ FSE_DState_t state2;
+
+ /* Init */
+ CHECK_F(BIT_initDStream(&bitD, cSrc, cSrcSize));
+
+ FSE_initDState(&state1, &bitD, dt);
+ FSE_initDState(&state2, &bitD, dt);
+
+#define FSE_GETSYMBOL(statePtr) fast ? FSE_decodeSymbolFast(statePtr, &bitD) : FSE_decodeSymbol(statePtr, &bitD)
+
+ /* 4 symbols per loop */
+ for (; (BIT_reloadDStream(&bitD) == BIT_DStream_unfinished) & (op < olimit); op += 4) {
+ op[0] = FSE_GETSYMBOL(&state1);
+
+ if (FSE_MAX_TABLELOG * 2 + 7 > sizeof(bitD.bitContainer) * 8) /* This test must be static */
+ BIT_reloadDStream(&bitD);
+
+ op[1] = FSE_GETSYMBOL(&state2);
+
+ if (FSE_MAX_TABLELOG * 4 + 7 > sizeof(bitD.bitContainer) * 8) /* This test must be static */
+ {
+ if (BIT_reloadDStream(&bitD) > BIT_DStream_unfinished) {
+ op += 2;
+ break;
+ }
+ }
+
+ op[2] = FSE_GETSYMBOL(&state1);
+
+ if (FSE_MAX_TABLELOG * 2 + 7 > sizeof(bitD.bitContainer) * 8) /* This test must be static */
+ BIT_reloadDStream(&bitD);
+
+ op[3] = FSE_GETSYMBOL(&state2);
+ }
+
+ /* tail */
+ /* note : BIT_reloadDStream(&bitD) >= FSE_DStream_partiallyFilled; Ends at exactly BIT_DStream_completed */
+ while (1) {
+ if (op > (omax - 2))
+ return ERROR(dstSize_tooSmall);
+ *op++ = FSE_GETSYMBOL(&state1);
+ if (BIT_reloadDStream(&bitD) == BIT_DStream_overflow) {
+ *op++ = FSE_GETSYMBOL(&state2);
+ break;
+ }
+
+ if (op > (omax - 2))
+ return ERROR(dstSize_tooSmall);
+ *op++ = FSE_GETSYMBOL(&state2);
+ if (BIT_reloadDStream(&bitD) == BIT_DStream_overflow) {
+ *op++ = FSE_GETSYMBOL(&state1);
+ break;
+ }
+ }
+
+ return op - ostart;
+}
+
+size_t FSE_decompress_usingDTable(void *dst, size_t originalSize, const void *cSrc, size_t cSrcSize, const FSE_DTable *dt)
+{
+ const void *ptr = dt;
+ const FSE_DTableHeader *DTableH = (const FSE_DTableHeader *)ptr;
+ const U32 fastMode = DTableH->fastMode;
+
+ /* select fast mode (static) */
+ if (fastMode)
+ return FSE_decompress_usingDTable_generic(dst, originalSize, cSrc, cSrcSize, dt, 1);
+ return FSE_decompress_usingDTable_generic(dst, originalSize, cSrc, cSrcSize, dt, 0);
+}
+
+size_t FSE_decompress_wksp(void *dst, size_t dstCapacity, const void *cSrc, size_t cSrcSize, unsigned maxLog, void *workspace, size_t workspaceSize)
+{
+ const BYTE *const istart = (const BYTE *)cSrc;
+ const BYTE *ip = istart;
+ unsigned tableLog;
+ unsigned maxSymbolValue = FSE_MAX_SYMBOL_VALUE;
+ size_t NCountLength;
+
+ FSE_DTable *dt;
+ short *counting;
+ size_t spaceUsed32 = 0;
+
+ FSE_STATIC_ASSERT(sizeof(FSE_DTable) == sizeof(U32));
+
+ dt = (FSE_DTable *)((U32 *)workspace + spaceUsed32);
+ spaceUsed32 += FSE_DTABLE_SIZE_U32(maxLog);
+ counting = (short *)((U32 *)workspace + spaceUsed32);
+ spaceUsed32 += ALIGN(sizeof(short) * (FSE_MAX_SYMBOL_VALUE + 1), sizeof(U32)) >> 2;
+
+ if ((spaceUsed32 << 2) > workspaceSize)
+ return ERROR(tableLog_tooLarge);
+ workspace = (U32 *)workspace + spaceUsed32;
+ workspaceSize -= (spaceUsed32 << 2);
+
+ /* normal FSE decoding mode */
+ NCountLength = FSE_readNCount(counting, &maxSymbolValue, &tableLog, istart, cSrcSize);
+ if (FSE_isError(NCountLength))
+ return NCountLength;
+ // if (NCountLength >= cSrcSize) return ERROR(srcSize_wrong); /* too small input size; supposed to be already checked in NCountLength, only remaining
+ // case : NCountLength==cSrcSize */
+ if (tableLog > maxLog)
+ return ERROR(tableLog_tooLarge);
+ ip += NCountLength;
+ cSrcSize -= NCountLength;
+
+ CHECK_F(FSE_buildDTable_wksp(dt, counting, maxSymbolValue, tableLog, workspace, workspaceSize));
+
+ return FSE_decompress_usingDTable(dst, dstCapacity, ip, cSrcSize, dt); /* always return, even if it is an error code */
+}
diff --git a/lib/zstd/huf.h b/lib/zstd/huf.h
new file mode 100644
index 000000000000..2143da28d952
--- /dev/null
+++ b/lib/zstd/huf.h
@@ -0,0 +1,212 @@
+/*
+ * Huffman coder, part of New Generation Entropy library
+ * header file
+ * Copyright (C) 2013-2016, Yann Collet.
+ *
+ * BSD 2-Clause License (http://www.opensource.org/licenses/bsd-license.php)
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are
+ * met:
+ *
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following disclaimer
+ * in the documentation and/or other materials provided with the
+ * distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ * This program is free software; you can redistribute it and/or modify it under
+ * the terms of the GNU General Public License version 2 as published by the
+ * Free Software Foundation. This program is dual-licensed; you may select
+ * either version 2 of the GNU General Public License ("GPL") or BSD license
+ * ("BSD").
+ *
+ * You can contact the author at :
+ * - Source repository : https://github.com/Cyan4973/FiniteStateEntropy
+ */
+#ifndef HUF_H_298734234
+#define HUF_H_298734234
+
+/* *** Dependencies *** */
+#include <linux/types.h> /* size_t */
+
+/* *** Tool functions *** */
+#define HUF_BLOCKSIZE_MAX (128 * 1024) /**< maximum input size for a single block compressed with HUF_compress */
+size_t HUF_compressBound(size_t size); /**< maximum compressed size (worst case) */
+
+/* Error Management */
+unsigned HUF_isError(size_t code); /**< tells if a return value is an error code */
+
+/* *** Advanced function *** */
+
+/** HUF_compress4X_wksp() :
+* Same as HUF_compress2(), but uses externally allocated `workSpace`, which must be a table of >= 1024 unsigned */
+size_t HUF_compress4X_wksp(void *dst, size_t dstSize, const void *src, size_t srcSize, unsigned maxSymbolValue, unsigned tableLog, void *workSpace,
+ size_t wkspSize); /**< `workSpace` must be a table of at least HUF_COMPRESS_WORKSPACE_SIZE_U32 unsigned */
+
+/* *** Dependencies *** */
+#include "mem.h" /* U32 */
+
+/* *** Constants *** */
+#define HUF_TABLELOG_MAX 12 /* max configured tableLog (for static allocation); can be modified up to HUF_ABSOLUTEMAX_TABLELOG */
+#define HUF_TABLELOG_DEFAULT 11 /* tableLog by default, when not specified */
+#define HUF_SYMBOLVALUE_MAX 255
+
+#define HUF_TABLELOG_ABSOLUTEMAX 15 /* absolute limit of HUF_MAX_TABLELOG. Beyond that value, code does not work */
+#if (HUF_TABLELOG_MAX > HUF_TABLELOG_ABSOLUTEMAX)
+#error "HUF_TABLELOG_MAX is too large !"
+#endif
+
+/* ****************************************
+* Static allocation
+******************************************/
+/* HUF buffer bounds */
+#define HUF_CTABLEBOUND 129
+#define HUF_BLOCKBOUND(size) (size + (size >> 8) + 8) /* only true if incompressible pre-filtered with fast heuristic */
+#define HUF_COMPRESSBOUND(size) (HUF_CTABLEBOUND + HUF_BLOCKBOUND(size)) /* Macro version, useful for static allocation */
+
+/* static allocation of HUF's Compression Table */
+#define HUF_CREATE_STATIC_CTABLE(name, maxSymbolValue) \
+ U32 name##hb[maxSymbolValue + 1]; \
+ void *name##hv = &(name##hb); \
+ HUF_CElt *name = (HUF_CElt *)(name##hv) /* no final ; */
+
+/* static allocation of HUF's DTable */
+typedef U32 HUF_DTable;
+#define HUF_DTABLE_SIZE(maxTableLog) (1 + (1 << (maxTableLog)))
+#define HUF_CREATE_STATIC_DTABLEX2(DTable, maxTableLog) HUF_DTable DTable[HUF_DTABLE_SIZE((maxTableLog)-1)] = {((U32)((maxTableLog)-1) * 0x01000001)}
+#define HUF_CREATE_STATIC_DTABLEX4(DTable, maxTableLog) HUF_DTable DTable[HUF_DTABLE_SIZE(maxTableLog)] = {((U32)(maxTableLog)*0x01000001)}
+
+/* The workspace must have alignment at least 4 and be at least this large */
+#define HUF_COMPRESS_WORKSPACE_SIZE (6 << 10)
+#define HUF_COMPRESS_WORKSPACE_SIZE_U32 (HUF_COMPRESS_WORKSPACE_SIZE / sizeof(U32))
+
+/* The workspace must have alignment at least 4 and be at least this large */
+#define HUF_DECOMPRESS_WORKSPACE_SIZE (3 << 10)
+#define HUF_DECOMPRESS_WORKSPACE_SIZE_U32 (HUF_DECOMPRESS_WORKSPACE_SIZE / sizeof(U32))
+
+/* ****************************************
+* Advanced decompression functions
+******************************************/
+size_t HUF_decompress4X_DCtx_wksp(HUF_DTable *dctx, void *dst, size_t dstSize, const void *cSrc, size_t cSrcSize, void *workspace, size_t workspaceSize); /**< decodes RLE and uncompressed */
+size_t HUF_decompress4X_hufOnly_wksp(HUF_DTable *dctx, void *dst, size_t dstSize, const void *cSrc, size_t cSrcSize, void *workspace,
+ size_t workspaceSize); /**< considers RLE and uncompressed as errors */
+size_t HUF_decompress4X2_DCtx_wksp(HUF_DTable *dctx, void *dst, size_t dstSize, const void *cSrc, size_t cSrcSize, void *workspace,
+ size_t workspaceSize); /**< single-symbol decoder */
+size_t HUF_decompress4X4_DCtx_wksp(HUF_DTable *dctx, void *dst, size_t dstSize, const void *cSrc, size_t cSrcSize, void *workspace,
+ size_t workspaceSize); /**< double-symbols decoder */
+
+/* ****************************************
+* HUF detailed API
+******************************************/
+/*!
+HUF_compress() does the following:
+1. count symbol occurrence from source[] into table count[] using FSE_count()
+2. (optional) refine tableLog using HUF_optimalTableLog()
+3. build Huffman table from count using HUF_buildCTable()
+4. save Huffman table to memory buffer using HUF_writeCTable_wksp()
+5. encode the data stream using HUF_compress4X_usingCTable()
+
+The following API allows targeting specific sub-functions for advanced tasks.
+For example, it's possible to compress several blocks using the same 'CTable',
+or to save and regenerate 'CTable' using external methods.
+*/
+/* FSE_count() : find it within "fse.h" */
+unsigned HUF_optimalTableLog(unsigned maxTableLog, size_t srcSize, unsigned maxSymbolValue);
+typedef struct HUF_CElt_s HUF_CElt; /* incomplete type */
+size_t HUF_writeCTable_wksp(void *dst, size_t maxDstSize, const HUF_CElt *CTable, unsigned maxSymbolValue, unsigned huffLog, void *workspace, size_t workspaceSize);
+size_t HUF_compress4X_usingCTable(void *dst, size_t dstSize, const void *src, size_t srcSize, const HUF_CElt *CTable);
+
+typedef enum {
+ HUF_repeat_none, /**< Cannot use the previous table */
+ HUF_repeat_check, /**< Can use the previous table but it must be checked. Note : The previous table must have been constructed by HUF_compress{1,
+ 4}X_repeat */
+ HUF_repeat_valid /**< Can use the previous table and it is asumed to be valid */
+} HUF_repeat;
+/** HUF_compress4X_repeat() :
+* Same as HUF_compress4X_wksp(), but considers using hufTable if *repeat != HUF_repeat_none.
+* If it uses hufTable it does not modify hufTable or repeat.
+* If it doesn't, it sets *repeat = HUF_repeat_none, and it sets hufTable to the table used.
+* If preferRepeat then the old table will always be used if valid. */
+size_t HUF_compress4X_repeat(void *dst, size_t dstSize, const void *src, size_t srcSize, unsigned maxSymbolValue, unsigned tableLog, void *workSpace,
+ size_t wkspSize, HUF_CElt *hufTable, HUF_repeat *repeat,
+ int preferRepeat); /**< `workSpace` must be a table of at least HUF_COMPRESS_WORKSPACE_SIZE_U32 unsigned */
+
+/** HUF_buildCTable_wksp() :
+ * Same as HUF_buildCTable(), but using externally allocated scratch buffer.
+ * `workSpace` must be aligned on 4-bytes boundaries, and be at least as large as a table of 1024 unsigned.
+ */
+size_t HUF_buildCTable_wksp(HUF_CElt *tree, const U32 *count, U32 maxSymbolValue, U32 maxNbBits, void *workSpace, size_t wkspSize);
+
+/*! HUF_readStats() :
+ Read compact Huffman tree, saved by HUF_writeCTable().
+ `huffWeight` is destination buffer.
+ @return : size read from `src` , or an error Code .
+ Note : Needed by HUF_readCTable() and HUF_readDTableXn() . */
+size_t HUF_readStats_wksp(BYTE *huffWeight, size_t hwSize, U32 *rankStats, U32 *nbSymbolsPtr, U32 *tableLogPtr, const void *src, size_t srcSize,
+ void *workspace, size_t workspaceSize);
+
+/** HUF_readCTable() :
+* Loading a CTable saved with HUF_writeCTable() */
+size_t HUF_readCTable_wksp(HUF_CElt *CTable, unsigned maxSymbolValue, const void *src, size_t srcSize, void *workspace, size_t workspaceSize);
+
+/*
+HUF_decompress() does the following:
+1. select the decompression algorithm (X2, X4) based on pre-computed heuristics
+2. build Huffman table from save, using HUF_readDTableXn()
+3. decode 1 or 4 segments in parallel using HUF_decompressSXn_usingDTable
+*/
+
+/** HUF_selectDecoder() :
+* Tells which decoder is likely to decode faster,
+* based on a set of pre-determined metrics.
+* @return : 0==HUF_decompress4X2, 1==HUF_decompress4X4 .
+* Assumption : 0 < cSrcSize < dstSize <= 128 KB */
+U32 HUF_selectDecoder(size_t dstSize, size_t cSrcSize);
+
+size_t HUF_readDTableX2_wksp(HUF_DTable *DTable, const void *src, size_t srcSize, void *workspace, size_t workspaceSize);
+size_t HUF_readDTableX4_wksp(HUF_DTable *DTable, const void *src, size_t srcSize, void *workspace, size_t workspaceSize);
+
+size_t HUF_decompress4X_usingDTable(void *dst, size_t maxDstSize, const void *cSrc, size_t cSrcSize, const HUF_DTable *DTable);
+size_t HUF_decompress4X2_usingDTable(void *dst, size_t maxDstSize, const void *cSrc, size_t cSrcSize, const HUF_DTable *DTable);
+size_t HUF_decompress4X4_usingDTable(void *dst, size_t maxDstSize, const void *cSrc, size_t cSrcSize, const HUF_DTable *DTable);
+
+/* single stream variants */
+
+size_t HUF_compress1X_wksp(void *dst, size_t dstSize, const void *src, size_t srcSize, unsigned maxSymbolValue, unsigned tableLog, void *workSpace,
+ size_t wkspSize); /**< `workSpace` must be a table of at least HUF_COMPRESS_WORKSPACE_SIZE_U32 unsigned */
+size_t HUF_compress1X_usingCTable(void *dst, size_t dstSize, const void *src, size_t srcSize, const HUF_CElt *CTable);
+/** HUF_compress1X_repeat() :
+* Same as HUF_compress1X_wksp(), but considers using hufTable if *repeat != HUF_repeat_none.
+* If it uses hufTable it does not modify hufTable or repeat.
+* If it doesn't, it sets *repeat = HUF_repeat_none, and it sets hufTable to the table used.
+* If preferRepeat then the old table will always be used if valid. */
+size_t HUF_compress1X_repeat(void *dst, size_t dstSize, const void *src, size_t srcSize, unsigned maxSymbolValue, unsigned tableLog, void *workSpace,
+ size_t wkspSize, HUF_CElt *hufTable, HUF_repeat *repeat,
+ int preferRepeat); /**< `workSpace` must be a table of at least HUF_COMPRESS_WORKSPACE_SIZE_U32 unsigned */
+
+size_t HUF_decompress1X_DCtx_wksp(HUF_DTable *dctx, void *dst, size_t dstSize, const void *cSrc, size_t cSrcSize, void *workspace, size_t workspaceSize);
+size_t HUF_decompress1X2_DCtx_wksp(HUF_DTable *dctx, void *dst, size_t dstSize, const void *cSrc, size_t cSrcSize, void *workspace,
+ size_t workspaceSize); /**< single-symbol decoder */
+size_t HUF_decompress1X4_DCtx_wksp(HUF_DTable *dctx, void *dst, size_t dstSize, const void *cSrc, size_t cSrcSize, void *workspace,
+ size_t workspaceSize); /**< double-symbols decoder */
+
+size_t HUF_decompress1X_usingDTable(void *dst, size_t maxDstSize, const void *cSrc, size_t cSrcSize,
+ const HUF_DTable *DTable); /**< automatic selection of sing or double symbol decoder, based on DTable */
+size_t HUF_decompress1X2_usingDTable(void *dst, size_t maxDstSize, const void *cSrc, size_t cSrcSize, const HUF_DTable *DTable);
+size_t HUF_decompress1X4_usingDTable(void *dst, size_t maxDstSize, const void *cSrc, size_t cSrcSize, const HUF_DTable *DTable);
+
+#endif /* HUF_H_298734234 */
diff --git a/lib/zstd/huf_compress.c b/lib/zstd/huf_compress.c
new file mode 100644
index 000000000000..40055a7016e6
--- /dev/null
+++ b/lib/zstd/huf_compress.c
@@ -0,0 +1,770 @@
+/*
+ * Huffman encoder, part of New Generation Entropy library
+ * Copyright (C) 2013-2016, Yann Collet.
+ *
+ * BSD 2-Clause License (http://www.opensource.org/licenses/bsd-license.php)
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are
+ * met:
+ *
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following disclaimer
+ * in the documentation and/or other materials provided with the
+ * distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ * This program is free software; you can redistribute it and/or modify it under
+ * the terms of the GNU General Public License version 2 as published by the
+ * Free Software Foundation. This program is dual-licensed; you may select
+ * either version 2 of the GNU General Public License ("GPL") or BSD license
+ * ("BSD").
+ *
+ * You can contact the author at :
+ * - Source repository : https://github.com/Cyan4973/FiniteStateEntropy
+ */
+
+/* **************************************************************
+* Includes
+****************************************************************/
+#include "bitstream.h"
+#include "fse.h" /* header compression */
+#include "huf.h"
+#include <linux/kernel.h>
+#include <linux/string.h> /* memcpy, memset */
+
+/* **************************************************************
+* Error Management
+****************************************************************/
+#define HUF_STATIC_ASSERT(c) \
+ { \
+ enum { HUF_static_assert = 1 / (int)(!!(c)) }; \
+ } /* use only *after* variable declarations */
+#define CHECK_V_F(e, f) \
+ size_t const e = f; \
+ if (ERR_isError(e)) \
+ return f
+#define CHECK_F(f) \
+ { \
+ CHECK_V_F(_var_err__, f); \
+ }
+
+/* **************************************************************
+* Utils
+****************************************************************/
+unsigned HUF_optimalTableLog(unsigned maxTableLog, size_t srcSize, unsigned maxSymbolValue)
+{
+ return FSE_optimalTableLog_internal(maxTableLog, srcSize, maxSymbolValue, 1);
+}
+
+/* *******************************************************
+* HUF : Huffman block compression
+*********************************************************/
+/* HUF_compressWeights() :
+ * Same as FSE_compress(), but dedicated to huff0's weights compression.
+ * The use case needs much less stack memory.
+ * Note : all elements within weightTable are supposed to be <= HUF_TABLELOG_MAX.
+ */
+#define MAX_FSE_TABLELOG_FOR_HUFF_HEADER 6
+size_t HUF_compressWeights_wksp(void *dst, size_t dstSize, const void *weightTable, size_t wtSize, void *workspace, size_t workspaceSize)
+{
+ BYTE *const ostart = (BYTE *)dst;
+ BYTE *op = ostart;
+ BYTE *const oend = ostart + dstSize;
+
+ U32 maxSymbolValue = HUF_TABLELOG_MAX;
+ U32 tableLog = MAX_FSE_TABLELOG_FOR_HUFF_HEADER;
+
+ FSE_CTable *CTable;
+ U32 *count;
+ S16 *norm;
+ size_t spaceUsed32 = 0;
+
+ HUF_STATIC_ASSERT(sizeof(FSE_CTable) == sizeof(U32));
+
+ CTable = (FSE_CTable *)((U32 *)workspace + spaceUsed32);
+ spaceUsed32 += FSE_CTABLE_SIZE_U32(MAX_FSE_TABLELOG_FOR_HUFF_HEADER, HUF_TABLELOG_MAX);
+ count = (U32 *)workspace + spaceUsed32;
+ spaceUsed32 += HUF_TABLELOG_MAX + 1;
+ norm = (S16 *)((U32 *)workspace + spaceUsed32);
+ spaceUsed32 += ALIGN(sizeof(S16) * (HUF_TABLELOG_MAX + 1), sizeof(U32)) >> 2;
+
+ if ((spaceUsed32 << 2) > workspaceSize)
+ return ERROR(tableLog_tooLarge);
+ workspace = (U32 *)workspace + spaceUsed32;
+ workspaceSize -= (spaceUsed32 << 2);
+
+ /* init conditions */
+ if (wtSize <= 1)
+ return 0; /* Not compressible */
+
+ /* Scan input and build symbol stats */
+ {
+ CHECK_V_F(maxCount, FSE_count_simple(count, &maxSymbolValue, weightTable, wtSize));
+ if (maxCount == wtSize)
+ return 1; /* only a single symbol in src : rle */
+ if (maxCount == 1)
+ return 0; /* each symbol present maximum once => not compressible */
+ }
+
+ tableLog = FSE_optimalTableLog(tableLog, wtSize, maxSymbolValue);
+ CHECK_F(FSE_normalizeCount(norm, tableLog, count, wtSize, maxSymbolValue));
+
+ /* Write table description header */
+ {
+ CHECK_V_F(hSize, FSE_writeNCount(op, oend - op, norm, maxSymbolValue, tableLog));
+ op += hSize;
+ }
+
+ /* Compress */
+ CHECK_F(FSE_buildCTable_wksp(CTable, norm, maxSymbolValue, tableLog, workspace, workspaceSize));
+ {
+ CHECK_V_F(cSize, FSE_compress_usingCTable(op, oend - op, weightTable, wtSize, CTable));
+ if (cSize == 0)
+ return 0; /* not enough space for compressed data */
+ op += cSize;
+ }
+
+ return op - ostart;
+}
+
+struct HUF_CElt_s {
+ U16 val;
+ BYTE nbBits;
+}; /* typedef'd to HUF_CElt within "huf.h" */
+
+/*! HUF_writeCTable_wksp() :
+ `CTable` : Huffman tree to save, using huf representation.
+ @return : size of saved CTable */
+size_t HUF_writeCTable_wksp(void *dst, size_t maxDstSize, const HUF_CElt *CTable, U32 maxSymbolValue, U32 huffLog, void *workspace, size_t workspaceSize)
+{
+ BYTE *op = (BYTE *)dst;
+ U32 n;
+
+ BYTE *bitsToWeight;
+ BYTE *huffWeight;
+ size_t spaceUsed32 = 0;
+
+ bitsToWeight = (BYTE *)((U32 *)workspace + spaceUsed32);
+ spaceUsed32 += ALIGN(HUF_TABLELOG_MAX + 1, sizeof(U32)) >> 2;
+ huffWeight = (BYTE *)((U32 *)workspace + spaceUsed32);
+ spaceUsed32 += ALIGN(HUF_SYMBOLVALUE_MAX, sizeof(U32)) >> 2;
+
+ if ((spaceUsed32 << 2) > workspaceSize)
+ return ERROR(tableLog_tooLarge);
+ workspace = (U32 *)workspace + spaceUsed32;
+ workspaceSize -= (spaceUsed32 << 2);
+
+ /* check conditions */
+ if (maxSymbolValue > HUF_SYMBOLVALUE_MAX)
+ return ERROR(maxSymbolValue_tooLarge);
+
+ /* convert to weight */
+ bitsToWeight[0] = 0;
+ for (n = 1; n < huffLog + 1; n++)
+ bitsToWeight[n] = (BYTE)(huffLog + 1 - n);
+ for (n = 0; n < maxSymbolValue; n++)
+ huffWeight[n] = bitsToWeight[CTable[n].nbBits];
+
+ /* attempt weights compression by FSE */
+ {
+ CHECK_V_F(hSize, HUF_compressWeights_wksp(op + 1, maxDstSize - 1, huffWeight, maxSymbolValue, workspace, workspaceSize));
+ if ((hSize > 1) & (hSize < maxSymbolValue / 2)) { /* FSE compressed */
+ op[0] = (BYTE)hSize;
+ return hSize + 1;
+ }
+ }
+
+ /* write raw values as 4-bits (max : 15) */
+ if (maxSymbolValue > (256 - 128))
+ return ERROR(GENERIC); /* should not happen : likely means source cannot be compressed */
+ if (((maxSymbolValue + 1) / 2) + 1 > maxDstSize)
+ return ERROR(dstSize_tooSmall); /* not enough space within dst buffer */
+ op[0] = (BYTE)(128 /*special case*/ + (maxSymbolValue - 1));
+ huffWeight[maxSymbolValue] = 0; /* to be sure it doesn't cause msan issue in final combination */
+ for (n = 0; n < maxSymbolValue; n += 2)
+ op[(n / 2) + 1] = (BYTE)((huffWeight[n] << 4) + huffWeight[n + 1]);
+ return ((maxSymbolValue + 1) / 2) + 1;
+}
+
+size_t HUF_readCTable_wksp(HUF_CElt *CTable, U32 maxSymbolValue, const void *src, size_t srcSize, void *workspace, size_t workspaceSize)
+{
+ U32 *rankVal;
+ BYTE *huffWeight;
+ U32 tableLog = 0;
+ U32 nbSymbols = 0;
+ size_t readSize;
+ size_t spaceUsed32 = 0;
+
+ rankVal = (U32 *)workspace + spaceUsed32;
+ spaceUsed32 += HUF_TABLELOG_ABSOLUTEMAX + 1;
+ huffWeight = (BYTE *)((U32 *)workspace + spaceUsed32);
+ spaceUsed32 += ALIGN(HUF_SYMBOLVALUE_MAX + 1, sizeof(U32)) >> 2;
+
+ if ((spaceUsed32 << 2) > workspaceSize)
+ return ERROR(tableLog_tooLarge);
+ workspace = (U32 *)workspace + spaceUsed32;
+ workspaceSize -= (spaceUsed32 << 2);
+
+ /* get symbol weights */
+ readSize = HUF_readStats_wksp(huffWeight, HUF_SYMBOLVALUE_MAX + 1, rankVal, &nbSymbols, &tableLog, src, srcSize, workspace, workspaceSize);
+ if (ERR_isError(readSize))
+ return readSize;
+
+ /* check result */
+ if (tableLog > HUF_TABLELOG_MAX)
+ return ERROR(tableLog_tooLarge);
+ if (nbSymbols > maxSymbolValue + 1)
+ return ERROR(maxSymbolValue_tooSmall);
+
+ /* Prepare base value per rank */
+ {
+ U32 n, nextRankStart = 0;
+ for (n = 1; n <= tableLog; n++) {
+ U32 curr = nextRankStart;
+ nextRankStart += (rankVal[n] << (n - 1));
+ rankVal[n] = curr;
+ }
+ }
+
+ /* fill nbBits */
+ {
+ U32 n;
+ for (n = 0; n < nbSymbols; n++) {
+ const U32 w = huffWeight[n];
+ CTable[n].nbBits = (BYTE)(tableLog + 1 - w);
+ }
+ }
+
+ /* fill val */
+ {
+ U16 nbPerRank[HUF_TABLELOG_MAX + 2] = {0}; /* support w=0=>n=tableLog+1 */
+ U16 valPerRank[HUF_TABLELOG_MAX + 2] = {0};
+ {
+ U32 n;
+ for (n = 0; n < nbSymbols; n++)
+ nbPerRank[CTable[n].nbBits]++;
+ }
+ /* determine stating value per rank */
+ valPerRank[tableLog + 1] = 0; /* for w==0 */
+ {
+ U16 min = 0;
+ U32 n;
+ for (n = tableLog; n > 0; n--) { /* start at n=tablelog <-> w=1 */
+ valPerRank[n] = min; /* get starting value within each rank */
+ min += nbPerRank[n];
+ min >>= 1;
+ }
+ }
+ /* assign value within rank, symbol order */
+ {
+ U32 n;
+ for (n = 0; n <= maxSymbolValue; n++)
+ CTable[n].val = valPerRank[CTable[n].nbBits]++;
+ }
+ }
+
+ return readSize;
+}
+
+typedef struct nodeElt_s {
+ U32 count;
+ U16 parent;
+ BYTE byte;
+ BYTE nbBits;
+} nodeElt;
+
+static U32 HUF_setMaxHeight(nodeElt *huffNode, U32 lastNonNull, U32 maxNbBits)
+{
+ const U32 largestBits = huffNode[lastNonNull].nbBits;
+ if (largestBits <= maxNbBits)
+ return largestBits; /* early exit : no elt > maxNbBits */
+
+ /* there are several too large elements (at least >= 2) */
+ {
+ int totalCost = 0;
+ const U32 baseCost = 1 << (largestBits - maxNbBits);
+ U32 n = lastNonNull;
+
+ while (huffNode[n].nbBits > maxNbBits) {
+ totalCost += baseCost - (1 << (largestBits - huffNode[n].nbBits));
+ huffNode[n].nbBits = (BYTE)maxNbBits;
+ n--;
+ } /* n stops at huffNode[n].nbBits <= maxNbBits */
+ while (huffNode[n].nbBits == maxNbBits)
+ n--; /* n end at index of smallest symbol using < maxNbBits */
+
+ /* renorm totalCost */
+ totalCost >>= (largestBits - maxNbBits); /* note : totalCost is necessarily a multiple of baseCost */
+
+ /* repay normalized cost */
+ {
+ U32 const noSymbol = 0xF0F0F0F0;
+ U32 rankLast[HUF_TABLELOG_MAX + 2];
+ int pos;
+
+ /* Get pos of last (smallest) symbol per rank */
+ memset(rankLast, 0xF0, sizeof(rankLast));
+ {
+ U32 currNbBits = maxNbBits;
+ for (pos = n; pos >= 0; pos--) {
+ if (huffNode[pos].nbBits >= currNbBits)
+ continue;
+ currNbBits = huffNode[pos].nbBits; /* < maxNbBits */
+ rankLast[maxNbBits - currNbBits] = pos;
+ }
+ }
+
+ while (totalCost > 0) {
+ U32 nBitsToDecrease = BIT_highbit32(totalCost) + 1;
+ for (; nBitsToDecrease > 1; nBitsToDecrease--) {
+ U32 highPos = rankLast[nBitsToDecrease];
+ U32 lowPos = rankLast[nBitsToDecrease - 1];
+ if (highPos == noSymbol)
+ continue;
+ if (lowPos == noSymbol)
+ break;
+ {
+ U32 const highTotal = huffNode[highPos].count;
+ U32 const lowTotal = 2 * huffNode[lowPos].count;
+ if (highTotal <= lowTotal)
+ break;
+ }
+ }
+ /* only triggered when no more rank 1 symbol left => find closest one (note : there is necessarily at least one !) */
+ /* HUF_MAX_TABLELOG test just to please gcc 5+; but it should not be necessary */
+ while ((nBitsToDecrease <= HUF_TABLELOG_MAX) && (rankLast[nBitsToDecrease] == noSymbol))
+ nBitsToDecrease++;
+ totalCost -= 1 << (nBitsToDecrease - 1);
+ if (rankLast[nBitsToDecrease - 1] == noSymbol)
+ rankLast[nBitsToDecrease - 1] = rankLast[nBitsToDecrease]; /* this rank is no longer empty */
+ huffNode[rankLast[nBitsToDecrease]].nbBits++;
+ if (rankLast[nBitsToDecrease] == 0) /* special case, reached largest symbol */
+ rankLast[nBitsToDecrease] = noSymbol;
+ else {
+ rankLast[nBitsToDecrease]--;
+ if (huffNode[rankLast[nBitsToDecrease]].nbBits != maxNbBits - nBitsToDecrease)
+ rankLast[nBitsToDecrease] = noSymbol; /* this rank is now empty */
+ }
+ } /* while (totalCost > 0) */
+
+ while (totalCost < 0) { /* Sometimes, cost correction overshoot */
+ if (rankLast[1] == noSymbol) { /* special case : no rank 1 symbol (using maxNbBits-1); let's create one from largest rank 0
+ (using maxNbBits) */
+ while (huffNode[n].nbBits == maxNbBits)
+ n--;
+ huffNode[n + 1].nbBits--;
+ rankLast[1] = n + 1;
+ totalCost++;
+ continue;
+ }
+ huffNode[rankLast[1] + 1].nbBits--;
+ rankLast[1]++;
+ totalCost++;
+ }
+ }
+ } /* there are several too large elements (at least >= 2) */
+
+ return maxNbBits;
+}
+
+typedef struct {
+ U32 base;
+ U32 curr;
+} rankPos;
+
+static void HUF_sort(nodeElt *huffNode, const U32 *count, U32 maxSymbolValue)
+{
+ rankPos rank[32];
+ U32 n;
+
+ memset(rank, 0, sizeof(rank));
+ for (n = 0; n <= maxSymbolValue; n++) {
+ U32 r = BIT_highbit32(count[n] + 1);
+ rank[r].base++;
+ }
+ for (n = 30; n > 0; n--)
+ rank[n - 1].base += rank[n].base;
+ for (n = 0; n < 32; n++)
+ rank[n].curr = rank[n].base;
+ for (n = 0; n <= maxSymbolValue; n++) {
+ U32 const c = count[n];
+ U32 const r = BIT_highbit32(c + 1) + 1;
+ U32 pos = rank[r].curr++;
+ while ((pos > rank[r].base) && (c > huffNode[pos - 1].count))
+ huffNode[pos] = huffNode[pos - 1], pos--;
+ huffNode[pos].count = c;
+ huffNode[pos].byte = (BYTE)n;
+ }
+}
+
+/** HUF_buildCTable_wksp() :
+ * Same as HUF_buildCTable(), but using externally allocated scratch buffer.
+ * `workSpace` must be aligned on 4-bytes boundaries, and be at least as large as a table of 1024 unsigned.
+ */
+#define STARTNODE (HUF_SYMBOLVALUE_MAX + 1)
+typedef nodeElt huffNodeTable[2 * HUF_SYMBOLVALUE_MAX + 1 + 1];
+size_t HUF_buildCTable_wksp(HUF_CElt *tree, const U32 *count, U32 maxSymbolValue, U32 maxNbBits, void *workSpace, size_t wkspSize)
+{
+ nodeElt *const huffNode0 = (nodeElt *)workSpace;
+ nodeElt *const huffNode = huffNode0 + 1;
+ U32 n, nonNullRank;
+ int lowS, lowN;
+ U16 nodeNb = STARTNODE;
+ U32 nodeRoot;
+
+ /* safety checks */
+ if (wkspSize < sizeof(huffNodeTable))
+ return ERROR(GENERIC); /* workSpace is not large enough */
+ if (maxNbBits == 0)
+ maxNbBits = HUF_TABLELOG_DEFAULT;
+ if (maxSymbolValue > HUF_SYMBOLVALUE_MAX)
+ return ERROR(GENERIC);
+ memset(huffNode0, 0, sizeof(huffNodeTable));
+
+ /* sort, decreasing order */
+ HUF_sort(huffNode, count, maxSymbolValue);
+
+ /* init for parents */
+ nonNullRank = maxSymbolValue;
+ while (huffNode[nonNullRank].count == 0)
+ nonNullRank--;
+ lowS = nonNullRank;
+ nodeRoot = nodeNb + lowS - 1;
+ lowN = nodeNb;
+ huffNode[nodeNb].count = huffNode[lowS].count + huffNode[lowS - 1].count;
+ huffNode[lowS].parent = huffNode[lowS - 1].parent = nodeNb;
+ nodeNb++;
+ lowS -= 2;
+ for (n = nodeNb; n <= nodeRoot; n++)
+ huffNode[n].count = (U32)(1U << 30);
+ huffNode0[0].count = (U32)(1U << 31); /* fake entry, strong barrier */
+
+ /* create parents */
+ while (nodeNb <= nodeRoot) {
+ U32 n1 = (huffNode[lowS].count < huffNode[lowN].count) ? lowS-- : lowN++;
+ U32 n2 = (huffNode[lowS].count < huffNode[lowN].count) ? lowS-- : lowN++;
+ huffNode[nodeNb].count = huffNode[n1].count + huffNode[n2].count;
+ huffNode[n1].parent = huffNode[n2].parent = nodeNb;
+ nodeNb++;
+ }
+
+ /* distribute weights (unlimited tree height) */
+ huffNode[nodeRoot].nbBits = 0;
+ for (n = nodeRoot - 1; n >= STARTNODE; n--)
+ huffNode[n].nbBits = huffNode[huffNode[n].parent].nbBits + 1;
+ for (n = 0; n <= nonNullRank; n++)
+ huffNode[n].nbBits = huffNode[huffNode[n].parent].nbBits + 1;
+
+ /* enforce maxTableLog */
+ maxNbBits = HUF_setMaxHeight(huffNode, nonNullRank, maxNbBits);
+
+ /* fill result into tree (val, nbBits) */
+ {
+ U16 nbPerRank[HUF_TABLELOG_MAX + 1] = {0};
+ U16 valPerRank[HUF_TABLELOG_MAX + 1] = {0};
+ if (maxNbBits > HUF_TABLELOG_MAX)
+ return ERROR(GENERIC); /* check fit into table */
+ for (n = 0; n <= nonNullRank; n++)
+ nbPerRank[huffNode[n].nbBits]++;
+ /* determine stating value per rank */
+ {
+ U16 min = 0;
+ for (n = maxNbBits; n > 0; n--) {
+ valPerRank[n] = min; /* get starting value within each rank */
+ min += nbPerRank[n];
+ min >>= 1;
+ }
+ }
+ for (n = 0; n <= maxSymbolValue; n++)
+ tree[huffNode[n].byte].nbBits = huffNode[n].nbBits; /* push nbBits per symbol, symbol order */
+ for (n = 0; n <= maxSymbolValue; n++)
+ tree[n].val = valPerRank[tree[n].nbBits]++; /* assign value within rank, symbol order */
+ }
+
+ return maxNbBits;
+}
+
+static size_t HUF_estimateCompressedSize(HUF_CElt *CTable, const unsigned *count, unsigned maxSymbolValue)
+{
+ size_t nbBits = 0;
+ int s;
+ for (s = 0; s <= (int)maxSymbolValue; ++s) {
+ nbBits += CTable[s].nbBits * count[s];
+ }
+ return nbBits >> 3;
+}
+
+static int HUF_validateCTable(const HUF_CElt *CTable, const unsigned *count, unsigned maxSymbolValue)
+{
+ int bad = 0;
+ int s;
+ for (s = 0; s <= (int)maxSymbolValue; ++s) {
+ bad |= (count[s] != 0) & (CTable[s].nbBits == 0);
+ }
+ return !bad;
+}
+
+static void HUF_encodeSymbol(BIT_CStream_t *bitCPtr, U32 symbol, const HUF_CElt *CTable)
+{
+ BIT_addBitsFast(bitCPtr, CTable[symbol].val, CTable[symbol].nbBits);
+}
+
+size_t HUF_compressBound(size_t size) { return HUF_COMPRESSBOUND(size); }
+
+#define HUF_FLUSHBITS(s) BIT_flushBits(s)
+
+#define HUF_FLUSHBITS_1(stream) \
+ if (sizeof((stream)->bitContainer) * 8 < HUF_TABLELOG_MAX * 2 + 7) \
+ HUF_FLUSHBITS(stream)
+
+#define HUF_FLUSHBITS_2(stream) \
+ if (sizeof((stream)->bitContainer) * 8 < HUF_TABLELOG_MAX * 4 + 7) \
+ HUF_FLUSHBITS(stream)
+
+size_t HUF_compress1X_usingCTable(void *dst, size_t dstSize, const void *src, size_t srcSize, const HUF_CElt *CTable)
+{
+ const BYTE *ip = (const BYTE *)src;
+ BYTE *const ostart = (BYTE *)dst;
+ BYTE *const oend = ostart + dstSize;
+ BYTE *op = ostart;
+ size_t n;
+ BIT_CStream_t bitC;
+
+ /* init */
+ if (dstSize < 8)
+ return 0; /* not enough space to compress */
+ {
+ size_t const initErr = BIT_initCStream(&bitC, op, oend - op);
+ if (HUF_isError(initErr))
+ return 0;
+ }
+
+ n = srcSize & ~3; /* join to mod 4 */
+ switch (srcSize & 3) {
+ case 3: HUF_encodeSymbol(&bitC, ip[n + 2], CTable); HUF_FLUSHBITS_2(&bitC);
+ case 2: HUF_encodeSymbol(&bitC, ip[n + 1], CTable); HUF_FLUSHBITS_1(&bitC);
+ case 1: HUF_encodeSymbol(&bitC, ip[n + 0], CTable); HUF_FLUSHBITS(&bitC);
+ case 0:
+ default:;
+ }
+
+ for (; n > 0; n -= 4) { /* note : n&3==0 at this stage */
+ HUF_encodeSymbol(&bitC, ip[n - 1], CTable);
+ HUF_FLUSHBITS_1(&bitC);
+ HUF_encodeSymbol(&bitC, ip[n - 2], CTable);
+ HUF_FLUSHBITS_2(&bitC);
+ HUF_encodeSymbol(&bitC, ip[n - 3], CTable);
+ HUF_FLUSHBITS_1(&bitC);
+ HUF_encodeSymbol(&bitC, ip[n - 4], CTable);
+ HUF_FLUSHBITS(&bitC);
+ }
+
+ return BIT_closeCStream(&bitC);
+}
+
+size_t HUF_compress4X_usingCTable(void *dst, size_t dstSize, const void *src, size_t srcSize, const HUF_CElt *CTable)
+{
+ size_t const segmentSize = (srcSize + 3) / 4; /* first 3 segments */
+ const BYTE *ip = (const BYTE *)src;
+ const BYTE *const iend = ip + srcSize;
+ BYTE *const ostart = (BYTE *)dst;
+ BYTE *const oend = ostart + dstSize;
+ BYTE *op = ostart;
+
+ if (dstSize < 6 + 1 + 1 + 1 + 8)
+ return 0; /* minimum space to compress successfully */
+ if (srcSize < 12)
+ return 0; /* no saving possible : too small input */
+ op += 6; /* jumpTable */
+
+ {
+ CHECK_V_F(cSize, HUF_compress1X_usingCTable(op, oend - op, ip, segmentSize, CTable));
+ if (cSize == 0)
+ return 0;
+ ZSTD_writeLE16(ostart, (U16)cSize);
+ op += cSize;
+ }
+
+ ip += segmentSize;
+ {
+ CHECK_V_F(cSize, HUF_compress1X_usingCTable(op, oend - op, ip, segmentSize, CTable));
+ if (cSize == 0)
+ return 0;
+ ZSTD_writeLE16(ostart + 2, (U16)cSize);
+ op += cSize;
+ }
+
+ ip += segmentSize;
+ {
+ CHECK_V_F(cSize, HUF_compress1X_usingCTable(op, oend - op, ip, segmentSize, CTable));
+ if (cSize == 0)
+ return 0;
+ ZSTD_writeLE16(ostart + 4, (U16)cSize);
+ op += cSize;
+ }
+
+ ip += segmentSize;
+ {
+ CHECK_V_F(cSize, HUF_compress1X_usingCTable(op, oend - op, ip, iend - ip, CTable));
+ if (cSize == 0)
+ return 0;
+ op += cSize;
+ }
+
+ return op - ostart;
+}
+
+static size_t HUF_compressCTable_internal(BYTE *const ostart, BYTE *op, BYTE *const oend, const void *src, size_t srcSize, unsigned singleStream,
+ const HUF_CElt *CTable)
+{
+ size_t const cSize =
+ singleStream ? HUF_compress1X_usingCTable(op, oend - op, src, srcSize, CTable) : HUF_compress4X_usingCTable(op, oend - op, src, srcSize, CTable);
+ if (HUF_isError(cSize)) {
+ return cSize;
+ }
+ if (cSize == 0) {
+ return 0;
+ } /* uncompressible */
+ op += cSize;
+ /* check compressibility */
+ if ((size_t)(op - ostart) >= srcSize - 1) {
+ return 0;
+ }
+ return op - ostart;
+}
+
+/* `workSpace` must a table of at least 1024 unsigned */
+static size_t HUF_compress_internal(void *dst, size_t dstSize, const void *src, size_t srcSize, unsigned maxSymbolValue, unsigned huffLog,
+ unsigned singleStream, void *workSpace, size_t wkspSize, HUF_CElt *oldHufTable, HUF_repeat *repeat, int preferRepeat)
+{
+ BYTE *const ostart = (BYTE *)dst;
+ BYTE *const oend = ostart + dstSize;
+ BYTE *op = ostart;
+
+ U32 *count;
+ size_t const countSize = sizeof(U32) * (HUF_SYMBOLVALUE_MAX + 1);
+ HUF_CElt *CTable;
+ size_t const CTableSize = sizeof(HUF_CElt) * (HUF_SYMBOLVALUE_MAX + 1);
+
+ /* checks & inits */
+ if (wkspSize < sizeof(huffNodeTable) + countSize + CTableSize)
+ return ERROR(GENERIC);
+ if (!srcSize)
+ return 0; /* Uncompressed (note : 1 means rle, so first byte must be correct) */
+ if (!dstSize)
+ return 0; /* cannot fit within dst budget */
+ if (srcSize > HUF_BLOCKSIZE_MAX)
+ return ERROR(srcSize_wrong); /* curr block size limit */
+ if (huffLog > HUF_TABLELOG_MAX)
+ return ERROR(tableLog_tooLarge);
+ if (!maxSymbolValue)
+ maxSymbolValue = HUF_SYMBOLVALUE_MAX;
+ if (!huffLog)
+ huffLog = HUF_TABLELOG_DEFAULT;
+
+ count = (U32 *)workSpace;
+ workSpace = (BYTE *)workSpace + countSize;
+ wkspSize -= countSize;
+ CTable = (HUF_CElt *)workSpace;
+ workSpace = (BYTE *)workSpace + CTableSize;
+ wkspSize -= CTableSize;
+
+ /* Heuristic : If we don't need to check the validity of the old table use the old table for small inputs */
+ if (preferRepeat && repeat && *repeat == HUF_repeat_valid) {
+ return HUF_compressCTable_internal(ostart, op, oend, src, srcSize, singleStream, oldHufTable);
+ }
+
+ /* Scan input and build symbol stats */
+ {
+ CHECK_V_F(largest, FSE_count_wksp(count, &maxSymbolValue, (const BYTE *)src, srcSize, (U32 *)workSpace));
+ if (largest == srcSize) {
+ *ostart = ((const BYTE *)src)[0];
+ return 1;
+ } /* single symbol, rle */
+ if (largest <= (srcSize >> 7) + 1)
+ return 0; /* Fast heuristic : not compressible enough */
+ }
+
+ /* Check validity of previous table */
+ if (repeat && *repeat == HUF_repeat_check && !HUF_validateCTable(oldHufTable, count, maxSymbolValue)) {
+ *repeat = HUF_repeat_none;
+ }
+ /* Heuristic : use existing table for small inputs */
+ if (preferRepeat && repeat && *repeat != HUF_repeat_none) {
+ return HUF_compressCTable_internal(ostart, op, oend, src, srcSize, singleStream, oldHufTable);
+ }
+
+ /* Build Huffman Tree */
+ huffLog = HUF_optimalTableLog(huffLog, srcSize, maxSymbolValue);
+ {
+ CHECK_V_F(maxBits, HUF_buildCTable_wksp(CTable, count, maxSymbolValue, huffLog, workSpace, wkspSize));
+ huffLog = (U32)maxBits;
+ /* Zero the unused symbols so we can check it for validity */
+ memset(CTable + maxSymbolValue + 1, 0, CTableSize - (maxSymbolValue + 1) * sizeof(HUF_CElt));
+ }
+
+ /* Write table description header */
+ {
+ CHECK_V_F(hSize, HUF_writeCTable_wksp(op, dstSize, CTable, maxSymbolValue, huffLog, workSpace, wkspSize));
+ /* Check if using the previous table will be beneficial */
+ if (repeat && *repeat != HUF_repeat_none) {
+ size_t const oldSize = HUF_estimateCompressedSize(oldHufTable, count, maxSymbolValue);
+ size_t const newSize = HUF_estimateCompressedSize(CTable, count, maxSymbolValue);
+ if (oldSize <= hSize + newSize || hSize + 12 >= srcSize) {
+ return HUF_compressCTable_internal(ostart, op, oend, src, srcSize, singleStream, oldHufTable);
+ }
+ }
+ /* Use the new table */
+ if (hSize + 12ul >= srcSize) {
+ return 0;
+ }
+ op += hSize;
+ if (repeat) {
+ *repeat = HUF_repeat_none;
+ }
+ if (oldHufTable) {
+ memcpy(oldHufTable, CTable, CTableSize);
+ } /* Save the new table */
+ }
+ return HUF_compressCTable_internal(ostart, op, oend, src, srcSize, singleStream, CTable);
+}
+
+size_t HUF_compress1X_wksp(void *dst, size_t dstSize, const void *src, size_t srcSize, unsigned maxSymbolValue, unsigned huffLog, void *workSpace,
+ size_t wkspSize)
+{
+ return HUF_compress_internal(dst, dstSize, src, srcSize, maxSymbolValue, huffLog, 1 /* single stream */, workSpace, wkspSize, NULL, NULL, 0);
+}
+
+size_t HUF_compress1X_repeat(void *dst, size_t dstSize, const void *src, size_t srcSize, unsigned maxSymbolValue, unsigned huffLog, void *workSpace,
+ size_t wkspSize, HUF_CElt *hufTable, HUF_repeat *repeat, int preferRepeat)
+{
+ return HUF_compress_internal(dst, dstSize, src, srcSize, maxSymbolValue, huffLog, 1 /* single stream */, workSpace, wkspSize, hufTable, repeat,
+ preferRepeat);
+}
+
+size_t HUF_compress4X_wksp(void *dst, size_t dstSize, const void *src, size_t srcSize, unsigned maxSymbolValue, unsigned huffLog, void *workSpace,
+ size_t wkspSize)
+{
+ return HUF_compress_internal(dst, dstSize, src, srcSize, maxSymbolValue, huffLog, 0 /* 4 streams */, workSpace, wkspSize, NULL, NULL, 0);
+}
+
+size_t HUF_compress4X_repeat(void *dst, size_t dstSize, const void *src, size_t srcSize, unsigned maxSymbolValue, unsigned huffLog, void *workSpace,
+ size_t wkspSize, HUF_CElt *hufTable, HUF_repeat *repeat, int preferRepeat)
+{
+ return HUF_compress_internal(dst, dstSize, src, srcSize, maxSymbolValue, huffLog, 0 /* 4 streams */, workSpace, wkspSize, hufTable, repeat,
+ preferRepeat);
+}
diff --git a/lib/zstd/huf_decompress.c b/lib/zstd/huf_decompress.c
new file mode 100644
index 000000000000..6526482047dc
--- /dev/null
+++ b/lib/zstd/huf_decompress.c
@@ -0,0 +1,960 @@
+/*
+ * Huffman decoder, part of New Generation Entropy library
+ * Copyright (C) 2013-2016, Yann Collet.
+ *
+ * BSD 2-Clause License (http://www.opensource.org/licenses/bsd-license.php)
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are
+ * met:
+ *
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following disclaimer
+ * in the documentation and/or other materials provided with the
+ * distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ * This program is free software; you can redistribute it and/or modify it under
+ * the terms of the GNU General Public License version 2 as published by the
+ * Free Software Foundation. This program is dual-licensed; you may select
+ * either version 2 of the GNU General Public License ("GPL") or BSD license
+ * ("BSD").
+ *
+ * You can contact the author at :
+ * - Source repository : https://github.com/Cyan4973/FiniteStateEntropy
+ */
+
+/* **************************************************************
+* Compiler specifics
+****************************************************************/
+#define FORCE_INLINE static __always_inline
+
+/* **************************************************************
+* Dependencies
+****************************************************************/
+#include "bitstream.h" /* BIT_* */
+#include "fse.h" /* header compression */
+#include "huf.h"
+#include <linux/compiler.h>
+#include <linux/kernel.h>
+#include <linux/string.h> /* memcpy, memset */
+
+/* **************************************************************
+* Error Management
+****************************************************************/
+#define HUF_STATIC_ASSERT(c) \
+ { \
+ enum { HUF_static_assert = 1 / (int)(!!(c)) }; \
+ } /* use only *after* variable declarations */
+
+/*-***************************/
+/* generic DTableDesc */
+/*-***************************/
+
+typedef struct {
+ BYTE maxTableLog;
+ BYTE tableType;
+ BYTE tableLog;
+ BYTE reserved;
+} DTableDesc;
+
+static DTableDesc HUF_getDTableDesc(const HUF_DTable *table)
+{
+ DTableDesc dtd;
+ memcpy(&dtd, table, sizeof(dtd));
+ return dtd;
+}
+
+/*-***************************/
+/* single-symbol decoding */
+/*-***************************/
+
+typedef struct {
+ BYTE byte;
+ BYTE nbBits;
+} HUF_DEltX2; /* single-symbol decoding */
+
+size_t HUF_readDTableX2_wksp(HUF_DTable *DTable, const void *src, size_t srcSize, void *workspace, size_t workspaceSize)
+{
+ U32 tableLog = 0;
+ U32 nbSymbols = 0;
+ size_t iSize;
+ void *const dtPtr = DTable + 1;
+ HUF_DEltX2 *const dt = (HUF_DEltX2 *)dtPtr;
+
+ U32 *rankVal;
+ BYTE *huffWeight;
+ size_t spaceUsed32 = 0;
+
+ rankVal = (U32 *)workspace + spaceUsed32;
+ spaceUsed32 += HUF_TABLELOG_ABSOLUTEMAX + 1;
+ huffWeight = (BYTE *)((U32 *)workspace + spaceUsed32);
+ spaceUsed32 += ALIGN(HUF_SYMBOLVALUE_MAX + 1, sizeof(U32)) >> 2;
+
+ if ((spaceUsed32 << 2) > workspaceSize)
+ return ERROR(tableLog_tooLarge);
+ workspace = (U32 *)workspace + spaceUsed32;
+ workspaceSize -= (spaceUsed32 << 2);
+
+ HUF_STATIC_ASSERT(sizeof(DTableDesc) == sizeof(HUF_DTable));
+ /* memset(huffWeight, 0, sizeof(huffWeight)); */ /* is not necessary, even though some analyzer complain ... */
+
+ iSize = HUF_readStats_wksp(huffWeight, HUF_SYMBOLVALUE_MAX + 1, rankVal, &nbSymbols, &tableLog, src, srcSize, workspace, workspaceSize);
+ if (HUF_isError(iSize))
+ return iSize;
+
+ /* Table header */
+ {
+ DTableDesc dtd = HUF_getDTableDesc(DTable);
+ if (tableLog > (U32)(dtd.maxTableLog + 1))
+ return ERROR(tableLog_tooLarge); /* DTable too small, Huffman tree cannot fit in */
+ dtd.tableType = 0;
+ dtd.tableLog = (BYTE)tableLog;
+ memcpy(DTable, &dtd, sizeof(dtd));
+ }
+
+ /* Calculate starting value for each rank */
+ {
+ U32 n, nextRankStart = 0;
+ for (n = 1; n < tableLog + 1; n++) {
+ U32 const curr = nextRankStart;
+ nextRankStart += (rankVal[n] << (n - 1));
+ rankVal[n] = curr;
+ }
+ }
+
+ /* fill DTable */
+ {
+ U32 n;
+ for (n = 0; n < nbSymbols; n++) {
+ U32 const w = huffWeight[n];
+ U32 const length = (1 << w) >> 1;
+ U32 u;
+ HUF_DEltX2 D;
+ D.byte = (BYTE)n;
+ D.nbBits = (BYTE)(tableLog + 1 - w);
+ for (u = rankVal[w]; u < rankVal[w] + length; u++)
+ dt[u] = D;
+ rankVal[w] += length;
+ }
+ }
+
+ return iSize;
+}
+
+static BYTE HUF_decodeSymbolX2(BIT_DStream_t *Dstream, const HUF_DEltX2 *dt, const U32 dtLog)
+{
+ size_t const val = BIT_lookBitsFast(Dstream, dtLog); /* note : dtLog >= 1 */
+ BYTE const c = dt[val].byte;
+ BIT_skipBits(Dstream, dt[val].nbBits);
+ return c;
+}
+
+#define HUF_DECODE_SYMBOLX2_0(ptr, DStreamPtr) *ptr++ = HUF_decodeSymbolX2(DStreamPtr, dt, dtLog)
+
+#define HUF_DECODE_SYMBOLX2_1(ptr, DStreamPtr) \
+ if (ZSTD_64bits() || (HUF_TABLELOG_MAX <= 12)) \
+ HUF_DECODE_SYMBOLX2_0(ptr, DStreamPtr)
+
+#define HUF_DECODE_SYMBOLX2_2(ptr, DStreamPtr) \
+ if (ZSTD_64bits()) \
+ HUF_DECODE_SYMBOLX2_0(ptr, DStreamPtr)
+
+FORCE_INLINE size_t HUF_decodeStreamX2(BYTE *p, BIT_DStream_t *const bitDPtr, BYTE *const pEnd, const HUF_DEltX2 *const dt, const U32 dtLog)
+{
+ BYTE *const pStart = p;
+
+ /* up to 4 symbols at a time */
+ while ((BIT_reloadDStream(bitDPtr) == BIT_DStream_unfinished) && (p <= pEnd - 4)) {
+ HUF_DECODE_SYMBOLX2_2(p, bitDPtr);
+ HUF_DECODE_SYMBOLX2_1(p, bitDPtr);
+ HUF_DECODE_SYMBOLX2_2(p, bitDPtr);
+ HUF_DECODE_SYMBOLX2_0(p, bitDPtr);
+ }
+
+ /* closer to the end */
+ while ((BIT_reloadDStream(bitDPtr) == BIT_DStream_unfinished) && (p < pEnd))
+ HUF_DECODE_SYMBOLX2_0(p, bitDPtr);
+
+ /* no more data to retrieve from bitstream, hence no need to reload */
+ while (p < pEnd)
+ HUF_DECODE_SYMBOLX2_0(p, bitDPtr);
+
+ return pEnd - pStart;
+}
+
+static size_t HUF_decompress1X2_usingDTable_internal(void *dst, size_t dstSize, const void *cSrc, size_t cSrcSize, const HUF_DTable *DTable)
+{
+ BYTE *op = (BYTE *)dst;
+ BYTE *const oend = op + dstSize;
+ const void *dtPtr = DTable + 1;
+ const HUF_DEltX2 *const dt = (const HUF_DEltX2 *)dtPtr;
+ BIT_DStream_t bitD;
+ DTableDesc const dtd = HUF_getDTableDesc(DTable);
+ U32 const dtLog = dtd.tableLog;
+
+ {
+ size_t const errorCode = BIT_initDStream(&bitD, cSrc, cSrcSize);
+ if (HUF_isError(errorCode))
+ return errorCode;
+ }
+
+ HUF_decodeStreamX2(op, &bitD, oend, dt, dtLog);
+
+ /* check */
+ if (!BIT_endOfDStream(&bitD))
+ return ERROR(corruption_detected);
+
+ return dstSize;
+}
+
+size_t HUF_decompress1X2_usingDTable(void *dst, size_t dstSize, const void *cSrc, size_t cSrcSize, const HUF_DTable *DTable)
+{
+ DTableDesc dtd = HUF_getDTableDesc(DTable);
+ if (dtd.tableType != 0)
+ return ERROR(GENERIC);
+ return HUF_decompress1X2_usingDTable_internal(dst, dstSize, cSrc, cSrcSize, DTable);
+}
+
+size_t HUF_decompress1X2_DCtx_wksp(HUF_DTable *DCtx, void *dst, size_t dstSize, const void *cSrc, size_t cSrcSize, void *workspace, size_t workspaceSize)
+{
+ const BYTE *ip = (const BYTE *)cSrc;
+
+ size_t const hSize = HUF_readDTableX2_wksp(DCtx, cSrc, cSrcSize, workspace, workspaceSize);
+ if (HUF_isError(hSize))
+ return hSize;
+ if (hSize >= cSrcSize)
+ return ERROR(srcSize_wrong);
+ ip += hSize;
+ cSrcSize -= hSize;
+
+ return HUF_decompress1X2_usingDTable_internal(dst, dstSize, ip, cSrcSize, DCtx);
+}
+
+static size_t HUF_decompress4X2_usingDTable_internal(void *dst, size_t dstSize, const void *cSrc, size_t cSrcSize, const HUF_DTable *DTable)
+{
+ /* Check */
+ if (cSrcSize < 10)
+ return ERROR(corruption_detected); /* strict minimum : jump table + 1 byte per stream */
+
+ {
+ const BYTE *const istart = (const BYTE *)cSrc;
+ BYTE *const ostart = (BYTE *)dst;
+ BYTE *const oend = ostart + dstSize;
+ const void *const dtPtr = DTable + 1;
+ const HUF_DEltX2 *const dt = (const HUF_DEltX2 *)dtPtr;
+
+ /* Init */
+ BIT_DStream_t bitD1;
+ BIT_DStream_t bitD2;
+ BIT_DStream_t bitD3;
+ BIT_DStream_t bitD4;
+ size_t const length1 = ZSTD_readLE16(istart);
+ size_t const length2 = ZSTD_readLE16(istart + 2);
+ size_t const length3 = ZSTD_readLE16(istart + 4);
+ size_t const length4 = cSrcSize - (length1 + length2 + length3 + 6);
+ const BYTE *const istart1 = istart + 6; /* jumpTable */
+ const BYTE *const istart2 = istart1 + length1;
+ const BYTE *const istart3 = istart2 + length2;
+ const BYTE *const istart4 = istart3 + length3;
+ const size_t segmentSize = (dstSize + 3) / 4;
+ BYTE *const opStart2 = ostart + segmentSize;
+ BYTE *const opStart3 = opStart2 + segmentSize;
+ BYTE *const opStart4 = opStart3 + segmentSize;
+ BYTE *op1 = ostart;
+ BYTE *op2 = opStart2;
+ BYTE *op3 = opStart3;
+ BYTE *op4 = opStart4;
+ U32 endSignal;
+ DTableDesc const dtd = HUF_getDTableDesc(DTable);
+ U32 const dtLog = dtd.tableLog;
+
+ if (length4 > cSrcSize)
+ return ERROR(corruption_detected); /* overflow */
+ {
+ size_t const errorCode = BIT_initDStream(&bitD1, istart1, length1);
+ if (HUF_isError(errorCode))
+ return errorCode;
+ }
+ {
+ size_t const errorCode = BIT_initDStream(&bitD2, istart2, length2);
+ if (HUF_isError(errorCode))
+ return errorCode;
+ }
+ {
+ size_t const errorCode = BIT_initDStream(&bitD3, istart3, length3);
+ if (HUF_isError(errorCode))
+ return errorCode;
+ }
+ {
+ size_t const errorCode = BIT_initDStream(&bitD4, istart4, length4);
+ if (HUF_isError(errorCode))
+ return errorCode;
+ }
+
+ /* 16-32 symbols per loop (4-8 symbols per stream) */
+ endSignal = BIT_reloadDStream(&bitD1) | BIT_reloadDStream(&bitD2) | BIT_reloadDStream(&bitD3) | BIT_reloadDStream(&bitD4);
+ for (; (endSignal == BIT_DStream_unfinished) && (op4 < (oend - 7));) {
+ HUF_DECODE_SYMBOLX2_2(op1, &bitD1);
+ HUF_DECODE_SYMBOLX2_2(op2, &bitD2);
+ HUF_DECODE_SYMBOLX2_2(op3, &bitD3);
+ HUF_DECODE_SYMBOLX2_2(op4, &bitD4);
+ HUF_DECODE_SYMBOLX2_1(op1, &bitD1);
+ HUF_DECODE_SYMBOLX2_1(op2, &bitD2);
+ HUF_DECODE_SYMBOLX2_1(op3, &bitD3);
+ HUF_DECODE_SYMBOLX2_1(op4, &bitD4);
+ HUF_DECODE_SYMBOLX2_2(op1, &bitD1);
+ HUF_DECODE_SYMBOLX2_2(op2, &bitD2);
+ HUF_DECODE_SYMBOLX2_2(op3, &bitD3);
+ HUF_DECODE_SYMBOLX2_2(op4, &bitD4);
+ HUF_DECODE_SYMBOLX2_0(op1, &bitD1);
+ HUF_DECODE_SYMBOLX2_0(op2, &bitD2);
+ HUF_DECODE_SYMBOLX2_0(op3, &bitD3);
+ HUF_DECODE_SYMBOLX2_0(op4, &bitD4);
+ endSignal = BIT_reloadDStream(&bitD1) | BIT_reloadDStream(&bitD2) | BIT_reloadDStream(&bitD3) | BIT_reloadDStream(&bitD4);
+ }
+
+ /* check corruption */
+ if (op1 > opStart2)
+ return ERROR(corruption_detected);
+ if (op2 > opStart3)
+ return ERROR(corruption_detected);
+ if (op3 > opStart4)
+ return ERROR(corruption_detected);
+ /* note : op4 supposed already verified within main loop */
+
+ /* finish bitStreams one by one */
+ HUF_decodeStreamX2(op1, &bitD1, opStart2, dt, dtLog);
+ HUF_decodeStreamX2(op2, &bitD2, opStart3, dt, dtLog);
+ HUF_decodeStreamX2(op3, &bitD3, opStart4, dt, dtLog);
+ HUF_decodeStreamX2(op4, &bitD4, oend, dt, dtLog);
+
+ /* check */
+ endSignal = BIT_endOfDStream(&bitD1) & BIT_endOfDStream(&bitD2) & BIT_endOfDStream(&bitD3) & BIT_endOfDStream(&bitD4);
+ if (!endSignal)
+ return ERROR(corruption_detected);
+
+ /* decoded size */
+ return dstSize;
+ }
+}
+
+size_t HUF_decompress4X2_usingDTable(void *dst, size_t dstSize, const void *cSrc, size_t cSrcSize, const HUF_DTable *DTable)
+{
+ DTableDesc dtd = HUF_getDTableDesc(DTable);
+ if (dtd.tableType != 0)
+ return ERROR(GENERIC);
+ return HUF_decompress4X2_usingDTable_internal(dst, dstSize, cSrc, cSrcSize, DTable);
+}
+
+size_t HUF_decompress4X2_DCtx_wksp(HUF_DTable *dctx, void *dst, size_t dstSize, const void *cSrc, size_t cSrcSize, void *workspace, size_t workspaceSize)
+{
+ const BYTE *ip = (const BYTE *)cSrc;
+
+ size_t const hSize = HUF_readDTableX2_wksp(dctx, cSrc, cSrcSize, workspace, workspaceSize);
+ if (HUF_isError(hSize))
+ return hSize;
+ if (hSize >= cSrcSize)
+ return ERROR(srcSize_wrong);
+ ip += hSize;
+ cSrcSize -= hSize;
+
+ return HUF_decompress4X2_usingDTable_internal(dst, dstSize, ip, cSrcSize, dctx);
+}
+
+/* *************************/
+/* double-symbols decoding */
+/* *************************/
+typedef struct {
+ U16 sequence;
+ BYTE nbBits;
+ BYTE length;
+} HUF_DEltX4; /* double-symbols decoding */
+
+typedef struct {
+ BYTE symbol;
+ BYTE weight;
+} sortedSymbol_t;
+
+/* HUF_fillDTableX4Level2() :
+ * `rankValOrigin` must be a table of at least (HUF_TABLELOG_MAX + 1) U32 */
+static void HUF_fillDTableX4Level2(HUF_DEltX4 *DTable, U32 sizeLog, const U32 consumed, const U32 *rankValOrigin, const int minWeight,
+ const sortedSymbol_t *sortedSymbols, const U32 sortedListSize, U32 nbBitsBaseline, U16 baseSeq)
+{
+ HUF_DEltX4 DElt;
+ U32 rankVal[HUF_TABLELOG_MAX + 1];
+
+ /* get pre-calculated rankVal */
+ memcpy(rankVal, rankValOrigin, sizeof(rankVal));
+
+ /* fill skipped values */
+ if (minWeight > 1) {
+ U32 i, skipSize = rankVal[minWeight];
+ ZSTD_writeLE16(&(DElt.sequence), baseSeq);
+ DElt.nbBits = (BYTE)(consumed);
+ DElt.length = 1;
+ for (i = 0; i < skipSize; i++)
+ DTable[i] = DElt;
+ }
+
+ /* fill DTable */
+ {
+ U32 s;
+ for (s = 0; s < sortedListSize; s++) { /* note : sortedSymbols already skipped */
+ const U32 symbol = sortedSymbols[s].symbol;
+ const U32 weight = sortedSymbols[s].weight;
+ const U32 nbBits = nbBitsBaseline - weight;
+ const U32 length = 1 << (sizeLog - nbBits);
+ const U32 start = rankVal[weight];
+ U32 i = start;
+ const U32 end = start + length;
+
+ ZSTD_writeLE16(&(DElt.sequence), (U16)(baseSeq + (symbol << 8)));
+ DElt.nbBits = (BYTE)(nbBits + consumed);
+ DElt.length = 2;
+ do {
+ DTable[i++] = DElt;
+ } while (i < end); /* since length >= 1 */
+
+ rankVal[weight] += length;
+ }
+ }
+}
+
+typedef U32 rankVal_t[HUF_TABLELOG_MAX][HUF_TABLELOG_MAX + 1];
+typedef U32 rankValCol_t[HUF_TABLELOG_MAX + 1];
+
+static void HUF_fillDTableX4(HUF_DEltX4 *DTable, const U32 targetLog, const sortedSymbol_t *sortedList, const U32 sortedListSize, const U32 *rankStart,
+ rankVal_t rankValOrigin, const U32 maxWeight, const U32 nbBitsBaseline)
+{
+ U32 rankVal[HUF_TABLELOG_MAX + 1];
+ const int scaleLog = nbBitsBaseline - targetLog; /* note : targetLog >= srcLog, hence scaleLog <= 1 */
+ const U32 minBits = nbBitsBaseline - maxWeight;
+ U32 s;
+
+ memcpy(rankVal, rankValOrigin, sizeof(rankVal));
+
+ /* fill DTable */
+ for (s = 0; s < sortedListSize; s++) {
+ const U16 symbol = sortedList[s].symbol;
+ const U32 weight = sortedList[s].weight;
+ const U32 nbBits = nbBitsBaseline - weight;
+ const U32 start = rankVal[weight];
+ const U32 length = 1 << (targetLog - nbBits);
+
+ if (targetLog - nbBits >= minBits) { /* enough room for a second symbol */
+ U32 sortedRank;
+ int minWeight = nbBits + scaleLog;
+ if (minWeight < 1)
+ minWeight = 1;
+ sortedRank = rankStart[minWeight];
+ HUF_fillDTableX4Level2(DTable + start, targetLog - nbBits, nbBits, rankValOrigin[nbBits], minWeight, sortedList + sortedRank,
+ sortedListSize - sortedRank, nbBitsBaseline, symbol);
+ } else {
+ HUF_DEltX4 DElt;
+ ZSTD_writeLE16(&(DElt.sequence), symbol);
+ DElt.nbBits = (BYTE)(nbBits);
+ DElt.length = 1;
+ {
+ U32 const end = start + length;
+ U32 u;
+ for (u = start; u < end; u++)
+ DTable[u] = DElt;
+ }
+ }
+ rankVal[weight] += length;
+ }
+}
+
+size_t HUF_readDTableX4_wksp(HUF_DTable *DTable, const void *src, size_t srcSize, void *workspace, size_t workspaceSize)
+{
+ U32 tableLog, maxW, sizeOfSort, nbSymbols;
+ DTableDesc dtd = HUF_getDTableDesc(DTable);
+ U32 const maxTableLog = dtd.maxTableLog;
+ size_t iSize;
+ void *dtPtr = DTable + 1; /* force compiler to avoid strict-aliasing */
+ HUF_DEltX4 *const dt = (HUF_DEltX4 *)dtPtr;
+ U32 *rankStart;
+
+ rankValCol_t *rankVal;
+ U32 *rankStats;
+ U32 *rankStart0;
+ sortedSymbol_t *sortedSymbol;
+ BYTE *weightList;
+ size_t spaceUsed32 = 0;
+
+ HUF_STATIC_ASSERT((sizeof(rankValCol_t) & 3) == 0);
+
+ rankVal = (rankValCol_t *)((U32 *)workspace + spaceUsed32);
+ spaceUsed32 += (sizeof(rankValCol_t) * HUF_TABLELOG_MAX) >> 2;
+ rankStats = (U32 *)workspace + spaceUsed32;
+ spaceUsed32 += HUF_TABLELOG_MAX + 1;
+ rankStart0 = (U32 *)workspace + spaceUsed32;
+ spaceUsed32 += HUF_TABLELOG_MAX + 2;
+ sortedSymbol = (sortedSymbol_t *)((U32 *)workspace + spaceUsed32);
+ spaceUsed32 += ALIGN(sizeof(sortedSymbol_t) * (HUF_SYMBOLVALUE_MAX + 1), sizeof(U32)) >> 2;
+ weightList = (BYTE *)((U32 *)workspace + spaceUsed32);
+ spaceUsed32 += ALIGN(HUF_SYMBOLVALUE_MAX + 1, sizeof(U32)) >> 2;
+
+ if ((spaceUsed32 << 2) > workspaceSize)
+ return ERROR(tableLog_tooLarge);
+ workspace = (U32 *)workspace + spaceUsed32;
+ workspaceSize -= (spaceUsed32 << 2);
+
+ rankStart = rankStart0 + 1;
+ memset(rankStats, 0, sizeof(U32) * (2 * HUF_TABLELOG_MAX + 2 + 1));
+
+ HUF_STATIC_ASSERT(sizeof(HUF_DEltX4) == sizeof(HUF_DTable)); /* if compiler fails here, assertion is wrong */
+ if (maxTableLog > HUF_TABLELOG_MAX)
+ return ERROR(tableLog_tooLarge);
+ /* memset(weightList, 0, sizeof(weightList)); */ /* is not necessary, even though some analyzer complain ... */
+
+ iSize = HUF_readStats_wksp(weightList, HUF_SYMBOLVALUE_MAX + 1, rankStats, &nbSymbols, &tableLog, src, srcSize, workspace, workspaceSize);
+ if (HUF_isError(iSize))
+ return iSize;
+
+ /* check result */
+ if (tableLog > maxTableLog)
+ return ERROR(tableLog_tooLarge); /* DTable can't fit code depth */
+
+ /* find maxWeight */
+ for (maxW = tableLog; rankStats[maxW] == 0; maxW--) {
+ } /* necessarily finds a solution before 0 */
+
+ /* Get start index of each weight */
+ {
+ U32 w, nextRankStart = 0;
+ for (w = 1; w < maxW + 1; w++) {
+ U32 curr = nextRankStart;
+ nextRankStart += rankStats[w];
+ rankStart[w] = curr;
+ }
+ rankStart[0] = nextRankStart; /* put all 0w symbols at the end of sorted list*/
+ sizeOfSort = nextRankStart;
+ }
+
+ /* sort symbols by weight */
+ {
+ U32 s;
+ for (s = 0; s < nbSymbols; s++) {
+ U32 const w = weightList[s];
+ U32 const r = rankStart[w]++;
+ sortedSymbol[r].symbol = (BYTE)s;
+ sortedSymbol[r].weight = (BYTE)w;
+ }
+ rankStart[0] = 0; /* forget 0w symbols; this is beginning of weight(1) */
+ }
+
+ /* Build rankVal */
+ {
+ U32 *const rankVal0 = rankVal[0];
+ {
+ int const rescale = (maxTableLog - tableLog) - 1; /* tableLog <= maxTableLog */
+ U32 nextRankVal = 0;
+ U32 w;
+ for (w = 1; w < maxW + 1; w++) {
+ U32 curr = nextRankVal;
+ nextRankVal += rankStats[w] << (w + rescale);
+ rankVal0[w] = curr;
+ }
+ }
+ {
+ U32 const minBits = tableLog + 1 - maxW;
+ U32 consumed;
+ for (consumed = minBits; consumed < maxTableLog - minBits + 1; consumed++) {
+ U32 *const rankValPtr = rankVal[consumed];
+ U32 w;
+ for (w = 1; w < maxW + 1; w++) {
+ rankValPtr[w] = rankVal0[w] >> consumed;
+ }
+ }
+ }
+ }
+
+ HUF_fillDTableX4(dt, maxTableLog, sortedSymbol, sizeOfSort, rankStart0, rankVal, maxW, tableLog + 1);
+
+ dtd.tableLog = (BYTE)maxTableLog;
+ dtd.tableType = 1;
+ memcpy(DTable, &dtd, sizeof(dtd));
+ return iSize;
+}
+
+static U32 HUF_decodeSymbolX4(void *op, BIT_DStream_t *DStream, const HUF_DEltX4 *dt, const U32 dtLog)
+{
+ size_t const val = BIT_lookBitsFast(DStream, dtLog); /* note : dtLog >= 1 */
+ memcpy(op, dt + val, 2);
+ BIT_skipBits(DStream, dt[val].nbBits);
+ return dt[val].length;
+}
+
+static U32 HUF_decodeLastSymbolX4(void *op, BIT_DStream_t *DStream, const HUF_DEltX4 *dt, const U32 dtLog)
+{
+ size_t const val = BIT_lookBitsFast(DStream, dtLog); /* note : dtLog >= 1 */
+ memcpy(op, dt + val, 1);
+ if (dt[val].length == 1)
+ BIT_skipBits(DStream, dt[val].nbBits);
+ else {
+ if (DStream->bitsConsumed < (sizeof(DStream->bitContainer) * 8)) {
+ BIT_skipBits(DStream, dt[val].nbBits);
+ if (DStream->bitsConsumed > (sizeof(DStream->bitContainer) * 8))
+ /* ugly hack; works only because it's the last symbol. Note : can't easily extract nbBits from just this symbol */
+ DStream->bitsConsumed = (sizeof(DStream->bitContainer) * 8);
+ }
+ }
+ return 1;
+}
+
+#define HUF_DECODE_SYMBOLX4_0(ptr, DStreamPtr) ptr += HUF_decodeSymbolX4(ptr, DStreamPtr, dt, dtLog)
+
+#define HUF_DECODE_SYMBOLX4_1(ptr, DStreamPtr) \
+ if (ZSTD_64bits() || (HUF_TABLELOG_MAX <= 12)) \
+ ptr += HUF_decodeSymbolX4(ptr, DStreamPtr, dt, dtLog)
+
+#define HUF_DECODE_SYMBOLX4_2(ptr, DStreamPtr) \
+ if (ZSTD_64bits()) \
+ ptr += HUF_decodeSymbolX4(ptr, DStreamPtr, dt, dtLog)
+
+FORCE_INLINE size_t HUF_decodeStreamX4(BYTE *p, BIT_DStream_t *bitDPtr, BYTE *const pEnd, const HUF_DEltX4 *const dt, const U32 dtLog)
+{
+ BYTE *const pStart = p;
+
+ /* up to 8 symbols at a time */
+ while ((BIT_reloadDStream(bitDPtr) == BIT_DStream_unfinished) & (p < pEnd - (sizeof(bitDPtr->bitContainer) - 1))) {
+ HUF_DECODE_SYMBOLX4_2(p, bitDPtr);
+ HUF_DECODE_SYMBOLX4_1(p, bitDPtr);
+ HUF_DECODE_SYMBOLX4_2(p, bitDPtr);
+ HUF_DECODE_SYMBOLX4_0(p, bitDPtr);
+ }
+
+ /* closer to end : up to 2 symbols at a time */
+ while ((BIT_reloadDStream(bitDPtr) == BIT_DStream_unfinished) & (p <= pEnd - 2))
+ HUF_DECODE_SYMBOLX4_0(p, bitDPtr);
+
+ while (p <= pEnd - 2)
+ HUF_DECODE_SYMBOLX4_0(p, bitDPtr); /* no need to reload : reached the end of DStream */
+
+ if (p < pEnd)
+ p += HUF_decodeLastSymbolX4(p, bitDPtr, dt, dtLog);
+
+ return p - pStart;
+}
+
+static size_t HUF_decompress1X4_usingDTable_internal(void *dst, size_t dstSize, const void *cSrc, size_t cSrcSize, const HUF_DTable *DTable)
+{
+ BIT_DStream_t bitD;
+
+ /* Init */
+ {
+ size_t const errorCode = BIT_initDStream(&bitD, cSrc, cSrcSize);
+ if (HUF_isError(errorCode))
+ return errorCode;
+ }
+
+ /* decode */
+ {
+ BYTE *const ostart = (BYTE *)dst;
+ BYTE *const oend = ostart + dstSize;
+ const void *const dtPtr = DTable + 1; /* force compiler to not use strict-aliasing */
+ const HUF_DEltX4 *const dt = (const HUF_DEltX4 *)dtPtr;
+ DTableDesc const dtd = HUF_getDTableDesc(DTable);
+ HUF_decodeStreamX4(ostart, &bitD, oend, dt, dtd.tableLog);
+ }
+
+ /* check */
+ if (!BIT_endOfDStream(&bitD))
+ return ERROR(corruption_detected);
+
+ /* decoded size */
+ return dstSize;
+}
+
+size_t HUF_decompress1X4_usingDTable(void *dst, size_t dstSize, const void *cSrc, size_t cSrcSize, const HUF_DTable *DTable)
+{
+ DTableDesc dtd = HUF_getDTableDesc(DTable);
+ if (dtd.tableType != 1)
+ return ERROR(GENERIC);
+ return HUF_decompress1X4_usingDTable_internal(dst, dstSize, cSrc, cSrcSize, DTable);
+}
+
+size_t HUF_decompress1X4_DCtx_wksp(HUF_DTable *DCtx, void *dst, size_t dstSize, const void *cSrc, size_t cSrcSize, void *workspace, size_t workspaceSize)
+{
+ const BYTE *ip = (const BYTE *)cSrc;
+
+ size_t const hSize = HUF_readDTableX4_wksp(DCtx, cSrc, cSrcSize, workspace, workspaceSize);
+ if (HUF_isError(hSize))
+ return hSize;
+ if (hSize >= cSrcSize)
+ return ERROR(srcSize_wrong);
+ ip += hSize;
+ cSrcSize -= hSize;
+
+ return HUF_decompress1X4_usingDTable_internal(dst, dstSize, ip, cSrcSize, DCtx);
+}
+
+static size_t HUF_decompress4X4_usingDTable_internal(void *dst, size_t dstSize, const void *cSrc, size_t cSrcSize, const HUF_DTable *DTable)
+{
+ if (cSrcSize < 10)
+ return ERROR(corruption_detected); /* strict minimum : jump table + 1 byte per stream */
+
+ {
+ const BYTE *const istart = (const BYTE *)cSrc;
+ BYTE *const ostart = (BYTE *)dst;
+ BYTE *const oend = ostart + dstSize;
+ const void *const dtPtr = DTable + 1;
+ const HUF_DEltX4 *const dt = (const HUF_DEltX4 *)dtPtr;
+
+ /* Init */
+ BIT_DStream_t bitD1;
+ BIT_DStream_t bitD2;
+ BIT_DStream_t bitD3;
+ BIT_DStream_t bitD4;
+ size_t const length1 = ZSTD_readLE16(istart);
+ size_t const length2 = ZSTD_readLE16(istart + 2);
+ size_t const length3 = ZSTD_readLE16(istart + 4);
+ size_t const length4 = cSrcSize - (length1 + length2 + length3 + 6);
+ const BYTE *const istart1 = istart + 6; /* jumpTable */
+ const BYTE *const istart2 = istart1 + length1;
+ const BYTE *const istart3 = istart2 + length2;
+ const BYTE *const istart4 = istart3 + length3;
+ size_t const segmentSize = (dstSize + 3) / 4;
+ BYTE *const opStart2 = ostart + segmentSize;
+ BYTE *const opStart3 = opStart2 + segmentSize;
+ BYTE *const opStart4 = opStart3 + segmentSize;
+ BYTE *op1 = ostart;
+ BYTE *op2 = opStart2;
+ BYTE *op3 = opStart3;
+ BYTE *op4 = opStart4;
+ U32 endSignal;
+ DTableDesc const dtd = HUF_getDTableDesc(DTable);
+ U32 const dtLog = dtd.tableLog;
+
+ if (length4 > cSrcSize)
+ return ERROR(corruption_detected); /* overflow */
+ {
+ size_t const errorCode = BIT_initDStream(&bitD1, istart1, length1);
+ if (HUF_isError(errorCode))
+ return errorCode;
+ }
+ {
+ size_t const errorCode = BIT_initDStream(&bitD2, istart2, length2);
+ if (HUF_isError(errorCode))
+ return errorCode;
+ }
+ {
+ size_t const errorCode = BIT_initDStream(&bitD3, istart3, length3);
+ if (HUF_isError(errorCode))
+ return errorCode;
+ }
+ {
+ size_t const errorCode = BIT_initDStream(&bitD4, istart4, length4);
+ if (HUF_isError(errorCode))
+ return errorCode;
+ }
+
+ /* 16-32 symbols per loop (4-8 symbols per stream) */
+ endSignal = BIT_reloadDStream(&bitD1) | BIT_reloadDStream(&bitD2) | BIT_reloadDStream(&bitD3) | BIT_reloadDStream(&bitD4);
+ for (; (endSignal == BIT_DStream_unfinished) & (op4 < (oend - (sizeof(bitD4.bitContainer) - 1)));) {
+ HUF_DECODE_SYMBOLX4_2(op1, &bitD1);
+ HUF_DECODE_SYMBOLX4_2(op2, &bitD2);
+ HUF_DECODE_SYMBOLX4_2(op3, &bitD3);
+ HUF_DECODE_SYMBOLX4_2(op4, &bitD4);
+ HUF_DECODE_SYMBOLX4_1(op1, &bitD1);
+ HUF_DECODE_SYMBOLX4_1(op2, &bitD2);
+ HUF_DECODE_SYMBOLX4_1(op3, &bitD3);
+ HUF_DECODE_SYMBOLX4_1(op4, &bitD4);
+ HUF_DECODE_SYMBOLX4_2(op1, &bitD1);
+ HUF_DECODE_SYMBOLX4_2(op2, &bitD2);
+ HUF_DECODE_SYMBOLX4_2(op3, &bitD3);
+ HUF_DECODE_SYMBOLX4_2(op4, &bitD4);
+ HUF_DECODE_SYMBOLX4_0(op1, &bitD1);
+ HUF_DECODE_SYMBOLX4_0(op2, &bitD2);
+ HUF_DECODE_SYMBOLX4_0(op3, &bitD3);
+ HUF_DECODE_SYMBOLX4_0(op4, &bitD4);
+
+ endSignal = BIT_reloadDStream(&bitD1) | BIT_reloadDStream(&bitD2) | BIT_reloadDStream(&bitD3) | BIT_reloadDStream(&bitD4);
+ }
+
+ /* check corruption */
+ if (op1 > opStart2)
+ return ERROR(corruption_detected);
+ if (op2 > opStart3)
+ return ERROR(corruption_detected);
+ if (op3 > opStart4)
+ return ERROR(corruption_detected);
+ /* note : op4 already verified within main loop */
+
+ /* finish bitStreams one by one */
+ HUF_decodeStreamX4(op1, &bitD1, opStart2, dt, dtLog);
+ HUF_decodeStreamX4(op2, &bitD2, opStart3, dt, dtLog);
+ HUF_decodeStreamX4(op3, &bitD3, opStart4, dt, dtLog);
+ HUF_decodeStreamX4(op4, &bitD4, oend, dt, dtLog);
+
+ /* check */
+ {
+ U32 const endCheck = BIT_endOfDStream(&bitD1) & BIT_endOfDStream(&bitD2) & BIT_endOfDStream(&bitD3) & BIT_endOfDStream(&bitD4);
+ if (!endCheck)
+ return ERROR(corruption_detected);
+ }
+
+ /* decoded size */
+ return dstSize;
+ }
+}
+
+size_t HUF_decompress4X4_usingDTable(void *dst, size_t dstSize, const void *cSrc, size_t cSrcSize, const HUF_DTable *DTable)
+{
+ DTableDesc dtd = HUF_getDTableDesc(DTable);
+ if (dtd.tableType != 1)
+ return ERROR(GENERIC);
+ return HUF_decompress4X4_usingDTable_internal(dst, dstSize, cSrc, cSrcSize, DTable);
+}
+
+size_t HUF_decompress4X4_DCtx_wksp(HUF_DTable *dctx, void *dst, size_t dstSize, const void *cSrc, size_t cSrcSize, void *workspace, size_t workspaceSize)
+{
+ const BYTE *ip = (const BYTE *)cSrc;
+
+ size_t hSize = HUF_readDTableX4_wksp(dctx, cSrc, cSrcSize, workspace, workspaceSize);
+ if (HUF_isError(hSize))
+ return hSize;
+ if (hSize >= cSrcSize)
+ return ERROR(srcSize_wrong);
+ ip += hSize;
+ cSrcSize -= hSize;
+
+ return HUF_decompress4X4_usingDTable_internal(dst, dstSize, ip, cSrcSize, dctx);
+}
+
+/* ********************************/
+/* Generic decompression selector */
+/* ********************************/
+
+size_t HUF_decompress1X_usingDTable(void *dst, size_t maxDstSize, const void *cSrc, size_t cSrcSize, const HUF_DTable *DTable)
+{
+ DTableDesc const dtd = HUF_getDTableDesc(DTable);
+ return dtd.tableType ? HUF_decompress1X4_usingDTable_internal(dst, maxDstSize, cSrc, cSrcSize, DTable)
+ : HUF_decompress1X2_usingDTable_internal(dst, maxDstSize, cSrc, cSrcSize, DTable);
+}
+
+size_t HUF_decompress4X_usingDTable(void *dst, size_t maxDstSize, const void *cSrc, size_t cSrcSize, const HUF_DTable *DTable)
+{
+ DTableDesc const dtd = HUF_getDTableDesc(DTable);
+ return dtd.tableType ? HUF_decompress4X4_usingDTable_internal(dst, maxDstSize, cSrc, cSrcSize, DTable)
+ : HUF_decompress4X2_usingDTable_internal(dst, maxDstSize, cSrc, cSrcSize, DTable);
+}
+
+typedef struct {
+ U32 tableTime;
+ U32 decode256Time;
+} algo_time_t;
+static const algo_time_t algoTime[16 /* Quantization */][3 /* single, double, quad */] = {
+ /* single, double, quad */
+ {{0, 0}, {1, 1}, {2, 2}}, /* Q==0 : impossible */
+ {{0, 0}, {1, 1}, {2, 2}}, /* Q==1 : impossible */
+ {{38, 130}, {1313, 74}, {2151, 38}}, /* Q == 2 : 12-18% */
+ {{448, 128}, {1353, 74}, {2238, 41}}, /* Q == 3 : 18-25% */
+ {{556, 128}, {1353, 74}, {2238, 47}}, /* Q == 4 : 25-32% */
+ {{714, 128}, {1418, 74}, {2436, 53}}, /* Q == 5 : 32-38% */
+ {{883, 128}, {1437, 74}, {2464, 61}}, /* Q == 6 : 38-44% */
+ {{897, 128}, {1515, 75}, {2622, 68}}, /* Q == 7 : 44-50% */
+ {{926, 128}, {1613, 75}, {2730, 75}}, /* Q == 8 : 50-56% */
+ {{947, 128}, {1729, 77}, {3359, 77}}, /* Q == 9 : 56-62% */
+ {{1107, 128}, {2083, 81}, {4006, 84}}, /* Q ==10 : 62-69% */
+ {{1177, 128}, {2379, 87}, {4785, 88}}, /* Q ==11 : 69-75% */
+ {{1242, 128}, {2415, 93}, {5155, 84}}, /* Q ==12 : 75-81% */
+ {{1349, 128}, {2644, 106}, {5260, 106}}, /* Q ==13 : 81-87% */
+ {{1455, 128}, {2422, 124}, {4174, 124}}, /* Q ==14 : 87-93% */
+ {{722, 128}, {1891, 145}, {1936, 146}}, /* Q ==15 : 93-99% */
+};
+
+/** HUF_selectDecoder() :
+* Tells which decoder is likely to decode faster,
+* based on a set of pre-determined metrics.
+* @return : 0==HUF_decompress4X2, 1==HUF_decompress4X4 .
+* Assumption : 0 < cSrcSize < dstSize <= 128 KB */
+U32 HUF_selectDecoder(size_t dstSize, size_t cSrcSize)
+{
+ /* decoder timing evaluation */
+ U32 const Q = (U32)(cSrcSize * 16 / dstSize); /* Q < 16 since dstSize > cSrcSize */
+ U32 const D256 = (U32)(dstSize >> 8);
+ U32 const DTime0 = algoTime[Q][0].tableTime + (algoTime[Q][0].decode256Time * D256);
+ U32 DTime1 = algoTime[Q][1].tableTime + (algoTime[Q][1].decode256Time * D256);
+ DTime1 += DTime1 >> 3; /* advantage to algorithm using less memory, for cache eviction */
+
+ return DTime1 < DTime0;
+}
+
+typedef size_t (*decompressionAlgo)(void *dst, size_t dstSize, const void *cSrc, size_t cSrcSize);
+
+size_t HUF_decompress4X_DCtx_wksp(HUF_DTable *dctx, void *dst, size_t dstSize, const void *cSrc, size_t cSrcSize, void *workspace, size_t workspaceSize)
+{
+ /* validation checks */
+ if (dstSize == 0)
+ return ERROR(dstSize_tooSmall);
+ if (cSrcSize > dstSize)
+ return ERROR(corruption_detected); /* invalid */
+ if (cSrcSize == dstSize) {
+ memcpy(dst, cSrc, dstSize);
+ return dstSize;
+ } /* not compressed */
+ if (cSrcSize == 1) {
+ memset(dst, *(const BYTE *)cSrc, dstSize);
+ return dstSize;
+ } /* RLE */
+
+ {
+ U32 const algoNb = HUF_selectDecoder(dstSize, cSrcSize);
+ return algoNb ? HUF_decompress4X4_DCtx_wksp(dctx, dst, dstSize, cSrc, cSrcSize, workspace, workspaceSize)
+ : HUF_decompress4X2_DCtx_wksp(dctx, dst, dstSize, cSrc, cSrcSize, workspace, workspaceSize);
+ }
+}
+
+size_t HUF_decompress4X_hufOnly_wksp(HUF_DTable *dctx, void *dst, size_t dstSize, const void *cSrc, size_t cSrcSize, void *workspace, size_t workspaceSize)
+{
+ /* validation checks */
+ if (dstSize == 0)
+ return ERROR(dstSize_tooSmall);
+ if ((cSrcSize >= dstSize) || (cSrcSize <= 1))
+ return ERROR(corruption_detected); /* invalid */
+
+ {
+ U32 const algoNb = HUF_selectDecoder(dstSize, cSrcSize);
+ return algoNb ? HUF_decompress4X4_DCtx_wksp(dctx, dst, dstSize, cSrc, cSrcSize, workspace, workspaceSize)
+ : HUF_decompress4X2_DCtx_wksp(dctx, dst, dstSize, cSrc, cSrcSize, workspace, workspaceSize);
+ }
+}
+
+size_t HUF_decompress1X_DCtx_wksp(HUF_DTable *dctx, void *dst, size_t dstSize, const void *cSrc, size_t cSrcSize, void *workspace, size_t workspaceSize)
+{
+ /* validation checks */
+ if (dstSize == 0)
+ return ERROR(dstSize_tooSmall);
+ if (cSrcSize > dstSize)
+ return ERROR(corruption_detected); /* invalid */
+ if (cSrcSize == dstSize) {
+ memcpy(dst, cSrc, dstSize);
+ return dstSize;
+ } /* not compressed */
+ if (cSrcSize == 1) {
+ memset(dst, *(const BYTE *)cSrc, dstSize);
+ return dstSize;
+ } /* RLE */
+
+ {
+ U32 const algoNb = HUF_selectDecoder(dstSize, cSrcSize);
+ return algoNb ? HUF_decompress1X4_DCtx_wksp(dctx, dst, dstSize, cSrc, cSrcSize, workspace, workspaceSize)
+ : HUF_decompress1X2_DCtx_wksp(dctx, dst, dstSize, cSrc, cSrcSize, workspace, workspaceSize);
+ }
+}
diff --git a/lib/zstd/mem.h b/lib/zstd/mem.h
new file mode 100644
index 000000000000..3a0f34c8706c
--- /dev/null
+++ b/lib/zstd/mem.h
@@ -0,0 +1,151 @@
+/**
+ * Copyright (c) 2016-present, Yann Collet, Facebook, Inc.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of https://github.com/facebook/zstd.
+ * An additional grant of patent rights can be found in the PATENTS file in the
+ * same directory.
+ *
+ * This program is free software; you can redistribute it and/or modify it under
+ * the terms of the GNU General Public License version 2 as published by the
+ * Free Software Foundation. This program is dual-licensed; you may select
+ * either version 2 of the GNU General Public License ("GPL") or BSD license
+ * ("BSD").
+ */
+
+#ifndef MEM_H_MODULE
+#define MEM_H_MODULE
+
+/*-****************************************
+* Dependencies
+******************************************/
+#include <asm/unaligned.h>
+#include <linux/string.h> /* memcpy */
+#include <linux/types.h> /* size_t, ptrdiff_t */
+
+/*-****************************************
+* Compiler specifics
+******************************************/
+#define ZSTD_STATIC static __inline __attribute__((unused))
+
+/*-**************************************************************
+* Basic Types
+*****************************************************************/
+typedef uint8_t BYTE;
+typedef uint16_t U16;
+typedef int16_t S16;
+typedef uint32_t U32;
+typedef int32_t S32;
+typedef uint64_t U64;
+typedef int64_t S64;
+typedef ptrdiff_t iPtrDiff;
+typedef uintptr_t uPtrDiff;
+
+/*-**************************************************************
+* Memory I/O
+*****************************************************************/
+ZSTD_STATIC unsigned ZSTD_32bits(void) { return sizeof(size_t) == 4; }
+ZSTD_STATIC unsigned ZSTD_64bits(void) { return sizeof(size_t) == 8; }
+
+#if defined(__LITTLE_ENDIAN)
+#define ZSTD_LITTLE_ENDIAN 1
+#else
+#define ZSTD_LITTLE_ENDIAN 0
+#endif
+
+ZSTD_STATIC unsigned ZSTD_isLittleEndian(void) { return ZSTD_LITTLE_ENDIAN; }
+
+ZSTD_STATIC U16 ZSTD_read16(const void *memPtr) { return get_unaligned((const U16 *)memPtr); }
+
+ZSTD_STATIC U32 ZSTD_read32(const void *memPtr) { return get_unaligned((const U32 *)memPtr); }
+
+ZSTD_STATIC U64 ZSTD_read64(const void *memPtr) { return get_unaligned((const U64 *)memPtr); }
+
+ZSTD_STATIC size_t ZSTD_readST(const void *memPtr) { return get_unaligned((const size_t *)memPtr); }
+
+ZSTD_STATIC void ZSTD_write16(void *memPtr, U16 value) { put_unaligned(value, (U16 *)memPtr); }
+
+ZSTD_STATIC void ZSTD_write32(void *memPtr, U32 value) { put_unaligned(value, (U32 *)memPtr); }
+
+ZSTD_STATIC void ZSTD_write64(void *memPtr, U64 value) { put_unaligned(value, (U64 *)memPtr); }
+
+/*=== Little endian r/w ===*/
+
+ZSTD_STATIC U16 ZSTD_readLE16(const void *memPtr) { return get_unaligned_le16(memPtr); }
+
+ZSTD_STATIC void ZSTD_writeLE16(void *memPtr, U16 val) { put_unaligned_le16(val, memPtr); }
+
+ZSTD_STATIC U32 ZSTD_readLE24(const void *memPtr) { return ZSTD_readLE16(memPtr) + (((const BYTE *)memPtr)[2] << 16); }
+
+ZSTD_STATIC void ZSTD_writeLE24(void *memPtr, U32 val)
+{
+ ZSTD_writeLE16(memPtr, (U16)val);
+ ((BYTE *)memPtr)[2] = (BYTE)(val >> 16);
+}
+
+ZSTD_STATIC U32 ZSTD_readLE32(const void *memPtr) { return get_unaligned_le32(memPtr); }
+
+ZSTD_STATIC void ZSTD_writeLE32(void *memPtr, U32 val32) { put_unaligned_le32(val32, memPtr); }
+
+ZSTD_STATIC U64 ZSTD_readLE64(const void *memPtr) { return get_unaligned_le64(memPtr); }
+
+ZSTD_STATIC void ZSTD_writeLE64(void *memPtr, U64 val64) { put_unaligned_le64(val64, memPtr); }
+
+ZSTD_STATIC size_t ZSTD_readLEST(const void *memPtr)
+{
+ if (ZSTD_32bits())
+ return (size_t)ZSTD_readLE32(memPtr);
+ else
+ return (size_t)ZSTD_readLE64(memPtr);
+}
+
+ZSTD_STATIC void ZSTD_writeLEST(void *memPtr, size_t val)
+{
+ if (ZSTD_32bits())
+ ZSTD_writeLE32(memPtr, (U32)val);
+ else
+ ZSTD_writeLE64(memPtr, (U64)val);
+}
+
+/*=== Big endian r/w ===*/
+
+ZSTD_STATIC U32 ZSTD_readBE32(const void *memPtr) { return get_unaligned_be32(memPtr); }
+
+ZSTD_STATIC void ZSTD_writeBE32(void *memPtr, U32 val32) { put_unaligned_be32(val32, memPtr); }
+
+ZSTD_STATIC U64 ZSTD_readBE64(const void *memPtr) { return get_unaligned_be64(memPtr); }
+
+ZSTD_STATIC void ZSTD_writeBE64(void *memPtr, U64 val64) { put_unaligned_be64(val64, memPtr); }
+
+ZSTD_STATIC size_t ZSTD_readBEST(const void *memPtr)
+{
+ if (ZSTD_32bits())
+ return (size_t)ZSTD_readBE32(memPtr);
+ else
+ return (size_t)ZSTD_readBE64(memPtr);
+}
+
+ZSTD_STATIC void ZSTD_writeBEST(void *memPtr, size_t val)
+{
+ if (ZSTD_32bits())
+ ZSTD_writeBE32(memPtr, (U32)val);
+ else
+ ZSTD_writeBE64(memPtr, (U64)val);
+}
+
+/* function safe only for comparisons */
+ZSTD_STATIC U32 ZSTD_readMINMATCH(const void *memPtr, U32 length)
+{
+ switch (length) {
+ default:
+ case 4: return ZSTD_read32(memPtr);
+ case 3:
+ if (ZSTD_isLittleEndian())
+ return ZSTD_read32(memPtr) << 8;
+ else
+ return ZSTD_read32(memPtr) >> 8;
+ }
+}
+
+#endif /* MEM_H_MODULE */
diff --git a/lib/zstd/zstd_common.c b/lib/zstd/zstd_common.c
new file mode 100644
index 000000000000..a282624ee155
--- /dev/null
+++ b/lib/zstd/zstd_common.c
@@ -0,0 +1,75 @@
+/**
+ * Copyright (c) 2016-present, Yann Collet, Facebook, Inc.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of https://github.com/facebook/zstd.
+ * An additional grant of patent rights can be found in the PATENTS file in the
+ * same directory.
+ *
+ * This program is free software; you can redistribute it and/or modify it under
+ * the terms of the GNU General Public License version 2 as published by the
+ * Free Software Foundation. This program is dual-licensed; you may select
+ * either version 2 of the GNU General Public License ("GPL") or BSD license
+ * ("BSD").
+ */
+
+/*-*************************************
+* Dependencies
+***************************************/
+#include "error_private.h"
+#include "zstd_internal.h" /* declaration of ZSTD_isError, ZSTD_getErrorName, ZSTD_getErrorCode, ZSTD_getErrorString, ZSTD_versionNumber */
+#include <linux/kernel.h>
+
+/*=**************************************************************
+* Custom allocator
+****************************************************************/
+
+#define stack_push(stack, size) \
+ ({ \
+ void *const ptr = ZSTD_PTR_ALIGN((stack)->ptr); \
+ (stack)->ptr = (char *)ptr + (size); \
+ (stack)->ptr <= (stack)->end ? ptr : NULL; \
+ })
+
+ZSTD_customMem ZSTD_initStack(void *workspace, size_t workspaceSize)
+{
+ ZSTD_customMem stackMem = {ZSTD_stackAlloc, ZSTD_stackFree, workspace};
+ ZSTD_stack *stack = (ZSTD_stack *)workspace;
+ /* Verify preconditions */
+ if (!workspace || workspaceSize < sizeof(ZSTD_stack) || workspace != ZSTD_PTR_ALIGN(workspace)) {
+ ZSTD_customMem error = {NULL, NULL, NULL};
+ return error;
+ }
+ /* Initialize the stack */
+ stack->ptr = workspace;
+ stack->end = (char *)workspace + workspaceSize;
+ stack_push(stack, sizeof(ZSTD_stack));
+ return stackMem;
+}
+
+void *ZSTD_stackAllocAll(void *opaque, size_t *size)
+{
+ ZSTD_stack *stack = (ZSTD_stack *)opaque;
+ *size = (BYTE const *)stack->end - (BYTE *)ZSTD_PTR_ALIGN(stack->ptr);
+ return stack_push(stack, *size);
+}
+
+void *ZSTD_stackAlloc(void *opaque, size_t size)
+{
+ ZSTD_stack *stack = (ZSTD_stack *)opaque;
+ return stack_push(stack, size);
+}
+void ZSTD_stackFree(void *opaque, void *address)
+{
+ (void)opaque;
+ (void)address;
+}
+
+void *ZSTD_malloc(size_t size, ZSTD_customMem customMem) { return customMem.customAlloc(customMem.opaque, size); }
+
+void ZSTD_free(void *ptr, ZSTD_customMem customMem)
+{
+ if (ptr != NULL)
+ customMem.customFree(customMem.opaque, ptr);
+}
diff --git a/lib/zstd/zstd_internal.h b/lib/zstd/zstd_internal.h
new file mode 100644
index 000000000000..1a79fab9e13a
--- /dev/null
+++ b/lib/zstd/zstd_internal.h
@@ -0,0 +1,263 @@
+/**
+ * Copyright (c) 2016-present, Yann Collet, Facebook, Inc.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of https://github.com/facebook/zstd.
+ * An additional grant of patent rights can be found in the PATENTS file in the
+ * same directory.
+ *
+ * This program is free software; you can redistribute it and/or modify it under
+ * the terms of the GNU General Public License version 2 as published by the
+ * Free Software Foundation. This program is dual-licensed; you may select
+ * either version 2 of the GNU General Public License ("GPL") or BSD license
+ * ("BSD").
+ */
+
+#ifndef ZSTD_CCOMMON_H_MODULE
+#define ZSTD_CCOMMON_H_MODULE
+
+/*-*******************************************************
+* Compiler specifics
+*********************************************************/
+#define FORCE_INLINE static __always_inline
+#define FORCE_NOINLINE static noinline
+
+/*-*************************************
+* Dependencies
+***************************************/
+#include "error_private.h"
+#include "mem.h"
+#include <linux/compiler.h>
+#include <linux/kernel.h>
+#include <linux/xxhash.h>
+#include <linux/zstd.h>
+
+/*-*************************************
+* shared macros
+***************************************/
+#define MIN(a, b) ((a) < (b) ? (a) : (b))
+#define MAX(a, b) ((a) > (b) ? (a) : (b))
+#define CHECK_F(f) \
+ { \
+ size_t const errcod = f; \
+ if (ERR_isError(errcod)) \
+ return errcod; \
+ } /* check and Forward error code */
+#define CHECK_E(f, e) \
+ { \
+ size_t const errcod = f; \
+ if (ERR_isError(errcod)) \
+ return ERROR(e); \
+ } /* check and send Error code */
+#define ZSTD_STATIC_ASSERT(c) \
+ { \
+ enum { ZSTD_static_assert = 1 / (int)(!!(c)) }; \
+ }
+
+/*-*************************************
+* Common constants
+***************************************/
+#define ZSTD_OPT_NUM (1 << 12)
+#define ZSTD_DICT_MAGIC 0xEC30A437 /* v0.7+ */
+
+#define ZSTD_REP_NUM 3 /* number of repcodes */
+#define ZSTD_REP_CHECK (ZSTD_REP_NUM) /* number of repcodes to check by the optimal parser */
+#define ZSTD_REP_MOVE (ZSTD_REP_NUM - 1)
+#define ZSTD_REP_MOVE_OPT (ZSTD_REP_NUM)
+static const U32 repStartValue[ZSTD_REP_NUM] = {1, 4, 8};
+
+#define KB *(1 << 10)
+#define MB *(1 << 20)
+#define GB *(1U << 30)
+
+#define BIT7 128
+#define BIT6 64
+#define BIT5 32
+#define BIT4 16
+#define BIT1 2
+#define BIT0 1
+
+#define ZSTD_WINDOWLOG_ABSOLUTEMIN 10
+static const size_t ZSTD_fcs_fieldSize[4] = {0, 2, 4, 8};
+static const size_t ZSTD_did_fieldSize[4] = {0, 1, 2, 4};
+
+#define ZSTD_BLOCKHEADERSIZE 3 /* C standard doesn't allow `static const` variable to be init using another `static const` variable */
+static const size_t ZSTD_blockHeaderSize = ZSTD_BLOCKHEADERSIZE;
+typedef enum { bt_raw, bt_rle, bt_compressed, bt_reserved } blockType_e;
+
+#define MIN_SEQUENCES_SIZE 1 /* nbSeq==0 */
+#define MIN_CBLOCK_SIZE (1 /*litCSize*/ + 1 /* RLE or RAW */ + MIN_SEQUENCES_SIZE /* nbSeq==0 */) /* for a non-null block */
+
+#define HufLog 12
+typedef enum { set_basic, set_rle, set_compressed, set_repeat } symbolEncodingType_e;
+
+#define LONGNBSEQ 0x7F00
+
+#define MINMATCH 3
+#define EQUAL_READ32 4
+
+#define Litbits 8
+#define MaxLit ((1 << Litbits) - 1)
+#define MaxML 52
+#define MaxLL 35
+#define MaxOff 28
+#define MaxSeq MAX(MaxLL, MaxML) /* Assumption : MaxOff < MaxLL,MaxML */
+#define MLFSELog 9
+#define LLFSELog 9
+#define OffFSELog 8
+
+static const U32 LL_bits[MaxLL + 1] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 3, 3, 4, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16};
+static const S16 LL_defaultNorm[MaxLL + 1] = {4, 3, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 3, 2, 1, 1, 1, 1, 1, -1, -1, -1, -1};
+#define LL_DEFAULTNORMLOG 6 /* for static allocation */
+static const U32 LL_defaultNormLog = LL_DEFAULTNORMLOG;
+
+static const U32 ML_bits[MaxML + 1] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 3, 3, 4, 4, 5, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16};
+static const S16 ML_defaultNorm[MaxML + 1] = {1, 4, 3, 2, 2, 2, 2, 2, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, -1, -1, -1, -1, -1, -1, -1};
+#define ML_DEFAULTNORMLOG 6 /* for static allocation */
+static const U32 ML_defaultNormLog = ML_DEFAULTNORMLOG;
+
+static const S16 OF_defaultNorm[MaxOff + 1] = {1, 1, 1, 1, 1, 1, 2, 2, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, -1, -1, -1, -1, -1};
+#define OF_DEFAULTNORMLOG 5 /* for static allocation */
+static const U32 OF_defaultNormLog = OF_DEFAULTNORMLOG;
+
+/*-*******************************************
+* Shared functions to include for inlining
+*********************************************/
+ZSTD_STATIC void ZSTD_copy8(void *dst, const void *src) {
+ memcpy(dst, src, 8);
+}
+/*! ZSTD_wildcopy() :
+* custom version of memcpy(), can copy up to 7 bytes too many (8 bytes if length==0) */
+#define WILDCOPY_OVERLENGTH 8
+ZSTD_STATIC void ZSTD_wildcopy(void *dst, const void *src, ptrdiff_t length)
+{
+ const BYTE* ip = (const BYTE*)src;
+ BYTE* op = (BYTE*)dst;
+ BYTE* const oend = op + length;
+ /* Work around https://gcc.gnu.org/bugzilla/show_bug.cgi?id=81388.
+ * Avoid the bad case where the loop only runs once by handling the
+ * special case separately. This doesn't trigger the bug because it
+ * doesn't involve pointer/integer overflow.
+ */
+ if (length <= 8)
+ return ZSTD_copy8(dst, src);
+ do {
+ ZSTD_copy8(op, ip);
+ op += 8;
+ ip += 8;
+ } while (op < oend);
+}
+
+/*-*******************************************
+* Private interfaces
+*********************************************/
+typedef struct ZSTD_stats_s ZSTD_stats_t;
+
+typedef struct {
+ U32 off;
+ U32 len;
+} ZSTD_match_t;
+
+typedef struct {
+ U32 price;
+ U32 off;
+ U32 mlen;
+ U32 litlen;
+ U32 rep[ZSTD_REP_NUM];
+} ZSTD_optimal_t;
+
+typedef struct seqDef_s {
+ U32 offset;
+ U16 litLength;
+ U16 matchLength;
+} seqDef;
+
+typedef struct {
+ seqDef *sequencesStart;
+ seqDef *sequences;
+ BYTE *litStart;
+ BYTE *lit;
+ BYTE *llCode;
+ BYTE *mlCode;
+ BYTE *ofCode;
+ U32 longLengthID; /* 0 == no longLength; 1 == Lit.longLength; 2 == Match.longLength; */
+ U32 longLengthPos;
+ /* opt */
+ ZSTD_optimal_t *priceTable;
+ ZSTD_match_t *matchTable;
+ U32 *matchLengthFreq;
+ U32 *litLengthFreq;
+ U32 *litFreq;
+ U32 *offCodeFreq;
+ U32 matchLengthSum;
+ U32 matchSum;
+ U32 litLengthSum;
+ U32 litSum;
+ U32 offCodeSum;
+ U32 log2matchLengthSum;
+ U32 log2matchSum;
+ U32 log2litLengthSum;
+ U32 log2litSum;
+ U32 log2offCodeSum;
+ U32 factor;
+ U32 staticPrices;
+ U32 cachedPrice;
+ U32 cachedLitLength;
+ const BYTE *cachedLiterals;
+} seqStore_t;
+
+const seqStore_t *ZSTD_getSeqStore(const ZSTD_CCtx *ctx);
+void ZSTD_seqToCodes(const seqStore_t *seqStorePtr);
+int ZSTD_isSkipFrame(ZSTD_DCtx *dctx);
+
+/*= Custom memory allocation functions */
+typedef void *(*ZSTD_allocFunction)(void *opaque, size_t size);
+typedef void (*ZSTD_freeFunction)(void *opaque, void *address);
+typedef struct {
+ ZSTD_allocFunction customAlloc;
+ ZSTD_freeFunction customFree;
+ void *opaque;
+} ZSTD_customMem;
+
+void *ZSTD_malloc(size_t size, ZSTD_customMem customMem);
+void ZSTD_free(void *ptr, ZSTD_customMem customMem);
+
+/*====== stack allocation ======*/
+
+typedef struct {
+ void *ptr;
+ const void *end;
+} ZSTD_stack;
+
+#define ZSTD_ALIGN(x) ALIGN(x, sizeof(size_t))
+#define ZSTD_PTR_ALIGN(p) PTR_ALIGN(p, sizeof(size_t))
+
+ZSTD_customMem ZSTD_initStack(void *workspace, size_t workspaceSize);
+
+void *ZSTD_stackAllocAll(void *opaque, size_t *size);
+void *ZSTD_stackAlloc(void *opaque, size_t size);
+void ZSTD_stackFree(void *opaque, void *address);
+
+/*====== common function ======*/
+
+ZSTD_STATIC U32 ZSTD_highbit32(U32 val) { return 31 - __builtin_clz(val); }
+
+/* hidden functions */
+
+/* ZSTD_invalidateRepCodes() :
+ * ensures next compression will not use repcodes from previous block.
+ * Note : only works with regular variant;
+ * do not use with extDict variant ! */
+void ZSTD_invalidateRepCodes(ZSTD_CCtx *cctx);
+
+size_t ZSTD_freeCCtx(ZSTD_CCtx *cctx);
+size_t ZSTD_freeDCtx(ZSTD_DCtx *dctx);
+size_t ZSTD_freeCDict(ZSTD_CDict *cdict);
+size_t ZSTD_freeDDict(ZSTD_DDict *cdict);
+size_t ZSTD_freeCStream(ZSTD_CStream *zcs);
+size_t ZSTD_freeDStream(ZSTD_DStream *zds);
+
+#endif /* ZSTD_CCOMMON_H_MODULE */
diff --git a/lib/zstd/zstd_opt.h b/lib/zstd/zstd_opt.h
new file mode 100644
index 000000000000..55e1b4cba808
--- /dev/null
+++ b/lib/zstd/zstd_opt.h
@@ -0,0 +1,1014 @@
+/**
+ * Copyright (c) 2016-present, Przemyslaw Skibinski, Yann Collet, Facebook, Inc.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of https://github.com/facebook/zstd.
+ * An additional grant of patent rights can be found in the PATENTS file in the
+ * same directory.
+ *
+ * This program is free software; you can redistribute it and/or modify it under
+ * the terms of the GNU General Public License version 2 as published by the
+ * Free Software Foundation. This program is dual-licensed; you may select
+ * either version 2 of the GNU General Public License ("GPL") or BSD license
+ * ("BSD").
+ */
+
+/* Note : this file is intended to be included within zstd_compress.c */
+
+#ifndef ZSTD_OPT_H_91842398743
+#define ZSTD_OPT_H_91842398743
+
+#define ZSTD_LITFREQ_ADD 2
+#define ZSTD_FREQ_DIV 4
+#define ZSTD_MAX_PRICE (1 << 30)
+
+/*-*************************************
+* Price functions for optimal parser
+***************************************/
+FORCE_INLINE void ZSTD_setLog2Prices(seqStore_t *ssPtr)
+{
+ ssPtr->log2matchLengthSum = ZSTD_highbit32(ssPtr->matchLengthSum + 1);
+ ssPtr->log2litLengthSum = ZSTD_highbit32(ssPtr->litLengthSum + 1);
+ ssPtr->log2litSum = ZSTD_highbit32(ssPtr->litSum + 1);
+ ssPtr->log2offCodeSum = ZSTD_highbit32(ssPtr->offCodeSum + 1);
+ ssPtr->factor = 1 + ((ssPtr->litSum >> 5) / ssPtr->litLengthSum) + ((ssPtr->litSum << 1) / (ssPtr->litSum + ssPtr->matchSum));
+}
+
+ZSTD_STATIC void ZSTD_rescaleFreqs(seqStore_t *ssPtr, const BYTE *src, size_t srcSize)
+{
+ unsigned u;
+
+ ssPtr->cachedLiterals = NULL;
+ ssPtr->cachedPrice = ssPtr->cachedLitLength = 0;
+ ssPtr->staticPrices = 0;
+
+ if (ssPtr->litLengthSum == 0) {
+ if (srcSize <= 1024)
+ ssPtr->staticPrices = 1;
+
+ for (u = 0; u <= MaxLit; u++)
+ ssPtr->litFreq[u] = 0;
+ for (u = 0; u < srcSize; u++)
+ ssPtr->litFreq[src[u]]++;
+
+ ssPtr->litSum = 0;
+ ssPtr->litLengthSum = MaxLL + 1;
+ ssPtr->matchLengthSum = MaxML + 1;
+ ssPtr->offCodeSum = (MaxOff + 1);
+ ssPtr->matchSum = (ZSTD_LITFREQ_ADD << Litbits);
+
+ for (u = 0; u <= MaxLit; u++) {
+ ssPtr->litFreq[u] = 1 + (ssPtr->litFreq[u] >> ZSTD_FREQ_DIV);
+ ssPtr->litSum += ssPtr->litFreq[u];
+ }
+ for (u = 0; u <= MaxLL; u++)
+ ssPtr->litLengthFreq[u] = 1;
+ for (u = 0; u <= MaxML; u++)
+ ssPtr->matchLengthFreq[u] = 1;
+ for (u = 0; u <= MaxOff; u++)
+ ssPtr->offCodeFreq[u] = 1;
+ } else {
+ ssPtr->matchLengthSum = 0;
+ ssPtr->litLengthSum = 0;
+ ssPtr->offCodeSum = 0;
+ ssPtr->matchSum = 0;
+ ssPtr->litSum = 0;
+
+ for (u = 0; u <= MaxLit; u++) {
+ ssPtr->litFreq[u] = 1 + (ssPtr->litFreq[u] >> (ZSTD_FREQ_DIV + 1));
+ ssPtr->litSum += ssPtr->litFreq[u];
+ }
+ for (u = 0; u <= MaxLL; u++) {
+ ssPtr->litLengthFreq[u] = 1 + (ssPtr->litLengthFreq[u] >> (ZSTD_FREQ_DIV + 1));
+ ssPtr->litLengthSum += ssPtr->litLengthFreq[u];
+ }
+ for (u = 0; u <= MaxML; u++) {
+ ssPtr->matchLengthFreq[u] = 1 + (ssPtr->matchLengthFreq[u] >> ZSTD_FREQ_DIV);
+ ssPtr->matchLengthSum += ssPtr->matchLengthFreq[u];
+ ssPtr->matchSum += ssPtr->matchLengthFreq[u] * (u + 3);
+ }
+ ssPtr->matchSum *= ZSTD_LITFREQ_ADD;
+ for (u = 0; u <= MaxOff; u++) {
+ ssPtr->offCodeFreq[u] = 1 + (ssPtr->offCodeFreq[u] >> ZSTD_FREQ_DIV);
+ ssPtr->offCodeSum += ssPtr->offCodeFreq[u];
+ }
+ }
+
+ ZSTD_setLog2Prices(ssPtr);
+}
+
+FORCE_INLINE U32 ZSTD_getLiteralPrice(seqStore_t *ssPtr, U32 litLength, const BYTE *literals)
+{
+ U32 price, u;
+
+ if (ssPtr->staticPrices)
+ return ZSTD_highbit32((U32)litLength + 1) + (litLength * 6);
+
+ if (litLength == 0)
+ return ssPtr->log2litLengthSum - ZSTD_highbit32(ssPtr->litLengthFreq[0] + 1);
+
+ /* literals */
+ if (ssPtr->cachedLiterals == literals) {
+ U32 const additional = litLength - ssPtr->cachedLitLength;
+ const BYTE *literals2 = ssPtr->cachedLiterals + ssPtr->cachedLitLength;
+ price = ssPtr->cachedPrice + additional * ssPtr->log2litSum;
+ for (u = 0; u < additional; u++)
+ price -= ZSTD_highbit32(ssPtr->litFreq[literals2[u]] + 1);
+ ssPtr->cachedPrice = price;
+ ssPtr->cachedLitLength = litLength;
+ } else {
+ price = litLength * ssPtr->log2litSum;
+ for (u = 0; u < litLength; u++)
+ price -= ZSTD_highbit32(ssPtr->litFreq[literals[u]] + 1);
+
+ if (litLength >= 12) {
+ ssPtr->cachedLiterals = literals;
+ ssPtr->cachedPrice = price;
+ ssPtr->cachedLitLength = litLength;
+ }
+ }
+
+ /* literal Length */
+ {
+ const BYTE LL_deltaCode = 19;
+ const BYTE llCode = (litLength > 63) ? (BYTE)ZSTD_highbit32(litLength) + LL_deltaCode : LL_Code[litLength];
+ price += LL_bits[llCode] + ssPtr->log2litLengthSum - ZSTD_highbit32(ssPtr->litLengthFreq[llCode] + 1);
+ }
+
+ return price;
+}
+
+FORCE_INLINE U32 ZSTD_getPrice(seqStore_t *seqStorePtr, U32 litLength, const BYTE *literals, U32 offset, U32 matchLength, const int ultra)
+{
+ /* offset */
+ U32 price;
+ BYTE const offCode = (BYTE)ZSTD_highbit32(offset + 1);
+
+ if (seqStorePtr->staticPrices)
+ return ZSTD_getLiteralPrice(seqStorePtr, litLength, literals) + ZSTD_highbit32((U32)matchLength + 1) + 16 + offCode;
+
+ price = offCode + seqStorePtr->log2offCodeSum - ZSTD_highbit32(seqStorePtr->offCodeFreq[offCode] + 1);
+ if (!ultra && offCode >= 20)
+ price += (offCode - 19) * 2;
+
+ /* match Length */
+ {
+ const BYTE ML_deltaCode = 36;
+ const BYTE mlCode = (matchLength > 127) ? (BYTE)ZSTD_highbit32(matchLength) + ML_deltaCode : ML_Code[matchLength];
+ price += ML_bits[mlCode] + seqStorePtr->log2matchLengthSum - ZSTD_highbit32(seqStorePtr->matchLengthFreq[mlCode] + 1);
+ }
+
+ return price + ZSTD_getLiteralPrice(seqStorePtr, litLength, literals) + seqStorePtr->factor;
+}
+
+ZSTD_STATIC void ZSTD_updatePrice(seqStore_t *seqStorePtr, U32 litLength, const BYTE *literals, U32 offset, U32 matchLength)
+{
+ U32 u;
+
+ /* literals */
+ seqStorePtr->litSum += litLength * ZSTD_LITFREQ_ADD;
+ for (u = 0; u < litLength; u++)
+ seqStorePtr->litFreq[literals[u]] += ZSTD_LITFREQ_ADD;
+
+ /* literal Length */
+ {
+ const BYTE LL_deltaCode = 19;
+ const BYTE llCode = (litLength > 63) ? (BYTE)ZSTD_highbit32(litLength) + LL_deltaCode : LL_Code[litLength];
+ seqStorePtr->litLengthFreq[llCode]++;
+ seqStorePtr->litLengthSum++;
+ }
+
+ /* match offset */
+ {
+ BYTE const offCode = (BYTE)ZSTD_highbit32(offset + 1);
+ seqStorePtr->offCodeSum++;
+ seqStorePtr->offCodeFreq[offCode]++;
+ }
+
+ /* match Length */
+ {
+ const BYTE ML_deltaCode = 36;
+ const BYTE mlCode = (matchLength > 127) ? (BYTE)ZSTD_highbit32(matchLength) + ML_deltaCode : ML_Code[matchLength];
+ seqStorePtr->matchLengthFreq[mlCode]++;
+ seqStorePtr->matchLengthSum++;
+ }
+
+ ZSTD_setLog2Prices(seqStorePtr);
+}
+
+#define SET_PRICE(pos, mlen_, offset_, litlen_, price_) \
+ { \
+ while (last_pos < pos) { \
+ opt[last_pos + 1].price = ZSTD_MAX_PRICE; \
+ last_pos++; \
+ } \
+ opt[pos].mlen = mlen_; \
+ opt[pos].off = offset_; \
+ opt[pos].litlen = litlen_; \
+ opt[pos].price = price_; \
+ }
+
+/* Update hashTable3 up to ip (excluded)
+ Assumption : always within prefix (i.e. not within extDict) */
+FORCE_INLINE
+U32 ZSTD_insertAndFindFirstIndexHash3(ZSTD_CCtx *zc, const BYTE *ip)
+{
+ U32 *const hashTable3 = zc->hashTable3;
+ U32 const hashLog3 = zc->hashLog3;
+ const BYTE *const base = zc->base;
+ U32 idx = zc->nextToUpdate3;
+ const U32 target = zc->nextToUpdate3 = (U32)(ip - base);
+ const size_t hash3 = ZSTD_hash3Ptr(ip, hashLog3);
+
+ while (idx < target) {
+ hashTable3[ZSTD_hash3Ptr(base + idx, hashLog3)] = idx;
+ idx++;
+ }
+
+ return hashTable3[hash3];
+}
+
+/*-*************************************
+* Binary Tree search
+***************************************/
+static U32 ZSTD_insertBtAndGetAllMatches(ZSTD_CCtx *zc, const BYTE *const ip, const BYTE *const iLimit, U32 nbCompares, const U32 mls, U32 extDict,
+ ZSTD_match_t *matches, const U32 minMatchLen)
+{
+ const BYTE *const base = zc->base;
+ const U32 curr = (U32)(ip - base);
+ const U32 hashLog = zc->params.cParams.hashLog;
+ const size_t h = ZSTD_hashPtr(ip, hashLog, mls);
+ U32 *const hashTable = zc->hashTable;
+ U32 matchIndex = hashTable[h];
+ U32 *const bt = zc->chainTable;
+ const U32 btLog = zc->params.cParams.chainLog - 1;
+ const U32 btMask = (1U << btLog) - 1;
+ size_t commonLengthSmaller = 0, commonLengthLarger = 0;
+ const BYTE *const dictBase = zc->dictBase;
+ const U32 dictLimit = zc->dictLimit;
+ const BYTE *const dictEnd = dictBase + dictLimit;
+ const BYTE *const prefixStart = base + dictLimit;
+ const U32 btLow = btMask >= curr ? 0 : curr - btMask;
+ const U32 windowLow = zc->lowLimit;
+ U32 *smallerPtr = bt + 2 * (curr & btMask);
+ U32 *largerPtr = bt + 2 * (curr & btMask) + 1;
+ U32 matchEndIdx = curr + 8;
+ U32 dummy32; /* to be nullified at the end */
+ U32 mnum = 0;
+
+ const U32 minMatch = (mls == 3) ? 3 : 4;
+ size_t bestLength = minMatchLen - 1;
+
+ if (minMatch == 3) { /* HC3 match finder */
+ U32 const matchIndex3 = ZSTD_insertAndFindFirstIndexHash3(zc, ip);
+ if (matchIndex3 > windowLow && (curr - matchIndex3 < (1 << 18))) {
+ const BYTE *match;
+ size_t currMl = 0;
+ if ((!extDict) || matchIndex3 >= dictLimit) {
+ match = base + matchIndex3;
+ if (match[bestLength] == ip[bestLength])
+ currMl = ZSTD_count(ip, match, iLimit);
+ } else {
+ match = dictBase + matchIndex3;
+ if (ZSTD_readMINMATCH(match, MINMATCH) ==
+ ZSTD_readMINMATCH(ip, MINMATCH)) /* assumption : matchIndex3 <= dictLimit-4 (by table construction) */
+ currMl = ZSTD_count_2segments(ip + MINMATCH, match + MINMATCH, iLimit, dictEnd, prefixStart) + MINMATCH;
+ }
+
+ /* save best solution */
+ if (currMl > bestLength) {
+ bestLength = currMl;
+ matches[mnum].off = ZSTD_REP_MOVE_OPT + curr - matchIndex3;
+ matches[mnum].len = (U32)currMl;
+ mnum++;
+ if (currMl > ZSTD_OPT_NUM)
+ goto update;
+ if (ip + currMl == iLimit)
+ goto update; /* best possible, and avoid read overflow*/
+ }
+ }
+ }
+
+ hashTable[h] = curr; /* Update Hash Table */
+
+ while (nbCompares-- && (matchIndex > windowLow)) {
+ U32 *nextPtr = bt + 2 * (matchIndex & btMask);
+ size_t matchLength = MIN(commonLengthSmaller, commonLengthLarger); /* guaranteed minimum nb of common bytes */
+ const BYTE *match;
+
+ if ((!extDict) || (matchIndex + matchLength >= dictLimit)) {
+ match = base + matchIndex;
+ if (match[matchLength] == ip[matchLength]) {
+ matchLength += ZSTD_count(ip + matchLength + 1, match + matchLength + 1, iLimit) + 1;
+ }
+ } else {
+ match = dictBase + matchIndex;
+ matchLength += ZSTD_count_2segments(ip + matchLength, match + matchLength, iLimit, dictEnd, prefixStart);
+ if (matchIndex + matchLength >= dictLimit)
+ match = base + matchIndex; /* to prepare for next usage of match[matchLength] */
+ }
+
+ if (matchLength > bestLength) {
+ if (matchLength > matchEndIdx - matchIndex)
+ matchEndIdx = matchIndex + (U32)matchLength;
+ bestLength = matchLength;
+ matches[mnum].off = ZSTD_REP_MOVE_OPT + curr - matchIndex;
+ matches[mnum].len = (U32)matchLength;
+ mnum++;
+ if (matchLength > ZSTD_OPT_NUM)
+ break;
+ if (ip + matchLength == iLimit) /* equal : no way to know if inf or sup */
+ break; /* drop, to guarantee consistency (miss a little bit of compression) */
+ }
+
+ if (match[matchLength] < ip[matchLength]) {
+ /* match is smaller than curr */
+ *smallerPtr = matchIndex; /* update smaller idx */
+ commonLengthSmaller = matchLength; /* all smaller will now have at least this guaranteed common length */
+ if (matchIndex <= btLow) {
+ smallerPtr = &dummy32;
+ break;
+ } /* beyond tree size, stop the search */
+ smallerPtr = nextPtr + 1; /* new "smaller" => larger of match */
+ matchIndex = nextPtr[1]; /* new matchIndex larger than previous (closer to curr) */
+ } else {
+ /* match is larger than curr */
+ *largerPtr = matchIndex;
+ commonLengthLarger = matchLength;
+ if (matchIndex <= btLow) {
+ largerPtr = &dummy32;
+ break;
+ } /* beyond tree size, stop the search */
+ largerPtr = nextPtr;
+ matchIndex = nextPtr[0];
+ }
+ }
+
+ *smallerPtr = *largerPtr = 0;
+
+update:
+ zc->nextToUpdate = (matchEndIdx > curr + 8) ? matchEndIdx - 8 : curr + 1;
+ return mnum;
+}
+
+/** Tree updater, providing best match */
+static U32 ZSTD_BtGetAllMatches(ZSTD_CCtx *zc, const BYTE *const ip, const BYTE *const iLimit, const U32 maxNbAttempts, const U32 mls, ZSTD_match_t *matches,
+ const U32 minMatchLen)
+{
+ if (ip < zc->base + zc->nextToUpdate)
+ return 0; /* skipped area */
+ ZSTD_updateTree(zc, ip, iLimit, maxNbAttempts, mls);
+ return ZSTD_insertBtAndGetAllMatches(zc, ip, iLimit, maxNbAttempts, mls, 0, matches, minMatchLen);
+}
+
+static U32 ZSTD_BtGetAllMatches_selectMLS(ZSTD_CCtx *zc, /* Index table will be updated */
+ const BYTE *ip, const BYTE *const iHighLimit, const U32 maxNbAttempts, const U32 matchLengthSearch,
+ ZSTD_match_t *matches, const U32 minMatchLen)
+{
+ switch (matchLengthSearch) {
+ case 3: return ZSTD_BtGetAllMatches(zc, ip, iHighLimit, maxNbAttempts, 3, matches, minMatchLen);
+ default:
+ case 4: return ZSTD_BtGetAllMatches(zc, ip, iHighLimit, maxNbAttempts, 4, matches, minMatchLen);
+ case 5: return ZSTD_BtGetAllMatches(zc, ip, iHighLimit, maxNbAttempts, 5, matches, minMatchLen);
+ case 7:
+ case 6: return ZSTD_BtGetAllMatches(zc, ip, iHighLimit, maxNbAttempts, 6, matches, minMatchLen);
+ }
+}
+
+/** Tree updater, providing best match */
+static U32 ZSTD_BtGetAllMatches_extDict(ZSTD_CCtx *zc, const BYTE *const ip, const BYTE *const iLimit, const U32 maxNbAttempts, const U32 mls,
+ ZSTD_match_t *matches, const U32 minMatchLen)
+{
+ if (ip < zc->base + zc->nextToUpdate)
+ return 0; /* skipped area */
+ ZSTD_updateTree_extDict(zc, ip, iLimit, maxNbAttempts, mls);
+ return ZSTD_insertBtAndGetAllMatches(zc, ip, iLimit, maxNbAttempts, mls, 1, matches, minMatchLen);
+}
+
+static U32 ZSTD_BtGetAllMatches_selectMLS_extDict(ZSTD_CCtx *zc, /* Index table will be updated */
+ const BYTE *ip, const BYTE *const iHighLimit, const U32 maxNbAttempts, const U32 matchLengthSearch,
+ ZSTD_match_t *matches, const U32 minMatchLen)
+{
+ switch (matchLengthSearch) {
+ case 3: return ZSTD_BtGetAllMatches_extDict(zc, ip, iHighLimit, maxNbAttempts, 3, matches, minMatchLen);
+ default:
+ case 4: return ZSTD_BtGetAllMatches_extDict(zc, ip, iHighLimit, maxNbAttempts, 4, matches, minMatchLen);
+ case 5: return ZSTD_BtGetAllMatches_extDict(zc, ip, iHighLimit, maxNbAttempts, 5, matches, minMatchLen);
+ case 7:
+ case 6: return ZSTD_BtGetAllMatches_extDict(zc, ip, iHighLimit, maxNbAttempts, 6, matches, minMatchLen);
+ }
+}
+
+/*-*******************************
+* Optimal parser
+*********************************/
+FORCE_INLINE
+void ZSTD_compressBlock_opt_generic(ZSTD_CCtx *ctx, const void *src, size_t srcSize, const int ultra)
+{
+ seqStore_t *seqStorePtr = &(ctx->seqStore);
+ const BYTE *const istart = (const BYTE *)src;
+ const BYTE *ip = istart;
+ const BYTE *anchor = istart;
+ const BYTE *const iend = istart + srcSize;
+ const BYTE *const ilimit = iend - 8;
+ const BYTE *const base = ctx->base;
+ const BYTE *const prefixStart = base + ctx->dictLimit;
+
+ const U32 maxSearches = 1U << ctx->params.cParams.searchLog;
+ const U32 sufficient_len = ctx->params.cParams.targetLength;
+ const U32 mls = ctx->params.cParams.searchLength;
+ const U32 minMatch = (ctx->params.cParams.searchLength == 3) ? 3 : 4;
+
+ ZSTD_optimal_t *opt = seqStorePtr->priceTable;
+ ZSTD_match_t *matches = seqStorePtr->matchTable;
+ const BYTE *inr;
+ U32 offset, rep[ZSTD_REP_NUM];
+
+ /* init */
+ ctx->nextToUpdate3 = ctx->nextToUpdate;
+ ZSTD_rescaleFreqs(seqStorePtr, (const BYTE *)src, srcSize);
+ ip += (ip == prefixStart);
+ {
+ U32 i;
+ for (i = 0; i < ZSTD_REP_NUM; i++)
+ rep[i] = ctx->rep[i];
+ }
+
+ /* Match Loop */
+ while (ip < ilimit) {
+ U32 cur, match_num, last_pos, litlen, price;
+ U32 u, mlen, best_mlen, best_off, litLength;
+ memset(opt, 0, sizeof(ZSTD_optimal_t));
+ last_pos = 0;
+ litlen = (U32)(ip - anchor);
+
+ /* check repCode */
+ {
+ U32 i, last_i = ZSTD_REP_CHECK + (ip == anchor);
+ for (i = (ip == anchor); i < last_i; i++) {
+ const S32 repCur = (i == ZSTD_REP_MOVE_OPT) ? (rep[0] - 1) : rep[i];
+ if ((repCur > 0) && (repCur < (S32)(ip - prefixStart)) &&
+ (ZSTD_readMINMATCH(ip, minMatch) == ZSTD_readMINMATCH(ip - repCur, minMatch))) {
+ mlen = (U32)ZSTD_count(ip + minMatch, ip + minMatch - repCur, iend) + minMatch;
+ if (mlen > sufficient_len || mlen >= ZSTD_OPT_NUM) {
+ best_mlen = mlen;
+ best_off = i;
+ cur = 0;
+ last_pos = 1;
+ goto _storeSequence;
+ }
+ best_off = i - (ip == anchor);
+ do {
+ price = ZSTD_getPrice(seqStorePtr, litlen, anchor, best_off, mlen - MINMATCH, ultra);
+ if (mlen > last_pos || price < opt[mlen].price)
+ SET_PRICE(mlen, mlen, i, litlen, price); /* note : macro modifies last_pos */
+ mlen--;
+ } while (mlen >= minMatch);
+ }
+ }
+ }
+
+ match_num = ZSTD_BtGetAllMatches_selectMLS(ctx, ip, iend, maxSearches, mls, matches, minMatch);
+
+ if (!last_pos && !match_num) {
+ ip++;
+ continue;
+ }
+
+ if (match_num && (matches[match_num - 1].len > sufficient_len || matches[match_num - 1].len >= ZSTD_OPT_NUM)) {
+ best_mlen = matches[match_num - 1].len;
+ best_off = matches[match_num - 1].off;
+ cur = 0;
+ last_pos = 1;
+ goto _storeSequence;
+ }
+
+ /* set prices using matches at position = 0 */
+ best_mlen = (last_pos) ? last_pos : minMatch;
+ for (u = 0; u < match_num; u++) {
+ mlen = (u > 0) ? matches[u - 1].len + 1 : best_mlen;
+ best_mlen = matches[u].len;
+ while (mlen <= best_mlen) {
+ price = ZSTD_getPrice(seqStorePtr, litlen, anchor, matches[u].off - 1, mlen - MINMATCH, ultra);
+ if (mlen > last_pos || price < opt[mlen].price)
+ SET_PRICE(mlen, mlen, matches[u].off, litlen, price); /* note : macro modifies last_pos */
+ mlen++;
+ }
+ }
+
+ if (last_pos < minMatch) {
+ ip++;
+ continue;
+ }
+
+ /* initialize opt[0] */
+ {
+ U32 i;
+ for (i = 0; i < ZSTD_REP_NUM; i++)
+ opt[0].rep[i] = rep[i];
+ }
+ opt[0].mlen = 1;
+ opt[0].litlen = litlen;
+
+ /* check further positions */
+ for (cur = 1; cur <= last_pos; cur++) {
+ inr = ip + cur;
+
+ if (opt[cur - 1].mlen == 1) {
+ litlen = opt[cur - 1].litlen + 1;
+ if (cur > litlen) {
+ price = opt[cur - litlen].price + ZSTD_getLiteralPrice(seqStorePtr, litlen, inr - litlen);
+ } else
+ price = ZSTD_getLiteralPrice(seqStorePtr, litlen, anchor);
+ } else {
+ litlen = 1;
+ price = opt[cur - 1].price + ZSTD_getLiteralPrice(seqStorePtr, litlen, inr - 1);
+ }
+
+ if (cur > last_pos || price <= opt[cur].price)
+ SET_PRICE(cur, 1, 0, litlen, price);
+
+ if (cur == last_pos)
+ break;
+
+ if (inr > ilimit) /* last match must start at a minimum distance of 8 from oend */
+ continue;
+
+ mlen = opt[cur].mlen;
+ if (opt[cur].off > ZSTD_REP_MOVE_OPT) {
+ opt[cur].rep[2] = opt[cur - mlen].rep[1];
+ opt[cur].rep[1] = opt[cur - mlen].rep[0];
+ opt[cur].rep[0] = opt[cur].off - ZSTD_REP_MOVE_OPT;
+ } else {
+ opt[cur].rep[2] = (opt[cur].off > 1) ? opt[cur - mlen].rep[1] : opt[cur - mlen].rep[2];
+ opt[cur].rep[1] = (opt[cur].off > 0) ? opt[cur - mlen].rep[0] : opt[cur - mlen].rep[1];
+ opt[cur].rep[0] =
+ ((opt[cur].off == ZSTD_REP_MOVE_OPT) && (mlen != 1)) ? (opt[cur - mlen].rep[0] - 1) : (opt[cur - mlen].rep[opt[cur].off]);
+ }
+
+ best_mlen = minMatch;
+ {
+ U32 i, last_i = ZSTD_REP_CHECK + (mlen != 1);
+ for (i = (opt[cur].mlen != 1); i < last_i; i++) { /* check rep */
+ const S32 repCur = (i == ZSTD_REP_MOVE_OPT) ? (opt[cur].rep[0] - 1) : opt[cur].rep[i];
+ if ((repCur > 0) && (repCur < (S32)(inr - prefixStart)) &&
+ (ZSTD_readMINMATCH(inr, minMatch) == ZSTD_readMINMATCH(inr - repCur, minMatch))) {
+ mlen = (U32)ZSTD_count(inr + minMatch, inr + minMatch - repCur, iend) + minMatch;
+
+ if (mlen > sufficient_len || cur + mlen >= ZSTD_OPT_NUM) {
+ best_mlen = mlen;
+ best_off = i;
+ last_pos = cur + 1;
+ goto _storeSequence;
+ }
+
+ best_off = i - (opt[cur].mlen != 1);
+ if (mlen > best_mlen)
+ best_mlen = mlen;
+
+ do {
+ if (opt[cur].mlen == 1) {
+ litlen = opt[cur].litlen;
+ if (cur > litlen) {
+ price = opt[cur - litlen].price + ZSTD_getPrice(seqStorePtr, litlen, inr - litlen,
+ best_off, mlen - MINMATCH, ultra);
+ } else
+ price = ZSTD_getPrice(seqStorePtr, litlen, anchor, best_off, mlen - MINMATCH, ultra);
+ } else {
+ litlen = 0;
+ price = opt[cur].price + ZSTD_getPrice(seqStorePtr, 0, NULL, best_off, mlen - MINMATCH, ultra);
+ }
+
+ if (cur + mlen > last_pos || price <= opt[cur + mlen].price)
+ SET_PRICE(cur + mlen, mlen, i, litlen, price);
+ mlen--;
+ } while (mlen >= minMatch);
+ }
+ }
+ }
+
+ match_num = ZSTD_BtGetAllMatches_selectMLS(ctx, inr, iend, maxSearches, mls, matches, best_mlen);
+
+ if (match_num > 0 && (matches[match_num - 1].len > sufficient_len || cur + matches[match_num - 1].len >= ZSTD_OPT_NUM)) {
+ best_mlen = matches[match_num - 1].len;
+ best_off = matches[match_num - 1].off;
+ last_pos = cur + 1;
+ goto _storeSequence;
+ }
+
+ /* set prices using matches at position = cur */
+ for (u = 0; u < match_num; u++) {
+ mlen = (u > 0) ? matches[u - 1].len + 1 : best_mlen;
+ best_mlen = matches[u].len;
+
+ while (mlen <= best_mlen) {
+ if (opt[cur].mlen == 1) {
+ litlen = opt[cur].litlen;
+ if (cur > litlen)
+ price = opt[cur - litlen].price + ZSTD_getPrice(seqStorePtr, litlen, ip + cur - litlen,
+ matches[u].off - 1, mlen - MINMATCH, ultra);
+ else
+ price = ZSTD_getPrice(seqStorePtr, litlen, anchor, matches[u].off - 1, mlen - MINMATCH, ultra);
+ } else {
+ litlen = 0;
+ price = opt[cur].price + ZSTD_getPrice(seqStorePtr, 0, NULL, matches[u].off - 1, mlen - MINMATCH, ultra);
+ }
+
+ if (cur + mlen > last_pos || (price < opt[cur + mlen].price))
+ SET_PRICE(cur + mlen, mlen, matches[u].off, litlen, price);
+
+ mlen++;
+ }
+ }
+ }
+
+ best_mlen = opt[last_pos].mlen;
+ best_off = opt[last_pos].off;
+ cur = last_pos - best_mlen;
+
+ /* store sequence */
+_storeSequence: /* cur, last_pos, best_mlen, best_off have to be set */
+ opt[0].mlen = 1;
+
+ while (1) {
+ mlen = opt[cur].mlen;
+ offset = opt[cur].off;
+ opt[cur].mlen = best_mlen;
+ opt[cur].off = best_off;
+ best_mlen = mlen;
+ best_off = offset;
+ if (mlen > cur)
+ break;
+ cur -= mlen;
+ }
+
+ for (u = 0; u <= last_pos;) {
+ u += opt[u].mlen;
+ }
+
+ for (cur = 0; cur < last_pos;) {
+ mlen = opt[cur].mlen;
+ if (mlen == 1) {
+ ip++;
+ cur++;
+ continue;
+ }
+ offset = opt[cur].off;
+ cur += mlen;
+ litLength = (U32)(ip - anchor);
+
+ if (offset > ZSTD_REP_MOVE_OPT) {
+ rep[2] = rep[1];
+ rep[1] = rep[0];
+ rep[0] = offset - ZSTD_REP_MOVE_OPT;
+ offset--;
+ } else {
+ if (offset != 0) {
+ best_off = (offset == ZSTD_REP_MOVE_OPT) ? (rep[0] - 1) : (rep[offset]);
+ if (offset != 1)
+ rep[2] = rep[1];
+ rep[1] = rep[0];
+ rep[0] = best_off;
+ }
+ if (litLength == 0)
+ offset--;
+ }
+
+ ZSTD_updatePrice(seqStorePtr, litLength, anchor, offset, mlen - MINMATCH);
+ ZSTD_storeSeq(seqStorePtr, litLength, anchor, offset, mlen - MINMATCH);
+ anchor = ip = ip + mlen;
+ }
+ } /* for (cur=0; cur < last_pos; ) */
+
+ /* Save reps for next block */
+ {
+ int i;
+ for (i = 0; i < ZSTD_REP_NUM; i++)
+ ctx->repToConfirm[i] = rep[i];
+ }
+
+ /* Last Literals */
+ {
+ size_t const lastLLSize = iend - anchor;
+ memcpy(seqStorePtr->lit, anchor, lastLLSize);
+ seqStorePtr->lit += lastLLSize;
+ }
+}
+
+FORCE_INLINE
+void ZSTD_compressBlock_opt_extDict_generic(ZSTD_CCtx *ctx, const void *src, size_t srcSize, const int ultra)
+{
+ seqStore_t *seqStorePtr = &(ctx->seqStore);
+ const BYTE *const istart = (const BYTE *)src;
+ const BYTE *ip = istart;
+ const BYTE *anchor = istart;
+ const BYTE *const iend = istart + srcSize;
+ const BYTE *const ilimit = iend - 8;
+ const BYTE *const base = ctx->base;
+ const U32 lowestIndex = ctx->lowLimit;
+ const U32 dictLimit = ctx->dictLimit;
+ const BYTE *const prefixStart = base + dictLimit;
+ const BYTE *const dictBase = ctx->dictBase;
+ const BYTE *const dictEnd = dictBase + dictLimit;
+
+ const U32 maxSearches = 1U << ctx->params.cParams.searchLog;
+ const U32 sufficient_len = ctx->params.cParams.targetLength;
+ const U32 mls = ctx->params.cParams.searchLength;
+ const U32 minMatch = (ctx->params.cParams.searchLength == 3) ? 3 : 4;
+
+ ZSTD_optimal_t *opt = seqStorePtr->priceTable;
+ ZSTD_match_t *matches = seqStorePtr->matchTable;
+ const BYTE *inr;
+
+ /* init */
+ U32 offset, rep[ZSTD_REP_NUM];
+ {
+ U32 i;
+ for (i = 0; i < ZSTD_REP_NUM; i++)
+ rep[i] = ctx->rep[i];
+ }
+
+ ctx->nextToUpdate3 = ctx->nextToUpdate;
+ ZSTD_rescaleFreqs(seqStorePtr, (const BYTE *)src, srcSize);
+ ip += (ip == prefixStart);
+
+ /* Match Loop */
+ while (ip < ilimit) {
+ U32 cur, match_num, last_pos, litlen, price;
+ U32 u, mlen, best_mlen, best_off, litLength;
+ U32 curr = (U32)(ip - base);
+ memset(opt, 0, sizeof(ZSTD_optimal_t));
+ last_pos = 0;
+ opt[0].litlen = (U32)(ip - anchor);
+
+ /* check repCode */
+ {
+ U32 i, last_i = ZSTD_REP_CHECK + (ip == anchor);
+ for (i = (ip == anchor); i < last_i; i++) {
+ const S32 repCur = (i == ZSTD_REP_MOVE_OPT) ? (rep[0] - 1) : rep[i];
+ const U32 repIndex = (U32)(curr - repCur);
+ const BYTE *const repBase = repIndex < dictLimit ? dictBase : base;
+ const BYTE *const repMatch = repBase + repIndex;
+ if ((repCur > 0 && repCur <= (S32)curr) &&
+ (((U32)((dictLimit - 1) - repIndex) >= 3) & (repIndex > lowestIndex)) /* intentional overflow */
+ && (ZSTD_readMINMATCH(ip, minMatch) == ZSTD_readMINMATCH(repMatch, minMatch))) {
+ /* repcode detected we should take it */
+ const BYTE *const repEnd = repIndex < dictLimit ? dictEnd : iend;
+ mlen = (U32)ZSTD_count_2segments(ip + minMatch, repMatch + minMatch, iend, repEnd, prefixStart) + minMatch;
+
+ if (mlen > sufficient_len || mlen >= ZSTD_OPT_NUM) {
+ best_mlen = mlen;
+ best_off = i;
+ cur = 0;
+ last_pos = 1;
+ goto _storeSequence;
+ }
+
+ best_off = i - (ip == anchor);
+ litlen = opt[0].litlen;
+ do {
+ price = ZSTD_getPrice(seqStorePtr, litlen, anchor, best_off, mlen - MINMATCH, ultra);
+ if (mlen > last_pos || price < opt[mlen].price)
+ SET_PRICE(mlen, mlen, i, litlen, price); /* note : macro modifies last_pos */
+ mlen--;
+ } while (mlen >= minMatch);
+ }
+ }
+ }
+
+ match_num = ZSTD_BtGetAllMatches_selectMLS_extDict(ctx, ip, iend, maxSearches, mls, matches, minMatch); /* first search (depth 0) */
+
+ if (!last_pos && !match_num) {
+ ip++;
+ continue;
+ }
+
+ {
+ U32 i;
+ for (i = 0; i < ZSTD_REP_NUM; i++)
+ opt[0].rep[i] = rep[i];
+ }
+ opt[0].mlen = 1;
+
+ if (match_num && (matches[match_num - 1].len > sufficient_len || matches[match_num - 1].len >= ZSTD_OPT_NUM)) {
+ best_mlen = matches[match_num - 1].len;
+ best_off = matches[match_num - 1].off;
+ cur = 0;
+ last_pos = 1;
+ goto _storeSequence;
+ }
+
+ best_mlen = (last_pos) ? last_pos : minMatch;
+
+ /* set prices using matches at position = 0 */
+ for (u = 0; u < match_num; u++) {
+ mlen = (u > 0) ? matches[u - 1].len + 1 : best_mlen;
+ best_mlen = matches[u].len;
+ litlen = opt[0].litlen;
+ while (mlen <= best_mlen) {
+ price = ZSTD_getPrice(seqStorePtr, litlen, anchor, matches[u].off - 1, mlen - MINMATCH, ultra);
+ if (mlen > last_pos || price < opt[mlen].price)
+ SET_PRICE(mlen, mlen, matches[u].off, litlen, price);
+ mlen++;
+ }
+ }
+
+ if (last_pos < minMatch) {
+ ip++;
+ continue;
+ }
+
+ /* check further positions */
+ for (cur = 1; cur <= last_pos; cur++) {
+ inr = ip + cur;
+
+ if (opt[cur - 1].mlen == 1) {
+ litlen = opt[cur - 1].litlen + 1;
+ if (cur > litlen) {
+ price = opt[cur - litlen].price + ZSTD_getLiteralPrice(seqStorePtr, litlen, inr - litlen);
+ } else
+ price = ZSTD_getLiteralPrice(seqStorePtr, litlen, anchor);
+ } else {
+ litlen = 1;
+ price = opt[cur - 1].price + ZSTD_getLiteralPrice(seqStorePtr, litlen, inr - 1);
+ }
+
+ if (cur > last_pos || price <= opt[cur].price)
+ SET_PRICE(cur, 1, 0, litlen, price);
+
+ if (cur == last_pos)
+ break;
+
+ if (inr > ilimit) /* last match must start at a minimum distance of 8 from oend */
+ continue;
+
+ mlen = opt[cur].mlen;
+ if (opt[cur].off > ZSTD_REP_MOVE_OPT) {
+ opt[cur].rep[2] = opt[cur - mlen].rep[1];
+ opt[cur].rep[1] = opt[cur - mlen].rep[0];
+ opt[cur].rep[0] = opt[cur].off - ZSTD_REP_MOVE_OPT;
+ } else {
+ opt[cur].rep[2] = (opt[cur].off > 1) ? opt[cur - mlen].rep[1] : opt[cur - mlen].rep[2];
+ opt[cur].rep[1] = (opt[cur].off > 0) ? opt[cur - mlen].rep[0] : opt[cur - mlen].rep[1];
+ opt[cur].rep[0] =
+ ((opt[cur].off == ZSTD_REP_MOVE_OPT) && (mlen != 1)) ? (opt[cur - mlen].rep[0] - 1) : (opt[cur - mlen].rep[opt[cur].off]);
+ }
+
+ best_mlen = minMatch;
+ {
+ U32 i, last_i = ZSTD_REP_CHECK + (mlen != 1);
+ for (i = (mlen != 1); i < last_i; i++) {
+ const S32 repCur = (i == ZSTD_REP_MOVE_OPT) ? (opt[cur].rep[0] - 1) : opt[cur].rep[i];
+ const U32 repIndex = (U32)(curr + cur - repCur);
+ const BYTE *const repBase = repIndex < dictLimit ? dictBase : base;
+ const BYTE *const repMatch = repBase + repIndex;
+ if ((repCur > 0 && repCur <= (S32)(curr + cur)) &&
+ (((U32)((dictLimit - 1) - repIndex) >= 3) & (repIndex > lowestIndex)) /* intentional overflow */
+ && (ZSTD_readMINMATCH(inr, minMatch) == ZSTD_readMINMATCH(repMatch, minMatch))) {
+ /* repcode detected */
+ const BYTE *const repEnd = repIndex < dictLimit ? dictEnd : iend;
+ mlen = (U32)ZSTD_count_2segments(inr + minMatch, repMatch + minMatch, iend, repEnd, prefixStart) + minMatch;
+
+ if (mlen > sufficient_len || cur + mlen >= ZSTD_OPT_NUM) {
+ best_mlen = mlen;
+ best_off = i;
+ last_pos = cur + 1;
+ goto _storeSequence;
+ }
+
+ best_off = i - (opt[cur].mlen != 1);
+ if (mlen > best_mlen)
+ best_mlen = mlen;
+
+ do {
+ if (opt[cur].mlen == 1) {
+ litlen = opt[cur].litlen;
+ if (cur > litlen) {
+ price = opt[cur - litlen].price + ZSTD_getPrice(seqStorePtr, litlen, inr - litlen,
+ best_off, mlen - MINMATCH, ultra);
+ } else
+ price = ZSTD_getPrice(seqStorePtr, litlen, anchor, best_off, mlen - MINMATCH, ultra);
+ } else {
+ litlen = 0;
+ price = opt[cur].price + ZSTD_getPrice(seqStorePtr, 0, NULL, best_off, mlen - MINMATCH, ultra);
+ }
+
+ if (cur + mlen > last_pos || price <= opt[cur + mlen].price)
+ SET_PRICE(cur + mlen, mlen, i, litlen, price);
+ mlen--;
+ } while (mlen >= minMatch);
+ }
+ }
+ }
+
+ match_num = ZSTD_BtGetAllMatches_selectMLS_extDict(ctx, inr, iend, maxSearches, mls, matches, minMatch);
+
+ if (match_num > 0 && (matches[match_num - 1].len > sufficient_len || cur + matches[match_num - 1].len >= ZSTD_OPT_NUM)) {
+ best_mlen = matches[match_num - 1].len;
+ best_off = matches[match_num - 1].off;
+ last_pos = cur + 1;
+ goto _storeSequence;
+ }
+
+ /* set prices using matches at position = cur */
+ for (u = 0; u < match_num; u++) {
+ mlen = (u > 0) ? matches[u - 1].len + 1 : best_mlen;
+ best_mlen = matches[u].len;
+
+ while (mlen <= best_mlen) {
+ if (opt[cur].mlen == 1) {
+ litlen = opt[cur].litlen;
+ if (cur > litlen)
+ price = opt[cur - litlen].price + ZSTD_getPrice(seqStorePtr, litlen, ip + cur - litlen,
+ matches[u].off - 1, mlen - MINMATCH, ultra);
+ else
+ price = ZSTD_getPrice(seqStorePtr, litlen, anchor, matches[u].off - 1, mlen - MINMATCH, ultra);
+ } else {
+ litlen = 0;
+ price = opt[cur].price + ZSTD_getPrice(seqStorePtr, 0, NULL, matches[u].off - 1, mlen - MINMATCH, ultra);
+ }
+
+ if (cur + mlen > last_pos || (price < opt[cur + mlen].price))
+ SET_PRICE(cur + mlen, mlen, matches[u].off, litlen, price);
+
+ mlen++;
+ }
+ }
+ } /* for (cur = 1; cur <= last_pos; cur++) */
+
+ best_mlen = opt[last_pos].mlen;
+ best_off = opt[last_pos].off;
+ cur = last_pos - best_mlen;
+
+ /* store sequence */
+_storeSequence: /* cur, last_pos, best_mlen, best_off have to be set */
+ opt[0].mlen = 1;
+
+ while (1) {
+ mlen = opt[cur].mlen;
+ offset = opt[cur].off;
+ opt[cur].mlen = best_mlen;
+ opt[cur].off = best_off;
+ best_mlen = mlen;
+ best_off = offset;
+ if (mlen > cur)
+ break;
+ cur -= mlen;
+ }
+
+ for (u = 0; u <= last_pos;) {
+ u += opt[u].mlen;
+ }
+
+ for (cur = 0; cur < last_pos;) {
+ mlen = opt[cur].mlen;
+ if (mlen == 1) {
+ ip++;
+ cur++;
+ continue;
+ }
+ offset = opt[cur].off;
+ cur += mlen;
+ litLength = (U32)(ip - anchor);
+
+ if (offset > ZSTD_REP_MOVE_OPT) {
+ rep[2] = rep[1];
+ rep[1] = rep[0];
+ rep[0] = offset - ZSTD_REP_MOVE_OPT;
+ offset--;
+ } else {
+ if (offset != 0) {
+ best_off = (offset == ZSTD_REP_MOVE_OPT) ? (rep[0] - 1) : (rep[offset]);
+ if (offset != 1)
+ rep[2] = rep[1];
+ rep[1] = rep[0];
+ rep[0] = best_off;
+ }
+
+ if (litLength == 0)
+ offset--;
+ }
+
+ ZSTD_updatePrice(seqStorePtr, litLength, anchor, offset, mlen - MINMATCH);
+ ZSTD_storeSeq(seqStorePtr, litLength, anchor, offset, mlen - MINMATCH);
+ anchor = ip = ip + mlen;
+ }
+ } /* for (cur=0; cur < last_pos; ) */
+
+ /* Save reps for next block */
+ {
+ int i;
+ for (i = 0; i < ZSTD_REP_NUM; i++)
+ ctx->repToConfirm[i] = rep[i];
+ }
+
+ /* Last Literals */
+ {
+ size_t lastLLSize = iend - anchor;
+ memcpy(seqStorePtr->lit, anchor, lastLLSize);
+ seqStorePtr->lit += lastLLSize;
+ }
+}
+
+#endif /* ZSTD_OPT_H_91842398743 */
diff --git a/mm/filemap.c b/mm/filemap.c
index 6d2f561d517c..8f36d9ca687b 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -437,19 +437,17 @@ static int __filemap_fdatawait_range(struct address_space *mapping,
goto out;
pagevec_init(&pvec, 0);
- while ((index <= end) &&
- (nr_pages = pagevec_lookup_tag(&pvec, mapping, &index,
- PAGECACHE_TAG_WRITEBACK,
- min(end - index, (pgoff_t)PAGEVEC_SIZE-1) + 1)) != 0) {
+ while (index <= end) {
unsigned i;
+ nr_pages = pagevec_lookup_range_tag(&pvec, mapping, &index,
+ end, PAGECACHE_TAG_WRITEBACK);
+ if (!nr_pages)
+ break;
+
for (i = 0; i < nr_pages; i++) {
struct page *page = pvec.pages[i];
- /* until radix tree lookup accepts end_index */
- if (page->index > end)
- continue;
-
wait_on_page_writeback(page);
if (TestClearPageError(page))
ret = -EIO;
@@ -1491,9 +1489,10 @@ repeat:
EXPORT_SYMBOL(find_get_pages_contig);
/**
- * find_get_pages_tag - find and return pages that match @tag
+ * find_get_pages_range_tag - find and return pages in given range matching @tag
* @mapping: the address_space to search
* @index: the starting page index
+ * @end: The final page index (inclusive)
* @tag: the tag index
* @nr_pages: the maximum number of pages
* @pages: where the resulting pages are placed
@@ -1501,8 +1500,9 @@ EXPORT_SYMBOL(find_get_pages_contig);
* Like find_get_pages, except we only return pages which are tagged with
* @tag. We update @index to index the next page for the traversal.
*/
-unsigned find_get_pages_tag(struct address_space *mapping, pgoff_t *index,
- int tag, unsigned int nr_pages, struct page **pages)
+unsigned find_get_pages_range_tag(struct address_space *mapping, pgoff_t *index,
+ pgoff_t end, int tag, unsigned int nr_pages,
+ struct page **pages)
{
struct radix_tree_iter iter;
void **slot;
@@ -1515,6 +1515,9 @@ unsigned find_get_pages_tag(struct address_space *mapping, pgoff_t *index,
radix_tree_for_each_tagged(slot, &mapping->page_tree,
&iter, *index, tag) {
struct page *head, *page;
+
+ if (iter.index > end)
+ break;
repeat:
page = radix_tree_deref_slot(slot);
if (unlikely(!page))
@@ -1556,18 +1559,28 @@ repeat:
}
pages[ret] = page;
- if (++ret == nr_pages)
- break;
+ if (++ret == nr_pages) {
+ *index = pages[ret - 1]->index + 1;
+ goto out;
+ }
}
+ /*
+ * We come here when we got at @end. We take care to not overflow the
+ * index @index as it confuses some of the callers. This breaks the
+ * iteration when there is page at index -1 but that is already broken
+ * anyway.
+ */
+ if (end == (pgoff_t)-1)
+ *index = (pgoff_t)-1;
+ else
+ *index = end + 1;
+out:
rcu_read_unlock();
- if (ret)
- *index = pages[ret - 1]->index + 1;
-
return ret;
}
-EXPORT_SYMBOL(find_get_pages_tag);
+EXPORT_SYMBOL(find_get_pages_range_tag);
/**
* find_get_entries_tag - find and return entries that match @tag
@@ -2387,7 +2400,7 @@ static struct page *wait_on_page_read(struct page *page)
static struct page *do_read_cache_page(struct address_space *mapping,
pgoff_t index,
- int (*filler)(void *, struct page *),
+ int (*filler)(struct file *, struct page *),
void *data,
gfp_t gfp)
{
@@ -2494,7 +2507,7 @@ out:
*/
struct page *read_cache_page(struct address_space *mapping,
pgoff_t index,
- int (*filler)(void *, struct page *),
+ int (*filler)(struct file *, struct page *),
void *data)
{
return do_read_cache_page(mapping, index, filler, data, mapping_gfp_mask(mapping));
@@ -2516,7 +2529,7 @@ struct page *read_cache_page_gfp(struct address_space *mapping,
pgoff_t index,
gfp_t gfp)
{
- filler_t *filler = (filler_t *)mapping->a_ops->readpage;
+ filler_t *filler = mapping->a_ops->readpage;
return do_read_cache_page(mapping, index, filler, NULL, gfp);
}
diff --git a/mm/kasan/kasan.c b/mm/kasan/kasan.c
index 4ce386c44cf1..627c6991d487 100644
--- a/mm/kasan/kasan.c
+++ b/mm/kasan/kasan.c
@@ -39,6 +39,16 @@
#include "kasan.h"
#include "../slab.h"
+void kasan_enable_current(void)
+{
+ current->kasan_depth++;
+}
+
+void kasan_disable_current(void)
+{
+ current->kasan_depth--;
+}
+
/*
* Poisons the shadow memory for 'size' bytes starting from 'addr'.
* Memory addresses should be aligned to KASAN_SHADOW_SCALE_SIZE.
@@ -428,7 +438,7 @@ void kasan_cache_shrink(struct kmem_cache *cache)
quarantine_remove_cache(cache);
}
-void kasan_cache_destroy(struct kmem_cache *cache)
+void kasan_cache_shutdown(struct kmem_cache *cache)
{
quarantine_remove_cache(cache);
}
@@ -559,7 +569,8 @@ bool kasan_slab_free(struct kmem_cache *cache, void *object)
shadow_byte = READ_ONCE(*(s8 *)kasan_mem_to_shadow(object));
if (shadow_byte < 0 || shadow_byte >= KASAN_SHADOW_SCALE_SIZE) {
- kasan_report_double_free(cache, object, shadow_byte);
+ kasan_report_double_free(cache, object,
+ __builtin_return_address(1));
return true;
}
@@ -784,6 +795,55 @@ void __asan_unpoison_stack_memory(const void *addr, size_t size)
}
EXPORT_SYMBOL(__asan_unpoison_stack_memory);
+/* Emitted by compiler to poison alloca()ed objects. */
+void __asan_alloca_poison(unsigned long addr, size_t size)
+{
+ size_t rounded_up_size = round_up(size, KASAN_SHADOW_SCALE_SIZE);
+ size_t padding_size = round_up(size, KASAN_ALLOCA_REDZONE_SIZE) -
+ rounded_up_size;
+ size_t rounded_down_size = round_down(size, KASAN_SHADOW_SCALE_SIZE);
+
+ const void *left_redzone = (const void *)(addr -
+ KASAN_ALLOCA_REDZONE_SIZE);
+ const void *right_redzone = (const void *)(addr + rounded_up_size);
+
+ WARN_ON(!IS_ALIGNED(addr, KASAN_ALLOCA_REDZONE_SIZE));
+
+ kasan_unpoison_shadow((const void *)(addr + rounded_down_size),
+ size - rounded_down_size);
+ kasan_poison_shadow(left_redzone, KASAN_ALLOCA_REDZONE_SIZE,
+ KASAN_ALLOCA_LEFT);
+ kasan_poison_shadow(right_redzone,
+ padding_size + KASAN_ALLOCA_REDZONE_SIZE,
+ KASAN_ALLOCA_RIGHT);
+}
+EXPORT_SYMBOL(__asan_alloca_poison);
+
+/* Emitted by compiler to unpoison alloca()ed areas when the stack unwinds. */
+void __asan_allocas_unpoison(const void *stack_top, const void *stack_bottom)
+{
+ if (unlikely(!stack_top || stack_top > stack_bottom))
+ return;
+
+ kasan_unpoison_shadow(stack_top, stack_bottom - stack_top);
+}
+EXPORT_SYMBOL(__asan_allocas_unpoison);
+
+/* Emitted by the compiler to [un]poison local variables. */
+#define DEFINE_ASAN_SET_SHADOW(byte) \
+ void __asan_set_shadow_##byte(const void *addr, size_t size) \
+ { \
+ __memset((void *)addr, 0x##byte, size); \
+ } \
+ EXPORT_SYMBOL(__asan_set_shadow_##byte)
+
+DEFINE_ASAN_SET_SHADOW(00);
+DEFINE_ASAN_SET_SHADOW(f1);
+DEFINE_ASAN_SET_SHADOW(f2);
+DEFINE_ASAN_SET_SHADOW(f3);
+DEFINE_ASAN_SET_SHADOW(f5);
+DEFINE_ASAN_SET_SHADOW(f8);
+
#ifdef CONFIG_MEMORY_HOTPLUG
static int kasan_mem_notifier(struct notifier_block *nb,
unsigned long action, void *data)
diff --git a/mm/kasan/kasan.h b/mm/kasan/kasan.h
index 1c260e6b3b3c..d9cf9e2cc782 100644
--- a/mm/kasan/kasan.h
+++ b/mm/kasan/kasan.h
@@ -23,6 +23,14 @@
#define KASAN_STACK_PARTIAL 0xF4
#define KASAN_USE_AFTER_SCOPE 0xF8
+/*
+ * alloca redzone shadow values
+ */
+#define KASAN_ALLOCA_LEFT 0xCA
+#define KASAN_ALLOCA_RIGHT 0xCB
+
+#define KASAN_ALLOCA_REDZONE_SIZE 32
+
/* Don't break randconfig/all*config builds */
#ifndef KASAN_ABI_VERSION
#define KASAN_ABI_VERSION 1
@@ -96,15 +104,10 @@ static inline const void *kasan_shadow_to_mem(const void *shadow_addr)
<< KASAN_SHADOW_SCALE_SHIFT);
}
-static inline bool kasan_report_enabled(void)
-{
- return !current->kasan_depth;
-}
-
void kasan_report(unsigned long addr, size_t size,
bool is_write, unsigned long ip);
void kasan_report_double_free(struct kmem_cache *cache, void *object,
- s8 shadow);
+ void *ip);
#if defined(CONFIG_SLAB) || defined(CONFIG_SLUB)
void quarantine_put(struct kasan_free_meta *info, struct kmem_cache *cache);
@@ -117,4 +120,48 @@ static inline void quarantine_reduce(void) { }
static inline void quarantine_remove_cache(struct kmem_cache *cache) { }
#endif
+/*
+ * Exported functions for interfaces called from assembly or from generated
+ * code. Declarations here to avoid warning about missing declarations.
+ */
+asmlinkage void kasan_unpoison_task_stack_below(const void *watermark);
+void __asan_register_globals(struct kasan_global *globals, size_t size);
+void __asan_unregister_globals(struct kasan_global *globals, size_t size);
+void __asan_loadN(unsigned long addr, size_t size);
+void __asan_storeN(unsigned long addr, size_t size);
+void __asan_handle_no_return(void);
+void __asan_poison_stack_memory(const void *addr, size_t size);
+void __asan_unpoison_stack_memory(const void *addr, size_t size);
+void __asan_alloca_poison(unsigned long addr, size_t size);
+void __asan_allocas_unpoison(const void *stack_top, const void *stack_bottom);
+
+void __asan_load1(unsigned long addr);
+void __asan_store1(unsigned long addr);
+void __asan_load2(unsigned long addr);
+void __asan_store2(unsigned long addr);
+void __asan_load4(unsigned long addr);
+void __asan_store4(unsigned long addr);
+void __asan_load8(unsigned long addr);
+void __asan_store8(unsigned long addr);
+void __asan_load16(unsigned long addr);
+void __asan_store16(unsigned long addr);
+
+void __asan_load1_noabort(unsigned long addr);
+void __asan_store1_noabort(unsigned long addr);
+void __asan_load2_noabort(unsigned long addr);
+void __asan_store2_noabort(unsigned long addr);
+void __asan_load4_noabort(unsigned long addr);
+void __asan_store4_noabort(unsigned long addr);
+void __asan_load8_noabort(unsigned long addr);
+void __asan_store8_noabort(unsigned long addr);
+void __asan_load16_noabort(unsigned long addr);
+void __asan_store16_noabort(unsigned long addr);
+
+void __asan_set_shadow_00(const void *addr, size_t size);
+void __asan_set_shadow_f1(const void *addr, size_t size);
+void __asan_set_shadow_f2(const void *addr, size_t size);
+void __asan_set_shadow_f3(const void *addr, size_t size);
+void __asan_set_shadow_f5(const void *addr, size_t size);
+void __asan_set_shadow_f8(const void *addr, size_t size);
+
#endif
diff --git a/mm/kasan/quarantine.c b/mm/kasan/quarantine.c
index baabaad4a4aa..3a8ddf8baf7d 100644
--- a/mm/kasan/quarantine.c
+++ b/mm/kasan/quarantine.c
@@ -25,6 +25,7 @@
#include <linux/printk.h>
#include <linux/shrinker.h>
#include <linux/slab.h>
+#include <linux/srcu.h>
#include <linux/string.h>
#include <linux/types.h>
@@ -86,24 +87,9 @@ static void qlist_move_all(struct qlist_head *from, struct qlist_head *to)
qlist_init(from);
}
-static void qlist_move(struct qlist_head *from, struct qlist_node *last,
- struct qlist_head *to, size_t size)
-{
- if (unlikely(last == from->tail)) {
- qlist_move_all(from, to);
- return;
- }
- if (qlist_empty(to))
- to->head = from->head;
- else
- to->tail->next = from->head;
- to->tail = last;
- from->head = last->next;
- last->next = NULL;
- from->bytes -= size;
- to->bytes += size;
-}
-
+#define QUARANTINE_PERCPU_SIZE (1 << 20)
+#define QUARANTINE_BATCHES \
+ (1024 > 4 * CONFIG_NR_CPUS ? 1024 : 4 * CONFIG_NR_CPUS)
/*
* The object quarantine consists of per-cpu queues and a global queue,
@@ -111,11 +97,23 @@ static void qlist_move(struct qlist_head *from, struct qlist_node *last,
*/
static DEFINE_PER_CPU(struct qlist_head, cpu_quarantine);
-static struct qlist_head global_quarantine;
+/* Round-robin FIFO array of batches. */
+static struct qlist_head global_quarantine[QUARANTINE_BATCHES];
+static int quarantine_head;
+static int quarantine_tail;
+/* Total size of all objects in global_quarantine across all batches. */
+static unsigned long quarantine_size;
static DEFINE_SPINLOCK(quarantine_lock);
+DEFINE_STATIC_SRCU(remove_cache_srcu);
/* Maximum size of the global queue. */
-static unsigned long quarantine_size;
+static unsigned long quarantine_max_size;
+
+/*
+ * Target size of a batch in global_quarantine.
+ * Usually equal to QUARANTINE_PERCPU_SIZE unless we have too much RAM.
+ */
+static unsigned long quarantine_batch_size;
/*
* The fraction of physical memory the quarantine is allowed to occupy.
@@ -124,9 +122,6 @@ static unsigned long quarantine_size;
*/
#define QUARANTINE_FRACTION 32
-#define QUARANTINE_LOW_SIZE (READ_ONCE(quarantine_size) * 3 / 4)
-#define QUARANTINE_PERCPU_SIZE (1 << 20)
-
static struct kmem_cache *qlink_to_cache(struct qlist_node *qlink)
{
return virt_to_head_page(qlink)->slab_cache;
@@ -180,62 +175,89 @@ void quarantine_put(struct kasan_free_meta *info, struct kmem_cache *cache)
struct qlist_head *q;
struct qlist_head temp = QLIST_INIT;
+ /*
+ * Note: irq must be disabled until after we move the batch to the
+ * global quarantine. Otherwise quarantine_remove_cache() can miss
+ * some objects belonging to the cache if they are in our local temp
+ * list. quarantine_remove_cache() executes on_each_cpu() at the
+ * beginning which ensures that it either sees the objects in per-cpu
+ * lists or in the global quarantine.
+ */
local_irq_save(flags);
q = this_cpu_ptr(&cpu_quarantine);
qlist_put(q, &info->quarantine_link, cache->size);
- if (unlikely(q->bytes > QUARANTINE_PERCPU_SIZE))
+ if (unlikely(q->bytes > QUARANTINE_PERCPU_SIZE)) {
qlist_move_all(q, &temp);
- local_irq_restore(flags);
-
- if (unlikely(!qlist_empty(&temp))) {
- spin_lock_irqsave(&quarantine_lock, flags);
- qlist_move_all(&temp, &global_quarantine);
- spin_unlock_irqrestore(&quarantine_lock, flags);
+ spin_lock(&quarantine_lock);
+ WRITE_ONCE(quarantine_size, quarantine_size + temp.bytes);
+ qlist_move_all(&temp, &global_quarantine[quarantine_tail]);
+ if (global_quarantine[quarantine_tail].bytes >=
+ READ_ONCE(quarantine_batch_size)) {
+ int new_tail;
+
+ new_tail = quarantine_tail + 1;
+ if (new_tail == QUARANTINE_BATCHES)
+ new_tail = 0;
+ if (new_tail != quarantine_head)
+ quarantine_tail = new_tail;
+ }
+ spin_unlock(&quarantine_lock);
}
+
+ local_irq_restore(flags);
}
void quarantine_reduce(void)
{
- size_t new_quarantine_size, percpu_quarantines;
+ size_t total_size, new_quarantine_size, percpu_quarantines;
unsigned long flags;
+ int srcu_idx;
struct qlist_head to_free = QLIST_INIT;
- size_t size_to_free = 0;
- struct qlist_node *last;
- if (likely(READ_ONCE(global_quarantine.bytes) <=
- READ_ONCE(quarantine_size)))
+ if (likely(READ_ONCE(quarantine_size) <=
+ READ_ONCE(quarantine_max_size)))
return;
+ /*
+ * srcu critical section ensures that quarantine_remove_cache()
+ * will not miss objects belonging to the cache while they are in our
+ * local to_free list. srcu is chosen because (1) it gives us private
+ * grace period domain that does not interfere with anything else,
+ * and (2) it allows synchronize_srcu() to return without waiting
+ * if there are no pending read critical sections (which is the
+ * expected case).
+ */
+ srcu_idx = srcu_read_lock(&remove_cache_srcu);
spin_lock_irqsave(&quarantine_lock, flags);
/*
* Update quarantine size in case of hotplug. Allocate a fraction of
* the installed memory to quarantine minus per-cpu queue limits.
*/
- new_quarantine_size = (READ_ONCE(totalram_pages) << PAGE_SHIFT) /
+ total_size = (READ_ONCE(totalram_pages) << PAGE_SHIFT) /
QUARANTINE_FRACTION;
percpu_quarantines = QUARANTINE_PERCPU_SIZE * num_online_cpus();
- new_quarantine_size = (new_quarantine_size < percpu_quarantines) ?
- 0 : new_quarantine_size - percpu_quarantines;
- WRITE_ONCE(quarantine_size, new_quarantine_size);
-
- last = global_quarantine.head;
- while (last) {
- struct kmem_cache *cache = qlink_to_cache(last);
-
- size_to_free += cache->size;
- if (!last->next || size_to_free >
- global_quarantine.bytes - QUARANTINE_LOW_SIZE)
- break;
- last = last->next;
+ new_quarantine_size = (total_size < percpu_quarantines) ?
+ 0 : total_size - percpu_quarantines;
+ WRITE_ONCE(quarantine_max_size, new_quarantine_size);
+ /* Aim at consuming at most 1/2 of slots in quarantine. */
+ WRITE_ONCE(quarantine_batch_size, max((size_t)QUARANTINE_PERCPU_SIZE,
+ 2 * total_size / QUARANTINE_BATCHES));
+
+ if (likely(quarantine_size > quarantine_max_size)) {
+ qlist_move_all(&global_quarantine[quarantine_head], &to_free);
+ WRITE_ONCE(quarantine_size, quarantine_size - to_free.bytes);
+ quarantine_head++;
+ if (quarantine_head == QUARANTINE_BATCHES)
+ quarantine_head = 0;
}
- qlist_move(&global_quarantine, last, &to_free, size_to_free);
spin_unlock_irqrestore(&quarantine_lock, flags);
qlist_free_all(&to_free, NULL);
+ srcu_read_unlock(&remove_cache_srcu, srcu_idx);
}
static void qlist_move_cache(struct qlist_head *from,
@@ -273,16 +295,34 @@ static void per_cpu_remove_cache(void *arg)
qlist_free_all(&to_free, cache);
}
+/* Free all quarantined objects belonging to cache. */
void quarantine_remove_cache(struct kmem_cache *cache)
{
- unsigned long flags;
+ unsigned long flags, i;
struct qlist_head to_free = QLIST_INIT;
+ /*
+ * Must be careful to not miss any objects that are being moved from
+ * per-cpu list to the global quarantine in quarantine_put(),
+ * nor objects being freed in quarantine_reduce(). on_each_cpu()
+ * achieves the first goal, while synchronize_srcu() achieves the
+ * second.
+ */
on_each_cpu(per_cpu_remove_cache, cache, 1);
spin_lock_irqsave(&quarantine_lock, flags);
- qlist_move_cache(&global_quarantine, &to_free, cache);
+ for (i = 0; i < QUARANTINE_BATCHES; i++) {
+ if (qlist_empty(&global_quarantine[i]))
+ continue;
+ qlist_move_cache(&global_quarantine[i], &to_free, cache);
+ /* Scanning whole quarantine can take a while. */
+ spin_unlock_irqrestore(&quarantine_lock, flags);
+ cond_resched();
+ spin_lock_irqsave(&quarantine_lock, flags);
+ }
spin_unlock_irqrestore(&quarantine_lock, flags);
qlist_free_all(&to_free, cache);
+
+ synchronize_srcu(&remove_cache_srcu);
}
diff --git a/mm/kasan/report.c b/mm/kasan/report.c
index 8ca412aebcf1..a3247975aaae 100644
--- a/mm/kasan/report.c
+++ b/mm/kasan/report.c
@@ -13,7 +13,9 @@
*
*/
+#include <linux/bitops.h>
#include <linux/ftrace.h>
+#include <linux/init.h>
#include <linux/kernel.h>
#include <linux/mm.h>
#include <linux/printk.h>
@@ -49,7 +51,13 @@ static const void *find_first_bad_addr(const void *addr, size_t size)
return first_bad_addr;
}
-static void print_error_description(struct kasan_access_info *info)
+static bool addr_has_shadow(struct kasan_access_info *info)
+{
+ return (info->access_addr >=
+ kasan_shadow_to_mem((void *)KASAN_SHADOW_START));
+}
+
+static const char *get_shadow_bug_type(struct kasan_access_info *info)
{
const char *bug_type = "unknown-crash";
u8 *shadow_addr;
@@ -94,14 +102,45 @@ static void print_error_description(struct kasan_access_info *info)
case KASAN_USE_AFTER_SCOPE:
bug_type = "use-after-scope";
break;
+ case KASAN_ALLOCA_LEFT:
+ case KASAN_ALLOCA_RIGHT:
+ bug_type = "alloca-out-of-bounds";
+ break;
}
- pr_err("BUG: KASAN: %s in %pS at addr %p\n",
- bug_type, (void *)info->ip,
- info->access_addr);
- pr_err("%s of size %zu by task %s/%d\n",
- info->is_write ? "Write" : "Read",
- info->access_size, current->comm, task_pid_nr(current));
+ return bug_type;
+}
+
+static const char *get_wild_bug_type(struct kasan_access_info *info)
+{
+ const char *bug_type = "unknown-crash";
+
+ if ((unsigned long)info->access_addr < PAGE_SIZE)
+ bug_type = "null-ptr-deref";
+ else if ((unsigned long)info->access_addr < TASK_SIZE)
+ bug_type = "user-memory-access";
+ else
+ bug_type = "wild-memory-access";
+
+ return bug_type;
+}
+
+static const char *get_bug_type(struct kasan_access_info *info)
+{
+ if (addr_has_shadow(info))
+ return get_shadow_bug_type(info);
+ return get_wild_bug_type(info);
+}
+
+static void print_error_description(struct kasan_access_info *info)
+{
+ const char *bug_type = get_bug_type(info);
+
+ pr_err("BUG: KASAN: %s in %pS\n",
+ bug_type, (void *)info->ip);
+ pr_err("%s of size %zu at addr %p by task %s/%d\n",
+ info->is_write ? "Write" : "Read", info->access_size,
+ info->access_addr, current->comm, task_pid_nr(current));
}
static inline bool kernel_or_module_addr(const void *addr)
@@ -137,12 +176,14 @@ static void kasan_end_report(unsigned long *flags)
pr_err("==================================================================\n");
add_taint(TAINT_BAD_PAGE, LOCKDEP_NOW_UNRELIABLE);
spin_unlock_irqrestore(&report_lock, *flags);
+ if (panic_on_warn)
+ panic("panic_on_warn set ...\n");
kasan_enable_current();
}
-static void print_track(struct kasan_track *track)
+static void print_track(struct kasan_track *track, const char *prefix)
{
- pr_err("PID = %u\n", track->pid);
+ pr_err("%s by task %u:\n", prefix, track->pid);
if (track->stack) {
struct stack_trace trace;
@@ -153,59 +194,84 @@ static void print_track(struct kasan_track *track)
}
}
-static void kasan_object_err(struct kmem_cache *cache, void *object)
+static struct page *addr_to_page(const void *addr)
{
- struct kasan_alloc_meta *alloc_info = get_alloc_info(cache, object);
+ if ((addr >= (void *)PAGE_OFFSET) &&
+ (addr < high_memory))
+ return virt_to_head_page(addr);
+ return NULL;
+}
- dump_stack();
- pr_err("Object at %p, in cache %s size: %d\n", object, cache->name,
- cache->object_size);
+static void describe_object_addr(struct kmem_cache *cache, void *object,
+ const void *addr)
+{
+ unsigned long access_addr = (unsigned long)addr;
+ unsigned long object_addr = (unsigned long)object;
+ const char *rel_type;
+ int rel_bytes;
- if (!(cache->flags & SLAB_KASAN))
+ pr_err("The buggy address belongs to the object at %p\n"
+ " which belongs to the cache %s of size %d\n",
+ object, cache->name, cache->object_size);
+
+ if (!addr)
return;
- pr_err("Allocated:\n");
- print_track(&alloc_info->alloc_track);
- pr_err("Freed:\n");
- print_track(&alloc_info->free_track);
+ if (access_addr < object_addr) {
+ rel_type = "to the left";
+ rel_bytes = object_addr - access_addr;
+ } else if (access_addr >= object_addr + cache->object_size) {
+ rel_type = "to the right";
+ rel_bytes = access_addr - (object_addr + cache->object_size);
+ } else {
+ rel_type = "inside";
+ rel_bytes = access_addr - object_addr;
+ }
+
+ pr_err("The buggy address is located %d bytes %s of\n"
+ " %d-byte region [%p, %p)\n",
+ rel_bytes, rel_type, cache->object_size, (void *)object_addr,
+ (void *)(object_addr + cache->object_size));
}
-void kasan_report_double_free(struct kmem_cache *cache, void *object,
- s8 shadow)
+static void describe_object(struct kmem_cache *cache, void *object,
+ const void *addr)
{
- unsigned long flags;
+ struct kasan_alloc_meta *alloc_info = get_alloc_info(cache, object);
- kasan_start_report(&flags);
- pr_err("BUG: Double free or freeing an invalid pointer\n");
- pr_err("Unexpected shadow byte: 0x%hhX\n", shadow);
- kasan_object_err(cache, object);
- kasan_end_report(&flags);
+ if (cache->flags & SLAB_KASAN) {
+ print_track(&alloc_info->alloc_track, "Allocated");
+ pr_err("\n");
+ print_track(&alloc_info->free_track, "Freed");
+ pr_err("\n");
+ }
+
+ describe_object_addr(cache, object, addr);
}
-static void print_address_description(struct kasan_access_info *info)
+static void print_address_description(void *addr)
{
- const void *addr = info->access_addr;
+ struct page *page = addr_to_page(addr);
- if ((addr >= (void *)PAGE_OFFSET) &&
- (addr < high_memory)) {
- struct page *page = virt_to_head_page(addr);
-
- if (PageSlab(page)) {
- void *object;
- struct kmem_cache *cache = page->slab_cache;
- object = nearest_obj(cache, page,
- (void *)info->access_addr);
- kasan_object_err(cache, object);
- return;
- }
- dump_page(page, "kasan: bad access detected");
+ dump_stack();
+ pr_err("\n");
+
+ if (page && PageSlab(page)) {
+ struct kmem_cache *cache = page->slab_cache;
+ void *object = nearest_obj(cache, page, addr);
+
+ describe_object(cache, object, addr);
}
- if (kernel_or_module_addr(addr)) {
- if (!init_task_stack_addr(addr))
- pr_err("Address belongs to variable %pS\n", addr);
+ if (kernel_or_module_addr(addr) && !init_task_stack_addr(addr)) {
+ pr_err("The buggy address belongs to the variable:\n");
+ pr_err(" %pS\n", addr);
+ }
+
+ if (page) {
+ pr_err("The buggy address belongs to the page:\n");
+ dump_page(page, "kasan: bad access detected");
}
- dump_stack();
}
static bool row_is_guilty(const void *row, const void *guilty)
@@ -260,37 +326,74 @@ static void print_shadow_for_address(const void *addr)
}
}
+void kasan_report_double_free(struct kmem_cache *cache, void *object,
+ void *ip)
+{
+ unsigned long flags;
+
+ kasan_start_report(&flags);
+ pr_err("BUG: KASAN: double-free or invalid-free in %pS\n", ip);
+ pr_err("\n");
+ print_address_description(object);
+ pr_err("\n");
+ print_shadow_for_address(object);
+ kasan_end_report(&flags);
+}
+
static void kasan_report_error(struct kasan_access_info *info)
{
unsigned long flags;
- const char *bug_type;
kasan_start_report(&flags);
- if (info->access_addr <
- kasan_shadow_to_mem((void *)KASAN_SHADOW_START)) {
- if ((unsigned long)info->access_addr < PAGE_SIZE)
- bug_type = "null-ptr-deref";
- else if ((unsigned long)info->access_addr < TASK_SIZE)
- bug_type = "user-memory-access";
- else
- bug_type = "wild-memory-access";
- pr_err("BUG: KASAN: %s on address %p\n",
- bug_type, info->access_addr);
- pr_err("%s of size %zu by task %s/%d\n",
- info->is_write ? "Write" : "Read",
- info->access_size, current->comm,
- task_pid_nr(current));
+ print_error_description(info);
+ pr_err("\n");
+
+ if (!addr_has_shadow(info)) {
dump_stack();
} else {
- print_error_description(info);
- print_address_description(info);
+ print_address_description((void *)info->access_addr);
+ pr_err("\n");
print_shadow_for_address(info->first_bad_addr);
}
kasan_end_report(&flags);
}
+static unsigned long kasan_flags;
+
+#define KASAN_BIT_REPORTED 0
+#define KASAN_BIT_MULTI_SHOT 1
+
+bool kasan_save_enable_multi_shot(void)
+{
+ return test_and_set_bit(KASAN_BIT_MULTI_SHOT, &kasan_flags);
+}
+EXPORT_SYMBOL_GPL(kasan_save_enable_multi_shot);
+
+void kasan_restore_multi_shot(bool enabled)
+{
+ if (!enabled)
+ clear_bit(KASAN_BIT_MULTI_SHOT, &kasan_flags);
+}
+EXPORT_SYMBOL_GPL(kasan_restore_multi_shot);
+
+static int __init kasan_set_multi_shot(char *str)
+{
+ set_bit(KASAN_BIT_MULTI_SHOT, &kasan_flags);
+ return 1;
+}
+__setup("kasan_multi_shot", kasan_set_multi_shot);
+
+static inline bool kasan_report_enabled(void)
+{
+ if (current->kasan_depth)
+ return false;
+ if (test_bit(KASAN_BIT_MULTI_SHOT, &kasan_flags))
+ return true;
+ return !test_and_set_bit(KASAN_BIT_REPORTED, &kasan_flags);
+}
+
void kasan_report(unsigned long addr, size_t size,
bool is_write, unsigned long ip)
{
diff --git a/mm/madvise.c b/mm/madvise.c
index 4a01c4bd786c..6849e4e2be22 100644
--- a/mm/madvise.c
+++ b/mm/madvise.c
@@ -109,7 +109,7 @@ static long madvise_behavior(struct vm_area_struct *vma,
pgoff = vma->vm_pgoff + ((start - vma->vm_start) >> PAGE_SHIFT);
*prev = vma_merge(mm, *prev, start, end, new_flags, vma->anon_vma,
vma->vm_file, pgoff, vma_policy(vma),
- vma->vm_userfaultfd_ctx);
+ vma->vm_userfaultfd_ctx, vma_get_anon_name(vma));
if (*prev) {
vma = *prev;
goto success;
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index e21d9b44247b..d80898272227 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -752,7 +752,8 @@ static int mbind_range(struct mm_struct *mm, unsigned long start,
((vmstart - vma->vm_start) >> PAGE_SHIFT);
prev = vma_merge(mm, prev, vmstart, vmend, vma->vm_flags,
vma->anon_vma, vma->vm_file, pgoff,
- new_pol, vma->vm_userfaultfd_ctx);
+ new_pol, vma->vm_userfaultfd_ctx,
+ vma_get_anon_name(vma));
if (prev) {
vma = prev;
next = vma->vm_next;
diff --git a/mm/mlock.c b/mm/mlock.c
index f0505692a5f4..9cdd063b7d32 100644
--- a/mm/mlock.c
+++ b/mm/mlock.c
@@ -529,7 +529,7 @@ static int mlock_fixup(struct vm_area_struct *vma, struct vm_area_struct **prev,
pgoff = vma->vm_pgoff + ((start - vma->vm_start) >> PAGE_SHIFT);
*prev = vma_merge(mm, *prev, start, end, newflags, vma->anon_vma,
vma->vm_file, pgoff, vma_policy(vma),
- vma->vm_userfaultfd_ctx);
+ vma->vm_userfaultfd_ctx, vma_get_anon_name(vma));
if (*prev) {
vma = *prev;
goto success;
diff --git a/mm/mmap.c b/mm/mmap.c
index 283755645d17..131641484303 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -970,7 +970,8 @@ again:
*/
static inline int is_mergeable_vma(struct vm_area_struct *vma,
struct file *file, unsigned long vm_flags,
- struct vm_userfaultfd_ctx vm_userfaultfd_ctx)
+ struct vm_userfaultfd_ctx vm_userfaultfd_ctx,
+ const char __user *anon_name)
{
/*
* VM_SOFTDIRTY should not prevent from VMA merging, if we
@@ -988,6 +989,8 @@ static inline int is_mergeable_vma(struct vm_area_struct *vma,
return 0;
if (!is_mergeable_vm_userfaultfd_ctx(vma, vm_userfaultfd_ctx))
return 0;
+ if (vma_get_anon_name(vma) != anon_name)
+ return 0;
return 1;
}
@@ -1020,9 +1023,10 @@ static int
can_vma_merge_before(struct vm_area_struct *vma, unsigned long vm_flags,
struct anon_vma *anon_vma, struct file *file,
pgoff_t vm_pgoff,
- struct vm_userfaultfd_ctx vm_userfaultfd_ctx)
+ struct vm_userfaultfd_ctx vm_userfaultfd_ctx,
+ const char __user *anon_name)
{
- if (is_mergeable_vma(vma, file, vm_flags, vm_userfaultfd_ctx) &&
+ if (is_mergeable_vma(vma, file, vm_flags, vm_userfaultfd_ctx, anon_name) &&
is_mergeable_anon_vma(anon_vma, vma->anon_vma, vma)) {
if (vma->vm_pgoff == vm_pgoff)
return 1;
@@ -1041,9 +1045,10 @@ static int
can_vma_merge_after(struct vm_area_struct *vma, unsigned long vm_flags,
struct anon_vma *anon_vma, struct file *file,
pgoff_t vm_pgoff,
- struct vm_userfaultfd_ctx vm_userfaultfd_ctx)
+ struct vm_userfaultfd_ctx vm_userfaultfd_ctx,
+ const char __user *anon_name)
{
- if (is_mergeable_vma(vma, file, vm_flags, vm_userfaultfd_ctx) &&
+ if (is_mergeable_vma(vma, file, vm_flags, vm_userfaultfd_ctx, anon_name) &&
is_mergeable_anon_vma(anon_vma, vma->anon_vma, vma)) {
pgoff_t vm_pglen;
vm_pglen = vma_pages(vma);
@@ -1054,9 +1059,9 @@ can_vma_merge_after(struct vm_area_struct *vma, unsigned long vm_flags,
}
/*
- * Given a mapping request (addr,end,vm_flags,file,pgoff), figure out
- * whether that can be merged with its predecessor or its successor.
- * Or both (it neatly fills a hole).
+ * Given a mapping request (addr,end,vm_flags,file,pgoff,anon_name),
+ * figure out whether that can be merged with its predecessor or its
+ * successor. Or both (it neatly fills a hole).
*
* In most cases - when called for mmap, brk or mremap - [addr,end) is
* certain not to be mapped by the time vma_merge is called; but when
@@ -1098,7 +1103,8 @@ struct vm_area_struct *vma_merge(struct mm_struct *mm,
unsigned long end, unsigned long vm_flags,
struct anon_vma *anon_vma, struct file *file,
pgoff_t pgoff, struct mempolicy *policy,
- struct vm_userfaultfd_ctx vm_userfaultfd_ctx)
+ struct vm_userfaultfd_ctx vm_userfaultfd_ctx,
+ const char __user *anon_name)
{
pgoff_t pglen = (end - addr) >> PAGE_SHIFT;
struct vm_area_struct *area, *next;
@@ -1131,7 +1137,8 @@ struct vm_area_struct *vma_merge(struct mm_struct *mm,
mpol_equal(vma_policy(prev), policy) &&
can_vma_merge_after(prev, vm_flags,
anon_vma, file, pgoff,
- vm_userfaultfd_ctx)) {
+ vm_userfaultfd_ctx,
+ anon_name)) {
/*
* OK, it can. Can we now merge in the successor as well?
*/
@@ -1140,7 +1147,8 @@ struct vm_area_struct *vma_merge(struct mm_struct *mm,
can_vma_merge_before(next, vm_flags,
anon_vma, file,
pgoff+pglen,
- vm_userfaultfd_ctx) &&
+ vm_userfaultfd_ctx,
+ anon_name) &&
is_mergeable_anon_vma(prev->anon_vma,
next->anon_vma, NULL)) {
/* cases 1, 6 */
@@ -1163,7 +1171,8 @@ struct vm_area_struct *vma_merge(struct mm_struct *mm,
mpol_equal(policy, vma_policy(next)) &&
can_vma_merge_before(next, vm_flags,
anon_vma, file, pgoff+pglen,
- vm_userfaultfd_ctx)) {
+ vm_userfaultfd_ctx,
+ anon_name)) {
if (prev && addr < prev->vm_end) /* case 4 */
err = __vma_adjust(prev, prev->vm_start,
addr, prev->vm_pgoff, NULL, next);
@@ -1673,7 +1682,7 @@ unsigned long mmap_region(struct file *file, unsigned long addr,
* Can we just expand an old mapping?
*/
vma = vma_merge(mm, prev, addr, addr + len, vm_flags,
- NULL, file, pgoff, NULL, NULL_VM_UFFD_CTX);
+ NULL, file, pgoff, NULL, NULL_VM_UFFD_CTX, NULL);
if (vma)
goto out;
@@ -2733,6 +2742,7 @@ int do_munmap(struct mm_struct *mm, unsigned long start, size_t len)
return 0;
}
+EXPORT_SYMBOL(do_munmap);
int vm_munmap(unsigned long start, size_t len)
{
@@ -2922,7 +2932,7 @@ static int do_brk(unsigned long addr, unsigned long len)
/* Can we just expand an old private anonymous mapping? */
vma = vma_merge(mm, prev, addr, addr + len, flags,
- NULL, NULL, pgoff, NULL, NULL_VM_UFFD_CTX);
+ NULL, NULL, pgoff, NULL, NULL_VM_UFFD_CTX, NULL);
if (vma)
goto out;
@@ -3090,7 +3100,7 @@ struct vm_area_struct *copy_vma(struct vm_area_struct **vmap,
return NULL; /* should never get here */
new_vma = vma_merge(mm, prev, addr, addr + len, vma->vm_flags,
vma->anon_vma, vma->vm_file, pgoff, vma_policy(vma),
- vma->vm_userfaultfd_ctx);
+ vma->vm_userfaultfd_ctx, vma_get_anon_name(vma));
if (new_vma) {
/*
* Source vma may have been merged into new_vma
diff --git a/mm/mprotect.c b/mm/mprotect.c
index 6896f77be166..5471f35be825 100644
--- a/mm/mprotect.c
+++ b/mm/mprotect.c
@@ -352,7 +352,7 @@ mprotect_fixup(struct vm_area_struct *vma, struct vm_area_struct **pprev,
pgoff = vma->vm_pgoff + ((start - vma->vm_start) >> PAGE_SHIFT);
*pprev = vma_merge(mm, *pprev, start, end, newflags,
vma->anon_vma, vma->vm_file, pgoff, vma_policy(vma),
- vma->vm_userfaultfd_ctx);
+ vma->vm_userfaultfd_ctx, vma_get_anon_name(vma));
if (*pprev) {
vma = *pprev;
VM_WARN_ON((vma->vm_flags ^ newflags) & ~VM_SOFTDIRTY);
diff --git a/mm/page-writeback.c b/mm/page-writeback.c
index 807236aed275..0bd812d6c750 100644
--- a/mm/page-writeback.c
+++ b/mm/page-writeback.c
@@ -2185,30 +2185,14 @@ retry:
while (!done && (index <= end)) {
int i;
- nr_pages = pagevec_lookup_tag(&pvec, mapping, &index, tag,
- min(end - index, (pgoff_t)PAGEVEC_SIZE-1) + 1);
+ nr_pages = pagevec_lookup_range_tag(&pvec, mapping, &index, end,
+ tag);
if (nr_pages == 0)
break;
for (i = 0; i < nr_pages; i++) {
struct page *page = pvec.pages[i];
- /*
- * At this point, the page may be truncated or
- * invalidated (changing page->mapping to NULL), or
- * even swizzled back from swapper_space to tmpfs file
- * mapping. However, page->index will not change
- * because we have a reference on the page.
- */
- if (page->index > end) {
- /*
- * can't be range_cyclic (1st pass) because
- * end == -1 in that case.
- */
- done = 1;
- break;
- }
-
done_index = page->index;
lock_page(page);
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 3af727d95c17..cc23cdadf3eb 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -253,10 +253,22 @@ compound_page_dtor * const compound_page_dtors[] = {
#endif
};
+/*
+ * Try to keep at least this much lowmem free. Do not allow normal
+ * allocations below this point, only high priority ones. Automatically
+ * tuned according to the amount of memory in the system.
+ */
int min_free_kbytes = 1024;
int user_min_free_kbytes = -1;
int watermark_scale_factor = 10;
+/*
+ * Extra memory for the system to try freeing. Used to temporarily
+ * free memory, to make space for new workloads. Anyone can allocate
+ * down to the min watermarks controlled by min_free_kbytes above.
+ */
+int extra_free_kbytes = 0;
+
static unsigned long __meminitdata nr_kernel_pages;
static unsigned long __meminitdata nr_all_pages;
static unsigned long __meminitdata dma_reserve;
@@ -2341,8 +2353,9 @@ static void drain_pages(unsigned int cpu)
* The CPU has to be pinned. When zone parameter is non-NULL, spill just
* the single zone's pages.
*/
-void drain_local_pages(struct zone *zone)
+void drain_local_pages(void *z)
{
+ struct zone *zone = (struct zone *)z;
int cpu = smp_processor_id();
if (zone)
@@ -2402,8 +2415,7 @@ void drain_all_pages(struct zone *zone)
else
cpumask_clear_cpu(cpu, &cpus_with_pcps);
}
- on_each_cpu_mask(&cpus_with_pcps, (smp_call_func_t) drain_local_pages,
- zone, 1);
+ on_each_cpu_mask(&cpus_with_pcps, drain_local_pages, zone, 1);
}
#ifdef CONFIG_HIBERNATION
@@ -6653,6 +6665,7 @@ static void setup_per_zone_lowmem_reserve(void)
static void __setup_per_zone_wmarks(void)
{
unsigned long pages_min = min_free_kbytes >> (PAGE_SHIFT - 10);
+ unsigned long pages_low = extra_free_kbytes >> (PAGE_SHIFT - 10);
unsigned long lowmem_pages = 0;
struct zone *zone;
unsigned long flags;
@@ -6664,11 +6677,14 @@ static void __setup_per_zone_wmarks(void)
}
for_each_zone(zone) {
- u64 tmp;
+ u64 min, low;
spin_lock_irqsave(&zone->lock, flags);
- tmp = (u64)pages_min * zone->managed_pages;
- do_div(tmp, lowmem_pages);
+ min = (u64)pages_min * zone->managed_pages;
+ do_div(min, lowmem_pages);
+ low = (u64)pages_low * zone->managed_pages;
+ do_div(low, vm_total_pages);
+
if (is_highmem(zone)) {
/*
* __GFP_HIGH and PF_MEMALLOC allocations usually don't
@@ -6689,7 +6705,7 @@ static void __setup_per_zone_wmarks(void)
* If it's a lowmem zone, reserve a number of pages
* proportionate to the zone's size.
*/
- zone->watermark[WMARK_MIN] = tmp;
+ zone->watermark[WMARK_MIN] = min;
}
/*
@@ -6697,12 +6713,14 @@ static void __setup_per_zone_wmarks(void)
* scale factor in proportion to available memory, but
* ensure a minimum size on small systems.
*/
- tmp = max_t(u64, tmp >> 2,
+ min = max_t(u64, min >> 2,
mult_frac(zone->managed_pages,
watermark_scale_factor, 10000));
- zone->watermark[WMARK_LOW] = min_wmark_pages(zone) + tmp;
- zone->watermark[WMARK_HIGH] = min_wmark_pages(zone) + tmp * 2;
+ zone->watermark[WMARK_LOW] = min_wmark_pages(zone) +
+ low + min;
+ zone->watermark[WMARK_HIGH] = min_wmark_pages(zone) +
+ low + min * 2;
spin_unlock_irqrestore(&zone->lock, flags);
}
@@ -6783,7 +6801,7 @@ core_initcall(init_per_zone_wmark_min)
/*
* min_free_kbytes_sysctl_handler - just a wrapper around proc_dointvec() so
* that we can call two helper functions whenever min_free_kbytes
- * changes.
+ * or extra_free_kbytes changes.
*/
int min_free_kbytes_sysctl_handler(struct ctl_table *table, int write,
void __user *buffer, size_t *length, loff_t *ppos)
diff --git a/mm/readahead.c b/mm/readahead.c
index c8a955b1297e..3c1d2a4612c8 100644
--- a/mm/readahead.c
+++ b/mm/readahead.c
@@ -81,7 +81,7 @@ static void read_cache_pages_invalidate_pages(struct address_space *mapping,
* Hides the details of the LRU cache etc from the filesystems.
*/
int read_cache_pages(struct address_space *mapping, struct list_head *pages,
- int (*filler)(void *, struct page *), void *data)
+ int (*filler)(struct file *, struct page *), void *data)
{
struct page *page;
int ret = 0;
diff --git a/mm/shmem.c b/mm/shmem.c
index 9b17bd4cbc5e..d15851d9c53c 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -4083,6 +4083,14 @@ struct file *shmem_file_setup(const char *name, loff_t size, unsigned long flags
}
EXPORT_SYMBOL_GPL(shmem_file_setup);
+void shmem_set_file(struct vm_area_struct *vma, struct file *file)
+{
+ if (vma->vm_file)
+ fput(vma->vm_file);
+ vma->vm_file = file;
+ vma->vm_ops = &shmem_vm_ops;
+}
+
/**
* shmem_zero_setup - setup a shared anonymous mapping
* @vma: the vma to be mmapped is prepared by do_mmap_pgoff
@@ -4102,10 +4110,7 @@ int shmem_zero_setup(struct vm_area_struct *vma)
if (IS_ERR(file))
return PTR_ERR(file);
- if (vma->vm_file)
- fput(vma->vm_file);
- vma->vm_file = file;
- vma->vm_ops = &shmem_vm_ops;
+ shmem_set_file(vma, file);
if (IS_ENABLED(CONFIG_TRANSPARENT_HUGE_PAGECACHE) &&
((vma->vm_start + ~HPAGE_PMD_MASK) & HPAGE_PMD_MASK) <
diff --git a/mm/slab_common.c b/mm/slab_common.c
index 13f1926f8fcd..467cf11f6990 100644
--- a/mm/slab_common.c
+++ b/mm/slab_common.c
@@ -458,6 +458,9 @@ EXPORT_SYMBOL(kmem_cache_create);
static int shutdown_cache(struct kmem_cache *s,
struct list_head *release, bool *need_rcu_barrier)
{
+ /* free asan quarantined objects */
+ kasan_cache_shutdown(s);
+
if (__kmem_cache_shutdown(s) != 0)
return -EBUSY;
@@ -741,7 +744,6 @@ void kmem_cache_destroy(struct kmem_cache *s)
get_online_cpus();
get_online_mems();
- kasan_cache_destroy(s);
mutex_lock(&slab_mutex);
s->refcount--;
diff --git a/mm/slub.c b/mm/slub.c
index 131dee87a67c..ecaa780f543a 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -4667,6 +4667,22 @@ enum slab_stat_type {
#define SO_OBJECTS (1 << SL_OBJECTS)
#define SO_TOTAL (1 << SL_TOTAL)
+#ifdef CONFIG_MEMCG
+static bool memcg_sysfs_enabled = IS_ENABLED(CONFIG_SLUB_MEMCG_SYSFS_ON);
+
+static int __init setup_slub_memcg_sysfs(char *str)
+{
+ int v;
+
+ if (get_option(&str, &v) > 0)
+ memcg_sysfs_enabled = v;
+
+ return 1;
+}
+
+__setup("slub_memcg_sysfs=", setup_slub_memcg_sysfs);
+#endif
+
static ssize_t show_slab_objects(struct kmem_cache *s,
char *buf, unsigned long flags)
{
@@ -5572,8 +5588,14 @@ static int sysfs_slab_add(struct kmem_cache *s)
{
int err;
const char *name;
+ struct kset *kset = cache_kset(s);
int unmergeable = slab_unmergeable(s);
+ if (!kset) {
+ kobject_init(&s->kobj, &slab_ktype);
+ return 0;
+ }
+
if (unmergeable) {
/*
* Slabcache can never be merged so we can use the name proper.
@@ -5590,7 +5612,7 @@ static int sysfs_slab_add(struct kmem_cache *s)
name = create_unique_id(s);
}
- s->kobj.kset = cache_kset(s);
+ s->kobj.kset = kset;
err = kobject_init_and_add(&s->kobj, &slab_ktype, NULL, "%s", name);
if (err)
goto out;
@@ -5600,7 +5622,7 @@ static int sysfs_slab_add(struct kmem_cache *s)
goto out_del_kobj;
#ifdef CONFIG_MEMCG
- if (is_root_cache(s)) {
+ if (is_root_cache(s) && memcg_sysfs_enabled) {
s->memcg_kset = kset_create_and_add("cgroup", NULL, &s->kobj);
if (!s->memcg_kset) {
err = -ENOMEM;
diff --git a/mm/swap.c b/mm/swap.c
index 4dcf852e1e6d..e75bb400b2e5 100644
--- a/mm/swap.c
+++ b/mm/swap.c
@@ -954,15 +954,25 @@ unsigned pagevec_lookup(struct pagevec *pvec, struct address_space *mapping,
}
EXPORT_SYMBOL(pagevec_lookup);
-unsigned pagevec_lookup_tag(struct pagevec *pvec, struct address_space *mapping,
- pgoff_t *index, int tag, unsigned nr_pages)
+unsigned pagevec_lookup_range_tag(struct pagevec *pvec,
+ struct address_space *mapping, pgoff_t *index, pgoff_t end,
+ int tag)
{
- pvec->nr = find_get_pages_tag(mapping, index, tag,
- nr_pages, pvec->pages);
+ pvec->nr = find_get_pages_range_tag(mapping, index, end, tag,
+ PAGEVEC_SIZE, pvec->pages);
return pagevec_count(pvec);
}
-EXPORT_SYMBOL(pagevec_lookup_tag);
+EXPORT_SYMBOL(pagevec_lookup_range_tag);
+unsigned pagevec_lookup_range_nr_tag(struct pagevec *pvec,
+ struct address_space *mapping, pgoff_t *index, pgoff_t end,
+ int tag, unsigned max_pages)
+{
+ pvec->nr = find_get_pages_range_tag(mapping, index, end, tag,
+ min_t(unsigned int, max_pages, PAGEVEC_SIZE), pvec->pages);
+ return pagevec_count(pvec);
+}
+EXPORT_SYMBOL(pagevec_lookup_range_nr_tag);
/*
* Perform any setup for the swap system
*/
diff --git a/mm/zsmalloc.c b/mm/zsmalloc.c
index d3548c48369f..6382bdae949d 100644
--- a/mm/zsmalloc.c
+++ b/mm/zsmalloc.c
@@ -188,6 +188,7 @@ static int zs_size_classes;
* (see: fix_fullness_group())
*/
static const int fullness_threshold_frac = 4;
+static size_t huge_class_size;
struct size_class {
spinlock_t lock;
@@ -1484,6 +1485,25 @@ void zs_unmap_object(struct zs_pool *pool, unsigned long handle)
}
EXPORT_SYMBOL_GPL(zs_unmap_object);
+/**
+ * zs_huge_class_size() - Returns the size (in bytes) of the first huge
+ * zsmalloc &size_class.
+ * @pool: zsmalloc pool to use
+ *
+ * The function returns the size of the first huge class - any object of equal
+ * or bigger size will be stored in zspage consisting of a single physical
+ * page.
+ *
+ * Context: Any context.
+ *
+ * Return: the size (in bytes) of the first huge zsmalloc &size_class.
+ */
+size_t zs_huge_class_size(struct zs_pool *pool)
+{
+ return huge_class_size;
+}
+EXPORT_SYMBOL_GPL(zs_huge_class_size);
+
static unsigned long obj_malloc(struct size_class *class,
struct zspage *zspage, unsigned long handle)
{
@@ -2440,6 +2460,27 @@ struct zs_pool *zs_create_pool(const char *name)
objs_per_zspage = pages_per_zspage * PAGE_SIZE / size;
/*
+ * We iterate from biggest down to smallest classes,
+ * so huge_class_size holds the size of the first huge
+ * class. Any object bigger than or equal to that will
+ * endup in the huge class.
+ */
+ if (pages_per_zspage != 1 && objs_per_zspage != 1 &&
+ !huge_class_size) {
+ huge_class_size = size;
+ /*
+ * The object uses ZS_HANDLE_SIZE bytes to store the
+ * handle. We need to subtract it, because zs_malloc()
+ * unconditionally adds handle size before it performs
+ * size class search - so object may be smaller than
+ * huge class size, yet it still can end up in the huge
+ * class because it grows by ZS_HANDLE_SIZE extra bytes
+ * right before class lookup.
+ */
+ huge_class_size -= (ZS_HANDLE_SIZE - 1);
+ }
+
+ /*
* size_class is used for normal zsmalloc operation such
* as alloc/free for that size. Although it is natural that we
* have one size_class for each size, there is a chance that we
diff --git a/net/Kconfig b/net/Kconfig
index 7b6cd340b72b..37757600171a 100644
--- a/net/Kconfig
+++ b/net/Kconfig
@@ -89,6 +89,12 @@ source "net/netlabel/Kconfig"
endif # if INET
+config ANDROID_PARANOID_NETWORK
+ bool "Only allow certain groups to create sockets"
+ default y
+ help
+ none
+
config NETWORK_SECMARK
bool "Security Marking"
help
@@ -258,10 +264,6 @@ config XPS
config HWBM
bool
-config SOCK_CGROUP_DATA
- bool
- default n
-
config CGROUP_NET_PRIO
bool "Network priority cgroup"
depends on CGROUPS
@@ -292,6 +294,7 @@ config BPF_JIT
bool "enable BPF Just In Time compiler"
depends on HAVE_CBPF_JIT || HAVE_EBPF_JIT
depends on MODULES
+ depends on !CFI
---help---
Berkeley Packet Filter filtering capabilities are normally handled
by an interpreter. This option allows kernel to generate a native
diff --git a/net/bluetooth/af_bluetooth.c b/net/bluetooth/af_bluetooth.c
index 5d3698170004..ec313c9dee6d 100644
--- a/net/bluetooth/af_bluetooth.c
+++ b/net/bluetooth/af_bluetooth.c
@@ -106,11 +106,40 @@ void bt_sock_unregister(int proto)
}
EXPORT_SYMBOL(bt_sock_unregister);
+#ifdef CONFIG_PARANOID_NETWORK
+static inline int current_has_bt_admin(void)
+{
+ return !current_euid();
+}
+
+static inline int current_has_bt(void)
+{
+ return current_has_bt_admin();
+}
+# else
+static inline int current_has_bt_admin(void)
+{
+ return 1;
+}
+
+static inline int current_has_bt(void)
+{
+ return 1;
+}
+#endif
+
static int bt_sock_create(struct net *net, struct socket *sock, int proto,
int kern)
{
int err;
+ if (proto == BTPROTO_RFCOMM || proto == BTPROTO_SCO ||
+ proto == BTPROTO_L2CAP) {
+ if (!current_has_bt())
+ return -EPERM;
+ } else if (!current_has_bt_admin())
+ return -EPERM;
+
if (net != &init_net)
return -EAFNOSUPPORT;
diff --git a/net/bridge/br_device.c b/net/bridge/br_device.c
index 5f5e28f210e0..04eea2f4b80f 100644
--- a/net/bridge/br_device.c
+++ b/net/bridge/br_device.c
@@ -48,11 +48,6 @@ netdev_tx_t br_dev_xmit(struct sk_buff *skb, struct net_device *dev)
return NETDEV_TX_OK;
}
- u64_stats_update_begin(&brstats->syncp);
- brstats->tx_packets++;
- brstats->tx_bytes += skb->len;
- u64_stats_update_end(&brstats->syncp);
-
#ifdef CONFIG_NET_SWITCHDEV
skb->offload_fwd_mark = 0;
#endif
@@ -61,6 +56,12 @@ netdev_tx_t br_dev_xmit(struct sk_buff *skb, struct net_device *dev)
skb_reset_mac_header(skb);
skb_pull(skb, ETH_HLEN);
+ u64_stats_update_begin(&brstats->syncp);
+ brstats->tx_packets++;
+ /* Exclude ETH_HLEN from byte stats for consistency with Rx chain */
+ brstats->tx_bytes += skb->len;
+ u64_stats_update_end(&brstats->syncp);
+
if (!br_allowed_ingress(br, br_vlan_group_rcu(br), skb, &vid))
goto out;
diff --git a/net/core/fib_rules.c b/net/core/fib_rules.c
index be4629c344a6..31c4041f7586 100644
--- a/net/core/fib_rules.c
+++ b/net/core/fib_rules.c
@@ -18,6 +18,11 @@
#include <net/fib_rules.h>
#include <net/ip_tunnels.h>
+static const struct fib_kuid_range fib_kuid_range_unset = {
+ KUIDT_INIT(0),
+ KUIDT_INIT(~0),
+};
+
int fib_default_rule_add(struct fib_rules_ops *ops,
u32 pref, u32 table, u32 flags)
{
@@ -33,6 +38,7 @@ int fib_default_rule_add(struct fib_rules_ops *ops,
r->table = table;
r->flags = flags;
r->fr_net = ops->fro_net;
+ r->uid_range = fib_kuid_range_unset;
r->suppress_prefixlen = -1;
r->suppress_ifgroup = -1;
@@ -172,6 +178,34 @@ void fib_rules_unregister(struct fib_rules_ops *ops)
}
EXPORT_SYMBOL_GPL(fib_rules_unregister);
+static int uid_range_set(struct fib_kuid_range *range)
+{
+ return uid_valid(range->start) && uid_valid(range->end);
+}
+
+static struct fib_kuid_range nla_get_kuid_range(struct nlattr **tb)
+{
+ struct fib_rule_uid_range *in;
+ struct fib_kuid_range out;
+
+ in = (struct fib_rule_uid_range *)nla_data(tb[FRA_UID_RANGE]);
+
+ out.start = make_kuid(current_user_ns(), in->start);
+ out.end = make_kuid(current_user_ns(), in->end);
+
+ return out;
+}
+
+static int nla_put_uid_range(struct sk_buff *skb, struct fib_kuid_range *range)
+{
+ struct fib_rule_uid_range out = {
+ from_kuid_munged(current_user_ns(), range->start),
+ from_kuid_munged(current_user_ns(), range->end)
+ };
+
+ return nla_put(skb, FRA_UID_RANGE, sizeof(out), &out);
+}
+
static int fib_rule_match(struct fib_rule *rule, struct fib_rules_ops *ops,
struct flowi *fl, int flags,
struct fib_lookup_arg *arg)
@@ -193,6 +227,10 @@ static int fib_rule_match(struct fib_rule *rule, struct fib_rules_ops *ops,
if (rule->l3mdev && !l3mdev_fib_rule_match(rule->fr_net, fl, arg))
goto out;
+ if (uid_lt(fl->flowi_uid, rule->uid_range.start) ||
+ uid_gt(fl->flowi_uid, rule->uid_range.end))
+ goto out;
+
ret = ops->match(rule, fl, flags);
out:
return (rule->flags & FIB_RULE_INVERT) ? !ret : ret;
@@ -305,6 +343,10 @@ static int rule_exists(struct fib_rules_ops *ops, struct fib_rule_hdr *frh,
if (r->l3mdev != rule->l3mdev)
continue;
+ if (!uid_eq(r->uid_range.start, rule->uid_range.start) ||
+ !uid_eq(r->uid_range.end, rule->uid_range.end))
+ continue;
+
if (!ops->compare(r, frh, tb))
continue;
return 1;
@@ -383,6 +425,7 @@ int fib_nl_newrule(struct sk_buff *skb, struct nlmsghdr *nlh)
if (tb[FRA_TUN_ID])
rule->tun_id = nla_get_be64(tb[FRA_TUN_ID]);
+ err = -EINVAL;
if (tb[FRA_L3MDEV]) {
#ifdef CONFIG_NET_L3_MASTER_DEV
rule->l3mdev = nla_get_u8(tb[FRA_L3MDEV]);
@@ -404,7 +447,6 @@ int fib_nl_newrule(struct sk_buff *skb, struct nlmsghdr *nlh)
else
rule->suppress_ifgroup = -1;
- err = -EINVAL;
if (tb[FRA_GOTO]) {
if (rule->action != FR_ACT_GOTO)
goto errout_free;
@@ -429,6 +471,21 @@ int fib_nl_newrule(struct sk_buff *skb, struct nlmsghdr *nlh)
if (rule->l3mdev && rule->table)
goto errout_free;
+ if (tb[FRA_UID_RANGE]) {
+ if (current_user_ns() != net->user_ns) {
+ err = -EPERM;
+ goto errout_free;
+ }
+
+ rule->uid_range = nla_get_kuid_range(tb);
+
+ if (!uid_range_set(&rule->uid_range) ||
+ !uid_lte(rule->uid_range.start, rule->uid_range.end))
+ goto errout_free;
+ } else {
+ rule->uid_range = fib_kuid_range_unset;
+ }
+
if ((nlh->nlmsg_flags & NLM_F_EXCL) &&
rule_exists(ops, frh, tb, rule)) {
err = -EEXIST;
@@ -497,6 +554,7 @@ int fib_nl_delrule(struct sk_buff *skb, struct nlmsghdr *nlh)
struct fib_rules_ops *ops = NULL;
struct fib_rule *rule, *tmp;
struct nlattr *tb[FRA_MAX+1];
+ struct fib_kuid_range range;
int err = -EINVAL;
if (nlh->nlmsg_len < nlmsg_msg_size(sizeof(*frh)))
@@ -516,6 +574,16 @@ int fib_nl_delrule(struct sk_buff *skb, struct nlmsghdr *nlh)
if (err < 0)
goto errout;
+ if (tb[FRA_UID_RANGE]) {
+ range = nla_get_kuid_range(tb);
+ if (!uid_range_set(&range)) {
+ err = -EINVAL;
+ goto errout;
+ }
+ } else {
+ range = fib_kuid_range_unset;
+ }
+
list_for_each_entry(rule, &ops->rules_list, list) {
if (frh->action && (frh->action != rule->action))
continue;
@@ -552,6 +620,11 @@ int fib_nl_delrule(struct sk_buff *skb, struct nlmsghdr *nlh)
(rule->l3mdev != nla_get_u8(tb[FRA_L3MDEV])))
continue;
+ if (uid_range_set(&range) &&
+ (!uid_eq(rule->uid_range.start, range.start) ||
+ !uid_eq(rule->uid_range.end, range.end)))
+ continue;
+
if (!ops->compare(rule, frh, tb))
continue;
@@ -619,7 +692,8 @@ static inline size_t fib_rule_nlmsg_size(struct fib_rules_ops *ops,
+ nla_total_size(4) /* FRA_SUPPRESS_IFGROUP */
+ nla_total_size(4) /* FRA_FWMARK */
+ nla_total_size(4) /* FRA_FWMASK */
- + nla_total_size_64bit(8); /* FRA_TUN_ID */
+ + nla_total_size_64bit(8) /* FRA_TUN_ID */
+ + nla_total_size(sizeof(struct fib_kuid_range));
if (ops->nlmsg_payload)
payload += ops->nlmsg_payload(rule);
@@ -679,7 +753,9 @@ static int fib_nl_fill_rule(struct sk_buff *skb, struct fib_rule *rule,
(rule->tun_id &&
nla_put_be64(skb, FRA_TUN_ID, rule->tun_id, FRA_PAD)) ||
(rule->l3mdev &&
- nla_put_u8(skb, FRA_L3MDEV, rule->l3mdev)))
+ nla_put_u8(skb, FRA_L3MDEV, rule->l3mdev)) ||
+ (uid_range_set(&rule->uid_range) &&
+ nla_put_uid_range(skb, &rule->uid_range)))
goto nla_put_failure;
if (rule->suppress_ifgroup != -1) {
diff --git a/net/core/filter.c b/net/core/filter.c
index e8c89d2d2bc0..c385c55af653 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -26,6 +26,7 @@
#include <linux/mm.h>
#include <linux/fcntl.h>
#include <linux/socket.h>
+#include <linux/sock_diag.h>
#include <linux/in.h>
#include <linux/inet.h>
#include <linux/netdevice.h>
@@ -78,6 +79,10 @@ int sk_filter_trim_cap(struct sock *sk, struct sk_buff *skb, unsigned int cap)
if (skb_pfmemalloc(skb) && !sock_flag(sk, SOCK_MEMALLOC))
return -ENOMEM;
+ err = BPF_CGROUP_RUN_PROG_INET_INGRESS(sk, skb);
+ if (err)
+ return err;
+
err = security_sock_rcv_skb(sk, skb);
if (err)
return err;
@@ -85,7 +90,12 @@ int sk_filter_trim_cap(struct sock *sk, struct sk_buff *skb, unsigned int cap)
rcu_read_lock();
filter = rcu_dereference(sk->sk_filter);
if (filter) {
- unsigned int pkt_len = bpf_prog_run_save_cb(filter->prog, skb);
+ struct sock *save_sk = skb->sk;
+ unsigned int pkt_len;
+
+ skb->sk = sk;
+ pkt_len = bpf_prog_run_save_cb(filter->prog, skb);
+ skb->sk = save_sk;
err = pkt_len ? pskb_trim(skb, max(cap, pkt_len)) : -EPERM;
}
rcu_read_unlock();
@@ -2533,6 +2543,36 @@ static const struct bpf_func_proto bpf_xdp_event_output_proto = {
.arg5_type = ARG_CONST_STACK_SIZE,
};
+BPF_CALL_1(bpf_get_socket_cookie, struct sk_buff *, skb)
+{
+ return skb->sk ? sock_gen_cookie(skb->sk) : 0;
+}
+
+static const struct bpf_func_proto bpf_get_socket_cookie_proto = {
+ .func = bpf_get_socket_cookie,
+ .gpl_only = false,
+ .ret_type = RET_INTEGER,
+ .arg1_type = ARG_PTR_TO_CTX,
+};
+
+BPF_CALL_1(bpf_get_socket_uid, struct sk_buff *, skb)
+{
+ struct sock *sk = sk_to_full_sk(skb->sk);
+ kuid_t kuid;
+
+ if (!sk || !sk_fullsock(sk))
+ return overflowuid;
+ kuid = sock_net_uid(sock_net(sk), sk);
+ return from_kuid_munged(sock_net(sk)->user_ns, kuid);
+}
+
+static const struct bpf_func_proto bpf_get_socket_uid_proto = {
+ .func = bpf_get_socket_uid,
+ .gpl_only = false,
+ .ret_type = RET_INTEGER,
+ .arg1_type = ARG_PTR_TO_CTX,
+};
+
static const struct bpf_func_proto *
sk_filter_func_proto(enum bpf_func_id func_id)
{
@@ -2554,6 +2594,10 @@ sk_filter_func_proto(enum bpf_func_id func_id)
case BPF_FUNC_trace_printk:
if (capable(CAP_SYS_ADMIN))
return bpf_get_trace_printk_proto();
+ case BPF_FUNC_get_socket_cookie:
+ return &bpf_get_socket_cookie_proto;
+ case BPF_FUNC_get_socket_uid:
+ return &bpf_get_socket_uid_proto;
default:
return NULL;
}
@@ -2631,6 +2675,17 @@ xdp_func_proto(enum bpf_func_id func_id)
}
}
+static const struct bpf_func_proto *
+cg_skb_func_proto(enum bpf_func_id func_id)
+{
+ switch (func_id) {
+ case BPF_FUNC_skb_load_bytes:
+ return &bpf_skb_load_bytes_proto;
+ default:
+ return sk_filter_func_proto(func_id);
+ }
+}
+
static bool __is_valid_access(int off, int size, enum bpf_access_type type)
{
if (off < 0 || off >= sizeof(struct __sk_buff))
@@ -2993,6 +3048,12 @@ static const struct bpf_verifier_ops xdp_ops = {
.convert_ctx_access = xdp_convert_ctx_access,
};
+static const struct bpf_verifier_ops cg_skb_ops = {
+ .get_func_proto = cg_skb_func_proto,
+ .is_valid_access = sk_filter_is_valid_access,
+ .convert_ctx_access = sk_filter_convert_ctx_access,
+};
+
static struct bpf_prog_type_list sk_filter_type __read_mostly = {
.ops = &sk_filter_ops,
.type = BPF_PROG_TYPE_SOCKET_FILTER,
@@ -3013,12 +3074,18 @@ static struct bpf_prog_type_list xdp_type __read_mostly = {
.type = BPF_PROG_TYPE_XDP,
};
+static struct bpf_prog_type_list cg_skb_type __read_mostly = {
+ .ops = &cg_skb_ops,
+ .type = BPF_PROG_TYPE_CGROUP_SKB,
+};
+
static int __init register_sk_filter_ops(void)
{
bpf_register_prog_type(&sk_filter_type);
bpf_register_prog_type(&sched_cls_type);
bpf_register_prog_type(&sched_act_type);
bpf_register_prog_type(&xdp_type);
+ bpf_register_prog_type(&cg_skb_type);
return 0;
}
diff --git a/net/core/netpoll.c b/net/core/netpoll.c
index 457f882b0f7b..9b2d61120c0d 100644
--- a/net/core/netpoll.c
+++ b/net/core/netpoll.c
@@ -666,7 +666,7 @@ int netpoll_setup(struct netpoll *np)
int err;
rtnl_lock();
- if (np->dev_name) {
+ if (np->dev_name[0]) {
struct net *net = current->nsproxy->net_ns;
ndev = __dev_get_by_name(net, np->dev_name);
}
diff --git a/net/core/pktgen.c b/net/core/pktgen.c
index 306b8f0e03c1..fc7ec7a6343e 100644
--- a/net/core/pktgen.c
+++ b/net/core/pktgen.c
@@ -2345,7 +2345,7 @@ static void get_ipsec_sa(struct pktgen_dev *pkt_dev, int flow)
x = xfrm_state_lookup_byspi(pn->net, htonl(pkt_dev->spi), AF_INET);
} else {
/* slow path: we dont already have xfrm_state */
- x = xfrm_stateonly_find(pn->net, DUMMY_MARK,
+ x = xfrm_stateonly_find(pn->net, DUMMY_MARK, 0,
(xfrm_address_t *)&pkt_dev->cur_daddr,
(xfrm_address_t *)&pkt_dev->cur_saddr,
AF_INET,
diff --git a/net/core/sock.c b/net/core/sock.c
index 68c831e1a5c0..7d42594fa71b 100644
--- a/net/core/sock.c
+++ b/net/core/sock.c
@@ -1031,6 +1031,7 @@ int sock_getsockopt(struct socket *sock, int level, int optname,
union {
int val;
+ u64 val64;
struct linger ling;
struct timeval tm;
} v;
@@ -1261,6 +1262,13 @@ int sock_getsockopt(struct socket *sock, int level, int optname,
v.val = sk->sk_incoming_cpu;
break;
+
+ case SO_COOKIE:
+ lv = sizeof(u64);
+ if (len < lv)
+ return -EINVAL;
+ v.val64 = sock_gen_cookie(sk);
+ break;
default:
/* We implement the SO_SNDLOWAT etc to not be settable
* (1003.1g 7).
@@ -2441,8 +2449,11 @@ void sock_init_data(struct socket *sock, struct sock *sk)
sk->sk_type = sock->type;
sk->sk_wq = sock->wq;
sock->sk = sk;
- } else
+ sk->sk_uid = SOCK_INODE(sock)->i_uid;
+ } else {
sk->sk_wq = NULL;
+ sk->sk_uid = make_kuid(sock_net(sk)->user_ns, 0);
+ }
rwlock_init(&sk->sk_callback_lock);
lockdep_set_class_and_name(&sk->sk_callback_lock,
diff --git a/net/core/sock_diag.c b/net/core/sock_diag.c
index d1d9faf3046b..fb467db2344a 100644
--- a/net/core/sock_diag.c
+++ b/net/core/sock_diag.c
@@ -19,7 +19,7 @@ static int (*inet_rcv_compat)(struct sk_buff *skb, struct nlmsghdr *nlh);
static DEFINE_MUTEX(sock_diag_table_mutex);
static struct workqueue_struct *broadcast_wq;
-static u64 sock_gen_cookie(struct sock *sk)
+u64 sock_gen_cookie(struct sock *sk)
{
while (1) {
u64 res = atomic64_read(&sk->sk_cookie);
diff --git a/net/dns_resolver/dns_query.c b/net/dns_resolver/dns_query.c
index ecc28cff08ab..d502c94b1a82 100644
--- a/net/dns_resolver/dns_query.c
+++ b/net/dns_resolver/dns_query.c
@@ -70,7 +70,7 @@ int dns_query(const char *type, const char *name, size_t namelen,
const char *options, char **_result, time64_t *_expiry)
{
struct key *rkey;
- const struct user_key_payload *upayload;
+ struct user_key_payload *upayload;
const struct cred *saved_cred;
size_t typelen, desclen;
char *desc, *cp;
@@ -141,7 +141,7 @@ int dns_query(const char *type, const char *name, size_t namelen,
if (ret)
goto put;
- upayload = user_key_payload(rkey);
+ upayload = user_key_payload_locked(rkey);
len = upayload->datalen;
ret = -ENOMEM;
diff --git a/net/ipv4/Makefile b/net/ipv4/Makefile
index bc6a6c8b9bcd..a8b934aa9d84 100644
--- a/net/ipv4/Makefile
+++ b/net/ipv4/Makefile
@@ -16,6 +16,7 @@ obj-y := route.o inetpeer.o protocol.o \
obj-$(CONFIG_NET_IP_TUNNEL) += ip_tunnel.o
obj-$(CONFIG_SYSCTL) += sysctl_net_ipv4.o
+obj-$(CONFIG_SYSFS) += sysfs_net_ipv4.o
obj-$(CONFIG_PROC_FS) += proc.o
obj-$(CONFIG_IP_MULTIPLE_TABLES) += fib_rules.o
obj-$(CONFIG_IP_MROUTE) += ipmr.o
diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c
index 689246d079ad..8d7c8a841fad 100644
--- a/net/ipv4/af_inet.c
+++ b/net/ipv4/af_inet.c
@@ -89,6 +89,7 @@
#include <linux/netfilter_ipv4.h>
#include <linux/random.h>
#include <linux/slab.h>
+#include <linux/netfilter/xt_qtaguid.h>
#include <asm/uaccess.h>
@@ -121,6 +122,19 @@
#endif
#include <net/l3mdev.h>
+#ifdef CONFIG_ANDROID_PARANOID_NETWORK
+#include <linux/android_aid.h>
+
+static inline int current_has_network(void)
+{
+ return in_egroup_p(AID_INET) || capable(CAP_NET_RAW);
+}
+#else
+static inline int current_has_network(void)
+{
+ return 1;
+}
+#endif
/* The inetsw table contains everything that inet_create needs to
* build a new socket.
@@ -255,6 +269,9 @@ static int inet_create(struct net *net, struct socket *sock, int protocol,
if (protocol < 0 || protocol >= IPPROTO_MAX)
return -EINVAL;
+ if (!current_has_network())
+ return -EACCES;
+
sock->state = SS_UNCONNECTED;
/* Look for the requested type/protocol pair. */
@@ -303,8 +320,7 @@ lookup_protocol:
}
err = -EPERM;
- if (sock->type == SOCK_RAW && !kern &&
- !ns_capable(net->user_ns, CAP_NET_RAW))
+ if (sock->type == SOCK_RAW && !kern && !capable(CAP_NET_RAW))
goto out_rcu_unlock;
sock->ops = answer->ops;
@@ -397,6 +413,9 @@ int inet_release(struct socket *sock)
if (sk) {
long timeout;
+#ifdef CONFIG_NETFILTER_XT_MATCH_QTAGUID
+ qtaguid_untag(sock, true);
+#endif
/* Applications forget to leave groups before exiting */
ip_mc_drop_socket(sk);
@@ -566,13 +585,24 @@ int __inet_stream_connect(struct socket *sock, struct sockaddr *uaddr,
int err;
long timeo;
- if (addr_len < sizeof(uaddr->sa_family))
- return -EINVAL;
+ /*
+ * uaddr can be NULL and addr_len can be 0 if:
+ * sk is a TCP fastopen active socket and
+ * TCP_FASTOPEN_CONNECT sockopt is set and
+ * we already have a valid cookie for this socket.
+ * In this case, user can call write() after connect().
+ * write() will invoke tcp_sendmsg_fastopen() which calls
+ * __inet_stream_connect().
+ */
+ if (uaddr) {
+ if (addr_len < sizeof(uaddr->sa_family))
+ return -EINVAL;
- if (uaddr->sa_family == AF_UNSPEC) {
- err = sk->sk_prot->disconnect(sk, flags);
- sock->state = err ? SS_DISCONNECTING : SS_UNCONNECTED;
- goto out;
+ if (uaddr->sa_family == AF_UNSPEC) {
+ err = sk->sk_prot->disconnect(sk, flags);
+ sock->state = err ? SS_DISCONNECTING : SS_UNCONNECTED;
+ goto out;
+ }
}
switch (sock->state) {
@@ -583,7 +613,10 @@ int __inet_stream_connect(struct socket *sock, struct sockaddr *uaddr,
err = -EISCONN;
goto out;
case SS_CONNECTING:
- err = -EALREADY;
+ if (inet_sk(sk)->defer_connect)
+ err = -EINPROGRESS;
+ else
+ err = -EALREADY;
/* Fall out of switch with err, set for this state */
break;
case SS_UNCONNECTED:
@@ -597,6 +630,9 @@ int __inet_stream_connect(struct socket *sock, struct sockaddr *uaddr,
sock->state = SS_CONNECTING;
+ if (!err && inet_sk(sk)->defer_connect)
+ goto out;
+
/* Just entered SS_CONNECTING state; the only
* difference is that return value in non-blocking
* case is EINPROGRESS, rather than EALREADY.
diff --git a/net/ipv4/fib_frontend.c b/net/ipv4/fib_frontend.c
index 9364c39d0555..8e556fa75a3b 100644
--- a/net/ipv4/fib_frontend.c
+++ b/net/ipv4/fib_frontend.c
@@ -320,7 +320,7 @@ static int __fib_validate_source(struct sk_buff *skb, __be32 src, __be32 dst,
int ret, no_addr;
struct fib_result res;
struct flowi4 fl4;
- struct net *net;
+ struct net *net = dev_net(dev);
bool dev_match;
fl4.flowi4_oif = 0;
@@ -333,6 +333,7 @@ static int __fib_validate_source(struct sk_buff *skb, __be32 src, __be32 dst,
fl4.flowi4_scope = RT_SCOPE_UNIVERSE;
fl4.flowi4_tun_key.tun_id = 0;
fl4.flowi4_flags = 0;
+ fl4.flowi4_uid = sock_net_uid(net, NULL);
no_addr = idev->ifa_list == NULL;
@@ -340,13 +341,12 @@ static int __fib_validate_source(struct sk_buff *skb, __be32 src, __be32 dst,
trace_fib_validate_source(dev, &fl4);
- net = dev_net(dev);
if (fib_lookup(net, &fl4, &res, 0))
goto last_resort;
if (res.type != RTN_UNICAST &&
(res.type != RTN_LOCAL || !IN_DEV_ACCEPT_LOCAL(idev)))
goto e_inval;
- if (!rpf && !fib_num_tclassid_users(dev_net(dev)) &&
+ if (!rpf && !fib_num_tclassid_users(net) &&
(dev->ifindex != oif || !IN_DEV_TX_REDIRECTS(idev)))
goto last_resort;
fib_combine_itag(itag, &res);
@@ -622,6 +622,7 @@ const struct nla_policy rtm_ipv4_policy[RTA_MAX + 1] = {
[RTA_FLOW] = { .type = NLA_U32 },
[RTA_ENCAP_TYPE] = { .type = NLA_U16 },
[RTA_ENCAP] = { .type = NLA_NESTED },
+ [RTA_UID] = { .type = NLA_U32 },
};
static int rtm_to_fib_config(struct net *net, struct sk_buff *skb,
diff --git a/net/ipv4/icmp.c b/net/ipv4/icmp.c
index 31f17f0bbd1c..42a19fb4ae5f 100644
--- a/net/ipv4/icmp.c
+++ b/net/ipv4/icmp.c
@@ -425,6 +425,7 @@ static void icmp_reply(struct icmp_bxm *icmp_param, struct sk_buff *skb)
fl4.daddr = daddr;
fl4.saddr = saddr;
fl4.flowi4_mark = mark;
+ fl4.flowi4_uid = sock_net_uid(net, NULL);
fl4.flowi4_tos = RT_TOS(ip_hdr(skb)->tos);
fl4.flowi4_proto = IPPROTO_ICMP;
fl4.flowi4_oif = l3mdev_master_ifindex(skb->dev);
@@ -473,6 +474,7 @@ static struct rtable *icmp_route_lookup(struct net *net,
param->replyopts.opt.opt.faddr : iph->saddr);
fl4->saddr = saddr;
fl4->flowi4_mark = mark;
+ fl4->flowi4_uid = sock_net_uid(net, NULL);
fl4->flowi4_tos = RT_TOS(tos);
fl4->flowi4_proto = IPPROTO_ICMP;
fl4->fl4_icmp_type = type;
diff --git a/net/ipv4/inet_connection_sock.c b/net/ipv4/inet_connection_sock.c
index 528a6777cda0..b857ecccaedb 100644
--- a/net/ipv4/inet_connection_sock.c
+++ b/net/ipv4/inet_connection_sock.c
@@ -418,7 +418,7 @@ struct dst_entry *inet_csk_route_req(const struct sock *sk,
sk->sk_protocol, inet_sk_flowi_flags(sk),
(opt && opt->opt.srr) ? opt->opt.faddr : ireq->ir_rmt_addr,
ireq->ir_loc_addr, ireq->ir_rmt_port,
- htons(ireq->ir_num));
+ htons(ireq->ir_num), sk->sk_uid);
security_req_classify_flow(req, flowi4_to_flowi(fl4));
rt = ip_route_output_flow(net, fl4, sk);
if (IS_ERR(rt))
@@ -456,7 +456,7 @@ struct dst_entry *inet_csk_route_child_sock(const struct sock *sk,
sk->sk_protocol, inet_sk_flowi_flags(sk),
(opt && opt->opt.srr) ? opt->opt.faddr : ireq->ir_rmt_addr,
ireq->ir_loc_addr, ireq->ir_rmt_port,
- htons(ireq->ir_num));
+ htons(ireq->ir_num), sk->sk_uid);
security_req_classify_flow(req, flowi4_to_flowi(fl4));
rt = ip_route_output_flow(net, fl4, sk);
if (IS_ERR(rt))
diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c
index 100c86f1f547..5fcafc839dad 100644
--- a/net/ipv4/ip_output.c
+++ b/net/ipv4/ip_output.c
@@ -74,6 +74,7 @@
#include <net/checksum.h>
#include <net/inetpeer.h>
#include <net/lwtunnel.h>
+#include <linux/bpf-cgroup.h>
#include <linux/igmp.h>
#include <linux/netfilter_ipv4.h>
#include <linux/netfilter_bridge.h>
@@ -287,6 +288,13 @@ static int ip_finish_output_gso(struct net *net, struct sock *sk,
static int ip_finish_output(struct net *net, struct sock *sk, struct sk_buff *skb)
{
unsigned int mtu;
+ int ret;
+
+ ret = BPF_CGROUP_RUN_PROG_INET_EGRESS(sk, skb);
+ if (ret) {
+ kfree_skb(skb);
+ return ret;
+ }
#if defined(CONFIG_NETFILTER) && defined(CONFIG_XFRM)
/* Policy lookup after SNAT yielded a new policy */
@@ -305,6 +313,20 @@ static int ip_finish_output(struct net *net, struct sock *sk, struct sk_buff *sk
return ip_finish_output2(net, sk, skb);
}
+static int ip_mc_finish_output(struct net *net, struct sock *sk,
+ struct sk_buff *skb)
+{
+ int ret;
+
+ ret = BPF_CGROUP_RUN_PROG_INET_EGRESS(sk, skb);
+ if (ret) {
+ kfree_skb(skb);
+ return ret;
+ }
+
+ return dev_loopback_xmit(net, sk, skb);
+}
+
int ip_mc_output(struct net *net, struct sock *sk, struct sk_buff *skb)
{
struct rtable *rt = skb_rtable(skb);
@@ -342,7 +364,7 @@ int ip_mc_output(struct net *net, struct sock *sk, struct sk_buff *skb)
if (newskb)
NF_HOOK(NFPROTO_IPV4, NF_INET_POST_ROUTING,
net, sk, newskb, NULL, newskb->dev,
- dev_loopback_xmit);
+ ip_mc_finish_output);
}
/* Multicasts with ttl 0 must not go beyond the host */
@@ -358,7 +380,7 @@ int ip_mc_output(struct net *net, struct sock *sk, struct sk_buff *skb)
if (newskb)
NF_HOOK(NFPROTO_IPV4, NF_INET_POST_ROUTING,
net, sk, newskb, NULL, newskb->dev,
- dev_loopback_xmit);
+ ip_mc_finish_output);
}
return NF_HOOK_COND(NFPROTO_IPV4, NF_INET_POST_ROUTING,
@@ -1600,7 +1622,8 @@ void ip_send_unicast_reply(struct sock *sk, struct sk_buff *skb,
RT_SCOPE_UNIVERSE, ip_hdr(skb)->protocol,
ip_reply_arg_flowi_flags(arg),
daddr, saddr,
- tcp_hdr(skb)->source, tcp_hdr(skb)->dest);
+ tcp_hdr(skb)->source, tcp_hdr(skb)->dest,
+ arg->uid);
security_skb_classify_flow(skb, flowi4_to_flowi(&fl4));
rt = ip_route_output_key(net, &fl4);
if (IS_ERR(rt))
diff --git a/net/ipv4/ping.c b/net/ipv4/ping.c
index 9adcd4b1b3fd..8fa153c65e76 100644
--- a/net/ipv4/ping.c
+++ b/net/ipv4/ping.c
@@ -798,7 +798,8 @@ static int ping_v4_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
flowi4_init_output(&fl4, ipc.oif, sk->sk_mark, tos,
RT_SCOPE_UNIVERSE, sk->sk_protocol,
- inet_sk_flowi_flags(sk), faddr, saddr, 0, 0);
+ inet_sk_flowi_flags(sk), faddr, saddr, 0, 0,
+ sk->sk_uid);
security_sk_classify_flow(sk, flowi4_to_flowi(&fl4));
rt = ip_route_output_flow(net, &fl4, sk);
diff --git a/net/ipv4/raw.c b/net/ipv4/raw.c
index 59d8770055ed..a860df28300d 100644
--- a/net/ipv4/raw.c
+++ b/net/ipv4/raw.c
@@ -612,7 +612,7 @@ static int raw_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
hdrincl ? IPPROTO_RAW : sk->sk_protocol,
inet_sk_flowi_flags(sk) |
(hdrincl ? FLOWI_FLAG_KNOWN_NH : 0),
- daddr, saddr, 0, 0);
+ daddr, saddr, 0, 0, sk->sk_uid);
if (!hdrincl) {
rfv.msg = msg;
diff --git a/net/ipv4/route.c b/net/ipv4/route.c
index 890141d32ab9..cec5e9e2a008 100644
--- a/net/ipv4/route.c
+++ b/net/ipv4/route.c
@@ -510,7 +510,8 @@ void __ip_select_ident(struct net *net, struct iphdr *iph, int segs)
}
EXPORT_SYMBOL(__ip_select_ident);
-static void __build_flow_key(struct flowi4 *fl4, const struct sock *sk,
+static void __build_flow_key(const struct net *net, struct flowi4 *fl4,
+ const struct sock *sk,
const struct iphdr *iph,
int oif, u8 tos,
u8 prot, u32 mark, int flow_flags)
@@ -526,19 +527,21 @@ static void __build_flow_key(struct flowi4 *fl4, const struct sock *sk,
flowi4_init_output(fl4, oif, mark, tos,
RT_SCOPE_UNIVERSE, prot,
flow_flags,
- iph->daddr, iph->saddr, 0, 0);
+ iph->daddr, iph->saddr, 0, 0,
+ sock_net_uid(net, sk));
}
static void build_skb_flow_key(struct flowi4 *fl4, const struct sk_buff *skb,
const struct sock *sk)
{
+ const struct net *net = dev_net(skb->dev);
const struct iphdr *iph = ip_hdr(skb);
int oif = skb->dev->ifindex;
u8 tos = RT_TOS(iph->tos);
u8 prot = iph->protocol;
u32 mark = skb->mark;
- __build_flow_key(fl4, sk, iph, oif, tos, prot, mark, 0);
+ __build_flow_key(net, fl4, sk, iph, oif, tos, prot, mark, 0);
}
static void build_sk_flow_key(struct flowi4 *fl4, const struct sock *sk)
@@ -555,7 +558,7 @@ static void build_sk_flow_key(struct flowi4 *fl4, const struct sock *sk)
RT_CONN_FLAGS(sk), RT_SCOPE_UNIVERSE,
inet->hdrincl ? IPPROTO_RAW : sk->sk_protocol,
inet_sk_flowi_flags(sk),
- daddr, inet->inet_saddr, 0, 0);
+ daddr, inet->inet_saddr, 0, 0, sk->sk_uid);
rcu_read_unlock();
}
@@ -807,6 +810,7 @@ static void ip_do_redirect(struct dst_entry *dst, struct sock *sk, struct sk_buf
struct rtable *rt;
struct flowi4 fl4;
const struct iphdr *iph = (const struct iphdr *) skb->data;
+ struct net *net = dev_net(skb->dev);
int oif = skb->dev->ifindex;
u8 tos = RT_TOS(iph->tos);
u8 prot = iph->protocol;
@@ -814,7 +818,7 @@ static void ip_do_redirect(struct dst_entry *dst, struct sock *sk, struct sk_buf
rt = (struct rtable *) dst;
- __build_flow_key(&fl4, sk, iph, oif, tos, prot, mark, 0);
+ __build_flow_key(net, &fl4, sk, iph, oif, tos, prot, mark, 0);
__ip_do_redirect(rt, skb, &fl4, true);
}
@@ -1035,7 +1039,7 @@ void ipv4_update_pmtu(struct sk_buff *skb, struct net *net, u32 mtu,
if (!mark)
mark = IP4_REPLY_MARK(net, skb->mark);
- __build_flow_key(&fl4, NULL, iph, oif,
+ __build_flow_key(net, &fl4, NULL, iph, oif,
RT_TOS(iph->tos), protocol, mark, flow_flags);
rt = __ip_route_output_key(net, &fl4);
if (!IS_ERR(rt)) {
@@ -1051,7 +1055,7 @@ static void __ipv4_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, u32 mtu)
struct flowi4 fl4;
struct rtable *rt;
- __build_flow_key(&fl4, sk, iph, 0, 0, 0, 0, 0);
+ __build_flow_key(sock_net(sk), &fl4, sk, iph, 0, 0, 0, 0, 0);
if (!fl4.flowi4_mark)
fl4.flowi4_mark = IP4_REPLY_MARK(sock_net(sk), skb->mark);
@@ -1070,6 +1074,7 @@ void ipv4_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, u32 mtu)
struct rtable *rt;
struct dst_entry *odst = NULL;
bool new = false;
+ struct net *net = sock_net(sk);
bh_lock_sock(sk);
@@ -1083,7 +1088,7 @@ void ipv4_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, u32 mtu)
goto out;
}
- __build_flow_key(&fl4, sk, iph, 0, 0, 0, 0, 0);
+ __build_flow_key(net, &fl4, sk, iph, 0, 0, 0, 0, 0);
rt = (struct rtable *)odst;
if (odst->obsolete && !odst->ops->check(odst, 0)) {
@@ -1123,7 +1128,7 @@ void ipv4_redirect(struct sk_buff *skb, struct net *net,
struct flowi4 fl4;
struct rtable *rt;
- __build_flow_key(&fl4, NULL, iph, oif,
+ __build_flow_key(net, &fl4, NULL, iph, oif,
RT_TOS(iph->tos), protocol, mark, flow_flags);
rt = __ip_route_output_key(net, &fl4);
if (!IS_ERR(rt)) {
@@ -1138,9 +1143,10 @@ void ipv4_sk_redirect(struct sk_buff *skb, struct sock *sk)
const struct iphdr *iph = (const struct iphdr *) skb->data;
struct flowi4 fl4;
struct rtable *rt;
+ struct net *net = sock_net(sk);
- __build_flow_key(&fl4, sk, iph, 0, 0, 0, 0, 0);
- rt = __ip_route_output_key(sock_net(sk), &fl4);
+ __build_flow_key(net, &fl4, sk, iph, 0, 0, 0, 0, 0);
+ rt = __ip_route_output_key(net, &fl4);
if (!IS_ERR(rt)) {
__ip_do_redirect(rt, skb, &fl4, false);
ip_rt_put(rt);
@@ -1870,6 +1876,7 @@ static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr,
fl4.flowi4_flags = 0;
fl4.daddr = daddr;
fl4.saddr = saddr;
+ fl4.flowi4_uid = sock_net_uid(net, NULL);
err = fib_lookup(net, &fl4, &res, 0);
if (err != 0) {
if (!IN_DEV_FORWARD(in_dev))
@@ -2524,6 +2531,11 @@ static int rt_fill_info(struct net *net, __be32 dst, __be32 src, u32 table_id,
nla_put_u32(skb, RTA_MARK, fl4->flowi4_mark))
goto nla_put_failure;
+ if (!uid_eq(fl4->flowi4_uid, INVALID_UID) &&
+ nla_put_u32(skb, RTA_UID,
+ from_kuid_munged(current_user_ns(), fl4->flowi4_uid)))
+ goto nla_put_failure;
+
error = rt->dst.error;
if (rt_is_input_route(rt)) {
@@ -2576,6 +2588,7 @@ static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh)
int mark;
struct sk_buff *skb;
u32 table_id = RT_TABLE_MAIN;
+ kuid_t uid;
err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv4_policy);
if (err < 0)
@@ -2603,6 +2616,10 @@ static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh)
dst = tb[RTA_DST] ? nla_get_in_addr(tb[RTA_DST]) : 0;
iif = tb[RTA_IIF] ? nla_get_u32(tb[RTA_IIF]) : 0;
mark = tb[RTA_MARK] ? nla_get_u32(tb[RTA_MARK]) : 0;
+ if (tb[RTA_UID])
+ uid = make_kuid(current_user_ns(), nla_get_u32(tb[RTA_UID]));
+ else
+ uid = (iif ? INVALID_UID : current_uid());
memset(&fl4, 0, sizeof(fl4));
fl4.daddr = dst;
@@ -2610,6 +2627,7 @@ static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh)
fl4.flowi4_tos = rtm->rtm_tos;
fl4.flowi4_oif = tb[RTA_OIF] ? nla_get_u32(tb[RTA_OIF]) : 0;
fl4.flowi4_mark = mark;
+ fl4.flowi4_uid = uid;
if (iif) {
struct net_device *dev;
diff --git a/net/ipv4/syncookies.c b/net/ipv4/syncookies.c
index 0597ad73a1fa..4487c71873db 100644
--- a/net/ipv4/syncookies.c
+++ b/net/ipv4/syncookies.c
@@ -373,7 +373,7 @@ struct sock *cookie_v4_check(struct sock *sk, struct sk_buff *skb)
RT_CONN_FLAGS(sk), RT_SCOPE_UNIVERSE, IPPROTO_TCP,
inet_sk_flowi_flags(sk),
opt->srr ? opt->faddr : ireq->ir_rmt_addr,
- ireq->ir_loc_addr, th->source, th->dest);
+ ireq->ir_loc_addr, th->source, th->dest, sk->sk_uid);
security_req_classify_flow(req, flowi4_to_flowi(&fl4));
rt = ip_route_output_key(sock_net(sk), &fl4);
if (IS_ERR(rt)) {
diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c
index 024ab833557d..ce43dc7afc16 100644
--- a/net/ipv4/sysctl_net_ipv4.c
+++ b/net/ipv4/sysctl_net_ipv4.c
@@ -152,6 +152,21 @@ static int ipv4_ping_group_range(struct ctl_table *table, int write,
return ret;
}
+/* Validate changes from /proc interface. */
+static int proc_tcp_default_init_rwnd(struct ctl_table *ctl, int write,
+ void __user *buffer,
+ size_t *lenp, loff_t *ppos)
+{
+ int old_value = *(int *)ctl->data;
+ int ret = proc_dointvec(ctl, write, buffer, lenp, ppos);
+ int new_value = *(int *)ctl->data;
+
+ if (write && ret == 0 && (new_value < 3 || new_value > 100))
+ *(int *)ctl->data = old_value;
+
+ return ret;
+}
+
static int proc_tcp_congestion_control(struct ctl_table *ctl, int write,
void __user *buffer, size_t *lenp, loff_t *ppos)
{
@@ -633,6 +648,13 @@ static struct ctl_table ipv4_table[] = {
.proc_handler = proc_dointvec_ms_jiffies,
},
{
+ .procname = "tcp_default_init_rwnd",
+ .data = &sysctl_tcp_default_init_rwnd,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_tcp_default_init_rwnd
+ },
+ {
.procname = "icmp_msgs_per_sec",
.data = &sysctl_icmp_msgs_per_sec,
.maxlen = sizeof(int),
diff --git a/net/ipv4/sysfs_net_ipv4.c b/net/ipv4/sysfs_net_ipv4.c
new file mode 100644
index 000000000000..0cbbf10026a6
--- /dev/null
+++ b/net/ipv4/sysfs_net_ipv4.c
@@ -0,0 +1,88 @@
+/*
+ * net/ipv4/sysfs_net_ipv4.c
+ *
+ * sysfs-based networking knobs (so we can, unlike with sysctl, control perms)
+ *
+ * Copyright (C) 2008 Google, Inc.
+ *
+ * Robert Love <rlove@google.com>
+ *
+ * This software is licensed under the terms of the GNU General Public
+ * License version 2, as published by the Free Software Foundation, and
+ * may be copied, distributed, and modified under those terms.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ */
+
+#include <linux/kobject.h>
+#include <linux/string.h>
+#include <linux/sysfs.h>
+#include <linux/init.h>
+#include <net/tcp.h>
+
+#define CREATE_IPV4_FILE(_name, _var) \
+static ssize_t _name##_show(struct kobject *kobj, \
+ struct kobj_attribute *attr, char *buf) \
+{ \
+ return sprintf(buf, "%d\n", _var); \
+} \
+static ssize_t _name##_store(struct kobject *kobj, \
+ struct kobj_attribute *attr, \
+ const char *buf, size_t count) \
+{ \
+ int val, ret; \
+ ret = sscanf(buf, "%d", &val); \
+ if (ret != 1) \
+ return -EINVAL; \
+ if (val < 0) \
+ return -EINVAL; \
+ _var = val; \
+ return count; \
+} \
+static struct kobj_attribute _name##_attr = \
+ __ATTR(_name, 0644, _name##_show, _name##_store)
+
+CREATE_IPV4_FILE(tcp_wmem_min, sysctl_tcp_wmem[0]);
+CREATE_IPV4_FILE(tcp_wmem_def, sysctl_tcp_wmem[1]);
+CREATE_IPV4_FILE(tcp_wmem_max, sysctl_tcp_wmem[2]);
+
+CREATE_IPV4_FILE(tcp_rmem_min, sysctl_tcp_rmem[0]);
+CREATE_IPV4_FILE(tcp_rmem_def, sysctl_tcp_rmem[1]);
+CREATE_IPV4_FILE(tcp_rmem_max, sysctl_tcp_rmem[2]);
+
+static struct attribute *ipv4_attrs[] = {
+ &tcp_wmem_min_attr.attr,
+ &tcp_wmem_def_attr.attr,
+ &tcp_wmem_max_attr.attr,
+ &tcp_rmem_min_attr.attr,
+ &tcp_rmem_def_attr.attr,
+ &tcp_rmem_max_attr.attr,
+ NULL
+};
+
+static struct attribute_group ipv4_attr_group = {
+ .attrs = ipv4_attrs,
+};
+
+static __init int sysfs_ipv4_init(void)
+{
+ struct kobject *ipv4_kobject;
+ int ret;
+
+ ipv4_kobject = kobject_create_and_add("ipv4", kernel_kobj);
+ if (!ipv4_kobject)
+ return -ENOMEM;
+
+ ret = sysfs_create_group(ipv4_kobject, &ipv4_attr_group);
+ if (ret) {
+ kobject_put(ipv4_kobject);
+ return ret;
+ }
+
+ return 0;
+}
+
+subsys_initcall(sysfs_ipv4_init);
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index 9de77d946f5a..ee9b3ac15efc 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -538,6 +538,12 @@ unsigned int tcp_poll(struct file *file, struct socket *sock, poll_table *wait)
if (tp->urg_data & TCP_URG_VALID)
mask |= POLLPRI;
+ } else if (state == TCP_SYN_SENT && inet_sk(sk)->defer_connect) {
+ /* Active TCP fastopen socket with defer_connect
+ * Return POLLOUT so application can call write()
+ * in order for kernel to generate SYN+data
+ */
+ mask |= POLLOUT | POLLWRNORM;
}
/* This barrier is coupled with smp_wmb() in tcp_reset() */
smp_rmb();
@@ -1079,6 +1085,7 @@ static int tcp_sendmsg_fastopen(struct sock *sk, struct msghdr *msg,
int *copied, size_t size)
{
struct tcp_sock *tp = tcp_sk(sk);
+ struct inet_sock *inet = inet_sk(sk);
struct sockaddr *uaddr = msg->msg_name;
int err, flags;
@@ -1096,11 +1103,26 @@ static int tcp_sendmsg_fastopen(struct sock *sk, struct msghdr *msg,
tp->fastopen_req->data = msg;
tp->fastopen_req->size = size;
+ if (inet->defer_connect) {
+ err = tcp_connect(sk);
+ /* Same failure procedure as in tcp_v4/6_connect */
+ if (err) {
+ tcp_set_state(sk, TCP_CLOSE);
+ inet->inet_dport = 0;
+ sk->sk_route_caps = 0;
+ }
+ }
flags = (msg->msg_flags & MSG_DONTWAIT) ? O_NONBLOCK : 0;
err = __inet_stream_connect(sk->sk_socket, uaddr,
msg->msg_namelen, flags);
- *copied = tp->fastopen_req->copied;
- tcp_free_fastopen_req(tp);
+ /* fastopen_req could already be freed in __inet_stream_connect
+ * if the connection times out or gets rst
+ */
+ if (tp->fastopen_req) {
+ *copied = tp->fastopen_req->copied;
+ tcp_free_fastopen_req(tp);
+ inet->defer_connect = 0;
+ }
return err;
}
@@ -1118,7 +1140,8 @@ int tcp_sendmsg(struct sock *sk, struct msghdr *msg, size_t size)
lock_sock(sk);
flags = msg->msg_flags;
- if ((flags & MSG_FASTOPEN) && !tp->repair) {
+ if (unlikely(flags & MSG_FASTOPEN || inet_sk(sk)->defer_connect) &&
+ !tp->repair) {
err = tcp_sendmsg_fastopen(sk, msg, &copied_syn, size);
if (err == -EINPROGRESS && copied_syn > 0)
goto out;
@@ -2314,6 +2337,10 @@ int tcp_disconnect(struct sock *sk, int flags)
sk->sk_rx_dst = NULL;
tcp_saved_syn_free(tp);
+ /* Clean up fastopen related fields */
+ tcp_free_fastopen_req(tp);
+ inet->defer_connect = 0;
+
WARN_ON(inet->inet_num && !icsk->icsk_bind_hash);
if (sk->sk_frag.page) {
@@ -2688,6 +2715,18 @@ static int do_tcp_setsockopt(struct sock *sk, int level,
err = -EINVAL;
}
break;
+ case TCP_FASTOPEN_CONNECT:
+ if (val > 1 || val < 0) {
+ err = -EINVAL;
+ } else if (sysctl_tcp_fastopen & TFO_CLIENT_ENABLE) {
+ if (sk->sk_state == TCP_CLOSE)
+ tp->fastopen_connect = val;
+ else
+ err = -EINVAL;
+ } else {
+ err = -EOPNOTSUPP;
+ }
+ break;
case TCP_TIMESTAMP:
if (!tp->repair)
err = -EPERM;
@@ -3001,6 +3040,10 @@ static int do_tcp_getsockopt(struct sock *sk, int level,
val = icsk->icsk_accept_queue.fastopenq.max_qlen;
break;
+ case TCP_FASTOPEN_CONNECT:
+ val = tp->fastopen_connect;
+ break;
+
case TCP_TIMESTAMP:
val = tcp_time_stamp + tp->tsoffset;
break;
diff --git a/net/ipv4/tcp_fastopen.c b/net/ipv4/tcp_fastopen.c
index dd2560c83a85..8ea4e9787f82 100644
--- a/net/ipv4/tcp_fastopen.c
+++ b/net/ipv4/tcp_fastopen.c
@@ -326,3 +326,57 @@ fastopen:
*foc = valid_foc;
return NULL;
}
+
+bool tcp_fastopen_cookie_check(struct sock *sk, u16 *mss,
+ struct tcp_fastopen_cookie *cookie)
+{
+ unsigned long last_syn_loss = 0;
+ int syn_loss = 0;
+
+ tcp_fastopen_cache_get(sk, mss, cookie, &syn_loss, &last_syn_loss);
+
+ /* Recurring FO SYN losses: no cookie or data in SYN */
+ if (syn_loss > 1 &&
+ time_before(jiffies, last_syn_loss + (60*HZ << syn_loss))) {
+ cookie->len = -1;
+ return false;
+ }
+ if (sysctl_tcp_fastopen & TFO_CLIENT_NO_COOKIE) {
+ cookie->len = -1;
+ return true;
+ }
+ return cookie->len > 0;
+}
+
+/* This function checks if we want to defer sending SYN until the first
+ * write(). We defer under the following conditions:
+ * 1. fastopen_connect sockopt is set
+ * 2. we have a valid cookie
+ * Return value: return true if we want to defer until application writes data
+ * return false if we want to send out SYN immediately
+ */
+bool tcp_fastopen_defer_connect(struct sock *sk, int *err)
+{
+ struct tcp_fastopen_cookie cookie = { .len = 0 };
+ struct tcp_sock *tp = tcp_sk(sk);
+ u16 mss;
+
+ if (tp->fastopen_connect && !tp->fastopen_req) {
+ if (tcp_fastopen_cookie_check(sk, &mss, &cookie)) {
+ inet_sk(sk)->defer_connect = 1;
+ return true;
+ }
+
+ /* Alloc fastopen_req in order for FO option to be included
+ * in SYN
+ */
+ tp->fastopen_req = kzalloc(sizeof(*tp->fastopen_req),
+ sk->sk_allocation);
+ if (tp->fastopen_req)
+ tp->fastopen_req->cookie = cookie;
+ else
+ *err = -ENOBUFS;
+ }
+ return false;
+}
+EXPORT_SYMBOL(tcp_fastopen_defer_connect);
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index dbb153c6b21a..80ac4ec65f61 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -100,6 +100,7 @@ int sysctl_tcp_thin_dupack __read_mostly;
int sysctl_tcp_moderate_rcvbuf __read_mostly = 1;
int sysctl_tcp_early_retrans __read_mostly = 3;
int sysctl_tcp_invalid_ratelimit __read_mostly = HZ/2;
+int sysctl_tcp_default_init_rwnd __read_mostly = TCP_INIT_CWND * 2;
#define FLAG_DATA 0x01 /* Incoming frame contained data. */
#define FLAG_WIN_UPDATE 0x02 /* Incoming ACK was a window update. */
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index 1ea0c91ba994..3738778a3444 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -232,6 +232,7 @@ int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
/* OK, now commit destination to socket. */
sk->sk_gso_type = SKB_GSO_TCPV4;
sk_setup_caps(sk, &rt->dst);
+ rt = NULL;
if (!tp->write_seq && likely(!tp->repair))
tp->write_seq = secure_tcp_sequence_number(inet->inet_saddr,
@@ -241,9 +242,13 @@ int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
inet->inet_id = tp->write_seq ^ jiffies;
+ if (tcp_fastopen_defer_connect(sk, &err))
+ return err;
+ if (err)
+ goto failure;
+
err = tcp_connect(sk);
- rt = NULL;
if (err)
goto failure;
@@ -695,6 +700,7 @@ static void tcp_v4_send_reset(const struct sock *sk, struct sk_buff *skb)
offsetof(struct inet_timewait_sock, tw_bound_dev_if));
arg.tos = ip_hdr(skb)->tos;
+ arg.uid = sock_net_uid(net, sk && sk_fullsock(sk) ? sk : NULL);
local_bh_disable();
ip_send_unicast_reply(*this_cpu_ptr(net->ipv4.tcp_sk),
skb, &TCP_SKB_CB(skb)->header.h4.opt,
@@ -715,7 +721,7 @@ out:
outside socket context is ugly, certainly. What can I do?
*/
-static void tcp_v4_send_ack(struct net *net,
+static void tcp_v4_send_ack(const struct sock *sk,
struct sk_buff *skb, u32 seq, u32 ack,
u32 win, u32 tsval, u32 tsecr, int oif,
struct tcp_md5sig_key *key,
@@ -730,6 +736,7 @@ static void tcp_v4_send_ack(struct net *net,
#endif
];
} rep;
+ struct net *net = sock_net(sk);
struct ip_reply_arg arg;
memset(&rep.th, 0, sizeof(struct tcphdr));
@@ -779,6 +786,7 @@ static void tcp_v4_send_ack(struct net *net,
if (oif)
arg.bound_dev_if = oif;
arg.tos = tos;
+ arg.uid = sock_net_uid(net, sk_fullsock(sk) ? sk : NULL);
local_bh_disable();
ip_send_unicast_reply(*this_cpu_ptr(net->ipv4.tcp_sk),
skb, &TCP_SKB_CB(skb)->header.h4.opt,
@@ -794,7 +802,7 @@ static void tcp_v4_timewait_ack(struct sock *sk, struct sk_buff *skb)
struct inet_timewait_sock *tw = inet_twsk(sk);
struct tcp_timewait_sock *tcptw = tcp_twsk(sk);
- tcp_v4_send_ack(sock_net(sk), skb,
+ tcp_v4_send_ack(sk, skb,
tcptw->tw_snd_nxt, tcptw->tw_rcv_nxt,
tcptw->tw_rcv_wnd >> tw->tw_rcv_wscale,
tcp_time_stamp + tcptw->tw_ts_offset,
@@ -822,7 +830,7 @@ static void tcp_v4_reqsk_send_ack(const struct sock *sk, struct sk_buff *skb,
* exception of <SYN> segments, MUST be right-shifted by
* Rcv.Wind.Shift bits:
*/
- tcp_v4_send_ack(sock_net(sk), skb, seq,
+ tcp_v4_send_ack(sk, skb, seq,
tcp_rsk(req)->rcv_nxt,
req->rsk_rcv_wnd >> inet_rsk(req)->rcv_wscale,
tcp_time_stamp,
@@ -1676,7 +1684,9 @@ process:
*/
sock_hold(sk);
refcounted = true;
- nsk = tcp_check_req(sk, skb, req, false);
+ nsk = NULL;
+ if (!tcp_filter(sk, skb))
+ nsk = tcp_check_req(sk, skb, req, false);
if (!nsk) {
reqsk_put(req);
goto discard_and_relse;
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index 6f35cdd5f2f0..8d31f60f22ed 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -193,7 +193,7 @@ u32 tcp_default_init_rwnd(u32 mss)
* (RFC 3517, Section 4, NextSeg() rule (2)). Further place a
* limit when mss is larger than 1460.
*/
- u32 init_rwnd = TCP_INIT_CWND * 2;
+ u32 init_rwnd = sysctl_tcp_default_init_rwnd;
if (mss > 1460)
init_rwnd = max((1460 * init_rwnd) / mss, 2U);
@@ -3297,23 +3297,11 @@ static int tcp_send_syn_data(struct sock *sk, struct sk_buff *syn)
{
struct tcp_sock *tp = tcp_sk(sk);
struct tcp_fastopen_request *fo = tp->fastopen_req;
- int syn_loss = 0, space, err = 0;
- unsigned long last_syn_loss = 0;
+ int space, err = 0;
struct sk_buff *syn_data;
tp->rx_opt.mss_clamp = tp->advmss; /* If MSS is not cached */
- tcp_fastopen_cache_get(sk, &tp->rx_opt.mss_clamp, &fo->cookie,
- &syn_loss, &last_syn_loss);
- /* Recurring FO SYN losses: revert to regular handshake temporarily */
- if (syn_loss > 1 &&
- time_before(jiffies, last_syn_loss + (60*HZ << syn_loss))) {
- fo->cookie.len = -1;
- goto fallback;
- }
-
- if (sysctl_tcp_fastopen & TFO_CLIENT_NO_COOKIE)
- fo->cookie.len = -1;
- else if (fo->cookie.len <= 0)
+ if (!tcp_fastopen_cookie_check(sk, &tp->rx_opt.mss_clamp, &fo->cookie))
goto fallback;
/* MSS for SYN-data is based on cached MSS and bounded by PMTU and
diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c
index 5d4b5e0f6b5e..bff17458e49b 100644
--- a/net/ipv4/udp.c
+++ b/net/ipv4/udp.c
@@ -1018,7 +1018,8 @@ int udp_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
flowi4_init_output(fl4, ipc.oif, sk->sk_mark, tos,
RT_SCOPE_UNIVERSE, sk->sk_protocol,
flow_flags,
- faddr, saddr, dport, inet->inet_sport);
+ faddr, saddr, dport, inet->inet_sport,
+ sk->sk_uid);
security_sk_classify_flow(sk, flowi4_to_flowi(fl4));
rt = ip_route_output_flow(net, fl4, sk);
diff --git a/net/ipv4/xfrm4_policy.c b/net/ipv4/xfrm4_policy.c
index 622e158a6fc4..e62d76c97e7f 100644
--- a/net/ipv4/xfrm4_policy.c
+++ b/net/ipv4/xfrm4_policy.c
@@ -22,7 +22,8 @@ static struct xfrm_policy_afinfo xfrm4_policy_afinfo;
static struct dst_entry *__xfrm4_dst_lookup(struct net *net, struct flowi4 *fl4,
int tos, int oif,
const xfrm_address_t *saddr,
- const xfrm_address_t *daddr)
+ const xfrm_address_t *daddr,
+ u32 mark)
{
struct rtable *rt;
@@ -30,6 +31,7 @@ static struct dst_entry *__xfrm4_dst_lookup(struct net *net, struct flowi4 *fl4,
fl4->daddr = daddr->a4;
fl4->flowi4_tos = tos;
fl4->flowi4_oif = l3mdev_master_ifindex_by_index(net, oif);
+ fl4->flowi4_mark = mark;
if (saddr)
fl4->saddr = saddr->a4;
@@ -44,20 +46,22 @@ static struct dst_entry *__xfrm4_dst_lookup(struct net *net, struct flowi4 *fl4,
static struct dst_entry *xfrm4_dst_lookup(struct net *net, int tos, int oif,
const xfrm_address_t *saddr,
- const xfrm_address_t *daddr)
+ const xfrm_address_t *daddr,
+ u32 mark)
{
struct flowi4 fl4;
- return __xfrm4_dst_lookup(net, &fl4, tos, oif, saddr, daddr);
+ return __xfrm4_dst_lookup(net, &fl4, tos, oif, saddr, daddr, mark);
}
static int xfrm4_get_saddr(struct net *net, int oif,
- xfrm_address_t *saddr, xfrm_address_t *daddr)
+ xfrm_address_t *saddr, xfrm_address_t *daddr,
+ u32 mark)
{
struct dst_entry *dst;
struct flowi4 fl4;
- dst = __xfrm4_dst_lookup(net, &fl4, 0, oif, NULL, daddr);
+ dst = __xfrm4_dst_lookup(net, &fl4, 0, oif, NULL, daddr, mark);
if (IS_ERR(dst))
return -EHOSTUNREACH;
diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c
index 8f79f0414bc3..40c3f6a157ed 100644
--- a/net/ipv6/addrconf.c
+++ b/net/ipv6/addrconf.c
@@ -223,9 +223,11 @@ static struct ipv6_devconf ipv6_devconf __read_mostly = {
.accept_ra_rtr_pref = 1,
.rtr_probe_interval = 60 * HZ,
#ifdef CONFIG_IPV6_ROUTE_INFO
+ .accept_ra_rt_info_min_plen = 0,
.accept_ra_rt_info_max_plen = 0,
#endif
#endif
+ .accept_ra_rt_table = 0,
.proxy_ndp = 0,
.accept_source_route = 0, /* we do not accept RH0 by default. */
.disable_ipv6 = 0,
@@ -269,9 +271,11 @@ static struct ipv6_devconf ipv6_devconf_dflt __read_mostly = {
.accept_ra_rtr_pref = 1,
.rtr_probe_interval = 60 * HZ,
#ifdef CONFIG_IPV6_ROUTE_INFO
+ .accept_ra_rt_info_min_plen = 0,
.accept_ra_rt_info_max_plen = 0,
#endif
#endif
+ .accept_ra_rt_table = 0,
.proxy_ndp = 0,
.accept_source_route = 0, /* we do not accept RH0 by default. */
.disable_ipv6 = 0,
@@ -2203,6 +2207,31 @@ static void ipv6_try_regen_rndid(struct inet6_dev *idev, struct in6_addr *tmpad
ipv6_regen_rndid(idev);
}
+u32 addrconf_rt_table(const struct net_device *dev, u32 default_table) {
+ /* Determines into what table to put autoconf PIO/RIO/default routes
+ * learned on this device.
+ *
+ * - If 0, use the same table for every device. This puts routes into
+ * one of RT_TABLE_{PREFIX,INFO,DFLT} depending on the type of route
+ * (but note that these three are currently all equal to
+ * RT6_TABLE_MAIN).
+ * - If > 0, use the specified table.
+ * - If < 0, put routes into table dev->ifindex + (-rt_table).
+ */
+ struct inet6_dev *idev = in6_dev_get(dev);
+ u32 table;
+ int sysctl = idev->cnf.accept_ra_rt_table;
+ if (sysctl == 0) {
+ table = default_table;
+ } else if (sysctl > 0) {
+ table = (u32) sysctl;
+ } else {
+ table = (unsigned) dev->ifindex + (-sysctl);
+ }
+ in6_dev_put(idev);
+ return table;
+}
+
/*
* Add prefix route.
*/
@@ -2212,7 +2241,7 @@ addrconf_prefix_route(struct in6_addr *pfx, int plen, struct net_device *dev,
unsigned long expires, u32 flags)
{
struct fib6_config cfg = {
- .fc_table = l3mdev_fib_table(dev) ? : RT6_TABLE_PREFIX,
+ .fc_table = l3mdev_fib_table(dev) ? : addrconf_rt_table(dev, RT6_TABLE_PREFIX),
.fc_metric = IP6_RT_PRIO_ADDRCONF,
.fc_ifindex = dev->ifindex,
.fc_expires = expires,
@@ -2245,7 +2274,7 @@ static struct rt6_info *addrconf_get_prefix_route(const struct in6_addr *pfx,
struct fib6_node *fn;
struct rt6_info *rt = NULL;
struct fib6_table *table;
- u32 tb_id = l3mdev_fib_table(dev) ? : RT6_TABLE_PREFIX;
+ u32 tb_id = l3mdev_fib_table(dev) ? : addrconf_rt_table(dev, RT6_TABLE_PREFIX);
table = fib6_get_table(dev_net(dev), tb_id);
if (!table)
@@ -4957,9 +4986,11 @@ static inline void ipv6_store_devconf(struct ipv6_devconf *cnf,
array[DEVCONF_RTR_PROBE_INTERVAL] =
jiffies_to_msecs(cnf->rtr_probe_interval);
#ifdef CONFIG_IPV6_ROUTE_INFO
+ array[DEVCONF_ACCEPT_RA_RT_INFO_MIN_PLEN] = cnf->accept_ra_rt_info_min_plen;
array[DEVCONF_ACCEPT_RA_RT_INFO_MAX_PLEN] = cnf->accept_ra_rt_info_max_plen;
#endif
#endif
+ array[DEVCONF_ACCEPT_RA_RT_TABLE] = cnf->accept_ra_rt_table;
array[DEVCONF_PROXY_NDP] = cnf->proxy_ndp;
array[DEVCONF_ACCEPT_SOURCE_ROUTE] = cnf->accept_source_route;
#ifdef CONFIG_IPV6_OPTIMISTIC_DAD
@@ -5932,6 +5963,13 @@ static const struct ctl_table addrconf_sysctl[] = {
},
#ifdef CONFIG_IPV6_ROUTE_INFO
{
+ .procname = "accept_ra_rt_info_min_plen",
+ .data = &ipv6_devconf.accept_ra_rt_info_min_plen,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec,
+ },
+ {
.procname = "accept_ra_rt_info_max_plen",
.data = &ipv6_devconf.accept_ra_rt_info_max_plen,
.maxlen = sizeof(int),
@@ -5941,6 +5979,13 @@ static const struct ctl_table addrconf_sysctl[] = {
#endif
#endif
{
+ .procname = "accept_ra_rt_table",
+ .data = &ipv6_devconf.accept_ra_rt_table,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec,
+ },
+ {
.procname = "proxy_ndp",
.data = &ipv6_devconf.proxy_ndp,
.maxlen = sizeof(int),
diff --git a/net/ipv6/af_inet6.c b/net/ipv6/af_inet6.c
index f7b425615c12..c789a3eef0e8 100644
--- a/net/ipv6/af_inet6.c
+++ b/net/ipv6/af_inet6.c
@@ -65,6 +65,20 @@
#include <asm/uaccess.h>
#include <linux/mroute6.h>
+#ifdef CONFIG_ANDROID_PARANOID_NETWORK
+#include <linux/android_aid.h>
+
+static inline int current_has_network(void)
+{
+ return in_egroup_p(AID_INET) || capable(CAP_NET_RAW);
+}
+#else
+static inline int current_has_network(void)
+{
+ return 1;
+}
+#endif
+
#include "ip6_offload.h"
MODULE_AUTHOR("Cast of dozens");
@@ -121,6 +135,9 @@ static int inet6_create(struct net *net, struct socket *sock, int protocol,
if (protocol < 0 || protocol >= IPPROTO_MAX)
return -EINVAL;
+ if (!current_has_network())
+ return -EACCES;
+
/* Look for the requested type/protocol pair. */
lookup_protocol:
err = -ESOCKTNOSUPPORT;
@@ -167,8 +184,7 @@ lookup_protocol:
}
err = -EPERM;
- if (sock->type == SOCK_RAW && !kern &&
- !ns_capable(net->user_ns, CAP_NET_RAW))
+ if (sock->type == SOCK_RAW && !kern && !capable(CAP_NET_RAW))
goto out_rcu_unlock;
sock->ops = answer->ops;
@@ -680,6 +696,7 @@ int inet6_sk_rebuild_header(struct sock *sk)
fl6.flowi6_mark = sk->sk_mark;
fl6.fl6_dport = inet->inet_dport;
fl6.fl6_sport = inet->inet_sport;
+ fl6.flowi6_uid = sk->sk_uid;
security_sk_classify_flow(sk, flowi6_to_flowi(&fl6));
rcu_read_lock();
diff --git a/net/ipv6/ah6.c b/net/ipv6/ah6.c
index 0edc44cb254e..e742c4deb13d 100644
--- a/net/ipv6/ah6.c
+++ b/net/ipv6/ah6.c
@@ -666,9 +666,10 @@ static int ah6_err(struct sk_buff *skb, struct inet6_skb_parm *opt,
return 0;
if (type == NDISC_REDIRECT)
- ip6_redirect(skb, net, skb->dev->ifindex, 0);
+ ip6_redirect(skb, net, skb->dev->ifindex, 0,
+ sock_net_uid(net, NULL));
else
- ip6_update_pmtu(skb, net, info, 0, 0);
+ ip6_update_pmtu(skb, net, info, 0, 0, sock_net_uid(net, NULL));
xfrm_state_put(x);
return 0;
diff --git a/net/ipv6/datagram.c b/net/ipv6/datagram.c
index 2d3c8fe27583..988b1c8dea5b 100644
--- a/net/ipv6/datagram.c
+++ b/net/ipv6/datagram.c
@@ -54,6 +54,7 @@ static void ip6_datagram_flow_key_init(struct flowi6 *fl6, struct sock *sk)
fl6->fl6_dport = inet->inet_dport;
fl6->fl6_sport = inet->inet_sport;
fl6->flowlabel = np->flow_label;
+ fl6->flowi6_uid = sk->sk_uid;
if (!fl6->flowi6_oif)
fl6->flowi6_oif = np->sticky_pktinfo.ipi6_ifindex;
diff --git a/net/ipv6/esp6.c b/net/ipv6/esp6.c
index 6a924be66e37..44a2010e2076 100644
--- a/net/ipv6/esp6.c
+++ b/net/ipv6/esp6.c
@@ -478,9 +478,10 @@ static int esp6_err(struct sk_buff *skb, struct inet6_skb_parm *opt,
return 0;
if (type == NDISC_REDIRECT)
- ip6_redirect(skb, net, skb->dev->ifindex, 0);
+ ip6_redirect(skb, net, skb->dev->ifindex, 0,
+ sock_net_uid(net, NULL));
else
- ip6_update_pmtu(skb, net, info, 0, 0);
+ ip6_update_pmtu(skb, net, info, 0, 0, sock_net_uid(net, NULL));
xfrm_state_put(x);
return 0;
diff --git a/net/ipv6/exthdrs_core.c b/net/ipv6/exthdrs_core.c
index 305e2ed730bf..477692f80f0d 100644
--- a/net/ipv6/exthdrs_core.c
+++ b/net/ipv6/exthdrs_core.c
@@ -166,15 +166,15 @@ EXPORT_SYMBOL_GPL(ipv6_find_tlv);
* to explore inner IPv6 header, eg. ICMPv6 error messages.
*
* If target header is found, its offset is set in *offset and return protocol
- * number. Otherwise, return -1.
+ * number. Otherwise, return -ENOENT or -EBADMSG.
*
* If the first fragment doesn't contain the final protocol header or
* NEXTHDR_NONE it is considered invalid.
*
* Note that non-1st fragment is special case that "the protocol number
* of last header" is "next header" field in Fragment header. In this case,
- * *offset is meaningless and fragment offset is stored in *fragoff if fragoff
- * isn't NULL.
+ * *offset is meaningless. If fragoff is not NULL, the fragment offset is
+ * stored in *fragoff; if it is NULL, return -EINVAL.
*
* if flags is not NULL and it's a fragment, then the frag flag
* IP6_FH_F_FRAG will be set. If it's an AH header, the
@@ -253,9 +253,12 @@ int ipv6_find_hdr(const struct sk_buff *skb, unsigned int *offset,
if (target < 0 &&
((!ipv6_ext_hdr(hp->nexthdr)) ||
hp->nexthdr == NEXTHDR_NONE)) {
- if (fragoff)
+ if (fragoff) {
*fragoff = _frag_off;
- return hp->nexthdr;
+ return hp->nexthdr;
+ } else {
+ return -EINVAL;
+ }
}
if (!found)
return -ENOENT;
diff --git a/net/ipv6/icmp.c b/net/ipv6/icmp.c
index 2772004ba5a1..17fa28f7a0ff 100644
--- a/net/ipv6/icmp.c
+++ b/net/ipv6/icmp.c
@@ -92,9 +92,10 @@ static void icmpv6_err(struct sk_buff *skb, struct inet6_skb_parm *opt,
struct net *net = dev_net(skb->dev);
if (type == ICMPV6_PKT_TOOBIG)
- ip6_update_pmtu(skb, net, info, 0, 0);
+ ip6_update_pmtu(skb, net, info, 0, 0, sock_net_uid(net, NULL));
else if (type == NDISC_REDIRECT)
- ip6_redirect(skb, net, skb->dev->ifindex, 0);
+ ip6_redirect(skb, net, skb->dev->ifindex, 0,
+ sock_net_uid(net, NULL));
if (!(type & ICMPV6_INFOMSG_MASK))
if (icmp6->icmp6_type == ICMPV6_ECHO_REQUEST)
@@ -486,6 +487,7 @@ static void icmp6_send(struct sk_buff *skb, u8 type, u8 code, __u32 info,
fl6.flowi6_oif = iif;
fl6.fl6_icmp_type = type;
fl6.fl6_icmp_code = code;
+ fl6.flowi6_uid = sock_net_uid(net, NULL);
security_skb_classify_flow(skb, flowi6_to_flowi(&fl6));
sk = icmpv6_xmit_lock(net);
@@ -660,6 +662,7 @@ static void icmpv6_echo_reply(struct sk_buff *skb)
fl6.flowi6_oif = skb->dev->ifindex;
fl6.fl6_icmp_type = ICMPV6_ECHO_REPLY;
fl6.flowi6_mark = mark;
+ fl6.flowi6_uid = sock_net_uid(net, NULL);
security_skb_classify_flow(skb, flowi6_to_flowi(&fl6));
sk = icmpv6_xmit_lock(net);
diff --git a/net/ipv6/inet6_connection_sock.c b/net/ipv6/inet6_connection_sock.c
index 798a0950e9a6..10d1deb30547 100644
--- a/net/ipv6/inet6_connection_sock.c
+++ b/net/ipv6/inet6_connection_sock.c
@@ -88,6 +88,7 @@ struct dst_entry *inet6_csk_route_req(const struct sock *sk,
fl6->flowi6_mark = ireq->ir_mark;
fl6->fl6_dport = ireq->ir_rmt_port;
fl6->fl6_sport = htons(ireq->ir_num);
+ fl6->flowi6_uid = sk->sk_uid;
security_req_classify_flow(req, flowi6_to_flowi(fl6));
dst = ip6_dst_lookup_flow(sk, fl6, final_p);
@@ -136,6 +137,7 @@ static struct dst_entry *inet6_csk_route_socket(struct sock *sk,
fl6->flowi6_mark = sk->sk_mark;
fl6->fl6_sport = inet->inet_sport;
fl6->fl6_dport = inet->inet_dport;
+ fl6->flowi6_uid = sk->sk_uid;
security_sk_classify_flow(sk, flowi6_to_flowi(fl6));
rcu_read_lock();
diff --git a/net/ipv6/ip6_fib.c b/net/ipv6/ip6_fib.c
index 5da864997495..7ba9a6ebf19d 100644
--- a/net/ipv6/ip6_fib.c
+++ b/net/ipv6/ip6_fib.c
@@ -914,6 +914,7 @@ add:
fn->fn_flags |= RTN_RTINFO;
}
nsiblings = iter->rt6i_nsiblings;
+ iter->rt6i_node = NULL;
fib6_purge_rt(iter, fn, info->nl_net);
if (fn->rr_ptr == iter)
fn->rr_ptr = NULL;
@@ -928,6 +929,7 @@ add:
break;
if (rt6_qualify_for_ecmp(iter)) {
*ins = iter->dst.rt6_next;
+ iter->rt6i_node = NULL;
fib6_purge_rt(iter, fn, info->nl_net);
if (fn->rr_ptr == iter)
fn->rr_ptr = NULL;
diff --git a/net/ipv6/ip6_gre.c b/net/ipv6/ip6_gre.c
index caee5530ae2c..a88aff02b579 100644
--- a/net/ipv6/ip6_gre.c
+++ b/net/ipv6/ip6_gre.c
@@ -562,6 +562,8 @@ static inline int ip6gre_xmit_ipv4(struct sk_buff *skb, struct net_device *dev)
if (t->parms.flags & IP6_TNL_F_USE_ORIG_FWMARK)
fl6.flowi6_mark = skb->mark;
+ fl6.flowi6_uid = sock_net_uid(dev_net(dev), NULL);
+
err = gre_handle_offloads(skb, !!(t->parms.o_flags & TUNNEL_CSUM));
if (err)
return -1;
@@ -621,6 +623,8 @@ static inline int ip6gre_xmit_ipv6(struct sk_buff *skb, struct net_device *dev)
if (t->parms.flags & IP6_TNL_F_USE_ORIG_FWMARK)
fl6.flowi6_mark = skb->mark;
+ fl6.flowi6_uid = sock_net_uid(dev_net(dev), NULL);
+
if (gre_handle_offloads(skb, !!(t->parms.o_flags & TUNNEL_CSUM)))
return -1;
diff --git a/net/ipv6/ip6_output.c b/net/ipv6/ip6_output.c
index b723987761be..f8f64e0dcba3 100644
--- a/net/ipv6/ip6_output.c
+++ b/net/ipv6/ip6_output.c
@@ -39,6 +39,7 @@
#include <linux/module.h>
#include <linux/slab.h>
+#include <linux/bpf-cgroup.h>
#include <linux/netfilter.h>
#include <linux/netfilter_ipv6.h>
@@ -66,9 +67,6 @@ static int ip6_finish_output2(struct net *net, struct sock *sk, struct sk_buff *
struct in6_addr *nexthop;
int ret;
- skb->protocol = htons(ETH_P_IPV6);
- skb->dev = dev;
-
if (ipv6_addr_is_multicast(&ipv6_hdr(skb)->daddr)) {
struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb));
@@ -131,6 +129,14 @@ static int ip6_finish_output2(struct net *net, struct sock *sk, struct sk_buff *
static int ip6_finish_output(struct net *net, struct sock *sk, struct sk_buff *skb)
{
+ int ret;
+
+ ret = BPF_CGROUP_RUN_PROG_INET_EGRESS(sk, skb);
+ if (ret) {
+ kfree_skb(skb);
+ return ret;
+ }
+
if ((skb->len > ip6_skb_dst_mtu(skb) && !skb_is_gso(skb)) ||
dst_allfrag(skb_dst(skb)) ||
(IP6CB(skb)->frag_max_size && skb->len > IP6CB(skb)->frag_max_size))
@@ -144,6 +150,9 @@ int ip6_output(struct net *net, struct sock *sk, struct sk_buff *skb)
struct net_device *dev = skb_dst(skb)->dev;
struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb));
+ skb->protocol = htons(ETH_P_IPV6);
+ skb->dev = dev;
+
if (unlikely(idev->cnf.disable_ipv6)) {
IP6_INC_STATS(net, idev, IPSTATS_MIB_OUTDISCARDS);
kfree_skb(skb);
@@ -866,7 +875,6 @@ fail_toobig:
if (skb->sk && dst_allfrag(skb_dst(skb)))
sk_nocaps_add(skb->sk, NETIF_F_GSO_MASK);
- skb->dev = skb_dst(skb)->dev;
icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
err = -EMSGSIZE;
diff --git a/net/ipv6/ip6_tunnel.c b/net/ipv6/ip6_tunnel.c
index f89516d04150..8855e89d207b 100644
--- a/net/ipv6/ip6_tunnel.c
+++ b/net/ipv6/ip6_tunnel.c
@@ -1275,6 +1275,8 @@ ip4ip6_tnl_xmit(struct sk_buff *skb, struct net_device *dev)
fl6.flowi6_mark = skb->mark;
}
+ fl6.flowi6_uid = sock_net_uid(dev_net(dev), NULL);
+
if (iptunnel_handle_offloads(skb, SKB_GSO_IPXIP6))
return -1;
@@ -1362,6 +1364,8 @@ ip6ip6_tnl_xmit(struct sk_buff *skb, struct net_device *dev)
fl6.flowi6_mark = skb->mark;
}
+ fl6.flowi6_uid = sock_net_uid(dev_net(dev), NULL);
+
if (iptunnel_handle_offloads(skb, SKB_GSO_IPXIP6))
return -1;
diff --git a/net/ipv6/ip6_vti.c b/net/ipv6/ip6_vti.c
index c2b2ee71fc6c..c88505dad64c 100644
--- a/net/ipv6/ip6_vti.c
+++ b/net/ipv6/ip6_vti.c
@@ -618,9 +618,10 @@ static int vti6_err(struct sk_buff *skb, struct inet6_skb_parm *opt,
return 0;
if (type == NDISC_REDIRECT)
- ip6_redirect(skb, net, skb->dev->ifindex, 0);
+ ip6_redirect(skb, net, skb->dev->ifindex, 0,
+ sock_net_uid(net, NULL));
else
- ip6_update_pmtu(skb, net, info, 0, 0);
+ ip6_update_pmtu(skb, net, info, 0, 0, sock_net_uid(net, NULL));
xfrm_state_put(x);
return 0;
diff --git a/net/ipv6/ipcomp6.c b/net/ipv6/ipcomp6.c
index 1b9316e1386a..54d165b9845a 100644
--- a/net/ipv6/ipcomp6.c
+++ b/net/ipv6/ipcomp6.c
@@ -74,9 +74,10 @@ static int ipcomp6_err(struct sk_buff *skb, struct inet6_skb_parm *opt,
return 0;
if (type == NDISC_REDIRECT)
- ip6_redirect(skb, net, skb->dev->ifindex, 0);
+ ip6_redirect(skb, net, skb->dev->ifindex, 0,
+ sock_net_uid(net, NULL));
else
- ip6_update_pmtu(skb, net, info, 0, 0);
+ ip6_update_pmtu(skb, net, info, 0, 0, sock_net_uid(net, NULL));
xfrm_state_put(x);
return 0;
diff --git a/net/ipv6/ndisc.c b/net/ipv6/ndisc.c
index 505d048ffff5..14750ba527cf 100644
--- a/net/ipv6/ndisc.c
+++ b/net/ipv6/ndisc.c
@@ -1395,6 +1395,8 @@ skip_linkparms:
if (ri->prefix_len == 0 &&
!in6_dev->cnf.accept_ra_defrtr)
continue;
+ if (ri->prefix_len < in6_dev->cnf.accept_ra_rt_info_min_plen)
+ continue;
if (ri->prefix_len > in6_dev->cnf.accept_ra_rt_info_max_plen)
continue;
rt6_route_rcv(skb->dev, (u8 *)p, (p->nd_opt_len) << 3,
diff --git a/net/ipv6/netfilter.c b/net/ipv6/netfilter.c
index d11c46833d61..39970e212ad5 100644
--- a/net/ipv6/netfilter.c
+++ b/net/ipv6/netfilter.c
@@ -26,6 +26,7 @@ int ip6_route_me_harder(struct net *net, struct sk_buff *skb)
struct flowi6 fl6 = {
.flowi6_oif = skb->sk ? skb->sk->sk_bound_dev_if : 0,
.flowi6_mark = skb->mark,
+ .flowi6_uid = sock_net_uid(net, skb->sk),
.daddr = iph->daddr,
.saddr = iph->saddr,
};
diff --git a/net/ipv6/ping.c b/net/ipv6/ping.c
index 982868193dbb..2a965d4c2421 100644
--- a/net/ipv6/ping.c
+++ b/net/ipv6/ping.c
@@ -113,6 +113,7 @@ static int ping_v6_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
fl6.daddr = *daddr;
fl6.flowi6_oif = oif;
fl6.flowi6_mark = sk->sk_mark;
+ fl6.flowi6_uid = sk->sk_uid;
fl6.fl6_icmp_type = user_icmph.icmp6_type;
fl6.fl6_icmp_code = user_icmph.icmp6_code;
security_sk_classify_flow(sk, flowi6_to_flowi(&fl6));
diff --git a/net/ipv6/raw.c b/net/ipv6/raw.c
index a4f979ff31b9..55d284a2e266 100644
--- a/net/ipv6/raw.c
+++ b/net/ipv6/raw.c
@@ -791,6 +791,7 @@ static int rawv6_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
memset(&fl6, 0, sizeof(fl6));
fl6.flowi6_mark = sk->sk_mark;
+ fl6.flowi6_uid = sk->sk_uid;
ipc6.hlimit = -1;
ipc6.tclass = -1;
diff --git a/net/ipv6/route.c b/net/ipv6/route.c
index b0a72677b7e5..52892b871bdc 100644
--- a/net/ipv6/route.c
+++ b/net/ipv6/route.c
@@ -1417,7 +1417,7 @@ static void ip6_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
}
void ip6_update_pmtu(struct sk_buff *skb, struct net *net, __be32 mtu,
- int oif, u32 mark)
+ int oif, u32 mark, kuid_t uid)
{
const struct ipv6hdr *iph = (struct ipv6hdr *) skb->data;
struct dst_entry *dst;
@@ -1429,6 +1429,7 @@ void ip6_update_pmtu(struct sk_buff *skb, struct net *net, __be32 mtu,
fl6.daddr = iph->daddr;
fl6.saddr = iph->saddr;
fl6.flowlabel = ip6_flowinfo(iph);
+ fl6.flowi6_uid = uid;
dst = ip6_route_output(net, NULL, &fl6);
if (!dst->error)
@@ -1445,7 +1446,7 @@ void ip6_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, __be32 mtu)
if (!oif && skb->dev)
oif = l3mdev_master_ifindex(skb->dev);
- ip6_update_pmtu(skb, sock_net(sk), mtu, oif, sk->sk_mark);
+ ip6_update_pmtu(skb, sock_net(sk), mtu, oif, sk->sk_mark, sk->sk_uid);
dst = __sk_dst_get(sk);
if (!dst || !dst->obsolete ||
@@ -1537,7 +1538,8 @@ static struct dst_entry *ip6_route_redirect(struct net *net,
flags, __ip6_route_redirect);
}
-void ip6_redirect(struct sk_buff *skb, struct net *net, int oif, u32 mark)
+void ip6_redirect(struct sk_buff *skb, struct net *net, int oif, u32 mark,
+ kuid_t uid)
{
const struct ipv6hdr *iph = (struct ipv6hdr *) skb->data;
struct dst_entry *dst;
@@ -1550,6 +1552,7 @@ void ip6_redirect(struct sk_buff *skb, struct net *net, int oif, u32 mark)
fl6.daddr = iph->daddr;
fl6.saddr = iph->saddr;
fl6.flowlabel = ip6_flowinfo(iph);
+ fl6.flowi6_uid = uid;
dst = ip6_route_redirect(net, &fl6, &ipv6_hdr(skb)->saddr);
rt6_do_redirect(dst, NULL, skb);
@@ -1571,6 +1574,7 @@ void ip6_redirect_no_header(struct sk_buff *skb, struct net *net, int oif,
fl6.flowi6_mark = mark;
fl6.daddr = msg->dest;
fl6.saddr = iph->daddr;
+ fl6.flowi6_uid = sock_net_uid(net, NULL);
dst = ip6_route_redirect(net, &fl6, &iph->saddr);
rt6_do_redirect(dst, NULL, skb);
@@ -1579,7 +1583,8 @@ void ip6_redirect_no_header(struct sk_buff *skb, struct net *net, int oif,
void ip6_sk_redirect(struct sk_buff *skb, struct sock *sk)
{
- ip6_redirect(skb, sock_net(sk), sk->sk_bound_dev_if, sk->sk_mark);
+ ip6_redirect(skb, sock_net(sk), sk->sk_bound_dev_if, sk->sk_mark,
+ sk->sk_uid);
}
EXPORT_SYMBOL_GPL(ip6_sk_redirect);
@@ -2354,8 +2359,7 @@ static struct rt6_info *rt6_get_route_info(struct net *net,
const struct in6_addr *gwaddr,
struct net_device *dev)
{
- u32 tb_id = l3mdev_fib_table(dev) ? : RT6_TABLE_INFO;
- int ifindex = dev->ifindex;
+ u32 tb_id = l3mdev_fib_table(dev) ? : addrconf_rt_table(dev, RT6_TABLE_INFO);
struct fib6_node *fn;
struct rt6_info *rt = NULL;
struct fib6_table *table;
@@ -2370,7 +2374,7 @@ static struct rt6_info *rt6_get_route_info(struct net *net,
goto out;
for (rt = fn->leaf; rt; rt = rt->dst.rt6_next) {
- if (rt->dst.dev->ifindex != ifindex)
+ if (rt->dst.dev->ifindex != dev->ifindex)
continue;
if ((rt->rt6i_flags & (RTF_ROUTEINFO|RTF_GATEWAY)) != (RTF_ROUTEINFO|RTF_GATEWAY))
continue;
@@ -2401,7 +2405,7 @@ static struct rt6_info *rt6_add_route_info(struct net *net,
.fc_nlinfo.nl_net = net,
};
- cfg.fc_table = l3mdev_fib_table(dev) ? : RT6_TABLE_INFO,
+ cfg.fc_table = l3mdev_fib_table(dev) ? : addrconf_rt_table(dev, RT6_TABLE_INFO),
cfg.fc_dst = *prefix;
cfg.fc_gateway = *gwaddr;
@@ -2417,7 +2421,7 @@ static struct rt6_info *rt6_add_route_info(struct net *net,
struct rt6_info *rt6_get_dflt_router(const struct in6_addr *addr, struct net_device *dev)
{
- u32 tb_id = l3mdev_fib_table(dev) ? : RT6_TABLE_DFLT;
+ u32 tb_id = l3mdev_fib_table(dev) ? : addrconf_rt_table(dev, RT6_TABLE_MAIN);
struct rt6_info *rt;
struct fib6_table *table;
@@ -2443,7 +2447,7 @@ struct rt6_info *rt6_add_dflt_router(const struct in6_addr *gwaddr,
unsigned int pref)
{
struct fib6_config cfg = {
- .fc_table = l3mdev_fib_table(dev) ? : RT6_TABLE_DFLT,
+ .fc_table = l3mdev_fib_table(dev) ? : addrconf_rt_table(dev, RT6_TABLE_DFLT),
.fc_metric = IP6_RT_PRIO_USER,
.fc_ifindex = dev->ifindex,
.fc_flags = RTF_GATEWAY | RTF_ADDRCONF | RTF_DEFAULT |
@@ -2466,43 +2470,16 @@ struct rt6_info *rt6_add_dflt_router(const struct in6_addr *gwaddr,
return rt6_get_dflt_router(gwaddr, dev);
}
-static void __rt6_purge_dflt_routers(struct fib6_table *table)
-{
- struct rt6_info *rt;
-
-restart:
- read_lock_bh(&table->tb6_lock);
- for (rt = table->tb6_root.leaf; rt; rt = rt->dst.rt6_next) {
- if (rt->rt6i_flags & (RTF_DEFAULT | RTF_ADDRCONF) &&
- (!rt->rt6i_idev || rt->rt6i_idev->cnf.accept_ra != 2)) {
- dst_hold(&rt->dst);
- read_unlock_bh(&table->tb6_lock);
- ip6_del_rt(rt);
- goto restart;
- }
- }
- read_unlock_bh(&table->tb6_lock);
-
- table->flags &= ~RT6_TABLE_HAS_DFLT_ROUTER;
+int rt6_addrconf_purge(struct rt6_info *rt, void *arg) {
+ if (rt->rt6i_flags & (RTF_DEFAULT | RTF_ADDRCONF) &&
+ (!rt->rt6i_idev || rt->rt6i_idev->cnf.accept_ra != 2))
+ return -1;
+ return 0;
}
void rt6_purge_dflt_routers(struct net *net)
{
- struct fib6_table *table;
- struct hlist_head *head;
- unsigned int h;
-
- rcu_read_lock();
-
- for (h = 0; h < FIB6_TABLE_HASHSZ; h++) {
- head = &net->ipv6.fib_table_hash[h];
- hlist_for_each_entry_rcu(table, head, tb6_hlist) {
- if (table->flags & RT6_TABLE_HAS_DFLT_ROUTER)
- __rt6_purge_dflt_routers(table);
- }
- }
-
- rcu_read_unlock();
+ fib6_clean_all(net, rt6_addrconf_purge, NULL);
}
static void rtmsg_to_fib6_config(struct net *net,
@@ -2821,6 +2798,7 @@ static const struct nla_policy rtm_ipv6_policy[RTA_MAX+1] = {
[RTA_ENCAP_TYPE] = { .type = NLA_U16 },
[RTA_ENCAP] = { .type = NLA_NESTED },
[RTA_EXPIRES] = { .type = NLA_U32 },
+ [RTA_UID] = { .type = NLA_U32 },
[RTA_TABLE] = { .type = NLA_U32 },
};
@@ -3401,6 +3379,12 @@ static int inet6_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh)
if (tb[RTA_MARK])
fl6.flowi6_mark = nla_get_u32(tb[RTA_MARK]);
+ if (tb[RTA_UID])
+ fl6.flowi6_uid = make_kuid(current_user_ns(),
+ nla_get_u32(tb[RTA_UID]));
+ else
+ fl6.flowi6_uid = iif ? INVALID_UID : current_uid();
+
if (iif) {
struct net_device *dev;
int flags = 0;
diff --git a/net/ipv6/syncookies.c b/net/ipv6/syncookies.c
index 7a86433d8896..a67174e0b094 100644
--- a/net/ipv6/syncookies.c
+++ b/net/ipv6/syncookies.c
@@ -228,6 +228,7 @@ struct sock *cookie_v6_check(struct sock *sk, struct sk_buff *skb)
fl6.flowi6_mark = ireq->ir_mark;
fl6.fl6_dport = ireq->ir_rmt_port;
fl6.fl6_sport = inet_sk(sk)->inet_sport;
+ fl6.flowi6_uid = sk->sk_uid;
security_req_classify_flow(req, flowi6_to_flowi(&fl6));
dst = ip6_dst_lookup_flow(sk, &fl6, final_p);
diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c
index 0a69d39880f2..cb0813cda355 100644
--- a/net/ipv6/tcp_ipv6.c
+++ b/net/ipv6/tcp_ipv6.c
@@ -238,6 +238,7 @@ static int tcp_v6_connect(struct sock *sk, struct sockaddr *uaddr,
fl6.flowi6_mark = sk->sk_mark;
fl6.fl6_dport = usin->sin6_port;
fl6.fl6_sport = inet->inet_sport;
+ fl6.flowi6_uid = sk->sk_uid;
opt = rcu_dereference_protected(np->opt, lockdep_sock_is_held(sk));
final_p = fl6_update_dst(&fl6, opt, &final);
@@ -289,6 +290,11 @@ static int tcp_v6_connect(struct sock *sk, struct sockaddr *uaddr,
inet->inet_sport,
inet->inet_dport);
+ if (tcp_fastopen_defer_connect(sk, &err))
+ return err;
+ if (err)
+ goto late_failure;
+
err = tcp_connect(sk);
if (err)
goto late_failure;
@@ -297,7 +303,6 @@ static int tcp_v6_connect(struct sock *sk, struct sockaddr *uaddr,
late_failure:
tcp_set_state(sk, TCP_CLOSE);
- __sk_dst_reset(sk);
failure:
inet->inet_dport = 0;
sk->sk_route_caps = 0;
@@ -835,6 +840,7 @@ static void tcp_v6_send_response(const struct sock *sk, struct sk_buff *skb, u32
fl6.flowi6_mark = IP6_REPLY_MARK(net, skb->mark);
fl6.fl6_dport = t1->dest;
fl6.fl6_sport = t1->source;
+ fl6.flowi6_uid = sock_net_uid(net, sk && sk_fullsock(sk) ? sk : NULL);
security_skb_classify_flow(skb, flowi6_to_flowi(&fl6));
/* Pass a socket to ip6_dst_lookup either it is for RST
@@ -1240,9 +1246,6 @@ static int tcp_v6_do_rcv(struct sock *sk, struct sk_buff *skb)
if (skb->protocol == htons(ETH_P_IP))
return tcp_v4_do_rcv(sk, skb);
- if (tcp_filter(sk, skb))
- goto discard;
-
/*
* socket locking is here for SMP purposes as backlog rcv
* is currently called with bh processing disabled.
@@ -1443,7 +1446,9 @@ process:
}
sock_hold(sk);
refcounted = true;
- nsk = tcp_check_req(sk, skb, req, false);
+ nsk = NULL;
+ if (!tcp_filter(sk, skb))
+ nsk = tcp_check_req(sk, skb, req, false);
if (!nsk) {
reqsk_put(req);
goto discard_and_relse;
diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c
index 4db5f541bca6..c43ef0cdf8cb 100644
--- a/net/ipv6/udp.c
+++ b/net/ipv6/udp.c
@@ -1162,6 +1162,7 @@ do_udp_sendmsg:
fl6.flowi6_oif = np->sticky_pktinfo.ipi6_ifindex;
fl6.flowi6_mark = sk->sk_mark;
+ fl6.flowi6_uid = sk->sk_uid;
if (msg->msg_controllen) {
opt = &opt_space;
diff --git a/net/ipv6/xfrm6_policy.c b/net/ipv6/xfrm6_policy.c
index 0c7f27a1725f..d82f42799da5 100644
--- a/net/ipv6/xfrm6_policy.c
+++ b/net/ipv6/xfrm6_policy.c
@@ -29,7 +29,8 @@ static struct xfrm_policy_afinfo xfrm6_policy_afinfo;
static struct dst_entry *xfrm6_dst_lookup(struct net *net, int tos, int oif,
const xfrm_address_t *saddr,
- const xfrm_address_t *daddr)
+ const xfrm_address_t *daddr,
+ u32 mark)
{
struct flowi6 fl6;
struct dst_entry *dst;
@@ -38,6 +39,7 @@ static struct dst_entry *xfrm6_dst_lookup(struct net *net, int tos, int oif,
memset(&fl6, 0, sizeof(fl6));
fl6.flowi6_oif = l3mdev_master_ifindex_by_index(net, oif);
fl6.flowi6_flags = FLOWI_FLAG_SKIP_NH_OIF;
+ fl6.flowi6_mark = mark;
memcpy(&fl6.daddr, daddr, sizeof(fl6.daddr));
if (saddr)
memcpy(&fl6.saddr, saddr, sizeof(fl6.saddr));
@@ -54,12 +56,13 @@ static struct dst_entry *xfrm6_dst_lookup(struct net *net, int tos, int oif,
}
static int xfrm6_get_saddr(struct net *net, int oif,
- xfrm_address_t *saddr, xfrm_address_t *daddr)
+ xfrm_address_t *saddr, xfrm_address_t *daddr,
+ u32 mark)
{
struct dst_entry *dst;
struct net_device *dev;
- dst = xfrm6_dst_lookup(net, 0, oif, NULL, daddr);
+ dst = xfrm6_dst_lookup(net, 0, oif, NULL, daddr, mark);
if (IS_ERR(dst))
return -EHOSTUNREACH;
diff --git a/net/key/af_key.c b/net/key/af_key.c
index 3ba903ff2bb0..06501ed01ab8 100644
--- a/net/key/af_key.c
+++ b/net/key/af_key.c
@@ -1390,7 +1390,7 @@ static int pfkey_getspi(struct sock *sk, struct sk_buff *skb, const struct sadb_
}
if (!x)
- x = xfrm_find_acq(net, &dummy_mark, mode, reqid, proto, xdaddr, xsaddr, 1, family);
+ x = xfrm_find_acq(net, &dummy_mark, mode, reqid, 0, proto, xdaddr, xsaddr, 1, family);
if (x == NULL)
return -ENOENT;
@@ -2425,7 +2425,7 @@ static int pfkey_spddelete(struct sock *sk, struct sk_buff *skb, const struct sa
return err;
}
- xp = xfrm_policy_bysel_ctx(net, DUMMY_MARK, XFRM_POLICY_TYPE_MAIN,
+ xp = xfrm_policy_bysel_ctx(net, DUMMY_MARK, 0, XFRM_POLICY_TYPE_MAIN,
pol->sadb_x_policy_dir - 1, &sel, pol_ctx,
1, &err);
security_xfrm_policy_free(pol_ctx);
@@ -2676,7 +2676,7 @@ static int pfkey_spdget(struct sock *sk, struct sk_buff *skb, const struct sadb_
return -EINVAL;
delete = (hdr->sadb_msg_type == SADB_X_SPDDELETE2);
- xp = xfrm_policy_byid(net, DUMMY_MARK, XFRM_POLICY_TYPE_MAIN,
+ xp = xfrm_policy_byid(net, DUMMY_MARK, 0, XFRM_POLICY_TYPE_MAIN,
dir, pol->sadb_x_policy_id, delete, &err);
if (xp == NULL)
return -ENOENT;
diff --git a/net/l2tp/l2tp_ip6.c b/net/l2tp/l2tp_ip6.c
index 247097289fd0..86ad51a039c4 100644
--- a/net/l2tp/l2tp_ip6.c
+++ b/net/l2tp/l2tp_ip6.c
@@ -526,6 +526,7 @@ static int l2tp_ip6_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
memset(&fl6, 0, sizeof(fl6));
fl6.flowi6_mark = sk->sk_mark;
+ fl6.flowi6_uid = sk->sk_uid;
ipc6.hlimit = -1;
ipc6.tclass = -1;
diff --git a/net/netfilter/Kconfig b/net/netfilter/Kconfig
index e8d56d9a4df2..177d3ae5fda8 100644
--- a/net/netfilter/Kconfig
+++ b/net/netfilter/Kconfig
@@ -1316,6 +1316,8 @@ config NETFILTER_XT_MATCH_OWNER
based on who created the socket: the user or group. It is also
possible to check whether a socket actually exists.
+ Conflicts with '"quota, tag, uid" match'
+
config NETFILTER_XT_MATCH_POLICY
tristate 'IPsec "policy" match support'
depends on XFRM
@@ -1349,6 +1351,22 @@ config NETFILTER_XT_MATCH_PKTTYPE
To compile it as a module, choose M here. If unsure, say N.
+config NETFILTER_XT_MATCH_QTAGUID
+ bool '"quota, tag, owner" match and stats support'
+ depends on NETFILTER_XT_MATCH_SOCKET
+ depends on NETFILTER_XT_MATCH_OWNER=n
+ help
+ This option replaces the `owner' match. In addition to matching
+ on uid, it keeps stats based on a tag assigned to a socket.
+ The full tag is comprised of a UID and an accounting tag.
+ The tags are assignable to sockets from user space (e.g. a download
+ manager can assign the socket to another UID for accounting).
+ Stats and control are done via /proc/net/xt_qtaguid/.
+ It replaces owner as it takes the same arguments, but should
+ really be recognized by the iptables tool.
+
+ If unsure, say `N'.
+
config NETFILTER_XT_MATCH_QUOTA
tristate '"quota" match support'
depends on NETFILTER_ADVANCED
@@ -1359,6 +1377,29 @@ config NETFILTER_XT_MATCH_QUOTA
If you want to compile it as a module, say M here and read
<file:Documentation/kbuild/modules.txt>. If unsure, say `N'.
+config NETFILTER_XT_MATCH_QUOTA2
+ tristate '"quota2" match support'
+ depends on NETFILTER_ADVANCED
+ help
+ This option adds a `quota2' match, which allows to match on a
+ byte counter correctly and not per CPU.
+ It allows naming the quotas.
+ This is based on http://xtables-addons.git.sourceforge.net
+
+ If you want to compile it as a module, say M here and read
+ <file:Documentation/kbuild/modules.txt>. If unsure, say `N'.
+
+config NETFILTER_XT_MATCH_QUOTA2_LOG
+ bool '"quota2" Netfilter LOG support'
+ depends on NETFILTER_XT_MATCH_QUOTA2
+ default n
+ help
+ This option allows `quota2' to log ONCE when a quota limit
+ is passed. It logs via NETLINK using the NETLINK_NFLOG family.
+ It logs similarly to how ipt_ULOG would without data.
+
+ If unsure, say `N'.
+
config NETFILTER_XT_MATCH_RATEEST
tristate '"rateest" match support'
depends on NETFILTER_ADVANCED
diff --git a/net/netfilter/Makefile b/net/netfilter/Makefile
index c23c3c84416f..54ba5aa1f9bf 100644
--- a/net/netfilter/Makefile
+++ b/net/netfilter/Makefile
@@ -167,7 +167,9 @@ obj-$(CONFIG_NETFILTER_XT_MATCH_CGROUP) += xt_cgroup.o
obj-$(CONFIG_NETFILTER_XT_MATCH_PHYSDEV) += xt_physdev.o
obj-$(CONFIG_NETFILTER_XT_MATCH_PKTTYPE) += xt_pkttype.o
obj-$(CONFIG_NETFILTER_XT_MATCH_POLICY) += xt_policy.o
+obj-$(CONFIG_NETFILTER_XT_MATCH_QTAGUID) += xt_qtaguid_print.o xt_qtaguid.o
obj-$(CONFIG_NETFILTER_XT_MATCH_QUOTA) += xt_quota.o
+obj-$(CONFIG_NETFILTER_XT_MATCH_QUOTA2) += xt_quota2.o
obj-$(CONFIG_NETFILTER_XT_MATCH_RATEEST) += xt_rateest.o
obj-$(CONFIG_NETFILTER_XT_MATCH_REALM) += xt_realm.o
obj-$(CONFIG_NETFILTER_XT_MATCH_RECENT) += xt_recent.o
diff --git a/net/netfilter/nf_conntrack_core.c b/net/netfilter/nf_conntrack_core.c
index 19b3f4fbea52..5fd310b7b7c1 100644
--- a/net/netfilter/nf_conntrack_core.c
+++ b/net/netfilter/nf_conntrack_core.c
@@ -1026,7 +1026,7 @@ static void gc_worker(struct work_struct *work)
next_run = gc_work->next_gc_run;
gc_work->last_bucket = i;
- queue_delayed_work(system_long_wq, &gc_work->dwork, next_run);
+ queue_delayed_work(system_power_efficient_wq, &gc_work->dwork, next_run);
}
static void conntrack_gc_work_init(struct conntrack_gc_work *gc_work)
@@ -1816,7 +1816,7 @@ int nf_conntrack_hash_resize(unsigned int hashsize)
return 0;
}
-int nf_conntrack_set_hashsize(const char *val, struct kernel_param *kp)
+int nf_conntrack_set_hashsize(const char *val, const struct kernel_param *kp)
{
unsigned int hashsize;
int rc;
@@ -1944,7 +1944,7 @@ int nf_conntrack_init_start(void)
nf_ct_untracked_status_or(IPS_CONFIRMED | IPS_UNTRACKED);
conntrack_gc_work_init(&conntrack_gc_work);
- queue_delayed_work(system_long_wq, &conntrack_gc_work.dwork, HZ);
+ queue_delayed_work(system_power_efficient_wq, &conntrack_gc_work.dwork, HZ);
return 0;
diff --git a/net/netfilter/nf_nat_ftp.c b/net/netfilter/nf_nat_ftp.c
index e84a578dbe35..d76afafdc699 100644
--- a/net/netfilter/nf_nat_ftp.c
+++ b/net/netfilter/nf_nat_ftp.c
@@ -134,7 +134,7 @@ static int __init nf_nat_ftp_init(void)
}
/* Prior to 2.6.11, we had a ports param. No longer, but don't break users. */
-static int warn_set(const char *val, struct kernel_param *kp)
+static int warn_set(const char *val, const struct kernel_param *kp)
{
printk(KERN_INFO KBUILD_MODNAME
": kernel >= 2.6.10 only uses 'ports' for conntrack modules\n");
diff --git a/net/netfilter/nf_nat_irc.c b/net/netfilter/nf_nat_irc.c
index 1fb2258c3535..8039bcd60758 100644
--- a/net/netfilter/nf_nat_irc.c
+++ b/net/netfilter/nf_nat_irc.c
@@ -107,7 +107,7 @@ static int __init nf_nat_irc_init(void)
}
/* Prior to 2.6.11, we had a ports param. No longer, but don't break users. */
-static int warn_set(const char *val, struct kernel_param *kp)
+static int warn_set(const char *val, const struct kernel_param *kp)
{
printk(KERN_INFO KBUILD_MODNAME
": kernel >= 2.6.10 only uses 'ports' for conntrack modules\n");
diff --git a/net/netfilter/xt_IDLETIMER.c b/net/netfilter/xt_IDLETIMER.c
index 921c9bd7e1e7..44c42a7fa7bc 100644
--- a/net/netfilter/xt_IDLETIMER.c
+++ b/net/netfilter/xt_IDLETIMER.c
@@ -5,6 +5,7 @@
* After timer expires a kevent will be sent.
*
* Copyright (C) 2004, 2010 Nokia Corporation
+ *
* Written by Timo Teras <ext-timo.teras@nokia.com>
*
* Converted to x_tables and reworked for upstream inclusion
@@ -38,8 +39,17 @@
#include <linux/netfilter/xt_IDLETIMER.h>
#include <linux/kdev_t.h>
#include <linux/kobject.h>
+#include <linux/skbuff.h>
#include <linux/workqueue.h>
#include <linux/sysfs.h>
+#include <linux/rtc.h>
+#include <linux/time.h>
+#include <linux/math64.h>
+#include <linux/suspend.h>
+#include <linux/notifier.h>
+#include <net/net_namespace.h>
+#include <net/sock.h>
+#include <net/inet_sock.h>
struct idletimer_tg_attr {
struct attribute attr;
@@ -55,14 +65,110 @@ struct idletimer_tg {
struct kobject *kobj;
struct idletimer_tg_attr attr;
+ struct timespec delayed_timer_trigger;
+ struct timespec last_modified_timer;
+ struct timespec last_suspend_time;
+ struct notifier_block pm_nb;
+
+ int timeout;
unsigned int refcnt;
+ bool work_pending;
+ bool send_nl_msg;
+ bool active;
+ uid_t uid;
};
static LIST_HEAD(idletimer_tg_list);
static DEFINE_MUTEX(list_mutex);
+static DEFINE_SPINLOCK(timestamp_lock);
static struct kobject *idletimer_tg_kobj;
+static bool check_for_delayed_trigger(struct idletimer_tg *timer,
+ struct timespec *ts)
+{
+ bool state;
+ struct timespec temp;
+ spin_lock_bh(&timestamp_lock);
+ timer->work_pending = false;
+ if ((ts->tv_sec - timer->last_modified_timer.tv_sec) > timer->timeout ||
+ timer->delayed_timer_trigger.tv_sec != 0) {
+ state = false;
+ temp.tv_sec = timer->timeout;
+ temp.tv_nsec = 0;
+ if (timer->delayed_timer_trigger.tv_sec != 0) {
+ temp = timespec_add(timer->delayed_timer_trigger, temp);
+ ts->tv_sec = temp.tv_sec;
+ ts->tv_nsec = temp.tv_nsec;
+ timer->delayed_timer_trigger.tv_sec = 0;
+ timer->work_pending = true;
+ schedule_work(&timer->work);
+ } else {
+ temp = timespec_add(timer->last_modified_timer, temp);
+ ts->tv_sec = temp.tv_sec;
+ ts->tv_nsec = temp.tv_nsec;
+ }
+ } else {
+ state = timer->active;
+ }
+ spin_unlock_bh(&timestamp_lock);
+ return state;
+}
+
+static void notify_netlink_uevent(const char *iface, struct idletimer_tg *timer)
+{
+ char iface_msg[NLMSG_MAX_SIZE];
+ char state_msg[NLMSG_MAX_SIZE];
+ char timestamp_msg[NLMSG_MAX_SIZE];
+ char uid_msg[NLMSG_MAX_SIZE];
+ char *envp[] = { iface_msg, state_msg, timestamp_msg, uid_msg, NULL };
+ int res;
+ struct timespec ts;
+ uint64_t time_ns;
+ bool state;
+
+ res = snprintf(iface_msg, NLMSG_MAX_SIZE, "INTERFACE=%s",
+ iface);
+ if (NLMSG_MAX_SIZE <= res) {
+ pr_err("message too long (%d)", res);
+ return;
+ }
+
+ get_monotonic_boottime(&ts);
+ state = check_for_delayed_trigger(timer, &ts);
+ res = snprintf(state_msg, NLMSG_MAX_SIZE, "STATE=%s",
+ state ? "active" : "inactive");
+
+ if (NLMSG_MAX_SIZE <= res) {
+ pr_err("message too long (%d)", res);
+ return;
+ }
+
+ if (state) {
+ res = snprintf(uid_msg, NLMSG_MAX_SIZE, "UID=%u", timer->uid);
+ if (NLMSG_MAX_SIZE <= res)
+ pr_err("message too long (%d)", res);
+ } else {
+ res = snprintf(uid_msg, NLMSG_MAX_SIZE, "UID=");
+ if (NLMSG_MAX_SIZE <= res)
+ pr_err("message too long (%d)", res);
+ }
+
+ time_ns = timespec_to_ns(&ts);
+ res = snprintf(timestamp_msg, NLMSG_MAX_SIZE, "TIME_NS=%llu", time_ns);
+ if (NLMSG_MAX_SIZE <= res) {
+ timestamp_msg[0] = '\0';
+ pr_err("message too long (%d)", res);
+ }
+
+ pr_debug("putting nlmsg: <%s> <%s> <%s> <%s>\n", iface_msg, state_msg,
+ timestamp_msg, uid_msg);
+ kobject_uevent_env(idletimer_tg_kobj, KOBJ_CHANGE, envp);
+ return;
+
+
+}
+
static
struct idletimer_tg *__idletimer_tg_find_by_label(const char *label)
{
@@ -83,6 +189,7 @@ static ssize_t idletimer_tg_show(struct kobject *kobj, struct attribute *attr,
{
struct idletimer_tg *timer;
unsigned long expires = 0;
+ unsigned long now = jiffies;
mutex_lock(&list_mutex);
@@ -92,11 +199,15 @@ static ssize_t idletimer_tg_show(struct kobject *kobj, struct attribute *attr,
mutex_unlock(&list_mutex);
- if (time_after(expires, jiffies))
+ if (time_after(expires, now))
return sprintf(buf, "%u\n",
- jiffies_to_msecs(expires - jiffies) / 1000);
+ jiffies_to_msecs(expires - now) / 1000);
- return sprintf(buf, "0\n");
+ if (timer->send_nl_msg)
+ return sprintf(buf, "0 %d\n",
+ jiffies_to_msecs(now - expires) / 1000);
+ else
+ return sprintf(buf, "0\n");
}
static void idletimer_tg_work(struct work_struct *work)
@@ -105,6 +216,9 @@ static void idletimer_tg_work(struct work_struct *work)
work);
sysfs_notify(idletimer_tg_kobj, NULL, timer->attr.attr.name);
+
+ if (timer->send_nl_msg)
+ notify_netlink_uevent(timer->attr.attr.name, timer);
}
static void idletimer_tg_expired(unsigned long data)
@@ -112,8 +226,55 @@ static void idletimer_tg_expired(unsigned long data)
struct idletimer_tg *timer = (struct idletimer_tg *) data;
pr_debug("timer %s expired\n", timer->attr.attr.name);
-
+ spin_lock_bh(&timestamp_lock);
+ timer->active = false;
+ timer->work_pending = true;
schedule_work(&timer->work);
+ spin_unlock_bh(&timestamp_lock);
+}
+
+static int idletimer_resume(struct notifier_block *notifier,
+ unsigned long pm_event, void *unused)
+{
+ struct timespec ts;
+ unsigned long time_diff, now = jiffies;
+ struct idletimer_tg *timer = container_of(notifier,
+ struct idletimer_tg, pm_nb);
+ if (!timer)
+ return NOTIFY_DONE;
+ switch (pm_event) {
+ case PM_SUSPEND_PREPARE:
+ get_monotonic_boottime(&timer->last_suspend_time);
+ break;
+ case PM_POST_SUSPEND:
+ spin_lock_bh(&timestamp_lock);
+ if (!timer->active) {
+ spin_unlock_bh(&timestamp_lock);
+ break;
+ }
+ /* since jiffies are not updated when suspended now represents
+ * the time it would have suspended */
+ if (time_after(timer->timer.expires, now)) {
+ get_monotonic_boottime(&ts);
+ ts = timespec_sub(ts, timer->last_suspend_time);
+ time_diff = timespec_to_jiffies(&ts);
+ if (timer->timer.expires > (time_diff + now)) {
+ mod_timer_pending(&timer->timer,
+ (timer->timer.expires - time_diff));
+ } else {
+ del_timer(&timer->timer);
+ timer->timer.expires = 0;
+ timer->active = false;
+ timer->work_pending = true;
+ schedule_work(&timer->work);
+ }
+ }
+ spin_unlock_bh(&timestamp_lock);
+ break;
+ default:
+ break;
+ }
+ return NOTIFY_DONE;
}
static int idletimer_check_sysfs_name(const char *name, unsigned int size)
@@ -166,6 +327,21 @@ static int idletimer_tg_create(struct idletimer_tg_info *info)
setup_timer(&info->timer->timer, idletimer_tg_expired,
(unsigned long) info->timer);
info->timer->refcnt = 1;
+ info->timer->send_nl_msg = (info->send_nl_msg == 0) ? false : true;
+ info->timer->active = true;
+ info->timer->timeout = info->timeout;
+
+ info->timer->delayed_timer_trigger.tv_sec = 0;
+ info->timer->delayed_timer_trigger.tv_nsec = 0;
+ info->timer->work_pending = false;
+ info->timer->uid = 0;
+ get_monotonic_boottime(&info->timer->last_modified_timer);
+
+ info->timer->pm_nb.notifier_call = idletimer_resume;
+ ret = register_pm_notifier(&info->timer->pm_nb);
+ if (ret)
+ printk(KERN_WARNING "[%s] Failed to register pm notifier %d\n",
+ __func__, ret);
INIT_WORK(&info->timer->work, idletimer_tg_work);
@@ -182,6 +358,42 @@ out:
return ret;
}
+static void reset_timer(const struct idletimer_tg_info *info,
+ struct sk_buff *skb)
+{
+ unsigned long now = jiffies;
+ struct idletimer_tg *timer = info->timer;
+ bool timer_prev;
+
+ spin_lock_bh(&timestamp_lock);
+ timer_prev = timer->active;
+ timer->active = true;
+ /* timer_prev is used to guard overflow problem in time_before*/
+ if (!timer_prev || time_before(timer->timer.expires, now)) {
+ pr_debug("Starting Checkentry timer (Expired, Jiffies): %lu, %lu\n",
+ timer->timer.expires, now);
+
+ /* Stores the uid resposible for waking up the radio */
+ if (skb && (skb->sk)) {
+ timer->uid = from_kuid_munged(current_user_ns(),
+ sock_i_uid(skb_to_full_sk(skb)));
+ }
+
+ /* checks if there is a pending inactive notification*/
+ if (timer->work_pending)
+ timer->delayed_timer_trigger = timer->last_modified_timer;
+ else {
+ timer->work_pending = true;
+ schedule_work(&timer->work);
+ }
+ }
+
+ get_monotonic_boottime(&timer->last_modified_timer);
+ mod_timer(&timer->timer,
+ msecs_to_jiffies(info->timeout * 1000) + now);
+ spin_unlock_bh(&timestamp_lock);
+}
+
/*
* The actual xt_tables plugin.
*/
@@ -189,15 +401,23 @@ static unsigned int idletimer_tg_target(struct sk_buff *skb,
const struct xt_action_param *par)
{
const struct idletimer_tg_info *info = par->targinfo;
+ unsigned long now = jiffies;
pr_debug("resetting timer %s, timeout period %u\n",
info->label, info->timeout);
BUG_ON(!info->timer);
- mod_timer(&info->timer->timer,
- msecs_to_jiffies(info->timeout * 1000) + jiffies);
+ info->timer->active = true;
+
+ if (time_before(info->timer->timer.expires, now)) {
+ schedule_work(&info->timer->work);
+ pr_debug("Starting timer %s (Expired, Jiffies): %lu, %lu\n",
+ info->label, info->timer->timer.expires, now);
+ }
+ /* TODO: Avoid modifying timers on each packet */
+ reset_timer(info, skb);
return XT_CONTINUE;
}
@@ -206,7 +426,7 @@ static int idletimer_tg_checkentry(const struct xt_tgchk_param *par)
struct idletimer_tg_info *info = par->targinfo;
int ret;
- pr_debug("checkentry targinfo%s\n", info->label);
+ pr_debug("checkentry targinfo %s\n", info->label);
if (info->timeout == 0) {
pr_debug("timeout value is zero\n");
@@ -228,9 +448,7 @@ static int idletimer_tg_checkentry(const struct xt_tgchk_param *par)
info->timer = __idletimer_tg_find_by_label(info->label);
if (info->timer) {
info->timer->refcnt++;
- mod_timer(&info->timer->timer,
- msecs_to_jiffies(info->timeout * 1000) + jiffies);
-
+ reset_timer(info, NULL);
pr_debug("increased refcnt of timer %s to %u\n",
info->label, info->timer->refcnt);
} else {
@@ -243,6 +461,7 @@ static int idletimer_tg_checkentry(const struct xt_tgchk_param *par)
}
mutex_unlock(&list_mutex);
+
return 0;
}
@@ -259,13 +478,14 @@ static void idletimer_tg_destroy(const struct xt_tgdtor_param *par)
list_del(&info->timer->entry);
del_timer_sync(&info->timer->timer);
- cancel_work_sync(&info->timer->work);
sysfs_remove_file(idletimer_tg_kobj, &info->timer->attr.attr);
+ unregister_pm_notifier(&info->timer->pm_nb);
+ cancel_work_sync(&info->timer->work);
kfree(info->timer->attr.attr.name);
kfree(info->timer);
} else {
pr_debug("decreased refcnt of timer %s to %u\n",
- info->label, info->timer->refcnt);
+ info->label, info->timer->refcnt);
}
mutex_unlock(&list_mutex);
@@ -273,6 +493,7 @@ static void idletimer_tg_destroy(const struct xt_tgdtor_param *par)
static struct xt_target idletimer_tg __read_mostly = {
.name = "IDLETIMER",
+ .revision = 1,
.family = NFPROTO_UNSPEC,
.target = idletimer_tg_target,
.targetsize = sizeof(struct idletimer_tg_info),
@@ -338,3 +559,4 @@ MODULE_DESCRIPTION("Xtables: idle time monitor");
MODULE_LICENSE("GPL v2");
MODULE_ALIAS("ipt_IDLETIMER");
MODULE_ALIAS("ip6t_IDLETIMER");
+MODULE_ALIAS("arpt_IDLETIMER");
diff --git a/net/netfilter/xt_bpf.c b/net/netfilter/xt_bpf.c
index dffee9d47ec4..a8df0a9d68e6 100644
--- a/net/netfilter/xt_bpf.c
+++ b/net/netfilter/xt_bpf.c
@@ -8,8 +8,10 @@
*/
#include <linux/module.h>
+#include <linux/syscalls.h>
#include <linux/skbuff.h>
#include <linux/filter.h>
+#include <linux/bpf.h>
#include <linux/netfilter/xt_bpf.h>
#include <linux/netfilter/x_tables.h>
@@ -20,15 +22,18 @@ MODULE_LICENSE("GPL");
MODULE_ALIAS("ipt_bpf");
MODULE_ALIAS("ip6t_bpf");
-static int bpf_mt_check(const struct xt_mtchk_param *par)
+static int __bpf_mt_check_bytecode(struct sock_filter *insns, __u16 len,
+ struct bpf_prog **ret)
{
- struct xt_bpf_info *info = par->matchinfo;
struct sock_fprog_kern program;
- program.len = info->bpf_program_num_elem;
- program.filter = info->bpf_program;
+ if (len > XT_BPF_MAX_NUM_INSTR)
+ return -EINVAL;
- if (bpf_prog_create(&info->filter, &program)) {
+ program.len = len;
+ program.filter = insns;
+
+ if (bpf_prog_create(ret, &program)) {
pr_info("bpf: check failed: parse error\n");
return -EINVAL;
}
@@ -36,6 +41,53 @@ static int bpf_mt_check(const struct xt_mtchk_param *par)
return 0;
}
+static int __bpf_mt_check_fd(int fd, struct bpf_prog **ret)
+{
+ struct bpf_prog *prog;
+
+ prog = bpf_prog_get_type(fd, BPF_PROG_TYPE_SOCKET_FILTER);
+ if (IS_ERR(prog))
+ return PTR_ERR(prog);
+
+ *ret = prog;
+ return 0;
+}
+
+static int __bpf_mt_check_path(const char *path, struct bpf_prog **ret)
+{
+ if (strnlen(path, XT_BPF_PATH_MAX) == XT_BPF_PATH_MAX)
+ return -EINVAL;
+
+ *ret = bpf_prog_get_type_path(path, BPF_PROG_TYPE_SOCKET_FILTER);
+ return PTR_ERR_OR_ZERO(*ret);
+
+}
+
+static int bpf_mt_check(const struct xt_mtchk_param *par)
+{
+ struct xt_bpf_info *info = par->matchinfo;
+
+ return __bpf_mt_check_bytecode(info->bpf_program,
+ info->bpf_program_num_elem,
+ &info->filter);
+}
+
+static int bpf_mt_check_v1(const struct xt_mtchk_param *par)
+{
+ struct xt_bpf_info_v1 *info = par->matchinfo;
+
+ if (info->mode == XT_BPF_MODE_BYTECODE)
+ return __bpf_mt_check_bytecode(info->bpf_program,
+ info->bpf_program_num_elem,
+ &info->filter);
+ else if (info->mode == XT_BPF_MODE_FD_ELF)
+ return __bpf_mt_check_fd(info->fd, &info->filter);
+ else if (info->mode == XT_BPF_MODE_PATH_PINNED)
+ return __bpf_mt_check_path(info->path, &info->filter);
+ else
+ return -EINVAL;
+}
+
static bool bpf_mt(const struct sk_buff *skb, struct xt_action_param *par)
{
const struct xt_bpf_info *info = par->matchinfo;
@@ -43,31 +95,58 @@ static bool bpf_mt(const struct sk_buff *skb, struct xt_action_param *par)
return BPF_PROG_RUN(info->filter, skb);
}
+static bool bpf_mt_v1(const struct sk_buff *skb, struct xt_action_param *par)
+{
+ const struct xt_bpf_info_v1 *info = par->matchinfo;
+
+ return !!bpf_prog_run_save_cb(info->filter, (struct sk_buff *) skb);
+}
+
static void bpf_mt_destroy(const struct xt_mtdtor_param *par)
{
const struct xt_bpf_info *info = par->matchinfo;
+
+ bpf_prog_destroy(info->filter);
+}
+
+static void bpf_mt_destroy_v1(const struct xt_mtdtor_param *par)
+{
+ const struct xt_bpf_info_v1 *info = par->matchinfo;
+
bpf_prog_destroy(info->filter);
}
-static struct xt_match bpf_mt_reg __read_mostly = {
- .name = "bpf",
- .revision = 0,
- .family = NFPROTO_UNSPEC,
- .checkentry = bpf_mt_check,
- .match = bpf_mt,
- .destroy = bpf_mt_destroy,
- .matchsize = sizeof(struct xt_bpf_info),
- .me = THIS_MODULE,
+static struct xt_match bpf_mt_reg[] __read_mostly = {
+ {
+ .name = "bpf",
+ .revision = 0,
+ .family = NFPROTO_UNSPEC,
+ .checkentry = bpf_mt_check,
+ .match = bpf_mt,
+ .destroy = bpf_mt_destroy,
+ .matchsize = sizeof(struct xt_bpf_info),
+ .me = THIS_MODULE,
+ },
+ {
+ .name = "bpf",
+ .revision = 1,
+ .family = NFPROTO_UNSPEC,
+ .checkentry = bpf_mt_check_v1,
+ .match = bpf_mt_v1,
+ .destroy = bpf_mt_destroy_v1,
+ .matchsize = sizeof(struct xt_bpf_info_v1),
+ .me = THIS_MODULE,
+ },
};
static int __init bpf_mt_init(void)
{
- return xt_register_match(&bpf_mt_reg);
+ return xt_register_matches(bpf_mt_reg, ARRAY_SIZE(bpf_mt_reg));
}
static void __exit bpf_mt_exit(void)
{
- xt_unregister_match(&bpf_mt_reg);
+ xt_unregister_matches(bpf_mt_reg, ARRAY_SIZE(bpf_mt_reg));
}
module_init(bpf_mt_init);
diff --git a/net/netfilter/xt_qtaguid.c b/net/netfilter/xt_qtaguid.c
new file mode 100644
index 000000000000..d6779099d453
--- /dev/null
+++ b/net/netfilter/xt_qtaguid.c
@@ -0,0 +1,3021 @@
+/*
+ * Kernel iptables module to track stats for packets based on user tags.
+ *
+ * (C) 2011 Google, Inc
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+/*
+ * There are run-time debug flags enabled via the debug_mask module param, or
+ * via the DEFAULT_DEBUG_MASK. See xt_qtaguid_internal.h.
+ */
+#define DEBUG
+
+#include <linux/file.h>
+#include <linux/inetdevice.h>
+#include <linux/module.h>
+#include <linux/miscdevice.h>
+#include <linux/netfilter/x_tables.h>
+#include <linux/netfilter/xt_qtaguid.h>
+#include <linux/ratelimit.h>
+#include <linux/seq_file.h>
+#include <linux/skbuff.h>
+#include <linux/workqueue.h>
+#include <net/addrconf.h>
+#include <net/sock.h>
+#include <net/tcp.h>
+#include <net/udp.h>
+
+#if defined(CONFIG_IP6_NF_IPTABLES) || defined(CONFIG_IP6_NF_IPTABLES_MODULE)
+#include <linux/netfilter_ipv6/ip6_tables.h>
+#endif
+
+#include <linux/netfilter/xt_socket.h>
+#include "xt_qtaguid_internal.h"
+#include "xt_qtaguid_print.h"
+#include "../../fs/proc/internal.h"
+
+/*
+ * We only use the xt_socket funcs within a similar context to avoid unexpected
+ * return values.
+ */
+#define XT_SOCKET_SUPPORTED_HOOKS \
+ ((1 << NF_INET_PRE_ROUTING) | (1 << NF_INET_LOCAL_IN))
+
+
+static const char *module_procdirname = "xt_qtaguid";
+static struct proc_dir_entry *xt_qtaguid_procdir;
+
+static unsigned int proc_iface_perms = S_IRUGO;
+module_param_named(iface_perms, proc_iface_perms, uint, S_IRUGO | S_IWUSR);
+
+static struct proc_dir_entry *xt_qtaguid_stats_file;
+static unsigned int proc_stats_perms = S_IRUGO;
+module_param_named(stats_perms, proc_stats_perms, uint, S_IRUGO | S_IWUSR);
+
+static struct proc_dir_entry *xt_qtaguid_ctrl_file;
+
+/* Everybody can write. But proc_ctrl_write_limited is true by default which
+ * limits what can be controlled. See the can_*() functions.
+ */
+static unsigned int proc_ctrl_perms = S_IRUGO | S_IWUGO;
+module_param_named(ctrl_perms, proc_ctrl_perms, uint, S_IRUGO | S_IWUSR);
+
+/* Limited by default, so the gid of the ctrl and stats proc entries
+ * will limit what can be done. See the can_*() functions.
+ */
+static bool proc_stats_readall_limited = true;
+static bool proc_ctrl_write_limited = true;
+
+module_param_named(stats_readall_limited, proc_stats_readall_limited, bool,
+ S_IRUGO | S_IWUSR);
+module_param_named(ctrl_write_limited, proc_ctrl_write_limited, bool,
+ S_IRUGO | S_IWUSR);
+
+/*
+ * Limit the number of active tags (via socket tags) for a given UID.
+ * Multiple processes could share the UID.
+ */
+static int max_sock_tags = DEFAULT_MAX_SOCK_TAGS;
+module_param(max_sock_tags, int, S_IRUGO | S_IWUSR);
+
+/*
+ * After the kernel has initiallized this module, it is still possible
+ * to make it passive.
+ * Setting passive to Y:
+ * - the iface stats handling will not act on notifications.
+ * - iptables matches will never match.
+ * - ctrl commands silently succeed.
+ * - stats are always empty.
+ * This is mostly usefull when a bug is suspected.
+ */
+static bool module_passive;
+module_param_named(passive, module_passive, bool, S_IRUGO | S_IWUSR);
+
+/*
+ * Control how qtaguid data is tracked per proc/uid.
+ * Setting tag_tracking_passive to Y:
+ * - don't create proc specific structs to track tags
+ * - don't check that active tag stats exceed some limits.
+ * - don't clean up socket tags on process exits.
+ * This is mostly usefull when a bug is suspected.
+ */
+static bool qtu_proc_handling_passive;
+module_param_named(tag_tracking_passive, qtu_proc_handling_passive, bool,
+ S_IRUGO | S_IWUSR);
+
+#define QTU_DEV_NAME "xt_qtaguid"
+
+uint qtaguid_debug_mask = DEFAULT_DEBUG_MASK;
+module_param_named(debug_mask, qtaguid_debug_mask, uint, S_IRUGO | S_IWUSR);
+
+/*---------------------------------------------------------------------------*/
+static const char *iface_stat_procdirname = "iface_stat";
+static struct proc_dir_entry *iface_stat_procdir;
+/*
+ * The iface_stat_all* will go away once userspace gets use to the new fields
+ * that have a format line.
+ */
+static const char *iface_stat_all_procfilename = "iface_stat_all";
+static struct proc_dir_entry *iface_stat_all_procfile;
+static const char *iface_stat_fmt_procfilename = "iface_stat_fmt";
+static struct proc_dir_entry *iface_stat_fmt_procfile;
+
+
+static LIST_HEAD(iface_stat_list);
+static DEFINE_SPINLOCK(iface_stat_list_lock);
+
+static struct rb_root sock_tag_tree = RB_ROOT;
+static DEFINE_SPINLOCK(sock_tag_list_lock);
+
+static struct rb_root tag_counter_set_tree = RB_ROOT;
+static DEFINE_SPINLOCK(tag_counter_set_list_lock);
+
+static struct rb_root uid_tag_data_tree = RB_ROOT;
+static DEFINE_SPINLOCK(uid_tag_data_tree_lock);
+
+static struct rb_root proc_qtu_data_tree = RB_ROOT;
+/* No proc_qtu_data_tree_lock; use uid_tag_data_tree_lock */
+
+static struct qtaguid_event_counts qtu_events;
+/*----------------------------------------------*/
+static bool can_manipulate_uids(void)
+{
+ /* root pwnd */
+ return in_egroup_p(xt_qtaguid_ctrl_file->gid)
+ || unlikely(!from_kuid(&init_user_ns, current_fsuid())) || unlikely(!proc_ctrl_write_limited)
+ || unlikely(uid_eq(current_fsuid(), xt_qtaguid_ctrl_file->uid));
+}
+
+static bool can_impersonate_uid(kuid_t uid)
+{
+ return uid_eq(uid, current_fsuid()) || can_manipulate_uids();
+}
+
+static bool can_read_other_uid_stats(kuid_t uid)
+{
+ /* root pwnd */
+ return in_egroup_p(xt_qtaguid_stats_file->gid)
+ || unlikely(!from_kuid(&init_user_ns, current_fsuid())) || uid_eq(uid, current_fsuid())
+ || unlikely(!proc_stats_readall_limited)
+ || unlikely(uid_eq(current_fsuid(), xt_qtaguid_ctrl_file->uid));
+}
+
+static inline void dc_add_byte_packets(struct data_counters *counters, int set,
+ enum ifs_tx_rx direction,
+ enum ifs_proto ifs_proto,
+ int bytes,
+ int packets)
+{
+ counters->bpc[set][direction][ifs_proto].bytes += bytes;
+ counters->bpc[set][direction][ifs_proto].packets += packets;
+}
+
+static struct tag_node *tag_node_tree_search(struct rb_root *root, tag_t tag)
+{
+ struct rb_node *node = root->rb_node;
+
+ while (node) {
+ struct tag_node *data = rb_entry(node, struct tag_node, node);
+ int result;
+ RB_DEBUG("qtaguid: tag_node_tree_search(0x%llx): "
+ " node=%p data=%p\n", tag, node, data);
+ result = tag_compare(tag, data->tag);
+ RB_DEBUG("qtaguid: tag_node_tree_search(0x%llx): "
+ " data.tag=0x%llx (uid=%u) res=%d\n",
+ tag, data->tag, get_uid_from_tag(data->tag), result);
+ if (result < 0)
+ node = node->rb_left;
+ else if (result > 0)
+ node = node->rb_right;
+ else
+ return data;
+ }
+ return NULL;
+}
+
+static void tag_node_tree_insert(struct tag_node *data, struct rb_root *root)
+{
+ struct rb_node **new = &(root->rb_node), *parent = NULL;
+
+ /* Figure out where to put new node */
+ while (*new) {
+ struct tag_node *this = rb_entry(*new, struct tag_node,
+ node);
+ int result = tag_compare(data->tag, this->tag);
+ RB_DEBUG("qtaguid: %s(): tag=0x%llx"
+ " (uid=%u)\n", __func__,
+ this->tag,
+ get_uid_from_tag(this->tag));
+ parent = *new;
+ if (result < 0)
+ new = &((*new)->rb_left);
+ else if (result > 0)
+ new = &((*new)->rb_right);
+ else
+ BUG();
+ }
+
+ /* Add new node and rebalance tree. */
+ rb_link_node(&data->node, parent, new);
+ rb_insert_color(&data->node, root);
+}
+
+static void tag_stat_tree_insert(struct tag_stat *data, struct rb_root *root)
+{
+ tag_node_tree_insert(&data->tn, root);
+}
+
+static struct tag_stat *tag_stat_tree_search(struct rb_root *root, tag_t tag)
+{
+ struct tag_node *node = tag_node_tree_search(root, tag);
+ if (!node)
+ return NULL;
+ return rb_entry(&node->node, struct tag_stat, tn.node);
+}
+
+static void tag_counter_set_tree_insert(struct tag_counter_set *data,
+ struct rb_root *root)
+{
+ tag_node_tree_insert(&data->tn, root);
+}
+
+static struct tag_counter_set *tag_counter_set_tree_search(struct rb_root *root,
+ tag_t tag)
+{
+ struct tag_node *node = tag_node_tree_search(root, tag);
+ if (!node)
+ return NULL;
+ return rb_entry(&node->node, struct tag_counter_set, tn.node);
+
+}
+
+static void tag_ref_tree_insert(struct tag_ref *data, struct rb_root *root)
+{
+ tag_node_tree_insert(&data->tn, root);
+}
+
+static struct tag_ref *tag_ref_tree_search(struct rb_root *root, tag_t tag)
+{
+ struct tag_node *node = tag_node_tree_search(root, tag);
+ if (!node)
+ return NULL;
+ return rb_entry(&node->node, struct tag_ref, tn.node);
+}
+
+static struct sock_tag *sock_tag_tree_search(struct rb_root *root,
+ const struct sock *sk)
+{
+ struct rb_node *node = root->rb_node;
+
+ while (node) {
+ struct sock_tag *data = rb_entry(node, struct sock_tag,
+ sock_node);
+ if (sk < data->sk)
+ node = node->rb_left;
+ else if (sk > data->sk)
+ node = node->rb_right;
+ else
+ return data;
+ }
+ return NULL;
+}
+
+static void sock_tag_tree_insert(struct sock_tag *data, struct rb_root *root)
+{
+ struct rb_node **new = &(root->rb_node), *parent = NULL;
+
+ /* Figure out where to put new node */
+ while (*new) {
+ struct sock_tag *this = rb_entry(*new, struct sock_tag,
+ sock_node);
+ parent = *new;
+ if (data->sk < this->sk)
+ new = &((*new)->rb_left);
+ else if (data->sk > this->sk)
+ new = &((*new)->rb_right);
+ else
+ BUG();
+ }
+
+ /* Add new node and rebalance tree. */
+ rb_link_node(&data->sock_node, parent, new);
+ rb_insert_color(&data->sock_node, root);
+}
+
+static void sock_tag_tree_erase(struct rb_root *st_to_free_tree)
+{
+ struct rb_node *node;
+ struct sock_tag *st_entry;
+
+ node = rb_first(st_to_free_tree);
+ while (node) {
+ st_entry = rb_entry(node, struct sock_tag, sock_node);
+ node = rb_next(node);
+ CT_DEBUG("qtaguid: %s(): "
+ "erase st: sk=%p tag=0x%llx (uid=%u)\n", __func__,
+ st_entry->sk,
+ st_entry->tag,
+ get_uid_from_tag(st_entry->tag));
+ rb_erase(&st_entry->sock_node, st_to_free_tree);
+ sock_put(st_entry->sk);
+ kfree(st_entry);
+ }
+}
+
+static struct proc_qtu_data *proc_qtu_data_tree_search(struct rb_root *root,
+ const pid_t pid)
+{
+ struct rb_node *node = root->rb_node;
+
+ while (node) {
+ struct proc_qtu_data *data = rb_entry(node,
+ struct proc_qtu_data,
+ node);
+ if (pid < data->pid)
+ node = node->rb_left;
+ else if (pid > data->pid)
+ node = node->rb_right;
+ else
+ return data;
+ }
+ return NULL;
+}
+
+static void proc_qtu_data_tree_insert(struct proc_qtu_data *data,
+ struct rb_root *root)
+{
+ struct rb_node **new = &(root->rb_node), *parent = NULL;
+
+ /* Figure out where to put new node */
+ while (*new) {
+ struct proc_qtu_data *this = rb_entry(*new,
+ struct proc_qtu_data,
+ node);
+ parent = *new;
+ if (data->pid < this->pid)
+ new = &((*new)->rb_left);
+ else if (data->pid > this->pid)
+ new = &((*new)->rb_right);
+ else
+ BUG();
+ }
+
+ /* Add new node and rebalance tree. */
+ rb_link_node(&data->node, parent, new);
+ rb_insert_color(&data->node, root);
+}
+
+static void uid_tag_data_tree_insert(struct uid_tag_data *data,
+ struct rb_root *root)
+{
+ struct rb_node **new = &(root->rb_node), *parent = NULL;
+
+ /* Figure out where to put new node */
+ while (*new) {
+ struct uid_tag_data *this = rb_entry(*new,
+ struct uid_tag_data,
+ node);
+ parent = *new;
+ if (data->uid < this->uid)
+ new = &((*new)->rb_left);
+ else if (data->uid > this->uid)
+ new = &((*new)->rb_right);
+ else
+ BUG();
+ }
+
+ /* Add new node and rebalance tree. */
+ rb_link_node(&data->node, parent, new);
+ rb_insert_color(&data->node, root);
+}
+
+static struct uid_tag_data *uid_tag_data_tree_search(struct rb_root *root,
+ uid_t uid)
+{
+ struct rb_node *node = root->rb_node;
+
+ while (node) {
+ struct uid_tag_data *data = rb_entry(node,
+ struct uid_tag_data,
+ node);
+ if (uid < data->uid)
+ node = node->rb_left;
+ else if (uid > data->uid)
+ node = node->rb_right;
+ else
+ return data;
+ }
+ return NULL;
+}
+
+/*
+ * Allocates a new uid_tag_data struct if needed.
+ * Returns a pointer to the found or allocated uid_tag_data.
+ * Returns a PTR_ERR on failures, and lock is not held.
+ * If found is not NULL:
+ * sets *found to true if not allocated.
+ * sets *found to false if allocated.
+ */
+struct uid_tag_data *get_uid_data(uid_t uid, bool *found_res)
+{
+ struct uid_tag_data *utd_entry;
+
+ /* Look for top level uid_tag_data for the UID */
+ utd_entry = uid_tag_data_tree_search(&uid_tag_data_tree, uid);
+ DR_DEBUG("qtaguid: get_uid_data(%u) utd=%p\n", uid, utd_entry);
+
+ if (found_res)
+ *found_res = utd_entry;
+ if (utd_entry)
+ return utd_entry;
+
+ utd_entry = kzalloc(sizeof(*utd_entry), GFP_ATOMIC);
+ if (!utd_entry) {
+ pr_err("qtaguid: get_uid_data(%u): "
+ "tag data alloc failed\n", uid);
+ return ERR_PTR(-ENOMEM);
+ }
+
+ utd_entry->uid = uid;
+ utd_entry->tag_ref_tree = RB_ROOT;
+ uid_tag_data_tree_insert(utd_entry, &uid_tag_data_tree);
+ DR_DEBUG("qtaguid: get_uid_data(%u) new utd=%p\n", uid, utd_entry);
+ return utd_entry;
+}
+
+/* Never returns NULL. Either PTR_ERR or a valid ptr. */
+static struct tag_ref *new_tag_ref(tag_t new_tag,
+ struct uid_tag_data *utd_entry)
+{
+ struct tag_ref *tr_entry;
+ int res;
+
+ if (utd_entry->num_active_tags + 1 > max_sock_tags) {
+ pr_info("qtaguid: new_tag_ref(0x%llx): "
+ "tag ref alloc quota exceeded. max=%d\n",
+ new_tag, max_sock_tags);
+ res = -EMFILE;
+ goto err_res;
+
+ }
+
+ tr_entry = kzalloc(sizeof(*tr_entry), GFP_ATOMIC);
+ if (!tr_entry) {
+ pr_err("qtaguid: new_tag_ref(0x%llx): "
+ "tag ref alloc failed\n",
+ new_tag);
+ res = -ENOMEM;
+ goto err_res;
+ }
+ tr_entry->tn.tag = new_tag;
+ /* tr_entry->num_sock_tags handled by caller */
+ utd_entry->num_active_tags++;
+ tag_ref_tree_insert(tr_entry, &utd_entry->tag_ref_tree);
+ DR_DEBUG("qtaguid: new_tag_ref(0x%llx): "
+ " inserted new tag ref %p\n",
+ new_tag, tr_entry);
+ return tr_entry;
+
+err_res:
+ return ERR_PTR(res);
+}
+
+static struct tag_ref *lookup_tag_ref(tag_t full_tag,
+ struct uid_tag_data **utd_res)
+{
+ struct uid_tag_data *utd_entry;
+ struct tag_ref *tr_entry;
+ bool found_utd;
+ uid_t uid = get_uid_from_tag(full_tag);
+
+ DR_DEBUG("qtaguid: lookup_tag_ref(tag=0x%llx (uid=%u))\n",
+ full_tag, uid);
+
+ utd_entry = get_uid_data(uid, &found_utd);
+ if (IS_ERR_OR_NULL(utd_entry)) {
+ if (utd_res)
+ *utd_res = utd_entry;
+ return NULL;
+ }
+
+ tr_entry = tag_ref_tree_search(&utd_entry->tag_ref_tree, full_tag);
+ if (utd_res)
+ *utd_res = utd_entry;
+ DR_DEBUG("qtaguid: lookup_tag_ref(0x%llx) utd_entry=%p tr_entry=%p\n",
+ full_tag, utd_entry, tr_entry);
+ return tr_entry;
+}
+
+/* Never returns NULL. Either PTR_ERR or a valid ptr. */
+static struct tag_ref *get_tag_ref(tag_t full_tag,
+ struct uid_tag_data **utd_res)
+{
+ struct uid_tag_data *utd_entry;
+ struct tag_ref *tr_entry;
+
+ DR_DEBUG("qtaguid: get_tag_ref(0x%llx)\n",
+ full_tag);
+ tr_entry = lookup_tag_ref(full_tag, &utd_entry);
+ BUG_ON(IS_ERR_OR_NULL(utd_entry));
+ if (!tr_entry)
+ tr_entry = new_tag_ref(full_tag, utd_entry);
+
+ if (utd_res)
+ *utd_res = utd_entry;
+ DR_DEBUG("qtaguid: get_tag_ref(0x%llx) utd=%p tr=%p\n",
+ full_tag, utd_entry, tr_entry);
+ return tr_entry;
+}
+
+/* Checks and maybe frees the UID Tag Data entry */
+static void put_utd_entry(struct uid_tag_data *utd_entry)
+{
+ /* Are we done with the UID tag data entry? */
+ if (RB_EMPTY_ROOT(&utd_entry->tag_ref_tree) &&
+ !utd_entry->num_pqd) {
+ DR_DEBUG("qtaguid: %s(): "
+ "erase utd_entry=%p uid=%u "
+ "by pid=%u tgid=%u uid=%u\n", __func__,
+ utd_entry, utd_entry->uid,
+ current->pid, current->tgid, from_kuid(&init_user_ns, current_fsuid()));
+ BUG_ON(utd_entry->num_active_tags);
+ rb_erase(&utd_entry->node, &uid_tag_data_tree);
+ kfree(utd_entry);
+ } else {
+ DR_DEBUG("qtaguid: %s(): "
+ "utd_entry=%p still has %d tags %d proc_qtu_data\n",
+ __func__, utd_entry, utd_entry->num_active_tags,
+ utd_entry->num_pqd);
+ BUG_ON(!(utd_entry->num_active_tags ||
+ utd_entry->num_pqd));
+ }
+}
+
+/*
+ * If no sock_tags are using this tag_ref,
+ * decrements refcount of utd_entry, removes tr_entry
+ * from utd_entry->tag_ref_tree and frees.
+ */
+static void free_tag_ref_from_utd_entry(struct tag_ref *tr_entry,
+ struct uid_tag_data *utd_entry)
+{
+ DR_DEBUG("qtaguid: %s(): %p tag=0x%llx (uid=%u)\n", __func__,
+ tr_entry, tr_entry->tn.tag,
+ get_uid_from_tag(tr_entry->tn.tag));
+ if (!tr_entry->num_sock_tags) {
+ BUG_ON(!utd_entry->num_active_tags);
+ utd_entry->num_active_tags--;
+ rb_erase(&tr_entry->tn.node, &utd_entry->tag_ref_tree);
+ DR_DEBUG("qtaguid: %s(): erased %p\n", __func__, tr_entry);
+ kfree(tr_entry);
+ }
+}
+
+static void put_tag_ref_tree(tag_t full_tag, struct uid_tag_data *utd_entry)
+{
+ struct rb_node *node;
+ struct tag_ref *tr_entry;
+ tag_t acct_tag;
+
+ DR_DEBUG("qtaguid: %s(tag=0x%llx (uid=%u))\n", __func__,
+ full_tag, get_uid_from_tag(full_tag));
+ acct_tag = get_atag_from_tag(full_tag);
+ node = rb_first(&utd_entry->tag_ref_tree);
+ while (node) {
+ tr_entry = rb_entry(node, struct tag_ref, tn.node);
+ node = rb_next(node);
+ if (!acct_tag || tr_entry->tn.tag == full_tag)
+ free_tag_ref_from_utd_entry(tr_entry, utd_entry);
+ }
+}
+
+static ssize_t read_proc_u64(struct file *file, char __user *buf,
+ size_t size, loff_t *ppos)
+{
+ uint64_t *valuep = PDE_DATA(file_inode(file));
+ char tmp[24];
+ size_t tmp_size;
+
+ tmp_size = scnprintf(tmp, sizeof(tmp), "%llu\n", *valuep);
+ return simple_read_from_buffer(buf, size, ppos, tmp, tmp_size);
+}
+
+static ssize_t read_proc_bool(struct file *file, char __user *buf,
+ size_t size, loff_t *ppos)
+{
+ bool *valuep = PDE_DATA(file_inode(file));
+ char tmp[24];
+ size_t tmp_size;
+
+ tmp_size = scnprintf(tmp, sizeof(tmp), "%u\n", *valuep);
+ return simple_read_from_buffer(buf, size, ppos, tmp, tmp_size);
+}
+
+static int get_active_counter_set(tag_t tag)
+{
+ int active_set = 0;
+ struct tag_counter_set *tcs;
+
+ MT_DEBUG("qtaguid: get_active_counter_set(tag=0x%llx)"
+ " (uid=%u)\n",
+ tag, get_uid_from_tag(tag));
+ /* For now we only handle UID tags for active sets */
+ tag = get_utag_from_tag(tag);
+ spin_lock_bh(&tag_counter_set_list_lock);
+ tcs = tag_counter_set_tree_search(&tag_counter_set_tree, tag);
+ if (tcs)
+ active_set = tcs->active_set;
+ spin_unlock_bh(&tag_counter_set_list_lock);
+ return active_set;
+}
+
+/*
+ * Find the entry for tracking the specified interface.
+ * Caller must hold iface_stat_list_lock
+ */
+static struct iface_stat *get_iface_entry(const char *ifname)
+{
+ struct iface_stat *iface_entry;
+
+ /* Find the entry for tracking the specified tag within the interface */
+ if (ifname == NULL) {
+ pr_info("qtaguid: iface_stat: get() NULL device name\n");
+ return NULL;
+ }
+
+ /* Iterate over interfaces */
+ list_for_each_entry(iface_entry, &iface_stat_list, list) {
+ if (!strcmp(ifname, iface_entry->ifname))
+ goto done;
+ }
+ iface_entry = NULL;
+done:
+ return iface_entry;
+}
+
+/* This is for fmt2 only */
+static void pp_iface_stat_header(struct seq_file *m)
+{
+ seq_puts(m,
+ "ifname "
+ "total_skb_rx_bytes total_skb_rx_packets "
+ "total_skb_tx_bytes total_skb_tx_packets "
+ "rx_tcp_bytes rx_tcp_packets "
+ "rx_udp_bytes rx_udp_packets "
+ "rx_other_bytes rx_other_packets "
+ "tx_tcp_bytes tx_tcp_packets "
+ "tx_udp_bytes tx_udp_packets "
+ "tx_other_bytes tx_other_packets\n"
+ );
+}
+
+static void pp_iface_stat_line(struct seq_file *m,
+ struct iface_stat *iface_entry)
+{
+ struct data_counters *cnts;
+ int cnt_set = 0; /* We only use one set for the device */
+ cnts = &iface_entry->totals_via_skb;
+ seq_printf(m, "%s %llu %llu %llu %llu %llu %llu %llu %llu "
+ "%llu %llu %llu %llu %llu %llu %llu %llu\n",
+ iface_entry->ifname,
+ dc_sum_bytes(cnts, cnt_set, IFS_RX),
+ dc_sum_packets(cnts, cnt_set, IFS_RX),
+ dc_sum_bytes(cnts, cnt_set, IFS_TX),
+ dc_sum_packets(cnts, cnt_set, IFS_TX),
+ cnts->bpc[cnt_set][IFS_RX][IFS_TCP].bytes,
+ cnts->bpc[cnt_set][IFS_RX][IFS_TCP].packets,
+ cnts->bpc[cnt_set][IFS_RX][IFS_UDP].bytes,
+ cnts->bpc[cnt_set][IFS_RX][IFS_UDP].packets,
+ cnts->bpc[cnt_set][IFS_RX][IFS_PROTO_OTHER].bytes,
+ cnts->bpc[cnt_set][IFS_RX][IFS_PROTO_OTHER].packets,
+ cnts->bpc[cnt_set][IFS_TX][IFS_TCP].bytes,
+ cnts->bpc[cnt_set][IFS_TX][IFS_TCP].packets,
+ cnts->bpc[cnt_set][IFS_TX][IFS_UDP].bytes,
+ cnts->bpc[cnt_set][IFS_TX][IFS_UDP].packets,
+ cnts->bpc[cnt_set][IFS_TX][IFS_PROTO_OTHER].bytes,
+ cnts->bpc[cnt_set][IFS_TX][IFS_PROTO_OTHER].packets);
+}
+
+struct proc_iface_stat_fmt_info {
+ int fmt;
+};
+
+static void *iface_stat_fmt_proc_start(struct seq_file *m, loff_t *pos)
+{
+ struct proc_iface_stat_fmt_info *p = m->private;
+ loff_t n = *pos;
+
+ /*
+ * This lock will prevent iface_stat_update() from changing active,
+ * and in turn prevent an interface from unregistering itself.
+ */
+ spin_lock_bh(&iface_stat_list_lock);
+
+ if (unlikely(module_passive))
+ return NULL;
+
+ if (!n && p->fmt == 2)
+ pp_iface_stat_header(m);
+
+ return seq_list_start(&iface_stat_list, n);
+}
+
+static void *iface_stat_fmt_proc_next(struct seq_file *m, void *p, loff_t *pos)
+{
+ return seq_list_next(p, &iface_stat_list, pos);
+}
+
+static void iface_stat_fmt_proc_stop(struct seq_file *m, void *p)
+{
+ spin_unlock_bh(&iface_stat_list_lock);
+}
+
+static int iface_stat_fmt_proc_show(struct seq_file *m, void *v)
+{
+ struct proc_iface_stat_fmt_info *p = m->private;
+ struct iface_stat *iface_entry;
+ struct rtnl_link_stats64 dev_stats, *stats;
+ struct rtnl_link_stats64 no_dev_stats = {0};
+
+
+ CT_DEBUG("qtaguid:proc iface_stat_fmt pid=%u tgid=%u uid=%u\n",
+ current->pid, current->tgid, from_kuid(&init_user_ns, current_fsuid()));
+
+ iface_entry = list_entry(v, struct iface_stat, list);
+
+ if (iface_entry->active) {
+ stats = dev_get_stats(iface_entry->net_dev,
+ &dev_stats);
+ } else {
+ stats = &no_dev_stats;
+ }
+ /*
+ * If the meaning of the data changes, then update the fmtX
+ * string.
+ */
+ if (p->fmt == 1) {
+ seq_printf(m, "%s %d %llu %llu %llu %llu %llu %llu %llu %llu\n",
+ iface_entry->ifname,
+ iface_entry->active,
+ iface_entry->totals_via_dev[IFS_RX].bytes,
+ iface_entry->totals_via_dev[IFS_RX].packets,
+ iface_entry->totals_via_dev[IFS_TX].bytes,
+ iface_entry->totals_via_dev[IFS_TX].packets,
+ stats->rx_bytes, stats->rx_packets,
+ stats->tx_bytes, stats->tx_packets
+ );
+ } else {
+ pp_iface_stat_line(m, iface_entry);
+ }
+ return 0;
+}
+
+static const struct file_operations read_u64_fops = {
+ .read = read_proc_u64,
+ .llseek = default_llseek,
+};
+
+static const struct file_operations read_bool_fops = {
+ .read = read_proc_bool,
+ .llseek = default_llseek,
+};
+
+static void iface_create_proc_worker(struct work_struct *work)
+{
+ struct proc_dir_entry *proc_entry;
+ struct iface_stat_work *isw = container_of(work, struct iface_stat_work,
+ iface_work);
+ struct iface_stat *new_iface = isw->iface_entry;
+
+ /* iface_entries are not deleted, so safe to manipulate. */
+ proc_entry = proc_mkdir(new_iface->ifname, iface_stat_procdir);
+ if (IS_ERR_OR_NULL(proc_entry)) {
+ pr_err("qtaguid: iface_stat: create_proc(): alloc failed.\n");
+ kfree(isw);
+ return;
+ }
+
+ new_iface->proc_ptr = proc_entry;
+
+ proc_create_data("tx_bytes", proc_iface_perms, proc_entry,
+ &read_u64_fops,
+ &new_iface->totals_via_dev[IFS_TX].bytes);
+ proc_create_data("rx_bytes", proc_iface_perms, proc_entry,
+ &read_u64_fops,
+ &new_iface->totals_via_dev[IFS_RX].bytes);
+ proc_create_data("tx_packets", proc_iface_perms, proc_entry,
+ &read_u64_fops,
+ &new_iface->totals_via_dev[IFS_TX].packets);
+ proc_create_data("rx_packets", proc_iface_perms, proc_entry,
+ &read_u64_fops,
+ &new_iface->totals_via_dev[IFS_RX].packets);
+ proc_create_data("active", proc_iface_perms, proc_entry,
+ &read_bool_fops, &new_iface->active);
+
+ IF_DEBUG("qtaguid: iface_stat: create_proc(): done "
+ "entry=%p dev=%s\n", new_iface, new_iface->ifname);
+ kfree(isw);
+}
+
+/*
+ * Will set the entry's active state, and
+ * update the net_dev accordingly also.
+ */
+static void _iface_stat_set_active(struct iface_stat *entry,
+ struct net_device *net_dev,
+ bool activate)
+{
+ if (activate) {
+ entry->net_dev = net_dev;
+ entry->active = true;
+ IF_DEBUG("qtaguid: %s(%s): "
+ "enable tracking. rfcnt=%d\n", __func__,
+ entry->ifname,
+ __this_cpu_read(*net_dev->pcpu_refcnt));
+ } else {
+ entry->active = false;
+ entry->net_dev = NULL;
+ IF_DEBUG("qtaguid: %s(%s): "
+ "disable tracking. rfcnt=%d\n", __func__,
+ entry->ifname,
+ __this_cpu_read(*net_dev->pcpu_refcnt));
+
+ }
+}
+
+/* Caller must hold iface_stat_list_lock */
+static struct iface_stat *iface_alloc(struct net_device *net_dev)
+{
+ struct iface_stat *new_iface;
+ struct iface_stat_work *isw;
+
+ new_iface = kzalloc(sizeof(*new_iface), GFP_ATOMIC);
+ if (new_iface == NULL) {
+ pr_err("qtaguid: iface_stat: create(%s): "
+ "iface_stat alloc failed\n", net_dev->name);
+ return NULL;
+ }
+ new_iface->ifname = kstrdup(net_dev->name, GFP_ATOMIC);
+ if (new_iface->ifname == NULL) {
+ pr_err("qtaguid: iface_stat: create(%s): "
+ "ifname alloc failed\n", net_dev->name);
+ kfree(new_iface);
+ return NULL;
+ }
+ spin_lock_init(&new_iface->tag_stat_list_lock);
+ new_iface->tag_stat_tree = RB_ROOT;
+ _iface_stat_set_active(new_iface, net_dev, true);
+
+ /*
+ * ipv6 notifier chains are atomic :(
+ * No create_proc_read_entry() for you!
+ */
+ isw = kmalloc(sizeof(*isw), GFP_ATOMIC);
+ if (!isw) {
+ pr_err("qtaguid: iface_stat: create(%s): "
+ "work alloc failed\n", new_iface->ifname);
+ _iface_stat_set_active(new_iface, net_dev, false);
+ kfree(new_iface->ifname);
+ kfree(new_iface);
+ return NULL;
+ }
+ isw->iface_entry = new_iface;
+ INIT_WORK(&isw->iface_work, iface_create_proc_worker);
+ schedule_work(&isw->iface_work);
+ list_add(&new_iface->list, &iface_stat_list);
+ return new_iface;
+}
+
+static void iface_check_stats_reset_and_adjust(struct net_device *net_dev,
+ struct iface_stat *iface)
+{
+ struct rtnl_link_stats64 dev_stats, *stats;
+ bool stats_rewound;
+
+ stats = dev_get_stats(net_dev, &dev_stats);
+ /* No empty packets */
+ stats_rewound =
+ (stats->rx_bytes < iface->last_known[IFS_RX].bytes)
+ || (stats->tx_bytes < iface->last_known[IFS_TX].bytes);
+
+ IF_DEBUG("qtaguid: %s(%s): iface=%p netdev=%p "
+ "bytes rx/tx=%llu/%llu "
+ "active=%d last_known=%d "
+ "stats_rewound=%d\n", __func__,
+ net_dev ? net_dev->name : "?",
+ iface, net_dev,
+ stats->rx_bytes, stats->tx_bytes,
+ iface->active, iface->last_known_valid, stats_rewound);
+
+ if (iface->active && iface->last_known_valid && stats_rewound) {
+ pr_warn_once("qtaguid: iface_stat: %s(%s): "
+ "iface reset its stats unexpectedly\n", __func__,
+ net_dev->name);
+
+ iface->totals_via_dev[IFS_TX].bytes +=
+ iface->last_known[IFS_TX].bytes;
+ iface->totals_via_dev[IFS_TX].packets +=
+ iface->last_known[IFS_TX].packets;
+ iface->totals_via_dev[IFS_RX].bytes +=
+ iface->last_known[IFS_RX].bytes;
+ iface->totals_via_dev[IFS_RX].packets +=
+ iface->last_known[IFS_RX].packets;
+ iface->last_known_valid = false;
+ IF_DEBUG("qtaguid: %s(%s): iface=%p "
+ "used last known bytes rx/tx=%llu/%llu\n", __func__,
+ iface->ifname, iface, iface->last_known[IFS_RX].bytes,
+ iface->last_known[IFS_TX].bytes);
+ }
+}
+
+/*
+ * Create a new entry for tracking the specified interface.
+ * Do nothing if the entry already exists.
+ * Called when an interface is configured with a valid IP address.
+ */
+static void iface_stat_create(struct net_device *net_dev,
+ struct in_ifaddr *ifa)
+{
+ struct in_device *in_dev = NULL;
+ const char *ifname;
+ struct iface_stat *entry;
+ __be32 ipaddr = 0;
+ struct iface_stat *new_iface;
+
+ IF_DEBUG("qtaguid: iface_stat: create(%s): ifa=%p netdev=%p\n",
+ net_dev ? net_dev->name : "?",
+ ifa, net_dev);
+ if (!net_dev) {
+ pr_err("qtaguid: iface_stat: create(): no net dev\n");
+ return;
+ }
+
+ ifname = net_dev->name;
+ if (!ifa) {
+ in_dev = in_dev_get(net_dev);
+ if (!in_dev) {
+ pr_err("qtaguid: iface_stat: create(%s): no inet dev\n",
+ ifname);
+ return;
+ }
+ IF_DEBUG("qtaguid: iface_stat: create(%s): in_dev=%p\n",
+ ifname, in_dev);
+ for (ifa = in_dev->ifa_list; ifa; ifa = ifa->ifa_next) {
+ IF_DEBUG("qtaguid: iface_stat: create(%s): "
+ "ifa=%p ifa_label=%s\n",
+ ifname, ifa, ifa->ifa_label);
+ if (!strcmp(ifname, ifa->ifa_label))
+ break;
+ }
+ }
+
+ if (!ifa) {
+ IF_DEBUG("qtaguid: iface_stat: create(%s): no matching IP\n",
+ ifname);
+ goto done_put;
+ }
+ ipaddr = ifa->ifa_local;
+
+ spin_lock_bh(&iface_stat_list_lock);
+ entry = get_iface_entry(ifname);
+ if (entry != NULL) {
+ IF_DEBUG("qtaguid: iface_stat: create(%s): entry=%p\n",
+ ifname, entry);
+ iface_check_stats_reset_and_adjust(net_dev, entry);
+ _iface_stat_set_active(entry, net_dev, true);
+ IF_DEBUG("qtaguid: %s(%s): "
+ "tracking now %d on ip=%pI4\n", __func__,
+ entry->ifname, true, &ipaddr);
+ goto done_unlock_put;
+ }
+
+ new_iface = iface_alloc(net_dev);
+ IF_DEBUG("qtaguid: iface_stat: create(%s): done "
+ "entry=%p ip=%pI4\n", ifname, new_iface, &ipaddr);
+done_unlock_put:
+ spin_unlock_bh(&iface_stat_list_lock);
+done_put:
+ if (in_dev)
+ in_dev_put(in_dev);
+}
+
+static void iface_stat_create_ipv6(struct net_device *net_dev,
+ struct inet6_ifaddr *ifa)
+{
+ struct in_device *in_dev;
+ const char *ifname;
+ struct iface_stat *entry;
+ struct iface_stat *new_iface;
+ int addr_type;
+
+ IF_DEBUG("qtaguid: iface_stat: create6(): ifa=%p netdev=%p->name=%s\n",
+ ifa, net_dev, net_dev ? net_dev->name : "");
+ if (!net_dev) {
+ pr_err("qtaguid: iface_stat: create6(): no net dev!\n");
+ return;
+ }
+ ifname = net_dev->name;
+
+ in_dev = in_dev_get(net_dev);
+ if (!in_dev) {
+ pr_err("qtaguid: iface_stat: create6(%s): no inet dev\n",
+ ifname);
+ return;
+ }
+
+ IF_DEBUG("qtaguid: iface_stat: create6(%s): in_dev=%p\n",
+ ifname, in_dev);
+
+ if (!ifa) {
+ IF_DEBUG("qtaguid: iface_stat: create6(%s): no matching IP\n",
+ ifname);
+ goto done_put;
+ }
+ addr_type = ipv6_addr_type(&ifa->addr);
+
+ spin_lock_bh(&iface_stat_list_lock);
+ entry = get_iface_entry(ifname);
+ if (entry != NULL) {
+ IF_DEBUG("qtaguid: %s(%s): entry=%p\n", __func__,
+ ifname, entry);
+ iface_check_stats_reset_and_adjust(net_dev, entry);
+ _iface_stat_set_active(entry, net_dev, true);
+ IF_DEBUG("qtaguid: %s(%s): "
+ "tracking now %d on ip=%pI6c\n", __func__,
+ entry->ifname, true, &ifa->addr);
+ goto done_unlock_put;
+ }
+
+ new_iface = iface_alloc(net_dev);
+ IF_DEBUG("qtaguid: iface_stat: create6(%s): done "
+ "entry=%p ip=%pI6c\n", ifname, new_iface, &ifa->addr);
+
+done_unlock_put:
+ spin_unlock_bh(&iface_stat_list_lock);
+done_put:
+ in_dev_put(in_dev);
+}
+
+static struct sock_tag *get_sock_stat_nl(const struct sock *sk)
+{
+ MT_DEBUG("qtaguid: get_sock_stat_nl(sk=%p)\n", sk);
+ return sock_tag_tree_search(&sock_tag_tree, sk);
+}
+
+static struct sock_tag *get_sock_stat(const struct sock *sk)
+{
+ struct sock_tag *sock_tag_entry;
+ MT_DEBUG("qtaguid: get_sock_stat(sk=%p)\n", sk);
+ if (!sk)
+ return NULL;
+ spin_lock_bh(&sock_tag_list_lock);
+ sock_tag_entry = get_sock_stat_nl(sk);
+ spin_unlock_bh(&sock_tag_list_lock);
+ return sock_tag_entry;
+}
+
+static int ipx_proto(const struct sk_buff *skb,
+ struct xt_action_param *par)
+{
+ int thoff = 0, tproto;
+
+ switch (par->family) {
+ case NFPROTO_IPV6:
+ tproto = ipv6_find_hdr(skb, &thoff, -1, NULL, NULL);
+ if (tproto < 0)
+ MT_DEBUG("%s(): transport header not found in ipv6"
+ " skb=%p\n", __func__, skb);
+ break;
+ case NFPROTO_IPV4:
+ tproto = ip_hdr(skb)->protocol;
+ break;
+ default:
+ tproto = IPPROTO_RAW;
+ }
+ return tproto;
+}
+
+static void
+data_counters_update(struct data_counters *dc, int set,
+ enum ifs_tx_rx direction, int proto, int bytes)
+{
+ switch (proto) {
+ case IPPROTO_TCP:
+ dc_add_byte_packets(dc, set, direction, IFS_TCP, bytes, 1);
+ break;
+ case IPPROTO_UDP:
+ dc_add_byte_packets(dc, set, direction, IFS_UDP, bytes, 1);
+ break;
+ case IPPROTO_IP:
+ default:
+ dc_add_byte_packets(dc, set, direction, IFS_PROTO_OTHER, bytes,
+ 1);
+ break;
+ }
+}
+
+/*
+ * Update stats for the specified interface. Do nothing if the entry
+ * does not exist (when a device was never configured with an IP address).
+ * Called when an device is being unregistered.
+ */
+static void iface_stat_update(struct net_device *net_dev, bool stash_only)
+{
+ struct rtnl_link_stats64 dev_stats, *stats;
+ struct iface_stat *entry;
+
+ stats = dev_get_stats(net_dev, &dev_stats);
+ spin_lock_bh(&iface_stat_list_lock);
+ entry = get_iface_entry(net_dev->name);
+ if (entry == NULL) {
+ IF_DEBUG("qtaguid: iface_stat: update(%s): not tracked\n",
+ net_dev->name);
+ spin_unlock_bh(&iface_stat_list_lock);
+ return;
+ }
+
+ IF_DEBUG("qtaguid: %s(%s): entry=%p\n", __func__,
+ net_dev->name, entry);
+ if (!entry->active) {
+ IF_DEBUG("qtaguid: %s(%s): already disabled\n", __func__,
+ net_dev->name);
+ spin_unlock_bh(&iface_stat_list_lock);
+ return;
+ }
+
+ if (stash_only) {
+ entry->last_known[IFS_TX].bytes = stats->tx_bytes;
+ entry->last_known[IFS_TX].packets = stats->tx_packets;
+ entry->last_known[IFS_RX].bytes = stats->rx_bytes;
+ entry->last_known[IFS_RX].packets = stats->rx_packets;
+ entry->last_known_valid = true;
+ IF_DEBUG("qtaguid: %s(%s): "
+ "dev stats stashed rx/tx=%llu/%llu\n", __func__,
+ net_dev->name, stats->rx_bytes, stats->tx_bytes);
+ spin_unlock_bh(&iface_stat_list_lock);
+ return;
+ }
+ entry->totals_via_dev[IFS_TX].bytes += stats->tx_bytes;
+ entry->totals_via_dev[IFS_TX].packets += stats->tx_packets;
+ entry->totals_via_dev[IFS_RX].bytes += stats->rx_bytes;
+ entry->totals_via_dev[IFS_RX].packets += stats->rx_packets;
+ /* We don't need the last_known[] anymore */
+ entry->last_known_valid = false;
+ _iface_stat_set_active(entry, net_dev, false);
+ IF_DEBUG("qtaguid: %s(%s): "
+ "disable tracking. rx/tx=%llu/%llu\n", __func__,
+ net_dev->name, stats->rx_bytes, stats->tx_bytes);
+ spin_unlock_bh(&iface_stat_list_lock);
+}
+
+/* Guarantied to return a net_device that has a name */
+static void get_dev_and_dir(const struct sk_buff *skb,
+ struct xt_action_param *par,
+ enum ifs_tx_rx *direction,
+ const struct net_device **el_dev)
+{
+ BUG_ON(!direction || !el_dev);
+
+ if (par->in) {
+ *el_dev = par->in;
+ *direction = IFS_RX;
+ } else if (par->out) {
+ *el_dev = par->out;
+ *direction = IFS_TX;
+ } else {
+ pr_err("qtaguid[%d]: %s(): no par->in/out?!!\n",
+ par->hooknum, __func__);
+ BUG();
+ }
+ if (skb->dev && *el_dev != skb->dev) {
+ MT_DEBUG("qtaguid[%d]: skb->dev=%p %s vs par->%s=%p %s\n",
+ par->hooknum, skb->dev, skb->dev->name,
+ *direction == IFS_RX ? "in" : "out", *el_dev,
+ (*el_dev)->name);
+ }
+}
+
+/*
+ * Update stats for the specified interface from the skb.
+ * Do nothing if the entry
+ * does not exist (when a device was never configured with an IP address).
+ * Called on each sk.
+ */
+static void iface_stat_update_from_skb(const struct sk_buff *skb,
+ struct xt_action_param *par)
+{
+ struct iface_stat *entry;
+ const struct net_device *el_dev;
+ enum ifs_tx_rx direction;
+ int bytes = skb->len;
+ int proto;
+
+ get_dev_and_dir(skb, par, &direction, &el_dev);
+ proto = ipx_proto(skb, par);
+ MT_DEBUG("qtaguid[%d]: iface_stat: %s(%s): "
+ "type=%d fam=%d proto=%d dir=%d\n",
+ par->hooknum, __func__, el_dev->name, el_dev->type,
+ par->family, proto, direction);
+
+ spin_lock_bh(&iface_stat_list_lock);
+ entry = get_iface_entry(el_dev->name);
+ if (entry == NULL) {
+ IF_DEBUG("qtaguid[%d]: iface_stat: %s(%s): not tracked\n",
+ par->hooknum, __func__, el_dev->name);
+ spin_unlock_bh(&iface_stat_list_lock);
+ return;
+ }
+
+ IF_DEBUG("qtaguid[%d]: %s(%s): entry=%p\n", par->hooknum, __func__,
+ el_dev->name, entry);
+
+ data_counters_update(&entry->totals_via_skb, 0, direction, proto,
+ bytes);
+ spin_unlock_bh(&iface_stat_list_lock);
+}
+
+static void tag_stat_update(struct tag_stat *tag_entry,
+ enum ifs_tx_rx direction, int proto, int bytes)
+{
+ int active_set;
+ active_set = get_active_counter_set(tag_entry->tn.tag);
+ MT_DEBUG("qtaguid: tag_stat_update(tag=0x%llx (uid=%u) set=%d "
+ "dir=%d proto=%d bytes=%d)\n",
+ tag_entry->tn.tag, get_uid_from_tag(tag_entry->tn.tag),
+ active_set, direction, proto, bytes);
+ data_counters_update(&tag_entry->counters, active_set, direction,
+ proto, bytes);
+ if (tag_entry->parent_counters)
+ data_counters_update(tag_entry->parent_counters, active_set,
+ direction, proto, bytes);
+}
+
+/*
+ * Create a new entry for tracking the specified {acct_tag,uid_tag} within
+ * the interface.
+ * iface_entry->tag_stat_list_lock should be held.
+ */
+static struct tag_stat *create_if_tag_stat(struct iface_stat *iface_entry,
+ tag_t tag)
+{
+ struct tag_stat *new_tag_stat_entry = NULL;
+ IF_DEBUG("qtaguid: iface_stat: %s(): ife=%p tag=0x%llx"
+ " (uid=%u)\n", __func__,
+ iface_entry, tag, get_uid_from_tag(tag));
+ new_tag_stat_entry = kzalloc(sizeof(*new_tag_stat_entry), GFP_ATOMIC);
+ if (!new_tag_stat_entry) {
+ pr_err("qtaguid: iface_stat: tag stat alloc failed\n");
+ goto done;
+ }
+ new_tag_stat_entry->tn.tag = tag;
+ tag_stat_tree_insert(new_tag_stat_entry, &iface_entry->tag_stat_tree);
+done:
+ return new_tag_stat_entry;
+}
+
+static void if_tag_stat_update(const char *ifname, uid_t uid,
+ const struct sock *sk, enum ifs_tx_rx direction,
+ int proto, int bytes)
+{
+ struct tag_stat *tag_stat_entry;
+ tag_t tag, acct_tag;
+ tag_t uid_tag;
+ struct data_counters *uid_tag_counters;
+ struct sock_tag *sock_tag_entry;
+ struct iface_stat *iface_entry;
+ struct tag_stat *new_tag_stat = NULL;
+ MT_DEBUG("qtaguid: if_tag_stat_update(ifname=%s "
+ "uid=%u sk=%p dir=%d proto=%d bytes=%d)\n",
+ ifname, uid, sk, direction, proto, bytes);
+
+ spin_lock_bh(&iface_stat_list_lock);
+ iface_entry = get_iface_entry(ifname);
+ if (!iface_entry) {
+ pr_err_ratelimited("qtaguid: tag_stat: stat_update() "
+ "%s not found\n", ifname);
+ spin_unlock_bh(&iface_stat_list_lock);
+ return;
+ }
+ /* It is ok to process data when an iface_entry is inactive */
+
+ MT_DEBUG("qtaguid: tag_stat: stat_update() dev=%s entry=%p\n",
+ ifname, iface_entry);
+
+ /*
+ * Look for a tagged sock.
+ * It will have an acct_uid.
+ */
+ sock_tag_entry = get_sock_stat(sk);
+ if (sock_tag_entry) {
+ tag = sock_tag_entry->tag;
+ acct_tag = get_atag_from_tag(tag);
+ uid_tag = get_utag_from_tag(tag);
+ } else {
+ acct_tag = make_atag_from_value(0);
+ tag = combine_atag_with_uid(acct_tag, uid);
+ uid_tag = make_tag_from_uid(uid);
+ }
+ MT_DEBUG("qtaguid: tag_stat: stat_update(): "
+ " looking for tag=0x%llx (uid=%u) in ife=%p\n",
+ tag, get_uid_from_tag(tag), iface_entry);
+ /* Loop over tag list under this interface for {acct_tag,uid_tag} */
+ spin_lock_bh(&iface_entry->tag_stat_list_lock);
+
+ tag_stat_entry = tag_stat_tree_search(&iface_entry->tag_stat_tree,
+ tag);
+ if (tag_stat_entry) {
+ /*
+ * Updating the {acct_tag, uid_tag} entry handles both stats:
+ * {0, uid_tag} will also get updated.
+ */
+ tag_stat_update(tag_stat_entry, direction, proto, bytes);
+ goto unlock;
+ }
+
+ /* Loop over tag list under this interface for {0,uid_tag} */
+ tag_stat_entry = tag_stat_tree_search(&iface_entry->tag_stat_tree,
+ uid_tag);
+ if (!tag_stat_entry) {
+ /* Here: the base uid_tag did not exist */
+ /*
+ * No parent counters. So
+ * - No {0, uid_tag} stats and no {acc_tag, uid_tag} stats.
+ */
+ new_tag_stat = create_if_tag_stat(iface_entry, uid_tag);
+ if (!new_tag_stat)
+ goto unlock;
+ uid_tag_counters = &new_tag_stat->counters;
+ } else {
+ uid_tag_counters = &tag_stat_entry->counters;
+ }
+
+ if (acct_tag) {
+ /* Create the child {acct_tag, uid_tag} and hook up parent. */
+ new_tag_stat = create_if_tag_stat(iface_entry, tag);
+ if (!new_tag_stat)
+ goto unlock;
+ new_tag_stat->parent_counters = uid_tag_counters;
+ } else {
+ /*
+ * For new_tag_stat to be still NULL here would require:
+ * {0, uid_tag} exists
+ * and {acct_tag, uid_tag} doesn't exist
+ * AND acct_tag == 0.
+ * Impossible. This reassures us that new_tag_stat
+ * below will always be assigned.
+ */
+ BUG_ON(!new_tag_stat);
+ }
+ tag_stat_update(new_tag_stat, direction, proto, bytes);
+unlock:
+ spin_unlock_bh(&iface_entry->tag_stat_list_lock);
+ spin_unlock_bh(&iface_stat_list_lock);
+}
+
+static int iface_netdev_event_handler(struct notifier_block *nb,
+ unsigned long event, void *ptr) {
+ struct net_device *dev = netdev_notifier_info_to_dev(ptr);
+
+ if (unlikely(module_passive))
+ return NOTIFY_DONE;
+
+ IF_DEBUG("qtaguid: iface_stat: netdev_event(): "
+ "ev=0x%lx/%s netdev=%p->name=%s\n",
+ event, netdev_evt_str(event), dev, dev ? dev->name : "");
+
+ switch (event) {
+ case NETDEV_UP:
+ iface_stat_create(dev, NULL);
+ atomic64_inc(&qtu_events.iface_events);
+ break;
+ case NETDEV_DOWN:
+ case NETDEV_UNREGISTER:
+ iface_stat_update(dev, event == NETDEV_DOWN);
+ atomic64_inc(&qtu_events.iface_events);
+ break;
+ }
+ return NOTIFY_DONE;
+}
+
+static int iface_inet6addr_event_handler(struct notifier_block *nb,
+ unsigned long event, void *ptr)
+{
+ struct inet6_ifaddr *ifa = ptr;
+ struct net_device *dev;
+
+ if (unlikely(module_passive))
+ return NOTIFY_DONE;
+
+ IF_DEBUG("qtaguid: iface_stat: inet6addr_event(): "
+ "ev=0x%lx/%s ifa=%p\n",
+ event, netdev_evt_str(event), ifa);
+
+ switch (event) {
+ case NETDEV_UP:
+ BUG_ON(!ifa || !ifa->idev);
+ dev = (struct net_device *)ifa->idev->dev;
+ iface_stat_create_ipv6(dev, ifa);
+ atomic64_inc(&qtu_events.iface_events);
+ break;
+ case NETDEV_DOWN:
+ case NETDEV_UNREGISTER:
+ BUG_ON(!ifa || !ifa->idev);
+ dev = (struct net_device *)ifa->idev->dev;
+ iface_stat_update(dev, event == NETDEV_DOWN);
+ atomic64_inc(&qtu_events.iface_events);
+ break;
+ }
+ return NOTIFY_DONE;
+}
+
+static int iface_inetaddr_event_handler(struct notifier_block *nb,
+ unsigned long event, void *ptr)
+{
+ struct in_ifaddr *ifa = ptr;
+ struct net_device *dev;
+
+ if (unlikely(module_passive))
+ return NOTIFY_DONE;
+
+ IF_DEBUG("qtaguid: iface_stat: inetaddr_event(): "
+ "ev=0x%lx/%s ifa=%p\n",
+ event, netdev_evt_str(event), ifa);
+
+ switch (event) {
+ case NETDEV_UP:
+ BUG_ON(!ifa || !ifa->ifa_dev);
+ dev = ifa->ifa_dev->dev;
+ iface_stat_create(dev, ifa);
+ atomic64_inc(&qtu_events.iface_events);
+ break;
+ case NETDEV_DOWN:
+ case NETDEV_UNREGISTER:
+ BUG_ON(!ifa || !ifa->ifa_dev);
+ dev = ifa->ifa_dev->dev;
+ iface_stat_update(dev, event == NETDEV_DOWN);
+ atomic64_inc(&qtu_events.iface_events);
+ break;
+ }
+ return NOTIFY_DONE;
+}
+
+static struct notifier_block iface_netdev_notifier_blk = {
+ .notifier_call = iface_netdev_event_handler,
+};
+
+static struct notifier_block iface_inetaddr_notifier_blk = {
+ .notifier_call = iface_inetaddr_event_handler,
+};
+
+static struct notifier_block iface_inet6addr_notifier_blk = {
+ .notifier_call = iface_inet6addr_event_handler,
+};
+
+static const struct seq_operations iface_stat_fmt_proc_seq_ops = {
+ .start = iface_stat_fmt_proc_start,
+ .next = iface_stat_fmt_proc_next,
+ .stop = iface_stat_fmt_proc_stop,
+ .show = iface_stat_fmt_proc_show,
+};
+
+static int proc_iface_stat_fmt_open(struct inode *inode, struct file *file)
+{
+ struct proc_iface_stat_fmt_info *s;
+
+ s = __seq_open_private(file, &iface_stat_fmt_proc_seq_ops,
+ sizeof(struct proc_iface_stat_fmt_info));
+ if (!s)
+ return -ENOMEM;
+
+ s->fmt = (uintptr_t)PDE_DATA(inode);
+ return 0;
+}
+
+static const struct file_operations proc_iface_stat_fmt_fops = {
+ .open = proc_iface_stat_fmt_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = seq_release_private,
+};
+
+static int __init iface_stat_init(struct proc_dir_entry *parent_procdir)
+{
+ int err;
+
+ iface_stat_procdir = proc_mkdir(iface_stat_procdirname, parent_procdir);
+ if (!iface_stat_procdir) {
+ pr_err("qtaguid: iface_stat: init failed to create proc entry\n");
+ err = -1;
+ goto err;
+ }
+
+ iface_stat_all_procfile = proc_create_data(iface_stat_all_procfilename,
+ proc_iface_perms,
+ parent_procdir,
+ &proc_iface_stat_fmt_fops,
+ (void *)1 /* fmt1 */);
+ if (!iface_stat_all_procfile) {
+ pr_err("qtaguid: iface_stat: init "
+ " failed to create stat_old proc entry\n");
+ err = -1;
+ goto err_zap_entry;
+ }
+
+ iface_stat_fmt_procfile = proc_create_data(iface_stat_fmt_procfilename,
+ proc_iface_perms,
+ parent_procdir,
+ &proc_iface_stat_fmt_fops,
+ (void *)2 /* fmt2 */);
+ if (!iface_stat_fmt_procfile) {
+ pr_err("qtaguid: iface_stat: init "
+ " failed to create stat_all proc entry\n");
+ err = -1;
+ goto err_zap_all_stats_entry;
+ }
+
+
+ err = register_netdevice_notifier(&iface_netdev_notifier_blk);
+ if (err) {
+ pr_err("qtaguid: iface_stat: init "
+ "failed to register dev event handler\n");
+ goto err_zap_all_stats_entries;
+ }
+ err = register_inetaddr_notifier(&iface_inetaddr_notifier_blk);
+ if (err) {
+ pr_err("qtaguid: iface_stat: init "
+ "failed to register ipv4 dev event handler\n");
+ goto err_unreg_nd;
+ }
+
+ err = register_inet6addr_notifier(&iface_inet6addr_notifier_blk);
+ if (err) {
+ pr_err("qtaguid: iface_stat: init "
+ "failed to register ipv6 dev event handler\n");
+ goto err_unreg_ip4_addr;
+ }
+ return 0;
+
+err_unreg_ip4_addr:
+ unregister_inetaddr_notifier(&iface_inetaddr_notifier_blk);
+err_unreg_nd:
+ unregister_netdevice_notifier(&iface_netdev_notifier_blk);
+err_zap_all_stats_entries:
+ remove_proc_entry(iface_stat_fmt_procfilename, parent_procdir);
+err_zap_all_stats_entry:
+ remove_proc_entry(iface_stat_all_procfilename, parent_procdir);
+err_zap_entry:
+ remove_proc_entry(iface_stat_procdirname, parent_procdir);
+err:
+ return err;
+}
+
+static struct sock *qtaguid_find_sk(const struct sk_buff *skb,
+ struct xt_action_param *par)
+{
+ struct sock *sk;
+ unsigned int hook_mask = (1 << par->hooknum);
+
+ MT_DEBUG("qtaguid[%d]: find_sk(skb=%p) family=%d\n",
+ par->hooknum, skb, par->family);
+
+ /*
+ * Let's not abuse the the xt_socket_get*_sk(), or else it will
+ * return garbage SKs.
+ */
+ if (!(hook_mask & XT_SOCKET_SUPPORTED_HOOKS))
+ return NULL;
+
+ switch (par->family) {
+ case NFPROTO_IPV6:
+ sk = xt_socket_lookup_slow_v6(dev_net(skb->dev), skb, par->in);
+ break;
+ case NFPROTO_IPV4:
+ sk = xt_socket_lookup_slow_v4(dev_net(skb->dev), skb, par->in);
+ break;
+ default:
+ return NULL;
+ }
+
+ if (sk) {
+ MT_DEBUG("qtaguid[%d]: %p->sk_proto=%u->sk_state=%d\n",
+ par->hooknum, sk, sk->sk_protocol, sk->sk_state);
+ }
+ return sk;
+}
+
+static void account_for_uid(const struct sk_buff *skb,
+ const struct sock *alternate_sk, uid_t uid,
+ struct xt_action_param *par)
+{
+ const struct net_device *el_dev;
+ enum ifs_tx_rx direction;
+ int proto;
+
+ get_dev_and_dir(skb, par, &direction, &el_dev);
+ proto = ipx_proto(skb, par);
+ MT_DEBUG("qtaguid[%d]: dev name=%s type=%d fam=%d proto=%d dir=%d\n",
+ par->hooknum, el_dev->name, el_dev->type,
+ par->family, proto, direction);
+
+ if_tag_stat_update(el_dev->name, uid,
+ skb->sk ? skb->sk : alternate_sk,
+ direction,
+ proto, skb->len);
+}
+
+static bool qtaguid_mt(const struct sk_buff *skb, struct xt_action_param *par)
+{
+ const struct xt_qtaguid_match_info *info = par->matchinfo;
+ const struct file *filp;
+ bool got_sock = false;
+ struct sock *sk;
+ kuid_t sock_uid;
+ bool res;
+ bool set_sk_callback_lock = false;
+ /*
+ * TODO: unhack how to force just accounting.
+ * For now we only do tag stats when the uid-owner is not requested
+ */
+ bool do_tag_stat = !(info->match & XT_QTAGUID_UID);
+
+ if (unlikely(module_passive))
+ return (info->match ^ info->invert) == 0;
+
+ MT_DEBUG("qtaguid[%d]: entered skb=%p par->in=%p/out=%p fam=%d\n",
+ par->hooknum, skb, par->in, par->out, par->family);
+
+ atomic64_inc(&qtu_events.match_calls);
+ if (skb == NULL) {
+ res = (info->match ^ info->invert) == 0;
+ goto ret_res;
+ }
+
+ switch (par->hooknum) {
+ case NF_INET_PRE_ROUTING:
+ case NF_INET_POST_ROUTING:
+ atomic64_inc(&qtu_events.match_calls_prepost);
+ iface_stat_update_from_skb(skb, par);
+ /*
+ * We are done in pre/post. The skb will get processed
+ * further alter.
+ */
+ res = (info->match ^ info->invert);
+ goto ret_res;
+ break;
+ /* default: Fall through and do UID releated work */
+ }
+
+ sk = skb_to_full_sk(skb);
+ /*
+ * When in TCP_TIME_WAIT the sk is not a "struct sock" but
+ * "struct inet_timewait_sock" which is missing fields.
+ * So we ignore it.
+ */
+ if (sk && sk->sk_state == TCP_TIME_WAIT)
+ sk = NULL;
+ if (sk == NULL) {
+ /*
+ * A missing sk->sk_socket happens when packets are in-flight
+ * and the matching socket is already closed and gone.
+ */
+ sk = qtaguid_find_sk(skb, par);
+ /*
+ * TCP_NEW_SYN_RECV are not "struct sock" but "struct request_sock"
+ * where we can get a pointer to a full socket to retrieve uid/gid.
+ * When in TCP_TIME_WAIT, sk is a struct inet_timewait_sock
+ * which is missing fields and does not contain any reference
+ * to a full socket, so just ignore the socket.
+ */
+ if (sk && sk->sk_state == TCP_NEW_SYN_RECV) {
+ sock_gen_put(sk);
+ sk = sk_to_full_sk(sk);
+ } else if (sk && (!sk_fullsock(sk) || sk->sk_state == TCP_TIME_WAIT)) {
+ sock_gen_put(sk);
+ sk = NULL;
+ } else {
+ /*
+ * If we got the socket from the find_sk(), we will need to put
+ * it back, as nf_tproxy_get_sock_v4() got it.
+ */
+ got_sock = sk;
+ }
+ if (sk)
+ atomic64_inc(&qtu_events.match_found_sk_in_ct);
+ else
+ atomic64_inc(&qtu_events.match_found_no_sk_in_ct);
+ } else {
+ atomic64_inc(&qtu_events.match_found_sk);
+ }
+ MT_DEBUG("qtaguid[%d]: sk=%p got_sock=%d fam=%d proto=%d\n",
+ par->hooknum, sk, got_sock, par->family, ipx_proto(skb, par));
+
+ if (!sk) {
+ /*
+ * Here, the qtaguid_find_sk() using connection tracking
+ * couldn't find the owner, so for now we just count them
+ * against the system.
+ */
+ if (do_tag_stat)
+ account_for_uid(skb, sk, 0, par);
+ MT_DEBUG("qtaguid[%d]: leaving (sk=NULL)\n", par->hooknum);
+ res = (info->match ^ info->invert) == 0;
+ atomic64_inc(&qtu_events.match_no_sk);
+ goto put_sock_ret_res;
+ } else if (info->match & info->invert & XT_QTAGUID_SOCKET) {
+ res = false;
+ goto put_sock_ret_res;
+ }
+ sock_uid = sk->sk_uid;
+ if (do_tag_stat)
+ account_for_uid(skb, sk, from_kuid(&init_user_ns, sock_uid),
+ par);
+
+ /*
+ * The following two tests fail the match when:
+ * id not in range AND no inverted condition requested
+ * or id in range AND inverted condition requested
+ * Thus (!a && b) || (a && !b) == a ^ b
+ */
+ if (info->match & XT_QTAGUID_UID) {
+ kuid_t uid_min = make_kuid(&init_user_ns, info->uid_min);
+ kuid_t uid_max = make_kuid(&init_user_ns, info->uid_max);
+
+ if ((uid_gte(sock_uid, uid_min) &&
+ uid_lte(sock_uid, uid_max)) ^
+ !(info->invert & XT_QTAGUID_UID)) {
+ MT_DEBUG("qtaguid[%d]: leaving uid not matching\n",
+ par->hooknum);
+ res = false;
+ goto put_sock_ret_res;
+ }
+ }
+ if (info->match & XT_QTAGUID_GID) {
+ kgid_t gid_min = make_kgid(&init_user_ns, info->gid_min);
+ kgid_t gid_max = make_kgid(&init_user_ns, info->gid_max);
+ set_sk_callback_lock = true;
+ read_lock_bh(&sk->sk_callback_lock);
+ MT_DEBUG("qtaguid[%d]: sk=%p->sk_socket=%p->file=%p\n",
+ par->hooknum, sk, sk->sk_socket,
+ sk->sk_socket ? sk->sk_socket->file : (void *)-1LL);
+ filp = sk->sk_socket ? sk->sk_socket->file : NULL;
+ if (!filp) {
+ res = ((info->match ^ info->invert) &
+ XT_QTAGUID_GID) == 0;
+ atomic64_inc(&qtu_events.match_no_sk_gid);
+ goto put_sock_ret_res;
+ }
+ MT_DEBUG("qtaguid[%d]: filp...uid=%u\n",
+ par->hooknum, filp ?
+ from_kuid(&init_user_ns, filp->f_cred->fsuid) : -1);
+ if ((gid_gte(filp->f_cred->fsgid, gid_min) &&
+ gid_lte(filp->f_cred->fsgid, gid_max)) ^
+ !(info->invert & XT_QTAGUID_GID)) {
+ MT_DEBUG("qtaguid[%d]: leaving gid not matching\n",
+ par->hooknum);
+ res = false;
+ goto put_sock_ret_res;
+ }
+ }
+ MT_DEBUG("qtaguid[%d]: leaving matched\n", par->hooknum);
+ res = true;
+
+put_sock_ret_res:
+ if (got_sock)
+ sock_gen_put(sk);
+ if (set_sk_callback_lock)
+ read_unlock_bh(&sk->sk_callback_lock);
+ret_res:
+ MT_DEBUG("qtaguid[%d]: left %d\n", par->hooknum, res);
+ return res;
+}
+
+#ifdef DDEBUG
+/*
+ * This function is not in xt_qtaguid_print.c because of locks visibility.
+ * The lock of sock_tag_list must be aquired before calling this function
+ */
+static void prdebug_full_state_locked(int indent_level, const char *fmt, ...)
+{
+ va_list args;
+ char *fmt_buff;
+ char *buff;
+
+ if (!unlikely(qtaguid_debug_mask & DDEBUG_MASK))
+ return;
+
+ fmt_buff = kasprintf(GFP_ATOMIC,
+ "qtaguid: %s(): %s {\n", __func__, fmt);
+ BUG_ON(!fmt_buff);
+ va_start(args, fmt);
+ buff = kvasprintf(GFP_ATOMIC,
+ fmt_buff, args);
+ BUG_ON(!buff);
+ pr_debug("%s", buff);
+ kfree(fmt_buff);
+ kfree(buff);
+ va_end(args);
+
+ prdebug_sock_tag_tree(indent_level, &sock_tag_tree);
+
+ spin_lock_bh(&uid_tag_data_tree_lock);
+ prdebug_uid_tag_data_tree(indent_level, &uid_tag_data_tree);
+ prdebug_proc_qtu_data_tree(indent_level, &proc_qtu_data_tree);
+ spin_unlock_bh(&uid_tag_data_tree_lock);
+
+ spin_lock_bh(&iface_stat_list_lock);
+ prdebug_iface_stat_list(indent_level, &iface_stat_list);
+ spin_unlock_bh(&iface_stat_list_lock);
+
+ pr_debug("qtaguid: %s(): }\n", __func__);
+}
+#else
+static void prdebug_full_state_locked(int indent_level, const char *fmt, ...) {}
+#endif
+
+struct proc_ctrl_print_info {
+ struct sock *sk; /* socket found by reading to sk_pos */
+ loff_t sk_pos;
+};
+
+static void *qtaguid_ctrl_proc_next(struct seq_file *m, void *v, loff_t *pos)
+{
+ struct proc_ctrl_print_info *pcpi = m->private;
+ struct sock_tag *sock_tag_entry = v;
+ struct rb_node *node;
+
+ (*pos)++;
+
+ if (!v || v == SEQ_START_TOKEN)
+ return NULL;
+
+ node = rb_next(&sock_tag_entry->sock_node);
+ if (!node) {
+ pcpi->sk = NULL;
+ sock_tag_entry = SEQ_START_TOKEN;
+ } else {
+ sock_tag_entry = rb_entry(node, struct sock_tag, sock_node);
+ pcpi->sk = sock_tag_entry->sk;
+ }
+ pcpi->sk_pos = *pos;
+ return sock_tag_entry;
+}
+
+static void *qtaguid_ctrl_proc_start(struct seq_file *m, loff_t *pos)
+{
+ struct proc_ctrl_print_info *pcpi = m->private;
+ struct sock_tag *sock_tag_entry;
+ struct rb_node *node;
+
+ spin_lock_bh(&sock_tag_list_lock);
+
+ if (unlikely(module_passive))
+ return NULL;
+
+ if (*pos == 0) {
+ pcpi->sk_pos = 0;
+ node = rb_first(&sock_tag_tree);
+ if (!node) {
+ pcpi->sk = NULL;
+ return SEQ_START_TOKEN;
+ }
+ sock_tag_entry = rb_entry(node, struct sock_tag, sock_node);
+ pcpi->sk = sock_tag_entry->sk;
+ } else {
+ sock_tag_entry = (pcpi->sk ? get_sock_stat_nl(pcpi->sk) :
+ NULL) ?: SEQ_START_TOKEN;
+ if (*pos != pcpi->sk_pos) {
+ /* seq_read skipped a next call */
+ *pos = pcpi->sk_pos;
+ return qtaguid_ctrl_proc_next(m, sock_tag_entry, pos);
+ }
+ }
+ return sock_tag_entry;
+}
+
+static void qtaguid_ctrl_proc_stop(struct seq_file *m, void *v)
+{
+ spin_unlock_bh(&sock_tag_list_lock);
+}
+
+/*
+ * Procfs reader to get all active socket tags using style "1)" as described in
+ * fs/proc/generic.c
+ */
+static int qtaguid_ctrl_proc_show(struct seq_file *m, void *v)
+{
+ struct sock_tag *sock_tag_entry = v;
+ uid_t uid;
+
+ CT_DEBUG("qtaguid: proc ctrl pid=%u tgid=%u uid=%u\n",
+ current->pid, current->tgid, from_kuid(&init_user_ns, current_fsuid()));
+
+ if (sock_tag_entry != SEQ_START_TOKEN) {
+ int sk_ref_count;
+ uid = get_uid_from_tag(sock_tag_entry->tag);
+ CT_DEBUG("qtaguid: proc_read(): sk=%p tag=0x%llx (uid=%u) "
+ "pid=%u\n",
+ sock_tag_entry->sk,
+ sock_tag_entry->tag,
+ uid,
+ sock_tag_entry->pid
+ );
+ sk_ref_count = atomic_read(
+ &sock_tag_entry->sk->sk_refcnt);
+ seq_printf(m, "sock=%pK tag=0x%llx (uid=%u) pid=%u "
+ "f_count=%d\n",
+ sock_tag_entry->sk,
+ sock_tag_entry->tag, uid,
+ sock_tag_entry->pid, sk_ref_count);
+ } else {
+ seq_printf(m, "events: sockets_tagged=%llu "
+ "sockets_untagged=%llu "
+ "counter_set_changes=%llu "
+ "delete_cmds=%llu "
+ "iface_events=%llu "
+ "match_calls=%llu "
+ "match_calls_prepost=%llu "
+ "match_found_sk=%llu "
+ "match_found_sk_in_ct=%llu "
+ "match_found_no_sk_in_ct=%llu "
+ "match_no_sk=%llu "
+ "match_no_sk_gid=%llu\n",
+ (u64)atomic64_read(&qtu_events.sockets_tagged),
+ (u64)atomic64_read(&qtu_events.sockets_untagged),
+ (u64)atomic64_read(&qtu_events.counter_set_changes),
+ (u64)atomic64_read(&qtu_events.delete_cmds),
+ (u64)atomic64_read(&qtu_events.iface_events),
+ (u64)atomic64_read(&qtu_events.match_calls),
+ (u64)atomic64_read(&qtu_events.match_calls_prepost),
+ (u64)atomic64_read(&qtu_events.match_found_sk),
+ (u64)atomic64_read(&qtu_events.match_found_sk_in_ct),
+ (u64)atomic64_read(&qtu_events.match_found_no_sk_in_ct),
+ (u64)atomic64_read(&qtu_events.match_no_sk),
+ (u64)atomic64_read(&qtu_events.match_no_sk_gid));
+
+ /* Count the following as part of the last item_index. No need
+ * to lock the sock_tag_list here since it is already locked when
+ * starting the seq_file operation
+ */
+ prdebug_full_state_locked(0, "proc ctrl");
+ }
+
+ return 0;
+}
+
+/*
+ * Delete socket tags, and stat tags associated with a given
+ * accouting tag and uid.
+ */
+static int ctrl_cmd_delete(const char *input)
+{
+ char cmd;
+ int uid_int;
+ kuid_t uid;
+ uid_t entry_uid;
+ tag_t acct_tag;
+ tag_t tag;
+ int res, argc;
+ struct iface_stat *iface_entry;
+ struct rb_node *node;
+ struct sock_tag *st_entry;
+ struct rb_root st_to_free_tree = RB_ROOT;
+ struct tag_stat *ts_entry;
+ struct tag_counter_set *tcs_entry;
+ struct tag_ref *tr_entry;
+ struct uid_tag_data *utd_entry;
+
+ argc = sscanf(input, "%c %llu %u", &cmd, &acct_tag, &uid_int);
+ uid = make_kuid(&init_user_ns, uid_int);
+ CT_DEBUG("qtaguid: ctrl_delete(%s): argc=%d cmd=%c "
+ "user_tag=0x%llx uid=%u\n", input, argc, cmd,
+ acct_tag, uid_int);
+ if (argc < 2) {
+ res = -EINVAL;
+ goto err;
+ }
+ if (!valid_atag(acct_tag)) {
+ pr_info("qtaguid: ctrl_delete(%s): invalid tag\n", input);
+ res = -EINVAL;
+ goto err;
+ }
+ if (argc < 3) {
+ uid = current_fsuid();
+ uid_int = from_kuid(&init_user_ns, uid);
+ } else if (!can_impersonate_uid(uid)) {
+ pr_info("qtaguid: ctrl_delete(%s): "
+ "insufficient priv from pid=%u tgid=%u uid=%u\n",
+ input, current->pid, current->tgid, from_kuid(&init_user_ns, current_fsuid()));
+ res = -EPERM;
+ goto err;
+ }
+
+ tag = combine_atag_with_uid(acct_tag, uid_int);
+ CT_DEBUG("qtaguid: ctrl_delete(%s): "
+ "looking for tag=0x%llx (uid=%u)\n",
+ input, tag, uid_int);
+
+ /* Delete socket tags */
+ spin_lock_bh(&sock_tag_list_lock);
+ spin_lock_bh(&uid_tag_data_tree_lock);
+ node = rb_first(&sock_tag_tree);
+ while (node) {
+ st_entry = rb_entry(node, struct sock_tag, sock_node);
+ entry_uid = get_uid_from_tag(st_entry->tag);
+ node = rb_next(node);
+ if (entry_uid != uid_int)
+ continue;
+
+ CT_DEBUG("qtaguid: ctrl_delete(%s): st tag=0x%llx (uid=%u)\n",
+ input, st_entry->tag, entry_uid);
+
+ if (!acct_tag || st_entry->tag == tag) {
+ rb_erase(&st_entry->sock_node, &sock_tag_tree);
+ /* Can't sockfd_put() within spinlock, do it later. */
+ sock_tag_tree_insert(st_entry, &st_to_free_tree);
+ tr_entry = lookup_tag_ref(st_entry->tag, NULL);
+ BUG_ON(tr_entry->num_sock_tags <= 0);
+ tr_entry->num_sock_tags--;
+ /*
+ * TODO: remove if, and start failing.
+ * This is a hack to work around the fact that in some
+ * places we have "if (IS_ERR_OR_NULL(pqd_entry))"
+ * and are trying to work around apps
+ * that didn't open the /dev/xt_qtaguid.
+ */
+ if (st_entry->list.next && st_entry->list.prev)
+ list_del(&st_entry->list);
+ }
+ }
+ spin_unlock_bh(&uid_tag_data_tree_lock);
+ spin_unlock_bh(&sock_tag_list_lock);
+
+ sock_tag_tree_erase(&st_to_free_tree);
+
+ /* Delete tag counter-sets */
+ spin_lock_bh(&tag_counter_set_list_lock);
+ /* Counter sets are only on the uid tag, not full tag */
+ tcs_entry = tag_counter_set_tree_search(&tag_counter_set_tree, tag);
+ if (tcs_entry) {
+ CT_DEBUG("qtaguid: ctrl_delete(%s): "
+ "erase tcs: tag=0x%llx (uid=%u) set=%d\n",
+ input,
+ tcs_entry->tn.tag,
+ get_uid_from_tag(tcs_entry->tn.tag),
+ tcs_entry->active_set);
+ rb_erase(&tcs_entry->tn.node, &tag_counter_set_tree);
+ kfree(tcs_entry);
+ }
+ spin_unlock_bh(&tag_counter_set_list_lock);
+
+ /*
+ * If acct_tag is 0, then all entries belonging to uid are
+ * erased.
+ */
+ spin_lock_bh(&iface_stat_list_lock);
+ list_for_each_entry(iface_entry, &iface_stat_list, list) {
+ spin_lock_bh(&iface_entry->tag_stat_list_lock);
+ node = rb_first(&iface_entry->tag_stat_tree);
+ while (node) {
+ ts_entry = rb_entry(node, struct tag_stat, tn.node);
+ entry_uid = get_uid_from_tag(ts_entry->tn.tag);
+ node = rb_next(node);
+
+ CT_DEBUG("qtaguid: ctrl_delete(%s): "
+ "ts tag=0x%llx (uid=%u)\n",
+ input, ts_entry->tn.tag, entry_uid);
+
+ if (entry_uid != uid_int)
+ continue;
+ if (!acct_tag || ts_entry->tn.tag == tag) {
+ CT_DEBUG("qtaguid: ctrl_delete(%s): "
+ "erase ts: %s 0x%llx %u\n",
+ input, iface_entry->ifname,
+ get_atag_from_tag(ts_entry->tn.tag),
+ entry_uid);
+ rb_erase(&ts_entry->tn.node,
+ &iface_entry->tag_stat_tree);
+ kfree(ts_entry);
+ }
+ }
+ spin_unlock_bh(&iface_entry->tag_stat_list_lock);
+ }
+ spin_unlock_bh(&iface_stat_list_lock);
+
+ /* Cleanup the uid_tag_data */
+ spin_lock_bh(&uid_tag_data_tree_lock);
+ node = rb_first(&uid_tag_data_tree);
+ while (node) {
+ utd_entry = rb_entry(node, struct uid_tag_data, node);
+ entry_uid = utd_entry->uid;
+ node = rb_next(node);
+
+ CT_DEBUG("qtaguid: ctrl_delete(%s): "
+ "utd uid=%u\n",
+ input, entry_uid);
+
+ if (entry_uid != uid_int)
+ continue;
+ /*
+ * Go over the tag_refs, and those that don't have
+ * sock_tags using them are freed.
+ */
+ put_tag_ref_tree(tag, utd_entry);
+ put_utd_entry(utd_entry);
+ }
+ spin_unlock_bh(&uid_tag_data_tree_lock);
+
+ atomic64_inc(&qtu_events.delete_cmds);
+ res = 0;
+
+err:
+ return res;
+}
+
+static int ctrl_cmd_counter_set(const char *input)
+{
+ char cmd;
+ uid_t uid = 0;
+ tag_t tag;
+ int res, argc;
+ struct tag_counter_set *tcs;
+ int counter_set;
+
+ argc = sscanf(input, "%c %d %u", &cmd, &counter_set, &uid);
+ CT_DEBUG("qtaguid: ctrl_counterset(%s): argc=%d cmd=%c "
+ "set=%d uid=%u\n", input, argc, cmd,
+ counter_set, uid);
+ if (argc != 3) {
+ res = -EINVAL;
+ goto err;
+ }
+ if (counter_set < 0 || counter_set >= IFS_MAX_COUNTER_SETS) {
+ pr_info("qtaguid: ctrl_counterset(%s): invalid counter_set range\n",
+ input);
+ res = -EINVAL;
+ goto err;
+ }
+ if (!can_manipulate_uids()) {
+ pr_info("qtaguid: ctrl_counterset(%s): "
+ "insufficient priv from pid=%u tgid=%u uid=%u\n",
+ input, current->pid, current->tgid, from_kuid(&init_user_ns, current_fsuid()));
+ res = -EPERM;
+ goto err;
+ }
+
+ tag = make_tag_from_uid(uid);
+ spin_lock_bh(&tag_counter_set_list_lock);
+ tcs = tag_counter_set_tree_search(&tag_counter_set_tree, tag);
+ if (!tcs) {
+ tcs = kzalloc(sizeof(*tcs), GFP_ATOMIC);
+ if (!tcs) {
+ spin_unlock_bh(&tag_counter_set_list_lock);
+ pr_err("qtaguid: ctrl_counterset(%s): "
+ "failed to alloc counter set\n",
+ input);
+ res = -ENOMEM;
+ goto err;
+ }
+ tcs->tn.tag = tag;
+ tag_counter_set_tree_insert(tcs, &tag_counter_set_tree);
+ CT_DEBUG("qtaguid: ctrl_counterset(%s): added tcs tag=0x%llx "
+ "(uid=%u) set=%d\n",
+ input, tag, get_uid_from_tag(tag), counter_set);
+ }
+ tcs->active_set = counter_set;
+ spin_unlock_bh(&tag_counter_set_list_lock);
+ atomic64_inc(&qtu_events.counter_set_changes);
+ res = 0;
+
+err:
+ return res;
+}
+
+static int ctrl_cmd_tag(const char *input)
+{
+ char cmd;
+ int sock_fd = 0;
+ kuid_t uid;
+ unsigned int uid_int = 0;
+ tag_t acct_tag = make_atag_from_value(0);
+ tag_t full_tag;
+ struct socket *el_socket;
+ int res, argc;
+ struct sock_tag *sock_tag_entry;
+ struct tag_ref *tag_ref_entry;
+ struct uid_tag_data *uid_tag_data_entry;
+ struct proc_qtu_data *pqd_entry;
+
+ /* Unassigned args will get defaulted later. */
+ argc = sscanf(input, "%c %d %llu %u", &cmd, &sock_fd, &acct_tag, &uid_int);
+ uid = make_kuid(&init_user_ns, uid_int);
+ CT_DEBUG("qtaguid: ctrl_tag(%s): argc=%d cmd=%c sock_fd=%d "
+ "acct_tag=0x%llx uid=%u\n", input, argc, cmd, sock_fd,
+ acct_tag, uid_int);
+ if (argc < 2) {
+ res = -EINVAL;
+ goto err;
+ }
+ el_socket = sockfd_lookup(sock_fd, &res); /* This locks the file */
+ if (!el_socket) {
+ pr_info("qtaguid: ctrl_tag(%s): failed to lookup"
+ " sock_fd=%d err=%d pid=%u tgid=%u uid=%u\n",
+ input, sock_fd, res, current->pid, current->tgid,
+ from_kuid(&init_user_ns, current_fsuid()));
+ goto err;
+ }
+ CT_DEBUG("qtaguid: ctrl_tag(%s): socket->...->sk_refcnt=%d ->sk=%p\n",
+ input, atomic_read(&el_socket->sk->sk_refcnt),
+ el_socket->sk);
+ if (argc < 3) {
+ acct_tag = make_atag_from_value(0);
+ } else if (!valid_atag(acct_tag)) {
+ pr_info("qtaguid: ctrl_tag(%s): invalid tag\n", input);
+ res = -EINVAL;
+ goto err_put;
+ }
+ CT_DEBUG("qtaguid: ctrl_tag(%s): "
+ "pid=%u tgid=%u uid=%u euid=%u fsuid=%u "
+ "ctrl.gid=%u in_group()=%d in_egroup()=%d\n",
+ input, current->pid, current->tgid,
+ from_kuid(&init_user_ns, current_uid()),
+ from_kuid(&init_user_ns, current_euid()),
+ from_kuid(&init_user_ns, current_fsuid()),
+ from_kgid(&init_user_ns, xt_qtaguid_ctrl_file->gid),
+ in_group_p(xt_qtaguid_ctrl_file->gid),
+ in_egroup_p(xt_qtaguid_ctrl_file->gid));
+ if (argc < 4) {
+ uid = current_fsuid();
+ uid_int = from_kuid(&init_user_ns, uid);
+ } else if (!can_impersonate_uid(uid)) {
+ pr_info("qtaguid: ctrl_tag(%s): "
+ "insufficient priv from pid=%u tgid=%u uid=%u\n",
+ input, current->pid, current->tgid, from_kuid(&init_user_ns, current_fsuid()));
+ res = -EPERM;
+ goto err_put;
+ }
+ full_tag = combine_atag_with_uid(acct_tag, uid_int);
+
+ spin_lock_bh(&sock_tag_list_lock);
+ spin_lock_bh(&uid_tag_data_tree_lock);
+ sock_tag_entry = get_sock_stat_nl(el_socket->sk);
+ tag_ref_entry = get_tag_ref(full_tag, &uid_tag_data_entry);
+ if (IS_ERR(tag_ref_entry)) {
+ res = PTR_ERR(tag_ref_entry);
+ spin_unlock_bh(&uid_tag_data_tree_lock);
+ spin_unlock_bh(&sock_tag_list_lock);
+ goto err_put;
+ }
+ tag_ref_entry->num_sock_tags++;
+ if (sock_tag_entry) {
+ struct tag_ref *prev_tag_ref_entry;
+
+ CT_DEBUG("qtaguid: ctrl_tag(%s): retag for sk=%p "
+ "st@%p ...->sk_refcnt=%d\n",
+ input, el_socket->sk, sock_tag_entry,
+ atomic_read(&el_socket->sk->sk_refcnt));
+ prev_tag_ref_entry = lookup_tag_ref(sock_tag_entry->tag,
+ &uid_tag_data_entry);
+ BUG_ON(IS_ERR_OR_NULL(prev_tag_ref_entry));
+ BUG_ON(prev_tag_ref_entry->num_sock_tags <= 0);
+ prev_tag_ref_entry->num_sock_tags--;
+ sock_tag_entry->tag = full_tag;
+ } else {
+ CT_DEBUG("qtaguid: ctrl_tag(%s): newtag for sk=%p\n",
+ input, el_socket->sk);
+ sock_tag_entry = kzalloc(sizeof(*sock_tag_entry),
+ GFP_ATOMIC);
+ if (!sock_tag_entry) {
+ pr_err("qtaguid: ctrl_tag(%s): "
+ "socket tag alloc failed\n",
+ input);
+ BUG_ON(tag_ref_entry->num_sock_tags <= 0);
+ tag_ref_entry->num_sock_tags--;
+ free_tag_ref_from_utd_entry(tag_ref_entry,
+ uid_tag_data_entry);
+ spin_unlock_bh(&uid_tag_data_tree_lock);
+ spin_unlock_bh(&sock_tag_list_lock);
+ res = -ENOMEM;
+ goto err_put;
+ }
+ /*
+ * Hold the sk refcount here to make sure the sk pointer cannot
+ * be freed and reused
+ */
+ sock_hold(el_socket->sk);
+ sock_tag_entry->sk = el_socket->sk;
+ sock_tag_entry->pid = current->tgid;
+ sock_tag_entry->tag = combine_atag_with_uid(acct_tag, uid_int);
+ pqd_entry = proc_qtu_data_tree_search(
+ &proc_qtu_data_tree, current->tgid);
+ /*
+ * TODO: remove if, and start failing.
+ * At first, we want to catch user-space code that is not
+ * opening the /dev/xt_qtaguid.
+ */
+ if (IS_ERR_OR_NULL(pqd_entry))
+ pr_warn_once(
+ "qtaguid: %s(): "
+ "User space forgot to open /dev/xt_qtaguid? "
+ "pid=%u tgid=%u uid=%u\n", __func__,
+ current->pid, current->tgid,
+ from_kuid(&init_user_ns, current_fsuid()));
+ else
+ list_add(&sock_tag_entry->list,
+ &pqd_entry->sock_tag_list);
+
+ sock_tag_tree_insert(sock_tag_entry, &sock_tag_tree);
+ atomic64_inc(&qtu_events.sockets_tagged);
+ }
+ spin_unlock_bh(&uid_tag_data_tree_lock);
+ spin_unlock_bh(&sock_tag_list_lock);
+ /* We keep the ref to the sk until it is untagged */
+ CT_DEBUG("qtaguid: ctrl_tag(%s): done st@%p ...->sk_refcnt=%d\n",
+ input, sock_tag_entry,
+ atomic_read(&el_socket->sk->sk_refcnt));
+ sockfd_put(el_socket);
+ return 0;
+
+err_put:
+ CT_DEBUG("qtaguid: ctrl_tag(%s): done. ...->sk_refcnt=%d\n",
+ input, atomic_read(&el_socket->sk->sk_refcnt) - 1);
+ /* Release the sock_fd that was grabbed by sockfd_lookup(). */
+ sockfd_put(el_socket);
+ return res;
+
+err:
+ CT_DEBUG("qtaguid: ctrl_tag(%s): done.\n", input);
+ return res;
+}
+
+static int ctrl_cmd_untag(const char *input)
+{
+ char cmd;
+ int sock_fd = 0;
+ struct socket *el_socket;
+ int res, argc;
+
+ argc = sscanf(input, "%c %d", &cmd, &sock_fd);
+ CT_DEBUG("qtaguid: ctrl_untag(%s): argc=%d cmd=%c sock_fd=%d\n",
+ input, argc, cmd, sock_fd);
+ if (argc < 2) {
+ res = -EINVAL;
+ return res;
+ }
+ el_socket = sockfd_lookup(sock_fd, &res); /* This locks the file */
+ if (!el_socket) {
+ pr_info("qtaguid: ctrl_untag(%s): failed to lookup"
+ " sock_fd=%d err=%d pid=%u tgid=%u uid=%u\n",
+ input, sock_fd, res, current->pid, current->tgid,
+ from_kuid(&init_user_ns, current_fsuid()));
+ return res;
+ }
+ CT_DEBUG("qtaguid: ctrl_untag(%s): socket->...->f_count=%ld ->sk=%p\n",
+ input, atomic_long_read(&el_socket->file->f_count),
+ el_socket->sk);
+ res = qtaguid_untag(el_socket, false);
+ sockfd_put(el_socket);
+ return res;
+}
+
+int qtaguid_untag(struct socket *el_socket, bool kernel)
+{
+ int res;
+ pid_t pid;
+ struct sock_tag *sock_tag_entry;
+ struct tag_ref *tag_ref_entry;
+ struct uid_tag_data *utd_entry;
+ struct proc_qtu_data *pqd_entry;
+
+ spin_lock_bh(&sock_tag_list_lock);
+ sock_tag_entry = get_sock_stat_nl(el_socket->sk);
+ if (!sock_tag_entry) {
+ spin_unlock_bh(&sock_tag_list_lock);
+ res = -EINVAL;
+ return res;
+ }
+ /*
+ * The socket already belongs to the current process
+ * so it can do whatever it wants to it.
+ */
+ rb_erase(&sock_tag_entry->sock_node, &sock_tag_tree);
+
+ tag_ref_entry = lookup_tag_ref(sock_tag_entry->tag, &utd_entry);
+ BUG_ON(!tag_ref_entry);
+ BUG_ON(tag_ref_entry->num_sock_tags <= 0);
+ spin_lock_bh(&uid_tag_data_tree_lock);
+ if (kernel)
+ pid = sock_tag_entry->pid;
+ else
+ pid = current->tgid;
+ pqd_entry = proc_qtu_data_tree_search(
+ &proc_qtu_data_tree, pid);
+ /*
+ * TODO: remove if, and start failing.
+ * At first, we want to catch user-space code that is not
+ * opening the /dev/xt_qtaguid.
+ */
+ if (IS_ERR_OR_NULL(pqd_entry) || !sock_tag_entry->list.next) {
+ pr_warn_once("qtaguid: %s(): "
+ "User space forgot to open /dev/xt_qtaguid? "
+ "pid=%u tgid=%u sk_pid=%u, uid=%u\n", __func__,
+ current->pid, current->tgid, sock_tag_entry->pid,
+ from_kuid(&init_user_ns, current_fsuid()));
+ } else {
+ list_del(&sock_tag_entry->list);
+ }
+ spin_unlock_bh(&uid_tag_data_tree_lock);
+ /*
+ * We don't free tag_ref from the utd_entry here,
+ * only during a cmd_delete().
+ */
+ tag_ref_entry->num_sock_tags--;
+ spin_unlock_bh(&sock_tag_list_lock);
+ /*
+ * Release the sock_fd that was grabbed at tag time.
+ */
+ sock_put(sock_tag_entry->sk);
+ CT_DEBUG("qtaguid: done. st@%p ...->sk_refcnt=%d\n",
+ sock_tag_entry,
+ atomic_read(&el_socket->sk->sk_refcnt));
+
+ kfree(sock_tag_entry);
+ atomic64_inc(&qtu_events.sockets_untagged);
+
+ return 0;
+}
+
+static ssize_t qtaguid_ctrl_parse(const char *input, size_t count)
+{
+ char cmd;
+ ssize_t res;
+
+ CT_DEBUG("qtaguid: ctrl(%s): pid=%u tgid=%u uid=%u\n",
+ input, current->pid, current->tgid, from_kuid(&init_user_ns, current_fsuid()));
+
+ cmd = input[0];
+ /* Collect params for commands */
+ switch (cmd) {
+ case 'd':
+ res = ctrl_cmd_delete(input);
+ break;
+
+ case 's':
+ res = ctrl_cmd_counter_set(input);
+ break;
+
+ case 't':
+ res = ctrl_cmd_tag(input);
+ break;
+
+ case 'u':
+ res = ctrl_cmd_untag(input);
+ break;
+
+ default:
+ res = -EINVAL;
+ goto err;
+ }
+ if (!res)
+ res = count;
+err:
+ CT_DEBUG("qtaguid: ctrl(%s): res=%zd\n", input, res);
+ return res;
+}
+
+#define MAX_QTAGUID_CTRL_INPUT_LEN 255
+static ssize_t qtaguid_ctrl_proc_write(struct file *file, const char __user *buffer,
+ size_t count, loff_t *offp)
+{
+ char input_buf[MAX_QTAGUID_CTRL_INPUT_LEN];
+
+ if (unlikely(module_passive))
+ return count;
+
+ if (count >= MAX_QTAGUID_CTRL_INPUT_LEN)
+ return -EINVAL;
+
+ if (copy_from_user(input_buf, buffer, count))
+ return -EFAULT;
+
+ input_buf[count] = '\0';
+ return qtaguid_ctrl_parse(input_buf, count);
+}
+
+struct proc_print_info {
+ struct iface_stat *iface_entry;
+ int item_index;
+ tag_t tag; /* tag found by reading to tag_pos */
+ off_t tag_pos;
+ int tag_item_index;
+};
+
+static void pp_stats_header(struct seq_file *m)
+{
+ seq_puts(m,
+ "idx iface acct_tag_hex uid_tag_int cnt_set "
+ "rx_bytes rx_packets "
+ "tx_bytes tx_packets "
+ "rx_tcp_bytes rx_tcp_packets "
+ "rx_udp_bytes rx_udp_packets "
+ "rx_other_bytes rx_other_packets "
+ "tx_tcp_bytes tx_tcp_packets "
+ "tx_udp_bytes tx_udp_packets "
+ "tx_other_bytes tx_other_packets\n");
+}
+
+static int pp_stats_line(struct seq_file *m, struct tag_stat *ts_entry,
+ int cnt_set)
+{
+ struct data_counters *cnts;
+ tag_t tag = ts_entry->tn.tag;
+ uid_t stat_uid = get_uid_from_tag(tag);
+ struct proc_print_info *ppi = m->private;
+ /* Detailed tags are not available to everybody */
+ if (!can_read_other_uid_stats(make_kuid(&init_user_ns,stat_uid))) {
+ CT_DEBUG("qtaguid: stats line: "
+ "%s 0x%llx %u: insufficient priv "
+ "from pid=%u tgid=%u uid=%u stats.gid=%u\n",
+ ppi->iface_entry->ifname,
+ get_atag_from_tag(tag), stat_uid,
+ current->pid, current->tgid, from_kuid(&init_user_ns, current_fsuid()),
+ from_kgid(&init_user_ns,xt_qtaguid_stats_file->gid));
+ return 0;
+ }
+ ppi->item_index++;
+ cnts = &ts_entry->counters;
+ seq_printf(m, "%d %s 0x%llx %u %u "
+ "%llu %llu "
+ "%llu %llu "
+ "%llu %llu "
+ "%llu %llu "
+ "%llu %llu "
+ "%llu %llu "
+ "%llu %llu "
+ "%llu %llu\n",
+ ppi->item_index,
+ ppi->iface_entry->ifname,
+ get_atag_from_tag(tag),
+ stat_uid,
+ cnt_set,
+ dc_sum_bytes(cnts, cnt_set, IFS_RX),
+ dc_sum_packets(cnts, cnt_set, IFS_RX),
+ dc_sum_bytes(cnts, cnt_set, IFS_TX),
+ dc_sum_packets(cnts, cnt_set, IFS_TX),
+ cnts->bpc[cnt_set][IFS_RX][IFS_TCP].bytes,
+ cnts->bpc[cnt_set][IFS_RX][IFS_TCP].packets,
+ cnts->bpc[cnt_set][IFS_RX][IFS_UDP].bytes,
+ cnts->bpc[cnt_set][IFS_RX][IFS_UDP].packets,
+ cnts->bpc[cnt_set][IFS_RX][IFS_PROTO_OTHER].bytes,
+ cnts->bpc[cnt_set][IFS_RX][IFS_PROTO_OTHER].packets,
+ cnts->bpc[cnt_set][IFS_TX][IFS_TCP].bytes,
+ cnts->bpc[cnt_set][IFS_TX][IFS_TCP].packets,
+ cnts->bpc[cnt_set][IFS_TX][IFS_UDP].bytes,
+ cnts->bpc[cnt_set][IFS_TX][IFS_UDP].packets,
+ cnts->bpc[cnt_set][IFS_TX][IFS_PROTO_OTHER].bytes,
+ cnts->bpc[cnt_set][IFS_TX][IFS_PROTO_OTHER].packets);
+ return seq_has_overflowed(m) ? -ENOSPC : 1;
+}
+
+static bool pp_sets(struct seq_file *m, struct tag_stat *ts_entry)
+{
+ int ret;
+ int counter_set;
+ for (counter_set = 0; counter_set < IFS_MAX_COUNTER_SETS;
+ counter_set++) {
+ ret = pp_stats_line(m, ts_entry, counter_set);
+ if (ret < 0)
+ return false;
+ }
+ return true;
+}
+
+static int qtaguid_stats_proc_iface_stat_ptr_valid(struct iface_stat *ptr)
+{
+ struct iface_stat *iface_entry;
+
+ if (!ptr)
+ return false;
+
+ list_for_each_entry(iface_entry, &iface_stat_list, list)
+ if (iface_entry == ptr)
+ return true;
+ return false;
+}
+
+static void qtaguid_stats_proc_next_iface_entry(struct proc_print_info *ppi)
+{
+ spin_unlock_bh(&ppi->iface_entry->tag_stat_list_lock);
+ list_for_each_entry_continue(ppi->iface_entry, &iface_stat_list, list) {
+ spin_lock_bh(&ppi->iface_entry->tag_stat_list_lock);
+ return;
+ }
+ ppi->iface_entry = NULL;
+}
+
+static void *qtaguid_stats_proc_next(struct seq_file *m, void *v, loff_t *pos)
+{
+ struct proc_print_info *ppi = m->private;
+ struct tag_stat *ts_entry;
+ struct rb_node *node;
+
+ if (!v) {
+ pr_err("qtaguid: %s(): unexpected v: NULL\n", __func__);
+ return NULL;
+ }
+
+ (*pos)++;
+
+ if (!ppi->iface_entry || unlikely(module_passive))
+ return NULL;
+
+ if (v == SEQ_START_TOKEN)
+ node = rb_first(&ppi->iface_entry->tag_stat_tree);
+ else
+ node = rb_next(&((struct tag_stat *)v)->tn.node);
+
+ while (!node) {
+ qtaguid_stats_proc_next_iface_entry(ppi);
+ if (!ppi->iface_entry)
+ return NULL;
+ node = rb_first(&ppi->iface_entry->tag_stat_tree);
+ }
+
+ ts_entry = rb_entry(node, struct tag_stat, tn.node);
+ ppi->tag = ts_entry->tn.tag;
+ ppi->tag_pos = *pos;
+ ppi->tag_item_index = ppi->item_index;
+ return ts_entry;
+}
+
+static void *qtaguid_stats_proc_start(struct seq_file *m, loff_t *pos)
+{
+ struct proc_print_info *ppi = m->private;
+ struct tag_stat *ts_entry = NULL;
+
+ spin_lock_bh(&iface_stat_list_lock);
+
+ if (*pos == 0) {
+ ppi->item_index = 1;
+ ppi->tag_pos = 0;
+ if (list_empty(&iface_stat_list)) {
+ ppi->iface_entry = NULL;
+ } else {
+ ppi->iface_entry = list_first_entry(&iface_stat_list,
+ struct iface_stat,
+ list);
+ spin_lock_bh(&ppi->iface_entry->tag_stat_list_lock);
+ }
+ return SEQ_START_TOKEN;
+ }
+ if (!qtaguid_stats_proc_iface_stat_ptr_valid(ppi->iface_entry)) {
+ if (ppi->iface_entry) {
+ pr_err("qtaguid: %s(): iface_entry %p not found\n",
+ __func__, ppi->iface_entry);
+ ppi->iface_entry = NULL;
+ }
+ return NULL;
+ }
+
+ spin_lock_bh(&ppi->iface_entry->tag_stat_list_lock);
+
+ if (!ppi->tag_pos) {
+ /* seq_read skipped first next call */
+ ts_entry = SEQ_START_TOKEN;
+ } else {
+ ts_entry = tag_stat_tree_search(
+ &ppi->iface_entry->tag_stat_tree, ppi->tag);
+ if (!ts_entry) {
+ pr_info("qtaguid: %s(): tag_stat.tag 0x%llx not found. Abort.\n",
+ __func__, ppi->tag);
+ return NULL;
+ }
+ }
+
+ if (*pos == ppi->tag_pos) { /* normal resume */
+ ppi->item_index = ppi->tag_item_index;
+ } else {
+ /* seq_read skipped a next call */
+ *pos = ppi->tag_pos;
+ ts_entry = qtaguid_stats_proc_next(m, ts_entry, pos);
+ }
+
+ return ts_entry;
+}
+
+static void qtaguid_stats_proc_stop(struct seq_file *m, void *v)
+{
+ struct proc_print_info *ppi = m->private;
+ if (ppi->iface_entry)
+ spin_unlock_bh(&ppi->iface_entry->tag_stat_list_lock);
+ spin_unlock_bh(&iface_stat_list_lock);
+}
+
+/*
+ * Procfs reader to get all tag stats using style "1)" as described in
+ * fs/proc/generic.c
+ * Groups all protocols tx/rx bytes.
+ */
+static int qtaguid_stats_proc_show(struct seq_file *m, void *v)
+{
+ struct tag_stat *ts_entry = v;
+
+ if (v == SEQ_START_TOKEN)
+ pp_stats_header(m);
+ else
+ pp_sets(m, ts_entry);
+
+ return 0;
+}
+
+/*------------------------------------------*/
+static int qtudev_open(struct inode *inode, struct file *file)
+{
+ struct uid_tag_data *utd_entry;
+ struct proc_qtu_data *pqd_entry;
+ struct proc_qtu_data *new_pqd_entry;
+ int res;
+ bool utd_entry_found;
+
+ if (unlikely(qtu_proc_handling_passive))
+ return 0;
+
+ DR_DEBUG("qtaguid: qtudev_open(): pid=%u tgid=%u uid=%u\n",
+ current->pid, current->tgid, from_kuid(&init_user_ns, current_fsuid()));
+
+ spin_lock_bh(&uid_tag_data_tree_lock);
+
+ /* Look for existing uid data, or alloc one. */
+ utd_entry = get_uid_data(from_kuid(&init_user_ns, current_fsuid()), &utd_entry_found);
+ if (IS_ERR_OR_NULL(utd_entry)) {
+ res = PTR_ERR(utd_entry);
+ goto err_unlock;
+ }
+
+ /* Look for existing PID based proc_data */
+ pqd_entry = proc_qtu_data_tree_search(&proc_qtu_data_tree,
+ current->tgid);
+ if (pqd_entry) {
+ pr_err("qtaguid: qtudev_open(): %u/%u %u "
+ "%s already opened\n",
+ current->pid, current->tgid, from_kuid(&init_user_ns, current_fsuid()),
+ QTU_DEV_NAME);
+ res = -EBUSY;
+ goto err_unlock_free_utd;
+ }
+
+ new_pqd_entry = kzalloc(sizeof(*new_pqd_entry), GFP_ATOMIC);
+ if (!new_pqd_entry) {
+ pr_err("qtaguid: qtudev_open(): %u/%u %u: "
+ "proc data alloc failed\n",
+ current->pid, current->tgid, from_kuid(&init_user_ns, current_fsuid()));
+ res = -ENOMEM;
+ goto err_unlock_free_utd;
+ }
+ new_pqd_entry->pid = current->tgid;
+ INIT_LIST_HEAD(&new_pqd_entry->sock_tag_list);
+ new_pqd_entry->parent_tag_data = utd_entry;
+ utd_entry->num_pqd++;
+
+ proc_qtu_data_tree_insert(new_pqd_entry,
+ &proc_qtu_data_tree);
+
+ spin_unlock_bh(&uid_tag_data_tree_lock);
+ DR_DEBUG("qtaguid: tracking data for uid=%u in pqd=%p\n",
+ from_kuid(&init_user_ns, current_fsuid()), new_pqd_entry);
+ file->private_data = new_pqd_entry;
+ return 0;
+
+err_unlock_free_utd:
+ if (!utd_entry_found) {
+ rb_erase(&utd_entry->node, &uid_tag_data_tree);
+ kfree(utd_entry);
+ }
+err_unlock:
+ spin_unlock_bh(&uid_tag_data_tree_lock);
+ return res;
+}
+
+static int qtudev_release(struct inode *inode, struct file *file)
+{
+ struct proc_qtu_data *pqd_entry = file->private_data;
+ struct uid_tag_data *utd_entry = pqd_entry->parent_tag_data;
+ struct sock_tag *st_entry;
+ struct rb_root st_to_free_tree = RB_ROOT;
+ struct list_head *entry, *next;
+ struct tag_ref *tr;
+
+ if (unlikely(qtu_proc_handling_passive))
+ return 0;
+
+ /*
+ * Do not trust the current->pid, it might just be a kworker cleaning
+ * up after a dead proc.
+ */
+ DR_DEBUG("qtaguid: qtudev_release(): "
+ "pid=%u tgid=%u uid=%u "
+ "pqd_entry=%p->pid=%u utd_entry=%p->active_tags=%d\n",
+ current->pid, current->tgid, pqd_entry->parent_tag_data->uid,
+ pqd_entry, pqd_entry->pid, utd_entry,
+ utd_entry->num_active_tags);
+
+ spin_lock_bh(&sock_tag_list_lock);
+ spin_lock_bh(&uid_tag_data_tree_lock);
+
+ list_for_each_safe(entry, next, &pqd_entry->sock_tag_list) {
+ st_entry = list_entry(entry, struct sock_tag, list);
+ DR_DEBUG("qtaguid: %s(): "
+ "erase sock_tag=%p->sk=%p pid=%u tgid=%u uid=%u\n",
+ __func__,
+ st_entry, st_entry->sk,
+ current->pid, current->tgid,
+ pqd_entry->parent_tag_data->uid);
+
+ utd_entry = uid_tag_data_tree_search(
+ &uid_tag_data_tree,
+ get_uid_from_tag(st_entry->tag));
+ BUG_ON(IS_ERR_OR_NULL(utd_entry));
+ DR_DEBUG("qtaguid: %s(): "
+ "looking for tag=0x%llx in utd_entry=%p\n", __func__,
+ st_entry->tag, utd_entry);
+ tr = tag_ref_tree_search(&utd_entry->tag_ref_tree,
+ st_entry->tag);
+ BUG_ON(!tr);
+ BUG_ON(tr->num_sock_tags <= 0);
+ tr->num_sock_tags--;
+ free_tag_ref_from_utd_entry(tr, utd_entry);
+
+ rb_erase(&st_entry->sock_node, &sock_tag_tree);
+ list_del(&st_entry->list);
+ /* Can't sockfd_put() within spinlock, do it later. */
+ sock_tag_tree_insert(st_entry, &st_to_free_tree);
+
+ /*
+ * Try to free the utd_entry if no other proc_qtu_data is
+ * using it (num_pqd is 0) and it doesn't have active tags
+ * (num_active_tags is 0).
+ */
+ put_utd_entry(utd_entry);
+ }
+
+ rb_erase(&pqd_entry->node, &proc_qtu_data_tree);
+ BUG_ON(pqd_entry->parent_tag_data->num_pqd < 1);
+ pqd_entry->parent_tag_data->num_pqd--;
+ put_utd_entry(pqd_entry->parent_tag_data);
+ kfree(pqd_entry);
+ file->private_data = NULL;
+
+ spin_unlock_bh(&uid_tag_data_tree_lock);
+ spin_unlock_bh(&sock_tag_list_lock);
+
+
+ sock_tag_tree_erase(&st_to_free_tree);
+
+ spin_lock_bh(&sock_tag_list_lock);
+ prdebug_full_state_locked(0, "%s(): pid=%u tgid=%u", __func__,
+ current->pid, current->tgid);
+ spin_unlock_bh(&sock_tag_list_lock);
+ return 0;
+}
+
+/*------------------------------------------*/
+static const struct file_operations qtudev_fops = {
+ .owner = THIS_MODULE,
+ .open = qtudev_open,
+ .release = qtudev_release,
+};
+
+static struct miscdevice qtu_device = {
+ .minor = MISC_DYNAMIC_MINOR,
+ .name = QTU_DEV_NAME,
+ .fops = &qtudev_fops,
+ /* How sad it doesn't allow for defaults: .mode = S_IRUGO | S_IWUSR */
+};
+
+static const struct seq_operations proc_qtaguid_ctrl_seqops = {
+ .start = qtaguid_ctrl_proc_start,
+ .next = qtaguid_ctrl_proc_next,
+ .stop = qtaguid_ctrl_proc_stop,
+ .show = qtaguid_ctrl_proc_show,
+};
+
+static int proc_qtaguid_ctrl_open(struct inode *inode, struct file *file)
+{
+ return seq_open_private(file, &proc_qtaguid_ctrl_seqops,
+ sizeof(struct proc_ctrl_print_info));
+}
+
+static const struct file_operations proc_qtaguid_ctrl_fops = {
+ .open = proc_qtaguid_ctrl_open,
+ .read = seq_read,
+ .write = qtaguid_ctrl_proc_write,
+ .llseek = seq_lseek,
+ .release = seq_release_private,
+};
+
+static const struct seq_operations proc_qtaguid_stats_seqops = {
+ .start = qtaguid_stats_proc_start,
+ .next = qtaguid_stats_proc_next,
+ .stop = qtaguid_stats_proc_stop,
+ .show = qtaguid_stats_proc_show,
+};
+
+static int proc_qtaguid_stats_open(struct inode *inode, struct file *file)
+{
+ return seq_open_private(file, &proc_qtaguid_stats_seqops,
+ sizeof(struct proc_print_info));
+}
+
+static const struct file_operations proc_qtaguid_stats_fops = {
+ .open = proc_qtaguid_stats_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = seq_release_private,
+};
+
+/*------------------------------------------*/
+static int __init qtaguid_proc_register(struct proc_dir_entry **res_procdir)
+{
+ int ret;
+ *res_procdir = proc_mkdir(module_procdirname, init_net.proc_net);
+ if (!*res_procdir) {
+ pr_err("qtaguid: failed to create proc/.../xt_qtaguid\n");
+ ret = -ENOMEM;
+ goto no_dir;
+ }
+
+ xt_qtaguid_ctrl_file = proc_create_data("ctrl", proc_ctrl_perms,
+ *res_procdir,
+ &proc_qtaguid_ctrl_fops,
+ NULL);
+ if (!xt_qtaguid_ctrl_file) {
+ pr_err("qtaguid: failed to create xt_qtaguid/ctrl "
+ " file\n");
+ ret = -ENOMEM;
+ goto no_ctrl_entry;
+ }
+
+ xt_qtaguid_stats_file = proc_create_data("stats", proc_stats_perms,
+ *res_procdir,
+ &proc_qtaguid_stats_fops,
+ NULL);
+ if (!xt_qtaguid_stats_file) {
+ pr_err("qtaguid: failed to create xt_qtaguid/stats "
+ "file\n");
+ ret = -ENOMEM;
+ goto no_stats_entry;
+ }
+ /*
+ * TODO: add support counter hacking
+ * xt_qtaguid_stats_file->write_proc = qtaguid_stats_proc_write;
+ */
+ return 0;
+
+no_stats_entry:
+ remove_proc_entry("ctrl", *res_procdir);
+no_ctrl_entry:
+ remove_proc_entry("xt_qtaguid", NULL);
+no_dir:
+ return ret;
+}
+
+static struct xt_match qtaguid_mt_reg __read_mostly = {
+ /*
+ * This module masquerades as the "owner" module so that iptables
+ * tools can deal with it.
+ */
+ .name = "owner",
+ .revision = 1,
+ .family = NFPROTO_UNSPEC,
+ .match = qtaguid_mt,
+ .matchsize = sizeof(struct xt_qtaguid_match_info),
+ .me = THIS_MODULE,
+};
+
+static int __init qtaguid_mt_init(void)
+{
+ if (qtaguid_proc_register(&xt_qtaguid_procdir)
+ || iface_stat_init(xt_qtaguid_procdir)
+ || xt_register_match(&qtaguid_mt_reg)
+ || misc_register(&qtu_device))
+ return -1;
+ return 0;
+}
+
+/*
+ * TODO: allow unloading of the module.
+ * For now stats are permanent.
+ * Kconfig forces'y/n' and never an 'm'.
+ */
+
+module_init(qtaguid_mt_init);
+MODULE_AUTHOR("jpa <jpa@google.com>");
+MODULE_DESCRIPTION("Xtables: socket owner+tag matching and associated stats");
+MODULE_LICENSE("GPL");
+MODULE_ALIAS("ipt_owner");
+MODULE_ALIAS("ip6t_owner");
+MODULE_ALIAS("ipt_qtaguid");
+MODULE_ALIAS("ip6t_qtaguid");
diff --git a/net/netfilter/xt_qtaguid_internal.h b/net/netfilter/xt_qtaguid_internal.h
new file mode 100644
index 000000000000..c7052707a6a4
--- /dev/null
+++ b/net/netfilter/xt_qtaguid_internal.h
@@ -0,0 +1,350 @@
+/*
+ * Kernel iptables module to track stats for packets based on user tags.
+ *
+ * (C) 2011 Google, Inc
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+#ifndef __XT_QTAGUID_INTERNAL_H__
+#define __XT_QTAGUID_INTERNAL_H__
+
+#include <linux/types.h>
+#include <linux/rbtree.h>
+#include <linux/spinlock_types.h>
+#include <linux/workqueue.h>
+
+/* Iface handling */
+#define IDEBUG_MASK (1<<0)
+/* Iptable Matching. Per packet. */
+#define MDEBUG_MASK (1<<1)
+/* Red-black tree handling. Per packet. */
+#define RDEBUG_MASK (1<<2)
+/* procfs ctrl/stats handling */
+#define CDEBUG_MASK (1<<3)
+/* dev and resource tracking */
+#define DDEBUG_MASK (1<<4)
+
+/* E.g (IDEBUG_MASK | CDEBUG_MASK | DDEBUG_MASK) */
+#define DEFAULT_DEBUG_MASK 0
+
+/*
+ * (Un)Define these *DEBUG to compile out/in the pr_debug calls.
+ * All undef: text size ~ 0x3030; all def: ~ 0x4404.
+ */
+#define IDEBUG
+#define MDEBUG
+#define RDEBUG
+#define CDEBUG
+#define DDEBUG
+
+#define MSK_DEBUG(mask, ...) do { \
+ if (unlikely(qtaguid_debug_mask & (mask))) \
+ pr_debug(__VA_ARGS__); \
+ } while (0)
+#ifdef IDEBUG
+#define IF_DEBUG(...) MSK_DEBUG(IDEBUG_MASK, __VA_ARGS__)
+#else
+#define IF_DEBUG(...) no_printk(__VA_ARGS__)
+#endif
+#ifdef MDEBUG
+#define MT_DEBUG(...) MSK_DEBUG(MDEBUG_MASK, __VA_ARGS__)
+#else
+#define MT_DEBUG(...) no_printk(__VA_ARGS__)
+#endif
+#ifdef RDEBUG
+#define RB_DEBUG(...) MSK_DEBUG(RDEBUG_MASK, __VA_ARGS__)
+#else
+#define RB_DEBUG(...) no_printk(__VA_ARGS__)
+#endif
+#ifdef CDEBUG
+#define CT_DEBUG(...) MSK_DEBUG(CDEBUG_MASK, __VA_ARGS__)
+#else
+#define CT_DEBUG(...) no_printk(__VA_ARGS__)
+#endif
+#ifdef DDEBUG
+#define DR_DEBUG(...) MSK_DEBUG(DDEBUG_MASK, __VA_ARGS__)
+#else
+#define DR_DEBUG(...) no_printk(__VA_ARGS__)
+#endif
+
+extern uint qtaguid_debug_mask;
+
+/*---------------------------------------------------------------------------*/
+/*
+ * Tags:
+ *
+ * They represent what the data usage counters will be tracked against.
+ * By default a tag is just based on the UID.
+ * The UID is used as the base for policing, and can not be ignored.
+ * So a tag will always at least represent a UID (uid_tag).
+ *
+ * A tag can be augmented with an "accounting tag" which is associated
+ * with a UID.
+ * User space can set the acct_tag portion of the tag which is then used
+ * with sockets: all data belonging to that socket will be counted against the
+ * tag. The policing is then based on the tag's uid_tag portion,
+ * and stats are collected for the acct_tag portion separately.
+ *
+ * There could be
+ * a: {acct_tag=1, uid_tag=10003}
+ * b: {acct_tag=2, uid_tag=10003}
+ * c: {acct_tag=3, uid_tag=10003}
+ * d: {acct_tag=0, uid_tag=10003}
+ * a, b, and c represent tags associated with specific sockets.
+ * d is for the totals for that uid, including all untagged traffic.
+ * Typically d is used with policing/quota rules.
+ *
+ * We want tag_t big enough to distinguish uid_t and acct_tag.
+ * It might become a struct if needed.
+ * Nothing should be using it as an int.
+ */
+typedef uint64_t tag_t; /* Only used via accessors */
+
+#define TAG_UID_MASK 0xFFFFFFFFULL
+#define TAG_ACCT_MASK (~0xFFFFFFFFULL)
+
+static inline int tag_compare(tag_t t1, tag_t t2)
+{
+ return t1 < t2 ? -1 : t1 == t2 ? 0 : 1;
+}
+
+static inline tag_t combine_atag_with_uid(tag_t acct_tag, uid_t uid)
+{
+ return acct_tag | uid;
+}
+static inline tag_t make_tag_from_uid(uid_t uid)
+{
+ return uid;
+}
+static inline uid_t get_uid_from_tag(tag_t tag)
+{
+ return tag & TAG_UID_MASK;
+}
+static inline tag_t get_utag_from_tag(tag_t tag)
+{
+ return tag & TAG_UID_MASK;
+}
+static inline tag_t get_atag_from_tag(tag_t tag)
+{
+ return tag & TAG_ACCT_MASK;
+}
+
+static inline bool valid_atag(tag_t tag)
+{
+ return !(tag & TAG_UID_MASK);
+}
+static inline tag_t make_atag_from_value(uint32_t value)
+{
+ return (uint64_t)value << 32;
+}
+/*---------------------------------------------------------------------------*/
+
+/*
+ * Maximum number of socket tags that a UID is allowed to have active.
+ * Multiple processes belonging to the same UID contribute towards this limit.
+ * Special UIDs that can impersonate a UID also contribute (e.g. download
+ * manager, ...)
+ */
+#define DEFAULT_MAX_SOCK_TAGS 1024
+
+/*
+ * For now we only track 2 sets of counters.
+ * The default set is 0.
+ * Userspace can activate another set for a given uid being tracked.
+ */
+#define IFS_MAX_COUNTER_SETS 2
+
+enum ifs_tx_rx {
+ IFS_TX,
+ IFS_RX,
+ IFS_MAX_DIRECTIONS
+};
+
+/* For now, TCP, UDP, the rest */
+enum ifs_proto {
+ IFS_TCP,
+ IFS_UDP,
+ IFS_PROTO_OTHER,
+ IFS_MAX_PROTOS
+};
+
+struct byte_packet_counters {
+ uint64_t bytes;
+ uint64_t packets;
+};
+
+struct data_counters {
+ struct byte_packet_counters bpc[IFS_MAX_COUNTER_SETS][IFS_MAX_DIRECTIONS][IFS_MAX_PROTOS];
+};
+
+static inline uint64_t dc_sum_bytes(struct data_counters *counters,
+ int set,
+ enum ifs_tx_rx direction)
+{
+ return counters->bpc[set][direction][IFS_TCP].bytes
+ + counters->bpc[set][direction][IFS_UDP].bytes
+ + counters->bpc[set][direction][IFS_PROTO_OTHER].bytes;
+}
+
+static inline uint64_t dc_sum_packets(struct data_counters *counters,
+ int set,
+ enum ifs_tx_rx direction)
+{
+ return counters->bpc[set][direction][IFS_TCP].packets
+ + counters->bpc[set][direction][IFS_UDP].packets
+ + counters->bpc[set][direction][IFS_PROTO_OTHER].packets;
+}
+
+
+/* Generic X based nodes used as a base for rb_tree ops */
+struct tag_node {
+ struct rb_node node;
+ tag_t tag;
+};
+
+struct tag_stat {
+ struct tag_node tn;
+ struct data_counters counters;
+ /*
+ * If this tag is acct_tag based, we need to count against the
+ * matching parent uid_tag.
+ */
+ struct data_counters *parent_counters;
+};
+
+struct iface_stat {
+ struct list_head list; /* in iface_stat_list */
+ char *ifname;
+ bool active;
+ /* net_dev is only valid for active iface_stat */
+ struct net_device *net_dev;
+
+ struct byte_packet_counters totals_via_dev[IFS_MAX_DIRECTIONS];
+ struct data_counters totals_via_skb;
+ /*
+ * We keep the last_known, because some devices reset their counters
+ * just before NETDEV_UP, while some will reset just before
+ * NETDEV_REGISTER (which is more normal).
+ * So now, if the device didn't do a NETDEV_UNREGISTER and we see
+ * its current dev stats smaller that what was previously known, we
+ * assume an UNREGISTER and just use the last_known.
+ */
+ struct byte_packet_counters last_known[IFS_MAX_DIRECTIONS];
+ /* last_known is usable when last_known_valid is true */
+ bool last_known_valid;
+
+ struct proc_dir_entry *proc_ptr;
+
+ struct rb_root tag_stat_tree;
+ spinlock_t tag_stat_list_lock;
+};
+
+/* This is needed to create proc_dir_entries from atomic context. */
+struct iface_stat_work {
+ struct work_struct iface_work;
+ struct iface_stat *iface_entry;
+};
+
+/*
+ * Track tag that this socket is transferring data for, and not necessarily
+ * the uid that owns the socket.
+ * This is the tag against which tag_stat.counters will be billed.
+ * These structs need to be looked up by sock and pid.
+ */
+struct sock_tag {
+ struct rb_node sock_node;
+ struct sock *sk; /* Only used as a number, never dereferenced */
+ /* Used to associate with a given pid */
+ struct list_head list; /* in proc_qtu_data.sock_tag_list */
+ pid_t pid;
+
+ tag_t tag;
+};
+
+struct qtaguid_event_counts {
+ /* Various successful events */
+ atomic64_t sockets_tagged;
+ atomic64_t sockets_untagged;
+ atomic64_t counter_set_changes;
+ atomic64_t delete_cmds;
+ atomic64_t iface_events; /* Number of NETDEV_* events handled */
+
+ atomic64_t match_calls; /* Number of times iptables called mt */
+ /* Number of times iptables called mt from pre or post routing hooks */
+ atomic64_t match_calls_prepost;
+ /*
+ * match_found_sk_*: numbers related to the netfilter matching
+ * function finding a sock for the sk_buff.
+ * Total skbs processed is sum(match_found*).
+ */
+ atomic64_t match_found_sk; /* An sk was already in the sk_buff. */
+ /* The connection tracker had or didn't have the sk. */
+ atomic64_t match_found_sk_in_ct;
+ atomic64_t match_found_no_sk_in_ct;
+ /*
+ * No sk could be found. No apparent owner. Could happen with
+ * unsolicited traffic.
+ */
+ atomic64_t match_no_sk;
+ /*
+ * The file ptr in the sk_socket wasn't there and we couldn't get GID.
+ * This might happen for traffic while the socket is being closed.
+ */
+ atomic64_t match_no_sk_gid;
+};
+
+/* Track the set active_set for the given tag. */
+struct tag_counter_set {
+ struct tag_node tn;
+ int active_set;
+};
+
+/*----------------------------------------------*/
+/*
+ * The qtu uid data is used to track resources that are created directly or
+ * indirectly by processes (uid tracked).
+ * It is shared by the processes with the same uid.
+ * Some of the resource will be counted to prevent further rogue allocations,
+ * some will need freeing once the owner process (uid) exits.
+ */
+struct uid_tag_data {
+ struct rb_node node;
+ uid_t uid;
+
+ /*
+ * For the uid, how many accounting tags have been set.
+ */
+ int num_active_tags;
+ /* Track the number of proc_qtu_data that reference it */
+ int num_pqd;
+ struct rb_root tag_ref_tree;
+ /* No tag_node_tree_lock; use uid_tag_data_tree_lock */
+};
+
+struct tag_ref {
+ struct tag_node tn;
+
+ /*
+ * This tracks the number of active sockets that have a tag on them
+ * which matches this tag_ref.tn.tag.
+ * A tag ref can live on after the sockets are untagged.
+ * A tag ref can only be removed during a tag delete command.
+ */
+ int num_sock_tags;
+};
+
+struct proc_qtu_data {
+ struct rb_node node;
+ pid_t pid;
+
+ struct uid_tag_data *parent_tag_data;
+
+ /* Tracks the sock_tags that need freeing upon this proc's death */
+ struct list_head sock_tag_list;
+ /* No spinlock_t sock_tag_list_lock; use the global one. */
+};
+
+/*----------------------------------------------*/
+#endif /* ifndef __XT_QTAGUID_INTERNAL_H__ */
diff --git a/net/netfilter/xt_qtaguid_print.c b/net/netfilter/xt_qtaguid_print.c
new file mode 100644
index 000000000000..2a7190d285e6
--- /dev/null
+++ b/net/netfilter/xt_qtaguid_print.c
@@ -0,0 +1,566 @@
+/*
+ * Pretty printing Support for iptables xt_qtaguid module.
+ *
+ * (C) 2011 Google, Inc
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+/*
+ * Most of the functions in this file just waste time if DEBUG is not defined.
+ * The matching xt_qtaguid_print.h will static inline empty funcs if the needed
+ * debug flags ore not defined.
+ * Those funcs that fail to allocate memory will panic as there is no need to
+ * hobble allong just pretending to do the requested work.
+ */
+
+#define DEBUG
+
+#include <linux/fs.h>
+#include <linux/gfp.h>
+#include <linux/net.h>
+#include <linux/rbtree.h>
+#include <linux/slab.h>
+#include <linux/spinlock_types.h>
+#include <net/sock.h>
+
+#include "xt_qtaguid_internal.h"
+#include "xt_qtaguid_print.h"
+
+#ifdef DDEBUG
+
+static void _bug_on_err_or_null(void *ptr)
+{
+ if (IS_ERR_OR_NULL(ptr)) {
+ pr_err("qtaguid: kmalloc failed\n");
+ BUG();
+ }
+}
+
+char *pp_tag_t(tag_t *tag)
+{
+ char *res;
+
+ if (!tag)
+ res = kasprintf(GFP_ATOMIC, "tag_t@null{}");
+ else
+ res = kasprintf(GFP_ATOMIC,
+ "tag_t@%p{tag=0x%llx, uid=%u}",
+ tag, *tag, get_uid_from_tag(*tag));
+ _bug_on_err_or_null(res);
+ return res;
+}
+
+char *pp_data_counters(struct data_counters *dc, bool showValues)
+{
+ char *res;
+
+ if (!dc)
+ res = kasprintf(GFP_ATOMIC, "data_counters@null{}");
+ else if (showValues)
+ res = kasprintf(
+ GFP_ATOMIC, "data_counters@%p{"
+ "set0{"
+ "rx{"
+ "tcp{b=%llu, p=%llu}, "
+ "udp{b=%llu, p=%llu},"
+ "other{b=%llu, p=%llu}}, "
+ "tx{"
+ "tcp{b=%llu, p=%llu}, "
+ "udp{b=%llu, p=%llu},"
+ "other{b=%llu, p=%llu}}}, "
+ "set1{"
+ "rx{"
+ "tcp{b=%llu, p=%llu}, "
+ "udp{b=%llu, p=%llu},"
+ "other{b=%llu, p=%llu}}, "
+ "tx{"
+ "tcp{b=%llu, p=%llu}, "
+ "udp{b=%llu, p=%llu},"
+ "other{b=%llu, p=%llu}}}}",
+ dc,
+ dc->bpc[0][IFS_RX][IFS_TCP].bytes,
+ dc->bpc[0][IFS_RX][IFS_TCP].packets,
+ dc->bpc[0][IFS_RX][IFS_UDP].bytes,
+ dc->bpc[0][IFS_RX][IFS_UDP].packets,
+ dc->bpc[0][IFS_RX][IFS_PROTO_OTHER].bytes,
+ dc->bpc[0][IFS_RX][IFS_PROTO_OTHER].packets,
+ dc->bpc[0][IFS_TX][IFS_TCP].bytes,
+ dc->bpc[0][IFS_TX][IFS_TCP].packets,
+ dc->bpc[0][IFS_TX][IFS_UDP].bytes,
+ dc->bpc[0][IFS_TX][IFS_UDP].packets,
+ dc->bpc[0][IFS_TX][IFS_PROTO_OTHER].bytes,
+ dc->bpc[0][IFS_TX][IFS_PROTO_OTHER].packets,
+ dc->bpc[1][IFS_RX][IFS_TCP].bytes,
+ dc->bpc[1][IFS_RX][IFS_TCP].packets,
+ dc->bpc[1][IFS_RX][IFS_UDP].bytes,
+ dc->bpc[1][IFS_RX][IFS_UDP].packets,
+ dc->bpc[1][IFS_RX][IFS_PROTO_OTHER].bytes,
+ dc->bpc[1][IFS_RX][IFS_PROTO_OTHER].packets,
+ dc->bpc[1][IFS_TX][IFS_TCP].bytes,
+ dc->bpc[1][IFS_TX][IFS_TCP].packets,
+ dc->bpc[1][IFS_TX][IFS_UDP].bytes,
+ dc->bpc[1][IFS_TX][IFS_UDP].packets,
+ dc->bpc[1][IFS_TX][IFS_PROTO_OTHER].bytes,
+ dc->bpc[1][IFS_TX][IFS_PROTO_OTHER].packets);
+ else
+ res = kasprintf(GFP_ATOMIC, "data_counters@%p{...}", dc);
+ _bug_on_err_or_null(res);
+ return res;
+}
+
+char *pp_tag_node(struct tag_node *tn)
+{
+ char *tag_str;
+ char *res;
+
+ if (!tn) {
+ res = kasprintf(GFP_ATOMIC, "tag_node@null{}");
+ _bug_on_err_or_null(res);
+ return res;
+ }
+ tag_str = pp_tag_t(&tn->tag);
+ res = kasprintf(GFP_ATOMIC,
+ "tag_node@%p{tag=%s}",
+ tn, tag_str);
+ _bug_on_err_or_null(res);
+ kfree(tag_str);
+ return res;
+}
+
+char *pp_tag_ref(struct tag_ref *tr)
+{
+ char *tn_str;
+ char *res;
+
+ if (!tr) {
+ res = kasprintf(GFP_ATOMIC, "tag_ref@null{}");
+ _bug_on_err_or_null(res);
+ return res;
+ }
+ tn_str = pp_tag_node(&tr->tn);
+ res = kasprintf(GFP_ATOMIC,
+ "tag_ref@%p{%s, num_sock_tags=%d}",
+ tr, tn_str, tr->num_sock_tags);
+ _bug_on_err_or_null(res);
+ kfree(tn_str);
+ return res;
+}
+
+char *pp_tag_stat(struct tag_stat *ts)
+{
+ char *tn_str;
+ char *counters_str;
+ char *parent_counters_str;
+ char *res;
+
+ if (!ts) {
+ res = kasprintf(GFP_ATOMIC, "tag_stat@null{}");
+ _bug_on_err_or_null(res);
+ return res;
+ }
+ tn_str = pp_tag_node(&ts->tn);
+ counters_str = pp_data_counters(&ts->counters, true);
+ parent_counters_str = pp_data_counters(ts->parent_counters, false);
+ res = kasprintf(GFP_ATOMIC,
+ "tag_stat@%p{%s, counters=%s, parent_counters=%s}",
+ ts, tn_str, counters_str, parent_counters_str);
+ _bug_on_err_or_null(res);
+ kfree(tn_str);
+ kfree(counters_str);
+ kfree(parent_counters_str);
+ return res;
+}
+
+char *pp_iface_stat(struct iface_stat *is)
+{
+ char *res;
+ if (!is) {
+ res = kasprintf(GFP_ATOMIC, "iface_stat@null{}");
+ } else {
+ struct data_counters *cnts = &is->totals_via_skb;
+ res = kasprintf(GFP_ATOMIC, "iface_stat@%p{"
+ "list=list_head{...}, "
+ "ifname=%s, "
+ "total_dev={rx={bytes=%llu, "
+ "packets=%llu}, "
+ "tx={bytes=%llu, "
+ "packets=%llu}}, "
+ "total_skb={rx={bytes=%llu, "
+ "packets=%llu}, "
+ "tx={bytes=%llu, "
+ "packets=%llu}}, "
+ "last_known_valid=%d, "
+ "last_known={rx={bytes=%llu, "
+ "packets=%llu}, "
+ "tx={bytes=%llu, "
+ "packets=%llu}}, "
+ "active=%d, "
+ "net_dev=%p, "
+ "proc_ptr=%p, "
+ "tag_stat_tree=rb_root{...}}",
+ is,
+ is->ifname,
+ is->totals_via_dev[IFS_RX].bytes,
+ is->totals_via_dev[IFS_RX].packets,
+ is->totals_via_dev[IFS_TX].bytes,
+ is->totals_via_dev[IFS_TX].packets,
+ dc_sum_bytes(cnts, 0, IFS_RX),
+ dc_sum_packets(cnts, 0, IFS_RX),
+ dc_sum_bytes(cnts, 0, IFS_TX),
+ dc_sum_packets(cnts, 0, IFS_TX),
+ is->last_known_valid,
+ is->last_known[IFS_RX].bytes,
+ is->last_known[IFS_RX].packets,
+ is->last_known[IFS_TX].bytes,
+ is->last_known[IFS_TX].packets,
+ is->active,
+ is->net_dev,
+ is->proc_ptr);
+ }
+ _bug_on_err_or_null(res);
+ return res;
+}
+
+char *pp_sock_tag(struct sock_tag *st)
+{
+ char *tag_str;
+ char *res;
+
+ if (!st) {
+ res = kasprintf(GFP_ATOMIC, "sock_tag@null{}");
+ _bug_on_err_or_null(res);
+ return res;
+ }
+ tag_str = pp_tag_t(&st->tag);
+ res = kasprintf(GFP_ATOMIC, "sock_tag@%p{"
+ "sock_node=rb_node{...}, "
+ "sk=%p (f_count=%d), list=list_head{...}, "
+ "pid=%u, tag=%s}",
+ st, st->sk, atomic_read(
+ &st->sk->sk_refcnt),
+ st->pid, tag_str);
+ _bug_on_err_or_null(res);
+ kfree(tag_str);
+ return res;
+}
+
+char *pp_uid_tag_data(struct uid_tag_data *utd)
+{
+ char *res;
+
+ if (!utd)
+ res = kasprintf(GFP_ATOMIC, "uid_tag_data@null{}");
+ else
+ res = kasprintf(GFP_ATOMIC, "uid_tag_data@%p{"
+ "uid=%u, num_active_acct_tags=%d, "
+ "num_pqd=%d, "
+ "tag_node_tree=rb_root{...}, "
+ "proc_qtu_data_tree=rb_root{...}}",
+ utd, utd->uid,
+ utd->num_active_tags, utd->num_pqd);
+ _bug_on_err_or_null(res);
+ return res;
+}
+
+char *pp_proc_qtu_data(struct proc_qtu_data *pqd)
+{
+ char *parent_tag_data_str;
+ char *res;
+
+ if (!pqd) {
+ res = kasprintf(GFP_ATOMIC, "proc_qtu_data@null{}");
+ _bug_on_err_or_null(res);
+ return res;
+ }
+ parent_tag_data_str = pp_uid_tag_data(pqd->parent_tag_data);
+ res = kasprintf(GFP_ATOMIC, "proc_qtu_data@%p{"
+ "node=rb_node{...}, pid=%u, "
+ "parent_tag_data=%s, "
+ "sock_tag_list=list_head{...}}",
+ pqd, pqd->pid, parent_tag_data_str
+ );
+ _bug_on_err_or_null(res);
+ kfree(parent_tag_data_str);
+ return res;
+}
+
+/*------------------------------------------*/
+void prdebug_sock_tag_tree(int indent_level,
+ struct rb_root *sock_tag_tree)
+{
+ struct rb_node *node;
+ struct sock_tag *sock_tag_entry;
+ char *str;
+
+ if (!unlikely(qtaguid_debug_mask & DDEBUG_MASK))
+ return;
+
+ if (RB_EMPTY_ROOT(sock_tag_tree)) {
+ str = "sock_tag_tree=rb_root{}";
+ pr_debug("%*d: %s\n", indent_level*2, indent_level, str);
+ return;
+ }
+
+ str = "sock_tag_tree=rb_root{";
+ pr_debug("%*d: %s\n", indent_level*2, indent_level, str);
+ indent_level++;
+ for (node = rb_first(sock_tag_tree);
+ node;
+ node = rb_next(node)) {
+ sock_tag_entry = rb_entry(node, struct sock_tag, sock_node);
+ str = pp_sock_tag(sock_tag_entry);
+ pr_debug("%*d: %s,\n", indent_level*2, indent_level, str);
+ kfree(str);
+ }
+ indent_level--;
+ str = "}";
+ pr_debug("%*d: %s\n", indent_level*2, indent_level, str);
+}
+
+void prdebug_sock_tag_list(int indent_level,
+ struct list_head *sock_tag_list)
+{
+ struct sock_tag *sock_tag_entry;
+ char *str;
+
+ if (!unlikely(qtaguid_debug_mask & DDEBUG_MASK))
+ return;
+
+ if (list_empty(sock_tag_list)) {
+ str = "sock_tag_list=list_head{}";
+ pr_debug("%*d: %s\n", indent_level*2, indent_level, str);
+ return;
+ }
+
+ str = "sock_tag_list=list_head{";
+ pr_debug("%*d: %s\n", indent_level*2, indent_level, str);
+ indent_level++;
+ list_for_each_entry(sock_tag_entry, sock_tag_list, list) {
+ str = pp_sock_tag(sock_tag_entry);
+ pr_debug("%*d: %s,\n", indent_level*2, indent_level, str);
+ kfree(str);
+ }
+ indent_level--;
+ str = "}";
+ pr_debug("%*d: %s\n", indent_level*2, indent_level, str);
+}
+
+void prdebug_proc_qtu_data_tree(int indent_level,
+ struct rb_root *proc_qtu_data_tree)
+{
+ char *str;
+ struct rb_node *node;
+ struct proc_qtu_data *proc_qtu_data_entry;
+
+ if (!unlikely(qtaguid_debug_mask & DDEBUG_MASK))
+ return;
+
+ if (RB_EMPTY_ROOT(proc_qtu_data_tree)) {
+ str = "proc_qtu_data_tree=rb_root{}";
+ pr_debug("%*d: %s\n", indent_level*2, indent_level, str);
+ return;
+ }
+
+ str = "proc_qtu_data_tree=rb_root{";
+ pr_debug("%*d: %s\n", indent_level*2, indent_level, str);
+ indent_level++;
+ for (node = rb_first(proc_qtu_data_tree);
+ node;
+ node = rb_next(node)) {
+ proc_qtu_data_entry = rb_entry(node,
+ struct proc_qtu_data,
+ node);
+ str = pp_proc_qtu_data(proc_qtu_data_entry);
+ pr_debug("%*d: %s,\n", indent_level*2, indent_level,
+ str);
+ kfree(str);
+ indent_level++;
+ prdebug_sock_tag_list(indent_level,
+ &proc_qtu_data_entry->sock_tag_list);
+ indent_level--;
+
+ }
+ indent_level--;
+ str = "}";
+ pr_debug("%*d: %s\n", indent_level*2, indent_level, str);
+}
+
+void prdebug_tag_ref_tree(int indent_level, struct rb_root *tag_ref_tree)
+{
+ char *str;
+ struct rb_node *node;
+ struct tag_ref *tag_ref_entry;
+
+ if (!unlikely(qtaguid_debug_mask & DDEBUG_MASK))
+ return;
+
+ if (RB_EMPTY_ROOT(tag_ref_tree)) {
+ str = "tag_ref_tree{}";
+ pr_debug("%*d: %s\n", indent_level*2, indent_level, str);
+ return;
+ }
+
+ str = "tag_ref_tree{";
+ pr_debug("%*d: %s\n", indent_level*2, indent_level, str);
+ indent_level++;
+ for (node = rb_first(tag_ref_tree);
+ node;
+ node = rb_next(node)) {
+ tag_ref_entry = rb_entry(node,
+ struct tag_ref,
+ tn.node);
+ str = pp_tag_ref(tag_ref_entry);
+ pr_debug("%*d: %s,\n", indent_level*2, indent_level,
+ str);
+ kfree(str);
+ }
+ indent_level--;
+ str = "}";
+ pr_debug("%*d: %s\n", indent_level*2, indent_level, str);
+}
+
+void prdebug_uid_tag_data_tree(int indent_level,
+ struct rb_root *uid_tag_data_tree)
+{
+ char *str;
+ struct rb_node *node;
+ struct uid_tag_data *uid_tag_data_entry;
+
+ if (!unlikely(qtaguid_debug_mask & DDEBUG_MASK))
+ return;
+
+ if (RB_EMPTY_ROOT(uid_tag_data_tree)) {
+ str = "uid_tag_data_tree=rb_root{}";
+ pr_debug("%*d: %s\n", indent_level*2, indent_level, str);
+ return;
+ }
+
+ str = "uid_tag_data_tree=rb_root{";
+ pr_debug("%*d: %s\n", indent_level*2, indent_level, str);
+ indent_level++;
+ for (node = rb_first(uid_tag_data_tree);
+ node;
+ node = rb_next(node)) {
+ uid_tag_data_entry = rb_entry(node, struct uid_tag_data,
+ node);
+ str = pp_uid_tag_data(uid_tag_data_entry);
+ pr_debug("%*d: %s,\n", indent_level*2, indent_level, str);
+ kfree(str);
+ if (!RB_EMPTY_ROOT(&uid_tag_data_entry->tag_ref_tree)) {
+ indent_level++;
+ prdebug_tag_ref_tree(indent_level,
+ &uid_tag_data_entry->tag_ref_tree);
+ indent_level--;
+ }
+ }
+ indent_level--;
+ str = "}";
+ pr_debug("%*d: %s\n", indent_level*2, indent_level, str);
+}
+
+void prdebug_tag_stat_tree(int indent_level,
+ struct rb_root *tag_stat_tree)
+{
+ char *str;
+ struct rb_node *node;
+ struct tag_stat *ts_entry;
+
+ if (!unlikely(qtaguid_debug_mask & DDEBUG_MASK))
+ return;
+
+ if (RB_EMPTY_ROOT(tag_stat_tree)) {
+ str = "tag_stat_tree{}";
+ pr_debug("%*d: %s\n", indent_level*2, indent_level, str);
+ return;
+ }
+
+ str = "tag_stat_tree{";
+ pr_debug("%*d: %s\n", indent_level*2, indent_level, str);
+ indent_level++;
+ for (node = rb_first(tag_stat_tree);
+ node;
+ node = rb_next(node)) {
+ ts_entry = rb_entry(node, struct tag_stat, tn.node);
+ str = pp_tag_stat(ts_entry);
+ pr_debug("%*d: %s\n", indent_level*2, indent_level,
+ str);
+ kfree(str);
+ }
+ indent_level--;
+ str = "}";
+ pr_debug("%*d: %s\n", indent_level*2, indent_level, str);
+}
+
+void prdebug_iface_stat_list(int indent_level,
+ struct list_head *iface_stat_list)
+{
+ char *str;
+ struct iface_stat *iface_entry;
+
+ if (!unlikely(qtaguid_debug_mask & DDEBUG_MASK))
+ return;
+
+ if (list_empty(iface_stat_list)) {
+ str = "iface_stat_list=list_head{}";
+ pr_debug("%*d: %s\n", indent_level*2, indent_level, str);
+ return;
+ }
+
+ str = "iface_stat_list=list_head{";
+ pr_debug("%*d: %s\n", indent_level*2, indent_level, str);
+ indent_level++;
+ list_for_each_entry(iface_entry, iface_stat_list, list) {
+ str = pp_iface_stat(iface_entry);
+ pr_debug("%*d: %s\n", indent_level*2, indent_level, str);
+ kfree(str);
+
+ spin_lock_bh(&iface_entry->tag_stat_list_lock);
+ if (!RB_EMPTY_ROOT(&iface_entry->tag_stat_tree)) {
+ indent_level++;
+ prdebug_tag_stat_tree(indent_level,
+ &iface_entry->tag_stat_tree);
+ indent_level--;
+ }
+ spin_unlock_bh(&iface_entry->tag_stat_list_lock);
+ }
+ indent_level--;
+ str = "}";
+ pr_debug("%*d: %s\n", indent_level*2, indent_level, str);
+}
+
+#endif /* ifdef DDEBUG */
+/*------------------------------------------*/
+static const char * const netdev_event_strings[] = {
+ "netdev_unknown",
+ "NETDEV_UP",
+ "NETDEV_DOWN",
+ "NETDEV_REBOOT",
+ "NETDEV_CHANGE",
+ "NETDEV_REGISTER",
+ "NETDEV_UNREGISTER",
+ "NETDEV_CHANGEMTU",
+ "NETDEV_CHANGEADDR",
+ "NETDEV_GOING_DOWN",
+ "NETDEV_CHANGENAME",
+ "NETDEV_FEAT_CHANGE",
+ "NETDEV_BONDING_FAILOVER",
+ "NETDEV_PRE_UP",
+ "NETDEV_PRE_TYPE_CHANGE",
+ "NETDEV_POST_TYPE_CHANGE",
+ "NETDEV_POST_INIT",
+ "NETDEV_UNREGISTER_BATCH",
+ "NETDEV_RELEASE",
+ "NETDEV_NOTIFY_PEERS",
+ "NETDEV_JOIN",
+};
+
+const char *netdev_evt_str(int netdev_event)
+{
+ if (netdev_event < 0
+ || netdev_event >= ARRAY_SIZE(netdev_event_strings))
+ return "bad event num";
+ return netdev_event_strings[netdev_event];
+}
diff --git a/net/netfilter/xt_qtaguid_print.h b/net/netfilter/xt_qtaguid_print.h
new file mode 100644
index 000000000000..b63871a0be5a
--- /dev/null
+++ b/net/netfilter/xt_qtaguid_print.h
@@ -0,0 +1,120 @@
+/*
+ * Pretty printing Support for iptables xt_qtaguid module.
+ *
+ * (C) 2011 Google, Inc
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+#ifndef __XT_QTAGUID_PRINT_H__
+#define __XT_QTAGUID_PRINT_H__
+
+#include "xt_qtaguid_internal.h"
+
+#ifdef DDEBUG
+
+char *pp_tag_t(tag_t *tag);
+char *pp_data_counters(struct data_counters *dc, bool showValues);
+char *pp_tag_node(struct tag_node *tn);
+char *pp_tag_ref(struct tag_ref *tr);
+char *pp_tag_stat(struct tag_stat *ts);
+char *pp_iface_stat(struct iface_stat *is);
+char *pp_sock_tag(struct sock_tag *st);
+char *pp_uid_tag_data(struct uid_tag_data *qtd);
+char *pp_proc_qtu_data(struct proc_qtu_data *pqd);
+
+/*------------------------------------------*/
+void prdebug_sock_tag_list(int indent_level,
+ struct list_head *sock_tag_list);
+void prdebug_sock_tag_tree(int indent_level,
+ struct rb_root *sock_tag_tree);
+void prdebug_proc_qtu_data_tree(int indent_level,
+ struct rb_root *proc_qtu_data_tree);
+void prdebug_tag_ref_tree(int indent_level, struct rb_root *tag_ref_tree);
+void prdebug_uid_tag_data_tree(int indent_level,
+ struct rb_root *uid_tag_data_tree);
+void prdebug_tag_stat_tree(int indent_level,
+ struct rb_root *tag_stat_tree);
+void prdebug_iface_stat_list(int indent_level,
+ struct list_head *iface_stat_list);
+
+#else
+
+/*------------------------------------------*/
+static inline char *pp_tag_t(tag_t *tag)
+{
+ return NULL;
+}
+static inline char *pp_data_counters(struct data_counters *dc, bool showValues)
+{
+ return NULL;
+}
+static inline char *pp_tag_node(struct tag_node *tn)
+{
+ return NULL;
+}
+static inline char *pp_tag_ref(struct tag_ref *tr)
+{
+ return NULL;
+}
+static inline char *pp_tag_stat(struct tag_stat *ts)
+{
+ return NULL;
+}
+static inline char *pp_iface_stat(struct iface_stat *is)
+{
+ return NULL;
+}
+static inline char *pp_sock_tag(struct sock_tag *st)
+{
+ return NULL;
+}
+static inline char *pp_uid_tag_data(struct uid_tag_data *qtd)
+{
+ return NULL;
+}
+static inline char *pp_proc_qtu_data(struct proc_qtu_data *pqd)
+{
+ return NULL;
+}
+
+/*------------------------------------------*/
+static inline
+void prdebug_sock_tag_list(int indent_level,
+ struct list_head *sock_tag_list)
+{
+}
+static inline
+void prdebug_sock_tag_tree(int indent_level,
+ struct rb_root *sock_tag_tree)
+{
+}
+static inline
+void prdebug_proc_qtu_data_tree(int indent_level,
+ struct rb_root *proc_qtu_data_tree)
+{
+}
+static inline
+void prdebug_tag_ref_tree(int indent_level, struct rb_root *tag_ref_tree)
+{
+}
+static inline
+void prdebug_uid_tag_data_tree(int indent_level,
+ struct rb_root *uid_tag_data_tree)
+{
+}
+static inline
+void prdebug_tag_stat_tree(int indent_level,
+ struct rb_root *tag_stat_tree)
+{
+}
+static inline
+void prdebug_iface_stat_list(int indent_level,
+ struct list_head *iface_stat_list)
+{
+}
+#endif
+/*------------------------------------------*/
+const char *netdev_evt_str(int netdev_event);
+#endif /* ifndef __XT_QTAGUID_PRINT_H__ */
diff --git a/net/netfilter/xt_quota2.c b/net/netfilter/xt_quota2.c
new file mode 100644
index 000000000000..834594aa0085
--- /dev/null
+++ b/net/netfilter/xt_quota2.c
@@ -0,0 +1,401 @@
+/*
+ * xt_quota2 - enhanced xt_quota that can count upwards and in packets
+ * as a minimal accounting match.
+ * by Jan Engelhardt <jengelh@medozas.de>, 2008
+ *
+ * Originally based on xt_quota.c:
+ * netfilter module to enforce network quotas
+ * Sam Johnston <samj@samj.net>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License; either
+ * version 2 of the License, as published by the Free Software Foundation.
+ */
+#include <linux/list.h>
+#include <linux/module.h>
+#include <linux/proc_fs.h>
+#include <linux/skbuff.h>
+#include <linux/spinlock.h>
+#include <asm/atomic.h>
+#include <net/netlink.h>
+
+#include <linux/netfilter/x_tables.h>
+#include <linux/netfilter/xt_quota2.h>
+
+#ifdef CONFIG_NETFILTER_XT_MATCH_QUOTA2_LOG
+/* For compatibility, these definitions are copied from the
+ * deprecated header file <linux/netfilter_ipv4/ipt_ULOG.h> */
+#define ULOG_MAC_LEN 80
+#define ULOG_PREFIX_LEN 32
+
+/* Format of the ULOG packets passed through netlink */
+typedef struct ulog_packet_msg {
+ unsigned long mark;
+ long timestamp_sec;
+ long timestamp_usec;
+ unsigned int hook;
+ char indev_name[IFNAMSIZ];
+ char outdev_name[IFNAMSIZ];
+ size_t data_len;
+ char prefix[ULOG_PREFIX_LEN];
+ unsigned char mac_len;
+ unsigned char mac[ULOG_MAC_LEN];
+ unsigned char payload[0];
+} ulog_packet_msg_t;
+#endif
+
+/**
+ * @lock: lock to protect quota writers from each other
+ */
+struct xt_quota_counter {
+ u_int64_t quota;
+ spinlock_t lock;
+ struct list_head list;
+ atomic_t ref;
+ char name[sizeof(((struct xt_quota_mtinfo2 *)NULL)->name)];
+ struct proc_dir_entry *procfs_entry;
+};
+
+#ifdef CONFIG_NETFILTER_XT_MATCH_QUOTA2_LOG
+/* Harald's favorite number +1 :D From ipt_ULOG.C */
+static int qlog_nl_event = 112;
+module_param_named(event_num, qlog_nl_event, uint, S_IRUGO | S_IWUSR);
+MODULE_PARM_DESC(event_num,
+ "Event number for NETLINK_NFLOG message. 0 disables log."
+ "111 is what ipt_ULOG uses.");
+static struct sock *nflognl;
+#endif
+
+static LIST_HEAD(counter_list);
+static DEFINE_SPINLOCK(counter_list_lock);
+
+static struct proc_dir_entry *proc_xt_quota;
+static unsigned int quota_list_perms = S_IRUGO | S_IWUSR;
+static kuid_t quota_list_uid = KUIDT_INIT(0);
+static kgid_t quota_list_gid = KGIDT_INIT(0);
+module_param_named(perms, quota_list_perms, uint, S_IRUGO | S_IWUSR);
+
+#ifdef CONFIG_NETFILTER_XT_MATCH_QUOTA2_LOG
+static void quota2_log(unsigned int hooknum,
+ const struct sk_buff *skb,
+ const struct net_device *in,
+ const struct net_device *out,
+ const char *prefix)
+{
+ ulog_packet_msg_t *pm;
+ struct sk_buff *log_skb;
+ size_t size;
+ struct nlmsghdr *nlh;
+
+ if (!qlog_nl_event)
+ return;
+
+ size = NLMSG_SPACE(sizeof(*pm));
+ size = max(size, (size_t)NLMSG_GOODSIZE);
+ log_skb = alloc_skb(size, GFP_ATOMIC);
+ if (!log_skb) {
+ pr_err("xt_quota2: cannot alloc skb for logging\n");
+ return;
+ }
+
+ nlh = nlmsg_put(log_skb, /*pid*/0, /*seq*/0, qlog_nl_event,
+ sizeof(*pm), 0);
+ if (!nlh) {
+ pr_err("xt_quota2: nlmsg_put failed\n");
+ kfree_skb(log_skb);
+ return;
+ }
+ pm = nlmsg_data(nlh);
+ if (skb->tstamp.tv64 == 0)
+ __net_timestamp((struct sk_buff *)skb);
+ pm->data_len = 0;
+ pm->hook = hooknum;
+ if (prefix != NULL)
+ strlcpy(pm->prefix, prefix, sizeof(pm->prefix));
+ else
+ *(pm->prefix) = '\0';
+ if (in)
+ strlcpy(pm->indev_name, in->name, sizeof(pm->indev_name));
+ else
+ pm->indev_name[0] = '\0';
+
+ if (out)
+ strlcpy(pm->outdev_name, out->name, sizeof(pm->outdev_name));
+ else
+ pm->outdev_name[0] = '\0';
+
+ NETLINK_CB(log_skb).dst_group = 1;
+ pr_debug("throwing 1 packets to netlink group 1\n");
+ netlink_broadcast(nflognl, log_skb, 0, 1, GFP_ATOMIC);
+}
+#else
+static void quota2_log(unsigned int hooknum,
+ const struct sk_buff *skb,
+ const struct net_device *in,
+ const struct net_device *out,
+ const char *prefix)
+{
+}
+#endif /* if+else CONFIG_NETFILTER_XT_MATCH_QUOTA2_LOG */
+
+static ssize_t quota_proc_read(struct file *file, char __user *buf,
+ size_t size, loff_t *ppos)
+{
+ struct xt_quota_counter *e = PDE_DATA(file_inode(file));
+ char tmp[24];
+ size_t tmp_size;
+
+ spin_lock_bh(&e->lock);
+ tmp_size = scnprintf(tmp, sizeof(tmp), "%llu\n", e->quota);
+ spin_unlock_bh(&e->lock);
+ return simple_read_from_buffer(buf, size, ppos, tmp, tmp_size);
+}
+
+static ssize_t quota_proc_write(struct file *file, const char __user *input,
+ size_t size, loff_t *ppos)
+{
+ struct xt_quota_counter *e = PDE_DATA(file_inode(file));
+ char buf[sizeof("18446744073709551616")];
+
+ if (size > sizeof(buf))
+ size = sizeof(buf);
+ if (copy_from_user(buf, input, size) != 0)
+ return -EFAULT;
+ buf[sizeof(buf)-1] = '\0';
+
+ spin_lock_bh(&e->lock);
+ e->quota = simple_strtoull(buf, NULL, 0);
+ spin_unlock_bh(&e->lock);
+ return size;
+}
+
+static const struct file_operations q2_counter_fops = {
+ .read = quota_proc_read,
+ .write = quota_proc_write,
+ .llseek = default_llseek,
+};
+
+static struct xt_quota_counter *
+q2_new_counter(const struct xt_quota_mtinfo2 *q, bool anon)
+{
+ struct xt_quota_counter *e;
+ unsigned int size;
+
+ /* Do not need all the procfs things for anonymous counters. */
+ size = anon ? offsetof(typeof(*e), list) : sizeof(*e);
+ e = kmalloc(size, GFP_KERNEL);
+ if (e == NULL)
+ return NULL;
+
+ e->quota = q->quota;
+ spin_lock_init(&e->lock);
+ if (!anon) {
+ INIT_LIST_HEAD(&e->list);
+ atomic_set(&e->ref, 1);
+ strlcpy(e->name, q->name, sizeof(e->name));
+ }
+ return e;
+}
+
+/**
+ * q2_get_counter - get ref to counter or create new
+ * @name: name of counter
+ */
+static struct xt_quota_counter *
+q2_get_counter(const struct xt_quota_mtinfo2 *q)
+{
+ struct proc_dir_entry *p;
+ struct xt_quota_counter *e = NULL;
+ struct xt_quota_counter *new_e;
+
+ if (*q->name == '\0')
+ return q2_new_counter(q, true);
+
+ /* No need to hold a lock while getting a new counter */
+ new_e = q2_new_counter(q, false);
+ if (new_e == NULL)
+ goto out;
+
+ spin_lock_bh(&counter_list_lock);
+ list_for_each_entry(e, &counter_list, list)
+ if (strcmp(e->name, q->name) == 0) {
+ atomic_inc(&e->ref);
+ spin_unlock_bh(&counter_list_lock);
+ kfree(new_e);
+ pr_debug("xt_quota2: old counter name=%s", e->name);
+ return e;
+ }
+ e = new_e;
+ pr_debug("xt_quota2: new_counter name=%s", e->name);
+ list_add_tail(&e->list, &counter_list);
+ /* The entry having a refcount of 1 is not directly destructible.
+ * This func has not yet returned the new entry, thus iptables
+ * has not references for destroying this entry.
+ * For another rule to try to destroy it, it would 1st need for this
+ * func* to be re-invoked, acquire a new ref for the same named quota.
+ * Nobody will access the e->procfs_entry either.
+ * So release the lock. */
+ spin_unlock_bh(&counter_list_lock);
+
+ /* create_proc_entry() is not spin_lock happy */
+ p = e->procfs_entry = proc_create_data(e->name, quota_list_perms,
+ proc_xt_quota, &q2_counter_fops, e);
+
+ if (IS_ERR_OR_NULL(p)) {
+ spin_lock_bh(&counter_list_lock);
+ list_del(&e->list);
+ spin_unlock_bh(&counter_list_lock);
+ goto out;
+ }
+ proc_set_user(p, quota_list_uid, quota_list_gid);
+ return e;
+
+ out:
+ kfree(e);
+ return NULL;
+}
+
+static int quota_mt2_check(const struct xt_mtchk_param *par)
+{
+ struct xt_quota_mtinfo2 *q = par->matchinfo;
+
+ pr_debug("xt_quota2: check() flags=0x%04x", q->flags);
+
+ if (q->flags & ~XT_QUOTA_MASK)
+ return -EINVAL;
+
+ q->name[sizeof(q->name)-1] = '\0';
+ if (*q->name == '.' || strchr(q->name, '/') != NULL) {
+ printk(KERN_ERR "xt_quota.3: illegal name\n");
+ return -EINVAL;
+ }
+
+ q->master = q2_get_counter(q);
+ if (q->master == NULL) {
+ printk(KERN_ERR "xt_quota.3: memory alloc failure\n");
+ return -ENOMEM;
+ }
+
+ return 0;
+}
+
+static void quota_mt2_destroy(const struct xt_mtdtor_param *par)
+{
+ struct xt_quota_mtinfo2 *q = par->matchinfo;
+ struct xt_quota_counter *e = q->master;
+
+ if (*q->name == '\0') {
+ kfree(e);
+ return;
+ }
+
+ spin_lock_bh(&counter_list_lock);
+ if (!atomic_dec_and_test(&e->ref)) {
+ spin_unlock_bh(&counter_list_lock);
+ return;
+ }
+
+ list_del(&e->list);
+ remove_proc_entry(e->name, proc_xt_quota);
+ spin_unlock_bh(&counter_list_lock);
+ kfree(e);
+}
+
+static bool
+quota_mt2(const struct sk_buff *skb, struct xt_action_param *par)
+{
+ struct xt_quota_mtinfo2 *q = (void *)par->matchinfo;
+ struct xt_quota_counter *e = q->master;
+ bool ret = q->flags & XT_QUOTA_INVERT;
+
+ spin_lock_bh(&e->lock);
+ if (q->flags & XT_QUOTA_GROW) {
+ /*
+ * While no_change is pointless in "grow" mode, we will
+ * implement it here simply to have a consistent behavior.
+ */
+ if (!(q->flags & XT_QUOTA_NO_CHANGE)) {
+ e->quota += (q->flags & XT_QUOTA_PACKET) ? 1 : skb->len;
+ }
+ ret = true;
+ } else {
+ if (e->quota >= skb->len) {
+ if (!(q->flags & XT_QUOTA_NO_CHANGE))
+ e->quota -= (q->flags & XT_QUOTA_PACKET) ? 1 : skb->len;
+ ret = !ret;
+ } else {
+ /* We are transitioning, log that fact. */
+ if (e->quota) {
+ quota2_log(par->hooknum,
+ skb,
+ par->in,
+ par->out,
+ q->name);
+ }
+ /* we do not allow even small packets from now on */
+ e->quota = 0;
+ }
+ }
+ spin_unlock_bh(&e->lock);
+ return ret;
+}
+
+static struct xt_match quota_mt2_reg[] __read_mostly = {
+ {
+ .name = "quota2",
+ .revision = 3,
+ .family = NFPROTO_IPV4,
+ .checkentry = quota_mt2_check,
+ .match = quota_mt2,
+ .destroy = quota_mt2_destroy,
+ .matchsize = sizeof(struct xt_quota_mtinfo2),
+ .me = THIS_MODULE,
+ },
+ {
+ .name = "quota2",
+ .revision = 3,
+ .family = NFPROTO_IPV6,
+ .checkentry = quota_mt2_check,
+ .match = quota_mt2,
+ .destroy = quota_mt2_destroy,
+ .matchsize = sizeof(struct xt_quota_mtinfo2),
+ .me = THIS_MODULE,
+ },
+};
+
+static int __init quota_mt2_init(void)
+{
+ int ret;
+ pr_debug("xt_quota2: init()");
+
+#ifdef CONFIG_NETFILTER_XT_MATCH_QUOTA2_LOG
+ nflognl = netlink_kernel_create(&init_net, NETLINK_NFLOG, NULL);
+ if (!nflognl)
+ return -ENOMEM;
+#endif
+
+ proc_xt_quota = proc_mkdir("xt_quota", init_net.proc_net);
+ if (proc_xt_quota == NULL)
+ return -EACCES;
+
+ ret = xt_register_matches(quota_mt2_reg, ARRAY_SIZE(quota_mt2_reg));
+ if (ret < 0)
+ remove_proc_entry("xt_quota", init_net.proc_net);
+ pr_debug("xt_quota2: init() %d", ret);
+ return ret;
+}
+
+static void __exit quota_mt2_exit(void)
+{
+ xt_unregister_matches(quota_mt2_reg, ARRAY_SIZE(quota_mt2_reg));
+ remove_proc_entry("xt_quota", init_net.proc_net);
+}
+
+module_init(quota_mt2_init);
+module_exit(quota_mt2_exit);
+MODULE_DESCRIPTION("Xtables: countdown quota match; up counter");
+MODULE_AUTHOR("Sam Johnston <samj@samj.net>");
+MODULE_AUTHOR("Jan Engelhardt <jengelh@medozas.de>");
+MODULE_LICENSE("GPL");
+MODULE_ALIAS("ipt_quota2");
+MODULE_ALIAS("ip6t_quota2");
diff --git a/net/netfilter/xt_socket.c b/net/netfilter/xt_socket.c
index b10ade272b50..a52fbaf52691 100644
--- a/net/netfilter/xt_socket.c
+++ b/net/netfilter/xt_socket.c
@@ -144,13 +144,14 @@ static bool xt_socket_sk_is_transparent(struct sock *sk)
}
}
-static struct sock *xt_socket_lookup_slow_v4(struct net *net,
+struct sock *xt_socket_lookup_slow_v4(struct net *net,
const struct sk_buff *skb,
const struct net_device *indev)
{
const struct iphdr *iph = ip_hdr(skb);
struct sk_buff *data_skb = NULL;
int doff = 0;
+ struct sock *sk = skb->sk;
__be32 uninitialized_var(daddr), uninitialized_var(saddr);
__be16 uninitialized_var(dport), uninitialized_var(sport);
u8 uninitialized_var(protocol);
@@ -205,9 +206,16 @@ static struct sock *xt_socket_lookup_slow_v4(struct net *net,
}
#endif
- return xt_socket_get_sock_v4(net, data_skb, doff, protocol, saddr,
- daddr, sport, dport, indev);
+ if (sk)
+ atomic_inc(&sk->sk_refcnt);
+ else
+ sk = xt_socket_get_sock_v4(dev_net(skb->dev), data_skb, doff,
+ protocol, saddr, daddr, sport,
+ dport, indev);
+
+ return sk;
}
+EXPORT_SYMBOL(xt_socket_lookup_slow_v4);
static bool
socket_match(const struct sk_buff *skb, struct xt_action_param *par,
@@ -239,8 +247,7 @@ socket_match(const struct sk_buff *skb, struct xt_action_param *par,
transparent)
pskb->mark = sk->sk_mark;
- if (sk != skb->sk)
- sock_gen_put(sk);
+ sock_gen_put(sk);
if (wildcard || !transparent)
sk = NULL;
@@ -344,10 +351,11 @@ xt_socket_get_sock_v6(struct net *net, struct sk_buff *skb, int doff,
return NULL;
}
-static struct sock *xt_socket_lookup_slow_v6(struct net *net,
+struct sock *xt_socket_lookup_slow_v6(struct net *net,
const struct sk_buff *skb,
const struct net_device *indev)
{
+ struct sock *sk = skb->sk;
__be16 uninitialized_var(dport), uninitialized_var(sport);
const struct in6_addr *daddr = NULL, *saddr = NULL;
struct ipv6hdr *iph = ipv6_hdr(skb);
@@ -387,9 +395,16 @@ static struct sock *xt_socket_lookup_slow_v6(struct net *net,
return NULL;
}
- return xt_socket_get_sock_v6(net, data_skb, doff, tproto, saddr, daddr,
- sport, dport, indev);
+ if (sk)
+ atomic_inc(&sk->sk_refcnt);
+ else
+ sk = xt_socket_get_sock_v6(dev_net(skb->dev), data_skb, doff,
+ tproto, saddr, daddr, sport, dport,
+ indev);
+
+ return sk;
}
+EXPORT_SYMBOL(xt_socket_lookup_slow_v6);
static bool
socket_mt6_v1_v2_v3(const struct sk_buff *skb, struct xt_action_param *par)
diff --git a/net/rfkill/Kconfig b/net/rfkill/Kconfig
index 868f1ad0415a..8463a6d4d508 100644
--- a/net/rfkill/Kconfig
+++ b/net/rfkill/Kconfig
@@ -10,6 +10,11 @@ menuconfig RFKILL
To compile this driver as a module, choose M here: the
module will be called rfkill.
+config RFKILL_PM
+ bool "Power off on suspend"
+ depends on RFKILL && PM
+ default y
+
# LED trigger support
config RFKILL_LEDS
bool
diff --git a/net/rfkill/core.c b/net/rfkill/core.c
index 884027f62783..a9a7128f039e 100644
--- a/net/rfkill/core.c
+++ b/net/rfkill/core.c
@@ -796,8 +796,7 @@ void rfkill_resume_polling(struct rfkill *rfkill)
}
EXPORT_SYMBOL(rfkill_resume_polling);
-#ifdef CONFIG_PM_SLEEP
-static int rfkill_suspend(struct device *dev)
+static __maybe_unused int rfkill_suspend(struct device *dev)
{
struct rfkill *rfkill = to_rfkill(dev);
@@ -807,7 +806,7 @@ static int rfkill_suspend(struct device *dev)
return 0;
}
-static int rfkill_resume(struct device *dev)
+static __maybe_unused int rfkill_resume(struct device *dev)
{
struct rfkill *rfkill = to_rfkill(dev);
bool cur;
@@ -827,17 +826,13 @@ static int rfkill_resume(struct device *dev)
}
static SIMPLE_DEV_PM_OPS(rfkill_pm_ops, rfkill_suspend, rfkill_resume);
-#define RFKILL_PM_OPS (&rfkill_pm_ops)
-#else
-#define RFKILL_PM_OPS NULL
-#endif
static struct class rfkill_class = {
.name = "rfkill",
.dev_release = rfkill_release,
.dev_groups = rfkill_dev_groups,
.dev_uevent = rfkill_dev_uevent,
- .pm = RFKILL_PM_OPS,
+ .pm = IS_ENABLED(CONFIG_RFKILL_PM) ? &rfkill_pm_ops : NULL,
};
bool rfkill_blocked(struct rfkill *rfkill)
diff --git a/net/socket.c b/net/socket.c
index d9e2989c10c4..0bc041cb5906 100644
--- a/net/socket.c
+++ b/net/socket.c
@@ -534,8 +534,25 @@ static ssize_t sockfs_listxattr(struct dentry *dentry, char *buffer,
return used;
}
+static int sockfs_setattr(struct dentry *dentry, struct iattr *iattr)
+{
+ int err = simple_setattr(dentry, iattr);
+
+ if (!err && (iattr->ia_valid & ATTR_UID)) {
+ struct socket *sock = SOCKET_I(d_inode(dentry));
+
+ if (sock->sk)
+ sock->sk->sk_uid = iattr->ia_uid;
+ else
+ err = -ENOENT;
+ }
+
+ return err;
+}
+
static const struct inode_operations sockfs_inode_ops = {
.listxattr = sockfs_listxattr,
+ .setattr = sockfs_setattr,
};
/**
@@ -578,12 +595,16 @@ EXPORT_SYMBOL(sock_alloc);
* an inode not a file.
*/
-void sock_release(struct socket *sock)
+static void __sock_release(struct socket *sock, struct inode *inode)
{
if (sock->ops) {
struct module *owner = sock->ops->owner;
+ if (inode)
+ inode_lock(inode);
sock->ops->release(sock);
+ if (inode)
+ inode_unlock(inode);
sock->ops = NULL;
module_put(owner);
}
@@ -598,6 +619,11 @@ void sock_release(struct socket *sock)
}
sock->file = NULL;
}
+
+void sock_release(struct socket *sock)
+{
+ __sock_release(sock, NULL);
+}
EXPORT_SYMBOL(sock_release);
void __sock_tx_timestamp(__u16 tsflags, __u8 *tx_flags)
@@ -1030,7 +1056,7 @@ static int sock_mmap(struct file *file, struct vm_area_struct *vma)
static int sock_close(struct inode *inode, struct file *filp)
{
- sock_release(SOCKET_I(inode));
+ __sock_release(SOCKET_I(inode), inode);
return 0;
}
diff --git a/net/sunrpc/svc.c b/net/sunrpc/svc.c
index eea18a124e4f..f984ff25cdb2 100644
--- a/net/sunrpc/svc.c
+++ b/net/sunrpc/svc.c
@@ -50,7 +50,7 @@ EXPORT_SYMBOL_GPL(svc_pool_map);
static DEFINE_MUTEX(svc_pool_map_mutex);/* protects svc_pool_map.count only */
static int
-param_set_pool_mode(const char *val, struct kernel_param *kp)
+param_set_pool_mode(const char *val, const struct kernel_param *kp)
{
int *ip = (int *)kp->arg;
struct svc_pool_map *m = &svc_pool_map;
@@ -80,7 +80,7 @@ out:
}
static int
-param_get_pool_mode(char *buf, struct kernel_param *kp)
+param_get_pool_mode(char *buf, const struct kernel_param *kp)
{
int *ip = (int *)kp->arg;
diff --git a/net/wireless/scan.c b/net/wireless/scan.c
index 435f904c1be5..38caa6ab1fa7 100644
--- a/net/wireless/scan.c
+++ b/net/wireless/scan.c
@@ -70,7 +70,7 @@ module_param(bss_entries_limit, int, 0644);
MODULE_PARM_DESC(bss_entries_limit,
"limit to number of scan BSS entries (per wiphy, default 1000)");
-#define IEEE80211_SCAN_RESULT_EXPIRE (30 * HZ)
+#define IEEE80211_SCAN_RESULT_EXPIRE (7 * HZ)
static void bss_free(struct cfg80211_internal_bss *bss)
{
diff --git a/net/xfrm/Kconfig b/net/xfrm/Kconfig
index bda1a13628a8..779579c9d07c 100644
--- a/net/xfrm/Kconfig
+++ b/net/xfrm/Kconfig
@@ -20,6 +20,14 @@ config XFRM_USER
If unsure, say Y.
+config XFRM_INTERFACE
+ tristate "Transformation virtual interface"
+ depends on XFRM && IPV6
+ ---help---
+ This provides a virtual interface to route IPsec traffic.
+
+ If unsure, say N.
+
config XFRM_SUB_POLICY
bool "Transformation sub policy support"
depends on XFRM
diff --git a/net/xfrm/Makefile b/net/xfrm/Makefile
index c0e961983f17..5f038f336d39 100644
--- a/net/xfrm/Makefile
+++ b/net/xfrm/Makefile
@@ -9,3 +9,4 @@ obj-$(CONFIG_XFRM_STATISTICS) += xfrm_proc.o
obj-$(CONFIG_XFRM_ALGO) += xfrm_algo.o
obj-$(CONFIG_XFRM_USER) += xfrm_user.o
obj-$(CONFIG_XFRM_IPCOMP) += xfrm_ipcomp.o
+obj-$(CONFIG_XFRM_INTERFACE) += xfrm_interface.o
diff --git a/net/xfrm/xfrm_algo.c b/net/xfrm/xfrm_algo.c
index 44ac85fe2bc9..d0ca0dbf494e 100644
--- a/net/xfrm/xfrm_algo.c
+++ b/net/xfrm/xfrm_algo.c
@@ -241,7 +241,7 @@ static struct xfrm_algo_desc aalg_list[] = {
.uinfo = {
.auth = {
- .icv_truncbits = 96,
+ .icv_truncbits = 128,
.icv_fullbits = 256,
}
},
diff --git a/net/xfrm/xfrm_input.c b/net/xfrm/xfrm_input.c
index 6e3f0254d8a1..87b9c0d30f3c 100644
--- a/net/xfrm/xfrm_input.c
+++ b/net/xfrm/xfrm_input.c
@@ -234,23 +234,28 @@ int xfrm_input(struct sk_buff *skb, int nexthdr, __be32 spi, int encap_type)
seq = 0;
if (!spi && (err = xfrm_parse_spi(skb, nexthdr, &spi, &seq)) != 0) {
+ secpath_reset(skb);
XFRM_INC_STATS(net, LINUX_MIB_XFRMINHDRERROR);
goto drop;
}
do {
if (skb->sp->len == XFRM_MAX_DEPTH) {
+ secpath_reset(skb);
XFRM_INC_STATS(net, LINUX_MIB_XFRMINBUFFERERROR);
goto drop;
}
x = xfrm_state_lookup(net, mark, daddr, spi, nexthdr, family);
if (x == NULL) {
+ secpath_reset(skb);
XFRM_INC_STATS(net, LINUX_MIB_XFRMINNOSTATES);
xfrm_audit_state_notfound(skb, family, spi, seq);
goto drop;
}
+ skb->mark = xfrm_smark_get(skb->mark, x);
+
skb->sp->xvec[skb->sp->len++] = x;
spin_lock(&x->lock);
diff --git a/net/xfrm/xfrm_interface.c b/net/xfrm/xfrm_interface.c
new file mode 100644
index 000000000000..0b6818dc971c
--- /dev/null
+++ b/net/xfrm/xfrm_interface.c
@@ -0,0 +1,974 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * XFRM virtual interface
+ *
+ * Copyright (C) 2018 secunet Security Networks AG
+ *
+ * Author:
+ * Steffen Klassert <steffen.klassert@secunet.com>
+ */
+
+#include <linux/module.h>
+#include <linux/capability.h>
+#include <linux/errno.h>
+#include <linux/types.h>
+#include <linux/sockios.h>
+#include <linux/icmp.h>
+#include <linux/if.h>
+#include <linux/in.h>
+#include <linux/ip.h>
+#include <linux/net.h>
+#include <linux/in6.h>
+#include <linux/netdevice.h>
+#include <linux/if_link.h>
+#include <linux/if_arp.h>
+#include <linux/icmpv6.h>
+#include <linux/init.h>
+#include <linux/route.h>
+#include <linux/rtnetlink.h>
+#include <linux/netfilter_ipv6.h>
+#include <linux/slab.h>
+#include <linux/hash.h>
+
+#include <linux/uaccess.h>
+#include <linux/atomic.h>
+
+#include <net/icmp.h>
+#include <net/ip.h>
+#include <net/ipv6.h>
+#include <net/ip6_route.h>
+#include <net/addrconf.h>
+#include <net/xfrm.h>
+#include <net/net_namespace.h>
+#include <net/netns/generic.h>
+#include <linux/etherdevice.h>
+
+static int xfrmi_dev_init(struct net_device *dev);
+static void xfrmi_dev_setup(struct net_device *dev);
+static struct rtnl_link_ops xfrmi_link_ops __read_mostly;
+static unsigned int xfrmi_net_id __read_mostly;
+
+struct xfrmi_net {
+ /* lists for storing interfaces in use */
+ struct xfrm_if __rcu *xfrmi[1];
+};
+
+#define for_each_xfrmi_rcu(start, xi) \
+ for (xi = rcu_dereference(start); xi; xi = rcu_dereference(xi->next))
+
+static struct xfrm_if *xfrmi_lookup(struct net *net, struct xfrm_state *x)
+{
+ struct xfrmi_net *xfrmn = net_generic(net, xfrmi_net_id);
+ struct xfrm_if *xi;
+
+ for_each_xfrmi_rcu(xfrmn->xfrmi[0], xi) {
+ if (x->if_id == xi->p.if_id &&
+ (xi->dev->flags & IFF_UP))
+ return xi;
+ }
+
+ return NULL;
+}
+
+static struct xfrm_if *xfrmi_decode_session(struct sk_buff *skb)
+{
+ struct xfrmi_net *xfrmn;
+ int ifindex;
+ struct xfrm_if *xi;
+
+ if (!skb->dev)
+ return NULL;
+
+ xfrmn = net_generic(dev_net(skb->dev), xfrmi_net_id);
+ ifindex = skb->dev->ifindex;
+
+ for_each_xfrmi_rcu(xfrmn->xfrmi[0], xi) {
+ if (ifindex == xi->dev->ifindex &&
+ (xi->dev->flags & IFF_UP))
+ return xi;
+ }
+
+ return NULL;
+}
+
+static void xfrmi_link(struct xfrmi_net *xfrmn, struct xfrm_if *xi)
+{
+ struct xfrm_if __rcu **xip = &xfrmn->xfrmi[0];
+
+ rcu_assign_pointer(xi->next , rtnl_dereference(*xip));
+ rcu_assign_pointer(*xip, xi);
+}
+
+static void xfrmi_unlink(struct xfrmi_net *xfrmn, struct xfrm_if *xi)
+{
+ struct xfrm_if __rcu **xip;
+ struct xfrm_if *iter;
+
+ for (xip = &xfrmn->xfrmi[0];
+ (iter = rtnl_dereference(*xip)) != NULL;
+ xip = &iter->next) {
+ if (xi == iter) {
+ rcu_assign_pointer(*xip, xi->next);
+ break;
+ }
+ }
+}
+
+static void xfrmi_dev_free(struct net_device *dev)
+{
+ free_percpu(dev->tstats);
+}
+
+static int xfrmi_create2(struct net_device *dev)
+{
+ struct xfrm_if *xi = netdev_priv(dev);
+ struct net *net = dev_net(dev);
+ struct xfrmi_net *xfrmn = net_generic(net, xfrmi_net_id);
+ int err;
+
+ dev->rtnl_link_ops = &xfrmi_link_ops;
+ err = register_netdevice(dev);
+ if (err < 0)
+ goto out;
+
+ strcpy(xi->p.name, dev->name);
+
+ dev_hold(dev);
+ xfrmi_link(xfrmn, xi);
+
+ return 0;
+
+out:
+ return err;
+}
+
+static struct xfrm_if *xfrmi_create(struct net *net, struct xfrm_if_parms *p)
+{
+ struct net_device *dev;
+ struct xfrm_if *xi;
+ char name[IFNAMSIZ];
+ int err;
+
+ if (p->name[0]) {
+ strlcpy(name, p->name, IFNAMSIZ);
+ } else {
+ err = -EINVAL;
+ goto failed;
+ }
+
+ dev = alloc_netdev(sizeof(*xi), name, NET_NAME_UNKNOWN, xfrmi_dev_setup);
+ if (!dev) {
+ err = -EAGAIN;
+ goto failed;
+ }
+
+ dev_net_set(dev, net);
+
+ xi = netdev_priv(dev);
+ xi->p = *p;
+ xi->net = net;
+ xi->dev = dev;
+ xi->phydev = dev_get_by_index(net, p->link);
+ if (!xi->phydev) {
+ err = -ENODEV;
+ goto failed_free;
+ }
+
+ err = xfrmi_create2(dev);
+ if (err < 0)
+ goto failed_dev_put;
+
+ return xi;
+
+failed_dev_put:
+ dev_put(xi->phydev);
+failed_free:
+ free_netdev(dev);
+failed:
+ return ERR_PTR(err);
+}
+
+static struct xfrm_if *xfrmi_locate(struct net *net, struct xfrm_if_parms *p,
+ int create)
+{
+ struct xfrm_if __rcu **xip;
+ struct xfrm_if *xi;
+ struct xfrmi_net *xfrmn = net_generic(net, xfrmi_net_id);
+
+ for (xip = &xfrmn->xfrmi[0];
+ (xi = rtnl_dereference(*xip)) != NULL;
+ xip = &xi->next) {
+ if (xi->p.if_id == p->if_id) {
+ if (create)
+ return ERR_PTR(-EEXIST);
+
+ return xi;
+ }
+ }
+ if (!create)
+ return ERR_PTR(-ENODEV);
+ return xfrmi_create(net, p);
+}
+
+static void xfrmi_dev_uninit(struct net_device *dev)
+{
+ struct xfrm_if *xi = netdev_priv(dev);
+ struct xfrmi_net *xfrmn = net_generic(xi->net, xfrmi_net_id);
+
+ xfrmi_unlink(xfrmn, xi);
+ dev_put(xi->phydev);
+ dev_put(dev);
+}
+
+static void xfrmi_scrub_packet(struct sk_buff *skb, bool xnet)
+{
+ skb->tstamp.tv64 = 0;
+ skb->pkt_type = PACKET_HOST;
+ skb->skb_iif = 0;
+ skb->ignore_df = 0;
+ skb_dst_drop(skb);
+ nf_reset(skb);
+ nf_reset_trace(skb);
+
+ if (!xnet)
+ return;
+
+ ipvs_reset(skb);
+ secpath_reset(skb);
+ skb_orphan(skb);
+ skb->mark = 0;
+}
+
+static int xfrmi_rcv_cb(struct sk_buff *skb, int err)
+{
+ struct pcpu_sw_netstats *tstats;
+ struct xfrm_mode *inner_mode;
+ struct net_device *dev;
+ struct xfrm_state *x;
+ struct xfrm_if *xi;
+ bool xnet;
+
+ if (err && !skb->sp)
+ return 0;
+
+ x = xfrm_input_state(skb);
+
+ xi = xfrmi_lookup(xs_net(x), x);
+ if (!xi)
+ return 1;
+
+ dev = xi->dev;
+ skb->dev = dev;
+
+ if (err) {
+ dev->stats.rx_errors++;
+ dev->stats.rx_dropped++;
+
+ return 0;
+ }
+
+ xnet = !net_eq(xi->net, dev_net(skb->dev));
+
+ if (xnet) {
+ inner_mode = x->inner_mode;
+
+ if (x->sel.family == AF_UNSPEC) {
+ inner_mode = xfrm_ip2inner_mode(x, XFRM_MODE_SKB_CB(skb)->protocol);
+ if (inner_mode == NULL) {
+ XFRM_INC_STATS(dev_net(skb->dev),
+ LINUX_MIB_XFRMINSTATEMODEERROR);
+ return -EINVAL;
+ }
+ }
+
+ if (!xfrm_policy_check(NULL, XFRM_POLICY_IN, skb,
+ inner_mode->afinfo->family))
+ return -EPERM;
+ }
+
+ xfrmi_scrub_packet(skb, xnet);
+
+ tstats = this_cpu_ptr(dev->tstats);
+
+ u64_stats_update_begin(&tstats->syncp);
+ tstats->rx_packets++;
+ tstats->rx_bytes += skb->len;
+ u64_stats_update_end(&tstats->syncp);
+
+ return 0;
+}
+
+static int
+xfrmi_xmit2(struct sk_buff *skb, struct net_device *dev, struct flowi *fl)
+{
+ struct xfrm_if *xi = netdev_priv(dev);
+ struct net_device_stats *stats = &xi->dev->stats;
+ struct dst_entry *dst = skb_dst(skb);
+ unsigned int length = skb->len;
+ struct net_device *tdev;
+ struct xfrm_state *x;
+ int err = -1;
+ int mtu;
+
+ if (!dst)
+ goto tx_err_link_failure;
+
+ fl->flowi_xfrm.if_id = xi->p.if_id;
+
+ dst_hold(dst);
+ dst = xfrm_lookup(xi->net, dst, fl, NULL, 0);
+ if (IS_ERR(dst)) {
+ err = PTR_ERR(dst);
+ dst = NULL;
+ goto tx_err_link_failure;
+ }
+
+ x = dst->xfrm;
+ if (!x)
+ goto tx_err_link_failure;
+
+ if (x->if_id != xi->p.if_id)
+ goto tx_err_link_failure;
+
+ tdev = dst->dev;
+
+ if (tdev == dev) {
+ stats->collisions++;
+ net_warn_ratelimited("%s: Local routing loop detected!\n",
+ xi->p.name);
+ goto tx_err_dst_release;
+ }
+
+ mtu = dst_mtu(dst);
+ if (!skb->ignore_df && skb->len > mtu) {
+ if (dst && dst->ops->update_pmtu)
+ dst->ops->update_pmtu(dst, NULL, skb, mtu);
+
+ if (skb->protocol == htons(ETH_P_IPV6)) {
+ if (mtu < IPV6_MIN_MTU)
+ mtu = IPV6_MIN_MTU;
+
+ icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
+ } else {
+ icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED,
+ htonl(mtu));
+ }
+
+ dst_release(dst);
+ return -EMSGSIZE;
+ }
+
+ xfrmi_scrub_packet(skb, !net_eq(xi->net, dev_net(dev)));
+ skb_dst_set(skb, dst);
+ skb->dev = tdev;
+
+ err = dst_output(xi->net, skb->sk, skb);
+ if (net_xmit_eval(err) == 0) {
+ struct pcpu_sw_netstats *tstats = this_cpu_ptr(dev->tstats);
+
+ u64_stats_update_begin(&tstats->syncp);
+ tstats->tx_bytes += length;
+ tstats->tx_packets++;
+ u64_stats_update_end(&tstats->syncp);
+ } else {
+ stats->tx_errors++;
+ stats->tx_aborted_errors++;
+ }
+
+ return 0;
+tx_err_link_failure:
+ stats->tx_carrier_errors++;
+ dst_link_failure(skb);
+tx_err_dst_release:
+ dst_release(dst);
+ return err;
+}
+
+static netdev_tx_t xfrmi_xmit(struct sk_buff *skb, struct net_device *dev)
+{
+ struct xfrm_if *xi = netdev_priv(dev);
+ struct net_device_stats *stats = &xi->dev->stats;
+ struct flowi fl;
+ int ret;
+
+ memset(&fl, 0, sizeof(fl));
+
+ switch (skb->protocol) {
+ case htons(ETH_P_IPV6):
+ xfrm_decode_session(skb, &fl, AF_INET6);
+ memset(IP6CB(skb), 0, sizeof(*IP6CB(skb)));
+ break;
+ case htons(ETH_P_IP):
+ xfrm_decode_session(skb, &fl, AF_INET);
+ memset(IPCB(skb), 0, sizeof(*IPCB(skb)));
+ break;
+ default:
+ goto tx_err;
+ }
+
+ fl.flowi_oif = xi->phydev->ifindex;
+
+ ret = xfrmi_xmit2(skb, dev, &fl);
+ if (ret < 0)
+ goto tx_err;
+
+ return NETDEV_TX_OK;
+
+tx_err:
+ stats->tx_errors++;
+ stats->tx_dropped++;
+ kfree_skb(skb);
+ return NETDEV_TX_OK;
+}
+
+static int xfrmi4_err(struct sk_buff *skb, u32 info)
+{
+ const struct iphdr *iph = (const struct iphdr *)skb->data;
+ struct net *net = dev_net(skb->dev);
+ int protocol = iph->protocol;
+ struct ip_comp_hdr *ipch;
+ struct ip_esp_hdr *esph;
+ struct ip_auth_hdr *ah ;
+ struct xfrm_state *x;
+ struct xfrm_if *xi;
+ __be32 spi;
+
+ switch (protocol) {
+ case IPPROTO_ESP:
+ esph = (struct ip_esp_hdr *)(skb->data+(iph->ihl<<2));
+ spi = esph->spi;
+ break;
+ case IPPROTO_AH:
+ ah = (struct ip_auth_hdr *)(skb->data+(iph->ihl<<2));
+ spi = ah->spi;
+ break;
+ case IPPROTO_COMP:
+ ipch = (struct ip_comp_hdr *)(skb->data+(iph->ihl<<2));
+ spi = htonl(ntohs(ipch->cpi));
+ break;
+ default:
+ return 0;
+ }
+
+ switch (icmp_hdr(skb)->type) {
+ case ICMP_DEST_UNREACH:
+ if (icmp_hdr(skb)->code != ICMP_FRAG_NEEDED)
+ return 0;
+ case ICMP_REDIRECT:
+ break;
+ default:
+ return 0;
+ }
+
+ x = xfrm_state_lookup(net, skb->mark, (const xfrm_address_t *)&iph->daddr,
+ spi, protocol, AF_INET);
+ if (!x)
+ return 0;
+
+ xi = xfrmi_lookup(net, x);
+ if (!xi) {
+ xfrm_state_put(x);
+ return -1;
+ }
+
+ if (icmp_hdr(skb)->type == ICMP_DEST_UNREACH)
+ ipv4_update_pmtu(skb, net, info, 0, 0, protocol, 0);
+ else
+ ipv4_redirect(skb, net, 0, 0, protocol, 0);
+ xfrm_state_put(x);
+
+ return 0;
+}
+
+static int xfrmi6_err(struct sk_buff *skb, struct inet6_skb_parm *opt,
+ u8 type, u8 code, int offset, __be32 info)
+{
+ const struct ipv6hdr *iph = (const struct ipv6hdr *)skb->data;
+ struct net *net = dev_net(skb->dev);
+ int protocol = iph->nexthdr;
+ struct ip_comp_hdr *ipch;
+ struct ip_esp_hdr *esph;
+ struct ip_auth_hdr *ah;
+ struct xfrm_state *x;
+ struct xfrm_if *xi;
+ __be32 spi;
+
+ switch (protocol) {
+ case IPPROTO_ESP:
+ esph = (struct ip_esp_hdr *)(skb->data + offset);
+ spi = esph->spi;
+ break;
+ case IPPROTO_AH:
+ ah = (struct ip_auth_hdr *)(skb->data + offset);
+ spi = ah->spi;
+ break;
+ case IPPROTO_COMP:
+ ipch = (struct ip_comp_hdr *)(skb->data + offset);
+ spi = htonl(ntohs(ipch->cpi));
+ break;
+ default:
+ return 0;
+ }
+
+ if (type != ICMPV6_PKT_TOOBIG &&
+ type != NDISC_REDIRECT)
+ return 0;
+
+ x = xfrm_state_lookup(net, skb->mark, (const xfrm_address_t *)&iph->daddr,
+ spi, protocol, AF_INET6);
+ if (!x)
+ return 0;
+
+ xi = xfrmi_lookup(net, x);
+ if (!xi) {
+ xfrm_state_put(x);
+ return -1;
+ }
+
+ if (type == NDISC_REDIRECT)
+ ip6_redirect(skb, net, skb->dev->ifindex, 0,
+ sock_net_uid(net, NULL));
+ else
+ ip6_update_pmtu(skb, net, info, 0, 0, sock_net_uid(net, NULL));
+ xfrm_state_put(x);
+
+ return 0;
+}
+
+static int xfrmi_change(struct xfrm_if *xi, const struct xfrm_if_parms *p)
+{
+ if (xi->p.link != p->link)
+ return -EINVAL;
+
+ xi->p.if_id = p->if_id;
+
+ return 0;
+}
+
+static int xfrmi_update(struct xfrm_if *xi, struct xfrm_if_parms *p)
+{
+ struct net *net = dev_net(xi->dev);
+ struct xfrmi_net *xfrmn = net_generic(net, xfrmi_net_id);
+ int err;
+
+ xfrmi_unlink(xfrmn, xi);
+ synchronize_net();
+ err = xfrmi_change(xi, p);
+ xfrmi_link(xfrmn, xi);
+ netdev_state_change(xi->dev);
+ return err;
+}
+
+static struct rtnl_link_stats64 *xfrmi_get_stats64(struct net_device *dev,
+ struct rtnl_link_stats64 *s)
+{
+ int cpu;
+
+ if (!dev->tstats)
+ return s;
+
+ for_each_possible_cpu(cpu) {
+ struct pcpu_sw_netstats *stats;
+ struct pcpu_sw_netstats tmp;
+ int start;
+
+ stats = per_cpu_ptr(dev->tstats, cpu);
+ do {
+ start = u64_stats_fetch_begin_irq(&stats->syncp);
+ tmp.rx_packets = stats->rx_packets;
+ tmp.rx_bytes = stats->rx_bytes;
+ tmp.tx_packets = stats->tx_packets;
+ tmp.tx_bytes = stats->tx_bytes;
+ } while (u64_stats_fetch_retry_irq(&stats->syncp, start));
+
+ s->rx_packets += tmp.rx_packets;
+ s->rx_bytes += tmp.rx_bytes;
+ s->tx_packets += tmp.tx_packets;
+ s->tx_bytes += tmp.tx_bytes;
+ }
+
+ s->rx_dropped = dev->stats.rx_dropped;
+ s->tx_dropped = dev->stats.tx_dropped;
+
+ return s;
+}
+
+static int xfrmi_get_iflink(const struct net_device *dev)
+{
+ struct xfrm_if *xi = netdev_priv(dev);
+
+ return xi->phydev->ifindex;
+}
+
+
+static const struct net_device_ops xfrmi_netdev_ops = {
+ .ndo_init = xfrmi_dev_init,
+ .ndo_uninit = xfrmi_dev_uninit,
+ .ndo_start_xmit = xfrmi_xmit,
+ .ndo_get_stats64 = xfrmi_get_stats64,
+ .ndo_get_iflink = xfrmi_get_iflink,
+};
+
+static void xfrmi_dev_setup(struct net_device *dev)
+{
+ dev->netdev_ops = &xfrmi_netdev_ops;
+ dev->type = ARPHRD_NONE;
+ dev->hard_header_len = ETH_HLEN;
+ dev->min_header_len = ETH_HLEN;
+ dev->mtu = ETH_DATA_LEN;
+ dev->addr_len = ETH_ALEN;
+ dev->flags = IFF_NOARP;
+ dev->destructor = xfrmi_dev_free;
+ netif_keep_dst(dev);
+}
+
+static int xfrmi_dev_init(struct net_device *dev)
+{
+ struct xfrm_if *xi = netdev_priv(dev);
+ struct net_device *phydev = xi->phydev;
+ int err;
+
+ dev->tstats = netdev_alloc_pcpu_stats(struct pcpu_sw_netstats);
+ if (!dev->tstats)
+ return -ENOMEM;
+
+ err = gro_cells_init(&xi->gro_cells, dev);
+ if (err) {
+ free_percpu(dev->tstats);
+ return err;
+ }
+
+ dev->features |= NETIF_F_LLTX;
+
+ dev->needed_headroom = phydev->needed_headroom;
+ dev->needed_tailroom = phydev->needed_tailroom;
+
+ if (is_zero_ether_addr(dev->dev_addr))
+ eth_hw_addr_inherit(dev, phydev);
+ if (is_zero_ether_addr(dev->broadcast))
+ memcpy(dev->broadcast, phydev->broadcast, dev->addr_len);
+
+ return 0;
+}
+
+static int xfrmi_validate(struct nlattr *tb[], struct nlattr *data[])
+{
+ return 0;
+}
+
+static void xfrmi_netlink_parms(struct nlattr *data[],
+ struct xfrm_if_parms *parms)
+{
+ memset(parms, 0, sizeof(*parms));
+
+ if (!data)
+ return;
+
+ if (data[IFLA_XFRM_LINK])
+ parms->link = nla_get_u32(data[IFLA_XFRM_LINK]);
+
+ if (data[IFLA_XFRM_IF_ID])
+ parms->if_id = nla_get_u32(data[IFLA_XFRM_IF_ID]);
+}
+
+static int xfrmi_newlink(struct net *src_net, struct net_device *dev,
+ struct nlattr *tb[], struct nlattr *data[])
+{
+ struct net *net = dev_net(dev);
+ struct xfrm_if_parms *p;
+ struct xfrm_if *xi;
+
+ xi = netdev_priv(dev);
+ p = &xi->p;
+
+ xfrmi_netlink_parms(data, p);
+
+ if (!tb[IFLA_IFNAME])
+ return -EINVAL;
+
+ nla_strlcpy(p->name, tb[IFLA_IFNAME], IFNAMSIZ);
+
+ xi = xfrmi_locate(net, p, 1);
+ return PTR_ERR_OR_ZERO(xi);
+}
+
+static void xfrmi_dellink(struct net_device *dev, struct list_head *head)
+{
+ unregister_netdevice_queue(dev, head);
+}
+
+static int xfrmi_changelink(struct net_device *dev, struct nlattr *tb[],
+ struct nlattr *data[])
+{
+ struct xfrm_if *xi = netdev_priv(dev);
+ struct net *net = dev_net(dev);
+
+ xfrmi_netlink_parms(data, &xi->p);
+
+ xi = xfrmi_locate(net, &xi->p, 0);
+
+ if (IS_ERR_OR_NULL(xi)) {
+ xi = netdev_priv(dev);
+ } else {
+ if (xi->dev != dev)
+ return -EEXIST;
+ }
+
+ return xfrmi_update(xi, &xi->p);
+}
+
+static size_t xfrmi_get_size(const struct net_device *dev)
+{
+ return
+ /* IFLA_XFRM_LINK */
+ nla_total_size(4) +
+ /* IFLA_XFRM_IF_ID */
+ nla_total_size(4) +
+ 0;
+}
+
+static int xfrmi_fill_info(struct sk_buff *skb, const struct net_device *dev)
+{
+ struct xfrm_if *xi = netdev_priv(dev);
+ struct xfrm_if_parms *parm = &xi->p;
+
+ if (nla_put_u32(skb, IFLA_XFRM_LINK, parm->link) ||
+ nla_put_u32(skb, IFLA_XFRM_IF_ID, parm->if_id))
+ goto nla_put_failure;
+ return 0;
+
+nla_put_failure:
+ return -EMSGSIZE;
+}
+
+struct net *xfrmi_get_link_net(const struct net_device *dev)
+{
+ struct xfrm_if *xi = netdev_priv(dev);
+
+ return dev_net(xi->phydev);
+}
+
+static const struct nla_policy xfrmi_policy[IFLA_XFRM_MAX + 1] = {
+ [IFLA_XFRM_LINK] = { .type = NLA_U32 },
+ [IFLA_XFRM_IF_ID] = { .type = NLA_U32 },
+};
+
+static struct rtnl_link_ops xfrmi_link_ops __read_mostly = {
+ .kind = "xfrm",
+ .maxtype = IFLA_XFRM_MAX,
+ .policy = xfrmi_policy,
+ .priv_size = sizeof(struct xfrm_if),
+ .setup = xfrmi_dev_setup,
+ .validate = xfrmi_validate,
+ .newlink = xfrmi_newlink,
+ .dellink = xfrmi_dellink,
+ .changelink = xfrmi_changelink,
+ .get_size = xfrmi_get_size,
+ .fill_info = xfrmi_fill_info,
+ .get_link_net = xfrmi_get_link_net,
+};
+
+static void __net_exit xfrmi_destroy_interfaces(struct xfrmi_net *xfrmn)
+{
+ struct xfrm_if *xi;
+ LIST_HEAD(list);
+
+ xi = rtnl_dereference(xfrmn->xfrmi[0]);
+ if (!xi)
+ return;
+
+ unregister_netdevice_queue(xi->dev, &list);
+ unregister_netdevice_many(&list);
+}
+
+static int __net_init xfrmi_init_net(struct net *net)
+{
+ return 0;
+}
+
+static void __net_exit xfrmi_exit_net(struct net *net)
+{
+ struct xfrmi_net *xfrmn = net_generic(net, xfrmi_net_id);
+
+ rtnl_lock();
+ xfrmi_destroy_interfaces(xfrmn);
+ rtnl_unlock();
+}
+
+static struct pernet_operations xfrmi_net_ops = {
+ .init = xfrmi_init_net,
+ .exit = xfrmi_exit_net,
+ .id = &xfrmi_net_id,
+ .size = sizeof(struct xfrmi_net),
+};
+
+static struct xfrm6_protocol xfrmi_esp6_protocol __read_mostly = {
+ .handler = xfrm6_rcv,
+ .cb_handler = xfrmi_rcv_cb,
+ .err_handler = xfrmi6_err,
+ .priority = 10,
+};
+
+static struct xfrm6_protocol xfrmi_ah6_protocol __read_mostly = {
+ .handler = xfrm6_rcv,
+ .cb_handler = xfrmi_rcv_cb,
+ .err_handler = xfrmi6_err,
+ .priority = 10,
+};
+
+static struct xfrm6_protocol xfrmi_ipcomp6_protocol __read_mostly = {
+ .handler = xfrm6_rcv,
+ .cb_handler = xfrmi_rcv_cb,
+ .err_handler = xfrmi6_err,
+ .priority = 10,
+};
+
+static struct xfrm4_protocol xfrmi_esp4_protocol __read_mostly = {
+ .handler = xfrm4_rcv,
+ .input_handler = xfrm_input,
+ .cb_handler = xfrmi_rcv_cb,
+ .err_handler = xfrmi4_err,
+ .priority = 10,
+};
+
+static struct xfrm4_protocol xfrmi_ah4_protocol __read_mostly = {
+ .handler = xfrm4_rcv,
+ .input_handler = xfrm_input,
+ .cb_handler = xfrmi_rcv_cb,
+ .err_handler = xfrmi4_err,
+ .priority = 10,
+};
+
+static struct xfrm4_protocol xfrmi_ipcomp4_protocol __read_mostly = {
+ .handler = xfrm4_rcv,
+ .input_handler = xfrm_input,
+ .cb_handler = xfrmi_rcv_cb,
+ .err_handler = xfrmi4_err,
+ .priority = 10,
+};
+
+static int __init xfrmi4_init(void)
+{
+ int err;
+
+ err = xfrm4_protocol_register(&xfrmi_esp4_protocol, IPPROTO_ESP);
+ if (err < 0)
+ goto xfrm_proto_esp_failed;
+ err = xfrm4_protocol_register(&xfrmi_ah4_protocol, IPPROTO_AH);
+ if (err < 0)
+ goto xfrm_proto_ah_failed;
+ err = xfrm4_protocol_register(&xfrmi_ipcomp4_protocol, IPPROTO_COMP);
+ if (err < 0)
+ goto xfrm_proto_comp_failed;
+
+ return 0;
+
+xfrm_proto_comp_failed:
+ xfrm4_protocol_deregister(&xfrmi_ah4_protocol, IPPROTO_AH);
+xfrm_proto_ah_failed:
+ xfrm4_protocol_deregister(&xfrmi_esp4_protocol, IPPROTO_ESP);
+xfrm_proto_esp_failed:
+ return err;
+}
+
+static void xfrmi4_fini(void)
+{
+ xfrm4_protocol_deregister(&xfrmi_ipcomp4_protocol, IPPROTO_COMP);
+ xfrm4_protocol_deregister(&xfrmi_ah4_protocol, IPPROTO_AH);
+ xfrm4_protocol_deregister(&xfrmi_esp4_protocol, IPPROTO_ESP);
+}
+
+static int __init xfrmi6_init(void)
+{
+ int err;
+
+ err = xfrm6_protocol_register(&xfrmi_esp6_protocol, IPPROTO_ESP);
+ if (err < 0)
+ goto xfrm_proto_esp_failed;
+ err = xfrm6_protocol_register(&xfrmi_ah6_protocol, IPPROTO_AH);
+ if (err < 0)
+ goto xfrm_proto_ah_failed;
+ err = xfrm6_protocol_register(&xfrmi_ipcomp6_protocol, IPPROTO_COMP);
+ if (err < 0)
+ goto xfrm_proto_comp_failed;
+
+ return 0;
+
+xfrm_proto_comp_failed:
+ xfrm6_protocol_deregister(&xfrmi_ah6_protocol, IPPROTO_AH);
+xfrm_proto_ah_failed:
+ xfrm6_protocol_deregister(&xfrmi_esp6_protocol, IPPROTO_ESP);
+xfrm_proto_esp_failed:
+ return err;
+}
+
+static void xfrmi6_fini(void)
+{
+ xfrm6_protocol_deregister(&xfrmi_ipcomp6_protocol, IPPROTO_COMP);
+ xfrm6_protocol_deregister(&xfrmi_ah6_protocol, IPPROTO_AH);
+ xfrm6_protocol_deregister(&xfrmi_esp6_protocol, IPPROTO_ESP);
+}
+
+static const struct xfrm_if_cb xfrm_if_cb = {
+ .decode_session = xfrmi_decode_session,
+};
+
+static int __init xfrmi_init(void)
+{
+ const char *msg;
+ int err;
+
+ pr_info("IPsec XFRM device driver\n");
+
+ msg = "tunnel device";
+ err = register_pernet_device(&xfrmi_net_ops);
+ if (err < 0)
+ goto pernet_dev_failed;
+
+ msg = "xfrm4 protocols";
+ err = xfrmi4_init();
+ if (err < 0)
+ goto xfrmi4_failed;
+
+ msg = "xfrm6 protocols";
+ err = xfrmi6_init();
+ if (err < 0)
+ goto xfrmi6_failed;
+
+
+ msg = "netlink interface";
+ err = rtnl_link_register(&xfrmi_link_ops);
+ if (err < 0)
+ goto rtnl_link_failed;
+
+ xfrm_if_register_cb(&xfrm_if_cb);
+
+ return err;
+
+rtnl_link_failed:
+ xfrmi6_fini();
+xfrmi6_failed:
+ xfrmi4_fini();
+xfrmi4_failed:
+ unregister_pernet_device(&xfrmi_net_ops);
+pernet_dev_failed:
+ pr_err("xfrmi init: failed to register %s\n", msg);
+ return err;
+}
+
+static void __exit xfrmi_fini(void)
+{
+ xfrm_if_unregister_cb();
+ rtnl_link_unregister(&xfrmi_link_ops);
+ xfrmi4_fini();
+ xfrmi6_fini();
+ unregister_pernet_device(&xfrmi_net_ops);
+}
+
+module_init(xfrmi_init);
+module_exit(xfrmi_fini);
+MODULE_LICENSE("GPL");
+MODULE_ALIAS_RTNL_LINK("xfrm");
+MODULE_ALIAS_NETDEV("xfrm0");
+MODULE_AUTHOR("Steffen Klassert");
+MODULE_DESCRIPTION("XFRM virtual interface");
diff --git a/net/xfrm/xfrm_output.c b/net/xfrm/xfrm_output.c
index 637387bbaaea..bbe611a999b2 100644
--- a/net/xfrm/xfrm_output.c
+++ b/net/xfrm/xfrm_output.c
@@ -66,6 +66,8 @@ static int xfrm_output_one(struct sk_buff *skb, int err)
goto error_nolock;
}
+ skb->mark = xfrm_smark_get(skb->mark, x);
+
err = x->outer_mode->output(x, skb);
if (err) {
XFRM_INC_STATS(net, LINUX_MIB_XFRMOUTSTATEMODEERROR);
diff --git a/net/xfrm/xfrm_policy.c b/net/xfrm/xfrm_policy.c
index 155b1591b17a..79866a9b09a3 100644
--- a/net/xfrm/xfrm_policy.c
+++ b/net/xfrm/xfrm_policy.c
@@ -44,6 +44,9 @@ struct xfrm_flo {
u8 flags;
};
+static DEFINE_SPINLOCK(xfrm_if_cb_lock);
+static struct xfrm_if_cb const __rcu *xfrm_if_cb __read_mostly;
+
static DEFINE_SPINLOCK(xfrm_policy_afinfo_lock);
static struct xfrm_policy_afinfo __rcu *xfrm_policy_afinfo[NPROTO]
__read_mostly;
@@ -116,7 +119,13 @@ static struct xfrm_policy_afinfo *xfrm_policy_get_afinfo(unsigned short family)
return afinfo;
}
-static void xfrm_policy_put_afinfo(struct xfrm_policy_afinfo *afinfo)
+/* Called with rcu_read_lock(). */
+static const struct xfrm_if_cb *xfrm_if_get_cb(void)
+{
+ return rcu_dereference(xfrm_if_cb);
+}
+
+static void xfrm_policy_put_afinfo(const struct xfrm_policy_afinfo *afinfo)
{
rcu_read_unlock();
}
@@ -125,7 +134,7 @@ static inline struct dst_entry *__xfrm_dst_lookup(struct net *net,
int tos, int oif,
const xfrm_address_t *saddr,
const xfrm_address_t *daddr,
- int family)
+ int family, u32 mark)
{
struct xfrm_policy_afinfo *afinfo;
struct dst_entry *dst;
@@ -134,7 +143,7 @@ static inline struct dst_entry *__xfrm_dst_lookup(struct net *net,
if (unlikely(afinfo == NULL))
return ERR_PTR(-EAFNOSUPPORT);
- dst = afinfo->dst_lookup(net, tos, oif, saddr, daddr);
+ dst = afinfo->dst_lookup(net, tos, oif, saddr, daddr, mark);
xfrm_policy_put_afinfo(afinfo);
@@ -145,7 +154,7 @@ static inline struct dst_entry *xfrm_dst_lookup(struct xfrm_state *x,
int tos, int oif,
xfrm_address_t *prev_saddr,
xfrm_address_t *prev_daddr,
- int family)
+ int family, u32 mark)
{
struct net *net = xs_net(x);
xfrm_address_t *saddr = &x->props.saddr;
@@ -161,7 +170,7 @@ static inline struct dst_entry *xfrm_dst_lookup(struct xfrm_state *x,
daddr = x->coaddr;
}
- dst = __xfrm_dst_lookup(net, tos, oif, saddr, daddr, family);
+ dst = __xfrm_dst_lookup(net, tos, oif, saddr, daddr, family, mark);
if (!IS_ERR(dst)) {
if (prev_saddr != saddr)
@@ -781,6 +790,7 @@ int xfrm_policy_insert(int dir, struct xfrm_policy *policy, int excl)
newpos = NULL;
hlist_for_each_entry(pol, chain, bydst) {
if (pol->type == policy->type &&
+ pol->if_id == policy->if_id &&
!selector_cmp(&pol->selector, &policy->selector) &&
xfrm_policy_mark_match(policy, pol) &&
xfrm_sec_ctx_match(pol->security, policy->security) &&
@@ -833,8 +843,9 @@ int xfrm_policy_insert(int dir, struct xfrm_policy *policy, int excl)
}
EXPORT_SYMBOL(xfrm_policy_insert);
-struct xfrm_policy *xfrm_policy_bysel_ctx(struct net *net, u32 mark, u8 type,
- int dir, struct xfrm_selector *sel,
+struct xfrm_policy *xfrm_policy_bysel_ctx(struct net *net, u32 mark, u32 if_id,
+ u8 type, int dir,
+ struct xfrm_selector *sel,
struct xfrm_sec_ctx *ctx, int delete,
int *err)
{
@@ -847,6 +858,7 @@ struct xfrm_policy *xfrm_policy_bysel_ctx(struct net *net, u32 mark, u8 type,
ret = NULL;
hlist_for_each_entry(pol, chain, bydst) {
if (pol->type == type &&
+ pol->if_id == if_id &&
(mark & pol->mark.m) == pol->mark.v &&
!selector_cmp(sel, &pol->selector) &&
xfrm_sec_ctx_match(ctx, pol->security)) {
@@ -872,8 +884,9 @@ struct xfrm_policy *xfrm_policy_bysel_ctx(struct net *net, u32 mark, u8 type,
}
EXPORT_SYMBOL(xfrm_policy_bysel_ctx);
-struct xfrm_policy *xfrm_policy_byid(struct net *net, u32 mark, u8 type,
- int dir, u32 id, int delete, int *err)
+struct xfrm_policy *xfrm_policy_byid(struct net *net, u32 mark, u32 if_id,
+ u8 type, int dir, u32 id, int delete,
+ int *err)
{
struct xfrm_policy *pol, *ret;
struct hlist_head *chain;
@@ -888,6 +901,7 @@ struct xfrm_policy *xfrm_policy_byid(struct net *net, u32 mark, u8 type,
ret = NULL;
hlist_for_each_entry(pol, chain, byidx) {
if (pol->type == type && pol->index == id &&
+ pol->if_id == if_id &&
(mark & pol->mark.m) == pol->mark.v) {
xfrm_pol_hold(pol);
if (delete) {
@@ -1098,6 +1112,7 @@ static int xfrm_policy_match(const struct xfrm_policy *pol,
bool match;
if (pol->family != family ||
+ pol->if_id != fl->flowi_xfrm.if_id ||
(fl->flowi_mark & pol->mark.m) != pol->mark.v ||
pol->type != type)
return ret;
@@ -1267,7 +1282,8 @@ static struct xfrm_policy *xfrm_sk_policy_lookup(const struct sock *sk, int dir,
match = xfrm_selector_match(&pol->selector, fl, family);
if (match) {
- if ((sk->sk_mark & pol->mark.m) != pol->mark.v) {
+ if ((sk->sk_mark & pol->mark.m) != pol->mark.v ||
+ pol->if_id != fl->flowi_xfrm.if_id) {
pol = NULL;
goto out;
}
@@ -1395,6 +1411,7 @@ static struct xfrm_policy *clone_policy(const struct xfrm_policy *old, int dir)
newp->lft = old->lft;
newp->curlft = old->curlft;
newp->mark = old->mark;
+ newp->if_id = old->if_id;
newp->action = old->action;
newp->flags = old->flags;
newp->xfrm_nr = old->xfrm_nr;
@@ -1435,14 +1452,14 @@ int __xfrm_sk_clone_policy(struct sock *sk, const struct sock *osk)
static int
xfrm_get_saddr(struct net *net, int oif, xfrm_address_t *local,
- xfrm_address_t *remote, unsigned short family)
+ xfrm_address_t *remote, unsigned short family, u32 mark)
{
int err;
struct xfrm_policy_afinfo *afinfo = xfrm_policy_get_afinfo(family);
if (unlikely(afinfo == NULL))
return -EINVAL;
- err = afinfo->get_saddr(net, oif, local, remote);
+ err = afinfo->get_saddr(net, oif, local, remote, mark);
xfrm_policy_put_afinfo(afinfo);
return err;
}
@@ -1473,7 +1490,7 @@ xfrm_tmpl_resolve_one(struct xfrm_policy *policy, const struct flowi *fl,
if (xfrm_addr_any(local, tmpl->encap_family)) {
error = xfrm_get_saddr(net, fl->flowi_oif,
&tmp, remote,
- tmpl->encap_family);
+ tmpl->encap_family, 0);
if (error)
goto fail;
local = &tmp;
@@ -1750,9 +1767,11 @@ static struct dst_entry *xfrm_bundle_create(struct xfrm_policy *policy,
dst_copy_metrics(dst1, dst);
if (xfrm[i]->props.mode != XFRM_MODE_TRANSPORT) {
+ __u32 mark = xfrm_smark_get(fl->flowi_mark, xfrm[i]);
+
family = xfrm[i]->props.family;
dst = xfrm_dst_lookup(xfrm[i], tos, fl->flowi_oif,
- &saddr, &daddr, family);
+ &saddr, &daddr, family, mark);
err = PTR_ERR(dst);
if (IS_ERR(dst))
goto put_states;
@@ -2122,6 +2141,10 @@ xfrm_bundle_lookup(struct net *net, const struct flowi *fl, u16 family, u8 dir,
xflo->dst_orig);
if (IS_ERR(new_xdst)) {
err = PTR_ERR(new_xdst);
+ if (err == -EREMOTE) {
+ xfrm_pols_put(pols, num_pols);
+ return NULL;
+ }
if (err != -EAGAIN)
goto error;
if (oldflo == NULL)
@@ -2234,6 +2257,9 @@ struct dst_entry *xfrm_lookup(struct net *net, struct dst_entry *dst_orig,
if (IS_ERR(xdst)) {
xfrm_pols_put(pols, num_pols);
err = PTR_ERR(xdst);
+ if (err == -EREMOTE)
+ goto nopol;
+
goto dropdst;
} else if (xdst == NULL) {
num_xfrms = 0;
@@ -2432,13 +2458,21 @@ xfrm_policy_ok(const struct xfrm_tmpl *tmpl, const struct sec_path *sp, int star
int __xfrm_decode_session(struct sk_buff *skb, struct flowi *fl,
unsigned int family, int reverse)
{
- struct xfrm_policy_afinfo *afinfo = xfrm_policy_get_afinfo(family);
+ const struct xfrm_policy_afinfo *afinfo = xfrm_policy_get_afinfo(family);
+ const struct xfrm_if_cb *ifcb = xfrm_if_get_cb();
+ struct xfrm_if *xi;
int err;
if (unlikely(afinfo == NULL))
return -EAFNOSUPPORT;
afinfo->decode_session(skb, fl, reverse);
+ if (ifcb) {
+ xi = ifcb->decode_session(skb);
+ if (xi)
+ fl->flowi_xfrm.if_id = xi->p.if_id;
+ }
+
err = security_xfrm_decode_session(skb, &fl->flowi_secid);
xfrm_policy_put_afinfo(afinfo);
return err;
@@ -2903,6 +2937,20 @@ static int xfrm_dev_event(struct notifier_block *this, unsigned long event, void
static struct notifier_block xfrm_dev_notifier = {
.notifier_call = xfrm_dev_event,
};
+void xfrm_if_register_cb(const struct xfrm_if_cb *ifcb)
+{
+ spin_lock(&xfrm_if_cb_lock);
+ rcu_assign_pointer(xfrm_if_cb, ifcb);
+ spin_unlock(&xfrm_if_cb_lock);
+}
+EXPORT_SYMBOL(xfrm_if_register_cb);
+
+void xfrm_if_unregister_cb(void)
+{
+ RCU_INIT_POINTER(xfrm_if_cb, NULL);
+ synchronize_rcu();
+}
+EXPORT_SYMBOL(xfrm_if_unregister_cb);
#ifdef CONFIG_XFRM_STATISTICS
static int __net_init xfrm_statistics_init(struct net *net)
@@ -3082,6 +3130,9 @@ void __init xfrm_init(void)
register_pernet_subsys(&xfrm_net_ops);
seqcount_init(&xfrm_policy_hash_generation);
xfrm_input_init();
+
+ RCU_INIT_POINTER(xfrm_if_cb, NULL);
+ synchronize_rcu();
}
#ifdef CONFIG_AUDITSYSCALL
diff --git a/net/xfrm/xfrm_state.c b/net/xfrm/xfrm_state.c
index 884f2136b34b..a4a0bd7a2585 100644
--- a/net/xfrm/xfrm_state.c
+++ b/net/xfrm/xfrm_state.c
@@ -794,6 +794,7 @@ xfrm_state_find(const xfrm_address_t *daddr, const xfrm_address_t *saddr,
int error = 0;
struct xfrm_state *best = NULL;
u32 mark = pol->mark.v & pol->mark.m;
+ u32 if_id = fl->flowi_xfrm.if_id;
unsigned short encap_family = tmpl->encap_family;
unsigned int sequence;
struct km_event c;
@@ -808,6 +809,7 @@ xfrm_state_find(const xfrm_address_t *daddr, const xfrm_address_t *saddr,
if (x->props.family == encap_family &&
x->props.reqid == tmpl->reqid &&
(mark & x->mark.m) == x->mark.v &&
+ x->if_id == if_id &&
!(x->props.flags & XFRM_STATE_WILDRECV) &&
xfrm_state_addr_check(x, daddr, saddr, encap_family) &&
tmpl->mode == x->props.mode &&
@@ -824,6 +826,7 @@ xfrm_state_find(const xfrm_address_t *daddr, const xfrm_address_t *saddr,
if (x->props.family == encap_family &&
x->props.reqid == tmpl->reqid &&
(mark & x->mark.m) == x->mark.v &&
+ x->if_id == if_id &&
!(x->props.flags & XFRM_STATE_WILDRECV) &&
xfrm_addr_equal(&x->id.daddr, daddr, encap_family) &&
tmpl->mode == x->props.mode &&
@@ -863,6 +866,7 @@ found:
* to current session. */
xfrm_init_tempstate(x, fl, tmpl, daddr, saddr, family);
memcpy(&x->mark, &pol->mark, sizeof(x->mark));
+ x->if_id = if_id;
error = security_xfrm_state_alloc_acquire(x, pol->security, fl->flowi_secid);
if (error) {
@@ -920,7 +924,7 @@ out:
}
struct xfrm_state *
-xfrm_stateonly_find(struct net *net, u32 mark,
+xfrm_stateonly_find(struct net *net, u32 mark, u32 if_id,
xfrm_address_t *daddr, xfrm_address_t *saddr,
unsigned short family, u8 mode, u8 proto, u32 reqid)
{
@@ -933,6 +937,7 @@ xfrm_stateonly_find(struct net *net, u32 mark,
if (x->props.family == family &&
x->props.reqid == reqid &&
(mark & x->mark.m) == x->mark.v &&
+ x->if_id == if_id &&
!(x->props.flags & XFRM_STATE_WILDRECV) &&
xfrm_state_addr_check(x, daddr, saddr, family) &&
mode == x->props.mode &&
@@ -1013,11 +1018,13 @@ static void __xfrm_state_bump_genids(struct xfrm_state *xnew)
struct xfrm_state *x;
unsigned int h;
u32 mark = xnew->mark.v & xnew->mark.m;
+ u32 if_id = xnew->if_id;
h = xfrm_dst_hash(net, &xnew->id.daddr, &xnew->props.saddr, reqid, family);
hlist_for_each_entry(x, net->xfrm.state_bydst+h, bydst) {
if (x->props.family == family &&
x->props.reqid == reqid &&
+ x->if_id == if_id &&
(mark & x->mark.m) == x->mark.v &&
xfrm_addr_equal(&x->id.daddr, &xnew->id.daddr, family) &&
xfrm_addr_equal(&x->props.saddr, &xnew->props.saddr, family))
@@ -1040,7 +1047,7 @@ EXPORT_SYMBOL(xfrm_state_insert);
static struct xfrm_state *__find_acq_core(struct net *net,
const struct xfrm_mark *m,
unsigned short family, u8 mode,
- u32 reqid, u8 proto,
+ u32 reqid, u32 if_id, u8 proto,
const xfrm_address_t *daddr,
const xfrm_address_t *saddr,
int create)
@@ -1095,6 +1102,7 @@ static struct xfrm_state *__find_acq_core(struct net *net,
x->props.family = family;
x->props.mode = mode;
x->props.reqid = reqid;
+ x->if_id = if_id;
x->mark.v = m->v;
x->mark.m = m->m;
x->lft.hard_add_expires_seconds = net->xfrm.sysctl_acq_expires;
@@ -1149,7 +1157,7 @@ int xfrm_state_add(struct xfrm_state *x)
if (use_spi && !x1)
x1 = __find_acq_core(net, &x->mark, family, x->props.mode,
- x->props.reqid, x->id.proto,
+ x->props.reqid, x->if_id, x->id.proto,
&x->id.daddr, &x->props.saddr, 0);
__xfrm_state_bump_genids(x);
@@ -1241,6 +1249,7 @@ static struct xfrm_state *xfrm_state_clone(struct xfrm_state *orig)
x->props.flags = orig->props.flags;
x->props.extra_flags = orig->props.extra_flags;
+ x->if_id = orig->if_id;
x->tfcpad = orig->tfcpad;
x->replay_maxdiff = orig->replay_maxdiff;
x->replay_maxage = orig->replay_maxage;
@@ -1395,6 +1404,19 @@ out:
if (x1->curlft.use_time)
xfrm_state_check_expire(x1);
+ if (x->props.smark.m || x->props.smark.v || x->if_id) {
+ spin_lock_bh(&net->xfrm.xfrm_state_lock);
+
+ if (x->props.smark.m || x->props.smark.v)
+ x1->props.smark = x->props.smark;
+
+ if (x->if_id)
+ x1->if_id = x->if_id;
+
+ __xfrm_state_bump_genids(x1);
+ spin_unlock_bh(&net->xfrm.xfrm_state_lock);
+ }
+
err = 0;
x->km.state = XFRM_STATE_DEAD;
__xfrm_state_put(x);
@@ -1458,13 +1480,13 @@ EXPORT_SYMBOL(xfrm_state_lookup_byaddr);
struct xfrm_state *
xfrm_find_acq(struct net *net, const struct xfrm_mark *mark, u8 mode, u32 reqid,
- u8 proto, const xfrm_address_t *daddr,
+ u32 if_id, u8 proto, const xfrm_address_t *daddr,
const xfrm_address_t *saddr, int create, unsigned short family)
{
struct xfrm_state *x;
spin_lock_bh(&net->xfrm.xfrm_state_lock);
- x = __find_acq_core(net, mark, family, mode, reqid, proto, daddr, saddr, create);
+ x = __find_acq_core(net, mark, family, mode, reqid, if_id, proto, daddr, saddr, create);
spin_unlock_bh(&net->xfrm.xfrm_state_lock);
return x;
@@ -1922,6 +1944,7 @@ int xfrm_user_policy(struct sock *sk, int optname, u8 __user *optval, int optlen
if (err >= 0) {
xfrm_sk_policy_insert(sk, err, pol);
xfrm_pol_put(pol);
+ __sk_dst_reset(sk);
err = 0;
}
diff --git a/net/xfrm/xfrm_user.c b/net/xfrm/xfrm_user.c
index 026770884d46..0d7dd77fe0e9 100644
--- a/net/xfrm/xfrm_user.c
+++ b/net/xfrm/xfrm_user.c
@@ -533,6 +533,19 @@ static void xfrm_update_ae_params(struct xfrm_state *x, struct nlattr **attrs,
x->replay_maxdiff = nla_get_u32(rt);
}
+static void xfrm_smark_init(struct nlattr **attrs, struct xfrm_mark *m)
+{
+ if (attrs[XFRMA_SET_MARK]) {
+ m->v = nla_get_u32(attrs[XFRMA_SET_MARK]);
+ if (attrs[XFRMA_SET_MARK_MASK])
+ m->m = nla_get_u32(attrs[XFRMA_SET_MARK_MASK]);
+ else
+ m->m = 0xffffffff;
+ } else {
+ m->v = m->m = 0;
+ }
+}
+
static struct xfrm_state *xfrm_state_construct(struct net *net,
struct xfrm_usersa_info *p,
struct nlattr **attrs,
@@ -585,6 +598,11 @@ static struct xfrm_state *xfrm_state_construct(struct net *net,
xfrm_mark_get(attrs, &x->mark);
+ xfrm_smark_init(attrs, &x->props.smark);
+
+ if (attrs[XFRMA_IF_ID])
+ x->if_id = nla_get_u32(attrs[XFRMA_IF_ID]);
+
err = __xfrm_init_state(x, false);
if (err)
goto error;
@@ -798,6 +816,18 @@ static int copy_to_user_auth(struct xfrm_algo_auth *auth, struct sk_buff *skb)
return 0;
}
+static int xfrm_smark_put(struct sk_buff *skb, struct xfrm_mark *m)
+{
+ int ret = 0;
+
+ if (m->v | m->m) {
+ ret = nla_put_u32(skb, XFRMA_SET_MARK, m->v);
+ if (!ret)
+ ret = nla_put_u32(skb, XFRMA_SET_MARK_MASK, m->m);
+ }
+ return ret;
+}
+
/* Don't change this without updating xfrm_sa_len! */
static int copy_to_user_state_extra(struct xfrm_state *x,
struct xfrm_usersa_info *p,
@@ -861,6 +891,11 @@ static int copy_to_user_state_extra(struct xfrm_state *x,
ret = xfrm_mark_put(skb, &x->mark);
if (ret)
goto out;
+
+ ret = xfrm_smark_put(skb, &x->props.smark);
+ if (ret)
+ goto out;
+
if (x->replay_esn)
ret = nla_put(skb, XFRMA_REPLAY_ESN_VAL,
xfrm_replay_state_esn_len(x->replay_esn),
@@ -870,6 +905,13 @@ static int copy_to_user_state_extra(struct xfrm_state *x,
&x->replay);
if (ret)
goto out;
+
+ if (x->if_id) {
+ ret = nla_put_u32(skb, XFRMA_IF_ID, x->if_id);
+ if (ret)
+ goto out;
+ }
+
if (x->security)
ret = copy_sec_ctx(x->security, skb);
out:
@@ -1218,6 +1260,7 @@ static int xfrm_alloc_userspi(struct sk_buff *skb, struct nlmsghdr *nlh,
int err;
u32 mark;
struct xfrm_mark m;
+ u32 if_id = 0;
p = nlmsg_data(nlh);
err = verify_spi_info(p->info.id.proto, p->min, p->max);
@@ -1230,6 +1273,10 @@ static int xfrm_alloc_userspi(struct sk_buff *skb, struct nlmsghdr *nlh,
x = NULL;
mark = xfrm_mark_get(attrs, &m);
+
+ if (attrs[XFRMA_IF_ID])
+ if_id = nla_get_u32(attrs[XFRMA_IF_ID]);
+
if (p->info.seq) {
x = xfrm_find_acq_byseq(net, mark, p->info.seq);
if (x && !xfrm_addr_equal(&x->id.daddr, daddr, family)) {
@@ -1240,7 +1287,7 @@ static int xfrm_alloc_userspi(struct sk_buff *skb, struct nlmsghdr *nlh,
if (!x)
x = xfrm_find_acq(net, &m, p->info.mode, p->info.reqid,
- p->info.id.proto, daddr,
+ if_id, p->info.id.proto, daddr,
&p->info.saddr, 1,
family);
err = -ENOENT;
@@ -1537,6 +1584,9 @@ static struct xfrm_policy *xfrm_policy_construct(struct net *net, struct xfrm_us
xfrm_mark_get(attrs, &xp->mark);
+ if (attrs[XFRMA_IF_ID])
+ xp->if_id = nla_get_u32(attrs[XFRMA_IF_ID]);
+
return xp;
error:
*errp = err;
@@ -1684,6 +1734,8 @@ static int dump_one_policy(struct xfrm_policy *xp, int dir, int count, void *ptr
err = copy_to_user_policy_type(xp->type, skb);
if (!err)
err = xfrm_mark_put(skb, &xp->mark);
+ if (!err)
+ err = xfrm_if_id_put(skb, xp->if_id);
if (err) {
nlmsg_cancel(skb, nlh);
return err;
@@ -1765,6 +1817,7 @@ static int xfrm_get_policy(struct sk_buff *skb, struct nlmsghdr *nlh,
int delete;
struct xfrm_mark m;
u32 mark = xfrm_mark_get(attrs, &m);
+ u32 if_id = 0;
p = nlmsg_data(nlh);
delete = nlh->nlmsg_type == XFRM_MSG_DELPOLICY;
@@ -1777,8 +1830,11 @@ static int xfrm_get_policy(struct sk_buff *skb, struct nlmsghdr *nlh,
if (err)
return err;
+ if (attrs[XFRMA_IF_ID])
+ if_id = nla_get_u32(attrs[XFRMA_IF_ID]);
+
if (p->index)
- xp = xfrm_policy_byid(net, mark, type, p->dir, p->index, delete, &err);
+ xp = xfrm_policy_byid(net, mark, if_id, type, p->dir, p->index, delete, &err);
else {
struct nlattr *rt = attrs[XFRMA_SEC_CTX];
struct xfrm_sec_ctx *ctx;
@@ -1795,7 +1851,7 @@ static int xfrm_get_policy(struct sk_buff *skb, struct nlmsghdr *nlh,
if (err)
return err;
}
- xp = xfrm_policy_bysel_ctx(net, mark, type, p->dir, &p->sel,
+ xp = xfrm_policy_bysel_ctx(net, mark, if_id, type, p->dir, &p->sel,
ctx, delete, &err);
security_xfrm_policy_free(ctx);
}
@@ -1919,6 +1975,10 @@ static int build_aevent(struct sk_buff *skb, struct xfrm_state *x, const struct
if (err)
goto out_cancel;
+ err = xfrm_if_id_put(skb, x->if_id);
+ if (err)
+ goto out_cancel;
+
nlmsg_end(skb, nlh);
return 0;
@@ -2060,6 +2120,7 @@ static int xfrm_add_pol_expire(struct sk_buff *skb, struct nlmsghdr *nlh,
int err = -ENOENT;
struct xfrm_mark m;
u32 mark = xfrm_mark_get(attrs, &m);
+ u32 if_id = 0;
err = copy_from_user_policy_type(&type, attrs);
if (err)
@@ -2069,8 +2130,11 @@ static int xfrm_add_pol_expire(struct sk_buff *skb, struct nlmsghdr *nlh,
if (err)
return err;
+ if (attrs[XFRMA_IF_ID])
+ if_id = nla_get_u32(attrs[XFRMA_IF_ID]);
+
if (p->index)
- xp = xfrm_policy_byid(net, mark, type, p->dir, p->index, 0, &err);
+ xp = xfrm_policy_byid(net, mark, if_id, type, p->dir, p->index, 0, &err);
else {
struct nlattr *rt = attrs[XFRMA_SEC_CTX];
struct xfrm_sec_ctx *ctx;
@@ -2087,7 +2151,7 @@ static int xfrm_add_pol_expire(struct sk_buff *skb, struct nlmsghdr *nlh,
if (err)
return err;
}
- xp = xfrm_policy_bysel_ctx(net, mark, type, p->dir,
+ xp = xfrm_policy_bysel_ctx(net, mark, if_id, type, p->dir,
&p->sel, ctx, 0, &err);
security_xfrm_policy_free(ctx);
}
@@ -2446,6 +2510,9 @@ static const struct nla_policy xfrma_policy[XFRMA_MAX+1] = {
[XFRMA_SA_EXTRA_FLAGS] = { .type = NLA_U32 },
[XFRMA_PROTO] = { .type = NLA_U8 },
[XFRMA_ADDRESS_FILTER] = { .len = sizeof(struct xfrm_address_filter) },
+ [XFRMA_SET_MARK] = { .type = NLA_U32 },
+ [XFRMA_SET_MARK_MASK] = { .type = NLA_U32 },
+ [XFRMA_IF_ID] = { .type = NLA_U32 },
};
static const struct nla_policy xfrma_spd_policy[XFRMA_SPD_MAX+1] = {
@@ -2574,6 +2641,10 @@ static int build_expire(struct sk_buff *skb, struct xfrm_state *x, const struct
if (err)
return err;
+ err = xfrm_if_id_put(skb, x->if_id);
+ if (err)
+ return err;
+
nlmsg_end(skb, nlh);
return 0;
}
@@ -2665,6 +2736,12 @@ static inline size_t xfrm_sa_len(struct xfrm_state *x)
l += nla_total_size(sizeof(*x->coaddr));
if (x->props.extra_flags)
l += nla_total_size(sizeof(x->props.extra_flags));
+ if (x->props.smark.v | x->props.smark.m) {
+ l += nla_total_size(sizeof(x->props.smark.v));
+ l += nla_total_size(sizeof(x->props.smark.m));
+ }
+ if (x->if_id)
+ l += nla_total_size(sizeof(x->if_id));
/* Must count x->lastused as it may become non-zero behind our back. */
l += nla_total_size_64bit(sizeof(u64));
@@ -2792,6 +2869,8 @@ static int build_acquire(struct sk_buff *skb, struct xfrm_state *x,
err = copy_to_user_policy_type(xp->type, skb);
if (!err)
err = xfrm_mark_put(skb, &xp->mark);
+ if (!err)
+ err = xfrm_if_id_put(skb, xp->if_id);
if (err) {
nlmsg_cancel(skb, nlh);
return err;
@@ -2907,6 +2986,8 @@ static int build_polexpire(struct sk_buff *skb, struct xfrm_policy *xp,
err = copy_to_user_policy_type(xp->type, skb);
if (!err)
err = xfrm_mark_put(skb, &xp->mark);
+ if (!err)
+ err = xfrm_if_id_put(skb, xp->if_id);
if (err) {
nlmsg_cancel(skb, nlh);
return err;
@@ -2986,6 +3067,8 @@ static int xfrm_notify_policy(struct xfrm_policy *xp, int dir, const struct km_e
err = copy_to_user_policy_type(xp->type, skb);
if (!err)
err = xfrm_mark_put(skb, &xp->mark);
+ if (!err)
+ err = xfrm_if_id_put(skb, xp->if_id);
if (err)
goto out_free_skb;
diff --git a/samples/bpf/Makefile b/samples/bpf/Makefile
index 72c58675973e..b2cdced61090 100644
--- a/samples/bpf/Makefile
+++ b/samples/bpf/Makefile
@@ -22,6 +22,7 @@ hostprogs-y += spintest
hostprogs-y += map_perf_test
hostprogs-y += test_overhead
hostprogs-y += test_cgrp2_array_pin
+hostprogs-y += test_cgrp2_attach
hostprogs-y += xdp1
hostprogs-y += xdp2
hostprogs-y += test_current_task_under_cgroup
@@ -50,6 +51,7 @@ spintest-objs := bpf_load.o libbpf.o spintest_user.o
map_perf_test-objs := bpf_load.o libbpf.o map_perf_test_user.o
test_overhead-objs := bpf_load.o libbpf.o test_overhead_user.o
test_cgrp2_array_pin-objs := libbpf.o test_cgrp2_array_pin.o
+test_cgrp2_attach-objs := libbpf.o test_cgrp2_attach.o
xdp1-objs := bpf_load.o libbpf.o xdp1_user.o
# reuse xdp1 source intentionally
xdp2-objs := bpf_load.o libbpf.o xdp1_user.o
diff --git a/samples/bpf/libbpf.c b/samples/bpf/libbpf.c
index 9969e35550c3..9cbc786e48e5 100644
--- a/samples/bpf/libbpf.c
+++ b/samples/bpf/libbpf.c
@@ -104,6 +104,29 @@ int bpf_prog_load(enum bpf_prog_type prog_type,
return syscall(__NR_bpf, BPF_PROG_LOAD, &attr, sizeof(attr));
}
+int bpf_prog_attach(int prog_fd, int target_fd, enum bpf_attach_type type,
+ unsigned int flags)
+{
+ union bpf_attr attr = {
+ .target_fd = target_fd,
+ .attach_bpf_fd = prog_fd,
+ .attach_type = type,
+ .attach_flags = flags;
+ };
+
+ return syscall(__NR_bpf, BPF_PROG_ATTACH, &attr, sizeof(attr));
+}
+
+int bpf_prog_detach(int target_fd, enum bpf_attach_type type)
+{
+ union bpf_attr attr = {
+ .target_fd = target_fd,
+ .attach_type = type,
+ };
+
+ return syscall(__NR_bpf, BPF_PROG_DETACH, &attr, sizeof(attr));
+}
+
int bpf_obj_pin(int fd, const char *pathname)
{
union bpf_attr attr = {
diff --git a/samples/bpf/libbpf.h b/samples/bpf/libbpf.h
index ac6edb61b64a..b06cf5aea097 100644
--- a/samples/bpf/libbpf.h
+++ b/samples/bpf/libbpf.h
@@ -15,6 +15,10 @@ int bpf_prog_load(enum bpf_prog_type prog_type,
const struct bpf_insn *insns, int insn_len,
const char *license, int kern_version);
+int bpf_prog_attach(int prog_fd, int attachable_fd, enum bpf_attach_type type,
+ unsigned int flags);
+int bpf_prog_detach(int attachable_fd, enum bpf_attach_type type);
+
int bpf_obj_pin(int fd, const char *pathname);
int bpf_obj_get(const char *pathname);
diff --git a/samples/bpf/test_cgrp2_attach.c b/samples/bpf/test_cgrp2_attach.c
new file mode 100644
index 000000000000..9de4896edeef
--- /dev/null
+++ b/samples/bpf/test_cgrp2_attach.c
@@ -0,0 +1,147 @@
+/* eBPF example program:
+ *
+ * - Creates arraymap in kernel with 4 bytes keys and 8 byte values
+ *
+ * - Loads eBPF program
+ *
+ * The eBPF program accesses the map passed in to store two pieces of
+ * information. The number of invocations of the program, which maps
+ * to the number of packets received, is stored to key 0. Key 1 is
+ * incremented on each iteration by the number of bytes stored in
+ * the skb.
+ *
+ * - Detaches any eBPF program previously attached to the cgroup
+ *
+ * - Attaches the new program to a cgroup using BPF_PROG_ATTACH
+ *
+ * - Every second, reads map[0] and map[1] to see how many bytes and
+ * packets were seen on any socket of tasks in the given cgroup.
+ */
+
+#define _GNU_SOURCE
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <stddef.h>
+#include <string.h>
+#include <unistd.h>
+#include <assert.h>
+#include <errno.h>
+#include <fcntl.h>
+
+#include <linux/bpf.h>
+
+#include "libbpf.h"
+
+enum {
+ MAP_KEY_PACKETS,
+ MAP_KEY_BYTES,
+};
+
+static int prog_load(int map_fd, int verdict)
+{
+ struct bpf_insn prog[] = {
+ BPF_MOV64_REG(BPF_REG_6, BPF_REG_1), /* save r6 so it's not clobbered by BPF_CALL */
+
+ /* Count packets */
+ BPF_MOV64_IMM(BPF_REG_0, MAP_KEY_PACKETS), /* r0 = 0 */
+ BPF_STX_MEM(BPF_W, BPF_REG_10, BPF_REG_0, -4), /* *(u32 *)(fp - 4) = r0 */
+ BPF_MOV64_REG(BPF_REG_2, BPF_REG_10),
+ BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -4), /* r2 = fp - 4 */
+ BPF_LD_MAP_FD(BPF_REG_1, map_fd), /* load map fd to r1 */
+ BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_map_lookup_elem),
+ BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 2),
+ BPF_MOV64_IMM(BPF_REG_1, 1), /* r1 = 1 */
+ BPF_RAW_INSN(BPF_STX | BPF_XADD | BPF_DW, BPF_REG_0, BPF_REG_1, 0, 0), /* xadd r0 += r1 */
+
+ /* Count bytes */
+ BPF_MOV64_IMM(BPF_REG_0, MAP_KEY_BYTES), /* r0 = 1 */
+ BPF_STX_MEM(BPF_W, BPF_REG_10, BPF_REG_0, -4), /* *(u32 *)(fp - 4) = r0 */
+ BPF_MOV64_REG(BPF_REG_2, BPF_REG_10),
+ BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -4), /* r2 = fp - 4 */
+ BPF_LD_MAP_FD(BPF_REG_1, map_fd),
+ BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_map_lookup_elem),
+ BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 2),
+ BPF_LDX_MEM(BPF_W, BPF_REG_1, BPF_REG_6, offsetof(struct __sk_buff, len)), /* r1 = skb->len */
+ BPF_RAW_INSN(BPF_STX | BPF_XADD | BPF_DW, BPF_REG_0, BPF_REG_1, 0, 0), /* xadd r0 += r1 */
+
+ BPF_MOV64_IMM(BPF_REG_0, verdict), /* r0 = verdict */
+ BPF_EXIT_INSN(),
+ };
+
+ return bpf_prog_load(BPF_PROG_TYPE_CGROUP_SKB,
+ prog, sizeof(prog), "GPL", 0);
+}
+
+static int usage(const char *argv0)
+{
+ printf("Usage: %s <cg-path> <egress|ingress> [drop]\n", argv0);
+ return EXIT_FAILURE;
+}
+
+int main(int argc, char **argv)
+{
+ int cg_fd, map_fd, prog_fd, key, ret;
+ long long pkt_cnt, byte_cnt;
+ enum bpf_attach_type type;
+ int verdict = 1;
+
+ if (argc < 3)
+ return usage(argv[0]);
+
+ if (strcmp(argv[2], "ingress") == 0)
+ type = BPF_CGROUP_INET_INGRESS;
+ else if (strcmp(argv[2], "egress") == 0)
+ type = BPF_CGROUP_INET_EGRESS;
+ else
+ return usage(argv[0]);
+
+ if (argc > 3 && strcmp(argv[3], "drop") == 0)
+ verdict = 0;
+
+ cg_fd = open(argv[1], O_DIRECTORY | O_RDONLY);
+ if (cg_fd < 0) {
+ printf("Failed to open cgroup path: '%s'\n", strerror(errno));
+ return EXIT_FAILURE;
+ }
+
+ map_fd = bpf_create_map(BPF_MAP_TYPE_ARRAY,
+ sizeof(key), sizeof(byte_cnt),
+ 256, 0);
+ if (map_fd < 0) {
+ printf("Failed to create map: '%s'\n", strerror(errno));
+ return EXIT_FAILURE;
+ }
+
+ prog_fd = prog_load(map_fd, verdict);
+ printf("Output from kernel verifier:\n%s\n-------\n", bpf_log_buf);
+
+ if (prog_fd < 0) {
+ printf("Failed to load prog: '%s'\n", strerror(errno));
+ return EXIT_FAILURE;
+ }
+
+ ret = bpf_prog_detach(cg_fd, type);
+ printf("bpf_prog_detach() returned '%s' (%d)\n", strerror(errno), errno);
+
+ ret = bpf_prog_attach(prog_fd, cg_fd, type, 0);
+ if (ret < 0) {
+ printf("Failed to attach prog to cgroup: '%s'\n",
+ strerror(errno));
+ return EXIT_FAILURE;
+ }
+
+ while (1) {
+ key = MAP_KEY_PACKETS;
+ assert(bpf_lookup_elem(map_fd, &key, &pkt_cnt) == 0);
+
+ key = MAP_KEY_BYTES;
+ assert(bpf_lookup_elem(map_fd, &key, &byte_cnt) == 0);
+
+ printf("cgroup received %lld packets, %lld bytes\n",
+ pkt_cnt, byte_cnt);
+ sleep(1);
+ }
+
+ return EXIT_SUCCESS;
+}
diff --git a/scripts/Kbuild.include b/scripts/Kbuild.include
index 7f430778f418..512cec516939 100644
--- a/scripts/Kbuild.include
+++ b/scripts/Kbuild.include
@@ -148,6 +148,37 @@ cc-disable-warning = $(call try-run,\
# Expands to either gcc or clang
cc-name = $(shell $(CC) -v 2>&1 | grep -q "clang version" && echo clang || echo gcc)
+# __cc-version
+# Returns compiler version
+__cc-version = $(shell $(CONFIG_SHELL) $(srctree)/scripts/$(cc-name)-version.sh $(CC))
+
+# __cc-fullversion
+# Returns full compiler version
+__cc-fullversion = $(shell $(CONFIG_SHELL) \
+ $(srctree)/scripts/$(cc-name)-version.sh -p $(CC))
+
+# __cc-ifversion
+# Matches compiler name and version
+# Usage: EXTRA_CFLAGS += $(call cc-if-name-version, gcc, -lt, 0402, -O1)
+__cc-ifversion = $(shell [ $(cc-name) = $(1) ] && [ $(__cc-version) $(2) $(3) ] && echo $(4) || echo $(5))
+
+# __cc-if-fullversion
+# Matches compiler name and full version
+# Usage: EXTRA_CFLAGS += $(call cc-if-name-fullversion, gcc, -lt, 040502, -O1)
+__cc-if-fullversion = $(shell [ $(cc-name) = $(1) ] && [ $(__cc-fullversion) $(2) $(3) ] && echo $(4) || echo $(5))
+
+# gcc-ifversion
+gcc-ifversion = $(call __cc-ifversion, gcc, $(1), $(2), $(3), $(4))
+
+# gcc-if-fullversion
+gcc-if-fullversion = (call __cc-if-fullversion, gcc, $(1), $(2), $(3), $(4))
+
+# clang-ifversion
+clang-ifversion = $(call __cc-ifversion, clang, $(1), $(2), $(3), $(4))
+
+# clang-if-fullversion
+clang-if-fullversion = (call __cc-if-fullversion, clang, $(1), $(2), $(3), $(4))
+
# cc-version
cc-version = $(shell $(CONFIG_SHELL) $(srctree)/scripts/gcc-version.sh $(CC))
@@ -155,9 +186,9 @@ cc-version = $(shell $(CONFIG_SHELL) $(srctree)/scripts/gcc-version.sh $(CC))
cc-fullversion = $(shell $(CONFIG_SHELL) \
$(srctree)/scripts/gcc-version.sh -p $(CC))
-# cc-ifversion
-# Usage: EXTRA_CFLAGS += $(call cc-ifversion, -lt, 0402, -O1)
-cc-ifversion = $(shell [ $(cc-version) $(1) $(2) ] && echo $(3) || echo $(4))
+# backward compatibility
+cc-ifversion = $(gcc-ifversion)
+cc-if-fullversion = $(gcc-if-fullversion)
# cc-ldoption
# Usage: ldflags += $(call cc-ldoption, -Wl$(comma)--hash-style=both)
@@ -175,6 +206,10 @@ ld-option = $(call try-run,\
# Important: no spaces around options
ar-option = $(call try-run, $(AR) rc$(1) "$$TMP",$(1),$(2))
+# ld-name
+# Expands to either bfd or gold
+ld-name = $(shell $(LD) -v 2>&1 | grep -q "GNU gold" && echo gold || echo bfd)
+
# ld-version
# Note this is mainly for HJ Lu's 3 number binutil versions
ld-version = $(shell $(LD) --version | $(srctree)/scripts/ld-version.sh)
@@ -183,6 +218,18 @@ ld-version = $(shell $(LD) --version | $(srctree)/scripts/ld-version.sh)
# Usage: $(call ld-ifversion, -ge, 22252, y)
ld-ifversion = $(shell [ $(ld-version) $(1) $(2) ] && echo $(3) || echo $(4))
+# __ld-ifversion
+# Usage: $(call __ld-ifversion, gold, -ge, 112000000, y)
+__ld-ifversion = $(shell [ $(ld-name) = $(1) ] && [ $(ld-version) $(2) $(3) ] && echo $(4) || echo $(5))
+
+# bfd-ifversion
+# Usage: $(call bfd-ifversion, -ge, 227000000, y)
+bfd-ifversion = $(call __ld-ifversion, bfd, $(1), $(2), $(3), $(4))
+
+# gold-ifversion
+# Usage: $(call gold-ifversion, -ge, 112000000, y)
+gold-ifversion = $(call __ld-ifversion, gold, $(1), $(2), $(3), $(4))
+
######
###
diff --git a/scripts/Makefile.build b/scripts/Makefile.build
index 6228a83156ea..d41c72a51043 100644
--- a/scripts/Makefile.build
+++ b/scripts/Makefile.build
@@ -208,6 +208,23 @@ else
cmd_cc_o_c = $(CC) $(c_flags) -c -o $(@D)/.tmp_$(@F) $<
+ifdef CONFIG_LTO_CLANG
+# Generate .o.symversions files for each .o with exported symbols, and link these
+# to the kernel and/or modules at the end.
+cmd_modversions_c = \
+ if $(OBJDUMP) -h $(@D)/.tmp_$(@F) >/dev/null 2>/dev/null; then \
+ if $(OBJDUMP) -h $(@D)/.tmp_$(@F) | grep -q __ksymtab; then \
+ $(call cmd_gensymtypes_c,$(KBUILD_SYMTYPES),$(@:.o=.symtypes)) \
+ > $(@D)/$(@F).symversions; \
+ fi; \
+ else \
+ if $(LLVM_DIS) -o=- $(@D)/.tmp_$(@F) | grep -q __ksymtab; then \
+ $(call cmd_gensymtypes_c,$(KBUILD_SYMTYPES),$(@:.o=.symtypes)) \
+ > $(@D)/$(@F).symversions; \
+ fi; \
+ fi; \
+ mv -f $(@D)/.tmp_$(@F) $@;
+else
cmd_modversions_c = \
if $(OBJDUMP) -h $(@D)/.tmp_$(@F) | grep -q __ksymtab; then \
$(call cmd_gensymtypes_c,$(KBUILD_SYMTYPES),$(@:.o=.symtypes)) \
@@ -220,12 +237,19 @@ cmd_modversions_c = \
mv -f $(@D)/.tmp_$(@F) $@; \
fi;
endif
+endif
ifdef CONFIG_FTRACE_MCOUNT_RECORD
ifdef BUILD_C_RECORDMCOUNT
ifeq ("$(origin RECORDMCOUNT_WARN)", "command line")
RECORDMCOUNT_FLAGS = -w
endif
+
+ifdef CONFIG_LTO_CLANG
+# With LTO, we postpone running recordmcount until after the LTO link step, so
+# let's export the parameters for the link script.
+export RECORDMCOUNT_FLAGS
+else
# Due to recursion, we must skip empty.o.
# The empty.o file is created in the make process in order to determine
# the target endianness and word size. It is made before all other C
@@ -234,23 +258,29 @@ sub_cmd_record_mcount = \
if [ $(@) != "scripts/mod/empty.o" ]; then \
$(objtree)/scripts/recordmcount $(RECORDMCOUNT_FLAGS) "$(@)"; \
fi;
+endif
+
recordmcount_source := $(srctree)/scripts/recordmcount.c \
$(srctree)/scripts/recordmcount.h
-else
+else # !BUILD_C_RECORDMCOUNT
sub_cmd_record_mcount = set -e ; perl $(srctree)/scripts/recordmcount.pl "$(ARCH)" \
"$(if $(CONFIG_CPU_BIG_ENDIAN),big,little)" \
"$(if $(CONFIG_64BIT),64,32)" \
"$(OBJDUMP)" "$(OBJCOPY)" "$(CC) $(KBUILD_CFLAGS)" \
"$(LD)" "$(NM)" "$(RM)" "$(MV)" \
"$(if $(part-of-module),1,0)" "$(@)";
+
recordmcount_source := $(srctree)/scripts/recordmcount.pl
-endif
+endif # BUILD_C_RECORDMCOUNT
+
+ifndef CONFIG_LTO_CLANG
cmd_record_mcount = \
if [ "$(findstring $(CC_FLAGS_FTRACE),$(_c_flags))" = \
"$(CC_FLAGS_FTRACE)" ]; then \
$(sub_cmd_record_mcount) \
fi;
endif
+endif # CONFIG_FTRACE_MCOUNT_RECORD
ifdef CONFIG_STACK_VALIDATION
ifneq ($(SKIP_STACK_VALIDATION),1)
@@ -437,9 +467,30 @@ $(sort $(subdir-obj-y)): $(subdir-ym) ;
#
ifdef builtin-target
+ifdef CONFIG_LTO_CLANG
+ ifdef CONFIG_MODVERSIONS
+ # combine symversions for later processing
+ update_lto_symversions = \
+ rm -f $@.symversions; \
+ for i in $(filter-out FORCE,$^); do \
+ if [ -f $$i.symversions ]; then \
+ cat $$i.symversions \
+ >> $@.symversions; \
+ fi; \
+ done;
+ endif
+ # rebuild the symbol table with llvm-ar to include IR files
+ update_lto_symtable = ; \
+ mv -f $@ $@.tmp; \
+ $(LLVM_AR) rcsT$(KBUILD_ARFLAGS) $@ \
+ $$($(AR) t $@.tmp); \
+ rm -f $@.tmp
+endif
+
ifdef CONFIG_THIN_ARCHIVES
- cmd_make_builtin = rm -f $@; $(AR) rcST$(KBUILD_ARFLAGS)
- cmd_make_empty_builtin = rm -f $@; $(AR) rcST$(KBUILD_ARFLAGS)
+ cmd_make_builtin = $(update_lto_symversions) \
+ rm -f $@; $(AR) rcSTP$(KBUILD_ARFLAGS)
+ cmd_make_empty_builtin = rm -f $@; $(AR) rcSTP$(KBUILD_ARFLAGS)
quiet_cmd_link_o_target = AR $@
else
cmd_make_builtin = $(LD) $(ld_flags) -r -o
@@ -479,7 +530,11 @@ ifdef lib-target
quiet_cmd_link_l_target = AR $@
ifdef CONFIG_THIN_ARCHIVES
- cmd_link_l_target = rm -f $@; $(AR) rcsT$(KBUILD_ARFLAGS) $@ $(lib-y)
+ cmd_link_l_target = \
+ $(update_lto_symversions) \
+ rm -f $@; \
+ $(AR) rcsTP$(KBUILD_ARFLAGS) $@ $(lib-y) \
+ $(update_lto_symtable)
else
cmd_link_l_target = rm -f $@; $(AR) rcs$(KBUILD_ARFLAGS) $@ $(lib-y)
endif
@@ -497,14 +552,36 @@ else
ref_prefix = EXTERN(
endif
-quiet_cmd_export_list = EXPORTS $@
-cmd_export_list = $(OBJDUMP) -h $< | \
- sed -ne '/___ksymtab/{s/.*+/$(ref_prefix)/;s/ .*/)/;p}' >$(ksyms-lds);\
- rm -f $(dummy-object);\
- $(AR) rcs$(KBUILD_ARFLAGS) $(dummy-object);\
+filter_export_list = sed -ne '/___ksymtab/s/.*+\([^ "]*\).*/$(ref_prefix)\1)/p'
+link_export_list = rm -f $(dummy-object);\
+ echo | $(CC) $(a_flags) -c -o $(dummy-object) -x assembler -;\
$(LD) $(ld_flags) -r -o $@ -T $(ksyms-lds) $(dummy-object);\
rm $(dummy-object) $(ksyms-lds)
+quiet_cmd_export_list = EXPORTS $@
+
+ifdef CONFIG_LTO_CLANG
+# objdump doesn't understand IR files and llvm-dis doesn't support archives,
+# so we'll walk through each file in the archive separately
+cmd_export_list = \
+ rm -f $(ksyms-lds); \
+ for o in $$($(AR) t $<); do \
+ if $(OBJDUMP) -h $$o >/dev/null 2>/dev/null; then \
+ $(OBJDUMP) -h $$o | \
+ $(filter_export_list) \
+ >>$(ksyms-lds); \
+ else \
+ $(LLVM_DIS) -o=- $$o | \
+ $(filter_export_list) \
+ >>$(ksyms-lds); \
+ fi; \
+ done; \
+ $(link_export_list)
+else
+cmd_export_list = $(OBJDUMP) -h $< | $(filter_export_list) >$(ksyms-lds); \
+ $(link_export_list)
+endif
+
$(obj)/lib-ksyms.o: $(lib-target) FORCE
$(call if_changed,export_list)
@@ -528,20 +605,36 @@ $($(subst $(obj)/,,$(@:.o=-objs))) \
$($(subst $(obj)/,,$(@:.o=-y))) \
$($(subst $(obj)/,,$(@:.o=-m)))), $^)
-quiet_cmd_link_multi-y = LD $@
-cmd_link_multi-y = $(LD) $(ld_flags) -r -o $@ $(link_multi_deps) $(cmd_secanalysis)
+cmd_link_multi-link = $(LD) $(ld_flags) -r -o $@ $(link_multi_deps) $(cmd_secanalysis)
+
+ifdef CONFIG_THIN_ARCHIVES
+ quiet_cmd_link_multi-y = AR $@
+ cmd_link_multi-y = $(update_lto_symversions) \
+ rm -f $@; $(AR) rcSTP$(KBUILD_ARFLAGS) $@ $(link_multi_deps) \
+ $(update_lto_symtable)
+else
+ quiet_cmd_link_multi-y = LD $@
+ cmd_link_multi-y = $(cmd_link_multi-link)
+endif
quiet_cmd_link_multi-m = LD [M] $@
-cmd_link_multi-m = $(cmd_link_multi-y)
+
+ifdef CONFIG_LTO_CLANG
+ # don't compile IR until needed
+ cmd_link_multi-m = $(cmd_link_multi-y)
+else
+ cmd_link_multi-m = $(cmd_link_multi-link)
+endif
$(multi-used-y): FORCE
$(call if_changed,link_multi-y)
-$(call multi_depend, $(multi-used-y), .o, -objs -y)
$(multi-used-m): FORCE
$(call if_changed,link_multi-m)
@{ echo $(@:.o=.ko); echo $(link_multi_deps); \
$(cmd_undef_syms); } > $(MODVERDIR)/$(@F:.o=.mod)
+
+$(call multi_depend, $(multi-used-y), .o, -objs -y)
$(call multi_depend, $(multi-used-m), .o, -objs -y -m)
targets += $(multi-used-y) $(multi-used-m)
diff --git a/scripts/Makefile.clean b/scripts/Makefile.clean
index 50616ea25131..2e70c6f06354 100644
--- a/scripts/Makefile.clean
+++ b/scripts/Makefile.clean
@@ -11,7 +11,7 @@ include scripts/Kbuild.include
# The filename Kbuild has precedence over Makefile
kbuild-dir := $(if $(filter /%,$(src)),$(src),$(srctree)/$(src))
-include $(if $(wildcard $(kbuild-dir)/Kbuild), $(kbuild-dir)/Kbuild, $(kbuild-dir)/Makefile)
+-include $(if $(wildcard $(kbuild-dir)/Kbuild), $(kbuild-dir)/Kbuild, $(kbuild-dir)/Makefile)
# Figure out what we need to build from the various variables
# ==========================================================================
diff --git a/scripts/Makefile.kasan b/scripts/Makefile.kasan
index 2624d4bf9a45..d809e00d6b61 100644
--- a/scripts/Makefile.kasan
+++ b/scripts/Makefile.kasan
@@ -9,10 +9,7 @@ KASAN_SHADOW_OFFSET ?= $(CONFIG_KASAN_SHADOW_OFFSET)
CFLAGS_KASAN_MINIMAL := -fsanitize=kernel-address
-CFLAGS_KASAN := $(call cc-option, -fsanitize=kernel-address \
- -fasan-shadow-offset=$(KASAN_SHADOW_OFFSET) \
- --param asan-stack=1 --param asan-globals=1 \
- --param asan-instrumentation-with-call-threshold=$(call_threshold))
+cc-param = $(call cc-option, -mllvm -$(1), $(call cc-option, --param $(1)))
ifeq ($(call cc-option, $(CFLAGS_KASAN_MINIMAL) -Werror),)
ifneq ($(CONFIG_COMPILE_TEST),y)
@@ -20,13 +17,24 @@ ifeq ($(call cc-option, $(CFLAGS_KASAN_MINIMAL) -Werror),)
-fsanitize=kernel-address is not supported by compiler)
endif
else
- ifeq ($(CFLAGS_KASAN),)
- ifneq ($(CONFIG_COMPILE_TEST),y)
- $(warning CONFIG_KASAN: compiler does not support all options.\
- Trying minimal configuration)
- endif
- CFLAGS_KASAN := $(CFLAGS_KASAN_MINIMAL)
- endif
+ # -fasan-shadow-offset fails without -fsanitize
+ CFLAGS_KASAN_SHADOW := $(call cc-option, -fsanitize=kernel-address \
+ -fasan-shadow-offset=$(KASAN_SHADOW_OFFSET), \
+ $(call cc-option, -fsanitize=kernel-address \
+ -mllvm -asan-mapping-offset=$(KASAN_SHADOW_OFFSET)))
+
+ ifeq ($(strip $(CFLAGS_KASAN_SHADOW)),)
+ CFLAGS_KASAN := $(CFLAGS_KASAN_MINIMAL)
+ else
+ # Now add all the compiler specific options that are valid standalone
+ CFLAGS_KASAN := $(CFLAGS_KASAN_SHADOW) \
+ $(call cc-param,asan-globals=1) \
+ $(call cc-param,asan-instrumentation-with-call-threshold=$(call_threshold)) \
+ $(call cc-param,asan-stack=1) \
+ $(call cc-param,asan-use-after-scope=1) \
+ $(call cc-param,asan-instrument-allocas=1)
+ endif
+
endif
CFLAGS_KASAN_NOSANITIZE := -fno-builtin
diff --git a/scripts/Makefile.lib b/scripts/Makefile.lib
index 4e02d51dfc62..45b5f5589a7b 100644
--- a/scripts/Makefile.lib
+++ b/scripts/Makefile.lib
@@ -314,6 +314,12 @@ $(obj)/%.dtb: $(src)/%.dts FORCE
dtc-tmp = $(subst $(comma),_,$(dot-target).dts.tmp)
+# cat
+# ---------------------------------------------------------------------------
+# Concatentate multiple files together
+quiet_cmd_cat = CAT $@
+cmd_cat = (cat $(filter-out FORCE,$^) > $@) || (rm -f $@; false)
+
# Bzip2
# ---------------------------------------------------------------------------
diff --git a/scripts/Makefile.modinst b/scripts/Makefile.modinst
index 07650eeaaf06..6f4c3f5a7ae3 100644
--- a/scripts/Makefile.modinst
+++ b/scripts/Makefile.modinst
@@ -29,7 +29,7 @@ quiet_cmd_modules_install = INSTALL $@
INSTALL_MOD_DIR ?= extra
ext-mod-dir = $(INSTALL_MOD_DIR)$(subst $(patsubst %/,%,$(KBUILD_EXTMOD)),,$(@D))
-modinst_dir = $(if $(KBUILD_EXTMOD),$(ext-mod-dir),kernel/$(@D))
+modinst_dir ?= $(if $(KBUILD_EXTMOD),$(ext-mod-dir),kernel/$(@D))
$(modules):
$(call cmd,modules_install,$(MODLIB)/$(modinst_dir))
diff --git a/scripts/Makefile.modpost b/scripts/Makefile.modpost
index 16923ba4b5b1..b7c50cb48ba4 100644
--- a/scripts/Makefile.modpost
+++ b/scripts/Makefile.modpost
@@ -82,12 +82,28 @@ modpost = scripts/mod/modpost \
MODPOST_OPT=$(subst -i,-n,$(filter -i,$(MAKEFLAGS)))
+# If CONFIG_LTO_CLANG is enabled, .o files are either LLVM IR, or empty, so we
+# need to link them into actual objects before passing them to modpost
+modpost-ext = $(if $(CONFIG_LTO_CLANG),.lto,)
+
+ifdef CONFIG_LTO_CLANG
+quiet_cmd_cc_lto_link_modules = LD [M] $@
+cmd_cc_lto_link_modules = \
+ $(LD) $(ld_flags) -r -o $(@) \
+ $(shell [ -s $(@:$(modpost-ext).o=.o.symversions) ] && \
+ echo -T $(@:$(modpost-ext).o=.o.symversions)) \
+ --whole-archive $(filter-out FORCE,$^)
+
+$(modules:.ko=$(modpost-ext).o): %$(modpost-ext).o: %.o FORCE
+ $(call if_changed,cc_lto_link_modules)
+endif
+
# We can go over command line length here, so be careful.
quiet_cmd_modpost = MODPOST $(words $(filter-out vmlinux FORCE, $^)) modules
- cmd_modpost = $(MODLISTCMD) | sed 's/\.ko$$/.o/' | $(modpost) $(MODPOST_OPT) -s -T -
+ cmd_modpost = $(MODLISTCMD) | sed 's/\.ko$$/$(modpost-ext)\.o/' | $(modpost) $(MODPOST_OPT) -s -T -
PHONY += __modpost
-__modpost: $(modules:.ko=.o) FORCE
+__modpost: $(modules:.ko=$(modpost-ext).o) FORCE
$(call cmd,modpost) $(wildcard vmlinux)
quiet_cmd_kernel-mod = MODPOST $@
@@ -98,8 +114,7 @@ vmlinux.o: FORCE
# Declare generated files as targets for modpost
$(symverfile): __modpost ;
-$(modules:.ko=.mod.c): __modpost ;
-
+$(modules:.ko=$(modpost-ext).mod.c): __modpost ;
# Step 5), compile all *.mod.c files
@@ -110,22 +125,37 @@ quiet_cmd_cc_o_c = CC $@
cmd_cc_o_c = $(CC) $(c_flags) $(KBUILD_CFLAGS_MODULE) $(CFLAGS_MODULE) \
-c -o $@ $<
-$(modules:.ko=.mod.o): %.mod.o: %.mod.c FORCE
+$(modules:.ko=.mod.o): %.mod.o: %$(modpost-ext).mod.c FORCE
$(call if_changed_dep,cc_o_c)
-targets += $(modules:.ko=.mod.o)
+targets += $(modules:.ko=$(modpost-ext).mod.o)
ARCH_POSTLINK := $(wildcard $(srctree)/arch/$(SRCARCH)/Makefile.postlink)
# Step 6), final link of the modules with optional arch pass after final link
quiet_cmd_ld_ko_o = LD [M] $@
+
+ifdef CONFIG_LTO_CLANG
+ cmd_ld_ko_o = \
+ $(LD) -r $(LDFLAGS) \
+ $(KBUILD_LDFLAGS_MODULE) $(LDFLAGS_MODULE) \
+ $(shell [ -s $(@:.ko=.o.symversions) ] && \
+ echo -T $(@:.ko=.o.symversions)) \
+ -o $@ --whole-archive \
+ $(filter-out FORCE,$(^:$(modpost-ext).o=.o))
+
+ ifdef CONFIG_FTRACE_MCOUNT_RECORD
+ cmd_ld_ko_o += ; $(objtree)/scripts/recordmcount $(RECORDMCOUNT_FLAGS) $@
+ endif
+else
cmd_ld_ko_o = \
$(LD) -r $(LDFLAGS) \
$(KBUILD_LDFLAGS_MODULE) $(LDFLAGS_MODULE) \
-o $@ $(filter-out FORCE,$^) ; \
$(if $(ARCH_POSTLINK), $(MAKE) -f $(ARCH_POSTLINK) $@, true)
+endif
-$(modules): %.ko :%.o %.mod.o FORCE
+$(modules): %.ko: %$(modpost-ext).o %.mod.o FORCE
+$(call if_changed,ld_ko_o)
targets += $(modules)
diff --git a/scripts/checkpatch.pl b/scripts/checkpatch.pl
index 55171647f516..77bc36817c2e 100755
--- a/scripts/checkpatch.pl
+++ b/scripts/checkpatch.pl
@@ -2529,6 +2529,7 @@ sub process {
# Check for git id commit length and improperly formed commit descriptions
if ($in_commit_log && !$commit_log_possible_stack_dump &&
$line !~ /^\s*(?:Link|Patchwork|http|https|BugLink):/i &&
+ $line !~ /^This reverts commit [0-9a-f]{7,40}/ &&
($line =~ /\bcommit\s+[0-9a-f]{5,}\b/i ||
($line =~ /(?:\s|^)[0-9a-f]{12,40}(?:[\s"'\(\[]|$)/i &&
$line !~ /[\<\[][0-9a-f]{12,40}[\>\]]/i &&
diff --git a/scripts/clang-android.sh b/scripts/clang-android.sh
new file mode 100755
index 000000000000..9186c4f48576
--- /dev/null
+++ b/scripts/clang-android.sh
@@ -0,0 +1,4 @@
+#!/bin/sh
+# SPDX-License-Identifier: GPL-2.0
+
+$* -dM -E - </dev/null 2>&1 | grep -q __ANDROID__ && echo "y"
diff --git a/scripts/clang-version.sh b/scripts/clang-version.sh
new file mode 100755
index 000000000000..9780efa56980
--- /dev/null
+++ b/scripts/clang-version.sh
@@ -0,0 +1,33 @@
+#!/bin/sh
+# SPDX-License-Identifier: GPL-2.0
+#
+# clang-version [-p] clang-command
+#
+# Prints the compiler version of `clang-command' in a canonical 4-digit form
+# such as `0500' for clang-5.0 etc.
+#
+# With the -p option, prints the patchlevel as well, for example `050001' for
+# clang-5.0.1 etc.
+#
+
+if [ "$1" = "-p" ] ; then
+ with_patchlevel=1;
+ shift;
+fi
+
+compiler="$*"
+
+if [ ${#compiler} -eq 0 ]; then
+ echo "Error: No compiler specified."
+ printf "Usage:\n\t$0 <clang-command>\n"
+ exit 1
+fi
+
+MAJOR=$(echo __clang_major__ | $compiler -E -x c - | tail -n 1)
+MINOR=$(echo __clang_minor__ | $compiler -E -x c - | tail -n 1)
+if [ "x$with_patchlevel" != "x" ] ; then
+ PATCHLEVEL=$(echo __clang_patchlevel__ | $compiler -E -x c - | tail -n 1)
+ printf "%02d%02d%02d\\n" $MAJOR $MINOR $PATCHLEVEL
+else
+ printf "%02d%02d\\n" $MAJOR $MINOR
+fi
diff --git a/scripts/link-vmlinux.sh b/scripts/link-vmlinux.sh
index f742c65108b9..8b4329c33181 100755
--- a/scripts/link-vmlinux.sh
+++ b/scripts/link-vmlinux.sh
@@ -53,7 +53,38 @@ archive_builtin()
${AR} rcsT${KBUILD_ARFLAGS} built-in.o \
${KBUILD_VMLINUX_INIT} \
${KBUILD_VMLINUX_MAIN}
+
+ if [ -n "${CONFIG_LTO_CLANG}" ]; then
+ mv -f built-in.o built-in.o.tmp
+ ${LLVM_AR} rcsT${KBUILD_ARFLAGS} built-in.o $(${AR} t built-in.o.tmp)
+ rm -f built-in.o.tmp
+ fi
+ fi
+}
+
+# If CONFIG_LTO_CLANG is selected, collect generated symbol versions into
+# .tmp_symversions
+modversions()
+{
+ if [ -z "${CONFIG_LTO_CLANG}" ]; then
+ return
fi
+
+ if [ -z "${CONFIG_MODVERSIONS}" ]; then
+ return
+ fi
+
+ rm -f .tmp_symversions
+
+ for a in built-in.o ${KBUILD_VMLINUX_LIBS}; do
+ for o in $(${AR} t $a); do
+ if [ -f ${o}.symversions ]; then
+ cat ${o}.symversions >> .tmp_symversions
+ fi
+ done
+ done
+
+ echo "-T .tmp_symversions"
}
# Link of vmlinux.o used for section mismatch analysis
@@ -70,7 +101,29 @@ modpost_link()
${KBUILD_VMLINUX_MAIN} \
--end-group"
fi
- ${LD} ${LDFLAGS} -r -o ${1} ${objects}
+
+ if [ -n "${CONFIG_LTO_CLANG}" ]; then
+ # This might take a while, so indicate that we're doing
+ # an LTO link
+ info LTO vmlinux.o
+ else
+ info LD vmlinux.o
+ fi
+
+ ${LD} ${LDFLAGS} -r -o ${1} $(modversions) ${objects}
+}
+
+# If CONFIG_LTO_CLANG is selected, we postpone running recordmcount until
+# we have compiled LLVM IR to an object file.
+recordmcount()
+{
+ if [ -z "${CONFIG_LTO_CLANG}" ]; then
+ return
+ fi
+
+ if [ -n "${CONFIG_FTRACE_MCOUNT_RECORD}" ]; then
+ scripts/recordmcount ${RECORDMCOUNT_FLAGS} $*
+ fi
}
# Link of vmlinux
@@ -82,7 +135,15 @@ vmlinux_link()
local objects
if [ "${SRCARCH}" != "um" ]; then
- if [ -n "${CONFIG_THIN_ARCHIVES}" ]; then
+ local ld=${LD}
+ local ldflags="${LDFLAGS} ${LDFLAGS_vmlinux}"
+
+ if [ -n "${LDFINAL_vmlinux}" ]; then
+ ld=${LDFINAL_vmlinux}
+ ldflags="${LDFLAGS_FINAL_vmlinux} ${LDFLAGS_vmlinux}"
+ fi
+
+ if [[ -n "${CONFIG_THIN_ARCHIVES}" && -z "${CONFIG_LTO_CLANG}" ]]; then
objects="--whole-archive built-in.o ${1}"
else
objects="${KBUILD_VMLINUX_INIT} \
@@ -92,8 +153,7 @@ vmlinux_link()
${1}"
fi
- ${LD} ${LDFLAGS} ${LDFLAGS_vmlinux} -o ${2} \
- -T ${lds} ${objects}
+ ${ld} ${ldflags} -o ${2} -T ${lds} ${objects}
else
if [ -n "${CONFIG_THIN_ARCHIVES}" ]; then
objects="-Wl,--whole-archive built-in.o ${1}"
@@ -113,7 +173,6 @@ vmlinux_link()
fi
}
-
# Create ${2} .o file with all symbols from the ${1} object file
kallsyms()
{
@@ -164,6 +223,7 @@ cleanup()
rm -f .tmp_System.map
rm -f .tmp_kallsyms*
rm -f .tmp_version
+ rm -f .tmp_symversions
rm -f .tmp_vmlinux*
rm -f built-in.o
rm -f System.map
@@ -209,15 +269,6 @@ case "${KCONFIG_CONFIG}" in
. "./${KCONFIG_CONFIG}"
esac
-archive_builtin
-
-#link vmlinux.o
-info LD vmlinux.o
-modpost_link vmlinux.o
-
-# modpost vmlinux.o to check for section mismatches
-${MAKE} -f "${srctree}/scripts/Makefile.modpost" vmlinux.o
-
# Update version
info GEN .version
if [ ! -r .version ]; then
@@ -228,9 +279,27 @@ else
expr 0$(cat .old_version) + 1 >.version;
fi;
+archive_builtin
+
+#link vmlinux.o
+modpost_link vmlinux.o
+
+# modpost vmlinux.o to check for section mismatches
+${MAKE} -f "${srctree}/scripts/Makefile.modpost" vmlinux.o
+
# final build of init/
${MAKE} -f "${srctree}/scripts/Makefile.build" obj=init GCC_PLUGINS_CFLAGS="${GCC_PLUGINS_CFLAGS}"
+if [ -n "${CONFIG_LTO_CLANG}" ]; then
+ # Re-use vmlinux.o, so we can avoid the slow LTO link step in
+ # vmlinux_link
+ KBUILD_VMLINUX_INIT=
+ KBUILD_VMLINUX_MAIN=vmlinux.o
+
+ # Call recordmcount if needed
+ recordmcount vmlinux.o
+fi
+
kallsymso=""
kallsyms_vmlinux=""
if [ -n "${CONFIG_KALLSYMS}" ]; then
diff --git a/scripts/mod/Makefile b/scripts/mod/Makefile
index b497d9764dcf..247cf1fb5a4e 100644
--- a/scripts/mod/Makefile
+++ b/scripts/mod/Makefile
@@ -1,4 +1,5 @@
OBJECT_FILES_NON_STANDARD := y
+CFLAGS_empty.o += $(DISABLE_LTO)
hostprogs-y := modpost mk_elfconfig
always := $(hostprogs-y) empty.o
diff --git a/scripts/mod/modpost.c b/scripts/mod/modpost.c
index 88b3dc19bbae..e5c411433011 100644
--- a/scripts/mod/modpost.c
+++ b/scripts/mod/modpost.c
@@ -936,6 +936,7 @@ static const char *const head_sections[] = { ".head.text*", NULL };
static const char *const linker_symbols[] =
{ "__init_begin", "_sinittext", "_einittext", NULL };
static const char *const optim_symbols[] = { "*.constprop.*", NULL };
+static const char *const cfi_symbols[] = { "*.cfi", NULL };
enum mismatch {
TEXT_TO_ANY_INIT,
@@ -1157,6 +1158,16 @@ static const struct sectioncheck *section_mismatch(
* fromsec = text section
* refsymname = *.constprop.*
*
+ * Pattern 6:
+ * With CONFIG_CFI_CLANG, clang appends .cfi to all indirectly called
+ * functions and creates a function stub with the original name. This
+ * stub is always placed in .text, even if the actual function with the
+ * .cfi postfix is in .init.text or .exit.text.
+ * This pattern is identified by
+ * tosec = init or exit section
+ * fromsec = text section
+ * tosym = *.cfi
+ *
**/
static int secref_whitelist(const struct sectioncheck *mismatch,
const char *fromsec, const char *fromsym,
@@ -1195,9 +1206,39 @@ static int secref_whitelist(const struct sectioncheck *mismatch,
match(fromsym, optim_symbols))
return 0;
+ /* Check for pattern 6 */
+ if (match(fromsec, text_sections) &&
+ match(tosec, init_exit_sections) &&
+ match(tosym, cfi_symbols))
+ return 0;
+
return 1;
}
+static inline int is_arm_mapping_symbol(const char *str)
+{
+ return str[0] == '$' && strchr("axtd", str[1])
+ && (str[2] == '\0' || str[2] == '.');
+}
+
+/*
+ * If there's no name there, ignore it; likewise, ignore it if it's
+ * one of the magic symbols emitted used by current ARM tools.
+ *
+ * Otherwise if find_symbols_between() returns those symbols, they'll
+ * fail the whitelist tests and cause lots of false alarms ... fixable
+ * only by merging __exit and __init sections into __text, bloating
+ * the kernel (which is especially evil on embedded platforms).
+ */
+static inline int is_valid_name(struct elf_info *elf, Elf_Sym *sym)
+{
+ const char *name = elf->strtab + sym->st_name;
+
+ if (!name || !strlen(name))
+ return 0;
+ return !is_arm_mapping_symbol(name);
+}
+
/**
* Find symbol based on relocation record info.
* In some cases the symbol supplied is a valid symbol so
@@ -1223,6 +1264,8 @@ static Elf_Sym *find_elf_symbol(struct elf_info *elf, Elf64_Sword addr,
continue;
if (ELF_ST_TYPE(sym->st_info) == STT_SECTION)
continue;
+ if (!is_valid_name(elf, sym))
+ continue;
if (sym->st_value == addr)
return sym;
/* Find a symbol nearby - addr are maybe negative */
@@ -1241,30 +1284,6 @@ static Elf_Sym *find_elf_symbol(struct elf_info *elf, Elf64_Sword addr,
return NULL;
}
-static inline int is_arm_mapping_symbol(const char *str)
-{
- return str[0] == '$' && strchr("axtd", str[1])
- && (str[2] == '\0' || str[2] == '.');
-}
-
-/*
- * If there's no name there, ignore it; likewise, ignore it if it's
- * one of the magic symbols emitted used by current ARM tools.
- *
- * Otherwise if find_symbols_between() returns those symbols, they'll
- * fail the whitelist tests and cause lots of false alarms ... fixable
- * only by merging __exit and __init sections into __text, bloating
- * the kernel (which is especially evil on embedded platforms).
- */
-static inline int is_valid_name(struct elf_info *elf, Elf_Sym *sym)
-{
- const char *name = elf->strtab + sym->st_name;
-
- if (!name || !strlen(name))
- return 0;
- return !is_arm_mapping_symbol(name);
-}
-
/*
* Find symbols before or equal addr and after addr - in the section sec.
* If we find two symbols with equal offset prefer one with a valid name.
diff --git a/scripts/recordmcount.c b/scripts/recordmcount.c
index 5423a58d1b06..e0c508203c9c 100644
--- a/scripts/recordmcount.c
+++ b/scripts/recordmcount.c
@@ -366,7 +366,8 @@ is_mcounted_section_name(char const *const txtname)
strcmp(".softirqentry.text", txtname) == 0 ||
strcmp(".kprobes.text", txtname) == 0 ||
strcmp(".cpuidle.text", txtname) == 0 ||
- strcmp(".text.unlikely", txtname) == 0;
+ (strncmp(".text.", txtname, 6) == 0 &&
+ strcmp(".text..ftrace", txtname) != 0);
}
/* 32 bit and 64 bit are very similar */
diff --git a/security/Kconfig b/security/Kconfig
index 32f36b40e9f0..80a2934d3110 100644
--- a/security/Kconfig
+++ b/security/Kconfig
@@ -18,6 +18,15 @@ config SECURITY_DMESG_RESTRICT
If you are unsure how to answer this question, answer N.
+config SECURITY_PERF_EVENTS_RESTRICT
+ bool "Restrict unprivileged use of performance events"
+ depends on PERF_EVENTS
+ help
+ If you say Y here, the kernel.perf_event_paranoid sysctl
+ will be set to 3 by default, and no unprivileged use of the
+ perf_event_open syscall will be permitted unless it is
+ changed.
+
config SECURITY
bool "Enable different security models"
depends on SYSFS
diff --git a/security/apparmor/lsm.c b/security/apparmor/lsm.c
index 02cc952b86aa..1d3b8abaec1c 100644
--- a/security/apparmor/lsm.c
+++ b/security/apparmor/lsm.c
@@ -656,11 +656,11 @@ static const struct kernel_param_ops param_ops_aalockpolicy = {
.get = param_get_aalockpolicy
};
-static int param_set_audit(const char *val, struct kernel_param *kp);
-static int param_get_audit(char *buffer, struct kernel_param *kp);
+static int param_set_audit(const char *val, const struct kernel_param *kp);
+static int param_get_audit(char *buffer, const struct kernel_param *kp);
-static int param_set_mode(const char *val, struct kernel_param *kp);
-static int param_get_mode(char *buffer, struct kernel_param *kp);
+static int param_set_mode(const char *val, const struct kernel_param *kp);
+static int param_get_mode(char *buffer, const struct kernel_param *kp);
/* Flag values, also controllable via /sys/module/apparmor/parameters
* We define special types as we want to do additional mediation.
@@ -774,7 +774,7 @@ static int param_get_aauint(char *buffer, const struct kernel_param *kp)
return param_get_uint(buffer, kp);
}
-static int param_get_audit(char *buffer, struct kernel_param *kp)
+static int param_get_audit(char *buffer, const struct kernel_param *kp)
{
if (!policy_view_capable())
return -EPERM;
@@ -785,7 +785,7 @@ static int param_get_audit(char *buffer, struct kernel_param *kp)
return sprintf(buffer, "%s", audit_mode_names[aa_g_audit]);
}
-static int param_set_audit(const char *val, struct kernel_param *kp)
+static int param_set_audit(const char *val, const struct kernel_param *kp)
{
int i;
if (!policy_admin_capable())
@@ -807,7 +807,7 @@ static int param_set_audit(const char *val, struct kernel_param *kp)
return -EINVAL;
}
-static int param_get_mode(char *buffer, struct kernel_param *kp)
+static int param_get_mode(char *buffer, const struct kernel_param *kp)
{
if (!policy_admin_capable())
return -EPERM;
@@ -818,7 +818,7 @@ static int param_get_mode(char *buffer, struct kernel_param *kp)
return sprintf(buffer, "%s", aa_profile_mode_names[aa_g_profile_mode]);
}
-static int param_set_mode(const char *val, struct kernel_param *kp)
+static int param_set_mode(const char *val, const struct kernel_param *kp)
{
int i;
if (!policy_admin_capable())
diff --git a/security/commoncap.c b/security/commoncap.c
index 8df676fbd393..1076608ff38f 100644
--- a/security/commoncap.c
+++ b/security/commoncap.c
@@ -31,6 +31,10 @@
#include <linux/binfmts.h>
#include <linux/personality.h>
+#ifdef CONFIG_ANDROID_PARANOID_NETWORK
+#include <linux/android_aid.h>
+#endif
+
/*
* If a non-root user executes a setuid-root binary in
* !secure(SECURE_NOROOT) mode, then we raise capabilities.
@@ -54,7 +58,7 @@ static void warn_setuid_and_fcaps_mixed(const char *fname)
}
/**
- * cap_capable - Determine whether a task has a particular effective capability
+ * __cap_capable - Determine whether a task has a particular effective capability
* @cred: The credentials to use
* @ns: The user namespace in which we need the capability
* @cap: The capability to check for
@@ -68,7 +72,7 @@ static void warn_setuid_and_fcaps_mixed(const char *fname)
* cap_has_capability() returns 0 when a task has a capability, but the
* kernel's capable() and has_capability() returns 1 for this case.
*/
-int cap_capable(const struct cred *cred, struct user_namespace *targ_ns,
+int __cap_capable(const struct cred *cred, struct user_namespace *targ_ns,
int cap, int audit)
{
struct user_namespace *ns = targ_ns;
@@ -103,6 +107,27 @@ int cap_capable(const struct cred *cred, struct user_namespace *targ_ns,
/* We never get here */
}
+int cap_capable(const struct cred *cred, struct user_namespace *targ_ns,
+ int cap, int audit)
+{
+ int ret = __cap_capable(cred, targ_ns, cap, audit);
+
+#ifdef CONFIG_ANDROID_PARANOID_NETWORK
+ if (ret != 0 && cap == CAP_NET_RAW && in_egroup_p(AID_NET_RAW)) {
+ printk("Process %s granted CAP_NET_RAW from Android group net_raw.\n", current->comm);
+ printk(" Please update the .rc file to explictly set 'capabilities NET_RAW'\n");
+ printk(" Implicit grants are deprecated and will be removed in the future.\n");
+ return 0;
+ }
+ if (ret != 0 && cap == CAP_NET_ADMIN && in_egroup_p(AID_NET_ADMIN)) {
+ printk("Process %s granted CAP_NET_ADMIN from Android group net_admin.\n", current->comm);
+ printk(" Please update the .rc file to explictly set 'capabilities NET_ADMIN'\n");
+ printk(" Implicit grants are deprecated and will be removed in the future.\n");
+ return 0;
+ }
+#endif
+ return ret;
+}
/**
* cap_settime - Determine whether the current process may set the system clock
* @ts: The time to set
diff --git a/security/inode.c b/security/inode.c
index c83db05c15ab..b4531f2be0f1 100644
--- a/security/inode.c
+++ b/security/inode.c
@@ -100,7 +100,7 @@ struct dentry *securityfs_create_file(const char *name, umode_t mode,
dir = d_inode(parent);
inode_lock(dir);
- dentry = lookup_one_len(name, parent, strlen(name));
+ dentry = lookup_one_len2(name, mount, parent, strlen(name));
if (IS_ERR(dentry))
goto out;
diff --git a/security/keys/dh.c b/security/keys/dh.c
index 531ed2ec132f..893af4c45038 100644
--- a/security/keys/dh.c
+++ b/security/keys/dh.c
@@ -55,7 +55,7 @@ static ssize_t mpi_from_key(key_serial_t keyid, size_t maxlen, MPI *mpi)
if (status == 0) {
const struct user_key_payload *payload;
- payload = user_key_payload(key);
+ payload = user_key_payload_locked(key);
if (maxlen == 0) {
*mpi = NULL;
diff --git a/security/keys/encrypted-keys/encrypted.c b/security/keys/encrypted-keys/encrypted.c
index ead2fd60244d..1f3d60021e05 100644
--- a/security/keys/encrypted-keys/encrypted.c
+++ b/security/keys/encrypted-keys/encrypted.c
@@ -313,7 +313,7 @@ static struct key *request_user_key(const char *master_desc, const u8 **master_k
goto error;
down_read(&ukey->sem);
- upayload = user_key_payload(ukey);
+ upayload = user_key_payload_locked(ukey);
if (!upayload) {
/* key was revoked before we acquired its semaphore */
up_read(&ukey->sem);
@@ -933,7 +933,7 @@ static long encrypted_read(const struct key *key, char __user *buffer,
size_t asciiblob_len;
int ret;
- epayload = rcu_dereference_key(key);
+ epayload = dereference_key_locked(key);
/* returns the hex encoded iv, encrypted-data, and hmac as ascii */
asciiblob_len = epayload->datablob_len + ivsize + 1
diff --git a/security/keys/trusted.c b/security/keys/trusted.c
index 4ba2f6b91242..e0fcb17068f5 100644
--- a/security/keys/trusted.c
+++ b/security/keys/trusted.c
@@ -1139,12 +1139,12 @@ out:
static long trusted_read(const struct key *key, char __user *buffer,
size_t buflen)
{
- struct trusted_key_payload *p;
+ const struct trusted_key_payload *p;
char *ascii_buf;
char *bufp;
int i;
- p = rcu_dereference_key(key);
+ p = dereference_key_locked(key);
if (!p)
return -EINVAL;
diff --git a/security/keys/user_defined.c b/security/keys/user_defined.c
index 3dc2607211cc..b4c170af7532 100644
--- a/security/keys/user_defined.c
+++ b/security/keys/user_defined.c
@@ -107,7 +107,7 @@ int user_update(struct key *key, struct key_preparsed_payload *prep)
/* attach the new data, displacing the old */
key->expiry = prep->expiry;
if (key_is_positive(key))
- zap = rcu_dereference_key(key);
+ zap = dereference_key_locked(key);
rcu_assign_keypointer(key, prep->payload.data[0]);
prep->payload.data[0] = NULL;
@@ -123,7 +123,7 @@ EXPORT_SYMBOL_GPL(user_update);
*/
void user_revoke(struct key *key)
{
- struct user_key_payload *upayload = key->payload.data[0];
+ struct user_key_payload *upayload = user_key_payload_locked(key);
/* clear the quota */
key_payload_reserve(key, 0);
@@ -169,7 +169,7 @@ long user_read(const struct key *key, char __user *buffer, size_t buflen)
const struct user_key_payload *upayload;
long ret;
- upayload = user_key_payload(key);
+ upayload = user_key_payload_locked(key);
ret = upayload->datalen;
/* we can return the data as is */
diff --git a/security/security.c b/security/security.c
index f825304f04a7..99890aa11652 100644
--- a/security/security.c
+++ b/security/security.c
@@ -11,6 +11,7 @@
* (at your option) any later version.
*/
+#include <linux/bpf.h>
#include <linux/capability.h>
#include <linux/dcache.h>
#include <linux/module.h>
@@ -508,6 +509,7 @@ int security_path_chown(const struct path *path, kuid_t uid, kgid_t gid)
return 0;
return call_int_hook(path_chown, 0, path, uid, gid);
}
+EXPORT_SYMBOL(security_path_chown);
int security_path_chroot(const struct path *path)
{
@@ -1589,6 +1591,37 @@ int security_audit_rule_match(u32 secid, u32 field, u32 op, void *lsmrule,
}
#endif /* CONFIG_AUDIT */
+#ifdef CONFIG_BPF_SYSCALL
+int security_bpf(int cmd, union bpf_attr *attr, unsigned int size)
+{
+ return call_int_hook(bpf, 0, cmd, attr, size);
+}
+int security_bpf_map(struct bpf_map *map, fmode_t fmode)
+{
+ return call_int_hook(bpf_map, 0, map, fmode);
+}
+int security_bpf_prog(struct bpf_prog *prog)
+{
+ return call_int_hook(bpf_prog, 0, prog);
+}
+int security_bpf_map_alloc(struct bpf_map *map)
+{
+ return call_int_hook(bpf_map_alloc_security, 0, map);
+}
+int security_bpf_prog_alloc(struct bpf_prog_aux *aux)
+{
+ return call_int_hook(bpf_prog_alloc_security, 0, aux);
+}
+void security_bpf_map_free(struct bpf_map *map)
+{
+ call_void_hook(bpf_map_free_security, map);
+}
+void security_bpf_prog_free(struct bpf_prog_aux *aux)
+{
+ call_void_hook(bpf_prog_free_security, aux);
+}
+#endif /* CONFIG_BPF_SYSCALL */
+
struct security_hook_heads security_hook_heads = {
.binder_set_context_mgr =
LIST_HEAD_INIT(security_hook_heads.binder_set_context_mgr),
@@ -1940,4 +1973,20 @@ struct security_hook_heads security_hook_heads = {
.audit_rule_free =
LIST_HEAD_INIT(security_hook_heads.audit_rule_free),
#endif /* CONFIG_AUDIT */
+#ifdef CONFIG_BPF_SYSCALL
+ .bpf =
+ LIST_HEAD_INIT(security_hook_heads.bpf),
+ .bpf_map =
+ LIST_HEAD_INIT(security_hook_heads.bpf_map),
+ .bpf_prog =
+ LIST_HEAD_INIT(security_hook_heads.bpf_prog),
+ .bpf_map_alloc_security =
+ LIST_HEAD_INIT(security_hook_heads.bpf_map_alloc_security),
+ .bpf_map_free_security =
+ LIST_HEAD_INIT(security_hook_heads.bpf_map_free_security),
+ .bpf_prog_alloc_security =
+ LIST_HEAD_INIT(security_hook_heads.bpf_prog_alloc_security),
+ .bpf_prog_free_security =
+ LIST_HEAD_INIT(security_hook_heads.bpf_prog_free_security),
+#endif /* CONFIG_BPF_SYSCALL */
};
diff --git a/security/selinux/hooks.c b/security/selinux/hooks.c
index 8ded80867b92..70b8ac1893e4 100644
--- a/security/selinux/hooks.c
+++ b/security/selinux/hooks.c
@@ -83,6 +83,7 @@
#include <linux/export.h>
#include <linux/msg.h>
#include <linux/shm.h>
+#include <linux/bpf.h>
#include "avc.h"
#include "objsec.h"
@@ -479,6 +480,7 @@ static int selinux_is_sblabel_mnt(struct super_block *sb)
!strcmp(sb->s_type->name, "sysfs") ||
!strcmp(sb->s_type->name, "pstore") ||
!strcmp(sb->s_type->name, "debugfs") ||
+ !strcmp(sb->s_type->name, "tracefs") ||
!strcmp(sb->s_type->name, "rootfs");
}
@@ -797,6 +799,7 @@ static int selinux_set_mnt_opts(struct super_block *sb,
sbsec->flags |= SE_SBPROC | SE_SBGENFS;
if (!strcmp(sb->s_type->name, "debugfs") ||
+ !strcmp(sb->s_type->name, "tracefs") ||
!strcmp(sb->s_type->name, "sysfs") ||
!strcmp(sb->s_type->name, "pstore"))
sbsec->flags |= SE_SBGENFS;
@@ -1745,6 +1748,10 @@ static inline int file_path_has_perm(const struct cred *cred,
return inode_has_perm(cred, file_inode(file), av, &ad);
}
+#ifdef CONFIG_BPF_SYSCALL
+static int bpf_fd_pass(struct file *file, u32 sid);
+#endif
+
/* Check whether a task can use an open file descriptor to
access an inode in a given way. Check access to the
descriptor itself, and then use dentry_has_perm to
@@ -1775,6 +1782,12 @@ static int file_has_perm(const struct cred *cred,
goto out;
}
+#ifdef CONFIG_BPF_SYSCALL
+ rc = bpf_fd_pass(file, cred_sid(cred));
+ if (rc)
+ return rc;
+#endif
+
/* av is zero if only checking access to the descriptor. */
rc = 0;
if (av)
@@ -2104,6 +2117,12 @@ static int selinux_binder_transfer_file(struct task_struct *from,
return rc;
}
+#ifdef CONFIG_BPF_SYSCALL
+ rc = bpf_fd_pass(file, sid);
+ if (rc)
+ return rc;
+#endif
+
if (unlikely(IS_PRIVATE(d_backing_inode(dentry))))
return 0;
@@ -6073,6 +6092,139 @@ static int selinux_key_getsecurity(struct key *key, char **_buffer)
#endif
+#ifdef CONFIG_BPF_SYSCALL
+static int selinux_bpf(int cmd, union bpf_attr *attr,
+ unsigned int size)
+{
+ u32 sid = current_sid();
+ int ret;
+
+ switch (cmd) {
+ case BPF_MAP_CREATE:
+ ret = avc_has_perm(sid, sid, SECCLASS_BPF, BPF__MAP_CREATE,
+ NULL);
+ break;
+ case BPF_PROG_LOAD:
+ ret = avc_has_perm(sid, sid, SECCLASS_BPF, BPF__PROG_LOAD,
+ NULL);
+ break;
+ default:
+ ret = 0;
+ break;
+ }
+
+ return ret;
+}
+
+static u32 bpf_map_fmode_to_av(fmode_t fmode)
+{
+ u32 av = 0;
+
+ if (fmode & FMODE_READ)
+ av |= BPF__MAP_READ;
+ if (fmode & FMODE_WRITE)
+ av |= BPF__MAP_WRITE;
+ return av;
+}
+
+/* This function will check the file pass through unix socket or binder to see
+ * if it is a bpf related object. And apply correspinding checks on the bpf
+ * object based on the type. The bpf maps and programs, not like other files and
+ * socket, are using a shared anonymous inode inside the kernel as their inode.
+ * So checking that inode cannot identify if the process have privilege to
+ * access the bpf object and that's why we have to add this additional check in
+ * selinux_file_receive and selinux_binder_transfer_files.
+ */
+static int bpf_fd_pass(struct file *file, u32 sid)
+{
+ struct bpf_security_struct *bpfsec;
+ struct bpf_prog *prog;
+ struct bpf_map *map;
+ int ret;
+
+ if (file->f_op == &bpf_map_fops) {
+ map = file->private_data;
+ bpfsec = map->security;
+ ret = avc_has_perm(sid, bpfsec->sid, SECCLASS_BPF,
+ bpf_map_fmode_to_av(file->f_mode), NULL);
+ if (ret)
+ return ret;
+ } else if (file->f_op == &bpf_prog_fops) {
+ prog = file->private_data;
+ bpfsec = prog->aux->security;
+ ret = avc_has_perm(sid, bpfsec->sid, SECCLASS_BPF,
+ BPF__PROG_RUN, NULL);
+ if (ret)
+ return ret;
+ }
+ return 0;
+}
+
+static int selinux_bpf_map(struct bpf_map *map, fmode_t fmode)
+{
+ u32 sid = current_sid();
+ struct bpf_security_struct *bpfsec;
+
+ bpfsec = map->security;
+ return avc_has_perm(sid, bpfsec->sid, SECCLASS_BPF,
+ bpf_map_fmode_to_av(fmode), NULL);
+}
+
+static int selinux_bpf_prog(struct bpf_prog *prog)
+{
+ u32 sid = current_sid();
+ struct bpf_security_struct *bpfsec;
+
+ bpfsec = prog->aux->security;
+ return avc_has_perm(sid, bpfsec->sid, SECCLASS_BPF,
+ BPF__PROG_RUN, NULL);
+}
+
+static int selinux_bpf_map_alloc(struct bpf_map *map)
+{
+ struct bpf_security_struct *bpfsec;
+
+ bpfsec = kzalloc(sizeof(*bpfsec), GFP_KERNEL);
+ if (!bpfsec)
+ return -ENOMEM;
+
+ bpfsec->sid = current_sid();
+ map->security = bpfsec;
+
+ return 0;
+}
+
+static void selinux_bpf_map_free(struct bpf_map *map)
+{
+ struct bpf_security_struct *bpfsec = map->security;
+
+ map->security = NULL;
+ kfree(bpfsec);
+}
+
+static int selinux_bpf_prog_alloc(struct bpf_prog_aux *aux)
+{
+ struct bpf_security_struct *bpfsec;
+
+ bpfsec = kzalloc(sizeof(*bpfsec), GFP_KERNEL);
+ if (!bpfsec)
+ return -ENOMEM;
+
+ bpfsec->sid = current_sid();
+ aux->security = bpfsec;
+
+ return 0;
+}
+
+static void selinux_bpf_prog_free(struct bpf_prog_aux *aux)
+{
+ struct bpf_security_struct *bpfsec = aux->security;
+
+ aux->security = NULL;
+ kfree(bpfsec);
+}
+#endif
+
static struct security_hook_list selinux_hooks[] = {
LSM_HOOK_INIT(binder_set_context_mgr, selinux_binder_set_context_mgr),
LSM_HOOK_INIT(binder_transaction, selinux_binder_transaction),
@@ -6287,6 +6439,16 @@ static struct security_hook_list selinux_hooks[] = {
LSM_HOOK_INIT(audit_rule_match, selinux_audit_rule_match),
LSM_HOOK_INIT(audit_rule_free, selinux_audit_rule_free),
#endif
+
+#ifdef CONFIG_BPF_SYSCALL
+ LSM_HOOK_INIT(bpf, selinux_bpf),
+ LSM_HOOK_INIT(bpf_map, selinux_bpf_map),
+ LSM_HOOK_INIT(bpf_prog, selinux_bpf_prog),
+ LSM_HOOK_INIT(bpf_map_alloc_security, selinux_bpf_map_alloc),
+ LSM_HOOK_INIT(bpf_prog_alloc_security, selinux_bpf_prog_alloc),
+ LSM_HOOK_INIT(bpf_map_free_security, selinux_bpf_map_free),
+ LSM_HOOK_INIT(bpf_prog_free_security, selinux_bpf_prog_free),
+#endif
};
static __init int selinux_init(void)
diff --git a/security/selinux/include/classmap.h b/security/selinux/include/classmap.h
index 1f1f4b2f6018..963ff8009dcc 100644
--- a/security/selinux/include/classmap.h
+++ b/security/selinux/include/classmap.h
@@ -165,5 +165,7 @@ struct security_class_mapping secclass_map[] = {
{ COMMON_CAP_PERMS, NULL } },
{ "cap2_userns",
{ COMMON_CAP2_PERMS, NULL } },
+ { "bpf",
+ {"map_create", "map_read", "map_write", "prog_load", "prog_run"} },
{ NULL }
};
diff --git a/security/selinux/include/objsec.h b/security/selinux/include/objsec.h
index c21e135460a5..43535cdd4c37 100644
--- a/security/selinux/include/objsec.h
+++ b/security/selinux/include/objsec.h
@@ -128,6 +128,10 @@ struct key_security_struct {
u32 sid; /* SID of key */
};
+struct bpf_security_struct {
+ u32 sid; /*SID of bpf obj creater*/
+};
+
extern unsigned int selinux_checkreqprot;
#endif /* _SELINUX_OBJSEC_H_ */
diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h
index f09c70b97eca..a339bea1f4c8 100644
--- a/tools/include/uapi/linux/bpf.h
+++ b/tools/include/uapi/linux/bpf.h
@@ -73,6 +73,8 @@ enum bpf_cmd {
BPF_PROG_LOAD,
BPF_OBJ_PIN,
BPF_OBJ_GET,
+ BPF_PROG_ATTACH,
+ BPF_PROG_DETACH,
};
enum bpf_map_type {
@@ -96,8 +98,23 @@ enum bpf_prog_type {
BPF_PROG_TYPE_TRACEPOINT,
BPF_PROG_TYPE_XDP,
BPF_PROG_TYPE_PERF_EVENT,
+ BPF_PROG_TYPE_CGROUP_SKB,
};
+enum bpf_attach_type {
+ BPF_CGROUP_INET_INGRESS,
+ BPF_CGROUP_INET_EGRESS,
+ __MAX_BPF_ATTACH_TYPE
+};
+
+#define MAX_BPF_ATTACH_TYPE __MAX_BPF_ATTACH_TYPE
+
+/* If BPF_F_ALLOW_OVERRIDE flag is used in BPF_PROG_ATTACH command
+ * to the given target_fd cgroup the descendent cgroup will be able to
+ * override effective bpf program that was inherited from this cgroup
+ */
+#define BPF_F_ALLOW_OVERRIDE (1U << 0)
+
#define BPF_PSEUDO_MAP_FD 1
/* flags for BPF_MAP_UPDATE_ELEM command */
@@ -107,6 +124,10 @@ enum bpf_prog_type {
#define BPF_F_NO_PREALLOC (1U << 0)
+/* Flags for accessing BPF object */
+#define BPF_F_RDONLY (1U << 3)
+#define BPF_F_WRONLY (1U << 4)
+
union bpf_attr {
struct { /* anonymous struct used by BPF_MAP_CREATE command */
__u32 map_type; /* one of enum bpf_map_type */
@@ -140,6 +161,14 @@ union bpf_attr {
struct { /* anonymous struct used by BPF_OBJ_* commands */
__aligned_u64 pathname;
__u32 bpf_fd;
+ __u32 file_flags;
+ };
+
+ struct { /* anonymous struct used by BPF_PROG_ATTACH/DETACH commands */
+ __u32 target_fd; /* container object to attach to */
+ __u32 attach_bpf_fd; /* eBPF program to attach */
+ __u32 attach_type;
+ __u32 attach_flags;
};
} __attribute__((aligned(8)));
@@ -426,6 +455,67 @@ enum bpf_func_id {
*/
BPF_FUNC_set_hash_invalid,
+ /**
+ * int bpf_get_numa_node_id()
+ * Return: Id of current NUMA node.
+ */
+ BPF_FUNC_get_numa_node_id,
+
+ /**
+ * int bpf_skb_change_head()
+ * Grows headroom of skb and adjusts MAC header offset accordingly.
+ * Will extends/reallocae as required automatically.
+ * May change skb data pointer and will thus invalidate any check
+ * performed for direct packet access.
+ * @skb: pointer to skb
+ * @len: length of header to be pushed in front
+ * @flags: Flags (unused for now)
+ * Return: 0 on success or negative error
+ */
+ BPF_FUNC_skb_change_head,
+
+ /**
+ * int bpf_xdp_adjust_head(xdp_md, delta)
+ * Adjust the xdp_md.data by delta
+ * @xdp_md: pointer to xdp_md
+ * @delta: An positive/negative integer to be added to xdp_md.data
+ * Return: 0 on success or negative on error
+ */
+ BPF_FUNC_xdp_adjust_head,
+
+ /**
+ * int bpf_probe_read_str(void *dst, int size, const void *unsafe_ptr)
+ * Copy a NUL terminated string from unsafe address. In case the string
+ * length is smaller than size, the target is not padded with further NUL
+ * bytes. In case the string length is larger than size, just count-1
+ * bytes are copied and the last byte is set to NUL.
+ * @dst: destination address
+ * @size: maximum number of bytes to copy, including the trailing NUL
+ * @unsafe_ptr: unsafe address
+ * Return:
+ * > 0 length of the string including the trailing NUL on success
+ * < 0 error
+ */
+ BPF_FUNC_probe_read_str,
+
+ /**
+ * u64 bpf_bpf_get_socket_cookie(skb)
+ * Get the cookie for the socket stored inside sk_buff.
+ * @skb: pointer to skb
+ * Return: 8 Bytes non-decreasing number on success or 0 if the socket
+ * field is missing inside sk_buff
+ */
+ BPF_FUNC_get_socket_cookie,
+
+ /**
+ * u32 bpf_get_socket_uid(skb)
+ * Get the owner uid of the socket stored inside sk_buff.
+ * @skb: pointer to skb
+ * Return: uid of the socket owner on success or 0 if the socket pointer
+ * inside sk_buff is NULL
+ */
+ BPF_FUNC_get_socket_uid,
+
__BPF_FUNC_MAX_ID,
};
diff --git a/tools/include/uapi/linux/hw_breakpoint.h b/tools/include/uapi/linux/hw_breakpoint.h
index b04000a2296a..2b65efd19a46 100644
--- a/tools/include/uapi/linux/hw_breakpoint.h
+++ b/tools/include/uapi/linux/hw_breakpoint.h
@@ -4,7 +4,11 @@
enum {
HW_BREAKPOINT_LEN_1 = 1,
HW_BREAKPOINT_LEN_2 = 2,
+ HW_BREAKPOINT_LEN_3 = 3,
HW_BREAKPOINT_LEN_4 = 4,
+ HW_BREAKPOINT_LEN_5 = 5,
+ HW_BREAKPOINT_LEN_6 = 6,
+ HW_BREAKPOINT_LEN_7 = 7,
HW_BREAKPOINT_LEN_8 = 8,
};
diff --git a/verity_dev_keys.x509 b/verity_dev_keys.x509
new file mode 100644
index 000000000000..86399c3c1dd7
--- /dev/null
+++ b/verity_dev_keys.x509
@@ -0,0 +1,24 @@
+-----BEGIN CERTIFICATE-----
+MIID/TCCAuWgAwIBAgIJAJcPmDkJqolJMA0GCSqGSIb3DQEBBQUAMIGUMQswCQYD
+VQQGEwJVUzETMBEGA1UECAwKQ2FsaWZvcm5pYTEWMBQGA1UEBwwNTW91bnRhaW4g
+VmlldzEQMA4GA1UECgwHQW5kcm9pZDEQMA4GA1UECwwHQW5kcm9pZDEQMA4GA1UE
+AwwHQW5kcm9pZDEiMCAGCSqGSIb3DQEJARYTYW5kcm9pZEBhbmRyb2lkLmNvbTAe
+Fw0xNDExMDYxOTA3NDBaFw00MjAzMjQxOTA3NDBaMIGUMQswCQYDVQQGEwJVUzET
+MBEGA1UECAwKQ2FsaWZvcm5pYTEWMBQGA1UEBwwNTW91bnRhaW4gVmlldzEQMA4G
+A1UECgwHQW5kcm9pZDEQMA4GA1UECwwHQW5kcm9pZDEQMA4GA1UEAwwHQW5kcm9p
+ZDEiMCAGCSqGSIb3DQEJARYTYW5kcm9pZEBhbmRyb2lkLmNvbTCCASIwDQYJKoZI
+hvcNAQEBBQADggEPADCCAQoCggEBAOjreE0vTVSRenuzO9vnaWfk0eQzYab0gqpi
+6xAzi6dmD+ugoEKJmbPiuE5Dwf21isZ9uhUUu0dQM46dK4ocKxMRrcnmGxydFn6o
+fs3ODJMXOkv2gKXL/FdbEPdDbxzdu8z3yk+W67udM/fW7WbaQ3DO0knu+izKak/3
+T41c5uoXmQ81UNtAzRGzGchNVXMmWuTGOkg6U+0I2Td7K8yvUMWhAWPPpKLtVH9r
+AL5TzjYNR92izdKcz3AjRsI3CTjtpiVABGeX0TcjRSuZB7K9EK56HV+OFNS6I1NP
+jdD7FIShyGlqqZdUOkAUZYanbpgeT5N7QL6uuqcGpoTOkalu6kkCAwEAAaNQME4w
+HQYDVR0OBBYEFH5DM/m7oArf4O3peeKO0ZIEkrQPMB8GA1UdIwQYMBaAFH5DM/m7
+oArf4O3peeKO0ZIEkrQPMAwGA1UdEwQFMAMBAf8wDQYJKoZIhvcNAQEFBQADggEB
+AHO3NSvDE5jFvMehGGtS8BnFYdFKRIglDMc4niWSzhzOVYRH4WajxdtBWc5fx0ix
+NF/+hVKVhP6AIOQa+++sk+HIi7RvioPPbhjcsVlZe7cUEGrLSSveGouQyc+j0+m6
+JF84kszIl5GGNMTnx0XRPO+g8t6h5LWfnVydgZfpGRRg+WHewk1U2HlvTjIceb0N
+dcoJ8WKJAFWdcuE7VIm4w+vF/DYX/A2Oyzr2+QRhmYSv1cusgAeC1tvH4ap+J1Lg
+UnOu5Kh/FqPLLSwNVQp4Bu7b9QFfqK8Moj84bj88NqRGZgDyqzuTrFxn6FW7dmyA
+yttuAJAEAymk1mipd9+zp38=
+-----END CERTIFICATE-----