summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--Documentation/DocBook/kernel-hacking.tmpl2
-rw-r--r--Documentation/RCU/00-INDEX2
-rw-r--r--Documentation/RCU/Design/Requirements/Requirements.html34
-rw-r--r--Documentation/RCU/checklist.txt8
-rw-r--r--Documentation/RCU/trace.txt535
-rw-r--r--Documentation/admin-guide/kernel-parameters.txt57
-rw-r--r--Documentation/block/data-integrity.txt6
-rw-r--r--Documentation/core-api/atomic_ops.rst5
-rw-r--r--Documentation/dev-tools/sparse.rst6
-rw-r--r--Documentation/devicetree/bindings/clock/sunxi-ccu.txt7
-rw-r--r--Documentation/devicetree/bindings/gpio/gpio-mvebu.txt6
-rw-r--r--Documentation/devicetree/bindings/mfd/stm32-timers.txt2
-rw-r--r--Documentation/devicetree/bindings/net/dsa/b53.txt2
-rw-r--r--Documentation/devicetree/bindings/net/smsc911x.txt1
-rw-r--r--Documentation/filesystems/autofs4.txt12
-rw-r--r--Documentation/kernel-per-CPU-kthreads.txt31
-rw-r--r--Documentation/memory-barriers.txt2
-rw-r--r--Documentation/networking/scaling.txt2
-rw-r--r--Documentation/scheduler/sched-deadline.txt168
-rw-r--r--Documentation/timers/NO_HZ.txt29
-rw-r--r--Documentation/trace/ftrace.txt2
-rw-r--r--MAINTAINERS13
-rw-r--r--Makefile4
-rw-r--r--arch/Kconfig9
-rw-r--r--arch/arc/include/asm/processor.h2
-rw-r--r--arch/arc/mm/mmap.c2
-rw-r--r--arch/arm/Kconfig20
-rw-r--r--arch/arm/boot/compressed/efi-header.S3
-rw-r--r--arch/arm/boot/dts/am335x-sl50.dts8
-rw-r--r--arch/arm/boot/dts/sunxi-h3-h5.dtsi7
-rw-r--r--arch/arm/include/asm/dmi.h19
-rw-r--r--arch/arm/kernel/setup.c2
-rw-r--r--arch/arm/kernel/smp.c3
-rw-r--r--arch/arm/mm/mmap.c4
-rw-r--r--arch/arm/mm/mmu.c8
-rw-r--r--arch/arm64/Kconfig2
-rw-r--r--arch/arm64/boot/dts/allwinner/sun50i-a64.dtsi5
-rw-r--r--arch/arm64/boot/dts/allwinner/sun50i-h5.dtsi2
l---------arch/arm64/boot/dts/allwinner/sunxi-h3-h5.dtsi1
-rw-r--r--arch/arm64/kernel/efi.c15
-rw-r--r--arch/arm64/kernel/smp.c3
-rw-r--r--arch/arm64/kernel/vdso.c5
-rw-r--r--arch/arm64/kernel/vdso/gettimeofday.S1
-rw-r--r--arch/arm64/net/bpf_jit_comp.c7
-rw-r--r--arch/blackfin/include/asm/processor.h5
-rw-r--r--arch/c6x/include/asm/processor.h5
-rw-r--r--arch/cris/arch-v10/kernel/process.c8
-rw-r--r--arch/cris/arch-v32/kernel/process.c8
-rw-r--r--arch/cris/include/asm/processor.h2
-rw-r--r--arch/frv/include/asm/processor.h5
-rw-r--r--arch/frv/kernel/process.c9
-rw-r--r--arch/frv/mm/elf-fdpic.c2
-rw-r--r--arch/h8300/include/asm/processor.h4
-rw-r--r--arch/h8300/kernel/process.c5
-rw-r--r--arch/hexagon/include/asm/processor.h3
-rw-r--r--arch/hexagon/kernel/process.c8
-rw-r--r--arch/ia64/include/asm/processor.h17
-rw-r--r--arch/m32r/include/asm/processor.h2
-rw-r--r--arch/m32r/kernel/process.c8
-rw-r--r--arch/m68k/include/asm/processor.h2
-rw-r--r--arch/m68k/kernel/process.c14
-rw-r--r--arch/metag/kernel/smp.c3
-rw-r--r--arch/microblaze/include/asm/processor.h6
-rw-r--r--arch/microblaze/kernel/process.c17
-rw-r--r--arch/mips/boot/Makefile10
-rw-r--r--arch/mips/include/asm/highmem.h5
-rw-r--r--arch/mips/include/asm/kprobes.h3
-rw-r--r--arch/mips/include/asm/pgtable-32.h7
-rw-r--r--arch/mips/kernel/branch.c4
-rw-r--r--arch/mips/kernel/entry.S3
-rw-r--r--arch/mips/kernel/ftrace.c24
-rw-r--r--arch/mips/kernel/head.S2
-rw-r--r--arch/mips/kernel/perf_event_mipsxx.c6
-rw-r--r--arch/mips/kernel/pm-cps.c9
-rw-r--r--arch/mips/kernel/traps.c2
-rw-r--r--arch/mips/kvm/tlb.c6
-rw-r--r--arch/mips/math-emu/dp_maddf.c5
-rw-r--r--arch/mips/math-emu/sp_maddf.c5
-rw-r--r--arch/mips/mm/dma-default.c23
-rw-r--r--arch/mips/mm/mmap.c2
-rw-r--r--arch/mips/mm/pgtable-32.c6
-rw-r--r--arch/mn10300/include/asm/processor.h5
-rw-r--r--arch/mn10300/kernel/process.c8
-rw-r--r--arch/nios2/include/asm/processor.h3
-rw-r--r--arch/openrisc/include/asm/processor.h5
-rw-r--r--arch/openrisc/kernel/process.c5
-rw-r--r--arch/parisc/include/asm/processor.h5
-rw-r--r--arch/parisc/kernel/process.c5
-rw-r--r--arch/parisc/kernel/sys_parisc.c15
-rw-r--r--arch/powerpc/Kconfig2
-rw-r--r--arch/powerpc/include/asm/bug.h2
-rw-r--r--arch/powerpc/include/asm/kprobes.h1
-rw-r--r--arch/powerpc/include/asm/processor.h6
-rw-r--r--arch/powerpc/include/asm/uaccess.h8
-rw-r--r--arch/powerpc/include/asm/xive.h12
-rw-r--r--arch/powerpc/kernel/exceptions-64s.S11
-rw-r--r--arch/powerpc/kernel/kprobes.c17
-rw-r--r--arch/powerpc/kernel/setup_64.c31
-rw-r--r--arch/powerpc/kernel/smp.c2
-rw-r--r--arch/powerpc/kernel/trace/ftrace_64_mprofile.S59
-rw-r--r--arch/powerpc/kvm/book3s_hv.c51
-rw-r--r--arch/powerpc/kvm/book3s_hv_interrupts.S12
-rw-r--r--arch/powerpc/kvm/book3s_hv_rmhandlers.S75
-rw-r--r--arch/powerpc/kvm/book3s_xive_template.c4
-rw-r--r--arch/powerpc/mm/hugetlbpage-radix.c2
-rw-r--r--arch/powerpc/mm/mmap.c4
-rw-r--r--arch/powerpc/mm/slice.c2
-rw-r--r--arch/powerpc/perf/perf_regs.c3
-rw-r--r--arch/powerpc/platforms/powernv/npu-dma.c97
-rw-r--r--arch/powerpc/sysdev/xive/common.c2
-rw-r--r--arch/s390/include/asm/processor.h5
-rw-r--r--arch/s390/kernel/ipl.c7
-rw-r--r--arch/s390/kernel/process.c25
-rw-r--r--arch/s390/kvm/gaccess.c15
-rw-r--r--arch/s390/mm/mmap.c4
-rw-r--r--arch/score/include/asm/processor.h1
-rw-r--r--arch/score/kernel/process.c5
-rw-r--r--arch/sh/mm/mmap.c4
-rw-r--r--arch/sparc/include/asm/processor_32.h3
-rw-r--r--arch/sparc/include/asm/processor_64.h2
-rw-r--r--arch/sparc/kernel/process_32.c8
-rw-r--r--arch/sparc/kernel/process_64.c19
-rw-r--r--arch/sparc/kernel/sys_sparc_64.c4
-rw-r--r--arch/sparc/mm/hugetlbpage.c2
-rw-r--r--arch/tile/include/asm/processor.h7
-rw-r--r--arch/tile/lib/atomic_asm_32.S3
-rw-r--r--arch/tile/mm/hugetlbpage.c2
-rw-r--r--arch/um/include/asm/processor-generic.h2
-rw-r--r--arch/um/kernel/um_arch.c6
-rw-r--r--arch/x86/Kconfig5
-rw-r--r--arch/x86/boot/compressed/cmdline.c2
-rw-r--r--arch/x86/boot/compressed/eboot.c73
-rw-r--r--arch/x86/boot/compressed/head_64.S86
-rw-r--r--arch/x86/boot/compressed/kaslr.c194
-rw-r--r--arch/x86/boot/compressed/misc.c6
-rw-r--r--arch/x86/boot/compressed/misc.h2
-rw-r--r--arch/x86/boot/compressed/pagetable.c18
-rw-r--r--arch/x86/boot/copy.S20
-rw-r--r--arch/x86/boot/string.c8
-rw-r--r--arch/x86/boot/string.h1
-rw-r--r--arch/x86/crypto/Makefile2
-rw-r--r--arch/x86/crypto/sha1-mb/Makefile2
-rw-r--r--arch/x86/crypto/sha256-mb/Makefile2
-rw-r--r--arch/x86/entry/entry_64.S3
-rw-r--r--arch/x86/events/core.c27
-rw-r--r--arch/x86/events/intel/core.c67
-rw-r--r--arch/x86/events/intel/lbr.c4
-rw-r--r--arch/x86/events/intel/uncore.c2
-rw-r--r--arch/x86/events/perf_event.h3
-rw-r--r--arch/x86/include/asm/atomic.h13
-rw-r--r--arch/x86/include/asm/efi.h2
-rw-r--r--arch/x86/include/asm/extable.h1
-rw-r--r--arch/x86/include/asm/hardirq.h2
-rw-r--r--arch/x86/include/asm/kvm_emulate.h1
-rw-r--r--arch/x86/include/asm/mmu.h6
-rw-r--r--arch/x86/include/asm/mmu_context.h63
-rw-r--r--arch/x86/include/asm/mshyperv.h3
-rw-r--r--arch/x86/include/asm/msr-index.h2
-rw-r--r--arch/x86/include/asm/paravirt.h10
-rw-r--r--arch/x86/include/asm/paravirt_types.h5
-rw-r--r--arch/x86/include/asm/pgtable-3level.h47
-rw-r--r--arch/x86/include/asm/pgtable.h55
-rw-r--r--arch/x86/include/asm/pgtable_64.h22
-rw-r--r--arch/x86/include/asm/processor-flags.h36
-rw-r--r--arch/x86/include/asm/processor.h10
-rw-r--r--arch/x86/include/asm/special_insns.h10
-rw-r--r--arch/x86/include/asm/timer.h8
-rw-r--r--arch/x86/include/asm/tlbbatch.h14
-rw-r--r--arch/x86/include/asm/tlbflush.h114
-rw-r--r--arch/x86/include/asm/uv/uv.h11
-rw-r--r--arch/x86/include/uapi/asm/hyperv.h15
-rw-r--r--arch/x86/include/uapi/asm/processor-flags.h2
-rw-r--r--arch/x86/kernel/Makefile2
-rw-r--r--arch/x86/kernel/acpi/Makefile2
-rw-r--r--arch/x86/kernel/apic/htirq.c2
-rw-r--r--arch/x86/kernel/apic/io_apic.c22
-rw-r--r--arch/x86/kernel/apic/msi.c2
-rw-r--r--arch/x86/kernel/apic/vector.c4
-rw-r--r--arch/x86/kernel/cpu/intel_rdt_rdtgroup.c4
-rw-r--r--arch/x86/kernel/cpu/microcode/amd.c2
-rw-r--r--arch/x86/kernel/cpu/microcode/core.c11
-rw-r--r--arch/x86/kernel/cpu/microcode/intel.c27
-rw-r--r--arch/x86/kernel/cpu/mshyperv.c18
-rw-r--r--arch/x86/kernel/espfix_64.c2
-rw-r--r--arch/x86/kernel/head64.c145
-rw-r--r--arch/x86/kernel/head_64.S131
-rw-r--r--arch/x86/kernel/kprobes/opt.c9
-rw-r--r--arch/x86/kernel/ldt.c56
-rw-r--r--arch/x86/kernel/machine_kexec_64.c2
-rw-r--r--arch/x86/kernel/nmi_selftest.c2
-rw-r--r--arch/x86/kernel/paravirt.c2
-rw-r--r--arch/x86/kernel/process.c11
-rw-r--r--arch/x86/kernel/process_32.c2
-rw-r--r--arch/x86/kernel/process_64.c4
-rw-r--r--arch/x86/kernel/reboot.c2
-rw-r--r--arch/x86/kernel/setup.c2
-rw-r--r--arch/x86/kernel/smpboot.c3
-rw-r--r--arch/x86/kernel/step.c2
-rw-r--r--arch/x86/kernel/sys_x86_64.c4
-rw-r--r--arch/x86/kernel/tboot.c2
-rw-r--r--arch/x86/kernel/traps.c2
-rw-r--r--arch/x86/kernel/tsc.c206
-rw-r--r--arch/x86/kvm/emulate.c1
-rw-r--r--arch/x86/kvm/svm.c2
-rw-r--r--arch/x86/kvm/vmx.c24
-rw-r--r--arch/x86/kvm/x86.c62
-rw-r--r--arch/x86/lib/copy_user_64.S7
-rw-r--r--arch/x86/lib/msr-reg.S8
-rw-r--r--arch/x86/lib/x86-opcode-map.txt2
-rw-r--r--arch/x86/math-emu/fpu_system.h2
-rw-r--r--arch/x86/mm/Makefile2
-rw-r--r--arch/x86/mm/dump_pagetables.c2
-rw-r--r--arch/x86/mm/extable.c3
-rw-r--r--arch/x86/mm/fault.c10
-rw-r--r--arch/x86/mm/gup.c496
-rw-r--r--arch/x86/mm/hugetlbpage.c2
-rw-r--r--arch/x86/mm/init.c10
-rw-r--r--arch/x86/mm/init_64.c116
-rw-r--r--arch/x86/mm/ioremap.c2
-rw-r--r--arch/x86/mm/kasan_init_64.c12
-rw-r--r--arch/x86/mm/kaslr.c81
-rw-r--r--arch/x86/mm/mmap.c3
-rw-r--r--arch/x86/mm/tlb.c458
-rw-r--r--arch/x86/net/Makefile2
-rw-r--r--arch/x86/platform/efi/Makefile1
-rw-r--r--arch/x86/platform/efi/efi.c3
-rw-r--r--arch/x86/platform/efi/efi_32.c9
-rw-r--r--arch/x86/platform/efi/efi_64.c9
-rw-r--r--arch/x86/platform/efi/quirks.c137
-rw-r--r--arch/x86/platform/olpc/olpc-xo1-pm.c2
-rw-r--r--arch/x86/platform/uv/tlb_uv.c24
-rw-r--r--arch/x86/power/Makefile2
-rw-r--r--arch/x86/power/cpu.c2
-rw-r--r--arch/x86/power/hibernate_64.c3
-rw-r--r--arch/x86/realmode/init.c2
-rw-r--r--arch/x86/xen/Makefile3
-rw-r--r--arch/x86/xen/efi.c45
-rw-r--r--arch/x86/xen/mmu_pv.c83
-rw-r--r--arch/x86/xen/xen-pvh.S2
-rw-r--r--arch/xtensa/include/asm/processor.h2
-rw-r--r--arch/xtensa/kernel/syscall.c2
-rw-r--r--block/bfq-iosched.c14
-rw-r--r--block/bfq-iosched.h3
-rw-r--r--block/bfq-wf2q.c39
-rw-r--r--block/bio-integrity.c165
-rw-r--r--block/bio.c23
-rw-r--r--block/blk-core.c5
-rw-r--r--block/blk-lib.c23
-rw-r--r--block/blk-mq-sched.c8
-rw-r--r--block/blk-mq.c8
-rw-r--r--block/blk-wbt.c4
-rw-r--r--block/blk.h11
-rw-r--r--block/kyber-iosched.c16
-rw-r--r--block/t10-pi.c9
-rw-r--r--drivers/acpi/acpica/tbutils.c34
-rw-r--r--drivers/acpi/acpica/utresrc.c9
-rw-r--r--drivers/acpi/pci_root.c2
-rw-r--r--drivers/acpi/scan.c67
-rw-r--r--drivers/base/node.c2
-rw-r--r--drivers/block/cciss.c8
-rw-r--r--drivers/block/mtip32xx/mtip32xx.c4
-rw-r--r--drivers/block/mtip32xx/mtip32xx.h1
-rw-r--r--drivers/block/null_blk.c18
-rw-r--r--drivers/bluetooth/btmrvl_main.c2
-rw-r--r--drivers/char/ipmi/ipmi_watchdog.c2
-rw-r--r--drivers/char/random.c12
-rw-r--r--drivers/clk/meson/Kconfig1
-rw-r--r--drivers/clk/sunxi-ng/Kconfig1
-rw-r--r--drivers/clk/sunxi-ng/ccu-sun50i-a64.h4
-rw-r--r--drivers/clk/sunxi-ng/ccu-sun5i.c2
-rw-r--r--drivers/clk/sunxi-ng/ccu-sun6i-a31.c2
-rw-r--r--drivers/clk/sunxi-ng/ccu-sun8i-h3.h4
-rw-r--r--drivers/clk/sunxi-ng/ccu-sun8i-v3s.c2
-rw-r--r--drivers/clocksource/arm_arch_timer.c4
-rw-r--r--drivers/clocksource/cadence_ttc_timer.c1
-rw-r--r--drivers/clocksource/timer-sun5i.c1
-rw-r--r--drivers/cpufreq/cpufreq_conservative.c4
-rw-r--r--drivers/cpufreq/pasemi-cpufreq.c2
-rw-r--r--drivers/cpuidle/cpuidle.c1
-rw-r--r--drivers/cpuidle/dt_idle_states.c4
-rw-r--r--drivers/devfreq/event/exynos-nocp.c6
-rw-r--r--drivers/devfreq/event/exynos-ppmu.c8
-rw-r--r--drivers/edac/altera_edac.c26
-rw-r--r--drivers/edac/i5000_edac.c6
-rw-r--r--drivers/edac/i5400_edac.c4
-rw-r--r--drivers/edac/ie31200_edac.c13
-rw-r--r--drivers/edac/mce_amd.c2
-rw-r--r--drivers/edac/mv64x60_edac.c88
-rw-r--r--drivers/edac/pnd2_edac.c20
-rw-r--r--drivers/edac/sb_edac.c682
-rw-r--r--drivers/edac/thunderx_edac.c2
-rw-r--r--drivers/firmware/dmi-id.c4
-rw-r--r--drivers/firmware/dmi_scan.c49
-rw-r--r--drivers/firmware/efi/Kconfig9
-rw-r--r--drivers/firmware/efi/arm-runtime.c16
-rw-r--r--drivers/firmware/efi/capsule-loader.c117
-rw-r--r--drivers/firmware/efi/capsule.c11
-rw-r--r--drivers/firmware/efi/efi.c3
-rw-r--r--drivers/firmware/efi/test/efi_test.c11
-rw-r--r--drivers/gpio/gpio-mvebu.c6
-rw-r--r--drivers/gpio/gpiolib-acpi.c2
-rw-r--r--drivers/gpio/gpiolib.c3
-rw-r--r--drivers/gpu/drm/amd/amdgpu/amdgpu_atombios.c4
-rw-r--r--drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c1
-rw-r--r--drivers/gpu/drm/amd/amdgpu/atombios_crtc.c4
-rw-r--r--drivers/gpu/drm/amd/amdgpu/dce_v10_0.c7
-rw-r--r--drivers/gpu/drm/amd/amdgpu/dce_v11_0.c7
-rw-r--r--drivers/gpu/drm/amd/amdgpu/dce_v6_0.c7
-rw-r--r--drivers/gpu/drm/amd/amdgpu/dce_v8_0.c7
-rw-r--r--drivers/gpu/drm/bridge/synopsys/Kconfig1
-rw-r--r--drivers/gpu/drm/drm_connector.c38
-rw-r--r--drivers/gpu/drm/etnaviv/etnaviv_gem.h3
-rw-r--r--drivers/gpu/drm/etnaviv/etnaviv_gem_submit.c2
-rw-r--r--drivers/gpu/drm/i915/i915_debugfs.c6
-rw-r--r--drivers/gpu/drm/i915/i915_gem.c63
-rw-r--r--drivers/gpu/drm/i915/i915_gem_execbuffer.c17
-rw-r--r--drivers/gpu/drm/i915/i915_gem_request.c2
-rw-r--r--drivers/gpu/drm/i915/i915_gem_request.h2
-rw-r--r--drivers/gpu/drm/i915/i915_guc_submission.c4
-rw-r--r--drivers/gpu/drm/i915/i915_pvinfo.h8
-rw-r--r--drivers/gpu/drm/i915/i915_sw_fence.c35
-rw-r--r--drivers/gpu/drm/i915/i915_sw_fence.h2
-rw-r--r--drivers/gpu/drm/i915/i915_vgpu.c10
-rw-r--r--drivers/gpu/drm/i915/i915_vma.c5
-rw-r--r--drivers/gpu/drm/i915/intel_display.c44
-rw-r--r--drivers/gpu/drm/i915/intel_dp_aux_backlight.c2
-rw-r--r--drivers/gpu/drm/i915/intel_lrc.c6
-rw-r--r--drivers/gpu/drm/i915/intel_pm.c36
-rw-r--r--drivers/gpu/drm/i915/intel_ringbuffer.c41
-rw-r--r--drivers/gpu/drm/i915/intel_ringbuffer.h19
-rw-r--r--drivers/gpu/drm/mgag200/mgag200_mode.c9
-rw-r--r--drivers/gpu/drm/mxsfb/mxsfb_crtc.c42
-rw-r--r--drivers/gpu/drm/radeon/cik.c7
-rw-r--r--drivers/gpu/drm/radeon/evergreen.c7
-rw-r--r--drivers/gpu/drm/radeon/radeon.h2
-rw-r--r--drivers/gpu/drm/radeon/radeon_combios.c7
-rw-r--r--drivers/gpu/drm/radeon/radeon_device.c4
-rw-r--r--drivers/gpu/drm/radeon/radeon_fence.c2
-rw-r--r--drivers/gpu/drm/radeon/radeon_uvd.c2
-rw-r--r--drivers/gpu/drm/radeon/si.c7
-rw-r--r--drivers/gpu/drm/tegra/drm.c22
-rw-r--r--drivers/gpu/drm/vmwgfx/vmwgfx_cmdbuf_res.c1
-rw-r--r--drivers/gpu/host1x/dev.c2
-rw-r--r--drivers/gpu/vga/vgaarb.c2
-rw-r--r--drivers/hid/hid-core.c282
-rw-r--r--drivers/hid/hid-ids.h3
-rw-r--r--drivers/hid/hid-magicmouse.c15
-rw-r--r--drivers/hid/usbhid/hid-quirks.c1
-rw-r--r--drivers/hsi/clients/ssi_protocol.c2
-rw-r--r--drivers/i2c/busses/i2c-imx.c8
-rw-r--r--drivers/i2c/busses/i2c-ismt.c2
-rw-r--r--drivers/i2c/busses/i2c-rcar.c2
-rw-r--r--drivers/iio/adc/meson_saradc.c4
-rw-r--r--drivers/iio/adc/mxs-lradc-adc.c7
-rw-r--r--drivers/iio/buffer/industrialio-buffer-dma.c1
-rw-r--r--drivers/iio/buffer/industrialio-buffer-dmaengine.c1
-rw-r--r--drivers/iio/imu/inv_mpu6050/inv_mpu_core.c39
-rw-r--r--drivers/iio/imu/inv_mpu6050/inv_mpu_iio.h3
-rw-r--r--drivers/infiniband/core/addr.c10
-rw-r--r--drivers/infiniband/hw/bnxt_re/bnxt_re.h4
-rw-r--r--drivers/infiniband/hw/bnxt_re/ib_verbs.c471
-rw-r--r--drivers/infiniband/hw/bnxt_re/ib_verbs.h22
-rw-r--r--drivers/infiniband/hw/bnxt_re/main.c4
-rw-r--r--drivers/infiniband/hw/bnxt_re/qplib_fp.c384
-rw-r--r--drivers/infiniband/hw/bnxt_re/qplib_fp.h18
-rw-r--r--drivers/infiniband/hw/bnxt_re/qplib_rcfw.c314
-rw-r--r--drivers/infiniband/hw/bnxt_re/qplib_rcfw.h61
-rw-r--r--drivers/infiniband/hw/bnxt_re/qplib_res.h4
-rw-r--r--drivers/infiniband/hw/bnxt_re/qplib_sp.c333
-rw-r--r--drivers/infiniband/hw/bnxt_re/qplib_sp.h2
-rw-r--r--drivers/infiniband/hw/cxgb4/device.c10
-rw-r--r--drivers/infiniband/hw/i40iw/i40iw_main.c2
-rw-r--r--drivers/infiniband/hw/mlx5/main.c6
-rw-r--r--drivers/infiniband/hw/qedr/qedr.h5
-rw-r--r--drivers/infiniband/hw/qedr/verbs.c68
-rw-r--r--drivers/infiniband/sw/rxe/rxe.h5
-rw-r--r--drivers/infiniband/sw/rxe/rxe_verbs.c9
-rw-r--r--drivers/infiniband/ulp/ipoib/ipoib_ib.c1
-rw-r--r--drivers/infiniband/ulp/ipoib/ipoib_main.c15
-rw-r--r--drivers/infiniband/ulp/ipoib/ipoib_vlan.c11
-rw-r--r--drivers/input/misc/soc_button_array.c20
-rw-r--r--drivers/input/rmi4/rmi_f54.c17
-rw-r--r--drivers/input/serio/i8042-x86ia64io.h7
-rw-r--r--drivers/iommu/amd_iommu.c6
-rw-r--r--drivers/iommu/intel-iommu.c4
-rw-r--r--drivers/iommu/of_iommu.c2
-rw-r--r--drivers/irqchip/irq-mips-gic.c6
-rw-r--r--drivers/leds/leds-bcm6328.c4
-rw-r--r--drivers/leds/trigger/ledtrig-heartbeat.c31
-rw-r--r--drivers/lightnvm/pblk-core.c61
-rw-r--r--drivers/lightnvm/pblk-recovery.c31
-rw-r--r--drivers/lightnvm/pblk-write.c26
-rw-r--r--drivers/lightnvm/pblk.h2
-rw-r--r--drivers/md/bcache/btree.h2
-rw-r--r--drivers/md/dm-integrity.c12
-rw-r--r--drivers/md/dm-io.c4
-rw-r--r--drivers/md/dm-raid.c17
-rw-r--r--drivers/md/dm-raid1.c21
-rw-r--r--drivers/md/dm-thin.c26
-rw-r--r--drivers/md/dm.c2
-rw-r--r--drivers/media/cec/Kconfig1
-rw-r--r--drivers/media/cec/cec-api.c8
-rw-r--r--drivers/media/i2c/tc358743.c2
-rw-r--r--drivers/media/rc/sir_ir.c6
-rw-r--r--drivers/media/usb/rainshadow-cec/rainshadow-cec.c1
-rw-r--r--drivers/media/v4l2-core/videobuf2-core.c2
-rw-r--r--drivers/mfd/arizona-core.c3
-rw-r--r--drivers/misc/cxl/context.c6
-rw-r--r--drivers/misc/cxl/cxl.h18
-rw-r--r--drivers/misc/cxl/fault.c23
-rw-r--r--drivers/misc/cxl/main.c17
-rw-r--r--drivers/misc/cxl/native.c29
-rw-r--r--drivers/misc/cxl/pci.c11
-rw-r--r--drivers/mmc/host/meson-gx-mmc.c9
-rw-r--r--drivers/mmc/host/sdhci-pci-core.c2
-rw-r--r--drivers/net/arcnet/arcnet.c7
-rw-r--r--drivers/net/arcnet/capmode.c2
-rw-r--r--drivers/net/arcnet/com20020-pci.c6
-rw-r--r--drivers/net/arcnet/com20020.c2
-rw-r--r--drivers/net/bonding/bond_3ad.c27
-rw-r--r--drivers/net/bonding/bond_main.c6
-rw-r--r--drivers/net/caif/caif_hsi.c2
-rw-r--r--drivers/net/caif/caif_serial.c2
-rw-r--r--drivers/net/caif/caif_spi.c2
-rw-r--r--drivers/net/caif/caif_virtio.c2
-rw-r--r--drivers/net/can/dev.c3
-rw-r--r--drivers/net/can/peak_canfd/peak_canfd.c2
-rw-r--r--drivers/net/can/slcan.c7
-rw-r--r--drivers/net/can/usb/gs_usb.c2
-rw-r--r--drivers/net/can/usb/peak_usb/pcan_usb_core.c4
-rw-r--r--drivers/net/can/vcan.c4
-rw-r--r--drivers/net/can/vxcan.c4
-rw-r--r--drivers/net/dummy.c4
-rw-r--r--drivers/net/ethernet/amazon/ena/ena_com.c35
-rw-r--r--drivers/net/ethernet/amazon/ena/ena_ethtool.c2
-rw-r--r--drivers/net/ethernet/amazon/ena/ena_netdev.c179
-rw-r--r--drivers/net/ethernet/amazon/ena/ena_netdev.h18
-rw-r--r--drivers/net/ethernet/aquantia/atlantic/hw_atl/hw_atl_utils.h3
-rw-r--r--drivers/net/ethernet/broadcom/bnx2x/bnx2x_cmn.c19
-rw-r--r--drivers/net/ethernet/broadcom/bnx2x/bnx2x_main.c2
-rw-r--r--drivers/net/ethernet/broadcom/bnx2x/bnx2x_sriov.c15
-rw-r--r--drivers/net/ethernet/broadcom/bnx2x/bnx2x_sriov.h1
-rw-r--r--drivers/net/ethernet/broadcom/bnxt/bnxt.c61
-rw-r--r--drivers/net/ethernet/broadcom/bnxt/bnxt.h6
-rw-r--r--drivers/net/ethernet/cavium/liquidio/octeon_main.h4
-rw-r--r--drivers/net/ethernet/chelsio/cxgb4/cxgb4_main.c12
-rw-r--r--drivers/net/ethernet/freescale/dpaa/dpaa_eth.c2
-rw-r--r--drivers/net/ethernet/freescale/fman/Kconfig1
-rw-r--r--drivers/net/ethernet/freescale/fman/mac.c2
-rw-r--r--drivers/net/ethernet/hisilicon/hns/hns_dsaf_misc.c2
-rw-r--r--drivers/net/ethernet/hisilicon/hns/hns_ethtool.c16
-rw-r--r--drivers/net/ethernet/ibm/emac/core.c67
-rw-r--r--drivers/net/ethernet/ibm/ibmvnic.c6
-rw-r--r--drivers/net/ethernet/intel/i40e/i40e.h1
-rw-r--r--drivers/net/ethernet/intel/i40e/i40e_ethtool.c4
-rw-r--r--drivers/net/ethernet/intel/i40e/i40e_main.c7
-rw-r--r--drivers/net/ethernet/intel/i40e/i40e_txrx.c4
-rw-r--r--drivers/net/ethernet/intel/i40e/i40e_virtchnl_pf.c2
-rw-r--r--drivers/net/ethernet/marvell/mvpp2.c76
-rw-r--r--drivers/net/ethernet/mellanox/mlx5/core/en.h8
-rw-r--r--drivers/net/ethernet/mellanox/mlx5/core/en_ethtool.c8
-rw-r--r--drivers/net/ethernet/mellanox/mlx5/core/en_main.c3
-rw-r--r--drivers/net/ethernet/mellanox/mlx5/core/en_rep.c2
-rw-r--r--drivers/net/ethernet/mellanox/mlx5/core/en_rx_am.c45
-rw-r--r--drivers/net/ethernet/mellanox/mlx5/core/en_stats.h11
-rw-r--r--drivers/net/ethernet/mellanox/mlx5/core/en_tc.c1
-rw-r--r--drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c77
-rw-r--r--drivers/net/ethernet/mellanox/mlx5/core/fs_core.c2
-rw-r--r--drivers/net/ethernet/mellanox/mlx5/core/health.c11
-rw-r--r--drivers/net/ethernet/mellanox/mlx5/core/main.c20
-rw-r--r--drivers/net/ethernet/mellanox/mlxsw/spectrum_router.c3
-rw-r--r--drivers/net/ethernet/qlogic/qed/qed_debug.c2
-rw-r--r--drivers/net/ethernet/rocker/rocker_ofdpa.c2
-rw-r--r--drivers/net/ethernet/sfc/ef10.c15
-rw-r--r--drivers/net/ethernet/sfc/ef10_sriov.c2
-rw-r--r--drivers/net/ethernet/stmicro/stmmac/dwmac4_descs.c11
-rw-r--r--drivers/net/ethernet/stmicro/stmmac/stmmac_main.c45
-rw-r--r--drivers/net/ethernet/stmicro/stmmac/stmmac_ptp.h3
-rw-r--r--drivers/net/ethernet/ti/cpsw-common.c2
-rw-r--r--drivers/net/geneve.c2
-rw-r--r--drivers/net/gtp.c2
-rw-r--r--drivers/net/hamradio/6pack.c2
-rw-r--r--drivers/net/hamradio/bpqether.c2
-rw-r--r--drivers/net/hyperv/hyperv_net.h5
-rw-r--r--drivers/net/hyperv/netvsc_drv.c58
-rw-r--r--drivers/net/hyperv/rndis_filter.c30
-rw-r--r--drivers/net/ifb.c4
-rw-r--r--drivers/net/ipvlan/ipvlan_main.c2
-rw-r--r--drivers/net/loopback.c4
-rw-r--r--drivers/net/macsec.c4
-rw-r--r--drivers/net/macvlan.c87
-rw-r--r--drivers/net/netconsole.c2
-rw-r--r--drivers/net/nlmon.c2
-rw-r--r--drivers/net/phy/Kconfig1
-rw-r--r--drivers/net/phy/dp83640.c2
-rw-r--r--drivers/net/phy/micrel.c2
-rw-r--r--drivers/net/phy/phy.c2
-rw-r--r--drivers/net/slip/slip.c7
-rw-r--r--drivers/net/team/team.c4
-rw-r--r--drivers/net/tun.c4
-rw-r--r--drivers/net/usb/ax88179_178a.c16
-rw-r--r--drivers/net/usb/cdc-phonet.c2
-rw-r--r--drivers/net/usb/qmi_wwan.c6
-rw-r--r--drivers/net/usb/r8152.c2
-rw-r--r--drivers/net/veth.c8
-rw-r--r--drivers/net/virtio_net.c1
-rw-r--r--drivers/net/vrf.c38
-rw-r--r--drivers/net/vsockmon.c2
-rw-r--r--drivers/net/vxlan.c2
-rw-r--r--drivers/net/wan/dlci.c2
-rw-r--r--drivers/net/wan/hdlc_fr.c2
-rw-r--r--drivers/net/wan/lapbether.c2
-rw-r--r--drivers/net/wireless/ath/ath6kl/main.c2
-rw-r--r--drivers/net/wireless/broadcom/brcm80211/brcmfmac/cfg80211.c1
-rw-r--r--drivers/net/wireless/broadcom/brcm80211/brcmfmac/core.c3
-rw-r--r--drivers/net/wireless/broadcom/brcm80211/brcmfmac/firmware.c35
-rw-r--r--drivers/net/wireless/broadcom/brcm80211/brcmfmac/firmware.h4
-rw-r--r--drivers/net/wireless/broadcom/brcm80211/brcmfmac/fwsignal.c2
-rw-r--r--drivers/net/wireless/broadcom/brcm80211/brcmfmac/pcie.c17
-rw-r--r--drivers/net/wireless/broadcom/brcm80211/brcmfmac/sdio.c18
-rw-r--r--drivers/net/wireless/broadcom/brcm80211/brcmfmac/usb.c9
-rw-r--r--drivers/net/wireless/cisco/airo.c2
-rw-r--r--drivers/net/wireless/intersil/hostap/hostap_ioctl.c2
-rw-r--r--drivers/net/wireless/intersil/hostap/hostap_main.c2
-rw-r--r--drivers/net/wireless/mac80211_hwsim.c2
-rw-r--r--drivers/net/wireless/marvell/libertas/main.c2
-rw-r--r--drivers/net/wireless/marvell/mwifiex/main.c2
-rw-r--r--drivers/net/xen-netback/common.h1
-rw-r--r--drivers/net/xen-netback/interface.c6
-rw-r--r--drivers/net/xen-netback/netback.c6
-rw-r--r--drivers/ntb/hw/intel/ntb_hw_intel.c2
-rw-r--r--drivers/ntb/ntb_transport.c58
-rw-r--r--drivers/ntb/test/ntb_perf.c4
-rw-r--r--drivers/nvdimm/blk.c16
-rw-r--r--drivers/nvdimm/btt.c16
-rw-r--r--drivers/nvme/host/pci.c3
-rw-r--r--drivers/pci/access.c12
-rw-r--r--drivers/pci/endpoint/functions/Kconfig1
-rw-r--r--drivers/pinctrl/pinctrl-amd.c91
-rw-r--r--drivers/pinctrl/pinctrl-rockchip.c44
-rw-r--r--drivers/pinctrl/stm32/pinctrl-stm32.c2
-rw-r--r--drivers/platform/x86/intel_telemetry_debugfs.c16
-rw-r--r--drivers/rtc/rtc-imxdi.c2
-rw-r--r--drivers/s390/net/netiucv.c4
-rw-r--r--drivers/scsi/dpt/dpti_i2o.h2
-rw-r--r--drivers/scsi/ips.c12
-rw-r--r--drivers/scsi/ips.h4
-rw-r--r--drivers/scsi/lpfc/lpfc_scsi.c5
-rw-r--r--drivers/scsi/qedi/qedi_fw.c1
-rw-r--r--drivers/scsi/qedi/qedi_main.c4
-rw-r--r--drivers/scsi/qla2xxx/qla_isr.c8
-rw-r--r--drivers/staging/iio/cdc/ad7152.c6
-rw-r--r--drivers/staging/lustre/lnet/klnds/o2iblnd/o2iblnd_cb.c6
-rw-r--r--drivers/staging/lustre/lnet/klnds/socklnd/socklnd_cb.c4
-rw-r--r--drivers/staging/lustre/lnet/libcfs/debug.c2
-rw-r--r--drivers/staging/lustre/lnet/libcfs/tracefile.c2
-rw-r--r--drivers/staging/lustre/lnet/lnet/lib-eq.c2
-rw-r--r--drivers/staging/lustre/lnet/lnet/lib-socket.c2
-rw-r--r--drivers/staging/lustre/lustre/fid/fid_request.c6
-rw-r--r--drivers/staging/lustre/lustre/include/lustre_lib.h4
-rw-r--r--drivers/staging/lustre/lustre/llite/lcommon_cl.c2
-rw-r--r--drivers/staging/lustre/lustre/lov/lov_cl_internal.h2
-rw-r--r--drivers/staging/lustre/lustre/lov/lov_object.c2
-rw-r--r--drivers/staging/lustre/lustre/obdclass/lu_object.c6
-rw-r--r--drivers/staging/rtl8188eu/os_dep/mon.c2
-rw-r--r--drivers/staging/rtl8723bs/os_dep/ioctl_cfg80211.c3
-rw-r--r--drivers/staging/rtl8723bs/os_dep/os_intfs.c2
-rw-r--r--drivers/staging/rtl8723bs/os_dep/osdep_service.c2
-rw-r--r--drivers/target/iscsi/iscsi_target.c22
-rw-r--r--drivers/target/target_core_internal.h2
-rw-r--r--drivers/target/target_core_sbc.c2
-rw-r--r--drivers/target/target_core_tmr.c16
-rw-r--r--drivers/target/target_core_transport.c9
-rw-r--r--drivers/tty/synclink_gt.c2
-rw-r--r--drivers/usb/gadget/composite.c11
-rw-r--r--drivers/usb/gadget/function/f_phonet.c2
-rw-r--r--drivers/usb/gadget/legacy/inode.c9
-rw-r--r--drivers/usb/gadget/udc/dummy_hcd.c13
-rw-r--r--drivers/usb/gadget/udc/net2280.c9
-rw-r--r--drivers/usb/host/xhci-mem.c7
-rw-r--r--drivers/usb/host/xhci-pci.c3
-rw-r--r--drivers/vfio/virqfd.c2
-rw-r--r--drivers/vhost/vhost.c2
-rw-r--r--drivers/vhost/vhost.h2
-rw-r--r--drivers/video/fbdev/core/fbmon.c2
-rw-r--r--drivers/video/fbdev/smscufx.c5
-rw-r--r--drivers/video/fbdev/udlfb.c9
-rw-r--r--drivers/video/fbdev/via/viafbdev.c8
-rw-r--r--drivers/virtio/virtio_balloon.c7
-rw-r--r--drivers/xen/manage.c1
-rw-r--r--fs/autofs4/autofs_i.h2
-rw-r--r--fs/autofs4/dev-ioctl.c2
-rw-r--r--fs/autofs4/waitq.c18
-rw-r--r--fs/block_dev.c5
-rw-r--r--fs/btrfs/hash.c5
-rw-r--r--fs/cachefiles/internal.h4
-rw-r--r--fs/cachefiles/namei.c2
-rw-r--r--fs/cachefiles/rdwr.c4
-rw-r--r--fs/ceph/acl.c1
-rw-r--r--fs/ceph/export.c4
-rw-r--r--fs/ceph/inode.c5
-rw-r--r--fs/ceph/mds_client.c4
-rw-r--r--fs/cifs/file.c2
-rw-r--r--fs/cifs/inode.c1
-rw-r--r--fs/cifs/misc.c2
-rw-r--r--fs/cifs/smb1ops.c9
-rw-r--r--fs/cifs/smb2ops.c8
-rw-r--r--fs/cifs/xattr.c2
-rw-r--r--fs/configfs/item.c8
-rw-r--r--fs/configfs/symlink.c3
-rw-r--r--fs/dax.c5
-rw-r--r--fs/dcache.c10
-rw-r--r--fs/eventfd.c2
-rw-r--r--fs/eventpoll.c12
-rw-r--r--fs/exec.c28
-rw-r--r--fs/f2fs/f2fs.h5
-rw-r--r--fs/fs_pin.c4
-rw-r--r--fs/hugetlbfs/inode.c2
-rw-r--r--fs/inode.c8
-rw-r--r--fs/jbd2/journal.c4
-rw-r--r--fs/namespace.c2
-rw-r--r--fs/nfs/callback_xdr.c1
-rw-r--r--fs/nfs/dir.c51
-rw-r--r--fs/nfs/internal.h1
-rw-r--r--fs/nfs/nfs4proc.c9
-rw-r--r--fs/nfs/nfs4state.c2
-rw-r--r--fs/nilfs2/segment.c5
-rw-r--r--fs/ocfs2/dlmglue.c4
-rw-r--r--fs/ocfs2/xattr.c23
-rw-r--r--fs/orangefs/orangefs-bufmap.c12
-rw-r--r--fs/overlayfs/copy_up.c33
-rw-r--r--fs/proc/task_mmu.c4
-rw-r--r--fs/read_write.c2
-rw-r--r--fs/reiserfs/journal.c2
-rw-r--r--fs/select.c4
-rw-r--r--fs/signalfd.c2
-rw-r--r--fs/ufs/balloc.c44
-rw-r--r--fs/ufs/inode.c74
-rw-r--r--fs/ufs/super.c73
-rw-r--r--fs/ufs/ufs_fs.h9
-rw-r--r--fs/ufs/util.c17
-rw-r--r--fs/ufs/util.h9
-rw-r--r--fs/userfaultfd.c59
-rw-r--r--fs/xfs/xfs_aops.c7
-rw-r--r--fs/xfs/xfs_buf.c2
-rw-r--r--fs/xfs/xfs_icache.c9
-rw-r--r--fs/xfs/xfs_inode.c8
-rw-r--r--include/acpi/acpi_bus.h3
-rw-r--r--include/acpi/actbl.h14
-rw-r--r--include/dt-bindings/clock/sun50i-a64-ccu.h2
-rw-r--r--include/dt-bindings/clock/sun8i-h3-ccu.h2
-rw-r--r--include/linux/bcm47xx_nvram.h1
-rw-r--r--include/linux/bio.h48
-rw-r--r--include/linux/blk-mq.h2
-rw-r--r--include/linux/bvec.h41
-rw-r--r--include/linux/clocksource.h1
-rw-r--r--include/linux/compiler.h4
-rw-r--r--include/linux/configfs.h3
-rw-r--r--include/linux/cpumask.h28
-rw-r--r--include/linux/dmi.h2
-rw-r--r--include/linux/efi.h14
-rw-r--r--include/linux/eventfd.h4
-rw-r--r--include/linux/fs.h2
-rw-r--r--include/linux/hashtable.h1
-rw-r--r--include/linux/kernel.h6
-rw-r--r--include/linux/kvm_irqfd.h2
-rw-r--r--include/linux/llist.h19
-rw-r--r--include/linux/mm.h53
-rw-r--r--include/linux/mm_types_task.h15
-rw-r--r--include/linux/moduleparam.h2
-rw-r--r--include/linux/netdevice.h15
-rw-r--r--include/linux/pagemap.h2
-rw-r--r--include/linux/poll.h2
-rw-r--r--include/linux/rcu_node_tree.h4
-rw-r--r--include/linux/rcu_segcblist.h4
-rw-r--r--include/linux/rcupdate.h318
-rw-r--r--include/linux/rcutiny.h167
-rw-r--r--include/linux/rcutree.h21
-rw-r--r--include/linux/refcount.h38
-rw-r--r--include/linux/rtmutex.h25
-rw-r--r--include/linux/sched.h22
-rw-r--r--include/linux/sched/clock.h11
-rw-r--r--include/linux/sched/nohz.h8
-rw-r--r--include/linux/sched/task.h2
-rw-r--r--include/linux/slub_def.h1
-rw-r--r--include/linux/spinlock.h20
-rw-r--r--include/linux/srcu.h25
-rw-r--r--include/linux/srcuclassic.h115
-rw-r--r--include/linux/srcutiny.h47
-rw-r--r--include/linux/srcutree.h13
-rw-r--r--include/linux/sunrpc/sched.h2
-rw-r--r--include/linux/t10-pi.h2
-rw-r--r--include/linux/timekeeper_internal.h5
-rw-r--r--include/linux/uuid.h4
-rw-r--r--include/linux/vfio.h2
-rw-r--r--include/linux/vm_event_item.h2
-rw-r--r--include/linux/wait.h1000
-rw-r--r--include/linux/wait_bit.h261
-rw-r--r--include/media/cec-notifier.h10
-rw-r--r--include/media/cec.h2
-rw-r--r--include/net/af_unix.h2
-rw-r--r--include/net/wext.h4
-rw-r--r--include/net/xfrm.h7
-rw-r--r--include/trace/events/rcu.h1
-rw-r--r--include/uapi/linux/a.out.h26
-rw-r--r--include/uapi/linux/auto_fs.h4
-rw-r--r--include/uapi/linux/auto_fs4.h4
-rw-r--r--include/uapi/linux/ethtool.h6
-rw-r--r--include/uapi/linux/openvswitch.h1
-rw-r--r--include/uapi/linux/sched.h1
-rw-r--r--init/Kconfig350
-rw-r--r--init/main.c27
-rw-r--r--kernel/async.c8
-rw-r--r--kernel/bpf/verifier.c5
-rw-r--r--kernel/events/core.c30
-rw-r--r--kernel/events/ring_buffer.c2
-rw-r--r--kernel/exit.c17
-rw-r--r--kernel/extable.c2
-rw-r--r--kernel/futex.c2
-rw-r--r--kernel/irq/manage.c4
-rw-r--r--kernel/kexec_core.c4
-rw-r--r--kernel/livepatch/patch.c8
-rw-r--r--kernel/livepatch/transition.c36
-rw-r--r--kernel/locking/lockdep.c176
-rw-r--r--kernel/locking/rtmutex-debug.c6
-rw-r--r--kernel/locking/rtmutex-debug.h2
-rw-r--r--kernel/locking/rtmutex.c37
-rw-r--r--kernel/locking/rtmutex.h2
-rw-r--r--kernel/printk/printk.c2
-rw-r--r--kernel/rcu/Kconfig242
-rw-r--r--kernel/rcu/Kconfig.debug82
-rw-r--r--kernel/rcu/Makefile2
-rw-r--r--kernel/rcu/rcu.h277
-rw-r--r--kernel/rcu/rcuperf.c129
-rw-r--r--kernel/rcu/rcutorture.c21
-rw-r--r--kernel/rcu/srcu.c661
-rw-r--r--kernel/rcu/srcutiny.c86
-rw-r--r--kernel/rcu/srcutree.c187
-rw-r--r--kernel/rcu/tiny.c54
-rw-r--r--kernel/rcu/tiny_plugin.h123
-rw-r--r--kernel/rcu/tree.c195
-rw-r--r--kernel/rcu/tree.h109
-rw-r--r--kernel/rcu/tree_exp.h2
-rw-r--r--kernel/rcu/tree_plugin.h573
-rw-r--r--kernel/rcu/tree_trace.c494
-rw-r--r--kernel/rcu/update.c77
-rw-r--r--kernel/sched/Makefile6
-rw-r--r--kernel/sched/clock.c128
-rw-r--r--kernel/sched/completion.c2
-rw-r--r--kernel/sched/core.c782
-rw-r--r--kernel/sched/cpufreq_schedutil.c3
-rw-r--r--kernel/sched/cputime.c16
-rw-r--r--kernel/sched/deadline.c894
-rw-r--r--kernel/sched/debug.c17
-rw-r--r--kernel/sched/fair.c453
-rw-r--r--kernel/sched/features.h2
-rw-r--r--kernel/sched/idle.c1
-rw-r--r--kernel/sched/loadavg.c51
-rw-r--r--kernel/sched/rt.c323
-rw-r--r--kernel/sched/sched.h113
-rw-r--r--kernel/sched/topology.c430
-rw-r--r--kernel/sched/wait.c441
-rw-r--r--kernel/sched/wait_bit.c286
-rw-r--r--kernel/signal.c20
-rw-r--r--kernel/smp.c16
-rw-r--r--kernel/time/Kconfig50
-rw-r--r--kernel/time/alarmtimer.c14
-rw-r--r--kernel/time/clocksource.c3
-rw-r--r--kernel/time/tick-broadcast.c4
-rw-r--r--kernel/time/tick-internal.h2
-rw-r--r--kernel/time/tick-sched.c74
-rw-r--r--kernel/time/tick-sched.h2
-rw-r--r--kernel/time/timekeeping.c71
-rw-r--r--kernel/trace/ftrace.c3
-rw-r--r--kernel/trace/trace.c3
-rw-r--r--kernel/trace/trace_functions.c12
-rw-r--r--kernel/trace/trace_kprobe.c14
-rw-r--r--kernel/trace/trace_stack.c6
-rw-r--r--kernel/workqueue.c4
-rw-r--r--lib/Kconfig.debug187
-rw-r--r--lib/Makefile3
-rw-r--r--lib/cmdline.c6
-rw-r--r--lib/cpumask.c32
-rw-r--r--lib/libcrc32c.c6
-rw-r--r--lib/locking-selftest-rtmutex.h11
-rw-r--r--lib/locking-selftest.c133
-rw-r--r--lib/refcount.c3
-rw-r--r--lib/smp_processor_id.c2
-rw-r--r--mm/Kconfig2
-rw-r--r--mm/filemap.c12
-rw-r--r--mm/gup.c15
-rw-r--r--mm/huge_memory.c8
-rw-r--r--mm/khugepaged.c1
-rw-r--r--mm/memcontrol.c10
-rw-r--r--mm/memory-failure.c5
-rw-r--r--mm/memory.c38
-rw-r--r--mm/mempool.c2
-rw-r--r--mm/mmap.c160
-rw-r--r--mm/rmap.c16
-rw-r--r--mm/shmem.c6
-rw-r--r--mm/slub.c40
-rw-r--r--mm/swap_cgroup.c3
-rw-r--r--mm/vmalloc.c15
-rw-r--r--mm/vmpressure.c6
-rw-r--r--mm/vmscan.c2
-rw-r--r--net/8021q/vlan.c3
-rw-r--r--net/8021q/vlan_dev.c4
-rw-r--r--net/9p/trans_fd.c4
-rw-r--r--net/batman-adv/distributed-arp-table.c5
-rw-r--r--net/batman-adv/routing.c2
-rw-r--r--net/batman-adv/soft-interface.c5
-rw-r--r--net/bluetooth/6lowpan.c2
-rw-r--r--net/bluetooth/bnep/core.c2
-rw-r--r--net/bluetooth/cmtp/core.c2
-rw-r--r--net/bluetooth/hidp/core.c2
-rw-r--r--net/bridge/br_device.c2
-rw-r--r--net/caif/caif_socket.c4
-rw-r--r--net/caif/cfpkt_skbuff.c6
-rw-r--r--net/caif/chnl_net.c4
-rw-r--r--net/can/af_can.c3
-rw-r--r--net/core/datagram.c2
-rw-r--r--net/core/dev.c74
-rw-r--r--net/core/dev_ioctl.c19
-rw-r--r--net/core/dst.c14
-rw-r--r--net/core/fib_rules.c21
-rw-r--r--net/core/rtnetlink.c5
-rw-r--r--net/decnet/dn_route.c14
-rw-r--r--net/decnet/netfilter/dn_rtmsg.c4
-rw-r--r--net/hsr/hsr_device.c4
-rw-r--r--net/hsr/hsr_forward.c3
-rw-r--r--net/hsr/hsr_framereg.c9
-rw-r--r--net/hsr/hsr_framereg.h2
-rw-r--r--net/ieee802154/6lowpan/core.c2
-rw-r--r--net/ipv4/icmp.c8
-rw-r--r--net/ipv4/igmp.c22
-rw-r--r--net/ipv4/ip_output.c3
-rw-r--r--net/ipv4/ip_tunnel.c6
-rw-r--r--net/ipv4/ipmr.c34
-rw-r--r--net/ipv4/tcp.c2
-rw-r--r--net/ipv6/addrconf.c11
-rw-r--r--net/ipv6/datagram.c8
-rw-r--r--net/ipv6/esp6_offload.c25
-rw-r--r--net/ipv6/fib6_rules.c22
-rw-r--r--net/ipv6/icmp.c2
-rw-r--r--net/ipv6/ila/ila_xlat.c1
-rw-r--r--net/ipv6/ip6_fib.c3
-rw-r--r--net/ipv6/ip6_gre.c9
-rw-r--r--net/ipv6/ip6_output.c2
-rw-r--r--net/ipv6/ip6_tunnel.c14
-rw-r--r--net/ipv6/ip6_vti.c8
-rw-r--r--net/ipv6/ip6mr.c2
-rw-r--r--net/ipv6/proc.c2
-rw-r--r--net/ipv6/route.c7
-rw-r--r--net/ipv6/sit.c8
-rw-r--r--net/ipv6/udp.c3
-rw-r--r--net/ipv6/xfrm6_input.c2
-rw-r--r--net/irda/irlan/irlan_eth.c2
-rw-r--r--net/key/af_key.c19
-rw-r--r--net/l2tp/l2tp_eth.c15
-rw-r--r--net/mac80211/cfg.c2
-rw-r--r--net/mac80211/ieee80211_i.h2
-rw-r--r--net/mac80211/iface.c7
-rw-r--r--net/mac80211/mlme.c62
-rw-r--r--net/mac80211/rx.c6
-rw-r--r--net/mac80211/wpa.c9
-rw-r--r--net/mac802154/iface.c7
-rw-r--r--net/openvswitch/vport-internal_dev.c4
-rw-r--r--net/phonet/pep-gprs.c2
-rw-r--r--net/rxrpc/key.c64
-rw-r--r--net/sched/act_pedit.c4
-rw-r--r--net/sched/act_police.c8
-rw-r--r--net/sched/sch_api.c3
-rw-r--r--net/sctp/endpointola.c1
-rw-r--r--net/sctp/sctp_diag.c5
-rw-r--r--net/sctp/socket.c9
-rw-r--r--net/tipc/msg.c2
-rw-r--r--net/unix/af_unix.c11
-rw-r--r--net/wireless/wext-core.c22
-rw-r--r--net/xfrm/Makefile3
-rw-r--r--net/xfrm/xfrm_device.c2
-rw-r--r--net/xfrm/xfrm_policy.c4
-rw-r--r--net/xfrm/xfrm_user.c1
-rw-r--r--scripts/Makefile.headersinst10
-rwxr-xr-xscripts/checkpatch.pl17
-rw-r--r--scripts/genksyms/genksyms.h2
-rw-r--r--scripts/kconfig/Makefile2
-rw-r--r--scripts/kconfig/nconf.c12
-rw-r--r--scripts/kconfig/nconf.gui.c4
-rwxr-xr-xscripts/tags.sh1
-rw-r--r--security/keys/internal.h1
-rw-r--r--security/selinux/hooks.c5
-rw-r--r--sound/core/control.c2
-rw-r--r--sound/core/hwdep.c2
-rw-r--r--sound/core/init.c2
-rw-r--r--sound/core/oss/pcm_oss.c4
-rw-r--r--sound/core/pcm_lib.c6
-rw-r--r--sound/core/pcm_native.c4
-rw-r--r--sound/core/rawmidi.c8
-rw-r--r--sound/core/seq/seq_fifo.c2
-rw-r--r--sound/core/seq/seq_memory.c2
-rw-r--r--sound/core/timer.c2
-rw-r--r--sound/firewire/amdtp-stream.c8
-rw-r--r--sound/firewire/amdtp-stream.h2
-rw-r--r--sound/isa/wavefront/wavefront_synth.c2
-rw-r--r--sound/pci/hda/hda_codec.h2
-rw-r--r--sound/pci/hda/hda_controller.c8
-rw-r--r--sound/pci/hda/hda_generic.c1
-rw-r--r--sound/pci/hda/hda_intel.c11
-rw-r--r--sound/pci/mixart/mixart_core.c4
-rw-r--r--sound/pci/ymfpci/ymfpci_main.c2
-rw-r--r--tools/Makefile8
-rw-r--r--tools/include/asm/sections.h4
-rw-r--r--tools/include/linux/bitops.h10
-rw-r--r--tools/include/linux/compiler-gcc.h10
-rw-r--r--tools/include/linux/compiler.h16
-rw-r--r--tools/include/linux/debug_locks.h (renamed from tools/lib/lockdep/uinclude/linux/debug_locks.h)3
-rw-r--r--tools/include/linux/delay.h4
-rw-r--r--tools/include/linux/err.h5
-rw-r--r--tools/include/linux/ftrace.h4
-rw-r--r--tools/include/linux/gfp.h4
-rw-r--r--tools/include/linux/hardirq.h (renamed from tools/lib/lockdep/uinclude/linux/hardirq.h)0
-rw-r--r--tools/include/linux/interrupt.h4
-rw-r--r--tools/include/linux/irqflags.h (renamed from tools/lib/lockdep/uinclude/linux/irqflags.h)8
-rw-r--r--tools/include/linux/jhash.h175
-rw-r--r--tools/include/linux/kallsyms.h (renamed from tools/lib/lockdep/uinclude/linux/kallsyms.h)3
-rw-r--r--tools/include/linux/kern_levels.h (renamed from tools/lib/lockdep/uinclude/linux/kern_levels.h)0
-rw-r--r--tools/include/linux/kernel.h39
-rw-r--r--tools/include/linux/kmemcheck.h (renamed from tools/lib/lockdep/uinclude/linux/kmemcheck.h)0
-rw-r--r--tools/include/linux/linkage.h4
-rw-r--r--tools/include/linux/lockdep.h (renamed from tools/lib/lockdep/uinclude/linux/lockdep.h)34
-rw-r--r--tools/include/linux/module.h (renamed from tools/lib/lockdep/uinclude/linux/module.h)5
-rw-r--r--tools/include/linux/mutex.h4
-rw-r--r--tools/include/linux/proc_fs.h4
-rw-r--r--tools/include/linux/rcu.h (renamed from tools/lib/lockdep/uinclude/linux/rcu.h)3
-rw-r--r--tools/include/linux/sched/clock.h4
-rw-r--r--tools/include/linux/sched/mm.h4
-rw-r--r--tools/include/linux/sched/task.h4
-rw-r--r--tools/include/linux/seq_file.h4
-rw-r--r--tools/include/linux/spinlock.h26
-rw-r--r--tools/include/linux/stacktrace.h (renamed from tools/lib/lockdep/uinclude/linux/stacktrace.h)0
-rw-r--r--tools/include/linux/unaligned/packed_struct.h46
-rw-r--r--tools/include/trace/events/lock.h4
-rw-r--r--tools/lib/api/fs/fs.c30
-rw-r--r--tools/lib/api/fs/fs.h4
-rw-r--r--tools/lib/lockdep/Makefile9
-rw-r--r--tools/lib/lockdep/lockdep.c19
-rw-r--r--tools/lib/lockdep/preload.c5
-rw-r--r--tools/lib/lockdep/rbtree.c2
-rwxr-xr-xtools/lib/lockdep/run_tests.sh8
-rw-r--r--tools/lib/lockdep/uinclude/asm/hash.h6
-rw-r--r--tools/lib/lockdep/uinclude/asm/hweight.h3
-rw-r--r--tools/lib/lockdep/uinclude/asm/sections.h3
-rw-r--r--tools/lib/lockdep/uinclude/linux/bitops.h3
-rw-r--r--tools/lib/lockdep/uinclude/linux/compiler.h10
-rw-r--r--tools/lib/lockdep/uinclude/linux/delay.h3
-rw-r--r--tools/lib/lockdep/uinclude/linux/ftrace.h3
-rw-r--r--tools/lib/lockdep/uinclude/linux/gfp.h3
-rw-r--r--tools/lib/lockdep/uinclude/linux/hash.h1
-rw-r--r--tools/lib/lockdep/uinclude/linux/interrupt.h3
-rw-r--r--tools/lib/lockdep/uinclude/linux/kernel.h47
-rw-r--r--tools/lib/lockdep/uinclude/linux/linkage.h3
-rw-r--r--tools/lib/lockdep/uinclude/linux/list.h1
-rw-r--r--tools/lib/lockdep/uinclude/linux/mutex.h3
-rw-r--r--tools/lib/lockdep/uinclude/linux/poison.h1
-rw-r--r--tools/lib/lockdep/uinclude/linux/prefetch.h6
-rw-r--r--tools/lib/lockdep/uinclude/linux/proc_fs.h3
-rw-r--r--tools/lib/lockdep/uinclude/linux/rbtree_augmented.h2
-rw-r--r--tools/lib/lockdep/uinclude/linux/seq_file.h3
-rw-r--r--tools/lib/lockdep/uinclude/linux/spinlock.h25
-rw-r--r--tools/lib/lockdep/uinclude/linux/stringify.h7
-rw-r--r--tools/lib/lockdep/uinclude/trace/events/lock.h3
-rw-r--r--tools/objtool/Build1
-rw-r--r--tools/objtool/Documentation/stack-validation.txt153
-rw-r--r--tools/objtool/Makefile2
-rw-r--r--tools/objtool/arch.h64
-rw-r--r--tools/objtool/arch/x86/decode.c400
-rw-r--r--tools/objtool/arch/x86/insn/x86-opcode-map.txt2
-rw-r--r--tools/objtool/builtin-check.c1280
-rw-r--r--tools/objtool/cfi.h55
-rw-r--r--tools/objtool/check.c1655
-rw-r--r--tools/objtool/check.h66
-rw-r--r--tools/objtool/elf.c59
-rw-r--r--tools/objtool/elf.h6
-rw-r--r--tools/objtool/special.c6
-rw-r--r--tools/objtool/warn.h10
-rw-r--r--tools/perf/Documentation/intel-pt.txt78
-rw-r--r--tools/perf/Documentation/itrace.txt8
-rw-r--r--tools/perf/Documentation/perf-ftrace.txt33
-rw-r--r--tools/perf/Documentation/perf-script.txt18
-rw-r--r--tools/perf/Documentation/perf-stat.txt14
-rw-r--r--tools/perf/Makefile.config38
-rw-r--r--tools/perf/Makefile.perf2
-rw-r--r--tools/perf/arch/Build2
-rw-r--r--tools/perf/arch/arm/util/cs-etm.c29
-rw-r--r--tools/perf/arch/powerpc/util/Build2
-rw-r--r--tools/perf/arch/powerpc/util/unwind-libdw.c73
-rw-r--r--tools/perf/arch/x86/tests/insn-x86-dat-32.c12
-rw-r--r--tools/perf/arch/x86/tests/insn-x86-dat-64.c30
-rw-r--r--tools/perf/arch/x86/tests/insn-x86-dat-src.c30
-rw-r--r--tools/perf/arch/x86/util/intel-bts.c4
-rw-r--r--tools/perf/arch/x86/util/intel-pt.c9
-rw-r--r--tools/perf/bench/numa.c2
-rw-r--r--tools/perf/builtin-c2c.c4
-rw-r--r--tools/perf/builtin-config.c67
-rw-r--r--tools/perf/builtin-diff.c5
-rw-r--r--tools/perf/builtin-ftrace.c159
-rw-r--r--tools/perf/builtin-help.c48
-rw-r--r--tools/perf/builtin-kmem.c4
-rw-r--r--tools/perf/builtin-record.c4
-rw-r--r--tools/perf/builtin-report.c8
-rw-r--r--tools/perf/builtin-sched.c2
-rw-r--r--tools/perf/builtin-script.c353
-rw-r--r--tools/perf/builtin-stat.c53
-rw-r--r--tools/perf/builtin-top.c4
-rw-r--r--tools/perf/jvmti/jvmti_agent.c2
-rw-r--r--tools/perf/jvmti/jvmti_agent.h2
-rw-r--r--tools/perf/jvmti/libjvmti.c5
-rw-r--r--tools/perf/pmu-events/Build4
-rw-r--r--tools/perf/pmu-events/jevents.c4
-rw-r--r--tools/perf/scripts/python/bin/intel-pt-events-record13
-rw-r--r--tools/perf/scripts/python/bin/intel-pt-events-report3
-rw-r--r--tools/perf/scripts/python/intel-pt-events.py128
-rw-r--r--tools/perf/tests/Build2
-rw-r--r--tools/perf/tests/attr.c10
-rw-r--r--tools/perf/tests/attr.py48
-rw-r--r--tools/perf/tests/bp_signal.c3
-rw-r--r--tools/perf/tests/bp_signal_overflow.c3
-rw-r--r--tools/perf/tests/bpf-script-test-prologue.c9
-rw-r--r--tools/perf/tests/dwarf-unwind.c15
-rw-r--r--tools/perf/tests/parse-events.c13
-rw-r--r--tools/perf/tests/task-exit.c2
-rw-r--r--tools/perf/ui/browsers/annotate.c54
-rw-r--r--tools/perf/ui/gtk/annotate.c3
-rw-r--r--tools/perf/util/annotate.c10
-rw-r--r--tools/perf/util/annotate.h4
-rw-r--r--tools/perf/util/auxtrace.c18
-rw-r--r--tools/perf/util/auxtrace.h6
-rw-r--r--tools/perf/util/cache.h3
-rw-r--r--tools/perf/util/config.c43
-rw-r--r--tools/perf/util/config.h4
-rw-r--r--tools/perf/util/data-convert-bt.c6
-rw-r--r--tools/perf/util/debug.h11
-rw-r--r--tools/perf/util/event.h121
-rw-r--r--tools/perf/util/evlist.h3
-rw-r--r--tools/perf/util/evsel.c54
-rw-r--r--tools/perf/util/genelf_debug.c5
-rw-r--r--tools/perf/util/header.c5
-rw-r--r--tools/perf/util/help-unknown-cmd.c2
-rw-r--r--tools/perf/util/intel-bts.c2
-rw-r--r--tools/perf/util/intel-pt-decoder/intel-pt-decoder.c304
-rw-r--r--tools/perf/util/intel-pt-decoder/intel-pt-decoder.h13
-rw-r--r--tools/perf/util/intel-pt-decoder/intel-pt-log.h4
-rw-r--r--tools/perf/util/intel-pt-decoder/intel-pt-pkt-decoder.c110
-rw-r--r--tools/perf/util/intel-pt-decoder/intel-pt-pkt-decoder.h7
-rw-r--r--tools/perf/util/intel-pt-decoder/x86-opcode-map.txt2
-rw-r--r--tools/perf/util/intel-pt.c642
-rw-r--r--tools/perf/util/machine.c10
-rw-r--r--tools/perf/util/pmu.h4
-rw-r--r--tools/perf/util/probe-event.c2
-rw-r--r--tools/perf/util/probe-event.h4
-rw-r--r--tools/perf/util/scripting-engines/trace-event-python.c3
-rw-r--r--tools/perf/util/session.c2
-rw-r--r--tools/perf/util/sort.c22
-rw-r--r--tools/perf/util/stat-shadow.c33
-rw-r--r--tools/perf/util/stat.c2
-rw-r--r--tools/perf/util/stat.h2
-rw-r--r--tools/perf/util/strbuf.h4
-rw-r--r--tools/perf/util/trace-event-parse.c4
-rw-r--r--tools/perf/util/unwind-libdw.c8
-rw-r--r--tools/perf/util/usage.c62
-rw-r--r--tools/perf/util/util.c52
-rw-r--r--tools/perf/util/util.h21
-rw-r--r--tools/testing/selftests/bpf/bpf_endian.h41
-rw-r--r--tools/testing/selftests/bpf/test_verifier.c66
-rwxr-xr-xtools/testing/selftests/ntb/ntb_test.sh2
-rwxr-xr-xtools/testing/selftests/rcutorture/bin/configcheck.sh2
-rwxr-xr-xtools/testing/selftests/rcutorture/bin/kvm-build.sh2
-rwxr-xr-xtools/testing/selftests/rcutorture/bin/kvm.sh5
-rw-r--r--tools/testing/selftests/rcutorture/configs/rcu/CFLIST2
-rw-r--r--tools/testing/selftests/rcutorture/configs/rcu/SRCU-C.boot1
-rw-r--r--tools/testing/selftests/rcutorture/configs/rcu/SRCU-N2
-rw-r--r--tools/testing/selftests/rcutorture/configs/rcu/SRCU-P6
-rw-r--r--tools/testing/selftests/rcutorture/configs/rcu/SRCU-t10
-rw-r--r--tools/testing/selftests/rcutorture/configs/rcu/SRCU-t.boot1
-rw-r--r--tools/testing/selftests/rcutorture/configs/rcu/SRCU-u9
-rw-r--r--tools/testing/selftests/rcutorture/configs/rcu/SRCU-u.boot1
-rw-r--r--tools/testing/selftests/rcutorture/configs/rcu/TINY025
-rw-r--r--tools/testing/selftests/rcutorture/configs/rcu/TREE015
-rw-r--r--tools/testing/selftests/rcutorture/configs/rcu/TREE01.boot4
-rw-r--r--tools/testing/selftests/rcutorture/configs/rcu/TREE025
-rw-r--r--tools/testing/selftests/rcutorture/configs/rcu/TREE034
-rw-r--r--tools/testing/selftests/rcutorture/configs/rcu/TREE03.boot4
-rw-r--r--tools/testing/selftests/rcutorture/configs/rcu/TREE044
-rw-r--r--tools/testing/selftests/rcutorture/configs/rcu/TREE054
-rw-r--r--tools/testing/selftests/rcutorture/configs/rcu/TREE05.boot3
-rw-r--r--tools/testing/selftests/rcutorture/configs/rcu/TREE064
-rw-r--r--tools/testing/selftests/rcutorture/configs/rcu/TREE06.boot3
-rw-r--r--tools/testing/selftests/rcutorture/configs/rcu/TREE076
-rw-r--r--tools/testing/selftests/rcutorture/configs/rcu/TREE081
-rw-r--r--tools/testing/selftests/rcutorture/configs/rcu/TREE08-T21
-rw-r--r--tools/testing/selftests/rcutorture/configs/rcu/TREE08.boot1
-rw-r--r--tools/testing/selftests/rcutorture/configs/rcuperf/TINY (renamed from tools/testing/selftests/rcutorture/configs/rcu/TREE02-T)19
-rw-r--r--tools/testing/selftests/rcutorture/configs/rcuperf/TREE1
-rw-r--r--tools/testing/selftests/rcutorture/configs/rcuperf/TREE541
-rw-r--r--tools/testing/selftests/rcutorture/doc/TINY_RCU.txt1
-rw-r--r--tools/testing/selftests/rcutorture/doc/TREE_RCU-kconfig.txt34
-rwxr-xr-xtools/testing/selftests/rcutorture/formal/srcu-cbmc/modify_srcu.awk2
-rw-r--r--virt/kvm/eventfd.c2
1108 files changed, 17755 insertions, 14649 deletions
diff --git a/Documentation/DocBook/kernel-hacking.tmpl b/Documentation/DocBook/kernel-hacking.tmpl
index da5c087462b1..c3c705591532 100644
--- a/Documentation/DocBook/kernel-hacking.tmpl
+++ b/Documentation/DocBook/kernel-hacking.tmpl
@@ -819,7 +819,7 @@ printk(KERN_INFO "my ip: %pI4\n", &ipaddress);
certain condition is true. They must be used carefully to ensure
there is no race condition. You declare a
<type>wait_queue_head_t</type>, and then processes which want to
- wait for that condition declare a <type>wait_queue_t</type>
+ wait for that condition declare a <type>wait_queue_entry_t</type>
referring to themselves, and place that in the queue.
</para>
diff --git a/Documentation/RCU/00-INDEX b/Documentation/RCU/00-INDEX
index 1672573b037a..f46980c060aa 100644
--- a/Documentation/RCU/00-INDEX
+++ b/Documentation/RCU/00-INDEX
@@ -28,8 +28,6 @@ stallwarn.txt
- RCU CPU stall warnings (module parameter rcu_cpu_stall_suppress)
torture.txt
- RCU Torture Test Operation (CONFIG_RCU_TORTURE_TEST)
-trace.txt
- - CONFIG_RCU_TRACE debugfs files and formats
UP.txt
- RCU on Uniprocessor Systems
whatisRCU.txt
diff --git a/Documentation/RCU/Design/Requirements/Requirements.html b/Documentation/RCU/Design/Requirements/Requirements.html
index f60adf112663..95b30fa25d56 100644
--- a/Documentation/RCU/Design/Requirements/Requirements.html
+++ b/Documentation/RCU/Design/Requirements/Requirements.html
@@ -559,9 +559,7 @@ The <tt>rcu_access_pointer()</tt> on line&nbsp;6 is similar to
For <tt>remove_gp_synchronous()</tt>, as long as all modifications
to <tt>gp</tt> are carried out while holding <tt>gp_lock</tt>,
the above optimizations are harmless.
- However,
- with <tt>CONFIG_SPARSE_RCU_POINTER=y</tt>,
- <tt>sparse</tt> will complain if you
+ However, <tt>sparse</tt> will complain if you
define <tt>gp</tt> with <tt>__rcu</tt> and then
access it without using
either <tt>rcu_access_pointer()</tt> or <tt>rcu_dereference()</tt>.
@@ -1849,7 +1847,8 @@ mass storage, or user patience, whichever comes first.
If the nesting is not visible to the compiler, as is the case with
mutually recursive functions each in its own translation unit,
stack overflow will result.
-If the nesting takes the form of loops, either the control variable
+If the nesting takes the form of loops, perhaps in the guise of tail
+recursion, either the control variable
will overflow or (in the Linux kernel) you will get an RCU CPU stall warning.
Nevertheless, this class of RCU implementations is one
of the most composable constructs in existence.
@@ -1977,9 +1976,8 @@ guard against mishaps and misuse:
and <tt>rcu_dereference()</tt>, perhaps (incorrectly)
substituting a simple assignment.
To catch this sort of error, a given RCU-protected pointer may be
- tagged with <tt>__rcu</tt>, after which running sparse
- with <tt>CONFIG_SPARSE_RCU_POINTER=y</tt> will complain
- about simple-assignment accesses to that pointer.
+ tagged with <tt>__rcu</tt>, after which sparse
+ will complain about simple-assignment accesses to that pointer.
Arnd Bergmann made me aware of this requirement, and also
supplied the needed
<a href="https://lwn.net/Articles/376011/">patch series</a>.
@@ -2036,7 +2034,7 @@ guard against mishaps and misuse:
some other synchronization mechanism, for example, reference
counting.
<li> In kernels built with <tt>CONFIG_RCU_TRACE=y</tt>, RCU-related
- information is provided via both debugfs and event tracing.
+ information is provided via event tracing.
<li> Open-coded use of <tt>rcu_assign_pointer()</tt> and
<tt>rcu_dereference()</tt> to create typical linked
data structures can be surprisingly error-prone.
@@ -2519,11 +2517,7 @@ It is similarly socially unacceptable to interrupt an
<tt>nohz_full</tt> CPU running in userspace.
RCU must therefore track <tt>nohz_full</tt> userspace
execution.
-And in
-<a href="https://lwn.net/Articles/558284/"><tt>CONFIG_NO_HZ_FULL_SYSIDLE=y</tt></a>
-kernels, RCU must separately track idle CPUs on the one hand and
-CPUs that are either idle or executing in userspace on the other.
-In both cases, RCU must be able to sample state at two points in
+RCU must therefore be able to sample state at two points in
time, and be able to determine whether or not some other CPU spent
any time idle and/or executing in userspace.
@@ -2936,6 +2930,20 @@ to whether or not a CPU is online, which means that <tt>srcu_barrier()</tt>
need not exclude CPU-hotplug operations.
<p>
+SRCU also differs from other RCU flavors in that SRCU's expedited and
+non-expedited grace periods are implemented by the same mechanism.
+This means that in the current SRCU implementation, expediting a
+future grace period has the side effect of expediting all prior
+grace periods that have not yet completed.
+(But please note that this is a property of the current implementation,
+not necessarily of future implementations.)
+In addition, if SRCU has been idle for longer than the interval
+specified by the <tt>srcutree.exp_holdoff</tt> kernel boot parameter
+(25&nbsp;microseconds by default),
+and if a <tt>synchronize_srcu()</tt> invocation ends this idle period,
+that invocation will be automatically expedited.
+
+<p>
As of v4.12, SRCU's callbacks are maintained per-CPU, eliminating
a locking bottleneck present in prior kernel versions.
Although this will allow users to put much heavier stress on
diff --git a/Documentation/RCU/checklist.txt b/Documentation/RCU/checklist.txt
index 877947130ebe..6beda556faf3 100644
--- a/Documentation/RCU/checklist.txt
+++ b/Documentation/RCU/checklist.txt
@@ -413,11 +413,11 @@ over a rather long period of time, but improvements are always welcome!
read-side critical sections. It is the responsibility of the
RCU update-side primitives to deal with this.
-17. Use CONFIG_PROVE_RCU, CONFIG_DEBUG_OBJECTS_RCU_HEAD, and the
- __rcu sparse checks (enabled by CONFIG_SPARSE_RCU_POINTER) to
- validate your RCU code. These can help find problems as follows:
+17. Use CONFIG_PROVE_LOCKING, CONFIG_DEBUG_OBJECTS_RCU_HEAD, and the
+ __rcu sparse checks to validate your RCU code. These can help
+ find problems as follows:
- CONFIG_PROVE_RCU: check that accesses to RCU-protected data
+ CONFIG_PROVE_LOCKING: check that accesses to RCU-protected data
structures are carried out under the proper RCU
read-side critical section, while holding the right
combination of locks, or whatever other conditions
diff --git a/Documentation/RCU/trace.txt b/Documentation/RCU/trace.txt
deleted file mode 100644
index 6549012033f9..000000000000
--- a/Documentation/RCU/trace.txt
+++ /dev/null
@@ -1,535 +0,0 @@
-CONFIG_RCU_TRACE debugfs Files and Formats
-
-
-The rcutree and rcutiny implementations of RCU provide debugfs trace
-output that summarizes counters and state. This information is useful for
-debugging RCU itself, and can sometimes also help to debug abuses of RCU.
-The following sections describe the debugfs files and formats, first
-for rcutree and next for rcutiny.
-
-
-CONFIG_TREE_RCU and CONFIG_PREEMPT_RCU debugfs Files and Formats
-
-These implementations of RCU provide several debugfs directories under the
-top-level directory "rcu":
-
-rcu/rcu_bh
-rcu/rcu_preempt
-rcu/rcu_sched
-
-Each directory contains files for the corresponding flavor of RCU.
-Note that rcu/rcu_preempt is only present for CONFIG_PREEMPT_RCU.
-For CONFIG_TREE_RCU, the RCU flavor maps onto the RCU-sched flavor,
-so that activity for both appears in rcu/rcu_sched.
-
-In addition, the following file appears in the top-level directory:
-rcu/rcutorture. This file displays rcutorture test progress. The output
-of "cat rcu/rcutorture" looks as follows:
-
-rcutorture test sequence: 0 (test in progress)
-rcutorture update version number: 615
-
-The first line shows the number of rcutorture tests that have completed
-since boot. If a test is currently running, the "(test in progress)"
-string will appear as shown above. The second line shows the number of
-update cycles that the current test has started, or zero if there is
-no test in progress.
-
-
-Within each flavor directory (rcu/rcu_bh, rcu/rcu_sched, and possibly
-also rcu/rcu_preempt) the following files will be present:
-
-rcudata:
- Displays fields in struct rcu_data.
-rcuexp:
- Displays statistics for expedited grace periods.
-rcugp:
- Displays grace-period counters.
-rcuhier:
- Displays the struct rcu_node hierarchy.
-rcu_pending:
- Displays counts of the reasons rcu_pending() decided that RCU had
- work to do.
-rcuboost:
- Displays RCU boosting statistics. Only present if
- CONFIG_RCU_BOOST=y.
-
-The output of "cat rcu/rcu_preempt/rcudata" looks as follows:
-
- 0!c=30455 g=30456 cnq=1/0:1 dt=126535/140000000000000/0 df=2002 of=4 ql=0/0 qs=N... b=10 ci=74572 nci=0 co=1131 ca=716
- 1!c=30719 g=30720 cnq=1/0:0 dt=132007/140000000000000/0 df=1874 of=10 ql=0/0 qs=N... b=10 ci=123209 nci=0 co=685 ca=982
- 2!c=30150 g=30151 cnq=1/1:1 dt=138537/140000000000000/0 df=1707 of=8 ql=0/0 qs=N... b=10 ci=80132 nci=0 co=1328 ca=1458
- 3 c=31249 g=31250 cnq=1/1:0 dt=107255/140000000000000/0 df=1749 of=6 ql=0/450 qs=NRW. b=10 ci=151700 nci=0 co=509 ca=622
- 4!c=29502 g=29503 cnq=1/0:1 dt=83647/140000000000000/0 df=965 of=5 ql=0/0 qs=N... b=10 ci=65643 nci=0 co=1373 ca=1521
- 5 c=31201 g=31202 cnq=1/0:1 dt=70422/0/0 df=535 of=7 ql=0/0 qs=.... b=10 ci=58500 nci=0 co=764 ca=698
- 6!c=30253 g=30254 cnq=1/0:1 dt=95363/140000000000000/0 df=780 of=5 ql=0/0 qs=N... b=10 ci=100607 nci=0 co=1414 ca=1353
- 7 c=31178 g=31178 cnq=1/0:0 dt=91536/0/0 df=547 of=4 ql=0/0 qs=.... b=10 ci=109819 nci=0 co=1115 ca=969
-
-This file has one line per CPU, or eight for this 8-CPU system.
-The fields are as follows:
-
-o The number at the beginning of each line is the CPU number.
- CPUs numbers followed by an exclamation mark are offline,
- but have been online at least once since boot. There will be
- no output for CPUs that have never been online, which can be
- a good thing in the surprisingly common case where NR_CPUS is
- substantially larger than the number of actual CPUs.
-
-o "c" is the count of grace periods that this CPU believes have
- completed. Offlined CPUs and CPUs in dynticks idle mode may lag
- quite a ways behind, for example, CPU 4 under "rcu_sched" above,
- which has been offline through 16 RCU grace periods. It is not
- unusual to see offline CPUs lagging by thousands of grace periods.
- Note that although the grace-period number is an unsigned long,
- it is printed out as a signed long to allow more human-friendly
- representation near boot time.
-
-o "g" is the count of grace periods that this CPU believes have
- started. Again, offlined CPUs and CPUs in dynticks idle mode
- may lag behind. If the "c" and "g" values are equal, this CPU
- has already reported a quiescent state for the last RCU grace
- period that it is aware of, otherwise, the CPU believes that it
- owes RCU a quiescent state.
-
-o "pq" indicates that this CPU has passed through a quiescent state
- for the current grace period. It is possible for "pq" to be
- "1" and "c" different than "g", which indicates that although
- the CPU has passed through a quiescent state, either (1) this
- CPU has not yet reported that fact, (2) some other CPU has not
- yet reported for this grace period, or (3) both.
-
-o "qp" indicates that RCU still expects a quiescent state from
- this CPU. Offlined CPUs and CPUs in dyntick idle mode might
- well have qp=1, which is OK: RCU is still ignoring them.
-
-o "dt" is the current value of the dyntick counter that is incremented
- when entering or leaving idle, either due to a context switch or
- due to an interrupt. This number is even if the CPU is in idle
- from RCU's viewpoint and odd otherwise. The number after the
- first "/" is the interrupt nesting depth when in idle state,
- or a large number added to the interrupt-nesting depth when
- running a non-idle task. Some architectures do not accurately
- count interrupt nesting when running in non-idle kernel context,
- which can result in interesting anomalies such as negative
- interrupt-nesting levels. The number after the second "/"
- is the NMI nesting depth.
-
-o "df" is the number of times that some other CPU has forced a
- quiescent state on behalf of this CPU due to this CPU being in
- idle state.
-
-o "of" is the number of times that some other CPU has forced a
- quiescent state on behalf of this CPU due to this CPU being
- offline. In a perfect world, this might never happen, but it
- turns out that offlining and onlining a CPU can take several grace
- periods, and so there is likely to be an extended period of time
- when RCU believes that the CPU is online when it really is not.
- Please note that erring in the other direction (RCU believing a
- CPU is offline when it is really alive and kicking) is a fatal
- error, so it makes sense to err conservatively.
-
-o "ql" is the number of RCU callbacks currently residing on
- this CPU. The first number is the number of "lazy" callbacks
- that are known to RCU to only be freeing memory, and the number
- after the "/" is the total number of callbacks, lazy or not.
- These counters count callbacks regardless of what phase of
- grace-period processing that they are in (new, waiting for
- grace period to start, waiting for grace period to end, ready
- to invoke).
-
-o "qs" gives an indication of the state of the callback queue
- with four characters:
-
- "N" Indicates that there are callbacks queued that are not
- ready to be handled by the next grace period, and thus
- will be handled by the grace period following the next
- one.
-
- "R" Indicates that there are callbacks queued that are
- ready to be handled by the next grace period.
-
- "W" Indicates that there are callbacks queued that are
- waiting on the current grace period.
-
- "D" Indicates that there are callbacks queued that have
- already been handled by a prior grace period, and are
- thus waiting to be invoked. Note that callbacks in
- the process of being invoked are not counted here.
- Callbacks in the process of being invoked are those
- that have been removed from the rcu_data structures
- queues by rcu_do_batch(), but which have not yet been
- invoked.
-
- If there are no callbacks in a given one of the above states,
- the corresponding character is replaced by ".".
-
-o "b" is the batch limit for this CPU. If more than this number
- of RCU callbacks is ready to invoke, then the remainder will
- be deferred.
-
-o "ci" is the number of RCU callbacks that have been invoked for
- this CPU. Note that ci+nci+ql is the number of callbacks that have
- been registered in absence of CPU-hotplug activity.
-
-o "nci" is the number of RCU callbacks that have been offloaded from
- this CPU. This will always be zero unless the kernel was built
- with CONFIG_RCU_NOCB_CPU=y and the "rcu_nocbs=" kernel boot
- parameter was specified.
-
-o "co" is the number of RCU callbacks that have been orphaned due to
- this CPU going offline. These orphaned callbacks have been moved
- to an arbitrarily chosen online CPU.
-
-o "ca" is the number of RCU callbacks that have been adopted by this
- CPU due to other CPUs going offline. Note that ci+co-ca+ql is
- the number of RCU callbacks registered on this CPU.
-
-
-Kernels compiled with CONFIG_RCU_BOOST=y display the following from
-/debug/rcu/rcu_preempt/rcudata:
-
- 0!c=12865 g=12866 cnq=1/0:1 dt=83113/140000000000000/0 df=288 of=11 ql=0/0 qs=N... kt=0/O ktl=944 b=10 ci=60709 nci=0 co=748 ca=871
- 1 c=14407 g=14408 cnq=1/0:0 dt=100679/140000000000000/0 df=378 of=7 ql=0/119 qs=NRW. kt=0/W ktl=9b6 b=10 ci=109740 nci=0 co=589 ca=485
- 2 c=14407 g=14408 cnq=1/0:0 dt=105486/0/0 df=90 of=9 ql=0/89 qs=NRW. kt=0/W ktl=c0c b=10 ci=83113 nci=0 co=533 ca=490
- 3 c=14407 g=14408 cnq=1/0:0 dt=107138/0/0 df=142 of=8 ql=0/188 qs=NRW. kt=0/W ktl=b96 b=10 ci=121114 nci=0 co=426 ca=290
- 4 c=14405 g=14406 cnq=1/0:1 dt=50238/0/0 df=706 of=7 ql=0/0 qs=.... kt=0/W ktl=812 b=10 ci=34929 nci=0 co=643 ca=114
- 5!c=14168 g=14169 cnq=1/0:0 dt=45465/140000000000000/0 df=161 of=11 ql=0/0 qs=N... kt=0/O ktl=b4d b=10 ci=47712 nci=0 co=677 ca=722
- 6 c=14404 g=14405 cnq=1/0:0 dt=59454/0/0 df=94 of=6 ql=0/0 qs=.... kt=0/W ktl=e57 b=10 ci=55597 nci=0 co=701 ca=811
- 7 c=14407 g=14408 cnq=1/0:1 dt=68850/0/0 df=31 of=8 ql=0/0 qs=.... kt=0/W ktl=14bd b=10 ci=77475 nci=0 co=508 ca=1042
-
-This is similar to the output discussed above, but contains the following
-additional fields:
-
-o "kt" is the per-CPU kernel-thread state. The digit preceding
- the first slash is zero if there is no work pending and 1
- otherwise. The character between the first pair of slashes is
- as follows:
-
- "S" The kernel thread is stopped, in other words, all
- CPUs corresponding to this rcu_node structure are
- offline.
-
- "R" The kernel thread is running.
-
- "W" The kernel thread is waiting because there is no work
- for it to do.
-
- "O" The kernel thread is waiting because it has been
- forced off of its designated CPU or because its
- ->cpus_allowed mask permits it to run on other than
- its designated CPU.
-
- "Y" The kernel thread is yielding to avoid hogging CPU.
-
- "?" Unknown value, indicates a bug.
-
- The number after the final slash is the CPU that the kthread
- is actually running on.
-
- This field is displayed only for CONFIG_RCU_BOOST kernels.
-
-o "ktl" is the low-order 16 bits (in hexadecimal) of the count of
- the number of times that this CPU's per-CPU kthread has gone
- through its loop servicing invoke_rcu_cpu_kthread() requests.
-
- This field is displayed only for CONFIG_RCU_BOOST kernels.
-
-
-The output of "cat rcu/rcu_preempt/rcuexp" looks as follows:
-
-s=21872 wd1=0 wd2=0 wd3=5 enq=0 sc=21872
-
-These fields are as follows:
-
-o "s" is the sequence number, with an odd number indicating that
- an expedited grace period is in progress.
-
-o "wd1", "wd2", and "wd3" are the number of times that an attempt
- to start an expedited grace period found that someone else had
- completed an expedited grace period that satisfies the attempted
- request. "Our work is done."
-
-o "enq" is the number of quiescent states still outstanding.
-
-o "sc" is the number of times that the attempt to start a
- new expedited grace period succeeded.
-
-
-The output of "cat rcu/rcu_preempt/rcugp" looks as follows:
-
-completed=31249 gpnum=31250 age=1 max=18
-
-These fields are taken from the rcu_state structure, and are as follows:
-
-o "completed" is the number of grace periods that have completed.
- It is comparable to the "c" field from rcu/rcudata in that a
- CPU whose "c" field matches the value of "completed" is aware
- that the corresponding RCU grace period has completed.
-
-o "gpnum" is the number of grace periods that have started. It is
- similarly comparable to the "g" field from rcu/rcudata in that
- a CPU whose "g" field matches the value of "gpnum" is aware that
- the corresponding RCU grace period has started.
-
- If these two fields are equal, then there is no grace period
- in progress, in other words, RCU is idle. On the other hand,
- if the two fields differ (as they are above), then an RCU grace
- period is in progress.
-
-o "age" is the number of jiffies that the current grace period
- has extended for, or zero if there is no grace period currently
- in effect.
-
-o "max" is the age in jiffies of the longest-duration grace period
- thus far.
-
-The output of "cat rcu/rcu_preempt/rcuhier" looks as follows:
-
-c=14407 g=14408 s=0 jfq=2 j=c863 nfqs=12040/nfqsng=0(12040) fqlh=1051 oqlen=0/0
-3/3 ..>. 0:7 ^0
-e/e ..>. 0:3 ^0 d/d ..>. 4:7 ^1
-
-The fields are as follows:
-
-o "c" is exactly the same as "completed" under rcu/rcu_preempt/rcugp.
-
-o "g" is exactly the same as "gpnum" under rcu/rcu_preempt/rcugp.
-
-o "s" is the current state of the force_quiescent_state()
- state machine.
-
-o "jfq" is the number of jiffies remaining for this grace period
- before force_quiescent_state() is invoked to help push things
- along. Note that CPUs in idle mode throughout the grace period
- will not report on their own, but rather must be check by some
- other CPU via force_quiescent_state().
-
-o "j" is the low-order four hex digits of the jiffies counter.
- Yes, Paul did run into a number of problems that turned out to
- be due to the jiffies counter no longer counting. Why do you ask?
-
-o "nfqs" is the number of calls to force_quiescent_state() since
- boot.
-
-o "nfqsng" is the number of useless calls to force_quiescent_state(),
- where there wasn't actually a grace period active. This can
- no longer happen due to grace-period processing being pushed
- into a kthread. The number in parentheses is the difference
- between "nfqs" and "nfqsng", or the number of times that
- force_quiescent_state() actually did some real work.
-
-o "fqlh" is the number of calls to force_quiescent_state() that
- exited immediately (without even being counted in nfqs above)
- due to contention on ->fqslock.
-
-o Each element of the form "3/3 ..>. 0:7 ^0" represents one rcu_node
- structure. Each line represents one level of the hierarchy,
- from root to leaves. It is best to think of the rcu_data
- structures as forming yet another level after the leaves.
- Note that there might be either one, two, three, or even four
- levels of rcu_node structures, depending on the relationship
- between CONFIG_RCU_FANOUT, CONFIG_RCU_FANOUT_LEAF (possibly
- adjusted using the rcu_fanout_leaf kernel boot parameter), and
- CONFIG_NR_CPUS (possibly adjusted using the nr_cpu_ids count of
- possible CPUs for the booting hardware).
-
- o The numbers separated by the "/" are the qsmask followed
- by the qsmaskinit. The qsmask will have one bit
- set for each entity in the next lower level that has
- not yet checked in for the current grace period ("e"
- indicating CPUs 5, 6, and 7 in the example above).
- The qsmaskinit will have one bit for each entity that is
- currently expected to check in during each grace period.
- The value of qsmaskinit is assigned to that of qsmask
- at the beginning of each grace period.
-
- o The characters separated by the ">" indicate the state
- of the blocked-tasks lists. A "G" preceding the ">"
- indicates that at least one task blocked in an RCU
- read-side critical section blocks the current grace
- period, while a "E" preceding the ">" indicates that
- at least one task blocked in an RCU read-side critical
- section blocks the current expedited grace period.
- A "T" character following the ">" indicates that at
- least one task is blocked within an RCU read-side
- critical section, regardless of whether any current
- grace period (expedited or normal) is inconvenienced.
- A "." character appears if the corresponding condition
- does not hold, so that "..>." indicates that no tasks
- are blocked. In contrast, "GE>T" indicates maximal
- inconvenience from blocked tasks. CONFIG_TREE_RCU
- builds of the kernel will always show "..>.".
-
- o The numbers separated by the ":" are the range of CPUs
- served by this struct rcu_node. This can be helpful
- in working out how the hierarchy is wired together.
-
- For example, the example rcu_node structure shown above
- has "0:7", indicating that it covers CPUs 0 through 7.
-
- o The number after the "^" indicates the bit in the
- next higher level rcu_node structure that this rcu_node
- structure corresponds to. For example, the "d/d ..>. 4:7
- ^1" has a "1" in this position, indicating that it
- corresponds to the "1" bit in the "3" shown in the
- "3/3 ..>. 0:7 ^0" entry on the next level up.
-
-
-The output of "cat rcu/rcu_sched/rcu_pending" looks as follows:
-
- 0!np=26111 qsp=29 rpq=5386 cbr=1 cng=570 gpc=3674 gps=577 nn=15903 ndw=0
- 1!np=28913 qsp=35 rpq=6097 cbr=1 cng=448 gpc=3700 gps=554 nn=18113 ndw=0
- 2!np=32740 qsp=37 rpq=6202 cbr=0 cng=476 gpc=4627 gps=546 nn=20889 ndw=0
- 3 np=23679 qsp=22 rpq=5044 cbr=1 cng=415 gpc=3403 gps=347 nn=14469 ndw=0
- 4!np=30714 qsp=4 rpq=5574 cbr=0 cng=528 gpc=3931 gps=639 nn=20042 ndw=0
- 5 np=28910 qsp=2 rpq=5246 cbr=0 cng=428 gpc=4105 gps=709 nn=18422 ndw=0
- 6!np=38648 qsp=5 rpq=7076 cbr=0 cng=840 gpc=4072 gps=961 nn=25699 ndw=0
- 7 np=37275 qsp=2 rpq=6873 cbr=0 cng=868 gpc=3416 gps=971 nn=25147 ndw=0
-
-The fields are as follows:
-
-o The leading number is the CPU number, with "!" indicating
- an offline CPU.
-
-o "np" is the number of times that __rcu_pending() has been invoked
- for the corresponding flavor of RCU.
-
-o "qsp" is the number of times that the RCU was waiting for a
- quiescent state from this CPU.
-
-o "rpq" is the number of times that the CPU had passed through
- a quiescent state, but not yet reported it to RCU.
-
-o "cbr" is the number of times that this CPU had RCU callbacks
- that had passed through a grace period, and were thus ready
- to be invoked.
-
-o "cng" is the number of times that this CPU needed another
- grace period while RCU was idle.
-
-o "gpc" is the number of times that an old grace period had
- completed, but this CPU was not yet aware of it.
-
-o "gps" is the number of times that a new grace period had started,
- but this CPU was not yet aware of it.
-
-o "ndw" is the number of times that a wakeup of an rcuo
- callback-offload kthread had to be deferred in order to avoid
- deadlock.
-
-o "nn" is the number of times that this CPU needed nothing.
-
-
-The output of "cat rcu/rcuboost" looks as follows:
-
-0:3 tasks=.... kt=W ntb=0 neb=0 nnb=0 j=c864 bt=c894
- balk: nt=0 egt=4695 bt=0 nb=0 ny=56 nos=0
-4:7 tasks=.... kt=W ntb=0 neb=0 nnb=0 j=c864 bt=c894
- balk: nt=0 egt=6541 bt=0 nb=0 ny=126 nos=0
-
-This information is output only for rcu_preempt. Each two-line entry
-corresponds to a leaf rcu_node structure. The fields are as follows:
-
-o "n:m" is the CPU-number range for the corresponding two-line
- entry. In the sample output above, the first entry covers
- CPUs zero through three and the second entry covers CPUs four
- through seven.
-
-o "tasks=TNEB" gives the state of the various segments of the
- rnp->blocked_tasks list:
-
- "T" This indicates that there are some tasks that blocked
- while running on one of the corresponding CPUs while
- in an RCU read-side critical section.
-
- "N" This indicates that some of the blocked tasks are preventing
- the current normal (non-expedited) grace period from
- completing.
-
- "E" This indicates that some of the blocked tasks are preventing
- the current expedited grace period from completing.
-
- "B" This indicates that some of the blocked tasks are in
- need of RCU priority boosting.
-
- Each character is replaced with "." if the corresponding
- condition does not hold.
-
-o "kt" is the state of the RCU priority-boosting kernel
- thread associated with the corresponding rcu_node structure.
- The state can be one of the following:
-
- "S" The kernel thread is stopped, in other words, all
- CPUs corresponding to this rcu_node structure are
- offline.
-
- "R" The kernel thread is running.
-
- "W" The kernel thread is waiting because there is no work
- for it to do.
-
- "Y" The kernel thread is yielding to avoid hogging CPU.
-
- "?" Unknown value, indicates a bug.
-
-o "ntb" is the number of tasks boosted.
-
-o "neb" is the number of tasks boosted in order to complete an
- expedited grace period.
-
-o "nnb" is the number of tasks boosted in order to complete a
- normal (non-expedited) grace period. When boosting a task
- that was blocking both an expedited and a normal grace period,
- it is counted against the expedited total above.
-
-o "j" is the low-order 16 bits of the jiffies counter in
- hexadecimal.
-
-o "bt" is the low-order 16 bits of the value that the jiffies
- counter will have when we next start boosting, assuming that
- the current grace period does not end beforehand. This is
- also in hexadecimal.
-
-o "balk: nt" counts the number of times we didn't boost (in
- other words, we balked) even though it was time to boost because
- there were no blocked tasks to boost. This situation occurs
- when there is one blocked task on one rcu_node structure and
- none on some other rcu_node structure.
-
-o "egt" counts the number of times we balked because although
- there were blocked tasks, none of them were blocking the
- current grace period, whether expedited or otherwise.
-
-o "bt" counts the number of times we balked because boosting
- had already been initiated for the current grace period.
-
-o "nb" counts the number of times we balked because there
- was at least one task blocking the current non-expedited grace
- period that never had blocked. If it is already running, it
- just won't help to boost its priority!
-
-o "ny" counts the number of times we balked because it was
- not yet time to start boosting.
-
-o "nos" counts the number of times we balked for other
- reasons, e.g., the grace period ended first.
-
-
-CONFIG_TINY_RCU debugfs Files and Formats
-
-These implementations of RCU provides a single debugfs file under the
-top-level directory RCU, namely rcu/rcudata, which displays fields in
-rcu_bh_ctrlblk and rcu_sched_ctrlblk.
-
-The output of "cat rcu/rcudata" is as follows:
-
-rcu_sched: qlen: 0
-rcu_bh: qlen: 0
-
-This is split into rcu_sched and rcu_bh sections. The field is as
-follows:
-
-o "qlen" is the number of RCU callbacks currently waiting either
- for an RCU grace period or waiting to be invoked. This is the
- only field present for rcu_sched and rcu_bh, due to the
- short-circuiting of grace period in those two cases.
diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt
index 0f5c3b4347c6..9b0b3dea6326 100644
--- a/Documentation/admin-guide/kernel-parameters.txt
+++ b/Documentation/admin-guide/kernel-parameters.txt
@@ -2136,6 +2136,12 @@
memmap=nn[KMG]@ss[KMG]
[KNL] Force usage of a specific region of memory.
Region of memory to be used is from ss to ss+nn.
+ If @ss[KMG] is omitted, it is equivalent to mem=nn[KMG],
+ which limits max address to nn[KMG].
+ Multiple different regions can be specified,
+ comma delimited.
+ Example:
+ memmap=100M@2G,100M#3G,1G!1024G
memmap=nn[KMG]#ss[KMG]
[KNL,ACPI] Mark specific memory as ACPI data.
@@ -2148,6 +2154,9 @@
memmap=64K$0x18690000
or
memmap=0x10000$0x18690000
+ Some bootloaders may need an escape character before '$',
+ like Grub2, otherwise '$' and the following number
+ will be eaten.
memmap=nn[KMG]!ss[KMG]
[KNL,X86] Mark specific memory as protected.
@@ -3238,21 +3247,17 @@
rcutree.gp_cleanup_delay= [KNL]
Set the number of jiffies to delay each step of
- RCU grace-period cleanup. This only has effect
- when CONFIG_RCU_TORTURE_TEST_SLOW_CLEANUP is set.
+ RCU grace-period cleanup.
rcutree.gp_init_delay= [KNL]
Set the number of jiffies to delay each step of
- RCU grace-period initialization. This only has
- effect when CONFIG_RCU_TORTURE_TEST_SLOW_INIT
- is set.
+ RCU grace-period initialization.
rcutree.gp_preinit_delay= [KNL]
Set the number of jiffies to delay each step of
RCU grace-period pre-initialization, that is,
the propagation of recent CPU-hotplug changes up
- the rcu_node combining tree. This only has effect
- when CONFIG_RCU_TORTURE_TEST_SLOW_PREINIT is set.
+ the rcu_node combining tree.
rcutree.rcu_fanout_exact= [KNL]
Disable autobalancing of the rcu_node combining
@@ -3328,6 +3333,17 @@
This wake_up() will be accompanied by a
WARN_ONCE() splat and an ftrace_dump().
+ rcuperf.gp_async= [KNL]
+ Measure performance of asynchronous
+ grace-period primitives such as call_rcu().
+
+ rcuperf.gp_async_max= [KNL]
+ Specify the maximum number of outstanding
+ callbacks per writer thread. When a writer
+ thread exceeds this limit, it invokes the
+ corresponding flavor of rcu_barrier() to allow
+ previously posted callbacks to drain.
+
rcuperf.gp_exp= [KNL]
Measure performance of expedited synchronous
grace-period primitives.
@@ -3355,17 +3371,22 @@
rcuperf.perf_runnable= [BOOT]
Start rcuperf running at boot time.
+ rcuperf.perf_type= [KNL]
+ Specify the RCU implementation to test.
+
rcuperf.shutdown= [KNL]
Shut the system down after performance tests
complete. This is useful for hands-off automated
testing.
- rcuperf.perf_type= [KNL]
- Specify the RCU implementation to test.
-
rcuperf.verbose= [KNL]
Enable additional printk() statements.
+ rcuperf.writer_holdoff= [KNL]
+ Write-side holdoff between grace periods,
+ in microseconds. The default of zero says
+ no holdoff.
+
rcutorture.cbflood_inter_holdoff= [KNL]
Set holdoff time (jiffies) between successive
callback-flood tests.
@@ -3803,6 +3824,15 @@
spia_pedr=
spia_peddr=
+ srcutree.counter_wrap_check [KNL]
+ Specifies how frequently to check for
+ grace-period sequence counter wrap for the
+ srcu_data structure's ->srcu_gp_seq_needed field.
+ The greater the number of bits set in this kernel
+ parameter, the less frequently counter wrap will
+ be checked for. Note that the bottom two bits
+ are ignored.
+
srcutree.exp_holdoff [KNL]
Specifies how many nanoseconds must elapse
since the end of the last SRCU grace period for
@@ -3811,6 +3841,13 @@
expediting. Set to zero to disable automatic
expediting.
+ stack_guard_gap= [MM]
+ override the default stack gap protection. The value
+ is in page units and it defines how many pages prior
+ to (for stacks growing down) resp. after (for stacks
+ growing up) the main stack are reserved for no other
+ mapping. Default value is 256 pages.
+
stacktrace [FTRACE]
Enabled the stack tracer on boot up.
diff --git a/Documentation/block/data-integrity.txt b/Documentation/block/data-integrity.txt
index f56ec97f0d14..934c44ea0c57 100644
--- a/Documentation/block/data-integrity.txt
+++ b/Documentation/block/data-integrity.txt
@@ -192,7 +192,7 @@ will require extra work due to the application tag.
supported by the block device.
- int bio_integrity_prep(bio);
+ bool bio_integrity_prep(bio);
To generate IMD for WRITE and to set up buffers for READ, the
filesystem must call bio_integrity_prep(bio).
@@ -201,9 +201,7 @@ will require extra work due to the application tag.
sector must be set, and the bio should have all data pages
added. It is up to the caller to ensure that the bio does not
change while I/O is in progress.
-
- bio_integrity_prep() should only be called if
- bio_integrity_enabled() returned 1.
+ Complete bio with error if prepare failed for some reson.
5.3 PASSING EXISTING INTEGRITY METADATA
diff --git a/Documentation/core-api/atomic_ops.rst b/Documentation/core-api/atomic_ops.rst
index 55e43f1c80de..fce929144ccd 100644
--- a/Documentation/core-api/atomic_ops.rst
+++ b/Documentation/core-api/atomic_ops.rst
@@ -303,6 +303,11 @@ defined which accomplish this::
void smp_mb__before_atomic(void);
void smp_mb__after_atomic(void);
+Preceding a non-value-returning read-modify-write atomic operation with
+smp_mb__before_atomic() and following it with smp_mb__after_atomic()
+provides the same full ordering that is provided by value-returning
+read-modify-write atomic operations.
+
For example, smp_mb__before_atomic() can be used like so::
obj->dead = 1;
diff --git a/Documentation/dev-tools/sparse.rst b/Documentation/dev-tools/sparse.rst
index ffdcc97f6f5a..78aa00a604a0 100644
--- a/Documentation/dev-tools/sparse.rst
+++ b/Documentation/dev-tools/sparse.rst
@@ -103,9 +103,3 @@ have already built it.
The optional make variable CF can be used to pass arguments to sparse. The
build system passes -Wbitwise to sparse automatically.
-
-Checking RCU annotations
-~~~~~~~~~~~~~~~~~~~~~~~~
-
-RCU annotations are not checked by default. To enable RCU annotation
-checks, include -DCONFIG_SPARSE_RCU_POINTER in your CF flags.
diff --git a/Documentation/devicetree/bindings/clock/sunxi-ccu.txt b/Documentation/devicetree/bindings/clock/sunxi-ccu.txt
index e9c5a1d9834a..f465647a4dd2 100644
--- a/Documentation/devicetree/bindings/clock/sunxi-ccu.txt
+++ b/Documentation/devicetree/bindings/clock/sunxi-ccu.txt
@@ -22,7 +22,8 @@ Required properties :
- #clock-cells : must contain 1
- #reset-cells : must contain 1
-For the PRCM CCUs on H3/A64, one more clock is needed:
+For the PRCM CCUs on H3/A64, two more clocks are needed:
+- "pll-periph": the SoC's peripheral PLL from the main CCU
- "iosc": the SoC's internal frequency oscillator
Example for generic CCU:
@@ -39,8 +40,8 @@ Example for PRCM CCU:
r_ccu: clock@01f01400 {
compatible = "allwinner,sun50i-a64-r-ccu";
reg = <0x01f01400 0x100>;
- clocks = <&osc24M>, <&osc32k>, <&iosc>;
- clock-names = "hosc", "losc", "iosc";
+ clocks = <&osc24M>, <&osc32k>, <&iosc>, <&ccu CLK_PLL_PERIPH0>;
+ clock-names = "hosc", "losc", "iosc", "pll-periph";
#clock-cells = <1>;
#reset-cells = <1>;
};
diff --git a/Documentation/devicetree/bindings/gpio/gpio-mvebu.txt b/Documentation/devicetree/bindings/gpio/gpio-mvebu.txt
index 42c3bb2d53e8..01e331a5f3e7 100644
--- a/Documentation/devicetree/bindings/gpio/gpio-mvebu.txt
+++ b/Documentation/devicetree/bindings/gpio/gpio-mvebu.txt
@@ -41,9 +41,9 @@ Required properties:
Optional properties:
In order to use the GPIO lines in PWM mode, some additional optional
-properties are required. Only Armada 370 and XP support these properties.
+properties are required.
-- compatible: Must contain "marvell,armada-370-xp-gpio"
+- compatible: Must contain "marvell,armada-370-gpio"
- reg: an additional register set is needed, for the GPIO Blink
Counter on/off registers.
@@ -71,7 +71,7 @@ Example:
};
gpio1: gpio@18140 {
- compatible = "marvell,armada-370-xp-gpio";
+ compatible = "marvell,armada-370-gpio";
reg = <0x18140 0x40>, <0x181c8 0x08>;
reg-names = "gpio", "pwm";
ngpios = <17>;
diff --git a/Documentation/devicetree/bindings/mfd/stm32-timers.txt b/Documentation/devicetree/bindings/mfd/stm32-timers.txt
index bbd083f5600a..1db6e0057a63 100644
--- a/Documentation/devicetree/bindings/mfd/stm32-timers.txt
+++ b/Documentation/devicetree/bindings/mfd/stm32-timers.txt
@@ -31,7 +31,7 @@ Example:
compatible = "st,stm32-timers";
reg = <0x40010000 0x400>;
clocks = <&rcc 0 160>;
- clock-names = "clk_int";
+ clock-names = "int";
pwm {
compatible = "st,stm32-pwm";
diff --git a/Documentation/devicetree/bindings/net/dsa/b53.txt b/Documentation/devicetree/bindings/net/dsa/b53.txt
index d6c6e41648d4..8ec2ca21adeb 100644
--- a/Documentation/devicetree/bindings/net/dsa/b53.txt
+++ b/Documentation/devicetree/bindings/net/dsa/b53.txt
@@ -34,7 +34,7 @@ Required properties:
"brcm,bcm6328-switch"
"brcm,bcm6368-switch" and the mandatory "brcm,bcm63xx-switch"
-See Documentation/devicetree/bindings/dsa/dsa.txt for a list of additional
+See Documentation/devicetree/bindings/net/dsa/dsa.txt for a list of additional
required and optional properties.
Examples:
diff --git a/Documentation/devicetree/bindings/net/smsc911x.txt b/Documentation/devicetree/bindings/net/smsc911x.txt
index 16c3a9501f5d..acfafc8e143c 100644
--- a/Documentation/devicetree/bindings/net/smsc911x.txt
+++ b/Documentation/devicetree/bindings/net/smsc911x.txt
@@ -27,6 +27,7 @@ Optional properties:
of the device. On many systems this is wired high so the device goes
out of reset at power-on, but if it is under program control, this
optional GPIO can wake up in response to it.
+- vdd33a-supply, vddvario-supply : 3.3V analog and IO logic power supplies
Examples:
diff --git a/Documentation/filesystems/autofs4.txt b/Documentation/filesystems/autofs4.txt
index f10dd590f69f..8444dc3d57e8 100644
--- a/Documentation/filesystems/autofs4.txt
+++ b/Documentation/filesystems/autofs4.txt
@@ -316,7 +316,7 @@ For version 5, the format of the message is:
struct autofs_v5_packet {
int proto_version; /* Protocol version */
int type; /* Type of packet */
- autofs_wqt_t wait_queue_token;
+ autofs_wqt_t wait_queue_entry_token;
__u32 dev;
__u64 ino;
__u32 uid;
@@ -341,12 +341,12 @@ The pipe will be set to "packet mode" (equivalent to passing
`O_DIRECT`) to _pipe2(2)_ so that a read from the pipe will return at
most one packet, and any unread portion of a packet will be discarded.
-The `wait_queue_token` is a unique number which can identify a
+The `wait_queue_entry_token` is a unique number which can identify a
particular request to be acknowledged. When a message is sent over
the pipe the affected dentry is marked as either "active" or
"expiring" and other accesses to it block until the message is
acknowledged using one of the ioctls below and the relevant
-`wait_queue_token`.
+`wait_queue_entry_token`.
Communicating with autofs: root directory ioctls
------------------------------------------------
@@ -358,7 +358,7 @@ capability, or must be the automount daemon.
The available ioctl commands are:
- **AUTOFS_IOC_READY**: a notification has been handled. The argument
- to the ioctl command is the "wait_queue_token" number
+ to the ioctl command is the "wait_queue_entry_token" number
corresponding to the notification being acknowledged.
- **AUTOFS_IOC_FAIL**: similar to above, but indicates failure with
the error code `ENOENT`.
@@ -382,14 +382,14 @@ The available ioctl commands are:
struct autofs_packet_expire_multi {
int proto_version; /* Protocol version */
int type; /* Type of packet */
- autofs_wqt_t wait_queue_token;
+ autofs_wqt_t wait_queue_entry_token;
int len;
char name[NAME_MAX+1];
};
is required. This is filled in with the name of something
that can be unmounted or removed. If nothing can be expired,
- `errno` is set to `EAGAIN`. Even though a `wait_queue_token`
+ `errno` is set to `EAGAIN`. Even though a `wait_queue_entry_token`
is present in the structure, no "wait queue" is established
and no acknowledgment is needed.
- **AUTOFS_IOC_EXPIRE_MULTI**: This is similar to
diff --git a/Documentation/kernel-per-CPU-kthreads.txt b/Documentation/kernel-per-CPU-kthreads.txt
index df31e30b6a02..2cb7dc5c0e0d 100644
--- a/Documentation/kernel-per-CPU-kthreads.txt
+++ b/Documentation/kernel-per-CPU-kthreads.txt
@@ -109,13 +109,12 @@ SCHED_SOFTIRQ: Do all of the following:
on that CPU. If a thread that expects to run on the de-jittered
CPU awakens, the scheduler will send an IPI that can result in
a subsequent SCHED_SOFTIRQ.
-2. Build with CONFIG_RCU_NOCB_CPU=y, CONFIG_RCU_NOCB_CPU_ALL=y,
- CONFIG_NO_HZ_FULL=y, and, in addition, ensure that the CPU
- to be de-jittered is marked as an adaptive-ticks CPU using the
- "nohz_full=" boot parameter. This reduces the number of
- scheduler-clock interrupts that the de-jittered CPU receives,
- minimizing its chances of being selected to do the load balancing
- work that runs in SCHED_SOFTIRQ context.
+2. CONFIG_NO_HZ_FULL=y and ensure that the CPU to be de-jittered
+ is marked as an adaptive-ticks CPU using the "nohz_full="
+ boot parameter. This reduces the number of scheduler-clock
+ interrupts that the de-jittered CPU receives, minimizing its
+ chances of being selected to do the load balancing work that
+ runs in SCHED_SOFTIRQ context.
3. To the extent possible, keep the CPU out of the kernel when it
is non-idle, for example, by avoiding system calls and by
forcing both kernel threads and interrupts to execute elsewhere.
@@ -135,11 +134,10 @@ HRTIMER_SOFTIRQ: Do all of the following:
RCU_SOFTIRQ: Do at least one of the following:
1. Offload callbacks and keep the CPU in either dyntick-idle or
adaptive-ticks state by doing all of the following:
- a. Build with CONFIG_RCU_NOCB_CPU=y, CONFIG_RCU_NOCB_CPU_ALL=y,
- CONFIG_NO_HZ_FULL=y, and, in addition ensure that the CPU
- to be de-jittered is marked as an adaptive-ticks CPU using
- the "nohz_full=" boot parameter. Bind the rcuo kthreads
- to housekeeping CPUs, which can tolerate OS jitter.
+ a. CONFIG_NO_HZ_FULL=y and ensure that the CPU to be
+ de-jittered is marked as an adaptive-ticks CPU using the
+ "nohz_full=" boot parameter. Bind the rcuo kthreads to
+ housekeeping CPUs, which can tolerate OS jitter.
b. To the extent possible, keep the CPU out of the kernel
when it is non-idle, for example, by avoiding system
calls and by forcing both kernel threads and interrupts
@@ -236,11 +234,10 @@ To reduce its OS jitter, do at least one of the following:
is feasible only if your workload never requires RCU priority
boosting, for example, if you ensure frequent idle time on all
CPUs that might execute within the kernel.
-3. Build with CONFIG_RCU_NOCB_CPU=y and CONFIG_RCU_NOCB_CPU_ALL=y,
- which offloads all RCU callbacks to kthreads that can be moved
- off of CPUs susceptible to OS jitter. This approach prevents the
- rcuc/%u kthreads from having any work to do, so that they are
- never awakened.
+3. Build with CONFIG_RCU_NOCB_CPU=y and boot with the rcu_nocbs=
+ boot parameter offloading RCU callbacks from all CPUs susceptible
+ to OS jitter. This approach prevents the rcuc/%u kthreads from
+ having any work to do, so that they are never awakened.
4. Ensure that the CPU never enters the kernel, and, in particular,
avoid initiating any CPU hotplug operations on this CPU. This is
another way of preventing any callbacks from being queued on the
diff --git a/Documentation/memory-barriers.txt b/Documentation/memory-barriers.txt
index 732f10ea382e..9d5e0f853f08 100644
--- a/Documentation/memory-barriers.txt
+++ b/Documentation/memory-barriers.txt
@@ -27,7 +27,7 @@ The purpose of this document is twofold:
(2) to provide a guide as to how to use the barriers that are available.
Note that an architecture can provide more than the minimum requirement
-for any particular barrier, but if the architecure provides less than
+for any particular barrier, but if the architecture provides less than
that, that architecture is incorrect.
Note also that it is possible that a barrier may be a no-op for an
diff --git a/Documentation/networking/scaling.txt b/Documentation/networking/scaling.txt
index 59f4db2a0c85..f55639d71d35 100644
--- a/Documentation/networking/scaling.txt
+++ b/Documentation/networking/scaling.txt
@@ -122,7 +122,7 @@ associated flow of the packet. The hash is either provided by hardware
or will be computed in the stack. Capable hardware can pass the hash in
the receive descriptor for the packet; this would usually be the same
hash used for RSS (e.g. computed Toeplitz hash). The hash is saved in
-skb->rx_hash and can be used elsewhere in the stack as a hash of the
+skb->hash and can be used elsewhere in the stack as a hash of the
packet’s flow.
Each receive hardware queue has an associated list of CPUs to which
diff --git a/Documentation/scheduler/sched-deadline.txt b/Documentation/scheduler/sched-deadline.txt
index cbc1b46cbf70..e89e36ec15a5 100644
--- a/Documentation/scheduler/sched-deadline.txt
+++ b/Documentation/scheduler/sched-deadline.txt
@@ -7,6 +7,8 @@ CONTENTS
0. WARNING
1. Overview
2. Scheduling algorithm
+ 2.1 Main algorithm
+ 2.2 Bandwidth reclaiming
3. Scheduling Real-Time Tasks
3.1 Definitions
3.2 Schedulability Analysis for Uniprocessor Systems
@@ -44,6 +46,9 @@ CONTENTS
2. Scheduling algorithm
==================
+2.1 Main algorithm
+------------------
+
SCHED_DEADLINE uses three parameters, named "runtime", "period", and
"deadline", to schedule tasks. A SCHED_DEADLINE task should receive
"runtime" microseconds of execution time every "period" microseconds, and
@@ -113,6 +118,160 @@ CONTENTS
remaining runtime = remaining runtime + runtime
+2.2 Bandwidth reclaiming
+------------------------
+
+ Bandwidth reclaiming for deadline tasks is based on the GRUB (Greedy
+ Reclamation of Unused Bandwidth) algorithm [15, 16, 17] and it is enabled
+ when flag SCHED_FLAG_RECLAIM is set.
+
+ The following diagram illustrates the state names for tasks handled by GRUB:
+
+ ------------
+ (d) | Active |
+ ------------->| |
+ | | Contending |
+ | ------------
+ | A |
+ ---------- | |
+ | | | |
+ | Inactive | |(b) | (a)
+ | | | |
+ ---------- | |
+ A | V
+ | ------------
+ | | Active |
+ --------------| Non |
+ (c) | Contending |
+ ------------
+
+ A task can be in one of the following states:
+
+ - ActiveContending: if it is ready for execution (or executing);
+
+ - ActiveNonContending: if it just blocked and has not yet surpassed the 0-lag
+ time;
+
+ - Inactive: if it is blocked and has surpassed the 0-lag time.
+
+ State transitions:
+
+ (a) When a task blocks, it does not become immediately inactive since its
+ bandwidth cannot be immediately reclaimed without breaking the
+ real-time guarantees. It therefore enters a transitional state called
+ ActiveNonContending. The scheduler arms the "inactive timer" to fire at
+ the 0-lag time, when the task's bandwidth can be reclaimed without
+ breaking the real-time guarantees.
+
+ The 0-lag time for a task entering the ActiveNonContending state is
+ computed as
+
+ (runtime * dl_period)
+ deadline - ---------------------
+ dl_runtime
+
+ where runtime is the remaining runtime, while dl_runtime and dl_period
+ are the reservation parameters.
+
+ (b) If the task wakes up before the inactive timer fires, the task re-enters
+ the ActiveContending state and the "inactive timer" is canceled.
+ In addition, if the task wakes up on a different runqueue, then
+ the task's utilization must be removed from the previous runqueue's active
+ utilization and must be added to the new runqueue's active utilization.
+ In order to avoid races between a task waking up on a runqueue while the
+ "inactive timer" is running on a different CPU, the "dl_non_contending"
+ flag is used to indicate that a task is not on a runqueue but is active
+ (so, the flag is set when the task blocks and is cleared when the
+ "inactive timer" fires or when the task wakes up).
+
+ (c) When the "inactive timer" fires, the task enters the Inactive state and
+ its utilization is removed from the runqueue's active utilization.
+
+ (d) When an inactive task wakes up, it enters the ActiveContending state and
+ its utilization is added to the active utilization of the runqueue where
+ it has been enqueued.
+
+ For each runqueue, the algorithm GRUB keeps track of two different bandwidths:
+
+ - Active bandwidth (running_bw): this is the sum of the bandwidths of all
+ tasks in active state (i.e., ActiveContending or ActiveNonContending);
+
+ - Total bandwidth (this_bw): this is the sum of all tasks "belonging" to the
+ runqueue, including the tasks in Inactive state.
+
+
+ The algorithm reclaims the bandwidth of the tasks in Inactive state.
+ It does so by decrementing the runtime of the executing task Ti at a pace equal
+ to
+
+ dq = -max{ Ui, (1 - Uinact) } dt
+
+ where Uinact is the inactive utilization, computed as (this_bq - running_bw),
+ and Ui is the bandwidth of task Ti.
+
+
+ Let's now see a trivial example of two deadline tasks with runtime equal
+ to 4 and period equal to 8 (i.e., bandwidth equal to 0.5):
+
+ A Task T1
+ |
+ | |
+ | |
+ |-------- |----
+ | | V
+ |---|---|---|---|---|---|---|---|--------->t
+ 0 1 2 3 4 5 6 7 8
+
+
+ A Task T2
+ |
+ | |
+ | |
+ | ------------------------|
+ | | V
+ |---|---|---|---|---|---|---|---|--------->t
+ 0 1 2 3 4 5 6 7 8
+
+
+ A running_bw
+ |
+ 1 ----------------- ------
+ | | |
+ 0.5- -----------------
+ | |
+ |---|---|---|---|---|---|---|---|--------->t
+ 0 1 2 3 4 5 6 7 8
+
+
+ - Time t = 0:
+
+ Both tasks are ready for execution and therefore in ActiveContending state.
+ Suppose Task T1 is the first task to start execution.
+ Since there are no inactive tasks, its runtime is decreased as dq = -1 dt.
+
+ - Time t = 2:
+
+ Suppose that task T1 blocks
+ Task T1 therefore enters the ActiveNonContending state. Since its remaining
+ runtime is equal to 2, its 0-lag time is equal to t = 4.
+ Task T2 start execution, with runtime still decreased as dq = -1 dt since
+ there are no inactive tasks.
+
+ - Time t = 4:
+
+ This is the 0-lag time for Task T1. Since it didn't woken up in the
+ meantime, it enters the Inactive state. Its bandwidth is removed from
+ running_bw.
+ Task T2 continues its execution. However, its runtime is now decreased as
+ dq = - 0.5 dt because Uinact = 0.5.
+ Task T2 therefore reclaims the bandwidth unused by Task T1.
+
+ - Time t = 8:
+
+ Task T1 wakes up. It enters the ActiveContending state again, and the
+ running_bw is incremented.
+
+
3. Scheduling Real-Time Tasks
=============================
@@ -330,6 +489,15 @@ CONTENTS
14 - J. Erickson, U. Devi and S. Baruah. Improved tardiness bounds for
Global EDF. Proceedings of the 22nd Euromicro Conference on
Real-Time Systems, 2010.
+ 15 - G. Lipari, S. Baruah, Greedy reclamation of unused bandwidth in
+ constant-bandwidth servers, 12th IEEE Euromicro Conference on Real-Time
+ Systems, 2000.
+ 16 - L. Abeni, J. Lelli, C. Scordino, L. Palopoli, Greedy CPU reclaiming for
+ SCHED DEADLINE. In Proceedings of the Real-Time Linux Workshop (RTLWS),
+ Dusseldorf, Germany, 2014.
+ 17 - L. Abeni, G. Lipari, A. Parri, Y. Sun, Multicore CPU reclaiming: parallel
+ or sequential?. In Proceedings of the 31st Annual ACM Symposium on Applied
+ Computing, 2016.
4. Bandwidth management
diff --git a/Documentation/timers/NO_HZ.txt b/Documentation/timers/NO_HZ.txt
index 6eaf576294f3..2dcaf9adb7a7 100644
--- a/Documentation/timers/NO_HZ.txt
+++ b/Documentation/timers/NO_HZ.txt
@@ -194,32 +194,9 @@ that the RCU callbacks are processed in a timely fashion.
Another approach is to offload RCU callback processing to "rcuo" kthreads
using the CONFIG_RCU_NOCB_CPU=y Kconfig option. The specific CPUs to
-offload may be selected via several methods:
-
-1. One of three mutually exclusive Kconfig options specify a
- build-time default for the CPUs to offload:
-
- a. The CONFIG_RCU_NOCB_CPU_NONE=y Kconfig option results in
- no CPUs being offloaded.
-
- b. The CONFIG_RCU_NOCB_CPU_ZERO=y Kconfig option causes
- CPU 0 to be offloaded.
-
- c. The CONFIG_RCU_NOCB_CPU_ALL=y Kconfig option causes all
- CPUs to be offloaded. Note that the callbacks will be
- offloaded to "rcuo" kthreads, and that those kthreads
- will in fact run on some CPU. However, this approach
- gives fine-grained control on exactly which CPUs the
- callbacks run on, along with their scheduling priority
- (including the default of SCHED_OTHER), and it further
- allows this control to be varied dynamically at runtime.
-
-2. The "rcu_nocbs=" kernel boot parameter, which takes a comma-separated
- list of CPUs and CPU ranges, for example, "1,3-5" selects CPUs 1,
- 3, 4, and 5. The specified CPUs will be offloaded in addition to
- any CPUs specified as offloaded by CONFIG_RCU_NOCB_CPU_ZERO=y or
- CONFIG_RCU_NOCB_CPU_ALL=y. This means that the "rcu_nocbs=" boot
- parameter has no effect for kernels built with RCU_NOCB_CPU_ALL=y.
+offload may be selected using The "rcu_nocbs=" kernel boot parameter,
+which takes a comma-separated list of CPUs and CPU ranges, for example,
+"1,3-5" selects CPUs 1, 3, 4, and 5.
The offloaded CPUs will never queue RCU callbacks, and therefore RCU
never prevents offloaded CPUs from entering either dyntick-idle mode
diff --git a/Documentation/trace/ftrace.txt b/Documentation/trace/ftrace.txt
index 94a987bd2bc5..fff8ff6d4893 100644
--- a/Documentation/trace/ftrace.txt
+++ b/Documentation/trace/ftrace.txt
@@ -1609,7 +1609,7 @@ Doing the same with chrt -r 5 and function-trace set.
<idle>-0 3dN.2 14us : sched_avg_update <-__cpu_load_update
<idle>-0 3dN.2 14us : _raw_spin_unlock <-cpu_load_update_nohz
<idle>-0 3dN.2 14us : sub_preempt_count <-_raw_spin_unlock
- <idle>-0 3dN.1 15us : calc_load_exit_idle <-tick_nohz_idle_exit
+ <idle>-0 3dN.1 15us : calc_load_nohz_stop <-tick_nohz_idle_exit
<idle>-0 3dN.1 15us : touch_softlockup_watchdog <-tick_nohz_idle_exit
<idle>-0 3dN.1 15us : hrtimer_cancel <-tick_nohz_idle_exit
<idle>-0 3dN.1 15us : hrtimer_try_to_cancel <-hrtimer_cancel
diff --git a/MAINTAINERS b/MAINTAINERS
index 8b9b56d58065..d357695ee4fe 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -2322,6 +2322,15 @@ F: Documentation/devicetree/bindings/input/atmel,maxtouch.txt
F: drivers/input/touchscreen/atmel_mxt_ts.c
F: include/linux/platform_data/atmel_mxt_ts.h
+ATOMIC INFRASTRUCTURE
+M: Will Deacon <will.deacon@arm.com>
+M: Peter Zijlstra <peterz@infradead.org>
+R: Boqun Feng <boqun.feng@gmail.com>
+L: linux-kernel@vger.kernel.org
+S: Maintained
+F: arch/*/include/asm/atomic*.h
+F: include/*/atomic*.h
+
ATTO EXPRESSSAS SAS/SATA RAID SCSI DRIVER
M: Bradley Grove <linuxdrivers@attotech.com>
L: linux-scsi@vger.kernel.org
@@ -2964,7 +2973,7 @@ F: sound/pci/oxygen/
C6X ARCHITECTURE
M: Mark Salter <msalter@redhat.com>
-M: Aurelien Jacquiot <a-jacquiot@ti.com>
+M: Aurelien Jacquiot <jacquiot.aurelien@gmail.com>
L: linux-c6x-dev@linux-c6x.org
W: http://www.linux-c6x.org/wiki/index.php/Main_Page
S: Maintained
@@ -7555,7 +7564,7 @@ S: Maintained
F: drivers/ata/sata_promise.*
LIBLOCKDEP
-M: Sasha Levin <sasha.levin@oracle.com>
+M: Sasha Levin <alexander.levin@verizon.com>
S: Maintained
F: tools/lib/lockdep/
diff --git a/Makefile b/Makefile
index 83f6d9972cab..283c6236438e 100644
--- a/Makefile
+++ b/Makefile
@@ -1,7 +1,7 @@
VERSION = 4
PATCHLEVEL = 12
SUBLEVEL = 0
-EXTRAVERSION = -rc5
+EXTRAVERSION =
NAME = Fearless Coyote
# *DOCUMENTATION*
@@ -1437,7 +1437,7 @@ help:
@echo ' make V=0|1 [targets] 0 => quiet build (default), 1 => verbose build'
@echo ' make V=2 [targets] 2 => give reason for rebuild of target'
@echo ' make O=dir [targets] Locate all output files in "dir", including .config'
- @echo ' make C=1 [targets] Check all c source with $$CHECK (sparse by default)'
+ @echo ' make C=1 [targets] Check re-compiled c source with $$CHECK (sparse by default)'
@echo ' make C=2 [targets] Force check of all c source with $$CHECK'
@echo ' make RECORDMCOUNT_WARN=1 [targets] Warn about ignored mcount sections'
@echo ' make W=n [targets] Enable extra gcc checks, n=1,2,3 where'
diff --git a/arch/Kconfig b/arch/Kconfig
index 6c00e5b00f8b..f76b214cf7ad 100644
--- a/arch/Kconfig
+++ b/arch/Kconfig
@@ -867,4 +867,13 @@ config STRICT_MODULE_RWX
config ARCH_WANT_RELAX_ORDER
bool
+config REFCOUNT_FULL
+ bool "Perform full reference count validation at the expense of speed"
+ help
+ Enabling this switches the refcounting infrastructure from a fast
+ unchecked atomic_t implementation to a fully state checked
+ implementation, which can be (slightly) slower but provides protections
+ against various use-after-free conditions that can be used in
+ security flaw exploits.
+
source "kernel/gcov/Kconfig"
diff --git a/arch/arc/include/asm/processor.h b/arch/arc/include/asm/processor.h
index 6e1242da0159..4104a0839214 100644
--- a/arch/arc/include/asm/processor.h
+++ b/arch/arc/include/asm/processor.h
@@ -86,8 +86,6 @@ struct task_struct;
#define TSK_K_BLINK(tsk) TSK_K_REG(tsk, 4)
#define TSK_K_FP(tsk) TSK_K_REG(tsk, 0)
-#define thread_saved_pc(tsk) TSK_K_BLINK(tsk)
-
extern void start_thread(struct pt_regs * regs, unsigned long pc,
unsigned long usp);
diff --git a/arch/arc/mm/mmap.c b/arch/arc/mm/mmap.c
index 3e25e8d6486b..2e13683dfb24 100644
--- a/arch/arc/mm/mmap.c
+++ b/arch/arc/mm/mmap.c
@@ -65,7 +65,7 @@ arch_get_unmapped_area(struct file *filp, unsigned long addr,
vma = find_vma(mm, addr);
if (TASK_SIZE - len >= addr &&
- (!vma || addr + len <= vma->vm_start))
+ (!vma || addr + len <= vm_start_gap(vma)))
return addr;
}
diff --git a/arch/arm/Kconfig b/arch/arm/Kconfig
index 4c1a35f15838..6491be556ddc 100644
--- a/arch/arm/Kconfig
+++ b/arch/arm/Kconfig
@@ -1416,6 +1416,7 @@ choice
config VMSPLIT_3G
bool "3G/1G user/kernel split"
config VMSPLIT_3G_OPT
+ depends on !ARM_LPAE
bool "3G/1G user/kernel split (for full 1G low memory)"
config VMSPLIT_2G
bool "2G/2G user/kernel split"
@@ -1637,7 +1638,7 @@ config ARCH_SELECT_MEMORY_MODEL
config HAVE_ARCH_PFN_VALID
def_bool ARCH_HAS_HOLES_MEMORYMODEL || !SPARSEMEM
-config HAVE_GENERIC_RCU_GUP
+config HAVE_GENERIC_GUP
def_bool y
depends on ARM_LPAE
@@ -2061,6 +2062,23 @@ config EFI
is only useful for kernels that may run on systems that have
UEFI firmware.
+config DMI
+ bool "Enable support for SMBIOS (DMI) tables"
+ depends on EFI
+ default y
+ help
+ This enables SMBIOS/DMI feature for systems.
+
+ This option is only useful on systems that have UEFI firmware.
+ However, even with this option, the resultant kernel should
+ continue to boot on existing non-UEFI platforms.
+
+ NOTE: This does *NOT* enable or encourage the use of DMI quirks,
+ i.e., the the practice of identifying the platform via DMI to
+ decide whether certain workarounds for buggy hardware and/or
+ firmware need to be enabled. This would require the DMI subsystem
+ to be enabled much earlier than we do on ARM, which is non-trivial.
+
endmenu
menu "CPU Power Management"
diff --git a/arch/arm/boot/compressed/efi-header.S b/arch/arm/boot/compressed/efi-header.S
index 3f7d1b74c5e0..a17ca8d78656 100644
--- a/arch/arm/boot/compressed/efi-header.S
+++ b/arch/arm/boot/compressed/efi-header.S
@@ -17,7 +17,8 @@
@ there.
.inst 'M' | ('Z' << 8) | (0x1310 << 16) @ tstne r0, #0x4d000
#else
- W(mov) r0, r0
+ AR_CLASS( mov r0, r0 )
+ M_CLASS( nop.w )
#endif
.endm
diff --git a/arch/arm/boot/dts/am335x-sl50.dts b/arch/arm/boot/dts/am335x-sl50.dts
index c5d2589c55fc..fc864a855991 100644
--- a/arch/arm/boot/dts/am335x-sl50.dts
+++ b/arch/arm/boot/dts/am335x-sl50.dts
@@ -220,7 +220,7 @@
mmc1_pins: pinmux_mmc1_pins {
pinctrl-single,pins = <
- AM33XX_IOPAD(0x960, PIN_INPUT | MUX_MODE7) /* spi0_cs1.gpio0_6 */
+ AM33XX_IOPAD(0x96c, PIN_INPUT | MUX_MODE7) /* uart0_rtsn.gpio1_9 */
>;
};
@@ -280,10 +280,6 @@
AM33XX_IOPAD(0x834, PIN_INPUT_PULLUP | MUX_MODE7) /* nKbdReset - gpmc_ad13.gpio1_13 */
AM33XX_IOPAD(0x838, PIN_INPUT_PULLUP | MUX_MODE7) /* nDispReset - gpmc_ad14.gpio1_14 */
AM33XX_IOPAD(0x844, PIN_INPUT_PULLUP | MUX_MODE7) /* USB1_enPower - gpmc_a1.gpio1_17 */
- /* AVR Programming - SPI Bus (bit bang) - Screen and Keyboard */
- AM33XX_IOPAD(0x954, PIN_INPUT_PULLUP | MUX_MODE7) /* Kbd/Disp/BattMOSI spi0_d0.gpio0_3 */
- AM33XX_IOPAD(0x958, PIN_INPUT_PULLUP | MUX_MODE7) /* Kbd/Disp/BattMISO spi0_d1.gpio0_4 */
- AM33XX_IOPAD(0x950, PIN_INPUT_PULLUP | MUX_MODE7) /* Kbd/Disp/BattSCLK spi0_clk.gpio0_2 */
/* PDI Bus - Battery system */
AM33XX_IOPAD(0x840, PIN_INPUT_PULLUP | MUX_MODE7) /* nBattReset gpmc_a0.gpio1_16 */
AM33XX_IOPAD(0x83c, PIN_INPUT_PULLUP | MUX_MODE7) /* BattPDIData gpmc_ad15.gpio1_15 */
@@ -384,7 +380,7 @@
pinctrl-names = "default";
pinctrl-0 = <&mmc1_pins>;
bus-width = <4>;
- cd-gpios = <&gpio0 6 GPIO_ACTIVE_LOW>;
+ cd-gpios = <&gpio1 9 GPIO_ACTIVE_LOW>;
vmmc-supply = <&vmmcsd_fixed>;
};
diff --git a/arch/arm/boot/dts/sunxi-h3-h5.dtsi b/arch/arm/boot/dts/sunxi-h3-h5.dtsi
index 1aeeacb3a884..d4f600dbb7eb 100644
--- a/arch/arm/boot/dts/sunxi-h3-h5.dtsi
+++ b/arch/arm/boot/dts/sunxi-h3-h5.dtsi
@@ -558,10 +558,11 @@
};
r_ccu: clock@1f01400 {
- compatible = "allwinner,sun50i-a64-r-ccu";
+ compatible = "allwinner,sun8i-h3-r-ccu";
reg = <0x01f01400 0x100>;
- clocks = <&osc24M>, <&osc32k>, <&iosc>;
- clock-names = "hosc", "losc", "iosc";
+ clocks = <&osc24M>, <&osc32k>, <&iosc>,
+ <&ccu 9>;
+ clock-names = "hosc", "losc", "iosc", "pll-periph";
#clock-cells = <1>;
#reset-cells = <1>;
};
diff --git a/arch/arm/include/asm/dmi.h b/arch/arm/include/asm/dmi.h
new file mode 100644
index 000000000000..df2d2ff06f5b
--- /dev/null
+++ b/arch/arm/include/asm/dmi.h
@@ -0,0 +1,19 @@
+/*
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#ifndef __ASM_DMI_H
+#define __ASM_DMI_H
+
+#include <linux/io.h>
+#include <linux/slab.h>
+
+#define dmi_early_remap(x, l) memremap(x, l, MEMREMAP_WB)
+#define dmi_early_unmap(x, l) memunmap(x)
+#define dmi_remap(x, l) memremap(x, l, MEMREMAP_WB)
+#define dmi_unmap(x) memunmap(x)
+#define dmi_alloc(l) kzalloc(l, GFP_KERNEL)
+
+#endif
diff --git a/arch/arm/kernel/setup.c b/arch/arm/kernel/setup.c
index 32e1a9513dc7..4e80bf7420d4 100644
--- a/arch/arm/kernel/setup.c
+++ b/arch/arm/kernel/setup.c
@@ -315,7 +315,7 @@ static void __init cacheid_init(void)
if (arch >= CPU_ARCH_ARMv6) {
unsigned int cachetype = read_cpuid_cachetype();
- if ((arch == CPU_ARCH_ARMv7M) && !cachetype) {
+ if ((arch == CPU_ARCH_ARMv7M) && !(cachetype & 0xf000f)) {
cacheid = 0;
} else if ((cachetype & (7 << 29)) == 4 << 29) {
/* ARMv7 register format */
diff --git a/arch/arm/kernel/smp.c b/arch/arm/kernel/smp.c
index 572a8df1b766..c9a0a5299827 100644
--- a/arch/arm/kernel/smp.c
+++ b/arch/arm/kernel/smp.c
@@ -555,8 +555,7 @@ static DEFINE_RAW_SPINLOCK(stop_lock);
*/
static void ipi_cpu_stop(unsigned int cpu)
{
- if (system_state == SYSTEM_BOOTING ||
- system_state == SYSTEM_RUNNING) {
+ if (system_state <= SYSTEM_RUNNING) {
raw_spin_lock(&stop_lock);
pr_crit("CPU%u: stopping\n", cpu);
dump_stack();
diff --git a/arch/arm/mm/mmap.c b/arch/arm/mm/mmap.c
index 2239fde10b80..f0701d8d24df 100644
--- a/arch/arm/mm/mmap.c
+++ b/arch/arm/mm/mmap.c
@@ -90,7 +90,7 @@ arch_get_unmapped_area(struct file *filp, unsigned long addr,
vma = find_vma(mm, addr);
if (TASK_SIZE - len >= addr &&
- (!vma || addr + len <= vma->vm_start))
+ (!vma || addr + len <= vm_start_gap(vma)))
return addr;
}
@@ -141,7 +141,7 @@ arch_get_unmapped_area_topdown(struct file *filp, const unsigned long addr0,
addr = PAGE_ALIGN(addr);
vma = find_vma(mm, addr);
if (TASK_SIZE - len >= addr &&
- (!vma || addr + len <= vma->vm_start))
+ (!vma || addr + len <= vm_start_gap(vma)))
return addr;
}
diff --git a/arch/arm/mm/mmu.c b/arch/arm/mm/mmu.c
index 31af3cb59a60..e46a6a446cdd 100644
--- a/arch/arm/mm/mmu.c
+++ b/arch/arm/mm/mmu.c
@@ -1218,15 +1218,15 @@ void __init adjust_lowmem_bounds(void)
high_memory = __va(arm_lowmem_limit - 1) + 1;
+ if (!memblock_limit)
+ memblock_limit = arm_lowmem_limit;
+
/*
* Round the memblock limit down to a pmd size. This
* helps to ensure that we will allocate memory from the
* last full pmd, which should be mapped.
*/
- if (memblock_limit)
- memblock_limit = round_down(memblock_limit, PMD_SIZE);
- if (!memblock_limit)
- memblock_limit = arm_lowmem_limit;
+ memblock_limit = round_down(memblock_limit, PMD_SIZE);
if (!IS_ENABLED(CONFIG_HIGHMEM) || cache_is_vipt_aliasing()) {
if (memblock_end_of_DRAM() > arm_lowmem_limit) {
diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig
index b2024db225a9..95c7ed392003 100644
--- a/arch/arm64/Kconfig
+++ b/arch/arm64/Kconfig
@@ -205,7 +205,7 @@ config GENERIC_CALIBRATE_DELAY
config ZONE_DMA
def_bool y
-config HAVE_GENERIC_RCU_GUP
+config HAVE_GENERIC_GUP
def_bool y
config ARCH_DMA_ADDR_T_64BIT
diff --git a/arch/arm64/boot/dts/allwinner/sun50i-a64.dtsi b/arch/arm64/boot/dts/allwinner/sun50i-a64.dtsi
index c7f669f5884f..166c9ef884dc 100644
--- a/arch/arm64/boot/dts/allwinner/sun50i-a64.dtsi
+++ b/arch/arm64/boot/dts/allwinner/sun50i-a64.dtsi
@@ -406,8 +406,9 @@
r_ccu: clock@1f01400 {
compatible = "allwinner,sun50i-a64-r-ccu";
reg = <0x01f01400 0x100>;
- clocks = <&osc24M>, <&osc32k>, <&iosc>;
- clock-names = "hosc", "losc", "iosc";
+ clocks = <&osc24M>, <&osc32k>, <&iosc>,
+ <&ccu 11>;
+ clock-names = "hosc", "losc", "iosc", "pll-periph";
#clock-cells = <1>;
#reset-cells = <1>;
};
diff --git a/arch/arm64/boot/dts/allwinner/sun50i-h5.dtsi b/arch/arm64/boot/dts/allwinner/sun50i-h5.dtsi
index 4d314a253fd9..732e2e06f503 100644
--- a/arch/arm64/boot/dts/allwinner/sun50i-h5.dtsi
+++ b/arch/arm64/boot/dts/allwinner/sun50i-h5.dtsi
@@ -40,7 +40,7 @@
* OTHER DEALINGS IN THE SOFTWARE.
*/
-#include "sunxi-h3-h5.dtsi"
+#include <arm/sunxi-h3-h5.dtsi>
/ {
cpus {
diff --git a/arch/arm64/boot/dts/allwinner/sunxi-h3-h5.dtsi b/arch/arm64/boot/dts/allwinner/sunxi-h3-h5.dtsi
deleted file mode 120000
index 036f01dc2b9b..000000000000
--- a/arch/arm64/boot/dts/allwinner/sunxi-h3-h5.dtsi
+++ /dev/null
@@ -1 +0,0 @@
-../../../../arm/boot/dts/sunxi-h3-h5.dtsi \ No newline at end of file
diff --git a/arch/arm64/kernel/efi.c b/arch/arm64/kernel/efi.c
index 5d17f377d905..82cd07592519 100644
--- a/arch/arm64/kernel/efi.c
+++ b/arch/arm64/kernel/efi.c
@@ -11,7 +11,6 @@
*
*/
-#include <linux/dmi.h>
#include <linux/efi.h>
#include <linux/init.h>
@@ -117,20 +116,6 @@ int __init efi_set_mapping_permissions(struct mm_struct *mm,
set_permissions, md);
}
-static int __init arm64_dmi_init(void)
-{
- /*
- * On arm64, DMI depends on UEFI, and dmi_scan_machine() needs to
- * be called early because dmi_id_init(), which is an arch_initcall
- * itself, depends on dmi_scan_machine() having been called already.
- */
- dmi_scan_machine();
- if (dmi_available)
- dmi_set_dump_stack_arch_desc();
- return 0;
-}
-core_initcall(arm64_dmi_init);
-
/*
* UpdateCapsule() depends on the system being shutdown via
* ResetSystem().
diff --git a/arch/arm64/kernel/smp.c b/arch/arm64/kernel/smp.c
index 6e0e16a3a7d4..321119881abf 100644
--- a/arch/arm64/kernel/smp.c
+++ b/arch/arm64/kernel/smp.c
@@ -961,8 +961,7 @@ void smp_send_stop(void)
cpumask_copy(&mask, cpu_online_mask);
cpumask_clear_cpu(smp_processor_id(), &mask);
- if (system_state == SYSTEM_BOOTING ||
- system_state == SYSTEM_RUNNING)
+ if (system_state <= SYSTEM_RUNNING)
pr_crit("SMP: stopping secondary CPUs\n");
smp_cross_call(&mask, IPI_CPU_STOP);
}
diff --git a/arch/arm64/kernel/vdso.c b/arch/arm64/kernel/vdso.c
index 41b6e31f8f55..d0cb007fa482 100644
--- a/arch/arm64/kernel/vdso.c
+++ b/arch/arm64/kernel/vdso.c
@@ -221,10 +221,11 @@ void update_vsyscall(struct timekeeper *tk)
/* tkr_mono.cycle_last == tkr_raw.cycle_last */
vdso_data->cs_cycle_last = tk->tkr_mono.cycle_last;
vdso_data->raw_time_sec = tk->raw_time.tv_sec;
- vdso_data->raw_time_nsec = tk->raw_time.tv_nsec;
+ vdso_data->raw_time_nsec = (tk->raw_time.tv_nsec <<
+ tk->tkr_raw.shift) +
+ tk->tkr_raw.xtime_nsec;
vdso_data->xtime_clock_sec = tk->xtime_sec;
vdso_data->xtime_clock_nsec = tk->tkr_mono.xtime_nsec;
- /* tkr_raw.xtime_nsec == 0 */
vdso_data->cs_mono_mult = tk->tkr_mono.mult;
vdso_data->cs_raw_mult = tk->tkr_raw.mult;
/* tkr_mono.shift == tkr_raw.shift */
diff --git a/arch/arm64/kernel/vdso/gettimeofday.S b/arch/arm64/kernel/vdso/gettimeofday.S
index e00b4671bd7c..76320e920965 100644
--- a/arch/arm64/kernel/vdso/gettimeofday.S
+++ b/arch/arm64/kernel/vdso/gettimeofday.S
@@ -256,7 +256,6 @@ monotonic_raw:
seqcnt_check fail=monotonic_raw
/* All computations are done with left-shifted nsecs. */
- lsl x14, x14, x12
get_nsec_per_sec res=x9
lsl x9, x9, x12
diff --git a/arch/arm64/net/bpf_jit_comp.c b/arch/arm64/net/bpf_jit_comp.c
index 71f930501ade..c870d6f01ac2 100644
--- a/arch/arm64/net/bpf_jit_comp.c
+++ b/arch/arm64/net/bpf_jit_comp.c
@@ -36,6 +36,7 @@ int bpf_jit_enable __read_mostly;
#define TMP_REG_1 (MAX_BPF_JIT_REG + 0)
#define TMP_REG_2 (MAX_BPF_JIT_REG + 1)
#define TCALL_CNT (MAX_BPF_JIT_REG + 2)
+#define TMP_REG_3 (MAX_BPF_JIT_REG + 3)
/* Map BPF registers to A64 registers */
static const int bpf2a64[] = {
@@ -57,6 +58,7 @@ static const int bpf2a64[] = {
/* temporary registers for internal BPF JIT */
[TMP_REG_1] = A64_R(10),
[TMP_REG_2] = A64_R(11),
+ [TMP_REG_3] = A64_R(12),
/* tail_call_cnt */
[TCALL_CNT] = A64_R(26),
/* temporary register for blinding constants */
@@ -319,6 +321,7 @@ static int build_insn(const struct bpf_insn *insn, struct jit_ctx *ctx)
const u8 src = bpf2a64[insn->src_reg];
const u8 tmp = bpf2a64[TMP_REG_1];
const u8 tmp2 = bpf2a64[TMP_REG_2];
+ const u8 tmp3 = bpf2a64[TMP_REG_3];
const s16 off = insn->off;
const s32 imm = insn->imm;
const int i = insn - ctx->prog->insnsi;
@@ -689,10 +692,10 @@ emit_cond_jmp:
emit(A64_PRFM(tmp, PST, L1, STRM), ctx);
emit(A64_LDXR(isdw, tmp2, tmp), ctx);
emit(A64_ADD(isdw, tmp2, tmp2, src), ctx);
- emit(A64_STXR(isdw, tmp2, tmp, tmp2), ctx);
+ emit(A64_STXR(isdw, tmp2, tmp, tmp3), ctx);
jmp_offset = -3;
check_imm19(jmp_offset);
- emit(A64_CBNZ(0, tmp2, jmp_offset), ctx);
+ emit(A64_CBNZ(0, tmp3, jmp_offset), ctx);
break;
/* R0 = ntohx(*(size *)(((struct sk_buff *)R6)->data + imm)) */
diff --git a/arch/blackfin/include/asm/processor.h b/arch/blackfin/include/asm/processor.h
index 85d4af97c986..dbdbb8a558df 100644
--- a/arch/blackfin/include/asm/processor.h
+++ b/arch/blackfin/include/asm/processor.h
@@ -75,11 +75,6 @@ static inline void release_thread(struct task_struct *dead_task)
{
}
-/*
- * Return saved PC of a blocked thread.
- */
-#define thread_saved_pc(tsk) (tsk->thread.pc)
-
unsigned long get_wchan(struct task_struct *p);
#define KSTK_EIP(tsk) \
diff --git a/arch/c6x/include/asm/processor.h b/arch/c6x/include/asm/processor.h
index b9eb3da7f278..7c87b5be53b5 100644
--- a/arch/c6x/include/asm/processor.h
+++ b/arch/c6x/include/asm/processor.h
@@ -96,11 +96,6 @@ static inline void release_thread(struct task_struct *dead_task)
#define release_segments(mm) do { } while (0)
/*
- * saved PC of a blocked thread.
- */
-#define thread_saved_pc(tsk) (task_pt_regs(tsk)->pc)
-
-/*
* saved kernel SP and DP of a blocked thread.
*/
#ifdef _BIG_ENDIAN
diff --git a/arch/cris/arch-v10/kernel/process.c b/arch/cris/arch-v10/kernel/process.c
index e299d30105b5..a2cdb1521aca 100644
--- a/arch/cris/arch-v10/kernel/process.c
+++ b/arch/cris/arch-v10/kernel/process.c
@@ -69,14 +69,6 @@ void hard_reset_now (void)
while(1) /* waiting for RETRIBUTION! */ ;
}
-/*
- * Return saved PC of a blocked thread.
- */
-unsigned long thread_saved_pc(struct task_struct *t)
-{
- return task_pt_regs(t)->irp;
-}
-
/* setup the child's kernel stack with a pt_regs and switch_stack on it.
* it will be un-nested during _resume and _ret_from_sys_call when the
* new thread is scheduled.
diff --git a/arch/cris/arch-v32/kernel/process.c b/arch/cris/arch-v32/kernel/process.c
index c530a8fa87ce..fe87b383fbf3 100644
--- a/arch/cris/arch-v32/kernel/process.c
+++ b/arch/cris/arch-v32/kernel/process.c
@@ -85,14 +85,6 @@ hard_reset_now(void)
}
/*
- * Return saved PC of a blocked thread.
- */
-unsigned long thread_saved_pc(struct task_struct *t)
-{
- return task_pt_regs(t)->erp;
-}
-
-/*
* Setup the child's kernel stack with a pt_regs and call switch_stack() on it.
* It will be unnested during _resume and _ret_from_sys_call when the new thread
* is scheduled.
diff --git a/arch/cris/include/asm/processor.h b/arch/cris/include/asm/processor.h
index 15b815df29c1..bc2729e4b2c9 100644
--- a/arch/cris/include/asm/processor.h
+++ b/arch/cris/include/asm/processor.h
@@ -52,8 +52,6 @@ unsigned long get_wchan(struct task_struct *p);
#define KSTK_ESP(tsk) ((tsk) == current ? rdusp() : (tsk)->thread.usp)
-extern unsigned long thread_saved_pc(struct task_struct *tsk);
-
/* Free all resources held by a thread. */
static inline void release_thread(struct task_struct *dead_task)
{
diff --git a/arch/frv/include/asm/processor.h b/arch/frv/include/asm/processor.h
index ddaeb9cc9143..e4d08d74ed9f 100644
--- a/arch/frv/include/asm/processor.h
+++ b/arch/frv/include/asm/processor.h
@@ -96,11 +96,6 @@ extern asmlinkage void *restore_user_regs(const struct user_context *target, ...
#define release_segments(mm) do { } while (0)
#define forget_segments() do { } while (0)
-/*
- * Return saved PC of a blocked thread.
- */
-extern unsigned long thread_saved_pc(struct task_struct *tsk);
-
unsigned long get_wchan(struct task_struct *p);
#define KSTK_EIP(tsk) ((tsk)->thread.frame0->pc)
diff --git a/arch/frv/kernel/process.c b/arch/frv/kernel/process.c
index 5a4c92abc99e..a957b374e3a6 100644
--- a/arch/frv/kernel/process.c
+++ b/arch/frv/kernel/process.c
@@ -198,15 +198,6 @@ unsigned long get_wchan(struct task_struct *p)
return 0;
}
-unsigned long thread_saved_pc(struct task_struct *tsk)
-{
- /* Check whether the thread is blocked in resume() */
- if (in_sched_functions(tsk->thread.pc))
- return ((unsigned long *)tsk->thread.fp)[2];
- else
- return tsk->thread.pc;
-}
-
int elf_check_arch(const struct elf32_hdr *hdr)
{
unsigned long hsr0 = __get_HSR(0);
diff --git a/arch/frv/mm/elf-fdpic.c b/arch/frv/mm/elf-fdpic.c
index da82c25301e7..46aa289c5102 100644
--- a/arch/frv/mm/elf-fdpic.c
+++ b/arch/frv/mm/elf-fdpic.c
@@ -75,7 +75,7 @@ unsigned long arch_get_unmapped_area(struct file *filp, unsigned long addr, unsi
addr = PAGE_ALIGN(addr);
vma = find_vma(current->mm, addr);
if (TASK_SIZE - len >= addr &&
- (!vma || addr + len <= vma->vm_start))
+ (!vma || addr + len <= vm_start_gap(vma)))
goto success;
}
diff --git a/arch/h8300/include/asm/processor.h b/arch/h8300/include/asm/processor.h
index 65132d7ae9e5..afa53147e66a 100644
--- a/arch/h8300/include/asm/processor.h
+++ b/arch/h8300/include/asm/processor.h
@@ -110,10 +110,6 @@ static inline void release_thread(struct task_struct *dead_task)
{
}
-/*
- * Return saved PC of a blocked thread.
- */
-unsigned long thread_saved_pc(struct task_struct *tsk);
unsigned long get_wchan(struct task_struct *p);
#define KSTK_EIP(tsk) \
diff --git a/arch/h8300/kernel/process.c b/arch/h8300/kernel/process.c
index 0f5db5bb561b..d1ddcabbbe83 100644
--- a/arch/h8300/kernel/process.c
+++ b/arch/h8300/kernel/process.c
@@ -129,11 +129,6 @@ int copy_thread(unsigned long clone_flags,
return 0;
}
-unsigned long thread_saved_pc(struct task_struct *tsk)
-{
- return ((struct pt_regs *)tsk->thread.esp0)->pc;
-}
-
unsigned long get_wchan(struct task_struct *p)
{
unsigned long fp, pc;
diff --git a/arch/hexagon/include/asm/processor.h b/arch/hexagon/include/asm/processor.h
index 45a825402f63..ce67940860a5 100644
--- a/arch/hexagon/include/asm/processor.h
+++ b/arch/hexagon/include/asm/processor.h
@@ -33,9 +33,6 @@
/* task_struct, defined elsewhere, is the "process descriptor" */
struct task_struct;
-/* this is defined in arch/process.c */
-extern unsigned long thread_saved_pc(struct task_struct *tsk);
-
extern void start_thread(struct pt_regs *, unsigned long, unsigned long);
/*
diff --git a/arch/hexagon/kernel/process.c b/arch/hexagon/kernel/process.c
index de715bab7956..656050c2e6a0 100644
--- a/arch/hexagon/kernel/process.c
+++ b/arch/hexagon/kernel/process.c
@@ -61,14 +61,6 @@ void arch_cpu_idle(void)
}
/*
- * Return saved PC of a blocked thread
- */
-unsigned long thread_saved_pc(struct task_struct *tsk)
-{
- return 0;
-}
-
-/*
* Copy architecture-specific thread state
*/
int copy_thread(unsigned long clone_flags, unsigned long usp,
diff --git a/arch/ia64/include/asm/processor.h b/arch/ia64/include/asm/processor.h
index 26a63d69c599..ab982f07ea68 100644
--- a/arch/ia64/include/asm/processor.h
+++ b/arch/ia64/include/asm/processor.h
@@ -602,23 +602,6 @@ ia64_set_unat (__u64 *unat, void *spill_addr, unsigned long nat)
}
/*
- * Return saved PC of a blocked thread.
- * Note that the only way T can block is through a call to schedule() -> switch_to().
- */
-static inline unsigned long
-thread_saved_pc (struct task_struct *t)
-{
- struct unw_frame_info info;
- unsigned long ip;
-
- unw_init_from_blocked_task(&info, t);
- if (unw_unwind(&info) < 0)
- return 0;
- unw_get_ip(&info, &ip);
- return ip;
-}
-
-/*
* Get the current instruction/program counter value.
*/
#define current_text_addr() \
diff --git a/arch/m32r/include/asm/processor.h b/arch/m32r/include/asm/processor.h
index 5767367550c6..657874eeeccc 100644
--- a/arch/m32r/include/asm/processor.h
+++ b/arch/m32r/include/asm/processor.h
@@ -122,8 +122,6 @@ extern void release_thread(struct task_struct *);
extern void copy_segments(struct task_struct *p, struct mm_struct * mm);
extern void release_segments(struct mm_struct * mm);
-extern unsigned long thread_saved_pc(struct task_struct *);
-
/* Copy and release all segment info associated with a VM */
#define copy_segments(p, mm) do { } while (0)
#define release_segments(mm) do { } while (0)
diff --git a/arch/m32r/kernel/process.c b/arch/m32r/kernel/process.c
index d8ffcfec599c..8cd7e03f4370 100644
--- a/arch/m32r/kernel/process.c
+++ b/arch/m32r/kernel/process.c
@@ -39,14 +39,6 @@
#include <linux/err.h>
-/*
- * Return saved PC of a blocked thread.
- */
-unsigned long thread_saved_pc(struct task_struct *tsk)
-{
- return tsk->thread.lr;
-}
-
void (*pm_power_off)(void) = NULL;
EXPORT_SYMBOL(pm_power_off);
diff --git a/arch/m68k/include/asm/processor.h b/arch/m68k/include/asm/processor.h
index 77239e81379b..94c36030440c 100644
--- a/arch/m68k/include/asm/processor.h
+++ b/arch/m68k/include/asm/processor.h
@@ -130,8 +130,6 @@ static inline void release_thread(struct task_struct *dead_task)
{
}
-extern unsigned long thread_saved_pc(struct task_struct *tsk);
-
unsigned long get_wchan(struct task_struct *p);
#define KSTK_EIP(tsk) \
diff --git a/arch/m68k/kernel/process.c b/arch/m68k/kernel/process.c
index e475c945c8b2..7df92f8b0781 100644
--- a/arch/m68k/kernel/process.c
+++ b/arch/m68k/kernel/process.c
@@ -40,20 +40,6 @@
asmlinkage void ret_from_fork(void);
asmlinkage void ret_from_kernel_thread(void);
-
-/*
- * Return saved PC from a blocked thread
- */
-unsigned long thread_saved_pc(struct task_struct *tsk)
-{
- struct switch_stack *sw = (struct switch_stack *)tsk->thread.ksp;
- /* Check whether the thread is blocked in resume() */
- if (in_sched_functions(sw->retpc))
- return ((unsigned long *)sw->a6)[1];
- else
- return sw->retpc;
-}
-
void arch_cpu_idle(void)
{
#if defined(MACH_ATARI_ONLY)
diff --git a/arch/metag/kernel/smp.c b/arch/metag/kernel/smp.c
index 232a12bf3f99..2dbbb7c66043 100644
--- a/arch/metag/kernel/smp.c
+++ b/arch/metag/kernel/smp.c
@@ -567,8 +567,7 @@ static void stop_this_cpu(void *data)
{
unsigned int cpu = smp_processor_id();
- if (system_state == SYSTEM_BOOTING ||
- system_state == SYSTEM_RUNNING) {
+ if (system_state <= SYSTEM_RUNNING) {
spin_lock(&stop_lock);
pr_crit("CPU%u: stopping\n", cpu);
dump_stack();
diff --git a/arch/microblaze/include/asm/processor.h b/arch/microblaze/include/asm/processor.h
index 37ef196e4519..330d556860ba 100644
--- a/arch/microblaze/include/asm/processor.h
+++ b/arch/microblaze/include/asm/processor.h
@@ -69,8 +69,6 @@ static inline void release_thread(struct task_struct *dead_task)
{
}
-extern unsigned long thread_saved_pc(struct task_struct *t);
-
extern unsigned long get_wchan(struct task_struct *p);
# define KSTK_EIP(tsk) (0)
@@ -121,10 +119,6 @@ static inline void release_thread(struct task_struct *dead_task)
{
}
-/* Return saved (kernel) PC of a blocked thread. */
-# define thread_saved_pc(tsk) \
- ((tsk)->thread.regs ? (tsk)->thread.regs->r15 : 0)
-
unsigned long get_wchan(struct task_struct *p);
/* The size allocated for kernel stacks. This _must_ be a power of two! */
diff --git a/arch/microblaze/kernel/process.c b/arch/microblaze/kernel/process.c
index e92a817e645f..6527ec22f158 100644
--- a/arch/microblaze/kernel/process.c
+++ b/arch/microblaze/kernel/process.c
@@ -119,23 +119,6 @@ int copy_thread(unsigned long clone_flags, unsigned long usp,
return 0;
}
-#ifndef CONFIG_MMU
-/*
- * Return saved PC of a blocked thread.
- */
-unsigned long thread_saved_pc(struct task_struct *tsk)
-{
- struct cpu_context *ctx =
- &(((struct thread_info *)(tsk->stack))->cpu_context);
-
- /* Check whether the thread is blocked in resume() */
- if (in_sched_functions(ctx->r15))
- return (unsigned long)ctx->r15;
- else
- return ctx->r14;
-}
-#endif
-
unsigned long get_wchan(struct task_struct *p)
{
/* TBD (used by procfs) */
diff --git a/arch/mips/boot/Makefile b/arch/mips/boot/Makefile
index 2728a9a9c7c5..145b5ce8eb7e 100644
--- a/arch/mips/boot/Makefile
+++ b/arch/mips/boot/Makefile
@@ -128,19 +128,19 @@ quiet_cmd_cpp_its_S = ITS $@
-DADDR_BITS=$(ADDR_BITS) \
-DADDR_CELLS=$(itb_addr_cells)
-$(obj)/vmlinux.its: $(srctree)/arch/mips/$(PLATFORM)/vmlinux.its.S FORCE
+$(obj)/vmlinux.its: $(srctree)/arch/mips/$(PLATFORM)/vmlinux.its.S $(VMLINUX) FORCE
$(call if_changed_dep,cpp_its_S,none,vmlinux.bin)
-$(obj)/vmlinux.gz.its: $(srctree)/arch/mips/$(PLATFORM)/vmlinux.its.S FORCE
+$(obj)/vmlinux.gz.its: $(srctree)/arch/mips/$(PLATFORM)/vmlinux.its.S $(VMLINUX) FORCE
$(call if_changed_dep,cpp_its_S,gzip,vmlinux.bin.gz)
-$(obj)/vmlinux.bz2.its: $(srctree)/arch/mips/$(PLATFORM)/vmlinux.its.S FORCE
+$(obj)/vmlinux.bz2.its: $(srctree)/arch/mips/$(PLATFORM)/vmlinux.its.S $(VMLINUX) FORCE
$(call if_changed_dep,cpp_its_S,bzip2,vmlinux.bin.bz2)
-$(obj)/vmlinux.lzma.its: $(srctree)/arch/mips/$(PLATFORM)/vmlinux.its.S FORCE
+$(obj)/vmlinux.lzma.its: $(srctree)/arch/mips/$(PLATFORM)/vmlinux.its.S $(VMLINUX) FORCE
$(call if_changed_dep,cpp_its_S,lzma,vmlinux.bin.lzma)
-$(obj)/vmlinux.lzo.its: $(srctree)/arch/mips/$(PLATFORM)/vmlinux.its.S FORCE
+$(obj)/vmlinux.lzo.its: $(srctree)/arch/mips/$(PLATFORM)/vmlinux.its.S $(VMLINUX) FORCE
$(call if_changed_dep,cpp_its_S,lzo,vmlinux.bin.lzo)
quiet_cmd_itb-image = ITB $@
diff --git a/arch/mips/include/asm/highmem.h b/arch/mips/include/asm/highmem.h
index d34536e7653f..279b6d14ffeb 100644
--- a/arch/mips/include/asm/highmem.h
+++ b/arch/mips/include/asm/highmem.h
@@ -35,7 +35,12 @@ extern pte_t *pkmap_page_table;
* easily, subsequent pte tables have to be allocated in one physical
* chunk of RAM.
*/
+#ifdef CONFIG_PHYS_ADDR_T_64BIT
+#define LAST_PKMAP 512
+#else
#define LAST_PKMAP 1024
+#endif
+
#define LAST_PKMAP_MASK (LAST_PKMAP-1)
#define PKMAP_NR(virt) ((virt-PKMAP_BASE) >> PAGE_SHIFT)
#define PKMAP_ADDR(nr) (PKMAP_BASE + ((nr) << PAGE_SHIFT))
diff --git a/arch/mips/include/asm/kprobes.h b/arch/mips/include/asm/kprobes.h
index 291846d9ba83..ad1a99948f27 100644
--- a/arch/mips/include/asm/kprobes.h
+++ b/arch/mips/include/asm/kprobes.h
@@ -43,7 +43,8 @@ typedef union mips_instruction kprobe_opcode_t;
#define flush_insn_slot(p) \
do { \
- flush_icache_range((unsigned long)p->addr, \
+ if (p->addr) \
+ flush_icache_range((unsigned long)p->addr, \
(unsigned long)p->addr + \
(MAX_INSN_SIZE * sizeof(kprobe_opcode_t))); \
} while (0)
diff --git a/arch/mips/include/asm/pgtable-32.h b/arch/mips/include/asm/pgtable-32.h
index 6f94bed571c4..74afe8c76bdd 100644
--- a/arch/mips/include/asm/pgtable-32.h
+++ b/arch/mips/include/asm/pgtable-32.h
@@ -19,6 +19,10 @@
#define __ARCH_USE_5LEVEL_HACK
#include <asm-generic/pgtable-nopmd.h>
+#ifdef CONFIG_HIGHMEM
+#include <asm/highmem.h>
+#endif
+
extern int temp_tlb_entry;
/*
@@ -62,7 +66,8 @@ extern int add_temporary_entry(unsigned long entrylo0, unsigned long entrylo1,
#define VMALLOC_START MAP_BASE
-#define PKMAP_BASE (0xfe000000UL)
+#define PKMAP_END ((FIXADDR_START) & ~((LAST_PKMAP << PAGE_SHIFT)-1))
+#define PKMAP_BASE (PKMAP_END - PAGE_SIZE * LAST_PKMAP)
#ifdef CONFIG_HIGHMEM
# define VMALLOC_END (PKMAP_BASE-2*PAGE_SIZE)
diff --git a/arch/mips/kernel/branch.c b/arch/mips/kernel/branch.c
index b11facd11c9d..f702a459a830 100644
--- a/arch/mips/kernel/branch.c
+++ b/arch/mips/kernel/branch.c
@@ -804,8 +804,10 @@ int __compute_return_epc_for_insn(struct pt_regs *regs,
break;
}
/* Compact branch: BNEZC || JIALC */
- if (insn.i_format.rs)
+ if (!insn.i_format.rs) {
+ /* JIALC: set $31/ra */
regs->regs[31] = epc + 4;
+ }
regs->cp0_epc += 8;
break;
#endif
diff --git a/arch/mips/kernel/entry.S b/arch/mips/kernel/entry.S
index 8d83fc2a96b7..38a302919e6b 100644
--- a/arch/mips/kernel/entry.S
+++ b/arch/mips/kernel/entry.S
@@ -11,6 +11,7 @@
#include <asm/asm.h>
#include <asm/asmmacro.h>
#include <asm/compiler.h>
+#include <asm/irqflags.h>
#include <asm/regdef.h>
#include <asm/mipsregs.h>
#include <asm/stackframe.h>
@@ -119,6 +120,7 @@ work_pending:
andi t0, a2, _TIF_NEED_RESCHED # a2 is preloaded with TI_FLAGS
beqz t0, work_notifysig
work_resched:
+ TRACE_IRQS_OFF
jal schedule
local_irq_disable # make sure need_resched and
@@ -155,6 +157,7 @@ syscall_exit_work:
beqz t0, work_pending # trace bit set?
local_irq_enable # could let syscall_trace_leave()
# call schedule() instead
+ TRACE_IRQS_ON
move a0, sp
jal syscall_trace_leave
b resume_userspace
diff --git a/arch/mips/kernel/ftrace.c b/arch/mips/kernel/ftrace.c
index 30a3b75e88eb..9d9b8fbae202 100644
--- a/arch/mips/kernel/ftrace.c
+++ b/arch/mips/kernel/ftrace.c
@@ -38,20 +38,6 @@ void arch_ftrace_update_code(int command)
#endif
-/*
- * Check if the address is in kernel space
- *
- * Clone core_kernel_text() from kernel/extable.c, but doesn't call
- * init_kernel_text() for Ftrace doesn't trace functions in init sections.
- */
-static inline int in_kernel_space(unsigned long ip)
-{
- if (ip >= (unsigned long)_stext &&
- ip <= (unsigned long)_etext)
- return 1;
- return 0;
-}
-
#ifdef CONFIG_DYNAMIC_FTRACE
#define JAL 0x0c000000 /* jump & link: ip --> ra, jump to target */
@@ -198,7 +184,7 @@ int ftrace_make_nop(struct module *mod,
* If ip is in kernel space, no long call, otherwise, long call is
* needed.
*/
- new = in_kernel_space(ip) ? INSN_NOP : INSN_B_1F;
+ new = core_kernel_text(ip) ? INSN_NOP : INSN_B_1F;
#ifdef CONFIG_64BIT
return ftrace_modify_code(ip, new);
#else
@@ -218,12 +204,12 @@ int ftrace_make_call(struct dyn_ftrace *rec, unsigned long addr)
unsigned int new;
unsigned long ip = rec->ip;
- new = in_kernel_space(ip) ? insn_jal_ftrace_caller : insn_la_mcount[0];
+ new = core_kernel_text(ip) ? insn_jal_ftrace_caller : insn_la_mcount[0];
#ifdef CONFIG_64BIT
return ftrace_modify_code(ip, new);
#else
- return ftrace_modify_code_2r(ip, new, in_kernel_space(ip) ?
+ return ftrace_modify_code_2r(ip, new, core_kernel_text(ip) ?
INSN_NOP : insn_la_mcount[1]);
#endif
}
@@ -289,7 +275,7 @@ unsigned long ftrace_get_parent_ra_addr(unsigned long self_ra, unsigned long
* instruction "lui v1, hi_16bit_of_mcount"(offset is 24), but for
* kernel, move after the instruction "move ra, at"(offset is 16)
*/
- ip = self_ra - (in_kernel_space(self_ra) ? 16 : 24);
+ ip = self_ra - (core_kernel_text(self_ra) ? 16 : 24);
/*
* search the text until finding the non-store instruction or "s{d,w}
@@ -394,7 +380,7 @@ void prepare_ftrace_return(unsigned long *parent_ra_addr, unsigned long self_ra,
* entries configured through the tracing/set_graph_function interface.
*/
- insns = in_kernel_space(self_ra) ? 2 : MCOUNT_OFFSET_INSNS + 1;
+ insns = core_kernel_text(self_ra) ? 2 : MCOUNT_OFFSET_INSNS + 1;
trace.func = self_ra - (MCOUNT_INSN_SIZE * insns);
/* Only trace if the calling function expects to */
diff --git a/arch/mips/kernel/head.S b/arch/mips/kernel/head.S
index cf052204eb0a..d1bb506adc10 100644
--- a/arch/mips/kernel/head.S
+++ b/arch/mips/kernel/head.S
@@ -106,8 +106,8 @@ NESTED(kernel_entry, 16, sp) # kernel entry point
beq t0, t1, dtb_found
#endif
li t1, -2
- beq a0, t1, dtb_found
move t2, a1
+ beq a0, t1, dtb_found
li t2, 0
dtb_found:
diff --git a/arch/mips/kernel/perf_event_mipsxx.c b/arch/mips/kernel/perf_event_mipsxx.c
index 313a88b2973f..f3e301f95aef 100644
--- a/arch/mips/kernel/perf_event_mipsxx.c
+++ b/arch/mips/kernel/perf_event_mipsxx.c
@@ -1597,7 +1597,6 @@ static const struct mips_perf_event *mipsxx_pmu_map_raw_event(u64 config)
break;
case CPU_P5600:
case CPU_P6600:
- case CPU_I6400:
/* 8-bit event numbers */
raw_id = config & 0x1ff;
base_id = raw_id & 0xff;
@@ -1610,6 +1609,11 @@ static const struct mips_perf_event *mipsxx_pmu_map_raw_event(u64 config)
raw_event.range = P;
#endif
break;
+ case CPU_I6400:
+ /* 8-bit event numbers */
+ base_id = config & 0xff;
+ raw_event.cntr_mask = CNTR_EVEN | CNTR_ODD;
+ break;
case CPU_1004K:
if (IS_BOTH_COUNTERS_1004K_EVENT(base_id))
raw_event.cntr_mask = CNTR_EVEN | CNTR_ODD;
diff --git a/arch/mips/kernel/pm-cps.c b/arch/mips/kernel/pm-cps.c
index 5f928c34c148..d99416094ba9 100644
--- a/arch/mips/kernel/pm-cps.c
+++ b/arch/mips/kernel/pm-cps.c
@@ -56,7 +56,6 @@ DECLARE_BITMAP(state_support, CPS_PM_STATE_COUNT);
* state. Actually per-core rather than per-CPU.
*/
static DEFINE_PER_CPU_ALIGNED(u32*, ready_count);
-static DEFINE_PER_CPU_ALIGNED(void*, ready_count_alloc);
/* Indicates online CPUs coupled with the current CPU */
static DEFINE_PER_CPU_ALIGNED(cpumask_t, online_coupled);
@@ -642,7 +641,6 @@ static int cps_pm_online_cpu(unsigned int cpu)
{
enum cps_pm_state state;
unsigned core = cpu_data[cpu].core;
- unsigned dlinesz = cpu_data[cpu].dcache.linesz;
void *entry_fn, *core_rc;
for (state = CPS_PM_NC_WAIT; state < CPS_PM_STATE_COUNT; state++) {
@@ -662,16 +660,11 @@ static int cps_pm_online_cpu(unsigned int cpu)
}
if (!per_cpu(ready_count, core)) {
- core_rc = kmalloc(dlinesz * 2, GFP_KERNEL);
+ core_rc = kmalloc(sizeof(u32), GFP_KERNEL);
if (!core_rc) {
pr_err("Failed allocate core %u ready_count\n", core);
return -ENOMEM;
}
- per_cpu(ready_count_alloc, core) = core_rc;
-
- /* Ensure ready_count is aligned to a cacheline boundary */
- core_rc += dlinesz - 1;
- core_rc = (void *)((unsigned long)core_rc & ~(dlinesz - 1));
per_cpu(ready_count, core) = core_rc;
}
diff --git a/arch/mips/kernel/traps.c b/arch/mips/kernel/traps.c
index 9681b5877140..38dfa27730ff 100644
--- a/arch/mips/kernel/traps.c
+++ b/arch/mips/kernel/traps.c
@@ -201,6 +201,8 @@ void show_stack(struct task_struct *task, unsigned long *sp)
{
struct pt_regs regs;
mm_segment_t old_fs = get_fs();
+
+ regs.cp0_status = KSU_KERNEL;
if (sp) {
regs.regs[29] = (unsigned long)sp;
regs.regs[31] = 0;
diff --git a/arch/mips/kvm/tlb.c b/arch/mips/kvm/tlb.c
index 7c6336dd2638..7cd92166a0b9 100644
--- a/arch/mips/kvm/tlb.c
+++ b/arch/mips/kvm/tlb.c
@@ -166,7 +166,11 @@ static int _kvm_mips_host_tlb_inv(unsigned long entryhi)
int kvm_mips_host_tlb_inv(struct kvm_vcpu *vcpu, unsigned long va,
bool user, bool kernel)
{
- int idx_user, idx_kernel;
+ /*
+ * Initialize idx_user and idx_kernel to workaround bogus
+ * maybe-initialized warning when using GCC 6.
+ */
+ int idx_user = 0, idx_kernel = 0;
unsigned long flags, old_entryhi;
local_irq_save(flags);
diff --git a/arch/mips/math-emu/dp_maddf.c b/arch/mips/math-emu/dp_maddf.c
index 4a2d03c72959..caa62f20a888 100644
--- a/arch/mips/math-emu/dp_maddf.c
+++ b/arch/mips/math-emu/dp_maddf.c
@@ -54,7 +54,7 @@ static union ieee754dp _dp_maddf(union ieee754dp z, union ieee754dp x,
return ieee754dp_nanxcpt(z);
case IEEE754_CLASS_DNORM:
DPDNORMZ;
- /* QNAN is handled separately below */
+ /* QNAN and ZERO cases are handled separately below */
}
switch (CLPAIR(xc, yc)) {
@@ -210,6 +210,9 @@ static union ieee754dp _dp_maddf(union ieee754dp z, union ieee754dp x,
}
assert(rm & (DP_HIDDEN_BIT << 3));
+ if (zc == IEEE754_CLASS_ZERO)
+ return ieee754dp_format(rs, re, rm);
+
/* And now the addition */
assert(zm & DP_HIDDEN_BIT);
diff --git a/arch/mips/math-emu/sp_maddf.c b/arch/mips/math-emu/sp_maddf.c
index a8cd8b4f235e..c91d5e5d9b5f 100644
--- a/arch/mips/math-emu/sp_maddf.c
+++ b/arch/mips/math-emu/sp_maddf.c
@@ -54,7 +54,7 @@ static union ieee754sp _sp_maddf(union ieee754sp z, union ieee754sp x,
return ieee754sp_nanxcpt(z);
case IEEE754_CLASS_DNORM:
SPDNORMZ;
- /* QNAN is handled separately below */
+ /* QNAN and ZERO cases are handled separately below */
}
switch (CLPAIR(xc, yc)) {
@@ -203,6 +203,9 @@ static union ieee754sp _sp_maddf(union ieee754sp z, union ieee754sp x,
}
assert(rm & (SP_HIDDEN_BIT << 3));
+ if (zc == IEEE754_CLASS_ZERO)
+ return ieee754sp_format(rs, re, rm);
+
/* And now the addition */
assert(zm & SP_HIDDEN_BIT);
diff --git a/arch/mips/mm/dma-default.c b/arch/mips/mm/dma-default.c
index fe8df14b6169..e08598c70b3e 100644
--- a/arch/mips/mm/dma-default.c
+++ b/arch/mips/mm/dma-default.c
@@ -68,12 +68,25 @@ static inline struct page *dma_addr_to_page(struct device *dev,
* systems and only the R10000 and R12000 are used in such systems, the
* SGI IP28 Indigo² rsp. SGI IP32 aka O2.
*/
-static inline int cpu_needs_post_dma_flush(struct device *dev)
+static inline bool cpu_needs_post_dma_flush(struct device *dev)
{
- return !plat_device_is_coherent(dev) &&
- (boot_cpu_type() == CPU_R10000 ||
- boot_cpu_type() == CPU_R12000 ||
- boot_cpu_type() == CPU_BMIPS5000);
+ if (plat_device_is_coherent(dev))
+ return false;
+
+ switch (boot_cpu_type()) {
+ case CPU_R10000:
+ case CPU_R12000:
+ case CPU_BMIPS5000:
+ return true;
+
+ default:
+ /*
+ * Presence of MAARs suggests that the CPU supports
+ * speculatively prefetching data, and therefore requires
+ * the post-DMA flush/invalidate.
+ */
+ return cpu_has_maar;
+ }
}
static gfp_t massage_gfp_flags(const struct device *dev, gfp_t gfp)
diff --git a/arch/mips/mm/mmap.c b/arch/mips/mm/mmap.c
index 64dd8bdd92c3..28adeabe851f 100644
--- a/arch/mips/mm/mmap.c
+++ b/arch/mips/mm/mmap.c
@@ -93,7 +93,7 @@ static unsigned long arch_get_unmapped_area_common(struct file *filp,
vma = find_vma(mm, addr);
if (TASK_SIZE - len >= addr &&
- (!vma || addr + len <= vma->vm_start))
+ (!vma || addr + len <= vm_start_gap(vma)))
return addr;
}
diff --git a/arch/mips/mm/pgtable-32.c b/arch/mips/mm/pgtable-32.c
index adc6911ba748..b19a3c506b1e 100644
--- a/arch/mips/mm/pgtable-32.c
+++ b/arch/mips/mm/pgtable-32.c
@@ -51,15 +51,15 @@ void __init pagetable_init(void)
/*
* Fixed mappings:
*/
- vaddr = __fix_to_virt(__end_of_fixed_addresses - 1) & PMD_MASK;
- fixrange_init(vaddr, vaddr + FIXADDR_SIZE, pgd_base);
+ vaddr = __fix_to_virt(__end_of_fixed_addresses - 1);
+ fixrange_init(vaddr & PMD_MASK, vaddr + FIXADDR_SIZE, pgd_base);
#ifdef CONFIG_HIGHMEM
/*
* Permanent kmaps:
*/
vaddr = PKMAP_BASE;
- fixrange_init(vaddr, vaddr + PAGE_SIZE*LAST_PKMAP, pgd_base);
+ fixrange_init(vaddr & PMD_MASK, vaddr + PAGE_SIZE*LAST_PKMAP, pgd_base);
pgd = swapper_pg_dir + __pgd_offset(vaddr);
pud = pud_offset(pgd, vaddr);
diff --git a/arch/mn10300/include/asm/processor.h b/arch/mn10300/include/asm/processor.h
index 18e17abf7664..3ae479117b42 100644
--- a/arch/mn10300/include/asm/processor.h
+++ b/arch/mn10300/include/asm/processor.h
@@ -132,11 +132,6 @@ static inline void start_thread(struct pt_regs *regs,
/* Free all resources held by a thread. */
extern void release_thread(struct task_struct *);
-/*
- * Return saved PC of a blocked thread.
- */
-extern unsigned long thread_saved_pc(struct task_struct *tsk);
-
unsigned long get_wchan(struct task_struct *p);
#define task_pt_regs(task) ((task)->thread.uregs)
diff --git a/arch/mn10300/kernel/process.c b/arch/mn10300/kernel/process.c
index c9fa42619c6a..89e8027e07fb 100644
--- a/arch/mn10300/kernel/process.c
+++ b/arch/mn10300/kernel/process.c
@@ -40,14 +40,6 @@
#include "internal.h"
/*
- * return saved PC of a blocked thread.
- */
-unsigned long thread_saved_pc(struct task_struct *tsk)
-{
- return ((unsigned long *) tsk->thread.sp)[3];
-}
-
-/*
* power off function, if any
*/
void (*pm_power_off)(void);
diff --git a/arch/nios2/include/asm/processor.h b/arch/nios2/include/asm/processor.h
index 3bbbc3d798e5..4944e2e1d8b0 100644
--- a/arch/nios2/include/asm/processor.h
+++ b/arch/nios2/include/asm/processor.h
@@ -75,9 +75,6 @@ static inline void release_thread(struct task_struct *dead_task)
{
}
-/* Return saved PC of a blocked thread. */
-#define thread_saved_pc(tsk) ((tsk)->thread.kregs->ea)
-
extern unsigned long get_wchan(struct task_struct *p);
#define task_pt_regs(p) \
diff --git a/arch/openrisc/include/asm/processor.h b/arch/openrisc/include/asm/processor.h
index a908e6c30a00..396d8f306c21 100644
--- a/arch/openrisc/include/asm/processor.h
+++ b/arch/openrisc/include/asm/processor.h
@@ -84,11 +84,6 @@ void start_thread(struct pt_regs *regs, unsigned long nip, unsigned long sp);
void release_thread(struct task_struct *);
unsigned long get_wchan(struct task_struct *p);
-/*
- * Return saved PC of a blocked thread. For now, this is the "user" PC
- */
-extern unsigned long thread_saved_pc(struct task_struct *t);
-
#define init_stack (init_thread_union.stack)
#define cpu_relax() barrier()
diff --git a/arch/openrisc/kernel/process.c b/arch/openrisc/kernel/process.c
index 106859ae27ff..f9b77003f113 100644
--- a/arch/openrisc/kernel/process.c
+++ b/arch/openrisc/kernel/process.c
@@ -110,11 +110,6 @@ void show_regs(struct pt_regs *regs)
show_registers(regs);
}
-unsigned long thread_saved_pc(struct task_struct *t)
-{
- return (unsigned long)user_regs(t->stack)->pc;
-}
-
void release_thread(struct task_struct *dead_task)
{
}
diff --git a/arch/parisc/include/asm/processor.h b/arch/parisc/include/asm/processor.h
index a3661ee6b060..4c6694b4e77e 100644
--- a/arch/parisc/include/asm/processor.h
+++ b/arch/parisc/include/asm/processor.h
@@ -163,12 +163,7 @@ struct thread_struct {
.flags = 0 \
}
-/*
- * Return saved PC of a blocked thread. This is used by ps mostly.
- */
-
struct task_struct;
-unsigned long thread_saved_pc(struct task_struct *t);
void show_trace(struct task_struct *task, unsigned long *stack);
/*
diff --git a/arch/parisc/kernel/process.c b/arch/parisc/kernel/process.c
index 4516a5b53f38..b64d7d21646e 100644
--- a/arch/parisc/kernel/process.c
+++ b/arch/parisc/kernel/process.c
@@ -239,11 +239,6 @@ copy_thread(unsigned long clone_flags, unsigned long usp,
return 0;
}
-unsigned long thread_saved_pc(struct task_struct *t)
-{
- return t->thread.regs.kpc;
-}
-
unsigned long
get_wchan(struct task_struct *p)
{
diff --git a/arch/parisc/kernel/sys_parisc.c b/arch/parisc/kernel/sys_parisc.c
index e5288638a1d9..378a754ca186 100644
--- a/arch/parisc/kernel/sys_parisc.c
+++ b/arch/parisc/kernel/sys_parisc.c
@@ -90,7 +90,7 @@ unsigned long arch_get_unmapped_area(struct file *filp, unsigned long addr,
unsigned long len, unsigned long pgoff, unsigned long flags)
{
struct mm_struct *mm = current->mm;
- struct vm_area_struct *vma;
+ struct vm_area_struct *vma, *prev;
unsigned long task_size = TASK_SIZE;
int do_color_align, last_mmap;
struct vm_unmapped_area_info info;
@@ -117,9 +117,10 @@ unsigned long arch_get_unmapped_area(struct file *filp, unsigned long addr,
else
addr = PAGE_ALIGN(addr);
- vma = find_vma(mm, addr);
+ vma = find_vma_prev(mm, addr, &prev);
if (task_size - len >= addr &&
- (!vma || addr + len <= vma->vm_start))
+ (!vma || addr + len <= vm_start_gap(vma)) &&
+ (!prev || addr >= vm_end_gap(prev)))
goto found_addr;
}
@@ -143,7 +144,7 @@ arch_get_unmapped_area_topdown(struct file *filp, const unsigned long addr0,
const unsigned long len, const unsigned long pgoff,
const unsigned long flags)
{
- struct vm_area_struct *vma;
+ struct vm_area_struct *vma, *prev;
struct mm_struct *mm = current->mm;
unsigned long addr = addr0;
int do_color_align, last_mmap;
@@ -177,9 +178,11 @@ arch_get_unmapped_area_topdown(struct file *filp, const unsigned long addr0,
addr = COLOR_ALIGN(addr, last_mmap, pgoff);
else
addr = PAGE_ALIGN(addr);
- vma = find_vma(mm, addr);
+
+ vma = find_vma_prev(mm, addr, &prev);
if (TASK_SIZE - len >= addr &&
- (!vma || addr + len <= vma->vm_start))
+ (!vma || addr + len <= vm_start_gap(vma)) &&
+ (!prev || addr >= vm_end_gap(prev)))
goto found_addr;
}
diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig
index bf4391d18923..6189238e69f8 100644
--- a/arch/powerpc/Kconfig
+++ b/arch/powerpc/Kconfig
@@ -184,7 +184,7 @@ config PPC
select HAVE_FUNCTION_GRAPH_TRACER
select HAVE_FUNCTION_TRACER
select HAVE_GCC_PLUGINS
- select HAVE_GENERIC_RCU_GUP
+ select HAVE_GENERIC_GUP
select HAVE_HW_BREAKPOINT if PERF_EVENTS && (PPC_BOOK3S || PPC_8xx)
select HAVE_IDE
select HAVE_IOREMAP_PROT
diff --git a/arch/powerpc/include/asm/bug.h b/arch/powerpc/include/asm/bug.h
index f2c562a0a427..0151af6c2a50 100644
--- a/arch/powerpc/include/asm/bug.h
+++ b/arch/powerpc/include/asm/bug.h
@@ -104,7 +104,7 @@
"1: "PPC_TLNEI" %4,0\n" \
_EMIT_BUG_ENTRY \
: : "i" (__FILE__), "i" (__LINE__), \
- "i" (BUGFLAG_TAINT(TAINT_WARN)), \
+ "i" (BUGFLAG_WARNING|BUGFLAG_TAINT(TAINT_WARN)),\
"i" (sizeof(struct bug_entry)), \
"r" (__ret_warn_on)); \
} \
diff --git a/arch/powerpc/include/asm/kprobes.h b/arch/powerpc/include/asm/kprobes.h
index a83821f33ea3..8814a7249ceb 100644
--- a/arch/powerpc/include/asm/kprobes.h
+++ b/arch/powerpc/include/asm/kprobes.h
@@ -103,6 +103,7 @@ extern int kprobe_exceptions_notify(struct notifier_block *self,
extern int kprobe_fault_handler(struct pt_regs *regs, int trapnr);
extern int kprobe_handler(struct pt_regs *regs);
extern int kprobe_post_handler(struct pt_regs *regs);
+extern int is_current_kprobe_addr(unsigned long addr);
#ifdef CONFIG_KPROBES_ON_FTRACE
extern int skip_singlestep(struct kprobe *p, struct pt_regs *regs,
struct kprobe_ctlblk *kcb);
diff --git a/arch/powerpc/include/asm/processor.h b/arch/powerpc/include/asm/processor.h
index bb99b651085a..1189d04f3bd1 100644
--- a/arch/powerpc/include/asm/processor.h
+++ b/arch/powerpc/include/asm/processor.h
@@ -378,12 +378,6 @@ struct thread_struct {
}
#endif
-/*
- * Return saved PC of a blocked thread. For now, this is the "user" PC
- */
-#define thread_saved_pc(tsk) \
- ((tsk)->thread.regs? (tsk)->thread.regs->nip: 0)
-
#define task_pt_regs(tsk) ((struct pt_regs *)(tsk)->thread.regs)
unsigned long get_wchan(struct task_struct *p);
diff --git a/arch/powerpc/include/asm/uaccess.h b/arch/powerpc/include/asm/uaccess.h
index 5c0d8a8cdae5..41e88d3ce36b 100644
--- a/arch/powerpc/include/asm/uaccess.h
+++ b/arch/powerpc/include/asm/uaccess.h
@@ -267,13 +267,7 @@ do { \
extern unsigned long __copy_tofrom_user(void __user *to,
const void __user *from, unsigned long size);
-#ifndef __powerpc64__
-
-#define INLINE_COPY_FROM_USER
-#define INLINE_COPY_TO_USER
-
-#else /* __powerpc64__ */
-
+#ifdef __powerpc64__
static inline unsigned long
raw_copy_in_user(void __user *to, const void __user *from, unsigned long n)
{
diff --git a/arch/powerpc/include/asm/xive.h b/arch/powerpc/include/asm/xive.h
index c8a822acf962..c23ff4389ca2 100644
--- a/arch/powerpc/include/asm/xive.h
+++ b/arch/powerpc/include/asm/xive.h
@@ -94,11 +94,13 @@ struct xive_q {
* store at 0 and some ESBs support doing a trigger via a
* separate trigger page.
*/
-#define XIVE_ESB_GET 0x800
-#define XIVE_ESB_SET_PQ_00 0xc00
-#define XIVE_ESB_SET_PQ_01 0xd00
-#define XIVE_ESB_SET_PQ_10 0xe00
-#define XIVE_ESB_SET_PQ_11 0xf00
+#define XIVE_ESB_STORE_EOI 0x400 /* Store */
+#define XIVE_ESB_LOAD_EOI 0x000 /* Load */
+#define XIVE_ESB_GET 0x800 /* Load */
+#define XIVE_ESB_SET_PQ_00 0xc00 /* Load */
+#define XIVE_ESB_SET_PQ_01 0xd00 /* Load */
+#define XIVE_ESB_SET_PQ_10 0xe00 /* Load */
+#define XIVE_ESB_SET_PQ_11 0xf00 /* Load */
#define XIVE_ESB_VAL_P 0x2
#define XIVE_ESB_VAL_Q 0x1
diff --git a/arch/powerpc/kernel/exceptions-64s.S b/arch/powerpc/kernel/exceptions-64s.S
index ae418b85c17c..b886795060fd 100644
--- a/arch/powerpc/kernel/exceptions-64s.S
+++ b/arch/powerpc/kernel/exceptions-64s.S
@@ -1411,10 +1411,8 @@ USE_TEXT_SECTION()
.balign IFETCH_ALIGN_BYTES
do_hash_page:
#ifdef CONFIG_PPC_STD_MMU_64
- andis. r0,r4,0xa410 /* weird error? */
+ andis. r0,r4,0xa450 /* weird error? */
bne- handle_page_fault /* if not, try to insert a HPTE */
- andis. r0,r4,DSISR_DABRMATCH@h
- bne- handle_dabr_fault
CURRENT_THREAD_INFO(r11, r1)
lwz r0,TI_PREEMPT(r11) /* If we're in an "NMI" */
andis. r0,r0,NMI_MASK@h /* (i.e. an irq when soft-disabled) */
@@ -1438,11 +1436,16 @@ do_hash_page:
/* Error */
blt- 13f
+
+ /* Reload DSISR into r4 for the DABR check below */
+ ld r4,_DSISR(r1)
#endif /* CONFIG_PPC_STD_MMU_64 */
/* Here we have a page fault that hash_page can't handle. */
handle_page_fault:
-11: ld r4,_DAR(r1)
+11: andis. r0,r4,DSISR_DABRMATCH@h
+ bne- handle_dabr_fault
+ ld r4,_DAR(r1)
ld r5,_DSISR(r1)
addi r3,r1,STACK_FRAME_OVERHEAD
bl do_page_fault
diff --git a/arch/powerpc/kernel/kprobes.c b/arch/powerpc/kernel/kprobes.c
index fc4343514bed..01addfb0ed0a 100644
--- a/arch/powerpc/kernel/kprobes.c
+++ b/arch/powerpc/kernel/kprobes.c
@@ -43,6 +43,12 @@ DEFINE_PER_CPU(struct kprobe_ctlblk, kprobe_ctlblk);
struct kretprobe_blackpoint kretprobe_blacklist[] = {{NULL, NULL}};
+int is_current_kprobe_addr(unsigned long addr)
+{
+ struct kprobe *p = kprobe_running();
+ return (p && (unsigned long)p->addr == addr) ? 1 : 0;
+}
+
bool arch_within_kprobe_blacklist(unsigned long addr)
{
return (addr >= (unsigned long)__kprobes_text_start &&
@@ -617,6 +623,15 @@ int setjmp_pre_handler(struct kprobe *p, struct pt_regs *regs)
regs->gpr[2] = (unsigned long)(((func_descr_t *)jp->entry)->toc);
#endif
+ /*
+ * jprobes use jprobe_return() which skips the normal return
+ * path of the function, and this messes up the accounting of the
+ * function graph tracer.
+ *
+ * Pause function graph tracing while performing the jprobe function.
+ */
+ pause_graph_tracing();
+
return 1;
}
NOKPROBE_SYMBOL(setjmp_pre_handler);
@@ -642,6 +657,8 @@ int longjmp_break_handler(struct kprobe *p, struct pt_regs *regs)
* saved regs...
*/
memcpy(regs, &kcb->jprobe_saved_regs, sizeof(struct pt_regs));
+ /* It's OK to start function graph tracing again */
+ unpause_graph_tracing();
preempt_enable_no_resched();
return 1;
}
diff --git a/arch/powerpc/kernel/setup_64.c b/arch/powerpc/kernel/setup_64.c
index a8c1f99e9607..4640f6d64f8b 100644
--- a/arch/powerpc/kernel/setup_64.c
+++ b/arch/powerpc/kernel/setup_64.c
@@ -616,6 +616,24 @@ void __init exc_lvl_early_init(void)
#endif
/*
+ * Emergency stacks are used for a range of things, from asynchronous
+ * NMIs (system reset, machine check) to synchronous, process context.
+ * We set preempt_count to zero, even though that isn't necessarily correct. To
+ * get the right value we'd need to copy it from the previous thread_info, but
+ * doing that might fault causing more problems.
+ * TODO: what to do with accounting?
+ */
+static void emerg_stack_init_thread_info(struct thread_info *ti, int cpu)
+{
+ ti->task = NULL;
+ ti->cpu = cpu;
+ ti->preempt_count = 0;
+ ti->local_flags = 0;
+ ti->flags = 0;
+ klp_init_thread_info(ti);
+}
+
+/*
* Stack space used when we detect a bad kernel stack pointer, and
* early in SMP boots before relocation is enabled. Exclusive emergency
* stack for machine checks.
@@ -633,24 +651,31 @@ void __init emergency_stack_init(void)
* Since we use these as temporary stacks during secondary CPU
* bringup, we need to get at them in real mode. This means they
* must also be within the RMO region.
+ *
+ * The IRQ stacks allocated elsewhere in this file are zeroed and
+ * initialized in kernel/irq.c. These are initialized here in order
+ * to have emergency stacks available as early as possible.
*/
limit = min(safe_stack_limit(), ppc64_rma_size);
for_each_possible_cpu(i) {
struct thread_info *ti;
ti = __va(memblock_alloc_base(THREAD_SIZE, THREAD_SIZE, limit));
- klp_init_thread_info(ti);
+ memset(ti, 0, THREAD_SIZE);
+ emerg_stack_init_thread_info(ti, i);
paca[i].emergency_sp = (void *)ti + THREAD_SIZE;
#ifdef CONFIG_PPC_BOOK3S_64
/* emergency stack for NMI exception handling. */
ti = __va(memblock_alloc_base(THREAD_SIZE, THREAD_SIZE, limit));
- klp_init_thread_info(ti);
+ memset(ti, 0, THREAD_SIZE);
+ emerg_stack_init_thread_info(ti, i);
paca[i].nmi_emergency_sp = (void *)ti + THREAD_SIZE;
/* emergency stack for machine check exception handling. */
ti = __va(memblock_alloc_base(THREAD_SIZE, THREAD_SIZE, limit));
- klp_init_thread_info(ti);
+ memset(ti, 0, THREAD_SIZE);
+ emerg_stack_init_thread_info(ti, i);
paca[i].mc_emergency_sp = (void *)ti + THREAD_SIZE;
#endif
}
diff --git a/arch/powerpc/kernel/smp.c b/arch/powerpc/kernel/smp.c
index df2a41647d8e..1069f74fca47 100644
--- a/arch/powerpc/kernel/smp.c
+++ b/arch/powerpc/kernel/smp.c
@@ -97,7 +97,7 @@ int smp_generic_cpu_bootable(unsigned int nr)
/* Special case - we inhibit secondary thread startup
* during boot if the user requests it.
*/
- if (system_state == SYSTEM_BOOTING && cpu_has_feature(CPU_FTR_SMT)) {
+ if (system_state < SYSTEM_RUNNING && cpu_has_feature(CPU_FTR_SMT)) {
if (!smt_enabled_at_boot && cpu_thread_in_core(nr) != 0)
return 0;
if (smt_enabled_at_boot
diff --git a/arch/powerpc/kernel/trace/ftrace_64_mprofile.S b/arch/powerpc/kernel/trace/ftrace_64_mprofile.S
index 7c933a99f5d5..c98e90b4ea7b 100644
--- a/arch/powerpc/kernel/trace/ftrace_64_mprofile.S
+++ b/arch/powerpc/kernel/trace/ftrace_64_mprofile.S
@@ -45,10 +45,14 @@ _GLOBAL(ftrace_caller)
stdu r1,-SWITCH_FRAME_SIZE(r1)
/* Save all gprs to pt_regs */
- SAVE_8GPRS(0,r1)
- SAVE_8GPRS(8,r1)
- SAVE_8GPRS(16,r1)
- SAVE_8GPRS(24,r1)
+ SAVE_GPR(0, r1)
+ SAVE_10GPRS(2, r1)
+ SAVE_10GPRS(12, r1)
+ SAVE_10GPRS(22, r1)
+
+ /* Save previous stack pointer (r1) */
+ addi r8, r1, SWITCH_FRAME_SIZE
+ std r8, GPR1(r1)
/* Load special regs for save below */
mfmsr r8
@@ -95,18 +99,44 @@ ftrace_call:
bl ftrace_stub
nop
- /* Load ctr with the possibly modified NIP */
- ld r3, _NIP(r1)
- mtctr r3
+ /* Load the possibly modified NIP */
+ ld r15, _NIP(r1)
+
#ifdef CONFIG_LIVEPATCH
- cmpd r14,r3 /* has NIP been altered? */
+ cmpd r14, r15 /* has NIP been altered? */
+#endif
+
+#if defined(CONFIG_LIVEPATCH) && defined(CONFIG_KPROBES_ON_FTRACE)
+ /* NIP has not been altered, skip over further checks */
+ beq 1f
+
+ /* Check if there is an active kprobe on us */
+ subi r3, r14, 4
+ bl is_current_kprobe_addr
+ nop
+
+ /*
+ * If r3 == 1, then this is a kprobe/jprobe.
+ * else, this is livepatched function.
+ *
+ * The conditional branch for livepatch_handler below will use the
+ * result of this comparison. For kprobe/jprobe, we just need to branch to
+ * the new NIP, not call livepatch_handler. The branch below is bne, so we
+ * want CR0[EQ] to be true if this is a kprobe/jprobe. Which means we want
+ * CR0[EQ] = (r3 == 1).
+ */
+ cmpdi r3, 1
+1:
#endif
+ /* Load CTR with the possibly modified NIP */
+ mtctr r15
+
/* Restore gprs */
- REST_8GPRS(0,r1)
- REST_8GPRS(8,r1)
- REST_8GPRS(16,r1)
- REST_8GPRS(24,r1)
+ REST_GPR(0,r1)
+ REST_10GPRS(2,r1)
+ REST_10GPRS(12,r1)
+ REST_10GPRS(22,r1)
/* Restore possibly modified LR */
ld r0, _LINK(r1)
@@ -119,7 +149,10 @@ ftrace_call:
addi r1, r1, SWITCH_FRAME_SIZE
#ifdef CONFIG_LIVEPATCH
- /* Based on the cmpd above, if the NIP was altered handle livepatch */
+ /*
+ * Based on the cmpd or cmpdi above, if the NIP was altered and we're
+ * not on a kprobe/jprobe, then handle livepatch.
+ */
bne- livepatch_handler
#endif
diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c
index 42b7a4fd57d9..8d1a365b8edc 100644
--- a/arch/powerpc/kvm/book3s_hv.c
+++ b/arch/powerpc/kvm/book3s_hv.c
@@ -1486,6 +1486,14 @@ static int kvmppc_set_one_reg_hv(struct kvm_vcpu *vcpu, u64 id,
r = set_vpa(vcpu, &vcpu->arch.dtl, addr, len);
break;
case KVM_REG_PPC_TB_OFFSET:
+ /*
+ * POWER9 DD1 has an erratum where writing TBU40 causes
+ * the timebase to lose ticks. So we don't let the
+ * timebase offset be changed on P9 DD1. (It is
+ * initialized to zero.)
+ */
+ if (cpu_has_feature(CPU_FTR_POWER9_DD1))
+ break;
/* round up to multiple of 2^24 */
vcpu->arch.vcore->tb_offset =
ALIGN(set_reg_val(id, *val), 1UL << 24);
@@ -2907,12 +2915,36 @@ static int kvmppc_vcpu_run_hv(struct kvm_run *run, struct kvm_vcpu *vcpu)
{
int r;
int srcu_idx;
+ unsigned long ebb_regs[3] = {}; /* shut up GCC */
+ unsigned long user_tar = 0;
+ unsigned int user_vrsave;
if (!vcpu->arch.sane) {
run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
return -EINVAL;
}
+ /*
+ * Don't allow entry with a suspended transaction, because
+ * the guest entry/exit code will lose it.
+ * If the guest has TM enabled, save away their TM-related SPRs
+ * (they will get restored by the TM unavailable interrupt).
+ */
+#ifdef CONFIG_PPC_TRANSACTIONAL_MEM
+ if (cpu_has_feature(CPU_FTR_TM) && current->thread.regs &&
+ (current->thread.regs->msr & MSR_TM)) {
+ if (MSR_TM_ACTIVE(current->thread.regs->msr)) {
+ run->exit_reason = KVM_EXIT_FAIL_ENTRY;
+ run->fail_entry.hardware_entry_failure_reason = 0;
+ return -EINVAL;
+ }
+ current->thread.tm_tfhar = mfspr(SPRN_TFHAR);
+ current->thread.tm_tfiar = mfspr(SPRN_TFIAR);
+ current->thread.tm_texasr = mfspr(SPRN_TEXASR);
+ current->thread.regs->msr &= ~MSR_TM;
+ }
+#endif
+
kvmppc_core_prepare_to_enter(vcpu);
/* No need to go into the guest when all we'll do is come back out */
@@ -2934,6 +2966,15 @@ static int kvmppc_vcpu_run_hv(struct kvm_run *run, struct kvm_vcpu *vcpu)
flush_all_to_thread(current);
+ /* Save userspace EBB and other register values */
+ if (cpu_has_feature(CPU_FTR_ARCH_207S)) {
+ ebb_regs[0] = mfspr(SPRN_EBBHR);
+ ebb_regs[1] = mfspr(SPRN_EBBRR);
+ ebb_regs[2] = mfspr(SPRN_BESCR);
+ user_tar = mfspr(SPRN_TAR);
+ }
+ user_vrsave = mfspr(SPRN_VRSAVE);
+
vcpu->arch.wqp = &vcpu->arch.vcore->wq;
vcpu->arch.pgdir = current->mm->pgd;
vcpu->arch.state = KVMPPC_VCPU_BUSY_IN_HOST;
@@ -2960,6 +3001,16 @@ static int kvmppc_vcpu_run_hv(struct kvm_run *run, struct kvm_vcpu *vcpu)
}
} while (is_kvmppc_resume_guest(r));
+ /* Restore userspace EBB and other register values */
+ if (cpu_has_feature(CPU_FTR_ARCH_207S)) {
+ mtspr(SPRN_EBBHR, ebb_regs[0]);
+ mtspr(SPRN_EBBRR, ebb_regs[1]);
+ mtspr(SPRN_BESCR, ebb_regs[2]);
+ mtspr(SPRN_TAR, user_tar);
+ mtspr(SPRN_FSCR, current->thread.fscr);
+ }
+ mtspr(SPRN_VRSAVE, user_vrsave);
+
out:
vcpu->arch.state = KVMPPC_VCPU_NOTREADY;
atomic_dec(&vcpu->kvm->arch.vcpus_running);
diff --git a/arch/powerpc/kvm/book3s_hv_interrupts.S b/arch/powerpc/kvm/book3s_hv_interrupts.S
index 0fdc4a28970b..404deb512844 100644
--- a/arch/powerpc/kvm/book3s_hv_interrupts.S
+++ b/arch/powerpc/kvm/book3s_hv_interrupts.S
@@ -121,10 +121,20 @@ END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S)
* Put whatever is in the decrementer into the
* hypervisor decrementer.
*/
+BEGIN_FTR_SECTION
+ ld r5, HSTATE_KVM_VCORE(r13)
+ ld r6, VCORE_KVM(r5)
+ ld r9, KVM_HOST_LPCR(r6)
+ andis. r9, r9, LPCR_LD@h
+END_FTR_SECTION_IFSET(CPU_FTR_ARCH_300)
mfspr r8,SPRN_DEC
mftb r7
- mtspr SPRN_HDEC,r8
+BEGIN_FTR_SECTION
+ /* On POWER9, don't sign-extend if host LPCR[LD] bit is set */
+ bne 32f
+END_FTR_SECTION_IFSET(CPU_FTR_ARCH_300)
extsw r8,r8
+32: mtspr SPRN_HDEC,r8
add r8,r8,r7
std r8,HSTATE_DECEXP(r13)
diff --git a/arch/powerpc/kvm/book3s_hv_rmhandlers.S b/arch/powerpc/kvm/book3s_hv_rmhandlers.S
index bdb3f76ceb6b..4888dd494604 100644
--- a/arch/powerpc/kvm/book3s_hv_rmhandlers.S
+++ b/arch/powerpc/kvm/book3s_hv_rmhandlers.S
@@ -32,12 +32,29 @@
#include <asm/opal.h>
#include <asm/xive-regs.h>
+/* Sign-extend HDEC if not on POWER9 */
+#define EXTEND_HDEC(reg) \
+BEGIN_FTR_SECTION; \
+ extsw reg, reg; \
+END_FTR_SECTION_IFCLR(CPU_FTR_ARCH_300)
+
#define VCPU_GPRS_TM(reg) (((reg) * ULONG_SIZE) + VCPU_GPR_TM)
/* Values in HSTATE_NAPPING(r13) */
#define NAPPING_CEDE 1
#define NAPPING_NOVCPU 2
+/* Stack frame offsets for kvmppc_hv_entry */
+#define SFS 144
+#define STACK_SLOT_TRAP (SFS-4)
+#define STACK_SLOT_TID (SFS-16)
+#define STACK_SLOT_PSSCR (SFS-24)
+#define STACK_SLOT_PID (SFS-32)
+#define STACK_SLOT_IAMR (SFS-40)
+#define STACK_SLOT_CIABR (SFS-48)
+#define STACK_SLOT_DAWR (SFS-56)
+#define STACK_SLOT_DAWRX (SFS-64)
+
/*
* Call kvmppc_hv_entry in real mode.
* Must be called with interrupts hard-disabled.
@@ -214,6 +231,8 @@ END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S)
kvmppc_primary_no_guest:
/* We handle this much like a ceded vcpu */
/* put the HDEC into the DEC, since HDEC interrupts don't wake us */
+ /* HDEC may be larger than DEC for arch >= v3.00, but since the */
+ /* HDEC value came from DEC in the first place, it will fit */
mfspr r3, SPRN_HDEC
mtspr SPRN_DEC, r3
/*
@@ -295,8 +314,9 @@ kvm_novcpu_wakeup:
/* See if our timeslice has expired (HDEC is negative) */
mfspr r0, SPRN_HDEC
+ EXTEND_HDEC(r0)
li r12, BOOK3S_INTERRUPT_HV_DECREMENTER
- cmpwi r0, 0
+ cmpdi r0, 0
blt kvm_novcpu_exit
/* Got an IPI but other vcpus aren't yet exiting, must be a latecomer */
@@ -319,10 +339,10 @@ kvm_novcpu_exit:
bl kvmhv_accumulate_time
#endif
13: mr r3, r12
- stw r12, 112-4(r1)
+ stw r12, STACK_SLOT_TRAP(r1)
bl kvmhv_commence_exit
nop
- lwz r12, 112-4(r1)
+ lwz r12, STACK_SLOT_TRAP(r1)
b kvmhv_switch_to_host
/*
@@ -390,8 +410,8 @@ kvm_secondary_got_guest:
lbz r4, HSTATE_PTID(r13)
cmpwi r4, 0
bne 63f
- lis r6, 0x7fff
- ori r6, r6, 0xffff
+ LOAD_REG_ADDR(r6, decrementer_max)
+ ld r6, 0(r6)
mtspr SPRN_HDEC, r6
/* and set per-LPAR registers, if doing dynamic micro-threading */
ld r6, HSTATE_SPLIT_MODE(r13)
@@ -545,11 +565,6 @@ END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S)
* *
*****************************************************************************/
-/* Stack frame offsets */
-#define STACK_SLOT_TID (112-16)
-#define STACK_SLOT_PSSCR (112-24)
-#define STACK_SLOT_PID (112-32)
-
.global kvmppc_hv_entry
kvmppc_hv_entry:
@@ -565,7 +580,7 @@ kvmppc_hv_entry:
*/
mflr r0
std r0, PPC_LR_STKOFF(r1)
- stdu r1, -112(r1)
+ stdu r1, -SFS(r1)
/* Save R1 in the PACA */
std r1, HSTATE_HOST_R1(r13)
@@ -749,10 +764,20 @@ BEGIN_FTR_SECTION
mfspr r5, SPRN_TIDR
mfspr r6, SPRN_PSSCR
mfspr r7, SPRN_PID
+ mfspr r8, SPRN_IAMR
std r5, STACK_SLOT_TID(r1)
std r6, STACK_SLOT_PSSCR(r1)
std r7, STACK_SLOT_PID(r1)
+ std r8, STACK_SLOT_IAMR(r1)
END_FTR_SECTION_IFSET(CPU_FTR_ARCH_300)
+BEGIN_FTR_SECTION
+ mfspr r5, SPRN_CIABR
+ mfspr r6, SPRN_DAWR
+ mfspr r7, SPRN_DAWRX
+ std r5, STACK_SLOT_CIABR(r1)
+ std r6, STACK_SLOT_DAWR(r1)
+ std r7, STACK_SLOT_DAWRX(r1)
+END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S)
BEGIN_FTR_SECTION
/* Set partition DABR */
@@ -968,7 +993,8 @@ ALT_FTR_SECTION_END_IFCLR(CPU_FTR_ARCH_300)
/* Check if HDEC expires soon */
mfspr r3, SPRN_HDEC
- cmpwi r3, 512 /* 1 microsecond */
+ EXTEND_HDEC(r3)
+ cmpdi r3, 512 /* 1 microsecond */
blt hdec_soon
#ifdef CONFIG_KVM_XICS
@@ -1505,11 +1531,10 @@ ALT_FTR_SECTION_END_IFCLR(CPU_FTR_ARCH_300)
* set by the guest could disrupt the host.
*/
li r0, 0
- mtspr SPRN_IAMR, r0
- mtspr SPRN_CIABR, r0
- mtspr SPRN_DAWRX, r0
+ mtspr SPRN_PSPB, r0
mtspr SPRN_WORT, r0
BEGIN_FTR_SECTION
+ mtspr SPRN_IAMR, r0
mtspr SPRN_TCSCR, r0
/* Set MMCRS to 1<<31 to freeze and disable the SPMC counters */
li r0, 1
@@ -1525,6 +1550,7 @@ END_FTR_SECTION_IFCLR(CPU_FTR_ARCH_300)
std r6,VCPU_UAMOR(r9)
li r6,0
mtspr SPRN_AMR,r6
+ mtspr SPRN_UAMOR, r6
/* Switch DSCR back to host value */
mfspr r8, SPRN_DSCR
@@ -1670,12 +1696,22 @@ END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S)
/* Restore host values of some registers */
BEGIN_FTR_SECTION
+ ld r5, STACK_SLOT_CIABR(r1)
+ ld r6, STACK_SLOT_DAWR(r1)
+ ld r7, STACK_SLOT_DAWRX(r1)
+ mtspr SPRN_CIABR, r5
+ mtspr SPRN_DAWR, r6
+ mtspr SPRN_DAWRX, r7
+END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S)
+BEGIN_FTR_SECTION
ld r5, STACK_SLOT_TID(r1)
ld r6, STACK_SLOT_PSSCR(r1)
ld r7, STACK_SLOT_PID(r1)
+ ld r8, STACK_SLOT_IAMR(r1)
mtspr SPRN_TIDR, r5
mtspr SPRN_PSSCR, r6
mtspr SPRN_PID, r7
+ mtspr SPRN_IAMR, r8
END_FTR_SECTION_IFSET(CPU_FTR_ARCH_300)
BEGIN_FTR_SECTION
PPC_INVALIDATE_ERAT
@@ -1819,8 +1855,8 @@ END_MMU_FTR_SECTION_IFSET(MMU_FTR_TYPE_RADIX)
li r0, KVM_GUEST_MODE_NONE
stb r0, HSTATE_IN_GUEST(r13)
- ld r0, 112+PPC_LR_STKOFF(r1)
- addi r1, r1, 112
+ ld r0, SFS+PPC_LR_STKOFF(r1)
+ addi r1, r1, SFS
mtlr r0
blr
@@ -2366,12 +2402,13 @@ END_FTR_SECTION_IFSET(CPU_FTR_TM)
mfspr r3, SPRN_DEC
mfspr r4, SPRN_HDEC
mftb r5
- cmpw r3, r4
+ extsw r3, r3
+ EXTEND_HDEC(r4)
+ cmpd r3, r4
ble 67f
mtspr SPRN_DEC, r4
67:
/* save expiry time of guest decrementer */
- extsw r3, r3
add r3, r3, r5
ld r4, HSTATE_KVM_VCPU(r13)
ld r5, HSTATE_KVM_VCORE(r13)
diff --git a/arch/powerpc/kvm/book3s_xive_template.c b/arch/powerpc/kvm/book3s_xive_template.c
index 023a31133c37..4636ca6e7d38 100644
--- a/arch/powerpc/kvm/book3s_xive_template.c
+++ b/arch/powerpc/kvm/book3s_xive_template.c
@@ -69,7 +69,7 @@ static void GLUE(X_PFX,source_eoi)(u32 hw_irq, struct xive_irq_data *xd)
{
/* If the XIVE supports the new "store EOI facility, use it */
if (xd->flags & XIVE_IRQ_FLAG_STORE_EOI)
- __x_writeq(0, __x_eoi_page(xd));
+ __x_writeq(0, __x_eoi_page(xd) + XIVE_ESB_STORE_EOI);
else if (hw_irq && xd->flags & XIVE_IRQ_FLAG_EOI_FW) {
opal_int_eoi(hw_irq);
} else {
@@ -89,7 +89,7 @@ static void GLUE(X_PFX,source_eoi)(u32 hw_irq, struct xive_irq_data *xd)
* properly.
*/
if (xd->flags & XIVE_IRQ_FLAG_LSI)
- __x_readq(__x_eoi_page(xd));
+ __x_readq(__x_eoi_page(xd) + XIVE_ESB_LOAD_EOI);
else {
eoi_val = GLUE(X_PFX,esb_load)(xd, XIVE_ESB_SET_PQ_00);
diff --git a/arch/powerpc/mm/hugetlbpage-radix.c b/arch/powerpc/mm/hugetlbpage-radix.c
index 6575b9aabef4..a12e86395025 100644
--- a/arch/powerpc/mm/hugetlbpage-radix.c
+++ b/arch/powerpc/mm/hugetlbpage-radix.c
@@ -68,7 +68,7 @@ radix__hugetlb_get_unmapped_area(struct file *file, unsigned long addr,
addr = ALIGN(addr, huge_page_size(h));
vma = find_vma(mm, addr);
if (mm->task_size - len >= addr &&
- (!vma || addr + len <= vma->vm_start))
+ (!vma || addr + len <= vm_start_gap(vma)))
return addr;
}
/*
diff --git a/arch/powerpc/mm/mmap.c b/arch/powerpc/mm/mmap.c
index 9dbd2a733d6b..0ee6be4f1ba4 100644
--- a/arch/powerpc/mm/mmap.c
+++ b/arch/powerpc/mm/mmap.c
@@ -112,7 +112,7 @@ radix__arch_get_unmapped_area(struct file *filp, unsigned long addr,
addr = PAGE_ALIGN(addr);
vma = find_vma(mm, addr);
if (mm->task_size - len >= addr && addr >= mmap_min_addr &&
- (!vma || addr + len <= vma->vm_start))
+ (!vma || addr + len <= vm_start_gap(vma)))
return addr;
}
@@ -157,7 +157,7 @@ radix__arch_get_unmapped_area_topdown(struct file *filp,
addr = PAGE_ALIGN(addr);
vma = find_vma(mm, addr);
if (mm->task_size - len >= addr && addr >= mmap_min_addr &&
- (!vma || addr + len <= vma->vm_start))
+ (!vma || addr + len <= vm_start_gap(vma)))
return addr;
}
diff --git a/arch/powerpc/mm/slice.c b/arch/powerpc/mm/slice.c
index 966b9fccfa66..45f6740dd407 100644
--- a/arch/powerpc/mm/slice.c
+++ b/arch/powerpc/mm/slice.c
@@ -99,7 +99,7 @@ static int slice_area_is_free(struct mm_struct *mm, unsigned long addr,
if ((mm->task_size - len) < addr)
return 0;
vma = find_vma(mm, addr);
- return (!vma || (addr + len) <= vma->vm_start);
+ return (!vma || (addr + len) <= vm_start_gap(vma));
}
static int slice_low_has_vma(struct mm_struct *mm, unsigned long slice)
diff --git a/arch/powerpc/perf/perf_regs.c b/arch/powerpc/perf/perf_regs.c
index cbd82fde5770..09ceea6175ba 100644
--- a/arch/powerpc/perf/perf_regs.c
+++ b/arch/powerpc/perf/perf_regs.c
@@ -101,5 +101,6 @@ void perf_get_regs_user(struct perf_regs *regs_user,
struct pt_regs *regs_user_copy)
{
regs_user->regs = task_pt_regs(current);
- regs_user->abi = perf_reg_abi(current);
+ regs_user->abi = (regs_user->regs) ? perf_reg_abi(current) :
+ PERF_SAMPLE_REGS_ABI_NONE;
}
diff --git a/arch/powerpc/platforms/powernv/npu-dma.c b/arch/powerpc/platforms/powernv/npu-dma.c
index 78fa9395b8c5..b5d960d6db3d 100644
--- a/arch/powerpc/platforms/powernv/npu-dma.c
+++ b/arch/powerpc/platforms/powernv/npu-dma.c
@@ -75,7 +75,8 @@ struct pci_dev *pnv_pci_get_npu_dev(struct pci_dev *gpdev, int index)
if (WARN_ON(!gpdev))
return NULL;
- if (WARN_ON(!gpdev->dev.of_node))
+ /* Not all PCI devices have device-tree nodes */
+ if (!gpdev->dev.of_node)
return NULL;
/* Get assoicated PCI device */
@@ -448,7 +449,7 @@ static int mmio_launch_invalidate(struct npu *npu, unsigned long launch,
return mmio_atsd_reg;
}
-static int mmio_invalidate_pid(struct npu *npu, unsigned long pid)
+static int mmio_invalidate_pid(struct npu *npu, unsigned long pid, bool flush)
{
unsigned long launch;
@@ -464,12 +465,15 @@ static int mmio_invalidate_pid(struct npu *npu, unsigned long pid)
/* PID */
launch |= pid << PPC_BITLSHIFT(38);
+ /* No flush */
+ launch |= !flush << PPC_BITLSHIFT(39);
+
/* Invalidating the entire process doesn't use a va */
return mmio_launch_invalidate(npu, launch, 0);
}
static int mmio_invalidate_va(struct npu *npu, unsigned long va,
- unsigned long pid)
+ unsigned long pid, bool flush)
{
unsigned long launch;
@@ -485,26 +489,60 @@ static int mmio_invalidate_va(struct npu *npu, unsigned long va,
/* PID */
launch |= pid << PPC_BITLSHIFT(38);
+ /* No flush */
+ launch |= !flush << PPC_BITLSHIFT(39);
+
return mmio_launch_invalidate(npu, launch, va);
}
#define mn_to_npu_context(x) container_of(x, struct npu_context, mn)
+struct mmio_atsd_reg {
+ struct npu *npu;
+ int reg;
+};
+
+static void mmio_invalidate_wait(
+ struct mmio_atsd_reg mmio_atsd_reg[NV_MAX_NPUS], bool flush)
+{
+ struct npu *npu;
+ int i, reg;
+
+ /* Wait for all invalidations to complete */
+ for (i = 0; i <= max_npu2_index; i++) {
+ if (mmio_atsd_reg[i].reg < 0)
+ continue;
+
+ /* Wait for completion */
+ npu = mmio_atsd_reg[i].npu;
+ reg = mmio_atsd_reg[i].reg;
+ while (__raw_readq(npu->mmio_atsd_regs[reg] + XTS_ATSD_STAT))
+ cpu_relax();
+
+ put_mmio_atsd_reg(npu, reg);
+
+ /*
+ * The GPU requires two flush ATSDs to ensure all entries have
+ * been flushed. We use PID 0 as it will never be used for a
+ * process on the GPU.
+ */
+ if (flush)
+ mmio_invalidate_pid(npu, 0, true);
+ }
+}
+
/*
* Invalidate either a single address or an entire PID depending on
* the value of va.
*/
static void mmio_invalidate(struct npu_context *npu_context, int va,
- unsigned long address)
+ unsigned long address, bool flush)
{
- int i, j, reg;
+ int i, j;
struct npu *npu;
struct pnv_phb *nphb;
struct pci_dev *npdev;
- struct {
- struct npu *npu;
- int reg;
- } mmio_atsd_reg[NV_MAX_NPUS];
+ struct mmio_atsd_reg mmio_atsd_reg[NV_MAX_NPUS];
unsigned long pid = npu_context->mm->context.id;
/*
@@ -524,10 +562,11 @@ static void mmio_invalidate(struct npu_context *npu_context, int va,
if (va)
mmio_atsd_reg[i].reg =
- mmio_invalidate_va(npu, address, pid);
+ mmio_invalidate_va(npu, address, pid,
+ flush);
else
mmio_atsd_reg[i].reg =
- mmio_invalidate_pid(npu, pid);
+ mmio_invalidate_pid(npu, pid, flush);
/*
* The NPU hardware forwards the shootdown to all GPUs
@@ -543,18 +582,10 @@ static void mmio_invalidate(struct npu_context *npu_context, int va,
*/
flush_tlb_mm(npu_context->mm);
- /* Wait for all invalidations to complete */
- for (i = 0; i <= max_npu2_index; i++) {
- if (mmio_atsd_reg[i].reg < 0)
- continue;
-
- /* Wait for completion */
- npu = mmio_atsd_reg[i].npu;
- reg = mmio_atsd_reg[i].reg;
- while (__raw_readq(npu->mmio_atsd_regs[reg] + XTS_ATSD_STAT))
- cpu_relax();
- put_mmio_atsd_reg(npu, reg);
- }
+ mmio_invalidate_wait(mmio_atsd_reg, flush);
+ if (flush)
+ /* Wait for the flush to complete */
+ mmio_invalidate_wait(mmio_atsd_reg, false);
}
static void pnv_npu2_mn_release(struct mmu_notifier *mn,
@@ -570,7 +601,7 @@ static void pnv_npu2_mn_release(struct mmu_notifier *mn,
* There should be no more translation requests for this PID, but we
* need to ensure any entries for it are removed from the TLB.
*/
- mmio_invalidate(npu_context, 0, 0);
+ mmio_invalidate(npu_context, 0, 0, true);
}
static void pnv_npu2_mn_change_pte(struct mmu_notifier *mn,
@@ -580,7 +611,7 @@ static void pnv_npu2_mn_change_pte(struct mmu_notifier *mn,
{
struct npu_context *npu_context = mn_to_npu_context(mn);
- mmio_invalidate(npu_context, 1, address);
+ mmio_invalidate(npu_context, 1, address, true);
}
static void pnv_npu2_mn_invalidate_page(struct mmu_notifier *mn,
@@ -589,7 +620,7 @@ static void pnv_npu2_mn_invalidate_page(struct mmu_notifier *mn,
{
struct npu_context *npu_context = mn_to_npu_context(mn);
- mmio_invalidate(npu_context, 1, address);
+ mmio_invalidate(npu_context, 1, address, true);
}
static void pnv_npu2_mn_invalidate_range(struct mmu_notifier *mn,
@@ -599,8 +630,11 @@ static void pnv_npu2_mn_invalidate_range(struct mmu_notifier *mn,
struct npu_context *npu_context = mn_to_npu_context(mn);
unsigned long address;
- for (address = start; address <= end; address += PAGE_SIZE)
- mmio_invalidate(npu_context, 1, address);
+ for (address = start; address < end; address += PAGE_SIZE)
+ mmio_invalidate(npu_context, 1, address, false);
+
+ /* Do the flush only on the final addess == end */
+ mmio_invalidate(npu_context, 1, address, true);
}
static const struct mmu_notifier_ops nv_nmmu_notifier_ops = {
@@ -650,8 +684,11 @@ struct npu_context *pnv_npu2_init_context(struct pci_dev *gpdev,
/* No nvlink associated with this GPU device */
return ERR_PTR(-ENODEV);
- if (!mm) {
- /* kernel thread contexts are not supported */
+ if (!mm || mm->context.id == 0) {
+ /*
+ * Kernel thread contexts are not supported and context id 0 is
+ * reserved on the GPU.
+ */
return ERR_PTR(-EINVAL);
}
diff --git a/arch/powerpc/sysdev/xive/common.c b/arch/powerpc/sysdev/xive/common.c
index 913825086b8d..8f5e3035483b 100644
--- a/arch/powerpc/sysdev/xive/common.c
+++ b/arch/powerpc/sysdev/xive/common.c
@@ -297,7 +297,7 @@ void xive_do_source_eoi(u32 hw_irq, struct xive_irq_data *xd)
{
/* If the XIVE supports the new "store EOI facility, use it */
if (xd->flags & XIVE_IRQ_FLAG_STORE_EOI)
- out_be64(xd->eoi_mmio, 0);
+ out_be64(xd->eoi_mmio + XIVE_ESB_STORE_EOI, 0);
else if (hw_irq && xd->flags & XIVE_IRQ_FLAG_EOI_FW) {
/*
* The FW told us to call it. This happens for some
diff --git a/arch/s390/include/asm/processor.h b/arch/s390/include/asm/processor.h
index 60d395fdc864..aeac013968f2 100644
--- a/arch/s390/include/asm/processor.h
+++ b/arch/s390/include/asm/processor.h
@@ -221,11 +221,6 @@ extern void release_thread(struct task_struct *);
/* Free guarded storage control block for current */
void exit_thread_gs(void);
-/*
- * Return saved PC of a blocked thread.
- */
-extern unsigned long thread_saved_pc(struct task_struct *t);
-
unsigned long get_wchan(struct task_struct *p);
#define task_pt_regs(tsk) ((struct pt_regs *) \
(task_stack_page(tsk) + THREAD_SIZE) - 1)
diff --git a/arch/s390/kernel/ipl.c b/arch/s390/kernel/ipl.c
index e545ffe5155a..8e622bb52f7a 100644
--- a/arch/s390/kernel/ipl.c
+++ b/arch/s390/kernel/ipl.c
@@ -564,8 +564,6 @@ static struct kset *ipl_kset;
static void __ipl_run(void *unused)
{
- if (MACHINE_IS_LPAR && ipl_info.type == IPL_TYPE_CCW)
- diag308(DIAG308_LOAD_NORMAL_DUMP, NULL);
diag308(DIAG308_LOAD_CLEAR, NULL);
if (MACHINE_IS_VM)
__cpcmd("IPL", NULL, 0, NULL);
@@ -1088,10 +1086,7 @@ static void __reipl_run(void *unused)
break;
case REIPL_METHOD_CCW_DIAG:
diag308(DIAG308_SET, reipl_block_ccw);
- if (MACHINE_IS_LPAR)
- diag308(DIAG308_LOAD_NORMAL_DUMP, NULL);
- else
- diag308(DIAG308_LOAD_CLEAR, NULL);
+ diag308(DIAG308_LOAD_CLEAR, NULL);
break;
case REIPL_METHOD_FCP_RW_DIAG:
diag308(DIAG308_SET, reipl_block_fcp);
diff --git a/arch/s390/kernel/process.c b/arch/s390/kernel/process.c
index 999d7154bbdc..bb32b8618bf6 100644
--- a/arch/s390/kernel/process.c
+++ b/arch/s390/kernel/process.c
@@ -41,31 +41,6 @@
asmlinkage void ret_from_fork(void) asm ("ret_from_fork");
-/*
- * Return saved PC of a blocked thread. used in kernel/sched.
- * resume in entry.S does not create a new stack frame, it
- * just stores the registers %r6-%r15 to the frame given by
- * schedule. We want to return the address of the caller of
- * schedule, so we have to walk the backchain one time to
- * find the frame schedule() store its return address.
- */
-unsigned long thread_saved_pc(struct task_struct *tsk)
-{
- struct stack_frame *sf, *low, *high;
-
- if (!tsk || !task_stack_page(tsk))
- return 0;
- low = task_stack_page(tsk);
- high = (struct stack_frame *) task_pt_regs(tsk);
- sf = (struct stack_frame *) tsk->thread.ksp;
- if (sf <= low || sf > high)
- return 0;
- sf = (struct stack_frame *) sf->back_chain;
- if (sf <= low || sf > high)
- return 0;
- return sf->gprs[8];
-}
-
extern void kernel_thread_starter(void);
/*
diff --git a/arch/s390/kvm/gaccess.c b/arch/s390/kvm/gaccess.c
index 9da243d94cc3..3b297fa3aa67 100644
--- a/arch/s390/kvm/gaccess.c
+++ b/arch/s390/kvm/gaccess.c
@@ -977,11 +977,12 @@ static int kvm_s390_shadow_tables(struct gmap *sg, unsigned long saddr,
ptr = asce.origin * 4096;
if (asce.r) {
*fake = 1;
+ ptr = 0;
asce.dt = ASCE_TYPE_REGION1;
}
switch (asce.dt) {
case ASCE_TYPE_REGION1:
- if (vaddr.rfx01 > asce.tl && !asce.r)
+ if (vaddr.rfx01 > asce.tl && !*fake)
return PGM_REGION_FIRST_TRANS;
break;
case ASCE_TYPE_REGION2:
@@ -1009,8 +1010,7 @@ static int kvm_s390_shadow_tables(struct gmap *sg, unsigned long saddr,
union region1_table_entry rfte;
if (*fake) {
- /* offset in 16EB guest memory block */
- ptr = ptr + ((unsigned long) vaddr.rsx << 53UL);
+ ptr += (unsigned long) vaddr.rfx << 53;
rfte.val = ptr;
goto shadow_r2t;
}
@@ -1036,8 +1036,7 @@ shadow_r2t:
union region2_table_entry rste;
if (*fake) {
- /* offset in 8PB guest memory block */
- ptr = ptr + ((unsigned long) vaddr.rtx << 42UL);
+ ptr += (unsigned long) vaddr.rsx << 42;
rste.val = ptr;
goto shadow_r3t;
}
@@ -1064,8 +1063,7 @@ shadow_r3t:
union region3_table_entry rtte;
if (*fake) {
- /* offset in 4TB guest memory block */
- ptr = ptr + ((unsigned long) vaddr.sx << 31UL);
+ ptr += (unsigned long) vaddr.rtx << 31;
rtte.val = ptr;
goto shadow_sgt;
}
@@ -1101,8 +1099,7 @@ shadow_sgt:
union segment_table_entry ste;
if (*fake) {
- /* offset in 2G guest memory block */
- ptr = ptr + ((unsigned long) vaddr.sx << 20UL);
+ ptr += (unsigned long) vaddr.sx << 20;
ste.val = ptr;
goto shadow_pgt;
}
diff --git a/arch/s390/mm/mmap.c b/arch/s390/mm/mmap.c
index b017daed6887..b854b1da281a 100644
--- a/arch/s390/mm/mmap.c
+++ b/arch/s390/mm/mmap.c
@@ -101,7 +101,7 @@ arch_get_unmapped_area(struct file *filp, unsigned long addr,
addr = PAGE_ALIGN(addr);
vma = find_vma(mm, addr);
if (TASK_SIZE - len >= addr && addr >= mmap_min_addr &&
- (!vma || addr + len <= vma->vm_start))
+ (!vma || addr + len <= vm_start_gap(vma)))
goto check_asce_limit;
}
@@ -151,7 +151,7 @@ arch_get_unmapped_area_topdown(struct file *filp, const unsigned long addr0,
addr = PAGE_ALIGN(addr);
vma = find_vma(mm, addr);
if (TASK_SIZE - len >= addr && addr >= mmap_min_addr &&
- (!vma || addr + len <= vma->vm_start))
+ (!vma || addr + len <= vm_start_gap(vma)))
goto check_asce_limit;
}
diff --git a/arch/score/include/asm/processor.h b/arch/score/include/asm/processor.h
index d9a922d8711b..299274581968 100644
--- a/arch/score/include/asm/processor.h
+++ b/arch/score/include/asm/processor.h
@@ -13,7 +13,6 @@ struct task_struct;
*/
extern void (*cpu_wait)(void);
-extern unsigned long thread_saved_pc(struct task_struct *tsk);
extern void start_thread(struct pt_regs *regs,
unsigned long pc, unsigned long sp);
extern unsigned long get_wchan(struct task_struct *p);
diff --git a/arch/score/kernel/process.c b/arch/score/kernel/process.c
index eb64d7a677cb..6e20241a1ed4 100644
--- a/arch/score/kernel/process.c
+++ b/arch/score/kernel/process.c
@@ -101,11 +101,6 @@ int dump_fpu(struct pt_regs *regs, elf_fpregset_t *r)
return 1;
}
-unsigned long thread_saved_pc(struct task_struct *tsk)
-{
- return task_pt_regs(tsk)->cp0_epc;
-}
-
unsigned long get_wchan(struct task_struct *task)
{
if (!task || task == current || task->state == TASK_RUNNING)
diff --git a/arch/sh/mm/mmap.c b/arch/sh/mm/mmap.c
index 08e7af0be4a7..6a1a1297baae 100644
--- a/arch/sh/mm/mmap.c
+++ b/arch/sh/mm/mmap.c
@@ -64,7 +64,7 @@ unsigned long arch_get_unmapped_area(struct file *filp, unsigned long addr,
vma = find_vma(mm, addr);
if (TASK_SIZE - len >= addr &&
- (!vma || addr + len <= vma->vm_start))
+ (!vma || addr + len <= vm_start_gap(vma)))
return addr;
}
@@ -114,7 +114,7 @@ arch_get_unmapped_area_topdown(struct file *filp, const unsigned long addr0,
vma = find_vma(mm, addr);
if (TASK_SIZE - len >= addr &&
- (!vma || addr + len <= vma->vm_start))
+ (!vma || addr + len <= vm_start_gap(vma)))
return addr;
}
diff --git a/arch/sparc/include/asm/processor_32.h b/arch/sparc/include/asm/processor_32.h
index dd27159819eb..b395e5620c0b 100644
--- a/arch/sparc/include/asm/processor_32.h
+++ b/arch/sparc/include/asm/processor_32.h
@@ -67,9 +67,6 @@ struct thread_struct {
.current_ds = KERNEL_DS, \
}
-/* Return saved PC of a blocked thread. */
-unsigned long thread_saved_pc(struct task_struct *t);
-
/* Do necessary setup to start up a newly executed thread. */
static inline void start_thread(struct pt_regs * regs, unsigned long pc,
unsigned long sp)
diff --git a/arch/sparc/include/asm/processor_64.h b/arch/sparc/include/asm/processor_64.h
index b58ee9018433..f04dc5a43062 100644
--- a/arch/sparc/include/asm/processor_64.h
+++ b/arch/sparc/include/asm/processor_64.h
@@ -89,9 +89,7 @@ struct thread_struct {
#include <linux/types.h>
#include <asm/fpumacro.h>
-/* Return saved PC of a blocked thread. */
struct task_struct;
-unsigned long thread_saved_pc(struct task_struct *);
/* On Uniprocessor, even in RMO processes see TSO semantics */
#ifdef CONFIG_SMP
diff --git a/arch/sparc/kernel/process_32.c b/arch/sparc/kernel/process_32.c
index b6dac8e980f0..9245f93398c7 100644
--- a/arch/sparc/kernel/process_32.c
+++ b/arch/sparc/kernel/process_32.c
@@ -177,14 +177,6 @@ void show_stack(struct task_struct *tsk, unsigned long *_ksp)
}
/*
- * Note: sparc64 has a pretty intricated thread_saved_pc, check it out.
- */
-unsigned long thread_saved_pc(struct task_struct *tsk)
-{
- return task_thread_info(tsk)->kpc;
-}
-
-/*
* Free current thread data structures etc..
*/
void exit_thread(struct task_struct *tsk)
diff --git a/arch/sparc/kernel/process_64.c b/arch/sparc/kernel/process_64.c
index 1badc493e62e..b96104da5bd6 100644
--- a/arch/sparc/kernel/process_64.c
+++ b/arch/sparc/kernel/process_64.c
@@ -400,25 +400,6 @@ core_initcall(sparc_sysrq_init);
#endif
-unsigned long thread_saved_pc(struct task_struct *tsk)
-{
- struct thread_info *ti = task_thread_info(tsk);
- unsigned long ret = 0xdeadbeefUL;
-
- if (ti && ti->ksp) {
- unsigned long *sp;
- sp = (unsigned long *)(ti->ksp + STACK_BIAS);
- if (((unsigned long)sp & (sizeof(long) - 1)) == 0UL &&
- sp[14]) {
- unsigned long *fp;
- fp = (unsigned long *)(sp[14] + STACK_BIAS);
- if (((unsigned long)fp & (sizeof(long) - 1)) == 0UL)
- ret = fp[15];
- }
- }
- return ret;
-}
-
/* Free current thread data structures etc.. */
void exit_thread(struct task_struct *tsk)
{
diff --git a/arch/sparc/kernel/sys_sparc_64.c b/arch/sparc/kernel/sys_sparc_64.c
index ef4520efc813..043544d0cda3 100644
--- a/arch/sparc/kernel/sys_sparc_64.c
+++ b/arch/sparc/kernel/sys_sparc_64.c
@@ -120,7 +120,7 @@ unsigned long arch_get_unmapped_area(struct file *filp, unsigned long addr, unsi
vma = find_vma(mm, addr);
if (task_size - len >= addr &&
- (!vma || addr + len <= vma->vm_start))
+ (!vma || addr + len <= vm_start_gap(vma)))
return addr;
}
@@ -183,7 +183,7 @@ arch_get_unmapped_area_topdown(struct file *filp, const unsigned long addr0,
vma = find_vma(mm, addr);
if (task_size - len >= addr &&
- (!vma || addr + len <= vma->vm_start))
+ (!vma || addr + len <= vm_start_gap(vma)))
return addr;
}
diff --git a/arch/sparc/mm/hugetlbpage.c b/arch/sparc/mm/hugetlbpage.c
index 7c29d38e6b99..88855e383b34 100644
--- a/arch/sparc/mm/hugetlbpage.c
+++ b/arch/sparc/mm/hugetlbpage.c
@@ -120,7 +120,7 @@ hugetlb_get_unmapped_area(struct file *file, unsigned long addr,
addr = ALIGN(addr, huge_page_size(h));
vma = find_vma(mm, addr);
if (task_size - len >= addr &&
- (!vma || addr + len <= vma->vm_start))
+ (!vma || addr + len <= vm_start_gap(vma)))
return addr;
}
if (mm->get_unmapped_area == arch_get_unmapped_area)
diff --git a/arch/tile/include/asm/processor.h b/arch/tile/include/asm/processor.h
index 0bc9968b97a1..f71e5206650b 100644
--- a/arch/tile/include/asm/processor.h
+++ b/arch/tile/include/asm/processor.h
@@ -214,13 +214,6 @@ static inline void release_thread(struct task_struct *dead_task)
extern void prepare_exit_to_usermode(struct pt_regs *regs, u32 flags);
-
-/*
- * Return saved (kernel) PC of a blocked thread.
- * Only used in a printk() in kernel/sched/core.c, so don't work too hard.
- */
-#define thread_saved_pc(t) ((t)->thread.pc)
-
unsigned long get_wchan(struct task_struct *p);
/* Return initial ksp value for given task. */
diff --git a/arch/tile/lib/atomic_asm_32.S b/arch/tile/lib/atomic_asm_32.S
index 1a70e6c0f259..94709ab41ed8 100644
--- a/arch/tile/lib/atomic_asm_32.S
+++ b/arch/tile/lib/atomic_asm_32.S
@@ -24,8 +24,7 @@
* has an opportunity to return -EFAULT to the user if needed.
* The 64-bit routines just return a "long long" with the value,
* since they are only used from kernel space and don't expect to fault.
- * Support for 16-bit ops is included in the framework but we don't provide
- * any (x86_64 has an atomic_inc_short(), so we might want to some day).
+ * Support for 16-bit ops is included in the framework but we don't provide any.
*
* Note that the caller is advised to issue a suitable L1 or L2
* prefetch on the address being manipulated to avoid extra stalls.
diff --git a/arch/tile/mm/hugetlbpage.c b/arch/tile/mm/hugetlbpage.c
index cb10153b5c9f..03e5cc4e76e4 100644
--- a/arch/tile/mm/hugetlbpage.c
+++ b/arch/tile/mm/hugetlbpage.c
@@ -233,7 +233,7 @@ unsigned long hugetlb_get_unmapped_area(struct file *file, unsigned long addr,
addr = ALIGN(addr, huge_page_size(h));
vma = find_vma(mm, addr);
if (TASK_SIZE - len >= addr &&
- (!vma || addr + len <= vma->vm_start))
+ (!vma || addr + len <= vm_start_gap(vma)))
return addr;
}
if (current->mm->get_unmapped_area == arch_get_unmapped_area)
diff --git a/arch/um/include/asm/processor-generic.h b/arch/um/include/asm/processor-generic.h
index 2d1e0dd5bb0b..f6d1a3f747a9 100644
--- a/arch/um/include/asm/processor-generic.h
+++ b/arch/um/include/asm/processor-generic.h
@@ -58,8 +58,6 @@ static inline void release_thread(struct task_struct *task)
{
}
-extern unsigned long thread_saved_pc(struct task_struct *t);
-
static inline void mm_copy_segments(struct mm_struct *from_mm,
struct mm_struct *new_mm)
{
diff --git a/arch/um/kernel/um_arch.c b/arch/um/kernel/um_arch.c
index 64a1fd06f3fd..7b5640117325 100644
--- a/arch/um/kernel/um_arch.c
+++ b/arch/um/kernel/um_arch.c
@@ -56,12 +56,6 @@ union thread_union cpu0_irqstack
__attribute__((__section__(".data..init_irqstack"))) =
{ INIT_THREAD_INFO(init_task) };
-unsigned long thread_saved_pc(struct task_struct *task)
-{
- /* FIXME: Need to look up userspace_pid by cpu */
- return os_process_pc(userspace_pid[0]);
-}
-
/* Changed in setup_arch, which is called in early boot */
static char host_info[(__NEW_UTS_LEN + 1) * 5];
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index 0efb4c9497bc..737212c0333e 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -69,7 +69,7 @@ config X86
select ARCH_USE_BUILTIN_BSWAP
select ARCH_USE_QUEUED_RWLOCKS
select ARCH_USE_QUEUED_SPINLOCKS
- select ARCH_WANT_BATCHED_UNMAP_TLB_FLUSH if SMP
+ select ARCH_WANT_BATCHED_UNMAP_TLB_FLUSH
select ARCH_WANT_FRAME_POINTERS
select ARCH_WANTS_DYNAMIC_TASK_STRUCT
select BUILDTIME_EXTABLE_SORT
@@ -2793,6 +2793,9 @@ config X86_DMA_REMAP
bool
depends on STA2X11
+config HAVE_GENERIC_GUP
+ def_bool y
+
source "net/Kconfig"
source "drivers/Kconfig"
diff --git a/arch/x86/boot/compressed/cmdline.c b/arch/x86/boot/compressed/cmdline.c
index 73ccf63b0f48..9dc1ce6ba3c0 100644
--- a/arch/x86/boot/compressed/cmdline.c
+++ b/arch/x86/boot/compressed/cmdline.c
@@ -13,7 +13,7 @@ static inline char rdfs8(addr_t addr)
return *((char *)(fs + addr));
}
#include "../cmdline.c"
-static unsigned long get_cmd_line_ptr(void)
+unsigned long get_cmd_line_ptr(void)
{
unsigned long cmd_line_ptr = boot_params->hdr.cmd_line_ptr;
diff --git a/arch/x86/boot/compressed/eboot.c b/arch/x86/boot/compressed/eboot.c
index cbf4b87f55b9..c3e869eaef0c 100644
--- a/arch/x86/boot/compressed/eboot.c
+++ b/arch/x86/boot/compressed/eboot.c
@@ -1046,9 +1046,31 @@ struct boot_params *efi_main(struct efi_config *c,
memset((char *)gdt->address, 0x0, gdt->size);
desc = (struct desc_struct *)gdt->address;
- /* The first GDT is a dummy and the second is unused. */
- desc += 2;
+ /* The first GDT is a dummy. */
+ desc++;
+
+ if (IS_ENABLED(CONFIG_X86_64)) {
+ /* __KERNEL32_CS */
+ desc->limit0 = 0xffff;
+ desc->base0 = 0x0000;
+ desc->base1 = 0x0000;
+ desc->type = SEG_TYPE_CODE | SEG_TYPE_EXEC_READ;
+ desc->s = DESC_TYPE_CODE_DATA;
+ desc->dpl = 0;
+ desc->p = 1;
+ desc->limit = 0xf;
+ desc->avl = 0;
+ desc->l = 0;
+ desc->d = SEG_OP_SIZE_32BIT;
+ desc->g = SEG_GRANULARITY_4KB;
+ desc->base2 = 0x00;
+ desc++;
+ } else {
+ /* Second entry is unused on 32-bit */
+ desc++;
+ }
+ /* __KERNEL_CS */
desc->limit0 = 0xffff;
desc->base0 = 0x0000;
desc->base1 = 0x0000;
@@ -1058,12 +1080,18 @@ struct boot_params *efi_main(struct efi_config *c,
desc->p = 1;
desc->limit = 0xf;
desc->avl = 0;
- desc->l = 0;
- desc->d = SEG_OP_SIZE_32BIT;
+ if (IS_ENABLED(CONFIG_X86_64)) {
+ desc->l = 1;
+ desc->d = 0;
+ } else {
+ desc->l = 0;
+ desc->d = SEG_OP_SIZE_32BIT;
+ }
desc->g = SEG_GRANULARITY_4KB;
desc->base2 = 0x00;
-
desc++;
+
+ /* __KERNEL_DS */
desc->limit0 = 0xffff;
desc->base0 = 0x0000;
desc->base1 = 0x0000;
@@ -1077,24 +1105,25 @@ struct boot_params *efi_main(struct efi_config *c,
desc->d = SEG_OP_SIZE_32BIT;
desc->g = SEG_GRANULARITY_4KB;
desc->base2 = 0x00;
-
-#ifdef CONFIG_X86_64
- /* Task segment value */
desc++;
- desc->limit0 = 0x0000;
- desc->base0 = 0x0000;
- desc->base1 = 0x0000;
- desc->type = SEG_TYPE_TSS;
- desc->s = 0;
- desc->dpl = 0;
- desc->p = 1;
- desc->limit = 0x0;
- desc->avl = 0;
- desc->l = 0;
- desc->d = 0;
- desc->g = SEG_GRANULARITY_4KB;
- desc->base2 = 0x00;
-#endif /* CONFIG_X86_64 */
+
+ if (IS_ENABLED(CONFIG_X86_64)) {
+ /* Task segment value */
+ desc->limit0 = 0x0000;
+ desc->base0 = 0x0000;
+ desc->base1 = 0x0000;
+ desc->type = SEG_TYPE_TSS;
+ desc->s = 0;
+ desc->dpl = 0;
+ desc->p = 1;
+ desc->limit = 0x0;
+ desc->avl = 0;
+ desc->l = 0;
+ desc->d = 0;
+ desc->g = SEG_GRANULARITY_4KB;
+ desc->base2 = 0x00;
+ desc++;
+ }
asm volatile("cli");
asm volatile ("lgdt %0" : : "m" (*gdt));
diff --git a/arch/x86/boot/compressed/head_64.S b/arch/x86/boot/compressed/head_64.S
index d2ae1f821e0c..fbf4c32d0b62 100644
--- a/arch/x86/boot/compressed/head_64.S
+++ b/arch/x86/boot/compressed/head_64.S
@@ -346,6 +346,48 @@ preferred_addr:
/* Set up the stack */
leaq boot_stack_end(%rbx), %rsp
+#ifdef CONFIG_X86_5LEVEL
+ /* Check if 5-level paging has already enabled */
+ movq %cr4, %rax
+ testl $X86_CR4_LA57, %eax
+ jnz lvl5
+
+ /*
+ * At this point we are in long mode with 4-level paging enabled,
+ * but we want to enable 5-level paging.
+ *
+ * The problem is that we cannot do it directly. Setting LA57 in
+ * long mode would trigger #GP. So we need to switch off long mode
+ * first.
+ *
+ * NOTE: This is not going to work if bootloader put us above 4G
+ * limit.
+ *
+ * The first step is go into compatibility mode.
+ */
+
+ /* Clear additional page table */
+ leaq lvl5_pgtable(%rbx), %rdi
+ xorq %rax, %rax
+ movq $(PAGE_SIZE/8), %rcx
+ rep stosq
+
+ /*
+ * Setup current CR3 as the first and only entry in a new top level
+ * page table.
+ */
+ movq %cr3, %rdi
+ leaq 0x7 (%rdi), %rax
+ movq %rax, lvl5_pgtable(%rbx)
+
+ /* Switch to compatibility mode (CS.L = 0 CS.D = 1) via far return */
+ pushq $__KERNEL32_CS
+ leaq compatible_mode(%rip), %rax
+ pushq %rax
+ lretq
+lvl5:
+#endif
+
/* Zero EFLAGS */
pushq $0
popfq
@@ -429,6 +471,44 @@ relocated:
jmp *%rax
.code32
+#ifdef CONFIG_X86_5LEVEL
+compatible_mode:
+ /* Setup data and stack segments */
+ movl $__KERNEL_DS, %eax
+ movl %eax, %ds
+ movl %eax, %ss
+
+ /* Disable paging */
+ movl %cr0, %eax
+ btrl $X86_CR0_PG_BIT, %eax
+ movl %eax, %cr0
+
+ /* Point CR3 to 5-level paging */
+ leal lvl5_pgtable(%ebx), %eax
+ movl %eax, %cr3
+
+ /* Enable PAE and LA57 mode */
+ movl %cr4, %eax
+ orl $(X86_CR4_PAE | X86_CR4_LA57), %eax
+ movl %eax, %cr4
+
+ /* Calculate address we are running at */
+ call 1f
+1: popl %edi
+ subl $1b, %edi
+
+ /* Prepare stack for far return to Long Mode */
+ pushl $__KERNEL_CS
+ leal lvl5(%edi), %eax
+ push %eax
+
+ /* Enable paging back */
+ movl $(X86_CR0_PG | X86_CR0_PE), %eax
+ movl %eax, %cr0
+
+ lret
+#endif
+
no_longmode:
/* This isn't an x86-64 CPU so hang */
1:
@@ -442,7 +522,7 @@ gdt:
.word gdt_end - gdt
.long gdt
.word 0
- .quad 0x0000000000000000 /* NULL descriptor */
+ .quad 0x00cf9a000000ffff /* __KERNEL32_CS */
.quad 0x00af9a000000ffff /* __KERNEL_CS */
.quad 0x00cf92000000ffff /* __KERNEL_DS */
.quad 0x0080890000000000 /* TS descriptor */
@@ -486,3 +566,7 @@ boot_stack_end:
.balign 4096
pgtable:
.fill BOOT_PGT_SIZE, 1, 0
+#ifdef CONFIG_X86_5LEVEL
+lvl5_pgtable:
+ .fill PAGE_SIZE, 1, 0
+#endif
diff --git a/arch/x86/boot/compressed/kaslr.c b/arch/x86/boot/compressed/kaslr.c
index 54c24f0a43d3..91f27ab970ef 100644
--- a/arch/x86/boot/compressed/kaslr.c
+++ b/arch/x86/boot/compressed/kaslr.c
@@ -9,16 +9,42 @@
* contain the entire properly aligned running kernel image.
*
*/
+
+/*
+ * isspace() in linux/ctype.h is expected by next_args() to filter
+ * out "space/lf/tab". While boot/ctype.h conflicts with linux/ctype.h,
+ * since isdigit() is implemented in both of them. Hence disable it
+ * here.
+ */
+#define BOOT_CTYPE_H
+
+/*
+ * _ctype[] in lib/ctype.c is needed by isspace() of linux/ctype.h.
+ * While both lib/ctype.c and lib/cmdline.c will bring EXPORT_SYMBOL
+ * which is meaningless and will cause compiling error in some cases.
+ * So do not include linux/export.h and define EXPORT_SYMBOL(sym)
+ * as empty.
+ */
+#define _LINUX_EXPORT_H
+#define EXPORT_SYMBOL(sym)
+
#include "misc.h"
#include "error.h"
-#include "../boot.h"
+#include "../string.h"
#include <generated/compile.h>
#include <linux/module.h>
#include <linux/uts.h>
#include <linux/utsname.h>
+#include <linux/ctype.h>
#include <generated/utsrelease.h>
+/* Macros used by the included decompressor code below. */
+#define STATIC
+#include <linux/decompress/mm.h>
+
+extern unsigned long get_cmd_line_ptr(void);
+
/* Simplified build-specific string for starting entropy. */
static const char build_str[] = UTS_RELEASE " (" LINUX_COMPILE_BY "@"
LINUX_COMPILE_HOST ") (" LINUX_COMPILER ") " UTS_VERSION;
@@ -62,6 +88,11 @@ struct mem_vector {
static bool memmap_too_large;
+
+/* Store memory limit specified by "mem=nn[KMG]" or "memmap=nn[KMG]" */
+unsigned long long mem_limit = ULLONG_MAX;
+
+
enum mem_avoid_index {
MEM_AVOID_ZO_RANGE = 0,
MEM_AVOID_INITRD,
@@ -85,49 +116,14 @@ static bool mem_overlaps(struct mem_vector *one, struct mem_vector *two)
return true;
}
-/**
- * _memparse - Parse a string with mem suffixes into a number
- * @ptr: Where parse begins
- * @retptr: (output) Optional pointer to next char after parse completes
- *
- * Parses a string into a number. The number stored at @ptr is
- * potentially suffixed with K, M, G, T, P, E.
- */
-static unsigned long long _memparse(const char *ptr, char **retptr)
+char *skip_spaces(const char *str)
{
- char *endptr; /* Local pointer to end of parsed string */
-
- unsigned long long ret = simple_strtoull(ptr, &endptr, 0);
-
- switch (*endptr) {
- case 'E':
- case 'e':
- ret <<= 10;
- case 'P':
- case 'p':
- ret <<= 10;
- case 'T':
- case 't':
- ret <<= 10;
- case 'G':
- case 'g':
- ret <<= 10;
- case 'M':
- case 'm':
- ret <<= 10;
- case 'K':
- case 'k':
- ret <<= 10;
- endptr++;
- default:
- break;
- }
-
- if (retptr)
- *retptr = endptr;
-
- return ret;
+ while (isspace(*str))
+ ++str;
+ return (char *)str;
}
+#include "../../../../lib/ctype.c"
+#include "../../../../lib/cmdline.c"
static int
parse_memmap(char *p, unsigned long long *start, unsigned long long *size)
@@ -142,40 +138,41 @@ parse_memmap(char *p, unsigned long long *start, unsigned long long *size)
return -EINVAL;
oldp = p;
- *size = _memparse(p, &p);
+ *size = memparse(p, &p);
if (p == oldp)
return -EINVAL;
switch (*p) {
- case '@':
- /* Skip this region, usable */
- *start = 0;
- *size = 0;
- return 0;
case '#':
case '$':
case '!':
- *start = _memparse(p + 1, &p);
+ *start = memparse(p + 1, &p);
+ return 0;
+ case '@':
+ /* memmap=nn@ss specifies usable region, should be skipped */
+ *size = 0;
+ /* Fall through */
+ default:
+ /*
+ * If w/o offset, only size specified, memmap=nn[KMG] has the
+ * same behaviour as mem=nn[KMG]. It limits the max address
+ * system can use. Region above the limit should be avoided.
+ */
+ *start = 0;
return 0;
}
return -EINVAL;
}
-static void mem_avoid_memmap(void)
+static void mem_avoid_memmap(char *str)
{
- char arg[128];
+ static int i;
int rc;
- int i;
- char *str;
- /* See if we have any memmap areas */
- rc = cmdline_find_option("memmap", arg, sizeof(arg));
- if (rc <= 0)
+ if (i >= MAX_MEMMAP_REGIONS)
return;
- i = 0;
- str = arg;
while (str && (i < MAX_MEMMAP_REGIONS)) {
int rc;
unsigned long long start, size;
@@ -188,9 +185,14 @@ static void mem_avoid_memmap(void)
if (rc < 0)
break;
str = k;
- /* A usable region that should not be skipped */
- if (size == 0)
+
+ if (start == 0) {
+ /* Store the specified memory limit if size > 0 */
+ if (size > 0)
+ mem_limit = size;
+
continue;
+ }
mem_avoid[MEM_AVOID_MEMMAP_BEGIN + i].start = start;
mem_avoid[MEM_AVOID_MEMMAP_BEGIN + i].size = size;
@@ -202,6 +204,57 @@ static void mem_avoid_memmap(void)
memmap_too_large = true;
}
+static int handle_mem_memmap(void)
+{
+ char *args = (char *)get_cmd_line_ptr();
+ size_t len = strlen((char *)args);
+ char *tmp_cmdline;
+ char *param, *val;
+ u64 mem_size;
+
+ if (!strstr(args, "memmap=") && !strstr(args, "mem="))
+ return 0;
+
+ tmp_cmdline = malloc(len + 1);
+ if (!tmp_cmdline )
+ error("Failed to allocate space for tmp_cmdline");
+
+ memcpy(tmp_cmdline, args, len);
+ tmp_cmdline[len] = 0;
+ args = tmp_cmdline;
+
+ /* Chew leading spaces */
+ args = skip_spaces(args);
+
+ while (*args) {
+ args = next_arg(args, &param, &val);
+ /* Stop at -- */
+ if (!val && strcmp(param, "--") == 0) {
+ warn("Only '--' specified in cmdline");
+ free(tmp_cmdline);
+ return -1;
+ }
+
+ if (!strcmp(param, "memmap")) {
+ mem_avoid_memmap(val);
+ } else if (!strcmp(param, "mem")) {
+ char *p = val;
+
+ if (!strcmp(p, "nopentium"))
+ continue;
+ mem_size = memparse(p, &p);
+ if (mem_size == 0) {
+ free(tmp_cmdline);
+ return -EINVAL;
+ }
+ mem_limit = mem_size;
+ }
+ }
+
+ free(tmp_cmdline);
+ return 0;
+}
+
/*
* In theory, KASLR can put the kernel anywhere in the range of [16M, 64T).
* The mem_avoid array is used to store the ranges that need to be avoided
@@ -323,7 +376,7 @@ static void mem_avoid_init(unsigned long input, unsigned long input_size,
/* We don't need to set a mapping for setup_data. */
/* Mark the memmap regions we need to avoid */
- mem_avoid_memmap();
+ handle_mem_memmap();
#ifdef CONFIG_X86_VERBOSE_BOOTUP
/* Make sure video RAM can be used. */
@@ -432,7 +485,8 @@ static void process_e820_entry(struct boot_e820_entry *entry,
{
struct mem_vector region, overlap;
struct slot_area slot_area;
- unsigned long start_orig;
+ unsigned long start_orig, end;
+ struct boot_e820_entry cur_entry;
/* Skip non-RAM entries. */
if (entry->type != E820_TYPE_RAM)
@@ -446,8 +500,15 @@ static void process_e820_entry(struct boot_e820_entry *entry,
if (entry->addr + entry->size < minimum)
return;
- region.start = entry->addr;
- region.size = entry->size;
+ /* Ignore entries above memory limit */
+ end = min(entry->size + entry->addr, mem_limit);
+ if (entry->addr >= end)
+ return;
+ cur_entry.addr = entry->addr;
+ cur_entry.size = end - entry->addr;
+
+ region.start = cur_entry.addr;
+ region.size = cur_entry.size;
/* Give up if slot area array is full. */
while (slot_area_index < MAX_SLOT_AREA) {
@@ -461,7 +522,7 @@ static void process_e820_entry(struct boot_e820_entry *entry,
region.start = ALIGN(region.start, CONFIG_PHYSICAL_ALIGN);
/* Did we raise the address above this e820 region? */
- if (region.start > entry->addr + entry->size)
+ if (region.start > cur_entry.addr + cur_entry.size)
return;
/* Reduce size by any delta from the original address. */
@@ -564,9 +625,6 @@ void choose_random_location(unsigned long input,
{
unsigned long random_addr, min_addr;
- /* By default, keep output position unchanged. */
- *virt_addr = *output;
-
if (cmdline_find_option_bool("nokaslr")) {
warn("KASLR disabled: 'nokaslr' on cmdline.");
return;
diff --git a/arch/x86/boot/compressed/misc.c b/arch/x86/boot/compressed/misc.c
index b3c5a5f030ce..00241c815524 100644
--- a/arch/x86/boot/compressed/misc.c
+++ b/arch/x86/boot/compressed/misc.c
@@ -338,7 +338,7 @@ asmlinkage __visible void *extract_kernel(void *rmode, memptr heap,
unsigned long output_len)
{
const unsigned long kernel_total_size = VO__end - VO__text;
- unsigned long virt_addr = (unsigned long)output;
+ unsigned long virt_addr = LOAD_PHYSICAL_ADDR;
/* Retain x86 boot parameters pointer passed from startup_32/64. */
boot_params = rmode;
@@ -390,6 +390,8 @@ asmlinkage __visible void *extract_kernel(void *rmode, memptr heap,
#ifdef CONFIG_X86_64
if (heap > 0x3fffffffffffUL)
error("Destination address too large");
+ if (virt_addr + max(output_len, kernel_total_size) > KERNEL_IMAGE_SIZE)
+ error("Destination virtual address is beyond the kernel mapping area");
#else
if (heap > ((-__PAGE_OFFSET-(128<<20)-1) & 0x7fffffff))
error("Destination address too large");
@@ -397,7 +399,7 @@ asmlinkage __visible void *extract_kernel(void *rmode, memptr heap,
#ifndef CONFIG_RELOCATABLE
if ((unsigned long)output != LOAD_PHYSICAL_ADDR)
error("Destination address does not match LOAD_PHYSICAL_ADDR");
- if ((unsigned long)output != virt_addr)
+ if (virt_addr != LOAD_PHYSICAL_ADDR)
error("Destination virtual address changed when not relocatable");
#endif
diff --git a/arch/x86/boot/compressed/misc.h b/arch/x86/boot/compressed/misc.h
index 1c8355eadbd1..766a5211f827 100644
--- a/arch/x86/boot/compressed/misc.h
+++ b/arch/x86/boot/compressed/misc.h
@@ -81,8 +81,6 @@ static inline void choose_random_location(unsigned long input,
unsigned long output_size,
unsigned long *virt_addr)
{
- /* No change from existing output location. */
- *virt_addr = *output;
}
#endif
diff --git a/arch/x86/boot/compressed/pagetable.c b/arch/x86/boot/compressed/pagetable.c
index 1d78f1739087..28029be47fbb 100644
--- a/arch/x86/boot/compressed/pagetable.c
+++ b/arch/x86/boot/compressed/pagetable.c
@@ -63,7 +63,7 @@ static void *alloc_pgt_page(void *context)
static struct alloc_pgt_data pgt_data;
/* The top level page table entry pointer. */
-static unsigned long level4p;
+static unsigned long top_level_pgt;
/*
* Mapping information structure passed to kernel_ident_mapping_init().
@@ -91,9 +91,15 @@ void initialize_identity_maps(void)
* If we came here via startup_32(), cr3 will be _pgtable already
* and we must append to the existing area instead of entirely
* overwriting it.
+ *
+ * With 5-level paging, we use '_pgtable' to allocate the p4d page table,
+ * the top-level page table is allocated separately.
+ *
+ * p4d_offset(top_level_pgt, 0) would cover both the 4- and 5-level
+ * cases. On 4-level paging it's equal to 'top_level_pgt'.
*/
- level4p = read_cr3();
- if (level4p == (unsigned long)_pgtable) {
+ top_level_pgt = read_cr3_pa();
+ if (p4d_offset((pgd_t *)top_level_pgt, 0) == (p4d_t *)_pgtable) {
debug_putstr("booted via startup_32()\n");
pgt_data.pgt_buf = _pgtable + BOOT_INIT_PGT_SIZE;
pgt_data.pgt_buf_size = BOOT_PGT_SIZE - BOOT_INIT_PGT_SIZE;
@@ -103,7 +109,7 @@ void initialize_identity_maps(void)
pgt_data.pgt_buf = _pgtable;
pgt_data.pgt_buf_size = BOOT_PGT_SIZE;
memset(pgt_data.pgt_buf, 0, pgt_data.pgt_buf_size);
- level4p = (unsigned long)alloc_pgt_page(&pgt_data);
+ top_level_pgt = (unsigned long)alloc_pgt_page(&pgt_data);
}
}
@@ -123,7 +129,7 @@ void add_identity_map(unsigned long start, unsigned long size)
return;
/* Build the mapping. */
- kernel_ident_mapping_init(&mapping_info, (pgd_t *)level4p,
+ kernel_ident_mapping_init(&mapping_info, (pgd_t *)top_level_pgt,
start, end);
}
@@ -134,5 +140,5 @@ void add_identity_map(unsigned long start, unsigned long size)
*/
void finalize_identity_maps(void)
{
- write_cr3(level4p);
+ write_cr3(top_level_pgt);
}
diff --git a/arch/x86/boot/copy.S b/arch/x86/boot/copy.S
index 1eb7d298b47d..15d9f74b0008 100644
--- a/arch/x86/boot/copy.S
+++ b/arch/x86/boot/copy.S
@@ -65,23 +65,3 @@ GLOBAL(copy_to_fs)
popw %es
retl
ENDPROC(copy_to_fs)
-
-#if 0 /* Not currently used, but can be enabled as needed */
-GLOBAL(copy_from_gs)
- pushw %ds
- pushw %gs
- popw %ds
- calll memcpy
- popw %ds
- retl
-ENDPROC(copy_from_gs)
-
-GLOBAL(copy_to_gs)
- pushw %es
- pushw %gs
- popw %es
- calll memcpy
- popw %es
- retl
-ENDPROC(copy_to_gs)
-#endif
diff --git a/arch/x86/boot/string.c b/arch/x86/boot/string.c
index 5457b02fc050..630e3664906b 100644
--- a/arch/x86/boot/string.c
+++ b/arch/x86/boot/string.c
@@ -122,6 +122,14 @@ unsigned long long simple_strtoull(const char *cp, char **endp, unsigned int bas
return result;
}
+long simple_strtol(const char *cp, char **endp, unsigned int base)
+{
+ if (*cp == '-')
+ return -simple_strtoull(cp + 1, endp, base);
+
+ return simple_strtoull(cp, endp, base);
+}
+
/**
* strlen - Find the length of a string
* @s: The string to be sized
diff --git a/arch/x86/boot/string.h b/arch/x86/boot/string.h
index 113588ddb43f..f274a50db5fa 100644
--- a/arch/x86/boot/string.h
+++ b/arch/x86/boot/string.h
@@ -22,6 +22,7 @@ extern int strcmp(const char *str1, const char *str2);
extern int strncmp(const char *cs, const char *ct, size_t count);
extern size_t strlen(const char *s);
extern char *strstr(const char *s1, const char *s2);
+extern char *strchr(const char *s, int c);
extern size_t strnlen(const char *s, size_t maxlen);
extern unsigned int atou(const char *s);
extern unsigned long long simple_strtoull(const char *cp, char **endp,
diff --git a/arch/x86/crypto/Makefile b/arch/x86/crypto/Makefile
index 34b3fa2889d1..9e32d40d71bd 100644
--- a/arch/x86/crypto/Makefile
+++ b/arch/x86/crypto/Makefile
@@ -2,6 +2,8 @@
# Arch-specific CryptoAPI modules.
#
+OBJECT_FILES_NON_STANDARD := y
+
avx_supported := $(call as-instr,vpxor %xmm0$(comma)%xmm0$(comma)%xmm0,yes,no)
avx2_supported := $(call as-instr,vpgatherdd %ymm0$(comma)(%eax$(comma)%ymm1\
$(comma)4)$(comma)%ymm2,yes,no)
diff --git a/arch/x86/crypto/sha1-mb/Makefile b/arch/x86/crypto/sha1-mb/Makefile
index 2f8756375df5..2e14acc3da25 100644
--- a/arch/x86/crypto/sha1-mb/Makefile
+++ b/arch/x86/crypto/sha1-mb/Makefile
@@ -2,6 +2,8 @@
# Arch-specific CryptoAPI modules.
#
+OBJECT_FILES_NON_STANDARD := y
+
avx2_supported := $(call as-instr,vpgatherdd %ymm0$(comma)(%eax$(comma)%ymm1\
$(comma)4)$(comma)%ymm2,yes,no)
ifeq ($(avx2_supported),yes)
diff --git a/arch/x86/crypto/sha256-mb/Makefile b/arch/x86/crypto/sha256-mb/Makefile
index 41089e7c400c..45b4fca6c4a8 100644
--- a/arch/x86/crypto/sha256-mb/Makefile
+++ b/arch/x86/crypto/sha256-mb/Makefile
@@ -2,6 +2,8 @@
# Arch-specific CryptoAPI modules.
#
+OBJECT_FILES_NON_STANDARD := y
+
avx2_supported := $(call as-instr,vpgatherdd %ymm0$(comma)(%eax$(comma)%ymm1\
$(comma)4)$(comma)%ymm2,yes,no)
ifeq ($(avx2_supported),yes)
diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S
index 4a4c0834f965..a9a8027a6c0e 100644
--- a/arch/x86/entry/entry_64.S
+++ b/arch/x86/entry/entry_64.S
@@ -265,7 +265,8 @@ return_from_SYSCALL_64:
* If width of "canonical tail" ever becomes variable, this will need
* to be updated to remain correct on both old and new CPUs.
*
- * Change top 16 bits to be the sign-extension of 47th bit
+ * Change top bits to match most significant bit (47th or 56th bit
+ * depending on paging mode) in the address.
*/
shl $(64 - (__VIRTUAL_MASK_SHIFT+1)), %rcx
sar $(64 - (__VIRTUAL_MASK_SHIFT+1)), %rcx
diff --git a/arch/x86/events/core.c b/arch/x86/events/core.c
index 580b60f5ac83..2de0dd73830a 100644
--- a/arch/x86/events/core.c
+++ b/arch/x86/events/core.c
@@ -1750,6 +1750,8 @@ ssize_t x86_event_sysfs_show(char *page, u64 config, u64 event)
return ret;
}
+static struct attribute_group x86_pmu_attr_group;
+
static int __init init_hw_perf_events(void)
{
struct x86_pmu_quirk *quirk;
@@ -1813,6 +1815,14 @@ static int __init init_hw_perf_events(void)
x86_pmu_events_group.attrs = tmp;
}
+ if (x86_pmu.attrs) {
+ struct attribute **tmp;
+
+ tmp = merge_attr(x86_pmu_attr_group.attrs, x86_pmu.attrs);
+ if (!WARN_ON(!tmp))
+ x86_pmu_attr_group.attrs = tmp;
+ }
+
pr_info("... version: %d\n", x86_pmu.version);
pr_info("... bit width: %d\n", x86_pmu.cntval_bits);
pr_info("... generic registers: %d\n", x86_pmu.num_counters);
@@ -2101,8 +2111,7 @@ static int x86_pmu_event_init(struct perf_event *event)
static void refresh_pce(void *ignored)
{
- if (current->active_mm)
- load_mm_cr4(current->active_mm);
+ load_mm_cr4(this_cpu_read(cpu_tlbstate.loaded_mm));
}
static void x86_pmu_event_mapped(struct perf_event *event)
@@ -2255,7 +2264,7 @@ static struct pmu pmu = {
void arch_perf_update_userpage(struct perf_event *event,
struct perf_event_mmap_page *userpg, u64 now)
{
- struct cyc2ns_data *data;
+ struct cyc2ns_data data;
u64 offset;
userpg->cap_user_time = 0;
@@ -2267,17 +2276,17 @@ void arch_perf_update_userpage(struct perf_event *event,
if (!using_native_sched_clock() || !sched_clock_stable())
return;
- data = cyc2ns_read_begin();
+ cyc2ns_read_begin(&data);
- offset = data->cyc2ns_offset + __sched_clock_offset;
+ offset = data.cyc2ns_offset + __sched_clock_offset;
/*
* Internal timekeeping for enabled/running/stopped times
* is always in the local_clock domain.
*/
userpg->cap_user_time = 1;
- userpg->time_mult = data->cyc2ns_mul;
- userpg->time_shift = data->cyc2ns_shift;
+ userpg->time_mult = data.cyc2ns_mul;
+ userpg->time_shift = data.cyc2ns_shift;
userpg->time_offset = offset - now;
/*
@@ -2289,7 +2298,7 @@ void arch_perf_update_userpage(struct perf_event *event,
userpg->time_zero = offset;
}
- cyc2ns_read_end(data);
+ cyc2ns_read_end();
}
void
@@ -2334,7 +2343,7 @@ static unsigned long get_segment_base(unsigned int segment)
/* IRQs are off, so this synchronizes with smp_store_release */
ldt = lockless_dereference(current->active_mm->context.ldt);
- if (!ldt || idx > ldt->size)
+ if (!ldt || idx > ldt->nr_entries)
return 0;
desc = &ldt->entries[idx];
diff --git a/arch/x86/events/intel/core.c b/arch/x86/events/intel/core.c
index a6d91d4e37a1..31acf2a98394 100644
--- a/arch/x86/events/intel/core.c
+++ b/arch/x86/events/intel/core.c
@@ -431,11 +431,11 @@ static __initconst const u64 skl_hw_cache_event_ids
[ C(DTLB) ] = {
[ C(OP_READ) ] = {
[ C(RESULT_ACCESS) ] = 0x81d0, /* MEM_INST_RETIRED.ALL_LOADS */
- [ C(RESULT_MISS) ] = 0x608, /* DTLB_LOAD_MISSES.WALK_COMPLETED */
+ [ C(RESULT_MISS) ] = 0xe08, /* DTLB_LOAD_MISSES.WALK_COMPLETED */
},
[ C(OP_WRITE) ] = {
[ C(RESULT_ACCESS) ] = 0x82d0, /* MEM_INST_RETIRED.ALL_STORES */
- [ C(RESULT_MISS) ] = 0x649, /* DTLB_STORE_MISSES.WALK_COMPLETED */
+ [ C(RESULT_MISS) ] = 0xe49, /* DTLB_STORE_MISSES.WALK_COMPLETED */
},
[ C(OP_PREFETCH) ] = {
[ C(RESULT_ACCESS) ] = 0x0,
@@ -3160,6 +3160,19 @@ err:
return -ENOMEM;
}
+static void flip_smm_bit(void *data)
+{
+ unsigned long set = *(unsigned long *)data;
+
+ if (set > 0) {
+ msr_set_bit(MSR_IA32_DEBUGCTLMSR,
+ DEBUGCTLMSR_FREEZE_IN_SMM_BIT);
+ } else {
+ msr_clear_bit(MSR_IA32_DEBUGCTLMSR,
+ DEBUGCTLMSR_FREEZE_IN_SMM_BIT);
+ }
+}
+
static void intel_pmu_cpu_starting(int cpu)
{
struct cpu_hw_events *cpuc = &per_cpu(cpu_hw_events, cpu);
@@ -3174,6 +3187,8 @@ static void intel_pmu_cpu_starting(int cpu)
cpuc->lbr_sel = NULL;
+ flip_smm_bit(&x86_pmu.attr_freeze_on_smi);
+
if (!cpuc->shared_regs)
return;
@@ -3595,6 +3610,52 @@ static struct attribute *hsw_events_attrs[] = {
NULL
};
+static ssize_t freeze_on_smi_show(struct device *cdev,
+ struct device_attribute *attr,
+ char *buf)
+{
+ return sprintf(buf, "%lu\n", x86_pmu.attr_freeze_on_smi);
+}
+
+static DEFINE_MUTEX(freeze_on_smi_mutex);
+
+static ssize_t freeze_on_smi_store(struct device *cdev,
+ struct device_attribute *attr,
+ const char *buf, size_t count)
+{
+ unsigned long val;
+ ssize_t ret;
+
+ ret = kstrtoul(buf, 0, &val);
+ if (ret)
+ return ret;
+
+ if (val > 1)
+ return -EINVAL;
+
+ mutex_lock(&freeze_on_smi_mutex);
+
+ if (x86_pmu.attr_freeze_on_smi == val)
+ goto done;
+
+ x86_pmu.attr_freeze_on_smi = val;
+
+ get_online_cpus();
+ on_each_cpu(flip_smm_bit, &val, 1);
+ put_online_cpus();
+done:
+ mutex_unlock(&freeze_on_smi_mutex);
+
+ return count;
+}
+
+static DEVICE_ATTR_RW(freeze_on_smi);
+
+static struct attribute *intel_pmu_attrs[] = {
+ &dev_attr_freeze_on_smi.attr,
+ NULL,
+};
+
__init int intel_pmu_init(void)
{
union cpuid10_edx edx;
@@ -3641,6 +3702,8 @@ __init int intel_pmu_init(void)
x86_pmu.max_pebs_events = min_t(unsigned, MAX_PEBS_EVENTS, x86_pmu.num_counters);
+
+ x86_pmu.attrs = intel_pmu_attrs;
/*
* Quirk: v2 perfmon does not report fixed-purpose events, so
* assume at least 3 events, when not running in a hypervisor:
diff --git a/arch/x86/events/intel/lbr.c b/arch/x86/events/intel/lbr.c
index f924629836a8..eb261656a320 100644
--- a/arch/x86/events/intel/lbr.c
+++ b/arch/x86/events/intel/lbr.c
@@ -18,7 +18,7 @@ enum {
LBR_FORMAT_MAX_KNOWN = LBR_FORMAT_TIME,
};
-static enum {
+static const enum {
LBR_EIP_FLAGS = 1,
LBR_TSX = 2,
} lbr_desc[LBR_FORMAT_MAX_KNOWN + 1] = {
@@ -287,7 +287,7 @@ inline u64 lbr_from_signext_quirk_wr(u64 val)
/*
* If quirk is needed, ensure sign extension is 61 bits:
*/
-u64 lbr_from_signext_quirk_rd(u64 val)
+static u64 lbr_from_signext_quirk_rd(u64 val)
{
if (static_branch_unlikely(&lbr_from_quirk_key)) {
/*
diff --git a/arch/x86/events/intel/uncore.c b/arch/x86/events/intel/uncore.c
index 758c1aa5009d..44ec523287f6 100644
--- a/arch/x86/events/intel/uncore.c
+++ b/arch/x86/events/intel/uncore.c
@@ -1170,7 +1170,7 @@ static int uncore_event_cpu_online(unsigned int cpu)
pmu = type->pmus;
for (i = 0; i < type->num_boxes; i++, pmu++) {
box = pmu->boxes[pkg];
- if (!box && atomic_inc_return(&box->refcnt) == 1)
+ if (box && atomic_inc_return(&box->refcnt) == 1)
uncore_box_init(box);
}
}
diff --git a/arch/x86/events/perf_event.h b/arch/x86/events/perf_event.h
index be3d36254040..53728eea1bed 100644
--- a/arch/x86/events/perf_event.h
+++ b/arch/x86/events/perf_event.h
@@ -562,6 +562,9 @@ struct x86_pmu {
ssize_t (*events_sysfs_show)(char *page, u64 config);
struct attribute **cpu_events;
+ unsigned long attr_freeze_on_smi;
+ struct attribute **attrs;
+
/*
* CPU Hotplug hooks
*/
diff --git a/arch/x86/include/asm/atomic.h b/arch/x86/include/asm/atomic.h
index caa5798c92f4..33380b871463 100644
--- a/arch/x86/include/asm/atomic.h
+++ b/arch/x86/include/asm/atomic.h
@@ -246,19 +246,6 @@ static __always_inline int __atomic_add_unless(atomic_t *v, int a, int u)
return c;
}
-/**
- * atomic_inc_short - increment of a short integer
- * @v: pointer to type int
- *
- * Atomically adds 1 to @v
- * Returns the new value of @u
- */
-static __always_inline short int atomic_inc_short(short int *v)
-{
- asm(LOCK_PREFIX "addw $1, %0" : "+m" (*v));
- return *v;
-}
-
#ifdef CONFIG_X86_32
# include <asm/atomic64_32.h>
#else
diff --git a/arch/x86/include/asm/efi.h b/arch/x86/include/asm/efi.h
index 2f77bcefe6b4..d2ff779f347e 100644
--- a/arch/x86/include/asm/efi.h
+++ b/arch/x86/include/asm/efi.h
@@ -74,7 +74,7 @@ struct efi_scratch {
__kernel_fpu_begin(); \
\
if (efi_scratch.use_pgd) { \
- efi_scratch.prev_cr3 = read_cr3(); \
+ efi_scratch.prev_cr3 = __read_cr3(); \
write_cr3((unsigned long)efi_scratch.efi_pgt); \
__flush_tlb_all(); \
} \
diff --git a/arch/x86/include/asm/extable.h b/arch/x86/include/asm/extable.h
index b8ad261d11dc..c66d19e3c23e 100644
--- a/arch/x86/include/asm/extable.h
+++ b/arch/x86/include/asm/extable.h
@@ -29,6 +29,7 @@ struct pt_regs;
} while (0)
extern int fixup_exception(struct pt_regs *regs, int trapnr);
+extern int fixup_bug(struct pt_regs *regs, int trapnr);
extern bool ex_has_fault_handler(unsigned long ip);
extern void early_fixup_exception(struct pt_regs *regs, int trapnr);
diff --git a/arch/x86/include/asm/hardirq.h b/arch/x86/include/asm/hardirq.h
index 59405a248fc2..9b76cd331990 100644
--- a/arch/x86/include/asm/hardirq.h
+++ b/arch/x86/include/asm/hardirq.h
@@ -22,8 +22,8 @@ typedef struct {
#ifdef CONFIG_SMP
unsigned int irq_resched_count;
unsigned int irq_call_count;
- unsigned int irq_tlb_count;
#endif
+ unsigned int irq_tlb_count;
#ifdef CONFIG_X86_THERMAL_VECTOR
unsigned int irq_thermal_count;
#endif
diff --git a/arch/x86/include/asm/kvm_emulate.h b/arch/x86/include/asm/kvm_emulate.h
index 055962615779..722d0e568863 100644
--- a/arch/x86/include/asm/kvm_emulate.h
+++ b/arch/x86/include/asm/kvm_emulate.h
@@ -296,6 +296,7 @@ struct x86_emulate_ctxt {
bool perm_ok; /* do not check permissions if true */
bool ud; /* inject an #UD if host doesn't support insn */
+ bool tf; /* TF value before instruction (after for syscall/sysret) */
bool have_exception;
struct x86_exception exception;
diff --git a/arch/x86/include/asm/mmu.h b/arch/x86/include/asm/mmu.h
index f9813b6d8b80..79b647a7ebd0 100644
--- a/arch/x86/include/asm/mmu.h
+++ b/arch/x86/include/asm/mmu.h
@@ -37,12 +37,6 @@ typedef struct {
#endif
} mm_context_t;
-#ifdef CONFIG_SMP
void leave_mm(int cpu);
-#else
-static inline void leave_mm(int cpu)
-{
-}
-#endif
#endif /* _ASM_X86_MMU_H */
diff --git a/arch/x86/include/asm/mmu_context.h b/arch/x86/include/asm/mmu_context.h
index 68b329d77b3a..ecfcb6643c9b 100644
--- a/arch/x86/include/asm/mmu_context.h
+++ b/arch/x86/include/asm/mmu_context.h
@@ -47,7 +47,7 @@ struct ldt_struct {
* allocations, but it's not worth trying to optimize.
*/
struct desc_struct *entries;
- unsigned int size;
+ unsigned int nr_entries;
};
/*
@@ -87,22 +87,46 @@ static inline void load_mm_ldt(struct mm_struct *mm)
*/
if (unlikely(ldt))
- set_ldt(ldt->entries, ldt->size);
+ set_ldt(ldt->entries, ldt->nr_entries);
else
clear_LDT();
#else
clear_LDT();
#endif
+}
+
+static inline void switch_ldt(struct mm_struct *prev, struct mm_struct *next)
+{
+#ifdef CONFIG_MODIFY_LDT_SYSCALL
+ /*
+ * Load the LDT if either the old or new mm had an LDT.
+ *
+ * An mm will never go from having an LDT to not having an LDT. Two
+ * mms never share an LDT, so we don't gain anything by checking to
+ * see whether the LDT changed. There's also no guarantee that
+ * prev->context.ldt actually matches LDTR, but, if LDTR is non-NULL,
+ * then prev->context.ldt will also be non-NULL.
+ *
+ * If we really cared, we could optimize the case where prev == next
+ * and we're exiting lazy mode. Most of the time, if this happens,
+ * we don't actually need to reload LDTR, but modify_ldt() is mostly
+ * used by legacy code and emulators where we don't need this level of
+ * performance.
+ *
+ * This uses | instead of || because it generates better code.
+ */
+ if (unlikely((unsigned long)prev->context.ldt |
+ (unsigned long)next->context.ldt))
+ load_mm_ldt(next);
+#endif
DEBUG_LOCKS_WARN_ON(preemptible());
}
static inline void enter_lazy_tlb(struct mm_struct *mm, struct task_struct *tsk)
{
-#ifdef CONFIG_SMP
if (this_cpu_read(cpu_tlbstate.state) == TLBSTATE_OK)
this_cpu_write(cpu_tlbstate.state, TLBSTATE_LAZY);
-#endif
}
static inline int init_new_context(struct task_struct *tsk,
@@ -220,18 +244,6 @@ static inline int vma_pkey(struct vm_area_struct *vma)
}
#endif
-static inline bool __pkru_allows_pkey(u16 pkey, bool write)
-{
- u32 pkru = read_pkru();
-
- if (!__pkru_allows_read(pkru, pkey))
- return false;
- if (write && !__pkru_allows_write(pkru, pkey))
- return false;
-
- return true;
-}
-
/*
* We only want to enforce protection keys on the current process
* because we effectively have no access to PKRU for other
@@ -268,4 +280,23 @@ static inline bool arch_vma_access_permitted(struct vm_area_struct *vma,
return __pkru_allows_pkey(vma_pkey(vma), write);
}
+
+/*
+ * This can be used from process context to figure out what the value of
+ * CR3 is without needing to do a (slow) __read_cr3().
+ *
+ * It's intended to be used for code like KVM that sneakily changes CR3
+ * and needs to restore it. It needs to be used very carefully.
+ */
+static inline unsigned long __get_current_cr3_fast(void)
+{
+ unsigned long cr3 = __pa(this_cpu_read(cpu_tlbstate.loaded_mm)->pgd);
+
+ /* For now, be very restrictive about when this can be called. */
+ VM_WARN_ON(in_nmi() || !in_atomic());
+
+ VM_BUG_ON(cr3 != __read_cr3());
+ return cr3;
+}
+
#endif /* _ASM_X86_MMU_CONTEXT_H */
diff --git a/arch/x86/include/asm/mshyperv.h b/arch/x86/include/asm/mshyperv.h
index fba100713924..d5acc27ed1cc 100644
--- a/arch/x86/include/asm/mshyperv.h
+++ b/arch/x86/include/asm/mshyperv.h
@@ -2,8 +2,7 @@
#define _ASM_X86_MSHYPER_H
#include <linux/types.h>
-#include <linux/interrupt.h>
-#include <linux/clocksource.h>
+#include <linux/atomic.h>
#include <asm/hyperv.h>
/*
diff --git a/arch/x86/include/asm/msr-index.h b/arch/x86/include/asm/msr-index.h
index 673f9ac50f6d..18b162322eff 100644
--- a/arch/x86/include/asm/msr-index.h
+++ b/arch/x86/include/asm/msr-index.h
@@ -137,6 +137,8 @@
#define DEBUGCTLMSR_BTS_OFF_OS (1UL << 9)
#define DEBUGCTLMSR_BTS_OFF_USR (1UL << 10)
#define DEBUGCTLMSR_FREEZE_LBRS_ON_PMI (1UL << 11)
+#define DEBUGCTLMSR_FREEZE_IN_SMM_BIT 14
+#define DEBUGCTLMSR_FREEZE_IN_SMM (1UL << DEBUGCTLMSR_FREEZE_IN_SMM_BIT)
#define MSR_PEBS_FRONTEND 0x000003f7
diff --git a/arch/x86/include/asm/paravirt.h b/arch/x86/include/asm/paravirt.h
index 55fa56fe4e45..9ccac1926587 100644
--- a/arch/x86/include/asm/paravirt.h
+++ b/arch/x86/include/asm/paravirt.h
@@ -61,7 +61,7 @@ static inline void write_cr2(unsigned long x)
PVOP_VCALL1(pv_mmu_ops.write_cr2, x);
}
-static inline unsigned long read_cr3(void)
+static inline unsigned long __read_cr3(void)
{
return PVOP_CALL0(unsigned long, pv_mmu_ops.read_cr3);
}
@@ -118,7 +118,7 @@ static inline u64 paravirt_read_msr(unsigned msr)
static inline void paravirt_write_msr(unsigned msr,
unsigned low, unsigned high)
{
- return PVOP_VCALL3(pv_cpu_ops.write_msr, msr, low, high);
+ PVOP_VCALL3(pv_cpu_ops.write_msr, msr, low, high);
}
static inline u64 paravirt_read_msr_safe(unsigned msr, int *err)
@@ -312,11 +312,9 @@ static inline void __flush_tlb_single(unsigned long addr)
}
static inline void flush_tlb_others(const struct cpumask *cpumask,
- struct mm_struct *mm,
- unsigned long start,
- unsigned long end)
+ const struct flush_tlb_info *info)
{
- PVOP_VCALL4(pv_mmu_ops.flush_tlb_others, cpumask, mm, start, end);
+ PVOP_VCALL2(pv_mmu_ops.flush_tlb_others, cpumask, info);
}
static inline int paravirt_pgd_alloc(struct mm_struct *mm)
diff --git a/arch/x86/include/asm/paravirt_types.h b/arch/x86/include/asm/paravirt_types.h
index 7465d6fe336f..cb976bab6299 100644
--- a/arch/x86/include/asm/paravirt_types.h
+++ b/arch/x86/include/asm/paravirt_types.h
@@ -51,6 +51,7 @@ struct mm_struct;
struct desc_struct;
struct task_struct;
struct cpumask;
+struct flush_tlb_info;
/*
* Wrapper type for pointers to code which uses the non-standard
@@ -223,9 +224,7 @@ struct pv_mmu_ops {
void (*flush_tlb_kernel)(void);
void (*flush_tlb_single)(unsigned long addr);
void (*flush_tlb_others)(const struct cpumask *cpus,
- struct mm_struct *mm,
- unsigned long start,
- unsigned long end);
+ const struct flush_tlb_info *info);
/* Hooks for allocating and freeing a pagetable top-level */
int (*pgd_alloc)(struct mm_struct *mm);
diff --git a/arch/x86/include/asm/pgtable-3level.h b/arch/x86/include/asm/pgtable-3level.h
index 50d35e3185f5..c8821bab938f 100644
--- a/arch/x86/include/asm/pgtable-3level.h
+++ b/arch/x86/include/asm/pgtable-3level.h
@@ -212,4 +212,51 @@ static inline pud_t native_pudp_get_and_clear(pud_t *pudp)
#define __pte_to_swp_entry(pte) ((swp_entry_t){ (pte).pte_high })
#define __swp_entry_to_pte(x) ((pte_t){ { .pte_high = (x).val } })
+#define gup_get_pte gup_get_pte
+/*
+ * WARNING: only to be used in the get_user_pages_fast() implementation.
+ *
+ * With get_user_pages_fast(), we walk down the pagetables without taking
+ * any locks. For this we would like to load the pointers atomically,
+ * but that is not possible (without expensive cmpxchg8b) on PAE. What
+ * we do have is the guarantee that a PTE will only either go from not
+ * present to present, or present to not present or both -- it will not
+ * switch to a completely different present page without a TLB flush in
+ * between; something that we are blocking by holding interrupts off.
+ *
+ * Setting ptes from not present to present goes:
+ *
+ * ptep->pte_high = h;
+ * smp_wmb();
+ * ptep->pte_low = l;
+ *
+ * And present to not present goes:
+ *
+ * ptep->pte_low = 0;
+ * smp_wmb();
+ * ptep->pte_high = 0;
+ *
+ * We must ensure here that the load of pte_low sees 'l' iff pte_high
+ * sees 'h'. We load pte_high *after* loading pte_low, which ensures we
+ * don't see an older value of pte_high. *Then* we recheck pte_low,
+ * which ensures that we haven't picked up a changed pte high. We might
+ * have gotten rubbish values from pte_low and pte_high, but we are
+ * guaranteed that pte_low will not have the present bit set *unless*
+ * it is 'l'. Because get_user_pages_fast() only operates on present ptes
+ * we're safe.
+ */
+static inline pte_t gup_get_pte(pte_t *ptep)
+{
+ pte_t pte;
+
+ do {
+ pte.pte_low = ptep->pte_low;
+ smp_rmb();
+ pte.pte_high = ptep->pte_high;
+ smp_rmb();
+ } while (unlikely(pte.pte_low != ptep->pte_low));
+
+ return pte;
+}
+
#endif /* _ASM_X86_PGTABLE_3LEVEL_H */
diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h
index f5af95a0c6b8..77037b6f1caa 100644
--- a/arch/x86/include/asm/pgtable.h
+++ b/arch/x86/include/asm/pgtable.h
@@ -244,6 +244,11 @@ static inline int pud_devmap(pud_t pud)
return 0;
}
#endif
+
+static inline int pgd_devmap(pgd_t pgd)
+{
+ return 0;
+}
#endif
#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
@@ -917,7 +922,7 @@ extern pgd_t trampoline_pgd_entry;
static inline void __meminit init_trampoline_default(void)
{
/* Default trampoline pgd value */
- trampoline_pgd_entry = init_level4_pgt[pgd_index(__PAGE_OFFSET)];
+ trampoline_pgd_entry = init_top_pgt[pgd_index(__PAGE_OFFSET)];
}
# ifdef CONFIG_RANDOMIZE_MEMORY
void __meminit init_trampoline(void);
@@ -1185,6 +1190,54 @@ static inline u16 pte_flags_pkey(unsigned long pte_flags)
#endif
}
+static inline bool __pkru_allows_pkey(u16 pkey, bool write)
+{
+ u32 pkru = read_pkru();
+
+ if (!__pkru_allows_read(pkru, pkey))
+ return false;
+ if (write && !__pkru_allows_write(pkru, pkey))
+ return false;
+
+ return true;
+}
+
+/*
+ * 'pteval' can come from a PTE, PMD or PUD. We only check
+ * _PAGE_PRESENT, _PAGE_USER, and _PAGE_RW in here which are the
+ * same value on all 3 types.
+ */
+static inline bool __pte_access_permitted(unsigned long pteval, bool write)
+{
+ unsigned long need_pte_bits = _PAGE_PRESENT|_PAGE_USER;
+
+ if (write)
+ need_pte_bits |= _PAGE_RW;
+
+ if ((pteval & need_pte_bits) != need_pte_bits)
+ return 0;
+
+ return __pkru_allows_pkey(pte_flags_pkey(pteval), write);
+}
+
+#define pte_access_permitted pte_access_permitted
+static inline bool pte_access_permitted(pte_t pte, bool write)
+{
+ return __pte_access_permitted(pte_val(pte), write);
+}
+
+#define pmd_access_permitted pmd_access_permitted
+static inline bool pmd_access_permitted(pmd_t pmd, bool write)
+{
+ return __pte_access_permitted(pmd_val(pmd), write);
+}
+
+#define pud_access_permitted pud_access_permitted
+static inline bool pud_access_permitted(pud_t pud, bool write)
+{
+ return __pte_access_permitted(pud_val(pud), write);
+}
+
#include <asm-generic/pgtable.h>
#endif /* __ASSEMBLY__ */
diff --git a/arch/x86/include/asm/pgtable_64.h b/arch/x86/include/asm/pgtable_64.h
index 9991224f6238..2160c1fee920 100644
--- a/arch/x86/include/asm/pgtable_64.h
+++ b/arch/x86/include/asm/pgtable_64.h
@@ -14,15 +14,17 @@
#include <linux/bitops.h>
#include <linux/threads.h>
+extern p4d_t level4_kernel_pgt[512];
+extern p4d_t level4_ident_pgt[512];
extern pud_t level3_kernel_pgt[512];
extern pud_t level3_ident_pgt[512];
extern pmd_t level2_kernel_pgt[512];
extern pmd_t level2_fixmap_pgt[512];
extern pmd_t level2_ident_pgt[512];
extern pte_t level1_fixmap_pgt[512];
-extern pgd_t init_level4_pgt[];
+extern pgd_t init_top_pgt[];
-#define swapper_pg_dir init_level4_pgt
+#define swapper_pg_dir init_top_pgt
extern void paging_init(void);
@@ -227,6 +229,20 @@ extern void cleanup_highmap(void);
extern void init_extra_mapping_uc(unsigned long phys, unsigned long size);
extern void init_extra_mapping_wb(unsigned long phys, unsigned long size);
-#endif /* !__ASSEMBLY__ */
+#define gup_fast_permitted gup_fast_permitted
+static inline bool gup_fast_permitted(unsigned long start, int nr_pages,
+ int write)
+{
+ unsigned long len, end;
+
+ len = (unsigned long)nr_pages << PAGE_SHIFT;
+ end = start + len;
+ if (end < start)
+ return false;
+ if (end >> __VIRTUAL_MASK_SHIFT)
+ return false;
+ return true;
+}
+#endif /* !__ASSEMBLY__ */
#endif /* _ASM_X86_PGTABLE_64_H */
diff --git a/arch/x86/include/asm/processor-flags.h b/arch/x86/include/asm/processor-flags.h
index 39fb618e2211..79aa2f98398d 100644
--- a/arch/x86/include/asm/processor-flags.h
+++ b/arch/x86/include/asm/processor-flags.h
@@ -8,4 +8,40 @@
#else
#define X86_VM_MASK 0 /* No VM86 support */
#endif
+
+/*
+ * CR3's layout varies depending on several things.
+ *
+ * If CR4.PCIDE is set (64-bit only), then CR3[11:0] is the address space ID.
+ * If PAE is enabled, then CR3[11:5] is part of the PDPT address
+ * (i.e. it's 32-byte aligned, not page-aligned) and CR3[4:0] is ignored.
+ * Otherwise (non-PAE, non-PCID), CR3[3] is PWT, CR3[4] is PCD, and
+ * CR3[2:0] and CR3[11:5] are ignored.
+ *
+ * In all cases, Linux puts zeros in the low ignored bits and in PWT and PCD.
+ *
+ * CR3[63] is always read as zero. If CR4.PCIDE is set, then CR3[63] may be
+ * written as 1 to prevent the write to CR3 from flushing the TLB.
+ *
+ * On systems with SME, one bit (in a variable position!) is stolen to indicate
+ * that the top-level paging structure is encrypted.
+ *
+ * All of the remaining bits indicate the physical address of the top-level
+ * paging structure.
+ *
+ * CR3_ADDR_MASK is the mask used by read_cr3_pa().
+ */
+#ifdef CONFIG_X86_64
+/* Mask off the address space ID bits. */
+#define CR3_ADDR_MASK 0x7FFFFFFFFFFFF000ull
+#define CR3_PCID_MASK 0xFFFull
+#else
+/*
+ * CR3_ADDR_MASK needs at least bits 31:5 set on PAE systems, and we save
+ * a tiny bit of code size by setting all the bits.
+ */
+#define CR3_ADDR_MASK 0xFFFFFFFFull
+#define CR3_PCID_MASK 0ull
+#endif
+
#endif /* _ASM_X86_PROCESSOR_FLAGS_H */
diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h
index 3cada998a402..2e1696294af5 100644
--- a/arch/x86/include/asm/processor.h
+++ b/arch/x86/include/asm/processor.h
@@ -231,6 +231,14 @@ native_cpuid_reg(ebx)
native_cpuid_reg(ecx)
native_cpuid_reg(edx)
+/*
+ * Friendlier CR3 helpers.
+ */
+static inline unsigned long read_cr3_pa(void)
+{
+ return __read_cr3() & CR3_ADDR_MASK;
+}
+
static inline void load_cr3(pgd_t *pgdir)
{
write_cr3(__pa(pgdir));
@@ -860,8 +868,6 @@ extern unsigned long KSTK_ESP(struct task_struct *task);
#endif /* CONFIG_X86_64 */
-extern unsigned long thread_saved_pc(struct task_struct *tsk);
-
extern void start_thread(struct pt_regs *regs, unsigned long new_ip,
unsigned long new_sp);
diff --git a/arch/x86/include/asm/special_insns.h b/arch/x86/include/asm/special_insns.h
index 12af3e35edfa..9efaabf5b54b 100644
--- a/arch/x86/include/asm/special_insns.h
+++ b/arch/x86/include/asm/special_insns.h
@@ -39,7 +39,7 @@ static inline void native_write_cr2(unsigned long val)
asm volatile("mov %0,%%cr2": : "r" (val), "m" (__force_order));
}
-static inline unsigned long native_read_cr3(void)
+static inline unsigned long __native_read_cr3(void)
{
unsigned long val;
asm volatile("mov %%cr3,%0\n\t" : "=r" (val), "=m" (__force_order));
@@ -159,9 +159,13 @@ static inline void write_cr2(unsigned long x)
native_write_cr2(x);
}
-static inline unsigned long read_cr3(void)
+/*
+ * Careful! CR3 contains more than just an address. You probably want
+ * read_cr3_pa() instead.
+ */
+static inline unsigned long __read_cr3(void)
{
- return native_read_cr3();
+ return __native_read_cr3();
}
static inline void write_cr3(unsigned long x)
diff --git a/arch/x86/include/asm/timer.h b/arch/x86/include/asm/timer.h
index 27e9f9d769b8..2016962103df 100644
--- a/arch/x86/include/asm/timer.h
+++ b/arch/x86/include/asm/timer.h
@@ -29,11 +29,9 @@ struct cyc2ns_data {
u32 cyc2ns_mul;
u32 cyc2ns_shift;
u64 cyc2ns_offset;
- u32 __count;
- /* u32 hole */
-}; /* 24 bytes -- do not grow */
+}; /* 16 bytes */
-extern struct cyc2ns_data *cyc2ns_read_begin(void);
-extern void cyc2ns_read_end(struct cyc2ns_data *);
+extern void cyc2ns_read_begin(struct cyc2ns_data *);
+extern void cyc2ns_read_end(void);
#endif /* _ASM_X86_TIMER_H */
diff --git a/arch/x86/include/asm/tlbbatch.h b/arch/x86/include/asm/tlbbatch.h
new file mode 100644
index 000000000000..f4a6ff352a0e
--- /dev/null
+++ b/arch/x86/include/asm/tlbbatch.h
@@ -0,0 +1,14 @@
+#ifndef _ARCH_X86_TLBBATCH_H
+#define _ARCH_X86_TLBBATCH_H
+
+#include <linux/cpumask.h>
+
+struct arch_tlbflush_unmap_batch {
+ /*
+ * Each bit set is a CPU that potentially has a TLB entry for one of
+ * the PFNs being flushed..
+ */
+ struct cpumask cpumask;
+};
+
+#endif /* _ARCH_X86_TLBBATCH_H */
diff --git a/arch/x86/include/asm/tlbflush.h b/arch/x86/include/asm/tlbflush.h
index 6ed9ea469b48..50ea3482e1d1 100644
--- a/arch/x86/include/asm/tlbflush.h
+++ b/arch/x86/include/asm/tlbflush.h
@@ -7,6 +7,7 @@
#include <asm/processor.h>
#include <asm/cpufeature.h>
#include <asm/special_insns.h>
+#include <asm/smp.h>
static inline void __invpcid(unsigned long pcid, unsigned long addr,
unsigned long type)
@@ -65,10 +66,14 @@ static inline void invpcid_flush_all_nonglobals(void)
#endif
struct tlb_state {
-#ifdef CONFIG_SMP
- struct mm_struct *active_mm;
+ /*
+ * cpu_tlbstate.loaded_mm should match CR3 whenever interrupts
+ * are on. This means that it may not match current->active_mm,
+ * which will contain the previous user mm when we're in lazy TLB
+ * mode even if we've already switched back to swapper_pg_dir.
+ */
+ struct mm_struct *loaded_mm;
int state;
-#endif
/*
* Access to this CR4 shadow and to H/W CR4 is protected by
@@ -151,7 +156,7 @@ static inline void __native_flush_tlb(void)
* back:
*/
preempt_disable();
- native_write_cr3(native_read_cr3());
+ native_write_cr3(__native_read_cr3());
preempt_enable();
}
@@ -220,84 +225,16 @@ static inline void __flush_tlb_one(unsigned long addr)
* - flush_tlb_page(vma, vmaddr) flushes one page
* - flush_tlb_range(vma, start, end) flushes a range of pages
* - flush_tlb_kernel_range(start, end) flushes a range of kernel pages
- * - flush_tlb_others(cpumask, mm, start, end) flushes TLBs on other cpus
+ * - flush_tlb_others(cpumask, info) flushes TLBs on other cpus
*
* ..but the i386 has somewhat limited tlb flushing capabilities,
* and page-granular flushes are available only on i486 and up.
*/
-
-#ifndef CONFIG_SMP
-
-/* "_up" is for UniProcessor.
- *
- * This is a helper for other header functions. *Not* intended to be called
- * directly. All global TLB flushes need to either call this, or to bump the
- * vm statistics themselves.
- */
-static inline void __flush_tlb_up(void)
-{
- count_vm_tlb_event(NR_TLB_LOCAL_FLUSH_ALL);
- __flush_tlb();
-}
-
-static inline void flush_tlb_all(void)
-{
- count_vm_tlb_event(NR_TLB_LOCAL_FLUSH_ALL);
- __flush_tlb_all();
-}
-
-static inline void local_flush_tlb(void)
-{
- __flush_tlb_up();
-}
-
-static inline void flush_tlb_mm(struct mm_struct *mm)
-{
- if (mm == current->active_mm)
- __flush_tlb_up();
-}
-
-static inline void flush_tlb_page(struct vm_area_struct *vma,
- unsigned long addr)
-{
- if (vma->vm_mm == current->active_mm)
- __flush_tlb_one(addr);
-}
-
-static inline void flush_tlb_range(struct vm_area_struct *vma,
- unsigned long start, unsigned long end)
-{
- if (vma->vm_mm == current->active_mm)
- __flush_tlb_up();
-}
-
-static inline void flush_tlb_mm_range(struct mm_struct *mm,
- unsigned long start, unsigned long end, unsigned long vmflag)
-{
- if (mm == current->active_mm)
- __flush_tlb_up();
-}
-
-static inline void native_flush_tlb_others(const struct cpumask *cpumask,
- struct mm_struct *mm,
- unsigned long start,
- unsigned long end)
-{
-}
-
-static inline void reset_lazy_tlbstate(void)
-{
-}
-
-static inline void flush_tlb_kernel_range(unsigned long start,
- unsigned long end)
-{
- flush_tlb_all();
-}
-
-#else /* SMP */
-
-#include <asm/smp.h>
+struct flush_tlb_info {
+ struct mm_struct *mm;
+ unsigned long start;
+ unsigned long end;
+};
#define local_flush_tlb() __flush_tlb()
@@ -307,29 +244,32 @@ static inline void flush_tlb_kernel_range(unsigned long start,
flush_tlb_mm_range(vma->vm_mm, start, end, vma->vm_flags)
extern void flush_tlb_all(void);
-extern void flush_tlb_page(struct vm_area_struct *, unsigned long);
extern void flush_tlb_mm_range(struct mm_struct *mm, unsigned long start,
unsigned long end, unsigned long vmflag);
extern void flush_tlb_kernel_range(unsigned long start, unsigned long end);
+static inline void flush_tlb_page(struct vm_area_struct *vma, unsigned long a)
+{
+ flush_tlb_mm_range(vma->vm_mm, a, a + PAGE_SIZE, VM_NONE);
+}
+
void native_flush_tlb_others(const struct cpumask *cpumask,
- struct mm_struct *mm,
- unsigned long start, unsigned long end);
+ const struct flush_tlb_info *info);
#define TLBSTATE_OK 1
#define TLBSTATE_LAZY 2
-static inline void reset_lazy_tlbstate(void)
+static inline void arch_tlbbatch_add_mm(struct arch_tlbflush_unmap_batch *batch,
+ struct mm_struct *mm)
{
- this_cpu_write(cpu_tlbstate.state, 0);
- this_cpu_write(cpu_tlbstate.active_mm, &init_mm);
+ cpumask_or(&batch->cpumask, &batch->cpumask, mm_cpumask(mm));
}
-#endif /* SMP */
+extern void arch_tlbbatch_flush(struct arch_tlbflush_unmap_batch *batch);
#ifndef CONFIG_PARAVIRT
-#define flush_tlb_others(mask, mm, start, end) \
- native_flush_tlb_others(mask, mm, start, end)
+#define flush_tlb_others(mask, info) \
+ native_flush_tlb_others(mask, info)
#endif
#endif /* _ASM_X86_TLBFLUSH_H */
diff --git a/arch/x86/include/asm/uv/uv.h b/arch/x86/include/asm/uv/uv.h
index 6686820feae9..b5a32231abd8 100644
--- a/arch/x86/include/asm/uv/uv.h
+++ b/arch/x86/include/asm/uv/uv.h
@@ -1,6 +1,8 @@
#ifndef _ASM_X86_UV_UV_H
#define _ASM_X86_UV_UV_H
+#include <asm/tlbflush.h>
+
enum uv_system_type {UV_NONE, UV_LEGACY_APIC, UV_X2APIC, UV_NON_UNIQUE_APIC};
struct cpumask;
@@ -15,10 +17,7 @@ extern void uv_cpu_init(void);
extern void uv_nmi_init(void);
extern void uv_system_init(void);
extern const struct cpumask *uv_flush_tlb_others(const struct cpumask *cpumask,
- struct mm_struct *mm,
- unsigned long start,
- unsigned long end,
- unsigned int cpu);
+ const struct flush_tlb_info *info);
#else /* X86_UV */
@@ -28,8 +27,8 @@ static inline int is_uv_hubless(void) { return 0; }
static inline void uv_cpu_init(void) { }
static inline void uv_system_init(void) { }
static inline const struct cpumask *
-uv_flush_tlb_others(const struct cpumask *cpumask, struct mm_struct *mm,
- unsigned long start, unsigned long end, unsigned int cpu)
+uv_flush_tlb_others(const struct cpumask *cpumask,
+ const struct flush_tlb_info *info)
{ return cpumask; }
#endif /* X86_UV */
diff --git a/arch/x86/include/uapi/asm/hyperv.h b/arch/x86/include/uapi/asm/hyperv.h
index 432df4b1baec..f4fef5a24ebd 100644
--- a/arch/x86/include/uapi/asm/hyperv.h
+++ b/arch/x86/include/uapi/asm/hyperv.h
@@ -34,16 +34,10 @@
#define HV_X64_MSR_REFERENCE_TSC 0x40000021
/*
- * There is a single feature flag that signifies the presence of the MSR
- * that can be used to retrieve both the local APIC Timer frequency as
- * well as the TSC frequency.
+ * There is a single feature flag that signifies if the partition has access
+ * to MSRs with local APIC and TSC frequencies.
*/
-
-/* Local APIC timer frequency MSR (HV_X64_MSR_APIC_FREQUENCY) is available */
-#define HV_X64_MSR_APIC_FREQUENCY_AVAILABLE (1 << 11)
-
-/* TSC frequency MSR (HV_X64_MSR_TSC_FREQUENCY) is available */
-#define HV_X64_MSR_TSC_FREQUENCY_AVAILABLE (1 << 11)
+#define HV_X64_ACCESS_FREQUENCY_MSRS (1 << 11)
/*
* Basic SynIC MSRs (HV_X64_MSR_SCONTROL through HV_X64_MSR_EOM
@@ -73,6 +67,9 @@
*/
#define HV_X64_MSR_STAT_PAGES_AVAILABLE (1 << 8)
+/* Frequency MSRs available */
+#define HV_FEATURE_FREQUENCY_MSRS_AVAILABLE (1 << 8)
+
/* Crash MSR available */
#define HV_FEATURE_GUEST_CRASH_MSR_AVAILABLE (1 << 10)
diff --git a/arch/x86/include/uapi/asm/processor-flags.h b/arch/x86/include/uapi/asm/processor-flags.h
index 567de50a4c2a..185f3d10c194 100644
--- a/arch/x86/include/uapi/asm/processor-flags.h
+++ b/arch/x86/include/uapi/asm/processor-flags.h
@@ -104,6 +104,8 @@
#define X86_CR4_OSFXSR _BITUL(X86_CR4_OSFXSR_BIT)
#define X86_CR4_OSXMMEXCPT_BIT 10 /* enable unmasked SSE exceptions */
#define X86_CR4_OSXMMEXCPT _BITUL(X86_CR4_OSXMMEXCPT_BIT)
+#define X86_CR4_LA57_BIT 12 /* enable 5-level page tables */
+#define X86_CR4_LA57 _BITUL(X86_CR4_LA57_BIT)
#define X86_CR4_VMXE_BIT 13 /* enable VMX virtualization */
#define X86_CR4_VMXE _BITUL(X86_CR4_VMXE_BIT)
#define X86_CR4_SMXE_BIT 14 /* enable safer mode (TXT) */
diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile
index 4b994232cb57..a01892bdd61a 100644
--- a/arch/x86/kernel/Makefile
+++ b/arch/x86/kernel/Makefile
@@ -18,6 +18,7 @@ CFLAGS_REMOVE_pvclock.o = -pg
CFLAGS_REMOVE_kvmclock.o = -pg
CFLAGS_REMOVE_ftrace.o = -pg
CFLAGS_REMOVE_early_printk.o = -pg
+CFLAGS_REMOVE_head64.o = -pg
endif
KASAN_SANITIZE_head$(BITS).o := n
@@ -29,6 +30,7 @@ OBJECT_FILES_NON_STANDARD_head_$(BITS).o := y
OBJECT_FILES_NON_STANDARD_relocate_kernel_$(BITS).o := y
OBJECT_FILES_NON_STANDARD_ftrace_$(BITS).o := y
OBJECT_FILES_NON_STANDARD_test_nx.o := y
+OBJECT_FILES_NON_STANDARD_paravirt_patch_$(BITS).o := y
# If instrumentation of this dir is enabled, boot hangs during first second.
# Probably could be more selective here, but note that files related to irqs,
diff --git a/arch/x86/kernel/acpi/Makefile b/arch/x86/kernel/acpi/Makefile
index 26b78d86f25a..85a9e17e0dbc 100644
--- a/arch/x86/kernel/acpi/Makefile
+++ b/arch/x86/kernel/acpi/Makefile
@@ -1,3 +1,5 @@
+OBJECT_FILES_NON_STANDARD_wakeup_$(BITS).o := y
+
obj-$(CONFIG_ACPI) += boot.o
obj-$(CONFIG_ACPI_SLEEP) += sleep.o wakeup_$(BITS).o
obj-$(CONFIG_ACPI_APEI) += apei.o
diff --git a/arch/x86/kernel/apic/htirq.c b/arch/x86/kernel/apic/htirq.c
index ae50d3454d78..81ff48905623 100644
--- a/arch/x86/kernel/apic/htirq.c
+++ b/arch/x86/kernel/apic/htirq.c
@@ -150,7 +150,7 @@ static const struct irq_domain_ops htirq_domain_ops = {
.deactivate = htirq_domain_deactivate,
};
-void arch_init_htirq_domain(struct irq_domain *parent)
+void __init arch_init_htirq_domain(struct irq_domain *parent)
{
if (disable_apic)
return;
diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c
index 347bb9f65737..247880fc29f9 100644
--- a/arch/x86/kernel/apic/io_apic.c
+++ b/arch/x86/kernel/apic/io_apic.c
@@ -1200,28 +1200,6 @@ EXPORT_SYMBOL(IO_APIC_get_PCI_irq_vector);
static struct irq_chip ioapic_chip, ioapic_ir_chip;
-#ifdef CONFIG_X86_32
-static inline int IO_APIC_irq_trigger(int irq)
-{
- int apic, idx, pin;
-
- for_each_ioapic_pin(apic, pin) {
- idx = find_irq_entry(apic, pin, mp_INT);
- if ((idx != -1) && (irq == pin_2_irq(idx, apic, pin, 0)))
- return irq_trigger(idx);
- }
- /*
- * nonexistent IRQs are edge default
- */
- return 0;
-}
-#else
-static inline int IO_APIC_irq_trigger(int irq)
-{
- return 1;
-}
-#endif
-
static void __init setup_IO_APIC_irqs(void)
{
unsigned int ioapic, pin;
diff --git a/arch/x86/kernel/apic/msi.c b/arch/x86/kernel/apic/msi.c
index c61aec7e65f4..4c5d1882bec5 100644
--- a/arch/x86/kernel/apic/msi.c
+++ b/arch/x86/kernel/apic/msi.c
@@ -136,7 +136,7 @@ static struct msi_domain_info pci_msi_domain_info = {
.handler_name = "edge",
};
-void arch_init_msi_domain(struct irq_domain *parent)
+void __init arch_init_msi_domain(struct irq_domain *parent)
{
if (disable_apic)
return;
diff --git a/arch/x86/kernel/apic/vector.c b/arch/x86/kernel/apic/vector.c
index f3557a1eb562..e66d8e48e456 100644
--- a/arch/x86/kernel/apic/vector.c
+++ b/arch/x86/kernel/apic/vector.c
@@ -405,7 +405,7 @@ int __init arch_probe_nr_irqs(void)
}
#ifdef CONFIG_X86_IO_APIC
-static void init_legacy_irqs(void)
+static void __init init_legacy_irqs(void)
{
int i, node = cpu_to_node(0);
struct apic_chip_data *data;
@@ -424,7 +424,7 @@ static void init_legacy_irqs(void)
}
}
#else
-static void init_legacy_irqs(void) { }
+static inline void init_legacy_irqs(void) { }
#endif
int __init arch_early_irq_init(void)
diff --git a/arch/x86/kernel/cpu/intel_rdt_rdtgroup.c b/arch/x86/kernel/cpu/intel_rdt_rdtgroup.c
index f5af0cc7eb0d..9257bd9dc664 100644
--- a/arch/x86/kernel/cpu/intel_rdt_rdtgroup.c
+++ b/arch/x86/kernel/cpu/intel_rdt_rdtgroup.c
@@ -856,11 +856,13 @@ static struct dentry *rdt_mount(struct file_system_type *fs_type,
dentry = kernfs_mount(fs_type, flags, rdt_root,
RDTGROUP_SUPER_MAGIC, NULL);
if (IS_ERR(dentry))
- goto out_cdp;
+ goto out_destroy;
static_branch_enable(&rdt_enable_key);
goto out;
+out_destroy:
+ kernfs_remove(kn_info);
out_cdp:
cdp_disable();
out:
diff --git a/arch/x86/kernel/cpu/microcode/amd.c b/arch/x86/kernel/cpu/microcode/amd.c
index e9f4d762aa5b..21b185793c80 100644
--- a/arch/x86/kernel/cpu/microcode/amd.c
+++ b/arch/x86/kernel/cpu/microcode/amd.c
@@ -251,7 +251,7 @@ static bool get_builtin_microcode(struct cpio_data *cp, unsigned int family)
#endif
}
-void __load_ucode_amd(unsigned int cpuid_1_eax, struct cpio_data *ret)
+static void __load_ucode_amd(unsigned int cpuid_1_eax, struct cpio_data *ret)
{
struct ucode_cpu_info *uci;
struct cpio_data cp;
diff --git a/arch/x86/kernel/cpu/microcode/core.c b/arch/x86/kernel/cpu/microcode/core.c
index e53d3c909840..9cb98ee103db 100644
--- a/arch/x86/kernel/cpu/microcode/core.c
+++ b/arch/x86/kernel/cpu/microcode/core.c
@@ -290,6 +290,17 @@ struct cpio_data find_microcode_in_initrd(const char *path, bool use_pa)
return (struct cpio_data){ NULL, 0, "" };
if (initrd_start)
start = initrd_start;
+ } else {
+ /*
+ * The picture with physical addresses is a bit different: we
+ * need to get the *physical* address to which the ramdisk was
+ * relocated, i.e., relocated_ramdisk (not initrd_start) and
+ * since we're running from physical addresses, we need to access
+ * relocated_ramdisk through its *physical* address too.
+ */
+ u64 *rr = (u64 *)__pa_nodebug(&relocated_ramdisk);
+ if (*rr)
+ start = *rr;
}
return find_cpio_data(path, (void *)start, size, NULL);
diff --git a/arch/x86/kernel/cpu/microcode/intel.c b/arch/x86/kernel/cpu/microcode/intel.c
index f522415bf9e5..59edbe9d4ccb 100644
--- a/arch/x86/kernel/cpu/microcode/intel.c
+++ b/arch/x86/kernel/cpu/microcode/intel.c
@@ -42,7 +42,7 @@
static const char ucode_path[] = "kernel/x86/microcode/GenuineIntel.bin";
/* Current microcode patch used in early patching on the APs. */
-struct microcode_intel *intel_ucode_patch;
+static struct microcode_intel *intel_ucode_patch;
static inline bool cpu_signatures_match(unsigned int s1, unsigned int p1,
unsigned int s2, unsigned int p2)
@@ -166,7 +166,7 @@ static struct ucode_patch *__alloc_microcode_buf(void *data, unsigned int size)
static void save_microcode_patch(void *data, unsigned int size)
{
struct microcode_header_intel *mc_hdr, *mc_saved_hdr;
- struct ucode_patch *iter, *tmp, *p;
+ struct ucode_patch *iter, *tmp, *p = NULL;
bool prev_found = false;
unsigned int sig, pf;
@@ -202,6 +202,18 @@ static void save_microcode_patch(void *data, unsigned int size)
else
list_add_tail(&p->plist, &microcode_cache);
}
+
+ /*
+ * Save for early loading. On 32-bit, that needs to be a physical
+ * address as the APs are running from physical addresses, before
+ * paging has been enabled.
+ */
+ if (p) {
+ if (IS_ENABLED(CONFIG_X86_32))
+ intel_ucode_patch = (struct microcode_intel *)__pa_nodebug(p->data);
+ else
+ intel_ucode_patch = p->data;
+ }
}
static int microcode_sanity_check(void *mc, int print_err)
@@ -607,6 +619,14 @@ int __init save_microcode_in_initrd_intel(void)
struct ucode_cpu_info uci;
struct cpio_data cp;
+ /*
+ * initrd is going away, clear patch ptr. We will scan the microcode one
+ * last time before jettisoning and save a patch, if found. Then we will
+ * update that pointer too, with a stable patch address to use when
+ * resuming the cores.
+ */
+ intel_ucode_patch = NULL;
+
if (!load_builtin_intel_microcode(&cp))
cp = find_microcode_in_initrd(ucode_path, false);
@@ -619,9 +639,6 @@ int __init save_microcode_in_initrd_intel(void)
show_saved_mc();
- /* initrd is going away, clear patch ptr. */
- intel_ucode_patch = NULL;
-
return 0;
}
diff --git a/arch/x86/kernel/cpu/mshyperv.c b/arch/x86/kernel/cpu/mshyperv.c
index 04cb8d34ccb8..70e717fccdd6 100644
--- a/arch/x86/kernel/cpu/mshyperv.c
+++ b/arch/x86/kernel/cpu/mshyperv.c
@@ -161,6 +161,15 @@ static int hv_nmi_unknown(unsigned int val, struct pt_regs *regs)
}
#endif
+static unsigned long hv_get_tsc_khz(void)
+{
+ unsigned long freq;
+
+ rdmsrl(HV_X64_MSR_TSC_FREQUENCY, freq);
+
+ return freq / 1000;
+}
+
static void __init ms_hyperv_init_platform(void)
{
int hv_host_info_eax;
@@ -193,8 +202,15 @@ static void __init ms_hyperv_init_platform(void)
hv_host_info_edx >> 24, hv_host_info_edx & 0xFFFFFF);
}
+ if (ms_hyperv.features & HV_X64_ACCESS_FREQUENCY_MSRS &&
+ ms_hyperv.misc_features & HV_FEATURE_FREQUENCY_MSRS_AVAILABLE) {
+ x86_platform.calibrate_tsc = hv_get_tsc_khz;
+ x86_platform.calibrate_cpu = hv_get_tsc_khz;
+ }
+
#ifdef CONFIG_X86_LOCAL_APIC
- if (ms_hyperv.features & HV_X64_MSR_APIC_FREQUENCY_AVAILABLE) {
+ if (ms_hyperv.features & HV_X64_ACCESS_FREQUENCY_MSRS &&
+ ms_hyperv.misc_features & HV_FEATURE_FREQUENCY_MSRS_AVAILABLE) {
/*
* Get the APIC frequency.
*/
diff --git a/arch/x86/kernel/espfix_64.c b/arch/x86/kernel/espfix_64.c
index 8e598a1ad986..6b91e2eb8d3f 100644
--- a/arch/x86/kernel/espfix_64.c
+++ b/arch/x86/kernel/espfix_64.c
@@ -125,7 +125,7 @@ void __init init_espfix_bsp(void)
p4d_t *p4d;
/* Install the espfix pud into the kernel page directory */
- pgd = &init_level4_pgt[pgd_index(ESPFIX_BASE_ADDR)];
+ pgd = &init_top_pgt[pgd_index(ESPFIX_BASE_ADDR)];
p4d = p4d_alloc(&init_mm, pgd, ESPFIX_BASE_ADDR);
p4d_populate(&init_mm, p4d, espfix_pud_page);
diff --git a/arch/x86/kernel/head64.c b/arch/x86/kernel/head64.c
index 43b7002f44fb..46c3c73e7f43 100644
--- a/arch/x86/kernel/head64.c
+++ b/arch/x86/kernel/head64.c
@@ -33,17 +33,120 @@
/*
* Manage page tables very early on.
*/
-extern pgd_t early_level4_pgt[PTRS_PER_PGD];
+extern pgd_t early_top_pgt[PTRS_PER_PGD];
extern pmd_t early_dynamic_pgts[EARLY_DYNAMIC_PAGE_TABLES][PTRS_PER_PMD];
-static unsigned int __initdata next_early_pgt = 2;
+static unsigned int __initdata next_early_pgt;
pmdval_t early_pmd_flags = __PAGE_KERNEL_LARGE & ~(_PAGE_GLOBAL | _PAGE_NX);
+#define __head __section(.head.text)
+
+static void __head *fixup_pointer(void *ptr, unsigned long physaddr)
+{
+ return ptr - (void *)_text + (void *)physaddr;
+}
+
+void __head __startup_64(unsigned long physaddr)
+{
+ unsigned long load_delta, *p;
+ pgdval_t *pgd;
+ p4dval_t *p4d;
+ pudval_t *pud;
+ pmdval_t *pmd, pmd_entry;
+ int i;
+
+ /* Is the address too large? */
+ if (physaddr >> MAX_PHYSMEM_BITS)
+ for (;;);
+
+ /*
+ * Compute the delta between the address I am compiled to run at
+ * and the address I am actually running at.
+ */
+ load_delta = physaddr - (unsigned long)(_text - __START_KERNEL_map);
+
+ /* Is the address not 2M aligned? */
+ if (load_delta & ~PMD_PAGE_MASK)
+ for (;;);
+
+ /* Fixup the physical addresses in the page table */
+
+ pgd = fixup_pointer(&early_top_pgt, physaddr);
+ pgd[pgd_index(__START_KERNEL_map)] += load_delta;
+
+ if (IS_ENABLED(CONFIG_X86_5LEVEL)) {
+ p4d = fixup_pointer(&level4_kernel_pgt, physaddr);
+ p4d[511] += load_delta;
+ }
+
+ pud = fixup_pointer(&level3_kernel_pgt, physaddr);
+ pud[510] += load_delta;
+ pud[511] += load_delta;
+
+ pmd = fixup_pointer(level2_fixmap_pgt, physaddr);
+ pmd[506] += load_delta;
+
+ /*
+ * Set up the identity mapping for the switchover. These
+ * entries should *NOT* have the global bit set! This also
+ * creates a bunch of nonsense entries but that is fine --
+ * it avoids problems around wraparound.
+ */
+
+ pud = fixup_pointer(early_dynamic_pgts[next_early_pgt++], physaddr);
+ pmd = fixup_pointer(early_dynamic_pgts[next_early_pgt++], physaddr);
+
+ if (IS_ENABLED(CONFIG_X86_5LEVEL)) {
+ p4d = fixup_pointer(early_dynamic_pgts[next_early_pgt++], physaddr);
+
+ i = (physaddr >> PGDIR_SHIFT) % PTRS_PER_PGD;
+ pgd[i + 0] = (pgdval_t)p4d + _KERNPG_TABLE;
+ pgd[i + 1] = (pgdval_t)p4d + _KERNPG_TABLE;
+
+ i = (physaddr >> P4D_SHIFT) % PTRS_PER_P4D;
+ p4d[i + 0] = (pgdval_t)pud + _KERNPG_TABLE;
+ p4d[i + 1] = (pgdval_t)pud + _KERNPG_TABLE;
+ } else {
+ i = (physaddr >> PGDIR_SHIFT) % PTRS_PER_PGD;
+ pgd[i + 0] = (pgdval_t)pud + _KERNPG_TABLE;
+ pgd[i + 1] = (pgdval_t)pud + _KERNPG_TABLE;
+ }
+
+ i = (physaddr >> PUD_SHIFT) % PTRS_PER_PUD;
+ pud[i + 0] = (pudval_t)pmd + _KERNPG_TABLE;
+ pud[i + 1] = (pudval_t)pmd + _KERNPG_TABLE;
+
+ pmd_entry = __PAGE_KERNEL_LARGE_EXEC & ~_PAGE_GLOBAL;
+ pmd_entry += physaddr;
+
+ for (i = 0; i < DIV_ROUND_UP(_end - _text, PMD_SIZE); i++) {
+ int idx = i + (physaddr >> PMD_SHIFT) % PTRS_PER_PMD;
+ pmd[idx] = pmd_entry + i * PMD_SIZE;
+ }
+
+ /*
+ * Fixup the kernel text+data virtual addresses. Note that
+ * we might write invalid pmds, when the kernel is relocated
+ * cleanup_highmap() fixes this up along with the mappings
+ * beyond _end.
+ */
+
+ pmd = fixup_pointer(level2_kernel_pgt, physaddr);
+ for (i = 0; i < PTRS_PER_PMD; i++) {
+ if (pmd[i] & _PAGE_PRESENT)
+ pmd[i] += load_delta;
+ }
+
+ /* Fixup phys_base */
+ p = fixup_pointer(&phys_base, physaddr);
+ *p += load_delta;
+}
+
/* Wipe all early page tables except for the kernel symbol map */
static void __init reset_early_page_tables(void)
{
- memset(early_level4_pgt, 0, sizeof(pgd_t)*(PTRS_PER_PGD-1));
+ memset(early_top_pgt, 0, sizeof(pgd_t)*(PTRS_PER_PGD-1));
next_early_pgt = 0;
- write_cr3(__pa_nodebug(early_level4_pgt));
+ write_cr3(__pa_nodebug(early_top_pgt));
}
/* Create a new PMD entry */
@@ -51,15 +154,16 @@ int __init early_make_pgtable(unsigned long address)
{
unsigned long physaddr = address - __PAGE_OFFSET;
pgdval_t pgd, *pgd_p;
+ p4dval_t p4d, *p4d_p;
pudval_t pud, *pud_p;
pmdval_t pmd, *pmd_p;
/* Invalid address or early pgt is done ? */
- if (physaddr >= MAXMEM || read_cr3() != __pa_nodebug(early_level4_pgt))
+ if (physaddr >= MAXMEM || read_cr3_pa() != __pa_nodebug(early_top_pgt))
return -1;
again:
- pgd_p = &early_level4_pgt[pgd_index(address)].pgd;
+ pgd_p = &early_top_pgt[pgd_index(address)].pgd;
pgd = *pgd_p;
/*
@@ -67,8 +171,25 @@ again:
* critical -- __PAGE_OFFSET would point us back into the dynamic
* range and we might end up looping forever...
*/
- if (pgd)
- pud_p = (pudval_t *)((pgd & PTE_PFN_MASK) + __START_KERNEL_map - phys_base);
+ if (!IS_ENABLED(CONFIG_X86_5LEVEL))
+ p4d_p = pgd_p;
+ else if (pgd)
+ p4d_p = (p4dval_t *)((pgd & PTE_PFN_MASK) + __START_KERNEL_map - phys_base);
+ else {
+ if (next_early_pgt >= EARLY_DYNAMIC_PAGE_TABLES) {
+ reset_early_page_tables();
+ goto again;
+ }
+
+ p4d_p = (p4dval_t *)early_dynamic_pgts[next_early_pgt++];
+ memset(p4d_p, 0, sizeof(*p4d_p) * PTRS_PER_P4D);
+ *pgd_p = (pgdval_t)p4d_p - __START_KERNEL_map + phys_base + _KERNPG_TABLE;
+ }
+ p4d_p += p4d_index(address);
+ p4d = *p4d_p;
+
+ if (p4d)
+ pud_p = (pudval_t *)((p4d & PTE_PFN_MASK) + __START_KERNEL_map - phys_base);
else {
if (next_early_pgt >= EARLY_DYNAMIC_PAGE_TABLES) {
reset_early_page_tables();
@@ -77,7 +198,7 @@ again:
pud_p = (pudval_t *)early_dynamic_pgts[next_early_pgt++];
memset(pud_p, 0, sizeof(*pud_p) * PTRS_PER_PUD);
- *pgd_p = (pgdval_t)pud_p - __START_KERNEL_map + phys_base + _KERNPG_TABLE;
+ *p4d_p = (p4dval_t)pud_p - __START_KERNEL_map + phys_base + _KERNPG_TABLE;
}
pud_p += pud_index(address);
pud = *pud_p;
@@ -156,7 +277,7 @@ asmlinkage __visible void __init x86_64_start_kernel(char * real_mode_data)
clear_bss();
- clear_page(init_level4_pgt);
+ clear_page(init_top_pgt);
kasan_early_init();
@@ -171,8 +292,8 @@ asmlinkage __visible void __init x86_64_start_kernel(char * real_mode_data)
*/
load_ucode_bsp();
- /* set init_level4_pgt kernel high mapping*/
- init_level4_pgt[511] = early_level4_pgt[511];
+ /* set init_top_pgt kernel high mapping*/
+ init_top_pgt[511] = early_top_pgt[511];
x86_64_start_reservations(real_mode_data);
}
diff --git a/arch/x86/kernel/head_64.S b/arch/x86/kernel/head_64.S
index ac9d327d2e42..6225550883df 100644
--- a/arch/x86/kernel/head_64.S
+++ b/arch/x86/kernel/head_64.S
@@ -37,10 +37,11 @@
*
*/
+#define p4d_index(x) (((x) >> P4D_SHIFT) & (PTRS_PER_P4D-1))
#define pud_index(x) (((x) >> PUD_SHIFT) & (PTRS_PER_PUD-1))
-L4_PAGE_OFFSET = pgd_index(__PAGE_OFFSET_BASE)
-L4_START_KERNEL = pgd_index(__START_KERNEL_map)
+PGD_PAGE_OFFSET = pgd_index(__PAGE_OFFSET_BASE)
+PGD_START_KERNEL = pgd_index(__START_KERNEL_map)
L3_START_KERNEL = pud_index(__START_KERNEL_map)
.text
@@ -72,101 +73,12 @@ startup_64:
/* Sanitize CPU configuration */
call verify_cpu
- /*
- * Compute the delta between the address I am compiled to run at and the
- * address I am actually running at.
- */
- leaq _text(%rip), %rbp
- subq $_text - __START_KERNEL_map, %rbp
-
- /* Is the address not 2M aligned? */
- testl $~PMD_PAGE_MASK, %ebp
- jnz bad_address
-
- /*
- * Is the address too large?
- */
- leaq _text(%rip), %rax
- shrq $MAX_PHYSMEM_BITS, %rax
- jnz bad_address
-
- /*
- * Fixup the physical addresses in the page table
- */
- addq %rbp, early_level4_pgt + (L4_START_KERNEL*8)(%rip)
-
- addq %rbp, level3_kernel_pgt + (510*8)(%rip)
- addq %rbp, level3_kernel_pgt + (511*8)(%rip)
-
- addq %rbp, level2_fixmap_pgt + (506*8)(%rip)
-
- /*
- * Set up the identity mapping for the switchover. These
- * entries should *NOT* have the global bit set! This also
- * creates a bunch of nonsense entries but that is fine --
- * it avoids problems around wraparound.
- */
leaq _text(%rip), %rdi
- leaq early_level4_pgt(%rip), %rbx
-
- movq %rdi, %rax
- shrq $PGDIR_SHIFT, %rax
-
- leaq (PAGE_SIZE + _KERNPG_TABLE)(%rbx), %rdx
- movq %rdx, 0(%rbx,%rax,8)
- movq %rdx, 8(%rbx,%rax,8)
-
- addq $PAGE_SIZE, %rdx
- movq %rdi, %rax
- shrq $PUD_SHIFT, %rax
- andl $(PTRS_PER_PUD-1), %eax
- movq %rdx, PAGE_SIZE(%rbx,%rax,8)
- incl %eax
- andl $(PTRS_PER_PUD-1), %eax
- movq %rdx, PAGE_SIZE(%rbx,%rax,8)
-
- addq $PAGE_SIZE * 2, %rbx
- movq %rdi, %rax
- shrq $PMD_SHIFT, %rdi
- addq $(__PAGE_KERNEL_LARGE_EXEC & ~_PAGE_GLOBAL), %rax
- leaq (_end - 1)(%rip), %rcx
- shrq $PMD_SHIFT, %rcx
- subq %rdi, %rcx
- incl %ecx
-
-1:
- andq $(PTRS_PER_PMD - 1), %rdi
- movq %rax, (%rbx,%rdi,8)
- incq %rdi
- addq $PMD_SIZE, %rax
- decl %ecx
- jnz 1b
-
- test %rbp, %rbp
- jz .Lskip_fixup
+ pushq %rsi
+ call __startup_64
+ popq %rsi
- /*
- * Fixup the kernel text+data virtual addresses. Note that
- * we might write invalid pmds, when the kernel is relocated
- * cleanup_highmap() fixes this up along with the mappings
- * beyond _end.
- */
- leaq level2_kernel_pgt(%rip), %rdi
- leaq PAGE_SIZE(%rdi), %r8
- /* See if it is a valid page table entry */
-1: testb $_PAGE_PRESENT, 0(%rdi)
- jz 2f
- addq %rbp, 0(%rdi)
- /* Go to the next page */
-2: addq $8, %rdi
- cmp %r8, %rdi
- jne 1b
-
- /* Fixup phys_base */
- addq %rbp, phys_base(%rip)
-
-.Lskip_fixup:
- movq $(early_level4_pgt - __START_KERNEL_map), %rax
+ movq $(early_top_pgt - __START_KERNEL_map), %rax
jmp 1f
ENTRY(secondary_startup_64)
/*
@@ -186,14 +98,17 @@ ENTRY(secondary_startup_64)
/* Sanitize CPU configuration */
call verify_cpu
- movq $(init_level4_pgt - __START_KERNEL_map), %rax
+ movq $(init_top_pgt - __START_KERNEL_map), %rax
1:
- /* Enable PAE mode and PGE */
+ /* Enable PAE mode, PGE and LA57 */
movl $(X86_CR4_PAE | X86_CR4_PGE), %ecx
+#ifdef CONFIG_X86_5LEVEL
+ orl $X86_CR4_LA57, %ecx
+#endif
movq %rcx, %cr4
- /* Setup early boot stage 4 level pagetables. */
+ /* Setup early boot stage 4-/5-level pagetables. */
addq phys_base(%rip), %rax
movq %rax, %cr3
@@ -417,9 +332,13 @@ GLOBAL(name)
.endr
__INITDATA
-NEXT_PAGE(early_level4_pgt)
+NEXT_PAGE(early_top_pgt)
.fill 511,8,0
+#ifdef CONFIG_X86_5LEVEL
+ .quad level4_kernel_pgt - __START_KERNEL_map + _PAGE_TABLE
+#else
.quad level3_kernel_pgt - __START_KERNEL_map + _PAGE_TABLE
+#endif
NEXT_PAGE(early_dynamic_pgts)
.fill 512*EARLY_DYNAMIC_PAGE_TABLES,8,0
@@ -427,14 +346,14 @@ NEXT_PAGE(early_dynamic_pgts)
.data
#ifndef CONFIG_XEN
-NEXT_PAGE(init_level4_pgt)
+NEXT_PAGE(init_top_pgt)
.fill 512,8,0
#else
-NEXT_PAGE(init_level4_pgt)
+NEXT_PAGE(init_top_pgt)
.quad level3_ident_pgt - __START_KERNEL_map + _KERNPG_TABLE
- .org init_level4_pgt + L4_PAGE_OFFSET*8, 0
+ .org init_top_pgt + PGD_PAGE_OFFSET*8, 0
.quad level3_ident_pgt - __START_KERNEL_map + _KERNPG_TABLE
- .org init_level4_pgt + L4_START_KERNEL*8, 0
+ .org init_top_pgt + PGD_START_KERNEL*8, 0
/* (2^48-(2*1024*1024*1024))/(2^39) = 511 */
.quad level3_kernel_pgt - __START_KERNEL_map + _PAGE_TABLE
@@ -448,6 +367,12 @@ NEXT_PAGE(level2_ident_pgt)
PMDS(0, __PAGE_KERNEL_IDENT_LARGE_EXEC, PTRS_PER_PMD)
#endif
+#ifdef CONFIG_X86_5LEVEL
+NEXT_PAGE(level4_kernel_pgt)
+ .fill 511,8,0
+ .quad level3_kernel_pgt - __START_KERNEL_map + _PAGE_TABLE
+#endif
+
NEXT_PAGE(level3_kernel_pgt)
.fill L3_START_KERNEL,8,0
/* (2^48-(2*1024*1024*1024)-((2^39)*511))/(2^30) = 510 */
diff --git a/arch/x86/kernel/kprobes/opt.c b/arch/x86/kernel/kprobes/opt.c
index 901c640d152f..69ea0bc1cfa3 100644
--- a/arch/x86/kernel/kprobes/opt.c
+++ b/arch/x86/kernel/kprobes/opt.c
@@ -28,6 +28,7 @@
#include <linux/kdebug.h>
#include <linux/kallsyms.h>
#include <linux/ftrace.h>
+#include <linux/frame.h>
#include <asm/text-patching.h>
#include <asm/cacheflush.h>
@@ -94,6 +95,7 @@ static void synthesize_set_arg1(kprobe_opcode_t *addr, unsigned long val)
}
asm (
+ "optprobe_template_func:\n"
".global optprobe_template_entry\n"
"optprobe_template_entry:\n"
#ifdef CONFIG_X86_64
@@ -131,7 +133,12 @@ asm (
" popf\n"
#endif
".global optprobe_template_end\n"
- "optprobe_template_end:\n");
+ "optprobe_template_end:\n"
+ ".type optprobe_template_func, @function\n"
+ ".size optprobe_template_func, .-optprobe_template_func\n");
+
+void optprobe_template_func(void);
+STACK_FRAME_NON_STANDARD(optprobe_template_func);
#define TMPL_MOVE_IDX \
((long)&optprobe_template_val - (long)&optprobe_template_entry)
diff --git a/arch/x86/kernel/ldt.c b/arch/x86/kernel/ldt.c
index d4a15831ac58..a870910c8565 100644
--- a/arch/x86/kernel/ldt.c
+++ b/arch/x86/kernel/ldt.c
@@ -22,24 +22,25 @@
#include <asm/syscalls.h>
/* context.lock is held for us, so we don't need any locking. */
-static void flush_ldt(void *current_mm)
+static void flush_ldt(void *__mm)
{
+ struct mm_struct *mm = __mm;
mm_context_t *pc;
- if (current->active_mm != current_mm)
+ if (this_cpu_read(cpu_tlbstate.loaded_mm) != mm)
return;
- pc = &current->active_mm->context;
- set_ldt(pc->ldt->entries, pc->ldt->size);
+ pc = &mm->context;
+ set_ldt(pc->ldt->entries, pc->ldt->nr_entries);
}
/* The caller must call finalize_ldt_struct on the result. LDT starts zeroed. */
-static struct ldt_struct *alloc_ldt_struct(unsigned int size)
+static struct ldt_struct *alloc_ldt_struct(unsigned int num_entries)
{
struct ldt_struct *new_ldt;
unsigned int alloc_size;
- if (size > LDT_ENTRIES)
+ if (num_entries > LDT_ENTRIES)
return NULL;
new_ldt = kmalloc(sizeof(struct ldt_struct), GFP_KERNEL);
@@ -47,7 +48,7 @@ static struct ldt_struct *alloc_ldt_struct(unsigned int size)
return NULL;
BUILD_BUG_ON(LDT_ENTRY_SIZE != sizeof(struct desc_struct));
- alloc_size = size * LDT_ENTRY_SIZE;
+ alloc_size = num_entries * LDT_ENTRY_SIZE;
/*
* Xen is very picky: it requires a page-aligned LDT that has no
@@ -65,14 +66,14 @@ static struct ldt_struct *alloc_ldt_struct(unsigned int size)
return NULL;
}
- new_ldt->size = size;
+ new_ldt->nr_entries = num_entries;
return new_ldt;
}
/* After calling this, the LDT is immutable. */
static void finalize_ldt_struct(struct ldt_struct *ldt)
{
- paravirt_alloc_ldt(ldt->entries, ldt->size);
+ paravirt_alloc_ldt(ldt->entries, ldt->nr_entries);
}
/* context.lock is held */
@@ -91,8 +92,8 @@ static void free_ldt_struct(struct ldt_struct *ldt)
if (likely(!ldt))
return;
- paravirt_free_ldt(ldt->entries, ldt->size);
- if (ldt->size * LDT_ENTRY_SIZE > PAGE_SIZE)
+ paravirt_free_ldt(ldt->entries, ldt->nr_entries);
+ if (ldt->nr_entries * LDT_ENTRY_SIZE > PAGE_SIZE)
vfree_atomic(ldt->entries);
else
free_page((unsigned long)ldt->entries);
@@ -122,14 +123,14 @@ int init_new_context_ldt(struct task_struct *tsk, struct mm_struct *mm)
goto out_unlock;
}
- new_ldt = alloc_ldt_struct(old_mm->context.ldt->size);
+ new_ldt = alloc_ldt_struct(old_mm->context.ldt->nr_entries);
if (!new_ldt) {
retval = -ENOMEM;
goto out_unlock;
}
memcpy(new_ldt->entries, old_mm->context.ldt->entries,
- new_ldt->size * LDT_ENTRY_SIZE);
+ new_ldt->nr_entries * LDT_ENTRY_SIZE);
finalize_ldt_struct(new_ldt);
mm->context.ldt = new_ldt;
@@ -152,9 +153,9 @@ void destroy_context_ldt(struct mm_struct *mm)
static int read_ldt(void __user *ptr, unsigned long bytecount)
{
- int retval;
- unsigned long size;
struct mm_struct *mm = current->mm;
+ unsigned long entries_size;
+ int retval;
mutex_lock(&mm->context.lock);
@@ -166,18 +167,18 @@ static int read_ldt(void __user *ptr, unsigned long bytecount)
if (bytecount > LDT_ENTRY_SIZE * LDT_ENTRIES)
bytecount = LDT_ENTRY_SIZE * LDT_ENTRIES;
- size = mm->context.ldt->size * LDT_ENTRY_SIZE;
- if (size > bytecount)
- size = bytecount;
+ entries_size = mm->context.ldt->nr_entries * LDT_ENTRY_SIZE;
+ if (entries_size > bytecount)
+ entries_size = bytecount;
- if (copy_to_user(ptr, mm->context.ldt->entries, size)) {
+ if (copy_to_user(ptr, mm->context.ldt->entries, entries_size)) {
retval = -EFAULT;
goto out_unlock;
}
- if (size != bytecount) {
+ if (entries_size != bytecount) {
/* Zero-fill the rest and pretend we read bytecount bytes. */
- if (clear_user(ptr + size, bytecount - size)) {
+ if (clear_user(ptr + entries_size, bytecount - entries_size)) {
retval = -EFAULT;
goto out_unlock;
}
@@ -208,7 +209,7 @@ static int write_ldt(void __user *ptr, unsigned long bytecount, int oldmode)
{
struct mm_struct *mm = current->mm;
struct ldt_struct *new_ldt, *old_ldt;
- unsigned int oldsize, newsize;
+ unsigned int old_nr_entries, new_nr_entries;
struct user_desc ldt_info;
struct desc_struct ldt;
int error;
@@ -247,17 +248,18 @@ static int write_ldt(void __user *ptr, unsigned long bytecount, int oldmode)
mutex_lock(&mm->context.lock);
- old_ldt = mm->context.ldt;
- oldsize = old_ldt ? old_ldt->size : 0;
- newsize = max(ldt_info.entry_number + 1, oldsize);
+ old_ldt = mm->context.ldt;
+ old_nr_entries = old_ldt ? old_ldt->nr_entries : 0;
+ new_nr_entries = max(ldt_info.entry_number + 1, old_nr_entries);
error = -ENOMEM;
- new_ldt = alloc_ldt_struct(newsize);
+ new_ldt = alloc_ldt_struct(new_nr_entries);
if (!new_ldt)
goto out_unlock;
if (old_ldt)
- memcpy(new_ldt->entries, old_ldt->entries, oldsize * LDT_ENTRY_SIZE);
+ memcpy(new_ldt->entries, old_ldt->entries, old_nr_entries * LDT_ENTRY_SIZE);
+
new_ldt->entries[ldt_info.entry_number] = ldt;
finalize_ldt_struct(new_ldt);
diff --git a/arch/x86/kernel/machine_kexec_64.c b/arch/x86/kernel/machine_kexec_64.c
index 6f5ca4ebe6e5..cb0a30473c23 100644
--- a/arch/x86/kernel/machine_kexec_64.c
+++ b/arch/x86/kernel/machine_kexec_64.c
@@ -347,7 +347,7 @@ void machine_kexec(struct kimage *image)
void arch_crash_save_vmcoreinfo(void)
{
VMCOREINFO_NUMBER(phys_base);
- VMCOREINFO_SYMBOL(init_level4_pgt);
+ VMCOREINFO_SYMBOL(init_top_pgt);
#ifdef CONFIG_NUMA
VMCOREINFO_SYMBOL(node_data);
diff --git a/arch/x86/kernel/nmi_selftest.c b/arch/x86/kernel/nmi_selftest.c
index 6d9582ec0324..d27f8d84c4ff 100644
--- a/arch/x86/kernel/nmi_selftest.c
+++ b/arch/x86/kernel/nmi_selftest.c
@@ -78,7 +78,7 @@ static void __init test_nmi_ipi(struct cpumask *mask)
/* Don't wait longer than a second */
timeout = USEC_PER_SEC;
- while (!cpumask_empty(mask) && timeout--)
+ while (!cpumask_empty(mask) && --timeout)
udelay(1);
/* What happens if we timeout, do we still unregister?? */
diff --git a/arch/x86/kernel/paravirt.c b/arch/x86/kernel/paravirt.c
index 3586996fc50d..bc0a849589bb 100644
--- a/arch/x86/kernel/paravirt.c
+++ b/arch/x86/kernel/paravirt.c
@@ -391,7 +391,7 @@ struct pv_mmu_ops pv_mmu_ops __ro_after_init = {
.read_cr2 = native_read_cr2,
.write_cr2 = native_write_cr2,
- .read_cr3 = native_read_cr3,
+ .read_cr3 = __native_read_cr3,
.write_cr3 = native_write_cr3,
.flush_tlb_user = native_flush_tlb,
diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c
index 0bb88428cbf2..3ca198080ea9 100644
--- a/arch/x86/kernel/process.c
+++ b/arch/x86/kernel/process.c
@@ -545,17 +545,6 @@ unsigned long arch_randomize_brk(struct mm_struct *mm)
}
/*
- * Return saved PC of a blocked thread.
- * What is this good for? it will be always the scheduler or ret_from_fork.
- */
-unsigned long thread_saved_pc(struct task_struct *tsk)
-{
- struct inactive_task_frame *frame =
- (struct inactive_task_frame *) READ_ONCE(tsk->thread.sp);
- return READ_ONCE_NOCHECK(frame->ret_addr);
-}
-
-/*
* Called from fs/proc with a reference on @p to find the function
* which called into schedule(). This needs to be done carefully
* because the task might wake up and we might look at a stack
diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c
index ffeae818aa7a..c6d6dc5f8bb2 100644
--- a/arch/x86/kernel/process_32.c
+++ b/arch/x86/kernel/process_32.c
@@ -92,7 +92,7 @@ void __show_regs(struct pt_regs *regs, int all)
cr0 = read_cr0();
cr2 = read_cr2();
- cr3 = read_cr3();
+ cr3 = __read_cr3();
cr4 = __read_cr4();
printk(KERN_DEFAULT "CR0: %08lx CR2: %08lx CR3: %08lx CR4: %08lx\n",
cr0, cr2, cr3, cr4);
diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c
index b6840bf3940b..c3169be4c596 100644
--- a/arch/x86/kernel/process_64.c
+++ b/arch/x86/kernel/process_64.c
@@ -104,7 +104,7 @@ void __show_regs(struct pt_regs *regs, int all)
cr0 = read_cr0();
cr2 = read_cr2();
- cr3 = read_cr3();
+ cr3 = __read_cr3();
cr4 = __read_cr4();
printk(KERN_DEFAULT "FS: %016lx(%04x) GS:%016lx(%04x) knlGS:%016lx\n",
@@ -142,7 +142,7 @@ void release_thread(struct task_struct *dead_task)
pr_warn("WARNING: dead process %s still has LDT? <%p/%d>\n",
dead_task->comm,
dead_task->mm->context.ldt->entries,
- dead_task->mm->context.ldt->size);
+ dead_task->mm->context.ldt->nr_entries);
BUG();
}
#endif
diff --git a/arch/x86/kernel/reboot.c b/arch/x86/kernel/reboot.c
index 2544700a2a87..67393fc88353 100644
--- a/arch/x86/kernel/reboot.c
+++ b/arch/x86/kernel/reboot.c
@@ -9,6 +9,7 @@
#include <linux/sched.h>
#include <linux/tboot.h>
#include <linux/delay.h>
+#include <linux/frame.h>
#include <acpi/reboot.h>
#include <asm/io.h>
#include <asm/apic.h>
@@ -123,6 +124,7 @@ void __noreturn machine_real_restart(unsigned int type)
#ifdef CONFIG_APM_MODULE
EXPORT_SYMBOL(machine_real_restart);
#endif
+STACK_FRAME_NON_STANDARD(machine_real_restart);
/*
* Some Apple MacBook and MacBookPro's needs reboot=p to be able to reboot
diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c
index f81823695014..65622f07e633 100644
--- a/arch/x86/kernel/setup.c
+++ b/arch/x86/kernel/setup.c
@@ -503,7 +503,7 @@ static int __init reserve_crashkernel_low(void)
return 0;
}
- low_base = memblock_find_in_range(low_size, 1ULL << 32, low_size, CRASH_ALIGN);
+ low_base = memblock_find_in_range(0, 1ULL << 32, low_size, CRASH_ALIGN);
if (!low_base) {
pr_err("Cannot reserve %ldMB crashkernel low memory, please try smaller size.\n",
(unsigned long)(low_size >> 20));
diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c
index f04479a8f74f..b474c8de7fba 100644
--- a/arch/x86/kernel/smpboot.c
+++ b/arch/x86/kernel/smpboot.c
@@ -863,7 +863,7 @@ static void announce_cpu(int cpu, int apicid)
if (cpu == 1)
printk(KERN_INFO "x86: Booting SMP configuration:\n");
- if (system_state == SYSTEM_BOOTING) {
+ if (system_state < SYSTEM_RUNNING) {
if (node != current_node) {
if (current_node > (-1))
pr_cont("\n");
@@ -1589,7 +1589,6 @@ void native_cpu_die(unsigned int cpu)
void play_dead_common(void)
{
idle_task_exit();
- reset_lazy_tlbstate();
/* Ack it */
(void)cpu_report_death();
diff --git a/arch/x86/kernel/step.c b/arch/x86/kernel/step.c
index f07f83b3611b..5f25cfbd952e 100644
--- a/arch/x86/kernel/step.c
+++ b/arch/x86/kernel/step.c
@@ -34,7 +34,7 @@ unsigned long convert_ip_to_linear(struct task_struct *child, struct pt_regs *re
mutex_lock(&child->mm->context.lock);
if (unlikely(!child->mm->context.ldt ||
- seg >= child->mm->context.ldt->size))
+ seg >= child->mm->context.ldt->nr_entries))
addr = -1L; /* bogus selector, access would fault */
else {
desc = &child->mm->context.ldt->entries[seg];
diff --git a/arch/x86/kernel/sys_x86_64.c b/arch/x86/kernel/sys_x86_64.c
index 207b8f2582c7..213ddf3e937d 100644
--- a/arch/x86/kernel/sys_x86_64.c
+++ b/arch/x86/kernel/sys_x86_64.c
@@ -144,7 +144,7 @@ arch_get_unmapped_area(struct file *filp, unsigned long addr,
addr = PAGE_ALIGN(addr);
vma = find_vma(mm, addr);
if (end - len >= addr &&
- (!vma || addr + len <= vma->vm_start))
+ (!vma || addr + len <= vm_start_gap(vma)))
return addr;
}
@@ -187,7 +187,7 @@ arch_get_unmapped_area_topdown(struct file *filp, const unsigned long addr0,
addr = PAGE_ALIGN(addr);
vma = find_vma(mm, addr);
if (TASK_SIZE - len >= addr &&
- (!vma || addr + len <= vma->vm_start))
+ (!vma || addr + len <= vm_start_gap(vma)))
return addr;
}
diff --git a/arch/x86/kernel/tboot.c b/arch/x86/kernel/tboot.c
index 4b1724059909..a4eb27918ceb 100644
--- a/arch/x86/kernel/tboot.c
+++ b/arch/x86/kernel/tboot.c
@@ -514,7 +514,7 @@ int tboot_force_iommu(void)
if (!tboot_enabled())
return 0;
- if (!intel_iommu_tboot_noforce)
+ if (intel_iommu_tboot_noforce)
return 1;
if (no_iommu || swiotlb || dmar_disabled)
diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c
index 3995d3a777d4..bf54309b85da 100644
--- a/arch/x86/kernel/traps.c
+++ b/arch/x86/kernel/traps.c
@@ -182,7 +182,7 @@ int is_valid_bugaddr(unsigned long addr)
return ud == INSN_UD0 || ud == INSN_UD2;
}
-static int fixup_bug(struct pt_regs *regs, int trapnr)
+int fixup_bug(struct pt_regs *regs, int trapnr)
{
if (trapnr != X86_TRAP_UD)
return 0;
diff --git a/arch/x86/kernel/tsc.c b/arch/x86/kernel/tsc.c
index 714dfba6a1e7..5270fc0c2df6 100644
--- a/arch/x86/kernel/tsc.c
+++ b/arch/x86/kernel/tsc.c
@@ -51,115 +51,34 @@ static u32 art_to_tsc_denominator;
static u64 art_to_tsc_offset;
struct clocksource *art_related_clocksource;
-/*
- * Use a ring-buffer like data structure, where a writer advances the head by
- * writing a new data entry and a reader advances the tail when it observes a
- * new entry.
- *
- * Writers are made to wait on readers until there's space to write a new
- * entry.
- *
- * This means that we can always use an {offset, mul} pair to compute a ns
- * value that is 'roughly' in the right direction, even if we're writing a new
- * {offset, mul} pair during the clock read.
- *
- * The down-side is that we can no longer guarantee strict monotonicity anymore
- * (assuming the TSC was that to begin with), because while we compute the
- * intersection point of the two clock slopes and make sure the time is
- * continuous at the point of switching; we can no longer guarantee a reader is
- * strictly before or after the switch point.
- *
- * It does mean a reader no longer needs to disable IRQs in order to avoid
- * CPU-Freq updates messing with his times, and similarly an NMI reader will
- * no longer run the risk of hitting half-written state.
- */
-
struct cyc2ns {
- struct cyc2ns_data data[2]; /* 0 + 2*24 = 48 */
- struct cyc2ns_data *head; /* 48 + 8 = 56 */
- struct cyc2ns_data *tail; /* 56 + 8 = 64 */
-}; /* exactly fits one cacheline */
-
-static DEFINE_PER_CPU_ALIGNED(struct cyc2ns, cyc2ns);
-
-struct cyc2ns_data *cyc2ns_read_begin(void)
-{
- struct cyc2ns_data *head;
-
- preempt_disable();
-
- head = this_cpu_read(cyc2ns.head);
- /*
- * Ensure we observe the entry when we observe the pointer to it.
- * matches the wmb from cyc2ns_write_end().
- */
- smp_read_barrier_depends();
- head->__count++;
- barrier();
+ struct cyc2ns_data data[2]; /* 0 + 2*16 = 32 */
+ seqcount_t seq; /* 32 + 4 = 36 */
- return head;
-}
+}; /* fits one cacheline */
-void cyc2ns_read_end(struct cyc2ns_data *head)
-{
- barrier();
- /*
- * If we're the outer most nested read; update the tail pointer
- * when we're done. This notifies possible pending writers
- * that we've observed the head pointer and that the other
- * entry is now free.
- */
- if (!--head->__count) {
- /*
- * x86-TSO does not reorder writes with older reads;
- * therefore once this write becomes visible to another
- * cpu, we must be finished reading the cyc2ns_data.
- *
- * matches with cyc2ns_write_begin().
- */
- this_cpu_write(cyc2ns.tail, head);
- }
- preempt_enable();
-}
+static DEFINE_PER_CPU_ALIGNED(struct cyc2ns, cyc2ns);
-/*
- * Begin writing a new @data entry for @cpu.
- *
- * Assumes some sort of write side lock; currently 'provided' by the assumption
- * that cpufreq will call its notifiers sequentially.
- */
-static struct cyc2ns_data *cyc2ns_write_begin(int cpu)
+void cyc2ns_read_begin(struct cyc2ns_data *data)
{
- struct cyc2ns *c2n = &per_cpu(cyc2ns, cpu);
- struct cyc2ns_data *data = c2n->data;
+ int seq, idx;
- if (data == c2n->head)
- data++;
+ preempt_disable_notrace();
- /* XXX send an IPI to @cpu in order to guarantee a read? */
+ do {
+ seq = this_cpu_read(cyc2ns.seq.sequence);
+ idx = seq & 1;
- /*
- * When we observe the tail write from cyc2ns_read_end(),
- * the cpu must be done with that entry and its safe
- * to start writing to it.
- */
- while (c2n->tail == data)
- cpu_relax();
+ data->cyc2ns_offset = this_cpu_read(cyc2ns.data[idx].cyc2ns_offset);
+ data->cyc2ns_mul = this_cpu_read(cyc2ns.data[idx].cyc2ns_mul);
+ data->cyc2ns_shift = this_cpu_read(cyc2ns.data[idx].cyc2ns_shift);
- return data;
+ } while (unlikely(seq != this_cpu_read(cyc2ns.seq.sequence)));
}
-static void cyc2ns_write_end(int cpu, struct cyc2ns_data *data)
+void cyc2ns_read_end(void)
{
- struct cyc2ns *c2n = &per_cpu(cyc2ns, cpu);
-
- /*
- * Ensure the @data writes are visible before we publish the
- * entry. Matches the data-depencency in cyc2ns_read_begin().
- */
- smp_wmb();
-
- ACCESS_ONCE(c2n->head) = data;
+ preempt_enable_notrace();
}
/*
@@ -191,7 +110,6 @@ static void cyc2ns_data_init(struct cyc2ns_data *data)
data->cyc2ns_mul = 0;
data->cyc2ns_shift = 0;
data->cyc2ns_offset = 0;
- data->__count = 0;
}
static void cyc2ns_init(int cpu)
@@ -201,51 +119,29 @@ static void cyc2ns_init(int cpu)
cyc2ns_data_init(&c2n->data[0]);
cyc2ns_data_init(&c2n->data[1]);
- c2n->head = c2n->data;
- c2n->tail = c2n->data;
+ seqcount_init(&c2n->seq);
}
static inline unsigned long long cycles_2_ns(unsigned long long cyc)
{
- struct cyc2ns_data *data, *tail;
+ struct cyc2ns_data data;
unsigned long long ns;
- /*
- * See cyc2ns_read_*() for details; replicated in order to avoid
- * an extra few instructions that came with the abstraction.
- * Notable, it allows us to only do the __count and tail update
- * dance when its actually needed.
- */
-
- preempt_disable_notrace();
- data = this_cpu_read(cyc2ns.head);
- tail = this_cpu_read(cyc2ns.tail);
-
- if (likely(data == tail)) {
- ns = data->cyc2ns_offset;
- ns += mul_u64_u32_shr(cyc, data->cyc2ns_mul, data->cyc2ns_shift);
- } else {
- data->__count++;
-
- barrier();
-
- ns = data->cyc2ns_offset;
- ns += mul_u64_u32_shr(cyc, data->cyc2ns_mul, data->cyc2ns_shift);
+ cyc2ns_read_begin(&data);
- barrier();
+ ns = data.cyc2ns_offset;
+ ns += mul_u64_u32_shr(cyc, data.cyc2ns_mul, data.cyc2ns_shift);
- if (!--data->__count)
- this_cpu_write(cyc2ns.tail, data);
- }
- preempt_enable_notrace();
+ cyc2ns_read_end();
return ns;
}
-static void set_cyc2ns_scale(unsigned long khz, int cpu)
+static void set_cyc2ns_scale(unsigned long khz, int cpu, unsigned long long tsc_now)
{
- unsigned long long tsc_now, ns_now;
- struct cyc2ns_data *data;
+ unsigned long long ns_now;
+ struct cyc2ns_data data;
+ struct cyc2ns *c2n;
unsigned long flags;
local_irq_save(flags);
@@ -254,9 +150,6 @@ static void set_cyc2ns_scale(unsigned long khz, int cpu)
if (!khz)
goto done;
- data = cyc2ns_write_begin(cpu);
-
- tsc_now = rdtsc();
ns_now = cycles_2_ns(tsc_now);
/*
@@ -264,7 +157,7 @@ static void set_cyc2ns_scale(unsigned long khz, int cpu)
* time function is continuous; see the comment near struct
* cyc2ns_data.
*/
- clocks_calc_mult_shift(&data->cyc2ns_mul, &data->cyc2ns_shift, khz,
+ clocks_calc_mult_shift(&data.cyc2ns_mul, &data.cyc2ns_shift, khz,
NSEC_PER_MSEC, 0);
/*
@@ -273,20 +166,26 @@ static void set_cyc2ns_scale(unsigned long khz, int cpu)
* conversion algorithm shifting a 32-bit value (now specifies a 64-bit
* value) - refer perf_event_mmap_page documentation in perf_event.h.
*/
- if (data->cyc2ns_shift == 32) {
- data->cyc2ns_shift = 31;
- data->cyc2ns_mul >>= 1;
+ if (data.cyc2ns_shift == 32) {
+ data.cyc2ns_shift = 31;
+ data.cyc2ns_mul >>= 1;
}
- data->cyc2ns_offset = ns_now -
- mul_u64_u32_shr(tsc_now, data->cyc2ns_mul, data->cyc2ns_shift);
+ data.cyc2ns_offset = ns_now -
+ mul_u64_u32_shr(tsc_now, data.cyc2ns_mul, data.cyc2ns_shift);
+
+ c2n = per_cpu_ptr(&cyc2ns, cpu);
- cyc2ns_write_end(cpu, data);
+ raw_write_seqcount_latch(&c2n->seq);
+ c2n->data[0] = data;
+ raw_write_seqcount_latch(&c2n->seq);
+ c2n->data[1] = data;
done:
- sched_clock_idle_wakeup_event(0);
+ sched_clock_idle_wakeup_event();
local_irq_restore(flags);
}
+
/*
* Scheduler clock - returns current time in nanosec units.
*/
@@ -374,6 +273,8 @@ static int __init tsc_setup(char *str)
tsc_clocksource_reliable = 1;
if (!strncmp(str, "noirqtime", 9))
no_sched_irq_time = 1;
+ if (!strcmp(str, "unstable"))
+ mark_tsc_unstable("boot parameter");
return 1;
}
@@ -986,7 +887,6 @@ void tsc_restore_sched_clock_state(void)
}
#ifdef CONFIG_CPU_FREQ
-
/* Frequency scaling support. Adjust the TSC based timer when the cpu frequency
* changes.
*
@@ -1027,7 +927,7 @@ static int time_cpufreq_notifier(struct notifier_block *nb, unsigned long val,
if (!(freq->flags & CPUFREQ_CONST_LOOPS))
mark_tsc_unstable("cpufreq changes");
- set_cyc2ns_scale(tsc_khz, freq->cpu);
+ set_cyc2ns_scale(tsc_khz, freq->cpu, rdtsc());
}
return 0;
@@ -1127,6 +1027,15 @@ static void tsc_cs_mark_unstable(struct clocksource *cs)
pr_info("Marking TSC unstable due to clocksource watchdog\n");
}
+static void tsc_cs_tick_stable(struct clocksource *cs)
+{
+ if (tsc_unstable)
+ return;
+
+ if (using_native_sched_clock())
+ sched_clock_tick_stable();
+}
+
/*
* .mask MUST be CLOCKSOURCE_MASK(64). See comment above read_tsc()
*/
@@ -1140,6 +1049,7 @@ static struct clocksource clocksource_tsc = {
.archdata = { .vclock_mode = VCLOCK_TSC },
.resume = tsc_resume,
.mark_unstable = tsc_cs_mark_unstable,
+ .tick_stable = tsc_cs_tick_stable,
};
void mark_tsc_unstable(char *reason)
@@ -1255,6 +1165,7 @@ static void tsc_refine_calibration_work(struct work_struct *work)
static int hpet;
u64 tsc_stop, ref_stop, delta;
unsigned long freq;
+ int cpu;
/* Don't bother refining TSC on unstable systems */
if (check_tsc_unstable())
@@ -1305,6 +1216,10 @@ static void tsc_refine_calibration_work(struct work_struct *work)
/* Inform the TSC deadline clockevent devices about the recalibration */
lapic_update_tsc_freq();
+ /* Update the sched_clock() rate to match the clocksource one */
+ for_each_possible_cpu(cpu)
+ set_cyc2ns_scale(tsc_khz, cpu, tsc_stop);
+
out:
if (boot_cpu_has(X86_FEATURE_ART))
art_related_clocksource = &clocksource_tsc;
@@ -1350,7 +1265,7 @@ device_initcall(init_tsc_clocksource);
void __init tsc_init(void)
{
- u64 lpj;
+ u64 lpj, cyc;
int cpu;
if (!boot_cpu_has(X86_FEATURE_TSC)) {
@@ -1390,9 +1305,10 @@ void __init tsc_init(void)
* speed as the bootup CPU. (cpufreq notifiers will fix this
* up if their speed diverges)
*/
+ cyc = rdtsc();
for_each_possible_cpu(cpu) {
cyc2ns_init(cpu);
- set_cyc2ns_scale(tsc_khz, cpu);
+ set_cyc2ns_scale(tsc_khz, cpu, cyc);
}
if (tsc_disabled > 0)
diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c
index 0816ab2e8adc..80890dee66ce 100644
--- a/arch/x86/kvm/emulate.c
+++ b/arch/x86/kvm/emulate.c
@@ -2742,6 +2742,7 @@ static int em_syscall(struct x86_emulate_ctxt *ctxt)
ctxt->eflags &= ~(X86_EFLAGS_VM | X86_EFLAGS_IF);
}
+ ctxt->tf = (ctxt->eflags & X86_EFLAGS_TF) != 0;
return X86EMUL_CONTINUE;
}
diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index ba9891ac5c56..33460fcdeef9 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -36,6 +36,7 @@
#include <linux/slab.h>
#include <linux/amd-iommu.h>
#include <linux/hashtable.h>
+#include <linux/frame.h>
#include <asm/apic.h>
#include <asm/perf_event.h>
@@ -4906,6 +4907,7 @@ static void svm_vcpu_run(struct kvm_vcpu *vcpu)
mark_all_clean(svm->vmcb);
}
+STACK_FRAME_NON_STANDARD(svm_vcpu_run);
static void svm_set_cr3(struct kvm_vcpu *vcpu, unsigned long root)
{
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index ca5d2b93385c..6dcc4873e435 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -33,6 +33,7 @@
#include <linux/slab.h>
#include <linux/tboot.h>
#include <linux/hrtimer.h>
+#include <linux/frame.h>
#include "kvm_cache_regs.h"
#include "x86.h"
@@ -48,6 +49,7 @@
#include <asm/kexec.h>
#include <asm/apic.h>
#include <asm/irq_remapping.h>
+#include <asm/mmu_context.h>
#include "trace.h"
#include "pmu.h"
@@ -596,6 +598,7 @@ struct vcpu_vmx {
int gs_ldt_reload_needed;
int fs_reload_needed;
u64 msr_host_bndcfgs;
+ unsigned long vmcs_host_cr3; /* May not match real cr3 */
unsigned long vmcs_host_cr4; /* May not match real cr4 */
} host_state;
struct {
@@ -5012,12 +5015,19 @@ static void vmx_set_constant_host_state(struct vcpu_vmx *vmx)
u32 low32, high32;
unsigned long tmpl;
struct desc_ptr dt;
- unsigned long cr0, cr4;
+ unsigned long cr0, cr3, cr4;
cr0 = read_cr0();
WARN_ON(cr0 & X86_CR0_TS);
vmcs_writel(HOST_CR0, cr0); /* 22.2.3 */
- vmcs_writel(HOST_CR3, read_cr3()); /* 22.2.3 FIXME: shadow tables */
+
+ /*
+ * Save the most likely value for this task's CR3 in the VMCS.
+ * We can't use __get_current_cr3_fast() because we're not atomic.
+ */
+ cr3 = __read_cr3();
+ vmcs_writel(HOST_CR3, cr3); /* 22.2.3 FIXME: shadow tables */
+ vmx->host_state.vmcs_host_cr3 = cr3;
/* Save the most likely value for this task's CR4 in the VMCS. */
cr4 = cr4_read_shadow();
@@ -8652,6 +8662,7 @@ static void vmx_handle_external_intr(struct kvm_vcpu *vcpu)
);
}
}
+STACK_FRAME_NON_STANDARD(vmx_handle_external_intr);
static bool vmx_has_high_real_mode_segbase(void)
{
@@ -8820,7 +8831,7 @@ static void vmx_arm_hv_timer(struct kvm_vcpu *vcpu)
static void __noclone vmx_vcpu_run(struct kvm_vcpu *vcpu)
{
struct vcpu_vmx *vmx = to_vmx(vcpu);
- unsigned long debugctlmsr, cr4;
+ unsigned long debugctlmsr, cr3, cr4;
/* Don't enter VMX if guest state is invalid, let the exit handler
start emulation until we arrive back to a valid state */
@@ -8842,6 +8853,12 @@ static void __noclone vmx_vcpu_run(struct kvm_vcpu *vcpu)
if (test_bit(VCPU_REGS_RIP, (unsigned long *)&vcpu->arch.regs_dirty))
vmcs_writel(GUEST_RIP, vcpu->arch.regs[VCPU_REGS_RIP]);
+ cr3 = __get_current_cr3_fast();
+ if (unlikely(cr3 != vmx->host_state.vmcs_host_cr3)) {
+ vmcs_writel(HOST_CR3, cr3);
+ vmx->host_state.vmcs_host_cr3 = cr3;
+ }
+
cr4 = cr4_read_shadow();
if (unlikely(cr4 != vmx->host_state.vmcs_host_cr4)) {
vmcs_writel(HOST_CR4, cr4);
@@ -9028,6 +9045,7 @@ static void __noclone vmx_vcpu_run(struct kvm_vcpu *vcpu)
vmx_recover_nmi_blocking(vmx);
vmx_complete_interrupts(vmx);
}
+STACK_FRAME_NON_STANDARD(vmx_vcpu_run);
static void vmx_switch_vmcs(struct kvm_vcpu *vcpu, struct loaded_vmcs *vmcs)
{
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 87d3cb901935..0e846f0cb83b 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -5313,6 +5313,8 @@ static void init_emulate_ctxt(struct kvm_vcpu *vcpu)
kvm_x86_ops->get_cs_db_l_bits(vcpu, &cs_db, &cs_l);
ctxt->eflags = kvm_get_rflags(vcpu);
+ ctxt->tf = (ctxt->eflags & X86_EFLAGS_TF) != 0;
+
ctxt->eip = kvm_rip_read(vcpu);
ctxt->mode = (!is_protmode(vcpu)) ? X86EMUL_MODE_REAL :
(ctxt->eflags & X86_EFLAGS_VM) ? X86EMUL_MODE_VM86 :
@@ -5528,36 +5530,25 @@ static int kvm_vcpu_check_hw_bp(unsigned long addr, u32 type, u32 dr7,
return dr6;
}
-static void kvm_vcpu_check_singlestep(struct kvm_vcpu *vcpu, unsigned long rflags, int *r)
+static void kvm_vcpu_do_singlestep(struct kvm_vcpu *vcpu, int *r)
{
struct kvm_run *kvm_run = vcpu->run;
- /*
- * rflags is the old, "raw" value of the flags. The new value has
- * not been saved yet.
- *
- * This is correct even for TF set by the guest, because "the
- * processor will not generate this exception after the instruction
- * that sets the TF flag".
- */
- if (unlikely(rflags & X86_EFLAGS_TF)) {
- if (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP) {
- kvm_run->debug.arch.dr6 = DR6_BS | DR6_FIXED_1 |
- DR6_RTM;
- kvm_run->debug.arch.pc = vcpu->arch.singlestep_rip;
- kvm_run->debug.arch.exception = DB_VECTOR;
- kvm_run->exit_reason = KVM_EXIT_DEBUG;
- *r = EMULATE_USER_EXIT;
- } else {
- /*
- * "Certain debug exceptions may clear bit 0-3. The
- * remaining contents of the DR6 register are never
- * cleared by the processor".
- */
- vcpu->arch.dr6 &= ~15;
- vcpu->arch.dr6 |= DR6_BS | DR6_RTM;
- kvm_queue_exception(vcpu, DB_VECTOR);
- }
+ if (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP) {
+ kvm_run->debug.arch.dr6 = DR6_BS | DR6_FIXED_1 | DR6_RTM;
+ kvm_run->debug.arch.pc = vcpu->arch.singlestep_rip;
+ kvm_run->debug.arch.exception = DB_VECTOR;
+ kvm_run->exit_reason = KVM_EXIT_DEBUG;
+ *r = EMULATE_USER_EXIT;
+ } else {
+ /*
+ * "Certain debug exceptions may clear bit 0-3. The
+ * remaining contents of the DR6 register are never
+ * cleared by the processor".
+ */
+ vcpu->arch.dr6 &= ~15;
+ vcpu->arch.dr6 |= DR6_BS | DR6_RTM;
+ kvm_queue_exception(vcpu, DB_VECTOR);
}
}
@@ -5567,7 +5558,17 @@ int kvm_skip_emulated_instruction(struct kvm_vcpu *vcpu)
int r = EMULATE_DONE;
kvm_x86_ops->skip_emulated_instruction(vcpu);
- kvm_vcpu_check_singlestep(vcpu, rflags, &r);
+
+ /*
+ * rflags is the old, "raw" value of the flags. The new value has
+ * not been saved yet.
+ *
+ * This is correct even for TF set by the guest, because "the
+ * processor will not generate this exception after the instruction
+ * that sets the TF flag".
+ */
+ if (unlikely(rflags & X86_EFLAGS_TF))
+ kvm_vcpu_do_singlestep(vcpu, &r);
return r == EMULATE_DONE;
}
EXPORT_SYMBOL_GPL(kvm_skip_emulated_instruction);
@@ -5726,8 +5727,9 @@ restart:
toggle_interruptibility(vcpu, ctxt->interruptibility);
vcpu->arch.emulate_regs_need_sync_to_vcpu = false;
kvm_rip_write(vcpu, ctxt->eip);
- if (r == EMULATE_DONE)
- kvm_vcpu_check_singlestep(vcpu, rflags, &r);
+ if (r == EMULATE_DONE &&
+ (ctxt->tf || (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP)))
+ kvm_vcpu_do_singlestep(vcpu, &r);
if (!ctxt->have_exception ||
exception_type(ctxt->exception.vector) == EXCPT_TRAP)
__kvm_set_rflags(vcpu, ctxt->eflags);
diff --git a/arch/x86/lib/copy_user_64.S b/arch/x86/lib/copy_user_64.S
index c5959576c315..020f75cc8cf6 100644
--- a/arch/x86/lib/copy_user_64.S
+++ b/arch/x86/lib/copy_user_64.S
@@ -37,7 +37,7 @@ ENTRY(copy_user_generic_unrolled)
movl %edx,%ecx
andl $63,%edx
shrl $6,%ecx
- jz 17f
+ jz .L_copy_short_string
1: movq (%rsi),%r8
2: movq 1*8(%rsi),%r9
3: movq 2*8(%rsi),%r10
@@ -58,7 +58,8 @@ ENTRY(copy_user_generic_unrolled)
leaq 64(%rdi),%rdi
decl %ecx
jnz 1b
-17: movl %edx,%ecx
+.L_copy_short_string:
+ movl %edx,%ecx
andl $7,%edx
shrl $3,%ecx
jz 20f
@@ -174,6 +175,8 @@ EXPORT_SYMBOL(copy_user_generic_string)
*/
ENTRY(copy_user_enhanced_fast_string)
ASM_STAC
+ cmpl $64,%edx
+ jb .L_copy_short_string /* less then 64 bytes, avoid the costly 'rep' */
movl %edx,%ecx
1: rep
movsb
diff --git a/arch/x86/lib/msr-reg.S b/arch/x86/lib/msr-reg.S
index c81556409bbb..10ffa7e8519f 100644
--- a/arch/x86/lib/msr-reg.S
+++ b/arch/x86/lib/msr-reg.S
@@ -13,14 +13,14 @@
.macro op_safe_regs op
ENTRY(\op\()_safe_regs)
pushq %rbx
- pushq %rbp
+ pushq %r12
movq %rdi, %r10 /* Save pointer */
xorl %r11d, %r11d /* Return value */
movl (%rdi), %eax
movl 4(%rdi), %ecx
movl 8(%rdi), %edx
movl 12(%rdi), %ebx
- movl 20(%rdi), %ebp
+ movl 20(%rdi), %r12d
movl 24(%rdi), %esi
movl 28(%rdi), %edi
1: \op
@@ -29,10 +29,10 @@ ENTRY(\op\()_safe_regs)
movl %ecx, 4(%r10)
movl %edx, 8(%r10)
movl %ebx, 12(%r10)
- movl %ebp, 20(%r10)
+ movl %r12d, 20(%r10)
movl %esi, 24(%r10)
movl %edi, 28(%r10)
- popq %rbp
+ popq %r12
popq %rbx
ret
3:
diff --git a/arch/x86/lib/x86-opcode-map.txt b/arch/x86/lib/x86-opcode-map.txt
index 767be7c76034..12e377184ee4 100644
--- a/arch/x86/lib/x86-opcode-map.txt
+++ b/arch/x86/lib/x86-opcode-map.txt
@@ -1009,7 +1009,7 @@ GrpTable: Grp15
1: fxstor | RDGSBASE Ry (F3),(11B)
2: vldmxcsr Md (v1) | WRFSBASE Ry (F3),(11B)
3: vstmxcsr Md (v1) | WRGSBASE Ry (F3),(11B)
-4: XSAVE
+4: XSAVE | ptwrite Ey (F3),(11B)
5: XRSTOR | lfence (11B)
6: XSAVEOPT | clwb (66) | mfence (11B)
7: clflush | clflushopt (66) | sfence (11B)
diff --git a/arch/x86/math-emu/fpu_system.h b/arch/x86/math-emu/fpu_system.h
index 5e044d506b7a..a179254a5122 100644
--- a/arch/x86/math-emu/fpu_system.h
+++ b/arch/x86/math-emu/fpu_system.h
@@ -27,7 +27,7 @@ static inline struct desc_struct FPU_get_ldt_descriptor(unsigned seg)
#ifdef CONFIG_MODIFY_LDT_SYSCALL
seg >>= 3;
mutex_lock(&current->mm->context.lock);
- if (current->mm->context.ldt && seg < current->mm->context.ldt->size)
+ if (current->mm->context.ldt && seg < current->mm->context.ldt->nr_entries)
ret = current->mm->context.ldt->entries[seg];
mutex_unlock(&current->mm->context.lock);
#endif
diff --git a/arch/x86/mm/Makefile b/arch/x86/mm/Makefile
index 96d2b847e09e..0fbdcb64f9f8 100644
--- a/arch/x86/mm/Makefile
+++ b/arch/x86/mm/Makefile
@@ -2,7 +2,7 @@
KCOV_INSTRUMENT_tlb.o := n
obj-y := init.o init_$(BITS).o fault.o ioremap.o extable.o pageattr.o mmap.o \
- pat.o pgtable.o physaddr.o gup.o setup_nx.o tlb.o
+ pat.o pgtable.o physaddr.o setup_nx.o tlb.o
# Make sure __phys_addr has no stackprotector
nostackp := $(call cc-option, -fno-stack-protector)
diff --git a/arch/x86/mm/dump_pagetables.c b/arch/x86/mm/dump_pagetables.c
index bce6990b1d81..0470826d2bdc 100644
--- a/arch/x86/mm/dump_pagetables.c
+++ b/arch/x86/mm/dump_pagetables.c
@@ -431,7 +431,7 @@ static void ptdump_walk_pgd_level_core(struct seq_file *m, pgd_t *pgd,
bool checkwx)
{
#ifdef CONFIG_X86_64
- pgd_t *start = (pgd_t *) &init_level4_pgt;
+ pgd_t *start = (pgd_t *) &init_top_pgt;
#else
pgd_t *start = swapper_pg_dir;
#endif
diff --git a/arch/x86/mm/extable.c b/arch/x86/mm/extable.c
index 35ea061010a1..0ea8afcb929c 100644
--- a/arch/x86/mm/extable.c
+++ b/arch/x86/mm/extable.c
@@ -162,6 +162,9 @@ void __init early_fixup_exception(struct pt_regs *regs, int trapnr)
if (fixup_exception(regs, trapnr))
return;
+ if (fixup_bug(regs, trapnr))
+ return;
+
fail:
early_printk("PANIC: early exception 0x%02x IP %lx:%lx error %lx cr2 0x%lx\n",
(unsigned)trapnr, (unsigned long)regs->cs, regs->ip,
diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c
index 8ad91a01cbc8..2a1fa10c6a98 100644
--- a/arch/x86/mm/fault.c
+++ b/arch/x86/mm/fault.c
@@ -346,7 +346,7 @@ static noinline int vmalloc_fault(unsigned long address)
* Do _not_ use "current" here. We might be inside
* an interrupt in the middle of a task switch..
*/
- pgd_paddr = read_cr3();
+ pgd_paddr = read_cr3_pa();
pmd_k = vmalloc_sync_one(__va(pgd_paddr), address);
if (!pmd_k)
return -1;
@@ -388,7 +388,7 @@ static bool low_pfn(unsigned long pfn)
static void dump_pagetable(unsigned long address)
{
- pgd_t *base = __va(read_cr3());
+ pgd_t *base = __va(read_cr3_pa());
pgd_t *pgd = &base[pgd_index(address)];
p4d_t *p4d;
pud_t *pud;
@@ -451,7 +451,7 @@ static noinline int vmalloc_fault(unsigned long address)
* happen within a race in page table update. In the later
* case just flush:
*/
- pgd = (pgd_t *)__va(read_cr3()) + pgd_index(address);
+ pgd = (pgd_t *)__va(read_cr3_pa()) + pgd_index(address);
pgd_ref = pgd_offset_k(address);
if (pgd_none(*pgd_ref))
return -1;
@@ -555,7 +555,7 @@ static int bad_address(void *p)
static void dump_pagetable(unsigned long address)
{
- pgd_t *base = __va(read_cr3() & PHYSICAL_PAGE_MASK);
+ pgd_t *base = __va(read_cr3_pa());
pgd_t *pgd = base + pgd_index(address);
p4d_t *p4d;
pud_t *pud;
@@ -700,7 +700,7 @@ show_fault_oops(struct pt_regs *regs, unsigned long error_code,
pgd_t *pgd;
pte_t *pte;
- pgd = __va(read_cr3() & PHYSICAL_PAGE_MASK);
+ pgd = __va(read_cr3_pa());
pgd += pgd_index(address);
pte = lookup_address_in_pgd(pgd, address, &level);
diff --git a/arch/x86/mm/gup.c b/arch/x86/mm/gup.c
deleted file mode 100644
index 456dfdfd2249..000000000000
--- a/arch/x86/mm/gup.c
+++ /dev/null
@@ -1,496 +0,0 @@
-/*
- * Lockless get_user_pages_fast for x86
- *
- * Copyright (C) 2008 Nick Piggin
- * Copyright (C) 2008 Novell Inc.
- */
-#include <linux/sched.h>
-#include <linux/mm.h>
-#include <linux/vmstat.h>
-#include <linux/highmem.h>
-#include <linux/swap.h>
-#include <linux/memremap.h>
-
-#include <asm/mmu_context.h>
-#include <asm/pgtable.h>
-
-static inline pte_t gup_get_pte(pte_t *ptep)
-{
-#ifndef CONFIG_X86_PAE
- return READ_ONCE(*ptep);
-#else
- /*
- * With get_user_pages_fast, we walk down the pagetables without taking
- * any locks. For this we would like to load the pointers atomically,
- * but that is not possible (without expensive cmpxchg8b) on PAE. What
- * we do have is the guarantee that a pte will only either go from not
- * present to present, or present to not present or both -- it will not
- * switch to a completely different present page without a TLB flush in
- * between; something that we are blocking by holding interrupts off.
- *
- * Setting ptes from not present to present goes:
- * ptep->pte_high = h;
- * smp_wmb();
- * ptep->pte_low = l;
- *
- * And present to not present goes:
- * ptep->pte_low = 0;
- * smp_wmb();
- * ptep->pte_high = 0;
- *
- * We must ensure here that the load of pte_low sees l iff pte_high
- * sees h. We load pte_high *after* loading pte_low, which ensures we
- * don't see an older value of pte_high. *Then* we recheck pte_low,
- * which ensures that we haven't picked up a changed pte high. We might
- * have got rubbish values from pte_low and pte_high, but we are
- * guaranteed that pte_low will not have the present bit set *unless*
- * it is 'l'. And get_user_pages_fast only operates on present ptes, so
- * we're safe.
- *
- * gup_get_pte should not be used or copied outside gup.c without being
- * very careful -- it does not atomically load the pte or anything that
- * is likely to be useful for you.
- */
- pte_t pte;
-
-retry:
- pte.pte_low = ptep->pte_low;
- smp_rmb();
- pte.pte_high = ptep->pte_high;
- smp_rmb();
- if (unlikely(pte.pte_low != ptep->pte_low))
- goto retry;
-
- return pte;
-#endif
-}
-
-static void undo_dev_pagemap(int *nr, int nr_start, struct page **pages)
-{
- while ((*nr) - nr_start) {
- struct page *page = pages[--(*nr)];
-
- ClearPageReferenced(page);
- put_page(page);
- }
-}
-
-/*
- * 'pteval' can come from a pte, pmd, pud or p4d. We only check
- * _PAGE_PRESENT, _PAGE_USER, and _PAGE_RW in here which are the
- * same value on all 4 types.
- */
-static inline int pte_allows_gup(unsigned long pteval, int write)
-{
- unsigned long need_pte_bits = _PAGE_PRESENT|_PAGE_USER;
-
- if (write)
- need_pte_bits |= _PAGE_RW;
-
- if ((pteval & need_pte_bits) != need_pte_bits)
- return 0;
-
- /* Check memory protection keys permissions. */
- if (!__pkru_allows_pkey(pte_flags_pkey(pteval), write))
- return 0;
-
- return 1;
-}
-
-/*
- * The performance critical leaf functions are made noinline otherwise gcc
- * inlines everything into a single function which results in too much
- * register pressure.
- */
-static noinline int gup_pte_range(pmd_t pmd, unsigned long addr,
- unsigned long end, int write, struct page **pages, int *nr)
-{
- struct dev_pagemap *pgmap = NULL;
- int nr_start = *nr, ret = 0;
- pte_t *ptep, *ptem;
-
- /*
- * Keep the original mapped PTE value (ptem) around since we
- * might increment ptep off the end of the page when finishing
- * our loop iteration.
- */
- ptem = ptep = pte_offset_map(&pmd, addr);
- do {
- pte_t pte = gup_get_pte(ptep);
- struct page *page;
-
- /* Similar to the PMD case, NUMA hinting must take slow path */
- if (pte_protnone(pte))
- break;
-
- if (!pte_allows_gup(pte_val(pte), write))
- break;
-
- if (pte_devmap(pte)) {
- pgmap = get_dev_pagemap(pte_pfn(pte), pgmap);
- if (unlikely(!pgmap)) {
- undo_dev_pagemap(nr, nr_start, pages);
- break;
- }
- } else if (pte_special(pte))
- break;
-
- VM_BUG_ON(!pfn_valid(pte_pfn(pte)));
- page = pte_page(pte);
- get_page(page);
- put_dev_pagemap(pgmap);
- SetPageReferenced(page);
- pages[*nr] = page;
- (*nr)++;
-
- } while (ptep++, addr += PAGE_SIZE, addr != end);
- if (addr == end)
- ret = 1;
- pte_unmap(ptem);
-
- return ret;
-}
-
-static inline void get_head_page_multiple(struct page *page, int nr)
-{
- VM_BUG_ON_PAGE(page != compound_head(page), page);
- VM_BUG_ON_PAGE(page_count(page) == 0, page);
- page_ref_add(page, nr);
- SetPageReferenced(page);
-}
-
-static int __gup_device_huge(unsigned long pfn, unsigned long addr,
- unsigned long end, struct page **pages, int *nr)
-{
- int nr_start = *nr;
- struct dev_pagemap *pgmap = NULL;
-
- do {
- struct page *page = pfn_to_page(pfn);
-
- pgmap = get_dev_pagemap(pfn, pgmap);
- if (unlikely(!pgmap)) {
- undo_dev_pagemap(nr, nr_start, pages);
- return 0;
- }
- SetPageReferenced(page);
- pages[*nr] = page;
- get_page(page);
- put_dev_pagemap(pgmap);
- (*nr)++;
- pfn++;
- } while (addr += PAGE_SIZE, addr != end);
- return 1;
-}
-
-static int __gup_device_huge_pmd(pmd_t pmd, unsigned long addr,
- unsigned long end, struct page **pages, int *nr)
-{
- unsigned long fault_pfn;
-
- fault_pfn = pmd_pfn(pmd) + ((addr & ~PMD_MASK) >> PAGE_SHIFT);
- return __gup_device_huge(fault_pfn, addr, end, pages, nr);
-}
-
-static int __gup_device_huge_pud(pud_t pud, unsigned long addr,
- unsigned long end, struct page **pages, int *nr)
-{
- unsigned long fault_pfn;
-
- fault_pfn = pud_pfn(pud) + ((addr & ~PUD_MASK) >> PAGE_SHIFT);
- return __gup_device_huge(fault_pfn, addr, end, pages, nr);
-}
-
-static noinline int gup_huge_pmd(pmd_t pmd, unsigned long addr,
- unsigned long end, int write, struct page **pages, int *nr)
-{
- struct page *head, *page;
- int refs;
-
- if (!pte_allows_gup(pmd_val(pmd), write))
- return 0;
-
- VM_BUG_ON(!pfn_valid(pmd_pfn(pmd)));
- if (pmd_devmap(pmd))
- return __gup_device_huge_pmd(pmd, addr, end, pages, nr);
-
- /* hugepages are never "special" */
- VM_BUG_ON(pmd_flags(pmd) & _PAGE_SPECIAL);
-
- refs = 0;
- head = pmd_page(pmd);
- page = head + ((addr & ~PMD_MASK) >> PAGE_SHIFT);
- do {
- VM_BUG_ON_PAGE(compound_head(page) != head, page);
- pages[*nr] = page;
- (*nr)++;
- page++;
- refs++;
- } while (addr += PAGE_SIZE, addr != end);
- get_head_page_multiple(head, refs);
-
- return 1;
-}
-
-static int gup_pmd_range(pud_t pud, unsigned long addr, unsigned long end,
- int write, struct page **pages, int *nr)
-{
- unsigned long next;
- pmd_t *pmdp;
-
- pmdp = pmd_offset(&pud, addr);
- do {
- pmd_t pmd = *pmdp;
-
- next = pmd_addr_end(addr, end);
- if (pmd_none(pmd))
- return 0;
- if (unlikely(pmd_large(pmd) || !pmd_present(pmd))) {
- /*
- * NUMA hinting faults need to be handled in the GUP
- * slowpath for accounting purposes and so that they
- * can be serialised against THP migration.
- */
- if (pmd_protnone(pmd))
- return 0;
- if (!gup_huge_pmd(pmd, addr, next, write, pages, nr))
- return 0;
- } else {
- if (!gup_pte_range(pmd, addr, next, write, pages, nr))
- return 0;
- }
- } while (pmdp++, addr = next, addr != end);
-
- return 1;
-}
-
-static noinline int gup_huge_pud(pud_t pud, unsigned long addr,
- unsigned long end, int write, struct page **pages, int *nr)
-{
- struct page *head, *page;
- int refs;
-
- if (!pte_allows_gup(pud_val(pud), write))
- return 0;
-
- VM_BUG_ON(!pfn_valid(pud_pfn(pud)));
- if (pud_devmap(pud))
- return __gup_device_huge_pud(pud, addr, end, pages, nr);
-
- /* hugepages are never "special" */
- VM_BUG_ON(pud_flags(pud) & _PAGE_SPECIAL);
-
- refs = 0;
- head = pud_page(pud);
- page = head + ((addr & ~PUD_MASK) >> PAGE_SHIFT);
- do {
- VM_BUG_ON_PAGE(compound_head(page) != head, page);
- pages[*nr] = page;
- (*nr)++;
- page++;
- refs++;
- } while (addr += PAGE_SIZE, addr != end);
- get_head_page_multiple(head, refs);
-
- return 1;
-}
-
-static int gup_pud_range(p4d_t p4d, unsigned long addr, unsigned long end,
- int write, struct page **pages, int *nr)
-{
- unsigned long next;
- pud_t *pudp;
-
- pudp = pud_offset(&p4d, addr);
- do {
- pud_t pud = *pudp;
-
- next = pud_addr_end(addr, end);
- if (pud_none(pud))
- return 0;
- if (unlikely(pud_large(pud))) {
- if (!gup_huge_pud(pud, addr, next, write, pages, nr))
- return 0;
- } else {
- if (!gup_pmd_range(pud, addr, next, write, pages, nr))
- return 0;
- }
- } while (pudp++, addr = next, addr != end);
-
- return 1;
-}
-
-static int gup_p4d_range(pgd_t pgd, unsigned long addr, unsigned long end,
- int write, struct page **pages, int *nr)
-{
- unsigned long next;
- p4d_t *p4dp;
-
- p4dp = p4d_offset(&pgd, addr);
- do {
- p4d_t p4d = *p4dp;
-
- next = p4d_addr_end(addr, end);
- if (p4d_none(p4d))
- return 0;
- BUILD_BUG_ON(p4d_large(p4d));
- if (!gup_pud_range(p4d, addr, next, write, pages, nr))
- return 0;
- } while (p4dp++, addr = next, addr != end);
-
- return 1;
-}
-
-/*
- * Like get_user_pages_fast() except its IRQ-safe in that it won't fall
- * back to the regular GUP.
- */
-int __get_user_pages_fast(unsigned long start, int nr_pages, int write,
- struct page **pages)
-{
- struct mm_struct *mm = current->mm;
- unsigned long addr, len, end;
- unsigned long next;
- unsigned long flags;
- pgd_t *pgdp;
- int nr = 0;
-
- start &= PAGE_MASK;
- addr = start;
- len = (unsigned long) nr_pages << PAGE_SHIFT;
- end = start + len;
- if (unlikely(!access_ok(write ? VERIFY_WRITE : VERIFY_READ,
- (void __user *)start, len)))
- return 0;
-
- /*
- * XXX: batch / limit 'nr', to avoid large irq off latency
- * needs some instrumenting to determine the common sizes used by
- * important workloads (eg. DB2), and whether limiting the batch size
- * will decrease performance.
- *
- * It seems like we're in the clear for the moment. Direct-IO is
- * the main guy that batches up lots of get_user_pages, and even
- * they are limited to 64-at-a-time which is not so many.
- */
- /*
- * This doesn't prevent pagetable teardown, but does prevent
- * the pagetables and pages from being freed on x86.
- *
- * So long as we atomically load page table pointers versus teardown
- * (which we do on x86, with the above PAE exception), we can follow the
- * address down to the the page and take a ref on it.
- */
- local_irq_save(flags);
- pgdp = pgd_offset(mm, addr);
- do {
- pgd_t pgd = *pgdp;
-
- next = pgd_addr_end(addr, end);
- if (pgd_none(pgd))
- break;
- if (!gup_p4d_range(pgd, addr, next, write, pages, &nr))
- break;
- } while (pgdp++, addr = next, addr != end);
- local_irq_restore(flags);
-
- return nr;
-}
-
-/**
- * get_user_pages_fast() - pin user pages in memory
- * @start: starting user address
- * @nr_pages: number of pages from start to pin
- * @write: whether pages will be written to
- * @pages: array that receives pointers to the pages pinned.
- * Should be at least nr_pages long.
- *
- * Attempt to pin user pages in memory without taking mm->mmap_sem.
- * If not successful, it will fall back to taking the lock and
- * calling get_user_pages().
- *
- * Returns number of pages pinned. This may be fewer than the number
- * requested. If nr_pages is 0 or negative, returns 0. If no pages
- * were pinned, returns -errno.
- */
-int get_user_pages_fast(unsigned long start, int nr_pages, int write,
- struct page **pages)
-{
- struct mm_struct *mm = current->mm;
- unsigned long addr, len, end;
- unsigned long next;
- pgd_t *pgdp;
- int nr = 0;
-
- start &= PAGE_MASK;
- addr = start;
- len = (unsigned long) nr_pages << PAGE_SHIFT;
-
- end = start + len;
- if (end < start)
- goto slow_irqon;
-
-#ifdef CONFIG_X86_64
- if (end >> __VIRTUAL_MASK_SHIFT)
- goto slow_irqon;
-#endif
-
- /*
- * XXX: batch / limit 'nr', to avoid large irq off latency
- * needs some instrumenting to determine the common sizes used by
- * important workloads (eg. DB2), and whether limiting the batch size
- * will decrease performance.
- *
- * It seems like we're in the clear for the moment. Direct-IO is
- * the main guy that batches up lots of get_user_pages, and even
- * they are limited to 64-at-a-time which is not so many.
- */
- /*
- * This doesn't prevent pagetable teardown, but does prevent
- * the pagetables and pages from being freed on x86.
- *
- * So long as we atomically load page table pointers versus teardown
- * (which we do on x86, with the above PAE exception), we can follow the
- * address down to the the page and take a ref on it.
- */
- local_irq_disable();
- pgdp = pgd_offset(mm, addr);
- do {
- pgd_t pgd = *pgdp;
-
- next = pgd_addr_end(addr, end);
- if (pgd_none(pgd))
- goto slow;
- if (!gup_p4d_range(pgd, addr, next, write, pages, &nr))
- goto slow;
- } while (pgdp++, addr = next, addr != end);
- local_irq_enable();
-
- VM_BUG_ON(nr != (end - start) >> PAGE_SHIFT);
- return nr;
-
- {
- int ret;
-
-slow:
- local_irq_enable();
-slow_irqon:
- /* Try to get the remaining pages with get_user_pages */
- start += nr << PAGE_SHIFT;
- pages += nr;
-
- ret = get_user_pages_unlocked(start,
- (end - start) >> PAGE_SHIFT,
- pages, write ? FOLL_WRITE : 0);
-
- /* Have to be a bit careful with return values */
- if (nr > 0) {
- if (ret < 0)
- ret = nr;
- else
- ret += nr;
- }
-
- return ret;
- }
-}
diff --git a/arch/x86/mm/hugetlbpage.c b/arch/x86/mm/hugetlbpage.c
index 302f43fd9c28..adad702b39cd 100644
--- a/arch/x86/mm/hugetlbpage.c
+++ b/arch/x86/mm/hugetlbpage.c
@@ -148,7 +148,7 @@ hugetlb_get_unmapped_area(struct file *file, unsigned long addr,
addr = ALIGN(addr, huge_page_size(h));
vma = find_vma(mm, addr);
if (TASK_SIZE - len >= addr &&
- (!vma || addr + len <= vma->vm_start))
+ (!vma || addr + len <= vm_start_gap(vma)))
return addr;
}
if (mm->get_unmapped_area == arch_get_unmapped_area)
diff --git a/arch/x86/mm/init.c b/arch/x86/mm/init.c
index cbc87ea98751..673541eb3b3f 100644
--- a/arch/x86/mm/init.c
+++ b/arch/x86/mm/init.c
@@ -161,16 +161,16 @@ static int page_size_mask;
static void __init probe_page_size_mask(void)
{
-#if !defined(CONFIG_KMEMCHECK)
/*
* For CONFIG_KMEMCHECK or pagealloc debugging, identity mapping will
* use small pages.
* This will simplify cpa(), which otherwise needs to support splitting
* large pages into small in interrupt context, etc.
*/
- if (boot_cpu_has(X86_FEATURE_PSE) && !debug_pagealloc_enabled())
+ if (boot_cpu_has(X86_FEATURE_PSE) && !debug_pagealloc_enabled() && !IS_ENABLED(CONFIG_KMEMCHECK))
page_size_mask |= 1 << PG_LEVEL_2M;
-#endif
+ else
+ direct_gbpages = 0;
/* Enable PSE if available */
if (boot_cpu_has(X86_FEATURE_PSE))
@@ -811,10 +811,8 @@ void __init zone_sizes_init(void)
}
DEFINE_PER_CPU_SHARED_ALIGNED(struct tlb_state, cpu_tlbstate) = {
-#ifdef CONFIG_SMP
- .active_mm = &init_mm,
+ .loaded_mm = &init_mm,
.state = 0,
-#endif
.cr4 = ~0UL, /* fail hard if we screw up cr4 shadow initialization */
};
EXPORT_SYMBOL_GPL(cpu_tlbstate);
diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c
index 95651dc58e09..dae6a5e5ad4a 100644
--- a/arch/x86/mm/init_64.c
+++ b/arch/x86/mm/init_64.c
@@ -92,6 +92,44 @@ __setup("noexec32=", nonx32_setup);
* When memory was added make sure all the processes MM have
* suitable PGD entries in the local PGD level page.
*/
+#ifdef CONFIG_X86_5LEVEL
+void sync_global_pgds(unsigned long start, unsigned long end)
+{
+ unsigned long addr;
+
+ for (addr = start; addr <= end; addr = ALIGN(addr + 1, PGDIR_SIZE)) {
+ const pgd_t *pgd_ref = pgd_offset_k(addr);
+ struct page *page;
+
+ /* Check for overflow */
+ if (addr < start)
+ break;
+
+ if (pgd_none(*pgd_ref))
+ continue;
+
+ spin_lock(&pgd_lock);
+ list_for_each_entry(page, &pgd_list, lru) {
+ pgd_t *pgd;
+ spinlock_t *pgt_lock;
+
+ pgd = (pgd_t *)page_address(page) + pgd_index(addr);
+ /* the pgt_lock only for Xen */
+ pgt_lock = &pgd_page_get_mm(page)->page_table_lock;
+ spin_lock(pgt_lock);
+
+ if (!pgd_none(*pgd_ref) && !pgd_none(*pgd))
+ BUG_ON(pgd_page_vaddr(*pgd) != pgd_page_vaddr(*pgd_ref));
+
+ if (pgd_none(*pgd))
+ set_pgd(pgd, *pgd_ref);
+
+ spin_unlock(pgt_lock);
+ }
+ spin_unlock(&pgd_lock);
+ }
+}
+#else
void sync_global_pgds(unsigned long start, unsigned long end)
{
unsigned long addr;
@@ -135,6 +173,7 @@ void sync_global_pgds(unsigned long start, unsigned long end)
spin_unlock(&pgd_lock);
}
}
+#endif
/*
* NOTE: This function is marked __ref because it calls __init function
@@ -585,6 +624,57 @@ phys_pud_init(pud_t *pud_page, unsigned long paddr, unsigned long paddr_end,
return paddr_last;
}
+static unsigned long __meminit
+phys_p4d_init(p4d_t *p4d_page, unsigned long paddr, unsigned long paddr_end,
+ unsigned long page_size_mask)
+{
+ unsigned long paddr_next, paddr_last = paddr_end;
+ unsigned long vaddr = (unsigned long)__va(paddr);
+ int i = p4d_index(vaddr);
+
+ if (!IS_ENABLED(CONFIG_X86_5LEVEL))
+ return phys_pud_init((pud_t *) p4d_page, paddr, paddr_end, page_size_mask);
+
+ for (; i < PTRS_PER_P4D; i++, paddr = paddr_next) {
+ p4d_t *p4d;
+ pud_t *pud;
+
+ vaddr = (unsigned long)__va(paddr);
+ p4d = p4d_page + p4d_index(vaddr);
+ paddr_next = (paddr & P4D_MASK) + P4D_SIZE;
+
+ if (paddr >= paddr_end) {
+ if (!after_bootmem &&
+ !e820__mapped_any(paddr & P4D_MASK, paddr_next,
+ E820_TYPE_RAM) &&
+ !e820__mapped_any(paddr & P4D_MASK, paddr_next,
+ E820_TYPE_RESERVED_KERN))
+ set_p4d(p4d, __p4d(0));
+ continue;
+ }
+
+ if (!p4d_none(*p4d)) {
+ pud = pud_offset(p4d, 0);
+ paddr_last = phys_pud_init(pud, paddr,
+ paddr_end,
+ page_size_mask);
+ __flush_tlb_all();
+ continue;
+ }
+
+ pud = alloc_low_page();
+ paddr_last = phys_pud_init(pud, paddr, paddr_end,
+ page_size_mask);
+
+ spin_lock(&init_mm.page_table_lock);
+ p4d_populate(&init_mm, p4d, pud);
+ spin_unlock(&init_mm.page_table_lock);
+ }
+ __flush_tlb_all();
+
+ return paddr_last;
+}
+
/*
* Create page table mapping for the physical memory for specific physical
* addresses. The virtual and physical addresses have to be aligned on PMD level
@@ -606,26 +696,26 @@ kernel_physical_mapping_init(unsigned long paddr_start,
for (; vaddr < vaddr_end; vaddr = vaddr_next) {
pgd_t *pgd = pgd_offset_k(vaddr);
p4d_t *p4d;
- pud_t *pud;
vaddr_next = (vaddr & PGDIR_MASK) + PGDIR_SIZE;
- BUILD_BUG_ON(pgd_none(*pgd));
- p4d = p4d_offset(pgd, vaddr);
- if (p4d_val(*p4d)) {
- pud = (pud_t *)p4d_page_vaddr(*p4d);
- paddr_last = phys_pud_init(pud, __pa(vaddr),
+ if (pgd_val(*pgd)) {
+ p4d = (p4d_t *)pgd_page_vaddr(*pgd);
+ paddr_last = phys_p4d_init(p4d, __pa(vaddr),
__pa(vaddr_end),
page_size_mask);
continue;
}
- pud = alloc_low_page();
- paddr_last = phys_pud_init(pud, __pa(vaddr), __pa(vaddr_end),
+ p4d = alloc_low_page();
+ paddr_last = phys_p4d_init(p4d, __pa(vaddr), __pa(vaddr_end),
page_size_mask);
spin_lock(&init_mm.page_table_lock);
- p4d_populate(&init_mm, p4d, pud);
+ if (IS_ENABLED(CONFIG_X86_5LEVEL))
+ pgd_populate(&init_mm, pgd, p4d);
+ else
+ p4d_populate(&init_mm, p4d_offset(pgd, vaddr), (pud_t *) p4d);
spin_unlock(&init_mm.page_table_lock);
pgd_changed = true;
}
@@ -990,7 +1080,13 @@ remove_p4d_table(p4d_t *p4d_start, unsigned long addr, unsigned long end,
pud_base = pud_offset(p4d, 0);
remove_pud_table(pud_base, addr, next, direct);
- free_pud_table(pud_base, p4d);
+ /*
+ * For 4-level page tables we do not want to free PUDs, but in the
+ * 5-level case we should free them. This code will have to change
+ * to adapt for boot-time switching between 4 and 5 level page tables.
+ */
+ if (CONFIG_PGTABLE_LEVELS == 5)
+ free_pud_table(pud_base, p4d);
}
if (direct)
diff --git a/arch/x86/mm/ioremap.c b/arch/x86/mm/ioremap.c
index bbc558b88a88..4c1b5fd0c7ad 100644
--- a/arch/x86/mm/ioremap.c
+++ b/arch/x86/mm/ioremap.c
@@ -424,7 +424,7 @@ static pte_t bm_pte[PAGE_SIZE/sizeof(pte_t)] __page_aligned_bss;
static inline pmd_t * __init early_ioremap_pmd(unsigned long addr)
{
/* Don't assume we're using swapper_pg_dir at this point */
- pgd_t *base = __va(read_cr3());
+ pgd_t *base = __va(read_cr3_pa());
pgd_t *pgd = &base[pgd_index(addr)];
p4d_t *p4d = p4d_offset(pgd, addr);
pud_t *pud = pud_offset(p4d, addr);
diff --git a/arch/x86/mm/kasan_init_64.c b/arch/x86/mm/kasan_init_64.c
index 0c7d8129bed6..88215ac16b24 100644
--- a/arch/x86/mm/kasan_init_64.c
+++ b/arch/x86/mm/kasan_init_64.c
@@ -12,7 +12,7 @@
#include <asm/tlbflush.h>
#include <asm/sections.h>
-extern pgd_t early_level4_pgt[PTRS_PER_PGD];
+extern pgd_t early_top_pgt[PTRS_PER_PGD];
extern struct range pfn_mapped[E820_MAX_ENTRIES];
static int __init map_range(struct range *range)
@@ -109,8 +109,8 @@ void __init kasan_early_init(void)
for (i = 0; CONFIG_PGTABLE_LEVELS >= 5 && i < PTRS_PER_P4D; i++)
kasan_zero_p4d[i] = __p4d(p4d_val);
- kasan_map_early_shadow(early_level4_pgt);
- kasan_map_early_shadow(init_level4_pgt);
+ kasan_map_early_shadow(early_top_pgt);
+ kasan_map_early_shadow(init_top_pgt);
}
void __init kasan_init(void)
@@ -121,8 +121,8 @@ void __init kasan_init(void)
register_die_notifier(&kasan_die_notifier);
#endif
- memcpy(early_level4_pgt, init_level4_pgt, sizeof(early_level4_pgt));
- load_cr3(early_level4_pgt);
+ memcpy(early_top_pgt, init_top_pgt, sizeof(early_top_pgt));
+ load_cr3(early_top_pgt);
__flush_tlb_all();
clear_pgds(KASAN_SHADOW_START, KASAN_SHADOW_END);
@@ -148,7 +148,7 @@ void __init kasan_init(void)
kasan_populate_zero_shadow(kasan_mem_to_shadow((void *)MODULES_END),
(void *)KASAN_SHADOW_END);
- load_cr3(init_level4_pgt);
+ load_cr3(init_top_pgt);
__flush_tlb_all();
/*
diff --git a/arch/x86/mm/kaslr.c b/arch/x86/mm/kaslr.c
index aed206475aa7..af599167fe3c 100644
--- a/arch/x86/mm/kaslr.c
+++ b/arch/x86/mm/kaslr.c
@@ -6,12 +6,12 @@
*
* Entropy is generated using the KASLR early boot functions now shared in
* the lib directory (originally written by Kees Cook). Randomization is
- * done on PGD & PUD page table levels to increase possible addresses. The
- * physical memory mapping code was adapted to support PUD level virtual
- * addresses. This implementation on the best configuration provides 30,000
- * possible virtual addresses in average for each memory region. An additional
- * low memory page is used to ensure each CPU can start with a PGD aligned
- * virtual address (for realmode).
+ * done on PGD & P4D/PUD page table levels to increase possible addresses.
+ * The physical memory mapping code was adapted to support P4D/PUD level
+ * virtual addresses. This implementation on the best configuration provides
+ * 30,000 possible virtual addresses in average for each memory region.
+ * An additional low memory page is used to ensure each CPU can start with
+ * a PGD aligned virtual address (for realmode).
*
* The order of each memory region is not changed. The feature looks at
* the available space for the regions based on different configuration
@@ -70,7 +70,7 @@ static __initdata struct kaslr_memory_region {
unsigned long *base;
unsigned long size_tb;
} kaslr_regions[] = {
- { &page_offset_base, 64/* Maximum */ },
+ { &page_offset_base, 1 << (__PHYSICAL_MASK_SHIFT - TB_SHIFT) /* Maximum */ },
{ &vmalloc_base, VMALLOC_SIZE_TB },
{ &vmemmap_base, 1 },
};
@@ -142,7 +142,10 @@ void __init kernel_randomize_memory(void)
*/
entropy = remain_entropy / (ARRAY_SIZE(kaslr_regions) - i);
prandom_bytes_state(&rand_state, &rand, sizeof(rand));
- entropy = (rand % (entropy + 1)) & PUD_MASK;
+ if (IS_ENABLED(CONFIG_X86_5LEVEL))
+ entropy = (rand % (entropy + 1)) & P4D_MASK;
+ else
+ entropy = (rand % (entropy + 1)) & PUD_MASK;
vaddr += entropy;
*kaslr_regions[i].base = vaddr;
@@ -151,27 +154,21 @@ void __init kernel_randomize_memory(void)
* randomization alignment.
*/
vaddr += get_padding(&kaslr_regions[i]);
- vaddr = round_up(vaddr + 1, PUD_SIZE);
+ if (IS_ENABLED(CONFIG_X86_5LEVEL))
+ vaddr = round_up(vaddr + 1, P4D_SIZE);
+ else
+ vaddr = round_up(vaddr + 1, PUD_SIZE);
remain_entropy -= entropy;
}
}
-/*
- * Create PGD aligned trampoline table to allow real mode initialization
- * of additional CPUs. Consume only 1 low memory page.
- */
-void __meminit init_trampoline(void)
+static void __meminit init_trampoline_pud(void)
{
unsigned long paddr, paddr_next;
pgd_t *pgd;
pud_t *pud_page, *pud_page_tramp;
int i;
- if (!kaslr_memory_enabled()) {
- init_trampoline_default();
- return;
- }
-
pud_page_tramp = alloc_low_page();
paddr = 0;
@@ -192,3 +189,49 @@ void __meminit init_trampoline(void)
set_pgd(&trampoline_pgd_entry,
__pgd(_KERNPG_TABLE | __pa(pud_page_tramp)));
}
+
+static void __meminit init_trampoline_p4d(void)
+{
+ unsigned long paddr, paddr_next;
+ pgd_t *pgd;
+ p4d_t *p4d_page, *p4d_page_tramp;
+ int i;
+
+ p4d_page_tramp = alloc_low_page();
+
+ paddr = 0;
+ pgd = pgd_offset_k((unsigned long)__va(paddr));
+ p4d_page = (p4d_t *) pgd_page_vaddr(*pgd);
+
+ for (i = p4d_index(paddr); i < PTRS_PER_P4D; i++, paddr = paddr_next) {
+ p4d_t *p4d, *p4d_tramp;
+ unsigned long vaddr = (unsigned long)__va(paddr);
+
+ p4d_tramp = p4d_page_tramp + p4d_index(paddr);
+ p4d = p4d_page + p4d_index(vaddr);
+ paddr_next = (paddr & P4D_MASK) + P4D_SIZE;
+
+ *p4d_tramp = *p4d;
+ }
+
+ set_pgd(&trampoline_pgd_entry,
+ __pgd(_KERNPG_TABLE | __pa(p4d_page_tramp)));
+}
+
+/*
+ * Create PGD aligned trampoline table to allow real mode initialization
+ * of additional CPUs. Consume only 1 low memory page.
+ */
+void __meminit init_trampoline(void)
+{
+
+ if (!kaslr_memory_enabled()) {
+ init_trampoline_default();
+ return;
+ }
+
+ if (IS_ENABLED(CONFIG_X86_5LEVEL))
+ init_trampoline_p4d();
+ else
+ init_trampoline_pud();
+}
diff --git a/arch/x86/mm/mmap.c b/arch/x86/mm/mmap.c
index 19ad095b41df..797295e792b2 100644
--- a/arch/x86/mm/mmap.c
+++ b/arch/x86/mm/mmap.c
@@ -74,9 +74,6 @@ static int mmap_is_legacy(void)
if (current->personality & ADDR_COMPAT_LAYOUT)
return 1;
- if (rlimit(RLIMIT_STACK) == RLIM_INFINITY)
- return 1;
-
return sysctl_legacy_va_layout;
}
diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c
index 6e7bedf69af7..014d07a80053 100644
--- a/arch/x86/mm/tlb.c
+++ b/arch/x86/mm/tlb.c
@@ -15,7 +15,7 @@
#include <linux/debugfs.h>
/*
- * Smarter SMP flushing macros.
+ * TLB flushing, formerly SMP-only
* c/o Linus Torvalds.
*
* These mean you can really definitely utterly forget about
@@ -28,39 +28,28 @@
* Implement flush IPI by CALL_FUNCTION_VECTOR, Alex Shi
*/
-#ifdef CONFIG_SMP
-
-struct flush_tlb_info {
- struct mm_struct *flush_mm;
- unsigned long flush_start;
- unsigned long flush_end;
-};
-
-/*
- * We cannot call mmdrop() because we are in interrupt context,
- * instead update mm->cpu_vm_mask.
- */
void leave_mm(int cpu)
{
- struct mm_struct *active_mm = this_cpu_read(cpu_tlbstate.active_mm);
+ struct mm_struct *loaded_mm = this_cpu_read(cpu_tlbstate.loaded_mm);
+
+ /*
+ * It's plausible that we're in lazy TLB mode while our mm is init_mm.
+ * If so, our callers still expect us to flush the TLB, but there
+ * aren't any user TLB entries in init_mm to worry about.
+ *
+ * This needs to happen before any other sanity checks due to
+ * intel_idle's shenanigans.
+ */
+ if (loaded_mm == &init_mm)
+ return;
+
if (this_cpu_read(cpu_tlbstate.state) == TLBSTATE_OK)
BUG();
- if (cpumask_test_cpu(cpu, mm_cpumask(active_mm))) {
- cpumask_clear_cpu(cpu, mm_cpumask(active_mm));
- load_cr3(swapper_pg_dir);
- /*
- * This gets called in the idle path where RCU
- * functions differently. Tracing normally
- * uses RCU, so we have to call the tracepoint
- * specially here.
- */
- trace_tlb_flush_rcuidle(TLB_FLUSH_ON_TASK_SWITCH, TLB_FLUSH_ALL);
- }
+
+ switch_mm(NULL, &init_mm, NULL);
}
EXPORT_SYMBOL_GPL(leave_mm);
-#endif /* CONFIG_SMP */
-
void switch_mm(struct mm_struct *prev, struct mm_struct *next,
struct task_struct *tsk)
{
@@ -75,216 +64,167 @@ void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next,
struct task_struct *tsk)
{
unsigned cpu = smp_processor_id();
+ struct mm_struct *real_prev = this_cpu_read(cpu_tlbstate.loaded_mm);
- if (likely(prev != next)) {
- if (IS_ENABLED(CONFIG_VMAP_STACK)) {
- /*
- * If our current stack is in vmalloc space and isn't
- * mapped in the new pgd, we'll double-fault. Forcibly
- * map it.
- */
- unsigned int stack_pgd_index = pgd_index(current_stack_pointer());
-
- pgd_t *pgd = next->pgd + stack_pgd_index;
-
- if (unlikely(pgd_none(*pgd)))
- set_pgd(pgd, init_mm.pgd[stack_pgd_index]);
- }
+ /*
+ * NB: The scheduler will call us with prev == next when
+ * switching from lazy TLB mode to normal mode if active_mm
+ * isn't changing. When this happens, there is no guarantee
+ * that CR3 (and hence cpu_tlbstate.loaded_mm) matches next.
+ *
+ * NB: leave_mm() calls us with prev == NULL and tsk == NULL.
+ */
-#ifdef CONFIG_SMP
- this_cpu_write(cpu_tlbstate.state, TLBSTATE_OK);
- this_cpu_write(cpu_tlbstate.active_mm, next);
-#endif
+ this_cpu_write(cpu_tlbstate.state, TLBSTATE_OK);
- cpumask_set_cpu(cpu, mm_cpumask(next));
+ if (real_prev == next) {
+ /*
+ * There's nothing to do: we always keep the per-mm control
+ * regs in sync with cpu_tlbstate.loaded_mm. Just
+ * sanity-check mm_cpumask.
+ */
+ if (WARN_ON_ONCE(!cpumask_test_cpu(cpu, mm_cpumask(next))))
+ cpumask_set_cpu(cpu, mm_cpumask(next));
+ return;
+ }
+ if (IS_ENABLED(CONFIG_VMAP_STACK)) {
/*
- * Re-load page tables.
- *
- * This logic has an ordering constraint:
- *
- * CPU 0: Write to a PTE for 'next'
- * CPU 0: load bit 1 in mm_cpumask. if nonzero, send IPI.
- * CPU 1: set bit 1 in next's mm_cpumask
- * CPU 1: load from the PTE that CPU 0 writes (implicit)
- *
- * We need to prevent an outcome in which CPU 1 observes
- * the new PTE value and CPU 0 observes bit 1 clear in
- * mm_cpumask. (If that occurs, then the IPI will never
- * be sent, and CPU 0's TLB will contain a stale entry.)
- *
- * The bad outcome can occur if either CPU's load is
- * reordered before that CPU's store, so both CPUs must
- * execute full barriers to prevent this from happening.
- *
- * Thus, switch_mm needs a full barrier between the
- * store to mm_cpumask and any operation that could load
- * from next->pgd. TLB fills are special and can happen
- * due to instruction fetches or for no reason at all,
- * and neither LOCK nor MFENCE orders them.
- * Fortunately, load_cr3() is serializing and gives the
- * ordering guarantee we need.
- *
+ * If our current stack is in vmalloc space and isn't
+ * mapped in the new pgd, we'll double-fault. Forcibly
+ * map it.
*/
- load_cr3(next->pgd);
+ unsigned int stack_pgd_index = pgd_index(current_stack_pointer());
- trace_tlb_flush(TLB_FLUSH_ON_TASK_SWITCH, TLB_FLUSH_ALL);
+ pgd_t *pgd = next->pgd + stack_pgd_index;
- /* Stop flush ipis for the previous mm */
- cpumask_clear_cpu(cpu, mm_cpumask(prev));
+ if (unlikely(pgd_none(*pgd)))
+ set_pgd(pgd, init_mm.pgd[stack_pgd_index]);
+ }
- /* Load per-mm CR4 state */
- load_mm_cr4(next);
+ this_cpu_write(cpu_tlbstate.loaded_mm, next);
-#ifdef CONFIG_MODIFY_LDT_SYSCALL
- /*
- * Load the LDT, if the LDT is different.
- *
- * It's possible that prev->context.ldt doesn't match
- * the LDT register. This can happen if leave_mm(prev)
- * was called and then modify_ldt changed
- * prev->context.ldt but suppressed an IPI to this CPU.
- * In this case, prev->context.ldt != NULL, because we
- * never set context.ldt to NULL while the mm still
- * exists. That means that next->context.ldt !=
- * prev->context.ldt, because mms never share an LDT.
- */
- if (unlikely(prev->context.ldt != next->context.ldt))
- load_mm_ldt(next);
-#endif
+ WARN_ON_ONCE(cpumask_test_cpu(cpu, mm_cpumask(next)));
+ cpumask_set_cpu(cpu, mm_cpumask(next));
+
+ /*
+ * Re-load page tables.
+ *
+ * This logic has an ordering constraint:
+ *
+ * CPU 0: Write to a PTE for 'next'
+ * CPU 0: load bit 1 in mm_cpumask. if nonzero, send IPI.
+ * CPU 1: set bit 1 in next's mm_cpumask
+ * CPU 1: load from the PTE that CPU 0 writes (implicit)
+ *
+ * We need to prevent an outcome in which CPU 1 observes
+ * the new PTE value and CPU 0 observes bit 1 clear in
+ * mm_cpumask. (If that occurs, then the IPI will never
+ * be sent, and CPU 0's TLB will contain a stale entry.)
+ *
+ * The bad outcome can occur if either CPU's load is
+ * reordered before that CPU's store, so both CPUs must
+ * execute full barriers to prevent this from happening.
+ *
+ * Thus, switch_mm needs a full barrier between the
+ * store to mm_cpumask and any operation that could load
+ * from next->pgd. TLB fills are special and can happen
+ * due to instruction fetches or for no reason at all,
+ * and neither LOCK nor MFENCE orders them.
+ * Fortunately, load_cr3() is serializing and gives the
+ * ordering guarantee we need.
+ */
+ load_cr3(next->pgd);
+
+ /*
+ * This gets called via leave_mm() in the idle path where RCU
+ * functions differently. Tracing normally uses RCU, so we have to
+ * call the tracepoint specially here.
+ */
+ trace_tlb_flush_rcuidle(TLB_FLUSH_ON_TASK_SWITCH, TLB_FLUSH_ALL);
+
+ /* Stop flush ipis for the previous mm */
+ WARN_ON_ONCE(!cpumask_test_cpu(cpu, mm_cpumask(real_prev)) &&
+ real_prev != &init_mm);
+ cpumask_clear_cpu(cpu, mm_cpumask(real_prev));
+
+ /* Load per-mm CR4 and LDTR state */
+ load_mm_cr4(next);
+ switch_ldt(real_prev, next);
+}
+
+static void flush_tlb_func_common(const struct flush_tlb_info *f,
+ bool local, enum tlb_flush_reason reason)
+{
+ /* This code cannot presently handle being reentered. */
+ VM_WARN_ON(!irqs_disabled());
+
+ if (this_cpu_read(cpu_tlbstate.state) != TLBSTATE_OK) {
+ leave_mm(smp_processor_id());
+ return;
}
-#ifdef CONFIG_SMP
- else {
- this_cpu_write(cpu_tlbstate.state, TLBSTATE_OK);
- BUG_ON(this_cpu_read(cpu_tlbstate.active_mm) != next);
-
- if (!cpumask_test_cpu(cpu, mm_cpumask(next))) {
- /*
- * On established mms, the mm_cpumask is only changed
- * from irq context, from ptep_clear_flush() while in
- * lazy tlb mode, and here. Irqs are blocked during
- * schedule, protecting us from simultaneous changes.
- */
- cpumask_set_cpu(cpu, mm_cpumask(next));
- /*
- * We were in lazy tlb mode and leave_mm disabled
- * tlb flush IPI delivery. We must reload CR3
- * to make sure to use no freed page tables.
- *
- * As above, load_cr3() is serializing and orders TLB
- * fills with respect to the mm_cpumask write.
- */
- load_cr3(next->pgd);
- trace_tlb_flush(TLB_FLUSH_ON_TASK_SWITCH, TLB_FLUSH_ALL);
- load_mm_cr4(next);
- load_mm_ldt(next);
+ if (f->end == TLB_FLUSH_ALL) {
+ local_flush_tlb();
+ if (local)
+ count_vm_tlb_event(NR_TLB_LOCAL_FLUSH_ALL);
+ trace_tlb_flush(reason, TLB_FLUSH_ALL);
+ } else {
+ unsigned long addr;
+ unsigned long nr_pages = (f->end - f->start) >> PAGE_SHIFT;
+ addr = f->start;
+ while (addr < f->end) {
+ __flush_tlb_single(addr);
+ addr += PAGE_SIZE;
}
+ if (local)
+ count_vm_tlb_events(NR_TLB_LOCAL_FLUSH_ONE, nr_pages);
+ trace_tlb_flush(reason, nr_pages);
}
-#endif
}
-#ifdef CONFIG_SMP
+static void flush_tlb_func_local(void *info, enum tlb_flush_reason reason)
+{
+ const struct flush_tlb_info *f = info;
-/*
- * The flush IPI assumes that a thread switch happens in this order:
- * [cpu0: the cpu that switches]
- * 1) switch_mm() either 1a) or 1b)
- * 1a) thread switch to a different mm
- * 1a1) set cpu_tlbstate to TLBSTATE_OK
- * Now the tlb flush NMI handler flush_tlb_func won't call leave_mm
- * if cpu0 was in lazy tlb mode.
- * 1a2) update cpu active_mm
- * Now cpu0 accepts tlb flushes for the new mm.
- * 1a3) cpu_set(cpu, new_mm->cpu_vm_mask);
- * Now the other cpus will send tlb flush ipis.
- * 1a4) change cr3.
- * 1a5) cpu_clear(cpu, old_mm->cpu_vm_mask);
- * Stop ipi delivery for the old mm. This is not synchronized with
- * the other cpus, but flush_tlb_func ignore flush ipis for the wrong
- * mm, and in the worst case we perform a superfluous tlb flush.
- * 1b) thread switch without mm change
- * cpu active_mm is correct, cpu0 already handles flush ipis.
- * 1b1) set cpu_tlbstate to TLBSTATE_OK
- * 1b2) test_and_set the cpu bit in cpu_vm_mask.
- * Atomically set the bit [other cpus will start sending flush ipis],
- * and test the bit.
- * 1b3) if the bit was 0: leave_mm was called, flush the tlb.
- * 2) switch %%esp, ie current
- *
- * The interrupt must handle 2 special cases:
- * - cr3 is changed before %%esp, ie. it cannot use current->{active_,}mm.
- * - the cpu performs speculative tlb reads, i.e. even if the cpu only
- * runs in kernel space, the cpu could load tlb entries for user space
- * pages.
- *
- * The good news is that cpu_tlbstate is local to each cpu, no
- * write/read ordering problems.
- */
+ flush_tlb_func_common(f, true, reason);
+}
-/*
- * TLB flush funcation:
- * 1) Flush the tlb entries if the cpu uses the mm that's being flushed.
- * 2) Leave the mm if we are in the lazy tlb mode.
- */
-static void flush_tlb_func(void *info)
+static void flush_tlb_func_remote(void *info)
{
- struct flush_tlb_info *f = info;
+ const struct flush_tlb_info *f = info;
inc_irq_stat(irq_tlb_count);
- if (f->flush_mm && f->flush_mm != this_cpu_read(cpu_tlbstate.active_mm))
+ if (f->mm && f->mm != this_cpu_read(cpu_tlbstate.loaded_mm))
return;
count_vm_tlb_event(NR_TLB_REMOTE_FLUSH_RECEIVED);
- if (this_cpu_read(cpu_tlbstate.state) == TLBSTATE_OK) {
- if (f->flush_end == TLB_FLUSH_ALL) {
- local_flush_tlb();
- trace_tlb_flush(TLB_REMOTE_SHOOTDOWN, TLB_FLUSH_ALL);
- } else {
- unsigned long addr;
- unsigned long nr_pages =
- (f->flush_end - f->flush_start) / PAGE_SIZE;
- addr = f->flush_start;
- while (addr < f->flush_end) {
- __flush_tlb_single(addr);
- addr += PAGE_SIZE;
- }
- trace_tlb_flush(TLB_REMOTE_SHOOTDOWN, nr_pages);
- }
- } else
- leave_mm(smp_processor_id());
-
+ flush_tlb_func_common(f, false, TLB_REMOTE_SHOOTDOWN);
}
void native_flush_tlb_others(const struct cpumask *cpumask,
- struct mm_struct *mm, unsigned long start,
- unsigned long end)
+ const struct flush_tlb_info *info)
{
- struct flush_tlb_info info;
-
- info.flush_mm = mm;
- info.flush_start = start;
- info.flush_end = end;
-
count_vm_tlb_event(NR_TLB_REMOTE_FLUSH);
- if (end == TLB_FLUSH_ALL)
+ if (info->end == TLB_FLUSH_ALL)
trace_tlb_flush(TLB_REMOTE_SEND_IPI, TLB_FLUSH_ALL);
else
trace_tlb_flush(TLB_REMOTE_SEND_IPI,
- (end - start) >> PAGE_SHIFT);
+ (info->end - info->start) >> PAGE_SHIFT);
if (is_uv_system()) {
unsigned int cpu;
cpu = smp_processor_id();
- cpumask = uv_flush_tlb_others(cpumask, mm, start, end, cpu);
+ cpumask = uv_flush_tlb_others(cpumask, info);
if (cpumask)
- smp_call_function_many(cpumask, flush_tlb_func,
- &info, 1);
+ smp_call_function_many(cpumask, flush_tlb_func_remote,
+ (void *)info, 1);
return;
}
- smp_call_function_many(cpumask, flush_tlb_func, &info, 1);
+ smp_call_function_many(cpumask, flush_tlb_func_remote,
+ (void *)info, 1);
}
/*
@@ -302,85 +242,41 @@ static unsigned long tlb_single_page_flush_ceiling __read_mostly = 33;
void flush_tlb_mm_range(struct mm_struct *mm, unsigned long start,
unsigned long end, unsigned long vmflag)
{
- unsigned long addr;
- /* do a global flush by default */
- unsigned long base_pages_to_flush = TLB_FLUSH_ALL;
-
- preempt_disable();
+ int cpu;
- if ((end != TLB_FLUSH_ALL) && !(vmflag & VM_HUGETLB))
- base_pages_to_flush = (end - start) >> PAGE_SHIFT;
- if (base_pages_to_flush > tlb_single_page_flush_ceiling)
- base_pages_to_flush = TLB_FLUSH_ALL;
+ struct flush_tlb_info info = {
+ .mm = mm,
+ };
- if (current->active_mm != mm) {
- /* Synchronize with switch_mm. */
- smp_mb();
+ cpu = get_cpu();
- goto out;
- }
-
- if (!current->mm) {
- leave_mm(smp_processor_id());
+ /* Synchronize with switch_mm. */
+ smp_mb();
- /* Synchronize with switch_mm. */
- smp_mb();
-
- goto out;
- }
-
- /*
- * Both branches below are implicit full barriers (MOV to CR or
- * INVLPG) that synchronize with switch_mm.
- */
- if (base_pages_to_flush == TLB_FLUSH_ALL) {
- count_vm_tlb_event(NR_TLB_LOCAL_FLUSH_ALL);
- local_flush_tlb();
+ /* Should we flush just the requested range? */
+ if ((end != TLB_FLUSH_ALL) &&
+ !(vmflag & VM_HUGETLB) &&
+ ((end - start) >> PAGE_SHIFT) <= tlb_single_page_flush_ceiling) {
+ info.start = start;
+ info.end = end;
} else {
- /* flush range by one by one 'invlpg' */
- for (addr = start; addr < end; addr += PAGE_SIZE) {
- count_vm_tlb_event(NR_TLB_LOCAL_FLUSH_ONE);
- __flush_tlb_single(addr);
- }
- }
- trace_tlb_flush(TLB_LOCAL_MM_SHOOTDOWN, base_pages_to_flush);
-out:
- if (base_pages_to_flush == TLB_FLUSH_ALL) {
- start = 0UL;
- end = TLB_FLUSH_ALL;
+ info.start = 0UL;
+ info.end = TLB_FLUSH_ALL;
}
- if (cpumask_any_but(mm_cpumask(mm), smp_processor_id()) < nr_cpu_ids)
- flush_tlb_others(mm_cpumask(mm), mm, start, end);
- preempt_enable();
-}
-void flush_tlb_page(struct vm_area_struct *vma, unsigned long start)
-{
- struct mm_struct *mm = vma->vm_mm;
-
- preempt_disable();
-
- if (current->active_mm == mm) {
- if (current->mm) {
- /*
- * Implicit full barrier (INVLPG) that synchronizes
- * with switch_mm.
- */
- __flush_tlb_one(start);
- } else {
- leave_mm(smp_processor_id());
-
- /* Synchronize with switch_mm. */
- smp_mb();
- }
+ if (mm == this_cpu_read(cpu_tlbstate.loaded_mm)) {
+ VM_WARN_ON(irqs_disabled());
+ local_irq_disable();
+ flush_tlb_func_local(&info, TLB_LOCAL_MM_SHOOTDOWN);
+ local_irq_enable();
}
- if (cpumask_any_but(mm_cpumask(mm), smp_processor_id()) < nr_cpu_ids)
- flush_tlb_others(mm_cpumask(mm), mm, start, start + PAGE_SIZE);
-
- preempt_enable();
+ if (cpumask_any_but(mm_cpumask(mm), cpu) < nr_cpu_ids)
+ flush_tlb_others(mm_cpumask(mm), &info);
+ put_cpu();
}
+
static void do_flush_tlb_all(void *info)
{
count_vm_tlb_event(NR_TLB_REMOTE_FLUSH_RECEIVED);
@@ -401,7 +297,7 @@ static void do_kernel_range_flush(void *info)
unsigned long addr;
/* flush range by one by one 'invlpg' */
- for (addr = f->flush_start; addr < f->flush_end; addr += PAGE_SIZE)
+ for (addr = f->start; addr < f->end; addr += PAGE_SIZE)
__flush_tlb_single(addr);
}
@@ -410,16 +306,40 @@ void flush_tlb_kernel_range(unsigned long start, unsigned long end)
/* Balance as user space task's flush, a bit conservative */
if (end == TLB_FLUSH_ALL ||
- (end - start) > tlb_single_page_flush_ceiling * PAGE_SIZE) {
+ (end - start) > tlb_single_page_flush_ceiling << PAGE_SHIFT) {
on_each_cpu(do_flush_tlb_all, NULL, 1);
} else {
struct flush_tlb_info info;
- info.flush_start = start;
- info.flush_end = end;
+ info.start = start;
+ info.end = end;
on_each_cpu(do_kernel_range_flush, &info, 1);
}
}
+void arch_tlbbatch_flush(struct arch_tlbflush_unmap_batch *batch)
+{
+ struct flush_tlb_info info = {
+ .mm = NULL,
+ .start = 0UL,
+ .end = TLB_FLUSH_ALL,
+ };
+
+ int cpu = get_cpu();
+
+ if (cpumask_test_cpu(cpu, &batch->cpumask)) {
+ VM_WARN_ON(irqs_disabled());
+ local_irq_disable();
+ flush_tlb_func_local(&info, TLB_LOCAL_SHOOTDOWN);
+ local_irq_enable();
+ }
+
+ if (cpumask_any_but(&batch->cpumask, cpu) < nr_cpu_ids)
+ flush_tlb_others(&batch->cpumask, &info);
+ cpumask_clear(&batch->cpumask);
+
+ put_cpu();
+}
+
static ssize_t tlbflush_read_file(struct file *file, char __user *user_buf,
size_t count, loff_t *ppos)
{
@@ -465,5 +385,3 @@ static int __init create_tlb_single_page_flush_ceiling(void)
return 0;
}
late_initcall(create_tlb_single_page_flush_ceiling);
-
-#endif /* CONFIG_SMP */
diff --git a/arch/x86/net/Makefile b/arch/x86/net/Makefile
index 90568c33ddb0..fefb4b619598 100644
--- a/arch/x86/net/Makefile
+++ b/arch/x86/net/Makefile
@@ -1,4 +1,6 @@
#
# Arch-specific network modules
#
+OBJECT_FILES_NON_STANDARD_bpf_jit.o += y
+
obj-$(CONFIG_BPF_JIT) += bpf_jit.o bpf_jit_comp.o
diff --git a/arch/x86/platform/efi/Makefile b/arch/x86/platform/efi/Makefile
index f1d83b34c329..2f56e1ed61c3 100644
--- a/arch/x86/platform/efi/Makefile
+++ b/arch/x86/platform/efi/Makefile
@@ -1,4 +1,5 @@
OBJECT_FILES_NON_STANDARD_efi_thunk_$(BITS).o := y
+OBJECT_FILES_NON_STANDARD_efi_stub_$(BITS).o := y
obj-$(CONFIG_EFI) += quirks.o efi.o efi_$(BITS).o efi_stub_$(BITS).o
obj-$(CONFIG_EARLY_PRINTK_EFI) += early_printk.o
diff --git a/arch/x86/platform/efi/efi.c b/arch/x86/platform/efi/efi.c
index 43b96f5f78ba..f084d8718ac4 100644
--- a/arch/x86/platform/efi/efi.c
+++ b/arch/x86/platform/efi/efi.c
@@ -1014,7 +1014,6 @@ static void __init __efi_enter_virtual_mode(void)
* necessary relocation fixups for the new virtual addresses.
*/
efi_runtime_update_mappings();
- efi_dump_pagetable();
/* clean DUMMY object */
efi_delete_dummy_variable();
@@ -1029,6 +1028,8 @@ void __init efi_enter_virtual_mode(void)
kexec_enter_virtual_mode();
else
__efi_enter_virtual_mode();
+
+ efi_dump_pagetable();
}
/*
diff --git a/arch/x86/platform/efi/efi_32.c b/arch/x86/platform/efi/efi_32.c
index 3481268da3d0..52f7faa1538f 100644
--- a/arch/x86/platform/efi/efi_32.c
+++ b/arch/x86/platform/efi/efi_32.c
@@ -44,7 +44,14 @@ int __init efi_alloc_page_tables(void)
}
void efi_sync_low_kernel_mappings(void) {}
-void __init efi_dump_pagetable(void) {}
+
+void __init efi_dump_pagetable(void)
+{
+#ifdef CONFIG_EFI_PGT_DUMP
+ ptdump_walk_pgd_level(NULL, swapper_pg_dir);
+#endif
+}
+
int __init efi_setup_page_tables(unsigned long pa_memmap, unsigned num_pages)
{
return 0;
diff --git a/arch/x86/platform/efi/efi_64.c b/arch/x86/platform/efi/efi_64.c
index eb8dff15a7f6..9bf72f5bfedb 100644
--- a/arch/x86/platform/efi/efi_64.c
+++ b/arch/x86/platform/efi/efi_64.c
@@ -80,7 +80,7 @@ pgd_t * __init efi_call_phys_prolog(void)
int n_pgds, i, j;
if (!efi_enabled(EFI_OLD_MEMMAP)) {
- save_pgd = (pgd_t *)read_cr3();
+ save_pgd = (pgd_t *)__read_cr3();
write_cr3((unsigned long)efi_scratch.efi_pgt);
goto out;
}
@@ -589,7 +589,10 @@ void __init efi_runtime_update_mappings(void)
void __init efi_dump_pagetable(void)
{
#ifdef CONFIG_EFI_PGT_DUMP
- ptdump_walk_pgd_level(NULL, efi_pgd);
+ if (efi_enabled(EFI_OLD_MEMMAP))
+ ptdump_walk_pgd_level(NULL, swapper_pg_dir);
+ else
+ ptdump_walk_pgd_level(NULL, efi_pgd);
#endif
}
@@ -646,7 +649,7 @@ efi_status_t efi_thunk_set_virtual_address_map(
efi_sync_low_kernel_mappings();
local_irq_save(flags);
- efi_scratch.prev_cr3 = read_cr3();
+ efi_scratch.prev_cr3 = __read_cr3();
write_cr3((unsigned long)efi_scratch.efi_pgt);
__flush_tlb_all();
diff --git a/arch/x86/platform/efi/quirks.c b/arch/x86/platform/efi/quirks.c
index e0cf95a83f3f..8a99a2e96537 100644
--- a/arch/x86/platform/efi/quirks.c
+++ b/arch/x86/platform/efi/quirks.c
@@ -15,12 +15,66 @@
#include <asm/e820/api.h>
#include <asm/efi.h>
#include <asm/uv/uv.h>
+#include <asm/cpu_device_id.h>
#define EFI_MIN_RESERVE 5120
#define EFI_DUMMY_GUID \
EFI_GUID(0x4424ac57, 0xbe4b, 0x47dd, 0x9e, 0x97, 0xed, 0x50, 0xf0, 0x9f, 0x92, 0xa9)
+#define QUARK_CSH_SIGNATURE 0x5f435348 /* _CSH */
+#define QUARK_SECURITY_HEADER_SIZE 0x400
+
+/*
+ * Header prepended to the standard EFI capsule on Quark systems the are based
+ * on Intel firmware BSP.
+ * @csh_signature: Unique identifier to sanity check signed module
+ * presence ("_CSH").
+ * @version: Current version of CSH used. Should be one for Quark A0.
+ * @modulesize: Size of the entire module including the module header
+ * and payload.
+ * @security_version_number_index: Index of SVN to use for validation of signed
+ * module.
+ * @security_version_number: Used to prevent against roll back of modules.
+ * @rsvd_module_id: Currently unused for Clanton (Quark).
+ * @rsvd_module_vendor: Vendor Identifier. For Intel products value is
+ * 0x00008086.
+ * @rsvd_date: BCD representation of build date as yyyymmdd, where
+ * yyyy=4 digit year, mm=1-12, dd=1-31.
+ * @headersize: Total length of the header including including any
+ * padding optionally added by the signing tool.
+ * @hash_algo: What Hash is used in the module signing.
+ * @cryp_algo: What Crypto is used in the module signing.
+ * @keysize: Total length of the key data including including any
+ * padding optionally added by the signing tool.
+ * @signaturesize: Total length of the signature including including any
+ * padding optionally added by the signing tool.
+ * @rsvd_next_header: 32-bit pointer to the next Secure Boot Module in the
+ * chain, if there is a next header.
+ * @rsvd: Reserved, padding structure to required size.
+ *
+ * See also QuartSecurityHeader_t in
+ * Quark_EDKII_v1.2.1.1/QuarkPlatformPkg/Include/QuarkBootRom.h
+ * from https://downloadcenter.intel.com/download/23197/Intel-Quark-SoC-X1000-Board-Support-Package-BSP
+ */
+struct quark_security_header {
+ u32 csh_signature;
+ u32 version;
+ u32 modulesize;
+ u32 security_version_number_index;
+ u32 security_version_number;
+ u32 rsvd_module_id;
+ u32 rsvd_module_vendor;
+ u32 rsvd_date;
+ u32 headersize;
+ u32 hash_algo;
+ u32 cryp_algo;
+ u32 keysize;
+ u32 signaturesize;
+ u32 rsvd_next_header;
+ u32 rsvd[2];
+};
+
static efi_char16_t efi_dummy_name[6] = { 'D', 'U', 'M', 'M', 'Y', 0 };
static bool efi_no_storage_paranoia;
@@ -504,3 +558,86 @@ bool efi_poweroff_required(void)
{
return acpi_gbl_reduced_hardware || acpi_no_s5;
}
+
+#ifdef CONFIG_EFI_CAPSULE_QUIRK_QUARK_CSH
+
+static int qrk_capsule_setup_info(struct capsule_info *cap_info, void **pkbuff,
+ size_t hdr_bytes)
+{
+ struct quark_security_header *csh = *pkbuff;
+
+ /* Only process data block that is larger than the security header */
+ if (hdr_bytes < sizeof(struct quark_security_header))
+ return 0;
+
+ if (csh->csh_signature != QUARK_CSH_SIGNATURE ||
+ csh->headersize != QUARK_SECURITY_HEADER_SIZE)
+ return 1;
+
+ /* Only process data block if EFI header is included */
+ if (hdr_bytes < QUARK_SECURITY_HEADER_SIZE +
+ sizeof(efi_capsule_header_t))
+ return 0;
+
+ pr_debug("Quark security header detected\n");
+
+ if (csh->rsvd_next_header != 0) {
+ pr_err("multiple Quark security headers not supported\n");
+ return -EINVAL;
+ }
+
+ *pkbuff += csh->headersize;
+ cap_info->total_size = csh->headersize;
+
+ /*
+ * Update the first page pointer to skip over the CSH header.
+ */
+ cap_info->pages[0] += csh->headersize;
+
+ return 1;
+}
+
+#define ICPU(family, model, quirk_handler) \
+ { X86_VENDOR_INTEL, family, model, X86_FEATURE_ANY, \
+ (unsigned long)&quirk_handler }
+
+static const struct x86_cpu_id efi_capsule_quirk_ids[] = {
+ ICPU(5, 9, qrk_capsule_setup_info), /* Intel Quark X1000 */
+ { }
+};
+
+int efi_capsule_setup_info(struct capsule_info *cap_info, void *kbuff,
+ size_t hdr_bytes)
+{
+ int (*quirk_handler)(struct capsule_info *, void **, size_t);
+ const struct x86_cpu_id *id;
+ int ret;
+
+ if (hdr_bytes < sizeof(efi_capsule_header_t))
+ return 0;
+
+ cap_info->total_size = 0;
+
+ id = x86_match_cpu(efi_capsule_quirk_ids);
+ if (id) {
+ /*
+ * The quirk handler is supposed to return
+ * - a value > 0 if the setup should continue, after advancing
+ * kbuff as needed
+ * - 0 if not enough hdr_bytes are available yet
+ * - a negative error code otherwise
+ */
+ quirk_handler = (typeof(quirk_handler))id->driver_data;
+ ret = quirk_handler(cap_info, &kbuff, hdr_bytes);
+ if (ret <= 0)
+ return ret;
+ }
+
+ memcpy(&cap_info->header, kbuff, sizeof(cap_info->header));
+
+ cap_info->total_size += cap_info->header.imagesize;
+
+ return __efi_capsule_setup_info(cap_info);
+}
+
+#endif
diff --git a/arch/x86/platform/olpc/olpc-xo1-pm.c b/arch/x86/platform/olpc/olpc-xo1-pm.c
index c5350fd27d70..0668aaff8bfe 100644
--- a/arch/x86/platform/olpc/olpc-xo1-pm.c
+++ b/arch/x86/platform/olpc/olpc-xo1-pm.c
@@ -77,7 +77,7 @@ static int xo1_power_state_enter(suspend_state_t pm_state)
asmlinkage __visible int xo1_do_sleep(u8 sleep_state)
{
- void *pgd_addr = __va(read_cr3());
+ void *pgd_addr = __va(read_cr3_pa());
/* Program wakeup mask (using dword access to CS5536_PM1_EN) */
outl(wakeup_mask << 16, acpi_base + CS5536_PM1_STS);
diff --git a/arch/x86/platform/uv/tlb_uv.c b/arch/x86/platform/uv/tlb_uv.c
index 42e65fee5673..2983faab5b18 100644
--- a/arch/x86/platform/uv/tlb_uv.c
+++ b/arch/x86/platform/uv/tlb_uv.c
@@ -456,12 +456,13 @@ static void reset_with_ipi(struct pnmask *distribution, struct bau_control *bcp)
*/
static inline unsigned long long cycles_2_ns(unsigned long long cyc)
{
- struct cyc2ns_data *data = cyc2ns_read_begin();
+ struct cyc2ns_data data;
unsigned long long ns;
- ns = mul_u64_u32_shr(cyc, data->cyc2ns_mul, data->cyc2ns_shift);
+ cyc2ns_read_begin(&data);
+ ns = mul_u64_u32_shr(cyc, data.cyc2ns_mul, data.cyc2ns_shift);
+ cyc2ns_read_end();
- cyc2ns_read_end(data);
return ns;
}
@@ -470,12 +471,13 @@ static inline unsigned long long cycles_2_ns(unsigned long long cyc)
*/
static inline unsigned long long ns_2_cycles(unsigned long long ns)
{
- struct cyc2ns_data *data = cyc2ns_read_begin();
+ struct cyc2ns_data data;
unsigned long long cyc;
- cyc = (ns << data->cyc2ns_shift) / data->cyc2ns_mul;
+ cyc2ns_read_begin(&data);
+ cyc = (ns << data.cyc2ns_shift) / data.cyc2ns_mul;
+ cyc2ns_read_end();
- cyc2ns_read_end(data);
return cyc;
}
@@ -1121,11 +1123,9 @@ static int set_distrib_bits(struct cpumask *flush_mask, struct bau_control *bcp,
* done. The returned pointer is valid till preemption is re-enabled.
*/
const struct cpumask *uv_flush_tlb_others(const struct cpumask *cpumask,
- struct mm_struct *mm,
- unsigned long start,
- unsigned long end,
- unsigned int cpu)
+ const struct flush_tlb_info *info)
{
+ unsigned int cpu = smp_processor_id();
int locals = 0, remotes = 0, hubs = 0;
struct bau_desc *bau_desc;
struct cpumask *flush_mask;
@@ -1179,8 +1179,8 @@ const struct cpumask *uv_flush_tlb_others(const struct cpumask *cpumask,
record_send_statistics(stat, locals, hubs, remotes, bau_desc);
- if (!end || (end - start) <= PAGE_SIZE)
- address = start;
+ if (!info->end || (info->end - info->start) <= PAGE_SIZE)
+ address = info->start;
else
address = TLB_FLUSH_ALL;
diff --git a/arch/x86/power/Makefile b/arch/x86/power/Makefile
index a6a198c33623..05041871ac90 100644
--- a/arch/x86/power/Makefile
+++ b/arch/x86/power/Makefile
@@ -1,3 +1,5 @@
+OBJECT_FILES_NON_STANDARD_hibernate_asm_$(BITS).o := y
+
# __restore_processor_state() restores %gs after S3 resume and so should not
# itself be stack-protected
nostackp := $(call cc-option, -fno-stack-protector)
diff --git a/arch/x86/power/cpu.c b/arch/x86/power/cpu.c
index 6b05a9219ea2..78459a6d455a 100644
--- a/arch/x86/power/cpu.c
+++ b/arch/x86/power/cpu.c
@@ -129,7 +129,7 @@ static void __save_processor_state(struct saved_context *ctxt)
*/
ctxt->cr0 = read_cr0();
ctxt->cr2 = read_cr2();
- ctxt->cr3 = read_cr3();
+ ctxt->cr3 = __read_cr3();
ctxt->cr4 = __read_cr4();
#ifdef CONFIG_X86_64
ctxt->cr8 = read_cr8();
diff --git a/arch/x86/power/hibernate_64.c b/arch/x86/power/hibernate_64.c
index a6e21fee22ea..e3e62c8a8e70 100644
--- a/arch/x86/power/hibernate_64.c
+++ b/arch/x86/power/hibernate_64.c
@@ -150,7 +150,8 @@ static int relocate_restore_code(void)
memcpy((void *)relocated_restore_code, &core_restore_code, PAGE_SIZE);
/* Make the page containing the relocated code executable */
- pgd = (pgd_t *)__va(read_cr3()) + pgd_index(relocated_restore_code);
+ pgd = (pgd_t *)__va(read_cr3_pa()) +
+ pgd_index(relocated_restore_code);
p4d = p4d_offset(pgd, relocated_restore_code);
if (p4d_large(*p4d)) {
set_p4d(p4d, __p4d(p4d_val(*p4d) & ~_PAGE_NX));
diff --git a/arch/x86/realmode/init.c b/arch/x86/realmode/init.c
index a163a90af4aa..cd4be19c36dc 100644
--- a/arch/x86/realmode/init.c
+++ b/arch/x86/realmode/init.c
@@ -102,7 +102,7 @@ static void __init setup_real_mode(void)
trampoline_pgd = (u64 *) __va(real_mode_header->trampoline_pgd);
trampoline_pgd[0] = trampoline_pgd_entry.pgd;
- trampoline_pgd[511] = init_level4_pgt[511].pgd;
+ trampoline_pgd[511] = init_top_pgt[511].pgd;
#endif
}
diff --git a/arch/x86/xen/Makefile b/arch/x86/xen/Makefile
index fffb0a16f9e3..bced7a369a11 100644
--- a/arch/x86/xen/Makefile
+++ b/arch/x86/xen/Makefile
@@ -1,3 +1,6 @@
+OBJECT_FILES_NON_STANDARD_xen-asm_$(BITS).o := y
+OBJECT_FILES_NON_STANDARD_xen-pvh.o := y
+
ifdef CONFIG_FUNCTION_TRACER
# Do not profile debug and lowlevel utilities
CFLAGS_REMOVE_spinlock.o = -pg
diff --git a/arch/x86/xen/efi.c b/arch/x86/xen/efi.c
index 30bb2e80cfe7..a18703be9ead 100644
--- a/arch/x86/xen/efi.c
+++ b/arch/x86/xen/efi.c
@@ -54,38 +54,6 @@ static efi_system_table_t efi_systab_xen __initdata = {
.tables = EFI_INVALID_TABLE_ADDR /* Initialized later. */
};
-static const struct efi efi_xen __initconst = {
- .systab = NULL, /* Initialized later. */
- .runtime_version = 0, /* Initialized later. */
- .mps = EFI_INVALID_TABLE_ADDR,
- .acpi = EFI_INVALID_TABLE_ADDR,
- .acpi20 = EFI_INVALID_TABLE_ADDR,
- .smbios = EFI_INVALID_TABLE_ADDR,
- .smbios3 = EFI_INVALID_TABLE_ADDR,
- .sal_systab = EFI_INVALID_TABLE_ADDR,
- .boot_info = EFI_INVALID_TABLE_ADDR,
- .hcdp = EFI_INVALID_TABLE_ADDR,
- .uga = EFI_INVALID_TABLE_ADDR,
- .uv_systab = EFI_INVALID_TABLE_ADDR,
- .fw_vendor = EFI_INVALID_TABLE_ADDR,
- .runtime = EFI_INVALID_TABLE_ADDR,
- .config_table = EFI_INVALID_TABLE_ADDR,
- .get_time = xen_efi_get_time,
- .set_time = xen_efi_set_time,
- .get_wakeup_time = xen_efi_get_wakeup_time,
- .set_wakeup_time = xen_efi_set_wakeup_time,
- .get_variable = xen_efi_get_variable,
- .get_next_variable = xen_efi_get_next_variable,
- .set_variable = xen_efi_set_variable,
- .query_variable_info = xen_efi_query_variable_info,
- .update_capsule = xen_efi_update_capsule,
- .query_capsule_caps = xen_efi_query_capsule_caps,
- .get_next_high_mono_count = xen_efi_get_next_high_mono_count,
- .reset_system = xen_efi_reset_system,
- .set_virtual_address_map = NULL, /* Not used under Xen. */
- .flags = 0 /* Initialized later. */
-};
-
static efi_system_table_t __init *xen_efi_probe(void)
{
struct xen_platform_op op = {
@@ -102,7 +70,18 @@ static efi_system_table_t __init *xen_efi_probe(void)
/* Here we know that Xen runs on EFI platform. */
- efi = efi_xen;
+ efi.get_time = xen_efi_get_time;
+ efi.set_time = xen_efi_set_time;
+ efi.get_wakeup_time = xen_efi_get_wakeup_time;
+ efi.set_wakeup_time = xen_efi_set_wakeup_time;
+ efi.get_variable = xen_efi_get_variable;
+ efi.get_next_variable = xen_efi_get_next_variable;
+ efi.set_variable = xen_efi_set_variable;
+ efi.query_variable_info = xen_efi_query_variable_info;
+ efi.update_capsule = xen_efi_update_capsule;
+ efi.query_capsule_caps = xen_efi_query_capsule_caps;
+ efi.get_next_high_mono_count = xen_efi_get_next_high_mono_count;
+ efi.reset_system = xen_efi_reset_system;
efi_systab_xen.tables = info->cfg.addr;
efi_systab_xen.nr_tables = info->cfg.nent;
diff --git a/arch/x86/xen/mmu_pv.c b/arch/x86/xen/mmu_pv.c
index 1f386d7fdf70..1d7a7213a310 100644
--- a/arch/x86/xen/mmu_pv.c
+++ b/arch/x86/xen/mmu_pv.c
@@ -975,37 +975,32 @@ static void xen_dup_mmap(struct mm_struct *oldmm, struct mm_struct *mm)
spin_unlock(&mm->page_table_lock);
}
-
-#ifdef CONFIG_SMP
-/* Another cpu may still have their %cr3 pointing at the pagetable, so
- we need to repoint it somewhere else before we can unpin it. */
-static void drop_other_mm_ref(void *info)
+static void drop_mm_ref_this_cpu(void *info)
{
struct mm_struct *mm = info;
- struct mm_struct *active_mm;
-
- active_mm = this_cpu_read(cpu_tlbstate.active_mm);
- if (active_mm == mm && this_cpu_read(cpu_tlbstate.state) != TLBSTATE_OK)
+ if (this_cpu_read(cpu_tlbstate.loaded_mm) == mm)
leave_mm(smp_processor_id());
- /* If this cpu still has a stale cr3 reference, then make sure
- it has been flushed. */
+ /*
+ * If this cpu still has a stale cr3 reference, then make sure
+ * it has been flushed.
+ */
if (this_cpu_read(xen_current_cr3) == __pa(mm->pgd))
- load_cr3(swapper_pg_dir);
+ xen_mc_flush();
}
+#ifdef CONFIG_SMP
+/*
+ * Another cpu may still have their %cr3 pointing at the pagetable, so
+ * we need to repoint it somewhere else before we can unpin it.
+ */
static void xen_drop_mm_ref(struct mm_struct *mm)
{
cpumask_var_t mask;
unsigned cpu;
- if (current->active_mm == mm) {
- if (current->mm == mm)
- load_cr3(swapper_pg_dir);
- else
- leave_mm(smp_processor_id());
- }
+ drop_mm_ref_this_cpu(mm);
/* Get the "official" set of cpus referring to our pagetable. */
if (!alloc_cpumask_var(&mask, GFP_ATOMIC)) {
@@ -1013,31 +1008,31 @@ static void xen_drop_mm_ref(struct mm_struct *mm)
if (!cpumask_test_cpu(cpu, mm_cpumask(mm))
&& per_cpu(xen_current_cr3, cpu) != __pa(mm->pgd))
continue;
- smp_call_function_single(cpu, drop_other_mm_ref, mm, 1);
+ smp_call_function_single(cpu, drop_mm_ref_this_cpu, mm, 1);
}
return;
}
cpumask_copy(mask, mm_cpumask(mm));
- /* It's possible that a vcpu may have a stale reference to our
- cr3, because its in lazy mode, and it hasn't yet flushed
- its set of pending hypercalls yet. In this case, we can
- look at its actual current cr3 value, and force it to flush
- if needed. */
+ /*
+ * It's possible that a vcpu may have a stale reference to our
+ * cr3, because its in lazy mode, and it hasn't yet flushed
+ * its set of pending hypercalls yet. In this case, we can
+ * look at its actual current cr3 value, and force it to flush
+ * if needed.
+ */
for_each_online_cpu(cpu) {
if (per_cpu(xen_current_cr3, cpu) == __pa(mm->pgd))
cpumask_set_cpu(cpu, mask);
}
- if (!cpumask_empty(mask))
- smp_call_function_many(mask, drop_other_mm_ref, mm, 1);
+ smp_call_function_many(mask, drop_mm_ref_this_cpu, mm, 1);
free_cpumask_var(mask);
}
#else
static void xen_drop_mm_ref(struct mm_struct *mm)
{
- if (current->active_mm == mm)
- load_cr3(swapper_pg_dir);
+ drop_mm_ref_this_cpu(mm);
}
#endif
@@ -1366,8 +1361,7 @@ static void xen_flush_tlb_single(unsigned long addr)
}
static void xen_flush_tlb_others(const struct cpumask *cpus,
- struct mm_struct *mm, unsigned long start,
- unsigned long end)
+ const struct flush_tlb_info *info)
{
struct {
struct mmuext_op op;
@@ -1379,7 +1373,7 @@ static void xen_flush_tlb_others(const struct cpumask *cpus,
} *args;
struct multicall_space mcs;
- trace_xen_mmu_flush_tlb_others(cpus, mm, start, end);
+ trace_xen_mmu_flush_tlb_others(cpus, info->mm, info->start, info->end);
if (cpumask_empty(cpus))
return; /* nothing to do */
@@ -1393,9 +1387,10 @@ static void xen_flush_tlb_others(const struct cpumask *cpus,
cpumask_clear_cpu(smp_processor_id(), to_cpumask(args->mask));
args->op.cmd = MMUEXT_TLB_FLUSH_MULTI;
- if (end != TLB_FLUSH_ALL && (end - start) <= PAGE_SIZE) {
+ if (info->end != TLB_FLUSH_ALL &&
+ (info->end - info->start) <= PAGE_SIZE) {
args->op.cmd = MMUEXT_INVLPG_MULTI;
- args->op.arg1.linear_addr = start;
+ args->op.arg1.linear_addr = info->start;
}
MULTI_mmuext_op(mcs.mc, &args->op, 1, NULL, DOMID_SELF);
@@ -1470,8 +1465,8 @@ static void xen_write_cr3(unsigned long cr3)
* At the start of the day - when Xen launches a guest, it has already
* built pagetables for the guest. We diligently look over them
* in xen_setup_kernel_pagetable and graft as appropriate them in the
- * init_level4_pgt and its friends. Then when we are happy we load
- * the new init_level4_pgt - and continue on.
+ * init_top_pgt and its friends. Then when we are happy we load
+ * the new init_top_pgt - and continue on.
*
* The generic code starts (start_kernel) and 'init_mem_mapping' sets
* up the rest of the pagetables. When it has completed it loads the cr3.
@@ -1914,12 +1909,12 @@ void __init xen_setup_kernel_pagetable(pgd_t *pgd, unsigned long max_pfn)
pt_end = pt_base + xen_start_info->nr_pt_frames;
/* Zap identity mapping */
- init_level4_pgt[0] = __pgd(0);
+ init_top_pgt[0] = __pgd(0);
/* Pre-constructed entries are in pfn, so convert to mfn */
/* L4[272] -> level3_ident_pgt */
/* L4[511] -> level3_kernel_pgt */
- convert_pfn_mfn(init_level4_pgt);
+ convert_pfn_mfn(init_top_pgt);
/* L3_i[0] -> level2_ident_pgt */
convert_pfn_mfn(level3_ident_pgt);
@@ -1950,10 +1945,10 @@ void __init xen_setup_kernel_pagetable(pgd_t *pgd, unsigned long max_pfn)
/* Copy the initial P->M table mappings if necessary. */
i = pgd_index(xen_start_info->mfn_list);
if (i && i < pgd_index(__START_KERNEL_map))
- init_level4_pgt[i] = ((pgd_t *)xen_start_info->pt_base)[i];
+ init_top_pgt[i] = ((pgd_t *)xen_start_info->pt_base)[i];
/* Make pagetable pieces RO */
- set_page_prot(init_level4_pgt, PAGE_KERNEL_RO);
+ set_page_prot(init_top_pgt, PAGE_KERNEL_RO);
set_page_prot(level3_ident_pgt, PAGE_KERNEL_RO);
set_page_prot(level3_kernel_pgt, PAGE_KERNEL_RO);
set_page_prot(level3_user_vsyscall, PAGE_KERNEL_RO);
@@ -1964,7 +1959,7 @@ void __init xen_setup_kernel_pagetable(pgd_t *pgd, unsigned long max_pfn)
/* Pin down new L4 */
pin_pagetable_pfn(MMUEXT_PIN_L4_TABLE,
- PFN_DOWN(__pa_symbol(init_level4_pgt)));
+ PFN_DOWN(__pa_symbol(init_top_pgt)));
/* Unpin Xen-provided one */
pin_pagetable_pfn(MMUEXT_UNPIN_TABLE, PFN_DOWN(__pa(pgd)));
@@ -1974,7 +1969,7 @@ void __init xen_setup_kernel_pagetable(pgd_t *pgd, unsigned long max_pfn)
* attach it to, so make sure we just set kernel pgd.
*/
xen_mc_batch();
- __xen_write_cr3(true, __pa(init_level4_pgt));
+ __xen_write_cr3(true, __pa(init_top_pgt));
xen_mc_issue(PARAVIRT_LAZY_CPU);
/* We can't that easily rip out L3 and L2, as the Xen pagetables are
@@ -2022,7 +2017,7 @@ static phys_addr_t __init xen_early_virt_to_phys(unsigned long vaddr)
pmd_t pmd;
pte_t pte;
- pa = read_cr3();
+ pa = read_cr3_pa();
pgd = native_make_pgd(xen_read_phys_ulong(pa + pgd_index(vaddr) *
sizeof(pgd)));
if (!pgd_present(pgd))
@@ -2102,7 +2097,7 @@ void __init xen_relocate_p2m(void)
pt_phys = pmd_phys + PFN_PHYS(n_pmd);
p2m_pfn = PFN_DOWN(pt_phys) + n_pt;
- pgd = __va(read_cr3());
+ pgd = __va(read_cr3_pa());
new_p2m = (unsigned long *)(2 * PGDIR_SIZE);
idx_p4d = 0;
save_pud = n_pud;
@@ -2209,7 +2204,7 @@ static void __init xen_write_cr3_init(unsigned long cr3)
{
unsigned long pfn = PFN_DOWN(__pa(swapper_pg_dir));
- BUG_ON(read_cr3() != __pa(initial_page_table));
+ BUG_ON(read_cr3_pa() != __pa(initial_page_table));
BUG_ON(cr3 != __pa(swapper_pg_dir));
/*
diff --git a/arch/x86/xen/xen-pvh.S b/arch/x86/xen/xen-pvh.S
index 5e246716d58f..e1a5fbeae08d 100644
--- a/arch/x86/xen/xen-pvh.S
+++ b/arch/x86/xen/xen-pvh.S
@@ -87,7 +87,7 @@ ENTRY(pvh_start_xen)
wrmsr
/* Enable pre-constructed page tables. */
- mov $_pa(init_level4_pgt), %eax
+ mov $_pa(init_top_pgt), %eax
mov %eax, %cr3
mov $(X86_CR0_PG | X86_CR0_PE), %eax
mov %eax, %cr0
diff --git a/arch/xtensa/include/asm/processor.h b/arch/xtensa/include/asm/processor.h
index 003eeee3fbc6..30ee8c608853 100644
--- a/arch/xtensa/include/asm/processor.h
+++ b/arch/xtensa/include/asm/processor.h
@@ -213,8 +213,6 @@ struct mm_struct;
#define release_segments(mm) do { } while(0)
#define forget_segments() do { } while (0)
-#define thread_saved_pc(tsk) (task_pt_regs(tsk)->pc)
-
extern unsigned long get_wchan(struct task_struct *p);
#define KSTK_EIP(tsk) (task_pt_regs(tsk)->pc)
diff --git a/arch/xtensa/kernel/syscall.c b/arch/xtensa/kernel/syscall.c
index 06937928cb72..74afbf02d07e 100644
--- a/arch/xtensa/kernel/syscall.c
+++ b/arch/xtensa/kernel/syscall.c
@@ -88,7 +88,7 @@ unsigned long arch_get_unmapped_area(struct file *filp, unsigned long addr,
/* At this point: (!vmm || addr < vmm->vm_end). */
if (TASK_SIZE - len < addr)
return -ENOMEM;
- if (!vmm || addr + len <= vmm->vm_start)
+ if (!vmm || addr + len <= vm_start_gap(vmm))
return addr;
addr = vmm->vm_end;
if (flags & MAP_SHARED)
diff --git a/block/bfq-iosched.c b/block/bfq-iosched.c
index 12bbc6b8657d..60a6835265fc 100644
--- a/block/bfq-iosched.c
+++ b/block/bfq-iosched.c
@@ -3483,11 +3483,17 @@ static void bfq_update_wr_data(struct bfq_data *bfqd, struct bfq_queue *bfqq)
}
}
}
- /* Update weight both if it must be raised and if it must be lowered */
+ /*
+ * To improve latency (for this or other queues), immediately
+ * update weight both if it must be raised and if it must be
+ * lowered. Since, entity may be on some active tree here, and
+ * might have a pending change of its ioprio class, invoke
+ * next function with the last parameter unset (see the
+ * comments on the function).
+ */
if ((entity->weight > entity->orig_weight) != (bfqq->wr_coeff > 1))
- __bfq_entity_update_weight_prio(
- bfq_entity_service_tree(entity),
- entity);
+ __bfq_entity_update_weight_prio(bfq_entity_service_tree(entity),
+ entity, false);
}
/*
diff --git a/block/bfq-iosched.h b/block/bfq-iosched.h
index 5c3bf9861492..8fd83b885774 100644
--- a/block/bfq-iosched.h
+++ b/block/bfq-iosched.h
@@ -892,7 +892,8 @@ void bfq_put_idle_entity(struct bfq_service_tree *st,
struct bfq_entity *entity);
struct bfq_service_tree *
__bfq_entity_update_weight_prio(struct bfq_service_tree *old_st,
- struct bfq_entity *entity);
+ struct bfq_entity *entity,
+ bool update_class_too);
void bfq_bfqq_served(struct bfq_queue *bfqq, int served);
void bfq_bfqq_charge_time(struct bfq_data *bfqd, struct bfq_queue *bfqq,
unsigned long time_ms);
diff --git a/block/bfq-wf2q.c b/block/bfq-wf2q.c
index 8726ede19eef..5ec05cd42b80 100644
--- a/block/bfq-wf2q.c
+++ b/block/bfq-wf2q.c
@@ -694,10 +694,28 @@ struct bfq_service_tree *bfq_entity_service_tree(struct bfq_entity *entity)
return sched_data->service_tree + idx;
}
-
+/*
+ * Update weight and priority of entity. If update_class_too is true,
+ * then update the ioprio_class of entity too.
+ *
+ * The reason why the update of ioprio_class is controlled through the
+ * last parameter is as follows. Changing the ioprio class of an
+ * entity implies changing the destination service trees for that
+ * entity. If such a change occurred when the entity is already on one
+ * of the service trees for its previous class, then the state of the
+ * entity would become more complex: none of the new possible service
+ * trees for the entity, according to bfq_entity_service_tree(), would
+ * match any of the possible service trees on which the entity
+ * is. Complex operations involving these trees, such as entity
+ * activations and deactivations, should take into account this
+ * additional complexity. To avoid this issue, this function is
+ * invoked with update_class_too unset in the points in the code where
+ * entity may happen to be on some tree.
+ */
struct bfq_service_tree *
__bfq_entity_update_weight_prio(struct bfq_service_tree *old_st,
- struct bfq_entity *entity)
+ struct bfq_entity *entity,
+ bool update_class_too)
{
struct bfq_service_tree *new_st = old_st;
@@ -739,9 +757,15 @@ __bfq_entity_update_weight_prio(struct bfq_service_tree *old_st,
bfq_weight_to_ioprio(entity->orig_weight);
}
- if (bfqq)
+ if (bfqq && update_class_too)
bfqq->ioprio_class = bfqq->new_ioprio_class;
- entity->prio_changed = 0;
+
+ /*
+ * Reset prio_changed only if the ioprio_class change
+ * is not pending any longer.
+ */
+ if (!bfqq || bfqq->ioprio_class == bfqq->new_ioprio_class)
+ entity->prio_changed = 0;
/*
* NOTE: here we may be changing the weight too early,
@@ -867,7 +891,12 @@ static void bfq_update_fin_time_enqueue(struct bfq_entity *entity,
{
struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity);
- st = __bfq_entity_update_weight_prio(st, entity);
+ /*
+ * When this function is invoked, entity is not in any service
+ * tree, then it is safe to invoke next function with the last
+ * parameter set (see the comments on the function).
+ */
+ st = __bfq_entity_update_weight_prio(st, entity, true);
bfq_calc_finish(entity, entity->budget);
/*
diff --git a/block/bio-integrity.c b/block/bio-integrity.c
index b8a3a65f7364..83e92beb3c9f 100644
--- a/block/bio-integrity.c
+++ b/block/bio-integrity.c
@@ -102,7 +102,7 @@ EXPORT_SYMBOL(bio_integrity_alloc);
* Description: Used to free the integrity portion of a bio. Usually
* called from bio_free().
*/
-void bio_integrity_free(struct bio *bio)
+static void bio_integrity_free(struct bio *bio)
{
struct bio_integrity_payload *bip = bio_integrity(bio);
struct bio_set *bs = bio->bi_pool;
@@ -120,8 +120,8 @@ void bio_integrity_free(struct bio *bio)
}
bio->bi_integrity = NULL;
+ bio->bi_opf &= ~REQ_INTEGRITY;
}
-EXPORT_SYMBOL(bio_integrity_free);
/**
* bio_integrity_add_page - Attach integrity metadata
@@ -160,44 +160,6 @@ int bio_integrity_add_page(struct bio *bio, struct page *page,
EXPORT_SYMBOL(bio_integrity_add_page);
/**
- * bio_integrity_enabled - Check whether integrity can be passed
- * @bio: bio to check
- *
- * Description: Determines whether bio_integrity_prep() can be called
- * on this bio or not. bio data direction and target device must be
- * set prior to calling. The functions honors the write_generate and
- * read_verify flags in sysfs.
- */
-bool bio_integrity_enabled(struct bio *bio)
-{
- struct blk_integrity *bi = bdev_get_integrity(bio->bi_bdev);
-
- if (bio_op(bio) != REQ_OP_READ && bio_op(bio) != REQ_OP_WRITE)
- return false;
-
- if (!bio_sectors(bio))
- return false;
-
- /* Already protected? */
- if (bio_integrity(bio))
- return false;
-
- if (bi == NULL)
- return false;
-
- if (bio_data_dir(bio) == READ && bi->profile->verify_fn != NULL &&
- (bi->flags & BLK_INTEGRITY_VERIFY))
- return true;
-
- if (bio_data_dir(bio) == WRITE && bi->profile->generate_fn != NULL &&
- (bi->flags & BLK_INTEGRITY_GENERATE))
- return true;
-
- return false;
-}
-EXPORT_SYMBOL(bio_integrity_enabled);
-
-/**
* bio_integrity_intervals - Return number of integrity intervals for a bio
* @bi: blk_integrity profile for device
* @sectors: Size of the bio in 512-byte sectors
@@ -222,10 +184,11 @@ static inline unsigned int bio_integrity_bytes(struct blk_integrity *bi,
/**
* bio_integrity_process - Process integrity metadata for a bio
* @bio: bio to generate/verify integrity metadata for
+ * @proc_iter: iterator to process
* @proc_fn: Pointer to the relevant processing function
*/
static blk_status_t bio_integrity_process(struct bio *bio,
- integrity_processing_fn *proc_fn)
+ struct bvec_iter *proc_iter, integrity_processing_fn *proc_fn)
{
struct blk_integrity *bi = bdev_get_integrity(bio->bi_bdev);
struct blk_integrity_iter iter;
@@ -238,10 +201,10 @@ static blk_status_t bio_integrity_process(struct bio *bio,
iter.disk_name = bio->bi_bdev->bd_disk->disk_name;
iter.interval = 1 << bi->interval_exp;
- iter.seed = bip_get_seed(bip);
+ iter.seed = proc_iter->bi_sector;
iter.prot_buf = prot_buf;
- bio_for_each_segment(bv, bio, bviter) {
+ __bio_for_each_segment(bv, bio, bviter, *proc_iter) {
void *kaddr = kmap_atomic(bv.bv_page);
iter.data_buf = kaddr + bv.bv_offset;
@@ -262,14 +225,15 @@ static blk_status_t bio_integrity_process(struct bio *bio,
* bio_integrity_prep - Prepare bio for integrity I/O
* @bio: bio to prepare
*
- * Description: Allocates a buffer for integrity metadata, maps the
- * pages and attaches them to a bio. The bio must have data
- * direction, target device and start sector set priot to calling. In
- * the WRITE case, integrity metadata will be generated using the
- * block device's integrity function. In the READ case, the buffer
+ * Description: Checks if the bio already has an integrity payload attached.
+ * If it does, the payload has been generated by another kernel subsystem,
+ * and we just pass it through. Otherwise allocates integrity payload.
+ * The bio must have data direction, target device and start sector set priot
+ * to calling. In the WRITE case, integrity metadata will be generated using
+ * the block device's integrity function. In the READ case, the buffer
* will be prepared for DMA and a suitable end_io handler set up.
*/
-int bio_integrity_prep(struct bio *bio)
+bool bio_integrity_prep(struct bio *bio)
{
struct bio_integrity_payload *bip;
struct blk_integrity *bi;
@@ -279,20 +243,41 @@ int bio_integrity_prep(struct bio *bio)
unsigned int len, nr_pages;
unsigned int bytes, offset, i;
unsigned int intervals;
+ blk_status_t status;
bi = bdev_get_integrity(bio->bi_bdev);
q = bdev_get_queue(bio->bi_bdev);
- BUG_ON(bi == NULL);
- BUG_ON(bio_integrity(bio));
+ if (bio_op(bio) != REQ_OP_READ && bio_op(bio) != REQ_OP_WRITE)
+ return true;
+ if (!bio_sectors(bio))
+ return true;
+
+ /* Already protected? */
+ if (bio_integrity(bio))
+ return true;
+
+ if (bi == NULL)
+ return true;
+
+ if (bio_data_dir(bio) == READ) {
+ if (!bi->profile->verify_fn ||
+ !(bi->flags & BLK_INTEGRITY_VERIFY))
+ return true;
+ } else {
+ if (!bi->profile->generate_fn ||
+ !(bi->flags & BLK_INTEGRITY_GENERATE))
+ return true;
+ }
intervals = bio_integrity_intervals(bi, bio_sectors(bio));
/* Allocate kernel buffer for protection data */
len = intervals * bi->tuple_size;
buf = kmalloc(len, GFP_NOIO | q->bounce_gfp);
+ status = BLK_STS_RESOURCE;
if (unlikely(buf == NULL)) {
printk(KERN_ERR "could not allocate integrity buffer\n");
- return -ENOMEM;
+ goto err_end_io;
}
end = (((unsigned long) buf) + len + PAGE_SIZE - 1) >> PAGE_SHIFT;
@@ -304,7 +289,8 @@ int bio_integrity_prep(struct bio *bio)
if (IS_ERR(bip)) {
printk(KERN_ERR "could not allocate data integrity bioset\n");
kfree(buf);
- return PTR_ERR(bip);
+ status = BLK_STS_RESOURCE;
+ goto err_end_io;
}
bip->bip_flags |= BIP_BLOCK_INTEGRITY;
@@ -330,7 +316,7 @@ int bio_integrity_prep(struct bio *bio)
bytes, offset);
if (ret == 0)
- return 0;
+ return false;
if (ret < bytes)
break;
@@ -340,17 +326,18 @@ int bio_integrity_prep(struct bio *bio)
offset = 0;
}
- /* Install custom I/O completion handler if read verify is enabled */
- if (bio_data_dir(bio) == READ) {
- bip->bip_end_io = bio->bi_end_io;
- bio->bi_end_io = bio_integrity_endio;
+ /* Auto-generate integrity metadata if this is a write */
+ if (bio_data_dir(bio) == WRITE) {
+ bio_integrity_process(bio, &bio->bi_iter,
+ bi->profile->generate_fn);
}
+ return true;
- /* Auto-generate integrity metadata if this is a write */
- if (bio_data_dir(bio) == WRITE)
- bio_integrity_process(bio, bi->profile->generate_fn);
+err_end_io:
+ bio->bi_status = status;
+ bio_endio(bio);
+ return false;
- return 0;
}
EXPORT_SYMBOL(bio_integrity_prep);
@@ -368,16 +355,26 @@ static void bio_integrity_verify_fn(struct work_struct *work)
container_of(work, struct bio_integrity_payload, bip_work);
struct bio *bio = bip->bip_bio;
struct blk_integrity *bi = bdev_get_integrity(bio->bi_bdev);
+ struct bvec_iter iter = bio->bi_iter;
- bio->bi_status = bio_integrity_process(bio, bi->profile->verify_fn);
+ /*
+ * At the moment verify is called bio's iterator was advanced
+ * during split and completion, we need to rewind iterator to
+ * it's original position.
+ */
+ if (bio_rewind_iter(bio, &iter, iter.bi_done)) {
+ bio->bi_status = bio_integrity_process(bio, &iter,
+ bi->profile->verify_fn);
+ } else {
+ bio->bi_status = BLK_STS_IOERR;
+ }
- /* Restore original bio completion handler */
- bio->bi_end_io = bip->bip_end_io;
+ bio_integrity_free(bio);
bio_endio(bio);
}
/**
- * bio_integrity_endio - Integrity I/O completion function
+ * __bio_integrity_endio - Integrity I/O completion function
* @bio: Protected bio
* @error: Pointer to errno
*
@@ -388,27 +385,19 @@ static void bio_integrity_verify_fn(struct work_struct *work)
* in process context. This function postpones completion
* accordingly.
*/
-void bio_integrity_endio(struct bio *bio)
+bool __bio_integrity_endio(struct bio *bio)
{
- struct bio_integrity_payload *bip = bio_integrity(bio);
-
- BUG_ON(bip->bip_bio != bio);
+ if (bio_op(bio) == REQ_OP_READ && !bio->bi_status) {
+ struct bio_integrity_payload *bip = bio_integrity(bio);
- /* In case of an I/O error there is no point in verifying the
- * integrity metadata. Restore original bio end_io handler
- * and run it.
- */
- if (bio->bi_status) {
- bio->bi_end_io = bip->bip_end_io;
- bio_endio(bio);
-
- return;
+ INIT_WORK(&bip->bip_work, bio_integrity_verify_fn);
+ queue_work(kintegrityd_wq, &bip->bip_work);
+ return false;
}
- INIT_WORK(&bip->bip_work, bio_integrity_verify_fn);
- queue_work(kintegrityd_wq, &bip->bip_work);
+ bio_integrity_free(bio);
+ return true;
}
-EXPORT_SYMBOL(bio_integrity_endio);
/**
* bio_integrity_advance - Advance integrity vector
@@ -425,6 +414,7 @@ void bio_integrity_advance(struct bio *bio, unsigned int bytes_done)
struct blk_integrity *bi = bdev_get_integrity(bio->bi_bdev);
unsigned bytes = bio_integrity_bytes(bi, bytes_done >> 9);
+ bip->bip_iter.bi_sector += bytes_done >> 9;
bvec_iter_advance(bip->bip_vec, &bip->bip_iter, bytes);
}
EXPORT_SYMBOL(bio_integrity_advance);
@@ -432,22 +422,15 @@ EXPORT_SYMBOL(bio_integrity_advance);
/**
* bio_integrity_trim - Trim integrity vector
* @bio: bio whose integrity vector to update
- * @offset: offset to first data sector
- * @sectors: number of data sectors
*
* Description: Used to trim the integrity vector in a cloned bio.
- * The ivec will be advanced corresponding to 'offset' data sectors
- * and the length will be truncated corresponding to 'len' data
- * sectors.
*/
-void bio_integrity_trim(struct bio *bio, unsigned int offset,
- unsigned int sectors)
+void bio_integrity_trim(struct bio *bio)
{
struct bio_integrity_payload *bip = bio_integrity(bio);
struct blk_integrity *bi = bdev_get_integrity(bio->bi_bdev);
- bio_integrity_advance(bio, offset << 9);
- bip->bip_iter.bi_size = bio_integrity_bytes(bi, sectors);
+ bip->bip_iter.bi_size = bio_integrity_bytes(bi, bio_sectors(bio));
}
EXPORT_SYMBOL(bio_integrity_trim);
diff --git a/block/bio.c b/block/bio.c
index 9cf98b29588a..9cabf5d0be20 100644
--- a/block/bio.c
+++ b/block/bio.c
@@ -240,20 +240,18 @@ fallback:
return bvl;
}
-static void __bio_free(struct bio *bio)
+void bio_uninit(struct bio *bio)
{
bio_disassociate_task(bio);
-
- if (bio_integrity(bio))
- bio_integrity_free(bio);
}
+EXPORT_SYMBOL(bio_uninit);
static void bio_free(struct bio *bio)
{
struct bio_set *bs = bio->bi_pool;
void *p;
- __bio_free(bio);
+ bio_uninit(bio);
if (bs) {
bvec_free(bs->bvec_pool, bio->bi_io_vec, BVEC_POOL_IDX(bio));
@@ -271,6 +269,11 @@ static void bio_free(struct bio *bio)
}
}
+/*
+ * Users of this function have their own bio allocation. Subsequently,
+ * they must remember to pair any call to bio_init() with bio_uninit()
+ * when IO has completed, or when the bio is released.
+ */
void bio_init(struct bio *bio, struct bio_vec *table,
unsigned short max_vecs)
{
@@ -297,7 +300,7 @@ void bio_reset(struct bio *bio)
{
unsigned long flags = bio->bi_flags & (~0UL << BIO_RESET_BITS);
- __bio_free(bio);
+ bio_uninit(bio);
memset(bio, 0, BIO_RESET_BYTES);
bio->bi_flags = flags;
@@ -1807,6 +1810,8 @@ void bio_endio(struct bio *bio)
again:
if (!bio_remaining_done(bio))
return;
+ if (!bio_integrity_endio(bio))
+ return;
/*
* Need to have a real endio function for chained bios, otherwise
@@ -1862,7 +1867,7 @@ struct bio *bio_split(struct bio *bio, int sectors,
split->bi_iter.bi_size = sectors << 9;
if (bio_integrity(split))
- bio_integrity_trim(split, 0, sectors);
+ bio_integrity_trim(split);
bio_advance(bio, split->bi_iter.bi_size);
@@ -1894,6 +1899,10 @@ void bio_trim(struct bio *bio, int offset, int size)
bio_advance(bio, offset << 9);
bio->bi_iter.bi_size = size;
+
+ if (bio_integrity(bio))
+ bio_integrity_trim(bio);
+
}
EXPORT_SYMBOL_GPL(bio_trim);
diff --git a/block/blk-core.c b/block/blk-core.c
index af393d5a9680..970b9c9638c5 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -1787,11 +1787,8 @@ static blk_qc_t blk_queue_bio(struct request_queue *q, struct bio *bio)
blk_queue_split(q, &bio);
- if (bio_integrity_enabled(bio) && bio_integrity_prep(bio)) {
- bio->bi_status = BLK_STS_IOERR;
- bio_endio(bio);
+ if (!bio_integrity_prep(bio))
return BLK_QC_T_NONE;
- }
if (op_is_flush(bio->bi_opf)) {
spin_lock_irq(q->queue_lock);
diff --git a/block/blk-lib.c b/block/blk-lib.c
index e8caecd71688..3fe0aec90597 100644
--- a/block/blk-lib.c
+++ b/block/blk-lib.c
@@ -261,6 +261,19 @@ static int __blkdev_issue_write_zeroes(struct block_device *bdev,
return 0;
}
+/*
+ * Convert a number of 512B sectors to a number of pages.
+ * The result is limited to a number of pages that can fit into a BIO.
+ * Also make sure that the result is always at least 1 (page) for the cases
+ * where nr_sects is lower than the number of sectors in a page.
+ */
+static unsigned int __blkdev_sectors_to_bio_pages(sector_t nr_sects)
+{
+ sector_t bytes = (nr_sects << 9) + PAGE_SIZE - 1;
+
+ return min(bytes >> PAGE_SHIFT, (sector_t)BIO_MAX_PAGES);
+}
+
/**
* __blkdev_issue_zeroout - generate number of zero filed write bios
* @bdev: blockdev to issue
@@ -307,18 +320,18 @@ int __blkdev_issue_zeroout(struct block_device *bdev, sector_t sector,
ret = 0;
while (nr_sects != 0) {
- bio = next_bio(bio, min(nr_sects, (sector_t)BIO_MAX_PAGES),
- gfp_mask);
+ bio = next_bio(bio, __blkdev_sectors_to_bio_pages(nr_sects),
+ gfp_mask);
bio->bi_iter.bi_sector = sector;
bio->bi_bdev = bdev;
bio_set_op_attrs(bio, REQ_OP_WRITE, 0);
while (nr_sects != 0) {
- sz = min((sector_t) PAGE_SIZE >> 9 , nr_sects);
- bi_size = bio_add_page(bio, ZERO_PAGE(0), sz << 9, 0);
+ sz = min((sector_t) PAGE_SIZE, nr_sects << 9);
+ bi_size = bio_add_page(bio, ZERO_PAGE(0), sz, 0);
nr_sects -= bi_size >> 9;
sector += bi_size >> 9;
- if (bi_size < (sz << 9))
+ if (bi_size < sz)
break;
}
cond_resched();
diff --git a/block/blk-mq-sched.c b/block/blk-mq-sched.c
index 7f0dc48ffb40..4ab69435708c 100644
--- a/block/blk-mq-sched.c
+++ b/block/blk-mq-sched.c
@@ -515,10 +515,12 @@ int blk_mq_init_sched(struct request_queue *q, struct elevator_type *e)
}
/*
- * Default to 256, since we don't split into sync/async like the
- * old code did. Additionally, this is a per-hw queue depth.
+ * Default to double of smaller one between hw queue_depth and 128,
+ * since we don't split into sync/async like the old code did.
+ * Additionally, this is a per-hw queue depth.
*/
- q->nr_requests = 2 * BLKDEV_MAX_RQ;
+ q->nr_requests = 2 * min_t(unsigned int, q->tag_set->queue_depth,
+ BLKDEV_MAX_RQ);
queue_for_each_hw_ctx(q, hctx, i) {
ret = blk_mq_sched_alloc_tags(q, hctx, i);
diff --git a/block/blk-mq.c b/block/blk-mq.c
index 05dfa3f270ae..77617fb12661 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -941,14 +941,14 @@ static bool reorder_tags_to_front(struct list_head *list)
return first != NULL;
}
-static int blk_mq_dispatch_wake(wait_queue_t *wait, unsigned mode, int flags,
+static int blk_mq_dispatch_wake(wait_queue_entry_t *wait, unsigned mode, int flags,
void *key)
{
struct blk_mq_hw_ctx *hctx;
hctx = container_of(wait, struct blk_mq_hw_ctx, dispatch_wait);
- list_del(&wait->task_list);
+ list_del(&wait->entry);
clear_bit_unlock(BLK_MQ_S_TAG_WAITING, &hctx->state);
blk_mq_run_hw_queue(hctx, true);
return 1;
@@ -1550,10 +1550,8 @@ static blk_qc_t blk_mq_make_request(struct request_queue *q, struct bio *bio)
blk_queue_split(q, &bio);
- if (bio_integrity_enabled(bio) && bio_integrity_prep(bio)) {
- bio_io_error(bio);
+ if (!bio_integrity_prep(bio))
return BLK_QC_T_NONE;
- }
if (!is_flush_fua && !blk_queue_nomerges(q) &&
blk_attempt_plug_merge(q, bio, &request_count, &same_queue_rq))
diff --git a/block/blk-wbt.c b/block/blk-wbt.c
index 17676f4d7fd1..6a9a0f03a67b 100644
--- a/block/blk-wbt.c
+++ b/block/blk-wbt.c
@@ -503,7 +503,7 @@ static inline unsigned int get_limit(struct rq_wb *rwb, unsigned long rw)
}
static inline bool may_queue(struct rq_wb *rwb, struct rq_wait *rqw,
- wait_queue_t *wait, unsigned long rw)
+ wait_queue_entry_t *wait, unsigned long rw)
{
/*
* inc it here even if disabled, since we'll dec it at completion.
@@ -520,7 +520,7 @@ static inline bool may_queue(struct rq_wb *rwb, struct rq_wait *rqw,
* in line to be woken up, wait for our turn.
*/
if (waitqueue_active(&rqw->wait) &&
- rqw->wait.task_list.next != &wait->task_list)
+ rqw->wait.head.next != &wait->entry)
return false;
return atomic_inc_below(&rqw->inflight, get_limit(rwb, rw));
diff --git a/block/blk.h b/block/blk.h
index 01ebb8185f6b..3a3d715bd725 100644
--- a/block/blk.h
+++ b/block/blk.h
@@ -81,10 +81,21 @@ static inline void blk_queue_enter_live(struct request_queue *q)
#ifdef CONFIG_BLK_DEV_INTEGRITY
void blk_flush_integrity(void);
+bool __bio_integrity_endio(struct bio *);
+static inline bool bio_integrity_endio(struct bio *bio)
+{
+ if (bio_integrity(bio))
+ return __bio_integrity_endio(bio);
+ return true;
+}
#else
static inline void blk_flush_integrity(void)
{
}
+static inline bool bio_integrity_endio(struct bio *bio)
+{
+ return true;
+}
#endif
void blk_timeout_work(struct work_struct *work);
diff --git a/block/kyber-iosched.c b/block/kyber-iosched.c
index a9f6fd3fab8e..f58cab82105b 100644
--- a/block/kyber-iosched.c
+++ b/block/kyber-iosched.c
@@ -99,7 +99,7 @@ struct kyber_hctx_data {
struct list_head rqs[KYBER_NUM_DOMAINS];
unsigned int cur_domain;
unsigned int batching;
- wait_queue_t domain_wait[KYBER_NUM_DOMAINS];
+ wait_queue_entry_t domain_wait[KYBER_NUM_DOMAINS];
atomic_t wait_index[KYBER_NUM_DOMAINS];
};
@@ -385,7 +385,7 @@ static int kyber_init_hctx(struct blk_mq_hw_ctx *hctx, unsigned int hctx_idx)
for (i = 0; i < KYBER_NUM_DOMAINS; i++) {
INIT_LIST_HEAD(&khd->rqs[i]);
- INIT_LIST_HEAD(&khd->domain_wait[i].task_list);
+ INIT_LIST_HEAD(&khd->domain_wait[i].entry);
atomic_set(&khd->wait_index[i], 0);
}
@@ -503,12 +503,12 @@ static void kyber_flush_busy_ctxs(struct kyber_hctx_data *khd,
}
}
-static int kyber_domain_wake(wait_queue_t *wait, unsigned mode, int flags,
+static int kyber_domain_wake(wait_queue_entry_t *wait, unsigned mode, int flags,
void *key)
{
struct blk_mq_hw_ctx *hctx = READ_ONCE(wait->private);
- list_del_init(&wait->task_list);
+ list_del_init(&wait->entry);
blk_mq_run_hw_queue(hctx, true);
return 1;
}
@@ -519,7 +519,7 @@ static int kyber_get_domain_token(struct kyber_queue_data *kqd,
{
unsigned int sched_domain = khd->cur_domain;
struct sbitmap_queue *domain_tokens = &kqd->domain_tokens[sched_domain];
- wait_queue_t *wait = &khd->domain_wait[sched_domain];
+ wait_queue_entry_t *wait = &khd->domain_wait[sched_domain];
struct sbq_wait_state *ws;
int nr;
@@ -532,7 +532,7 @@ static int kyber_get_domain_token(struct kyber_queue_data *kqd,
* run when one becomes available. Note that this is serialized on
* khd->lock, but we still need to be careful about the waker.
*/
- if (list_empty_careful(&wait->task_list)) {
+ if (list_empty_careful(&wait->entry)) {
init_waitqueue_func_entry(wait, kyber_domain_wake);
wait->private = hctx;
ws = sbq_wait_ptr(domain_tokens,
@@ -730,9 +730,9 @@ static int kyber_##name##_waiting_show(void *data, struct seq_file *m) \
{ \
struct blk_mq_hw_ctx *hctx = data; \
struct kyber_hctx_data *khd = hctx->sched_data; \
- wait_queue_t *wait = &khd->domain_wait[domain]; \
+ wait_queue_entry_t *wait = &khd->domain_wait[domain]; \
\
- seq_printf(m, "%d\n", !list_empty_careful(&wait->task_list)); \
+ seq_printf(m, "%d\n", !list_empty_careful(&wait->entry)); \
return 0; \
}
KYBER_DEBUGFS_DOMAIN_ATTRS(KYBER_READ, read)
diff --git a/block/t10-pi.c b/block/t10-pi.c
index 3416dadf7b15..a98db384048f 100644
--- a/block/t10-pi.c
+++ b/block/t10-pi.c
@@ -28,9 +28,6 @@
typedef __be16 (csum_fn) (void *, unsigned int);
-static const __be16 APP_ESCAPE = (__force __be16) 0xffff;
-static const __be32 REF_ESCAPE = (__force __be32) 0xffffffff;
-
static __be16 t10_pi_crc_fn(void *data, unsigned int len)
{
return cpu_to_be16(crc_t10dif(data, len));
@@ -82,7 +79,7 @@ static blk_status_t t10_pi_verify(struct blk_integrity_iter *iter,
switch (type) {
case 1:
case 2:
- if (pi->app_tag == APP_ESCAPE)
+ if (pi->app_tag == T10_PI_APP_ESCAPE)
goto next;
if (be32_to_cpu(pi->ref_tag) !=
@@ -95,8 +92,8 @@ static blk_status_t t10_pi_verify(struct blk_integrity_iter *iter,
}
break;
case 3:
- if (pi->app_tag == APP_ESCAPE &&
- pi->ref_tag == REF_ESCAPE)
+ if (pi->app_tag == T10_PI_APP_ESCAPE &&
+ pi->ref_tag == T10_PI_REF_ESCAPE)
goto next;
break;
}
diff --git a/drivers/acpi/acpica/tbutils.c b/drivers/acpi/acpica/tbutils.c
index 7abe66505739..0d2e98920069 100644
--- a/drivers/acpi/acpica/tbutils.c
+++ b/drivers/acpi/acpica/tbutils.c
@@ -416,9 +416,18 @@ acpi_tb_get_table(struct acpi_table_desc *table_desc,
}
}
- table_desc->validation_count++;
- if (table_desc->validation_count == 0) {
- table_desc->validation_count--;
+ if (table_desc->validation_count < ACPI_MAX_TABLE_VALIDATIONS) {
+ table_desc->validation_count++;
+
+ /*
+ * Detect validation_count overflows to ensure that the warning
+ * message will only be printed once.
+ */
+ if (table_desc->validation_count >= ACPI_MAX_TABLE_VALIDATIONS) {
+ ACPI_WARNING((AE_INFO,
+ "Table %p, Validation count overflows\n",
+ table_desc));
+ }
}
*out_table = table_desc->pointer;
@@ -445,13 +454,20 @@ void acpi_tb_put_table(struct acpi_table_desc *table_desc)
ACPI_FUNCTION_TRACE(acpi_tb_put_table);
- if (table_desc->validation_count == 0) {
- ACPI_WARNING((AE_INFO,
- "Table %p, Validation count is zero before decrement\n",
- table_desc));
- return_VOID;
+ if (table_desc->validation_count < ACPI_MAX_TABLE_VALIDATIONS) {
+ table_desc->validation_count--;
+
+ /*
+ * Detect validation_count underflows to ensure that the warning
+ * message will only be printed once.
+ */
+ if (table_desc->validation_count >= ACPI_MAX_TABLE_VALIDATIONS) {
+ ACPI_WARNING((AE_INFO,
+ "Table %p, Validation count underflows\n",
+ table_desc));
+ return_VOID;
+ }
}
- table_desc->validation_count--;
if (table_desc->validation_count == 0) {
diff --git a/drivers/acpi/acpica/utresrc.c b/drivers/acpi/acpica/utresrc.c
index e0587c85bafd..ff096d9755b9 100644
--- a/drivers/acpi/acpica/utresrc.c
+++ b/drivers/acpi/acpica/utresrc.c
@@ -474,15 +474,6 @@ acpi_ut_walk_aml_resources(struct acpi_walk_state *walk_state,
return_ACPI_STATUS(AE_AML_NO_RESOURCE_END_TAG);
}
- /*
- * The end_tag opcode must be followed by a zero byte.
- * Although this byte is technically defined to be a checksum,
- * in practice, all ASL compilers set this byte to zero.
- */
- if (*(aml + 1) != 0) {
- return_ACPI_STATUS(AE_AML_NO_RESOURCE_END_TAG);
- }
-
/* Return the pointer to the end_tag if requested */
if (!user_function) {
diff --git a/drivers/acpi/pci_root.c b/drivers/acpi/pci_root.c
index 919be0aa2578..240544253ccd 100644
--- a/drivers/acpi/pci_root.c
+++ b/drivers/acpi/pci_root.c
@@ -523,7 +523,7 @@ static int acpi_pci_root_add(struct acpi_device *device,
struct acpi_pci_root *root;
acpi_handle handle = device->handle;
int no_aspm = 0;
- bool hotadd = system_state != SYSTEM_BOOTING;
+ bool hotadd = system_state == SYSTEM_RUNNING;
root = kzalloc(sizeof(struct acpi_pci_root), GFP_KERNEL);
if (!root)
diff --git a/drivers/acpi/scan.c b/drivers/acpi/scan.c
index 3a10d7573477..d53162997f32 100644
--- a/drivers/acpi/scan.c
+++ b/drivers/acpi/scan.c
@@ -1428,6 +1428,37 @@ static void acpi_init_coherency(struct acpi_device *adev)
adev->flags.coherent_dma = cca;
}
+static int acpi_check_spi_i2c_slave(struct acpi_resource *ares, void *data)
+{
+ bool *is_spi_i2c_slave_p = data;
+
+ if (ares->type != ACPI_RESOURCE_TYPE_SERIAL_BUS)
+ return 1;
+
+ /*
+ * devices that are connected to UART still need to be enumerated to
+ * platform bus
+ */
+ if (ares->data.common_serial_bus.type != ACPI_RESOURCE_SERIAL_TYPE_UART)
+ *is_spi_i2c_slave_p = true;
+
+ /* no need to do more checking */
+ return -1;
+}
+
+static bool acpi_is_spi_i2c_slave(struct acpi_device *device)
+{
+ struct list_head resource_list;
+ bool is_spi_i2c_slave = false;
+
+ INIT_LIST_HEAD(&resource_list);
+ acpi_dev_get_resources(device, &resource_list, acpi_check_spi_i2c_slave,
+ &is_spi_i2c_slave);
+ acpi_dev_free_resource_list(&resource_list);
+
+ return is_spi_i2c_slave;
+}
+
void acpi_init_device_object(struct acpi_device *device, acpi_handle handle,
int type, unsigned long long sta)
{
@@ -1443,6 +1474,7 @@ void acpi_init_device_object(struct acpi_device *device, acpi_handle handle,
acpi_bus_get_flags(device);
device->flags.match_driver = false;
device->flags.initialized = true;
+ device->flags.spi_i2c_slave = acpi_is_spi_i2c_slave(device);
acpi_device_clear_enumerated(device);
device_initialize(&device->dev);
dev_set_uevent_suppress(&device->dev, true);
@@ -1727,38 +1759,13 @@ static acpi_status acpi_bus_check_add(acpi_handle handle, u32 lvl_not_used,
return AE_OK;
}
-static int acpi_check_spi_i2c_slave(struct acpi_resource *ares, void *data)
-{
- bool *is_spi_i2c_slave_p = data;
-
- if (ares->type != ACPI_RESOURCE_TYPE_SERIAL_BUS)
- return 1;
-
- /*
- * devices that are connected to UART still need to be enumerated to
- * platform bus
- */
- if (ares->data.common_serial_bus.type != ACPI_RESOURCE_SERIAL_TYPE_UART)
- *is_spi_i2c_slave_p = true;
-
- /* no need to do more checking */
- return -1;
-}
-
static void acpi_default_enumeration(struct acpi_device *device)
{
- struct list_head resource_list;
- bool is_spi_i2c_slave = false;
-
/*
* Do not enumerate SPI/I2C slaves as they will be enumerated by their
* respective parents.
*/
- INIT_LIST_HEAD(&resource_list);
- acpi_dev_get_resources(device, &resource_list, acpi_check_spi_i2c_slave,
- &is_spi_i2c_slave);
- acpi_dev_free_resource_list(&resource_list);
- if (!is_spi_i2c_slave) {
+ if (!device->flags.spi_i2c_slave) {
acpi_create_platform_device(device, NULL);
acpi_device_set_enumerated(device);
} else {
@@ -1854,7 +1861,7 @@ static void acpi_bus_attach(struct acpi_device *device)
return;
device->flags.match_driver = true;
- if (ret > 0) {
+ if (ret > 0 && !device->flags.spi_i2c_slave) {
acpi_device_set_enumerated(device);
goto ok;
}
@@ -1863,10 +1870,10 @@ static void acpi_bus_attach(struct acpi_device *device)
if (ret < 0)
return;
- if (device->pnp.type.platform_id)
- acpi_default_enumeration(device);
- else
+ if (!device->pnp.type.platform_id && !device->flags.spi_i2c_slave)
acpi_device_set_enumerated(device);
+ else
+ acpi_default_enumeration(device);
ok:
list_for_each_entry(child, &device->children, node)
diff --git a/drivers/base/node.c b/drivers/base/node.c
index 5548f9686016..0440d95c9b5b 100644
--- a/drivers/base/node.c
+++ b/drivers/base/node.c
@@ -377,7 +377,7 @@ static int __ref get_nid_for_pfn(unsigned long pfn)
if (!pfn_valid_within(pfn))
return -1;
#ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT
- if (system_state == SYSTEM_BOOTING)
+ if (system_state < SYSTEM_RUNNING)
return early_pfn_to_nid(pfn);
#endif
page = pfn_to_page(pfn);
diff --git a/drivers/block/cciss.c b/drivers/block/cciss.c
index 02a611993bb4..678af946be30 100644
--- a/drivers/block/cciss.c
+++ b/drivers/block/cciss.c
@@ -1944,6 +1944,13 @@ static void cciss_get_serial_no(ctlr_info_t *h, int logvol,
return;
}
+static void cciss_initialize_rq(struct request *rq)
+{
+ struct scsi_request *sreq = blk_mq_rq_to_pdu(rq);
+
+ scsi_req_init(sreq);
+}
+
/*
* cciss_add_disk sets up the block device queue for a logical drive
*/
@@ -1956,6 +1963,7 @@ static int cciss_add_disk(ctlr_info_t *h, struct gendisk *disk,
disk->queue->cmd_size = sizeof(struct scsi_request);
disk->queue->request_fn = do_cciss_request;
+ disk->queue->initialize_rq_fn = cciss_initialize_rq;
disk->queue->queue_lock = &h->lock;
queue_flag_set_unlocked(QUEUE_FLAG_SCSI_PASSTHROUGH, disk->queue);
if (blk_init_allocated_queue(disk->queue) < 0)
diff --git a/drivers/block/mtip32xx/mtip32xx.c b/drivers/block/mtip32xx/mtip32xx.c
index 87717a1a5c89..4a3cfc7940de 100644
--- a/drivers/block/mtip32xx/mtip32xx.c
+++ b/drivers/block/mtip32xx/mtip32xx.c
@@ -174,7 +174,6 @@ static void mtip_init_cmd_header(struct request *rq)
{
struct driver_data *dd = rq->q->queuedata;
struct mtip_cmd *cmd = blk_mq_rq_to_pdu(rq);
- u32 host_cap_64 = readl(dd->mmio + HOST_CAP) & HOST_CAP_64;
/* Point the command headers at the command tables. */
cmd->command_header = dd->port->command_list +
@@ -182,7 +181,7 @@ static void mtip_init_cmd_header(struct request *rq)
cmd->command_header_dma = dd->port->command_list_dma +
(sizeof(struct mtip_cmd_hdr) * rq->tag);
- if (host_cap_64)
+ if (test_bit(MTIP_PF_HOST_CAP_64, &dd->port->flags))
cmd->command_header->ctbau = __force_bit2int cpu_to_le32((cmd->command_dma >> 16) >> 16);
cmd->command_header->ctba = __force_bit2int cpu_to_le32(cmd->command_dma & 0xFFFFFFFF);
@@ -386,6 +385,7 @@ static void mtip_init_port(struct mtip_port *port)
port->mmio + PORT_LST_ADDR_HI);
writel((port->rxfis_dma >> 16) >> 16,
port->mmio + PORT_FIS_ADDR_HI);
+ set_bit(MTIP_PF_HOST_CAP_64, &port->flags);
}
writel(port->command_list_dma & 0xFFFFFFFF,
diff --git a/drivers/block/mtip32xx/mtip32xx.h b/drivers/block/mtip32xx/mtip32xx.h
index e8286af50e16..e20e55dab443 100644
--- a/drivers/block/mtip32xx/mtip32xx.h
+++ b/drivers/block/mtip32xx/mtip32xx.h
@@ -140,6 +140,7 @@ enum {
(1 << MTIP_PF_SE_ACTIVE_BIT) |
(1 << MTIP_PF_DM_ACTIVE_BIT) |
(1 << MTIP_PF_TO_ACTIVE_BIT)),
+ MTIP_PF_HOST_CAP_64 = 10, /* cache HOST_CAP_64 */
MTIP_PF_SVC_THD_ACTIVE_BIT = 4,
MTIP_PF_ISSUE_CMDS_BIT = 5,
diff --git a/drivers/block/null_blk.c b/drivers/block/null_blk.c
index 71f4422eba81..85c24cace973 100644
--- a/drivers/block/null_blk.c
+++ b/drivers/block/null_blk.c
@@ -844,9 +844,6 @@ static int __init null_init(void)
queue_mode = NULL_Q_MQ;
}
- if (queue_mode == NULL_Q_MQ && shared_tags)
- null_init_tag_set(&tag_set);
-
if (queue_mode == NULL_Q_MQ && use_per_node_hctx) {
if (submit_queues < nr_online_nodes) {
pr_warn("null_blk: submit_queues param is set to %u.",
@@ -858,11 +855,19 @@ static int __init null_init(void)
else if (!submit_queues)
submit_queues = 1;
+ if (queue_mode == NULL_Q_MQ && shared_tags) {
+ ret = null_init_tag_set(&tag_set);
+ if (ret)
+ return ret;
+ }
+
mutex_init(&lock);
null_major = register_blkdev(0, "nullb");
- if (null_major < 0)
- return null_major;
+ if (null_major < 0) {
+ ret = null_major;
+ goto err_tagset;
+ }
if (use_lightnvm) {
ppa_cache = kmem_cache_create("ppa_cache", 64 * sizeof(u64),
@@ -891,6 +896,9 @@ err_dev:
kmem_cache_destroy(ppa_cache);
err_ppa:
unregister_blkdev(null_major, "nullb");
+err_tagset:
+ if (queue_mode == NULL_Q_MQ && shared_tags)
+ blk_mq_free_tag_set(&tag_set);
return ret;
}
diff --git a/drivers/bluetooth/btmrvl_main.c b/drivers/bluetooth/btmrvl_main.c
index c38cb5b91291..fe850f0567cb 100644
--- a/drivers/bluetooth/btmrvl_main.c
+++ b/drivers/bluetooth/btmrvl_main.c
@@ -602,7 +602,7 @@ static int btmrvl_service_main_thread(void *data)
struct btmrvl_thread *thread = data;
struct btmrvl_private *priv = thread->priv;
struct btmrvl_adapter *adapter = priv->adapter;
- wait_queue_t wait;
+ wait_queue_entry_t wait;
struct sk_buff *skb;
ulong flags;
diff --git a/drivers/char/ipmi/ipmi_watchdog.c b/drivers/char/ipmi/ipmi_watchdog.c
index d165af8abe36..a5c6cfe71a8e 100644
--- a/drivers/char/ipmi/ipmi_watchdog.c
+++ b/drivers/char/ipmi/ipmi_watchdog.c
@@ -821,7 +821,7 @@ static ssize_t ipmi_read(struct file *file,
loff_t *ppos)
{
int rv = 0;
- wait_queue_t wait;
+ wait_queue_entry_t wait;
if (count <= 0)
return 0;
diff --git a/drivers/char/random.c b/drivers/char/random.c
index e870f329db88..01a260f67437 100644
--- a/drivers/char/random.c
+++ b/drivers/char/random.c
@@ -803,13 +803,13 @@ static int crng_fast_load(const char *cp, size_t len)
p[crng_init_cnt % CHACHA20_KEY_SIZE] ^= *cp;
cp++; crng_init_cnt++; len--;
}
+ spin_unlock_irqrestore(&primary_crng.lock, flags);
if (crng_init_cnt >= CRNG_INIT_CNT_THRESH) {
invalidate_batched_entropy();
crng_init = 1;
wake_up_interruptible(&crng_init_wait);
pr_notice("random: fast init done\n");
}
- spin_unlock_irqrestore(&primary_crng.lock, flags);
return 1;
}
@@ -841,6 +841,7 @@ static void crng_reseed(struct crng_state *crng, struct entropy_store *r)
}
memzero_explicit(&buf, sizeof(buf));
crng->init_time = jiffies;
+ spin_unlock_irqrestore(&primary_crng.lock, flags);
if (crng == &primary_crng && crng_init < 2) {
invalidate_batched_entropy();
crng_init = 2;
@@ -848,7 +849,6 @@ static void crng_reseed(struct crng_state *crng, struct entropy_store *r)
wake_up_interruptible(&crng_init_wait);
pr_notice("random: crng init done\n");
}
- spin_unlock_irqrestore(&primary_crng.lock, flags);
}
static inline void crng_wait_ready(void)
@@ -2041,8 +2041,8 @@ static DEFINE_PER_CPU(struct batched_entropy, batched_entropy_u64);
u64 get_random_u64(void)
{
u64 ret;
- bool use_lock = crng_init < 2;
- unsigned long flags;
+ bool use_lock = READ_ONCE(crng_init) < 2;
+ unsigned long flags = 0;
struct batched_entropy *batch;
#if BITS_PER_LONG == 64
@@ -2073,8 +2073,8 @@ static DEFINE_PER_CPU(struct batched_entropy, batched_entropy_u32);
u32 get_random_u32(void)
{
u32 ret;
- bool use_lock = crng_init < 2;
- unsigned long flags;
+ bool use_lock = READ_ONCE(crng_init) < 2;
+ unsigned long flags = 0;
struct batched_entropy *batch;
if (arch_get_random_int(&ret))
diff --git a/drivers/clk/meson/Kconfig b/drivers/clk/meson/Kconfig
index 19480bcc7046..2f29ee1a4d00 100644
--- a/drivers/clk/meson/Kconfig
+++ b/drivers/clk/meson/Kconfig
@@ -14,6 +14,7 @@ config COMMON_CLK_MESON8B
config COMMON_CLK_GXBB
bool
depends on COMMON_CLK_AMLOGIC
+ select RESET_CONTROLLER
help
Support for the clock controller on AmLogic S905 devices, aka gxbb.
Say Y if you want peripherals and CPU frequency scaling to work.
diff --git a/drivers/clk/sunxi-ng/Kconfig b/drivers/clk/sunxi-ng/Kconfig
index b0d551a8efe4..eb89c7801f00 100644
--- a/drivers/clk/sunxi-ng/Kconfig
+++ b/drivers/clk/sunxi-ng/Kconfig
@@ -156,6 +156,7 @@ config SUN8I_R_CCU
bool "Support for Allwinner SoCs' PRCM CCUs"
select SUNXI_CCU_DIV
select SUNXI_CCU_GATE
+ select SUNXI_CCU_MP
default MACH_SUN8I || (ARCH_SUNXI && ARM64)
endif
diff --git a/drivers/clk/sunxi-ng/ccu-sun50i-a64.h b/drivers/clk/sunxi-ng/ccu-sun50i-a64.h
index 9b3cd24b78d2..061b6fbb4f95 100644
--- a/drivers/clk/sunxi-ng/ccu-sun50i-a64.h
+++ b/drivers/clk/sunxi-ng/ccu-sun50i-a64.h
@@ -31,7 +31,9 @@
#define CLK_PLL_VIDEO0_2X 8
#define CLK_PLL_VE 9
#define CLK_PLL_DDR0 10
-#define CLK_PLL_PERIPH0 11
+
+/* PLL_PERIPH0 exported for PRCM */
+
#define CLK_PLL_PERIPH0_2X 12
#define CLK_PLL_PERIPH1 13
#define CLK_PLL_PERIPH1_2X 14
diff --git a/drivers/clk/sunxi-ng/ccu-sun5i.c b/drivers/clk/sunxi-ng/ccu-sun5i.c
index 5c476f966a72..5372bf8be5e6 100644
--- a/drivers/clk/sunxi-ng/ccu-sun5i.c
+++ b/drivers/clk/sunxi-ng/ccu-sun5i.c
@@ -243,7 +243,7 @@ static SUNXI_CCU_GATE(ahb_ss_clk, "ahb-ss", "ahb",
static SUNXI_CCU_GATE(ahb_dma_clk, "ahb-dma", "ahb",
0x060, BIT(6), 0);
static SUNXI_CCU_GATE(ahb_bist_clk, "ahb-bist", "ahb",
- 0x060, BIT(6), 0);
+ 0x060, BIT(7), 0);
static SUNXI_CCU_GATE(ahb_mmc0_clk, "ahb-mmc0", "ahb",
0x060, BIT(8), 0);
static SUNXI_CCU_GATE(ahb_mmc1_clk, "ahb-mmc1", "ahb",
diff --git a/drivers/clk/sunxi-ng/ccu-sun6i-a31.c b/drivers/clk/sunxi-ng/ccu-sun6i-a31.c
index 89e68d29bf45..df97e25aec76 100644
--- a/drivers/clk/sunxi-ng/ccu-sun6i-a31.c
+++ b/drivers/clk/sunxi-ng/ccu-sun6i-a31.c
@@ -556,7 +556,7 @@ static SUNXI_CCU_M_WITH_MUX_GATE(lcd0_ch1_clk, "lcd0-ch1", lcd_ch1_parents,
0x12c, 0, 4, 24, 3, BIT(31),
CLK_SET_RATE_PARENT);
static SUNXI_CCU_M_WITH_MUX_GATE(lcd1_ch1_clk, "lcd1-ch1", lcd_ch1_parents,
- 0x12c, 0, 4, 24, 3, BIT(31),
+ 0x130, 0, 4, 24, 3, BIT(31),
CLK_SET_RATE_PARENT);
static const char * const csi_sclk_parents[] = { "pll-video0", "pll-video1",
diff --git a/drivers/clk/sunxi-ng/ccu-sun8i-h3.h b/drivers/clk/sunxi-ng/ccu-sun8i-h3.h
index 85973d1e8165..1b4baea37d81 100644
--- a/drivers/clk/sunxi-ng/ccu-sun8i-h3.h
+++ b/drivers/clk/sunxi-ng/ccu-sun8i-h3.h
@@ -29,7 +29,9 @@
#define CLK_PLL_VIDEO 6
#define CLK_PLL_VE 7
#define CLK_PLL_DDR 8
-#define CLK_PLL_PERIPH0 9
+
+/* PLL_PERIPH0 exported for PRCM */
+
#define CLK_PLL_PERIPH0_2X 10
#define CLK_PLL_GPU 11
#define CLK_PLL_PERIPH1 12
diff --git a/drivers/clk/sunxi-ng/ccu-sun8i-v3s.c b/drivers/clk/sunxi-ng/ccu-sun8i-v3s.c
index e58706b40ae9..6297add857b5 100644
--- a/drivers/clk/sunxi-ng/ccu-sun8i-v3s.c
+++ b/drivers/clk/sunxi-ng/ccu-sun8i-v3s.c
@@ -537,7 +537,7 @@ static struct ccu_reset_map sun8i_v3s_ccu_resets[] = {
[RST_BUS_EMAC] = { 0x2c0, BIT(17) },
[RST_BUS_HSTIMER] = { 0x2c0, BIT(19) },
[RST_BUS_SPI0] = { 0x2c0, BIT(20) },
- [RST_BUS_OTG] = { 0x2c0, BIT(23) },
+ [RST_BUS_OTG] = { 0x2c0, BIT(24) },
[RST_BUS_EHCI0] = { 0x2c0, BIT(26) },
[RST_BUS_OHCI0] = { 0x2c0, BIT(29) },
diff --git a/drivers/clocksource/arm_arch_timer.c b/drivers/clocksource/arm_arch_timer.c
index 4bed671e490e..8b5c30062d99 100644
--- a/drivers/clocksource/arm_arch_timer.c
+++ b/drivers/clocksource/arm_arch_timer.c
@@ -1209,9 +1209,9 @@ arch_timer_mem_frame_get_cntfrq(struct arch_timer_mem_frame *frame)
return 0;
}
- rate = readl_relaxed(frame + CNTFRQ);
+ rate = readl_relaxed(base + CNTFRQ);
- iounmap(frame);
+ iounmap(base);
return rate;
}
diff --git a/drivers/clocksource/cadence_ttc_timer.c b/drivers/clocksource/cadence_ttc_timer.c
index 44e5e951583b..8e64b8460f11 100644
--- a/drivers/clocksource/cadence_ttc_timer.c
+++ b/drivers/clocksource/cadence_ttc_timer.c
@@ -18,6 +18,7 @@
#include <linux/clk.h>
#include <linux/interrupt.h>
#include <linux/clockchips.h>
+#include <linux/clocksource.h>
#include <linux/of_address.h>
#include <linux/of_irq.h>
#include <linux/slab.h>
diff --git a/drivers/clocksource/timer-sun5i.c b/drivers/clocksource/timer-sun5i.c
index 2e9c830ae1cd..c4656c4d44a6 100644
--- a/drivers/clocksource/timer-sun5i.c
+++ b/drivers/clocksource/timer-sun5i.c
@@ -12,6 +12,7 @@
#include <linux/clk.h>
#include <linux/clockchips.h>
+#include <linux/clocksource.h>
#include <linux/delay.h>
#include <linux/interrupt.h>
#include <linux/irq.h>
diff --git a/drivers/cpufreq/cpufreq_conservative.c b/drivers/cpufreq/cpufreq_conservative.c
index 992f7c20760f..88220ff3e1c2 100644
--- a/drivers/cpufreq/cpufreq_conservative.c
+++ b/drivers/cpufreq/cpufreq_conservative.c
@@ -185,8 +185,8 @@ static ssize_t store_down_threshold(struct gov_attr_set *attr_set,
int ret;
ret = sscanf(buf, "%u", &input);
- /* cannot be lower than 11 otherwise freq will not fall */
- if (ret != 1 || input < 11 || input > 100 ||
+ /* cannot be lower than 1 otherwise freq will not fall */
+ if (ret != 1 || input < 1 || input > 100 ||
input >= dbs_data->up_threshold)
return -EINVAL;
diff --git a/drivers/cpufreq/pasemi-cpufreq.c b/drivers/cpufreq/pasemi-cpufreq.c
index 35dd4d7ffee0..b257fc7d5204 100644
--- a/drivers/cpufreq/pasemi-cpufreq.c
+++ b/drivers/cpufreq/pasemi-cpufreq.c
@@ -226,7 +226,7 @@ static int pas_cpufreq_cpu_exit(struct cpufreq_policy *policy)
* We don't support CPU hotplug. Don't unmap after the system
* has already made it to a running state.
*/
- if (system_state != SYSTEM_BOOTING)
+ if (system_state >= SYSTEM_RUNNING)
return 0;
if (sdcasr_mapbase)
diff --git a/drivers/cpuidle/cpuidle.c b/drivers/cpuidle/cpuidle.c
index 2706be7ed334..60bb64f4329d 100644
--- a/drivers/cpuidle/cpuidle.c
+++ b/drivers/cpuidle/cpuidle.c
@@ -220,6 +220,7 @@ int cpuidle_enter_state(struct cpuidle_device *dev, struct cpuidle_driver *drv,
entered_state = target_state->enter(dev, drv, index);
start_critical_timings();
+ sched_clock_idle_wakeup_event();
time_end = ns_to_ktime(local_clock());
trace_cpu_idle_rcuidle(PWR_EVENT_EXIT, dev->cpu);
diff --git a/drivers/cpuidle/dt_idle_states.c b/drivers/cpuidle/dt_idle_states.c
index ffca4fc0061d..ae8eb0359889 100644
--- a/drivers/cpuidle/dt_idle_states.c
+++ b/drivers/cpuidle/dt_idle_states.c
@@ -180,8 +180,10 @@ int dt_init_idle_driver(struct cpuidle_driver *drv,
if (!state_node)
break;
- if (!of_device_is_available(state_node))
+ if (!of_device_is_available(state_node)) {
+ of_node_put(state_node);
continue;
+ }
if (!idle_state_valid(state_node, i, cpumask)) {
pr_warn("%s idle state not valid, bailing out\n",
diff --git a/drivers/devfreq/event/exynos-nocp.c b/drivers/devfreq/event/exynos-nocp.c
index 5c3e7b11e8a6..f6e7956fc91a 100644
--- a/drivers/devfreq/event/exynos-nocp.c
+++ b/drivers/devfreq/event/exynos-nocp.c
@@ -267,7 +267,11 @@ static int exynos_nocp_probe(struct platform_device *pdev)
}
platform_set_drvdata(pdev, nocp);
- clk_prepare_enable(nocp->clk);
+ ret = clk_prepare_enable(nocp->clk);
+ if (ret) {
+ dev_err(&pdev->dev, "failed to prepare ppmu clock\n");
+ return ret;
+ }
pr_info("exynos-nocp: new NoC Probe device registered: %s\n",
dev_name(dev));
diff --git a/drivers/devfreq/event/exynos-ppmu.c b/drivers/devfreq/event/exynos-ppmu.c
index 9b7350935b73..d96e3dc71cf8 100644
--- a/drivers/devfreq/event/exynos-ppmu.c
+++ b/drivers/devfreq/event/exynos-ppmu.c
@@ -44,7 +44,7 @@ struct exynos_ppmu {
{ "ppmu-event2-"#name, PPMU_PMNCNT2 }, \
{ "ppmu-event3-"#name, PPMU_PMNCNT3 }
-struct __exynos_ppmu_events {
+static struct __exynos_ppmu_events {
char *name;
int id;
} ppmu_events[] = {
@@ -648,7 +648,11 @@ static int exynos_ppmu_probe(struct platform_device *pdev)
dev_name(&pdev->dev), desc[i].name);
}
- clk_prepare_enable(info->ppmu.clk);
+ ret = clk_prepare_enable(info->ppmu.clk);
+ if (ret) {
+ dev_err(&pdev->dev, "failed to prepare ppmu clock\n");
+ return ret;
+ }
return 0;
}
diff --git a/drivers/edac/altera_edac.c b/drivers/edac/altera_edac.c
index 7717b094fabb..db75d4b614f7 100644
--- a/drivers/edac/altera_edac.c
+++ b/drivers/edac/altera_edac.c
@@ -214,24 +214,16 @@ static void altr_sdr_mc_create_debugfs_nodes(struct mem_ctl_info *mci)
static unsigned long get_total_mem(void)
{
struct device_node *np = NULL;
- const unsigned int *reg, *reg_end;
- int len, sw, aw;
- unsigned long start, size, total_mem = 0;
+ struct resource res;
+ int ret;
+ unsigned long total_mem = 0;
for_each_node_by_type(np, "memory") {
- aw = of_n_addr_cells(np);
- sw = of_n_size_cells(np);
- reg = (const unsigned int *)of_get_property(np, "reg", &len);
- reg_end = reg + (len / sizeof(u32));
-
- total_mem = 0;
- do {
- start = of_read_number(reg, aw);
- reg += aw;
- size = of_read_number(reg, sw);
- reg += sw;
- total_mem += size;
- } while (reg < reg_end);
+ ret = of_address_to_resource(np, 0, &res);
+ if (ret)
+ continue;
+
+ total_mem += resource_size(&res);
}
edac_dbg(0, "total_mem 0x%lx\n", total_mem);
return total_mem;
@@ -1839,7 +1831,7 @@ static int a10_eccmgr_irqdomain_map(struct irq_domain *d, unsigned int irq,
return 0;
}
-static struct irq_domain_ops a10_eccmgr_ic_ops = {
+static const struct irq_domain_ops a10_eccmgr_ic_ops = {
.map = a10_eccmgr_irqdomain_map,
.xlate = irq_domain_xlate_twocell,
};
diff --git a/drivers/edac/i5000_edac.c b/drivers/edac/i5000_edac.c
index f683919981b0..8f5a56e25bd2 100644
--- a/drivers/edac/i5000_edac.c
+++ b/drivers/edac/i5000_edac.c
@@ -227,7 +227,7 @@
#define NREC_RDWR(x) (((x)>>11) & 1)
#define NREC_RANK(x) (((x)>>8) & 0x7)
#define NRECMEMB 0xC0
-#define NREC_CAS(x) (((x)>>16) & 0xFFFFFF)
+#define NREC_CAS(x) (((x)>>16) & 0xFFF)
#define NREC_RAS(x) ((x) & 0x7FFF)
#define NRECFGLOG 0xC4
#define NREEECFBDA 0xC8
@@ -371,7 +371,7 @@ struct i5000_error_info {
/* These registers are input ONLY if there was a
* Non-Recoverable Error */
u16 nrecmema; /* Non-Recoverable Mem log A */
- u16 nrecmemb; /* Non-Recoverable Mem log B */
+ u32 nrecmemb; /* Non-Recoverable Mem log B */
};
@@ -407,7 +407,7 @@ static void i5000_get_error_info(struct mem_ctl_info *mci,
NERR_FAT_FBD, &info->nerr_fat_fbd);
pci_read_config_word(pvt->branchmap_werrors,
NRECMEMA, &info->nrecmema);
- pci_read_config_word(pvt->branchmap_werrors,
+ pci_read_config_dword(pvt->branchmap_werrors,
NRECMEMB, &info->nrecmemb);
/* Clear the error bits, by writing them back */
diff --git a/drivers/edac/i5400_edac.c b/drivers/edac/i5400_edac.c
index 37a9ba71da44..cd889edc8516 100644
--- a/drivers/edac/i5400_edac.c
+++ b/drivers/edac/i5400_edac.c
@@ -368,7 +368,7 @@ struct i5400_error_info {
/* These registers are input ONLY if there was a Non-Rec Error */
u16 nrecmema; /* Non-Recoverable Mem log A */
- u16 nrecmemb; /* Non-Recoverable Mem log B */
+ u32 nrecmemb; /* Non-Recoverable Mem log B */
};
@@ -458,7 +458,7 @@ static void i5400_get_error_info(struct mem_ctl_info *mci,
NERR_FAT_FBD, &info->nerr_fat_fbd);
pci_read_config_word(pvt->branchmap_werrors,
NRECMEMA, &info->nrecmema);
- pci_read_config_word(pvt->branchmap_werrors,
+ pci_read_config_dword(pvt->branchmap_werrors,
NRECMEMB, &info->nrecmemb);
/* Clear the error bits, by writing them back */
diff --git a/drivers/edac/ie31200_edac.c b/drivers/edac/ie31200_edac.c
index 2733fb5938a4..4260579e6901 100644
--- a/drivers/edac/ie31200_edac.c
+++ b/drivers/edac/ie31200_edac.c
@@ -18,10 +18,12 @@
* 0c04: Xeon E3-1200 v3/4th Gen Core Processor DRAM Controller
* 0c08: Xeon E3-1200 v3 Processor DRAM Controller
* 1918: Xeon E3-1200 v5 Skylake Host Bridge/DRAM Registers
+ * 5918: Xeon E3-1200 Xeon E3-1200 v6/7th Gen Core Processor Host Bridge/DRAM Registers
*
* Based on Intel specification:
* http://www.intel.com/content/dam/www/public/us/en/documents/datasheets/xeon-e3-1200v3-vol-2-datasheet.pdf
* http://www.intel.com/content/www/us/en/processors/xeon/xeon-e3-1200-family-vol-2-datasheet.html
+ * http://www.intel.com/content/www/us/en/processors/core/7th-gen-core-family-mobile-h-processor-lines-datasheet-vol-2.html
*
* According to the above datasheet (p.16):
* "
@@ -57,6 +59,7 @@
#define PCI_DEVICE_ID_INTEL_IE31200_HB_6 0x0c04
#define PCI_DEVICE_ID_INTEL_IE31200_HB_7 0x0c08
#define PCI_DEVICE_ID_INTEL_IE31200_HB_8 0x1918
+#define PCI_DEVICE_ID_INTEL_IE31200_HB_9 0x5918
#define IE31200_DIMMS 4
#define IE31200_RANKS 8
@@ -376,7 +379,12 @@ static int ie31200_probe1(struct pci_dev *pdev, int dev_idx)
void __iomem *window;
struct ie31200_priv *priv;
u32 addr_decode, mad_offset;
- bool skl = (pdev->device == PCI_DEVICE_ID_INTEL_IE31200_HB_8);
+
+ /*
+ * Kaby Lake seems to work like Skylake. Please re-visit this logic
+ * when adding new CPU support.
+ */
+ bool skl = (pdev->device >= PCI_DEVICE_ID_INTEL_IE31200_HB_8);
edac_dbg(0, "MC:\n");
@@ -560,6 +568,9 @@ static const struct pci_device_id ie31200_pci_tbl[] = {
PCI_VEND_DEV(INTEL, IE31200_HB_8), PCI_ANY_ID, PCI_ANY_ID, 0, 0,
IE31200},
{
+ PCI_VEND_DEV(INTEL, IE31200_HB_9), PCI_ANY_ID, PCI_ANY_ID, 0, 0,
+ IE31200},
+ {
0,
} /* 0 terminated list. */
};
diff --git a/drivers/edac/mce_amd.c b/drivers/edac/mce_amd.c
index ba35b7ea3686..9a2658a256a9 100644
--- a/drivers/edac/mce_amd.c
+++ b/drivers/edac/mce_amd.c
@@ -161,7 +161,7 @@ static const char * const smca_ls_mce_desc[] = {
"Sys Read data error thread 0",
"Sys read data error thread 1",
"DC tag error type 2",
- "DC data error type 1 (poison comsumption)",
+ "DC data error type 1 (poison consumption)",
"DC data error type 2",
"DC data error type 3",
"DC tag error type 4",
diff --git a/drivers/edac/mv64x60_edac.c b/drivers/edac/mv64x60_edac.c
index 14b7e7b71eaa..d3650df94fe8 100644
--- a/drivers/edac/mv64x60_edac.c
+++ b/drivers/edac/mv64x60_edac.c
@@ -32,21 +32,21 @@ static void mv64x60_pci_check(struct edac_pci_ctl_info *pci)
struct mv64x60_pci_pdata *pdata = pci->pvt_info;
u32 cause;
- cause = in_le32(pdata->pci_vbase + MV64X60_PCI_ERROR_CAUSE);
+ cause = readl(pdata->pci_vbase + MV64X60_PCI_ERROR_CAUSE);
if (!cause)
return;
printk(KERN_ERR "Error in PCI %d Interface\n", pdata->pci_hose);
printk(KERN_ERR "Cause register: 0x%08x\n", cause);
printk(KERN_ERR "Address Low: 0x%08x\n",
- in_le32(pdata->pci_vbase + MV64X60_PCI_ERROR_ADDR_LO));
+ readl(pdata->pci_vbase + MV64X60_PCI_ERROR_ADDR_LO));
printk(KERN_ERR "Address High: 0x%08x\n",
- in_le32(pdata->pci_vbase + MV64X60_PCI_ERROR_ADDR_HI));
+ readl(pdata->pci_vbase + MV64X60_PCI_ERROR_ADDR_HI));
printk(KERN_ERR "Attribute: 0x%08x\n",
- in_le32(pdata->pci_vbase + MV64X60_PCI_ERROR_ATTR));
+ readl(pdata->pci_vbase + MV64X60_PCI_ERROR_ATTR));
printk(KERN_ERR "Command: 0x%08x\n",
- in_le32(pdata->pci_vbase + MV64X60_PCI_ERROR_CMD));
- out_le32(pdata->pci_vbase + MV64X60_PCI_ERROR_CAUSE, ~cause);
+ readl(pdata->pci_vbase + MV64X60_PCI_ERROR_CMD));
+ writel(~cause, pdata->pci_vbase + MV64X60_PCI_ERROR_CAUSE);
if (cause & MV64X60_PCI_PE_MASK)
edac_pci_handle_pe(pci, pci->ctl_name);
@@ -61,7 +61,7 @@ static irqreturn_t mv64x60_pci_isr(int irq, void *dev_id)
struct mv64x60_pci_pdata *pdata = pci->pvt_info;
u32 val;
- val = in_le32(pdata->pci_vbase + MV64X60_PCI_ERROR_CAUSE);
+ val = readl(pdata->pci_vbase + MV64X60_PCI_ERROR_CAUSE);
if (!val)
return IRQ_NONE;
@@ -93,7 +93,7 @@ static int __init mv64x60_pci_fixup(struct platform_device *pdev)
if (!pci_serr)
return -ENOMEM;
- out_le32(pci_serr, in_le32(pci_serr) & ~0x1);
+ writel(readl(pci_serr) & ~0x1, pci_serr);
iounmap(pci_serr);
return 0;
@@ -116,7 +116,7 @@ static int mv64x60_pci_err_probe(struct platform_device *pdev)
pdata = pci->pvt_info;
pdata->pci_hose = pdev->id;
- pdata->name = "mpc85xx_pci_err";
+ pdata->name = "mv64x60_pci_err";
platform_set_drvdata(pdev, pci);
pci->dev = &pdev->dev;
pci->dev_name = dev_name(&pdev->dev);
@@ -161,10 +161,10 @@ static int mv64x60_pci_err_probe(struct platform_device *pdev)
goto err;
}
- out_le32(pdata->pci_vbase + MV64X60_PCI_ERROR_CAUSE, 0);
- out_le32(pdata->pci_vbase + MV64X60_PCI_ERROR_MASK, 0);
- out_le32(pdata->pci_vbase + MV64X60_PCI_ERROR_MASK,
- MV64X60_PCIx_ERR_MASK_VAL);
+ writel(0, pdata->pci_vbase + MV64X60_PCI_ERROR_CAUSE);
+ writel(0, pdata->pci_vbase + MV64X60_PCI_ERROR_MASK);
+ writel(MV64X60_PCIx_ERR_MASK_VAL,
+ pdata->pci_vbase + MV64X60_PCI_ERROR_MASK);
if (edac_pci_add_device(pci, pdata->edac_idx) > 0) {
edac_dbg(3, "failed edac_pci_add_device()\n");
@@ -233,23 +233,23 @@ static void mv64x60_sram_check(struct edac_device_ctl_info *edac_dev)
struct mv64x60_sram_pdata *pdata = edac_dev->pvt_info;
u32 cause;
- cause = in_le32(pdata->sram_vbase + MV64X60_SRAM_ERR_CAUSE);
+ cause = readl(pdata->sram_vbase + MV64X60_SRAM_ERR_CAUSE);
if (!cause)
return;
printk(KERN_ERR "Error in internal SRAM\n");
printk(KERN_ERR "Cause register: 0x%08x\n", cause);
printk(KERN_ERR "Address Low: 0x%08x\n",
- in_le32(pdata->sram_vbase + MV64X60_SRAM_ERR_ADDR_LO));
+ readl(pdata->sram_vbase + MV64X60_SRAM_ERR_ADDR_LO));
printk(KERN_ERR "Address High: 0x%08x\n",
- in_le32(pdata->sram_vbase + MV64X60_SRAM_ERR_ADDR_HI));
+ readl(pdata->sram_vbase + MV64X60_SRAM_ERR_ADDR_HI));
printk(KERN_ERR "Data Low: 0x%08x\n",
- in_le32(pdata->sram_vbase + MV64X60_SRAM_ERR_DATA_LO));
+ readl(pdata->sram_vbase + MV64X60_SRAM_ERR_DATA_LO));
printk(KERN_ERR "Data High: 0x%08x\n",
- in_le32(pdata->sram_vbase + MV64X60_SRAM_ERR_DATA_HI));
+ readl(pdata->sram_vbase + MV64X60_SRAM_ERR_DATA_HI));
printk(KERN_ERR "Parity: 0x%08x\n",
- in_le32(pdata->sram_vbase + MV64X60_SRAM_ERR_PARITY));
- out_le32(pdata->sram_vbase + MV64X60_SRAM_ERR_CAUSE, 0);
+ readl(pdata->sram_vbase + MV64X60_SRAM_ERR_PARITY));
+ writel(0, pdata->sram_vbase + MV64X60_SRAM_ERR_CAUSE);
edac_device_handle_ue(edac_dev, 0, 0, edac_dev->ctl_name);
}
@@ -260,7 +260,7 @@ static irqreturn_t mv64x60_sram_isr(int irq, void *dev_id)
struct mv64x60_sram_pdata *pdata = edac_dev->pvt_info;
u32 cause;
- cause = in_le32(pdata->sram_vbase + MV64X60_SRAM_ERR_CAUSE);
+ cause = readl(pdata->sram_vbase + MV64X60_SRAM_ERR_CAUSE);
if (!cause)
return IRQ_NONE;
@@ -322,7 +322,7 @@ static int mv64x60_sram_err_probe(struct platform_device *pdev)
}
/* setup SRAM err registers */
- out_le32(pdata->sram_vbase + MV64X60_SRAM_ERR_CAUSE, 0);
+ writel(0, pdata->sram_vbase + MV64X60_SRAM_ERR_CAUSE);
edac_dev->mod_name = EDAC_MOD_STR;
edac_dev->ctl_name = pdata->name;
@@ -398,7 +398,7 @@ static void mv64x60_cpu_check(struct edac_device_ctl_info *edac_dev)
struct mv64x60_cpu_pdata *pdata = edac_dev->pvt_info;
u32 cause;
- cause = in_le32(pdata->cpu_vbase[1] + MV64x60_CPU_ERR_CAUSE) &
+ cause = readl(pdata->cpu_vbase[1] + MV64x60_CPU_ERR_CAUSE) &
MV64x60_CPU_CAUSE_MASK;
if (!cause)
return;
@@ -406,16 +406,16 @@ static void mv64x60_cpu_check(struct edac_device_ctl_info *edac_dev)
printk(KERN_ERR "Error on CPU interface\n");
printk(KERN_ERR "Cause register: 0x%08x\n", cause);
printk(KERN_ERR "Address Low: 0x%08x\n",
- in_le32(pdata->cpu_vbase[0] + MV64x60_CPU_ERR_ADDR_LO));
+ readl(pdata->cpu_vbase[0] + MV64x60_CPU_ERR_ADDR_LO));
printk(KERN_ERR "Address High: 0x%08x\n",
- in_le32(pdata->cpu_vbase[0] + MV64x60_CPU_ERR_ADDR_HI));
+ readl(pdata->cpu_vbase[0] + MV64x60_CPU_ERR_ADDR_HI));
printk(KERN_ERR "Data Low: 0x%08x\n",
- in_le32(pdata->cpu_vbase[1] + MV64x60_CPU_ERR_DATA_LO));
+ readl(pdata->cpu_vbase[1] + MV64x60_CPU_ERR_DATA_LO));
printk(KERN_ERR "Data High: 0x%08x\n",
- in_le32(pdata->cpu_vbase[1] + MV64x60_CPU_ERR_DATA_HI));
+ readl(pdata->cpu_vbase[1] + MV64x60_CPU_ERR_DATA_HI));
printk(KERN_ERR "Parity: 0x%08x\n",
- in_le32(pdata->cpu_vbase[1] + MV64x60_CPU_ERR_PARITY));
- out_le32(pdata->cpu_vbase[1] + MV64x60_CPU_ERR_CAUSE, 0);
+ readl(pdata->cpu_vbase[1] + MV64x60_CPU_ERR_PARITY));
+ writel(0, pdata->cpu_vbase[1] + MV64x60_CPU_ERR_CAUSE);
edac_device_handle_ue(edac_dev, 0, 0, edac_dev->ctl_name);
}
@@ -426,7 +426,7 @@ static irqreturn_t mv64x60_cpu_isr(int irq, void *dev_id)
struct mv64x60_cpu_pdata *pdata = edac_dev->pvt_info;
u32 cause;
- cause = in_le32(pdata->cpu_vbase[1] + MV64x60_CPU_ERR_CAUSE) &
+ cause = readl(pdata->cpu_vbase[1] + MV64x60_CPU_ERR_CAUSE) &
MV64x60_CPU_CAUSE_MASK;
if (!cause)
return IRQ_NONE;
@@ -515,9 +515,9 @@ static int mv64x60_cpu_err_probe(struct platform_device *pdev)
}
/* setup CPU err registers */
- out_le32(pdata->cpu_vbase[1] + MV64x60_CPU_ERR_CAUSE, 0);
- out_le32(pdata->cpu_vbase[1] + MV64x60_CPU_ERR_MASK, 0);
- out_le32(pdata->cpu_vbase[1] + MV64x60_CPU_ERR_MASK, 0x000000ff);
+ writel(0, pdata->cpu_vbase[1] + MV64x60_CPU_ERR_CAUSE);
+ writel(0, pdata->cpu_vbase[1] + MV64x60_CPU_ERR_MASK);
+ writel(0x000000ff, pdata->cpu_vbase[1] + MV64x60_CPU_ERR_MASK);
edac_dev->mod_name = EDAC_MOD_STR;
edac_dev->ctl_name = pdata->name;
@@ -596,13 +596,13 @@ static void mv64x60_mc_check(struct mem_ctl_info *mci)
u32 comp_ecc;
u32 syndrome;
- reg = in_le32(pdata->mc_vbase + MV64X60_SDRAM_ERR_ADDR);
+ reg = readl(pdata->mc_vbase + MV64X60_SDRAM_ERR_ADDR);
if (!reg)
return;
err_addr = reg & ~0x3;
- sdram_ecc = in_le32(pdata->mc_vbase + MV64X60_SDRAM_ERR_ECC_RCVD);
- comp_ecc = in_le32(pdata->mc_vbase + MV64X60_SDRAM_ERR_ECC_CALC);
+ sdram_ecc = readl(pdata->mc_vbase + MV64X60_SDRAM_ERR_ECC_RCVD);
+ comp_ecc = readl(pdata->mc_vbase + MV64X60_SDRAM_ERR_ECC_CALC);
syndrome = sdram_ecc ^ comp_ecc;
/* first bit clear in ECC Err Reg, 1 bit error, correctable by HW */
@@ -620,7 +620,7 @@ static void mv64x60_mc_check(struct mem_ctl_info *mci)
mci->ctl_name, "");
/* clear the error */
- out_le32(pdata->mc_vbase + MV64X60_SDRAM_ERR_ADDR, 0);
+ writel(0, pdata->mc_vbase + MV64X60_SDRAM_ERR_ADDR);
}
static irqreturn_t mv64x60_mc_isr(int irq, void *dev_id)
@@ -629,7 +629,7 @@ static irqreturn_t mv64x60_mc_isr(int irq, void *dev_id)
struct mv64x60_mc_pdata *pdata = mci->pvt_info;
u32 reg;
- reg = in_le32(pdata->mc_vbase + MV64X60_SDRAM_ERR_ADDR);
+ reg = readl(pdata->mc_vbase + MV64X60_SDRAM_ERR_ADDR);
if (!reg)
return IRQ_NONE;
@@ -664,7 +664,7 @@ static void mv64x60_init_csrows(struct mem_ctl_info *mci,
get_total_mem(pdata);
- ctl = in_le32(pdata->mc_vbase + MV64X60_SDRAM_CONFIG);
+ ctl = readl(pdata->mc_vbase + MV64X60_SDRAM_CONFIG);
csrow = mci->csrows[0];
dimm = csrow->channels[0]->dimm;
@@ -753,7 +753,7 @@ static int mv64x60_mc_err_probe(struct platform_device *pdev)
goto err;
}
- ctl = in_le32(pdata->mc_vbase + MV64X60_SDRAM_CONFIG);
+ ctl = readl(pdata->mc_vbase + MV64X60_SDRAM_CONFIG);
if (!(ctl & MV64X60_SDRAM_ECC)) {
/* Non-ECC RAM? */
printk(KERN_WARNING "%s: No ECC DIMMs discovered\n", __func__);
@@ -779,10 +779,10 @@ static int mv64x60_mc_err_probe(struct platform_device *pdev)
mv64x60_init_csrows(mci, pdata);
/* setup MC registers */
- out_le32(pdata->mc_vbase + MV64X60_SDRAM_ERR_ADDR, 0);
- ctl = in_le32(pdata->mc_vbase + MV64X60_SDRAM_ERR_ECC_CNTL);
+ writel(0, pdata->mc_vbase + MV64X60_SDRAM_ERR_ADDR);
+ ctl = readl(pdata->mc_vbase + MV64X60_SDRAM_ERR_ECC_CNTL);
ctl = (ctl & 0xff00ffff) | 0x10000;
- out_le32(pdata->mc_vbase + MV64X60_SDRAM_ERR_ECC_CNTL, ctl);
+ writel(ctl, pdata->mc_vbase + MV64X60_SDRAM_ERR_ECC_CNTL);
res = edac_mc_add_mc(mci);
if (res) {
@@ -853,10 +853,10 @@ static struct platform_driver * const drivers[] = {
static int __init mv64x60_edac_init(void)
{
- int ret = 0;
printk(KERN_INFO "Marvell MV64x60 EDAC driver " MV64x60_REVISION "\n");
printk(KERN_INFO "\t(C) 2006-2007 MontaVista Software\n");
+
/* make sure error reporting method is sane */
switch (edac_op_state) {
case EDAC_OPSTATE_POLL:
diff --git a/drivers/edac/pnd2_edac.c b/drivers/edac/pnd2_edac.c
index 1cad5a9af8d0..8e599490f6de 100644
--- a/drivers/edac/pnd2_edac.c
+++ b/drivers/edac/pnd2_edac.c
@@ -131,7 +131,7 @@ static struct mem_ctl_info *pnd2_mci;
#ifdef CONFIG_X86_INTEL_SBI_APL
#include "linux/platform_data/sbi_apl.h"
-int sbi_send(int port, int off, int op, u32 *data)
+static int sbi_send(int port, int off, int op, u32 *data)
{
struct sbi_apl_message sbi_arg;
int ret, read = 0;
@@ -160,7 +160,7 @@ int sbi_send(int port, int off, int op, u32 *data)
return ret;
}
#else
-int sbi_send(int port, int off, int op, u32 *data)
+static int sbi_send(int port, int off, int op, u32 *data)
{
return -EUNATCH;
}
@@ -168,14 +168,15 @@ int sbi_send(int port, int off, int op, u32 *data)
static int apl_rd_reg(int port, int off, int op, void *data, size_t sz, char *name)
{
- int ret = 0;
+ int ret = 0;
edac_dbg(2, "Read %s port=%x off=%x op=%x\n", name, port, off, op);
switch (sz) {
case 8:
ret = sbi_send(port, off + 4, op, (u32 *)(data + 4));
+ /* fall through */
case 4:
- ret = sbi_send(port, off, op, (u32 *)data);
+ ret |= sbi_send(port, off, op, (u32 *)data);
pnd2_printk(KERN_DEBUG, "%s=%x%08x ret=%d\n", name,
sz == 8 ? *((u32 *)(data + 4)) : 0, *((u32 *)data), ret);
break;
@@ -423,16 +424,21 @@ static void dnv_mk_region(char *name, struct region *rp, void *asym)
static int apl_get_registers(void)
{
+ int ret = -ENODEV;
int i;
if (RD_REG(&asym_2way, b_cr_asym_2way_mem_region_mchbar))
return -ENODEV;
+ /*
+ * RD_REGP() will fail for unpopulated or non-existent
+ * DIMM slots. Return success if we find at least one DIMM.
+ */
for (i = 0; i < APL_NUM_CHANNELS; i++)
- if (RD_REGP(&drp0[i], d_cr_drp0, apl_dports[i]))
- return -ENODEV;
+ if (!RD_REGP(&drp0[i], d_cr_drp0, apl_dports[i]))
+ ret = 0;
- return 0;
+ return ret;
}
static int dnv_get_registers(void)
diff --git a/drivers/edac/sb_edac.c b/drivers/edac/sb_edac.c
index ea21cb651b3c..80d860cb0746 100644
--- a/drivers/edac/sb_edac.c
+++ b/drivers/edac/sb_edac.c
@@ -35,7 +35,7 @@ static LIST_HEAD(sbridge_edac_list);
/*
* Alter this version for the module when modifications are made
*/
-#define SBRIDGE_REVISION " Ver: 1.1.1 "
+#define SBRIDGE_REVISION " Ver: 1.1.2 "
#define EDAC_MOD_STR "sbridge_edac"
/*
@@ -279,7 +279,7 @@ static const u32 correrrthrsld[] = {
* sbridge structs
*/
-#define NUM_CHANNELS 8 /* 2MC per socket, four chan per MC */
+#define NUM_CHANNELS 4 /* Max channels per MC */
#define MAX_DIMMS 3 /* Max DIMMS per channel */
#define KNL_MAX_CHAS 38 /* KNL max num. of Cache Home Agents */
#define KNL_MAX_CHANNELS 6 /* KNL max num. of PCI channels */
@@ -294,6 +294,12 @@ enum type {
KNIGHTS_LANDING,
};
+enum domain {
+ IMC0 = 0,
+ IMC1,
+ SOCK,
+};
+
struct sbridge_pvt;
struct sbridge_info {
enum type type;
@@ -324,11 +330,14 @@ struct sbridge_channel {
struct pci_id_descr {
int dev_id;
int optional;
+ enum domain dom;
};
struct pci_id_table {
const struct pci_id_descr *descr;
- int n_devs;
+ int n_devs_per_imc;
+ int n_devs_per_sock;
+ int n_imcs_per_sock;
enum type type;
};
@@ -337,7 +346,9 @@ struct sbridge_dev {
u8 bus, mc;
u8 node_id, source_id;
struct pci_dev **pdev;
+ enum domain dom;
int n_devs;
+ int i_devs;
struct mem_ctl_info *mci;
};
@@ -352,11 +363,12 @@ struct knl_pvt {
};
struct sbridge_pvt {
- struct pci_dev *pci_ta, *pci_ddrio, *pci_ras;
+ /* Devices per socket */
+ struct pci_dev *pci_ddrio;
struct pci_dev *pci_sad0, *pci_sad1;
- struct pci_dev *pci_ha0, *pci_ha1;
struct pci_dev *pci_br0, *pci_br1;
- struct pci_dev *pci_ha1_ta;
+ /* Devices per memory controller */
+ struct pci_dev *pci_ha, *pci_ta, *pci_ras;
struct pci_dev *pci_tad[NUM_CHANNELS];
struct sbridge_dev *sbridge_dev;
@@ -373,39 +385,42 @@ struct sbridge_pvt {
struct knl_pvt knl;
};
-#define PCI_DESCR(device_id, opt) \
+#define PCI_DESCR(device_id, opt, domain) \
.dev_id = (device_id), \
- .optional = opt
+ .optional = opt, \
+ .dom = domain
static const struct pci_id_descr pci_dev_descr_sbridge[] = {
/* Processor Home Agent */
- { PCI_DESCR(PCI_DEVICE_ID_INTEL_SBRIDGE_IMC_HA0, 0) },
+ { PCI_DESCR(PCI_DEVICE_ID_INTEL_SBRIDGE_IMC_HA0, 0, IMC0) },
/* Memory controller */
- { PCI_DESCR(PCI_DEVICE_ID_INTEL_SBRIDGE_IMC_TA, 0) },
- { PCI_DESCR(PCI_DEVICE_ID_INTEL_SBRIDGE_IMC_RAS, 0) },
- { PCI_DESCR(PCI_DEVICE_ID_INTEL_SBRIDGE_IMC_TAD0, 0) },
- { PCI_DESCR(PCI_DEVICE_ID_INTEL_SBRIDGE_IMC_TAD1, 0) },
- { PCI_DESCR(PCI_DEVICE_ID_INTEL_SBRIDGE_IMC_TAD2, 0) },
- { PCI_DESCR(PCI_DEVICE_ID_INTEL_SBRIDGE_IMC_TAD3, 0) },
- { PCI_DESCR(PCI_DEVICE_ID_INTEL_SBRIDGE_IMC_DDRIO, 1) },
+ { PCI_DESCR(PCI_DEVICE_ID_INTEL_SBRIDGE_IMC_TA, 0, IMC0) },
+ { PCI_DESCR(PCI_DEVICE_ID_INTEL_SBRIDGE_IMC_RAS, 0, IMC0) },
+ { PCI_DESCR(PCI_DEVICE_ID_INTEL_SBRIDGE_IMC_TAD0, 0, IMC0) },
+ { PCI_DESCR(PCI_DEVICE_ID_INTEL_SBRIDGE_IMC_TAD1, 0, IMC0) },
+ { PCI_DESCR(PCI_DEVICE_ID_INTEL_SBRIDGE_IMC_TAD2, 0, IMC0) },
+ { PCI_DESCR(PCI_DEVICE_ID_INTEL_SBRIDGE_IMC_TAD3, 0, IMC0) },
+ { PCI_DESCR(PCI_DEVICE_ID_INTEL_SBRIDGE_IMC_DDRIO, 1, SOCK) },
/* System Address Decoder */
- { PCI_DESCR(PCI_DEVICE_ID_INTEL_SBRIDGE_SAD0, 0) },
- { PCI_DESCR(PCI_DEVICE_ID_INTEL_SBRIDGE_SAD1, 0) },
+ { PCI_DESCR(PCI_DEVICE_ID_INTEL_SBRIDGE_SAD0, 0, SOCK) },
+ { PCI_DESCR(PCI_DEVICE_ID_INTEL_SBRIDGE_SAD1, 0, SOCK) },
/* Broadcast Registers */
- { PCI_DESCR(PCI_DEVICE_ID_INTEL_SBRIDGE_BR, 0) },
+ { PCI_DESCR(PCI_DEVICE_ID_INTEL_SBRIDGE_BR, 0, SOCK) },
};
-#define PCI_ID_TABLE_ENTRY(A, T) { \
+#define PCI_ID_TABLE_ENTRY(A, N, M, T) { \
.descr = A, \
- .n_devs = ARRAY_SIZE(A), \
+ .n_devs_per_imc = N, \
+ .n_devs_per_sock = ARRAY_SIZE(A), \
+ .n_imcs_per_sock = M, \
.type = T \
}
static const struct pci_id_table pci_dev_descr_sbridge_table[] = {
- PCI_ID_TABLE_ENTRY(pci_dev_descr_sbridge, SANDY_BRIDGE),
+ PCI_ID_TABLE_ENTRY(pci_dev_descr_sbridge, ARRAY_SIZE(pci_dev_descr_sbridge), 1, SANDY_BRIDGE),
{0,} /* 0 terminated list. */
};
@@ -439,40 +454,39 @@ static const struct pci_id_table pci_dev_descr_sbridge_table[] = {
static const struct pci_id_descr pci_dev_descr_ibridge[] = {
/* Processor Home Agent */
- { PCI_DESCR(PCI_DEVICE_ID_INTEL_IBRIDGE_IMC_HA0, 0) },
+ { PCI_DESCR(PCI_DEVICE_ID_INTEL_IBRIDGE_IMC_HA0, 0, IMC0) },
/* Memory controller */
- { PCI_DESCR(PCI_DEVICE_ID_INTEL_IBRIDGE_IMC_HA0_TA, 0) },
- { PCI_DESCR(PCI_DEVICE_ID_INTEL_IBRIDGE_IMC_HA0_RAS, 0) },
- { PCI_DESCR(PCI_DEVICE_ID_INTEL_IBRIDGE_IMC_HA0_TAD0, 0) },
- { PCI_DESCR(PCI_DEVICE_ID_INTEL_IBRIDGE_IMC_HA0_TAD1, 0) },
- { PCI_DESCR(PCI_DEVICE_ID_INTEL_IBRIDGE_IMC_HA0_TAD2, 0) },
- { PCI_DESCR(PCI_DEVICE_ID_INTEL_IBRIDGE_IMC_HA0_TAD3, 0) },
+ { PCI_DESCR(PCI_DEVICE_ID_INTEL_IBRIDGE_IMC_HA0_TA, 0, IMC0) },
+ { PCI_DESCR(PCI_DEVICE_ID_INTEL_IBRIDGE_IMC_HA0_RAS, 0, IMC0) },
+ { PCI_DESCR(PCI_DEVICE_ID_INTEL_IBRIDGE_IMC_HA0_TAD0, 0, IMC0) },
+ { PCI_DESCR(PCI_DEVICE_ID_INTEL_IBRIDGE_IMC_HA0_TAD1, 0, IMC0) },
+ { PCI_DESCR(PCI_DEVICE_ID_INTEL_IBRIDGE_IMC_HA0_TAD2, 0, IMC0) },
+ { PCI_DESCR(PCI_DEVICE_ID_INTEL_IBRIDGE_IMC_HA0_TAD3, 0, IMC0) },
+
+ /* Optional, mode 2HA */
+ { PCI_DESCR(PCI_DEVICE_ID_INTEL_IBRIDGE_IMC_HA1, 1, IMC1) },
+ { PCI_DESCR(PCI_DEVICE_ID_INTEL_IBRIDGE_IMC_HA1_TA, 1, IMC1) },
+ { PCI_DESCR(PCI_DEVICE_ID_INTEL_IBRIDGE_IMC_HA1_RAS, 1, IMC1) },
+ { PCI_DESCR(PCI_DEVICE_ID_INTEL_IBRIDGE_IMC_HA1_TAD0, 1, IMC1) },
+ { PCI_DESCR(PCI_DEVICE_ID_INTEL_IBRIDGE_IMC_HA1_TAD1, 1, IMC1) },
+ { PCI_DESCR(PCI_DEVICE_ID_INTEL_IBRIDGE_IMC_HA1_TAD2, 1, IMC1) },
+ { PCI_DESCR(PCI_DEVICE_ID_INTEL_IBRIDGE_IMC_HA1_TAD3, 1, IMC1) },
+
+ { PCI_DESCR(PCI_DEVICE_ID_INTEL_IBRIDGE_IMC_1HA_DDRIO0, 1, SOCK) },
+ { PCI_DESCR(PCI_DEVICE_ID_INTEL_IBRIDGE_IMC_2HA_DDRIO0, 1, SOCK) },
/* System Address Decoder */
- { PCI_DESCR(PCI_DEVICE_ID_INTEL_IBRIDGE_SAD, 0) },
+ { PCI_DESCR(PCI_DEVICE_ID_INTEL_IBRIDGE_SAD, 0, SOCK) },
/* Broadcast Registers */
- { PCI_DESCR(PCI_DEVICE_ID_INTEL_IBRIDGE_BR0, 1) },
- { PCI_DESCR(PCI_DEVICE_ID_INTEL_IBRIDGE_BR1, 0) },
+ { PCI_DESCR(PCI_DEVICE_ID_INTEL_IBRIDGE_BR0, 1, SOCK) },
+ { PCI_DESCR(PCI_DEVICE_ID_INTEL_IBRIDGE_BR1, 0, SOCK) },
- /* Optional, mode 2HA */
- { PCI_DESCR(PCI_DEVICE_ID_INTEL_IBRIDGE_IMC_HA1, 1) },
-#if 0
- { PCI_DESCR(PCI_DEVICE_ID_INTEL_IBRIDGE_IMC_HA1_TA, 1) },
- { PCI_DESCR(PCI_DEVICE_ID_INTEL_IBRIDGE_IMC_HA1_RAS, 1) },
-#endif
- { PCI_DESCR(PCI_DEVICE_ID_INTEL_IBRIDGE_IMC_HA1_TAD0, 1) },
- { PCI_DESCR(PCI_DEVICE_ID_INTEL_IBRIDGE_IMC_HA1_TAD1, 1) },
- { PCI_DESCR(PCI_DEVICE_ID_INTEL_IBRIDGE_IMC_HA1_TAD2, 1) },
- { PCI_DESCR(PCI_DEVICE_ID_INTEL_IBRIDGE_IMC_HA1_TAD3, 1) },
-
- { PCI_DESCR(PCI_DEVICE_ID_INTEL_IBRIDGE_IMC_1HA_DDRIO0, 1) },
- { PCI_DESCR(PCI_DEVICE_ID_INTEL_IBRIDGE_IMC_2HA_DDRIO0, 1) },
};
static const struct pci_id_table pci_dev_descr_ibridge_table[] = {
- PCI_ID_TABLE_ENTRY(pci_dev_descr_ibridge, IVY_BRIDGE),
+ PCI_ID_TABLE_ENTRY(pci_dev_descr_ibridge, 12, 2, IVY_BRIDGE),
{0,} /* 0 terminated list. */
};
@@ -498,9 +512,9 @@ static const struct pci_id_table pci_dev_descr_ibridge_table[] = {
#define PCI_DEVICE_ID_INTEL_HASWELL_IMC_HA0 0x2fa0
#define PCI_DEVICE_ID_INTEL_HASWELL_IMC_HA1 0x2f60
#define PCI_DEVICE_ID_INTEL_HASWELL_IMC_HA0_TA 0x2fa8
-#define PCI_DEVICE_ID_INTEL_HASWELL_IMC_HA0_THERMAL 0x2f71
+#define PCI_DEVICE_ID_INTEL_HASWELL_IMC_HA0_TM 0x2f71
#define PCI_DEVICE_ID_INTEL_HASWELL_IMC_HA1_TA 0x2f68
-#define PCI_DEVICE_ID_INTEL_HASWELL_IMC_HA1_THERMAL 0x2f79
+#define PCI_DEVICE_ID_INTEL_HASWELL_IMC_HA1_TM 0x2f79
#define PCI_DEVICE_ID_INTEL_HASWELL_IMC_CBO_SAD0 0x2ffc
#define PCI_DEVICE_ID_INTEL_HASWELL_IMC_CBO_SAD1 0x2ffd
#define PCI_DEVICE_ID_INTEL_HASWELL_IMC_HA0_TAD0 0x2faa
@@ -517,35 +531,33 @@ static const struct pci_id_table pci_dev_descr_ibridge_table[] = {
#define PCI_DEVICE_ID_INTEL_HASWELL_IMC_DDRIO3 0x2fbb
static const struct pci_id_descr pci_dev_descr_haswell[] = {
/* first item must be the HA */
- { PCI_DESCR(PCI_DEVICE_ID_INTEL_HASWELL_IMC_HA0, 0) },
-
- { PCI_DESCR(PCI_DEVICE_ID_INTEL_HASWELL_IMC_CBO_SAD0, 0) },
- { PCI_DESCR(PCI_DEVICE_ID_INTEL_HASWELL_IMC_CBO_SAD1, 0) },
-
- { PCI_DESCR(PCI_DEVICE_ID_INTEL_HASWELL_IMC_HA1, 1) },
-
- { PCI_DESCR(PCI_DEVICE_ID_INTEL_HASWELL_IMC_HA0_TA, 0) },
- { PCI_DESCR(PCI_DEVICE_ID_INTEL_HASWELL_IMC_HA0_THERMAL, 0) },
- { PCI_DESCR(PCI_DEVICE_ID_INTEL_HASWELL_IMC_HA0_TAD0, 0) },
- { PCI_DESCR(PCI_DEVICE_ID_INTEL_HASWELL_IMC_HA0_TAD1, 0) },
- { PCI_DESCR(PCI_DEVICE_ID_INTEL_HASWELL_IMC_HA0_TAD2, 1) },
- { PCI_DESCR(PCI_DEVICE_ID_INTEL_HASWELL_IMC_HA0_TAD3, 1) },
-
- { PCI_DESCR(PCI_DEVICE_ID_INTEL_HASWELL_IMC_DDRIO0, 1) },
- { PCI_DESCR(PCI_DEVICE_ID_INTEL_HASWELL_IMC_DDRIO1, 1) },
- { PCI_DESCR(PCI_DEVICE_ID_INTEL_HASWELL_IMC_DDRIO2, 1) },
- { PCI_DESCR(PCI_DEVICE_ID_INTEL_HASWELL_IMC_DDRIO3, 1) },
-
- { PCI_DESCR(PCI_DEVICE_ID_INTEL_HASWELL_IMC_HA1_TA, 1) },
- { PCI_DESCR(PCI_DEVICE_ID_INTEL_HASWELL_IMC_HA1_THERMAL, 1) },
- { PCI_DESCR(PCI_DEVICE_ID_INTEL_HASWELL_IMC_HA1_TAD0, 1) },
- { PCI_DESCR(PCI_DEVICE_ID_INTEL_HASWELL_IMC_HA1_TAD1, 1) },
- { PCI_DESCR(PCI_DEVICE_ID_INTEL_HASWELL_IMC_HA1_TAD2, 1) },
- { PCI_DESCR(PCI_DEVICE_ID_INTEL_HASWELL_IMC_HA1_TAD3, 1) },
+ { PCI_DESCR(PCI_DEVICE_ID_INTEL_HASWELL_IMC_HA0, 0, IMC0) },
+ { PCI_DESCR(PCI_DEVICE_ID_INTEL_HASWELL_IMC_HA1, 1, IMC1) },
+
+ { PCI_DESCR(PCI_DEVICE_ID_INTEL_HASWELL_IMC_HA0_TA, 0, IMC0) },
+ { PCI_DESCR(PCI_DEVICE_ID_INTEL_HASWELL_IMC_HA0_TM, 0, IMC0) },
+ { PCI_DESCR(PCI_DEVICE_ID_INTEL_HASWELL_IMC_HA0_TAD0, 0, IMC0) },
+ { PCI_DESCR(PCI_DEVICE_ID_INTEL_HASWELL_IMC_HA0_TAD1, 0, IMC0) },
+ { PCI_DESCR(PCI_DEVICE_ID_INTEL_HASWELL_IMC_HA0_TAD2, 1, IMC0) },
+ { PCI_DESCR(PCI_DEVICE_ID_INTEL_HASWELL_IMC_HA0_TAD3, 1, IMC0) },
+
+ { PCI_DESCR(PCI_DEVICE_ID_INTEL_HASWELL_IMC_HA1_TA, 1, IMC1) },
+ { PCI_DESCR(PCI_DEVICE_ID_INTEL_HASWELL_IMC_HA1_TM, 1, IMC1) },
+ { PCI_DESCR(PCI_DEVICE_ID_INTEL_HASWELL_IMC_HA1_TAD0, 1, IMC1) },
+ { PCI_DESCR(PCI_DEVICE_ID_INTEL_HASWELL_IMC_HA1_TAD1, 1, IMC1) },
+ { PCI_DESCR(PCI_DEVICE_ID_INTEL_HASWELL_IMC_HA1_TAD2, 1, IMC1) },
+ { PCI_DESCR(PCI_DEVICE_ID_INTEL_HASWELL_IMC_HA1_TAD3, 1, IMC1) },
+
+ { PCI_DESCR(PCI_DEVICE_ID_INTEL_HASWELL_IMC_CBO_SAD0, 0, SOCK) },
+ { PCI_DESCR(PCI_DEVICE_ID_INTEL_HASWELL_IMC_CBO_SAD1, 0, SOCK) },
+ { PCI_DESCR(PCI_DEVICE_ID_INTEL_HASWELL_IMC_DDRIO0, 1, SOCK) },
+ { PCI_DESCR(PCI_DEVICE_ID_INTEL_HASWELL_IMC_DDRIO1, 1, SOCK) },
+ { PCI_DESCR(PCI_DEVICE_ID_INTEL_HASWELL_IMC_DDRIO2, 1, SOCK) },
+ { PCI_DESCR(PCI_DEVICE_ID_INTEL_HASWELL_IMC_DDRIO3, 1, SOCK) },
};
static const struct pci_id_table pci_dev_descr_haswell_table[] = {
- PCI_ID_TABLE_ENTRY(pci_dev_descr_haswell, HASWELL),
+ PCI_ID_TABLE_ENTRY(pci_dev_descr_haswell, 13, 2, HASWELL),
{0,} /* 0 terminated list. */
};
@@ -559,7 +571,7 @@ static const struct pci_id_table pci_dev_descr_haswell_table[] = {
/* Memory controller, TAD tables, error injection - 2-8-0, 2-9-0 (2 of these) */
#define PCI_DEVICE_ID_INTEL_KNL_IMC_MC 0x7840
/* DRAM channel stuff; bank addrs, dimmmtr, etc.. 2-8-2 - 2-9-4 (6 of these) */
-#define PCI_DEVICE_ID_INTEL_KNL_IMC_CHANNEL 0x7843
+#define PCI_DEVICE_ID_INTEL_KNL_IMC_CHAN 0x7843
/* kdrwdbu TAD limits/offsets, MCMTR - 2-10-1, 2-11-1 (2 of these) */
#define PCI_DEVICE_ID_INTEL_KNL_IMC_TA 0x7844
/* CHA broadcast registers, dram rules - 1-29-0 (1 of these) */
@@ -579,17 +591,17 @@ static const struct pci_id_table pci_dev_descr_haswell_table[] = {
*/
static const struct pci_id_descr pci_dev_descr_knl[] = {
- [0] = { PCI_DESCR(PCI_DEVICE_ID_INTEL_KNL_IMC_SAD0, 0) },
- [1] = { PCI_DESCR(PCI_DEVICE_ID_INTEL_KNL_IMC_SAD1, 0) },
- [2 ... 3] = { PCI_DESCR(PCI_DEVICE_ID_INTEL_KNL_IMC_MC, 0)},
- [4 ... 41] = { PCI_DESCR(PCI_DEVICE_ID_INTEL_KNL_IMC_CHA, 0) },
- [42 ... 47] = { PCI_DESCR(PCI_DEVICE_ID_INTEL_KNL_IMC_CHANNEL, 0) },
- [48] = { PCI_DESCR(PCI_DEVICE_ID_INTEL_KNL_IMC_TA, 0) },
- [49] = { PCI_DESCR(PCI_DEVICE_ID_INTEL_KNL_IMC_TOLHM, 0) },
+ [0 ... 1] = { PCI_DESCR(PCI_DEVICE_ID_INTEL_KNL_IMC_MC, 0, IMC0)},
+ [2 ... 7] = { PCI_DESCR(PCI_DEVICE_ID_INTEL_KNL_IMC_CHAN, 0, IMC0) },
+ [8] = { PCI_DESCR(PCI_DEVICE_ID_INTEL_KNL_IMC_TA, 0, IMC0) },
+ [9] = { PCI_DESCR(PCI_DEVICE_ID_INTEL_KNL_IMC_TOLHM, 0, IMC0) },
+ [10] = { PCI_DESCR(PCI_DEVICE_ID_INTEL_KNL_IMC_SAD0, 0, SOCK) },
+ [11] = { PCI_DESCR(PCI_DEVICE_ID_INTEL_KNL_IMC_SAD1, 0, SOCK) },
+ [12 ... 49] = { PCI_DESCR(PCI_DEVICE_ID_INTEL_KNL_IMC_CHA, 0, SOCK) },
};
static const struct pci_id_table pci_dev_descr_knl_table[] = {
- PCI_ID_TABLE_ENTRY(pci_dev_descr_knl, KNIGHTS_LANDING),
+ PCI_ID_TABLE_ENTRY(pci_dev_descr_knl, ARRAY_SIZE(pci_dev_descr_knl), 1, KNIGHTS_LANDING),
{0,}
};
@@ -615,9 +627,9 @@ static const struct pci_id_table pci_dev_descr_knl_table[] = {
#define PCI_DEVICE_ID_INTEL_BROADWELL_IMC_HA0 0x6fa0
#define PCI_DEVICE_ID_INTEL_BROADWELL_IMC_HA1 0x6f60
#define PCI_DEVICE_ID_INTEL_BROADWELL_IMC_HA0_TA 0x6fa8
-#define PCI_DEVICE_ID_INTEL_BROADWELL_IMC_HA0_THERMAL 0x6f71
+#define PCI_DEVICE_ID_INTEL_BROADWELL_IMC_HA0_TM 0x6f71
#define PCI_DEVICE_ID_INTEL_BROADWELL_IMC_HA1_TA 0x6f68
-#define PCI_DEVICE_ID_INTEL_BROADWELL_IMC_HA1_THERMAL 0x6f79
+#define PCI_DEVICE_ID_INTEL_BROADWELL_IMC_HA1_TM 0x6f79
#define PCI_DEVICE_ID_INTEL_BROADWELL_IMC_CBO_SAD0 0x6ffc
#define PCI_DEVICE_ID_INTEL_BROADWELL_IMC_CBO_SAD1 0x6ffd
#define PCI_DEVICE_ID_INTEL_BROADWELL_IMC_HA0_TAD0 0x6faa
@@ -632,32 +644,30 @@ static const struct pci_id_table pci_dev_descr_knl_table[] = {
static const struct pci_id_descr pci_dev_descr_broadwell[] = {
/* first item must be the HA */
- { PCI_DESCR(PCI_DEVICE_ID_INTEL_BROADWELL_IMC_HA0, 0) },
-
- { PCI_DESCR(PCI_DEVICE_ID_INTEL_BROADWELL_IMC_CBO_SAD0, 0) },
- { PCI_DESCR(PCI_DEVICE_ID_INTEL_BROADWELL_IMC_CBO_SAD1, 0) },
-
- { PCI_DESCR(PCI_DEVICE_ID_INTEL_BROADWELL_IMC_HA1, 1) },
-
- { PCI_DESCR(PCI_DEVICE_ID_INTEL_BROADWELL_IMC_HA0_TA, 0) },
- { PCI_DESCR(PCI_DEVICE_ID_INTEL_BROADWELL_IMC_HA0_THERMAL, 0) },
- { PCI_DESCR(PCI_DEVICE_ID_INTEL_BROADWELL_IMC_HA0_TAD0, 0) },
- { PCI_DESCR(PCI_DEVICE_ID_INTEL_BROADWELL_IMC_HA0_TAD1, 0) },
- { PCI_DESCR(PCI_DEVICE_ID_INTEL_BROADWELL_IMC_HA0_TAD2, 1) },
- { PCI_DESCR(PCI_DEVICE_ID_INTEL_BROADWELL_IMC_HA0_TAD3, 1) },
-
- { PCI_DESCR(PCI_DEVICE_ID_INTEL_BROADWELL_IMC_DDRIO0, 1) },
-
- { PCI_DESCR(PCI_DEVICE_ID_INTEL_BROADWELL_IMC_HA1_TA, 1) },
- { PCI_DESCR(PCI_DEVICE_ID_INTEL_BROADWELL_IMC_HA1_THERMAL, 1) },
- { PCI_DESCR(PCI_DEVICE_ID_INTEL_BROADWELL_IMC_HA1_TAD0, 1) },
- { PCI_DESCR(PCI_DEVICE_ID_INTEL_BROADWELL_IMC_HA1_TAD1, 1) },
- { PCI_DESCR(PCI_DEVICE_ID_INTEL_BROADWELL_IMC_HA1_TAD2, 1) },
- { PCI_DESCR(PCI_DEVICE_ID_INTEL_BROADWELL_IMC_HA1_TAD3, 1) },
+ { PCI_DESCR(PCI_DEVICE_ID_INTEL_BROADWELL_IMC_HA0, 0, IMC0) },
+ { PCI_DESCR(PCI_DEVICE_ID_INTEL_BROADWELL_IMC_HA1, 1, IMC1) },
+
+ { PCI_DESCR(PCI_DEVICE_ID_INTEL_BROADWELL_IMC_HA0_TA, 0, IMC0) },
+ { PCI_DESCR(PCI_DEVICE_ID_INTEL_BROADWELL_IMC_HA0_TM, 0, IMC0) },
+ { PCI_DESCR(PCI_DEVICE_ID_INTEL_BROADWELL_IMC_HA0_TAD0, 0, IMC0) },
+ { PCI_DESCR(PCI_DEVICE_ID_INTEL_BROADWELL_IMC_HA0_TAD1, 0, IMC0) },
+ { PCI_DESCR(PCI_DEVICE_ID_INTEL_BROADWELL_IMC_HA0_TAD2, 1, IMC0) },
+ { PCI_DESCR(PCI_DEVICE_ID_INTEL_BROADWELL_IMC_HA0_TAD3, 1, IMC0) },
+
+ { PCI_DESCR(PCI_DEVICE_ID_INTEL_BROADWELL_IMC_HA1_TA, 1, IMC1) },
+ { PCI_DESCR(PCI_DEVICE_ID_INTEL_BROADWELL_IMC_HA1_TM, 1, IMC1) },
+ { PCI_DESCR(PCI_DEVICE_ID_INTEL_BROADWELL_IMC_HA1_TAD0, 1, IMC1) },
+ { PCI_DESCR(PCI_DEVICE_ID_INTEL_BROADWELL_IMC_HA1_TAD1, 1, IMC1) },
+ { PCI_DESCR(PCI_DEVICE_ID_INTEL_BROADWELL_IMC_HA1_TAD2, 1, IMC1) },
+ { PCI_DESCR(PCI_DEVICE_ID_INTEL_BROADWELL_IMC_HA1_TAD3, 1, IMC1) },
+
+ { PCI_DESCR(PCI_DEVICE_ID_INTEL_BROADWELL_IMC_CBO_SAD0, 0, SOCK) },
+ { PCI_DESCR(PCI_DEVICE_ID_INTEL_BROADWELL_IMC_CBO_SAD1, 0, SOCK) },
+ { PCI_DESCR(PCI_DEVICE_ID_INTEL_BROADWELL_IMC_DDRIO0, 1, SOCK) },
};
static const struct pci_id_table pci_dev_descr_broadwell_table[] = {
- PCI_ID_TABLE_ENTRY(pci_dev_descr_broadwell, BROADWELL),
+ PCI_ID_TABLE_ENTRY(pci_dev_descr_broadwell, 10, 2, BROADWELL),
{0,} /* 0 terminated list. */
};
@@ -709,7 +719,8 @@ static inline int numcol(u32 mtr)
return 1 << cols;
}
-static struct sbridge_dev *get_sbridge_dev(u8 bus, int multi_bus)
+static struct sbridge_dev *get_sbridge_dev(u8 bus, enum domain dom, int multi_bus,
+ struct sbridge_dev *prev)
{
struct sbridge_dev *sbridge_dev;
@@ -722,16 +733,19 @@ static struct sbridge_dev *get_sbridge_dev(u8 bus, int multi_bus)
struct sbridge_dev, list);
}
- list_for_each_entry(sbridge_dev, &sbridge_edac_list, list) {
- if (sbridge_dev->bus == bus)
+ sbridge_dev = list_entry(prev ? prev->list.next
+ : sbridge_edac_list.next, struct sbridge_dev, list);
+
+ list_for_each_entry_from(sbridge_dev, &sbridge_edac_list, list) {
+ if (sbridge_dev->bus == bus && (dom == SOCK || dom == sbridge_dev->dom))
return sbridge_dev;
}
return NULL;
}
-static struct sbridge_dev *alloc_sbridge_dev(u8 bus,
- const struct pci_id_table *table)
+static struct sbridge_dev *alloc_sbridge_dev(u8 bus, enum domain dom,
+ const struct pci_id_table *table)
{
struct sbridge_dev *sbridge_dev;
@@ -739,15 +753,17 @@ static struct sbridge_dev *alloc_sbridge_dev(u8 bus,
if (!sbridge_dev)
return NULL;
- sbridge_dev->pdev = kzalloc(sizeof(*sbridge_dev->pdev) * table->n_devs,
- GFP_KERNEL);
+ sbridge_dev->pdev = kcalloc(table->n_devs_per_imc,
+ sizeof(*sbridge_dev->pdev),
+ GFP_KERNEL);
if (!sbridge_dev->pdev) {
kfree(sbridge_dev);
return NULL;
}
sbridge_dev->bus = bus;
- sbridge_dev->n_devs = table->n_devs;
+ sbridge_dev->dom = dom;
+ sbridge_dev->n_devs = table->n_devs_per_imc;
list_add_tail(&sbridge_dev->list, &sbridge_edac_list);
return sbridge_dev;
@@ -1044,79 +1060,6 @@ static int haswell_chan_hash(int idx, u64 addr)
return idx;
}
-/****************************************************************************
- Memory check routines
- ****************************************************************************/
-static struct pci_dev *get_pdev_same_bus(u8 bus, u32 id)
-{
- struct pci_dev *pdev = NULL;
-
- do {
- pdev = pci_get_device(PCI_VENDOR_ID_INTEL, id, pdev);
- if (pdev && pdev->bus->number == bus)
- break;
- } while (pdev);
-
- return pdev;
-}
-
-/**
- * check_if_ecc_is_active() - Checks if ECC is active
- * @bus: Device bus
- * @type: Memory controller type
- * returns: 0 in case ECC is active, -ENODEV if it can't be determined or
- * disabled
- */
-static int check_if_ecc_is_active(const u8 bus, enum type type)
-{
- struct pci_dev *pdev = NULL;
- u32 mcmtr, id;
-
- switch (type) {
- case IVY_BRIDGE:
- id = PCI_DEVICE_ID_INTEL_IBRIDGE_IMC_HA0_TA;
- break;
- case HASWELL:
- id = PCI_DEVICE_ID_INTEL_HASWELL_IMC_HA0_TA;
- break;
- case SANDY_BRIDGE:
- id = PCI_DEVICE_ID_INTEL_SBRIDGE_IMC_TA;
- break;
- case BROADWELL:
- id = PCI_DEVICE_ID_INTEL_BROADWELL_IMC_HA0_TA;
- break;
- case KNIGHTS_LANDING:
- /*
- * KNL doesn't group things by bus the same way
- * SB/IB/Haswell does.
- */
- id = PCI_DEVICE_ID_INTEL_KNL_IMC_TA;
- break;
- default:
- return -ENODEV;
- }
-
- if (type != KNIGHTS_LANDING)
- pdev = get_pdev_same_bus(bus, id);
- else
- pdev = pci_get_device(PCI_VENDOR_ID_INTEL, id, 0);
-
- if (!pdev) {
- sbridge_printk(KERN_ERR, "Couldn't find PCI device "
- "%04x:%04x! on bus %02d\n",
- PCI_VENDOR_ID_INTEL, id, bus);
- return -ENODEV;
- }
-
- pci_read_config_dword(pdev,
- type == KNIGHTS_LANDING ? KNL_MCMTR : MCMTR, &mcmtr);
- if (!IS_ECC_ENABLED(mcmtr)) {
- sbridge_printk(KERN_ERR, "ECC is disabled. Aborting\n");
- return -ENODEV;
- }
- return 0;
-}
-
/* Low bits of TAD limit, and some metadata. */
static const u32 knl_tad_dram_limit_lo[] = {
0x400, 0x500, 0x600, 0x700,
@@ -1587,25 +1530,13 @@ static int knl_get_dimm_capacity(struct sbridge_pvt *pvt, u64 *mc_sizes)
return 0;
}
-static int get_dimm_config(struct mem_ctl_info *mci)
+static void get_source_id(struct mem_ctl_info *mci)
{
struct sbridge_pvt *pvt = mci->pvt_info;
- struct dimm_info *dimm;
- unsigned i, j, banks, ranks, rows, cols, npages;
- u64 size;
u32 reg;
- enum edac_type mode;
- enum mem_type mtype;
- int channels = pvt->info.type == KNIGHTS_LANDING ?
- KNL_MAX_CHANNELS : NUM_CHANNELS;
- u64 knl_mc_sizes[KNL_MAX_CHANNELS];
- if (pvt->info.type == HASWELL || pvt->info.type == BROADWELL) {
- pci_read_config_dword(pvt->pci_ha0, HASWELL_HASYSDEFEATURE2, &reg);
- pvt->is_chan_hash = GET_BITFIELD(reg, 21, 21);
- }
if (pvt->info.type == HASWELL || pvt->info.type == BROADWELL ||
- pvt->info.type == KNIGHTS_LANDING)
+ pvt->info.type == KNIGHTS_LANDING)
pci_read_config_dword(pvt->pci_sad1, SAD_TARGET, &reg);
else
pci_read_config_dword(pvt->pci_br0, SAD_TARGET, &reg);
@@ -1614,50 +1545,19 @@ static int get_dimm_config(struct mem_ctl_info *mci)
pvt->sbridge_dev->source_id = SOURCE_ID_KNL(reg);
else
pvt->sbridge_dev->source_id = SOURCE_ID(reg);
+}
- pvt->sbridge_dev->node_id = pvt->info.get_node_id(pvt);
- edac_dbg(0, "mc#%d: Node ID: %d, source ID: %d\n",
- pvt->sbridge_dev->mc,
- pvt->sbridge_dev->node_id,
- pvt->sbridge_dev->source_id);
-
- /* KNL doesn't support mirroring or lockstep,
- * and is always closed page
- */
- if (pvt->info.type == KNIGHTS_LANDING) {
- mode = EDAC_S4ECD4ED;
- pvt->is_mirrored = false;
-
- if (knl_get_dimm_capacity(pvt, knl_mc_sizes) != 0)
- return -1;
- } else {
- pci_read_config_dword(pvt->pci_ras, RASENABLES, &reg);
- if (IS_MIRROR_ENABLED(reg)) {
- edac_dbg(0, "Memory mirror is enabled\n");
- pvt->is_mirrored = true;
- } else {
- edac_dbg(0, "Memory mirror is disabled\n");
- pvt->is_mirrored = false;
- }
-
- pci_read_config_dword(pvt->pci_ta, MCMTR, &pvt->info.mcmtr);
- if (IS_LOCKSTEP_ENABLED(pvt->info.mcmtr)) {
- edac_dbg(0, "Lockstep is enabled\n");
- mode = EDAC_S8ECD8ED;
- pvt->is_lockstep = true;
- } else {
- edac_dbg(0, "Lockstep is disabled\n");
- mode = EDAC_S4ECD4ED;
- pvt->is_lockstep = false;
- }
- if (IS_CLOSE_PG(pvt->info.mcmtr)) {
- edac_dbg(0, "address map is on closed page mode\n");
- pvt->is_close_pg = true;
- } else {
- edac_dbg(0, "address map is on open page mode\n");
- pvt->is_close_pg = false;
- }
- }
+static int __populate_dimms(struct mem_ctl_info *mci,
+ u64 knl_mc_sizes[KNL_MAX_CHANNELS],
+ enum edac_type mode)
+{
+ struct sbridge_pvt *pvt = mci->pvt_info;
+ int channels = pvt->info.type == KNIGHTS_LANDING ? KNL_MAX_CHANNELS
+ : NUM_CHANNELS;
+ unsigned int i, j, banks, ranks, rows, cols, npages;
+ struct dimm_info *dimm;
+ enum mem_type mtype;
+ u64 size;
mtype = pvt->info.get_memory_type(pvt);
if (mtype == MEM_RDDR3 || mtype == MEM_RDDR4)
@@ -1688,8 +1588,7 @@ static int get_dimm_config(struct mem_ctl_info *mci)
}
for (j = 0; j < max_dimms_per_channel; j++) {
- dimm = EDAC_DIMM_PTR(mci->layers, mci->dimms, mci->n_layers,
- i, j, 0);
+ dimm = EDAC_DIMM_PTR(mci->layers, mci->dimms, mci->n_layers, i, j, 0);
if (pvt->info.type == KNIGHTS_LANDING) {
pci_read_config_dword(pvt->knl.pci_channel[i],
knl_mtr_reg, &mtr);
@@ -1699,6 +1598,12 @@ static int get_dimm_config(struct mem_ctl_info *mci)
}
edac_dbg(4, "Channel #%d MTR%d = %x\n", i, j, mtr);
if (IS_DIMM_PRESENT(mtr)) {
+ if (!IS_ECC_ENABLED(pvt->info.mcmtr)) {
+ sbridge_printk(KERN_ERR, "CPU SrcID #%d, Ha #%d, Channel #%d has DIMMs, but ECC is disabled\n",
+ pvt->sbridge_dev->source_id,
+ pvt->sbridge_dev->dom, i);
+ return -ENODEV;
+ }
pvt->channel[i].dimms++;
ranks = numrank(pvt->info.type, mtr);
@@ -1717,7 +1622,7 @@ static int get_dimm_config(struct mem_ctl_info *mci)
npages = MiB_TO_PAGES(size);
edac_dbg(0, "mc#%d: ha %d channel %d, dimm %d, %lld Mb (%d pages) bank: %d, rank: %d, row: %#x, col: %#x\n",
- pvt->sbridge_dev->mc, i/4, i%4, j,
+ pvt->sbridge_dev->mc, pvt->sbridge_dev->dom, i, j,
size, npages,
banks, ranks, rows, cols);
@@ -1727,8 +1632,8 @@ static int get_dimm_config(struct mem_ctl_info *mci)
dimm->mtype = mtype;
dimm->edac_mode = mode;
snprintf(dimm->label, sizeof(dimm->label),
- "CPU_SrcID#%u_Ha#%u_Chan#%u_DIMM#%u",
- pvt->sbridge_dev->source_id, i/4, i%4, j);
+ "CPU_SrcID#%u_Ha#%u_Chan#%u_DIMM#%u",
+ pvt->sbridge_dev->source_id, pvt->sbridge_dev->dom, i, j);
}
}
}
@@ -1736,6 +1641,65 @@ static int get_dimm_config(struct mem_ctl_info *mci)
return 0;
}
+static int get_dimm_config(struct mem_ctl_info *mci)
+{
+ struct sbridge_pvt *pvt = mci->pvt_info;
+ u64 knl_mc_sizes[KNL_MAX_CHANNELS];
+ enum edac_type mode;
+ u32 reg;
+
+ if (pvt->info.type == HASWELL || pvt->info.type == BROADWELL) {
+ pci_read_config_dword(pvt->pci_ha, HASWELL_HASYSDEFEATURE2, &reg);
+ pvt->is_chan_hash = GET_BITFIELD(reg, 21, 21);
+ }
+ pvt->sbridge_dev->node_id = pvt->info.get_node_id(pvt);
+ edac_dbg(0, "mc#%d: Node ID: %d, source ID: %d\n",
+ pvt->sbridge_dev->mc,
+ pvt->sbridge_dev->node_id,
+ pvt->sbridge_dev->source_id);
+
+ /* KNL doesn't support mirroring or lockstep,
+ * and is always closed page
+ */
+ if (pvt->info.type == KNIGHTS_LANDING) {
+ mode = EDAC_S4ECD4ED;
+ pvt->is_mirrored = false;
+
+ if (knl_get_dimm_capacity(pvt, knl_mc_sizes) != 0)
+ return -1;
+ pci_read_config_dword(pvt->pci_ta, KNL_MCMTR, &pvt->info.mcmtr);
+ } else {
+ pci_read_config_dword(pvt->pci_ras, RASENABLES, &reg);
+ if (IS_MIRROR_ENABLED(reg)) {
+ edac_dbg(0, "Memory mirror is enabled\n");
+ pvt->is_mirrored = true;
+ } else {
+ edac_dbg(0, "Memory mirror is disabled\n");
+ pvt->is_mirrored = false;
+ }
+
+ pci_read_config_dword(pvt->pci_ta, MCMTR, &pvt->info.mcmtr);
+ if (IS_LOCKSTEP_ENABLED(pvt->info.mcmtr)) {
+ edac_dbg(0, "Lockstep is enabled\n");
+ mode = EDAC_S8ECD8ED;
+ pvt->is_lockstep = true;
+ } else {
+ edac_dbg(0, "Lockstep is disabled\n");
+ mode = EDAC_S4ECD4ED;
+ pvt->is_lockstep = false;
+ }
+ if (IS_CLOSE_PG(pvt->info.mcmtr)) {
+ edac_dbg(0, "address map is on closed page mode\n");
+ pvt->is_close_pg = true;
+ } else {
+ edac_dbg(0, "address map is on open page mode\n");
+ pvt->is_close_pg = false;
+ }
+ }
+
+ return __populate_dimms(mci, knl_mc_sizes, mode);
+}
+
static void get_memory_layout(const struct mem_ctl_info *mci)
{
struct sbridge_pvt *pvt = mci->pvt_info;
@@ -1816,8 +1780,7 @@ static void get_memory_layout(const struct mem_ctl_info *mci)
*/
prv = 0;
for (n_tads = 0; n_tads < MAX_TAD; n_tads++) {
- pci_read_config_dword(pvt->pci_ha0, tad_dram_rule[n_tads],
- &reg);
+ pci_read_config_dword(pvt->pci_ha, tad_dram_rule[n_tads], &reg);
limit = TAD_LIMIT(reg);
if (limit <= prv)
break;
@@ -1899,12 +1862,12 @@ static void get_memory_layout(const struct mem_ctl_info *mci)
}
}
-static struct mem_ctl_info *get_mci_for_node_id(u8 node_id)
+static struct mem_ctl_info *get_mci_for_node_id(u8 node_id, u8 ha)
{
struct sbridge_dev *sbridge_dev;
list_for_each_entry(sbridge_dev, &sbridge_edac_list, list) {
- if (sbridge_dev->node_id == node_id)
+ if (sbridge_dev->node_id == node_id && sbridge_dev->dom == ha)
return sbridge_dev->mci;
}
return NULL;
@@ -1925,7 +1888,7 @@ static int get_memory_error_data(struct mem_ctl_info *mci,
int interleave_mode, shiftup = 0;
unsigned sad_interleave[pvt->info.max_interleave];
u32 reg, dram_rule;
- u8 ch_way, sck_way, pkg, sad_ha = 0, ch_add = 0;
+ u8 ch_way, sck_way, pkg, sad_ha = 0;
u32 tad_offset;
u32 rir_way;
u32 mb, gb;
@@ -2038,13 +2001,10 @@ static int get_memory_error_data(struct mem_ctl_info *mci,
pkg = sad_pkg(pvt->info.interleave_pkg, reg, idx);
*socket = sad_pkg_socket(pkg);
sad_ha = sad_pkg_ha(pkg);
- if (sad_ha)
- ch_add = 4;
if (a7mode) {
/* MCChanShiftUpEnable */
- pci_read_config_dword(pvt->pci_ha0,
- HASWELL_HASYSDEFEATURE2, &reg);
+ pci_read_config_dword(pvt->pci_ha, HASWELL_HASYSDEFEATURE2, &reg);
shiftup = GET_BITFIELD(reg, 22, 22);
}
@@ -2056,8 +2016,6 @@ static int get_memory_error_data(struct mem_ctl_info *mci,
pkg = sad_pkg(pvt->info.interleave_pkg, reg, idx);
*socket = sad_pkg_socket(pkg);
sad_ha = sad_pkg_ha(pkg);
- if (sad_ha)
- ch_add = 4;
edac_dbg(0, "SAD interleave package: %d = CPU socket %d, HA %d\n",
idx, *socket, sad_ha);
}
@@ -2068,7 +2026,7 @@ static int get_memory_error_data(struct mem_ctl_info *mci,
* Move to the proper node structure, in order to access the
* right PCI registers
*/
- new_mci = get_mci_for_node_id(*socket);
+ new_mci = get_mci_for_node_id(*socket, sad_ha);
if (!new_mci) {
sprintf(msg, "Struct for socket #%u wasn't initialized",
*socket);
@@ -2081,14 +2039,7 @@ static int get_memory_error_data(struct mem_ctl_info *mci,
* Step 2) Get memory channel
*/
prv = 0;
- if (pvt->info.type == SANDY_BRIDGE)
- pci_ha = pvt->pci_ha0;
- else {
- if (sad_ha)
- pci_ha = pvt->pci_ha1;
- else
- pci_ha = pvt->pci_ha0;
- }
+ pci_ha = pvt->pci_ha;
for (n_tads = 0; n_tads < MAX_TAD; n_tads++) {
pci_read_config_dword(pci_ha, tad_dram_rule[n_tads], &reg);
limit = TAD_LIMIT(reg);
@@ -2139,9 +2090,7 @@ static int get_memory_error_data(struct mem_ctl_info *mci,
}
*channel_mask = 1 << base_ch;
- pci_read_config_dword(pvt->pci_tad[ch_add + base_ch],
- tad_ch_nilv_offset[n_tads],
- &tad_offset);
+ pci_read_config_dword(pvt->pci_tad[base_ch], tad_ch_nilv_offset[n_tads], &tad_offset);
if (pvt->is_mirrored) {
*channel_mask |= 1 << ((base_ch + 2) % 4);
@@ -2192,9 +2141,7 @@ static int get_memory_error_data(struct mem_ctl_info *mci,
* Step 3) Decode rank
*/
for (n_rir = 0; n_rir < MAX_RIR_RANGES; n_rir++) {
- pci_read_config_dword(pvt->pci_tad[ch_add + base_ch],
- rir_way_limit[n_rir],
- &reg);
+ pci_read_config_dword(pvt->pci_tad[base_ch], rir_way_limit[n_rir], &reg);
if (!IS_RIR_VALID(reg))
continue;
@@ -2222,9 +2169,7 @@ static int get_memory_error_data(struct mem_ctl_info *mci,
idx = (ch_addr >> 13); /* FIXME: Datasheet says to shift by 15 */
idx %= 1 << rir_way;
- pci_read_config_dword(pvt->pci_tad[ch_add + base_ch],
- rir_offset[n_rir][idx],
- &reg);
+ pci_read_config_dword(pvt->pci_tad[base_ch], rir_offset[n_rir][idx], &reg);
*rank = RIR_RNK_TGT(pvt->info.type, reg);
edac_dbg(0, "RIR#%d: channel address 0x%08Lx < 0x%08Lx, RIR interleave %d, index %d\n",
@@ -2277,10 +2222,11 @@ static int sbridge_get_onedevice(struct pci_dev **prev,
const unsigned devno,
const int multi_bus)
{
- struct sbridge_dev *sbridge_dev;
+ struct sbridge_dev *sbridge_dev = NULL;
const struct pci_id_descr *dev_descr = &table->descr[devno];
struct pci_dev *pdev = NULL;
u8 bus = 0;
+ int i = 0;
sbridge_printk(KERN_DEBUG,
"Seeking for: PCI ID %04x:%04x\n",
@@ -2311,9 +2257,14 @@ static int sbridge_get_onedevice(struct pci_dev **prev,
}
bus = pdev->bus->number;
- sbridge_dev = get_sbridge_dev(bus, multi_bus);
+next_imc:
+ sbridge_dev = get_sbridge_dev(bus, dev_descr->dom, multi_bus, sbridge_dev);
if (!sbridge_dev) {
- sbridge_dev = alloc_sbridge_dev(bus, table);
+
+ if (dev_descr->dom == SOCK)
+ goto out_imc;
+
+ sbridge_dev = alloc_sbridge_dev(bus, dev_descr->dom, table);
if (!sbridge_dev) {
pci_dev_put(pdev);
return -ENOMEM;
@@ -2321,7 +2272,7 @@ static int sbridge_get_onedevice(struct pci_dev **prev,
(*num_mc)++;
}
- if (sbridge_dev->pdev[devno]) {
+ if (sbridge_dev->pdev[sbridge_dev->i_devs]) {
sbridge_printk(KERN_ERR,
"Duplicated device for %04x:%04x\n",
PCI_VENDOR_ID_INTEL, dev_descr->dev_id);
@@ -2329,8 +2280,16 @@ static int sbridge_get_onedevice(struct pci_dev **prev,
return -ENODEV;
}
- sbridge_dev->pdev[devno] = pdev;
+ sbridge_dev->pdev[sbridge_dev->i_devs++] = pdev;
+
+ /* pdev belongs to more than one IMC, do extra gets */
+ if (++i > 1)
+ pci_dev_get(pdev);
+ if (dev_descr->dom == SOCK && i < table->n_imcs_per_sock)
+ goto next_imc;
+
+out_imc:
/* Be sure that the device is enabled */
if (unlikely(pci_enable_device(pdev) < 0)) {
sbridge_printk(KERN_ERR,
@@ -2374,7 +2333,7 @@ static int sbridge_get_all_devices(u8 *num_mc,
if (table->type == KNIGHTS_LANDING)
allow_dups = multi_bus = 1;
while (table && table->descr) {
- for (i = 0; i < table->n_devs; i++) {
+ for (i = 0; i < table->n_devs_per_sock; i++) {
if (!allow_dups || i == 0 ||
table->descr[i].dev_id !=
table->descr[i-1].dev_id) {
@@ -2385,7 +2344,7 @@ static int sbridge_get_all_devices(u8 *num_mc,
table, i, multi_bus);
if (rc < 0) {
if (i == 0) {
- i = table->n_devs;
+ i = table->n_devs_per_sock;
break;
}
sbridge_put_all_devices();
@@ -2399,6 +2358,13 @@ static int sbridge_get_all_devices(u8 *num_mc,
return 0;
}
+/*
+ * Device IDs for {SBRIDGE,IBRIDGE,HASWELL,BROADWELL}_IMC_HA0_TAD0 are in
+ * the format: XXXa. So we can convert from a device to the corresponding
+ * channel like this
+ */
+#define TAD_DEV_TO_CHAN(dev) (((dev) & 0xf) - 0xa)
+
static int sbridge_mci_bind_devs(struct mem_ctl_info *mci,
struct sbridge_dev *sbridge_dev)
{
@@ -2423,7 +2389,7 @@ static int sbridge_mci_bind_devs(struct mem_ctl_info *mci,
pvt->pci_br0 = pdev;
break;
case PCI_DEVICE_ID_INTEL_SBRIDGE_IMC_HA0:
- pvt->pci_ha0 = pdev;
+ pvt->pci_ha = pdev;
break;
case PCI_DEVICE_ID_INTEL_SBRIDGE_IMC_TA:
pvt->pci_ta = pdev;
@@ -2436,7 +2402,7 @@ static int sbridge_mci_bind_devs(struct mem_ctl_info *mci,
case PCI_DEVICE_ID_INTEL_SBRIDGE_IMC_TAD2:
case PCI_DEVICE_ID_INTEL_SBRIDGE_IMC_TAD3:
{
- int id = pdev->device - PCI_DEVICE_ID_INTEL_SBRIDGE_IMC_TAD0;
+ int id = TAD_DEV_TO_CHAN(pdev->device);
pvt->pci_tad[id] = pdev;
saw_chan_mask |= 1 << id;
}
@@ -2455,7 +2421,7 @@ static int sbridge_mci_bind_devs(struct mem_ctl_info *mci,
}
/* Check if everything were registered */
- if (!pvt->pci_sad0 || !pvt->pci_sad1 || !pvt->pci_ha0 ||
+ if (!pvt->pci_sad0 || !pvt->pci_sad1 || !pvt->pci_ha ||
!pvt->pci_ras || !pvt->pci_ta)
goto enodev;
@@ -2488,19 +2454,26 @@ static int ibridge_mci_bind_devs(struct mem_ctl_info *mci,
switch (pdev->device) {
case PCI_DEVICE_ID_INTEL_IBRIDGE_IMC_HA0:
- pvt->pci_ha0 = pdev;
+ case PCI_DEVICE_ID_INTEL_IBRIDGE_IMC_HA1:
+ pvt->pci_ha = pdev;
break;
case PCI_DEVICE_ID_INTEL_IBRIDGE_IMC_HA0_TA:
+ case PCI_DEVICE_ID_INTEL_IBRIDGE_IMC_HA1_TA:
pvt->pci_ta = pdev;
case PCI_DEVICE_ID_INTEL_IBRIDGE_IMC_HA0_RAS:
+ case PCI_DEVICE_ID_INTEL_IBRIDGE_IMC_HA1_RAS:
pvt->pci_ras = pdev;
break;
case PCI_DEVICE_ID_INTEL_IBRIDGE_IMC_HA0_TAD0:
case PCI_DEVICE_ID_INTEL_IBRIDGE_IMC_HA0_TAD1:
case PCI_DEVICE_ID_INTEL_IBRIDGE_IMC_HA0_TAD2:
case PCI_DEVICE_ID_INTEL_IBRIDGE_IMC_HA0_TAD3:
+ case PCI_DEVICE_ID_INTEL_IBRIDGE_IMC_HA1_TAD0:
+ case PCI_DEVICE_ID_INTEL_IBRIDGE_IMC_HA1_TAD1:
+ case PCI_DEVICE_ID_INTEL_IBRIDGE_IMC_HA1_TAD2:
+ case PCI_DEVICE_ID_INTEL_IBRIDGE_IMC_HA1_TAD3:
{
- int id = pdev->device - PCI_DEVICE_ID_INTEL_IBRIDGE_IMC_HA0_TAD0;
+ int id = TAD_DEV_TO_CHAN(pdev->device);
pvt->pci_tad[id] = pdev;
saw_chan_mask |= 1 << id;
}
@@ -2520,19 +2493,6 @@ static int ibridge_mci_bind_devs(struct mem_ctl_info *mci,
case PCI_DEVICE_ID_INTEL_IBRIDGE_BR1:
pvt->pci_br1 = pdev;
break;
- case PCI_DEVICE_ID_INTEL_IBRIDGE_IMC_HA1:
- pvt->pci_ha1 = pdev;
- break;
- case PCI_DEVICE_ID_INTEL_IBRIDGE_IMC_HA1_TAD0:
- case PCI_DEVICE_ID_INTEL_IBRIDGE_IMC_HA1_TAD1:
- case PCI_DEVICE_ID_INTEL_IBRIDGE_IMC_HA1_TAD2:
- case PCI_DEVICE_ID_INTEL_IBRIDGE_IMC_HA1_TAD3:
- {
- int id = pdev->device - PCI_DEVICE_ID_INTEL_IBRIDGE_IMC_HA1_TAD0 + 4;
- pvt->pci_tad[id] = pdev;
- saw_chan_mask |= 1 << id;
- }
- break;
default:
goto error;
}
@@ -2544,13 +2504,12 @@ static int ibridge_mci_bind_devs(struct mem_ctl_info *mci,
}
/* Check if everything were registered */
- if (!pvt->pci_sad0 || !pvt->pci_ha0 || !pvt->pci_br0 ||
+ if (!pvt->pci_sad0 || !pvt->pci_ha || !pvt->pci_br0 ||
!pvt->pci_br1 || !pvt->pci_ras || !pvt->pci_ta)
goto enodev;
- if (saw_chan_mask != 0x0f && /* -EN */
- saw_chan_mask != 0x33 && /* -EP */
- saw_chan_mask != 0xff) /* -EX */
+ if (saw_chan_mask != 0x0f && /* -EN/-EX */
+ saw_chan_mask != 0x03) /* -EP */
goto enodev;
return 0;
@@ -2593,32 +2552,27 @@ static int haswell_mci_bind_devs(struct mem_ctl_info *mci,
pvt->pci_sad1 = pdev;
break;
case PCI_DEVICE_ID_INTEL_HASWELL_IMC_HA0:
- pvt->pci_ha0 = pdev;
+ case PCI_DEVICE_ID_INTEL_HASWELL_IMC_HA1:
+ pvt->pci_ha = pdev;
break;
case PCI_DEVICE_ID_INTEL_HASWELL_IMC_HA0_TA:
+ case PCI_DEVICE_ID_INTEL_HASWELL_IMC_HA1_TA:
pvt->pci_ta = pdev;
break;
- case PCI_DEVICE_ID_INTEL_HASWELL_IMC_HA0_THERMAL:
+ case PCI_DEVICE_ID_INTEL_HASWELL_IMC_HA0_TM:
+ case PCI_DEVICE_ID_INTEL_HASWELL_IMC_HA1_TM:
pvt->pci_ras = pdev;
break;
case PCI_DEVICE_ID_INTEL_HASWELL_IMC_HA0_TAD0:
case PCI_DEVICE_ID_INTEL_HASWELL_IMC_HA0_TAD1:
case PCI_DEVICE_ID_INTEL_HASWELL_IMC_HA0_TAD2:
case PCI_DEVICE_ID_INTEL_HASWELL_IMC_HA0_TAD3:
- {
- int id = pdev->device - PCI_DEVICE_ID_INTEL_HASWELL_IMC_HA0_TAD0;
-
- pvt->pci_tad[id] = pdev;
- saw_chan_mask |= 1 << id;
- }
- break;
case PCI_DEVICE_ID_INTEL_HASWELL_IMC_HA1_TAD0:
case PCI_DEVICE_ID_INTEL_HASWELL_IMC_HA1_TAD1:
case PCI_DEVICE_ID_INTEL_HASWELL_IMC_HA1_TAD2:
case PCI_DEVICE_ID_INTEL_HASWELL_IMC_HA1_TAD3:
{
- int id = pdev->device - PCI_DEVICE_ID_INTEL_HASWELL_IMC_HA1_TAD0 + 4;
-
+ int id = TAD_DEV_TO_CHAN(pdev->device);
pvt->pci_tad[id] = pdev;
saw_chan_mask |= 1 << id;
}
@@ -2630,12 +2584,6 @@ static int haswell_mci_bind_devs(struct mem_ctl_info *mci,
if (!pvt->pci_ddrio)
pvt->pci_ddrio = pdev;
break;
- case PCI_DEVICE_ID_INTEL_HASWELL_IMC_HA1:
- pvt->pci_ha1 = pdev;
- break;
- case PCI_DEVICE_ID_INTEL_HASWELL_IMC_HA1_TA:
- pvt->pci_ha1_ta = pdev;
- break;
default:
break;
}
@@ -2647,13 +2595,12 @@ static int haswell_mci_bind_devs(struct mem_ctl_info *mci,
}
/* Check if everything were registered */
- if (!pvt->pci_sad0 || !pvt->pci_ha0 || !pvt->pci_sad1 ||
+ if (!pvt->pci_sad0 || !pvt->pci_ha || !pvt->pci_sad1 ||
!pvt->pci_ras || !pvt->pci_ta || !pvt->info.pci_vtd)
goto enodev;
- if (saw_chan_mask != 0x0f && /* -EN */
- saw_chan_mask != 0x33 && /* -EP */
- saw_chan_mask != 0xff) /* -EX */
+ if (saw_chan_mask != 0x0f && /* -EN/-EX */
+ saw_chan_mask != 0x03) /* -EP */
goto enodev;
return 0;
@@ -2690,30 +2637,27 @@ static int broadwell_mci_bind_devs(struct mem_ctl_info *mci,
pvt->pci_sad1 = pdev;
break;
case PCI_DEVICE_ID_INTEL_BROADWELL_IMC_HA0:
- pvt->pci_ha0 = pdev;
+ case PCI_DEVICE_ID_INTEL_BROADWELL_IMC_HA1:
+ pvt->pci_ha = pdev;
break;
case PCI_DEVICE_ID_INTEL_BROADWELL_IMC_HA0_TA:
+ case PCI_DEVICE_ID_INTEL_BROADWELL_IMC_HA1_TA:
pvt->pci_ta = pdev;
break;
- case PCI_DEVICE_ID_INTEL_BROADWELL_IMC_HA0_THERMAL:
+ case PCI_DEVICE_ID_INTEL_BROADWELL_IMC_HA0_TM:
+ case PCI_DEVICE_ID_INTEL_BROADWELL_IMC_HA1_TM:
pvt->pci_ras = pdev;
break;
case PCI_DEVICE_ID_INTEL_BROADWELL_IMC_HA0_TAD0:
case PCI_DEVICE_ID_INTEL_BROADWELL_IMC_HA0_TAD1:
case PCI_DEVICE_ID_INTEL_BROADWELL_IMC_HA0_TAD2:
case PCI_DEVICE_ID_INTEL_BROADWELL_IMC_HA0_TAD3:
- {
- int id = pdev->device - PCI_DEVICE_ID_INTEL_BROADWELL_IMC_HA0_TAD0;
- pvt->pci_tad[id] = pdev;
- saw_chan_mask |= 1 << id;
- }
- break;
case PCI_DEVICE_ID_INTEL_BROADWELL_IMC_HA1_TAD0:
case PCI_DEVICE_ID_INTEL_BROADWELL_IMC_HA1_TAD1:
case PCI_DEVICE_ID_INTEL_BROADWELL_IMC_HA1_TAD2:
case PCI_DEVICE_ID_INTEL_BROADWELL_IMC_HA1_TAD3:
{
- int id = pdev->device - PCI_DEVICE_ID_INTEL_BROADWELL_IMC_HA1_TAD0 + 4;
+ int id = TAD_DEV_TO_CHAN(pdev->device);
pvt->pci_tad[id] = pdev;
saw_chan_mask |= 1 << id;
}
@@ -2721,12 +2665,6 @@ static int broadwell_mci_bind_devs(struct mem_ctl_info *mci,
case PCI_DEVICE_ID_INTEL_BROADWELL_IMC_DDRIO0:
pvt->pci_ddrio = pdev;
break;
- case PCI_DEVICE_ID_INTEL_BROADWELL_IMC_HA1:
- pvt->pci_ha1 = pdev;
- break;
- case PCI_DEVICE_ID_INTEL_BROADWELL_IMC_HA1_TA:
- pvt->pci_ha1_ta = pdev;
- break;
default:
break;
}
@@ -2738,13 +2676,12 @@ static int broadwell_mci_bind_devs(struct mem_ctl_info *mci,
}
/* Check if everything were registered */
- if (!pvt->pci_sad0 || !pvt->pci_ha0 || !pvt->pci_sad1 ||
+ if (!pvt->pci_sad0 || !pvt->pci_ha || !pvt->pci_sad1 ||
!pvt->pci_ras || !pvt->pci_ta || !pvt->info.pci_vtd)
goto enodev;
- if (saw_chan_mask != 0x0f && /* -EN */
- saw_chan_mask != 0x33 && /* -EP */
- saw_chan_mask != 0xff) /* -EX */
+ if (saw_chan_mask != 0x0f && /* -EN/-EX */
+ saw_chan_mask != 0x03) /* -EP */
goto enodev;
return 0;
@@ -2812,7 +2749,7 @@ static int knl_mci_bind_devs(struct mem_ctl_info *mci,
pvt->knl.pci_cha[devidx] = pdev;
break;
- case PCI_DEVICE_ID_INTEL_KNL_IMC_CHANNEL:
+ case PCI_DEVICE_ID_INTEL_KNL_IMC_CHAN:
devidx = -1;
/*
@@ -3006,7 +2943,7 @@ static void sbridge_mce_output_error(struct mem_ctl_info *mci,
if (rc < 0)
goto err_parsing;
- new_mci = get_mci_for_node_id(socket);
+ new_mci = get_mci_for_node_id(socket, ha);
if (!new_mci) {
strcpy(msg, "Error: socket got corrupted!");
goto err_parsing;
@@ -3053,7 +2990,7 @@ static void sbridge_mce_output_error(struct mem_ctl_info *mci,
/* Call the helper to output message */
edac_mc_handle_error(tp_event, mci, core_err_cnt,
m->addr >> PAGE_SHIFT, m->addr & ~PAGE_MASK, 0,
- 4*ha+channel, dimm, -1,
+ channel, dimm, -1,
optype, msg);
return;
err_parsing:
@@ -3078,7 +3015,7 @@ static int sbridge_mce_check_error(struct notifier_block *nb, unsigned long val,
if (edac_get_report_status() == EDAC_REPORTING_DISABLED)
return NOTIFY_DONE;
- mci = get_mci_for_node_id(mce->socketid);
+ mci = get_mci_for_node_id(mce->socketid, IMC0);
if (!mci)
return NOTIFY_DONE;
pvt = mci->pvt_info;
@@ -3159,11 +3096,6 @@ static int sbridge_register_mci(struct sbridge_dev *sbridge_dev, enum type type)
struct pci_dev *pdev = sbridge_dev->pdev[0];
int rc;
- /* Check the number of active and not disabled channels */
- rc = check_if_ecc_is_active(sbridge_dev->bus, type);
- if (unlikely(rc < 0))
- return rc;
-
/* allocate a new MC control structure */
layers[0].type = EDAC_MC_LAYER_CHANNEL;
layers[0].size = type == KNIGHTS_LANDING ?
@@ -3192,7 +3124,7 @@ static int sbridge_register_mci(struct sbridge_dev *sbridge_dev, enum type type)
MEM_FLAG_DDR4 : MEM_FLAG_DDR3;
mci->edac_ctl_cap = EDAC_FLAG_NONE;
mci->edac_cap = EDAC_FLAG_NONE;
- mci->mod_name = "sbridge_edac.c";
+ mci->mod_name = "sb_edac.c";
mci->mod_ver = SBRIDGE_REVISION;
mci->dev_name = pci_name(pdev);
mci->ctl_page_to_phys = NULL;
@@ -3215,12 +3147,14 @@ static int sbridge_register_mci(struct sbridge_dev *sbridge_dev, enum type type)
pvt->info.max_interleave = ARRAY_SIZE(ibridge_interleave_list);
pvt->info.interleave_pkg = ibridge_interleave_pkg;
pvt->info.get_width = ibridge_get_width;
- mci->ctl_name = kasprintf(GFP_KERNEL, "Ivy Bridge Socket#%d", mci->mc_idx);
/* Store pci devices at mci for faster access */
rc = ibridge_mci_bind_devs(mci, sbridge_dev);
if (unlikely(rc < 0))
goto fail0;
+ get_source_id(mci);
+ mci->ctl_name = kasprintf(GFP_KERNEL, "Ivy Bridge SrcID#%d_Ha#%d",
+ pvt->sbridge_dev->source_id, pvt->sbridge_dev->dom);
break;
case SANDY_BRIDGE:
pvt->info.rankcfgr = SB_RANK_CFG_A;
@@ -3238,12 +3172,14 @@ static int sbridge_register_mci(struct sbridge_dev *sbridge_dev, enum type type)
pvt->info.max_interleave = ARRAY_SIZE(sbridge_interleave_list);
pvt->info.interleave_pkg = sbridge_interleave_pkg;
pvt->info.get_width = sbridge_get_width;
- mci->ctl_name = kasprintf(GFP_KERNEL, "Sandy Bridge Socket#%d", mci->mc_idx);
/* Store pci devices at mci for faster access */
rc = sbridge_mci_bind_devs(mci, sbridge_dev);
if (unlikely(rc < 0))
goto fail0;
+ get_source_id(mci);
+ mci->ctl_name = kasprintf(GFP_KERNEL, "Sandy Bridge SrcID#%d_Ha#%d",
+ pvt->sbridge_dev->source_id, pvt->sbridge_dev->dom);
break;
case HASWELL:
/* rankcfgr isn't used */
@@ -3261,12 +3197,14 @@ static int sbridge_register_mci(struct sbridge_dev *sbridge_dev, enum type type)
pvt->info.max_interleave = ARRAY_SIZE(ibridge_interleave_list);
pvt->info.interleave_pkg = ibridge_interleave_pkg;
pvt->info.get_width = ibridge_get_width;
- mci->ctl_name = kasprintf(GFP_KERNEL, "Haswell Socket#%d", mci->mc_idx);
/* Store pci devices at mci for faster access */
rc = haswell_mci_bind_devs(mci, sbridge_dev);
if (unlikely(rc < 0))
goto fail0;
+ get_source_id(mci);
+ mci->ctl_name = kasprintf(GFP_KERNEL, "Haswell SrcID#%d_Ha#%d",
+ pvt->sbridge_dev->source_id, pvt->sbridge_dev->dom);
break;
case BROADWELL:
/* rankcfgr isn't used */
@@ -3284,12 +3222,14 @@ static int sbridge_register_mci(struct sbridge_dev *sbridge_dev, enum type type)
pvt->info.max_interleave = ARRAY_SIZE(ibridge_interleave_list);
pvt->info.interleave_pkg = ibridge_interleave_pkg;
pvt->info.get_width = broadwell_get_width;
- mci->ctl_name = kasprintf(GFP_KERNEL, "Broadwell Socket#%d", mci->mc_idx);
/* Store pci devices at mci for faster access */
rc = broadwell_mci_bind_devs(mci, sbridge_dev);
if (unlikely(rc < 0))
goto fail0;
+ get_source_id(mci);
+ mci->ctl_name = kasprintf(GFP_KERNEL, "Broadwell SrcID#%d_Ha#%d",
+ pvt->sbridge_dev->source_id, pvt->sbridge_dev->dom);
break;
case KNIGHTS_LANDING:
/* pvt->info.rankcfgr == ??? */
@@ -3307,17 +3247,22 @@ static int sbridge_register_mci(struct sbridge_dev *sbridge_dev, enum type type)
pvt->info.max_interleave = ARRAY_SIZE(knl_interleave_list);
pvt->info.interleave_pkg = ibridge_interleave_pkg;
pvt->info.get_width = knl_get_width;
- mci->ctl_name = kasprintf(GFP_KERNEL,
- "Knights Landing Socket#%d", mci->mc_idx);
rc = knl_mci_bind_devs(mci, sbridge_dev);
if (unlikely(rc < 0))
goto fail0;
+ get_source_id(mci);
+ mci->ctl_name = kasprintf(GFP_KERNEL, "Knights Landing SrcID#%d_Ha#%d",
+ pvt->sbridge_dev->source_id, pvt->sbridge_dev->dom);
break;
}
/* Get dimm basic config and the memory layout */
- get_dimm_config(mci);
+ rc = get_dimm_config(mci);
+ if (rc < 0) {
+ edac_dbg(0, "MC: failed to get_dimm_config()\n");
+ goto fail;
+ }
get_memory_layout(mci);
/* record ptr to the generic device */
@@ -3327,13 +3272,14 @@ static int sbridge_register_mci(struct sbridge_dev *sbridge_dev, enum type type)
if (unlikely(edac_mc_add_mc(mci))) {
edac_dbg(0, "MC: failed edac_mc_add_mc()\n");
rc = -EINVAL;
- goto fail0;
+ goto fail;
}
return 0;
-fail0:
+fail:
kfree(mci->ctl_name);
+fail0:
edac_mc_free(mci);
sbridge_dev->mci = NULL;
return rc;
diff --git a/drivers/edac/thunderx_edac.c b/drivers/edac/thunderx_edac.c
index 86d585cb6d32..2d352b40ae1c 100644
--- a/drivers/edac/thunderx_edac.c
+++ b/drivers/edac/thunderx_edac.c
@@ -2080,7 +2080,7 @@ static int thunderx_l2c_probe(struct pci_dev *pdev,
if (IS_ENABLED(CONFIG_EDAC_DEBUG)) {
l2c->debugfs = edac_debugfs_create_dir(pdev->dev.kobj.name);
- thunderx_create_debugfs_nodes(l2c->debugfs, l2c_devattr,
+ ret = thunderx_create_debugfs_nodes(l2c->debugfs, l2c_devattr,
l2c, dfs_entries);
if (ret != dfs_entries) {
diff --git a/drivers/firmware/dmi-id.c b/drivers/firmware/dmi-id.c
index dc269cb288c2..951b6c79f166 100644
--- a/drivers/firmware/dmi-id.c
+++ b/drivers/firmware/dmi-id.c
@@ -47,7 +47,7 @@ DEFINE_DMI_ATTR_WITH_SHOW(product_name, 0444, DMI_PRODUCT_NAME);
DEFINE_DMI_ATTR_WITH_SHOW(product_version, 0444, DMI_PRODUCT_VERSION);
DEFINE_DMI_ATTR_WITH_SHOW(product_serial, 0400, DMI_PRODUCT_SERIAL);
DEFINE_DMI_ATTR_WITH_SHOW(product_uuid, 0400, DMI_PRODUCT_UUID);
-DEFINE_DMI_ATTR_WITH_SHOW(product_family, 0400, DMI_PRODUCT_FAMILY);
+DEFINE_DMI_ATTR_WITH_SHOW(product_family, 0444, DMI_PRODUCT_FAMILY);
DEFINE_DMI_ATTR_WITH_SHOW(board_vendor, 0444, DMI_BOARD_VENDOR);
DEFINE_DMI_ATTR_WITH_SHOW(board_name, 0444, DMI_BOARD_NAME);
DEFINE_DMI_ATTR_WITH_SHOW(board_version, 0444, DMI_BOARD_VERSION);
@@ -192,7 +192,7 @@ static void __init dmi_id_init_attr_table(void)
ADD_DMI_ATTR(product_version, DMI_PRODUCT_VERSION);
ADD_DMI_ATTR(product_serial, DMI_PRODUCT_SERIAL);
ADD_DMI_ATTR(product_uuid, DMI_PRODUCT_UUID);
- ADD_DMI_ATTR(product_family, DMI_PRODUCT_FAMILY);
+ ADD_DMI_ATTR(product_family, DMI_PRODUCT_FAMILY);
ADD_DMI_ATTR(board_vendor, DMI_BOARD_VENDOR);
ADD_DMI_ATTR(board_name, DMI_BOARD_NAME);
ADD_DMI_ATTR(board_version, DMI_BOARD_VERSION);
diff --git a/drivers/firmware/dmi_scan.c b/drivers/firmware/dmi_scan.c
index 93f7acdaac7a..783041964439 100644
--- a/drivers/firmware/dmi_scan.c
+++ b/drivers/firmware/dmi_scan.c
@@ -144,7 +144,7 @@ static int __init dmi_walk_early(void (*decode)(const struct dmi_header *,
buf = dmi_early_remap(dmi_base, orig_dmi_len);
if (buf == NULL)
- return -1;
+ return -ENOMEM;
dmi_decode_table(buf, decode, NULL);
@@ -178,7 +178,7 @@ static void __init dmi_save_ident(const struct dmi_header *dm, int slot,
const char *d = (const char *) dm;
const char *p;
- if (dmi_ident[slot])
+ if (dmi_ident[slot] || dm->length <= string)
return;
p = dmi_string(dm, d[string]);
@@ -191,13 +191,14 @@ static void __init dmi_save_ident(const struct dmi_header *dm, int slot,
static void __init dmi_save_uuid(const struct dmi_header *dm, int slot,
int index)
{
- const u8 *d = (u8 *) dm + index;
+ const u8 *d;
char *s;
int is_ff = 1, is_00 = 1, i;
- if (dmi_ident[slot])
+ if (dmi_ident[slot] || dm->length <= index + 16)
return;
+ d = (u8 *) dm + index;
for (i = 0; i < 16 && (is_ff || is_00); i++) {
if (d[i] != 0x00)
is_00 = 0;
@@ -228,16 +229,17 @@ static void __init dmi_save_uuid(const struct dmi_header *dm, int slot,
static void __init dmi_save_type(const struct dmi_header *dm, int slot,
int index)
{
- const u8 *d = (u8 *) dm + index;
+ const u8 *d;
char *s;
- if (dmi_ident[slot])
+ if (dmi_ident[slot] || dm->length <= index)
return;
s = dmi_alloc(4);
if (!s)
return;
+ d = (u8 *) dm + index;
sprintf(s, "%u", *d & 0x7F);
dmi_ident[slot] = s;
}
@@ -278,9 +280,13 @@ static void __init dmi_save_devices(const struct dmi_header *dm)
static void __init dmi_save_oem_strings_devices(const struct dmi_header *dm)
{
- int i, count = *(u8 *)(dm + 1);
+ int i, count;
struct dmi_device *dev;
+ if (dm->length < 0x05)
+ return;
+
+ count = *(u8 *)(dm + 1);
for (i = 1; i <= count; i++) {
const char *devname = dmi_string(dm, i);
@@ -353,6 +359,9 @@ static void __init dmi_save_extended_devices(const struct dmi_header *dm)
const char *name;
const u8 *d = (u8 *)dm;
+ if (dm->length < 0x0B)
+ return;
+
/* Skip disabled device */
if ((d[0x5] & 0x80) == 0)
return;
@@ -387,7 +396,7 @@ static void __init save_mem_devices(const struct dmi_header *dm, void *v)
const char *d = (const char *)dm;
static int nr;
- if (dm->type != DMI_ENTRY_MEM_DEVICE)
+ if (dm->type != DMI_ENTRY_MEM_DEVICE || dm->length < 0x12)
return;
if (nr >= dmi_memdev_nr) {
pr_warn(FW_BUG "Too many DIMM entries in SMBIOS table\n");
@@ -650,6 +659,21 @@ void __init dmi_scan_machine(void)
goto error;
/*
+ * Same logic as above, look for a 64-bit entry point
+ * first, and if not found, fall back to 32-bit entry point.
+ */
+ memcpy_fromio(buf, p, 16);
+ for (q = p + 16; q < p + 0x10000; q += 16) {
+ memcpy_fromio(buf + 16, q, 16);
+ if (!dmi_smbios3_present(buf)) {
+ dmi_available = 1;
+ dmi_early_unmap(p, 0x10000);
+ goto out;
+ }
+ memcpy(buf, buf + 16, 16);
+ }
+
+ /*
* Iterate over all possible DMI header addresses q.
* Maintain the 32 bytes around q in buf. On the
* first iteration, substitute zero for the
@@ -659,7 +683,7 @@ void __init dmi_scan_machine(void)
memset(buf, 0, 16);
for (q = p; q < p + 0x10000; q += 16) {
memcpy_fromio(buf + 16, q, 16);
- if (!dmi_smbios3_present(buf) || !dmi_present(buf)) {
+ if (!dmi_present(buf)) {
dmi_available = 1;
dmi_early_unmap(p, 0x10000);
goto out;
@@ -993,7 +1017,8 @@ EXPORT_SYMBOL(dmi_get_date);
* @decode: Callback function
* @private_data: Private data to be passed to the callback function
*
- * Returns -1 when the DMI table can't be reached, 0 on success.
+ * Returns 0 on success, -ENXIO if DMI is not selected or not present,
+ * or a different negative error code if DMI walking fails.
*/
int dmi_walk(void (*decode)(const struct dmi_header *, void *),
void *private_data)
@@ -1001,11 +1026,11 @@ int dmi_walk(void (*decode)(const struct dmi_header *, void *),
u8 *buf;
if (!dmi_available)
- return -1;
+ return -ENXIO;
buf = dmi_remap(dmi_base, dmi_len);
if (buf == NULL)
- return -1;
+ return -ENOMEM;
dmi_decode_table(buf, decode, private_data);
diff --git a/drivers/firmware/efi/Kconfig b/drivers/firmware/efi/Kconfig
index 2e78b0b96d74..394db40ed374 100644
--- a/drivers/firmware/efi/Kconfig
+++ b/drivers/firmware/efi/Kconfig
@@ -112,6 +112,15 @@ config EFI_CAPSULE_LOADER
Most users should say N.
+config EFI_CAPSULE_QUIRK_QUARK_CSH
+ boolean "Add support for Quark capsules with non-standard headers"
+ depends on X86 && !64BIT
+ select EFI_CAPSULE_LOADER
+ default y
+ help
+ Add support for processing Quark X1000 EFI capsules, whose header
+ layout deviates from the layout mandated by the UEFI specification.
+
config EFI_TEST
tristate "EFI Runtime Service Tests Support"
depends on EFI
diff --git a/drivers/firmware/efi/arm-runtime.c b/drivers/firmware/efi/arm-runtime.c
index 974c5a31a005..1cc41c3d6315 100644
--- a/drivers/firmware/efi/arm-runtime.c
+++ b/drivers/firmware/efi/arm-runtime.c
@@ -11,6 +11,7 @@
*
*/
+#include <linux/dmi.h>
#include <linux/efi.h>
#include <linux/io.h>
#include <linux/memblock.h>
@@ -166,3 +167,18 @@ void efi_virtmap_unload(void)
efi_set_pgd(current->active_mm);
preempt_enable();
}
+
+
+static int __init arm_dmi_init(void)
+{
+ /*
+ * On arm64/ARM, DMI depends on UEFI, and dmi_scan_machine() needs to
+ * be called early because dmi_id_init(), which is an arch_initcall
+ * itself, depends on dmi_scan_machine() having been called already.
+ */
+ dmi_scan_machine();
+ if (dmi_available)
+ dmi_set_dump_stack_arch_desc();
+ return 0;
+}
+core_initcall(arm_dmi_init);
diff --git a/drivers/firmware/efi/capsule-loader.c b/drivers/firmware/efi/capsule-loader.c
index 9ae6c116c474..ec8ac5c4dd84 100644
--- a/drivers/firmware/efi/capsule-loader.c
+++ b/drivers/firmware/efi/capsule-loader.c
@@ -20,15 +20,9 @@
#define NO_FURTHER_WRITE_ACTION -1
-struct capsule_info {
- bool header_obtained;
- int reset_type;
- long index;
- size_t count;
- size_t total_size;
- struct page **pages;
- size_t page_bytes_remain;
-};
+#ifndef phys_to_page
+#define phys_to_page(x) pfn_to_page((x) >> PAGE_SHIFT)
+#endif
/**
* efi_free_all_buff_pages - free all previous allocated buffer pages
@@ -41,65 +35,70 @@ struct capsule_info {
static void efi_free_all_buff_pages(struct capsule_info *cap_info)
{
while (cap_info->index > 0)
- __free_page(cap_info->pages[--cap_info->index]);
+ __free_page(phys_to_page(cap_info->pages[--cap_info->index]));
cap_info->index = NO_FURTHER_WRITE_ACTION;
}
-/**
- * efi_capsule_setup_info - obtain the efi capsule header in the binary and
- * setup capsule_info structure
- * @cap_info: pointer to current instance of capsule_info structure
- * @kbuff: a mapped first page buffer pointer
- * @hdr_bytes: the total received number of bytes for efi header
- **/
-static ssize_t efi_capsule_setup_info(struct capsule_info *cap_info,
- void *kbuff, size_t hdr_bytes)
+int __efi_capsule_setup_info(struct capsule_info *cap_info)
{
- efi_capsule_header_t *cap_hdr;
size_t pages_needed;
int ret;
void *temp_page;
- /* Only process data block that is larger than efi header size */
- if (hdr_bytes < sizeof(efi_capsule_header_t))
- return 0;
-
- /* Reset back to the correct offset of header */
- cap_hdr = kbuff - cap_info->count;
- pages_needed = ALIGN(cap_hdr->imagesize, PAGE_SIZE) >> PAGE_SHIFT;
+ pages_needed = ALIGN(cap_info->total_size, PAGE_SIZE) / PAGE_SIZE;
if (pages_needed == 0) {
- pr_err("%s: pages count invalid\n", __func__);
+ pr_err("invalid capsule size");
return -EINVAL;
}
/* Check if the capsule binary supported */
- ret = efi_capsule_supported(cap_hdr->guid, cap_hdr->flags,
- cap_hdr->imagesize,
+ ret = efi_capsule_supported(cap_info->header.guid,
+ cap_info->header.flags,
+ cap_info->header.imagesize,
&cap_info->reset_type);
if (ret) {
- pr_err("%s: efi_capsule_supported() failed\n",
- __func__);
+ pr_err("capsule not supported\n");
return ret;
}
- cap_info->total_size = cap_hdr->imagesize;
temp_page = krealloc(cap_info->pages,
pages_needed * sizeof(void *),
GFP_KERNEL | __GFP_ZERO);
- if (!temp_page) {
- pr_debug("%s: krealloc() failed\n", __func__);
+ if (!temp_page)
return -ENOMEM;
- }
cap_info->pages = temp_page;
- cap_info->header_obtained = true;
return 0;
}
/**
+ * efi_capsule_setup_info - obtain the efi capsule header in the binary and
+ * setup capsule_info structure
+ * @cap_info: pointer to current instance of capsule_info structure
+ * @kbuff: a mapped first page buffer pointer
+ * @hdr_bytes: the total received number of bytes for efi header
+ *
+ * Platforms with non-standard capsule update mechanisms can override
+ * this __weak function so they can perform any required capsule
+ * image munging. See quark_quirk_function() for an example.
+ **/
+int __weak efi_capsule_setup_info(struct capsule_info *cap_info, void *kbuff,
+ size_t hdr_bytes)
+{
+ /* Only process data block that is larger than efi header size */
+ if (hdr_bytes < sizeof(efi_capsule_header_t))
+ return 0;
+
+ memcpy(&cap_info->header, kbuff, sizeof(cap_info->header));
+ cap_info->total_size = cap_info->header.imagesize;
+
+ return __efi_capsule_setup_info(cap_info);
+}
+
+/**
* efi_capsule_submit_update - invoke the efi_capsule_update API once binary
* upload done
* @cap_info: pointer to current instance of capsule_info structure
@@ -107,26 +106,17 @@ static ssize_t efi_capsule_setup_info(struct capsule_info *cap_info,
static ssize_t efi_capsule_submit_update(struct capsule_info *cap_info)
{
int ret;
- void *cap_hdr_temp;
-
- cap_hdr_temp = vmap(cap_info->pages, cap_info->index,
- VM_MAP, PAGE_KERNEL);
- if (!cap_hdr_temp) {
- pr_debug("%s: vmap() failed\n", __func__);
- return -EFAULT;
- }
- ret = efi_capsule_update(cap_hdr_temp, cap_info->pages);
- vunmap(cap_hdr_temp);
+ ret = efi_capsule_update(&cap_info->header, cap_info->pages);
if (ret) {
- pr_err("%s: efi_capsule_update() failed\n", __func__);
+ pr_err("capsule update failed\n");
return ret;
}
/* Indicate capsule binary uploading is done */
cap_info->index = NO_FURTHER_WRITE_ACTION;
- pr_info("%s: Successfully upload capsule file with reboot type '%s'\n",
- __func__, !cap_info->reset_type ? "RESET_COLD" :
+ pr_info("Successfully upload capsule file with reboot type '%s'\n",
+ !cap_info->reset_type ? "RESET_COLD" :
cap_info->reset_type == 1 ? "RESET_WARM" :
"RESET_SHUTDOWN");
return 0;
@@ -171,37 +161,30 @@ static ssize_t efi_capsule_write(struct file *file, const char __user *buff,
if (!cap_info->page_bytes_remain) {
page = alloc_page(GFP_KERNEL);
if (!page) {
- pr_debug("%s: alloc_page() failed\n", __func__);
ret = -ENOMEM;
goto failed;
}
- cap_info->pages[cap_info->index++] = page;
+ cap_info->pages[cap_info->index++] = page_to_phys(page);
cap_info->page_bytes_remain = PAGE_SIZE;
+ } else {
+ page = phys_to_page(cap_info->pages[cap_info->index - 1]);
}
- page = cap_info->pages[cap_info->index - 1];
-
kbuff = kmap(page);
- if (!kbuff) {
- pr_debug("%s: kmap() failed\n", __func__);
- ret = -EFAULT;
- goto failed;
- }
kbuff += PAGE_SIZE - cap_info->page_bytes_remain;
/* Copy capsule binary data from user space to kernel space buffer */
write_byte = min_t(size_t, count, cap_info->page_bytes_remain);
if (copy_from_user(kbuff, buff, write_byte)) {
- pr_debug("%s: copy_from_user() failed\n", __func__);
ret = -EFAULT;
goto fail_unmap;
}
cap_info->page_bytes_remain -= write_byte;
/* Setup capsule binary info structure */
- if (!cap_info->header_obtained) {
- ret = efi_capsule_setup_info(cap_info, kbuff,
+ if (cap_info->header.headersize == 0) {
+ ret = efi_capsule_setup_info(cap_info, kbuff - cap_info->count,
cap_info->count + write_byte);
if (ret)
goto fail_unmap;
@@ -211,11 +194,10 @@ static ssize_t efi_capsule_write(struct file *file, const char __user *buff,
kunmap(page);
/* Submit the full binary to efi_capsule_update() API */
- if (cap_info->header_obtained &&
+ if (cap_info->header.headersize > 0 &&
cap_info->count >= cap_info->total_size) {
if (cap_info->count > cap_info->total_size) {
- pr_err("%s: upload size exceeded header defined size\n",
- __func__);
+ pr_err("capsule upload size exceeded header defined size\n");
ret = -EINVAL;
goto failed;
}
@@ -249,7 +231,7 @@ static int efi_capsule_flush(struct file *file, fl_owner_t id)
struct capsule_info *cap_info = file->private_data;
if (cap_info->index > 0) {
- pr_err("%s: capsule upload not complete\n", __func__);
+ pr_err("capsule upload not complete\n");
efi_free_all_buff_pages(cap_info);
ret = -ECANCELED;
}
@@ -328,8 +310,7 @@ static int __init efi_capsule_loader_init(void)
ret = misc_register(&efi_capsule_misc);
if (ret)
- pr_err("%s: Failed to register misc char file note\n",
- __func__);
+ pr_err("Unable to register capsule loader device\n");
return ret;
}
diff --git a/drivers/firmware/efi/capsule.c b/drivers/firmware/efi/capsule.c
index 6eedff45e6d7..901b9306bf94 100644
--- a/drivers/firmware/efi/capsule.c
+++ b/drivers/firmware/efi/capsule.c
@@ -214,7 +214,7 @@ efi_capsule_update_locked(efi_capsule_header_t *capsule,
*
* Return 0 on success, a converted EFI status code on failure.
*/
-int efi_capsule_update(efi_capsule_header_t *capsule, struct page **pages)
+int efi_capsule_update(efi_capsule_header_t *capsule, phys_addr_t *pages)
{
u32 imagesize = capsule->imagesize;
efi_guid_t guid = capsule->guid;
@@ -247,16 +247,13 @@ int efi_capsule_update(efi_capsule_header_t *capsule, struct page **pages)
efi_capsule_block_desc_t *sglist;
sglist = kmap(sg_pages[i]);
- if (!sglist) {
- rv = -ENOMEM;
- goto out;
- }
for (j = 0; j < SGLIST_PER_PAGE && count > 0; j++) {
- u64 sz = min_t(u64, imagesize, PAGE_SIZE);
+ u64 sz = min_t(u64, imagesize,
+ PAGE_SIZE - (u64)*pages % PAGE_SIZE);
sglist[j].length = sz;
- sglist[j].data = page_to_phys(*pages++);
+ sglist[j].data = *pages++;
imagesize -= sz;
count--;
diff --git a/drivers/firmware/efi/efi.c b/drivers/firmware/efi/efi.c
index b372aad3b449..045d6d311bde 100644
--- a/drivers/firmware/efi/efi.c
+++ b/drivers/firmware/efi/efi.c
@@ -528,7 +528,8 @@ int __init efi_config_parse_tables(void *config_tables, int count, int sz,
}
}
- efi_memattr_init();
+ if (efi_enabled(EFI_MEMMAP))
+ efi_memattr_init();
/* Parse the EFI Properties table if it exists */
if (efi.properties_table != EFI_INVALID_TABLE_ADDR) {
diff --git a/drivers/firmware/efi/test/efi_test.c b/drivers/firmware/efi/test/efi_test.c
index 8cd578f62059..08129b7b80ab 100644
--- a/drivers/firmware/efi/test/efi_test.c
+++ b/drivers/firmware/efi/test/efi_test.c
@@ -71,18 +71,13 @@ copy_ucs2_from_user_len(efi_char16_t **dst, efi_char16_t __user *src,
if (!access_ok(VERIFY_READ, src, 1))
return -EFAULT;
- buf = kmalloc(len, GFP_KERNEL);
- if (!buf) {
+ buf = memdup_user(src, len);
+ if (IS_ERR(buf)) {
*dst = NULL;
- return -ENOMEM;
+ return PTR_ERR(buf);
}
*dst = buf;
- if (copy_from_user(*dst, src, len)) {
- kfree(buf);
- return -EFAULT;
- }
-
return 0;
}
diff --git a/drivers/gpio/gpio-mvebu.c b/drivers/gpio/gpio-mvebu.c
index 5104b6398139..c83ea68be792 100644
--- a/drivers/gpio/gpio-mvebu.c
+++ b/drivers/gpio/gpio-mvebu.c
@@ -721,7 +721,7 @@ static int mvebu_pwm_probe(struct platform_device *pdev,
u32 set;
if (!of_device_is_compatible(mvchip->chip.of_node,
- "marvell,armada-370-xp-gpio"))
+ "marvell,armada-370-gpio"))
return 0;
if (IS_ERR(mvchip->clk))
@@ -852,7 +852,7 @@ static const struct of_device_id mvebu_gpio_of_match[] = {
.data = (void *) MVEBU_GPIO_SOC_VARIANT_ARMADAXP,
},
{
- .compatible = "marvell,armada-370-xp-gpio",
+ .compatible = "marvell,armada-370-gpio",
.data = (void *) MVEBU_GPIO_SOC_VARIANT_ORION,
},
{
@@ -1128,7 +1128,7 @@ static int mvebu_gpio_probe(struct platform_device *pdev)
mvchip);
}
- /* Armada 370/XP has simple PWM support for GPIO lines */
+ /* Some MVEBU SoCs have simple PWM support for GPIO lines */
if (IS_ENABLED(CONFIG_PWM))
return mvebu_pwm_probe(pdev, mvchip, id);
diff --git a/drivers/gpio/gpiolib-acpi.c b/drivers/gpio/gpiolib-acpi.c
index 2185232da823..8fa5fcd00e9a 100644
--- a/drivers/gpio/gpiolib-acpi.c
+++ b/drivers/gpio/gpiolib-acpi.c
@@ -201,7 +201,7 @@ static acpi_status acpi_gpiochip_request_interrupt(struct acpi_resource *ares,
handler = acpi_gpio_irq_handler_evt;
}
if (!handler)
- return AE_BAD_PARAMETER;
+ return AE_OK;
pin = acpi_gpiochip_pin_to_gpio_offset(chip->gpiodev, pin);
if (pin < 0)
diff --git a/drivers/gpio/gpiolib.c b/drivers/gpio/gpiolib.c
index 5db44139cef8..a42a1eea5714 100644
--- a/drivers/gpio/gpiolib.c
+++ b/drivers/gpio/gpiolib.c
@@ -708,7 +708,8 @@ static irqreturn_t lineevent_irq_thread(int irq, void *p)
ge.timestamp = ktime_get_real_ns();
- if (le->eflags & GPIOEVENT_REQUEST_BOTH_EDGES) {
+ if (le->eflags & GPIOEVENT_REQUEST_RISING_EDGE
+ && le->eflags & GPIOEVENT_REQUEST_FALLING_EDGE) {
int level = gpiod_get_value_cansleep(le->desc);
if (level)
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_atombios.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_atombios.c
index 1cf78f4dd339..1e8e1123ddf4 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_atombios.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_atombios.c
@@ -693,6 +693,10 @@ int amdgpu_atombios_get_clock_info(struct amdgpu_device *adev)
DRM_INFO("Changing default dispclk from %dMhz to 600Mhz\n",
adev->clock.default_dispclk / 100);
adev->clock.default_dispclk = 60000;
+ } else if (adev->clock.default_dispclk <= 60000) {
+ DRM_INFO("Changing default dispclk from %dMhz to 625Mhz\n",
+ adev->clock.default_dispclk / 100);
+ adev->clock.default_dispclk = 62500;
}
adev->clock.dp_extclk =
le16_to_cpu(firmware_info->info_21.usUniphyDPModeExtClkFreq);
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
index f2d705e6a75a..ab6b0d0febab 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
@@ -449,6 +449,7 @@ static const struct pci_device_id pciidlist[] = {
{0x1002, 0x6986, PCI_ANY_ID, PCI_ANY_ID, 0, 0, CHIP_POLARIS12},
{0x1002, 0x6987, PCI_ANY_ID, PCI_ANY_ID, 0, 0, CHIP_POLARIS12},
{0x1002, 0x6995, PCI_ANY_ID, PCI_ANY_ID, 0, 0, CHIP_POLARIS12},
+ {0x1002, 0x6997, PCI_ANY_ID, PCI_ANY_ID, 0, 0, CHIP_POLARIS12},
{0x1002, 0x699F, PCI_ANY_ID, PCI_ANY_ID, 0, 0, CHIP_POLARIS12},
/* Vega 10 */
{0x1002, 0x6860, PCI_ANY_ID, PCI_ANY_ID, 0, 0, CHIP_VEGA10|AMD_EXP_HW_SUPPORT},
diff --git a/drivers/gpu/drm/amd/amdgpu/atombios_crtc.c b/drivers/gpu/drm/amd/amdgpu/atombios_crtc.c
index 8c9bc75a9c2d..8a0818b23ea4 100644
--- a/drivers/gpu/drm/amd/amdgpu/atombios_crtc.c
+++ b/drivers/gpu/drm/amd/amdgpu/atombios_crtc.c
@@ -165,7 +165,7 @@ void amdgpu_atombios_crtc_powergate(struct drm_crtc *crtc, int state)
struct drm_device *dev = crtc->dev;
struct amdgpu_device *adev = dev->dev_private;
int index = GetIndexIntoMasterTable(COMMAND, EnableDispPowerGating);
- ENABLE_DISP_POWER_GATING_PARAMETERS_V2_1 args;
+ ENABLE_DISP_POWER_GATING_PS_ALLOCATION args;
memset(&args, 0, sizeof(args));
@@ -178,7 +178,7 @@ void amdgpu_atombios_crtc_powergate(struct drm_crtc *crtc, int state)
void amdgpu_atombios_crtc_powergate_init(struct amdgpu_device *adev)
{
int index = GetIndexIntoMasterTable(COMMAND, EnableDispPowerGating);
- ENABLE_DISP_POWER_GATING_PARAMETERS_V2_1 args;
+ ENABLE_DISP_POWER_GATING_PS_ALLOCATION args;
memset(&args, 0, sizeof(args));
diff --git a/drivers/gpu/drm/amd/amdgpu/dce_v10_0.c b/drivers/gpu/drm/amd/amdgpu/dce_v10_0.c
index 0cdeb6a2e4a0..5dffa27afa45 100644
--- a/drivers/gpu/drm/amd/amdgpu/dce_v10_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/dce_v10_0.c
@@ -1207,8 +1207,11 @@ static void dce_v10_0_program_watermarks(struct amdgpu_device *adev,
u32 tmp, wm_mask, lb_vblank_lead_lines = 0;
if (amdgpu_crtc->base.enabled && num_heads && mode) {
- active_time = 1000000UL * (u32)mode->crtc_hdisplay / (u32)mode->clock;
- line_time = min((u32) (1000000UL * (u32)mode->crtc_htotal / (u32)mode->clock), (u32)65535);
+ active_time = (u32) div_u64((u64)mode->crtc_hdisplay * 1000000,
+ (u32)mode->clock);
+ line_time = (u32) div_u64((u64)mode->crtc_htotal * 1000000,
+ (u32)mode->clock);
+ line_time = min(line_time, (u32)65535);
/* watermark for high clocks */
if (adev->pm.dpm_enabled) {
diff --git a/drivers/gpu/drm/amd/amdgpu/dce_v11_0.c b/drivers/gpu/drm/amd/amdgpu/dce_v11_0.c
index 773654a19749..47bbc87f96d2 100644
--- a/drivers/gpu/drm/amd/amdgpu/dce_v11_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/dce_v11_0.c
@@ -1176,8 +1176,11 @@ static void dce_v11_0_program_watermarks(struct amdgpu_device *adev,
u32 tmp, wm_mask, lb_vblank_lead_lines = 0;
if (amdgpu_crtc->base.enabled && num_heads && mode) {
- active_time = 1000000UL * (u32)mode->crtc_hdisplay / (u32)mode->clock;
- line_time = min((u32) (1000000UL * (u32)mode->crtc_htotal / (u32)mode->clock), (u32)65535);
+ active_time = (u32) div_u64((u64)mode->crtc_hdisplay * 1000000,
+ (u32)mode->clock);
+ line_time = (u32) div_u64((u64)mode->crtc_htotal * 1000000,
+ (u32)mode->clock);
+ line_time = min(line_time, (u32)65535);
/* watermark for high clocks */
if (adev->pm.dpm_enabled) {
diff --git a/drivers/gpu/drm/amd/amdgpu/dce_v6_0.c b/drivers/gpu/drm/amd/amdgpu/dce_v6_0.c
index 1f3552967ba3..d8c9a959493e 100644
--- a/drivers/gpu/drm/amd/amdgpu/dce_v6_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/dce_v6_0.c
@@ -983,8 +983,11 @@ static void dce_v6_0_program_watermarks(struct amdgpu_device *adev,
fixed20_12 a, b, c;
if (amdgpu_crtc->base.enabled && num_heads && mode) {
- active_time = 1000000UL * (u32)mode->crtc_hdisplay / (u32)mode->clock;
- line_time = min((u32) (1000000UL * (u32)mode->crtc_htotal / (u32)mode->clock), (u32)65535);
+ active_time = (u32) div_u64((u64)mode->crtc_hdisplay * 1000000,
+ (u32)mode->clock);
+ line_time = (u32) div_u64((u64)mode->crtc_htotal * 1000000,
+ (u32)mode->clock);
+ line_time = min(line_time, (u32)65535);
priority_a_cnt = 0;
priority_b_cnt = 0;
diff --git a/drivers/gpu/drm/amd/amdgpu/dce_v8_0.c b/drivers/gpu/drm/amd/amdgpu/dce_v8_0.c
index 3c558c170e5e..db30c6ba563a 100644
--- a/drivers/gpu/drm/amd/amdgpu/dce_v8_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/dce_v8_0.c
@@ -1091,8 +1091,11 @@ static void dce_v8_0_program_watermarks(struct amdgpu_device *adev,
u32 tmp, wm_mask, lb_vblank_lead_lines = 0;
if (amdgpu_crtc->base.enabled && num_heads && mode) {
- active_time = 1000000UL * (u32)mode->crtc_hdisplay / (u32)mode->clock;
- line_time = min((u32) (1000000UL * (u32)mode->crtc_htotal / (u32)mode->clock), (u32)65535);
+ active_time = (u32) div_u64((u64)mode->crtc_hdisplay * 1000000,
+ (u32)mode->clock);
+ line_time = (u32) div_u64((u64)mode->crtc_htotal * 1000000,
+ (u32)mode->clock);
+ line_time = min(line_time, (u32)65535);
/* watermark for high clocks */
if (adev->pm.dpm_enabled) {
diff --git a/drivers/gpu/drm/bridge/synopsys/Kconfig b/drivers/gpu/drm/bridge/synopsys/Kconfig
index 40d2827a6d19..53e78d092d18 100644
--- a/drivers/gpu/drm/bridge/synopsys/Kconfig
+++ b/drivers/gpu/drm/bridge/synopsys/Kconfig
@@ -1,6 +1,7 @@
config DRM_DW_HDMI
tristate
select DRM_KMS_HELPER
+ select REGMAP_MMIO
config DRM_DW_HDMI_AHB_AUDIO
tristate "Synopsys Designware AHB Audio interface"
diff --git a/drivers/gpu/drm/drm_connector.c b/drivers/gpu/drm/drm_connector.c
index 9f847615ac74..48ca2457df8c 100644
--- a/drivers/gpu/drm/drm_connector.c
+++ b/drivers/gpu/drm/drm_connector.c
@@ -1229,21 +1229,6 @@ int drm_mode_getconnector(struct drm_device *dev, void *data,
if (!connector)
return -ENOENT;
- drm_modeset_lock(&dev->mode_config.connection_mutex, NULL);
- encoder = drm_connector_get_encoder(connector);
- if (encoder)
- out_resp->encoder_id = encoder->base.id;
- else
- out_resp->encoder_id = 0;
-
- ret = drm_mode_object_get_properties(&connector->base, file_priv->atomic,
- (uint32_t __user *)(unsigned long)(out_resp->props_ptr),
- (uint64_t __user *)(unsigned long)(out_resp->prop_values_ptr),
- &out_resp->count_props);
- drm_modeset_unlock(&dev->mode_config.connection_mutex);
- if (ret)
- goto out_unref;
-
for (i = 0; i < DRM_CONNECTOR_MAX_ENCODER; i++)
if (connector->encoder_ids[i] != 0)
encoders_count++;
@@ -1256,7 +1241,7 @@ int drm_mode_getconnector(struct drm_device *dev, void *data,
if (put_user(connector->encoder_ids[i],
encoder_ptr + copied)) {
ret = -EFAULT;
- goto out_unref;
+ goto out;
}
copied++;
}
@@ -1300,15 +1285,32 @@ int drm_mode_getconnector(struct drm_device *dev, void *data,
if (copy_to_user(mode_ptr + copied,
&u_mode, sizeof(u_mode))) {
ret = -EFAULT;
+ mutex_unlock(&dev->mode_config.mutex);
+
goto out;
}
copied++;
}
}
out_resp->count_modes = mode_count;
-out:
mutex_unlock(&dev->mode_config.mutex);
-out_unref:
+
+ drm_modeset_lock(&dev->mode_config.connection_mutex, NULL);
+ encoder = drm_connector_get_encoder(connector);
+ if (encoder)
+ out_resp->encoder_id = encoder->base.id;
+ else
+ out_resp->encoder_id = 0;
+
+ /* Only grab properties after probing, to make sure EDID and other
+ * properties reflect the latest status. */
+ ret = drm_mode_object_get_properties(&connector->base, file_priv->atomic,
+ (uint32_t __user *)(unsigned long)(out_resp->props_ptr),
+ (uint64_t __user *)(unsigned long)(out_resp->prop_values_ptr),
+ &out_resp->count_props);
+ drm_modeset_unlock(&dev->mode_config.connection_mutex);
+
+out:
drm_connector_put(connector);
return ret;
diff --git a/drivers/gpu/drm/etnaviv/etnaviv_gem.h b/drivers/gpu/drm/etnaviv/etnaviv_gem.h
index c4a091e87426..e437fba1209d 100644
--- a/drivers/gpu/drm/etnaviv/etnaviv_gem.h
+++ b/drivers/gpu/drm/etnaviv/etnaviv_gem.h
@@ -106,9 +106,10 @@ struct etnaviv_gem_submit {
struct etnaviv_gpu *gpu;
struct ww_acquire_ctx ticket;
struct dma_fence *fence;
+ u32 flags;
unsigned int nr_bos;
struct etnaviv_gem_submit_bo bos[0];
- u32 flags;
+ /* No new members here, the previous one is variable-length! */
};
int etnaviv_gem_wait_bo(struct etnaviv_gpu *gpu, struct drm_gem_object *obj,
diff --git a/drivers/gpu/drm/etnaviv/etnaviv_gem_submit.c b/drivers/gpu/drm/etnaviv/etnaviv_gem_submit.c
index de80ee1b71df..1013765274da 100644
--- a/drivers/gpu/drm/etnaviv/etnaviv_gem_submit.c
+++ b/drivers/gpu/drm/etnaviv/etnaviv_gem_submit.c
@@ -172,7 +172,7 @@ static int submit_fence_sync(const struct etnaviv_gem_submit *submit)
for (i = 0; i < submit->nr_bos; i++) {
struct etnaviv_gem_object *etnaviv_obj = submit->bos[i].obj;
bool write = submit->bos[i].flags & ETNA_SUBMIT_BO_WRITE;
- bool explicit = !(submit->flags & ETNA_SUBMIT_NO_IMPLICIT);
+ bool explicit = !!(submit->flags & ETNA_SUBMIT_NO_IMPLICIT);
ret = etnaviv_gpu_fence_sync_obj(etnaviv_obj, context, write,
explicit);
diff --git a/drivers/gpu/drm/i915/i915_debugfs.c b/drivers/gpu/drm/i915/i915_debugfs.c
index d689e511744e..4bd1467c17b1 100644
--- a/drivers/gpu/drm/i915/i915_debugfs.c
+++ b/drivers/gpu/drm/i915/i915_debugfs.c
@@ -292,6 +292,8 @@ static int per_file_stats(int id, void *ptr, void *data)
struct file_stats *stats = data;
struct i915_vma *vma;
+ lockdep_assert_held(&obj->base.dev->struct_mutex);
+
stats->count++;
stats->total += obj->base.size;
if (!obj->bind_count)
@@ -476,6 +478,8 @@ static int i915_gem_object_info(struct seq_file *m, void *data)
struct drm_i915_gem_request *request;
struct task_struct *task;
+ mutex_lock(&dev->struct_mutex);
+
memset(&stats, 0, sizeof(stats));
stats.file_priv = file->driver_priv;
spin_lock(&file->table_lock);
@@ -487,7 +491,6 @@ static int i915_gem_object_info(struct seq_file *m, void *data)
* still alive (e.g. get_pid(current) => fork() => exit()).
* Therefore, we need to protect this ->comm access using RCU.
*/
- mutex_lock(&dev->struct_mutex);
request = list_first_entry_or_null(&file_priv->mm.request_list,
struct drm_i915_gem_request,
client_link);
@@ -497,6 +500,7 @@ static int i915_gem_object_info(struct seq_file *m, void *data)
PIDTYPE_PID);
print_file_stats(m, task ? task->comm : "<unknown>", stats);
rcu_read_unlock();
+
mutex_unlock(&dev->struct_mutex);
}
mutex_unlock(&dev->filelist_mutex);
diff --git a/drivers/gpu/drm/i915/i915_gem.c b/drivers/gpu/drm/i915/i915_gem.c
index 462031cbd77f..615f0a855222 100644
--- a/drivers/gpu/drm/i915/i915_gem.c
+++ b/drivers/gpu/drm/i915/i915_gem.c
@@ -2285,8 +2285,8 @@ i915_gem_object_get_pages_gtt(struct drm_i915_gem_object *obj)
struct page *page;
unsigned long last_pfn = 0; /* suppress gcc warning */
unsigned int max_segment;
+ gfp_t noreclaim;
int ret;
- gfp_t gfp;
/* Assert that the object is not currently in any GPU domain. As it
* wasn't in the GTT, there shouldn't be any way it could have been in
@@ -2315,22 +2315,31 @@ rebuild_st:
* Fail silently without starting the shrinker
*/
mapping = obj->base.filp->f_mapping;
- gfp = mapping_gfp_constraint(mapping, ~(__GFP_IO | __GFP_RECLAIM));
- gfp |= __GFP_NORETRY | __GFP_NOWARN;
+ noreclaim = mapping_gfp_constraint(mapping,
+ ~(__GFP_IO | __GFP_RECLAIM));
+ noreclaim |= __GFP_NORETRY | __GFP_NOWARN;
+
sg = st->sgl;
st->nents = 0;
for (i = 0; i < page_count; i++) {
- page = shmem_read_mapping_page_gfp(mapping, i, gfp);
- if (unlikely(IS_ERR(page))) {
- i915_gem_shrink(dev_priv,
- page_count,
- I915_SHRINK_BOUND |
- I915_SHRINK_UNBOUND |
- I915_SHRINK_PURGEABLE);
+ const unsigned int shrink[] = {
+ I915_SHRINK_BOUND | I915_SHRINK_UNBOUND | I915_SHRINK_PURGEABLE,
+ 0,
+ }, *s = shrink;
+ gfp_t gfp = noreclaim;
+
+ do {
page = shmem_read_mapping_page_gfp(mapping, i, gfp);
- }
- if (unlikely(IS_ERR(page))) {
- gfp_t reclaim;
+ if (likely(!IS_ERR(page)))
+ break;
+
+ if (!*s) {
+ ret = PTR_ERR(page);
+ goto err_sg;
+ }
+
+ i915_gem_shrink(dev_priv, 2 * page_count, *s++);
+ cond_resched();
/* We've tried hard to allocate the memory by reaping
* our own buffer, now let the real VM do its job and
@@ -2340,15 +2349,26 @@ rebuild_st:
* defer the oom here by reporting the ENOMEM back
* to userspace.
*/
- reclaim = mapping_gfp_mask(mapping);
- reclaim |= __GFP_NORETRY; /* reclaim, but no oom */
-
- page = shmem_read_mapping_page_gfp(mapping, i, reclaim);
- if (IS_ERR(page)) {
- ret = PTR_ERR(page);
- goto err_sg;
+ if (!*s) {
+ /* reclaim and warn, but no oom */
+ gfp = mapping_gfp_mask(mapping);
+
+ /* Our bo are always dirty and so we require
+ * kswapd to reclaim our pages (direct reclaim
+ * does not effectively begin pageout of our
+ * buffers on its own). However, direct reclaim
+ * only waits for kswapd when under allocation
+ * congestion. So as a result __GFP_RECLAIM is
+ * unreliable and fails to actually reclaim our
+ * dirty pages -- unless you try over and over
+ * again with !__GFP_NORETRY. However, we still
+ * want to fail this allocation rather than
+ * trigger the out-of-memory killer and for
+ * this we want the future __GFP_MAYFAIL.
+ */
}
- }
+ } while (1);
+
if (!i ||
sg->length >= max_segment ||
page_to_pfn(page) != last_pfn + 1) {
@@ -4222,6 +4242,7 @@ i915_gem_object_create(struct drm_i915_private *dev_priv, u64 size)
mapping = obj->base.filp->f_mapping;
mapping_set_gfp_mask(mapping, mask);
+ GEM_BUG_ON(!(mapping_gfp_mask(mapping) & __GFP_RECLAIM));
i915_gem_object_init(obj, &i915_gem_object_ops);
diff --git a/drivers/gpu/drm/i915/i915_gem_execbuffer.c b/drivers/gpu/drm/i915/i915_gem_execbuffer.c
index a3e59c8ef27b..9ad13eeed904 100644
--- a/drivers/gpu/drm/i915/i915_gem_execbuffer.c
+++ b/drivers/gpu/drm/i915/i915_gem_execbuffer.c
@@ -546,11 +546,12 @@ repeat:
}
static int
-i915_gem_execbuffer_relocate_entry(struct drm_i915_gem_object *obj,
+i915_gem_execbuffer_relocate_entry(struct i915_vma *vma,
struct eb_vmas *eb,
struct drm_i915_gem_relocation_entry *reloc,
struct reloc_cache *cache)
{
+ struct drm_i915_gem_object *obj = vma->obj;
struct drm_i915_private *dev_priv = to_i915(obj->base.dev);
struct drm_gem_object *target_obj;
struct drm_i915_gem_object *target_i915_obj;
@@ -628,6 +629,16 @@ i915_gem_execbuffer_relocate_entry(struct drm_i915_gem_object *obj,
return -EINVAL;
}
+ /*
+ * If we write into the object, we need to force the synchronisation
+ * barrier, either with an asynchronous clflush or if we executed the
+ * patching using the GPU (though that should be serialised by the
+ * timeline). To be completely sure, and since we are required to
+ * do relocations we are already stalling, disable the user's opt
+ * of our synchronisation.
+ */
+ vma->exec_entry->flags &= ~EXEC_OBJECT_ASYNC;
+
ret = relocate_entry(obj, reloc, cache, target_offset);
if (ret)
return ret;
@@ -678,7 +689,7 @@ i915_gem_execbuffer_relocate_vma(struct i915_vma *vma,
do {
u64 offset = r->presumed_offset;
- ret = i915_gem_execbuffer_relocate_entry(vma->obj, eb, r, &cache);
+ ret = i915_gem_execbuffer_relocate_entry(vma, eb, r, &cache);
if (ret)
goto out;
@@ -726,7 +737,7 @@ i915_gem_execbuffer_relocate_vma_slow(struct i915_vma *vma,
reloc_cache_init(&cache, eb->i915);
for (i = 0; i < entry->relocation_count; i++) {
- ret = i915_gem_execbuffer_relocate_entry(vma->obj, eb, &relocs[i], &cache);
+ ret = i915_gem_execbuffer_relocate_entry(vma, eb, &relocs[i], &cache);
if (ret)
break;
}
diff --git a/drivers/gpu/drm/i915/i915_gem_request.c b/drivers/gpu/drm/i915/i915_gem_request.c
index 5ddbc9499775..a74d0ac737cb 100644
--- a/drivers/gpu/drm/i915/i915_gem_request.c
+++ b/drivers/gpu/drm/i915/i915_gem_request.c
@@ -623,7 +623,7 @@ i915_gem_request_alloc(struct intel_engine_cs *engine,
* GPU processing the request, we never over-estimate the
* position of the head.
*/
- req->head = req->ring->tail;
+ req->head = req->ring->emit;
/* Check that we didn't interrupt ourselves with a new request */
GEM_BUG_ON(req->timeline->seqno != req->fence.seqno);
diff --git a/drivers/gpu/drm/i915/i915_gem_request.h b/drivers/gpu/drm/i915/i915_gem_request.h
index 129c58bb4805..a4a920c4c454 100644
--- a/drivers/gpu/drm/i915/i915_gem_request.h
+++ b/drivers/gpu/drm/i915/i915_gem_request.h
@@ -123,7 +123,7 @@ struct drm_i915_gem_request {
* It is used by the driver to then queue the request for execution.
*/
struct i915_sw_fence submit;
- wait_queue_t submitq;
+ wait_queue_entry_t submitq;
wait_queue_head_t execute;
/* A list of everyone we wait upon, and everyone who waits upon us.
diff --git a/drivers/gpu/drm/i915/i915_guc_submission.c b/drivers/gpu/drm/i915/i915_guc_submission.c
index 1642fff9cf13..ab5140ba108d 100644
--- a/drivers/gpu/drm/i915/i915_guc_submission.c
+++ b/drivers/gpu/drm/i915/i915_guc_submission.c
@@ -480,9 +480,7 @@ static void guc_wq_item_append(struct i915_guc_client *client,
GEM_BUG_ON(freespace < wqi_size);
/* The GuC firmware wants the tail index in QWords, not bytes */
- tail = rq->tail;
- assert_ring_tail_valid(rq->ring, rq->tail);
- tail >>= 3;
+ tail = intel_ring_set_tail(rq->ring, rq->tail) >> 3;
GEM_BUG_ON(tail > WQ_RING_TAIL_MAX);
/* For now workqueue item is 4 DWs; workqueue buffer is 2 pages. So we
diff --git a/drivers/gpu/drm/i915/i915_pvinfo.h b/drivers/gpu/drm/i915/i915_pvinfo.h
index c0cb2974caac..2cfe96d3e5d1 100644
--- a/drivers/gpu/drm/i915/i915_pvinfo.h
+++ b/drivers/gpu/drm/i915/i915_pvinfo.h
@@ -36,10 +36,6 @@
#define VGT_VERSION_MAJOR 1
#define VGT_VERSION_MINOR 0
-#define INTEL_VGT_IF_VERSION_ENCODE(major, minor) ((major) << 16 | (minor))
-#define INTEL_VGT_IF_VERSION \
- INTEL_VGT_IF_VERSION_ENCODE(VGT_VERSION_MAJOR, VGT_VERSION_MINOR)
-
/*
* notifications from guest to vgpu device model
*/
@@ -55,8 +51,8 @@ enum vgt_g2v_type {
struct vgt_if {
u64 magic; /* VGT_MAGIC */
- uint16_t version_major;
- uint16_t version_minor;
+ u16 version_major;
+ u16 version_minor;
u32 vgt_id; /* ID of vGT instance */
u32 rsv1[12]; /* pad to offset 0x40 */
/*
diff --git a/drivers/gpu/drm/i915/i915_sw_fence.c b/drivers/gpu/drm/i915/i915_sw_fence.c
index a277f8eb7beb..380de4360b8a 100644
--- a/drivers/gpu/drm/i915/i915_sw_fence.c
+++ b/drivers/gpu/drm/i915/i915_sw_fence.c
@@ -152,7 +152,7 @@ static void __i915_sw_fence_wake_up_all(struct i915_sw_fence *fence,
struct list_head *continuation)
{
wait_queue_head_t *x = &fence->wait;
- wait_queue_t *pos, *next;
+ wait_queue_entry_t *pos, *next;
unsigned long flags;
debug_fence_deactivate(fence);
@@ -160,31 +160,30 @@ static void __i915_sw_fence_wake_up_all(struct i915_sw_fence *fence,
/*
* To prevent unbounded recursion as we traverse the graph of
- * i915_sw_fences, we move the task_list from this, the next ready
- * fence, to the tail of the original fence's task_list
+ * i915_sw_fences, we move the entry list from this, the next ready
+ * fence, to the tail of the original fence's entry list
* (and so added to the list to be woken).
*/
spin_lock_irqsave_nested(&x->lock, flags, 1 + !!continuation);
if (continuation) {
- list_for_each_entry_safe(pos, next, &x->task_list, task_list) {
+ list_for_each_entry_safe(pos, next, &x->head, entry) {
if (pos->func == autoremove_wake_function)
pos->func(pos, TASK_NORMAL, 0, continuation);
else
- list_move_tail(&pos->task_list, continuation);
+ list_move_tail(&pos->entry, continuation);
}
} else {
LIST_HEAD(extra);
do {
- list_for_each_entry_safe(pos, next,
- &x->task_list, task_list)
+ list_for_each_entry_safe(pos, next, &x->head, entry)
pos->func(pos, TASK_NORMAL, 0, &extra);
if (list_empty(&extra))
break;
- list_splice_tail_init(&extra, &x->task_list);
+ list_splice_tail_init(&extra, &x->head);
} while (1);
}
spin_unlock_irqrestore(&x->lock, flags);
@@ -254,9 +253,9 @@ void i915_sw_fence_commit(struct i915_sw_fence *fence)
__i915_sw_fence_commit(fence);
}
-static int i915_sw_fence_wake(wait_queue_t *wq, unsigned mode, int flags, void *key)
+static int i915_sw_fence_wake(wait_queue_entry_t *wq, unsigned mode, int flags, void *key)
{
- list_del(&wq->task_list);
+ list_del(&wq->entry);
__i915_sw_fence_complete(wq->private, key);
i915_sw_fence_put(wq->private);
if (wq->flags & I915_SW_FENCE_FLAG_ALLOC)
@@ -267,7 +266,7 @@ static int i915_sw_fence_wake(wait_queue_t *wq, unsigned mode, int flags, void *
static bool __i915_sw_fence_check_if_after(struct i915_sw_fence *fence,
const struct i915_sw_fence * const signaler)
{
- wait_queue_t *wq;
+ wait_queue_entry_t *wq;
if (__test_and_set_bit(I915_SW_FENCE_CHECKED_BIT, &fence->flags))
return false;
@@ -275,7 +274,7 @@ static bool __i915_sw_fence_check_if_after(struct i915_sw_fence *fence,
if (fence == signaler)
return true;
- list_for_each_entry(wq, &fence->wait.task_list, task_list) {
+ list_for_each_entry(wq, &fence->wait.head, entry) {
if (wq->func != i915_sw_fence_wake)
continue;
@@ -288,12 +287,12 @@ static bool __i915_sw_fence_check_if_after(struct i915_sw_fence *fence,
static void __i915_sw_fence_clear_checked_bit(struct i915_sw_fence *fence)
{
- wait_queue_t *wq;
+ wait_queue_entry_t *wq;
if (!__test_and_clear_bit(I915_SW_FENCE_CHECKED_BIT, &fence->flags))
return;
- list_for_each_entry(wq, &fence->wait.task_list, task_list) {
+ list_for_each_entry(wq, &fence->wait.head, entry) {
if (wq->func != i915_sw_fence_wake)
continue;
@@ -320,7 +319,7 @@ static bool i915_sw_fence_check_if_after(struct i915_sw_fence *fence,
static int __i915_sw_fence_await_sw_fence(struct i915_sw_fence *fence,
struct i915_sw_fence *signaler,
- wait_queue_t *wq, gfp_t gfp)
+ wait_queue_entry_t *wq, gfp_t gfp)
{
unsigned long flags;
int pending;
@@ -350,7 +349,7 @@ static int __i915_sw_fence_await_sw_fence(struct i915_sw_fence *fence,
pending |= I915_SW_FENCE_FLAG_ALLOC;
}
- INIT_LIST_HEAD(&wq->task_list);
+ INIT_LIST_HEAD(&wq->entry);
wq->flags = pending;
wq->func = i915_sw_fence_wake;
wq->private = i915_sw_fence_get(fence);
@@ -359,7 +358,7 @@ static int __i915_sw_fence_await_sw_fence(struct i915_sw_fence *fence,
spin_lock_irqsave(&signaler->wait.lock, flags);
if (likely(!i915_sw_fence_done(signaler))) {
- __add_wait_queue_tail(&signaler->wait, wq);
+ __add_wait_queue_entry_tail(&signaler->wait, wq);
pending = 1;
} else {
i915_sw_fence_wake(wq, 0, 0, NULL);
@@ -372,7 +371,7 @@ static int __i915_sw_fence_await_sw_fence(struct i915_sw_fence *fence,
int i915_sw_fence_await_sw_fence(struct i915_sw_fence *fence,
struct i915_sw_fence *signaler,
- wait_queue_t *wq)
+ wait_queue_entry_t *wq)
{
return __i915_sw_fence_await_sw_fence(fence, signaler, wq, 0);
}
diff --git a/drivers/gpu/drm/i915/i915_sw_fence.h b/drivers/gpu/drm/i915/i915_sw_fence.h
index d31cefbbcc04..fd3c3bf6c8b7 100644
--- a/drivers/gpu/drm/i915/i915_sw_fence.h
+++ b/drivers/gpu/drm/i915/i915_sw_fence.h
@@ -66,7 +66,7 @@ void i915_sw_fence_commit(struct i915_sw_fence *fence);
int i915_sw_fence_await_sw_fence(struct i915_sw_fence *fence,
struct i915_sw_fence *after,
- wait_queue_t *wq);
+ wait_queue_entry_t *wq);
int i915_sw_fence_await_sw_fence_gfp(struct i915_sw_fence *fence,
struct i915_sw_fence *after,
gfp_t gfp);
diff --git a/drivers/gpu/drm/i915/i915_vgpu.c b/drivers/gpu/drm/i915/i915_vgpu.c
index 4ab8a973b61f..2e739018fb4c 100644
--- a/drivers/gpu/drm/i915/i915_vgpu.c
+++ b/drivers/gpu/drm/i915/i915_vgpu.c
@@ -60,8 +60,8 @@
*/
void i915_check_vgpu(struct drm_i915_private *dev_priv)
{
- uint64_t magic;
- uint32_t version;
+ u64 magic;
+ u16 version_major;
BUILD_BUG_ON(sizeof(struct vgt_if) != VGT_PVINFO_SIZE);
@@ -69,10 +69,8 @@ void i915_check_vgpu(struct drm_i915_private *dev_priv)
if (magic != VGT_MAGIC)
return;
- version = INTEL_VGT_IF_VERSION_ENCODE(
- __raw_i915_read16(dev_priv, vgtif_reg(version_major)),
- __raw_i915_read16(dev_priv, vgtif_reg(version_minor)));
- if (version != INTEL_VGT_IF_VERSION) {
+ version_major = __raw_i915_read16(dev_priv, vgtif_reg(version_major));
+ if (version_major < VGT_VERSION_MAJOR) {
DRM_INFO("VGT interface version mismatch!\n");
return;
}
diff --git a/drivers/gpu/drm/i915/i915_vma.c b/drivers/gpu/drm/i915/i915_vma.c
index 1aba47024656..f066e2d785f5 100644
--- a/drivers/gpu/drm/i915/i915_vma.c
+++ b/drivers/gpu/drm/i915/i915_vma.c
@@ -650,6 +650,11 @@ int i915_vma_unbind(struct i915_vma *vma)
break;
}
+ if (!ret) {
+ ret = i915_gem_active_retire(&vma->last_fence,
+ &vma->vm->i915->drm.struct_mutex);
+ }
+
__i915_vma_unpin(vma);
if (ret)
return ret;
diff --git a/drivers/gpu/drm/i915/intel_display.c b/drivers/gpu/drm/i915/intel_display.c
index 569717a12723..9106ea32b048 100644
--- a/drivers/gpu/drm/i915/intel_display.c
+++ b/drivers/gpu/drm/i915/intel_display.c
@@ -120,7 +120,8 @@ static void intel_crtc_init_scalers(struct intel_crtc *crtc,
static void skylake_pfit_enable(struct intel_crtc *crtc);
static void ironlake_pfit_disable(struct intel_crtc *crtc, bool force);
static void ironlake_pfit_enable(struct intel_crtc *crtc);
-static void intel_modeset_setup_hw_state(struct drm_device *dev);
+static void intel_modeset_setup_hw_state(struct drm_device *dev,
+ struct drm_modeset_acquire_ctx *ctx);
static void intel_pre_disable_primary_noatomic(struct drm_crtc *crtc);
struct intel_limit {
@@ -3449,7 +3450,7 @@ __intel_display_resume(struct drm_device *dev,
struct drm_crtc *crtc;
int i, ret;
- intel_modeset_setup_hw_state(dev);
+ intel_modeset_setup_hw_state(dev, ctx);
i915_redisable_vga(to_i915(dev));
if (!state)
@@ -4598,7 +4599,7 @@ static void cpt_verify_modeset(struct drm_device *dev, int pipe)
static int
skl_update_scaler(struct intel_crtc_state *crtc_state, bool force_detach,
- unsigned scaler_user, int *scaler_id, unsigned int rotation,
+ unsigned int scaler_user, int *scaler_id,
int src_w, int src_h, int dst_w, int dst_h)
{
struct intel_crtc_scaler_state *scaler_state =
@@ -4607,9 +4608,12 @@ skl_update_scaler(struct intel_crtc_state *crtc_state, bool force_detach,
to_intel_crtc(crtc_state->base.crtc);
int need_scaling;
- need_scaling = drm_rotation_90_or_270(rotation) ?
- (src_h != dst_w || src_w != dst_h):
- (src_w != dst_w || src_h != dst_h);
+ /*
+ * Src coordinates are already rotated by 270 degrees for
+ * the 90/270 degree plane rotation cases (to match the
+ * GTT mapping), hence no need to account for rotation here.
+ */
+ need_scaling = src_w != dst_w || src_h != dst_h;
/*
* if plane is being disabled or scaler is no more required or force detach
@@ -4671,7 +4675,7 @@ int skl_update_scaler_crtc(struct intel_crtc_state *state)
const struct drm_display_mode *adjusted_mode = &state->base.adjusted_mode;
return skl_update_scaler(state, !state->base.active, SKL_CRTC_INDEX,
- &state->scaler_state.scaler_id, DRM_ROTATE_0,
+ &state->scaler_state.scaler_id,
state->pipe_src_w, state->pipe_src_h,
adjusted_mode->crtc_hdisplay, adjusted_mode->crtc_vdisplay);
}
@@ -4700,7 +4704,6 @@ static int skl_update_scaler_plane(struct intel_crtc_state *crtc_state,
ret = skl_update_scaler(crtc_state, force_detach,
drm_plane_index(&intel_plane->base),
&plane_state->scaler_id,
- plane_state->base.rotation,
drm_rect_width(&plane_state->base.src) >> 16,
drm_rect_height(&plane_state->base.src) >> 16,
drm_rect_width(&plane_state->base.dst),
@@ -5823,7 +5826,8 @@ static void i9xx_crtc_disable(struct intel_crtc_state *old_crtc_state,
intel_update_watermarks(intel_crtc);
}
-static void intel_crtc_disable_noatomic(struct drm_crtc *crtc)
+static void intel_crtc_disable_noatomic(struct drm_crtc *crtc,
+ struct drm_modeset_acquire_ctx *ctx)
{
struct intel_encoder *encoder;
struct intel_crtc *intel_crtc = to_intel_crtc(crtc);
@@ -5853,7 +5857,7 @@ static void intel_crtc_disable_noatomic(struct drm_crtc *crtc)
return;
}
- state->acquire_ctx = crtc->dev->mode_config.acquire_ctx;
+ state->acquire_ctx = ctx;
/* Everything's already locked, -EDEADLK can't happen. */
crtc_state = intel_atomic_get_crtc_state(state, intel_crtc);
@@ -15028,7 +15032,7 @@ int intel_modeset_init(struct drm_device *dev)
intel_setup_outputs(dev_priv);
drm_modeset_lock_all(dev);
- intel_modeset_setup_hw_state(dev);
+ intel_modeset_setup_hw_state(dev, dev->mode_config.acquire_ctx);
drm_modeset_unlock_all(dev);
for_each_intel_crtc(dev, crtc) {
@@ -15065,13 +15069,13 @@ int intel_modeset_init(struct drm_device *dev)
return 0;
}
-static void intel_enable_pipe_a(struct drm_device *dev)
+static void intel_enable_pipe_a(struct drm_device *dev,
+ struct drm_modeset_acquire_ctx *ctx)
{
struct intel_connector *connector;
struct drm_connector_list_iter conn_iter;
struct drm_connector *crt = NULL;
struct intel_load_detect_pipe load_detect_temp;
- struct drm_modeset_acquire_ctx *ctx = dev->mode_config.acquire_ctx;
int ret;
/* We can't just switch on the pipe A, we need to set things up with a
@@ -15143,7 +15147,8 @@ static bool has_pch_trancoder(struct drm_i915_private *dev_priv,
(HAS_PCH_LPT_H(dev_priv) && pch_transcoder == TRANSCODER_A);
}
-static void intel_sanitize_crtc(struct intel_crtc *crtc)
+static void intel_sanitize_crtc(struct intel_crtc *crtc,
+ struct drm_modeset_acquire_ctx *ctx)
{
struct drm_device *dev = crtc->base.dev;
struct drm_i915_private *dev_priv = to_i915(dev);
@@ -15189,7 +15194,7 @@ static void intel_sanitize_crtc(struct intel_crtc *crtc)
plane = crtc->plane;
crtc->base.primary->state->visible = true;
crtc->plane = !plane;
- intel_crtc_disable_noatomic(&crtc->base);
+ intel_crtc_disable_noatomic(&crtc->base, ctx);
crtc->plane = plane;
}
@@ -15199,13 +15204,13 @@ static void intel_sanitize_crtc(struct intel_crtc *crtc)
* resume. Force-enable the pipe to fix this, the update_dpms
* call below we restore the pipe to the right state, but leave
* the required bits on. */
- intel_enable_pipe_a(dev);
+ intel_enable_pipe_a(dev, ctx);
}
/* Adjust the state of the output pipe according to whether we
* have active connectors/encoders. */
if (crtc->active && !intel_crtc_has_encoders(crtc))
- intel_crtc_disable_noatomic(&crtc->base);
+ intel_crtc_disable_noatomic(&crtc->base, ctx);
if (crtc->active || HAS_GMCH_DISPLAY(dev_priv)) {
/*
@@ -15503,7 +15508,8 @@ get_encoder_power_domains(struct drm_i915_private *dev_priv)
* and sanitizes it to the current state
*/
static void
-intel_modeset_setup_hw_state(struct drm_device *dev)
+intel_modeset_setup_hw_state(struct drm_device *dev,
+ struct drm_modeset_acquire_ctx *ctx)
{
struct drm_i915_private *dev_priv = to_i915(dev);
enum pipe pipe;
@@ -15523,7 +15529,7 @@ intel_modeset_setup_hw_state(struct drm_device *dev)
for_each_pipe(dev_priv, pipe) {
crtc = intel_get_crtc_for_pipe(dev_priv, pipe);
- intel_sanitize_crtc(crtc);
+ intel_sanitize_crtc(crtc, ctx);
intel_dump_pipe_config(crtc, crtc->config,
"[setup_hw_state]");
}
diff --git a/drivers/gpu/drm/i915/intel_dp_aux_backlight.c b/drivers/gpu/drm/i915/intel_dp_aux_backlight.c
index 6532e226db29..40ba3134545e 100644
--- a/drivers/gpu/drm/i915/intel_dp_aux_backlight.c
+++ b/drivers/gpu/drm/i915/intel_dp_aux_backlight.c
@@ -119,8 +119,6 @@ static int intel_dp_aux_setup_backlight(struct intel_connector *connector,
struct intel_dp *intel_dp = enc_to_intel_dp(&connector->encoder->base);
struct intel_panel *panel = &connector->panel;
- intel_dp_aux_enable_backlight(connector);
-
if (intel_dp->edp_dpcd[2] & DP_EDP_BACKLIGHT_BRIGHTNESS_BYTE_COUNT)
panel->backlight.max = 0xFFFF;
else
diff --git a/drivers/gpu/drm/i915/intel_lrc.c b/drivers/gpu/drm/i915/intel_lrc.c
index dac4e003c1f3..62f44d3e7c43 100644
--- a/drivers/gpu/drm/i915/intel_lrc.c
+++ b/drivers/gpu/drm/i915/intel_lrc.c
@@ -326,8 +326,7 @@ static u64 execlists_update_context(struct drm_i915_gem_request *rq)
rq->ctx->ppgtt ?: rq->i915->mm.aliasing_ppgtt;
u32 *reg_state = ce->lrc_reg_state;
- assert_ring_tail_valid(rq->ring, rq->tail);
- reg_state[CTX_RING_TAIL+1] = rq->tail;
+ reg_state[CTX_RING_TAIL+1] = intel_ring_set_tail(rq->ring, rq->tail);
/* True 32b PPGTT with dynamic page allocation: update PDP
* registers and point the unallocated PDPs to scratch page.
@@ -2036,8 +2035,7 @@ void intel_lr_context_resume(struct drm_i915_private *dev_priv)
ce->state->obj->mm.dirty = true;
i915_gem_object_unpin_map(ce->state->obj);
- ce->ring->head = ce->ring->tail = 0;
- intel_ring_update_space(ce->ring);
+ intel_ring_reset(ce->ring, 0);
}
}
}
diff --git a/drivers/gpu/drm/i915/intel_pm.c b/drivers/gpu/drm/i915/intel_pm.c
index 2ca481b5aa69..078fd1bfa5ea 100644
--- a/drivers/gpu/drm/i915/intel_pm.c
+++ b/drivers/gpu/drm/i915/intel_pm.c
@@ -3373,20 +3373,26 @@ skl_plane_downscale_amount(const struct intel_crtc_state *cstate,
/* n.b., src is 16.16 fixed point, dst is whole integer */
if (plane->id == PLANE_CURSOR) {
+ /*
+ * Cursors only support 0/180 degree rotation,
+ * hence no need to account for rotation here.
+ */
src_w = pstate->base.src_w;
src_h = pstate->base.src_h;
dst_w = pstate->base.crtc_w;
dst_h = pstate->base.crtc_h;
} else {
+ /*
+ * Src coordinates are already rotated by 270 degrees for
+ * the 90/270 degree plane rotation cases (to match the
+ * GTT mapping), hence no need to account for rotation here.
+ */
src_w = drm_rect_width(&pstate->base.src);
src_h = drm_rect_height(&pstate->base.src);
dst_w = drm_rect_width(&pstate->base.dst);
dst_h = drm_rect_height(&pstate->base.dst);
}
- if (drm_rotation_90_or_270(pstate->base.rotation))
- swap(dst_w, dst_h);
-
downscale_h = max(src_h / dst_h, (uint32_t)DRM_PLANE_HELPER_NO_SCALING);
downscale_w = max(src_w / dst_w, (uint32_t)DRM_PLANE_HELPER_NO_SCALING);
@@ -3417,12 +3423,14 @@ skl_plane_relative_data_rate(const struct intel_crtc_state *cstate,
if (y && format != DRM_FORMAT_NV12)
return 0;
+ /*
+ * Src coordinates are already rotated by 270 degrees for
+ * the 90/270 degree plane rotation cases (to match the
+ * GTT mapping), hence no need to account for rotation here.
+ */
width = drm_rect_width(&intel_pstate->base.src) >> 16;
height = drm_rect_height(&intel_pstate->base.src) >> 16;
- if (drm_rotation_90_or_270(pstate->rotation))
- swap(width, height);
-
/* for planar format */
if (format == DRM_FORMAT_NV12) {
if (y) /* y-plane data rate */
@@ -3505,12 +3513,14 @@ skl_ddb_min_alloc(const struct drm_plane_state *pstate,
fb->modifier != I915_FORMAT_MOD_Yf_TILED)
return 8;
+ /*
+ * Src coordinates are already rotated by 270 degrees for
+ * the 90/270 degree plane rotation cases (to match the
+ * GTT mapping), hence no need to account for rotation here.
+ */
src_w = drm_rect_width(&intel_pstate->base.src) >> 16;
src_h = drm_rect_height(&intel_pstate->base.src) >> 16;
- if (drm_rotation_90_or_270(pstate->rotation))
- swap(src_w, src_h);
-
/* Halve UV plane width and height for NV12 */
if (fb->format->format == DRM_FORMAT_NV12 && !y) {
src_w /= 2;
@@ -3794,13 +3804,15 @@ static int skl_compute_plane_wm(const struct drm_i915_private *dev_priv,
width = intel_pstate->base.crtc_w;
height = intel_pstate->base.crtc_h;
} else {
+ /*
+ * Src coordinates are already rotated by 270 degrees for
+ * the 90/270 degree plane rotation cases (to match the
+ * GTT mapping), hence no need to account for rotation here.
+ */
width = drm_rect_width(&intel_pstate->base.src) >> 16;
height = drm_rect_height(&intel_pstate->base.src) >> 16;
}
- if (drm_rotation_90_or_270(pstate->rotation))
- swap(width, height);
-
cpp = fb->format->cpp[0];
plane_pixel_rate = skl_adjusted_plane_pixel_rate(cstate, intel_pstate);
diff --git a/drivers/gpu/drm/i915/intel_ringbuffer.c b/drivers/gpu/drm/i915/intel_ringbuffer.c
index 66a2b8b83972..513a0f4b469b 100644
--- a/drivers/gpu/drm/i915/intel_ringbuffer.c
+++ b/drivers/gpu/drm/i915/intel_ringbuffer.c
@@ -49,7 +49,7 @@ static int __intel_ring_space(int head, int tail, int size)
void intel_ring_update_space(struct intel_ring *ring)
{
- ring->space = __intel_ring_space(ring->head, ring->tail, ring->size);
+ ring->space = __intel_ring_space(ring->head, ring->emit, ring->size);
}
static int
@@ -774,8 +774,8 @@ static void i9xx_submit_request(struct drm_i915_gem_request *request)
i915_gem_request_submit(request);
- assert_ring_tail_valid(request->ring, request->tail);
- I915_WRITE_TAIL(request->engine, request->tail);
+ I915_WRITE_TAIL(request->engine,
+ intel_ring_set_tail(request->ring, request->tail));
}
static void i9xx_emit_breadcrumb(struct drm_i915_gem_request *req, u32 *cs)
@@ -1316,11 +1316,23 @@ err:
return PTR_ERR(addr);
}
+void intel_ring_reset(struct intel_ring *ring, u32 tail)
+{
+ GEM_BUG_ON(!list_empty(&ring->request_list));
+ ring->tail = tail;
+ ring->head = tail;
+ ring->emit = tail;
+ intel_ring_update_space(ring);
+}
+
void intel_ring_unpin(struct intel_ring *ring)
{
GEM_BUG_ON(!ring->vma);
GEM_BUG_ON(!ring->vaddr);
+ /* Discard any unused bytes beyond that submitted to hw. */
+ intel_ring_reset(ring, ring->tail);
+
if (i915_vma_is_map_and_fenceable(ring->vma))
i915_vma_unpin_iomap(ring->vma);
else
@@ -1562,8 +1574,9 @@ void intel_legacy_submission_resume(struct drm_i915_private *dev_priv)
struct intel_engine_cs *engine;
enum intel_engine_id id;
+ /* Restart from the beginning of the rings for convenience */
for_each_engine(engine, dev_priv, id)
- engine->buffer->head = engine->buffer->tail;
+ intel_ring_reset(engine->buffer, 0);
}
static int ring_request_alloc(struct drm_i915_gem_request *request)
@@ -1616,7 +1629,7 @@ static int wait_for_space(struct drm_i915_gem_request *req, int bytes)
unsigned space;
/* Would completion of this request free enough space? */
- space = __intel_ring_space(target->postfix, ring->tail,
+ space = __intel_ring_space(target->postfix, ring->emit,
ring->size);
if (space >= bytes)
break;
@@ -1641,8 +1654,8 @@ static int wait_for_space(struct drm_i915_gem_request *req, int bytes)
u32 *intel_ring_begin(struct drm_i915_gem_request *req, int num_dwords)
{
struct intel_ring *ring = req->ring;
- int remain_actual = ring->size - ring->tail;
- int remain_usable = ring->effective_size - ring->tail;
+ int remain_actual = ring->size - ring->emit;
+ int remain_usable = ring->effective_size - ring->emit;
int bytes = num_dwords * sizeof(u32);
int total_bytes, wait_bytes;
bool need_wrap = false;
@@ -1678,17 +1691,17 @@ u32 *intel_ring_begin(struct drm_i915_gem_request *req, int num_dwords)
if (unlikely(need_wrap)) {
GEM_BUG_ON(remain_actual > ring->space);
- GEM_BUG_ON(ring->tail + remain_actual > ring->size);
+ GEM_BUG_ON(ring->emit + remain_actual > ring->size);
/* Fill the tail with MI_NOOP */
- memset(ring->vaddr + ring->tail, 0, remain_actual);
- ring->tail = 0;
+ memset(ring->vaddr + ring->emit, 0, remain_actual);
+ ring->emit = 0;
ring->space -= remain_actual;
}
- GEM_BUG_ON(ring->tail > ring->size - bytes);
- cs = ring->vaddr + ring->tail;
- ring->tail += bytes;
+ GEM_BUG_ON(ring->emit > ring->size - bytes);
+ cs = ring->vaddr + ring->emit;
+ ring->emit += bytes;
ring->space -= bytes;
GEM_BUG_ON(ring->space < 0);
@@ -1699,7 +1712,7 @@ u32 *intel_ring_begin(struct drm_i915_gem_request *req, int num_dwords)
int intel_ring_cacheline_align(struct drm_i915_gem_request *req)
{
int num_dwords =
- (req->ring->tail & (CACHELINE_BYTES - 1)) / sizeof(uint32_t);
+ (req->ring->emit & (CACHELINE_BYTES - 1)) / sizeof(uint32_t);
u32 *cs;
if (num_dwords == 0)
diff --git a/drivers/gpu/drm/i915/intel_ringbuffer.h b/drivers/gpu/drm/i915/intel_ringbuffer.h
index a82a0807f64d..f7144fe09613 100644
--- a/drivers/gpu/drm/i915/intel_ringbuffer.h
+++ b/drivers/gpu/drm/i915/intel_ringbuffer.h
@@ -145,6 +145,7 @@ struct intel_ring {
u32 head;
u32 tail;
+ u32 emit;
int space;
int size;
@@ -488,6 +489,8 @@ intel_write_status_page(struct intel_engine_cs *engine, int reg, u32 value)
struct intel_ring *
intel_engine_create_ring(struct intel_engine_cs *engine, int size);
int intel_ring_pin(struct intel_ring *ring, unsigned int offset_bias);
+void intel_ring_reset(struct intel_ring *ring, u32 tail);
+void intel_ring_update_space(struct intel_ring *ring);
void intel_ring_unpin(struct intel_ring *ring);
void intel_ring_free(struct intel_ring *ring);
@@ -511,7 +514,7 @@ intel_ring_advance(struct drm_i915_gem_request *req, u32 *cs)
* reserved for the command packet (i.e. the value passed to
* intel_ring_begin()).
*/
- GEM_BUG_ON((req->ring->vaddr + req->ring->tail) != cs);
+ GEM_BUG_ON((req->ring->vaddr + req->ring->emit) != cs);
}
static inline u32
@@ -540,7 +543,19 @@ assert_ring_tail_valid(const struct intel_ring *ring, unsigned int tail)
GEM_BUG_ON(tail >= ring->size);
}
-void intel_ring_update_space(struct intel_ring *ring);
+static inline unsigned int
+intel_ring_set_tail(struct intel_ring *ring, unsigned int tail)
+{
+ /* Whilst writes to the tail are strictly order, there is no
+ * serialisation between readers and the writers. The tail may be
+ * read by i915_gem_request_retire() just as it is being updated
+ * by execlists, as although the breadcrumb is complete, the context
+ * switch hasn't been seen.
+ */
+ assert_ring_tail_valid(ring, tail);
+ ring->tail = tail;
+ return tail;
+}
void intel_engine_init_global_seqno(struct intel_engine_cs *engine, u32 seqno);
diff --git a/drivers/gpu/drm/mgag200/mgag200_mode.c b/drivers/gpu/drm/mgag200/mgag200_mode.c
index adb411a078e8..f4b53588e071 100644
--- a/drivers/gpu/drm/mgag200/mgag200_mode.c
+++ b/drivers/gpu/drm/mgag200/mgag200_mode.c
@@ -1173,7 +1173,10 @@ static int mga_crtc_mode_set(struct drm_crtc *crtc,
if (IS_G200_SE(mdev)) {
- if (mdev->unique_rev_id >= 0x02) {
+ if (mdev->unique_rev_id >= 0x04) {
+ WREG8(MGAREG_CRTCEXT_INDEX, 0x06);
+ WREG8(MGAREG_CRTCEXT_DATA, 0);
+ } else if (mdev->unique_rev_id >= 0x02) {
u8 hi_pri_lvl;
u32 bpp;
u32 mb;
@@ -1639,6 +1642,10 @@ static int mga_vga_mode_valid(struct drm_connector *connector,
if (mga_vga_calculate_mode_bandwidth(mode, bpp)
> (30100 * 1024))
return MODE_BANDWIDTH;
+ } else {
+ if (mga_vga_calculate_mode_bandwidth(mode, bpp)
+ > (55000 * 1024))
+ return MODE_BANDWIDTH;
}
} else if (mdev->type == G200_WB) {
if (mode->hdisplay > 1280)
diff --git a/drivers/gpu/drm/mxsfb/mxsfb_crtc.c b/drivers/gpu/drm/mxsfb/mxsfb_crtc.c
index 1144e0c9e894..0abe77675b76 100644
--- a/drivers/gpu/drm/mxsfb/mxsfb_crtc.c
+++ b/drivers/gpu/drm/mxsfb/mxsfb_crtc.c
@@ -35,6 +35,13 @@
#include "mxsfb_drv.h"
#include "mxsfb_regs.h"
+#define MXS_SET_ADDR 0x4
+#define MXS_CLR_ADDR 0x8
+#define MODULE_CLKGATE BIT(30)
+#define MODULE_SFTRST BIT(31)
+/* 1 second delay should be plenty of time for block reset */
+#define RESET_TIMEOUT 1000000
+
static u32 set_hsync_pulse_width(struct mxsfb_drm_private *mxsfb, u32 val)
{
return (val & mxsfb->devdata->hs_wdth_mask) <<
@@ -159,6 +166,36 @@ static void mxsfb_disable_controller(struct mxsfb_drm_private *mxsfb)
clk_disable_unprepare(mxsfb->clk_disp_axi);
}
+/*
+ * Clear the bit and poll it cleared. This is usually called with
+ * a reset address and mask being either SFTRST(bit 31) or CLKGATE
+ * (bit 30).
+ */
+static int clear_poll_bit(void __iomem *addr, u32 mask)
+{
+ u32 reg;
+
+ writel(mask, addr + MXS_CLR_ADDR);
+ return readl_poll_timeout(addr, reg, !(reg & mask), 0, RESET_TIMEOUT);
+}
+
+static int mxsfb_reset_block(void __iomem *reset_addr)
+{
+ int ret;
+
+ ret = clear_poll_bit(reset_addr, MODULE_SFTRST);
+ if (ret)
+ return ret;
+
+ writel(MODULE_CLKGATE, reset_addr + MXS_CLR_ADDR);
+
+ ret = clear_poll_bit(reset_addr, MODULE_SFTRST);
+ if (ret)
+ return ret;
+
+ return clear_poll_bit(reset_addr, MODULE_CLKGATE);
+}
+
static void mxsfb_crtc_mode_set_nofb(struct mxsfb_drm_private *mxsfb)
{
struct drm_display_mode *m = &mxsfb->pipe.crtc.state->adjusted_mode;
@@ -173,6 +210,11 @@ static void mxsfb_crtc_mode_set_nofb(struct mxsfb_drm_private *mxsfb)
*/
mxsfb_enable_axi_clk(mxsfb);
+ /* Mandatory eLCDIF reset as per the Reference Manual */
+ err = mxsfb_reset_block(mxsfb->base);
+ if (err)
+ return;
+
/* Clear the FIFOs */
writel(CTRL1_FIFO_CLEAR, mxsfb->base + LCDC_CTRL1 + REG_SET);
diff --git a/drivers/gpu/drm/radeon/cik.c b/drivers/gpu/drm/radeon/cik.c
index 008c145b7f29..ca44233ceacc 100644
--- a/drivers/gpu/drm/radeon/cik.c
+++ b/drivers/gpu/drm/radeon/cik.c
@@ -9267,8 +9267,11 @@ static void dce8_program_watermarks(struct radeon_device *rdev,
u32 tmp, wm_mask;
if (radeon_crtc->base.enabled && num_heads && mode) {
- active_time = 1000000UL * (u32)mode->crtc_hdisplay / (u32)mode->clock;
- line_time = min((u32) (1000000UL * (u32)mode->crtc_htotal / (u32)mode->clock), (u32)65535);
+ active_time = (u32) div_u64((u64)mode->crtc_hdisplay * 1000000,
+ (u32)mode->clock);
+ line_time = (u32) div_u64((u64)mode->crtc_htotal * 1000000,
+ (u32)mode->clock);
+ line_time = min(line_time, (u32)65535);
/* watermark for high clocks */
if ((rdev->pm.pm_method == PM_METHOD_DPM) &&
diff --git a/drivers/gpu/drm/radeon/evergreen.c b/drivers/gpu/drm/radeon/evergreen.c
index 0bf103536404..534637203e70 100644
--- a/drivers/gpu/drm/radeon/evergreen.c
+++ b/drivers/gpu/drm/radeon/evergreen.c
@@ -2266,8 +2266,11 @@ static void evergreen_program_watermarks(struct radeon_device *rdev,
fixed20_12 a, b, c;
if (radeon_crtc->base.enabled && num_heads && mode) {
- active_time = 1000000UL * (u32)mode->crtc_hdisplay / (u32)mode->clock;
- line_time = min((u32) (1000000UL * (u32)mode->crtc_htotal / (u32)mode->clock), (u32)65535);
+ active_time = (u32) div_u64((u64)mode->crtc_hdisplay * 1000000,
+ (u32)mode->clock);
+ line_time = (u32) div_u64((u64)mode->crtc_htotal * 1000000,
+ (u32)mode->clock);
+ line_time = min(line_time, (u32)65535);
priority_a_cnt = 0;
priority_b_cnt = 0;
dram_channels = evergreen_get_number_of_dram_channels(rdev);
diff --git a/drivers/gpu/drm/radeon/radeon.h b/drivers/gpu/drm/radeon/radeon.h
index c1c8e2208a21..e562a78510ff 100644
--- a/drivers/gpu/drm/radeon/radeon.h
+++ b/drivers/gpu/drm/radeon/radeon.h
@@ -375,7 +375,7 @@ struct radeon_fence {
unsigned ring;
bool is_vm_update;
- wait_queue_t fence_wake;
+ wait_queue_entry_t fence_wake;
};
int radeon_fence_driver_start_ring(struct radeon_device *rdev, int ring);
diff --git a/drivers/gpu/drm/radeon/radeon_combios.c b/drivers/gpu/drm/radeon/radeon_combios.c
index 432480ff9d22..3178ba0c537c 100644
--- a/drivers/gpu/drm/radeon/radeon_combios.c
+++ b/drivers/gpu/drm/radeon/radeon_combios.c
@@ -3393,6 +3393,13 @@ void radeon_combios_asic_init(struct drm_device *dev)
rdev->pdev->subsystem_vendor == 0x103c &&
rdev->pdev->subsystem_device == 0x280a)
return;
+ /* quirk for rs4xx Toshiba Sattellite L20-183 latop to make it resume
+ * - it hangs on resume inside the dynclk 1 table.
+ */
+ if (rdev->family == CHIP_RS400 &&
+ rdev->pdev->subsystem_vendor == 0x1179 &&
+ rdev->pdev->subsystem_device == 0xff31)
+ return;
/* DYN CLK 1 */
table = combios_get_table_offset(dev, COMBIOS_DYN_CLK_1_TABLE);
diff --git a/drivers/gpu/drm/radeon/radeon_device.c b/drivers/gpu/drm/radeon/radeon_device.c
index 6ecf42783d4b..0a6444d72000 100644
--- a/drivers/gpu/drm/radeon/radeon_device.c
+++ b/drivers/gpu/drm/radeon/radeon_device.c
@@ -136,6 +136,10 @@ static struct radeon_px_quirk radeon_px_quirk_list[] = {
* https://bugzilla.kernel.org/show_bug.cgi?id=51381
*/
{ PCI_VENDOR_ID_ATI, 0x6840, 0x1043, 0x2122, RADEON_PX_QUIRK_DISABLE_PX },
+ /* Asus K53TK laptop with AMD A6-3420M APU and Radeon 7670m GPU
+ * https://bugs.freedesktop.org/show_bug.cgi?id=101491
+ */
+ { PCI_VENDOR_ID_ATI, 0x6741, 0x1043, 0x2122, RADEON_PX_QUIRK_DISABLE_PX },
/* macbook pro 8.2 */
{ PCI_VENDOR_ID_ATI, 0x6741, PCI_VENDOR_ID_APPLE, 0x00e2, RADEON_PX_QUIRK_LONG_WAKEUP },
{ 0, 0, 0, 0, 0 },
diff --git a/drivers/gpu/drm/radeon/radeon_fence.c b/drivers/gpu/drm/radeon/radeon_fence.c
index ef09f0a63754..e86f2bd38410 100644
--- a/drivers/gpu/drm/radeon/radeon_fence.c
+++ b/drivers/gpu/drm/radeon/radeon_fence.c
@@ -158,7 +158,7 @@ int radeon_fence_emit(struct radeon_device *rdev,
* for the fence locking itself, so unlocked variants are used for
* fence_signal, and remove_wait_queue.
*/
-static int radeon_fence_check_signaled(wait_queue_t *wait, unsigned mode, int flags, void *key)
+static int radeon_fence_check_signaled(wait_queue_entry_t *wait, unsigned mode, int flags, void *key)
{
struct radeon_fence *fence;
u64 seq;
diff --git a/drivers/gpu/drm/radeon/radeon_uvd.c b/drivers/gpu/drm/radeon/radeon_uvd.c
index 7431eb4a11b7..d34d1cf33895 100644
--- a/drivers/gpu/drm/radeon/radeon_uvd.c
+++ b/drivers/gpu/drm/radeon/radeon_uvd.c
@@ -621,7 +621,7 @@ static int radeon_uvd_cs_reloc(struct radeon_cs_parser *p,
}
/* TODO: is this still necessary on NI+ ? */
- if ((cmd == 0 || cmd == 1 || cmd == 0x3) &&
+ if ((cmd == 0 || cmd == 0x3) &&
(start >> 28) != (p->rdev->uvd.gpu_addr >> 28)) {
DRM_ERROR("msg/fb buffer %LX-%LX out of 256MB segment!\n",
start, end);
diff --git a/drivers/gpu/drm/radeon/si.c b/drivers/gpu/drm/radeon/si.c
index 76d1888528e6..5303f25d5280 100644
--- a/drivers/gpu/drm/radeon/si.c
+++ b/drivers/gpu/drm/radeon/si.c
@@ -2284,8 +2284,11 @@ static void dce6_program_watermarks(struct radeon_device *rdev,
fixed20_12 a, b, c;
if (radeon_crtc->base.enabled && num_heads && mode) {
- active_time = 1000000UL * (u32)mode->crtc_hdisplay / (u32)mode->clock;
- line_time = min((u32) (1000000UL * (u32)mode->crtc_htotal / (u32)mode->clock), (u32)65535);
+ active_time = (u32) div_u64((u64)mode->crtc_hdisplay * 1000000,
+ (u32)mode->clock);
+ line_time = (u32) div_u64((u64)mode->crtc_htotal * 1000000,
+ (u32)mode->clock);
+ line_time = min(line_time, (u32)65535);
priority_a_cnt = 0;
priority_b_cnt = 0;
diff --git a/drivers/gpu/drm/tegra/drm.c b/drivers/gpu/drm/tegra/drm.c
index 9a1e34e48f64..81f86a67c10d 100644
--- a/drivers/gpu/drm/tegra/drm.c
+++ b/drivers/gpu/drm/tegra/drm.c
@@ -451,18 +451,6 @@ fail:
#ifdef CONFIG_DRM_TEGRA_STAGING
-static struct tegra_drm_context *
-tegra_drm_file_get_context(struct tegra_drm_file *file, u32 id)
-{
- struct tegra_drm_context *context;
-
- mutex_lock(&file->lock);
- context = idr_find(&file->contexts, id);
- mutex_unlock(&file->lock);
-
- return context;
-}
-
static int tegra_gem_create(struct drm_device *drm, void *data,
struct drm_file *file)
{
@@ -551,7 +539,7 @@ static int tegra_client_open(struct tegra_drm_file *fpriv,
if (err < 0)
return err;
- err = idr_alloc(&fpriv->contexts, context, 0, 0, GFP_KERNEL);
+ err = idr_alloc(&fpriv->contexts, context, 1, 0, GFP_KERNEL);
if (err < 0) {
client->ops->close_channel(context);
return err;
@@ -606,7 +594,7 @@ static int tegra_close_channel(struct drm_device *drm, void *data,
mutex_lock(&fpriv->lock);
- context = tegra_drm_file_get_context(fpriv, args->context);
+ context = idr_find(&fpriv->contexts, args->context);
if (!context) {
err = -EINVAL;
goto unlock;
@@ -631,7 +619,7 @@ static int tegra_get_syncpt(struct drm_device *drm, void *data,
mutex_lock(&fpriv->lock);
- context = tegra_drm_file_get_context(fpriv, args->context);
+ context = idr_find(&fpriv->contexts, args->context);
if (!context) {
err = -ENODEV;
goto unlock;
@@ -660,7 +648,7 @@ static int tegra_submit(struct drm_device *drm, void *data,
mutex_lock(&fpriv->lock);
- context = tegra_drm_file_get_context(fpriv, args->context);
+ context = idr_find(&fpriv->contexts, args->context);
if (!context) {
err = -ENODEV;
goto unlock;
@@ -685,7 +673,7 @@ static int tegra_get_syncpt_base(struct drm_device *drm, void *data,
mutex_lock(&fpriv->lock);
- context = tegra_drm_file_get_context(fpriv, args->context);
+ context = idr_find(&fpriv->contexts, args->context);
if (!context) {
err = -ENODEV;
goto unlock;
diff --git a/drivers/gpu/drm/vmwgfx/vmwgfx_cmdbuf_res.c b/drivers/gpu/drm/vmwgfx/vmwgfx_cmdbuf_res.c
index 13db8a2851ed..1f013d45c9e9 100644
--- a/drivers/gpu/drm/vmwgfx/vmwgfx_cmdbuf_res.c
+++ b/drivers/gpu/drm/vmwgfx/vmwgfx_cmdbuf_res.c
@@ -321,6 +321,7 @@ void vmw_cmdbuf_res_man_destroy(struct vmw_cmdbuf_res_manager *man)
list_for_each_entry_safe(entry, next, &man->list, head)
vmw_cmdbuf_res_free(man, entry);
+ drm_ht_remove(&man->resources);
kfree(man);
}
diff --git a/drivers/gpu/host1x/dev.c b/drivers/gpu/host1x/dev.c
index f05ebb14fa63..ac65f52850a6 100644
--- a/drivers/gpu/host1x/dev.c
+++ b/drivers/gpu/host1x/dev.c
@@ -172,7 +172,7 @@ static int host1x_probe(struct platform_device *pdev)
host->rst = devm_reset_control_get(&pdev->dev, "host1x");
if (IS_ERR(host->rst)) {
- err = PTR_ERR(host->clk);
+ err = PTR_ERR(host->rst);
dev_err(&pdev->dev, "failed to get reset: %d\n", err);
return err;
}
diff --git a/drivers/gpu/vga/vgaarb.c b/drivers/gpu/vga/vgaarb.c
index 92f1452dad57..76875f6299b8 100644
--- a/drivers/gpu/vga/vgaarb.c
+++ b/drivers/gpu/vga/vgaarb.c
@@ -417,7 +417,7 @@ int vga_get(struct pci_dev *pdev, unsigned int rsrc, int interruptible)
{
struct vga_device *vgadev, *conflict;
unsigned long flags;
- wait_queue_t wait;
+ wait_queue_entry_t wait;
int rc = 0;
vga_check_first_use();
diff --git a/drivers/hid/hid-core.c b/drivers/hid/hid-core.c
index 04cee65531d7..6e040692f1d8 100644
--- a/drivers/hid/hid-core.c
+++ b/drivers/hid/hid-core.c
@@ -826,11 +826,35 @@ static int hid_scan_report(struct hid_device *hid)
* hid-rmi should take care of them,
* not hid-generic
*/
- if (IS_ENABLED(CONFIG_HID_RMI))
- hid->group = HID_GROUP_RMI;
+ hid->group = HID_GROUP_RMI;
break;
}
+ /* fall back to generic driver in case specific driver doesn't exist */
+ switch (hid->group) {
+ case HID_GROUP_MULTITOUCH_WIN_8:
+ /* fall-through */
+ case HID_GROUP_MULTITOUCH:
+ if (!IS_ENABLED(CONFIG_HID_MULTITOUCH))
+ hid->group = HID_GROUP_GENERIC;
+ break;
+ case HID_GROUP_SENSOR_HUB:
+ if (!IS_ENABLED(CONFIG_HID_SENSOR_HUB))
+ hid->group = HID_GROUP_GENERIC;
+ break;
+ case HID_GROUP_RMI:
+ if (!IS_ENABLED(CONFIG_HID_RMI))
+ hid->group = HID_GROUP_GENERIC;
+ break;
+ case HID_GROUP_WACOM:
+ if (!IS_ENABLED(CONFIG_HID_WACOM))
+ hid->group = HID_GROUP_GENERIC;
+ break;
+ case HID_GROUP_LOGITECH_DJ_DEVICE:
+ if (!IS_ENABLED(CONFIG_HID_LOGITECH_DJ))
+ hid->group = HID_GROUP_GENERIC;
+ break;
+ }
vfree(parser);
return 0;
}
@@ -1763,15 +1787,23 @@ EXPORT_SYMBOL_GPL(hid_disconnect);
* used as a driver. See hid_scan_report().
*/
static const struct hid_device_id hid_have_special_driver[] = {
+#if IS_ENABLED(CONFIG_HID_A4TECH)
{ HID_USB_DEVICE(USB_VENDOR_ID_A4TECH, USB_DEVICE_ID_A4TECH_WCP32PU) },
{ HID_USB_DEVICE(USB_VENDOR_ID_A4TECH, USB_DEVICE_ID_A4TECH_X5_005D) },
{ HID_USB_DEVICE(USB_VENDOR_ID_A4TECH, USB_DEVICE_ID_A4TECH_RP_649) },
+#endif
+#if IS_ENABLED(CONFIG_HID_ACCUTOUCH)
+ { HID_USB_DEVICE(USB_VENDOR_ID_ELO, USB_DEVICE_ID_ELO_ACCUTOUCH_2216) },
+#endif
+#if IS_ENABLED(CONFIG_HID_ACRUX)
{ HID_USB_DEVICE(USB_VENDOR_ID_ACRUX, 0x0802) },
{ HID_USB_DEVICE(USB_VENDOR_ID_ACRUX, 0xf705) },
+#endif
+#if IS_ENABLED(CONFIG_HID_ALPS)
{ HID_DEVICE(HID_BUS_ANY, HID_GROUP_ANY, USB_VENDOR_ID_ALPS_JP, HID_DEVICE_ID_ALPS_U1_DUAL) },
+#endif
+#if IS_ENABLED(CONFIG_HID_APPLE)
{ HID_USB_DEVICE(USB_VENDOR_ID_APPLE, USB_DEVICE_ID_APPLE_MIGHTYMOUSE) },
- { HID_BLUETOOTH_DEVICE(USB_VENDOR_ID_APPLE, USB_DEVICE_ID_APPLE_MAGICMOUSE) },
- { HID_BLUETOOTH_DEVICE(USB_VENDOR_ID_APPLE, USB_DEVICE_ID_APPLE_MAGICTRACKPAD) },
{ HID_USB_DEVICE(USB_VENDOR_ID_APPLE, USB_DEVICE_ID_APPLE_FOUNTAIN_ANSI) },
{ HID_USB_DEVICE(USB_VENDOR_ID_APPLE, USB_DEVICE_ID_APPLE_FOUNTAIN_ISO) },
{ HID_USB_DEVICE(USB_VENDOR_ID_APPLE, USB_DEVICE_ID_APPLE_GEYSER_ANSI) },
@@ -1792,11 +1824,6 @@ static const struct hid_device_id hid_have_special_driver[] = {
{ HID_USB_DEVICE(USB_VENDOR_ID_APPLE, USB_DEVICE_ID_APPLE_GEYSER4_HF_ANSI) },
{ HID_USB_DEVICE(USB_VENDOR_ID_APPLE, USB_DEVICE_ID_APPLE_GEYSER4_HF_ISO) },
{ HID_USB_DEVICE(USB_VENDOR_ID_APPLE, USB_DEVICE_ID_APPLE_GEYSER4_HF_JIS) },
- { HID_USB_DEVICE(USB_VENDOR_ID_APPLE, USB_DEVICE_ID_APPLE_IRCONTROL) },
- { HID_USB_DEVICE(USB_VENDOR_ID_APPLE, USB_DEVICE_ID_APPLE_IRCONTROL2) },
- { HID_USB_DEVICE(USB_VENDOR_ID_APPLE, USB_DEVICE_ID_APPLE_IRCONTROL3) },
- { HID_USB_DEVICE(USB_VENDOR_ID_APPLE, USB_DEVICE_ID_APPLE_IRCONTROL4) },
- { HID_USB_DEVICE(USB_VENDOR_ID_APPLE, USB_DEVICE_ID_APPLE_IRCONTROL5) },
{ HID_BLUETOOTH_DEVICE(USB_VENDOR_ID_APPLE, USB_DEVICE_ID_APPLE_ALU_WIRELESS_ANSI) },
{ HID_BLUETOOTH_DEVICE(USB_VENDOR_ID_APPLE, USB_DEVICE_ID_APPLE_ALU_WIRELESS_ISO) },
{ HID_BLUETOOTH_DEVICE(USB_VENDOR_ID_APPLE, USB_DEVICE_ID_APPLE_ALU_WIRELESS_JIS) },
@@ -1851,62 +1878,100 @@ static const struct hid_device_id hid_have_special_driver[] = {
{ HID_USB_DEVICE(USB_VENDOR_ID_APPLE, USB_DEVICE_ID_APPLE_MAGIC_KEYBOARD_ANSI) },
{ HID_USB_DEVICE(USB_VENDOR_ID_APPLE, USB_DEVICE_ID_APPLE_FOUNTAIN_TP_ONLY) },
{ HID_USB_DEVICE(USB_VENDOR_ID_APPLE, USB_DEVICE_ID_APPLE_GEYSER1_TP_ONLY) },
+#endif
+#if IS_ENABLED(CONFIG_HID_APPLEIR)
+ { HID_USB_DEVICE(USB_VENDOR_ID_APPLE, USB_DEVICE_ID_APPLE_IRCONTROL) },
+ { HID_USB_DEVICE(USB_VENDOR_ID_APPLE, USB_DEVICE_ID_APPLE_IRCONTROL2) },
+ { HID_USB_DEVICE(USB_VENDOR_ID_APPLE, USB_DEVICE_ID_APPLE_IRCONTROL3) },
+ { HID_USB_DEVICE(USB_VENDOR_ID_APPLE, USB_DEVICE_ID_APPLE_IRCONTROL4) },
+ { HID_USB_DEVICE(USB_VENDOR_ID_APPLE, USB_DEVICE_ID_APPLE_IRCONTROL5) },
+#endif
+#if IS_ENABLED(CONFIG_HID_ASUS)
{ HID_I2C_DEVICE(USB_VENDOR_ID_ASUSTEK, USB_DEVICE_ID_ASUSTEK_I2C_KEYBOARD) },
{ HID_I2C_DEVICE(USB_VENDOR_ID_ASUSTEK, USB_DEVICE_ID_ASUSTEK_I2C_TOUCHPAD) },
{ HID_USB_DEVICE(USB_VENDOR_ID_ASUSTEK, USB_DEVICE_ID_ASUSTEK_ROG_KEYBOARD1) },
{ HID_USB_DEVICE(USB_VENDOR_ID_ASUSTEK, USB_DEVICE_ID_ASUSTEK_ROG_KEYBOARD2) },
{ HID_USB_DEVICE(USB_VENDOR_ID_ASUSTEK, USB_DEVICE_ID_ASUSTEK_T100_KEYBOARD) },
+#endif
+#if IS_ENABLED(CONFIG_HID_AUREAL)
{ HID_USB_DEVICE(USB_VENDOR_ID_AUREAL, USB_DEVICE_ID_AUREAL_W01RN) },
+#endif
+#if IS_ENABLED(CONFIG_HID_BELKIN)
{ HID_USB_DEVICE(USB_VENDOR_ID_BELKIN, USB_DEVICE_ID_FLIP_KVM) },
+ { HID_USB_DEVICE(USB_VENDOR_ID_LABTEC, USB_DEVICE_ID_LABTEC_WIRELESS_KEYBOARD) },
+#endif
+#if IS_ENABLED(CONFIG_HID_BETOP_FF)
{ HID_USB_DEVICE(USB_VENDOR_ID_BETOP_2185BFM, 0x2208) },
{ HID_USB_DEVICE(USB_VENDOR_ID_BETOP_2185PC, 0x5506) },
{ HID_USB_DEVICE(USB_VENDOR_ID_BETOP_2185V2PC, 0x1850) },
{ HID_USB_DEVICE(USB_VENDOR_ID_BETOP_2185V2BFM, 0x5500) },
- { HID_USB_DEVICE(USB_VENDOR_ID_BTC, USB_DEVICE_ID_BTC_EMPREX_REMOTE) },
- { HID_USB_DEVICE(USB_VENDOR_ID_BTC, USB_DEVICE_ID_BTC_EMPREX_REMOTE_2) },
+#endif
+#if IS_ENABLED(CONFIG_HID_CHERRY)
{ HID_USB_DEVICE(USB_VENDOR_ID_CHERRY, USB_DEVICE_ID_CHERRY_CYMOTION) },
{ HID_USB_DEVICE(USB_VENDOR_ID_CHERRY, USB_DEVICE_ID_CHERRY_CYMOTION_SOLAR) },
+#endif
+#if IS_ENABLED(CONFIG_HID_CHICONY)
{ HID_USB_DEVICE(USB_VENDOR_ID_CHICONY, USB_DEVICE_ID_CHICONY_TACTICAL_PAD) },
- { HID_USB_DEVICE(USB_VENDOR_ID_CHICONY, USB_DEVICE_ID_CHICONY_WIRELESS) },
{ HID_USB_DEVICE(USB_VENDOR_ID_CHICONY, USB_DEVICE_ID_CHICONY_WIRELESS2) },
{ HID_USB_DEVICE(USB_VENDOR_ID_CHICONY, USB_DEVICE_ID_CHICONY_AK1D) },
{ HID_USB_DEVICE(USB_VENDOR_ID_CHICONY, USB_DEVICE_ID_CHICONY_ACER_SWITCH12) },
+ { HID_USB_DEVICE(USB_VENDOR_ID_JESS, USB_DEVICE_ID_JESS_ZEN_AIO_KBD) },
+#endif
+#if IS_ENABLED(CONFIG_HID_CMEDIA)
+ { HID_USB_DEVICE(USB_VENDOR_ID_CMEDIA, USB_DEVICE_ID_CM6533) },
+#endif
+#if IS_ENABLED(CONFIG_HID_CORSAIR)
{ HID_USB_DEVICE(USB_VENDOR_ID_CORSAIR, USB_DEVICE_ID_CORSAIR_K90) },
{ HID_USB_DEVICE(USB_VENDOR_ID_CORSAIR, USB_DEVICE_ID_CORSAIR_SCIMITAR_PRO_RGB) },
- { HID_USB_DEVICE(USB_VENDOR_ID_CREATIVELABS, USB_DEVICE_ID_PRODIKEYS_PCMIDI) },
+#endif
+#if IS_ENABLED(CONFIG_HID_CP2112)
{ HID_USB_DEVICE(USB_VENDOR_ID_CYGNAL, USB_DEVICE_ID_CYGNAL_CP2112) },
+#endif
+#if IS_ENABLED(CONFIG_HID_CYPRESS)
{ HID_USB_DEVICE(USB_VENDOR_ID_CYPRESS, USB_DEVICE_ID_CYPRESS_BARCODE_1) },
{ HID_USB_DEVICE(USB_VENDOR_ID_CYPRESS, USB_DEVICE_ID_CYPRESS_BARCODE_2) },
{ HID_USB_DEVICE(USB_VENDOR_ID_CYPRESS, USB_DEVICE_ID_CYPRESS_BARCODE_3) },
{ HID_USB_DEVICE(USB_VENDOR_ID_CYPRESS, USB_DEVICE_ID_CYPRESS_BARCODE_4) },
{ HID_USB_DEVICE(USB_VENDOR_ID_CYPRESS, USB_DEVICE_ID_CYPRESS_MOUSE) },
- { HID_USB_DEVICE(USB_VENDOR_ID_DELCOM, USB_DEVICE_ID_DELCOM_VISUAL_IND) },
+#endif
+#if IS_ENABLED(CONFIG_HID_DRAGONRISE)
{ HID_USB_DEVICE(USB_VENDOR_ID_DRAGONRISE, 0x0006) },
{ HID_USB_DEVICE(USB_VENDOR_ID_DRAGONRISE, 0x0011) },
-#if IS_ENABLED(CONFIG_HID_MAYFLASH)
- { HID_USB_DEVICE(USB_VENDOR_ID_DRAGONRISE, USB_DEVICE_ID_DRAGONRISE_PS3) },
- { HID_USB_DEVICE(USB_VENDOR_ID_DRAGONRISE, USB_DEVICE_ID_DRAGONRISE_DOLPHINBAR) },
- { HID_USB_DEVICE(USB_VENDOR_ID_DRAGONRISE, USB_DEVICE_ID_DRAGONRISE_GAMECUBE1) },
- { HID_USB_DEVICE(USB_VENDOR_ID_DRAGONRISE, USB_DEVICE_ID_DRAGONRISE_GAMECUBE2) },
#endif
- { HID_USB_DEVICE(USB_VENDOR_ID_DREAM_CHEEKY, USB_DEVICE_ID_DREAM_CHEEKY_WN) },
- { HID_USB_DEVICE(USB_VENDOR_ID_DREAM_CHEEKY, USB_DEVICE_ID_DREAM_CHEEKY_FA) },
+#if IS_ENABLED(CONFIG_HID_ELECOM)
{ HID_BLUETOOTH_DEVICE(USB_VENDOR_ID_ELECOM, USB_DEVICE_ID_ELECOM_BM084) },
{ HID_USB_DEVICE(USB_VENDOR_ID_ELECOM, USB_DEVICE_ID_ELECOM_DEFT_WIRED) },
{ HID_USB_DEVICE(USB_VENDOR_ID_ELECOM, USB_DEVICE_ID_ELECOM_DEFT_WIRELESS) },
+#endif
+#if IS_ENABLED(CONFIG_HID_ELO)
{ HID_USB_DEVICE(USB_VENDOR_ID_ELO, 0x0009) },
{ HID_USB_DEVICE(USB_VENDOR_ID_ELO, 0x0030) },
- { HID_USB_DEVICE(USB_VENDOR_ID_ELO, USB_DEVICE_ID_ELO_ACCUTOUCH_2216) },
+#endif
+#if IS_ENABLED(CONFIG_HID_EMS_FF)
{ HID_USB_DEVICE(USB_VENDOR_ID_EMS, USB_DEVICE_ID_EMS_TRIO_LINKER_PLUS_II) },
+#endif
+#if IS_ENABLED(CONFIG_HID_EZKEY)
{ HID_USB_DEVICE(USB_VENDOR_ID_EZKEY, USB_DEVICE_ID_BTC_8193) },
- { HID_USB_DEVICE(USB_VENDOR_ID_GAMERON, USB_DEVICE_ID_GAMERON_DUAL_PSX_ADAPTOR) },
- { HID_USB_DEVICE(USB_VENDOR_ID_GAMERON, USB_DEVICE_ID_GAMERON_DUAL_PCS_ADAPTOR) },
+#endif
+#if IS_ENABLED(CONFIG_HID_GEMBIRD)
{ HID_USB_DEVICE(USB_VENDOR_ID_GEMBIRD, USB_DEVICE_ID_GEMBIRD_JPD_DUALFORCE2) },
- { HID_USB_DEVICE(USB_VENDOR_ID_GREENASIA, 0x0003) },
+#endif
+#if IS_ENABLED(CONFIG_HID_GFRM)
+ { HID_BLUETOOTH_DEVICE(0x58, 0x2000) },
+ { HID_BLUETOOTH_DEVICE(0x471, 0x2210) },
+#endif
+#if IS_ENABLED(CONFIG_HID_GREENASIA)
{ HID_USB_DEVICE(USB_VENDOR_ID_GREENASIA, 0x0012) },
+#endif
+#if IS_ENABLED(CONFIG_HID_GT683R)
+ { HID_USB_DEVICE(USB_VENDOR_ID_MSI, USB_DEVICE_ID_MSI_GT683R_LED_PANEL) },
+#endif
+#if IS_ENABLED(CONFIG_HID_GYRATION)
{ HID_USB_DEVICE(USB_VENDOR_ID_GYRATION, USB_DEVICE_ID_GYRATION_REMOTE) },
{ HID_USB_DEVICE(USB_VENDOR_ID_GYRATION, USB_DEVICE_ID_GYRATION_REMOTE_2) },
{ HID_USB_DEVICE(USB_VENDOR_ID_GYRATION, USB_DEVICE_ID_GYRATION_REMOTE_3) },
+#endif
+#if IS_ENABLED(CONFIG_HID_HOLTEK)
{ HID_USB_DEVICE(USB_VENDOR_ID_HOLTEK, USB_DEVICE_ID_HOLTEK_ON_LINE_GRIP) },
{ HID_USB_DEVICE(USB_VENDOR_ID_HOLTEK_ALT, USB_DEVICE_ID_HOLTEK_ALT_KEYBOARD) },
{ HID_USB_DEVICE(USB_VENDOR_ID_HOLTEK_ALT, USB_DEVICE_ID_HOLTEK_ALT_MOUSE_A04A) },
@@ -1915,12 +1980,17 @@ static const struct hid_device_id hid_have_special_driver[] = {
{ HID_USB_DEVICE(USB_VENDOR_ID_HOLTEK_ALT, USB_DEVICE_ID_HOLTEK_ALT_MOUSE_A072) },
{ HID_USB_DEVICE(USB_VENDOR_ID_HOLTEK_ALT, USB_DEVICE_ID_HOLTEK_ALT_MOUSE_A081) },
{ HID_USB_DEVICE(USB_VENDOR_ID_HOLTEK_ALT, USB_DEVICE_ID_HOLTEK_ALT_MOUSE_A0C2) },
- { HID_USB_DEVICE(USB_VENDOR_ID_HUION, USB_DEVICE_ID_HUION_TABLET) },
- { HID_USB_DEVICE(USB_VENDOR_ID_JESS, USB_DEVICE_ID_JESS_ZEN_AIO_KBD) },
- { HID_USB_DEVICE(USB_VENDOR_ID_JESS2, USB_DEVICE_ID_JESS2_COLOR_RUMBLE_PAD) },
+#endif
+#if IS_ENABLED(CONFIG_HID_ICADE)
{ HID_BLUETOOTH_DEVICE(USB_VENDOR_ID_ION, USB_DEVICE_ID_ICADE) },
+#endif
+#if IS_ENABLED(CONFIG_HID_KENSINGTON)
{ HID_USB_DEVICE(USB_VENDOR_ID_KENSINGTON, USB_DEVICE_ID_KS_SLIMBLADE) },
+#endif
+#if IS_ENABLED(CONFIG_HID_KEYTOUCH)
{ HID_USB_DEVICE(USB_VENDOR_ID_KEYTOUCH, USB_DEVICE_ID_KEYTOUCH_IEC) },
+#endif
+#if IS_ENABLED(CONFIG_HID_KYE)
{ HID_USB_DEVICE(USB_VENDOR_ID_KYE, USB_DEVICE_ID_GENIUS_GILA_GAMING_MOUSE) },
{ HID_USB_DEVICE(USB_VENDOR_ID_KYE, USB_DEVICE_ID_GENIUS_MANTICORE) },
{ HID_USB_DEVICE(USB_VENDOR_ID_KYE, USB_DEVICE_ID_GENIUS_GX_IMPERATOR) },
@@ -1930,21 +2000,29 @@ static const struct hid_device_id hid_have_special_driver[] = {
{ HID_USB_DEVICE(USB_VENDOR_ID_KYE, USB_DEVICE_ID_KYE_MOUSEPEN_I608X_V2) },
{ HID_USB_DEVICE(USB_VENDOR_ID_KYE, USB_DEVICE_ID_KYE_EASYPEN_M610X) },
{ HID_USB_DEVICE(USB_VENDOR_ID_KYE, USB_DEVICE_ID_KYE_PENSKETCH_M912) },
- { HID_USB_DEVICE(USB_VENDOR_ID_LABTEC, USB_DEVICE_ID_LABTEC_WIRELESS_KEYBOARD) },
+#endif
+#if IS_ENABLED(CONFIG_HID_LCPOWER)
{ HID_USB_DEVICE(USB_VENDOR_ID_LCPOWER, USB_DEVICE_ID_LCPOWER_LC1000 ) },
+#endif
+#if IS_ENABLED(CONFIG_HID_LED)
+ { HID_USB_DEVICE(USB_VENDOR_ID_DELCOM, USB_DEVICE_ID_DELCOM_VISUAL_IND) },
+ { HID_USB_DEVICE(USB_VENDOR_ID_DREAM_CHEEKY, USB_DEVICE_ID_DREAM_CHEEKY_WN) },
+ { HID_USB_DEVICE(USB_VENDOR_ID_DREAM_CHEEKY, USB_DEVICE_ID_DREAM_CHEEKY_FA) },
+ { HID_USB_DEVICE(USB_VENDOR_ID_MICROCHIP, USB_DEVICE_ID_LUXAFOR) },
+ { HID_USB_DEVICE(USB_VENDOR_ID_RISO_KAGAKU, USB_DEVICE_ID_RI_KA_WEBMAIL) },
+ { HID_USB_DEVICE(USB_VENDOR_ID_THINGM, USB_DEVICE_ID_BLINK1) },
+#endif
#if IS_ENABLED(CONFIG_HID_LENOVO)
{ HID_USB_DEVICE(USB_VENDOR_ID_LENOVO, USB_DEVICE_ID_LENOVO_TPKBD) },
{ HID_USB_DEVICE(USB_VENDOR_ID_LENOVO, USB_DEVICE_ID_LENOVO_CUSBKBD) },
{ HID_BLUETOOTH_DEVICE(USB_VENDOR_ID_LENOVO, USB_DEVICE_ID_LENOVO_CBTKBD) },
{ HID_USB_DEVICE(USB_VENDOR_ID_LENOVO, USB_DEVICE_ID_LENOVO_TPPRODOCK) },
#endif
- { HID_USB_DEVICE(USB_VENDOR_ID_LG, USB_DEVICE_ID_LG_MELFAS_MT) },
+#if IS_ENABLED(CONFIG_HID_LOGITECH)
{ HID_USB_DEVICE(USB_VENDOR_ID_LOGITECH, USB_DEVICE_ID_MX3000_RECEIVER) },
{ HID_USB_DEVICE(USB_VENDOR_ID_LOGITECH, USB_DEVICE_ID_S510_RECEIVER) },
{ HID_USB_DEVICE(USB_VENDOR_ID_LOGITECH, USB_DEVICE_ID_S510_RECEIVER_2) },
{ HID_USB_DEVICE(USB_VENDOR_ID_LOGITECH, USB_DEVICE_ID_LOGITECH_RECEIVER) },
- { HID_BLUETOOTH_DEVICE(USB_VENDOR_ID_LOGITECH, USB_DEVICE_ID_LOGITECH_HARMONY_PS3) },
- { HID_BLUETOOTH_DEVICE(USB_VENDOR_ID_LOGITECH, USB_DEVICE_ID_LOGITECH_T651) },
{ HID_USB_DEVICE(USB_VENDOR_ID_LOGITECH, USB_DEVICE_ID_DINOVO_DESKTOP) },
{ HID_USB_DEVICE(USB_VENDOR_ID_LOGITECH, USB_DEVICE_ID_DINOVO_EDGE) },
{ HID_USB_DEVICE(USB_VENDOR_ID_LOGITECH, USB_DEVICE_ID_DINOVO_MINI) },
@@ -1957,7 +2035,6 @@ static const struct hid_device_id hid_have_special_driver[] = {
{ HID_USB_DEVICE(USB_VENDOR_ID_LOGITECH, USB_DEVICE_ID_LOGITECH_RUMBLEPAD) },
{ HID_USB_DEVICE(USB_VENDOR_ID_LOGITECH, USB_DEVICE_ID_LOGITECH_RUMBLEPAD2_2) },
{ HID_USB_DEVICE(USB_VENDOR_ID_LOGITECH, USB_DEVICE_ID_LOGITECH_G29_WHEEL) },
- { HID_USB_DEVICE(USB_VENDOR_ID_LOGITECH, USB_DEVICE_ID_LOGITECH_G920_WHEEL) },
{ HID_USB_DEVICE(USB_VENDOR_ID_LOGITECH, USB_DEVICE_ID_LOGITECH_WINGMAN_F3D) },
{ HID_USB_DEVICE(USB_VENDOR_ID_LOGITECH, USB_DEVICE_ID_LOGITECH_WINGMAN_FFG ) },
{ HID_USB_DEVICE(USB_VENDOR_ID_LOGITECH, USB_DEVICE_ID_LOGITECH_FORCE3D_PRO) },
@@ -1969,17 +2046,30 @@ static const struct hid_device_id hid_have_special_driver[] = {
{ HID_USB_DEVICE(USB_VENDOR_ID_LOGITECH, USB_DEVICE_ID_LOGITECH_DFGT_WHEEL) },
{ HID_USB_DEVICE(USB_VENDOR_ID_LOGITECH, USB_DEVICE_ID_LOGITECH_G25_WHEEL) },
{ HID_USB_DEVICE(USB_VENDOR_ID_LOGITECH, USB_DEVICE_ID_LOGITECH_G27_WHEEL) },
-#if IS_ENABLED(CONFIG_HID_LOGITECH_DJ)
- { HID_USB_DEVICE(USB_VENDOR_ID_LOGITECH, USB_DEVICE_ID_LOGITECH_UNIFYING_RECEIVER) },
- { HID_USB_DEVICE(USB_VENDOR_ID_LOGITECH, USB_DEVICE_ID_LOGITECH_UNIFYING_RECEIVER_2) },
-#endif
{ HID_USB_DEVICE(USB_VENDOR_ID_LOGITECH, USB_DEVICE_ID_LOGITECH_WII_WHEEL) },
{ HID_USB_DEVICE(USB_VENDOR_ID_LOGITECH, USB_DEVICE_ID_LOGITECH_RUMBLEPAD2) },
{ HID_USB_DEVICE(USB_VENDOR_ID_LOGITECH, USB_DEVICE_ID_SPACETRAVELLER) },
{ HID_USB_DEVICE(USB_VENDOR_ID_LOGITECH, USB_DEVICE_ID_SPACENAVIGATOR) },
- { HID_USB_DEVICE(USB_VENDOR_ID_MICROCHIP, USB_DEVICE_ID_PICOLCD) },
- { HID_USB_DEVICE(USB_VENDOR_ID_MICROCHIP, USB_DEVICE_ID_PICOLCD_BOOTLOADER) },
- { HID_USB_DEVICE(USB_VENDOR_ID_MICROCHIP, USB_DEVICE_ID_LUXAFOR) },
+#endif
+#if IS_ENABLED(CONFIG_HID_LOGITECH_HIDPP)
+ { HID_BLUETOOTH_DEVICE(USB_VENDOR_ID_LOGITECH, USB_DEVICE_ID_LOGITECH_T651) },
+ { HID_USB_DEVICE(USB_VENDOR_ID_LOGITECH, USB_DEVICE_ID_LOGITECH_G920_WHEEL) },
+#endif
+#if IS_ENABLED(CONFIG_HID_LOGITECH_DJ)
+ { HID_USB_DEVICE(USB_VENDOR_ID_LOGITECH, USB_DEVICE_ID_LOGITECH_UNIFYING_RECEIVER) },
+ { HID_USB_DEVICE(USB_VENDOR_ID_LOGITECH, USB_DEVICE_ID_LOGITECH_UNIFYING_RECEIVER_2) },
+#endif
+#if IS_ENABLED(CONFIG_HID_MAGICMOUSE)
+ { HID_BLUETOOTH_DEVICE(USB_VENDOR_ID_APPLE, USB_DEVICE_ID_APPLE_MAGICMOUSE) },
+ { HID_BLUETOOTH_DEVICE(USB_VENDOR_ID_APPLE, USB_DEVICE_ID_APPLE_MAGICTRACKPAD) },
+#endif
+#if IS_ENABLED(CONFIG_HID_MAYFLASH)
+ { HID_USB_DEVICE(USB_VENDOR_ID_DRAGONRISE, USB_DEVICE_ID_DRAGONRISE_PS3) },
+ { HID_USB_DEVICE(USB_VENDOR_ID_DRAGONRISE, USB_DEVICE_ID_DRAGONRISE_DOLPHINBAR) },
+ { HID_USB_DEVICE(USB_VENDOR_ID_DRAGONRISE, USB_DEVICE_ID_DRAGONRISE_GAMECUBE1) },
+ { HID_USB_DEVICE(USB_VENDOR_ID_DRAGONRISE, USB_DEVICE_ID_DRAGONRISE_GAMECUBE2) },
+#endif
+#if IS_ENABLED(CONFIG_HID_MICROSOFT)
{ HID_USB_DEVICE(USB_VENDOR_ID_MICROSOFT, USB_DEVICE_ID_MS_COMFORT_MOUSE_4500) },
{ HID_USB_DEVICE(USB_VENDOR_ID_MICROSOFT, USB_DEVICE_ID_MS_COMFORT_KEYBOARD) },
{ HID_USB_DEVICE(USB_VENDOR_ID_MICROSOFT, USB_DEVICE_ID_SIDEWINDER_GV) },
@@ -1995,9 +2085,22 @@ static const struct hid_device_id hid_have_special_driver[] = {
{ HID_USB_DEVICE(USB_VENDOR_ID_MICROSOFT, USB_DEVICE_ID_MS_DIGITAL_MEDIA_600) },
{ HID_USB_DEVICE(USB_VENDOR_ID_MICROSOFT, USB_DEVICE_ID_MS_DIGITAL_MEDIA_3KV1) },
{ HID_USB_DEVICE(USB_VENDOR_ID_MICROSOFT, USB_DEVICE_ID_MS_POWER_COVER) },
+ { HID_BLUETOOTH_DEVICE(USB_VENDOR_ID_MICROSOFT, USB_DEVICE_ID_MS_PRESENTER_8K_BT) },
+#endif
+#if IS_ENABLED(CONFIG_HID_MONTEREY)
{ HID_USB_DEVICE(USB_VENDOR_ID_MONTEREY, USB_DEVICE_ID_GENIUS_KB29E) },
- { HID_USB_DEVICE(USB_VENDOR_ID_MSI, USB_DEVICE_ID_MSI_GT683R_LED_PANEL) },
+#endif
+#if IS_ENABLED(CONFIG_HID_MULTITOUCH)
+ { HID_USB_DEVICE(USB_VENDOR_ID_LG, USB_DEVICE_ID_LG_MELFAS_MT) },
+#endif
+#if IS_ENABLED(CONFIG_HID_WIIMOTE)
+ { HID_BLUETOOTH_DEVICE(USB_VENDOR_ID_NINTENDO, USB_DEVICE_ID_NINTENDO_WIIMOTE) },
+ { HID_BLUETOOTH_DEVICE(USB_VENDOR_ID_NINTENDO, USB_DEVICE_ID_NINTENDO_WIIMOTE2) },
+#endif
+#if IS_ENABLED(CONFIG_HID_NTI)
{ HID_USB_DEVICE(USB_VENDOR_ID_NTI, USB_DEVICE_ID_USB_SUN) },
+#endif
+#if IS_ENABLED(CONFIG_HID_NTRIG)
{ HID_USB_DEVICE(USB_VENDOR_ID_NTRIG, USB_DEVICE_ID_NTRIG_TOUCH_SCREEN) },
{ HID_USB_DEVICE(USB_VENDOR_ID_NTRIG, USB_DEVICE_ID_NTRIG_TOUCH_SCREEN_1) },
{ HID_USB_DEVICE(USB_VENDOR_ID_NTRIG, USB_DEVICE_ID_NTRIG_TOUCH_SCREEN_2) },
@@ -2017,13 +2120,41 @@ static const struct hid_device_id hid_have_special_driver[] = {
{ HID_USB_DEVICE(USB_VENDOR_ID_NTRIG, USB_DEVICE_ID_NTRIG_TOUCH_SCREEN_16) },
{ HID_USB_DEVICE(USB_VENDOR_ID_NTRIG, USB_DEVICE_ID_NTRIG_TOUCH_SCREEN_17) },
{ HID_USB_DEVICE(USB_VENDOR_ID_NTRIG, USB_DEVICE_ID_NTRIG_TOUCH_SCREEN_18) },
+#endif
+#if IS_ENABLED(CONFIG_HID_ORTEK)
{ HID_USB_DEVICE(USB_VENDOR_ID_ORTEK, USB_DEVICE_ID_ORTEK_PKB1700) },
{ HID_USB_DEVICE(USB_VENDOR_ID_ORTEK, USB_DEVICE_ID_ORTEK_WKB2000) },
+ { HID_USB_DEVICE(USB_VENDOR_ID_SKYCABLE, USB_DEVICE_ID_SKYCABLE_WIRELESS_PRESENTER) },
+#endif
+#if IS_ENABLED(CONFIG_HID_PANTHERLORD)
+ { HID_USB_DEVICE(USB_VENDOR_ID_GAMERON, USB_DEVICE_ID_GAMERON_DUAL_PSX_ADAPTOR) },
+ { HID_USB_DEVICE(USB_VENDOR_ID_GAMERON, USB_DEVICE_ID_GAMERON_DUAL_PCS_ADAPTOR) },
+ { HID_USB_DEVICE(USB_VENDOR_ID_GREENASIA, 0x0003) },
+ { HID_USB_DEVICE(USB_VENDOR_ID_JESS2, USB_DEVICE_ID_JESS2_COLOR_RUMBLE_PAD) },
+#endif
+#if IS_ENABLED(CONFIG_HID_PENMOUNT)
{ HID_USB_DEVICE(USB_VENDOR_ID_PENMOUNT, USB_DEVICE_ID_PENMOUNT_6000) },
+#endif
+#if IS_ENABLED(CONFIG_HID_PETALYNX)
{ HID_USB_DEVICE(USB_VENDOR_ID_PETALYNX, USB_DEVICE_ID_PETALYNX_MAXTER_REMOTE) },
+#endif
+#if IS_ENABLED(CONFIG_HID_PICOLCD)
+ { HID_USB_DEVICE(USB_VENDOR_ID_MICROCHIP, USB_DEVICE_ID_PICOLCD) },
+ { HID_USB_DEVICE(USB_VENDOR_ID_MICROCHIP, USB_DEVICE_ID_PICOLCD_BOOTLOADER) },
+#endif
+#if IS_ENABLED(CONFIG_HID_PLANTRONICS)
{ HID_USB_DEVICE(USB_VENDOR_ID_PLANTRONICS, HID_ANY_ID) },
+#endif
+#if IS_ENABLED(CONFIG_HID_PRIMAX)
{ HID_USB_DEVICE(USB_VENDOR_ID_PRIMAX, USB_DEVICE_ID_PRIMAX_KEYBOARD) },
- { HID_USB_DEVICE(USB_VENDOR_ID_RISO_KAGAKU, USB_DEVICE_ID_RI_KA_WEBMAIL) },
+#endif
+#if IS_ENABLED(CONFIG_HID_PRODIKEYS)
+ { HID_USB_DEVICE(USB_VENDOR_ID_CREATIVELABS, USB_DEVICE_ID_PRODIKEYS_PCMIDI) },
+#endif
+#if IS_ENABLED(CONFIG_HID_RMI)
+ { HID_USB_DEVICE(USB_VENDOR_ID_LENOVO, USB_DEVICE_ID_LENOVO_X1_COVER) },
+ { HID_USB_DEVICE(USB_VENDOR_ID_RAZER, USB_DEVICE_ID_RAZER_BLADE_14) },
+#endif
#if IS_ENABLED(CONFIG_HID_ROCCAT)
{ HID_USB_DEVICE(USB_VENDOR_ID_ROCCAT, USB_DEVICE_ID_ROCCAT_ARVO) },
{ HID_USB_DEVICE(USB_VENDOR_ID_ROCCAT, USB_DEVICE_ID_ROCCAT_ISKU) },
@@ -2051,9 +2182,21 @@ static const struct hid_device_id hid_have_special_driver[] = {
{ HID_USB_DEVICE(USB_VENDOR_ID_MADCATZ, USB_DEVICE_ID_MADCATZ_RAT5) },
{ HID_USB_DEVICE(USB_VENDOR_ID_MADCATZ, USB_DEVICE_ID_MADCATZ_RAT9) },
#endif
+#if IS_ENABLED(CONFIG_HID_SAMSUNG)
{ HID_USB_DEVICE(USB_VENDOR_ID_SAMSUNG, USB_DEVICE_ID_SAMSUNG_IR_REMOTE) },
{ HID_USB_DEVICE(USB_VENDOR_ID_SAMSUNG, USB_DEVICE_ID_SAMSUNG_WIRELESS_KBD_MOUSE) },
- { HID_USB_DEVICE(USB_VENDOR_ID_SKYCABLE, USB_DEVICE_ID_SKYCABLE_WIRELESS_PRESENTER) },
+#endif
+#if IS_ENABLED(CONFIG_HID_SMARTJOYPLUS)
+ { HID_USB_DEVICE(USB_VENDOR_ID_PLAYDOTCOM, USB_DEVICE_ID_PLAYDOTCOM_EMS_USBII) },
+ { HID_USB_DEVICE(USB_VENDOR_ID_WISEGROUP, USB_DEVICE_ID_SMARTJOY_PLUS) },
+ { HID_USB_DEVICE(USB_VENDOR_ID_WISEGROUP, USB_DEVICE_ID_SUPER_JOY_BOX_3) },
+ { HID_USB_DEVICE(USB_VENDOR_ID_WISEGROUP, USB_DEVICE_ID_DUAL_USB_JOYPAD) },
+ { HID_USB_DEVICE(USB_VENDOR_ID_WISEGROUP_LTD, USB_DEVICE_ID_SUPER_JOY_BOX_3_PRO) },
+ { HID_USB_DEVICE(USB_VENDOR_ID_WISEGROUP_LTD, USB_DEVICE_ID_SUPER_DUAL_BOX_PRO) },
+ { HID_USB_DEVICE(USB_VENDOR_ID_WISEGROUP_LTD, USB_DEVICE_ID_SUPER_JOY_BOX_5_PRO) },
+#endif
+#if IS_ENABLED(CONFIG_HID_SONY)
+ { HID_BLUETOOTH_DEVICE(USB_VENDOR_ID_LOGITECH, USB_DEVICE_ID_LOGITECH_HARMONY_PS3) },
{ HID_BLUETOOTH_DEVICE(USB_VENDOR_ID_SMK, USB_DEVICE_ID_SMK_PS3_BDREMOTE) },
{ HID_USB_DEVICE(USB_VENDOR_ID_SONY, USB_DEVICE_ID_SONY_BUZZ_CONTROLLER) },
{ HID_USB_DEVICE(USB_VENDOR_ID_SONY, USB_DEVICE_ID_SONY_WIRELESS_BUZZ_CONTROLLER) },
@@ -2072,9 +2215,17 @@ static const struct hid_device_id hid_have_special_driver[] = {
{ HID_USB_DEVICE(USB_VENDOR_ID_SONY, USB_DEVICE_ID_SONY_VAIO_VGX_MOUSE) },
{ HID_USB_DEVICE(USB_VENDOR_ID_SONY, USB_DEVICE_ID_SONY_VAIO_VGP_MOUSE) },
{ HID_USB_DEVICE(USB_VENDOR_ID_SINO_LITE, USB_DEVICE_ID_SINO_LITE_CONTROLLER) },
+#endif
+#if IS_ENABLED(CONFIG_HID_SPEEDLINK)
+ { HID_USB_DEVICE(USB_VENDOR_ID_X_TENSIONS, USB_DEVICE_ID_SPEEDLINK_VAD_CEZANNE) },
+#endif
+#if IS_ENABLED(CONFIG_HID_STEELSERIES)
{ HID_USB_DEVICE(USB_VENDOR_ID_STEELSERIES, USB_DEVICE_ID_STEELSERIES_SRWS1) },
+#endif
+#if IS_ENABLED(CONFIG_HID_SUNPLUS)
{ HID_USB_DEVICE(USB_VENDOR_ID_SUNPLUS, USB_DEVICE_ID_SUNPLUS_WDESKTOP) },
- { HID_USB_DEVICE(USB_VENDOR_ID_THINGM, USB_DEVICE_ID_BLINK1) },
+#endif
+#if IS_ENABLED(CONFIG_HID_THRUSTMASTER)
{ HID_USB_DEVICE(USB_VENDOR_ID_THRUSTMASTER, 0xb300) },
{ HID_USB_DEVICE(USB_VENDOR_ID_THRUSTMASTER, 0xb304) },
{ HID_USB_DEVICE(USB_VENDOR_ID_THRUSTMASTER, 0xb323) },
@@ -2083,12 +2234,25 @@ static const struct hid_device_id hid_have_special_driver[] = {
{ HID_USB_DEVICE(USB_VENDOR_ID_THRUSTMASTER, 0xb653) },
{ HID_USB_DEVICE(USB_VENDOR_ID_THRUSTMASTER, 0xb654) },
{ HID_USB_DEVICE(USB_VENDOR_ID_THRUSTMASTER, 0xb65a) },
+#endif
+#if IS_ENABLED(CONFIG_HID_TIVO)
{ HID_BLUETOOTH_DEVICE(USB_VENDOR_ID_TIVO, USB_DEVICE_ID_TIVO_SLIDE_BT) },
{ HID_USB_DEVICE(USB_VENDOR_ID_TIVO, USB_DEVICE_ID_TIVO_SLIDE) },
{ HID_USB_DEVICE(USB_VENDOR_ID_TIVO, USB_DEVICE_ID_TIVO_SLIDE_PRO) },
+#endif
+#if IS_ENABLED(CONFIG_HID_TOPSEED)
+ { HID_USB_DEVICE(USB_VENDOR_ID_BTC, USB_DEVICE_ID_BTC_EMPREX_REMOTE) },
+ { HID_USB_DEVICE(USB_VENDOR_ID_BTC, USB_DEVICE_ID_BTC_EMPREX_REMOTE_2) },
+ { HID_USB_DEVICE(USB_VENDOR_ID_CHICONY, USB_DEVICE_ID_CHICONY_WIRELESS) },
{ HID_USB_DEVICE(USB_VENDOR_ID_TOPSEED, USB_DEVICE_ID_TOPSEED_CYBERLINK) },
{ HID_USB_DEVICE(USB_VENDOR_ID_TOPSEED2, USB_DEVICE_ID_TOPSEED2_RF_COMBO) },
+#endif
+#if IS_ENABLED(CONFIG_HID_TWINHAN)
{ HID_USB_DEVICE(USB_VENDOR_ID_TWINHAN, USB_DEVICE_ID_TWINHAN_IR_REMOTE) },
+#endif
+#if IS_ENABLED(CONFIG_HID_UCLOGIC)
+ { HID_USB_DEVICE(USB_VENDOR_ID_HUION, USB_DEVICE_ID_HUION_TABLET) },
+ { HID_USB_DEVICE(USB_VENDOR_ID_UCLOGIC, USB_DEVICE_ID_HUION_TABLET) },
{ HID_USB_DEVICE(USB_VENDOR_ID_UCLOGIC, USB_DEVICE_ID_UCLOGIC_TABLET_PF1209) },
{ HID_USB_DEVICE(USB_VENDOR_ID_UCLOGIC, USB_DEVICE_ID_UCLOGIC_TABLET_WP4030U) },
{ HID_USB_DEVICE(USB_VENDOR_ID_UCLOGIC, USB_DEVICE_ID_UCLOGIC_TABLET_WP5540U) },
@@ -2096,20 +2260,17 @@ static const struct hid_device_id hid_have_special_driver[] = {
{ HID_USB_DEVICE(USB_VENDOR_ID_UCLOGIC, USB_DEVICE_ID_UCLOGIC_TABLET_WP1062) },
{ HID_USB_DEVICE(USB_VENDOR_ID_UCLOGIC, USB_DEVICE_ID_UCLOGIC_WIRELESS_TABLET_TWHL850) },
{ HID_USB_DEVICE(USB_VENDOR_ID_UCLOGIC, USB_DEVICE_ID_UCLOGIC_TABLET_TWHA60) },
- { HID_USB_DEVICE(USB_VENDOR_ID_THQ, USB_DEVICE_ID_THQ_PS3_UDRAW) },
{ HID_USB_DEVICE(USB_VENDOR_ID_UCLOGIC, USB_DEVICE_ID_YIYNOVA_TABLET) },
{ HID_USB_DEVICE(USB_VENDOR_ID_UCLOGIC, USB_DEVICE_ID_UGEE_TABLET_81) },
{ HID_USB_DEVICE(USB_VENDOR_ID_UCLOGIC, USB_DEVICE_ID_UGEE_TABLET_45) },
{ HID_USB_DEVICE(USB_VENDOR_ID_UCLOGIC, USB_DEVICE_ID_UCLOGIC_DRAWIMAGE_G3) },
- { HID_USB_DEVICE(USB_VENDOR_ID_UGTIZER, USB_DEVICE_ID_UGTIZER_TABLET_GP0610) },
{ HID_USB_DEVICE(USB_VENDOR_ID_UGEE, USB_DEVICE_ID_UGEE_TABLET_EX07S) },
- { HID_USB_DEVICE(USB_VENDOR_ID_WISEGROUP, USB_DEVICE_ID_SMARTJOY_PLUS) },
- { HID_USB_DEVICE(USB_VENDOR_ID_WISEGROUP, USB_DEVICE_ID_SUPER_JOY_BOX_3) },
- { HID_USB_DEVICE(USB_VENDOR_ID_WISEGROUP, USB_DEVICE_ID_DUAL_USB_JOYPAD) },
- { HID_USB_DEVICE(USB_VENDOR_ID_WISEGROUP_LTD, USB_DEVICE_ID_SUPER_JOY_BOX_3_PRO) },
- { HID_USB_DEVICE(USB_VENDOR_ID_WISEGROUP_LTD, USB_DEVICE_ID_SUPER_DUAL_BOX_PRO) },
- { HID_USB_DEVICE(USB_VENDOR_ID_WISEGROUP_LTD, USB_DEVICE_ID_SUPER_JOY_BOX_5_PRO) },
- { HID_USB_DEVICE(USB_VENDOR_ID_PLAYDOTCOM, USB_DEVICE_ID_PLAYDOTCOM_EMS_USBII) },
+ { HID_USB_DEVICE(USB_VENDOR_ID_UGTIZER, USB_DEVICE_ID_UGTIZER_TABLET_GP0610) },
+#endif
+#if IS_ENABLED(CONFIG_HID_UDRAW_PS3)
+ { HID_USB_DEVICE(USB_VENDOR_ID_THQ, USB_DEVICE_ID_THQ_PS3_UDRAW) },
+#endif
+#if IS_ENABLED(CONFIG_HID_WALTOP)
{ HID_USB_DEVICE(USB_VENDOR_ID_WALTOP, USB_DEVICE_ID_WALTOP_SLIM_TABLET_5_8_INCH) },
{ HID_USB_DEVICE(USB_VENDOR_ID_WALTOP, USB_DEVICE_ID_WALTOP_SLIM_TABLET_12_1_INCH) },
{ HID_USB_DEVICE(USB_VENDOR_ID_WALTOP, USB_DEVICE_ID_WALTOP_Q_PAD) },
@@ -2117,19 +2278,18 @@ static const struct hid_device_id hid_have_special_driver[] = {
{ HID_USB_DEVICE(USB_VENDOR_ID_WALTOP, USB_DEVICE_ID_WALTOP_MEDIA_TABLET_10_6_INCH) },
{ HID_USB_DEVICE(USB_VENDOR_ID_WALTOP, USB_DEVICE_ID_WALTOP_MEDIA_TABLET_14_1_INCH) },
{ HID_USB_DEVICE(USB_VENDOR_ID_WALTOP, USB_DEVICE_ID_WALTOP_SIRIUS_BATTERY_FREE_TABLET) },
- { HID_USB_DEVICE(USB_VENDOR_ID_X_TENSIONS, USB_DEVICE_ID_SPEEDLINK_VAD_CEZANNE) },
+#endif
+#if IS_ENABLED(CONFIG_HID_XINMO)
{ HID_USB_DEVICE(USB_VENDOR_ID_XIN_MO, USB_DEVICE_ID_XIN_MO_DUAL_ARCADE) },
{ HID_USB_DEVICE(USB_VENDOR_ID_XIN_MO, USB_DEVICE_ID_THT_2P_ARCADE) },
+#endif
+#if IS_ENABLED(CONFIG_HID_ZEROPLUS)
{ HID_USB_DEVICE(USB_VENDOR_ID_ZEROPLUS, 0x0005) },
{ HID_USB_DEVICE(USB_VENDOR_ID_ZEROPLUS, 0x0030) },
+#endif
+#if IS_ENABLED(CONFIG_HID_ZYDACRON)
{ HID_USB_DEVICE(USB_VENDOR_ID_ZYDACRON, USB_DEVICE_ID_ZYDACRON_REMOTE_CONTROL) },
-
- { HID_BLUETOOTH_DEVICE(USB_VENDOR_ID_MICROSOFT, USB_DEVICE_ID_MS_PRESENTER_8K_BT) },
- { HID_BLUETOOTH_DEVICE(USB_VENDOR_ID_NINTENDO, USB_DEVICE_ID_NINTENDO_WIIMOTE) },
- { HID_BLUETOOTH_DEVICE(USB_VENDOR_ID_NINTENDO, USB_DEVICE_ID_NINTENDO_WIIMOTE2) },
- { HID_USB_DEVICE(USB_VENDOR_ID_RAZER, USB_DEVICE_ID_RAZER_BLADE_14) },
- { HID_USB_DEVICE(USB_VENDOR_ID_CMEDIA, USB_DEVICE_ID_CM6533) },
- { HID_USB_DEVICE(USB_VENDOR_ID_LENOVO, USB_DEVICE_ID_LENOVO_X1_COVER) },
+#endif
{ }
};
diff --git a/drivers/hid/hid-ids.h b/drivers/hid/hid-ids.h
index 8ca1e8ce0af2..4f9a3938189a 100644
--- a/drivers/hid/hid-ids.h
+++ b/drivers/hid/hid-ids.h
@@ -319,6 +319,9 @@
#define USB_VENDOR_ID_DELCOM 0x0fc5
#define USB_DEVICE_ID_DELCOM_VISUAL_IND 0xb080
+#define USB_VENDOR_ID_DELL 0x413c
+#define USB_DEVICE_ID_DELL_PIXART_USB_OPTICAL_MOUSE 0x301a
+
#define USB_VENDOR_ID_DELORME 0x1163
#define USB_DEVICE_ID_DELORME_EARTHMATE 0x0100
#define USB_DEVICE_ID_DELORME_EM_LT20 0x0200
diff --git a/drivers/hid/hid-magicmouse.c b/drivers/hid/hid-magicmouse.c
index 1d6c997b3001..20b40ad26325 100644
--- a/drivers/hid/hid-magicmouse.c
+++ b/drivers/hid/hid-magicmouse.c
@@ -349,7 +349,6 @@ static int magicmouse_raw_event(struct hid_device *hdev,
if (input->id.product == USB_DEVICE_ID_APPLE_MAGICMOUSE) {
magicmouse_emit_buttons(msc, clicks & 3);
- input_mt_report_pointer_emulation(input, true);
input_report_rel(input, REL_X, x);
input_report_rel(input, REL_Y, y);
} else { /* USB_DEVICE_ID_APPLE_MAGICTRACKPAD */
@@ -389,16 +388,16 @@ static int magicmouse_setup_input(struct input_dev *input, struct hid_device *hd
__clear_bit(BTN_RIGHT, input->keybit);
__clear_bit(BTN_MIDDLE, input->keybit);
__set_bit(BTN_MOUSE, input->keybit);
+ __set_bit(BTN_TOOL_FINGER, input->keybit);
+ __set_bit(BTN_TOOL_DOUBLETAP, input->keybit);
+ __set_bit(BTN_TOOL_TRIPLETAP, input->keybit);
+ __set_bit(BTN_TOOL_QUADTAP, input->keybit);
+ __set_bit(BTN_TOOL_QUINTTAP, input->keybit);
+ __set_bit(BTN_TOUCH, input->keybit);
+ __set_bit(INPUT_PROP_POINTER, input->propbit);
__set_bit(INPUT_PROP_BUTTONPAD, input->propbit);
}
- __set_bit(BTN_TOOL_FINGER, input->keybit);
- __set_bit(BTN_TOOL_DOUBLETAP, input->keybit);
- __set_bit(BTN_TOOL_TRIPLETAP, input->keybit);
- __set_bit(BTN_TOOL_QUADTAP, input->keybit);
- __set_bit(BTN_TOOL_QUINTTAP, input->keybit);
- __set_bit(BTN_TOUCH, input->keybit);
- __set_bit(INPUT_PROP_POINTER, input->propbit);
__set_bit(EV_ABS, input->evbit);
diff --git a/drivers/hid/usbhid/hid-quirks.c b/drivers/hid/usbhid/hid-quirks.c
index 6316498b7812..a88e7c7bea0a 100644
--- a/drivers/hid/usbhid/hid-quirks.c
+++ b/drivers/hid/usbhid/hid-quirks.c
@@ -85,6 +85,7 @@ static const struct hid_blacklist {
{ USB_VENDOR_ID_CORSAIR, USB_DEVICE_ID_CORSAIR_K65RGB_RAPIDFIRE, HID_QUIRK_NO_INIT_REPORTS | HID_QUIRK_ALWAYS_POLL },
{ USB_VENDOR_ID_CORSAIR, USB_DEVICE_ID_CORSAIR_SCIMITAR_PRO_RGB, HID_QUIRK_NO_INIT_REPORTS | HID_QUIRK_ALWAYS_POLL },
{ USB_VENDOR_ID_CREATIVELABS, USB_DEVICE_ID_CREATIVE_SB_OMNI_SURROUND_51, HID_QUIRK_NOGET },
+ { USB_VENDOR_ID_DELL, USB_DEVICE_ID_DELL_PIXART_USB_OPTICAL_MOUSE, HID_QUIRK_ALWAYS_POLL },
{ USB_VENDOR_ID_DMI, USB_DEVICE_ID_DMI_ENC, HID_QUIRK_NOGET },
{ USB_VENDOR_ID_DRAGONRISE, USB_DEVICE_ID_DRAGONRISE_WIIU, HID_QUIRK_MULTI_INPUT },
{ USB_VENDOR_ID_DRAGONRISE, USB_DEVICE_ID_DRAGONRISE_PS3, HID_QUIRK_MULTI_INPUT },
diff --git a/drivers/hsi/clients/ssi_protocol.c b/drivers/hsi/clients/ssi_protocol.c
index 26b05106f0d3..93d28c0ec8bf 100644
--- a/drivers/hsi/clients/ssi_protocol.c
+++ b/drivers/hsi/clients/ssi_protocol.c
@@ -1066,7 +1066,7 @@ static void ssip_pn_setup(struct net_device *dev)
dev->addr_len = 1;
dev->tx_queue_len = SSIP_TXQUEUE_LEN;
- dev->destructor = free_netdev;
+ dev->needs_free_netdev = true;
dev->header_ops = &phonet_header_ops;
}
diff --git a/drivers/i2c/busses/i2c-imx.c b/drivers/i2c/busses/i2c-imx.c
index 95ed17183e73..54a47b40546f 100644
--- a/drivers/i2c/busses/i2c-imx.c
+++ b/drivers/i2c/busses/i2c-imx.c
@@ -734,9 +734,9 @@ static int i2c_imx_dma_read(struct imx_i2c_struct *i2c_imx,
* the first read operation, otherwise the first read cost
* one extra clock cycle.
*/
- temp = readb(i2c_imx->base + IMX_I2C_I2CR);
+ temp = imx_i2c_read_reg(i2c_imx, IMX_I2C_I2CR);
temp |= I2CR_MTX;
- writeb(temp, i2c_imx->base + IMX_I2C_I2CR);
+ imx_i2c_write_reg(temp, i2c_imx, IMX_I2C_I2CR);
}
msgs->buf[msgs->len-1] = imx_i2c_read_reg(i2c_imx, IMX_I2C_I2DR);
@@ -857,9 +857,9 @@ static int i2c_imx_read(struct imx_i2c_struct *i2c_imx, struct i2c_msg *msgs, bo
* the first read operation, otherwise the first read cost
* one extra clock cycle.
*/
- temp = readb(i2c_imx->base + IMX_I2C_I2CR);
+ temp = imx_i2c_read_reg(i2c_imx, IMX_I2C_I2CR);
temp |= I2CR_MTX;
- writeb(temp, i2c_imx->base + IMX_I2C_I2CR);
+ imx_i2c_write_reg(temp, i2c_imx, IMX_I2C_I2CR);
}
} else if (i == (msgs->len - 2)) {
dev_dbg(&i2c_imx->adapter.dev,
diff --git a/drivers/i2c/busses/i2c-ismt.c b/drivers/i2c/busses/i2c-ismt.c
index f573448d2132..e98e44e584a4 100644
--- a/drivers/i2c/busses/i2c-ismt.c
+++ b/drivers/i2c/busses/i2c-ismt.c
@@ -584,7 +584,7 @@ static int ismt_access(struct i2c_adapter *adap, u16 addr,
/* unmap the data buffer */
if (dma_size != 0)
- dma_unmap_single(&adap->dev, dma_addr, dma_size, dma_direction);
+ dma_unmap_single(dev, dma_addr, dma_size, dma_direction);
if (unlikely(!time_left)) {
dev_err(dev, "completion wait timed out\n");
diff --git a/drivers/i2c/busses/i2c-rcar.c b/drivers/i2c/busses/i2c-rcar.c
index 214bf2835d1f..8be3e6cb8fe6 100644
--- a/drivers/i2c/busses/i2c-rcar.c
+++ b/drivers/i2c/busses/i2c-rcar.c
@@ -319,7 +319,7 @@ static void rcar_i2c_dma_unmap(struct rcar_i2c_priv *priv)
rcar_i2c_write(priv, ICFBSCR, TCYC06);
dma_unmap_single(chan->device->dev, sg_dma_address(&priv->sg),
- priv->msg->len, priv->dma_direction);
+ sg_dma_len(&priv->sg), priv->dma_direction);
priv->dma_direction = DMA_NONE;
}
diff --git a/drivers/iio/adc/meson_saradc.c b/drivers/iio/adc/meson_saradc.c
index dd4190b50df6..6066bbfc42fe 100644
--- a/drivers/iio/adc/meson_saradc.c
+++ b/drivers/iio/adc/meson_saradc.c
@@ -468,13 +468,13 @@ static void meson_sar_adc_unlock(struct iio_dev *indio_dev)
static void meson_sar_adc_clear_fifo(struct iio_dev *indio_dev)
{
struct meson_sar_adc_priv *priv = iio_priv(indio_dev);
- int count;
+ unsigned int count, tmp;
for (count = 0; count < MESON_SAR_ADC_MAX_FIFO_SIZE; count++) {
if (!meson_sar_adc_get_fifo_count(indio_dev))
break;
- regmap_read(priv->regmap, MESON_SAR_ADC_FIFO_RD, 0);
+ regmap_read(priv->regmap, MESON_SAR_ADC_FIFO_RD, &tmp);
}
}
diff --git a/drivers/iio/adc/mxs-lradc-adc.c b/drivers/iio/adc/mxs-lradc-adc.c
index b0c7d8ee5cb8..6888167ca1e6 100644
--- a/drivers/iio/adc/mxs-lradc-adc.c
+++ b/drivers/iio/adc/mxs-lradc-adc.c
@@ -718,9 +718,12 @@ static int mxs_lradc_adc_probe(struct platform_device *pdev)
adc->dev = dev;
iores = platform_get_resource(pdev, IORESOURCE_MEM, 0);
+ if (!iores)
+ return -EINVAL;
+
adc->base = devm_ioremap(dev, iores->start, resource_size(iores));
- if (IS_ERR(adc->base))
- return PTR_ERR(adc->base);
+ if (!adc->base)
+ return -ENOMEM;
init_completion(&adc->completion);
spin_lock_init(&adc->lock);
diff --git a/drivers/iio/buffer/industrialio-buffer-dma.c b/drivers/iio/buffer/industrialio-buffer-dma.c
index dd99d273bae9..ff03324dee13 100644
--- a/drivers/iio/buffer/industrialio-buffer-dma.c
+++ b/drivers/iio/buffer/industrialio-buffer-dma.c
@@ -14,6 +14,7 @@
#include <linux/sched.h>
#include <linux/poll.h>
#include <linux/iio/buffer.h>
+#include <linux/iio/buffer_impl.h>
#include <linux/iio/buffer-dma.h>
#include <linux/dma-mapping.h>
#include <linux/sizes.h>
diff --git a/drivers/iio/buffer/industrialio-buffer-dmaengine.c b/drivers/iio/buffer/industrialio-buffer-dmaengine.c
index 9fabed47053d..2b5a320f42c5 100644
--- a/drivers/iio/buffer/industrialio-buffer-dmaengine.c
+++ b/drivers/iio/buffer/industrialio-buffer-dmaengine.c
@@ -14,6 +14,7 @@
#include <linux/iio/iio.h>
#include <linux/iio/buffer.h>
+#include <linux/iio/buffer_impl.h>
#include <linux/iio/buffer-dma.h>
#include <linux/iio/buffer-dmaengine.h>
diff --git a/drivers/iio/imu/inv_mpu6050/inv_mpu_core.c b/drivers/iio/imu/inv_mpu6050/inv_mpu_core.c
index 96dabbd2f004..88a7c5d4e4d2 100644
--- a/drivers/iio/imu/inv_mpu6050/inv_mpu_core.c
+++ b/drivers/iio/imu/inv_mpu6050/inv_mpu_core.c
@@ -41,6 +41,7 @@ static const int accel_scale[] = {598, 1196, 2392, 4785};
static const struct inv_mpu6050_reg_map reg_set_6500 = {
.sample_rate_div = INV_MPU6050_REG_SAMPLE_RATE_DIV,
.lpf = INV_MPU6050_REG_CONFIG,
+ .accel_lpf = INV_MPU6500_REG_ACCEL_CONFIG_2,
.user_ctrl = INV_MPU6050_REG_USER_CTRL,
.fifo_en = INV_MPU6050_REG_FIFO_EN,
.gyro_config = INV_MPU6050_REG_GYRO_CONFIG,
@@ -211,6 +212,37 @@ int inv_mpu6050_set_power_itg(struct inv_mpu6050_state *st, bool power_on)
EXPORT_SYMBOL_GPL(inv_mpu6050_set_power_itg);
/**
+ * inv_mpu6050_set_lpf_regs() - set low pass filter registers, chip dependent
+ *
+ * MPU60xx/MPU9150 use only 1 register for accelerometer + gyroscope
+ * MPU6500 and above have a dedicated register for accelerometer
+ */
+static int inv_mpu6050_set_lpf_regs(struct inv_mpu6050_state *st,
+ enum inv_mpu6050_filter_e val)
+{
+ int result;
+
+ result = regmap_write(st->map, st->reg->lpf, val);
+ if (result)
+ return result;
+
+ switch (st->chip_type) {
+ case INV_MPU6050:
+ case INV_MPU6000:
+ case INV_MPU9150:
+ /* old chips, nothing to do */
+ result = 0;
+ break;
+ default:
+ /* set accel lpf */
+ result = regmap_write(st->map, st->reg->accel_lpf, val);
+ break;
+ }
+
+ return result;
+}
+
+/**
* inv_mpu6050_init_config() - Initialize hardware, disable FIFO.
*
* Initial configuration:
@@ -233,8 +265,7 @@ static int inv_mpu6050_init_config(struct iio_dev *indio_dev)
if (result)
return result;
- d = INV_MPU6050_FILTER_20HZ;
- result = regmap_write(st->map, st->reg->lpf, d);
+ result = inv_mpu6050_set_lpf_regs(st, INV_MPU6050_FILTER_20HZ);
if (result)
return result;
@@ -537,6 +568,8 @@ error_write_raw:
* would be alising. This function basically search for the
* correct low pass parameters based on the fifo rate, e.g,
* sampling frequency.
+ *
+ * lpf is set automatically when setting sampling rate to avoid any aliases.
*/
static int inv_mpu6050_set_lpf(struct inv_mpu6050_state *st, int rate)
{
@@ -552,7 +585,7 @@ static int inv_mpu6050_set_lpf(struct inv_mpu6050_state *st, int rate)
while ((h < hz[i]) && (i < ARRAY_SIZE(d) - 1))
i++;
data = d[i];
- result = regmap_write(st->map, st->reg->lpf, data);
+ result = inv_mpu6050_set_lpf_regs(st, data);
if (result)
return result;
st->chip_config.lpf = data;
diff --git a/drivers/iio/imu/inv_mpu6050/inv_mpu_iio.h b/drivers/iio/imu/inv_mpu6050/inv_mpu_iio.h
index ef13de7a2c20..953a0c09d568 100644
--- a/drivers/iio/imu/inv_mpu6050/inv_mpu_iio.h
+++ b/drivers/iio/imu/inv_mpu6050/inv_mpu_iio.h
@@ -28,6 +28,7 @@
* struct inv_mpu6050_reg_map - Notable registers.
* @sample_rate_div: Divider applied to gyro output rate.
* @lpf: Configures internal low pass filter.
+ * @accel_lpf: Configures accelerometer low pass filter.
* @user_ctrl: Enables/resets the FIFO.
* @fifo_en: Determines which data will appear in FIFO.
* @gyro_config: gyro config register.
@@ -47,6 +48,7 @@
struct inv_mpu6050_reg_map {
u8 sample_rate_div;
u8 lpf;
+ u8 accel_lpf;
u8 user_ctrl;
u8 fifo_en;
u8 gyro_config;
@@ -188,6 +190,7 @@ struct inv_mpu6050_state {
#define INV_MPU6050_FIFO_THRESHOLD 500
/* mpu6500 registers */
+#define INV_MPU6500_REG_ACCEL_CONFIG_2 0x1D
#define INV_MPU6500_REG_ACCEL_OFFSET 0x77
/* delay time in milliseconds */
diff --git a/drivers/infiniband/core/addr.c b/drivers/infiniband/core/addr.c
index 02971e239a18..ece6926fa2e6 100644
--- a/drivers/infiniband/core/addr.c
+++ b/drivers/infiniband/core/addr.c
@@ -449,12 +449,7 @@ static int addr6_resolve(struct sockaddr_in6 *src_in,
return ret;
rt = (struct rt6_info *)dst;
- if (ipv6_addr_any(&fl6.saddr)) {
- ret = ipv6_dev_get_saddr(addr->net, ip6_dst_idev(dst)->dev,
- &fl6.daddr, 0, &fl6.saddr);
- if (ret)
- goto put;
-
+ if (ipv6_addr_any(&src_in->sin6_addr)) {
src_in->sin6_family = AF_INET6;
src_in->sin6_addr = fl6.saddr;
}
@@ -471,9 +466,6 @@ static int addr6_resolve(struct sockaddr_in6 *src_in,
*pdst = dst;
return 0;
-put:
- dst_release(dst);
- return ret;
}
#else
static int addr6_resolve(struct sockaddr_in6 *src_in,
diff --git a/drivers/infiniband/hw/bnxt_re/bnxt_re.h b/drivers/infiniband/hw/bnxt_re/bnxt_re.h
index ebf7be8d4139..08772836fded 100644
--- a/drivers/infiniband/hw/bnxt_re/bnxt_re.h
+++ b/drivers/infiniband/hw/bnxt_re/bnxt_re.h
@@ -56,6 +56,10 @@
#define BNXT_RE_MAX_SRQC_COUNT (64 * 1024)
#define BNXT_RE_MAX_CQ_COUNT (64 * 1024)
+#define BNXT_RE_UD_QP_HW_STALL 0x400000
+
+#define BNXT_RE_RQ_WQE_THRESHOLD 32
+
struct bnxt_re_work {
struct work_struct work;
unsigned long event;
diff --git a/drivers/infiniband/hw/bnxt_re/ib_verbs.c b/drivers/infiniband/hw/bnxt_re/ib_verbs.c
index 7ba9e699d7ab..c7bd68311d0c 100644
--- a/drivers/infiniband/hw/bnxt_re/ib_verbs.c
+++ b/drivers/infiniband/hw/bnxt_re/ib_verbs.c
@@ -61,6 +61,48 @@
#include "ib_verbs.h"
#include <rdma/bnxt_re-abi.h>
+static int __from_ib_access_flags(int iflags)
+{
+ int qflags = 0;
+
+ if (iflags & IB_ACCESS_LOCAL_WRITE)
+ qflags |= BNXT_QPLIB_ACCESS_LOCAL_WRITE;
+ if (iflags & IB_ACCESS_REMOTE_READ)
+ qflags |= BNXT_QPLIB_ACCESS_REMOTE_READ;
+ if (iflags & IB_ACCESS_REMOTE_WRITE)
+ qflags |= BNXT_QPLIB_ACCESS_REMOTE_WRITE;
+ if (iflags & IB_ACCESS_REMOTE_ATOMIC)
+ qflags |= BNXT_QPLIB_ACCESS_REMOTE_ATOMIC;
+ if (iflags & IB_ACCESS_MW_BIND)
+ qflags |= BNXT_QPLIB_ACCESS_MW_BIND;
+ if (iflags & IB_ZERO_BASED)
+ qflags |= BNXT_QPLIB_ACCESS_ZERO_BASED;
+ if (iflags & IB_ACCESS_ON_DEMAND)
+ qflags |= BNXT_QPLIB_ACCESS_ON_DEMAND;
+ return qflags;
+};
+
+static enum ib_access_flags __to_ib_access_flags(int qflags)
+{
+ enum ib_access_flags iflags = 0;
+
+ if (qflags & BNXT_QPLIB_ACCESS_LOCAL_WRITE)
+ iflags |= IB_ACCESS_LOCAL_WRITE;
+ if (qflags & BNXT_QPLIB_ACCESS_REMOTE_WRITE)
+ iflags |= IB_ACCESS_REMOTE_WRITE;
+ if (qflags & BNXT_QPLIB_ACCESS_REMOTE_READ)
+ iflags |= IB_ACCESS_REMOTE_READ;
+ if (qflags & BNXT_QPLIB_ACCESS_REMOTE_ATOMIC)
+ iflags |= IB_ACCESS_REMOTE_ATOMIC;
+ if (qflags & BNXT_QPLIB_ACCESS_MW_BIND)
+ iflags |= IB_ACCESS_MW_BIND;
+ if (qflags & BNXT_QPLIB_ACCESS_ZERO_BASED)
+ iflags |= IB_ZERO_BASED;
+ if (qflags & BNXT_QPLIB_ACCESS_ON_DEMAND)
+ iflags |= IB_ACCESS_ON_DEMAND;
+ return iflags;
+};
+
static int bnxt_re_build_sgl(struct ib_sge *ib_sg_list,
struct bnxt_qplib_sge *sg_list, int num)
{
@@ -149,8 +191,8 @@ int bnxt_re_query_device(struct ib_device *ibdev,
ib_attr->max_total_mcast_qp_attach = 0;
ib_attr->max_ah = dev_attr->max_ah;
- ib_attr->max_fmr = dev_attr->max_fmr;
- ib_attr->max_map_per_fmr = 1; /* ? */
+ ib_attr->max_fmr = 0;
+ ib_attr->max_map_per_fmr = 0;
ib_attr->max_srq = dev_attr->max_srq;
ib_attr->max_srq_wr = dev_attr->max_srq_wqes;
@@ -410,6 +452,158 @@ enum rdma_link_layer bnxt_re_get_link_layer(struct ib_device *ibdev,
return IB_LINK_LAYER_ETHERNET;
}
+#define BNXT_RE_FENCE_PBL_SIZE DIV_ROUND_UP(BNXT_RE_FENCE_BYTES, PAGE_SIZE)
+
+static void bnxt_re_create_fence_wqe(struct bnxt_re_pd *pd)
+{
+ struct bnxt_re_fence_data *fence = &pd->fence;
+ struct ib_mr *ib_mr = &fence->mr->ib_mr;
+ struct bnxt_qplib_swqe *wqe = &fence->bind_wqe;
+
+ memset(wqe, 0, sizeof(*wqe));
+ wqe->type = BNXT_QPLIB_SWQE_TYPE_BIND_MW;
+ wqe->wr_id = BNXT_QPLIB_FENCE_WRID;
+ wqe->flags |= BNXT_QPLIB_SWQE_FLAGS_SIGNAL_COMP;
+ wqe->flags |= BNXT_QPLIB_SWQE_FLAGS_UC_FENCE;
+ wqe->bind.zero_based = false;
+ wqe->bind.parent_l_key = ib_mr->lkey;
+ wqe->bind.va = (u64)(unsigned long)fence->va;
+ wqe->bind.length = fence->size;
+ wqe->bind.access_cntl = __from_ib_access_flags(IB_ACCESS_REMOTE_READ);
+ wqe->bind.mw_type = SQ_BIND_MW_TYPE_TYPE1;
+
+ /* Save the initial rkey in fence structure for now;
+ * wqe->bind.r_key will be set at (re)bind time.
+ */
+ fence->bind_rkey = ib_inc_rkey(fence->mw->rkey);
+}
+
+static int bnxt_re_bind_fence_mw(struct bnxt_qplib_qp *qplib_qp)
+{
+ struct bnxt_re_qp *qp = container_of(qplib_qp, struct bnxt_re_qp,
+ qplib_qp);
+ struct ib_pd *ib_pd = qp->ib_qp.pd;
+ struct bnxt_re_pd *pd = container_of(ib_pd, struct bnxt_re_pd, ib_pd);
+ struct bnxt_re_fence_data *fence = &pd->fence;
+ struct bnxt_qplib_swqe *fence_wqe = &fence->bind_wqe;
+ struct bnxt_qplib_swqe wqe;
+ int rc;
+
+ memcpy(&wqe, fence_wqe, sizeof(wqe));
+ wqe.bind.r_key = fence->bind_rkey;
+ fence->bind_rkey = ib_inc_rkey(fence->bind_rkey);
+
+ dev_dbg(rdev_to_dev(qp->rdev),
+ "Posting bind fence-WQE: rkey: %#x QP: %d PD: %p\n",
+ wqe.bind.r_key, qp->qplib_qp.id, pd);
+ rc = bnxt_qplib_post_send(&qp->qplib_qp, &wqe);
+ if (rc) {
+ dev_err(rdev_to_dev(qp->rdev), "Failed to bind fence-WQE\n");
+ return rc;
+ }
+ bnxt_qplib_post_send_db(&qp->qplib_qp);
+
+ return rc;
+}
+
+static void bnxt_re_destroy_fence_mr(struct bnxt_re_pd *pd)
+{
+ struct bnxt_re_fence_data *fence = &pd->fence;
+ struct bnxt_re_dev *rdev = pd->rdev;
+ struct device *dev = &rdev->en_dev->pdev->dev;
+ struct bnxt_re_mr *mr = fence->mr;
+
+ if (fence->mw) {
+ bnxt_re_dealloc_mw(fence->mw);
+ fence->mw = NULL;
+ }
+ if (mr) {
+ if (mr->ib_mr.rkey)
+ bnxt_qplib_dereg_mrw(&rdev->qplib_res, &mr->qplib_mr,
+ true);
+ if (mr->ib_mr.lkey)
+ bnxt_qplib_free_mrw(&rdev->qplib_res, &mr->qplib_mr);
+ kfree(mr);
+ fence->mr = NULL;
+ }
+ if (fence->dma_addr) {
+ dma_unmap_single(dev, fence->dma_addr, BNXT_RE_FENCE_BYTES,
+ DMA_BIDIRECTIONAL);
+ fence->dma_addr = 0;
+ }
+}
+
+static int bnxt_re_create_fence_mr(struct bnxt_re_pd *pd)
+{
+ int mr_access_flags = IB_ACCESS_LOCAL_WRITE | IB_ACCESS_MW_BIND;
+ struct bnxt_re_fence_data *fence = &pd->fence;
+ struct bnxt_re_dev *rdev = pd->rdev;
+ struct device *dev = &rdev->en_dev->pdev->dev;
+ struct bnxt_re_mr *mr = NULL;
+ dma_addr_t dma_addr = 0;
+ struct ib_mw *mw;
+ u64 pbl_tbl;
+ int rc;
+
+ dma_addr = dma_map_single(dev, fence->va, BNXT_RE_FENCE_BYTES,
+ DMA_BIDIRECTIONAL);
+ rc = dma_mapping_error(dev, dma_addr);
+ if (rc) {
+ dev_err(rdev_to_dev(rdev), "Failed to dma-map fence-MR-mem\n");
+ rc = -EIO;
+ fence->dma_addr = 0;
+ goto fail;
+ }
+ fence->dma_addr = dma_addr;
+
+ /* Allocate a MR */
+ mr = kzalloc(sizeof(*mr), GFP_KERNEL);
+ if (!mr) {
+ rc = -ENOMEM;
+ goto fail;
+ }
+ fence->mr = mr;
+ mr->rdev = rdev;
+ mr->qplib_mr.pd = &pd->qplib_pd;
+ mr->qplib_mr.type = CMDQ_ALLOCATE_MRW_MRW_FLAGS_PMR;
+ mr->qplib_mr.flags = __from_ib_access_flags(mr_access_flags);
+ rc = bnxt_qplib_alloc_mrw(&rdev->qplib_res, &mr->qplib_mr);
+ if (rc) {
+ dev_err(rdev_to_dev(rdev), "Failed to alloc fence-HW-MR\n");
+ goto fail;
+ }
+
+ /* Register MR */
+ mr->ib_mr.lkey = mr->qplib_mr.lkey;
+ mr->qplib_mr.va = (u64)(unsigned long)fence->va;
+ mr->qplib_mr.total_size = BNXT_RE_FENCE_BYTES;
+ pbl_tbl = dma_addr;
+ rc = bnxt_qplib_reg_mr(&rdev->qplib_res, &mr->qplib_mr, &pbl_tbl,
+ BNXT_RE_FENCE_PBL_SIZE, false);
+ if (rc) {
+ dev_err(rdev_to_dev(rdev), "Failed to register fence-MR\n");
+ goto fail;
+ }
+ mr->ib_mr.rkey = mr->qplib_mr.rkey;
+
+ /* Create a fence MW only for kernel consumers */
+ mw = bnxt_re_alloc_mw(&pd->ib_pd, IB_MW_TYPE_1, NULL);
+ if (!mw) {
+ dev_err(rdev_to_dev(rdev),
+ "Failed to create fence-MW for PD: %p\n", pd);
+ rc = -EINVAL;
+ goto fail;
+ }
+ fence->mw = mw;
+
+ bnxt_re_create_fence_wqe(pd);
+ return 0;
+
+fail:
+ bnxt_re_destroy_fence_mr(pd);
+ return rc;
+}
+
/* Protection Domains */
int bnxt_re_dealloc_pd(struct ib_pd *ib_pd)
{
@@ -417,6 +611,7 @@ int bnxt_re_dealloc_pd(struct ib_pd *ib_pd)
struct bnxt_re_dev *rdev = pd->rdev;
int rc;
+ bnxt_re_destroy_fence_mr(pd);
if (ib_pd->uobject && pd->dpi.dbr) {
struct ib_ucontext *ib_uctx = ib_pd->uobject->context;
struct bnxt_re_ucontext *ucntx;
@@ -498,6 +693,10 @@ struct ib_pd *bnxt_re_alloc_pd(struct ib_device *ibdev,
}
}
+ if (!udata)
+ if (bnxt_re_create_fence_mr(pd))
+ dev_warn(rdev_to_dev(rdev),
+ "Failed to create Fence-MR\n");
return &pd->ib_pd;
dbfail:
(void)bnxt_qplib_dealloc_pd(&rdev->qplib_res, &rdev->qplib_res.pd_tbl,
@@ -849,12 +1048,16 @@ static struct bnxt_re_qp *bnxt_re_create_shadow_qp
/* Shadow QP SQ depth should be same as QP1 RQ depth */
qp->qplib_qp.sq.max_wqe = qp1_qp->rq.max_wqe;
qp->qplib_qp.sq.max_sge = 2;
+ /* Q full delta can be 1 since it is internal QP */
+ qp->qplib_qp.sq.q_full_delta = 1;
qp->qplib_qp.scq = qp1_qp->scq;
qp->qplib_qp.rcq = qp1_qp->rcq;
qp->qplib_qp.rq.max_wqe = qp1_qp->rq.max_wqe;
qp->qplib_qp.rq.max_sge = qp1_qp->rq.max_sge;
+ /* Q full delta can be 1 since it is internal QP */
+ qp->qplib_qp.rq.q_full_delta = 1;
qp->qplib_qp.mtu = qp1_qp->mtu;
@@ -917,10 +1120,6 @@ struct ib_qp *bnxt_re_create_qp(struct ib_pd *ib_pd,
qp->qplib_qp.sig_type = ((qp_init_attr->sq_sig_type ==
IB_SIGNAL_ALL_WR) ? true : false);
- entries = roundup_pow_of_two(qp_init_attr->cap.max_send_wr + 1);
- qp->qplib_qp.sq.max_wqe = min_t(u32, entries,
- dev_attr->max_qp_wqes + 1);
-
qp->qplib_qp.sq.max_sge = qp_init_attr->cap.max_send_sge;
if (qp->qplib_qp.sq.max_sge > dev_attr->max_qp_sges)
qp->qplib_qp.sq.max_sge = dev_attr->max_qp_sges;
@@ -959,6 +1158,9 @@ struct ib_qp *bnxt_re_create_qp(struct ib_pd *ib_pd,
qp->qplib_qp.rq.max_wqe = min_t(u32, entries,
dev_attr->max_qp_wqes + 1);
+ qp->qplib_qp.rq.q_full_delta = qp->qplib_qp.rq.max_wqe -
+ qp_init_attr->cap.max_recv_wr;
+
qp->qplib_qp.rq.max_sge = qp_init_attr->cap.max_recv_sge;
if (qp->qplib_qp.rq.max_sge > dev_attr->max_qp_sges)
qp->qplib_qp.rq.max_sge = dev_attr->max_qp_sges;
@@ -967,6 +1169,12 @@ struct ib_qp *bnxt_re_create_qp(struct ib_pd *ib_pd,
qp->qplib_qp.mtu = ib_mtu_enum_to_int(iboe_get_mtu(rdev->netdev->mtu));
if (qp_init_attr->qp_type == IB_QPT_GSI) {
+ /* Allocate 1 more than what's provided */
+ entries = roundup_pow_of_two(qp_init_attr->cap.max_send_wr + 1);
+ qp->qplib_qp.sq.max_wqe = min_t(u32, entries,
+ dev_attr->max_qp_wqes + 1);
+ qp->qplib_qp.sq.q_full_delta = qp->qplib_qp.sq.max_wqe -
+ qp_init_attr->cap.max_send_wr;
qp->qplib_qp.rq.max_sge = dev_attr->max_qp_sges;
if (qp->qplib_qp.rq.max_sge > dev_attr->max_qp_sges)
qp->qplib_qp.rq.max_sge = dev_attr->max_qp_sges;
@@ -1006,6 +1214,22 @@ struct ib_qp *bnxt_re_create_qp(struct ib_pd *ib_pd,
}
} else {
+ /* Allocate 128 + 1 more than what's provided */
+ entries = roundup_pow_of_two(qp_init_attr->cap.max_send_wr +
+ BNXT_QPLIB_RESERVED_QP_WRS + 1);
+ qp->qplib_qp.sq.max_wqe = min_t(u32, entries,
+ dev_attr->max_qp_wqes +
+ BNXT_QPLIB_RESERVED_QP_WRS + 1);
+ qp->qplib_qp.sq.q_full_delta = BNXT_QPLIB_RESERVED_QP_WRS + 1;
+
+ /*
+ * Reserving one slot for Phantom WQE. Application can
+ * post one extra entry in this case. But allowing this to avoid
+ * unexpected Queue full condition
+ */
+
+ qp->qplib_qp.sq.q_full_delta -= 1;
+
qp->qplib_qp.max_rd_atomic = dev_attr->max_qp_rd_atom;
qp->qplib_qp.max_dest_rd_atomic = dev_attr->max_qp_init_rd_atom;
if (udata) {
@@ -1025,6 +1249,7 @@ struct ib_qp *bnxt_re_create_qp(struct ib_pd *ib_pd,
qp->ib_qp.qp_num = qp->qplib_qp.id;
spin_lock_init(&qp->sq_lock);
+ spin_lock_init(&qp->rq_lock);
if (udata) {
struct bnxt_re_qp_resp resp;
@@ -1129,48 +1354,6 @@ static enum ib_mtu __to_ib_mtu(u32 mtu)
}
}
-static int __from_ib_access_flags(int iflags)
-{
- int qflags = 0;
-
- if (iflags & IB_ACCESS_LOCAL_WRITE)
- qflags |= BNXT_QPLIB_ACCESS_LOCAL_WRITE;
- if (iflags & IB_ACCESS_REMOTE_READ)
- qflags |= BNXT_QPLIB_ACCESS_REMOTE_READ;
- if (iflags & IB_ACCESS_REMOTE_WRITE)
- qflags |= BNXT_QPLIB_ACCESS_REMOTE_WRITE;
- if (iflags & IB_ACCESS_REMOTE_ATOMIC)
- qflags |= BNXT_QPLIB_ACCESS_REMOTE_ATOMIC;
- if (iflags & IB_ACCESS_MW_BIND)
- qflags |= BNXT_QPLIB_ACCESS_MW_BIND;
- if (iflags & IB_ZERO_BASED)
- qflags |= BNXT_QPLIB_ACCESS_ZERO_BASED;
- if (iflags & IB_ACCESS_ON_DEMAND)
- qflags |= BNXT_QPLIB_ACCESS_ON_DEMAND;
- return qflags;
-};
-
-static enum ib_access_flags __to_ib_access_flags(int qflags)
-{
- enum ib_access_flags iflags = 0;
-
- if (qflags & BNXT_QPLIB_ACCESS_LOCAL_WRITE)
- iflags |= IB_ACCESS_LOCAL_WRITE;
- if (qflags & BNXT_QPLIB_ACCESS_REMOTE_WRITE)
- iflags |= IB_ACCESS_REMOTE_WRITE;
- if (qflags & BNXT_QPLIB_ACCESS_REMOTE_READ)
- iflags |= IB_ACCESS_REMOTE_READ;
- if (qflags & BNXT_QPLIB_ACCESS_REMOTE_ATOMIC)
- iflags |= IB_ACCESS_REMOTE_ATOMIC;
- if (qflags & BNXT_QPLIB_ACCESS_MW_BIND)
- iflags |= IB_ACCESS_MW_BIND;
- if (qflags & BNXT_QPLIB_ACCESS_ZERO_BASED)
- iflags |= IB_ZERO_BASED;
- if (qflags & BNXT_QPLIB_ACCESS_ON_DEMAND)
- iflags |= IB_ACCESS_ON_DEMAND;
- return iflags;
-};
-
static int bnxt_re_modify_shadow_qp(struct bnxt_re_dev *rdev,
struct bnxt_re_qp *qp1_qp,
int qp_attr_mask)
@@ -1378,11 +1561,21 @@ int bnxt_re_modify_qp(struct ib_qp *ib_qp, struct ib_qp_attr *qp_attr,
entries = roundup_pow_of_two(qp_attr->cap.max_send_wr);
qp->qplib_qp.sq.max_wqe = min_t(u32, entries,
dev_attr->max_qp_wqes + 1);
+ qp->qplib_qp.sq.q_full_delta = qp->qplib_qp.sq.max_wqe -
+ qp_attr->cap.max_send_wr;
+ /*
+ * Reserving one slot for Phantom WQE. Some application can
+ * post one extra entry in this case. Allowing this to avoid
+ * unexpected Queue full condition
+ */
+ qp->qplib_qp.sq.q_full_delta -= 1;
qp->qplib_qp.sq.max_sge = qp_attr->cap.max_send_sge;
if (qp->qplib_qp.rq.max_wqe) {
entries = roundup_pow_of_two(qp_attr->cap.max_recv_wr);
qp->qplib_qp.rq.max_wqe =
min_t(u32, entries, dev_attr->max_qp_wqes + 1);
+ qp->qplib_qp.rq.q_full_delta = qp->qplib_qp.rq.max_wqe -
+ qp_attr->cap.max_recv_wr;
qp->qplib_qp.rq.max_sge = qp_attr->cap.max_recv_sge;
} else {
/* SRQ was used prior, just ignore the RQ caps */
@@ -1883,6 +2076,22 @@ static int bnxt_re_copy_wr_payload(struct bnxt_re_dev *rdev,
return payload_sz;
}
+static void bnxt_ud_qp_hw_stall_workaround(struct bnxt_re_qp *qp)
+{
+ if ((qp->ib_qp.qp_type == IB_QPT_UD ||
+ qp->ib_qp.qp_type == IB_QPT_GSI ||
+ qp->ib_qp.qp_type == IB_QPT_RAW_ETHERTYPE) &&
+ qp->qplib_qp.wqe_cnt == BNXT_RE_UD_QP_HW_STALL) {
+ int qp_attr_mask;
+ struct ib_qp_attr qp_attr;
+
+ qp_attr_mask = IB_QP_STATE;
+ qp_attr.qp_state = IB_QPS_RTS;
+ bnxt_re_modify_qp(&qp->ib_qp, &qp_attr, qp_attr_mask, NULL);
+ qp->qplib_qp.wqe_cnt = 0;
+ }
+}
+
static int bnxt_re_post_send_shadow_qp(struct bnxt_re_dev *rdev,
struct bnxt_re_qp *qp,
struct ib_send_wr *wr)
@@ -1928,6 +2137,7 @@ bad:
wr = wr->next;
}
bnxt_qplib_post_send_db(&qp->qplib_qp);
+ bnxt_ud_qp_hw_stall_workaround(qp);
spin_unlock_irqrestore(&qp->sq_lock, flags);
return rc;
}
@@ -2024,6 +2234,7 @@ bad:
wr = wr->next;
}
bnxt_qplib_post_send_db(&qp->qplib_qp);
+ bnxt_ud_qp_hw_stall_workaround(qp);
spin_unlock_irqrestore(&qp->sq_lock, flags);
return rc;
@@ -2071,7 +2282,10 @@ int bnxt_re_post_recv(struct ib_qp *ib_qp, struct ib_recv_wr *wr,
struct bnxt_re_qp *qp = container_of(ib_qp, struct bnxt_re_qp, ib_qp);
struct bnxt_qplib_swqe wqe;
int rc = 0, payload_sz = 0;
+ unsigned long flags;
+ u32 count = 0;
+ spin_lock_irqsave(&qp->rq_lock, flags);
while (wr) {
/* House keeping */
memset(&wqe, 0, sizeof(wqe));
@@ -2100,9 +2314,21 @@ int bnxt_re_post_recv(struct ib_qp *ib_qp, struct ib_recv_wr *wr,
*bad_wr = wr;
break;
}
+
+ /* Ring DB if the RQEs posted reaches a threshold value */
+ if (++count >= BNXT_RE_RQ_WQE_THRESHOLD) {
+ bnxt_qplib_post_recv_db(&qp->qplib_qp);
+ count = 0;
+ }
+
wr = wr->next;
}
- bnxt_qplib_post_recv_db(&qp->qplib_qp);
+
+ if (count)
+ bnxt_qplib_post_recv_db(&qp->qplib_qp);
+
+ spin_unlock_irqrestore(&qp->rq_lock, flags);
+
return rc;
}
@@ -2643,12 +2869,36 @@ static void bnxt_re_process_res_ud_wc(struct ib_wc *wc,
wc->opcode = IB_WC_RECV_RDMA_WITH_IMM;
}
+static int send_phantom_wqe(struct bnxt_re_qp *qp)
+{
+ struct bnxt_qplib_qp *lib_qp = &qp->qplib_qp;
+ unsigned long flags;
+ int rc = 0;
+
+ spin_lock_irqsave(&qp->sq_lock, flags);
+
+ rc = bnxt_re_bind_fence_mw(lib_qp);
+ if (!rc) {
+ lib_qp->sq.phantom_wqe_cnt++;
+ dev_dbg(&lib_qp->sq.hwq.pdev->dev,
+ "qp %#x sq->prod %#x sw_prod %#x phantom_wqe_cnt %d\n",
+ lib_qp->id, lib_qp->sq.hwq.prod,
+ HWQ_CMP(lib_qp->sq.hwq.prod, &lib_qp->sq.hwq),
+ lib_qp->sq.phantom_wqe_cnt);
+ }
+
+ spin_unlock_irqrestore(&qp->sq_lock, flags);
+ return rc;
+}
+
int bnxt_re_poll_cq(struct ib_cq *ib_cq, int num_entries, struct ib_wc *wc)
{
struct bnxt_re_cq *cq = container_of(ib_cq, struct bnxt_re_cq, ib_cq);
struct bnxt_re_qp *qp;
struct bnxt_qplib_cqe *cqe;
int i, ncqe, budget;
+ struct bnxt_qplib_q *sq;
+ struct bnxt_qplib_qp *lib_qp;
u32 tbl_idx;
struct bnxt_re_sqp_entries *sqp_entry = NULL;
unsigned long flags;
@@ -2661,7 +2911,21 @@ int bnxt_re_poll_cq(struct ib_cq *ib_cq, int num_entries, struct ib_wc *wc)
}
cqe = &cq->cql[0];
while (budget) {
- ncqe = bnxt_qplib_poll_cq(&cq->qplib_cq, cqe, budget);
+ lib_qp = NULL;
+ ncqe = bnxt_qplib_poll_cq(&cq->qplib_cq, cqe, budget, &lib_qp);
+ if (lib_qp) {
+ sq = &lib_qp->sq;
+ if (sq->send_phantom) {
+ qp = container_of(lib_qp,
+ struct bnxt_re_qp, qplib_qp);
+ if (send_phantom_wqe(qp) == -ENOMEM)
+ dev_err(rdev_to_dev(cq->rdev),
+ "Phantom failed! Scheduled to send again\n");
+ else
+ sq->send_phantom = false;
+ }
+ }
+
if (!ncqe)
break;
@@ -2822,6 +3086,12 @@ int bnxt_re_dereg_mr(struct ib_mr *ib_mr)
struct bnxt_re_dev *rdev = mr->rdev;
int rc;
+ rc = bnxt_qplib_free_mrw(&rdev->qplib_res, &mr->qplib_mr);
+ if (rc) {
+ dev_err(rdev_to_dev(rdev), "Dereg MR failed: %#x\n", rc);
+ return rc;
+ }
+
if (mr->npages && mr->pages) {
rc = bnxt_qplib_free_fast_reg_page_list(&rdev->qplib_res,
&mr->qplib_frpl);
@@ -2829,8 +3099,6 @@ int bnxt_re_dereg_mr(struct ib_mr *ib_mr)
mr->npages = 0;
mr->pages = NULL;
}
- rc = bnxt_qplib_free_mrw(&rdev->qplib_res, &mr->qplib_mr);
-
if (!IS_ERR_OR_NULL(mr->ib_umem))
ib_umem_release(mr->ib_umem);
@@ -2914,97 +3182,52 @@ fail:
return ERR_PTR(rc);
}
-/* Fast Memory Regions */
-struct ib_fmr *bnxt_re_alloc_fmr(struct ib_pd *ib_pd, int mr_access_flags,
- struct ib_fmr_attr *fmr_attr)
+struct ib_mw *bnxt_re_alloc_mw(struct ib_pd *ib_pd, enum ib_mw_type type,
+ struct ib_udata *udata)
{
struct bnxt_re_pd *pd = container_of(ib_pd, struct bnxt_re_pd, ib_pd);
struct bnxt_re_dev *rdev = pd->rdev;
- struct bnxt_re_fmr *fmr;
+ struct bnxt_re_mw *mw;
int rc;
- if (fmr_attr->max_pages > MAX_PBL_LVL_2_PGS ||
- fmr_attr->max_maps > rdev->dev_attr.max_map_per_fmr) {
- dev_err(rdev_to_dev(rdev), "Allocate FMR exceeded Max limit");
+ mw = kzalloc(sizeof(*mw), GFP_KERNEL);
+ if (!mw)
return ERR_PTR(-ENOMEM);
- }
- fmr = kzalloc(sizeof(*fmr), GFP_KERNEL);
- if (!fmr)
- return ERR_PTR(-ENOMEM);
-
- fmr->rdev = rdev;
- fmr->qplib_fmr.pd = &pd->qplib_pd;
- fmr->qplib_fmr.type = CMDQ_ALLOCATE_MRW_MRW_FLAGS_PMR;
+ mw->rdev = rdev;
+ mw->qplib_mw.pd = &pd->qplib_pd;
- rc = bnxt_qplib_alloc_mrw(&rdev->qplib_res, &fmr->qplib_fmr);
- if (rc)
+ mw->qplib_mw.type = (type == IB_MW_TYPE_1 ?
+ CMDQ_ALLOCATE_MRW_MRW_FLAGS_MW_TYPE1 :
+ CMDQ_ALLOCATE_MRW_MRW_FLAGS_MW_TYPE2B);
+ rc = bnxt_qplib_alloc_mrw(&rdev->qplib_res, &mw->qplib_mw);
+ if (rc) {
+ dev_err(rdev_to_dev(rdev), "Allocate MW failed!");
goto fail;
+ }
+ mw->ib_mw.rkey = mw->qplib_mw.rkey;
- fmr->qplib_fmr.flags = __from_ib_access_flags(mr_access_flags);
- fmr->ib_fmr.lkey = fmr->qplib_fmr.lkey;
- fmr->ib_fmr.rkey = fmr->ib_fmr.lkey;
+ atomic_inc(&rdev->mw_count);
+ return &mw->ib_mw;
- atomic_inc(&rdev->mr_count);
- return &fmr->ib_fmr;
fail:
- kfree(fmr);
+ kfree(mw);
return ERR_PTR(rc);
}
-int bnxt_re_map_phys_fmr(struct ib_fmr *ib_fmr, u64 *page_list, int list_len,
- u64 iova)
+int bnxt_re_dealloc_mw(struct ib_mw *ib_mw)
{
- struct bnxt_re_fmr *fmr = container_of(ib_fmr, struct bnxt_re_fmr,
- ib_fmr);
- struct bnxt_re_dev *rdev = fmr->rdev;
+ struct bnxt_re_mw *mw = container_of(ib_mw, struct bnxt_re_mw, ib_mw);
+ struct bnxt_re_dev *rdev = mw->rdev;
int rc;
- fmr->qplib_fmr.va = iova;
- fmr->qplib_fmr.total_size = list_len * PAGE_SIZE;
-
- rc = bnxt_qplib_reg_mr(&rdev->qplib_res, &fmr->qplib_fmr, page_list,
- list_len, true);
- if (rc)
- dev_err(rdev_to_dev(rdev), "Failed to map FMR for lkey = 0x%x!",
- fmr->ib_fmr.lkey);
- return rc;
-}
-
-int bnxt_re_unmap_fmr(struct list_head *fmr_list)
-{
- struct bnxt_re_dev *rdev;
- struct bnxt_re_fmr *fmr;
- struct ib_fmr *ib_fmr;
- int rc = 0;
-
- /* Validate each FMRs inside the fmr_list */
- list_for_each_entry(ib_fmr, fmr_list, list) {
- fmr = container_of(ib_fmr, struct bnxt_re_fmr, ib_fmr);
- rdev = fmr->rdev;
-
- if (rdev) {
- rc = bnxt_qplib_dereg_mrw(&rdev->qplib_res,
- &fmr->qplib_fmr, true);
- if (rc)
- break;
- }
+ rc = bnxt_qplib_free_mrw(&rdev->qplib_res, &mw->qplib_mw);
+ if (rc) {
+ dev_err(rdev_to_dev(rdev), "Free MW failed: %#x\n", rc);
+ return rc;
}
- return rc;
-}
-
-int bnxt_re_dealloc_fmr(struct ib_fmr *ib_fmr)
-{
- struct bnxt_re_fmr *fmr = container_of(ib_fmr, struct bnxt_re_fmr,
- ib_fmr);
- struct bnxt_re_dev *rdev = fmr->rdev;
- int rc;
- rc = bnxt_qplib_free_mrw(&rdev->qplib_res, &fmr->qplib_fmr);
- if (rc)
- dev_err(rdev_to_dev(rdev), "Failed to free FMR");
-
- kfree(fmr);
- atomic_dec(&rdev->mr_count);
+ kfree(mw);
+ atomic_dec(&rdev->mw_count);
return rc;
}
diff --git a/drivers/infiniband/hw/bnxt_re/ib_verbs.h b/drivers/infiniband/hw/bnxt_re/ib_verbs.h
index 5c3d71765454..6c160f6a5398 100644
--- a/drivers/infiniband/hw/bnxt_re/ib_verbs.h
+++ b/drivers/infiniband/hw/bnxt_re/ib_verbs.h
@@ -44,11 +44,23 @@ struct bnxt_re_gid_ctx {
u32 refcnt;
};
+#define BNXT_RE_FENCE_BYTES 64
+struct bnxt_re_fence_data {
+ u32 size;
+ u8 va[BNXT_RE_FENCE_BYTES];
+ dma_addr_t dma_addr;
+ struct bnxt_re_mr *mr;
+ struct ib_mw *mw;
+ struct bnxt_qplib_swqe bind_wqe;
+ u32 bind_rkey;
+};
+
struct bnxt_re_pd {
struct bnxt_re_dev *rdev;
struct ib_pd ib_pd;
struct bnxt_qplib_pd qplib_pd;
struct bnxt_qplib_dpi dpi;
+ struct bnxt_re_fence_data fence;
};
struct bnxt_re_ah {
@@ -62,6 +74,7 @@ struct bnxt_re_qp {
struct bnxt_re_dev *rdev;
struct ib_qp ib_qp;
spinlock_t sq_lock; /* protect sq */
+ spinlock_t rq_lock; /* protect rq */
struct bnxt_qplib_qp qplib_qp;
struct ib_umem *sumem;
struct ib_umem *rumem;
@@ -181,12 +194,9 @@ int bnxt_re_map_mr_sg(struct ib_mr *ib_mr, struct scatterlist *sg, int sg_nents,
struct ib_mr *bnxt_re_alloc_mr(struct ib_pd *ib_pd, enum ib_mr_type mr_type,
u32 max_num_sg);
int bnxt_re_dereg_mr(struct ib_mr *mr);
-struct ib_fmr *bnxt_re_alloc_fmr(struct ib_pd *pd, int mr_access_flags,
- struct ib_fmr_attr *fmr_attr);
-int bnxt_re_map_phys_fmr(struct ib_fmr *fmr, u64 *page_list, int list_len,
- u64 iova);
-int bnxt_re_unmap_fmr(struct list_head *fmr_list);
-int bnxt_re_dealloc_fmr(struct ib_fmr *fmr);
+struct ib_mw *bnxt_re_alloc_mw(struct ib_pd *ib_pd, enum ib_mw_type type,
+ struct ib_udata *udata);
+int bnxt_re_dealloc_mw(struct ib_mw *mw);
struct ib_mr *bnxt_re_reg_user_mr(struct ib_pd *pd, u64 start, u64 length,
u64 virt_addr, int mr_access_flags,
struct ib_udata *udata);
diff --git a/drivers/infiniband/hw/bnxt_re/main.c b/drivers/infiniband/hw/bnxt_re/main.c
index 5d355401179b..1fce5e73216b 100644
--- a/drivers/infiniband/hw/bnxt_re/main.c
+++ b/drivers/infiniband/hw/bnxt_re/main.c
@@ -507,10 +507,6 @@ static int bnxt_re_register_ib(struct bnxt_re_dev *rdev)
ibdev->dereg_mr = bnxt_re_dereg_mr;
ibdev->alloc_mr = bnxt_re_alloc_mr;
ibdev->map_mr_sg = bnxt_re_map_mr_sg;
- ibdev->alloc_fmr = bnxt_re_alloc_fmr;
- ibdev->map_phys_fmr = bnxt_re_map_phys_fmr;
- ibdev->unmap_fmr = bnxt_re_unmap_fmr;
- ibdev->dealloc_fmr = bnxt_re_dealloc_fmr;
ibdev->reg_user_mr = bnxt_re_reg_user_mr;
ibdev->alloc_ucontext = bnxt_re_alloc_ucontext;
diff --git a/drivers/infiniband/hw/bnxt_re/qplib_fp.c b/drivers/infiniband/hw/bnxt_re/qplib_fp.c
index 43d08b5e9085..f05500bcdcf1 100644
--- a/drivers/infiniband/hw/bnxt_re/qplib_fp.c
+++ b/drivers/infiniband/hw/bnxt_re/qplib_fp.c
@@ -284,7 +284,7 @@ int bnxt_qplib_create_qp1(struct bnxt_qplib_res *res, struct bnxt_qplib_qp *qp)
{
struct bnxt_qplib_rcfw *rcfw = res->rcfw;
struct cmdq_create_qp1 req;
- struct creq_create_qp1_resp *resp;
+ struct creq_create_qp1_resp resp;
struct bnxt_qplib_pbl *pbl;
struct bnxt_qplib_q *sq = &qp->sq;
struct bnxt_qplib_q *rq = &qp->rq;
@@ -394,31 +394,12 @@ int bnxt_qplib_create_qp1(struct bnxt_qplib_res *res, struct bnxt_qplib_qp *qp)
req.pd_id = cpu_to_le32(qp->pd->id);
- resp = (struct creq_create_qp1_resp *)
- bnxt_qplib_rcfw_send_message(rcfw, (void *)&req,
- NULL, 0);
- if (!resp) {
- dev_err(&res->pdev->dev, "QPLIB: FP: CREATE_QP1 send failed");
- rc = -EINVAL;
- goto fail;
- }
- if (!bnxt_qplib_rcfw_wait_for_resp(rcfw, le16_to_cpu(req.cookie))) {
- /* Cmd timed out */
- dev_err(&rcfw->pdev->dev, "QPLIB: FP: CREATE_QP1 timed out");
- rc = -ETIMEDOUT;
- goto fail;
- }
- if (resp->status ||
- le16_to_cpu(resp->cookie) != le16_to_cpu(req.cookie)) {
- dev_err(&rcfw->pdev->dev, "QPLIB: FP: CREATE_QP1 failed ");
- dev_err(&rcfw->pdev->dev,
- "QPLIB: with status 0x%x cmdq 0x%x resp 0x%x",
- resp->status, le16_to_cpu(req.cookie),
- le16_to_cpu(resp->cookie));
- rc = -EINVAL;
+ rc = bnxt_qplib_rcfw_send_message(rcfw, (void *)&req,
+ (void *)&resp, NULL, 0);
+ if (rc)
goto fail;
- }
- qp->id = le32_to_cpu(resp->xid);
+
+ qp->id = le32_to_cpu(resp.xid);
qp->cur_qp_state = CMDQ_MODIFY_QP_NEW_STATE_RESET;
sq->flush_in_progress = false;
rq->flush_in_progress = false;
@@ -442,7 +423,7 @@ int bnxt_qplib_create_qp(struct bnxt_qplib_res *res, struct bnxt_qplib_qp *qp)
struct bnxt_qplib_rcfw *rcfw = res->rcfw;
struct sq_send *hw_sq_send_hdr, **hw_sq_send_ptr;
struct cmdq_create_qp req;
- struct creq_create_qp_resp *resp;
+ struct creq_create_qp_resp resp;
struct bnxt_qplib_pbl *pbl;
struct sq_psn_search **psn_search_ptr;
unsigned long int psn_search, poff = 0;
@@ -627,31 +608,12 @@ int bnxt_qplib_create_qp(struct bnxt_qplib_res *res, struct bnxt_qplib_qp *qp)
}
req.pd_id = cpu_to_le32(qp->pd->id);
- resp = (struct creq_create_qp_resp *)
- bnxt_qplib_rcfw_send_message(rcfw, (void *)&req,
- NULL, 0);
- if (!resp) {
- dev_err(&rcfw->pdev->dev, "QPLIB: FP: CREATE_QP send failed");
- rc = -EINVAL;
- goto fail;
- }
- if (!bnxt_qplib_rcfw_wait_for_resp(rcfw, le16_to_cpu(req.cookie))) {
- /* Cmd timed out */
- dev_err(&rcfw->pdev->dev, "QPLIB: FP: CREATE_QP timed out");
- rc = -ETIMEDOUT;
- goto fail;
- }
- if (resp->status ||
- le16_to_cpu(resp->cookie) != le16_to_cpu(req.cookie)) {
- dev_err(&rcfw->pdev->dev, "QPLIB: FP: CREATE_QP failed ");
- dev_err(&rcfw->pdev->dev,
- "QPLIB: with status 0x%x cmdq 0x%x resp 0x%x",
- resp->status, le16_to_cpu(req.cookie),
- le16_to_cpu(resp->cookie));
- rc = -EINVAL;
+ rc = bnxt_qplib_rcfw_send_message(rcfw, (void *)&req,
+ (void *)&resp, NULL, 0);
+ if (rc)
goto fail;
- }
- qp->id = le32_to_cpu(resp->xid);
+
+ qp->id = le32_to_cpu(resp.xid);
qp->cur_qp_state = CMDQ_MODIFY_QP_NEW_STATE_RESET;
sq->flush_in_progress = false;
rq->flush_in_progress = false;
@@ -769,10 +731,11 @@ int bnxt_qplib_modify_qp(struct bnxt_qplib_res *res, struct bnxt_qplib_qp *qp)
{
struct bnxt_qplib_rcfw *rcfw = res->rcfw;
struct cmdq_modify_qp req;
- struct creq_modify_qp_resp *resp;
+ struct creq_modify_qp_resp resp;
u16 cmd_flags = 0, pkey;
u32 temp32[4];
u32 bmask;
+ int rc;
RCFW_CMD_PREP(req, MODIFY_QP, cmd_flags);
@@ -862,27 +825,10 @@ int bnxt_qplib_modify_qp(struct bnxt_qplib_res *res, struct bnxt_qplib_qp *qp)
req.vlan_pcp_vlan_dei_vlan_id = cpu_to_le16(qp->vlan_id);
- resp = (struct creq_modify_qp_resp *)
- bnxt_qplib_rcfw_send_message(rcfw, (void *)&req,
- NULL, 0);
- if (!resp) {
- dev_err(&rcfw->pdev->dev, "QPLIB: FP: MODIFY_QP send failed");
- return -EINVAL;
- }
- if (!bnxt_qplib_rcfw_wait_for_resp(rcfw, le16_to_cpu(req.cookie))) {
- /* Cmd timed out */
- dev_err(&rcfw->pdev->dev, "QPLIB: FP: MODIFY_QP timed out");
- return -ETIMEDOUT;
- }
- if (resp->status ||
- le16_to_cpu(resp->cookie) != le16_to_cpu(req.cookie)) {
- dev_err(&rcfw->pdev->dev, "QPLIB: FP: MODIFY_QP failed ");
- dev_err(&rcfw->pdev->dev,
- "QPLIB: with status 0x%x cmdq 0x%x resp 0x%x",
- resp->status, le16_to_cpu(req.cookie),
- le16_to_cpu(resp->cookie));
- return -EINVAL;
- }
+ rc = bnxt_qplib_rcfw_send_message(rcfw, (void *)&req,
+ (void *)&resp, NULL, 0);
+ if (rc)
+ return rc;
qp->cur_qp_state = qp->state;
return 0;
}
@@ -891,37 +837,26 @@ int bnxt_qplib_query_qp(struct bnxt_qplib_res *res, struct bnxt_qplib_qp *qp)
{
struct bnxt_qplib_rcfw *rcfw = res->rcfw;
struct cmdq_query_qp req;
- struct creq_query_qp_resp *resp;
+ struct creq_query_qp_resp resp;
+ struct bnxt_qplib_rcfw_sbuf *sbuf;
struct creq_query_qp_resp_sb *sb;
u16 cmd_flags = 0;
u32 temp32[4];
- int i;
+ int i, rc = 0;
RCFW_CMD_PREP(req, QUERY_QP, cmd_flags);
+ sbuf = bnxt_qplib_rcfw_alloc_sbuf(rcfw, sizeof(*sb));
+ if (!sbuf)
+ return -ENOMEM;
+ sb = sbuf->sb;
+
req.qp_cid = cpu_to_le32(qp->id);
req.resp_size = sizeof(*sb) / BNXT_QPLIB_CMDQE_UNITS;
- resp = (struct creq_query_qp_resp *)
- bnxt_qplib_rcfw_send_message(rcfw, (void *)&req,
- (void **)&sb, 0);
- if (!resp) {
- dev_err(&rcfw->pdev->dev, "QPLIB: FP: QUERY_QP send failed");
- return -EINVAL;
- }
- if (!bnxt_qplib_rcfw_wait_for_resp(rcfw, le16_to_cpu(req.cookie))) {
- /* Cmd timed out */
- dev_err(&rcfw->pdev->dev, "QPLIB: FP: QUERY_QP timed out");
- return -ETIMEDOUT;
- }
- if (resp->status ||
- le16_to_cpu(resp->cookie) != le16_to_cpu(req.cookie)) {
- dev_err(&rcfw->pdev->dev, "QPLIB: FP: QUERY_QP failed ");
- dev_err(&rcfw->pdev->dev,
- "QPLIB: with status 0x%x cmdq 0x%x resp 0x%x",
- resp->status, le16_to_cpu(req.cookie),
- le16_to_cpu(resp->cookie));
- return -EINVAL;
- }
+ rc = bnxt_qplib_rcfw_send_message(rcfw, (void *)&req, (void *)&resp,
+ (void *)sbuf, 0);
+ if (rc)
+ goto bail;
/* Extract the context from the side buffer */
qp->state = sb->en_sqd_async_notify_state &
CREQ_QUERY_QP_RESP_SB_STATE_MASK;
@@ -976,7 +911,9 @@ int bnxt_qplib_query_qp(struct bnxt_qplib_res *res, struct bnxt_qplib_qp *qp)
qp->dest_qpn = le32_to_cpu(sb->dest_qp_id);
memcpy(qp->smac, sb->src_mac, 6);
qp->vlan_id = le16_to_cpu(sb->vlan_pcp_vlan_dei_vlan_id);
- return 0;
+bail:
+ bnxt_qplib_rcfw_free_sbuf(rcfw, sbuf);
+ return rc;
}
static void __clean_cq(struct bnxt_qplib_cq *cq, u64 qp)
@@ -1021,34 +958,18 @@ int bnxt_qplib_destroy_qp(struct bnxt_qplib_res *res,
{
struct bnxt_qplib_rcfw *rcfw = res->rcfw;
struct cmdq_destroy_qp req;
- struct creq_destroy_qp_resp *resp;
+ struct creq_destroy_qp_resp resp;
unsigned long flags;
u16 cmd_flags = 0;
+ int rc;
RCFW_CMD_PREP(req, DESTROY_QP, cmd_flags);
req.qp_cid = cpu_to_le32(qp->id);
- resp = (struct creq_destroy_qp_resp *)
- bnxt_qplib_rcfw_send_message(rcfw, (void *)&req,
- NULL, 0);
- if (!resp) {
- dev_err(&rcfw->pdev->dev, "QPLIB: FP: DESTROY_QP send failed");
- return -EINVAL;
- }
- if (!bnxt_qplib_rcfw_wait_for_resp(rcfw, le16_to_cpu(req.cookie))) {
- /* Cmd timed out */
- dev_err(&rcfw->pdev->dev, "QPLIB: FP: DESTROY_QP timed out");
- return -ETIMEDOUT;
- }
- if (resp->status ||
- le16_to_cpu(resp->cookie) != le16_to_cpu(req.cookie)) {
- dev_err(&rcfw->pdev->dev, "QPLIB: FP: DESTROY_QP failed ");
- dev_err(&rcfw->pdev->dev,
- "QPLIB: with status 0x%x cmdq 0x%x resp 0x%x",
- resp->status, le16_to_cpu(req.cookie),
- le16_to_cpu(resp->cookie));
- return -EINVAL;
- }
+ rc = bnxt_qplib_rcfw_send_message(rcfw, (void *)&req,
+ (void *)&resp, NULL, 0);
+ if (rc)
+ return rc;
/* Must walk the associated CQs to nullified the QP ptr */
spin_lock_irqsave(&qp->scq->hwq.lock, flags);
@@ -1162,8 +1083,12 @@ int bnxt_qplib_post_send(struct bnxt_qplib_qp *qp,
rc = -EINVAL;
goto done;
}
- if (HWQ_CMP((sq->hwq.prod + 1), &sq->hwq) ==
- HWQ_CMP(sq->hwq.cons, &sq->hwq)) {
+
+ if (bnxt_qplib_queue_full(sq)) {
+ dev_err(&sq->hwq.pdev->dev,
+ "QPLIB: prod = %#x cons = %#x qdepth = %#x delta = %#x",
+ sq->hwq.prod, sq->hwq.cons, sq->hwq.max_elements,
+ sq->q_full_delta);
rc = -ENOMEM;
goto done;
}
@@ -1373,6 +1298,9 @@ int bnxt_qplib_post_send(struct bnxt_qplib_qp *qp,
}
sq->hwq.prod++;
+
+ qp->wqe_cnt++;
+
done:
return rc;
}
@@ -1411,8 +1339,7 @@ int bnxt_qplib_post_recv(struct bnxt_qplib_qp *qp,
rc = -EINVAL;
goto done;
}
- if (HWQ_CMP((rq->hwq.prod + 1), &rq->hwq) ==
- HWQ_CMP(rq->hwq.cons, &rq->hwq)) {
+ if (bnxt_qplib_queue_full(rq)) {
dev_err(&rq->hwq.pdev->dev,
"QPLIB: FP: QP (0x%x) RQ is full!", qp->id);
rc = -EINVAL;
@@ -1483,7 +1410,7 @@ int bnxt_qplib_create_cq(struct bnxt_qplib_res *res, struct bnxt_qplib_cq *cq)
{
struct bnxt_qplib_rcfw *rcfw = res->rcfw;
struct cmdq_create_cq req;
- struct creq_create_cq_resp *resp;
+ struct creq_create_cq_resp resp;
struct bnxt_qplib_pbl *pbl;
u16 cmd_flags = 0;
int rc;
@@ -1525,30 +1452,12 @@ int bnxt_qplib_create_cq(struct bnxt_qplib_res *res, struct bnxt_qplib_cq *cq)
(cq->cnq_hw_ring_id & CMDQ_CREATE_CQ_CNQ_ID_MASK) <<
CMDQ_CREATE_CQ_CNQ_ID_SFT);
- resp = (struct creq_create_cq_resp *)
- bnxt_qplib_rcfw_send_message(rcfw, (void *)&req,
- NULL, 0);
- if (!resp) {
- dev_err(&rcfw->pdev->dev, "QPLIB: FP: CREATE_CQ send failed");
- return -EINVAL;
- }
- if (!bnxt_qplib_rcfw_wait_for_resp(rcfw, le16_to_cpu(req.cookie))) {
- /* Cmd timed out */
- dev_err(&rcfw->pdev->dev, "QPLIB: FP: CREATE_CQ timed out");
- rc = -ETIMEDOUT;
- goto fail;
- }
- if (resp->status ||
- le16_to_cpu(resp->cookie) != le16_to_cpu(req.cookie)) {
- dev_err(&rcfw->pdev->dev, "QPLIB: FP: CREATE_CQ failed ");
- dev_err(&rcfw->pdev->dev,
- "QPLIB: with status 0x%x cmdq 0x%x resp 0x%x",
- resp->status, le16_to_cpu(req.cookie),
- le16_to_cpu(resp->cookie));
- rc = -EINVAL;
+ rc = bnxt_qplib_rcfw_send_message(rcfw, (void *)&req,
+ (void *)&resp, NULL, 0);
+ if (rc)
goto fail;
- }
- cq->id = le32_to_cpu(resp->xid);
+
+ cq->id = le32_to_cpu(resp.xid);
cq->dbr_base = res->dpi_tbl.dbr_bar_reg_iomem;
cq->period = BNXT_QPLIB_QUEUE_START_PERIOD;
init_waitqueue_head(&cq->waitq);
@@ -1566,33 +1475,17 @@ int bnxt_qplib_destroy_cq(struct bnxt_qplib_res *res, struct bnxt_qplib_cq *cq)
{
struct bnxt_qplib_rcfw *rcfw = res->rcfw;
struct cmdq_destroy_cq req;
- struct creq_destroy_cq_resp *resp;
+ struct creq_destroy_cq_resp resp;
u16 cmd_flags = 0;
+ int rc;
RCFW_CMD_PREP(req, DESTROY_CQ, cmd_flags);
req.cq_cid = cpu_to_le32(cq->id);
- resp = (struct creq_destroy_cq_resp *)
- bnxt_qplib_rcfw_send_message(rcfw, (void *)&req,
- NULL, 0);
- if (!resp) {
- dev_err(&rcfw->pdev->dev, "QPLIB: FP: DESTROY_CQ send failed");
- return -EINVAL;
- }
- if (!bnxt_qplib_rcfw_wait_for_resp(rcfw, le16_to_cpu(req.cookie))) {
- /* Cmd timed out */
- dev_err(&rcfw->pdev->dev, "QPLIB: FP: DESTROY_CQ timed out");
- return -ETIMEDOUT;
- }
- if (resp->status ||
- le16_to_cpu(resp->cookie) != le16_to_cpu(req.cookie)) {
- dev_err(&rcfw->pdev->dev, "QPLIB: FP: DESTROY_CQ failed ");
- dev_err(&rcfw->pdev->dev,
- "QPLIB: with status 0x%x cmdq 0x%x resp 0x%x",
- resp->status, le16_to_cpu(req.cookie),
- le16_to_cpu(resp->cookie));
- return -EINVAL;
- }
+ rc = bnxt_qplib_rcfw_send_message(rcfw, (void *)&req,
+ (void *)&resp, NULL, 0);
+ if (rc)
+ return rc;
bnxt_qplib_free_hwq(res->pdev, &cq->hwq);
return 0;
}
@@ -1664,14 +1557,113 @@ static int __flush_rq(struct bnxt_qplib_q *rq, struct bnxt_qplib_qp *qp,
return rc;
}
+/* Note: SQE is valid from sw_sq_cons up to cqe_sq_cons (exclusive)
+ * CQE is track from sw_cq_cons to max_element but valid only if VALID=1
+ */
+static int do_wa9060(struct bnxt_qplib_qp *qp, struct bnxt_qplib_cq *cq,
+ u32 cq_cons, u32 sw_sq_cons, u32 cqe_sq_cons)
+{
+ struct bnxt_qplib_q *sq = &qp->sq;
+ struct bnxt_qplib_swq *swq;
+ u32 peek_sw_cq_cons, peek_raw_cq_cons, peek_sq_cons_idx;
+ struct cq_base *peek_hwcqe, **peek_hw_cqe_ptr;
+ struct cq_req *peek_req_hwcqe;
+ struct bnxt_qplib_qp *peek_qp;
+ struct bnxt_qplib_q *peek_sq;
+ int i, rc = 0;
+
+ /* Normal mode */
+ /* Check for the psn_search marking before completing */
+ swq = &sq->swq[sw_sq_cons];
+ if (swq->psn_search &&
+ le32_to_cpu(swq->psn_search->flags_next_psn) & 0x80000000) {
+ /* Unmark */
+ swq->psn_search->flags_next_psn = cpu_to_le32
+ (le32_to_cpu(swq->psn_search->flags_next_psn)
+ & ~0x80000000);
+ dev_dbg(&cq->hwq.pdev->dev,
+ "FP: Process Req cq_cons=0x%x qp=0x%x sq cons sw=0x%x cqe=0x%x marked!\n",
+ cq_cons, qp->id, sw_sq_cons, cqe_sq_cons);
+ sq->condition = true;
+ sq->send_phantom = true;
+
+ /* TODO: Only ARM if the previous SQE is ARMALL */
+ bnxt_qplib_arm_cq(cq, DBR_DBR_TYPE_CQ_ARMALL);
+
+ rc = -EAGAIN;
+ goto out;
+ }
+ if (sq->condition) {
+ /* Peek at the completions */
+ peek_raw_cq_cons = cq->hwq.cons;
+ peek_sw_cq_cons = cq_cons;
+ i = cq->hwq.max_elements;
+ while (i--) {
+ peek_sw_cq_cons = HWQ_CMP((peek_sw_cq_cons), &cq->hwq);
+ peek_hw_cqe_ptr = (struct cq_base **)cq->hwq.pbl_ptr;
+ peek_hwcqe = &peek_hw_cqe_ptr[CQE_PG(peek_sw_cq_cons)]
+ [CQE_IDX(peek_sw_cq_cons)];
+ /* If the next hwcqe is VALID */
+ if (CQE_CMP_VALID(peek_hwcqe, peek_raw_cq_cons,
+ cq->hwq.max_elements)) {
+ /* If the next hwcqe is a REQ */
+ if ((peek_hwcqe->cqe_type_toggle &
+ CQ_BASE_CQE_TYPE_MASK) ==
+ CQ_BASE_CQE_TYPE_REQ) {
+ peek_req_hwcqe = (struct cq_req *)
+ peek_hwcqe;
+ peek_qp = (struct bnxt_qplib_qp *)
+ ((unsigned long)
+ le64_to_cpu
+ (peek_req_hwcqe->qp_handle));
+ peek_sq = &peek_qp->sq;
+ peek_sq_cons_idx = HWQ_CMP(le16_to_cpu(
+ peek_req_hwcqe->sq_cons_idx) - 1
+ , &sq->hwq);
+ /* If the hwcqe's sq's wr_id matches */
+ if (peek_sq == sq &&
+ sq->swq[peek_sq_cons_idx].wr_id ==
+ BNXT_QPLIB_FENCE_WRID) {
+ /*
+ * Unbreak only if the phantom
+ * comes back
+ */
+ dev_dbg(&cq->hwq.pdev->dev,
+ "FP:Got Phantom CQE");
+ sq->condition = false;
+ sq->single = true;
+ rc = 0;
+ goto out;
+ }
+ }
+ /* Valid but not the phantom, so keep looping */
+ } else {
+ /* Not valid yet, just exit and wait */
+ rc = -EINVAL;
+ goto out;
+ }
+ peek_sw_cq_cons++;
+ peek_raw_cq_cons++;
+ }
+ dev_err(&cq->hwq.pdev->dev,
+ "Should not have come here! cq_cons=0x%x qp=0x%x sq cons sw=0x%x hw=0x%x",
+ cq_cons, qp->id, sw_sq_cons, cqe_sq_cons);
+ rc = -EINVAL;
+ }
+out:
+ return rc;
+}
+
static int bnxt_qplib_cq_process_req(struct bnxt_qplib_cq *cq,
struct cq_req *hwcqe,
- struct bnxt_qplib_cqe **pcqe, int *budget)
+ struct bnxt_qplib_cqe **pcqe, int *budget,
+ u32 cq_cons, struct bnxt_qplib_qp **lib_qp)
{
struct bnxt_qplib_qp *qp;
struct bnxt_qplib_q *sq;
struct bnxt_qplib_cqe *cqe;
- u32 sw_cons, cqe_cons;
+ u32 sw_sq_cons, cqe_sq_cons;
+ struct bnxt_qplib_swq *swq;
int rc = 0;
qp = (struct bnxt_qplib_qp *)((unsigned long)
@@ -1683,13 +1675,13 @@ static int bnxt_qplib_cq_process_req(struct bnxt_qplib_cq *cq,
}
sq = &qp->sq;
- cqe_cons = HWQ_CMP(le16_to_cpu(hwcqe->sq_cons_idx), &sq->hwq);
- if (cqe_cons > sq->hwq.max_elements) {
+ cqe_sq_cons = HWQ_CMP(le16_to_cpu(hwcqe->sq_cons_idx), &sq->hwq);
+ if (cqe_sq_cons > sq->hwq.max_elements) {
dev_err(&cq->hwq.pdev->dev,
"QPLIB: FP: CQ Process req reported ");
dev_err(&cq->hwq.pdev->dev,
"QPLIB: sq_cons_idx 0x%x which exceeded max 0x%x",
- cqe_cons, sq->hwq.max_elements);
+ cqe_sq_cons, sq->hwq.max_elements);
return -EINVAL;
}
/* If we were in the middle of flushing the SQ, continue */
@@ -1698,53 +1690,74 @@ static int bnxt_qplib_cq_process_req(struct bnxt_qplib_cq *cq,
/* Require to walk the sq's swq to fabricate CQEs for all previously
* signaled SWQEs due to CQE aggregation from the current sq cons
- * to the cqe_cons
+ * to the cqe_sq_cons
*/
cqe = *pcqe;
while (*budget) {
- sw_cons = HWQ_CMP(sq->hwq.cons, &sq->hwq);
- if (sw_cons == cqe_cons)
+ sw_sq_cons = HWQ_CMP(sq->hwq.cons, &sq->hwq);
+ if (sw_sq_cons == cqe_sq_cons)
+ /* Done */
break;
+
+ swq = &sq->swq[sw_sq_cons];
memset(cqe, 0, sizeof(*cqe));
cqe->opcode = CQ_BASE_CQE_TYPE_REQ;
cqe->qp_handle = (u64)(unsigned long)qp;
cqe->src_qp = qp->id;
- cqe->wr_id = sq->swq[sw_cons].wr_id;
- cqe->type = sq->swq[sw_cons].type;
+ cqe->wr_id = swq->wr_id;
+ if (cqe->wr_id == BNXT_QPLIB_FENCE_WRID)
+ goto skip;
+ cqe->type = swq->type;
/* For the last CQE, check for status. For errors, regardless
* of the request being signaled or not, it must complete with
* the hwcqe error status
*/
- if (HWQ_CMP((sw_cons + 1), &sq->hwq) == cqe_cons &&
+ if (HWQ_CMP((sw_sq_cons + 1), &sq->hwq) == cqe_sq_cons &&
hwcqe->status != CQ_REQ_STATUS_OK) {
cqe->status = hwcqe->status;
dev_err(&cq->hwq.pdev->dev,
"QPLIB: FP: CQ Processed Req ");
dev_err(&cq->hwq.pdev->dev,
"QPLIB: wr_id[%d] = 0x%llx with status 0x%x",
- sw_cons, cqe->wr_id, cqe->status);
+ sw_sq_cons, cqe->wr_id, cqe->status);
cqe++;
(*budget)--;
sq->flush_in_progress = true;
/* Must block new posting of SQ and RQ */
qp->state = CMDQ_MODIFY_QP_NEW_STATE_ERR;
+ sq->condition = false;
+ sq->single = false;
} else {
- if (sq->swq[sw_cons].flags &
- SQ_SEND_FLAGS_SIGNAL_COMP) {
+ if (swq->flags & SQ_SEND_FLAGS_SIGNAL_COMP) {
+ /* Before we complete, do WA 9060 */
+ if (do_wa9060(qp, cq, cq_cons, sw_sq_cons,
+ cqe_sq_cons)) {
+ *lib_qp = qp;
+ goto out;
+ }
cqe->status = CQ_REQ_STATUS_OK;
cqe++;
(*budget)--;
}
}
+skip:
sq->hwq.cons++;
+ if (sq->single)
+ break;
}
+out:
*pcqe = cqe;
- if (!*budget && HWQ_CMP(sq->hwq.cons, &sq->hwq) != cqe_cons) {
+ if (HWQ_CMP(sq->hwq.cons, &sq->hwq) != cqe_sq_cons) {
/* Out of budget */
rc = -EAGAIN;
goto done;
}
+ /*
+ * Back to normal completion mode only after it has completed all of
+ * the WC for this CQE
+ */
+ sq->single = false;
if (!sq->flush_in_progress)
goto done;
flush:
@@ -2074,7 +2087,7 @@ static int bnxt_qplib_cq_process_cutoff(struct bnxt_qplib_cq *cq,
}
int bnxt_qplib_poll_cq(struct bnxt_qplib_cq *cq, struct bnxt_qplib_cqe *cqe,
- int num_cqes)
+ int num_cqes, struct bnxt_qplib_qp **lib_qp)
{
struct cq_base *hw_cqe, **hw_cqe_ptr;
unsigned long flags;
@@ -2099,7 +2112,8 @@ int bnxt_qplib_poll_cq(struct bnxt_qplib_cq *cq, struct bnxt_qplib_cqe *cqe,
case CQ_BASE_CQE_TYPE_REQ:
rc = bnxt_qplib_cq_process_req(cq,
(struct cq_req *)hw_cqe,
- &cqe, &budget);
+ &cqe, &budget,
+ sw_cons, lib_qp);
break;
case CQ_BASE_CQE_TYPE_RES_RC:
rc = bnxt_qplib_cq_process_res_rc(cq,
diff --git a/drivers/infiniband/hw/bnxt_re/qplib_fp.h b/drivers/infiniband/hw/bnxt_re/qplib_fp.h
index f0150f8da1e3..36b7b7db0e3f 100644
--- a/drivers/infiniband/hw/bnxt_re/qplib_fp.h
+++ b/drivers/infiniband/hw/bnxt_re/qplib_fp.h
@@ -88,6 +88,7 @@ struct bnxt_qplib_swq {
struct bnxt_qplib_swqe {
/* General */
+#define BNXT_QPLIB_FENCE_WRID 0x46454E43 /* "FENC" */
u64 wr_id;
u8 reqs_type;
u8 type;
@@ -216,9 +217,16 @@ struct bnxt_qplib_q {
struct scatterlist *sglist;
u32 nmap;
u32 max_wqe;
+ u16 q_full_delta;
u16 max_sge;
u32 psn;
bool flush_in_progress;
+ bool condition;
+ bool single;
+ bool send_phantom;
+ u32 phantom_wqe_cnt;
+ u32 phantom_cqe_cnt;
+ u32 next_cq_cons;
};
struct bnxt_qplib_qp {
@@ -242,6 +250,7 @@ struct bnxt_qplib_qp {
u8 timeout;
u8 retry_cnt;
u8 rnr_retry;
+ u64 wqe_cnt;
u32 min_rnr_timer;
u32 max_rd_atomic;
u32 max_dest_rd_atomic;
@@ -301,6 +310,13 @@ struct bnxt_qplib_qp {
(!!((hdr)->cqe_type_toggle & CQ_BASE_TOGGLE) == \
!((raw_cons) & (cp_bit)))
+static inline bool bnxt_qplib_queue_full(struct bnxt_qplib_q *qplib_q)
+{
+ return HWQ_CMP((qplib_q->hwq.prod + qplib_q->q_full_delta),
+ &qplib_q->hwq) == HWQ_CMP(qplib_q->hwq.cons,
+ &qplib_q->hwq);
+}
+
struct bnxt_qplib_cqe {
u8 status;
u8 type;
@@ -432,7 +448,7 @@ int bnxt_qplib_post_recv(struct bnxt_qplib_qp *qp,
int bnxt_qplib_create_cq(struct bnxt_qplib_res *res, struct bnxt_qplib_cq *cq);
int bnxt_qplib_destroy_cq(struct bnxt_qplib_res *res, struct bnxt_qplib_cq *cq);
int bnxt_qplib_poll_cq(struct bnxt_qplib_cq *cq, struct bnxt_qplib_cqe *cqe,
- int num);
+ int num, struct bnxt_qplib_qp **qp);
void bnxt_qplib_req_notify_cq(struct bnxt_qplib_cq *cq, u32 arm_type);
void bnxt_qplib_free_nq(struct bnxt_qplib_nq *nq);
int bnxt_qplib_alloc_nq(struct pci_dev *pdev, struct bnxt_qplib_nq *nq);
diff --git a/drivers/infiniband/hw/bnxt_re/qplib_rcfw.c b/drivers/infiniband/hw/bnxt_re/qplib_rcfw.c
index 23fb7260662b..16e42754dbec 100644
--- a/drivers/infiniband/hw/bnxt_re/qplib_rcfw.c
+++ b/drivers/infiniband/hw/bnxt_re/qplib_rcfw.c
@@ -39,72 +39,55 @@
#include <linux/spinlock.h>
#include <linux/pci.h>
#include <linux/prefetch.h>
+#include <linux/delay.h>
+
#include "roce_hsi.h"
#include "qplib_res.h"
#include "qplib_rcfw.h"
static void bnxt_qplib_service_creq(unsigned long data);
/* Hardware communication channel */
-int bnxt_qplib_rcfw_wait_for_resp(struct bnxt_qplib_rcfw *rcfw, u16 cookie)
+static int __wait_for_resp(struct bnxt_qplib_rcfw *rcfw, u16 cookie)
{
u16 cbit;
int rc;
- cookie &= RCFW_MAX_COOKIE_VALUE;
cbit = cookie % RCFW_MAX_OUTSTANDING_CMD;
- if (!test_bit(cbit, rcfw->cmdq_bitmap))
- dev_warn(&rcfw->pdev->dev,
- "QPLIB: CMD bit %d for cookie 0x%x is not set?",
- cbit, cookie);
-
rc = wait_event_timeout(rcfw->waitq,
!test_bit(cbit, rcfw->cmdq_bitmap),
msecs_to_jiffies(RCFW_CMD_WAIT_TIME_MS));
- if (!rc) {
- dev_warn(&rcfw->pdev->dev,
- "QPLIB: Bono Error: timeout %d msec, msg {0x%x}\n",
- RCFW_CMD_WAIT_TIME_MS, cookie);
- }
-
- return rc;
+ return rc ? 0 : -ETIMEDOUT;
};
-int bnxt_qplib_rcfw_block_for_resp(struct bnxt_qplib_rcfw *rcfw, u16 cookie)
+static int __block_for_resp(struct bnxt_qplib_rcfw *rcfw, u16 cookie)
{
- u32 count = -1;
+ u32 count = RCFW_BLOCKED_CMD_WAIT_COUNT;
u16 cbit;
- cookie &= RCFW_MAX_COOKIE_VALUE;
cbit = cookie % RCFW_MAX_OUTSTANDING_CMD;
if (!test_bit(cbit, rcfw->cmdq_bitmap))
goto done;
do {
+ mdelay(1); /* 1m sec */
bnxt_qplib_service_creq((unsigned long)rcfw);
} while (test_bit(cbit, rcfw->cmdq_bitmap) && --count);
done:
- return count;
+ return count ? 0 : -ETIMEDOUT;
};
-void *bnxt_qplib_rcfw_send_message(struct bnxt_qplib_rcfw *rcfw,
- struct cmdq_base *req, void **crsbe,
- u8 is_block)
+static int __send_message(struct bnxt_qplib_rcfw *rcfw, struct cmdq_base *req,
+ struct creq_base *resp, void *sb, u8 is_block)
{
- struct bnxt_qplib_crsq *crsq = &rcfw->crsq;
struct bnxt_qplib_cmdqe *cmdqe, **cmdq_ptr;
struct bnxt_qplib_hwq *cmdq = &rcfw->cmdq;
- struct bnxt_qplib_hwq *crsb = &rcfw->crsb;
- struct bnxt_qplib_crsqe *crsqe = NULL;
- struct bnxt_qplib_crsbe **crsb_ptr;
+ struct bnxt_qplib_crsq *crsqe;
u32 sw_prod, cmdq_prod;
- u8 retry_cnt = 0xFF;
- dma_addr_t dma_addr;
unsigned long flags;
u32 size, opcode;
u16 cookie, cbit;
int pg, idx;
u8 *preq;
-retry:
opcode = req->opcode;
if (!test_bit(FIRMWARE_INITIALIZED_FLAG, &rcfw->flags) &&
(opcode != CMDQ_BASE_OPCODE_QUERY_FUNC &&
@@ -112,63 +95,50 @@ retry:
dev_err(&rcfw->pdev->dev,
"QPLIB: RCFW not initialized, reject opcode 0x%x",
opcode);
- return NULL;
+ return -EINVAL;
}
if (test_bit(FIRMWARE_INITIALIZED_FLAG, &rcfw->flags) &&
opcode == CMDQ_BASE_OPCODE_INITIALIZE_FW) {
dev_err(&rcfw->pdev->dev, "QPLIB: RCFW already initialized!");
- return NULL;
+ return -EINVAL;
}
/* Cmdq are in 16-byte units, each request can consume 1 or more
* cmdqe
*/
spin_lock_irqsave(&cmdq->lock, flags);
- if (req->cmd_size > cmdq->max_elements -
- ((HWQ_CMP(cmdq->prod, cmdq) - HWQ_CMP(cmdq->cons, cmdq)) &
- (cmdq->max_elements - 1))) {
+ if (req->cmd_size >= HWQ_FREE_SLOTS(cmdq)) {
dev_err(&rcfw->pdev->dev, "QPLIB: RCFW: CMDQ is full!");
spin_unlock_irqrestore(&cmdq->lock, flags);
-
- if (!retry_cnt--)
- return NULL;
- goto retry;
+ return -EAGAIN;
}
- retry_cnt = 0xFF;
- cookie = atomic_inc_return(&rcfw->seq_num) & RCFW_MAX_COOKIE_VALUE;
+ cookie = rcfw->seq_num & RCFW_MAX_COOKIE_VALUE;
cbit = cookie % RCFW_MAX_OUTSTANDING_CMD;
if (is_block)
cookie |= RCFW_CMD_IS_BLOCKING;
+
+ set_bit(cbit, rcfw->cmdq_bitmap);
req->cookie = cpu_to_le16(cookie);
- if (test_and_set_bit(cbit, rcfw->cmdq_bitmap)) {
- dev_err(&rcfw->pdev->dev,
- "QPLIB: RCFW MAX outstanding cmd reached!");
- atomic_dec(&rcfw->seq_num);
+ crsqe = &rcfw->crsqe_tbl[cbit];
+ if (crsqe->resp) {
spin_unlock_irqrestore(&cmdq->lock, flags);
-
- if (!retry_cnt--)
- return NULL;
- goto retry;
+ return -EBUSY;
}
- /* Reserve a resp buffer slot if requested */
- if (req->resp_size && crsbe) {
- spin_lock(&crsb->lock);
- sw_prod = HWQ_CMP(crsb->prod, crsb);
- crsb_ptr = (struct bnxt_qplib_crsbe **)crsb->pbl_ptr;
- *crsbe = (void *)&crsb_ptr[get_crsb_pg(sw_prod)]
- [get_crsb_idx(sw_prod)];
- bnxt_qplib_crsb_dma_next(crsb->pbl_dma_ptr, sw_prod, &dma_addr);
- req->resp_addr = cpu_to_le64(dma_addr);
- crsb->prod++;
- spin_unlock(&crsb->lock);
-
- req->resp_size = (sizeof(struct bnxt_qplib_crsbe) +
- BNXT_QPLIB_CMDQE_UNITS - 1) /
- BNXT_QPLIB_CMDQE_UNITS;
+ memset(resp, 0, sizeof(*resp));
+ crsqe->resp = (struct creq_qp_event *)resp;
+ crsqe->resp->cookie = req->cookie;
+ crsqe->req_size = req->cmd_size;
+ if (req->resp_size && sb) {
+ struct bnxt_qplib_rcfw_sbuf *sbuf = sb;
+
+ req->resp_addr = cpu_to_le64(sbuf->dma_addr);
+ req->resp_size = (sbuf->size + BNXT_QPLIB_CMDQE_UNITS - 1) /
+ BNXT_QPLIB_CMDQE_UNITS;
}
+
cmdq_ptr = (struct bnxt_qplib_cmdqe **)cmdq->pbl_ptr;
preq = (u8 *)req;
size = req->cmd_size * BNXT_QPLIB_CMDQE_UNITS;
@@ -190,23 +160,24 @@ retry:
preq += min_t(u32, size, sizeof(*cmdqe));
size -= min_t(u32, size, sizeof(*cmdqe));
cmdq->prod++;
+ rcfw->seq_num++;
} while (size > 0);
+ rcfw->seq_num++;
+
cmdq_prod = cmdq->prod;
if (rcfw->flags & FIRMWARE_FIRST_FLAG) {
- /* The very first doorbell write is required to set this flag
- * which prompts the FW to reset its internal pointers
+ /* The very first doorbell write
+ * is required to set this flag
+ * which prompts the FW to reset
+ * its internal pointers
*/
cmdq_prod |= FIRMWARE_FIRST_FLAG;
rcfw->flags &= ~FIRMWARE_FIRST_FLAG;
}
- sw_prod = HWQ_CMP(crsq->prod, crsq);
- crsqe = &crsq->crsq[sw_prod];
- memset(crsqe, 0, sizeof(*crsqe));
- crsq->prod++;
- crsqe->req_size = req->cmd_size;
/* ring CMDQ DB */
+ wmb();
writel(cmdq_prod, rcfw->cmdq_bar_reg_iomem +
rcfw->cmdq_bar_reg_prod_off);
writel(RCFW_CMDQ_TRIG_VAL, rcfw->cmdq_bar_reg_iomem +
@@ -214,9 +185,56 @@ retry:
done:
spin_unlock_irqrestore(&cmdq->lock, flags);
/* Return the CREQ response pointer */
- return crsqe ? &crsqe->qp_event : NULL;
+ return 0;
}
+int bnxt_qplib_rcfw_send_message(struct bnxt_qplib_rcfw *rcfw,
+ struct cmdq_base *req,
+ struct creq_base *resp,
+ void *sb, u8 is_block)
+{
+ struct creq_qp_event *evnt = (struct creq_qp_event *)resp;
+ u16 cookie;
+ u8 opcode, retry_cnt = 0xFF;
+ int rc = 0;
+
+ do {
+ opcode = req->opcode;
+ rc = __send_message(rcfw, req, resp, sb, is_block);
+ cookie = le16_to_cpu(req->cookie) & RCFW_MAX_COOKIE_VALUE;
+ if (!rc)
+ break;
+
+ if (!retry_cnt || (rc != -EAGAIN && rc != -EBUSY)) {
+ /* send failed */
+ dev_err(&rcfw->pdev->dev, "QPLIB: cmdq[%#x]=%#x send failed",
+ cookie, opcode);
+ return rc;
+ }
+ is_block ? mdelay(1) : usleep_range(500, 1000);
+
+ } while (retry_cnt--);
+
+ if (is_block)
+ rc = __block_for_resp(rcfw, cookie);
+ else
+ rc = __wait_for_resp(rcfw, cookie);
+ if (rc) {
+ /* timed out */
+ dev_err(&rcfw->pdev->dev, "QPLIB: cmdq[%#x]=%#x timedout (%d)msec",
+ cookie, opcode, RCFW_CMD_WAIT_TIME_MS);
+ return rc;
+ }
+
+ if (evnt->status) {
+ /* failed with status */
+ dev_err(&rcfw->pdev->dev, "QPLIB: cmdq[%#x]=%#x status %#x",
+ cookie, opcode, evnt->status);
+ rc = -EFAULT;
+ }
+
+ return rc;
+}
/* Completions */
static int bnxt_qplib_process_func_event(struct bnxt_qplib_rcfw *rcfw,
struct creq_func_event *func_event)
@@ -260,12 +278,12 @@ static int bnxt_qplib_process_func_event(struct bnxt_qplib_rcfw *rcfw,
static int bnxt_qplib_process_qp_event(struct bnxt_qplib_rcfw *rcfw,
struct creq_qp_event *qp_event)
{
- struct bnxt_qplib_crsq *crsq = &rcfw->crsq;
struct bnxt_qplib_hwq *cmdq = &rcfw->cmdq;
- struct bnxt_qplib_crsqe *crsqe;
- u16 cbit, cookie, blocked = 0;
+ struct bnxt_qplib_crsq *crsqe;
unsigned long flags;
- u32 sw_cons;
+ u16 cbit, blocked = 0;
+ u16 cookie;
+ __le16 mcookie;
switch (qp_event->event) {
case CREQ_QP_EVENT_EVENT_QP_ERROR_NOTIFICATION:
@@ -275,24 +293,31 @@ static int bnxt_qplib_process_qp_event(struct bnxt_qplib_rcfw *rcfw,
default:
/* Command Response */
spin_lock_irqsave(&cmdq->lock, flags);
- sw_cons = HWQ_CMP(crsq->cons, crsq);
- crsqe = &crsq->crsq[sw_cons];
- crsq->cons++;
- memcpy(&crsqe->qp_event, qp_event, sizeof(crsqe->qp_event));
-
- cookie = le16_to_cpu(crsqe->qp_event.cookie);
+ cookie = le16_to_cpu(qp_event->cookie);
+ mcookie = qp_event->cookie;
blocked = cookie & RCFW_CMD_IS_BLOCKING;
cookie &= RCFW_MAX_COOKIE_VALUE;
cbit = cookie % RCFW_MAX_OUTSTANDING_CMD;
+ crsqe = &rcfw->crsqe_tbl[cbit];
+ if (crsqe->resp &&
+ crsqe->resp->cookie == mcookie) {
+ memcpy(crsqe->resp, qp_event, sizeof(*qp_event));
+ crsqe->resp = NULL;
+ } else {
+ dev_err(&rcfw->pdev->dev,
+ "QPLIB: CMD %s resp->cookie = %#x, evnt->cookie = %#x",
+ crsqe->resp ? "mismatch" : "collision",
+ crsqe->resp ? crsqe->resp->cookie : 0, mcookie);
+ }
if (!test_and_clear_bit(cbit, rcfw->cmdq_bitmap))
dev_warn(&rcfw->pdev->dev,
"QPLIB: CMD bit %d was not requested", cbit);
-
cmdq->cons += crsqe->req_size;
- spin_unlock_irqrestore(&cmdq->lock, flags);
+ crsqe->req_size = 0;
+
if (!blocked)
wake_up(&rcfw->waitq);
- break;
+ spin_unlock_irqrestore(&cmdq->lock, flags);
}
return 0;
}
@@ -305,12 +330,12 @@ static void bnxt_qplib_service_creq(unsigned long data)
struct creq_base *creqe, **creq_ptr;
u32 sw_cons, raw_cons;
unsigned long flags;
- u32 type;
+ u32 type, budget = CREQ_ENTRY_POLL_BUDGET;
- /* Service the CREQ until empty */
+ /* Service the CREQ until budget is over */
spin_lock_irqsave(&creq->lock, flags);
raw_cons = creq->cons;
- while (1) {
+ while (budget > 0) {
sw_cons = HWQ_CMP(raw_cons, creq);
creq_ptr = (struct creq_base **)creq->pbl_ptr;
creqe = &creq_ptr[get_creq_pg(sw_cons)][get_creq_idx(sw_cons)];
@@ -320,15 +345,9 @@ static void bnxt_qplib_service_creq(unsigned long data)
type = creqe->type & CREQ_BASE_TYPE_MASK;
switch (type) {
case CREQ_BASE_TYPE_QP_EVENT:
- if (!bnxt_qplib_process_qp_event
- (rcfw, (struct creq_qp_event *)creqe))
- rcfw->creq_qp_event_processed++;
- else {
- dev_warn(&rcfw->pdev->dev, "QPLIB: crsqe with");
- dev_warn(&rcfw->pdev->dev,
- "QPLIB: type = 0x%x not handled",
- type);
- }
+ bnxt_qplib_process_qp_event
+ (rcfw, (struct creq_qp_event *)creqe);
+ rcfw->creq_qp_event_processed++;
break;
case CREQ_BASE_TYPE_FUNC_EVENT:
if (!bnxt_qplib_process_func_event
@@ -346,7 +365,9 @@ static void bnxt_qplib_service_creq(unsigned long data)
break;
}
raw_cons++;
+ budget--;
}
+
if (creq->cons != raw_cons) {
creq->cons = raw_cons;
CREQ_DB_REARM(rcfw->creq_bar_reg_iomem, raw_cons,
@@ -375,23 +396,16 @@ static irqreturn_t bnxt_qplib_creq_irq(int irq, void *dev_instance)
/* RCFW */
int bnxt_qplib_deinit_rcfw(struct bnxt_qplib_rcfw *rcfw)
{
- struct creq_deinitialize_fw_resp *resp;
struct cmdq_deinitialize_fw req;
+ struct creq_deinitialize_fw_resp resp;
u16 cmd_flags = 0;
+ int rc;
RCFW_CMD_PREP(req, DEINITIALIZE_FW, cmd_flags);
- resp = (struct creq_deinitialize_fw_resp *)
- bnxt_qplib_rcfw_send_message(rcfw, (void *)&req,
- NULL, 0);
- if (!resp)
- return -EINVAL;
-
- if (!bnxt_qplib_rcfw_wait_for_resp(rcfw, le16_to_cpu(req.cookie)))
- return -ETIMEDOUT;
-
- if (resp->status ||
- le16_to_cpu(resp->cookie) != le16_to_cpu(req.cookie))
- return -EFAULT;
+ rc = bnxt_qplib_rcfw_send_message(rcfw, (void *)&req, (void *)&resp,
+ NULL, 0);
+ if (rc)
+ return rc;
clear_bit(FIRMWARE_INITIALIZED_FLAG, &rcfw->flags);
return 0;
@@ -417,9 +431,10 @@ static int __get_pbl_pg_idx(struct bnxt_qplib_pbl *pbl)
int bnxt_qplib_init_rcfw(struct bnxt_qplib_rcfw *rcfw,
struct bnxt_qplib_ctx *ctx, int is_virtfn)
{
- struct creq_initialize_fw_resp *resp;
struct cmdq_initialize_fw req;
+ struct creq_initialize_fw_resp resp;
u16 cmd_flags = 0, level;
+ int rc;
RCFW_CMD_PREP(req, INITIALIZE_FW, cmd_flags);
@@ -482,37 +497,19 @@ int bnxt_qplib_init_rcfw(struct bnxt_qplib_rcfw *rcfw,
skip_ctx_setup:
req.stat_ctx_id = cpu_to_le32(ctx->stats.fw_id);
- resp = (struct creq_initialize_fw_resp *)
- bnxt_qplib_rcfw_send_message(rcfw, (void *)&req,
- NULL, 0);
- if (!resp) {
- dev_err(&rcfw->pdev->dev,
- "QPLIB: RCFW: INITIALIZE_FW send failed");
- return -EINVAL;
- }
- if (!bnxt_qplib_rcfw_wait_for_resp(rcfw, le16_to_cpu(req.cookie))) {
- /* Cmd timed out */
- dev_err(&rcfw->pdev->dev,
- "QPLIB: RCFW: INITIALIZE_FW timed out");
- return -ETIMEDOUT;
- }
- if (resp->status ||
- le16_to_cpu(resp->cookie) != le16_to_cpu(req.cookie)) {
- dev_err(&rcfw->pdev->dev,
- "QPLIB: RCFW: INITIALIZE_FW failed");
- return -EINVAL;
- }
+ rc = bnxt_qplib_rcfw_send_message(rcfw, (void *)&req, (void *)&resp,
+ NULL, 0);
+ if (rc)
+ return rc;
set_bit(FIRMWARE_INITIALIZED_FLAG, &rcfw->flags);
return 0;
}
void bnxt_qplib_free_rcfw_channel(struct bnxt_qplib_rcfw *rcfw)
{
- bnxt_qplib_free_hwq(rcfw->pdev, &rcfw->crsb);
- kfree(rcfw->crsq.crsq);
+ kfree(rcfw->crsqe_tbl);
bnxt_qplib_free_hwq(rcfw->pdev, &rcfw->cmdq);
bnxt_qplib_free_hwq(rcfw->pdev, &rcfw->creq);
-
rcfw->pdev = NULL;
}
@@ -539,21 +536,11 @@ int bnxt_qplib_alloc_rcfw_channel(struct pci_dev *pdev,
goto fail;
}
- rcfw->crsq.max_elements = rcfw->cmdq.max_elements;
- rcfw->crsq.crsq = kcalloc(rcfw->crsq.max_elements,
- sizeof(*rcfw->crsq.crsq), GFP_KERNEL);
- if (!rcfw->crsq.crsq)
+ rcfw->crsqe_tbl = kcalloc(rcfw->cmdq.max_elements,
+ sizeof(*rcfw->crsqe_tbl), GFP_KERNEL);
+ if (!rcfw->crsqe_tbl)
goto fail;
- rcfw->crsb.max_elements = BNXT_QPLIB_CRSBE_MAX_CNT;
- if (bnxt_qplib_alloc_init_hwq(rcfw->pdev, &rcfw->crsb, NULL, 0,
- &rcfw->crsb.max_elements,
- BNXT_QPLIB_CRSBE_UNITS, 0, PAGE_SIZE,
- HWQ_TYPE_CTX)) {
- dev_err(&rcfw->pdev->dev,
- "QPLIB: HW channel CRSB allocation failed");
- goto fail;
- }
return 0;
fail:
@@ -606,7 +593,7 @@ int bnxt_qplib_enable_rcfw_channel(struct pci_dev *pdev,
int rc;
/* General */
- atomic_set(&rcfw->seq_num, 0);
+ rcfw->seq_num = 0;
rcfw->flags = FIRMWARE_FIRST_FLAG;
bmap_size = BITS_TO_LONGS(RCFW_MAX_OUTSTANDING_CMD *
sizeof(unsigned long));
@@ -636,10 +623,6 @@ int bnxt_qplib_enable_rcfw_channel(struct pci_dev *pdev,
rcfw->cmdq_bar_reg_trig_off = RCFW_COMM_TRIG_OFFSET;
- /* CRSQ */
- rcfw->crsq.prod = 0;
- rcfw->crsq.cons = 0;
-
/* CREQ */
rcfw->creq_bar_reg = RCFW_COMM_CONS_PCI_BAR_REGION;
res_base = pci_resource_start(pdev, rcfw->creq_bar_reg);
@@ -692,3 +675,34 @@ int bnxt_qplib_enable_rcfw_channel(struct pci_dev *pdev,
__iowrite32_copy(rcfw->cmdq_bar_reg_iomem, &init, sizeof(init) / 4);
return 0;
}
+
+struct bnxt_qplib_rcfw_sbuf *bnxt_qplib_rcfw_alloc_sbuf(
+ struct bnxt_qplib_rcfw *rcfw,
+ u32 size)
+{
+ struct bnxt_qplib_rcfw_sbuf *sbuf;
+
+ sbuf = kzalloc(sizeof(*sbuf), GFP_ATOMIC);
+ if (!sbuf)
+ return NULL;
+
+ sbuf->size = size;
+ sbuf->sb = dma_zalloc_coherent(&rcfw->pdev->dev, sbuf->size,
+ &sbuf->dma_addr, GFP_ATOMIC);
+ if (!sbuf->sb)
+ goto bail;
+
+ return sbuf;
+bail:
+ kfree(sbuf);
+ return NULL;
+}
+
+void bnxt_qplib_rcfw_free_sbuf(struct bnxt_qplib_rcfw *rcfw,
+ struct bnxt_qplib_rcfw_sbuf *sbuf)
+{
+ if (sbuf->sb)
+ dma_free_coherent(&rcfw->pdev->dev, sbuf->size,
+ sbuf->sb, sbuf->dma_addr);
+ kfree(sbuf);
+}
diff --git a/drivers/infiniband/hw/bnxt_re/qplib_rcfw.h b/drivers/infiniband/hw/bnxt_re/qplib_rcfw.h
index d3567d75bf58..09ce121770cd 100644
--- a/drivers/infiniband/hw/bnxt_re/qplib_rcfw.h
+++ b/drivers/infiniband/hw/bnxt_re/qplib_rcfw.h
@@ -73,6 +73,7 @@
#define RCFW_MAX_OUTSTANDING_CMD BNXT_QPLIB_CMDQE_MAX_CNT
#define RCFW_MAX_COOKIE_VALUE 0x7FFF
#define RCFW_CMD_IS_BLOCKING 0x8000
+#define RCFW_BLOCKED_CMD_WAIT_COUNT 0x4E20
/* Cmdq contains a fix number of a 16-Byte slots */
struct bnxt_qplib_cmdqe {
@@ -94,32 +95,6 @@ struct bnxt_qplib_crsbe {
u8 data[1024];
};
-/* CRSQ SB */
-#define BNXT_QPLIB_CRSBE_MAX_CNT 4
-#define BNXT_QPLIB_CRSBE_UNITS sizeof(struct bnxt_qplib_crsbe)
-#define BNXT_QPLIB_CRSBE_CNT_PER_PG (PAGE_SIZE / BNXT_QPLIB_CRSBE_UNITS)
-
-#define MAX_CRSB_IDX (BNXT_QPLIB_CRSBE_MAX_CNT - 1)
-#define MAX_CRSB_IDX_PER_PG (BNXT_QPLIB_CRSBE_CNT_PER_PG - 1)
-
-static inline u32 get_crsb_pg(u32 val)
-{
- return (val & ~MAX_CRSB_IDX_PER_PG) / BNXT_QPLIB_CRSBE_CNT_PER_PG;
-}
-
-static inline u32 get_crsb_idx(u32 val)
-{
- return val & MAX_CRSB_IDX_PER_PG;
-}
-
-static inline void bnxt_qplib_crsb_dma_next(dma_addr_t *pg_map_arr,
- u32 prod, dma_addr_t *dma_addr)
-{
- *dma_addr = pg_map_arr[(prod) / BNXT_QPLIB_CRSBE_CNT_PER_PG];
- *dma_addr += ((prod) % BNXT_QPLIB_CRSBE_CNT_PER_PG) *
- BNXT_QPLIB_CRSBE_UNITS;
-}
-
/* CREQ */
/* Allocate 1 per QP for async error notification for now */
#define BNXT_QPLIB_CREQE_MAX_CNT (64 * 1024)
@@ -158,17 +133,19 @@ static inline u32 get_creq_idx(u32 val)
#define CREQ_DB(db, raw_cons, cp_bit) \
writel(CREQ_DB_CP_FLAGS | ((raw_cons) & ((cp_bit) - 1)), db)
+#define CREQ_ENTRY_POLL_BUDGET 0x100
+
/* HWQ */
-struct bnxt_qplib_crsqe {
- struct creq_qp_event qp_event;
+
+struct bnxt_qplib_crsq {
+ struct creq_qp_event *resp;
u32 req_size;
};
-struct bnxt_qplib_crsq {
- struct bnxt_qplib_crsqe *crsq;
- u32 prod;
- u32 cons;
- u32 max_elements;
+struct bnxt_qplib_rcfw_sbuf {
+ void *sb;
+ dma_addr_t dma_addr;
+ u32 size;
};
/* RCFW Communication Channels */
@@ -185,7 +162,7 @@ struct bnxt_qplib_rcfw {
wait_queue_head_t waitq;
int (*aeq_handler)(struct bnxt_qplib_rcfw *,
struct creq_func_event *);
- atomic_t seq_num;
+ u32 seq_num;
/* Bar region info */
void __iomem *cmdq_bar_reg_iomem;
@@ -203,8 +180,7 @@ struct bnxt_qplib_rcfw {
/* Actual Cmd and Resp Queues */
struct bnxt_qplib_hwq cmdq;
- struct bnxt_qplib_crsq crsq;
- struct bnxt_qplib_hwq crsb;
+ struct bnxt_qplib_crsq *crsqe_tbl;
};
void bnxt_qplib_free_rcfw_channel(struct bnxt_qplib_rcfw *rcfw);
@@ -219,11 +195,14 @@ int bnxt_qplib_enable_rcfw_channel(struct pci_dev *pdev,
(struct bnxt_qplib_rcfw *,
struct creq_func_event *));
-int bnxt_qplib_rcfw_block_for_resp(struct bnxt_qplib_rcfw *rcfw, u16 cookie);
-int bnxt_qplib_rcfw_wait_for_resp(struct bnxt_qplib_rcfw *rcfw, u16 cookie);
-void *bnxt_qplib_rcfw_send_message(struct bnxt_qplib_rcfw *rcfw,
- struct cmdq_base *req, void **crsbe,
- u8 is_block);
+struct bnxt_qplib_rcfw_sbuf *bnxt_qplib_rcfw_alloc_sbuf(
+ struct bnxt_qplib_rcfw *rcfw,
+ u32 size);
+void bnxt_qplib_rcfw_free_sbuf(struct bnxt_qplib_rcfw *rcfw,
+ struct bnxt_qplib_rcfw_sbuf *sbuf);
+int bnxt_qplib_rcfw_send_message(struct bnxt_qplib_rcfw *rcfw,
+ struct cmdq_base *req, struct creq_base *resp,
+ void *sbuf, u8 is_block);
int bnxt_qplib_deinit_rcfw(struct bnxt_qplib_rcfw *rcfw);
int bnxt_qplib_init_rcfw(struct bnxt_qplib_rcfw *rcfw,
diff --git a/drivers/infiniband/hw/bnxt_re/qplib_res.h b/drivers/infiniband/hw/bnxt_re/qplib_res.h
index 6277d802ca4b..2e4855509719 100644
--- a/drivers/infiniband/hw/bnxt_re/qplib_res.h
+++ b/drivers/infiniband/hw/bnxt_re/qplib_res.h
@@ -48,6 +48,10 @@ extern const struct bnxt_qplib_gid bnxt_qplib_gid_zero;
#define HWQ_CMP(idx, hwq) ((idx) & ((hwq)->max_elements - 1))
+#define HWQ_FREE_SLOTS(hwq) (hwq->max_elements - \
+ ((HWQ_CMP(hwq->prod, hwq)\
+ - HWQ_CMP(hwq->cons, hwq))\
+ & (hwq->max_elements - 1)))
enum bnxt_qplib_hwq_type {
HWQ_TYPE_CTX,
HWQ_TYPE_QUEUE,
diff --git a/drivers/infiniband/hw/bnxt_re/qplib_sp.c b/drivers/infiniband/hw/bnxt_re/qplib_sp.c
index 7b31eccedf11..fde18cf0e406 100644
--- a/drivers/infiniband/hw/bnxt_re/qplib_sp.c
+++ b/drivers/infiniband/hw/bnxt_re/qplib_sp.c
@@ -55,37 +55,30 @@ int bnxt_qplib_get_dev_attr(struct bnxt_qplib_rcfw *rcfw,
struct bnxt_qplib_dev_attr *attr)
{
struct cmdq_query_func req;
- struct creq_query_func_resp *resp;
+ struct creq_query_func_resp resp;
+ struct bnxt_qplib_rcfw_sbuf *sbuf;
struct creq_query_func_resp_sb *sb;
u16 cmd_flags = 0;
u32 temp;
u8 *tqm_alloc;
- int i;
+ int i, rc = 0;
RCFW_CMD_PREP(req, QUERY_FUNC, cmd_flags);
- req.resp_size = sizeof(*sb) / BNXT_QPLIB_CMDQE_UNITS;
- resp = (struct creq_query_func_resp *)
- bnxt_qplib_rcfw_send_message(rcfw, (void *)&req, (void **)&sb,
- 0);
- if (!resp) {
- dev_err(&rcfw->pdev->dev, "QPLIB: SP: QUERY_FUNC send failed");
- return -EINVAL;
- }
- if (!bnxt_qplib_rcfw_wait_for_resp(rcfw, le16_to_cpu(req.cookie))) {
- /* Cmd timed out */
- dev_err(&rcfw->pdev->dev, "QPLIB: SP: QUERY_FUNC timed out");
- return -ETIMEDOUT;
- }
- if (resp->status ||
- le16_to_cpu(resp->cookie) != le16_to_cpu(req.cookie)) {
- dev_err(&rcfw->pdev->dev, "QPLIB: SP: QUERY_FUNC failed ");
+ sbuf = bnxt_qplib_rcfw_alloc_sbuf(rcfw, sizeof(*sb));
+ if (!sbuf) {
dev_err(&rcfw->pdev->dev,
- "QPLIB: with status 0x%x cmdq 0x%x resp 0x%x",
- resp->status, le16_to_cpu(req.cookie),
- le16_to_cpu(resp->cookie));
- return -EINVAL;
+ "QPLIB: SP: QUERY_FUNC alloc side buffer failed");
+ return -ENOMEM;
}
+
+ sb = sbuf->sb;
+ req.resp_size = sizeof(*sb) / BNXT_QPLIB_CMDQE_UNITS;
+ rc = bnxt_qplib_rcfw_send_message(rcfw, (void *)&req, (void *)&resp,
+ (void *)sbuf, 0);
+ if (rc)
+ goto bail;
+
/* Extract the context from the side buffer */
attr->max_qp = le32_to_cpu(sb->max_qp);
attr->max_qp_rd_atom =
@@ -95,6 +88,11 @@ int bnxt_qplib_get_dev_attr(struct bnxt_qplib_rcfw *rcfw,
sb->max_qp_init_rd_atom > BNXT_QPLIB_MAX_OUT_RD_ATOM ?
BNXT_QPLIB_MAX_OUT_RD_ATOM : sb->max_qp_init_rd_atom;
attr->max_qp_wqes = le16_to_cpu(sb->max_qp_wr);
+ /*
+ * 128 WQEs needs to be reserved for the HW (8916). Prevent
+ * reporting the max number
+ */
+ attr->max_qp_wqes -= BNXT_QPLIB_RESERVED_QP_WRS;
attr->max_qp_sges = sb->max_sge;
attr->max_cq = le32_to_cpu(sb->max_cq);
attr->max_cq_wqes = le32_to_cpu(sb->max_cqe);
@@ -130,7 +128,10 @@ int bnxt_qplib_get_dev_attr(struct bnxt_qplib_rcfw *rcfw,
attr->tqm_alloc_reqs[i * 4 + 2] = *(++tqm_alloc);
attr->tqm_alloc_reqs[i * 4 + 3] = *(++tqm_alloc);
}
- return 0;
+
+bail:
+ bnxt_qplib_rcfw_free_sbuf(rcfw, sbuf);
+ return rc;
}
/* SGID */
@@ -178,8 +179,9 @@ int bnxt_qplib_del_sgid(struct bnxt_qplib_sgid_tbl *sgid_tbl,
/* Remove GID from the SGID table */
if (update) {
struct cmdq_delete_gid req;
- struct creq_delete_gid_resp *resp;
+ struct creq_delete_gid_resp resp;
u16 cmd_flags = 0;
+ int rc;
RCFW_CMD_PREP(req, DELETE_GID, cmd_flags);
if (sgid_tbl->hw_id[index] == 0xFFFF) {
@@ -188,31 +190,10 @@ int bnxt_qplib_del_sgid(struct bnxt_qplib_sgid_tbl *sgid_tbl,
return -EINVAL;
}
req.gid_index = cpu_to_le16(sgid_tbl->hw_id[index]);
- resp = (struct creq_delete_gid_resp *)
- bnxt_qplib_rcfw_send_message(rcfw, (void *)&req, NULL,
- 0);
- if (!resp) {
- dev_err(&res->pdev->dev,
- "QPLIB: SP: DELETE_GID send failed");
- return -EINVAL;
- }
- if (!bnxt_qplib_rcfw_wait_for_resp(rcfw,
- le16_to_cpu(req.cookie))) {
- /* Cmd timed out */
- dev_err(&res->pdev->dev,
- "QPLIB: SP: DELETE_GID timed out");
- return -ETIMEDOUT;
- }
- if (resp->status ||
- le16_to_cpu(resp->cookie) != le16_to_cpu(req.cookie)) {
- dev_err(&res->pdev->dev,
- "QPLIB: SP: DELETE_GID failed ");
- dev_err(&res->pdev->dev,
- "QPLIB: with status 0x%x cmdq 0x%x resp 0x%x",
- resp->status, le16_to_cpu(req.cookie),
- le16_to_cpu(resp->cookie));
- return -EINVAL;
- }
+ rc = bnxt_qplib_rcfw_send_message(rcfw, (void *)&req,
+ (void *)&resp, NULL, 0);
+ if (rc)
+ return rc;
}
memcpy(&sgid_tbl->tbl[index], &bnxt_qplib_gid_zero,
sizeof(bnxt_qplib_gid_zero));
@@ -234,7 +215,7 @@ int bnxt_qplib_add_sgid(struct bnxt_qplib_sgid_tbl *sgid_tbl,
struct bnxt_qplib_res,
sgid_tbl);
struct bnxt_qplib_rcfw *rcfw = res->rcfw;
- int i, free_idx, rc = 0;
+ int i, free_idx;
if (!sgid_tbl) {
dev_err(&res->pdev->dev, "QPLIB: SGID table not allocated");
@@ -266,10 +247,11 @@ int bnxt_qplib_add_sgid(struct bnxt_qplib_sgid_tbl *sgid_tbl,
}
if (update) {
struct cmdq_add_gid req;
- struct creq_add_gid_resp *resp;
+ struct creq_add_gid_resp resp;
u16 cmd_flags = 0;
u32 temp32[4];
u16 temp16[3];
+ int rc;
RCFW_CMD_PREP(req, ADD_GID, cmd_flags);
@@ -290,31 +272,11 @@ int bnxt_qplib_add_sgid(struct bnxt_qplib_sgid_tbl *sgid_tbl,
req.src_mac[1] = cpu_to_be16(temp16[1]);
req.src_mac[2] = cpu_to_be16(temp16[2]);
- resp = (struct creq_add_gid_resp *)
- bnxt_qplib_rcfw_send_message(rcfw, (void *)&req,
- NULL, 0);
- if (!resp) {
- dev_err(&res->pdev->dev,
- "QPLIB: SP: ADD_GID send failed");
- return -EINVAL;
- }
- if (!bnxt_qplib_rcfw_wait_for_resp(rcfw,
- le16_to_cpu(req.cookie))) {
- /* Cmd timed out */
- dev_err(&res->pdev->dev,
- "QPIB: SP: ADD_GID timed out");
- return -ETIMEDOUT;
- }
- if (resp->status ||
- le16_to_cpu(resp->cookie) != le16_to_cpu(req.cookie)) {
- dev_err(&res->pdev->dev, "QPLIB: SP: ADD_GID failed ");
- dev_err(&res->pdev->dev,
- "QPLIB: with status 0x%x cmdq 0x%x resp 0x%x",
- resp->status, le16_to_cpu(req.cookie),
- le16_to_cpu(resp->cookie));
- return -EINVAL;
- }
- sgid_tbl->hw_id[free_idx] = le32_to_cpu(resp->xid);
+ rc = bnxt_qplib_rcfw_send_message(rcfw, (void *)&req,
+ (void *)&resp, NULL, 0);
+ if (rc)
+ return rc;
+ sgid_tbl->hw_id[free_idx] = le32_to_cpu(resp.xid);
}
/* Add GID to the sgid_tbl */
memcpy(&sgid_tbl->tbl[free_idx], gid, sizeof(*gid));
@@ -325,7 +287,7 @@ int bnxt_qplib_add_sgid(struct bnxt_qplib_sgid_tbl *sgid_tbl,
*index = free_idx;
/* unlock */
- return rc;
+ return 0;
}
/* pkeys */
@@ -422,10 +384,11 @@ int bnxt_qplib_create_ah(struct bnxt_qplib_res *res, struct bnxt_qplib_ah *ah)
{
struct bnxt_qplib_rcfw *rcfw = res->rcfw;
struct cmdq_create_ah req;
- struct creq_create_ah_resp *resp;
+ struct creq_create_ah_resp resp;
u16 cmd_flags = 0;
u32 temp32[4];
u16 temp16[3];
+ int rc;
RCFW_CMD_PREP(req, CREATE_AH, cmd_flags);
@@ -450,28 +413,12 @@ int bnxt_qplib_create_ah(struct bnxt_qplib_res *res, struct bnxt_qplib_ah *ah)
req.dest_mac[1] = cpu_to_le16(temp16[1]);
req.dest_mac[2] = cpu_to_le16(temp16[2]);
- resp = (struct creq_create_ah_resp *)
- bnxt_qplib_rcfw_send_message(rcfw, (void *)&req,
- NULL, 1);
- if (!resp) {
- dev_err(&rcfw->pdev->dev, "QPLIB: SP: CREATE_AH send failed");
- return -EINVAL;
- }
- if (!bnxt_qplib_rcfw_block_for_resp(rcfw, le16_to_cpu(req.cookie))) {
- /* Cmd timed out */
- dev_err(&rcfw->pdev->dev, "QPLIB: SP: CREATE_AH timed out");
- return -ETIMEDOUT;
- }
- if (resp->status ||
- le16_to_cpu(resp->cookie) != le16_to_cpu(req.cookie)) {
- dev_err(&rcfw->pdev->dev, "QPLIB: SP: CREATE_AH failed ");
- dev_err(&rcfw->pdev->dev,
- "QPLIB: with status 0x%x cmdq 0x%x resp 0x%x",
- resp->status, le16_to_cpu(req.cookie),
- le16_to_cpu(resp->cookie));
- return -EINVAL;
- }
- ah->id = le32_to_cpu(resp->xid);
+ rc = bnxt_qplib_rcfw_send_message(rcfw, (void *)&req, (void *)&resp,
+ NULL, 1);
+ if (rc)
+ return rc;
+
+ ah->id = le32_to_cpu(resp.xid);
return 0;
}
@@ -479,35 +426,19 @@ int bnxt_qplib_destroy_ah(struct bnxt_qplib_res *res, struct bnxt_qplib_ah *ah)
{
struct bnxt_qplib_rcfw *rcfw = res->rcfw;
struct cmdq_destroy_ah req;
- struct creq_destroy_ah_resp *resp;
+ struct creq_destroy_ah_resp resp;
u16 cmd_flags = 0;
+ int rc;
/* Clean up the AH table in the device */
RCFW_CMD_PREP(req, DESTROY_AH, cmd_flags);
req.ah_cid = cpu_to_le32(ah->id);
- resp = (struct creq_destroy_ah_resp *)
- bnxt_qplib_rcfw_send_message(rcfw, (void *)&req,
- NULL, 1);
- if (!resp) {
- dev_err(&rcfw->pdev->dev, "QPLIB: SP: DESTROY_AH send failed");
- return -EINVAL;
- }
- if (!bnxt_qplib_rcfw_block_for_resp(rcfw, le16_to_cpu(req.cookie))) {
- /* Cmd timed out */
- dev_err(&rcfw->pdev->dev, "QPLIB: SP: DESTROY_AH timed out");
- return -ETIMEDOUT;
- }
- if (resp->status ||
- le16_to_cpu(resp->cookie) != le16_to_cpu(req.cookie)) {
- dev_err(&rcfw->pdev->dev, "QPLIB: SP: DESTROY_AH failed ");
- dev_err(&rcfw->pdev->dev,
- "QPLIB: with status 0x%x cmdq 0x%x resp 0x%x",
- resp->status, le16_to_cpu(req.cookie),
- le16_to_cpu(resp->cookie));
- return -EINVAL;
- }
+ rc = bnxt_qplib_rcfw_send_message(rcfw, (void *)&req, (void *)&resp,
+ NULL, 1);
+ if (rc)
+ return rc;
return 0;
}
@@ -516,8 +447,9 @@ int bnxt_qplib_free_mrw(struct bnxt_qplib_res *res, struct bnxt_qplib_mrw *mrw)
{
struct bnxt_qplib_rcfw *rcfw = res->rcfw;
struct cmdq_deallocate_key req;
- struct creq_deallocate_key_resp *resp;
+ struct creq_deallocate_key_resp resp;
u16 cmd_flags = 0;
+ int rc;
if (mrw->lkey == 0xFFFFFFFF) {
dev_info(&res->pdev->dev,
@@ -536,27 +468,11 @@ int bnxt_qplib_free_mrw(struct bnxt_qplib_res *res, struct bnxt_qplib_mrw *mrw)
else
req.key = cpu_to_le32(mrw->lkey);
- resp = (struct creq_deallocate_key_resp *)
- bnxt_qplib_rcfw_send_message(rcfw, (void *)&req,
- NULL, 0);
- if (!resp) {
- dev_err(&res->pdev->dev, "QPLIB: SP: FREE_MR send failed");
- return -EINVAL;
- }
- if (!bnxt_qplib_rcfw_wait_for_resp(rcfw, le16_to_cpu(req.cookie))) {
- /* Cmd timed out */
- dev_err(&res->pdev->dev, "QPLIB: SP: FREE_MR timed out");
- return -ETIMEDOUT;
- }
- if (resp->status ||
- le16_to_cpu(resp->cookie) != le16_to_cpu(req.cookie)) {
- dev_err(&res->pdev->dev, "QPLIB: SP: FREE_MR failed ");
- dev_err(&res->pdev->dev,
- "QPLIB: with status 0x%x cmdq 0x%x resp 0x%x",
- resp->status, le16_to_cpu(req.cookie),
- le16_to_cpu(resp->cookie));
- return -EINVAL;
- }
+ rc = bnxt_qplib_rcfw_send_message(rcfw, (void *)&req, (void *)&resp,
+ NULL, 0);
+ if (rc)
+ return rc;
+
/* Free the qplib's MRW memory */
if (mrw->hwq.max_elements)
bnxt_qplib_free_hwq(res->pdev, &mrw->hwq);
@@ -568,9 +484,10 @@ int bnxt_qplib_alloc_mrw(struct bnxt_qplib_res *res, struct bnxt_qplib_mrw *mrw)
{
struct bnxt_qplib_rcfw *rcfw = res->rcfw;
struct cmdq_allocate_mrw req;
- struct creq_allocate_mrw_resp *resp;
+ struct creq_allocate_mrw_resp resp;
u16 cmd_flags = 0;
unsigned long tmp;
+ int rc;
RCFW_CMD_PREP(req, ALLOCATE_MRW, cmd_flags);
@@ -584,33 +501,17 @@ int bnxt_qplib_alloc_mrw(struct bnxt_qplib_res *res, struct bnxt_qplib_mrw *mrw)
tmp = (unsigned long)mrw;
req.mrw_handle = cpu_to_le64(tmp);
- resp = (struct creq_allocate_mrw_resp *)
- bnxt_qplib_rcfw_send_message(rcfw, (void *)&req,
- NULL, 0);
- if (!resp) {
- dev_err(&rcfw->pdev->dev, "QPLIB: SP: ALLOC_MRW send failed");
- return -EINVAL;
- }
- if (!bnxt_qplib_rcfw_wait_for_resp(rcfw, le16_to_cpu(req.cookie))) {
- /* Cmd timed out */
- dev_err(&rcfw->pdev->dev, "QPLIB: SP: ALLOC_MRW timed out");
- return -ETIMEDOUT;
- }
- if (resp->status ||
- le16_to_cpu(resp->cookie) != le16_to_cpu(req.cookie)) {
- dev_err(&rcfw->pdev->dev, "QPLIB: SP: ALLOC_MRW failed ");
- dev_err(&rcfw->pdev->dev,
- "QPLIB: with status 0x%x cmdq 0x%x resp 0x%x",
- resp->status, le16_to_cpu(req.cookie),
- le16_to_cpu(resp->cookie));
- return -EINVAL;
- }
+ rc = bnxt_qplib_rcfw_send_message(rcfw, (void *)&req,
+ (void *)&resp, NULL, 0);
+ if (rc)
+ return rc;
+
if ((mrw->type == CMDQ_ALLOCATE_MRW_MRW_FLAGS_MW_TYPE1) ||
(mrw->type == CMDQ_ALLOCATE_MRW_MRW_FLAGS_MW_TYPE2A) ||
(mrw->type == CMDQ_ALLOCATE_MRW_MRW_FLAGS_MW_TYPE2B))
- mrw->rkey = le32_to_cpu(resp->xid);
+ mrw->rkey = le32_to_cpu(resp.xid);
else
- mrw->lkey = le32_to_cpu(resp->xid);
+ mrw->lkey = le32_to_cpu(resp.xid);
return 0;
}
@@ -619,40 +520,17 @@ int bnxt_qplib_dereg_mrw(struct bnxt_qplib_res *res, struct bnxt_qplib_mrw *mrw,
{
struct bnxt_qplib_rcfw *rcfw = res->rcfw;
struct cmdq_deregister_mr req;
- struct creq_deregister_mr_resp *resp;
+ struct creq_deregister_mr_resp resp;
u16 cmd_flags = 0;
int rc;
RCFW_CMD_PREP(req, DEREGISTER_MR, cmd_flags);
req.lkey = cpu_to_le32(mrw->lkey);
- resp = (struct creq_deregister_mr_resp *)
- bnxt_qplib_rcfw_send_message(rcfw, (void *)&req,
- NULL, block);
- if (!resp) {
- dev_err(&rcfw->pdev->dev, "QPLIB: SP: DEREG_MR send failed");
- return -EINVAL;
- }
- if (block)
- rc = bnxt_qplib_rcfw_block_for_resp(rcfw,
- le16_to_cpu(req.cookie));
- else
- rc = bnxt_qplib_rcfw_wait_for_resp(rcfw,
- le16_to_cpu(req.cookie));
- if (!rc) {
- /* Cmd timed out */
- dev_err(&res->pdev->dev, "QPLIB: SP: DEREG_MR timed out");
- return -ETIMEDOUT;
- }
- if (resp->status ||
- le16_to_cpu(resp->cookie) != le16_to_cpu(req.cookie)) {
- dev_err(&rcfw->pdev->dev, "QPLIB: SP: DEREG_MR failed ");
- dev_err(&rcfw->pdev->dev,
- "QPLIB: with status 0x%x cmdq 0x%x resp 0x%x",
- resp->status, le16_to_cpu(req.cookie),
- le16_to_cpu(resp->cookie));
- return -EINVAL;
- }
+ rc = bnxt_qplib_rcfw_send_message(rcfw, (void *)&req,
+ (void *)&resp, NULL, block);
+ if (rc)
+ return rc;
/* Free the qplib's MR memory */
if (mrw->hwq.max_elements) {
@@ -669,7 +547,7 @@ int bnxt_qplib_reg_mr(struct bnxt_qplib_res *res, struct bnxt_qplib_mrw *mr,
{
struct bnxt_qplib_rcfw *rcfw = res->rcfw;
struct cmdq_register_mr req;
- struct creq_register_mr_resp *resp;
+ struct creq_register_mr_resp resp;
u16 cmd_flags = 0, level;
int pg_ptrs, pages, i, rc;
dma_addr_t **pbl_ptr;
@@ -730,36 +608,11 @@ int bnxt_qplib_reg_mr(struct bnxt_qplib_res *res, struct bnxt_qplib_mrw *mr,
req.key = cpu_to_le32(mr->lkey);
req.mr_size = cpu_to_le64(mr->total_size);
- resp = (struct creq_register_mr_resp *)
- bnxt_qplib_rcfw_send_message(rcfw, (void *)&req,
- NULL, block);
- if (!resp) {
- dev_err(&res->pdev->dev, "SP: REG_MR send failed");
- rc = -EINVAL;
- goto fail;
- }
- if (block)
- rc = bnxt_qplib_rcfw_block_for_resp(rcfw,
- le16_to_cpu(req.cookie));
- else
- rc = bnxt_qplib_rcfw_wait_for_resp(rcfw,
- le16_to_cpu(req.cookie));
- if (!rc) {
- /* Cmd timed out */
- dev_err(&res->pdev->dev, "SP: REG_MR timed out");
- rc = -ETIMEDOUT;
- goto fail;
- }
- if (resp->status ||
- le16_to_cpu(resp->cookie) != le16_to_cpu(req.cookie)) {
- dev_err(&res->pdev->dev, "QPLIB: SP: REG_MR failed ");
- dev_err(&res->pdev->dev,
- "QPLIB: SP: with status 0x%x cmdq 0x%x resp 0x%x",
- resp->status, le16_to_cpu(req.cookie),
- le16_to_cpu(resp->cookie));
- rc = -EINVAL;
+ rc = bnxt_qplib_rcfw_send_message(rcfw, (void *)&req,
+ (void *)&resp, NULL, block);
+ if (rc)
goto fail;
- }
+
return 0;
fail:
@@ -804,35 +657,15 @@ int bnxt_qplib_map_tc2cos(struct bnxt_qplib_res *res, u16 *cids)
{
struct bnxt_qplib_rcfw *rcfw = res->rcfw;
struct cmdq_map_tc_to_cos req;
- struct creq_map_tc_to_cos_resp *resp;
+ struct creq_map_tc_to_cos_resp resp;
u16 cmd_flags = 0;
- int tleft;
+ int rc = 0;
RCFW_CMD_PREP(req, MAP_TC_TO_COS, cmd_flags);
req.cos0 = cpu_to_le16(cids[0]);
req.cos1 = cpu_to_le16(cids[1]);
- resp = bnxt_qplib_rcfw_send_message(rcfw, (void *)&req, NULL, 0);
- if (!resp) {
- dev_err(&res->pdev->dev, "QPLIB: SP: MAP_TC2COS send failed");
- return -EINVAL;
- }
-
- tleft = bnxt_qplib_rcfw_block_for_resp(rcfw, le16_to_cpu(req.cookie));
- if (!tleft) {
- dev_err(&res->pdev->dev, "QPLIB: SP: MAP_TC2COS timed out");
- return -ETIMEDOUT;
- }
-
- if (resp->status ||
- le16_to_cpu(resp->cookie) != le16_to_cpu(req.cookie)) {
- dev_err(&res->pdev->dev, "QPLIB: SP: MAP_TC2COS failed ");
- dev_err(&res->pdev->dev,
- "QPLIB: with status 0x%x cmdq 0x%x resp 0x%x",
- resp->status, le16_to_cpu(req.cookie),
- le16_to_cpu(resp->cookie));
- return -EINVAL;
- }
-
+ rc = bnxt_qplib_rcfw_send_message(rcfw, (void *)&req,
+ (void *)&resp, NULL, 0);
return 0;
}
diff --git a/drivers/infiniband/hw/bnxt_re/qplib_sp.h b/drivers/infiniband/hw/bnxt_re/qplib_sp.h
index 1442a617e968..a543f959098b 100644
--- a/drivers/infiniband/hw/bnxt_re/qplib_sp.h
+++ b/drivers/infiniband/hw/bnxt_re/qplib_sp.h
@@ -40,6 +40,8 @@
#ifndef __BNXT_QPLIB_SP_H__
#define __BNXT_QPLIB_SP_H__
+#define BNXT_QPLIB_RESERVED_QP_WRS 128
+
struct bnxt_qplib_dev_attr {
char fw_ver[32];
u16 max_sgid;
diff --git a/drivers/infiniband/hw/cxgb4/device.c b/drivers/infiniband/hw/cxgb4/device.c
index f96a96dbcf1f..ae0b79aeea2e 100644
--- a/drivers/infiniband/hw/cxgb4/device.c
+++ b/drivers/infiniband/hw/cxgb4/device.c
@@ -767,7 +767,7 @@ void c4iw_release_dev_ucontext(struct c4iw_rdev *rdev,
kfree(entry);
}
- list_for_each_safe(pos, nxt, &uctx->qpids) {
+ list_for_each_safe(pos, nxt, &uctx->cqids) {
entry = list_entry(pos, struct c4iw_qid_list, entry);
list_del_init(&entry->entry);
kfree(entry);
@@ -880,13 +880,15 @@ static int c4iw_rdev_open(struct c4iw_rdev *rdev)
rdev->free_workq = create_singlethread_workqueue("iw_cxgb4_free");
if (!rdev->free_workq) {
err = -ENOMEM;
- goto err_free_status_page;
+ goto err_free_status_page_and_wr_log;
}
rdev->status_page->db_off = 0;
return 0;
-err_free_status_page:
+err_free_status_page_and_wr_log:
+ if (c4iw_wr_log && rdev->wr_log)
+ kfree(rdev->wr_log);
free_page((unsigned long)rdev->status_page);
destroy_ocqp_pool:
c4iw_ocqp_pool_destroy(rdev);
@@ -903,9 +905,11 @@ static void c4iw_rdev_close(struct c4iw_rdev *rdev)
{
destroy_workqueue(rdev->free_workq);
kfree(rdev->wr_log);
+ c4iw_release_dev_ucontext(rdev, &rdev->uctx);
free_page((unsigned long)rdev->status_page);
c4iw_pblpool_destroy(rdev);
c4iw_rqtpool_destroy(rdev);
+ c4iw_ocqp_pool_destroy(rdev);
c4iw_destroy_resource(&rdev->resource);
}
diff --git a/drivers/infiniband/hw/i40iw/i40iw_main.c b/drivers/infiniband/hw/i40iw/i40iw_main.c
index a3f18a22f5ed..e0f47cc2effc 100644
--- a/drivers/infiniband/hw/i40iw/i40iw_main.c
+++ b/drivers/infiniband/hw/i40iw/i40iw_main.c
@@ -1939,7 +1939,7 @@ static int i40iw_virtchnl_receive(struct i40e_info *ldev,
bool i40iw_vf_clear_to_send(struct i40iw_sc_dev *dev)
{
struct i40iw_device *iwdev;
- wait_queue_t wait;
+ wait_queue_entry_t wait;
iwdev = dev->back_dev;
diff --git a/drivers/infiniband/hw/mlx5/main.c b/drivers/infiniband/hw/mlx5/main.c
index 0c79983c8b1a..9ecc089d4529 100644
--- a/drivers/infiniband/hw/mlx5/main.c
+++ b/drivers/infiniband/hw/mlx5/main.c
@@ -3692,8 +3692,10 @@ static void *mlx5_ib_add(struct mlx5_core_dev *mdev)
dev->ib_dev.check_mr_status = mlx5_ib_check_mr_status;
dev->ib_dev.get_port_immutable = mlx5_port_immutable;
dev->ib_dev.get_dev_fw_str = get_dev_fw_str;
- dev->ib_dev.alloc_rdma_netdev = mlx5_ib_alloc_rdma_netdev;
- dev->ib_dev.free_rdma_netdev = mlx5_ib_free_rdma_netdev;
+ if (MLX5_CAP_GEN(mdev, ipoib_enhanced_offloads)) {
+ dev->ib_dev.alloc_rdma_netdev = mlx5_ib_alloc_rdma_netdev;
+ dev->ib_dev.free_rdma_netdev = mlx5_ib_free_rdma_netdev;
+ }
if (mlx5_core_is_pf(mdev)) {
dev->ib_dev.get_vf_config = mlx5_ib_get_vf_config;
dev->ib_dev.set_vf_link_state = mlx5_ib_set_vf_link_state;
diff --git a/drivers/infiniband/hw/qedr/qedr.h b/drivers/infiniband/hw/qedr/qedr.h
index aa08c76a4245..d961f79b317c 100644
--- a/drivers/infiniband/hw/qedr/qedr.h
+++ b/drivers/infiniband/hw/qedr/qedr.h
@@ -58,7 +58,10 @@
#define QEDR_MSG_QP " QP"
#define QEDR_MSG_GSI " GSI"
-#define QEDR_CQ_MAGIC_NUMBER (0x11223344)
+#define QEDR_CQ_MAGIC_NUMBER (0x11223344)
+
+#define FW_PAGE_SIZE (RDMA_RING_PAGE_SIZE)
+#define FW_PAGE_SHIFT (12)
struct qedr_dev;
diff --git a/drivers/infiniband/hw/qedr/verbs.c b/drivers/infiniband/hw/qedr/verbs.c
index 17685cfea6a2..d6723c365c7f 100644
--- a/drivers/infiniband/hw/qedr/verbs.c
+++ b/drivers/infiniband/hw/qedr/verbs.c
@@ -653,14 +653,15 @@ static int qedr_prepare_pbl_tbl(struct qedr_dev *dev,
static void qedr_populate_pbls(struct qedr_dev *dev, struct ib_umem *umem,
struct qedr_pbl *pbl,
- struct qedr_pbl_info *pbl_info)
+ struct qedr_pbl_info *pbl_info, u32 pg_shift)
{
int shift, pg_cnt, pages, pbe_cnt, total_num_pbes = 0;
+ u32 fw_pg_cnt, fw_pg_per_umem_pg;
struct qedr_pbl *pbl_tbl;
struct scatterlist *sg;
struct regpair *pbe;
+ u64 pg_addr;
int entry;
- u32 addr;
if (!pbl_info->num_pbes)
return;
@@ -683,29 +684,35 @@ static void qedr_populate_pbls(struct qedr_dev *dev, struct ib_umem *umem,
shift = umem->page_shift;
+ fw_pg_per_umem_pg = BIT(umem->page_shift - pg_shift);
+
for_each_sg(umem->sg_head.sgl, sg, umem->nmap, entry) {
pages = sg_dma_len(sg) >> shift;
+ pg_addr = sg_dma_address(sg);
for (pg_cnt = 0; pg_cnt < pages; pg_cnt++) {
- /* store the page address in pbe */
- pbe->lo = cpu_to_le32(sg_dma_address(sg) +
- (pg_cnt << shift));
- addr = upper_32_bits(sg_dma_address(sg) +
- (pg_cnt << shift));
- pbe->hi = cpu_to_le32(addr);
- pbe_cnt++;
- total_num_pbes++;
- pbe++;
-
- if (total_num_pbes == pbl_info->num_pbes)
- return;
-
- /* If the given pbl is full storing the pbes,
- * move to next pbl.
- */
- if (pbe_cnt == (pbl_info->pbl_size / sizeof(u64))) {
- pbl_tbl++;
- pbe = (struct regpair *)pbl_tbl->va;
- pbe_cnt = 0;
+ for (fw_pg_cnt = 0; fw_pg_cnt < fw_pg_per_umem_pg;) {
+ pbe->lo = cpu_to_le32(pg_addr);
+ pbe->hi = cpu_to_le32(upper_32_bits(pg_addr));
+
+ pg_addr += BIT(pg_shift);
+ pbe_cnt++;
+ total_num_pbes++;
+ pbe++;
+
+ if (total_num_pbes == pbl_info->num_pbes)
+ return;
+
+ /* If the given pbl is full storing the pbes,
+ * move to next pbl.
+ */
+ if (pbe_cnt ==
+ (pbl_info->pbl_size / sizeof(u64))) {
+ pbl_tbl++;
+ pbe = (struct regpair *)pbl_tbl->va;
+ pbe_cnt = 0;
+ }
+
+ fw_pg_cnt++;
}
}
}
@@ -754,7 +761,7 @@ static inline int qedr_init_user_queue(struct ib_ucontext *ib_ctx,
u64 buf_addr, size_t buf_len,
int access, int dmasync)
{
- int page_cnt;
+ u32 fw_pages;
int rc;
q->buf_addr = buf_addr;
@@ -766,8 +773,10 @@ static inline int qedr_init_user_queue(struct ib_ucontext *ib_ctx,
return PTR_ERR(q->umem);
}
- page_cnt = ib_umem_page_count(q->umem);
- rc = qedr_prepare_pbl_tbl(dev, &q->pbl_info, page_cnt, 0);
+ fw_pages = ib_umem_page_count(q->umem) <<
+ (q->umem->page_shift - FW_PAGE_SHIFT);
+
+ rc = qedr_prepare_pbl_tbl(dev, &q->pbl_info, fw_pages, 0);
if (rc)
goto err0;
@@ -777,7 +786,8 @@ static inline int qedr_init_user_queue(struct ib_ucontext *ib_ctx,
goto err0;
}
- qedr_populate_pbls(dev, q->umem, q->pbl_tbl, &q->pbl_info);
+ qedr_populate_pbls(dev, q->umem, q->pbl_tbl, &q->pbl_info,
+ FW_PAGE_SHIFT);
return 0;
@@ -2226,7 +2236,7 @@ struct ib_mr *qedr_reg_user_mr(struct ib_pd *ibpd, u64 start, u64 len,
goto err1;
qedr_populate_pbls(dev, mr->umem, mr->info.pbl_table,
- &mr->info.pbl_info);
+ &mr->info.pbl_info, mr->umem->page_shift);
rc = dev->ops->rdma_alloc_tid(dev->rdma_ctx, &mr->hw_mr.itid);
if (rc) {
@@ -3209,6 +3219,10 @@ static int process_req(struct qedr_dev *dev, struct qedr_qp *qp,
case IB_WC_REG_MR:
qp->wqe_wr_id[qp->sq.cons].mr->info.completed++;
break;
+ case IB_WC_RDMA_READ:
+ case IB_WC_SEND:
+ wc->byte_len = qp->wqe_wr_id[qp->sq.cons].bytes_len;
+ break;
default:
break;
}
diff --git a/drivers/infiniband/sw/rxe/rxe.h b/drivers/infiniband/sw/rxe/rxe.h
index ecdba2fce083..1ac5b8551a4d 100644
--- a/drivers/infiniband/sw/rxe/rxe.h
+++ b/drivers/infiniband/sw/rxe/rxe.h
@@ -68,6 +68,7 @@
static inline u32 rxe_crc32(struct rxe_dev *rxe,
u32 crc, void *next, size_t len)
{
+ u32 retval;
int err;
SHASH_DESC_ON_STACK(shash, rxe->tfm);
@@ -81,7 +82,9 @@ static inline u32 rxe_crc32(struct rxe_dev *rxe,
return crc32_le(crc, next, len);
}
- return *(u32 *)shash_desc_ctx(shash);
+ retval = *(u32 *)shash_desc_ctx(shash);
+ barrier_data(shash_desc_ctx(shash));
+ return retval;
}
int rxe_set_mtu(struct rxe_dev *rxe, unsigned int dev_mtu);
diff --git a/drivers/infiniband/sw/rxe/rxe_verbs.c b/drivers/infiniband/sw/rxe/rxe_verbs.c
index 83d709e74dfb..073e66783f1d 100644
--- a/drivers/infiniband/sw/rxe/rxe_verbs.c
+++ b/drivers/infiniband/sw/rxe/rxe_verbs.c
@@ -740,13 +740,8 @@ static int init_send_wqe(struct rxe_qp *qp, struct ib_send_wr *ibwr,
sge = ibwr->sg_list;
for (i = 0; i < num_sge; i++, sge++) {
- if (qp->is_user && copy_from_user(p, (__user void *)
- (uintptr_t)sge->addr, sge->length))
- return -EFAULT;
-
- else if (!qp->is_user)
- memcpy(p, (void *)(uintptr_t)sge->addr,
- sge->length);
+ memcpy(p, (void *)(uintptr_t)sge->addr,
+ sge->length);
p += sge->length;
}
diff --git a/drivers/infiniband/ulp/ipoib/ipoib_ib.c b/drivers/infiniband/ulp/ipoib/ipoib_ib.c
index 0060b2f9f659..efe7402f4885 100644
--- a/drivers/infiniband/ulp/ipoib/ipoib_ib.c
+++ b/drivers/infiniband/ulp/ipoib/ipoib_ib.c
@@ -863,7 +863,6 @@ dev_stop:
set_bit(IPOIB_STOP_REAPER, &priv->flags);
cancel_delayed_work(&priv->ah_reap_task);
set_bit(IPOIB_FLAG_INITIALIZED, &priv->flags);
- napi_enable(&priv->napi);
ipoib_ib_dev_stop(dev);
return -1;
}
diff --git a/drivers/infiniband/ulp/ipoib/ipoib_main.c b/drivers/infiniband/ulp/ipoib/ipoib_main.c
index a115c0b7a310..1015a63de6ae 100644
--- a/drivers/infiniband/ulp/ipoib/ipoib_main.c
+++ b/drivers/infiniband/ulp/ipoib/ipoib_main.c
@@ -1596,6 +1596,8 @@ static void ipoib_dev_uninit_default(struct net_device *dev)
ipoib_transport_dev_cleanup(dev);
+ netif_napi_del(&priv->napi);
+
ipoib_cm_dev_cleanup(dev);
kfree(priv->rx_ring);
@@ -1649,6 +1651,7 @@ out_rx_ring_cleanup:
kfree(priv->rx_ring);
out:
+ netif_napi_del(&priv->napi);
return -ENOMEM;
}
@@ -2237,6 +2240,7 @@ event_failed:
device_init_failed:
free_netdev(priv->dev);
+ kfree(priv);
alloc_mem_failed:
return ERR_PTR(result);
@@ -2277,7 +2281,7 @@ static void ipoib_add_one(struct ib_device *device)
static void ipoib_remove_one(struct ib_device *device, void *client_data)
{
- struct ipoib_dev_priv *priv, *tmp;
+ struct ipoib_dev_priv *priv, *tmp, *cpriv, *tcpriv;
struct list_head *dev_list = client_data;
if (!dev_list)
@@ -2300,7 +2304,14 @@ static void ipoib_remove_one(struct ib_device *device, void *client_data)
flush_workqueue(priv->wq);
unregister_netdev(priv->dev);
- free_netdev(priv->dev);
+ if (device->free_rdma_netdev)
+ device->free_rdma_netdev(priv->dev);
+ else
+ free_netdev(priv->dev);
+
+ list_for_each_entry_safe(cpriv, tcpriv, &priv->child_intfs, list)
+ kfree(cpriv);
+
kfree(priv);
}
diff --git a/drivers/infiniband/ulp/ipoib/ipoib_vlan.c b/drivers/infiniband/ulp/ipoib/ipoib_vlan.c
index 36dc4fcaa3cd..081b33deff1b 100644
--- a/drivers/infiniband/ulp/ipoib/ipoib_vlan.c
+++ b/drivers/infiniband/ulp/ipoib/ipoib_vlan.c
@@ -133,13 +133,13 @@ int ipoib_vlan_add(struct net_device *pdev, unsigned short pkey)
snprintf(intf_name, sizeof intf_name, "%s.%04x",
ppriv->dev->name, pkey);
+ if (!rtnl_trylock())
+ return restart_syscall();
+
priv = ipoib_intf_alloc(ppriv->ca, ppriv->port, intf_name);
if (!priv)
return -ENOMEM;
- if (!rtnl_trylock())
- return restart_syscall();
-
down_write(&ppriv->vlan_rwsem);
/*
@@ -167,8 +167,10 @@ out:
rtnl_unlock();
- if (result)
+ if (result) {
free_netdev(priv->dev);
+ kfree(priv);
+ }
return result;
}
@@ -209,6 +211,7 @@ int ipoib_vlan_delete(struct net_device *pdev, unsigned short pkey)
if (dev) {
free_netdev(dev);
+ kfree(priv);
return 0;
}
diff --git a/drivers/input/misc/soc_button_array.c b/drivers/input/misc/soc_button_array.c
index e37d37273182..f600f3a7a3c6 100644
--- a/drivers/input/misc/soc_button_array.c
+++ b/drivers/input/misc/soc_button_array.c
@@ -248,7 +248,8 @@ static struct soc_button_info *soc_button_get_button_info(struct device *dev)
if (!btns_desc) {
dev_err(dev, "ACPI Button Descriptors not found\n");
- return ERR_PTR(-ENODEV);
+ button_info = ERR_PTR(-ENODEV);
+ goto out;
}
/* The first package describes the collection */
@@ -264,24 +265,31 @@ static struct soc_button_info *soc_button_get_button_info(struct device *dev)
}
if (collection_uid == -1) {
dev_err(dev, "Invalid Button Collection Descriptor\n");
- return ERR_PTR(-ENODEV);
+ button_info = ERR_PTR(-ENODEV);
+ goto out;
}
/* There are package.count - 1 buttons + 1 terminating empty entry */
button_info = devm_kcalloc(dev, btns_desc->package.count,
sizeof(*button_info), GFP_KERNEL);
- if (!button_info)
- return ERR_PTR(-ENOMEM);
+ if (!button_info) {
+ button_info = ERR_PTR(-ENOMEM);
+ goto out;
+ }
/* Parse the button descriptors */
for (i = 1, btn = 0; i < btns_desc->package.count; i++, btn++) {
if (soc_button_parse_btn_desc(dev,
&btns_desc->package.elements[i],
collection_uid,
- &button_info[btn]))
- return ERR_PTR(-ENODEV);
+ &button_info[btn])) {
+ button_info = ERR_PTR(-ENODEV);
+ goto out;
+ }
}
+out:
+ kfree(buf.pointer);
return button_info;
}
diff --git a/drivers/input/rmi4/rmi_f54.c b/drivers/input/rmi4/rmi_f54.c
index dea63e2db3e6..f5206e2c767e 100644
--- a/drivers/input/rmi4/rmi_f54.c
+++ b/drivers/input/rmi4/rmi_f54.c
@@ -31,9 +31,6 @@
#define F54_GET_REPORT 1
#define F54_FORCE_CAL 2
-/* Fixed sizes of reports */
-#define F54_QUERY_LEN 27
-
/* F54 capabilities */
#define F54_CAP_BASELINE (1 << 2)
#define F54_CAP_IMAGE8 (1 << 3)
@@ -95,7 +92,6 @@ struct rmi_f54_reports {
struct f54_data {
struct rmi_function *fn;
- u8 qry[F54_QUERY_LEN];
u8 num_rx_electrodes;
u8 num_tx_electrodes;
u8 capabilities;
@@ -632,22 +628,23 @@ static int rmi_f54_detect(struct rmi_function *fn)
{
int error;
struct f54_data *f54;
+ u8 buf[6];
f54 = dev_get_drvdata(&fn->dev);
error = rmi_read_block(fn->rmi_dev, fn->fd.query_base_addr,
- &f54->qry, sizeof(f54->qry));
+ buf, sizeof(buf));
if (error) {
dev_err(&fn->dev, "%s: Failed to query F54 properties\n",
__func__);
return error;
}
- f54->num_rx_electrodes = f54->qry[0];
- f54->num_tx_electrodes = f54->qry[1];
- f54->capabilities = f54->qry[2];
- f54->clock_rate = f54->qry[3] | (f54->qry[4] << 8);
- f54->family = f54->qry[5];
+ f54->num_rx_electrodes = buf[0];
+ f54->num_tx_electrodes = buf[1];
+ f54->capabilities = buf[2];
+ f54->clock_rate = buf[3] | (buf[4] << 8);
+ f54->family = buf[5];
rmi_dbg(RMI_DEBUG_FN, &fn->dev, "F54 num_rx_electrodes: %d\n",
f54->num_rx_electrodes);
diff --git a/drivers/input/serio/i8042-x86ia64io.h b/drivers/input/serio/i8042-x86ia64io.h
index 09720d950686..f932a83b4990 100644
--- a/drivers/input/serio/i8042-x86ia64io.h
+++ b/drivers/input/serio/i8042-x86ia64io.h
@@ -723,6 +723,13 @@ static const struct dmi_system_id __initconst i8042_dmi_notimeout_table[] = {
DMI_MATCH(DMI_PRODUCT_NAME, "LIFEBOOK U574"),
},
},
+ {
+ /* Fujitsu UH554 laptop */
+ .matches = {
+ DMI_MATCH(DMI_SYS_VENDOR, "FUJITSU"),
+ DMI_MATCH(DMI_PRODUCT_NAME, "LIFEBOOK UH544"),
+ },
+ },
{ }
};
diff --git a/drivers/iommu/amd_iommu.c b/drivers/iommu/amd_iommu.c
index 63cacf5d6cf2..0f1219fa8561 100644
--- a/drivers/iommu/amd_iommu.c
+++ b/drivers/iommu/amd_iommu.c
@@ -3879,11 +3879,9 @@ static void irte_ga_prepare(void *entry,
u8 vector, u32 dest_apicid, int devid)
{
struct irte_ga *irte = (struct irte_ga *) entry;
- struct iommu_dev_data *dev_data = search_dev_data(devid);
irte->lo.val = 0;
irte->hi.val = 0;
- irte->lo.fields_remap.guest_mode = dev_data ? dev_data->use_vapic : 0;
irte->lo.fields_remap.int_type = delivery_mode;
irte->lo.fields_remap.dm = dest_mode;
irte->hi.fields.vector = vector;
@@ -3939,10 +3937,10 @@ static void irte_ga_set_affinity(void *entry, u16 devid, u16 index,
struct irte_ga *irte = (struct irte_ga *) entry;
struct iommu_dev_data *dev_data = search_dev_data(devid);
- if (!dev_data || !dev_data->use_vapic) {
+ if (!dev_data || !dev_data->use_vapic ||
+ !irte->lo.fields_remap.guest_mode) {
irte->hi.fields.vector = vector;
irte->lo.fields_remap.destination = dest_apicid;
- irte->lo.fields_remap.guest_mode = 0;
modify_irte_ga(devid, index, irte, NULL);
}
}
diff --git a/drivers/iommu/intel-iommu.c b/drivers/iommu/intel-iommu.c
index fc2765ccdb57..8500deda9175 100644
--- a/drivers/iommu/intel-iommu.c
+++ b/drivers/iommu/intel-iommu.c
@@ -4315,7 +4315,7 @@ int dmar_parse_one_atsr(struct acpi_dmar_header *hdr, void *arg)
struct acpi_dmar_atsr *atsr;
struct dmar_atsr_unit *atsru;
- if (system_state != SYSTEM_BOOTING && !intel_iommu_enabled)
+ if (system_state >= SYSTEM_RUNNING && !intel_iommu_enabled)
return 0;
atsr = container_of(hdr, struct acpi_dmar_atsr, header);
@@ -4565,7 +4565,7 @@ int dmar_iommu_notify_scope_dev(struct dmar_pci_notify_info *info)
struct acpi_dmar_atsr *atsr;
struct acpi_dmar_reserved_memory *rmrr;
- if (!intel_iommu_enabled && system_state != SYSTEM_BOOTING)
+ if (!intel_iommu_enabled && system_state >= SYSTEM_RUNNING)
return 0;
list_for_each_entry(rmrru, &dmar_rmrr_units, list) {
diff --git a/drivers/iommu/of_iommu.c b/drivers/iommu/of_iommu.c
index 19779b88a479..8cb60829a7a1 100644
--- a/drivers/iommu/of_iommu.c
+++ b/drivers/iommu/of_iommu.c
@@ -103,7 +103,7 @@ static bool of_iommu_driver_present(struct device_node *np)
* it never will be. We don't want to defer indefinitely, nor attempt
* to dereference __iommu_of_table after it's been freed.
*/
- if (system_state > SYSTEM_BOOTING)
+ if (system_state >= SYSTEM_RUNNING)
return false;
return of_match_node(&__iommu_of_table, np);
diff --git a/drivers/irqchip/irq-mips-gic.c b/drivers/irqchip/irq-mips-gic.c
index eb7fbe159963..929f8558bf1c 100644
--- a/drivers/irqchip/irq-mips-gic.c
+++ b/drivers/irqchip/irq-mips-gic.c
@@ -140,7 +140,7 @@ static inline void gic_map_to_vpe(unsigned int intr, unsigned int vpe)
}
#ifdef CONFIG_CLKSRC_MIPS_GIC
-u64 gic_read_count(void)
+u64 notrace gic_read_count(void)
{
unsigned int hi, hi2, lo;
@@ -167,7 +167,7 @@ unsigned int gic_get_count_width(void)
return bits;
}
-void gic_write_compare(u64 cnt)
+void notrace gic_write_compare(u64 cnt)
{
if (mips_cm_is64) {
gic_write(GIC_REG(VPE_LOCAL, GIC_VPE_COMPARE), cnt);
@@ -179,7 +179,7 @@ void gic_write_compare(u64 cnt)
}
}
-void gic_write_cpu_compare(u64 cnt, int cpu)
+void notrace gic_write_cpu_compare(u64 cnt, int cpu)
{
unsigned long flags;
diff --git a/drivers/leds/leds-bcm6328.c b/drivers/leds/leds-bcm6328.c
index 1548259297c1..2cfd9389ee96 100644
--- a/drivers/leds/leds-bcm6328.c
+++ b/drivers/leds/leds-bcm6328.c
@@ -242,7 +242,7 @@ static int bcm6328_hwled(struct device *dev, struct device_node *nc, u32 reg,
spin_lock_irqsave(lock, flags);
val = bcm6328_led_read(addr);
- val |= (BIT(reg) << (((sel % 4) * 4) + 16));
+ val |= (BIT(reg % 4) << (((sel % 4) * 4) + 16));
bcm6328_led_write(addr, val);
spin_unlock_irqrestore(lock, flags);
}
@@ -269,7 +269,7 @@ static int bcm6328_hwled(struct device *dev, struct device_node *nc, u32 reg,
spin_lock_irqsave(lock, flags);
val = bcm6328_led_read(addr);
- val |= (BIT(reg) << ((sel % 4) * 4));
+ val |= (BIT(reg % 4) << ((sel % 4) * 4));
bcm6328_led_write(addr, val);
spin_unlock_irqrestore(lock, flags);
}
diff --git a/drivers/leds/trigger/ledtrig-heartbeat.c b/drivers/leds/trigger/ledtrig-heartbeat.c
index afa3b4099214..e95ea65380c8 100644
--- a/drivers/leds/trigger/ledtrig-heartbeat.c
+++ b/drivers/leds/trigger/ledtrig-heartbeat.c
@@ -20,7 +20,6 @@
#include <linux/sched/loadavg.h>
#include <linux/leds.h>
#include <linux/reboot.h>
-#include <linux/suspend.h>
#include "../leds.h"
static int panic_heartbeats;
@@ -163,30 +162,6 @@ static struct led_trigger heartbeat_led_trigger = {
.deactivate = heartbeat_trig_deactivate,
};
-static int heartbeat_pm_notifier(struct notifier_block *nb,
- unsigned long pm_event, void *unused)
-{
- int rc;
-
- switch (pm_event) {
- case PM_SUSPEND_PREPARE:
- case PM_HIBERNATION_PREPARE:
- case PM_RESTORE_PREPARE:
- led_trigger_unregister(&heartbeat_led_trigger);
- break;
- case PM_POST_SUSPEND:
- case PM_POST_HIBERNATION:
- case PM_POST_RESTORE:
- rc = led_trigger_register(&heartbeat_led_trigger);
- if (rc)
- pr_err("could not re-register heartbeat trigger\n");
- break;
- default:
- break;
- }
- return NOTIFY_DONE;
-}
-
static int heartbeat_reboot_notifier(struct notifier_block *nb,
unsigned long code, void *unused)
{
@@ -201,10 +176,6 @@ static int heartbeat_panic_notifier(struct notifier_block *nb,
return NOTIFY_DONE;
}
-static struct notifier_block heartbeat_pm_nb = {
- .notifier_call = heartbeat_pm_notifier,
-};
-
static struct notifier_block heartbeat_reboot_nb = {
.notifier_call = heartbeat_reboot_notifier,
};
@@ -221,14 +192,12 @@ static int __init heartbeat_trig_init(void)
atomic_notifier_chain_register(&panic_notifier_list,
&heartbeat_panic_nb);
register_reboot_notifier(&heartbeat_reboot_nb);
- register_pm_notifier(&heartbeat_pm_nb);
}
return rc;
}
static void __exit heartbeat_trig_exit(void)
{
- unregister_pm_notifier(&heartbeat_pm_nb);
unregister_reboot_notifier(&heartbeat_reboot_nb);
atomic_notifier_chain_unregister(&panic_notifier_list,
&heartbeat_panic_nb);
diff --git a/drivers/lightnvm/pblk-core.c b/drivers/lightnvm/pblk-core.c
index 11fe0c5b2a9c..81501644fb15 100644
--- a/drivers/lightnvm/pblk-core.c
+++ b/drivers/lightnvm/pblk-core.c
@@ -1670,13 +1670,10 @@ void pblk_line_run_ws(struct pblk *pblk, struct pblk_line *line, void *priv,
queue_work(wq, &line_ws->ws);
}
-void pblk_down_rq(struct pblk *pblk, struct ppa_addr *ppa_list, int nr_ppas,
- unsigned long *lun_bitmap)
+static void __pblk_down_page(struct pblk *pblk, struct ppa_addr *ppa_list,
+ int nr_ppas, int pos)
{
- struct nvm_tgt_dev *dev = pblk->dev;
- struct nvm_geo *geo = &dev->geo;
- struct pblk_lun *rlun;
- int pos = pblk_ppa_to_pos(geo, ppa_list[0]);
+ struct pblk_lun *rlun = &pblk->luns[pos];
int ret;
/*
@@ -1690,14 +1687,8 @@ void pblk_down_rq(struct pblk *pblk, struct ppa_addr *ppa_list, int nr_ppas,
WARN_ON(ppa_list[0].g.lun != ppa_list[i].g.lun ||
ppa_list[0].g.ch != ppa_list[i].g.ch);
#endif
- /* If the LUN has been locked for this same request, do no attempt to
- * lock it again
- */
- if (test_and_set_bit(pos, lun_bitmap))
- return;
- rlun = &pblk->luns[pos];
- ret = down_timeout(&rlun->wr_sem, msecs_to_jiffies(5000));
+ ret = down_timeout(&rlun->wr_sem, msecs_to_jiffies(30000));
if (ret) {
switch (ret) {
case -ETIME:
@@ -1710,6 +1701,50 @@ void pblk_down_rq(struct pblk *pblk, struct ppa_addr *ppa_list, int nr_ppas,
}
}
+void pblk_down_page(struct pblk *pblk, struct ppa_addr *ppa_list, int nr_ppas)
+{
+ struct nvm_tgt_dev *dev = pblk->dev;
+ struct nvm_geo *geo = &dev->geo;
+ int pos = pblk_ppa_to_pos(geo, ppa_list[0]);
+
+ __pblk_down_page(pblk, ppa_list, nr_ppas, pos);
+}
+
+void pblk_down_rq(struct pblk *pblk, struct ppa_addr *ppa_list, int nr_ppas,
+ unsigned long *lun_bitmap)
+{
+ struct nvm_tgt_dev *dev = pblk->dev;
+ struct nvm_geo *geo = &dev->geo;
+ int pos = pblk_ppa_to_pos(geo, ppa_list[0]);
+
+ /* If the LUN has been locked for this same request, do no attempt to
+ * lock it again
+ */
+ if (test_and_set_bit(pos, lun_bitmap))
+ return;
+
+ __pblk_down_page(pblk, ppa_list, nr_ppas, pos);
+}
+
+void pblk_up_page(struct pblk *pblk, struct ppa_addr *ppa_list, int nr_ppas)
+{
+ struct nvm_tgt_dev *dev = pblk->dev;
+ struct nvm_geo *geo = &dev->geo;
+ struct pblk_lun *rlun;
+ int pos = pblk_ppa_to_pos(geo, ppa_list[0]);
+
+#ifdef CONFIG_NVM_DEBUG
+ int i;
+
+ for (i = 1; i < nr_ppas; i++)
+ WARN_ON(ppa_list[0].g.lun != ppa_list[i].g.lun ||
+ ppa_list[0].g.ch != ppa_list[i].g.ch);
+#endif
+
+ rlun = &pblk->luns[pos];
+ up(&rlun->wr_sem);
+}
+
void pblk_up_rq(struct pblk *pblk, struct ppa_addr *ppa_list, int nr_ppas,
unsigned long *lun_bitmap)
{
diff --git a/drivers/lightnvm/pblk-recovery.c b/drivers/lightnvm/pblk-recovery.c
index 0e48d3e4e143..cb556e06673e 100644
--- a/drivers/lightnvm/pblk-recovery.c
+++ b/drivers/lightnvm/pblk-recovery.c
@@ -340,9 +340,14 @@ static void pblk_end_io_recov(struct nvm_rq *rqd)
struct pblk *pblk = pad_rq->pblk;
struct nvm_tgt_dev *dev = pblk->dev;
- kref_put(&pad_rq->ref, pblk_recov_complete);
+ pblk_up_page(pblk, rqd->ppa_list, rqd->nr_ppas);
+
+ bio_put(rqd->bio);
nvm_dev_dma_free(dev->parent, rqd->meta_list, rqd->dma_meta_list);
pblk_free_rqd(pblk, rqd, WRITE);
+
+ atomic_dec(&pblk->inflight_io);
+ kref_put(&pad_rq->ref, pblk_recov_complete);
}
static int pblk_recov_pad_oob(struct pblk *pblk, struct pblk_line *line,
@@ -385,7 +390,7 @@ next_pad_rq:
rq_ppas = pblk_calc_secs(pblk, left_ppas, 0);
if (rq_ppas < pblk->min_write_pgs) {
pr_err("pblk: corrupted pad line %d\n", line->id);
- goto free_rq;
+ goto fail_free_pad;
}
rq_len = rq_ppas * geo->sec_size;
@@ -393,7 +398,7 @@ next_pad_rq:
meta_list = nvm_dev_dma_alloc(dev->parent, GFP_KERNEL, &dma_meta_list);
if (!meta_list) {
ret = -ENOMEM;
- goto free_data;
+ goto fail_free_pad;
}
ppa_list = (void *)(meta_list) + pblk_dma_meta_size;
@@ -404,9 +409,9 @@ next_pad_rq:
ret = PTR_ERR(rqd);
goto fail_free_meta;
}
- memset(rqd, 0, pblk_w_rq_size);
- bio = bio_map_kern(dev->q, data, rq_len, GFP_KERNEL);
+ bio = pblk_bio_map_addr(pblk, data, rq_ppas, rq_len,
+ PBLK_VMALLOC_META, GFP_KERNEL);
if (IS_ERR(bio)) {
ret = PTR_ERR(bio);
goto fail_free_rqd;
@@ -453,15 +458,15 @@ next_pad_rq:
}
kref_get(&pad_rq->ref);
+ pblk_down_page(pblk, rqd->ppa_list, rqd->nr_ppas);
ret = pblk_submit_io(pblk, rqd);
if (ret) {
pr_err("pblk: I/O submission failed: %d\n", ret);
- goto free_data;
+ pblk_up_page(pblk, rqd->ppa_list, rqd->nr_ppas);
+ goto fail_free_bio;
}
- atomic_dec(&pblk->inflight_io);
-
left_line_ppas -= rq_ppas;
left_ppas -= rq_ppas;
if (left_ppas && left_line_ppas)
@@ -475,17 +480,23 @@ next_pad_rq:
ret = -ETIME;
}
+ if (!pblk_line_is_full(line))
+ pr_err("pblk: corrupted padded line: %d\n", line->id);
+
+ vfree(data);
free_rq:
kfree(pad_rq);
-free_data:
- vfree(data);
return ret;
+fail_free_bio:
+ bio_put(bio);
fail_free_rqd:
pblk_free_rqd(pblk, rqd, WRITE);
fail_free_meta:
nvm_dev_dma_free(dev->parent, meta_list, dma_meta_list);
+fail_free_pad:
kfree(pad_rq);
+ vfree(data);
return ret;
}
diff --git a/drivers/lightnvm/pblk-write.c b/drivers/lightnvm/pblk-write.c
index d62a8f4faaf4..3ad9e56d2473 100644
--- a/drivers/lightnvm/pblk-write.c
+++ b/drivers/lightnvm/pblk-write.c
@@ -39,9 +39,7 @@ static unsigned long pblk_end_w_bio(struct pblk *pblk, struct nvm_rq *rqd,
ret = pblk_rb_sync_advance(&pblk->rwb, c_ctx->nr_valid);
- if (rqd->meta_list)
- nvm_dev_dma_free(dev->parent, rqd->meta_list,
- rqd->dma_meta_list);
+ nvm_dev_dma_free(dev->parent, rqd->meta_list, rqd->dma_meta_list);
bio_put(rqd->bio);
pblk_free_rqd(pblk, rqd, WRITE);
@@ -178,15 +176,12 @@ static void pblk_end_io_write_meta(struct nvm_rq *rqd)
{
struct pblk *pblk = rqd->private;
struct nvm_tgt_dev *dev = pblk->dev;
- struct nvm_geo *geo = &dev->geo;
struct pblk_g_ctx *m_ctx = nvm_rq_to_pdu(rqd);
struct pblk_line *line = m_ctx->private;
struct pblk_emeta *emeta = line->emeta;
- int pos = pblk_ppa_to_pos(geo, rqd->ppa_list[0]);
- struct pblk_lun *rlun = &pblk->luns[pos];
int sync;
- up(&rlun->wr_sem);
+ pblk_up_page(pblk, rqd->ppa_list, rqd->nr_ppas);
if (rqd->error) {
pblk_log_write_err(pblk, rqd);
@@ -203,6 +198,7 @@ static void pblk_end_io_write_meta(struct nvm_rq *rqd)
pblk->close_wq);
bio_put(rqd->bio);
+ nvm_dev_dma_free(dev->parent, rqd->meta_list, rqd->dma_meta_list);
pblk_free_rqd(pblk, rqd, READ);
atomic_dec(&pblk->inflight_io);
@@ -226,9 +222,6 @@ static int pblk_alloc_w_rq(struct pblk *pblk, struct nvm_rq *rqd,
if (!rqd->meta_list)
return -ENOMEM;
- if (unlikely(nr_secs == 1))
- return 0;
-
rqd->ppa_list = rqd->meta_list + pblk_dma_meta_size;
rqd->dma_ppa_list = rqd->dma_meta_list + pblk_dma_meta_size;
@@ -367,7 +360,6 @@ int pblk_submit_meta_io(struct pblk *pblk, struct pblk_line *meta_line)
struct pblk_line_meta *lm = &pblk->lm;
struct pblk_emeta *emeta = meta_line->emeta;
struct pblk_g_ctx *m_ctx;
- struct pblk_lun *rlun;
struct bio *bio;
struct nvm_rq *rqd;
void *data;
@@ -411,13 +403,6 @@ int pblk_submit_meta_io(struct pblk *pblk, struct pblk_line *meta_line)
rqd->ppa_list[i] = addr_to_gen_ppa(pblk, paddr, id);
}
- rlun = &pblk->luns[pblk_ppa_to_pos(geo, rqd->ppa_list[0])];
- ret = down_timeout(&rlun->wr_sem, msecs_to_jiffies(5000));
- if (ret) {
- pr_err("pblk: lun semaphore timed out (%d)\n", ret);
- goto fail_free_bio;
- }
-
emeta->mem += rq_len;
if (emeta->mem >= lm->emeta_len[0]) {
spin_lock(&l_mg->close_lock);
@@ -427,6 +412,8 @@ int pblk_submit_meta_io(struct pblk *pblk, struct pblk_line *meta_line)
spin_unlock(&l_mg->close_lock);
}
+ pblk_down_page(pblk, rqd->ppa_list, rqd->nr_ppas);
+
ret = pblk_submit_io(pblk, rqd);
if (ret) {
pr_err("pblk: emeta I/O submission failed: %d\n", ret);
@@ -436,10 +423,13 @@ int pblk_submit_meta_io(struct pblk *pblk, struct pblk_line *meta_line)
return NVM_IO_OK;
fail_rollback:
+ pblk_up_page(pblk, rqd->ppa_list, rqd->nr_ppas);
spin_lock(&l_mg->close_lock);
pblk_dealloc_page(pblk, meta_line, rq_ppas);
list_add(&meta_line->list, &meta_line->list);
spin_unlock(&l_mg->close_lock);
+
+ nvm_dev_dma_free(dev->parent, rqd->meta_list, rqd->dma_meta_list);
fail_free_bio:
if (likely(l_mg->emeta_alloc_type == PBLK_VMALLOC_META))
bio_put(bio);
diff --git a/drivers/lightnvm/pblk.h b/drivers/lightnvm/pblk.h
index 15931381348c..0c5692cc2f60 100644
--- a/drivers/lightnvm/pblk.h
+++ b/drivers/lightnvm/pblk.h
@@ -739,8 +739,10 @@ u64 pblk_alloc_page(struct pblk *pblk, struct pblk_line *line, int nr_secs);
u64 __pblk_alloc_page(struct pblk *pblk, struct pblk_line *line, int nr_secs);
int pblk_calc_secs(struct pblk *pblk, unsigned long secs_avail,
unsigned long secs_to_flush);
+void pblk_up_page(struct pblk *pblk, struct ppa_addr *ppa_list, int nr_ppas);
void pblk_down_rq(struct pblk *pblk, struct ppa_addr *ppa_list, int nr_ppas,
unsigned long *lun_bitmap);
+void pblk_down_page(struct pblk *pblk, struct ppa_addr *ppa_list, int nr_ppas);
void pblk_up_rq(struct pblk *pblk, struct ppa_addr *ppa_list, int nr_ppas,
unsigned long *lun_bitmap);
void pblk_end_bio_sync(struct bio *bio);
diff --git a/drivers/md/bcache/btree.h b/drivers/md/bcache/btree.h
index 9b80417cd547..73da1f5626cb 100644
--- a/drivers/md/bcache/btree.h
+++ b/drivers/md/bcache/btree.h
@@ -207,7 +207,7 @@ void bkey_put(struct cache_set *c, struct bkey *k);
struct btree_op {
/* for waiting on btree reserve in btree_split() */
- wait_queue_t wait;
+ wait_queue_entry_t wait;
/* Btree level at which we start taking write locks */
short lock;
diff --git a/drivers/md/dm-integrity.c b/drivers/md/dm-integrity.c
index 339af38459fc..1b224aa9cf15 100644
--- a/drivers/md/dm-integrity.c
+++ b/drivers/md/dm-integrity.c
@@ -1105,10 +1105,13 @@ static void schedule_autocommit(struct dm_integrity_c *ic)
static void submit_flush_bio(struct dm_integrity_c *ic, struct dm_integrity_io *dio)
{
struct bio *bio;
- spin_lock_irq(&ic->endio_wait.lock);
+ unsigned long flags;
+
+ spin_lock_irqsave(&ic->endio_wait.lock, flags);
bio = dm_bio_from_per_bio_data(dio, sizeof(struct dm_integrity_io));
bio_list_add(&ic->flush_bio_list, bio);
- spin_unlock_irq(&ic->endio_wait.lock);
+ spin_unlock_irqrestore(&ic->endio_wait.lock, flags);
+
queue_work(ic->commit_wq, &ic->commit_work);
}
@@ -3040,6 +3043,11 @@ static int dm_integrity_ctr(struct dm_target *ti, unsigned argc, char **argv)
ti->error = "The device is too small";
goto bad;
}
+ if (ti->len > ic->provided_data_sectors) {
+ r = -EINVAL;
+ ti->error = "Not enough provided sectors for requested mapping size";
+ goto bad;
+ }
if (!buffer_sectors)
buffer_sectors = 1;
diff --git a/drivers/md/dm-io.c b/drivers/md/dm-io.c
index 81248a8a8b57..25039607f3cb 100644
--- a/drivers/md/dm-io.c
+++ b/drivers/md/dm-io.c
@@ -318,8 +318,8 @@ static void do_region(int op, int op_flags, unsigned region,
else if (op == REQ_OP_WRITE_SAME)
special_cmd_max_sectors = q->limits.max_write_same_sectors;
if ((op == REQ_OP_DISCARD || op == REQ_OP_WRITE_ZEROES ||
- op == REQ_OP_WRITE_SAME) &&
- special_cmd_max_sectors == 0) {
+ op == REQ_OP_WRITE_SAME) && special_cmd_max_sectors == 0) {
+ atomic_inc(&io->count);
dec_count(io, region, BLK_STS_NOTSUPP);
return;
}
diff --git a/drivers/md/dm-raid.c b/drivers/md/dm-raid.c
index 7d893228c40f..b4b75dad816a 100644
--- a/drivers/md/dm-raid.c
+++ b/drivers/md/dm-raid.c
@@ -1927,7 +1927,7 @@ struct dm_raid_superblock {
/********************************************************************
* BELOW FOLLOW V1.9.0 EXTENSIONS TO THE PRISTINE SUPERBLOCK FORMAT!!!
*
- * FEATURE_FLAG_SUPPORTS_V190 in the features member indicates that those exist
+ * FEATURE_FLAG_SUPPORTS_V190 in the compat_features member indicates that those exist
*/
__le32 flags; /* Flags defining array states for reshaping */
@@ -2092,6 +2092,11 @@ static void super_sync(struct mddev *mddev, struct md_rdev *rdev)
sb->layout = cpu_to_le32(mddev->layout);
sb->stripe_sectors = cpu_to_le32(mddev->chunk_sectors);
+ /********************************************************************
+ * BELOW FOLLOW V1.9.0 EXTENSIONS TO THE PRISTINE SUPERBLOCK FORMAT!!!
+ *
+ * FEATURE_FLAG_SUPPORTS_V190 in the compat_features member indicates that those exist
+ */
sb->new_level = cpu_to_le32(mddev->new_level);
sb->new_layout = cpu_to_le32(mddev->new_layout);
sb->new_stripe_sectors = cpu_to_le32(mddev->new_chunk_sectors);
@@ -2438,8 +2443,14 @@ static int super_validate(struct raid_set *rs, struct md_rdev *rdev)
mddev->bitmap_info.default_offset = mddev->bitmap_info.offset;
if (!test_and_clear_bit(FirstUse, &rdev->flags)) {
- /* Retrieve device size stored in superblock to be prepared for shrink */
- rdev->sectors = le64_to_cpu(sb->sectors);
+ /*
+ * Retrieve rdev size stored in superblock to be prepared for shrink.
+ * Check extended superblock members are present otherwise the size
+ * will not be set!
+ */
+ if (le32_to_cpu(sb->compat_features) & FEATURE_FLAG_SUPPORTS_V190)
+ rdev->sectors = le64_to_cpu(sb->sectors);
+
rdev->recovery_offset = le64_to_cpu(sb->disk_recovery_offset);
if (rdev->recovery_offset == MaxSector)
set_bit(In_sync, &rdev->flags);
diff --git a/drivers/md/dm-raid1.c b/drivers/md/dm-raid1.c
index 3ab584b686e0..a4fbd911d566 100644
--- a/drivers/md/dm-raid1.c
+++ b/drivers/md/dm-raid1.c
@@ -145,6 +145,7 @@ static void dispatch_bios(void *context, struct bio_list *bio_list)
struct dm_raid1_bio_record {
struct mirror *m;
+ /* if details->bi_bdev == NULL, details were not saved */
struct dm_bio_details details;
region_t write_region;
};
@@ -1198,6 +1199,8 @@ static int mirror_map(struct dm_target *ti, struct bio *bio)
struct dm_raid1_bio_record *bio_record =
dm_per_bio_data(bio, sizeof(struct dm_raid1_bio_record));
+ bio_record->details.bi_bdev = NULL;
+
if (rw == WRITE) {
/* Save region for mirror_end_io() handler */
bio_record->write_region = dm_rh_bio_to_region(ms->rh, bio);
@@ -1257,12 +1260,22 @@ static int mirror_end_io(struct dm_target *ti, struct bio *bio,
}
if (*error == BLK_STS_NOTSUPP)
- return DM_ENDIO_DONE;
+ goto out;
if (bio->bi_opf & REQ_RAHEAD)
- return DM_ENDIO_DONE;
+ goto out;
if (unlikely(*error)) {
+ if (!bio_record->details.bi_bdev) {
+ /*
+ * There wasn't enough memory to record necessary
+ * information for a retry or there was no other
+ * mirror in-sync.
+ */
+ DMERR_LIMIT("Mirror read failed.");
+ return DM_ENDIO_DONE;
+ }
+
m = bio_record->m;
DMERR("Mirror read failed from %s. Trying alternative device.",
@@ -1278,6 +1291,7 @@ static int mirror_end_io(struct dm_target *ti, struct bio *bio,
bd = &bio_record->details;
dm_bio_restore(bd, bio);
+ bio_record->details.bi_bdev = NULL;
bio->bi_status = 0;
queue_bio(ms, bio, rw);
@@ -1286,6 +1300,9 @@ static int mirror_end_io(struct dm_target *ti, struct bio *bio,
DMERR("All replicated volumes dead, failing I/O");
}
+out:
+ bio_record->details.bi_bdev = NULL;
+
return DM_ENDIO_DONE;
}
diff --git a/drivers/md/dm-thin.c b/drivers/md/dm-thin.c
index 3490b300cbff..9dec2f8cc739 100644
--- a/drivers/md/dm-thin.c
+++ b/drivers/md/dm-thin.c
@@ -1091,6 +1091,19 @@ static void process_prepared_discard_passdown_pt1(struct dm_thin_new_mapping *m)
return;
}
+ /*
+ * Increment the unmapped blocks. This prevents a race between the
+ * passdown io and reallocation of freed blocks.
+ */
+ r = dm_pool_inc_data_range(pool->pmd, m->data_block, data_end);
+ if (r) {
+ metadata_operation_failed(pool, "dm_pool_inc_data_range", r);
+ bio_io_error(m->bio);
+ cell_defer_no_holder(tc, m->cell);
+ mempool_free(m, pool->mapping_pool);
+ return;
+ }
+
discard_parent = bio_alloc(GFP_NOIO, 1);
if (!discard_parent) {
DMWARN("%s: unable to allocate top level discard bio for passdown. Skipping passdown.",
@@ -1111,19 +1124,6 @@ static void process_prepared_discard_passdown_pt1(struct dm_thin_new_mapping *m)
end_discard(&op, r);
}
}
-
- /*
- * Increment the unmapped blocks. This prevents a race between the
- * passdown io and reallocation of freed blocks.
- */
- r = dm_pool_inc_data_range(pool->pmd, m->data_block, data_end);
- if (r) {
- metadata_operation_failed(pool, "dm_pool_inc_data_range", r);
- bio_io_error(m->bio);
- cell_defer_no_holder(tc, m->cell);
- mempool_free(m, pool->mapping_pool);
- return;
- }
}
static void process_prepared_discard_passdown_pt2(struct dm_thin_new_mapping *m)
diff --git a/drivers/md/dm.c b/drivers/md/dm.c
index 402946035308..13e714ea7a42 100644
--- a/drivers/md/dm.c
+++ b/drivers/md/dm.c
@@ -1153,7 +1153,7 @@ static int clone_bio(struct dm_target_io *tio, struct bio *bio,
clone->bi_iter.bi_size = to_bytes(len);
if (unlikely(bio_integrity(bio) != NULL))
- bio_integrity_trim(clone, 0, len);
+ bio_integrity_trim(clone);
return 0;
}
diff --git a/drivers/media/cec/Kconfig b/drivers/media/cec/Kconfig
index 4e25a950ae6f..43428cec3a01 100644
--- a/drivers/media/cec/Kconfig
+++ b/drivers/media/cec/Kconfig
@@ -1,5 +1,6 @@
config MEDIA_CEC_RC
bool "HDMI CEC RC integration"
depends on CEC_CORE && RC_CORE
+ depends on CEC_CORE=m || RC_CORE=y
---help---
Pass on CEC remote control messages to the RC framework.
diff --git a/drivers/media/cec/cec-api.c b/drivers/media/cec/cec-api.c
index 0860fb458757..999926f731c8 100644
--- a/drivers/media/cec/cec-api.c
+++ b/drivers/media/cec/cec-api.c
@@ -271,16 +271,10 @@ static long cec_receive(struct cec_adapter *adap, struct cec_fh *fh,
bool block, struct cec_msg __user *parg)
{
struct cec_msg msg = {};
- long err = 0;
+ long err;
if (copy_from_user(&msg, parg, sizeof(msg)))
return -EFAULT;
- mutex_lock(&adap->lock);
- if (!adap->is_configured && fh->mode_follower < CEC_MODE_MONITOR)
- err = -ENONET;
- mutex_unlock(&adap->lock);
- if (err)
- return err;
err = cec_receive_msg(fh, &msg, block);
if (err)
diff --git a/drivers/media/i2c/tc358743.c b/drivers/media/i2c/tc358743.c
index acef4eca269f..3251cba89e8f 100644
--- a/drivers/media/i2c/tc358743.c
+++ b/drivers/media/i2c/tc358743.c
@@ -223,7 +223,7 @@ static void i2c_wr8(struct v4l2_subdev *sd, u16 reg, u8 val)
static void i2c_wr8_and_or(struct v4l2_subdev *sd, u16 reg,
u8 mask, u8 val)
{
- i2c_wrreg(sd, reg, (i2c_rdreg(sd, reg, 2) & mask) | val, 2);
+ i2c_wrreg(sd, reg, (i2c_rdreg(sd, reg, 1) & mask) | val, 1);
}
static u16 i2c_rd16(struct v4l2_subdev *sd, u16 reg)
diff --git a/drivers/media/rc/sir_ir.c b/drivers/media/rc/sir_ir.c
index e12ec50bf0bf..90a5f8fd5eea 100644
--- a/drivers/media/rc/sir_ir.c
+++ b/drivers/media/rc/sir_ir.c
@@ -183,9 +183,15 @@ static irqreturn_t sir_interrupt(int irq, void *dev_id)
static unsigned long delt;
unsigned long deltintr;
unsigned long flags;
+ int counter = 0;
int iir, lsr;
while ((iir = inb(io + UART_IIR) & UART_IIR_ID)) {
+ if (++counter > 256) {
+ dev_err(&sir_ir_dev->dev, "Trapped in interrupt");
+ break;
+ }
+
switch (iir & UART_IIR_ID) { /* FIXME toto treba preriedit */
case UART_IIR_MSI:
(void)inb(io + UART_MSR);
diff --git a/drivers/media/usb/rainshadow-cec/rainshadow-cec.c b/drivers/media/usb/rainshadow-cec/rainshadow-cec.c
index 71bd68548c9c..4126552c9055 100644
--- a/drivers/media/usb/rainshadow-cec/rainshadow-cec.c
+++ b/drivers/media/usb/rainshadow-cec/rainshadow-cec.c
@@ -336,6 +336,7 @@ static int rain_connect(struct serio *serio, struct serio_driver *drv)
serio_set_drvdata(serio, rain);
INIT_WORK(&rain->work, rain_irq_work_handler);
mutex_init(&rain->write_lock);
+ spin_lock_init(&rain->buf_lock);
err = serio_open(serio, drv);
if (err)
diff --git a/drivers/media/v4l2-core/videobuf2-core.c b/drivers/media/v4l2-core/videobuf2-core.c
index 94afbbf92807..c0175ea7e7ad 100644
--- a/drivers/media/v4l2-core/videobuf2-core.c
+++ b/drivers/media/v4l2-core/videobuf2-core.c
@@ -868,7 +868,7 @@ EXPORT_SYMBOL_GPL(vb2_core_create_bufs);
void *vb2_plane_vaddr(struct vb2_buffer *vb, unsigned int plane_no)
{
- if (plane_no > vb->num_planes || !vb->planes[plane_no].mem_priv)
+ if (plane_no >= vb->num_planes || !vb->planes[plane_no].mem_priv)
return NULL;
return call_ptr_memop(vb, vaddr, vb->planes[plane_no].mem_priv);
diff --git a/drivers/mfd/arizona-core.c b/drivers/mfd/arizona-core.c
index 75488e65cd96..8d46e3ad9529 100644
--- a/drivers/mfd/arizona-core.c
+++ b/drivers/mfd/arizona-core.c
@@ -245,8 +245,7 @@ static int arizona_poll_reg(struct arizona *arizona,
int ret;
ret = regmap_read_poll_timeout(arizona->regmap,
- ARIZONA_INTERRUPT_RAW_STATUS_5, val,
- ((val & mask) == target),
+ reg, val, ((val & mask) == target),
ARIZONA_REG_POLL_DELAY_US,
timeout_ms * 1000);
if (ret)
diff --git a/drivers/misc/cxl/context.c b/drivers/misc/cxl/context.c
index 4472ce11f98d..8c32040b9c09 100644
--- a/drivers/misc/cxl/context.c
+++ b/drivers/misc/cxl/context.c
@@ -45,7 +45,7 @@ int cxl_context_init(struct cxl_context *ctx, struct cxl_afu *afu, bool master)
mutex_init(&ctx->mapping_lock);
ctx->mapping = NULL;
- if (cxl_is_psl8(afu)) {
+ if (cxl_is_power8()) {
spin_lock_init(&ctx->sste_lock);
/*
@@ -189,7 +189,7 @@ int cxl_context_iomap(struct cxl_context *ctx, struct vm_area_struct *vma)
if (start + len > ctx->afu->adapter->ps_size)
return -EINVAL;
- if (cxl_is_psl9(ctx->afu)) {
+ if (cxl_is_power9()) {
/*
* Make sure there is a valid problem state
* area space for this AFU.
@@ -324,7 +324,7 @@ static void reclaim_ctx(struct rcu_head *rcu)
{
struct cxl_context *ctx = container_of(rcu, struct cxl_context, rcu);
- if (cxl_is_psl8(ctx->afu))
+ if (cxl_is_power8())
free_page((u64)ctx->sstp);
if (ctx->ff_page)
__free_page(ctx->ff_page);
diff --git a/drivers/misc/cxl/cxl.h b/drivers/misc/cxl/cxl.h
index c8568ea7c518..a03f8e7535e5 100644
--- a/drivers/misc/cxl/cxl.h
+++ b/drivers/misc/cxl/cxl.h
@@ -357,6 +357,7 @@ static const cxl_p2n_reg_t CXL_PSL_WED_An = {0x0A0};
#define CXL_PSL9_DSISR_An_PF_RGP 0x0000000000000090ULL /* PTE not found (Radix Guest (parent)) 0b10010000 */
#define CXL_PSL9_DSISR_An_PF_HRH 0x0000000000000094ULL /* PTE not found (HPT/Radix Host) 0b10010100 */
#define CXL_PSL9_DSISR_An_PF_STEG 0x000000000000009CULL /* PTE not found (STEG VA) 0b10011100 */
+#define CXL_PSL9_DSISR_An_URTCH 0x00000000000000B4ULL /* Unsupported Radix Tree Configuration 0b10110100 */
/****** CXL_PSL_TFC_An ******************************************************/
#define CXL_PSL_TFC_An_A (1ull << (63-28)) /* Acknowledge non-translation fault */
@@ -844,24 +845,15 @@ static inline bool cxl_is_power8(void)
static inline bool cxl_is_power9(void)
{
- /* intermediate solution */
- if (!cxl_is_power8() &&
- (cpu_has_feature(CPU_FTRS_POWER9) ||
- cpu_has_feature(CPU_FTR_POWER9_DD1)))
+ if (pvr_version_is(PVR_POWER9))
return true;
return false;
}
-static inline bool cxl_is_psl8(struct cxl_afu *afu)
+static inline bool cxl_is_power9_dd1(void)
{
- if (afu->adapter->caia_major == 1)
- return true;
- return false;
-}
-
-static inline bool cxl_is_psl9(struct cxl_afu *afu)
-{
- if (afu->adapter->caia_major == 2)
+ if ((pvr_version_is(PVR_POWER9)) &&
+ cpu_has_feature(CPU_FTR_POWER9_DD1))
return true;
return false;
}
diff --git a/drivers/misc/cxl/fault.c b/drivers/misc/cxl/fault.c
index 5344448f514e..c79e39bad7a4 100644
--- a/drivers/misc/cxl/fault.c
+++ b/drivers/misc/cxl/fault.c
@@ -187,7 +187,7 @@ static struct mm_struct *get_mem_context(struct cxl_context *ctx)
static bool cxl_is_segment_miss(struct cxl_context *ctx, u64 dsisr)
{
- if ((cxl_is_psl8(ctx->afu)) && (dsisr & CXL_PSL_DSISR_An_DS))
+ if ((cxl_is_power8() && (dsisr & CXL_PSL_DSISR_An_DS)))
return true;
return false;
@@ -195,16 +195,23 @@ static bool cxl_is_segment_miss(struct cxl_context *ctx, u64 dsisr)
static bool cxl_is_page_fault(struct cxl_context *ctx, u64 dsisr)
{
- if ((cxl_is_psl8(ctx->afu)) && (dsisr & CXL_PSL_DSISR_An_DM))
- return true;
+ u64 crs; /* Translation Checkout Response Status */
- if ((cxl_is_psl9(ctx->afu)) &&
- ((dsisr & CXL_PSL9_DSISR_An_CO_MASK) &
- (CXL_PSL9_DSISR_An_PF_SLR | CXL_PSL9_DSISR_An_PF_RGC |
- CXL_PSL9_DSISR_An_PF_RGP | CXL_PSL9_DSISR_An_PF_HRH |
- CXL_PSL9_DSISR_An_PF_STEG)))
+ if ((cxl_is_power8()) && (dsisr & CXL_PSL_DSISR_An_DM))
return true;
+ if (cxl_is_power9()) {
+ crs = (dsisr & CXL_PSL9_DSISR_An_CO_MASK);
+ if ((crs == CXL_PSL9_DSISR_An_PF_SLR) ||
+ (crs == CXL_PSL9_DSISR_An_PF_RGC) ||
+ (crs == CXL_PSL9_DSISR_An_PF_RGP) ||
+ (crs == CXL_PSL9_DSISR_An_PF_HRH) ||
+ (crs == CXL_PSL9_DSISR_An_PF_STEG) ||
+ (crs == CXL_PSL9_DSISR_An_URTCH)) {
+ return true;
+ }
+ }
+
return false;
}
diff --git a/drivers/misc/cxl/main.c b/drivers/misc/cxl/main.c
index 1703655072b1..c1ba0d42cbc8 100644
--- a/drivers/misc/cxl/main.c
+++ b/drivers/misc/cxl/main.c
@@ -329,8 +329,15 @@ static int __init init_cxl(void)
cxl_debugfs_init();
- if ((rc = register_cxl_calls(&cxl_calls)))
- goto err;
+ /*
+ * we don't register the callback on P9. slb callack is only
+ * used for the PSL8 MMU and CX4.
+ */
+ if (cxl_is_power8()) {
+ rc = register_cxl_calls(&cxl_calls);
+ if (rc)
+ goto err;
+ }
if (cpu_has_feature(CPU_FTR_HVMODE)) {
cxl_ops = &cxl_native_ops;
@@ -347,7 +354,8 @@ static int __init init_cxl(void)
return 0;
err1:
- unregister_cxl_calls(&cxl_calls);
+ if (cxl_is_power8())
+ unregister_cxl_calls(&cxl_calls);
err:
cxl_debugfs_exit();
cxl_file_exit();
@@ -366,7 +374,8 @@ static void exit_cxl(void)
cxl_debugfs_exit();
cxl_file_exit();
- unregister_cxl_calls(&cxl_calls);
+ if (cxl_is_power8())
+ unregister_cxl_calls(&cxl_calls);
idr_destroy(&cxl_adapter_idr);
}
diff --git a/drivers/misc/cxl/native.c b/drivers/misc/cxl/native.c
index 8d6ea9712dbd..2b2f8894149d 100644
--- a/drivers/misc/cxl/native.c
+++ b/drivers/misc/cxl/native.c
@@ -105,11 +105,16 @@ static int native_afu_reset(struct cxl_afu *afu)
CXL_AFU_Cntl_An_RS_MASK | CXL_AFU_Cntl_An_ES_MASK,
false);
- /* Re-enable any masked interrupts */
- serr = cxl_p1n_read(afu, CXL_PSL_SERR_An);
- serr &= ~CXL_PSL_SERR_An_IRQ_MASKS;
- cxl_p1n_write(afu, CXL_PSL_SERR_An, serr);
-
+ /*
+ * Re-enable any masked interrupts when the AFU is not
+ * activated to avoid side effects after attaching a process
+ * in dedicated mode.
+ */
+ if (afu->current_mode == 0) {
+ serr = cxl_p1n_read(afu, CXL_PSL_SERR_An);
+ serr &= ~CXL_PSL_SERR_An_IRQ_MASKS;
+ cxl_p1n_write(afu, CXL_PSL_SERR_An, serr);
+ }
return rc;
}
@@ -139,9 +144,9 @@ int cxl_psl_purge(struct cxl_afu *afu)
pr_devel("PSL purge request\n");
- if (cxl_is_psl8(afu))
+ if (cxl_is_power8())
trans_fault = CXL_PSL_DSISR_TRANS;
- if (cxl_is_psl9(afu))
+ if (cxl_is_power9())
trans_fault = CXL_PSL9_DSISR_An_TF;
if (!cxl_ops->link_ok(afu->adapter, afu)) {
@@ -603,7 +608,7 @@ static u64 calculate_sr(struct cxl_context *ctx)
if (!test_tsk_thread_flag(current, TIF_32BIT))
sr |= CXL_PSL_SR_An_SF;
}
- if (cxl_is_psl9(ctx->afu)) {
+ if (cxl_is_power9()) {
if (radix_enabled())
sr |= CXL_PSL_SR_An_XLAT_ror;
else
@@ -1117,10 +1122,10 @@ static irqreturn_t native_handle_psl_slice_error(struct cxl_context *ctx,
static bool cxl_is_translation_fault(struct cxl_afu *afu, u64 dsisr)
{
- if ((cxl_is_psl8(afu)) && (dsisr & CXL_PSL_DSISR_TRANS))
+ if ((cxl_is_power8()) && (dsisr & CXL_PSL_DSISR_TRANS))
return true;
- if ((cxl_is_psl9(afu)) && (dsisr & CXL_PSL9_DSISR_An_TF))
+ if ((cxl_is_power9()) && (dsisr & CXL_PSL9_DSISR_An_TF))
return true;
return false;
@@ -1194,10 +1199,10 @@ static void native_irq_wait(struct cxl_context *ctx)
if (ph != ctx->pe)
return;
dsisr = cxl_p2n_read(ctx->afu, CXL_PSL_DSISR_An);
- if (cxl_is_psl8(ctx->afu) &&
+ if (cxl_is_power8() &&
((dsisr & CXL_PSL_DSISR_PENDING) == 0))
return;
- if (cxl_is_psl9(ctx->afu) &&
+ if (cxl_is_power9() &&
((dsisr & CXL_PSL9_DSISR_PENDING) == 0))
return;
/*
diff --git a/drivers/misc/cxl/pci.c b/drivers/misc/cxl/pci.c
index 6dc1ee5b92c9..1eb9859809bf 100644
--- a/drivers/misc/cxl/pci.c
+++ b/drivers/misc/cxl/pci.c
@@ -436,7 +436,7 @@ static int init_implementation_adapter_regs_psl9(struct cxl *adapter, struct pci
/* nMMU_ID Defaults to: b’000001001’*/
xsl_dsnctl |= ((u64)0x09 << (63-28));
- if (cxl_is_power9() && !cpu_has_feature(CPU_FTR_POWER9_DD1)) {
+ if (!(cxl_is_power9_dd1())) {
/*
* Used to identify CAPI packets which should be sorted into
* the Non-Blocking queues by the PHB. This field should match
@@ -491,7 +491,7 @@ static int init_implementation_adapter_regs_psl9(struct cxl *adapter, struct pci
cxl_p1_write(adapter, CXL_PSL9_APCDEDTYPE, 0x40000003FFFF0000ULL);
/* Disable vc dd1 fix */
- if ((cxl_is_power9() && cpu_has_feature(CPU_FTR_POWER9_DD1)))
+ if (cxl_is_power9_dd1())
cxl_p1_write(adapter, CXL_PSL9_GP_CT, 0x0400000000000001ULL);
return 0;
@@ -1439,8 +1439,7 @@ int cxl_pci_reset(struct cxl *adapter)
* The adapter is about to be reset, so ignore errors.
* Not supported on P9 DD1
*/
- if ((cxl_is_power8()) ||
- ((cxl_is_power9() && !cpu_has_feature(CPU_FTR_POWER9_DD1))))
+ if ((cxl_is_power8()) || (!(cxl_is_power9_dd1())))
cxl_data_cache_flush(adapter);
/* pcie_warm_reset requests a fundamental pci reset which includes a
@@ -1750,7 +1749,6 @@ static const struct cxl_service_layer_ops psl9_ops = {
.debugfs_add_adapter_regs = cxl_debugfs_add_adapter_regs_psl9,
.debugfs_add_afu_regs = cxl_debugfs_add_afu_regs_psl9,
.psl_irq_dump_registers = cxl_native_irq_dump_regs_psl9,
- .err_irq_dump_registers = cxl_native_err_irq_dump_regs,
.debugfs_stop_trace = cxl_stop_trace_psl9,
.write_timebase_ctrl = write_timebase_ctrl_psl9,
.timebase_read = timebase_read_psl9,
@@ -1889,8 +1887,7 @@ static void cxl_pci_remove_adapter(struct cxl *adapter)
* Flush adapter datacache as its about to be removed.
* Not supported on P9 DD1.
*/
- if ((cxl_is_power8()) ||
- ((cxl_is_power9() && !cpu_has_feature(CPU_FTR_POWER9_DD1))))
+ if ((cxl_is_power8()) || (!(cxl_is_power9_dd1())))
cxl_data_cache_flush(adapter);
cxl_deconfigure_adapter(adapter);
diff --git a/drivers/mmc/host/meson-gx-mmc.c b/drivers/mmc/host/meson-gx-mmc.c
index 1842ed341af1..de962c2d5e00 100644
--- a/drivers/mmc/host/meson-gx-mmc.c
+++ b/drivers/mmc/host/meson-gx-mmc.c
@@ -210,6 +210,15 @@ static void meson_mmc_get_transfer_mode(struct mmc_host *mmc,
int i;
bool use_desc_chain_mode = true;
+ /*
+ * Broken SDIO with AP6255-based WiFi on Khadas VIM Pro has been
+ * reported. For some strange reason this occurs in descriptor
+ * chain mode only. So let's fall back to bounce buffer mode
+ * for command SD_IO_RW_EXTENDED.
+ */
+ if (mrq->cmd->opcode == SD_IO_RW_EXTENDED)
+ return;
+
for_each_sg(data->sg, sg, data->sg_len, i)
/* check for 8 byte alignment */
if (sg->offset & 7) {
diff --git a/drivers/mmc/host/sdhci-pci-core.c b/drivers/mmc/host/sdhci-pci-core.c
index 9577beb278e7..18957fea82ff 100644
--- a/drivers/mmc/host/sdhci-pci-core.c
+++ b/drivers/mmc/host/sdhci-pci-core.c
@@ -404,7 +404,7 @@ struct intel_host {
bool d3_retune;
};
-const guid_t intel_dsm_guid =
+static const guid_t intel_dsm_guid =
GUID_INIT(0xF6C13EA5, 0x65CD, 0x461F,
0xAB, 0x7A, 0x29, 0xF7, 0xE8, 0xD5, 0xBD, 0x61);
diff --git a/drivers/net/arcnet/arcnet.c b/drivers/net/arcnet/arcnet.c
index 62ee439d5882..53a1cb551def 100644
--- a/drivers/net/arcnet/arcnet.c
+++ b/drivers/net/arcnet/arcnet.c
@@ -756,6 +756,7 @@ irqreturn_t arcnet_interrupt(int irq, void *dev_id)
struct net_device *dev = dev_id;
struct arcnet_local *lp;
int recbuf, status, diagstatus, didsomething, boguscount;
+ unsigned long flags;
int retval = IRQ_NONE;
arc_printk(D_DURING, dev, "\n");
@@ -765,7 +766,7 @@ irqreturn_t arcnet_interrupt(int irq, void *dev_id)
lp = netdev_priv(dev);
BUG_ON(!lp);
- spin_lock(&lp->lock);
+ spin_lock_irqsave(&lp->lock, flags);
/* RESET flag was enabled - if device is not running, we must
* clear it right away (but nothing else).
@@ -774,7 +775,7 @@ irqreturn_t arcnet_interrupt(int irq, void *dev_id)
if (lp->hw.status(dev) & RESETflag)
lp->hw.command(dev, CFLAGScmd | RESETclear);
lp->hw.intmask(dev, 0);
- spin_unlock(&lp->lock);
+ spin_unlock_irqrestore(&lp->lock, flags);
return retval;
}
@@ -998,7 +999,7 @@ irqreturn_t arcnet_interrupt(int irq, void *dev_id)
udelay(1);
lp->hw.intmask(dev, lp->intmask);
- spin_unlock(&lp->lock);
+ spin_unlock_irqrestore(&lp->lock, flags);
return retval;
}
EXPORT_SYMBOL(arcnet_interrupt);
diff --git a/drivers/net/arcnet/capmode.c b/drivers/net/arcnet/capmode.c
index 2056878fb087..4fa2e46b48d3 100644
--- a/drivers/net/arcnet/capmode.c
+++ b/drivers/net/arcnet/capmode.c
@@ -212,7 +212,7 @@ static int ack_tx(struct net_device *dev, int acked)
ackpkt->soft.cap.proto = 0; /* using protocol 0 for acknowledge */
ackpkt->soft.cap.mes.ack = acked;
- arc_printk(D_PROTO, dev, "Ackknowledge for cap packet %x.\n",
+ arc_printk(D_PROTO, dev, "Acknowledge for cap packet %x.\n",
*((int *)&ackpkt->soft.cap.cookie[0]));
ackskb->protocol = cpu_to_be16(ETH_P_ARCNET);
diff --git a/drivers/net/arcnet/com20020-pci.c b/drivers/net/arcnet/com20020-pci.c
index 239de38fbd6a..47f80b83dcf4 100644
--- a/drivers/net/arcnet/com20020-pci.c
+++ b/drivers/net/arcnet/com20020-pci.c
@@ -135,6 +135,7 @@ static int com20020pci_probe(struct pci_dev *pdev,
for (i = 0; i < ci->devcount; i++) {
struct com20020_pci_channel_map *cm = &ci->chan_map_tbl[i];
struct com20020_dev *card;
+ int dev_id_mask = 0xf;
dev = alloc_arcdev(device);
if (!dev) {
@@ -166,6 +167,7 @@ static int com20020pci_probe(struct pci_dev *pdev,
arcnet_outb(0x00, ioaddr, COM20020_REG_W_COMMAND);
arcnet_inb(ioaddr, COM20020_REG_R_DIAGSTAT);
+ SET_NETDEV_DEV(dev, &pdev->dev);
dev->base_addr = ioaddr;
dev->dev_addr[0] = node;
dev->irq = pdev->irq;
@@ -179,8 +181,8 @@ static int com20020pci_probe(struct pci_dev *pdev,
/* Get the dev_id from the PLX rotary coder */
if (!strncmp(ci->name, "EAE PLX-PCI MA1", 15))
- dev->dev_id = 0xc;
- dev->dev_id ^= inb(priv->misc + ci->rotary) >> 4;
+ dev_id_mask = 0x3;
+ dev->dev_id = (inb(priv->misc + ci->rotary) >> 4) & dev_id_mask;
snprintf(dev->name, sizeof(dev->name), "arc%d-%d", dev->dev_id, i);
diff --git a/drivers/net/arcnet/com20020.c b/drivers/net/arcnet/com20020.c
index 13d9ad4b3f5c..78043a9c5981 100644
--- a/drivers/net/arcnet/com20020.c
+++ b/drivers/net/arcnet/com20020.c
@@ -246,8 +246,6 @@ int com20020_found(struct net_device *dev, int shared)
return -ENODEV;
}
- dev->base_addr = ioaddr;
-
arc_printk(D_NORMAL, dev, "%s: station %02Xh found at %03lXh, IRQ %d.\n",
lp->card_name, dev->dev_addr[0], dev->base_addr, dev->irq);
diff --git a/drivers/net/bonding/bond_3ad.c b/drivers/net/bonding/bond_3ad.c
index b44a6aeb346d..e5386ab706ec 100644
--- a/drivers/net/bonding/bond_3ad.c
+++ b/drivers/net/bonding/bond_3ad.c
@@ -90,10 +90,13 @@ enum ad_link_speed_type {
AD_LINK_SPEED_100MBPS,
AD_LINK_SPEED_1000MBPS,
AD_LINK_SPEED_2500MBPS,
+ AD_LINK_SPEED_5000MBPS,
AD_LINK_SPEED_10000MBPS,
+ AD_LINK_SPEED_14000MBPS,
AD_LINK_SPEED_20000MBPS,
AD_LINK_SPEED_25000MBPS,
AD_LINK_SPEED_40000MBPS,
+ AD_LINK_SPEED_50000MBPS,
AD_LINK_SPEED_56000MBPS,
AD_LINK_SPEED_100000MBPS,
};
@@ -259,10 +262,13 @@ static inline int __check_agg_selection_timer(struct port *port)
* %AD_LINK_SPEED_100MBPS,
* %AD_LINK_SPEED_1000MBPS,
* %AD_LINK_SPEED_2500MBPS,
+ * %AD_LINK_SPEED_5000MBPS,
* %AD_LINK_SPEED_10000MBPS
+ * %AD_LINK_SPEED_14000MBPS,
* %AD_LINK_SPEED_20000MBPS
* %AD_LINK_SPEED_25000MBPS
* %AD_LINK_SPEED_40000MBPS
+ * %AD_LINK_SPEED_50000MBPS
* %AD_LINK_SPEED_56000MBPS
* %AD_LINK_SPEED_100000MBPS
*/
@@ -296,10 +302,18 @@ static u16 __get_link_speed(struct port *port)
speed = AD_LINK_SPEED_2500MBPS;
break;
+ case SPEED_5000:
+ speed = AD_LINK_SPEED_5000MBPS;
+ break;
+
case SPEED_10000:
speed = AD_LINK_SPEED_10000MBPS;
break;
+ case SPEED_14000:
+ speed = AD_LINK_SPEED_14000MBPS;
+ break;
+
case SPEED_20000:
speed = AD_LINK_SPEED_20000MBPS;
break;
@@ -312,6 +326,10 @@ static u16 __get_link_speed(struct port *port)
speed = AD_LINK_SPEED_40000MBPS;
break;
+ case SPEED_50000:
+ speed = AD_LINK_SPEED_50000MBPS;
+ break;
+
case SPEED_56000:
speed = AD_LINK_SPEED_56000MBPS;
break;
@@ -707,9 +725,15 @@ static u32 __get_agg_bandwidth(struct aggregator *aggregator)
case AD_LINK_SPEED_2500MBPS:
bandwidth = nports * 2500;
break;
+ case AD_LINK_SPEED_5000MBPS:
+ bandwidth = nports * 5000;
+ break;
case AD_LINK_SPEED_10000MBPS:
bandwidth = nports * 10000;
break;
+ case AD_LINK_SPEED_14000MBPS:
+ bandwidth = nports * 14000;
+ break;
case AD_LINK_SPEED_20000MBPS:
bandwidth = nports * 20000;
break;
@@ -719,6 +743,9 @@ static u32 __get_agg_bandwidth(struct aggregator *aggregator)
case AD_LINK_SPEED_40000MBPS:
bandwidth = nports * 40000;
break;
+ case AD_LINK_SPEED_50000MBPS:
+ bandwidth = nports * 50000;
+ break;
case AD_LINK_SPEED_56000MBPS:
bandwidth = nports * 56000;
break;
diff --git a/drivers/net/bonding/bond_main.c b/drivers/net/bonding/bond_main.c
index 2359478b977f..8ab6bdbe1682 100644
--- a/drivers/net/bonding/bond_main.c
+++ b/drivers/net/bonding/bond_main.c
@@ -4192,7 +4192,6 @@ static void bond_destructor(struct net_device *bond_dev)
struct bonding *bond = netdev_priv(bond_dev);
if (bond->wq)
destroy_workqueue(bond->wq);
- free_netdev(bond_dev);
}
void bond_setup(struct net_device *bond_dev)
@@ -4212,7 +4211,8 @@ void bond_setup(struct net_device *bond_dev)
bond_dev->netdev_ops = &bond_netdev_ops;
bond_dev->ethtool_ops = &bond_ethtool_ops;
- bond_dev->destructor = bond_destructor;
+ bond_dev->needs_free_netdev = true;
+ bond_dev->priv_destructor = bond_destructor;
SET_NETDEV_DEVTYPE(bond_dev, &bond_type);
@@ -4736,7 +4736,7 @@ int bond_create(struct net *net, const char *name)
rtnl_unlock();
if (res < 0)
- bond_destructor(bond_dev);
+ free_netdev(bond_dev);
return res;
}
diff --git a/drivers/net/caif/caif_hsi.c b/drivers/net/caif/caif_hsi.c
index ddabce759456..71a7c3b44fdd 100644
--- a/drivers/net/caif/caif_hsi.c
+++ b/drivers/net/caif/caif_hsi.c
@@ -1121,7 +1121,7 @@ static void cfhsi_setup(struct net_device *dev)
dev->flags = IFF_POINTOPOINT | IFF_NOARP;
dev->mtu = CFHSI_MAX_CAIF_FRAME_SZ;
dev->priv_flags |= IFF_NO_QUEUE;
- dev->destructor = free_netdev;
+ dev->needs_free_netdev = true;
dev->netdev_ops = &cfhsi_netdevops;
for (i = 0; i < CFHSI_PRIO_LAST; ++i)
skb_queue_head_init(&cfhsi->qhead[i]);
diff --git a/drivers/net/caif/caif_serial.c b/drivers/net/caif/caif_serial.c
index c2dea4916e5d..76e1d3545105 100644
--- a/drivers/net/caif/caif_serial.c
+++ b/drivers/net/caif/caif_serial.c
@@ -428,7 +428,7 @@ static void caifdev_setup(struct net_device *dev)
dev->flags = IFF_POINTOPOINT | IFF_NOARP;
dev->mtu = CAIF_MAX_MTU;
dev->priv_flags |= IFF_NO_QUEUE;
- dev->destructor = free_netdev;
+ dev->needs_free_netdev = true;
skb_queue_head_init(&serdev->head);
serdev->common.link_select = CAIF_LINK_LOW_LATENCY;
serdev->common.use_frag = true;
diff --git a/drivers/net/caif/caif_spi.c b/drivers/net/caif/caif_spi.c
index 3a529fbe539f..fc21afe852b9 100644
--- a/drivers/net/caif/caif_spi.c
+++ b/drivers/net/caif/caif_spi.c
@@ -712,7 +712,7 @@ static void cfspi_setup(struct net_device *dev)
dev->flags = IFF_NOARP | IFF_POINTOPOINT;
dev->priv_flags |= IFF_NO_QUEUE;
dev->mtu = SPI_MAX_PAYLOAD_SIZE;
- dev->destructor = free_netdev;
+ dev->needs_free_netdev = true;
skb_queue_head_init(&cfspi->qhead);
skb_queue_head_init(&cfspi->chead);
cfspi->cfdev.link_select = CAIF_LINK_HIGH_BANDW;
diff --git a/drivers/net/caif/caif_virtio.c b/drivers/net/caif/caif_virtio.c
index 6122768c8644..1794ea0420b7 100644
--- a/drivers/net/caif/caif_virtio.c
+++ b/drivers/net/caif/caif_virtio.c
@@ -617,7 +617,7 @@ static void cfv_netdev_setup(struct net_device *netdev)
netdev->tx_queue_len = 100;
netdev->flags = IFF_POINTOPOINT | IFF_NOARP;
netdev->mtu = CFV_DEF_MTU_SIZE;
- netdev->destructor = free_netdev;
+ netdev->needs_free_netdev = true;
}
/* Create debugfs counters for the device */
diff --git a/drivers/net/can/dev.c b/drivers/net/can/dev.c
index 611d16a7061d..ae4ed03dc642 100644
--- a/drivers/net/can/dev.c
+++ b/drivers/net/can/dev.c
@@ -391,6 +391,9 @@ void can_change_state(struct net_device *dev, struct can_frame *cf,
can_update_state_error_stats(dev, new_state);
priv->state = new_state;
+ if (!cf)
+ return;
+
if (unlikely(new_state == CAN_STATE_BUS_OFF)) {
cf->can_id |= CAN_ERR_BUSOFF;
return;
diff --git a/drivers/net/can/peak_canfd/peak_canfd.c b/drivers/net/can/peak_canfd/peak_canfd.c
index 0d57be5ea97b..85268be0c913 100644
--- a/drivers/net/can/peak_canfd/peak_canfd.c
+++ b/drivers/net/can/peak_canfd/peak_canfd.c
@@ -489,7 +489,7 @@ int peak_canfd_handle_msgs_list(struct peak_canfd_priv *priv,
struct pucan_rx_msg *msg_list, int msg_count)
{
void *msg_ptr = msg_list;
- int i, msg_size;
+ int i, msg_size = 0;
for (i = 0; i < msg_count; i++) {
msg_size = peak_canfd_handle_msg(priv, msg_ptr);
diff --git a/drivers/net/can/slcan.c b/drivers/net/can/slcan.c
index eb7173713bbc..6a6e896e52fa 100644
--- a/drivers/net/can/slcan.c
+++ b/drivers/net/can/slcan.c
@@ -417,7 +417,7 @@ static int slc_open(struct net_device *dev)
static void slc_free_netdev(struct net_device *dev)
{
int i = dev->base_addr;
- free_netdev(dev);
+
slcan_devs[i] = NULL;
}
@@ -436,7 +436,8 @@ static const struct net_device_ops slc_netdev_ops = {
static void slc_setup(struct net_device *dev)
{
dev->netdev_ops = &slc_netdev_ops;
- dev->destructor = slc_free_netdev;
+ dev->needs_free_netdev = true;
+ dev->priv_destructor = slc_free_netdev;
dev->hard_header_len = 0;
dev->addr_len = 0;
@@ -761,8 +762,6 @@ static void __exit slcan_exit(void)
if (sl->tty) {
printk(KERN_ERR "%s: tty discipline still running\n",
dev->name);
- /* Intentionally leak the control block. */
- dev->destructor = NULL;
}
unregister_netdev(dev);
diff --git a/drivers/net/can/usb/gs_usb.c b/drivers/net/can/usb/gs_usb.c
index eecee7f8dfb7..afcc1312dbaf 100644
--- a/drivers/net/can/usb/gs_usb.c
+++ b/drivers/net/can/usb/gs_usb.c
@@ -265,6 +265,8 @@ static int gs_cmd_reset(struct gs_usb *gsusb, struct gs_can *gsdev)
sizeof(*dm),
1000);
+ kfree(dm);
+
return rc;
}
diff --git a/drivers/net/can/usb/peak_usb/pcan_usb_core.c b/drivers/net/can/usb/peak_usb/pcan_usb_core.c
index 57913dbbae0a..1ca76e03e965 100644
--- a/drivers/net/can/usb/peak_usb/pcan_usb_core.c
+++ b/drivers/net/can/usb/peak_usb/pcan_usb_core.c
@@ -908,8 +908,6 @@ static int peak_usb_probe(struct usb_interface *intf,
const struct peak_usb_adapter *peak_usb_adapter = NULL;
int i, err = -ENOMEM;
- usb_dev = interface_to_usbdev(intf);
-
/* get corresponding PCAN-USB adapter */
for (i = 0; i < ARRAY_SIZE(peak_usb_adapters_list); i++)
if (peak_usb_adapters_list[i]->device_id == usb_id_product) {
@@ -920,7 +918,7 @@ static int peak_usb_probe(struct usb_interface *intf,
if (!peak_usb_adapter) {
/* should never come except device_id bad usage in this file */
pr_err("%s: didn't find device id. 0x%x in devices list\n",
- PCAN_USB_DRIVER_NAME, usb_dev->descriptor.idProduct);
+ PCAN_USB_DRIVER_NAME, usb_id_product);
return -ENODEV;
}
diff --git a/drivers/net/can/vcan.c b/drivers/net/can/vcan.c
index facca33d53e9..a8cb33264ff1 100644
--- a/drivers/net/can/vcan.c
+++ b/drivers/net/can/vcan.c
@@ -152,7 +152,7 @@ static const struct net_device_ops vcan_netdev_ops = {
static void vcan_setup(struct net_device *dev)
{
dev->type = ARPHRD_CAN;
- dev->mtu = CAN_MTU;
+ dev->mtu = CANFD_MTU;
dev->hard_header_len = 0;
dev->addr_len = 0;
dev->tx_queue_len = 0;
@@ -163,7 +163,7 @@ static void vcan_setup(struct net_device *dev)
dev->flags |= IFF_ECHO;
dev->netdev_ops = &vcan_netdev_ops;
- dev->destructor = free_netdev;
+ dev->needs_free_netdev = true;
}
static struct rtnl_link_ops vcan_link_ops __read_mostly = {
diff --git a/drivers/net/can/vxcan.c b/drivers/net/can/vxcan.c
index 7fbb24795681..cfe889e8f172 100644
--- a/drivers/net/can/vxcan.c
+++ b/drivers/net/can/vxcan.c
@@ -150,13 +150,13 @@ static const struct net_device_ops vxcan_netdev_ops = {
static void vxcan_setup(struct net_device *dev)
{
dev->type = ARPHRD_CAN;
- dev->mtu = CAN_MTU;
+ dev->mtu = CANFD_MTU;
dev->hard_header_len = 0;
dev->addr_len = 0;
dev->tx_queue_len = 0;
dev->flags = (IFF_NOARP|IFF_ECHO);
dev->netdev_ops = &vxcan_netdev_ops;
- dev->destructor = free_netdev;
+ dev->needs_free_netdev = true;
}
/* forward declaration for rtnl_create_link() */
diff --git a/drivers/net/dummy.c b/drivers/net/dummy.c
index 149244aac20a..9905b52fe293 100644
--- a/drivers/net/dummy.c
+++ b/drivers/net/dummy.c
@@ -328,7 +328,6 @@ static void dummy_free_netdev(struct net_device *dev)
struct dummy_priv *priv = netdev_priv(dev);
kfree(priv->vfinfo);
- free_netdev(dev);
}
static void dummy_setup(struct net_device *dev)
@@ -338,7 +337,8 @@ static void dummy_setup(struct net_device *dev)
/* Initialize the device structure. */
dev->netdev_ops = &dummy_netdev_ops;
dev->ethtool_ops = &dummy_ethtool_ops;
- dev->destructor = dummy_free_netdev;
+ dev->needs_free_netdev = true;
+ dev->priv_destructor = dummy_free_netdev;
/* Fill in device structure with ethernet-generic values. */
dev->flags |= IFF_NOARP;
diff --git a/drivers/net/ethernet/amazon/ena/ena_com.c b/drivers/net/ethernet/amazon/ena/ena_com.c
index 08d11cede9c9..f5b237e0bd60 100644
--- a/drivers/net/ethernet/amazon/ena/ena_com.c
+++ b/drivers/net/ethernet/amazon/ena/ena_com.c
@@ -61,6 +61,8 @@
#define ENA_MMIO_READ_TIMEOUT 0xFFFFFFFF
+#define ENA_REGS_ADMIN_INTR_MASK 1
+
/*****************************************************************************/
/*****************************************************************************/
/*****************************************************************************/
@@ -232,11 +234,9 @@ static struct ena_comp_ctx *__ena_com_submit_admin_cmd(struct ena_com_admin_queu
tail_masked = admin_queue->sq.tail & queue_size_mask;
/* In case of queue FULL */
- cnt = admin_queue->sq.tail - admin_queue->sq.head;
+ cnt = atomic_read(&admin_queue->outstanding_cmds);
if (cnt >= admin_queue->q_depth) {
- pr_debug("admin queue is FULL (tail %d head %d depth: %d)\n",
- admin_queue->sq.tail, admin_queue->sq.head,
- admin_queue->q_depth);
+ pr_debug("admin queue is full.\n");
admin_queue->stats.out_of_space++;
return ERR_PTR(-ENOSPC);
}
@@ -508,15 +508,20 @@ static int ena_com_comp_status_to_errno(u8 comp_status)
static int ena_com_wait_and_process_admin_cq_polling(struct ena_comp_ctx *comp_ctx,
struct ena_com_admin_queue *admin_queue)
{
- unsigned long flags;
- u32 start_time;
+ unsigned long flags, timeout;
int ret;
- start_time = ((u32)jiffies_to_usecs(jiffies));
+ timeout = jiffies + ADMIN_CMD_TIMEOUT_US;
+
+ while (1) {
+ spin_lock_irqsave(&admin_queue->q_lock, flags);
+ ena_com_handle_admin_completion(admin_queue);
+ spin_unlock_irqrestore(&admin_queue->q_lock, flags);
+
+ if (comp_ctx->status != ENA_CMD_SUBMITTED)
+ break;
- while (comp_ctx->status == ENA_CMD_SUBMITTED) {
- if ((((u32)jiffies_to_usecs(jiffies)) - start_time) >
- ADMIN_CMD_TIMEOUT_US) {
+ if (time_is_before_jiffies(timeout)) {
pr_err("Wait for completion (polling) timeout\n");
/* ENA didn't have any completion */
spin_lock_irqsave(&admin_queue->q_lock, flags);
@@ -528,10 +533,6 @@ static int ena_com_wait_and_process_admin_cq_polling(struct ena_comp_ctx *comp_c
goto err;
}
- spin_lock_irqsave(&admin_queue->q_lock, flags);
- ena_com_handle_admin_completion(admin_queue);
- spin_unlock_irqrestore(&admin_queue->q_lock, flags);
-
msleep(100);
}
@@ -1455,6 +1456,12 @@ void ena_com_admin_destroy(struct ena_com_dev *ena_dev)
void ena_com_set_admin_polling_mode(struct ena_com_dev *ena_dev, bool polling)
{
+ u32 mask_value = 0;
+
+ if (polling)
+ mask_value = ENA_REGS_ADMIN_INTR_MASK;
+
+ writel(mask_value, ena_dev->reg_bar + ENA_REGS_INTR_MASK_OFF);
ena_dev->admin_queue.polling = polling;
}
diff --git a/drivers/net/ethernet/amazon/ena/ena_ethtool.c b/drivers/net/ethernet/amazon/ena/ena_ethtool.c
index 67b2338f8fb3..3ee55e2fd694 100644
--- a/drivers/net/ethernet/amazon/ena/ena_ethtool.c
+++ b/drivers/net/ethernet/amazon/ena/ena_ethtool.c
@@ -80,7 +80,6 @@ static const struct ena_stats ena_stats_tx_strings[] = {
ENA_STAT_TX_ENTRY(tx_poll),
ENA_STAT_TX_ENTRY(doorbells),
ENA_STAT_TX_ENTRY(prepare_ctx_err),
- ENA_STAT_TX_ENTRY(missing_tx_comp),
ENA_STAT_TX_ENTRY(bad_req_id),
};
@@ -94,6 +93,7 @@ static const struct ena_stats ena_stats_rx_strings[] = {
ENA_STAT_RX_ENTRY(dma_mapping_err),
ENA_STAT_RX_ENTRY(bad_desc_num),
ENA_STAT_RX_ENTRY(rx_copybreak_pkt),
+ ENA_STAT_RX_ENTRY(empty_rx_ring),
};
static const struct ena_stats ena_stats_ena_com_strings[] = {
diff --git a/drivers/net/ethernet/amazon/ena/ena_netdev.c b/drivers/net/ethernet/amazon/ena/ena_netdev.c
index 7c1214d78855..4f16ed38bcf3 100644
--- a/drivers/net/ethernet/amazon/ena/ena_netdev.c
+++ b/drivers/net/ethernet/amazon/ena/ena_netdev.c
@@ -190,6 +190,7 @@ static void ena_init_io_rings(struct ena_adapter *adapter)
rxr->sgl_size = adapter->max_rx_sgl_size;
rxr->smoothed_interval =
ena_com_get_nonadaptive_moderation_interval_rx(ena_dev);
+ rxr->empty_rx_queue = 0;
}
}
@@ -1078,6 +1079,26 @@ inline void ena_adjust_intr_moderation(struct ena_ring *rx_ring,
rx_ring->per_napi_bytes = 0;
}
+static inline void ena_unmask_interrupt(struct ena_ring *tx_ring,
+ struct ena_ring *rx_ring)
+{
+ struct ena_eth_io_intr_reg intr_reg;
+
+ /* Update intr register: rx intr delay,
+ * tx intr delay and interrupt unmask
+ */
+ ena_com_update_intr_reg(&intr_reg,
+ rx_ring->smoothed_interval,
+ tx_ring->smoothed_interval,
+ true);
+
+ /* It is a shared MSI-X.
+ * Tx and Rx CQ have pointer to it.
+ * So we use one of them to reach the intr reg
+ */
+ ena_com_unmask_intr(rx_ring->ena_com_io_cq, &intr_reg);
+}
+
static inline void ena_update_ring_numa_node(struct ena_ring *tx_ring,
struct ena_ring *rx_ring)
{
@@ -1108,7 +1129,6 @@ static int ena_io_poll(struct napi_struct *napi, int budget)
{
struct ena_napi *ena_napi = container_of(napi, struct ena_napi, napi);
struct ena_ring *tx_ring, *rx_ring;
- struct ena_eth_io_intr_reg intr_reg;
u32 tx_work_done;
u32 rx_work_done;
@@ -1149,22 +1169,9 @@ static int ena_io_poll(struct napi_struct *napi, int budget)
if (ena_com_get_adaptive_moderation_enabled(rx_ring->ena_dev))
ena_adjust_intr_moderation(rx_ring, tx_ring);
- /* Update intr register: rx intr delay,
- * tx intr delay and interrupt unmask
- */
- ena_com_update_intr_reg(&intr_reg,
- rx_ring->smoothed_interval,
- tx_ring->smoothed_interval,
- true);
-
- /* It is a shared MSI-X.
- * Tx and Rx CQ have pointer to it.
- * So we use one of them to reach the intr reg
- */
- ena_com_unmask_intr(rx_ring->ena_com_io_cq, &intr_reg);
+ ena_unmask_interrupt(tx_ring, rx_ring);
}
-
ena_update_ring_numa_node(tx_ring, rx_ring);
ret = rx_work_done;
@@ -1485,6 +1492,11 @@ static int ena_up_complete(struct ena_adapter *adapter)
ena_napi_enable_all(adapter);
+ /* Enable completion queues interrupt */
+ for (i = 0; i < adapter->num_queues; i++)
+ ena_unmask_interrupt(&adapter->tx_ring[i],
+ &adapter->rx_ring[i]);
+
/* schedule napi in case we had pending packets
* from the last time we disable napi
*/
@@ -1532,6 +1544,7 @@ static int ena_create_io_tx_queue(struct ena_adapter *adapter, int qid)
"Failed to get TX queue handlers. TX queue num %d rc: %d\n",
qid, rc);
ena_com_destroy_io_queue(ena_dev, ena_qid);
+ return rc;
}
ena_com_update_numa_node(tx_ring->ena_com_io_cq, ctx.numa_node);
@@ -1596,6 +1609,7 @@ static int ena_create_io_rx_queue(struct ena_adapter *adapter, int qid)
"Failed to get RX queue handlers. RX queue num %d rc: %d\n",
qid, rc);
ena_com_destroy_io_queue(ena_dev, ena_qid);
+ return rc;
}
ena_com_update_numa_node(rx_ring->ena_com_io_cq, ctx.numa_node);
@@ -1981,6 +1995,7 @@ static netdev_tx_t ena_start_xmit(struct sk_buff *skb, struct net_device *dev)
tx_info->tx_descs = nb_hw_desc;
tx_info->last_jiffies = jiffies;
+ tx_info->print_once = 0;
tx_ring->next_to_use = ENA_TX_RING_IDX_NEXT(next_to_use,
tx_ring->ring_size);
@@ -2550,13 +2565,44 @@ err:
"Reset attempt failed. Can not reset the device\n");
}
-static void check_for_missing_tx_completions(struct ena_adapter *adapter)
+static int check_missing_comp_in_queue(struct ena_adapter *adapter,
+ struct ena_ring *tx_ring)
{
struct ena_tx_buffer *tx_buf;
unsigned long last_jiffies;
+ u32 missed_tx = 0;
+ int i;
+
+ for (i = 0; i < tx_ring->ring_size; i++) {
+ tx_buf = &tx_ring->tx_buffer_info[i];
+ last_jiffies = tx_buf->last_jiffies;
+ if (unlikely(last_jiffies &&
+ time_is_before_jiffies(last_jiffies + TX_TIMEOUT))) {
+ if (!tx_buf->print_once)
+ netif_notice(adapter, tx_err, adapter->netdev,
+ "Found a Tx that wasn't completed on time, qid %d, index %d.\n",
+ tx_ring->qid, i);
+
+ tx_buf->print_once = 1;
+ missed_tx++;
+
+ if (unlikely(missed_tx > MAX_NUM_OF_TIMEOUTED_PACKETS)) {
+ netif_err(adapter, tx_err, adapter->netdev,
+ "The number of lost tx completions is above the threshold (%d > %d). Reset the device\n",
+ missed_tx, MAX_NUM_OF_TIMEOUTED_PACKETS);
+ set_bit(ENA_FLAG_TRIGGER_RESET, &adapter->flags);
+ return -EIO;
+ }
+ }
+ }
+
+ return 0;
+}
+
+static void check_for_missing_tx_completions(struct ena_adapter *adapter)
+{
struct ena_ring *tx_ring;
- int i, j, budget;
- u32 missed_tx;
+ int i, budget, rc;
/* Make sure the driver doesn't turn the device in other process */
smp_rmb();
@@ -2572,31 +2618,9 @@ static void check_for_missing_tx_completions(struct ena_adapter *adapter)
for (i = adapter->last_monitored_tx_qid; i < adapter->num_queues; i++) {
tx_ring = &adapter->tx_ring[i];
- for (j = 0; j < tx_ring->ring_size; j++) {
- tx_buf = &tx_ring->tx_buffer_info[j];
- last_jiffies = tx_buf->last_jiffies;
- if (unlikely(last_jiffies && time_is_before_jiffies(last_jiffies + TX_TIMEOUT))) {
- netif_notice(adapter, tx_err, adapter->netdev,
- "Found a Tx that wasn't completed on time, qid %d, index %d.\n",
- tx_ring->qid, j);
-
- u64_stats_update_begin(&tx_ring->syncp);
- missed_tx = tx_ring->tx_stats.missing_tx_comp++;
- u64_stats_update_end(&tx_ring->syncp);
-
- /* Clear last jiffies so the lost buffer won't
- * be counted twice.
- */
- tx_buf->last_jiffies = 0;
-
- if (unlikely(missed_tx > MAX_NUM_OF_TIMEOUTED_PACKETS)) {
- netif_err(adapter, tx_err, adapter->netdev,
- "The number of lost tx completion is above the threshold (%d > %d). Reset the device\n",
- missed_tx, MAX_NUM_OF_TIMEOUTED_PACKETS);
- set_bit(ENA_FLAG_TRIGGER_RESET, &adapter->flags);
- }
- }
- }
+ rc = check_missing_comp_in_queue(adapter, tx_ring);
+ if (unlikely(rc))
+ return;
budget--;
if (!budget)
@@ -2606,6 +2630,58 @@ static void check_for_missing_tx_completions(struct ena_adapter *adapter)
adapter->last_monitored_tx_qid = i % adapter->num_queues;
}
+/* trigger napi schedule after 2 consecutive detections */
+#define EMPTY_RX_REFILL 2
+/* For the rare case where the device runs out of Rx descriptors and the
+ * napi handler failed to refill new Rx descriptors (due to a lack of memory
+ * for example).
+ * This case will lead to a deadlock:
+ * The device won't send interrupts since all the new Rx packets will be dropped
+ * The napi handler won't allocate new Rx descriptors so the device will be
+ * able to send new packets.
+ *
+ * This scenario can happen when the kernel's vm.min_free_kbytes is too small.
+ * It is recommended to have at least 512MB, with a minimum of 128MB for
+ * constrained environment).
+ *
+ * When such a situation is detected - Reschedule napi
+ */
+static void check_for_empty_rx_ring(struct ena_adapter *adapter)
+{
+ struct ena_ring *rx_ring;
+ int i, refill_required;
+
+ if (!test_bit(ENA_FLAG_DEV_UP, &adapter->flags))
+ return;
+
+ if (test_bit(ENA_FLAG_TRIGGER_RESET, &adapter->flags))
+ return;
+
+ for (i = 0; i < adapter->num_queues; i++) {
+ rx_ring = &adapter->rx_ring[i];
+
+ refill_required =
+ ena_com_sq_empty_space(rx_ring->ena_com_io_sq);
+ if (unlikely(refill_required == (rx_ring->ring_size - 1))) {
+ rx_ring->empty_rx_queue++;
+
+ if (rx_ring->empty_rx_queue >= EMPTY_RX_REFILL) {
+ u64_stats_update_begin(&rx_ring->syncp);
+ rx_ring->rx_stats.empty_rx_ring++;
+ u64_stats_update_end(&rx_ring->syncp);
+
+ netif_err(adapter, drv, adapter->netdev,
+ "trigger refill for ring %d\n", i);
+
+ napi_schedule(rx_ring->napi);
+ rx_ring->empty_rx_queue = 0;
+ }
+ } else {
+ rx_ring->empty_rx_queue = 0;
+ }
+ }
+}
+
/* Check for keep alive expiration */
static void check_for_missing_keep_alive(struct ena_adapter *adapter)
{
@@ -2660,6 +2736,8 @@ static void ena_timer_service(unsigned long data)
check_for_missing_tx_completions(adapter);
+ check_for_empty_rx_ring(adapter);
+
if (debug_area)
ena_dump_stats_to_buf(adapter, debug_area);
@@ -2840,6 +2918,11 @@ static void ena_release_bars(struct ena_com_dev *ena_dev, struct pci_dev *pdev)
{
int release_bars;
+ if (ena_dev->mem_bar)
+ devm_iounmap(&pdev->dev, ena_dev->mem_bar);
+
+ devm_iounmap(&pdev->dev, ena_dev->reg_bar);
+
release_bars = pci_select_bars(pdev, IORESOURCE_MEM) & ENA_BAR_MASK;
pci_release_selected_regions(pdev, release_bars);
}
@@ -2927,8 +3010,9 @@ static int ena_probe(struct pci_dev *pdev, const struct pci_device_id *ent)
goto err_free_ena_dev;
}
- ena_dev->reg_bar = ioremap(pci_resource_start(pdev, ENA_REG_BAR),
- pci_resource_len(pdev, ENA_REG_BAR));
+ ena_dev->reg_bar = devm_ioremap(&pdev->dev,
+ pci_resource_start(pdev, ENA_REG_BAR),
+ pci_resource_len(pdev, ENA_REG_BAR));
if (!ena_dev->reg_bar) {
dev_err(&pdev->dev, "failed to remap regs bar\n");
rc = -EFAULT;
@@ -2948,8 +3032,9 @@ static int ena_probe(struct pci_dev *pdev, const struct pci_device_id *ent)
ena_set_push_mode(pdev, ena_dev, &get_feat_ctx);
if (ena_dev->tx_mem_queue_type == ENA_ADMIN_PLACEMENT_POLICY_DEV) {
- ena_dev->mem_bar = ioremap_wc(pci_resource_start(pdev, ENA_MEM_BAR),
- pci_resource_len(pdev, ENA_MEM_BAR));
+ ena_dev->mem_bar = devm_ioremap_wc(&pdev->dev,
+ pci_resource_start(pdev, ENA_MEM_BAR),
+ pci_resource_len(pdev, ENA_MEM_BAR));
if (!ena_dev->mem_bar) {
rc = -EFAULT;
goto err_device_destroy;
diff --git a/drivers/net/ethernet/amazon/ena/ena_netdev.h b/drivers/net/ethernet/amazon/ena/ena_netdev.h
index 0e22bce6239d..a4d3d5e21068 100644
--- a/drivers/net/ethernet/amazon/ena/ena_netdev.h
+++ b/drivers/net/ethernet/amazon/ena/ena_netdev.h
@@ -45,7 +45,7 @@
#define DRV_MODULE_VER_MAJOR 1
#define DRV_MODULE_VER_MINOR 1
-#define DRV_MODULE_VER_SUBMINOR 2
+#define DRV_MODULE_VER_SUBMINOR 7
#define DRV_MODULE_NAME "ena"
#ifndef DRV_MODULE_VERSION
@@ -146,7 +146,18 @@ struct ena_tx_buffer {
u32 tx_descs;
/* num of buffers used by this skb */
u32 num_of_bufs;
- /* Save the last jiffies to detect missing tx packets */
+
+ /* Used for detect missing tx packets to limit the number of prints */
+ u32 print_once;
+ /* Save the last jiffies to detect missing tx packets
+ *
+ * sets to non zero value on ena_start_xmit and set to zero on
+ * napi and timer_Service_routine.
+ *
+ * while this value is not protected by lock,
+ * a given packet is not expected to be handled by ena_start_xmit
+ * and by napi/timer_service at the same time.
+ */
unsigned long last_jiffies;
struct ena_com_buf bufs[ENA_PKT_MAX_BUFS];
} ____cacheline_aligned;
@@ -170,7 +181,6 @@ struct ena_stats_tx {
u64 napi_comp;
u64 tx_poll;
u64 doorbells;
- u64 missing_tx_comp;
u64 bad_req_id;
};
@@ -184,6 +194,7 @@ struct ena_stats_rx {
u64 dma_mapping_err;
u64 bad_desc_num;
u64 rx_copybreak_pkt;
+ u64 empty_rx_ring;
};
struct ena_ring {
@@ -231,6 +242,7 @@ struct ena_ring {
struct ena_stats_tx tx_stats;
struct ena_stats_rx rx_stats;
};
+ int empty_rx_queue;
} ____cacheline_aligned;
struct ena_stats_dev {
diff --git a/drivers/net/ethernet/aquantia/atlantic/hw_atl/hw_atl_utils.h b/drivers/net/ethernet/aquantia/atlantic/hw_atl/hw_atl_utils.h
index b8e3d88f0879..a66aee51ab5b 100644
--- a/drivers/net/ethernet/aquantia/atlantic/hw_atl/hw_atl_utils.h
+++ b/drivers/net/ethernet/aquantia/atlantic/hw_atl/hw_atl_utils.h
@@ -193,9 +193,6 @@ int hw_atl_utils_hw_get_regs(struct aq_hw_s *self,
struct aq_hw_caps_s *aq_hw_caps,
u32 *regs_buff);
-int hw_atl_utils_hw_get_settings(struct aq_hw_s *self,
- struct ethtool_cmd *cmd);
-
int hw_atl_utils_hw_set_power(struct aq_hw_s *self,
unsigned int power_state);
diff --git a/drivers/net/ethernet/broadcom/bnx2x/bnx2x_cmn.c b/drivers/net/ethernet/broadcom/bnx2x/bnx2x_cmn.c
index 5f49334dcad5..f619c4cac51f 100644
--- a/drivers/net/ethernet/broadcom/bnx2x/bnx2x_cmn.c
+++ b/drivers/net/ethernet/broadcom/bnx2x/bnx2x_cmn.c
@@ -3883,15 +3883,26 @@ netdev_tx_t bnx2x_start_xmit(struct sk_buff *skb, struct net_device *dev)
/* when transmitting in a vf, start bd must hold the ethertype
* for fw to enforce it
*/
+ u16 vlan_tci = 0;
#ifndef BNX2X_STOP_ON_ERROR
- if (IS_VF(bp))
+ if (IS_VF(bp)) {
#endif
- tx_start_bd->vlan_or_ethertype =
- cpu_to_le16(ntohs(eth->h_proto));
+ /* Still need to consider inband vlan for enforced */
+ if (__vlan_get_tag(skb, &vlan_tci)) {
+ tx_start_bd->vlan_or_ethertype =
+ cpu_to_le16(ntohs(eth->h_proto));
+ } else {
+ tx_start_bd->bd_flags.as_bitfield |=
+ (X_ETH_INBAND_VLAN <<
+ ETH_TX_BD_FLAGS_VLAN_MODE_SHIFT);
+ tx_start_bd->vlan_or_ethertype =
+ cpu_to_le16(vlan_tci);
+ }
#ifndef BNX2X_STOP_ON_ERROR
- else
+ } else {
/* used by FW for packet accounting */
tx_start_bd->vlan_or_ethertype = cpu_to_le16(pkt_prod);
+ }
#endif
}
diff --git a/drivers/net/ethernet/broadcom/bnx2x/bnx2x_main.c b/drivers/net/ethernet/broadcom/bnx2x/bnx2x_main.c
index a851f95c307a..349a46593abf 100644
--- a/drivers/net/ethernet/broadcom/bnx2x/bnx2x_main.c
+++ b/drivers/net/ethernet/broadcom/bnx2x/bnx2x_main.c
@@ -12729,7 +12729,7 @@ static int bnx2x_set_mc_list(struct bnx2x *bp)
} else {
/* If no mc addresses are required, flush the configuration */
rc = bnx2x_config_mcast(bp, &rparam, BNX2X_MCAST_CMD_DEL);
- if (rc)
+ if (rc < 0)
BNX2X_ERR("Failed to clear multicast configuration %d\n",
rc);
}
diff --git a/drivers/net/ethernet/broadcom/bnx2x/bnx2x_sriov.c b/drivers/net/ethernet/broadcom/bnx2x/bnx2x_sriov.c
index bdfd53b46bc5..9ca994d0bab6 100644
--- a/drivers/net/ethernet/broadcom/bnx2x/bnx2x_sriov.c
+++ b/drivers/net/ethernet/broadcom/bnx2x/bnx2x_sriov.c
@@ -901,6 +901,8 @@ static void bnx2x_vf_flr(struct bnx2x *bp, struct bnx2x_virtf *vf)
/* release VF resources */
bnx2x_vf_free_resc(bp, vf);
+ vf->malicious = false;
+
/* re-open the mailbox */
bnx2x_vf_enable_mbx(bp, vf->abs_vfid);
return;
@@ -1822,9 +1824,11 @@ get_vf:
vf->abs_vfid, qidx);
bnx2x_vf_handle_rss_update_eqe(bp, vf);
case EVENT_RING_OPCODE_VF_FLR:
- case EVENT_RING_OPCODE_MALICIOUS_VF:
/* Do nothing for now */
return 0;
+ case EVENT_RING_OPCODE_MALICIOUS_VF:
+ vf->malicious = true;
+ return 0;
}
return 0;
@@ -1905,6 +1909,13 @@ void bnx2x_iov_adjust_stats_req(struct bnx2x *bp)
continue;
}
+ if (vf->malicious) {
+ DP_AND((BNX2X_MSG_IOV | BNX2X_MSG_STATS),
+ "vf %d malicious so no stats for it\n",
+ vf->abs_vfid);
+ continue;
+ }
+
DP_AND((BNX2X_MSG_IOV | BNX2X_MSG_STATS),
"add addresses for vf %d\n", vf->abs_vfid);
for_each_vfq(vf, j) {
@@ -3042,7 +3053,7 @@ void bnx2x_vf_pci_dealloc(struct bnx2x *bp)
{
BNX2X_PCI_FREE(bp->vf2pf_mbox, bp->vf2pf_mbox_mapping,
sizeof(struct bnx2x_vf_mbx_msg));
- BNX2X_PCI_FREE(bp->vf2pf_mbox, bp->pf2vf_bulletin_mapping,
+ BNX2X_PCI_FREE(bp->pf2vf_bulletin, bp->pf2vf_bulletin_mapping,
sizeof(union pf_vf_bulletin));
}
diff --git a/drivers/net/ethernet/broadcom/bnx2x/bnx2x_sriov.h b/drivers/net/ethernet/broadcom/bnx2x/bnx2x_sriov.h
index 888d0b6632e8..53466f6cebab 100644
--- a/drivers/net/ethernet/broadcom/bnx2x/bnx2x_sriov.h
+++ b/drivers/net/ethernet/broadcom/bnx2x/bnx2x_sriov.h
@@ -141,6 +141,7 @@ struct bnx2x_virtf {
#define VF_RESET 3 /* VF FLR'd, pending cleanup */
bool flr_clnup_stage; /* true during flr cleanup */
+ bool malicious; /* true if FW indicated so, until FLR */
/* dma */
dma_addr_t fw_stat_map;
diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt.c b/drivers/net/ethernet/broadcom/bnxt/bnxt.c
index 03f55daecb20..74e8e215524d 100644
--- a/drivers/net/ethernet/broadcom/bnxt/bnxt.c
+++ b/drivers/net/ethernet/broadcom/bnxt/bnxt.c
@@ -1301,10 +1301,11 @@ static inline struct sk_buff *bnxt_tpa_end(struct bnxt *bp,
cp_cons = NEXT_CMP(cp_cons);
}
- if (unlikely(agg_bufs > MAX_SKB_FRAGS)) {
+ if (unlikely(agg_bufs > MAX_SKB_FRAGS || TPA_END_ERRORS(tpa_end1))) {
bnxt_abort_tpa(bp, bnapi, cp_cons, agg_bufs);
- netdev_warn(bp->dev, "TPA frags %d exceeded MAX_SKB_FRAGS %d\n",
- agg_bufs, (int)MAX_SKB_FRAGS);
+ if (agg_bufs > MAX_SKB_FRAGS)
+ netdev_warn(bp->dev, "TPA frags %d exceeded MAX_SKB_FRAGS %d\n",
+ agg_bufs, (int)MAX_SKB_FRAGS);
return NULL;
}
@@ -1562,6 +1563,45 @@ next_rx_no_prod:
return rc;
}
+/* In netpoll mode, if we are using a combined completion ring, we need to
+ * discard the rx packets and recycle the buffers.
+ */
+static int bnxt_force_rx_discard(struct bnxt *bp, struct bnxt_napi *bnapi,
+ u32 *raw_cons, u8 *event)
+{
+ struct bnxt_cp_ring_info *cpr = &bnapi->cp_ring;
+ u32 tmp_raw_cons = *raw_cons;
+ struct rx_cmp_ext *rxcmp1;
+ struct rx_cmp *rxcmp;
+ u16 cp_cons;
+ u8 cmp_type;
+
+ cp_cons = RING_CMP(tmp_raw_cons);
+ rxcmp = (struct rx_cmp *)
+ &cpr->cp_desc_ring[CP_RING(cp_cons)][CP_IDX(cp_cons)];
+
+ tmp_raw_cons = NEXT_RAW_CMP(tmp_raw_cons);
+ cp_cons = RING_CMP(tmp_raw_cons);
+ rxcmp1 = (struct rx_cmp_ext *)
+ &cpr->cp_desc_ring[CP_RING(cp_cons)][CP_IDX(cp_cons)];
+
+ if (!RX_CMP_VALID(rxcmp1, tmp_raw_cons))
+ return -EBUSY;
+
+ cmp_type = RX_CMP_TYPE(rxcmp);
+ if (cmp_type == CMP_TYPE_RX_L2_CMP) {
+ rxcmp1->rx_cmp_cfa_code_errors_v2 |=
+ cpu_to_le32(RX_CMPL_ERRORS_CRC_ERROR);
+ } else if (cmp_type == CMP_TYPE_RX_L2_TPA_END_CMP) {
+ struct rx_tpa_end_cmp_ext *tpa_end1;
+
+ tpa_end1 = (struct rx_tpa_end_cmp_ext *)rxcmp1;
+ tpa_end1->rx_tpa_end_cmp_errors_v2 |=
+ cpu_to_le32(RX_TPA_END_CMP_ERRORS);
+ }
+ return bnxt_rx_pkt(bp, bnapi, raw_cons, event);
+}
+
#define BNXT_GET_EVENT_PORT(data) \
((data) & \
ASYNC_EVENT_CMPL_PORT_CONN_NOT_ALLOWED_EVENT_DATA1_PORT_ID_MASK)
@@ -1744,7 +1784,11 @@ static int bnxt_poll_work(struct bnxt *bp, struct bnxt_napi *bnapi, int budget)
if (unlikely(tx_pkts > bp->tx_wake_thresh))
rx_pkts = budget;
} else if ((TX_CMP_TYPE(txcmp) & 0x30) == 0x10) {
- rc = bnxt_rx_pkt(bp, bnapi, &raw_cons, &event);
+ if (likely(budget))
+ rc = bnxt_rx_pkt(bp, bnapi, &raw_cons, &event);
+ else
+ rc = bnxt_force_rx_discard(bp, bnapi, &raw_cons,
+ &event);
if (likely(rc >= 0))
rx_pkts += rc;
else if (rc == -EBUSY) /* partial completion */
@@ -6663,12 +6707,11 @@ static void bnxt_poll_controller(struct net_device *dev)
struct bnxt *bp = netdev_priv(dev);
int i;
- for (i = 0; i < bp->cp_nr_rings; i++) {
- struct bnxt_irq *irq = &bp->irq_tbl[i];
+ /* Only process tx rings/combined rings in netpoll mode. */
+ for (i = 0; i < bp->tx_nr_rings; i++) {
+ struct bnxt_tx_ring_info *txr = &bp->tx_ring[i];
- disable_irq(irq->vector);
- irq->handler(irq->vector, bp->bnapi[i]);
- enable_irq(irq->vector);
+ napi_schedule(&txr->bnapi->napi);
}
}
#endif
diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt.h b/drivers/net/ethernet/broadcom/bnxt/bnxt.h
index 3ef42dbc6327..d46a85041083 100644
--- a/drivers/net/ethernet/broadcom/bnxt/bnxt.h
+++ b/drivers/net/ethernet/broadcom/bnxt/bnxt.h
@@ -374,12 +374,16 @@ struct rx_tpa_end_cmp_ext {
__le32 rx_tpa_end_cmp_errors_v2;
#define RX_TPA_END_CMP_V2 (0x1 << 0)
- #define RX_TPA_END_CMP_ERRORS (0x7fff << 1)
+ #define RX_TPA_END_CMP_ERRORS (0x3 << 1)
#define RX_TPA_END_CMPL_ERRORS_SHIFT 1
u32 rx_tpa_end_cmp_start_opaque;
};
+#define TPA_END_ERRORS(rx_tpa_end_ext) \
+ ((rx_tpa_end_ext)->rx_tpa_end_cmp_errors_v2 & \
+ cpu_to_le32(RX_TPA_END_CMP_ERRORS))
+
#define DB_IDX_MASK 0xffffff
#define DB_IDX_VALID (0x1 << 26)
#define DB_IRQ_DIS (0x1 << 27)
diff --git a/drivers/net/ethernet/cavium/liquidio/octeon_main.h b/drivers/net/ethernet/cavium/liquidio/octeon_main.h
index bed9ef17bc26..7ccffbb0019e 100644
--- a/drivers/net/ethernet/cavium/liquidio/octeon_main.h
+++ b/drivers/net/ethernet/cavium/liquidio/octeon_main.h
@@ -144,7 +144,7 @@ static inline int
sleep_cond(wait_queue_head_t *wait_queue, int *condition)
{
int errno = 0;
- wait_queue_t we;
+ wait_queue_entry_t we;
init_waitqueue_entry(&we, current);
add_wait_queue(wait_queue, &we);
@@ -171,7 +171,7 @@ sleep_timeout_cond(wait_queue_head_t *wait_queue,
int *condition,
int timeout)
{
- wait_queue_t we;
+ wait_queue_entry_t we;
init_waitqueue_entry(&we, current);
add_wait_queue(wait_queue, &we);
diff --git a/drivers/net/ethernet/chelsio/cxgb4/cxgb4_main.c b/drivers/net/ethernet/chelsio/cxgb4/cxgb4_main.c
index 77ed2f628f9c..53309f659951 100644
--- a/drivers/net/ethernet/chelsio/cxgb4/cxgb4_main.c
+++ b/drivers/net/ethernet/chelsio/cxgb4/cxgb4_main.c
@@ -2171,9 +2171,10 @@ static int cxgb_up(struct adapter *adap)
{
int err;
+ mutex_lock(&uld_mutex);
err = setup_sge_queues(adap);
if (err)
- goto out;
+ goto rel_lock;
err = setup_rss(adap);
if (err)
goto freeq;
@@ -2197,7 +2198,6 @@ static int cxgb_up(struct adapter *adap)
goto irq_err;
}
- mutex_lock(&uld_mutex);
enable_rx(adap);
t4_sge_start(adap);
t4_intr_enable(adap);
@@ -2210,13 +2210,15 @@ static int cxgb_up(struct adapter *adap)
#endif
/* Initialize hash mac addr list*/
INIT_LIST_HEAD(&adap->mac_hlist);
- out:
return err;
+
irq_err:
dev_err(adap->pdev_dev, "request_irq failed, err %d\n", err);
freeq:
t4_free_sge_resources(adap);
- goto out;
+ rel_lock:
+ mutex_unlock(&uld_mutex);
+ return err;
}
static void cxgb_down(struct adapter *adapter)
@@ -4525,7 +4527,7 @@ static void dummy_setup(struct net_device *dev)
/* Initialize the device structure. */
dev->netdev_ops = &cxgb4_mgmt_netdev_ops;
dev->ethtool_ops = &cxgb4_mgmt_ethtool_ops;
- dev->destructor = free_netdev;
+ dev->needs_free_netdev = true;
}
static int config_mgmt_dev(struct pci_dev *pdev)
diff --git a/drivers/net/ethernet/freescale/dpaa/dpaa_eth.c b/drivers/net/ethernet/freescale/dpaa/dpaa_eth.c
index 9a520e4f0df9..290ad0563320 100644
--- a/drivers/net/ethernet/freescale/dpaa/dpaa_eth.c
+++ b/drivers/net/ethernet/freescale/dpaa/dpaa_eth.c
@@ -2647,7 +2647,7 @@ static int dpaa_eth_probe(struct platform_device *pdev)
priv->buf_layout[TX].priv_data_size = DPAA_TX_PRIV_DATA_SIZE; /* Tx */
/* device used for DMA mapping */
- arch_setup_dma_ops(dev, 0, 0, NULL, false);
+ set_dma_ops(dev, get_dma_ops(&pdev->dev));
err = dma_coerce_mask_and_coherent(dev, DMA_BIT_MASK(40));
if (err) {
dev_err(dev, "dma_coerce_mask_and_coherent() failed\n");
diff --git a/drivers/net/ethernet/freescale/fman/Kconfig b/drivers/net/ethernet/freescale/fman/Kconfig
index dc0850b3b517..8870a9a798ca 100644
--- a/drivers/net/ethernet/freescale/fman/Kconfig
+++ b/drivers/net/ethernet/freescale/fman/Kconfig
@@ -2,6 +2,7 @@ config FSL_FMAN
tristate "FMan support"
depends on FSL_SOC || ARCH_LAYERSCAPE || COMPILE_TEST
select GENERIC_ALLOCATOR
+ depends on HAS_DMA
select PHYLIB
default n
help
diff --git a/drivers/net/ethernet/freescale/fman/mac.c b/drivers/net/ethernet/freescale/fman/mac.c
index 0b31f8502ada..6e67d22fd0d5 100644
--- a/drivers/net/ethernet/freescale/fman/mac.c
+++ b/drivers/net/ethernet/freescale/fman/mac.c
@@ -623,6 +623,8 @@ static struct platform_device *dpaa_eth_add_device(int fman_id,
goto no_mem;
}
+ set_dma_ops(&pdev->dev, get_dma_ops(priv->dev));
+
ret = platform_device_add_data(pdev, &data, sizeof(data));
if (ret)
goto err;
diff --git a/drivers/net/ethernet/hisilicon/hns/hns_dsaf_misc.c b/drivers/net/ethernet/hisilicon/hns/hns_dsaf_misc.c
index 6b15a507999c..7a8addda726e 100644
--- a/drivers/net/ethernet/hisilicon/hns/hns_dsaf_misc.c
+++ b/drivers/net/ethernet/hisilicon/hns/hns_dsaf_misc.c
@@ -29,7 +29,7 @@ enum _dsm_rst_type {
HNS_ROCE_RESET_FUNC = 0x7,
};
-const guid_t hns_dsaf_acpi_dsm_guid =
+static const guid_t hns_dsaf_acpi_dsm_guid =
GUID_INIT(0x1A85AA1A, 0xE293, 0x415E,
0x8E, 0x28, 0x8D, 0x69, 0x0A, 0x0F, 0x82, 0x0A);
diff --git a/drivers/net/ethernet/hisilicon/hns/hns_ethtool.c b/drivers/net/ethernet/hisilicon/hns/hns_ethtool.c
index b8fab149690f..e95795b3c841 100644
--- a/drivers/net/ethernet/hisilicon/hns/hns_ethtool.c
+++ b/drivers/net/ethernet/hisilicon/hns/hns_ethtool.c
@@ -288,9 +288,15 @@ static int hns_nic_config_phy_loopback(struct phy_device *phy_dev, u8 en)
/* Force 1000M Link, Default is 0x0200 */
phy_write(phy_dev, 7, 0x20C);
- phy_write(phy_dev, HNS_PHY_PAGE_REG, 0);
- /* Enable PHY loop-back */
+ /* Powerup Fiber */
+ phy_write(phy_dev, HNS_PHY_PAGE_REG, 1);
+ val = phy_read(phy_dev, COPPER_CONTROL_REG);
+ val &= ~PHY_POWER_DOWN;
+ phy_write(phy_dev, COPPER_CONTROL_REG, val);
+
+ /* Enable Phy Loopback */
+ phy_write(phy_dev, HNS_PHY_PAGE_REG, 0);
val = phy_read(phy_dev, COPPER_CONTROL_REG);
val |= PHY_LOOP_BACK;
val &= ~PHY_POWER_DOWN;
@@ -299,6 +305,12 @@ static int hns_nic_config_phy_loopback(struct phy_device *phy_dev, u8 en)
phy_write(phy_dev, HNS_PHY_PAGE_REG, 0xFA);
phy_write(phy_dev, 1, 0x400);
phy_write(phy_dev, 7, 0x200);
+
+ phy_write(phy_dev, HNS_PHY_PAGE_REG, 1);
+ val = phy_read(phy_dev, COPPER_CONTROL_REG);
+ val |= PHY_POWER_DOWN;
+ phy_write(phy_dev, COPPER_CONTROL_REG, val);
+
phy_write(phy_dev, HNS_PHY_PAGE_REG, 0);
phy_write(phy_dev, 9, 0xF00);
diff --git a/drivers/net/ethernet/ibm/emac/core.c b/drivers/net/ethernet/ibm/emac/core.c
index 508923f39ccf..259e69a52ec5 100644
--- a/drivers/net/ethernet/ibm/emac/core.c
+++ b/drivers/net/ethernet/ibm/emac/core.c
@@ -343,6 +343,7 @@ static int emac_reset(struct emac_instance *dev)
{
struct emac_regs __iomem *p = dev->emacp;
int n = 20;
+ bool __maybe_unused try_internal_clock = false;
DBG(dev, "reset" NL);
@@ -355,6 +356,7 @@ static int emac_reset(struct emac_instance *dev)
}
#ifdef CONFIG_PPC_DCR_NATIVE
+do_retry:
/*
* PPC460EX/GT Embedded Processor Advanced User's Manual
* section 28.10.1 Mode Register 0 (EMACx_MR0) states:
@@ -362,10 +364,19 @@ static int emac_reset(struct emac_instance *dev)
* of the EMAC. If none is present, select the internal clock
* (SDR0_ETH_CFG[EMACx_PHY_CLK] = 1).
* After a soft reset, select the external clock.
+ *
+ * The AR8035-A PHY Meraki MR24 does not provide a TX Clk if the
+ * ethernet cable is not attached. This causes the reset to timeout
+ * and the PHY detection code in emac_init_phy() is unable to
+ * communicate and detect the AR8035-A PHY. As a result, the emac
+ * driver bails out early and the user has no ethernet.
+ * In order to stay compatible with existing configurations, the
+ * driver will temporarily switch to the internal clock, after
+ * the first reset fails.
*/
if (emac_has_feature(dev, EMAC_FTR_460EX_PHY_CLK_FIX)) {
- if (dev->phy_address == 0xffffffff &&
- dev->phy_map == 0xffffffff) {
+ if (try_internal_clock || (dev->phy_address == 0xffffffff &&
+ dev->phy_map == 0xffffffff)) {
/* No PHY: select internal loop clock before reset */
dcri_clrset(SDR0, SDR0_ETH_CFG,
0, SDR0_ETH_CFG_ECS << dev->cell_index);
@@ -383,8 +394,15 @@ static int emac_reset(struct emac_instance *dev)
#ifdef CONFIG_PPC_DCR_NATIVE
if (emac_has_feature(dev, EMAC_FTR_460EX_PHY_CLK_FIX)) {
- if (dev->phy_address == 0xffffffff &&
- dev->phy_map == 0xffffffff) {
+ if (!n && !try_internal_clock) {
+ /* first attempt has timed out. */
+ n = 20;
+ try_internal_clock = true;
+ goto do_retry;
+ }
+
+ if (try_internal_clock || (dev->phy_address == 0xffffffff &&
+ dev->phy_map == 0xffffffff)) {
/* No PHY: restore external clock source after reset */
dcri_clrset(SDR0, SDR0_ETH_CFG,
SDR0_ETH_CFG_ECS << dev->cell_index, 0);
@@ -2460,20 +2478,24 @@ static int emac_mii_bus_reset(struct mii_bus *bus)
return emac_reset(dev);
}
+static int emac_mdio_phy_start_aneg(struct mii_phy *phy,
+ struct phy_device *phy_dev)
+{
+ phy_dev->autoneg = phy->autoneg;
+ phy_dev->speed = phy->speed;
+ phy_dev->duplex = phy->duplex;
+ phy_dev->advertising = phy->advertising;
+ return phy_start_aneg(phy_dev);
+}
+
static int emac_mdio_setup_aneg(struct mii_phy *phy, u32 advertise)
{
struct net_device *ndev = phy->dev;
struct emac_instance *dev = netdev_priv(ndev);
- dev->phy.autoneg = AUTONEG_ENABLE;
- dev->phy.speed = SPEED_1000;
- dev->phy.duplex = DUPLEX_FULL;
- dev->phy.advertising = advertise;
phy->autoneg = AUTONEG_ENABLE;
- phy->speed = dev->phy.speed;
- phy->duplex = dev->phy.duplex;
phy->advertising = advertise;
- return phy_start_aneg(dev->phy_dev);
+ return emac_mdio_phy_start_aneg(phy, dev->phy_dev);
}
static int emac_mdio_setup_forced(struct mii_phy *phy, int speed, int fd)
@@ -2481,13 +2503,10 @@ static int emac_mdio_setup_forced(struct mii_phy *phy, int speed, int fd)
struct net_device *ndev = phy->dev;
struct emac_instance *dev = netdev_priv(ndev);
- dev->phy.autoneg = AUTONEG_DISABLE;
- dev->phy.speed = speed;
- dev->phy.duplex = fd;
phy->autoneg = AUTONEG_DISABLE;
phy->speed = speed;
phy->duplex = fd;
- return phy_start_aneg(dev->phy_dev);
+ return emac_mdio_phy_start_aneg(phy, dev->phy_dev);
}
static int emac_mdio_poll_link(struct mii_phy *phy)
@@ -2509,16 +2528,17 @@ static int emac_mdio_read_link(struct mii_phy *phy)
{
struct net_device *ndev = phy->dev;
struct emac_instance *dev = netdev_priv(ndev);
+ struct phy_device *phy_dev = dev->phy_dev;
int res;
- res = phy_read_status(dev->phy_dev);
+ res = phy_read_status(phy_dev);
if (res)
return res;
- dev->phy.speed = phy->speed;
- dev->phy.duplex = phy->duplex;
- dev->phy.pause = phy->pause;
- dev->phy.asym_pause = phy->asym_pause;
+ phy->speed = phy_dev->speed;
+ phy->duplex = phy_dev->duplex;
+ phy->pause = phy_dev->pause;
+ phy->asym_pause = phy_dev->asym_pause;
return 0;
}
@@ -2528,13 +2548,6 @@ static int emac_mdio_init_phy(struct mii_phy *phy)
struct emac_instance *dev = netdev_priv(ndev);
phy_start(dev->phy_dev);
- dev->phy.autoneg = phy->autoneg;
- dev->phy.speed = phy->speed;
- dev->phy.duplex = phy->duplex;
- dev->phy.advertising = phy->advertising;
- dev->phy.pause = phy->pause;
- dev->phy.asym_pause = phy->asym_pause;
-
return phy_init_hw(dev->phy_dev);
}
diff --git a/drivers/net/ethernet/ibm/ibmvnic.c b/drivers/net/ethernet/ibm/ibmvnic.c
index a93757c255f7..c0fbeb387db4 100644
--- a/drivers/net/ethernet/ibm/ibmvnic.c
+++ b/drivers/net/ethernet/ibm/ibmvnic.c
@@ -1468,6 +1468,11 @@ static void ibmvnic_netpoll_controller(struct net_device *dev)
}
#endif
+static int ibmvnic_change_mtu(struct net_device *netdev, int new_mtu)
+{
+ return -EOPNOTSUPP;
+}
+
static const struct net_device_ops ibmvnic_netdev_ops = {
.ndo_open = ibmvnic_open,
.ndo_stop = ibmvnic_close,
@@ -1479,6 +1484,7 @@ static const struct net_device_ops ibmvnic_netdev_ops = {
#ifdef CONFIG_NET_POLL_CONTROLLER
.ndo_poll_controller = ibmvnic_netpoll_controller,
#endif
+ .ndo_change_mtu = ibmvnic_change_mtu,
};
/* ethtool functions */
diff --git a/drivers/net/ethernet/intel/i40e/i40e.h b/drivers/net/ethernet/intel/i40e/i40e.h
index cdde3cc28fb5..44d9610f7a15 100644
--- a/drivers/net/ethernet/intel/i40e/i40e.h
+++ b/drivers/net/ethernet/intel/i40e/i40e.h
@@ -399,6 +399,7 @@ struct i40e_pf {
#define I40E_FLAG_RX_CSUM_ENABLED BIT_ULL(1)
#define I40E_FLAG_MSI_ENABLED BIT_ULL(2)
#define I40E_FLAG_MSIX_ENABLED BIT_ULL(3)
+#define I40E_FLAG_HW_ATR_EVICT_ENABLED BIT_ULL(4)
#define I40E_FLAG_RSS_ENABLED BIT_ULL(6)
#define I40E_FLAG_VMDQ_ENABLED BIT_ULL(7)
#define I40E_FLAG_IWARP_ENABLED BIT_ULL(10)
diff --git a/drivers/net/ethernet/intel/i40e/i40e_ethtool.c b/drivers/net/ethernet/intel/i40e/i40e_ethtool.c
index 7a8eb486b9ea..894c8e57ba00 100644
--- a/drivers/net/ethernet/intel/i40e/i40e_ethtool.c
+++ b/drivers/net/ethernet/intel/i40e/i40e_ethtool.c
@@ -224,7 +224,7 @@ static const struct i40e_priv_flags i40e_gstrings_priv_flags[] = {
I40E_PRIV_FLAG("LinkPolling", I40E_FLAG_LINK_POLLING_ENABLED, 0),
I40E_PRIV_FLAG("flow-director-atr", I40E_FLAG_FD_ATR_ENABLED, 0),
I40E_PRIV_FLAG("veb-stats", I40E_FLAG_VEB_STATS_ENABLED, 0),
- I40E_PRIV_FLAG("hw-atr-eviction", I40E_FLAG_HW_ATR_EVICT_CAPABLE, 0),
+ I40E_PRIV_FLAG("hw-atr-eviction", I40E_FLAG_HW_ATR_EVICT_ENABLED, 0),
I40E_PRIV_FLAG("legacy-rx", I40E_FLAG_LEGACY_RX, 0),
};
@@ -4092,7 +4092,7 @@ flags_complete:
/* Only allow ATR evict on hardware that is capable of handling it */
if (pf->flags & I40E_FLAG_HW_ATR_EVICT_CAPABLE)
- pf->flags &= ~I40E_FLAG_HW_ATR_EVICT_CAPABLE;
+ pf->flags &= ~I40E_FLAG_HW_ATR_EVICT_ENABLED;
if (changed_flags & I40E_FLAG_TRUE_PROMISC_SUPPORT) {
u16 sw_flags = 0, valid_flags = 0;
diff --git a/drivers/net/ethernet/intel/i40e/i40e_main.c b/drivers/net/ethernet/intel/i40e/i40e_main.c
index 150caf6ca2b4..a7a4b28b4144 100644
--- a/drivers/net/ethernet/intel/i40e/i40e_main.c
+++ b/drivers/net/ethernet/intel/i40e/i40e_main.c
@@ -8821,11 +8821,12 @@ static int i40e_sw_init(struct i40e_pf *pf)
(pf->hw.aq.api_min_ver > 4))) {
/* Supported in FW API version higher than 1.4 */
pf->flags |= I40E_FLAG_GENEVE_OFFLOAD_CAPABLE;
- pf->flags = I40E_FLAG_HW_ATR_EVICT_CAPABLE;
- } else {
- pf->flags = I40E_FLAG_HW_ATR_EVICT_CAPABLE;
}
+ /* Enable HW ATR eviction if possible */
+ if (pf->flags & I40E_FLAG_HW_ATR_EVICT_CAPABLE)
+ pf->flags |= I40E_FLAG_HW_ATR_EVICT_ENABLED;
+
pf->eeprom_version = 0xDEAD;
pf->lan_veb = I40E_NO_VEB;
pf->lan_vsi = I40E_NO_VSI;
diff --git a/drivers/net/ethernet/intel/i40e/i40e_txrx.c b/drivers/net/ethernet/intel/i40e/i40e_txrx.c
index cd894f4023b1..77115c25d96f 100644
--- a/drivers/net/ethernet/intel/i40e/i40e_txrx.c
+++ b/drivers/net/ethernet/intel/i40e/i40e_txrx.c
@@ -2341,7 +2341,7 @@ static void i40e_atr(struct i40e_ring *tx_ring, struct sk_buff *skb,
/* Due to lack of space, no more new filters can be programmed */
if (th->syn && (pf->flags & I40E_FLAG_FD_ATR_AUTO_DISABLED))
return;
- if (pf->flags & I40E_FLAG_HW_ATR_EVICT_CAPABLE) {
+ if (pf->flags & I40E_FLAG_HW_ATR_EVICT_ENABLED) {
/* HW ATR eviction will take care of removing filters on FIN
* and RST packets.
*/
@@ -2403,7 +2403,7 @@ static void i40e_atr(struct i40e_ring *tx_ring, struct sk_buff *skb,
I40E_TXD_FLTR_QW1_CNTINDEX_SHIFT) &
I40E_TXD_FLTR_QW1_CNTINDEX_MASK;
- if (pf->flags & I40E_FLAG_HW_ATR_EVICT_CAPABLE)
+ if (pf->flags & I40E_FLAG_HW_ATR_EVICT_ENABLED)
dtype_cmd |= I40E_TXD_FLTR_QW1_ATR_MASK;
fdir_desc->qindex_flex_ptype_vsi = cpu_to_le32(flex_ptype);
diff --git a/drivers/net/ethernet/intel/i40e/i40e_virtchnl_pf.c b/drivers/net/ethernet/intel/i40e/i40e_virtchnl_pf.c
index 95c23fbaa211..0fb38ca78900 100644
--- a/drivers/net/ethernet/intel/i40e/i40e_virtchnl_pf.c
+++ b/drivers/net/ethernet/intel/i40e/i40e_virtchnl_pf.c
@@ -3017,10 +3017,12 @@ int i40e_ndo_set_vf_port_vlan(struct net_device *netdev, int vf_id,
VLAN_VID_MASK));
}
+ spin_unlock_bh(&vsi->mac_filter_hash_lock);
if (vlan_id || qos)
ret = i40e_vsi_add_pvid(vsi, vlanprio);
else
i40e_vsi_remove_pvid(vsi);
+ spin_lock_bh(&vsi->mac_filter_hash_lock);
if (vlan_id) {
dev_info(&pf->pdev->dev, "Setting VLAN %d, QOS 0x%x on VF %d\n",
diff --git a/drivers/net/ethernet/marvell/mvpp2.c b/drivers/net/ethernet/marvell/mvpp2.c
index 9b875d776b29..33c901622ed5 100644
--- a/drivers/net/ethernet/marvell/mvpp2.c
+++ b/drivers/net/ethernet/marvell/mvpp2.c
@@ -3719,7 +3719,7 @@ static void mvpp2_bm_bufs_get_addrs(struct device *dev, struct mvpp2 *priv,
dma_addr_t *dma_addr,
phys_addr_t *phys_addr)
{
- int cpu = smp_processor_id();
+ int cpu = get_cpu();
*dma_addr = mvpp2_percpu_read(priv, cpu,
MVPP2_BM_PHY_ALLOC_REG(bm_pool->id));
@@ -3740,6 +3740,8 @@ static void mvpp2_bm_bufs_get_addrs(struct device *dev, struct mvpp2 *priv,
if (sizeof(phys_addr_t) == 8)
*phys_addr |= (u64)phys_addr_highbits << 32;
}
+
+ put_cpu();
}
/* Free all buffers from the pool */
@@ -3920,18 +3922,12 @@ static inline u32 mvpp2_bm_cookie_pool_set(u32 cookie, int pool)
return bm;
}
-/* Get pool number from a BM cookie */
-static inline int mvpp2_bm_cookie_pool_get(unsigned long cookie)
-{
- return (cookie >> MVPP2_BM_COOKIE_POOL_OFFS) & 0xFF;
-}
-
/* Release buffer to BM */
static inline void mvpp2_bm_pool_put(struct mvpp2_port *port, int pool,
dma_addr_t buf_dma_addr,
phys_addr_t buf_phys_addr)
{
- int cpu = smp_processor_id();
+ int cpu = get_cpu();
if (port->priv->hw_version == MVPP22) {
u32 val = 0;
@@ -3958,15 +3954,15 @@ static inline void mvpp2_bm_pool_put(struct mvpp2_port *port, int pool,
MVPP2_BM_VIRT_RLS_REG, buf_phys_addr);
mvpp2_percpu_write(port->priv, cpu,
MVPP2_BM_PHY_RLS_REG(pool), buf_dma_addr);
+
+ put_cpu();
}
/* Refill BM pool */
-static void mvpp2_pool_refill(struct mvpp2_port *port, u32 bm,
+static void mvpp2_pool_refill(struct mvpp2_port *port, int pool,
dma_addr_t dma_addr,
phys_addr_t phys_addr)
{
- int pool = mvpp2_bm_cookie_pool_get(bm);
-
mvpp2_bm_pool_put(port, pool, dma_addr, phys_addr);
}
@@ -4186,8 +4182,6 @@ static void mvpp22_port_mii_set(struct mvpp2_port *port)
{
u32 val;
- return;
-
/* Only GOP port 0 has an XLG MAC */
if (port->gop_id == 0) {
val = readl(port->base + MVPP22_XLG_CTRL3_REG);
@@ -4515,21 +4509,6 @@ static void mvpp2_rxq_offset_set(struct mvpp2_port *port,
mvpp2_write(port->priv, MVPP2_RXQ_CONFIG_REG(prxq), val);
}
-/* Obtain BM cookie information from descriptor */
-static u32 mvpp2_bm_cookie_build(struct mvpp2_port *port,
- struct mvpp2_rx_desc *rx_desc)
-{
- int cpu = smp_processor_id();
- int pool;
-
- pool = (mvpp2_rxdesc_status_get(port, rx_desc) &
- MVPP2_RXD_BM_POOL_ID_MASK) >>
- MVPP2_RXD_BM_POOL_ID_OFFS;
-
- return ((pool & 0xFF) << MVPP2_BM_COOKIE_POOL_OFFS) |
- ((cpu & 0xFF) << MVPP2_BM_COOKIE_CPU_OFFS);
-}
-
/* Tx descriptors helper methods */
/* Get pointer to next Tx descriptor to be processed (send) by HW */
@@ -4757,7 +4736,7 @@ static void mvpp2_txp_max_tx_size_set(struct mvpp2_port *port)
static void mvpp2_rx_pkts_coal_set(struct mvpp2_port *port,
struct mvpp2_rx_queue *rxq)
{
- int cpu = smp_processor_id();
+ int cpu = get_cpu();
if (rxq->pkts_coal > MVPP2_OCCUPIED_THRESH_MASK)
rxq->pkts_coal = MVPP2_OCCUPIED_THRESH_MASK;
@@ -4765,6 +4744,8 @@ static void mvpp2_rx_pkts_coal_set(struct mvpp2_port *port,
mvpp2_percpu_write(port->priv, cpu, MVPP2_RXQ_NUM_REG, rxq->id);
mvpp2_percpu_write(port->priv, cpu, MVPP2_RXQ_THRESH_REG,
rxq->pkts_coal);
+
+ put_cpu();
}
static u32 mvpp2_usec_to_cycles(u32 usec, unsigned long clk_hz)
@@ -4945,7 +4926,7 @@ static int mvpp2_rxq_init(struct mvpp2_port *port,
mvpp2_write(port->priv, MVPP2_RXQ_STATUS_REG(rxq->id), 0);
/* Set Rx descriptors queue starting address - indirect access */
- cpu = smp_processor_id();
+ cpu = get_cpu();
mvpp2_percpu_write(port->priv, cpu, MVPP2_RXQ_NUM_REG, rxq->id);
if (port->priv->hw_version == MVPP21)
rxq_dma = rxq->descs_dma;
@@ -4954,6 +4935,7 @@ static int mvpp2_rxq_init(struct mvpp2_port *port,
mvpp2_percpu_write(port->priv, cpu, MVPP2_RXQ_DESC_ADDR_REG, rxq_dma);
mvpp2_percpu_write(port->priv, cpu, MVPP2_RXQ_DESC_SIZE_REG, rxq->size);
mvpp2_percpu_write(port->priv, cpu, MVPP2_RXQ_INDEX_REG, 0);
+ put_cpu();
/* Set Offset */
mvpp2_rxq_offset_set(port, rxq->id, NET_SKB_PAD);
@@ -4980,9 +4962,13 @@ static void mvpp2_rxq_drop_pkts(struct mvpp2_port *port,
for (i = 0; i < rx_received; i++) {
struct mvpp2_rx_desc *rx_desc = mvpp2_rxq_next_desc_get(rxq);
- u32 bm = mvpp2_bm_cookie_build(port, rx_desc);
+ u32 status = mvpp2_rxdesc_status_get(port, rx_desc);
+ int pool;
+
+ pool = (status & MVPP2_RXD_BM_POOL_ID_MASK) >>
+ MVPP2_RXD_BM_POOL_ID_OFFS;
- mvpp2_pool_refill(port, bm,
+ mvpp2_pool_refill(port, pool,
mvpp2_rxdesc_dma_addr_get(port, rx_desc),
mvpp2_rxdesc_cookie_get(port, rx_desc));
}
@@ -5012,10 +4998,11 @@ static void mvpp2_rxq_deinit(struct mvpp2_port *port,
* free descriptor number
*/
mvpp2_write(port->priv, MVPP2_RXQ_STATUS_REG(rxq->id), 0);
- cpu = smp_processor_id();
+ cpu = get_cpu();
mvpp2_percpu_write(port->priv, cpu, MVPP2_RXQ_NUM_REG, rxq->id);
mvpp2_percpu_write(port->priv, cpu, MVPP2_RXQ_DESC_ADDR_REG, 0);
mvpp2_percpu_write(port->priv, cpu, MVPP2_RXQ_DESC_SIZE_REG, 0);
+ put_cpu();
}
/* Create and initialize a Tx queue */
@@ -5038,7 +5025,7 @@ static int mvpp2_txq_init(struct mvpp2_port *port,
txq->last_desc = txq->size - 1;
/* Set Tx descriptors queue starting address - indirect access */
- cpu = smp_processor_id();
+ cpu = get_cpu();
mvpp2_percpu_write(port->priv, cpu, MVPP2_TXQ_NUM_REG, txq->id);
mvpp2_percpu_write(port->priv, cpu, MVPP2_TXQ_DESC_ADDR_REG,
txq->descs_dma);
@@ -5063,6 +5050,7 @@ static int mvpp2_txq_init(struct mvpp2_port *port,
mvpp2_percpu_write(port->priv, cpu, MVPP2_TXQ_PREF_BUF_REG,
MVPP2_PREF_BUF_PTR(desc) | MVPP2_PREF_BUF_SIZE_16 |
MVPP2_PREF_BUF_THRESH(desc_per_txq / 2));
+ put_cpu();
/* WRR / EJP configuration - indirect access */
tx_port_num = mvpp2_egress_port(port);
@@ -5133,10 +5121,11 @@ static void mvpp2_txq_deinit(struct mvpp2_port *port,
mvpp2_write(port->priv, MVPP2_TXQ_SCHED_TOKEN_CNTR_REG(txq->id), 0);
/* Set Tx descriptors queue starting address and size */
- cpu = smp_processor_id();
+ cpu = get_cpu();
mvpp2_percpu_write(port->priv, cpu, MVPP2_TXQ_NUM_REG, txq->id);
mvpp2_percpu_write(port->priv, cpu, MVPP2_TXQ_DESC_ADDR_REG, 0);
mvpp2_percpu_write(port->priv, cpu, MVPP2_TXQ_DESC_SIZE_REG, 0);
+ put_cpu();
}
/* Cleanup Tx ports */
@@ -5146,7 +5135,7 @@ static void mvpp2_txq_clean(struct mvpp2_port *port, struct mvpp2_tx_queue *txq)
int delay, pending, cpu;
u32 val;
- cpu = smp_processor_id();
+ cpu = get_cpu();
mvpp2_percpu_write(port->priv, cpu, MVPP2_TXQ_NUM_REG, txq->id);
val = mvpp2_percpu_read(port->priv, cpu, MVPP2_TXQ_PREF_BUF_REG);
val |= MVPP2_TXQ_DRAIN_EN_MASK;
@@ -5173,6 +5162,7 @@ static void mvpp2_txq_clean(struct mvpp2_port *port, struct mvpp2_tx_queue *txq)
val &= ~MVPP2_TXQ_DRAIN_EN_MASK;
mvpp2_percpu_write(port->priv, cpu, MVPP2_TXQ_PREF_BUF_REG, val);
+ put_cpu();
for_each_present_cpu(cpu) {
txq_pcpu = per_cpu_ptr(txq->pcpu, cpu);
@@ -5420,7 +5410,7 @@ static void mvpp2_rx_csum(struct mvpp2_port *port, u32 status,
/* Reuse skb if possible, or allocate a new skb and add it to BM pool */
static int mvpp2_rx_refill(struct mvpp2_port *port,
- struct mvpp2_bm_pool *bm_pool, u32 bm)
+ struct mvpp2_bm_pool *bm_pool, int pool)
{
dma_addr_t dma_addr;
phys_addr_t phys_addr;
@@ -5432,7 +5422,7 @@ static int mvpp2_rx_refill(struct mvpp2_port *port,
if (!buf)
return -ENOMEM;
- mvpp2_pool_refill(port, bm, dma_addr, phys_addr);
+ mvpp2_pool_refill(port, pool, dma_addr, phys_addr);
return 0;
}
@@ -5490,7 +5480,7 @@ static int mvpp2_rx(struct mvpp2_port *port, int rx_todo,
unsigned int frag_size;
dma_addr_t dma_addr;
phys_addr_t phys_addr;
- u32 bm, rx_status;
+ u32 rx_status;
int pool, rx_bytes, err;
void *data;
@@ -5502,8 +5492,8 @@ static int mvpp2_rx(struct mvpp2_port *port, int rx_todo,
phys_addr = mvpp2_rxdesc_cookie_get(port, rx_desc);
data = (void *)phys_to_virt(phys_addr);
- bm = mvpp2_bm_cookie_build(port, rx_desc);
- pool = mvpp2_bm_cookie_pool_get(bm);
+ pool = (rx_status & MVPP2_RXD_BM_POOL_ID_MASK) >>
+ MVPP2_RXD_BM_POOL_ID_OFFS;
bm_pool = &port->priv->bm_pools[pool];
/* In case of an error, release the requested buffer pointer
@@ -5516,7 +5506,7 @@ err_drop_frame:
dev->stats.rx_errors++;
mvpp2_rx_error(port, rx_desc);
/* Return the buffer to the pool */
- mvpp2_pool_refill(port, bm, dma_addr, phys_addr);
+ mvpp2_pool_refill(port, pool, dma_addr, phys_addr);
continue;
}
@@ -5531,7 +5521,7 @@ err_drop_frame:
goto err_drop_frame;
}
- err = mvpp2_rx_refill(port, bm_pool, bm);
+ err = mvpp2_rx_refill(port, bm_pool, pool);
if (err) {
netdev_err(port->dev, "failed to refill BM pools\n");
goto err_drop_frame;
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en.h b/drivers/net/ethernet/mellanox/mlx5/core/en.h
index 2fd044b23875..944fc1742464 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en.h
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en.h
@@ -458,13 +458,15 @@ struct mlx5e_mpw_info {
struct mlx5e_rx_am_stats {
int ppms; /* packets per msec */
+ int bpms; /* bytes per msec */
int epms; /* events per msec */
};
struct mlx5e_rx_am_sample {
- ktime_t time;
- unsigned int pkt_ctr;
- u16 event_ctr;
+ ktime_t time;
+ u32 pkt_ctr;
+ u32 byte_ctr;
+ u16 event_ctr;
};
struct mlx5e_rx_am { /* Adaptive Moderation */
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_ethtool.c b/drivers/net/ethernet/mellanox/mlx5/core/en_ethtool.c
index 8209affa75c3..16486dff1493 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_ethtool.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_ethtool.c
@@ -1242,11 +1242,11 @@ static int mlx5e_get_ts_info(struct net_device *dev,
SOF_TIMESTAMPING_RX_HARDWARE |
SOF_TIMESTAMPING_RAW_HARDWARE;
- info->tx_types = (BIT(1) << HWTSTAMP_TX_OFF) |
- (BIT(1) << HWTSTAMP_TX_ON);
+ info->tx_types = BIT(HWTSTAMP_TX_OFF) |
+ BIT(HWTSTAMP_TX_ON);
- info->rx_filters = (BIT(1) << HWTSTAMP_FILTER_NONE) |
- (BIT(1) << HWTSTAMP_FILTER_ALL);
+ info->rx_filters = BIT(HWTSTAMP_FILTER_NONE) |
+ BIT(HWTSTAMP_FILTER_ALL);
return 0;
}
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
index 41cd22a223dc..277f4de30375 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
@@ -4241,7 +4241,8 @@ struct net_device *mlx5e_create_netdev(struct mlx5_core_dev *mdev,
return netdev;
err_cleanup_nic:
- profile->cleanup(priv);
+ if (profile->cleanup)
+ profile->cleanup(priv);
free_netdev(netdev);
return NULL;
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_rep.c b/drivers/net/ethernet/mellanox/mlx5/core/en_rep.c
index 79462c0368a0..46984a52a94b 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_rep.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_rep.c
@@ -791,6 +791,8 @@ static void mlx5e_build_rep_params(struct mlx5_core_dev *mdev,
params->tx_max_inline = mlx5e_get_max_inline_cap(mdev);
params->num_tc = 1;
params->lro_wqe_sz = MLX5E_PARAMS_DEFAULT_LRO_WQE_SZ;
+
+ mlx5_query_min_inline(mdev, &params->tx_min_inline_mode);
}
static void mlx5e_build_rep_netdev(struct net_device *netdev)
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_rx_am.c b/drivers/net/ethernet/mellanox/mlx5/core/en_rx_am.c
index 02dd3a95ed8f..acf32fe952cd 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_rx_am.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_rx_am.c
@@ -183,28 +183,27 @@ static void mlx5e_am_exit_parking(struct mlx5e_rx_am *am)
mlx5e_am_step(am);
}
+#define IS_SIGNIFICANT_DIFF(val, ref) \
+ (((100 * abs((val) - (ref))) / (ref)) > 10) /* more than 10% difference */
+
static int mlx5e_am_stats_compare(struct mlx5e_rx_am_stats *curr,
struct mlx5e_rx_am_stats *prev)
{
- int diff;
-
- if (!prev->ppms)
- return curr->ppms ? MLX5E_AM_STATS_BETTER :
+ if (!prev->bpms)
+ return curr->bpms ? MLX5E_AM_STATS_BETTER :
MLX5E_AM_STATS_SAME;
- diff = curr->ppms - prev->ppms;
- if (((100 * abs(diff)) / prev->ppms) > 10) /* more than 10% diff */
- return (diff > 0) ? MLX5E_AM_STATS_BETTER :
- MLX5E_AM_STATS_WORSE;
+ if (IS_SIGNIFICANT_DIFF(curr->bpms, prev->bpms))
+ return (curr->bpms > prev->bpms) ? MLX5E_AM_STATS_BETTER :
+ MLX5E_AM_STATS_WORSE;
- if (!prev->epms)
- return curr->epms ? MLX5E_AM_STATS_WORSE :
- MLX5E_AM_STATS_SAME;
+ if (IS_SIGNIFICANT_DIFF(curr->ppms, prev->ppms))
+ return (curr->ppms > prev->ppms) ? MLX5E_AM_STATS_BETTER :
+ MLX5E_AM_STATS_WORSE;
- diff = curr->epms - prev->epms;
- if (((100 * abs(diff)) / prev->epms) > 10) /* more than 10% diff */
- return (diff < 0) ? MLX5E_AM_STATS_BETTER :
- MLX5E_AM_STATS_WORSE;
+ if (IS_SIGNIFICANT_DIFF(curr->epms, prev->epms))
+ return (curr->epms < prev->epms) ? MLX5E_AM_STATS_BETTER :
+ MLX5E_AM_STATS_WORSE;
return MLX5E_AM_STATS_SAME;
}
@@ -266,10 +265,13 @@ static void mlx5e_am_sample(struct mlx5e_rq *rq,
{
s->time = ktime_get();
s->pkt_ctr = rq->stats.packets;
+ s->byte_ctr = rq->stats.bytes;
s->event_ctr = rq->cq.event_ctr;
}
#define MLX5E_AM_NEVENTS 64
+#define BITS_PER_TYPE(type) (sizeof(type) * BITS_PER_BYTE)
+#define BIT_GAP(bits, end, start) ((((end) - (start)) + BIT_ULL(bits)) & (BIT_ULL(bits) - 1))
static void mlx5e_am_calc_stats(struct mlx5e_rx_am_sample *start,
struct mlx5e_rx_am_sample *end,
@@ -277,13 +279,17 @@ static void mlx5e_am_calc_stats(struct mlx5e_rx_am_sample *start,
{
/* u32 holds up to 71 minutes, should be enough */
u32 delta_us = ktime_us_delta(end->time, start->time);
- unsigned int npkts = end->pkt_ctr - start->pkt_ctr;
+ u32 npkts = BIT_GAP(BITS_PER_TYPE(u32), end->pkt_ctr, start->pkt_ctr);
+ u32 nbytes = BIT_GAP(BITS_PER_TYPE(u32), end->byte_ctr,
+ start->byte_ctr);
if (!delta_us)
return;
- curr_stats->ppms = (npkts * USEC_PER_MSEC) / delta_us;
- curr_stats->epms = (MLX5E_AM_NEVENTS * USEC_PER_MSEC) / delta_us;
+ curr_stats->ppms = DIV_ROUND_UP(npkts * USEC_PER_MSEC, delta_us);
+ curr_stats->bpms = DIV_ROUND_UP(nbytes * USEC_PER_MSEC, delta_us);
+ curr_stats->epms = DIV_ROUND_UP(MLX5E_AM_NEVENTS * USEC_PER_MSEC,
+ delta_us);
}
void mlx5e_rx_am_work(struct work_struct *work)
@@ -308,7 +314,8 @@ void mlx5e_rx_am(struct mlx5e_rq *rq)
switch (am->state) {
case MLX5E_AM_MEASURE_IN_PROGRESS:
- nevents = rq->cq.event_ctr - am->start_sample.event_ctr;
+ nevents = BIT_GAP(BITS_PER_TYPE(u16), rq->cq.event_ctr,
+ am->start_sample.event_ctr);
if (nevents < MLX5E_AM_NEVENTS)
break;
mlx5e_am_sample(rq, &end_sample);
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_stats.h b/drivers/net/ethernet/mellanox/mlx5/core/en_stats.h
index 53e4992d6511..f81c3aa60b46 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_stats.h
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_stats.h
@@ -417,20 +417,13 @@ struct mlx5e_stats {
};
static const struct counter_desc mlx5e_pme_status_desc[] = {
- { "module_plug", 0 },
{ "module_unplug", 8 },
};
static const struct counter_desc mlx5e_pme_error_desc[] = {
- { "module_pwr_budget_exd", 0 }, /* power budget exceed */
- { "module_long_range", 8 }, /* long range for non MLNX cable */
- { "module_bus_stuck", 16 }, /* bus stuck (I2C or data shorted) */
- { "module_no_eeprom", 24 }, /* no eeprom/retry time out */
- { "module_enforce_part", 32 }, /* enforce part number list */
- { "module_unknown_id", 40 }, /* unknown identifier */
- { "module_high_temp", 48 }, /* high temperature */
+ { "module_bus_stuck", 16 }, /* bus stuck (I2C or data shorted) */
+ { "module_high_temp", 48 }, /* high temperature */
{ "module_bad_shorted", 56 }, /* bad or shorted cable/module */
- { "module_unknown_status", 64 },
};
#endif /* __MLX5_EN_STATS_H__ */
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c b/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c
index ec63158ab643..9df9fc0d26f5 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c
@@ -895,7 +895,6 @@ static struct mlx5_fields fields[] = {
{MLX5_ACTION_IN_FIELD_OUT_SMAC_15_0, 2, offsetof(struct pedit_headers, eth.h_source[4])},
{MLX5_ACTION_IN_FIELD_OUT_ETHERTYPE, 2, offsetof(struct pedit_headers, eth.h_proto)},
- {MLX5_ACTION_IN_FIELD_OUT_IP_DSCP, 1, offsetof(struct pedit_headers, ip4.tos)},
{MLX5_ACTION_IN_FIELD_OUT_IP_TTL, 1, offsetof(struct pedit_headers, ip4.ttl)},
{MLX5_ACTION_IN_FIELD_OUT_SIPV4, 4, offsetof(struct pedit_headers, ip4.saddr)},
{MLX5_ACTION_IN_FIELD_OUT_DIPV4, 4, offsetof(struct pedit_headers, ip4.daddr)},
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c b/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c
index f991f669047e..a53e982a6863 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c
@@ -906,21 +906,34 @@ static int esw_inline_mode_to_devlink(u8 mlx5_mode, u8 *mode)
return 0;
}
-int mlx5_devlink_eswitch_mode_set(struct devlink *devlink, u16 mode)
+static int mlx5_devlink_eswitch_check(struct devlink *devlink)
{
- struct mlx5_core_dev *dev;
- u16 cur_mlx5_mode, mlx5_mode = 0;
+ struct mlx5_core_dev *dev = devlink_priv(devlink);
- dev = devlink_priv(devlink);
+ if (MLX5_CAP_GEN(dev, port_type) != MLX5_CAP_PORT_TYPE_ETH)
+ return -EOPNOTSUPP;
if (!MLX5_CAP_GEN(dev, vport_group_manager))
return -EOPNOTSUPP;
- cur_mlx5_mode = dev->priv.eswitch->mode;
-
- if (cur_mlx5_mode == SRIOV_NONE)
+ if (dev->priv.eswitch->mode == SRIOV_NONE)
return -EOPNOTSUPP;
+ return 0;
+}
+
+int mlx5_devlink_eswitch_mode_set(struct devlink *devlink, u16 mode)
+{
+ struct mlx5_core_dev *dev = devlink_priv(devlink);
+ u16 cur_mlx5_mode, mlx5_mode = 0;
+ int err;
+
+ err = mlx5_devlink_eswitch_check(devlink);
+ if (err)
+ return err;
+
+ cur_mlx5_mode = dev->priv.eswitch->mode;
+
if (esw_mode_from_devlink(mode, &mlx5_mode))
return -EINVAL;
@@ -937,15 +950,12 @@ int mlx5_devlink_eswitch_mode_set(struct devlink *devlink, u16 mode)
int mlx5_devlink_eswitch_mode_get(struct devlink *devlink, u16 *mode)
{
- struct mlx5_core_dev *dev;
-
- dev = devlink_priv(devlink);
-
- if (!MLX5_CAP_GEN(dev, vport_group_manager))
- return -EOPNOTSUPP;
+ struct mlx5_core_dev *dev = devlink_priv(devlink);
+ int err;
- if (dev->priv.eswitch->mode == SRIOV_NONE)
- return -EOPNOTSUPP;
+ err = mlx5_devlink_eswitch_check(devlink);
+ if (err)
+ return err;
return esw_mode_to_devlink(dev->priv.eswitch->mode, mode);
}
@@ -954,15 +964,12 @@ int mlx5_devlink_eswitch_inline_mode_set(struct devlink *devlink, u8 mode)
{
struct mlx5_core_dev *dev = devlink_priv(devlink);
struct mlx5_eswitch *esw = dev->priv.eswitch;
- int num_vports = esw->enabled_vports;
int err, vport;
u8 mlx5_mode;
- if (!MLX5_CAP_GEN(dev, vport_group_manager))
- return -EOPNOTSUPP;
-
- if (esw->mode == SRIOV_NONE)
- return -EOPNOTSUPP;
+ err = mlx5_devlink_eswitch_check(devlink);
+ if (err)
+ return err;
switch (MLX5_CAP_ETH(dev, wqe_inline_mode)) {
case MLX5_CAP_INLINE_MODE_NOT_REQUIRED:
@@ -985,7 +992,7 @@ int mlx5_devlink_eswitch_inline_mode_set(struct devlink *devlink, u8 mode)
if (err)
goto out;
- for (vport = 1; vport < num_vports; vport++) {
+ for (vport = 1; vport < esw->enabled_vports; vport++) {
err = mlx5_modify_nic_vport_min_inline(dev, vport, mlx5_mode);
if (err) {
esw_warn(dev, "Failed to set min inline on vport %d\n",
@@ -1010,12 +1017,11 @@ int mlx5_devlink_eswitch_inline_mode_get(struct devlink *devlink, u8 *mode)
{
struct mlx5_core_dev *dev = devlink_priv(devlink);
struct mlx5_eswitch *esw = dev->priv.eswitch;
+ int err;
- if (!MLX5_CAP_GEN(dev, vport_group_manager))
- return -EOPNOTSUPP;
-
- if (esw->mode == SRIOV_NONE)
- return -EOPNOTSUPP;
+ err = mlx5_devlink_eswitch_check(devlink);
+ if (err)
+ return err;
return esw_inline_mode_to_devlink(esw->offloads.inline_mode, mode);
}
@@ -1062,11 +1068,9 @@ int mlx5_devlink_eswitch_encap_mode_set(struct devlink *devlink, u8 encap)
struct mlx5_eswitch *esw = dev->priv.eswitch;
int err;
- if (!MLX5_CAP_GEN(dev, vport_group_manager))
- return -EOPNOTSUPP;
-
- if (esw->mode == SRIOV_NONE)
- return -EOPNOTSUPP;
+ err = mlx5_devlink_eswitch_check(devlink);
+ if (err)
+ return err;
if (encap != DEVLINK_ESWITCH_ENCAP_MODE_NONE &&
(!MLX5_CAP_ESW_FLOWTABLE_FDB(dev, encap) ||
@@ -1105,12 +1109,11 @@ int mlx5_devlink_eswitch_encap_mode_get(struct devlink *devlink, u8 *encap)
{
struct mlx5_core_dev *dev = devlink_priv(devlink);
struct mlx5_eswitch *esw = dev->priv.eswitch;
+ int err;
- if (!MLX5_CAP_GEN(dev, vport_group_manager))
- return -EOPNOTSUPP;
-
- if (esw->mode == SRIOV_NONE)
- return -EOPNOTSUPP;
+ err = mlx5_devlink_eswitch_check(devlink);
+ if (err)
+ return err;
*encap = esw->offloads.encap;
return 0;
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/fs_core.c b/drivers/net/ethernet/mellanox/mlx5/core/fs_core.c
index 0e487e8ca634..8f5125ccd8d4 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/fs_core.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/fs_core.c
@@ -862,7 +862,7 @@ struct mlx5_flow_table *mlx5_create_vport_flow_table(struct mlx5_flow_namespace
ft_attr.level = level;
ft_attr.prio = prio;
- return __mlx5_create_flow_table(ns, &ft_attr, FS_FT_OP_MOD_NORMAL, 0);
+ return __mlx5_create_flow_table(ns, &ft_attr, FS_FT_OP_MOD_NORMAL, vport);
}
struct mlx5_flow_table*
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/health.c b/drivers/net/ethernet/mellanox/mlx5/core/health.c
index 44f59b1d6f0f..f27f84ffbc85 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/health.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/health.c
@@ -275,10 +275,8 @@ static void poll_health(unsigned long data)
struct mlx5_core_health *health = &dev->priv.health;
u32 count;
- if (dev->state == MLX5_DEVICE_STATE_INTERNAL_ERROR) {
- mod_timer(&health->timer, get_next_poll_jiffies());
- return;
- }
+ if (dev->state == MLX5_DEVICE_STATE_INTERNAL_ERROR)
+ goto out;
count = ioread32be(health->health_counter);
if (count == health->prev)
@@ -290,8 +288,6 @@ static void poll_health(unsigned long data)
if (health->miss_counter == MAX_MISSES) {
dev_err(&dev->pdev->dev, "device's health compromised - reached miss count\n");
print_health_info(dev);
- } else {
- mod_timer(&health->timer, get_next_poll_jiffies());
}
if (in_fatal(dev) && !health->sick) {
@@ -305,6 +301,9 @@ static void poll_health(unsigned long data)
"new health works are not permitted at this stage\n");
spin_unlock(&health->wq_lock);
}
+
+out:
+ mod_timer(&health->timer, get_next_poll_jiffies());
}
void mlx5_start_health_poll(struct mlx5_core_dev *dev)
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/main.c b/drivers/net/ethernet/mellanox/mlx5/core/main.c
index af945edfee19..13be264587f1 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/main.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/main.c
@@ -175,8 +175,9 @@ static struct mlx5_profile profile[] = {
},
};
-#define FW_INIT_TIMEOUT_MILI 2000
-#define FW_INIT_WAIT_MS 2
+#define FW_INIT_TIMEOUT_MILI 2000
+#define FW_INIT_WAIT_MS 2
+#define FW_PRE_INIT_TIMEOUT_MILI 10000
static int wait_fw_init(struct mlx5_core_dev *dev, u32 max_wait_mili)
{
@@ -537,8 +538,10 @@ static int handle_hca_cap(struct mlx5_core_dev *dev)
/* disable cmdif checksum */
MLX5_SET(cmd_hca_cap, set_hca_cap, cmdif_checksum, 0);
- /* If the HCA supports 4K UARs use it */
- if (MLX5_CAP_GEN_MAX(dev, uar_4k))
+ /* Enable 4K UAR only when HCA supports it and page size is bigger
+ * than 4K.
+ */
+ if (MLX5_CAP_GEN_MAX(dev, uar_4k) && PAGE_SIZE > 4096)
MLX5_SET(cmd_hca_cap, set_hca_cap, uar_4k, 1);
MLX5_SET(cmd_hca_cap, set_hca_cap, log_uar_page_sz, PAGE_SHIFT - 12);
@@ -1011,6 +1014,15 @@ static int mlx5_load_one(struct mlx5_core_dev *dev, struct mlx5_priv *priv,
*/
dev->state = MLX5_DEVICE_STATE_UP;
+ /* wait for firmware to accept initialization segments configurations
+ */
+ err = wait_fw_init(dev, FW_PRE_INIT_TIMEOUT_MILI);
+ if (err) {
+ dev_err(&dev->pdev->dev, "Firmware over %d MS in pre-initializing state, aborting\n",
+ FW_PRE_INIT_TIMEOUT_MILI);
+ goto out;
+ }
+
err = mlx5_cmd_init(dev);
if (err) {
dev_err(&pdev->dev, "Failed initializing command interface, aborting\n");
diff --git a/drivers/net/ethernet/mellanox/mlxsw/spectrum_router.c b/drivers/net/ethernet/mellanox/mlxsw/spectrum_router.c
index 9f89c4137d21..0744452a0b18 100644
--- a/drivers/net/ethernet/mellanox/mlxsw/spectrum_router.c
+++ b/drivers/net/ethernet/mellanox/mlxsw/spectrum_router.c
@@ -3334,6 +3334,9 @@ static int mlxsw_sp_inetaddr_vlan_event(struct net_device *vlan_dev,
struct mlxsw_sp *mlxsw_sp = mlxsw_sp_lower_get(vlan_dev);
u16 vid = vlan_dev_vlan_id(vlan_dev);
+ if (netif_is_bridge_port(vlan_dev))
+ return 0;
+
if (mlxsw_sp_port_dev_check(real_dev))
return mlxsw_sp_inetaddr_vport_event(vlan_dev, real_dev, event,
vid);
diff --git a/drivers/net/ethernet/qlogic/qed/qed_debug.c b/drivers/net/ethernet/qlogic/qed/qed_debug.c
index 483241b4b05d..a672f6a860dc 100644
--- a/drivers/net/ethernet/qlogic/qed/qed_debug.c
+++ b/drivers/net/ethernet/qlogic/qed/qed_debug.c
@@ -2956,7 +2956,7 @@ static u32 qed_grc_dump_ctx_data(struct qed_hwfn *p_hwfn,
qed_wr(p_hwfn,
p_ptt,
s_storm_defs[storm_id].cm_ctx_wr_addr,
- BIT(9) | lid);
+ (i << 9) | lid);
*(dump_buf + offset) = qed_rd(p_hwfn,
p_ptt,
rd_reg_addr);
diff --git a/drivers/net/ethernet/rocker/rocker_ofdpa.c b/drivers/net/ethernet/rocker/rocker_ofdpa.c
index 2ae852454780..a9ce82d3e9cf 100644
--- a/drivers/net/ethernet/rocker/rocker_ofdpa.c
+++ b/drivers/net/ethernet/rocker/rocker_ofdpa.c
@@ -1505,8 +1505,8 @@ static int ofdpa_port_ipv4_nh(struct ofdpa_port *ofdpa_port,
*index = entry->index;
resolved = false;
} else if (removing) {
- ofdpa_neigh_del(trans, found);
*index = found->index;
+ ofdpa_neigh_del(trans, found);
} else if (updating) {
ofdpa_neigh_update(found, trans, NULL, false);
resolved = !is_zero_ether_addr(found->eth_dst);
diff --git a/drivers/net/ethernet/sfc/ef10.c b/drivers/net/ethernet/sfc/ef10.c
index 78efb2822b86..78f9e43420e0 100644
--- a/drivers/net/ethernet/sfc/ef10.c
+++ b/drivers/net/ethernet/sfc/ef10.c
@@ -4172,7 +4172,7 @@ found:
* recipients
*/
if (is_mc_recip) {
- MCDI_DECLARE_BUF(inbuf, MC_CMD_FILTER_OP_IN_LEN);
+ MCDI_DECLARE_BUF(inbuf, MC_CMD_FILTER_OP_EXT_IN_LEN);
unsigned int depth, i;
memset(inbuf, 0, sizeof(inbuf));
@@ -4320,7 +4320,7 @@ static int efx_ef10_filter_remove_internal(struct efx_nic *efx,
efx_ef10_filter_set_entry(table, filter_idx, NULL, 0);
} else {
efx_mcdi_display_error(efx, MC_CMD_FILTER_OP,
- MC_CMD_FILTER_OP_IN_LEN,
+ MC_CMD_FILTER_OP_EXT_IN_LEN,
NULL, 0, rc);
}
}
@@ -4453,7 +4453,7 @@ static s32 efx_ef10_filter_rfs_insert(struct efx_nic *efx,
struct efx_filter_spec *spec)
{
struct efx_ef10_filter_table *table = efx->filter_state;
- MCDI_DECLARE_BUF(inbuf, MC_CMD_FILTER_OP_IN_LEN);
+ MCDI_DECLARE_BUF(inbuf, MC_CMD_FILTER_OP_EXT_IN_LEN);
struct efx_filter_spec *saved_spec;
unsigned int hash, i, depth = 1;
bool replacing = false;
@@ -4940,7 +4940,7 @@ not_restored:
static void efx_ef10_filter_table_remove(struct efx_nic *efx)
{
struct efx_ef10_filter_table *table = efx->filter_state;
- MCDI_DECLARE_BUF(inbuf, MC_CMD_FILTER_OP_IN_LEN);
+ MCDI_DECLARE_BUF(inbuf, MC_CMD_FILTER_OP_EXT_IN_LEN);
struct efx_filter_spec *spec;
unsigned int filter_idx;
int rc;
@@ -5105,6 +5105,7 @@ static int efx_ef10_filter_insert_addr_list(struct efx_nic *efx,
/* Insert/renew filters */
for (i = 0; i < addr_count; i++) {
+ EFX_WARN_ON_PARANOID(ids[i] != EFX_EF10_FILTER_ID_INVALID);
efx_filter_init_rx(&spec, EFX_FILTER_PRI_AUTO, filter_flags, 0);
efx_filter_set_eth_local(&spec, vlan->vid, addr_list[i].addr);
rc = efx_ef10_filter_insert(efx, &spec, true);
@@ -5122,11 +5123,11 @@ static int efx_ef10_filter_insert_addr_list(struct efx_nic *efx,
}
return rc;
} else {
- /* mark as not inserted, and carry on */
- rc = EFX_EF10_FILTER_ID_INVALID;
+ /* keep invalid ID, and carry on */
}
+ } else {
+ ids[i] = efx_ef10_filter_get_unsafe_id(rc);
}
- ids[i] = efx_ef10_filter_get_unsafe_id(rc);
}
if (multicast && rollback) {
diff --git a/drivers/net/ethernet/sfc/ef10_sriov.c b/drivers/net/ethernet/sfc/ef10_sriov.c
index b7e4345c990d..019cef1d3cf7 100644
--- a/drivers/net/ethernet/sfc/ef10_sriov.c
+++ b/drivers/net/ethernet/sfc/ef10_sriov.c
@@ -661,8 +661,6 @@ restore_filters:
up_write(&vf->efx->filter_sem);
mutex_unlock(&vf->efx->mac_lock);
- up_write(&vf->efx->filter_sem);
-
rc2 = efx_net_open(vf->efx->net_dev);
if (rc2)
goto reset_nic;
diff --git a/drivers/net/ethernet/stmicro/stmmac/dwmac4_descs.c b/drivers/net/ethernet/stmicro/stmmac/dwmac4_descs.c
index aa6476439aee..e0ef02f9503b 100644
--- a/drivers/net/ethernet/stmicro/stmmac/dwmac4_descs.c
+++ b/drivers/net/ethernet/stmicro/stmmac/dwmac4_descs.c
@@ -214,13 +214,13 @@ static int dwmac4_wrback_get_tx_timestamp_status(struct dma_desc *p)
{
/* Context type from W/B descriptor must be zero */
if (le32_to_cpu(p->des3) & TDES3_CONTEXT_TYPE)
- return -EINVAL;
+ return 0;
/* Tx Timestamp Status is 1 so des0 and des1'll have valid values */
if (le32_to_cpu(p->des3) & TDES3_TIMESTAMP_STATUS)
- return 0;
+ return 1;
- return 1;
+ return 0;
}
static inline u64 dwmac4_get_timestamp(void *desc, u32 ats)
@@ -282,7 +282,10 @@ static int dwmac4_wrback_get_rx_timestamp_status(void *desc, u32 ats)
}
}
exit:
- return ret;
+ if (likely(ret == 0))
+ return 1;
+
+ return 0;
}
static void dwmac4_rd_init_rx_desc(struct dma_desc *p, int disable_rx_ic,
diff --git a/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c b/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c
index 12236daf7bb6..6e4cbc6ce0ef 100644
--- a/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c
+++ b/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c
@@ -434,14 +434,14 @@ static void stmmac_get_tx_hwtstamp(struct stmmac_priv *priv,
return;
/* check tx tstamp status */
- if (!priv->hw->desc->get_tx_timestamp_status(p)) {
+ if (priv->hw->desc->get_tx_timestamp_status(p)) {
/* get the valid tstamp */
ns = priv->hw->desc->get_timestamp(p, priv->adv_ts);
memset(&shhwtstamp, 0, sizeof(struct skb_shared_hwtstamps));
shhwtstamp.hwtstamp = ns_to_ktime(ns);
- netdev_info(priv->dev, "get valid TX hw timestamp %llu\n", ns);
+ netdev_dbg(priv->dev, "get valid TX hw timestamp %llu\n", ns);
/* pass tstamp to stack */
skb_tstamp_tx(skb, &shhwtstamp);
}
@@ -468,19 +468,19 @@ static void stmmac_get_rx_hwtstamp(struct stmmac_priv *priv, struct dma_desc *p,
return;
/* Check if timestamp is available */
- if (!priv->hw->desc->get_rx_timestamp_status(p, priv->adv_ts)) {
+ if (priv->hw->desc->get_rx_timestamp_status(p, priv->adv_ts)) {
/* For GMAC4, the valid timestamp is from CTX next desc. */
if (priv->plat->has_gmac4)
ns = priv->hw->desc->get_timestamp(np, priv->adv_ts);
else
ns = priv->hw->desc->get_timestamp(p, priv->adv_ts);
- netdev_info(priv->dev, "get valid RX hw timestamp %llu\n", ns);
+ netdev_dbg(priv->dev, "get valid RX hw timestamp %llu\n", ns);
shhwtstamp = skb_hwtstamps(skb);
memset(shhwtstamp, 0, sizeof(struct skb_shared_hwtstamps));
shhwtstamp->hwtstamp = ns_to_ktime(ns);
} else {
- netdev_err(priv->dev, "cannot get RX hw timestamp\n");
+ netdev_dbg(priv->dev, "cannot get RX hw timestamp\n");
}
}
@@ -546,7 +546,10 @@ static int stmmac_hwtstamp_ioctl(struct net_device *dev, struct ifreq *ifr)
/* PTP v1, UDP, any kind of event packet */
config.rx_filter = HWTSTAMP_FILTER_PTP_V1_L4_EVENT;
/* take time stamp for all event messages */
- snap_type_sel = PTP_TCR_SNAPTYPSEL_1;
+ if (priv->plat->has_gmac4)
+ snap_type_sel = PTP_GMAC4_TCR_SNAPTYPSEL_1;
+ else
+ snap_type_sel = PTP_TCR_SNAPTYPSEL_1;
ptp_over_ipv4_udp = PTP_TCR_TSIPV4ENA;
ptp_over_ipv6_udp = PTP_TCR_TSIPV6ENA;
@@ -578,7 +581,10 @@ static int stmmac_hwtstamp_ioctl(struct net_device *dev, struct ifreq *ifr)
config.rx_filter = HWTSTAMP_FILTER_PTP_V2_L4_EVENT;
ptp_v2 = PTP_TCR_TSVER2ENA;
/* take time stamp for all event messages */
- snap_type_sel = PTP_TCR_SNAPTYPSEL_1;
+ if (priv->plat->has_gmac4)
+ snap_type_sel = PTP_GMAC4_TCR_SNAPTYPSEL_1;
+ else
+ snap_type_sel = PTP_TCR_SNAPTYPSEL_1;
ptp_over_ipv4_udp = PTP_TCR_TSIPV4ENA;
ptp_over_ipv6_udp = PTP_TCR_TSIPV6ENA;
@@ -612,7 +618,10 @@ static int stmmac_hwtstamp_ioctl(struct net_device *dev, struct ifreq *ifr)
config.rx_filter = HWTSTAMP_FILTER_PTP_V2_EVENT;
ptp_v2 = PTP_TCR_TSVER2ENA;
/* take time stamp for all event messages */
- snap_type_sel = PTP_TCR_SNAPTYPSEL_1;
+ if (priv->plat->has_gmac4)
+ snap_type_sel = PTP_GMAC4_TCR_SNAPTYPSEL_1;
+ else
+ snap_type_sel = PTP_TCR_SNAPTYPSEL_1;
ptp_over_ipv4_udp = PTP_TCR_TSIPV4ENA;
ptp_over_ipv6_udp = PTP_TCR_TSIPV6ENA;
@@ -2822,7 +2831,6 @@ static netdev_tx_t stmmac_tso_xmit(struct sk_buff *skb, struct net_device *dev)
tx_q->tx_skbuff_dma[first_entry].buf = des;
tx_q->tx_skbuff_dma[first_entry].len = skb_headlen(skb);
- tx_q->tx_skbuff[first_entry] = skb;
first->des0 = cpu_to_le32(des);
@@ -2856,6 +2864,14 @@ static netdev_tx_t stmmac_tso_xmit(struct sk_buff *skb, struct net_device *dev)
tx_q->tx_skbuff_dma[tx_q->cur_tx].last_segment = true;
+ /* Only the last descriptor gets to point to the skb. */
+ tx_q->tx_skbuff[tx_q->cur_tx] = skb;
+
+ /* We've used all descriptors we need for this skb, however,
+ * advance cur_tx so that it references a fresh descriptor.
+ * ndo_start_xmit will fill this descriptor the next time it's
+ * called and stmmac_tx_clean may clean up to this descriptor.
+ */
tx_q->cur_tx = STMMAC_GET_ENTRY(tx_q->cur_tx, DMA_TX_SIZE);
if (unlikely(stmmac_tx_avail(priv, queue) <= (MAX_SKB_FRAGS + 1))) {
@@ -2989,8 +3005,6 @@ static netdev_tx_t stmmac_xmit(struct sk_buff *skb, struct net_device *dev)
first = desc;
- tx_q->tx_skbuff[first_entry] = skb;
-
enh_desc = priv->plat->enh_desc;
/* To program the descriptors according to the size of the frame */
if (enh_desc)
@@ -3038,8 +3052,15 @@ static netdev_tx_t stmmac_xmit(struct sk_buff *skb, struct net_device *dev)
skb->len);
}
- entry = STMMAC_GET_ENTRY(entry, DMA_TX_SIZE);
+ /* Only the last descriptor gets to point to the skb. */
+ tx_q->tx_skbuff[entry] = skb;
+ /* We've used all descriptors we need for this skb, however,
+ * advance cur_tx so that it references a fresh descriptor.
+ * ndo_start_xmit will fill this descriptor the next time it's
+ * called and stmmac_tx_clean may clean up to this descriptor.
+ */
+ entry = STMMAC_GET_ENTRY(entry, DMA_TX_SIZE);
tx_q->cur_tx = entry;
if (netif_msg_pktdata(priv)) {
diff --git a/drivers/net/ethernet/stmicro/stmmac/stmmac_ptp.h b/drivers/net/ethernet/stmicro/stmmac/stmmac_ptp.h
index 48fb72fc423c..f4b31d69f60e 100644
--- a/drivers/net/ethernet/stmicro/stmmac/stmmac_ptp.h
+++ b/drivers/net/ethernet/stmicro/stmmac/stmmac_ptp.h
@@ -59,7 +59,8 @@
/* Enable Snapshot for Messages Relevant to Master */
#define PTP_TCR_TSMSTRENA BIT(15)
/* Select PTP packets for Taking Snapshots */
-#define PTP_TCR_SNAPTYPSEL_1 GENMASK(17, 16)
+#define PTP_TCR_SNAPTYPSEL_1 BIT(16)
+#define PTP_GMAC4_TCR_SNAPTYPSEL_1 GENMASK(17, 16)
/* Enable MAC address for PTP Frame Filtering */
#define PTP_TCR_TSENMACADDR BIT(18)
diff --git a/drivers/net/ethernet/ti/cpsw-common.c b/drivers/net/ethernet/ti/cpsw-common.c
index 1562ab4151e1..56ba411421f0 100644
--- a/drivers/net/ethernet/ti/cpsw-common.c
+++ b/drivers/net/ethernet/ti/cpsw-common.c
@@ -90,7 +90,7 @@ int ti_cm_get_macid(struct device *dev, int slave, u8 *mac_addr)
if (of_device_is_compatible(dev->of_node, "ti,dm816-emac"))
return cpsw_am33xx_cm_get_macid(dev, 0x30, slave, mac_addr);
- if (of_machine_is_compatible("ti,am4372"))
+ if (of_machine_is_compatible("ti,am43"))
return cpsw_am33xx_cm_get_macid(dev, 0x630, slave, mac_addr);
if (of_machine_is_compatible("ti,dra7"))
diff --git a/drivers/net/geneve.c b/drivers/net/geneve.c
index 6ebb0f559a42..199459bd6961 100644
--- a/drivers/net/geneve.c
+++ b/drivers/net/geneve.c
@@ -1007,7 +1007,7 @@ static void geneve_setup(struct net_device *dev)
dev->netdev_ops = &geneve_netdev_ops;
dev->ethtool_ops = &geneve_ethtool_ops;
- dev->destructor = free_netdev;
+ dev->needs_free_netdev = true;
SET_NETDEV_DEVTYPE(dev, &geneve_type);
diff --git a/drivers/net/gtp.c b/drivers/net/gtp.c
index 7b652bb7ebe4..ca110cd2a4e4 100644
--- a/drivers/net/gtp.c
+++ b/drivers/net/gtp.c
@@ -611,7 +611,7 @@ static const struct net_device_ops gtp_netdev_ops = {
static void gtp_link_setup(struct net_device *dev)
{
dev->netdev_ops = &gtp_netdev_ops;
- dev->destructor = free_netdev;
+ dev->needs_free_netdev = true;
dev->hard_header_len = 0;
dev->addr_len = 0;
diff --git a/drivers/net/hamradio/6pack.c b/drivers/net/hamradio/6pack.c
index 922bf440e9f1..021a8ec411ab 100644
--- a/drivers/net/hamradio/6pack.c
+++ b/drivers/net/hamradio/6pack.c
@@ -311,7 +311,7 @@ static void sp_setup(struct net_device *dev)
{
/* Finish setting up the DEVICE info. */
dev->netdev_ops = &sp_netdev_ops;
- dev->destructor = free_netdev;
+ dev->needs_free_netdev = true;
dev->mtu = SIXP_MTU;
dev->hard_header_len = AX25_MAX_HEADER_LEN;
dev->header_ops = &ax25_header_ops;
diff --git a/drivers/net/hamradio/bpqether.c b/drivers/net/hamradio/bpqether.c
index f62e7f325cf9..78a6414c5fd9 100644
--- a/drivers/net/hamradio/bpqether.c
+++ b/drivers/net/hamradio/bpqether.c
@@ -476,7 +476,7 @@ static const struct net_device_ops bpq_netdev_ops = {
static void bpq_setup(struct net_device *dev)
{
dev->netdev_ops = &bpq_netdev_ops;
- dev->destructor = free_netdev;
+ dev->needs_free_netdev = true;
memcpy(dev->broadcast, &ax25_bcast, AX25_ADDR_LEN);
memcpy(dev->dev_addr, &ax25_defaddr, AX25_ADDR_LEN);
diff --git a/drivers/net/hyperv/hyperv_net.h b/drivers/net/hyperv/hyperv_net.h
index 262b2ea576a3..6066f1bcaf2d 100644
--- a/drivers/net/hyperv/hyperv_net.h
+++ b/drivers/net/hyperv/hyperv_net.h
@@ -171,6 +171,8 @@ struct rndis_device {
spinlock_t request_lock;
struct list_head req_list;
+ struct work_struct mcast_work;
+
u8 hw_mac_adr[ETH_ALEN];
u8 rss_key[NETVSC_HASH_KEYLEN];
u16 ind_table[ITAB_NUM];
@@ -201,6 +203,7 @@ int rndis_filter_open(struct netvsc_device *nvdev);
int rndis_filter_close(struct netvsc_device *nvdev);
int rndis_filter_device_add(struct hv_device *dev,
struct netvsc_device_info *info);
+void rndis_filter_update(struct netvsc_device *nvdev);
void rndis_filter_device_remove(struct hv_device *dev,
struct netvsc_device *nvdev);
int rndis_filter_set_rss_param(struct rndis_device *rdev,
@@ -211,7 +214,6 @@ int rndis_filter_receive(struct net_device *ndev,
struct vmbus_channel *channel,
void *data, u32 buflen);
-int rndis_filter_set_packet_filter(struct rndis_device *dev, u32 new_filter);
int rndis_filter_set_device_mac(struct net_device *ndev, char *mac);
void netvsc_switch_datapath(struct net_device *nv_dev, bool vf);
@@ -696,7 +698,6 @@ struct net_device_context {
/* list protection */
spinlock_t lock;
- struct work_struct work;
u32 msg_enable; /* debug level */
u32 tx_checksum_mask;
diff --git a/drivers/net/hyperv/netvsc_drv.c b/drivers/net/hyperv/netvsc_drv.c
index 4421a6d00375..643c539a08ba 100644
--- a/drivers/net/hyperv/netvsc_drv.c
+++ b/drivers/net/hyperv/netvsc_drv.c
@@ -56,37 +56,12 @@ static int debug = -1;
module_param(debug, int, S_IRUGO);
MODULE_PARM_DESC(debug, "Debug level (0=none,...,16=all)");
-static void do_set_multicast(struct work_struct *w)
-{
- struct net_device_context *ndevctx =
- container_of(w, struct net_device_context, work);
- struct hv_device *device_obj = ndevctx->device_ctx;
- struct net_device *ndev = hv_get_drvdata(device_obj);
- struct netvsc_device *nvdev = rcu_dereference(ndevctx->nvdev);
- struct rndis_device *rdev;
-
- if (!nvdev)
- return;
-
- rdev = nvdev->extension;
- if (rdev == NULL)
- return;
-
- if (ndev->flags & IFF_PROMISC)
- rndis_filter_set_packet_filter(rdev,
- NDIS_PACKET_TYPE_PROMISCUOUS);
- else
- rndis_filter_set_packet_filter(rdev,
- NDIS_PACKET_TYPE_BROADCAST |
- NDIS_PACKET_TYPE_ALL_MULTICAST |
- NDIS_PACKET_TYPE_DIRECTED);
-}
-
static void netvsc_set_multicast_list(struct net_device *net)
{
struct net_device_context *net_device_ctx = netdev_priv(net);
+ struct netvsc_device *nvdev = rtnl_dereference(net_device_ctx->nvdev);
- schedule_work(&net_device_ctx->work);
+ rndis_filter_update(nvdev);
}
static int netvsc_open(struct net_device *net)
@@ -123,8 +98,6 @@ static int netvsc_close(struct net_device *net)
netif_tx_disable(net);
- /* Make sure netvsc_set_multicast_list doesn't re-enable filter! */
- cancel_work_sync(&net_device_ctx->work);
ret = rndis_filter_close(nvdev);
if (ret != 0) {
netdev_err(net, "unable to close device (ret %d).\n", ret);
@@ -803,7 +776,7 @@ static int netvsc_set_channels(struct net_device *net,
channels->rx_count || channels->tx_count || channels->other_count)
return -EINVAL;
- if (count > net->num_tx_queues || count > net->num_rx_queues)
+ if (count > net->num_tx_queues || count > VRSS_CHANNEL_MAX)
return -EINVAL;
if (!nvdev || nvdev->destroy)
@@ -1028,7 +1001,7 @@ static const struct {
static int netvsc_get_sset_count(struct net_device *dev, int string_set)
{
struct net_device_context *ndc = netdev_priv(dev);
- struct netvsc_device *nvdev = rcu_dereference(ndc->nvdev);
+ struct netvsc_device *nvdev = rtnl_dereference(ndc->nvdev);
if (!nvdev)
return -ENODEV;
@@ -1158,11 +1131,22 @@ netvsc_get_rxnfc(struct net_device *dev, struct ethtool_rxnfc *info,
}
#ifdef CONFIG_NET_POLL_CONTROLLER
-static void netvsc_poll_controller(struct net_device *net)
+static void netvsc_poll_controller(struct net_device *dev)
{
- /* As netvsc_start_xmit() works synchronous we don't have to
- * trigger anything here.
- */
+ struct net_device_context *ndc = netdev_priv(dev);
+ struct netvsc_device *ndev;
+ int i;
+
+ rcu_read_lock();
+ ndev = rcu_dereference(ndc->nvdev);
+ if (ndev) {
+ for (i = 0; i < ndev->num_chn; i++) {
+ struct netvsc_channel *nvchan = &ndev->chan_table[i];
+
+ napi_schedule(&nvchan->napi);
+ }
+ }
+ rcu_read_unlock();
}
#endif
@@ -1219,7 +1203,7 @@ static int netvsc_set_rxfh(struct net_device *dev, const u32 *indir,
rndis_dev = ndev->extension;
if (indir) {
for (i = 0; i < ITAB_NUM; i++)
- if (indir[i] >= dev->num_rx_queues)
+ if (indir[i] >= VRSS_CHANNEL_MAX)
return -EINVAL;
for (i = 0; i < ITAB_NUM; i++)
@@ -1552,7 +1536,6 @@ static int netvsc_probe(struct hv_device *dev,
hv_set_drvdata(dev, net);
INIT_DELAYED_WORK(&net_device_ctx->dwork, netvsc_link_change);
- INIT_WORK(&net_device_ctx->work, do_set_multicast);
spin_lock_init(&net_device_ctx->lock);
INIT_LIST_HEAD(&net_device_ctx->reconfig_events);
@@ -1622,7 +1605,6 @@ static int netvsc_remove(struct hv_device *dev)
netif_device_detach(net);
cancel_delayed_work_sync(&ndev_ctx->dwork);
- cancel_work_sync(&ndev_ctx->work);
/*
* Call to the vsc driver to let it know that the device is being
diff --git a/drivers/net/hyperv/rndis_filter.c b/drivers/net/hyperv/rndis_filter.c
index f9d5b0b8209a..cb79cd081f42 100644
--- a/drivers/net/hyperv/rndis_filter.c
+++ b/drivers/net/hyperv/rndis_filter.c
@@ -31,6 +31,7 @@
#include "hyperv_net.h"
+static void rndis_set_multicast(struct work_struct *w);
#define RNDIS_EXT_LEN PAGE_SIZE
struct rndis_request {
@@ -76,6 +77,7 @@ static struct rndis_device *get_rndis_device(void)
spin_lock_init(&device->request_lock);
INIT_LIST_HEAD(&device->req_list);
+ INIT_WORK(&device->mcast_work, rndis_set_multicast);
device->state = RNDIS_DEV_UNINITIALIZED;
@@ -815,7 +817,8 @@ static int rndis_filter_query_link_speed(struct rndis_device *dev)
return ret;
}
-int rndis_filter_set_packet_filter(struct rndis_device *dev, u32 new_filter)
+static int rndis_filter_set_packet_filter(struct rndis_device *dev,
+ u32 new_filter)
{
struct rndis_request *request;
struct rndis_set_request *set;
@@ -846,6 +849,28 @@ int rndis_filter_set_packet_filter(struct rndis_device *dev, u32 new_filter)
return ret;
}
+static void rndis_set_multicast(struct work_struct *w)
+{
+ struct rndis_device *rdev
+ = container_of(w, struct rndis_device, mcast_work);
+
+ if (rdev->ndev->flags & IFF_PROMISC)
+ rndis_filter_set_packet_filter(rdev,
+ NDIS_PACKET_TYPE_PROMISCUOUS);
+ else
+ rndis_filter_set_packet_filter(rdev,
+ NDIS_PACKET_TYPE_BROADCAST |
+ NDIS_PACKET_TYPE_ALL_MULTICAST |
+ NDIS_PACKET_TYPE_DIRECTED);
+}
+
+void rndis_filter_update(struct netvsc_device *nvdev)
+{
+ struct rndis_device *rdev = nvdev->extension;
+
+ schedule_work(&rdev->mcast_work);
+}
+
static int rndis_filter_init_device(struct rndis_device *dev)
{
struct rndis_request *request;
@@ -973,6 +998,9 @@ static int rndis_filter_close_device(struct rndis_device *dev)
if (dev->state != RNDIS_DEV_DATAINITIALIZED)
return 0;
+ /* Make sure rndis_set_multicast doesn't re-enable filter! */
+ cancel_work_sync(&dev->mcast_work);
+
ret = rndis_filter_set_packet_filter(dev, 0);
if (ret == -ENODEV)
ret = 0;
diff --git a/drivers/net/ifb.c b/drivers/net/ifb.c
index 312fce7302d3..144ea5ae8ab4 100644
--- a/drivers/net/ifb.c
+++ b/drivers/net/ifb.c
@@ -207,7 +207,6 @@ static void ifb_dev_free(struct net_device *dev)
__skb_queue_purge(&txp->tq);
}
kfree(dp->tx_private);
- free_netdev(dev);
}
static void ifb_setup(struct net_device *dev)
@@ -230,7 +229,8 @@ static void ifb_setup(struct net_device *dev)
dev->priv_flags &= ~IFF_TX_SKB_SHARING;
netif_keep_dst(dev);
eth_hw_addr_random(dev);
- dev->destructor = ifb_dev_free;
+ dev->needs_free_netdev = true;
+ dev->priv_destructor = ifb_dev_free;
}
static netdev_tx_t ifb_xmit(struct sk_buff *skb, struct net_device *dev)
diff --git a/drivers/net/ipvlan/ipvlan_main.c b/drivers/net/ipvlan/ipvlan_main.c
index 618ed88fad0f..7c7680c8f0e3 100644
--- a/drivers/net/ipvlan/ipvlan_main.c
+++ b/drivers/net/ipvlan/ipvlan_main.c
@@ -632,7 +632,7 @@ void ipvlan_link_setup(struct net_device *dev)
dev->priv_flags &= ~(IFF_XMIT_DST_RELEASE | IFF_TX_SKB_SHARING);
dev->priv_flags |= IFF_UNICAST_FLT | IFF_NO_QUEUE;
dev->netdev_ops = &ipvlan_netdev_ops;
- dev->destructor = free_netdev;
+ dev->needs_free_netdev = true;
dev->header_ops = &ipvlan_header_ops;
dev->ethtool_ops = &ipvlan_ethtool_ops;
}
diff --git a/drivers/net/loopback.c b/drivers/net/loopback.c
index 224f65cb576b..30612497643c 100644
--- a/drivers/net/loopback.c
+++ b/drivers/net/loopback.c
@@ -159,7 +159,6 @@ static void loopback_dev_free(struct net_device *dev)
{
dev_net(dev)->loopback_dev = NULL;
free_percpu(dev->lstats);
- free_netdev(dev);
}
static const struct net_device_ops loopback_ops = {
@@ -196,7 +195,8 @@ static void loopback_setup(struct net_device *dev)
dev->ethtool_ops = &loopback_ethtool_ops;
dev->header_ops = &eth_header_ops;
dev->netdev_ops = &loopback_ops;
- dev->destructor = loopback_dev_free;
+ dev->needs_free_netdev = true;
+ dev->priv_destructor = loopback_dev_free;
}
/* Setup and register the loopback device. */
diff --git a/drivers/net/macsec.c b/drivers/net/macsec.c
index cdc347be68f2..79411675f0e6 100644
--- a/drivers/net/macsec.c
+++ b/drivers/net/macsec.c
@@ -2996,7 +2996,6 @@ static void macsec_free_netdev(struct net_device *dev)
free_percpu(macsec->secy.tx_sc.stats);
dev_put(real_dev);
- free_netdev(dev);
}
static void macsec_setup(struct net_device *dev)
@@ -3006,7 +3005,8 @@ static void macsec_setup(struct net_device *dev)
dev->max_mtu = ETH_MAX_MTU;
dev->priv_flags |= IFF_NO_QUEUE;
dev->netdev_ops = &macsec_netdev_ops;
- dev->destructor = macsec_free_netdev;
+ dev->needs_free_netdev = true;
+ dev->priv_destructor = macsec_free_netdev;
SET_NETDEV_DEVTYPE(dev, &macsec_type);
eth_zero_addr(dev->broadcast);
diff --git a/drivers/net/macvlan.c b/drivers/net/macvlan.c
index 346ad2ff3998..72b801803aa4 100644
--- a/drivers/net/macvlan.c
+++ b/drivers/net/macvlan.c
@@ -39,16 +39,20 @@
#define MACVLAN_HASH_SIZE (1<<MACVLAN_HASH_BITS)
#define MACVLAN_BC_QUEUE_LEN 1000
+#define MACVLAN_F_PASSTHRU 1
+#define MACVLAN_F_ADDRCHANGE 2
+
struct macvlan_port {
struct net_device *dev;
struct hlist_head vlan_hash[MACVLAN_HASH_SIZE];
struct list_head vlans;
struct sk_buff_head bc_queue;
struct work_struct bc_work;
- bool passthru;
+ u32 flags;
int count;
struct hlist_head vlan_source_hash[MACVLAN_HASH_SIZE];
DECLARE_BITMAP(mc_filter, MACVLAN_MC_FILTER_SZ);
+ unsigned char perm_addr[ETH_ALEN];
};
struct macvlan_source_entry {
@@ -66,6 +70,31 @@ struct macvlan_skb_cb {
static void macvlan_port_destroy(struct net_device *dev);
+static inline bool macvlan_passthru(const struct macvlan_port *port)
+{
+ return port->flags & MACVLAN_F_PASSTHRU;
+}
+
+static inline void macvlan_set_passthru(struct macvlan_port *port)
+{
+ port->flags |= MACVLAN_F_PASSTHRU;
+}
+
+static inline bool macvlan_addr_change(const struct macvlan_port *port)
+{
+ return port->flags & MACVLAN_F_ADDRCHANGE;
+}
+
+static inline void macvlan_set_addr_change(struct macvlan_port *port)
+{
+ port->flags |= MACVLAN_F_ADDRCHANGE;
+}
+
+static inline void macvlan_clear_addr_change(struct macvlan_port *port)
+{
+ port->flags &= ~MACVLAN_F_ADDRCHANGE;
+}
+
/* Hash Ethernet address */
static u32 macvlan_eth_hash(const unsigned char *addr)
{
@@ -181,11 +210,12 @@ static void macvlan_hash_change_addr(struct macvlan_dev *vlan,
static bool macvlan_addr_busy(const struct macvlan_port *port,
const unsigned char *addr)
{
- /* Test to see if the specified multicast address is
+ /* Test to see if the specified address is
* currently in use by the underlying device or
* another macvlan.
*/
- if (ether_addr_equal_64bits(port->dev->dev_addr, addr))
+ if (!macvlan_passthru(port) && !macvlan_addr_change(port) &&
+ ether_addr_equal_64bits(port->dev->dev_addr, addr))
return true;
if (macvlan_hash_lookup(port, addr))
@@ -445,7 +475,7 @@ static rx_handler_result_t macvlan_handle_frame(struct sk_buff **pskb)
}
macvlan_forward_source(skb, port, eth->h_source);
- if (port->passthru)
+ if (macvlan_passthru(port))
vlan = list_first_or_null_rcu(&port->vlans,
struct macvlan_dev, list);
else
@@ -574,7 +604,7 @@ static int macvlan_open(struct net_device *dev)
struct net_device *lowerdev = vlan->lowerdev;
int err;
- if (vlan->port->passthru) {
+ if (macvlan_passthru(vlan->port)) {
if (!(vlan->flags & MACVLAN_FLAG_NOPROMISC)) {
err = dev_set_promiscuity(lowerdev, 1);
if (err < 0)
@@ -649,7 +679,7 @@ static int macvlan_stop(struct net_device *dev)
dev_uc_unsync(lowerdev, dev);
dev_mc_unsync(lowerdev, dev);
- if (vlan->port->passthru) {
+ if (macvlan_passthru(vlan->port)) {
if (!(vlan->flags & MACVLAN_FLAG_NOPROMISC))
dev_set_promiscuity(lowerdev, -1);
goto hash_del;
@@ -672,6 +702,7 @@ static int macvlan_sync_address(struct net_device *dev, unsigned char *addr)
{
struct macvlan_dev *vlan = netdev_priv(dev);
struct net_device *lowerdev = vlan->lowerdev;
+ struct macvlan_port *port = vlan->port;
int err;
if (!(dev->flags & IFF_UP)) {
@@ -682,7 +713,7 @@ static int macvlan_sync_address(struct net_device *dev, unsigned char *addr)
if (macvlan_addr_busy(vlan->port, addr))
return -EBUSY;
- if (!vlan->port->passthru) {
+ if (!macvlan_passthru(port)) {
err = dev_uc_add(lowerdev, addr);
if (err)
return err;
@@ -692,6 +723,15 @@ static int macvlan_sync_address(struct net_device *dev, unsigned char *addr)
macvlan_hash_change_addr(vlan, addr);
}
+ if (macvlan_passthru(port) && !macvlan_addr_change(port)) {
+ /* Since addr_change isn't set, we are here due to lower
+ * device change. Save the lower-dev address so we can
+ * restore it later.
+ */
+ ether_addr_copy(vlan->port->perm_addr,
+ lowerdev->dev_addr);
+ }
+ macvlan_clear_addr_change(port);
return 0;
}
@@ -703,7 +743,12 @@ static int macvlan_set_mac_address(struct net_device *dev, void *p)
if (!is_valid_ether_addr(addr->sa_data))
return -EADDRNOTAVAIL;
+ /* If the addresses are the same, this is a no-op */
+ if (ether_addr_equal(dev->dev_addr, addr->sa_data))
+ return 0;
+
if (vlan->mode == MACVLAN_MODE_PASSTHRU) {
+ macvlan_set_addr_change(vlan->port);
dev_set_mac_address(vlan->lowerdev, addr);
return 0;
}
@@ -928,7 +973,7 @@ static int macvlan_fdb_add(struct ndmsg *ndm, struct nlattr *tb[],
/* Support unicast filter only on passthru devices.
* Multicast filter should be allowed on all devices.
*/
- if (!vlan->port->passthru && is_unicast_ether_addr(addr))
+ if (!macvlan_passthru(vlan->port) && is_unicast_ether_addr(addr))
return -EOPNOTSUPP;
if (flags & NLM_F_REPLACE)
@@ -952,7 +997,7 @@ static int macvlan_fdb_del(struct ndmsg *ndm, struct nlattr *tb[],
/* Support unicast filter only on passthru devices.
* Multicast filter should be allowed on all devices.
*/
- if (!vlan->port->passthru && is_unicast_ether_addr(addr))
+ if (!macvlan_passthru(vlan->port) && is_unicast_ether_addr(addr))
return -EOPNOTSUPP;
if (is_unicast_ether_addr(addr))
@@ -1092,7 +1137,7 @@ void macvlan_common_setup(struct net_device *dev)
netif_keep_dst(dev);
dev->priv_flags |= IFF_UNICAST_FLT;
dev->netdev_ops = &macvlan_netdev_ops;
- dev->destructor = free_netdev;
+ dev->needs_free_netdev = true;
dev->header_ops = &macvlan_hard_header_ops;
dev->ethtool_ops = &macvlan_ethtool_ops;
}
@@ -1120,8 +1165,8 @@ static int macvlan_port_create(struct net_device *dev)
if (port == NULL)
return -ENOMEM;
- port->passthru = false;
port->dev = dev;
+ ether_addr_copy(port->perm_addr, dev->dev_addr);
INIT_LIST_HEAD(&port->vlans);
for (i = 0; i < MACVLAN_HASH_SIZE; i++)
INIT_HLIST_HEAD(&port->vlan_hash[i]);
@@ -1161,6 +1206,18 @@ static void macvlan_port_destroy(struct net_device *dev)
kfree_skb(skb);
}
+ /* If the lower device address has been changed by passthru
+ * macvlan, put it back.
+ */
+ if (macvlan_passthru(port) &&
+ !ether_addr_equal(port->dev->dev_addr, port->perm_addr)) {
+ struct sockaddr sa;
+
+ sa.sa_family = port->dev->type;
+ memcpy(&sa.sa_data, port->perm_addr, port->dev->addr_len);
+ dev_set_mac_address(port->dev, &sa);
+ }
+
kfree(port);
}
@@ -1326,7 +1383,7 @@ int macvlan_common_newlink(struct net *src_net, struct net_device *dev,
port = macvlan_port_get_rtnl(lowerdev);
/* Only 1 macvlan device can be created in passthru mode */
- if (port->passthru) {
+ if (macvlan_passthru(port)) {
/* The macvlan port must be not created this time,
* still goto destroy_macvlan_port for readability.
*/
@@ -1352,7 +1409,7 @@ int macvlan_common_newlink(struct net *src_net, struct net_device *dev,
err = -EINVAL;
goto destroy_macvlan_port;
}
- port->passthru = true;
+ macvlan_set_passthru(port);
eth_hw_addr_inherit(dev, lowerdev);
}
@@ -1434,7 +1491,7 @@ static int macvlan_changelink(struct net_device *dev,
if (data && data[IFLA_MACVLAN_FLAGS]) {
__u16 flags = nla_get_u16(data[IFLA_MACVLAN_FLAGS]);
bool promisc = (flags ^ vlan->flags) & MACVLAN_FLAG_NOPROMISC;
- if (vlan->port->passthru && promisc) {
+ if (macvlan_passthru(vlan->port) && promisc) {
int err;
if (flags & MACVLAN_FLAG_NOPROMISC)
@@ -1597,7 +1654,7 @@ static int macvlan_device_event(struct notifier_block *unused,
}
break;
case NETDEV_CHANGEADDR:
- if (!port->passthru)
+ if (!macvlan_passthru(port))
return NOTIFY_DONE;
vlan = list_first_entry_or_null(&port->vlans,
diff --git a/drivers/net/netconsole.c b/drivers/net/netconsole.c
index 06ee6395117f..0e27920c2b6b 100644
--- a/drivers/net/netconsole.c
+++ b/drivers/net/netconsole.c
@@ -358,7 +358,7 @@ static ssize_t enabled_store(struct config_item *item,
if (err)
goto out_unlock;
- pr_info("netconsole: network logging started\n");
+ pr_info("network logging started\n");
} else { /* false */
/* We need to disable the netconsole before cleaning it up
* otherwise we might end up in write_msg() with
diff --git a/drivers/net/nlmon.c b/drivers/net/nlmon.c
index b91603835d26..c4b3362da4a2 100644
--- a/drivers/net/nlmon.c
+++ b/drivers/net/nlmon.c
@@ -113,7 +113,7 @@ static void nlmon_setup(struct net_device *dev)
dev->netdev_ops = &nlmon_ops;
dev->ethtool_ops = &nlmon_ethtool_ops;
- dev->destructor = free_netdev;
+ dev->needs_free_netdev = true;
dev->features = NETIF_F_SG | NETIF_F_FRAGLIST |
NETIF_F_HIGHDMA | NETIF_F_LLTX;
diff --git a/drivers/net/phy/Kconfig b/drivers/net/phy/Kconfig
index c360dd6ead22..3ab6c58d4be6 100644
--- a/drivers/net/phy/Kconfig
+++ b/drivers/net/phy/Kconfig
@@ -127,6 +127,7 @@ config MDIO_THUNDER
tristate "ThunderX SOCs MDIO buses"
depends on 64BIT
depends on PCI
+ depends on !(MDIO_DEVICE=y && PHYLIB=m)
select MDIO_CAVIUM
help
This driver supports the MDIO interfaces found on Cavium
diff --git a/drivers/net/phy/dp83640.c b/drivers/net/phy/dp83640.c
index ed0d10f54f26..c3065236ffcc 100644
--- a/drivers/net/phy/dp83640.c
+++ b/drivers/net/phy/dp83640.c
@@ -908,7 +908,7 @@ static void decode_txts(struct dp83640_private *dp83640,
if (overflow) {
pr_debug("tx timestamp queue overflow, count %d\n", overflow);
while (skb) {
- skb_complete_tx_timestamp(skb, NULL);
+ kfree_skb(skb);
skb = skb_dequeue(&dp83640->tx_queue);
}
return;
diff --git a/drivers/net/phy/micrel.c b/drivers/net/phy/micrel.c
index b9252b8d81ff..8b2038844ba9 100644
--- a/drivers/net/phy/micrel.c
+++ b/drivers/net/phy/micrel.c
@@ -619,6 +619,8 @@ static int ksz9031_read_status(struct phy_device *phydev)
if ((regval & 0xFF) == 0xFF) {
phy_init_hw(phydev);
phydev->link = 0;
+ if (phydev->drv->config_intr && phy_interrupt_is_valid(phydev))
+ phydev->drv->config_intr(phydev);
}
return 0;
diff --git a/drivers/net/phy/phy.c b/drivers/net/phy/phy.c
index 7524caa0f29d..eebb0e1c70ff 100644
--- a/drivers/net/phy/phy.c
+++ b/drivers/net/phy/phy.c
@@ -54,6 +54,8 @@ static const char *phy_speed_to_str(int speed)
return "5Gbps";
case SPEED_10000:
return "10Gbps";
+ case SPEED_14000:
+ return "14Gbps";
case SPEED_20000:
return "20Gbps";
case SPEED_25000:
diff --git a/drivers/net/slip/slip.c b/drivers/net/slip/slip.c
index 1da31dc47f86..74b907206aa7 100644
--- a/drivers/net/slip/slip.c
+++ b/drivers/net/slip/slip.c
@@ -629,7 +629,7 @@ static void sl_uninit(struct net_device *dev)
static void sl_free_netdev(struct net_device *dev)
{
int i = dev->base_addr;
- free_netdev(dev);
+
slip_devs[i] = NULL;
}
@@ -651,7 +651,8 @@ static const struct net_device_ops sl_netdev_ops = {
static void sl_setup(struct net_device *dev)
{
dev->netdev_ops = &sl_netdev_ops;
- dev->destructor = sl_free_netdev;
+ dev->needs_free_netdev = true;
+ dev->priv_destructor = sl_free_netdev;
dev->hard_header_len = 0;
dev->addr_len = 0;
@@ -1369,8 +1370,6 @@ static void __exit slip_exit(void)
if (sl->tty) {
printk(KERN_ERR "%s: tty discipline still running\n",
dev->name);
- /* Intentionally leak the control block. */
- dev->destructor = NULL;
}
unregister_netdev(dev);
diff --git a/drivers/net/team/team.c b/drivers/net/team/team.c
index 6c5d5ef46f75..fba8c136aa7c 100644
--- a/drivers/net/team/team.c
+++ b/drivers/net/team/team.c
@@ -1643,7 +1643,6 @@ static void team_destructor(struct net_device *dev)
struct team *team = netdev_priv(dev);
free_percpu(team->pcpu_stats);
- free_netdev(dev);
}
static int team_open(struct net_device *dev)
@@ -2079,7 +2078,8 @@ static void team_setup(struct net_device *dev)
dev->netdev_ops = &team_netdev_ops;
dev->ethtool_ops = &team_ethtool_ops;
- dev->destructor = team_destructor;
+ dev->needs_free_netdev = true;
+ dev->priv_destructor = team_destructor;
dev->priv_flags &= ~(IFF_XMIT_DST_RELEASE | IFF_TX_SKB_SHARING);
dev->priv_flags |= IFF_NO_QUEUE;
dev->priv_flags |= IFF_TEAM;
diff --git a/drivers/net/tun.c b/drivers/net/tun.c
index bbd707b9ef7a..9ee7d4275640 100644
--- a/drivers/net/tun.c
+++ b/drivers/net/tun.c
@@ -1560,7 +1560,6 @@ static void tun_free_netdev(struct net_device *dev)
free_percpu(tun->pcpu_stats);
tun_flow_uninit(tun);
security_tun_dev_free_security(tun->security);
- free_netdev(dev);
}
static void tun_setup(struct net_device *dev)
@@ -1571,7 +1570,8 @@ static void tun_setup(struct net_device *dev)
tun->group = INVALID_GID;
dev->ethtool_ops = &tun_ethtool_ops;
- dev->destructor = tun_free_netdev;
+ dev->needs_free_netdev = true;
+ dev->priv_destructor = tun_free_netdev;
/* We prefer our own queue length */
dev->tx_queue_len = TUN_READQ_SIZE;
}
diff --git a/drivers/net/usb/ax88179_178a.c b/drivers/net/usb/ax88179_178a.c
index 51cf60092a18..4037ab27734a 100644
--- a/drivers/net/usb/ax88179_178a.c
+++ b/drivers/net/usb/ax88179_178a.c
@@ -1722,6 +1722,18 @@ static const struct driver_info lenovo_info = {
.tx_fixup = ax88179_tx_fixup,
};
+static const struct driver_info belkin_info = {
+ .description = "Belkin USB Ethernet Adapter",
+ .bind = ax88179_bind,
+ .unbind = ax88179_unbind,
+ .status = ax88179_status,
+ .link_reset = ax88179_link_reset,
+ .reset = ax88179_reset,
+ .flags = FLAG_ETHER | FLAG_FRAMING_AX,
+ .rx_fixup = ax88179_rx_fixup,
+ .tx_fixup = ax88179_tx_fixup,
+};
+
static const struct usb_device_id products[] = {
{
/* ASIX AX88179 10/100/1000 */
@@ -1751,6 +1763,10 @@ static const struct usb_device_id products[] = {
/* Lenovo OneLinkDock Gigabit LAN */
USB_DEVICE(0x17ef, 0x304b),
.driver_info = (unsigned long)&lenovo_info,
+}, {
+ /* Belkin B2B128 USB 3.0 Hub + Gigabit Ethernet Adapter */
+ USB_DEVICE(0x050d, 0x0128),
+ .driver_info = (unsigned long)&belkin_info,
},
{ },
};
diff --git a/drivers/net/usb/cdc-phonet.c b/drivers/net/usb/cdc-phonet.c
index eb52de8205f0..c7a350bbaaa7 100644
--- a/drivers/net/usb/cdc-phonet.c
+++ b/drivers/net/usb/cdc-phonet.c
@@ -298,7 +298,7 @@ static void usbpn_setup(struct net_device *dev)
dev->addr_len = 1;
dev->tx_queue_len = 3;
- dev->destructor = free_netdev;
+ dev->needs_free_netdev = true;
}
/*
diff --git a/drivers/net/usb/qmi_wwan.c b/drivers/net/usb/qmi_wwan.c
index 8f923a147fa9..32a22f4e8356 100644
--- a/drivers/net/usb/qmi_wwan.c
+++ b/drivers/net/usb/qmi_wwan.c
@@ -123,7 +123,7 @@ static void qmimux_setup(struct net_device *dev)
dev->addr_len = 0;
dev->flags = IFF_POINTOPOINT | IFF_NOARP | IFF_MULTICAST;
dev->netdev_ops = &qmimux_netdev_ops;
- dev->destructor = free_netdev;
+ dev->needs_free_netdev = true;
}
static struct net_device *qmimux_find_dev(struct usbnet *dev, u8 mux_id)
@@ -1192,6 +1192,8 @@ static const struct usb_device_id products[] = {
{QMI_FIXED_INTF(0x1199, 0x9056, 8)}, /* Sierra Wireless Modem */
{QMI_FIXED_INTF(0x1199, 0x9057, 8)},
{QMI_FIXED_INTF(0x1199, 0x9061, 8)}, /* Sierra Wireless Modem */
+ {QMI_FIXED_INTF(0x1199, 0x9063, 8)}, /* Sierra Wireless EM7305 */
+ {QMI_FIXED_INTF(0x1199, 0x9063, 10)}, /* Sierra Wireless EM7305 */
{QMI_FIXED_INTF(0x1199, 0x9071, 8)}, /* Sierra Wireless MC74xx */
{QMI_FIXED_INTF(0x1199, 0x9071, 10)}, /* Sierra Wireless MC74xx */
{QMI_FIXED_INTF(0x1199, 0x9079, 8)}, /* Sierra Wireless EM74xx */
@@ -1206,6 +1208,8 @@ static const struct usb_device_id products[] = {
{QMI_FIXED_INTF(0x1bc7, 0x1100, 3)}, /* Telit ME910 */
{QMI_FIXED_INTF(0x1bc7, 0x1200, 5)}, /* Telit LE920 */
{QMI_QUIRK_SET_DTR(0x1bc7, 0x1201, 2)}, /* Telit LE920, LE920A4 */
+ {QMI_FIXED_INTF(0x1c9e, 0x9801, 3)}, /* Telewell TW-3G HSPA+ */
+ {QMI_FIXED_INTF(0x1c9e, 0x9803, 4)}, /* Telewell TW-3G HSPA+ */
{QMI_FIXED_INTF(0x1c9e, 0x9b01, 3)}, /* XS Stick W100-2 from 4G Systems */
{QMI_FIXED_INTF(0x0b3c, 0xc000, 4)}, /* Olivetti Olicard 100 */
{QMI_FIXED_INTF(0x0b3c, 0xc001, 4)}, /* Olivetti Olicard 120 */
diff --git a/drivers/net/usb/r8152.c b/drivers/net/usb/r8152.c
index ddc62cb69be8..1a419a45e2a2 100644
--- a/drivers/net/usb/r8152.c
+++ b/drivers/net/usb/r8152.c
@@ -4368,6 +4368,8 @@ static u8 rtl_get_version(struct usb_interface *intf)
break;
}
+ dev_dbg(&intf->dev, "Detected version 0x%04x\n", version);
+
return version;
}
diff --git a/drivers/net/veth.c b/drivers/net/veth.c
index 38f0f03a29c8..364fa9d11d1a 100644
--- a/drivers/net/veth.c
+++ b/drivers/net/veth.c
@@ -222,7 +222,6 @@ static int veth_dev_init(struct net_device *dev)
static void veth_dev_free(struct net_device *dev)
{
free_percpu(dev->vstats);
- free_netdev(dev);
}
#ifdef CONFIG_NET_POLL_CONTROLLER
@@ -317,7 +316,8 @@ static void veth_setup(struct net_device *dev)
NETIF_F_HW_VLAN_STAG_TX |
NETIF_F_HW_VLAN_CTAG_RX |
NETIF_F_HW_VLAN_STAG_RX);
- dev->destructor = veth_dev_free;
+ dev->needs_free_netdev = true;
+ dev->priv_destructor = veth_dev_free;
dev->max_mtu = ETH_MAX_MTU;
dev->hw_features = VETH_FEATURES;
@@ -383,7 +383,7 @@ static int veth_newlink(struct net *src_net, struct net_device *dev,
tbp = tb;
}
- if (tbp[IFLA_IFNAME]) {
+ if (ifmp && tbp[IFLA_IFNAME]) {
nla_strlcpy(ifname, tbp[IFLA_IFNAME], IFNAMSIZ);
name_assign_type = NET_NAME_USER;
} else {
@@ -402,7 +402,7 @@ static int veth_newlink(struct net *src_net, struct net_device *dev,
return PTR_ERR(peer);
}
- if (tbp[IFLA_ADDRESS] == NULL)
+ if (!ifmp || !tbp[IFLA_ADDRESS])
eth_hw_addr_random(peer);
if (ifmp && (dev->ifindex != 0))
diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c
index a871f45ecc79..143d8a95a60d 100644
--- a/drivers/net/virtio_net.c
+++ b/drivers/net/virtio_net.c
@@ -1797,6 +1797,7 @@ static void virtnet_freeze_down(struct virtio_device *vdev)
flush_work(&vi->config_work);
netif_device_detach(vi->dev);
+ netif_tx_disable(vi->dev);
cancel_delayed_work_sync(&vi->refill);
if (netif_running(vi->dev)) {
diff --git a/drivers/net/vrf.c b/drivers/net/vrf.c
index db882493875c..022c0b5f9844 100644
--- a/drivers/net/vrf.c
+++ b/drivers/net/vrf.c
@@ -36,12 +36,14 @@
#include <net/addrconf.h>
#include <net/l3mdev.h>
#include <net/fib_rules.h>
+#include <net/netns/generic.h>
#define DRV_NAME "vrf"
#define DRV_VERSION "1.0"
#define FIB_RULE_PREF 1000 /* default preference for FIB rules */
-static bool add_fib_rules = true;
+
+static unsigned int vrf_net_id;
struct net_vrf {
struct rtable __rcu *rth;
@@ -1348,7 +1350,7 @@ static void vrf_setup(struct net_device *dev)
dev->netdev_ops = &vrf_netdev_ops;
dev->l3mdev_ops = &vrf_l3mdev_ops;
dev->ethtool_ops = &vrf_ethtool_ops;
- dev->destructor = free_netdev;
+ dev->needs_free_netdev = true;
/* Fill in device structure with ethernet-generic values. */
eth_hw_addr_random(dev);
@@ -1394,6 +1396,8 @@ static int vrf_newlink(struct net *src_net, struct net_device *dev,
struct nlattr *tb[], struct nlattr *data[])
{
struct net_vrf *vrf = netdev_priv(dev);
+ bool *add_fib_rules;
+ struct net *net;
int err;
if (!data || !data[IFLA_VRF_TABLE])
@@ -1409,13 +1413,15 @@ static int vrf_newlink(struct net *src_net, struct net_device *dev,
if (err)
goto out;
- if (add_fib_rules) {
+ net = dev_net(dev);
+ add_fib_rules = net_generic(net, vrf_net_id);
+ if (*add_fib_rules) {
err = vrf_add_fib_rules(dev);
if (err) {
unregister_netdevice(dev);
goto out;
}
- add_fib_rules = false;
+ *add_fib_rules = false;
}
out:
@@ -1498,16 +1504,38 @@ static struct notifier_block vrf_notifier_block __read_mostly = {
.notifier_call = vrf_device_event,
};
+/* Initialize per network namespace state */
+static int __net_init vrf_netns_init(struct net *net)
+{
+ bool *add_fib_rules = net_generic(net, vrf_net_id);
+
+ *add_fib_rules = true;
+
+ return 0;
+}
+
+static struct pernet_operations vrf_net_ops __net_initdata = {
+ .init = vrf_netns_init,
+ .id = &vrf_net_id,
+ .size = sizeof(bool),
+};
+
static int __init vrf_init_module(void)
{
int rc;
register_netdevice_notifier(&vrf_notifier_block);
- rc = rtnl_link_register(&vrf_link_ops);
+ rc = register_pernet_subsys(&vrf_net_ops);
if (rc < 0)
goto error;
+ rc = rtnl_link_register(&vrf_link_ops);
+ if (rc < 0) {
+ unregister_pernet_subsys(&vrf_net_ops);
+ goto error;
+ }
+
return 0;
error:
diff --git a/drivers/net/vsockmon.c b/drivers/net/vsockmon.c
index 7f0136f2dd9d..c28bdce14fd5 100644
--- a/drivers/net/vsockmon.c
+++ b/drivers/net/vsockmon.c
@@ -135,7 +135,7 @@ static void vsockmon_setup(struct net_device *dev)
dev->netdev_ops = &vsockmon_ops;
dev->ethtool_ops = &vsockmon_ethtool_ops;
- dev->destructor = free_netdev;
+ dev->needs_free_netdev = true;
dev->features = NETIF_F_SG | NETIF_F_FRAGLIST |
NETIF_F_HIGHDMA | NETIF_F_LLTX;
diff --git a/drivers/net/vxlan.c b/drivers/net/vxlan.c
index a6b5052c1d36..5fa798a5c9a6 100644
--- a/drivers/net/vxlan.c
+++ b/drivers/net/vxlan.c
@@ -2611,7 +2611,7 @@ static void vxlan_setup(struct net_device *dev)
eth_hw_addr_random(dev);
ether_setup(dev);
- dev->destructor = free_netdev;
+ dev->needs_free_netdev = true;
SET_NETDEV_DEVTYPE(dev, &vxlan_type);
dev->features |= NETIF_F_LLTX;
diff --git a/drivers/net/wan/dlci.c b/drivers/net/wan/dlci.c
index 65ee2a6f248c..a0d76f70c428 100644
--- a/drivers/net/wan/dlci.c
+++ b/drivers/net/wan/dlci.c
@@ -475,7 +475,7 @@ static void dlci_setup(struct net_device *dev)
dev->flags = 0;
dev->header_ops = &dlci_header_ops;
dev->netdev_ops = &dlci_netdev_ops;
- dev->destructor = free_netdev;
+ dev->needs_free_netdev = true;
dlp->receive = dlci_receive;
diff --git a/drivers/net/wan/hdlc_fr.c b/drivers/net/wan/hdlc_fr.c
index eb915281197e..78596e42a3f3 100644
--- a/drivers/net/wan/hdlc_fr.c
+++ b/drivers/net/wan/hdlc_fr.c
@@ -1106,7 +1106,7 @@ static int fr_add_pvc(struct net_device *frad, unsigned int dlci, int type)
return -EIO;
}
- dev->destructor = free_netdev;
+ dev->needs_free_netdev = true;
*get_dev_p(pvc, type) = dev;
if (!used) {
state(hdlc)->dce_changed = 1;
diff --git a/drivers/net/wan/lapbether.c b/drivers/net/wan/lapbether.c
index 9df9ed62beff..63f749078a1f 100644
--- a/drivers/net/wan/lapbether.c
+++ b/drivers/net/wan/lapbether.c
@@ -306,7 +306,7 @@ static const struct net_device_ops lapbeth_netdev_ops = {
static void lapbeth_setup(struct net_device *dev)
{
dev->netdev_ops = &lapbeth_netdev_ops;
- dev->destructor = free_netdev;
+ dev->needs_free_netdev = true;
dev->type = ARPHRD_X25;
dev->hard_header_len = 3;
dev->mtu = 1000;
diff --git a/drivers/net/wireless/ath/ath6kl/main.c b/drivers/net/wireless/ath/ath6kl/main.c
index 91ee542de3d7..b90c77ef792e 100644
--- a/drivers/net/wireless/ath/ath6kl/main.c
+++ b/drivers/net/wireless/ath/ath6kl/main.c
@@ -1287,7 +1287,7 @@ void init_netdev(struct net_device *dev)
struct ath6kl *ar = ath6kl_priv(dev);
dev->netdev_ops = &ath6kl_netdev_ops;
- dev->destructor = free_netdev;
+ dev->needs_free_netdev = true;
dev->watchdog_timeo = ATH6KL_TX_TIMEOUT;
dev->needed_headroom = ETH_HLEN;
diff --git a/drivers/net/wireless/broadcom/brcm80211/brcmfmac/cfg80211.c b/drivers/net/wireless/broadcom/brcm80211/brcmfmac/cfg80211.c
index cd1d6730eab7..617199c0e5a0 100644
--- a/drivers/net/wireless/broadcom/brcm80211/brcmfmac/cfg80211.c
+++ b/drivers/net/wireless/broadcom/brcm80211/brcmfmac/cfg80211.c
@@ -5225,7 +5225,6 @@ void brcmf_cfg80211_free_netdev(struct net_device *ndev)
if (vif)
brcmf_free_vif(vif);
- free_netdev(ndev);
}
static bool brcmf_is_linkup(const struct brcmf_event_msg *e)
diff --git a/drivers/net/wireless/broadcom/brcm80211/brcmfmac/core.c b/drivers/net/wireless/broadcom/brcm80211/brcmfmac/core.c
index a3d82368f1a9..511d190c6cca 100644
--- a/drivers/net/wireless/broadcom/brcm80211/brcmfmac/core.c
+++ b/drivers/net/wireless/broadcom/brcm80211/brcmfmac/core.c
@@ -624,7 +624,8 @@ struct brcmf_if *brcmf_add_if(struct brcmf_pub *drvr, s32 bsscfgidx, s32 ifidx,
if (!ndev)
return ERR_PTR(-ENOMEM);
- ndev->destructor = brcmf_cfg80211_free_netdev;
+ ndev->needs_free_netdev = true;
+ ndev->priv_destructor = brcmf_cfg80211_free_netdev;
ifp = netdev_priv(ndev);
ifp->ndev = ndev;
/* store mapping ifidx to bsscfgidx */
diff --git a/drivers/net/wireless/broadcom/brcm80211/brcmfmac/firmware.c b/drivers/net/wireless/broadcom/brcm80211/brcmfmac/firmware.c
index c7c1e9906500..d231042f19d6 100644
--- a/drivers/net/wireless/broadcom/brcm80211/brcmfmac/firmware.c
+++ b/drivers/net/wireless/broadcom/brcm80211/brcmfmac/firmware.c
@@ -442,7 +442,7 @@ struct brcmf_fw {
const char *nvram_name;
u16 domain_nr;
u16 bus_nr;
- void (*done)(struct device *dev, const struct firmware *fw,
+ void (*done)(struct device *dev, int err, const struct firmware *fw,
void *nvram_image, u32 nvram_len);
};
@@ -477,52 +477,51 @@ static void brcmf_fw_request_nvram_done(const struct firmware *fw, void *ctx)
if (!nvram && !(fwctx->flags & BRCMF_FW_REQ_NV_OPTIONAL))
goto fail;
- fwctx->done(fwctx->dev, fwctx->code, nvram, nvram_length);
+ fwctx->done(fwctx->dev, 0, fwctx->code, nvram, nvram_length);
kfree(fwctx);
return;
fail:
brcmf_dbg(TRACE, "failed: dev=%s\n", dev_name(fwctx->dev));
release_firmware(fwctx->code);
- device_release_driver(fwctx->dev);
+ fwctx->done(fwctx->dev, -ENOENT, NULL, NULL, 0);
kfree(fwctx);
}
static void brcmf_fw_request_code_done(const struct firmware *fw, void *ctx)
{
struct brcmf_fw *fwctx = ctx;
- int ret;
+ int ret = 0;
brcmf_dbg(TRACE, "enter: dev=%s\n", dev_name(fwctx->dev));
- if (!fw)
+ if (!fw) {
+ ret = -ENOENT;
goto fail;
-
- /* only requested code so done here */
- if (!(fwctx->flags & BRCMF_FW_REQUEST_NVRAM)) {
- fwctx->done(fwctx->dev, fw, NULL, 0);
- kfree(fwctx);
- return;
}
+ /* only requested code so done here */
+ if (!(fwctx->flags & BRCMF_FW_REQUEST_NVRAM))
+ goto done;
+
fwctx->code = fw;
ret = request_firmware_nowait(THIS_MODULE, true, fwctx->nvram_name,
fwctx->dev, GFP_KERNEL, fwctx,
brcmf_fw_request_nvram_done);
- if (!ret)
- return;
-
- brcmf_fw_request_nvram_done(NULL, fwctx);
+ /* pass NULL to nvram callback for bcm47xx fallback */
+ if (ret)
+ brcmf_fw_request_nvram_done(NULL, fwctx);
return;
fail:
brcmf_dbg(TRACE, "failed: dev=%s\n", dev_name(fwctx->dev));
- device_release_driver(fwctx->dev);
+done:
+ fwctx->done(fwctx->dev, ret, fw, NULL, 0);
kfree(fwctx);
}
int brcmf_fw_get_firmwares_pcie(struct device *dev, u16 flags,
const char *code, const char *nvram,
- void (*fw_cb)(struct device *dev,
+ void (*fw_cb)(struct device *dev, int err,
const struct firmware *fw,
void *nvram_image, u32 nvram_len),
u16 domain_nr, u16 bus_nr)
@@ -555,7 +554,7 @@ int brcmf_fw_get_firmwares_pcie(struct device *dev, u16 flags,
int brcmf_fw_get_firmwares(struct device *dev, u16 flags,
const char *code, const char *nvram,
- void (*fw_cb)(struct device *dev,
+ void (*fw_cb)(struct device *dev, int err,
const struct firmware *fw,
void *nvram_image, u32 nvram_len))
{
diff --git a/drivers/net/wireless/broadcom/brcm80211/brcmfmac/firmware.h b/drivers/net/wireless/broadcom/brcm80211/brcmfmac/firmware.h
index d3c9f0d52ae3..8fa4b7e1ab3d 100644
--- a/drivers/net/wireless/broadcom/brcm80211/brcmfmac/firmware.h
+++ b/drivers/net/wireless/broadcom/brcm80211/brcmfmac/firmware.h
@@ -73,13 +73,13 @@ void brcmf_fw_nvram_free(void *nvram);
*/
int brcmf_fw_get_firmwares_pcie(struct device *dev, u16 flags,
const char *code, const char *nvram,
- void (*fw_cb)(struct device *dev,
+ void (*fw_cb)(struct device *dev, int err,
const struct firmware *fw,
void *nvram_image, u32 nvram_len),
u16 domain_nr, u16 bus_nr);
int brcmf_fw_get_firmwares(struct device *dev, u16 flags,
const char *code, const char *nvram,
- void (*fw_cb)(struct device *dev,
+ void (*fw_cb)(struct device *dev, int err,
const struct firmware *fw,
void *nvram_image, u32 nvram_len));
diff --git a/drivers/net/wireless/broadcom/brcm80211/brcmfmac/fwsignal.c b/drivers/net/wireless/broadcom/brcm80211/brcmfmac/fwsignal.c
index 72373e59308e..f59642b2c935 100644
--- a/drivers/net/wireless/broadcom/brcm80211/brcmfmac/fwsignal.c
+++ b/drivers/net/wireless/broadcom/brcm80211/brcmfmac/fwsignal.c
@@ -2145,7 +2145,7 @@ void brcmf_fws_add_interface(struct brcmf_if *ifp)
struct brcmf_fws_info *fws = drvr_to_fws(ifp->drvr);
struct brcmf_fws_mac_descriptor *entry;
- if (!ifp->ndev || fws->fcmode == BRCMF_FWS_FCMODE_NONE)
+ if (!ifp->ndev || !brcmf_fws_queue_skbs(fws))
return;
entry = &fws->desc.iface[ifp->ifidx];
diff --git a/drivers/net/wireless/broadcom/brcm80211/brcmfmac/pcie.c b/drivers/net/wireless/broadcom/brcm80211/brcmfmac/pcie.c
index f36b96dc6acd..f878706613e6 100644
--- a/drivers/net/wireless/broadcom/brcm80211/brcmfmac/pcie.c
+++ b/drivers/net/wireless/broadcom/brcm80211/brcmfmac/pcie.c
@@ -1650,16 +1650,23 @@ static const struct brcmf_buscore_ops brcmf_pcie_buscore_ops = {
.write32 = brcmf_pcie_buscore_write32,
};
-static void brcmf_pcie_setup(struct device *dev, const struct firmware *fw,
+static void brcmf_pcie_setup(struct device *dev, int ret,
+ const struct firmware *fw,
void *nvram, u32 nvram_len)
{
- struct brcmf_bus *bus = dev_get_drvdata(dev);
- struct brcmf_pciedev *pcie_bus_dev = bus->bus_priv.pcie;
- struct brcmf_pciedev_info *devinfo = pcie_bus_dev->devinfo;
+ struct brcmf_bus *bus;
+ struct brcmf_pciedev *pcie_bus_dev;
+ struct brcmf_pciedev_info *devinfo;
struct brcmf_commonring **flowrings;
- int ret;
u32 i;
+ /* check firmware loading result */
+ if (ret)
+ goto fail;
+
+ bus = dev_get_drvdata(dev);
+ pcie_bus_dev = bus->bus_priv.pcie;
+ devinfo = pcie_bus_dev->devinfo;
brcmf_pcie_attach(devinfo);
/* Some of the firmwares have the size of the memory of the device
diff --git a/drivers/net/wireless/broadcom/brcm80211/brcmfmac/sdio.c b/drivers/net/wireless/broadcom/brcm80211/brcmfmac/sdio.c
index e03450059b06..5653d6dd38f6 100644
--- a/drivers/net/wireless/broadcom/brcm80211/brcmfmac/sdio.c
+++ b/drivers/net/wireless/broadcom/brcm80211/brcmfmac/sdio.c
@@ -3982,21 +3982,26 @@ static const struct brcmf_bus_ops brcmf_sdio_bus_ops = {
.get_memdump = brcmf_sdio_bus_get_memdump,
};
-static void brcmf_sdio_firmware_callback(struct device *dev,
+static void brcmf_sdio_firmware_callback(struct device *dev, int err,
const struct firmware *code,
void *nvram, u32 nvram_len)
{
- struct brcmf_bus *bus_if = dev_get_drvdata(dev);
- struct brcmf_sdio_dev *sdiodev = bus_if->bus_priv.sdio;
- struct brcmf_sdio *bus = sdiodev->bus;
- int err = 0;
+ struct brcmf_bus *bus_if;
+ struct brcmf_sdio_dev *sdiodev;
+ struct brcmf_sdio *bus;
u8 saveclk;
- brcmf_dbg(TRACE, "Enter: dev=%s\n", dev_name(dev));
+ brcmf_dbg(TRACE, "Enter: dev=%s, err=%d\n", dev_name(dev), err);
+ bus_if = dev_get_drvdata(dev);
+ sdiodev = bus_if->bus_priv.sdio;
+ if (err)
+ goto fail;
if (!bus_if->drvr)
return;
+ bus = sdiodev->bus;
+
/* try to download image and nvram to the dongle */
bus->alp_only = true;
err = brcmf_sdio_download_firmware(bus, code, nvram, nvram_len);
@@ -4083,6 +4088,7 @@ release:
fail:
brcmf_dbg(TRACE, "failed: dev=%s, err=%d\n", dev_name(dev), err);
device_release_driver(dev);
+ device_release_driver(&sdiodev->func[2]->dev);
}
struct brcmf_sdio *brcmf_sdio_probe(struct brcmf_sdio_dev *sdiodev)
diff --git a/drivers/net/wireless/broadcom/brcm80211/brcmfmac/usb.c b/drivers/net/wireless/broadcom/brcm80211/brcmfmac/usb.c
index e4d545f9edee..0eea48e73331 100644
--- a/drivers/net/wireless/broadcom/brcm80211/brcmfmac/usb.c
+++ b/drivers/net/wireless/broadcom/brcm80211/brcmfmac/usb.c
@@ -1159,17 +1159,18 @@ fail:
return ret;
}
-static void brcmf_usb_probe_phase2(struct device *dev,
+static void brcmf_usb_probe_phase2(struct device *dev, int ret,
const struct firmware *fw,
void *nvram, u32 nvlen)
{
struct brcmf_bus *bus = dev_get_drvdata(dev);
- struct brcmf_usbdev_info *devinfo;
- int ret;
+ struct brcmf_usbdev_info *devinfo = bus->bus_priv.usb->devinfo;
+
+ if (ret)
+ goto error;
brcmf_dbg(USB, "Start fw downloading\n");
- devinfo = bus->bus_priv.usb->devinfo;
ret = check_file(fw->data);
if (ret < 0) {
brcmf_err("invalid firmware\n");
diff --git a/drivers/net/wireless/cisco/airo.c b/drivers/net/wireless/cisco/airo.c
index 1b7e125a28e2..6a13303af2b7 100644
--- a/drivers/net/wireless/cisco/airo.c
+++ b/drivers/net/wireless/cisco/airo.c
@@ -3066,7 +3066,7 @@ static int airo_thread(void *data) {
if (ai->jobs) {
locked = down_interruptible(&ai->sem);
} else {
- wait_queue_t wait;
+ wait_queue_entry_t wait;
init_waitqueue_entry(&wait, current);
add_wait_queue(&ai->thr_wait, &wait);
diff --git a/drivers/net/wireless/intersil/hostap/hostap_ioctl.c b/drivers/net/wireless/intersil/hostap/hostap_ioctl.c
index b2c6b065b542..ff153ce29539 100644
--- a/drivers/net/wireless/intersil/hostap/hostap_ioctl.c
+++ b/drivers/net/wireless/intersil/hostap/hostap_ioctl.c
@@ -2544,7 +2544,7 @@ static int prism2_ioctl_priv_prism2_param(struct net_device *dev,
ret = -EINVAL;
}
if (local->iw_mode == IW_MODE_MASTER) {
- wait_queue_t __wait;
+ wait_queue_entry_t __wait;
init_waitqueue_entry(&__wait, current);
add_wait_queue(&local->hostscan_wq, &__wait);
set_current_state(TASK_INTERRUPTIBLE);
diff --git a/drivers/net/wireless/intersil/hostap/hostap_main.c b/drivers/net/wireless/intersil/hostap/hostap_main.c
index 544fc09dcb62..1372b20f931e 100644
--- a/drivers/net/wireless/intersil/hostap/hostap_main.c
+++ b/drivers/net/wireless/intersil/hostap/hostap_main.c
@@ -73,7 +73,7 @@ struct net_device * hostap_add_interface(struct local_info *local,
dev->mem_end = mdev->mem_end;
hostap_setup_dev(dev, local, type);
- dev->destructor = free_netdev;
+ dev->needs_free_netdev = true;
sprintf(dev->name, "%s%s", prefix, name);
if (!rtnl_locked)
diff --git a/drivers/net/wireless/mac80211_hwsim.c b/drivers/net/wireless/mac80211_hwsim.c
index 002b25cff5b6..c854a557998b 100644
--- a/drivers/net/wireless/mac80211_hwsim.c
+++ b/drivers/net/wireless/mac80211_hwsim.c
@@ -2861,7 +2861,7 @@ static const struct net_device_ops hwsim_netdev_ops = {
static void hwsim_mon_setup(struct net_device *dev)
{
dev->netdev_ops = &hwsim_netdev_ops;
- dev->destructor = free_netdev;
+ dev->needs_free_netdev = true;
ether_setup(dev);
dev->priv_flags |= IFF_NO_QUEUE;
dev->type = ARPHRD_IEEE80211_RADIOTAP;
diff --git a/drivers/net/wireless/marvell/libertas/main.c b/drivers/net/wireless/marvell/libertas/main.c
index e3500203715c..dde065d0d5c1 100644
--- a/drivers/net/wireless/marvell/libertas/main.c
+++ b/drivers/net/wireless/marvell/libertas/main.c
@@ -453,7 +453,7 @@ static int lbs_thread(void *data)
{
struct net_device *dev = data;
struct lbs_private *priv = dev->ml_priv;
- wait_queue_t wait;
+ wait_queue_entry_t wait;
lbs_deb_enter(LBS_DEB_THREAD);
diff --git a/drivers/net/wireless/marvell/mwifiex/main.c b/drivers/net/wireless/marvell/mwifiex/main.c
index dd87b9ff64c3..39b6b5e3f6e0 100644
--- a/drivers/net/wireless/marvell/mwifiex/main.c
+++ b/drivers/net/wireless/marvell/mwifiex/main.c
@@ -1280,7 +1280,7 @@ void mwifiex_init_priv_params(struct mwifiex_private *priv,
struct net_device *dev)
{
dev->netdev_ops = &mwifiex_netdev_ops;
- dev->destructor = free_netdev;
+ dev->needs_free_netdev = true;
/* Initialize private structure */
priv->current_key_index = 0;
priv->media_connected = false;
diff --git a/drivers/net/xen-netback/common.h b/drivers/net/xen-netback/common.h
index 530586be05b4..5b1d2e8402d9 100644
--- a/drivers/net/xen-netback/common.h
+++ b/drivers/net/xen-netback/common.h
@@ -199,6 +199,7 @@ struct xenvif_queue { /* Per-queue data for xenvif */
unsigned long remaining_credit;
struct timer_list credit_timeout;
u64 credit_window_start;
+ bool rate_limited;
/* Statistics */
struct xenvif_stats stats;
diff --git a/drivers/net/xen-netback/interface.c b/drivers/net/xen-netback/interface.c
index 8397f6c92451..e322a862ddfe 100644
--- a/drivers/net/xen-netback/interface.c
+++ b/drivers/net/xen-netback/interface.c
@@ -106,7 +106,11 @@ static int xenvif_poll(struct napi_struct *napi, int budget)
if (work_done < budget) {
napi_complete_done(napi, work_done);
- xenvif_napi_schedule_or_enable_events(queue);
+ /* If the queue is rate-limited, it shall be
+ * rescheduled in the timer callback.
+ */
+ if (likely(!queue->rate_limited))
+ xenvif_napi_schedule_or_enable_events(queue);
}
return work_done;
diff --git a/drivers/net/xen-netback/netback.c b/drivers/net/xen-netback/netback.c
index 602d408fa25e..5042ff8d449a 100644
--- a/drivers/net/xen-netback/netback.c
+++ b/drivers/net/xen-netback/netback.c
@@ -180,6 +180,7 @@ static void tx_add_credit(struct xenvif_queue *queue)
max_credit = ULONG_MAX; /* wrapped: clamp to ULONG_MAX */
queue->remaining_credit = min(max_credit, max_burst);
+ queue->rate_limited = false;
}
void xenvif_tx_credit_callback(unsigned long data)
@@ -686,8 +687,10 @@ static bool tx_credit_exceeded(struct xenvif_queue *queue, unsigned size)
msecs_to_jiffies(queue->credit_usec / 1000);
/* Timer could already be pending in rare cases. */
- if (timer_pending(&queue->credit_timeout))
+ if (timer_pending(&queue->credit_timeout)) {
+ queue->rate_limited = true;
return true;
+ }
/* Passed the point where we can replenish credit? */
if (time_after_eq64(now, next_credit)) {
@@ -702,6 +705,7 @@ static bool tx_credit_exceeded(struct xenvif_queue *queue, unsigned size)
mod_timer(&queue->credit_timeout,
next_credit);
queue->credit_window_start = next_credit;
+ queue->rate_limited = true;
return true;
}
diff --git a/drivers/ntb/hw/intel/ntb_hw_intel.c b/drivers/ntb/hw/intel/ntb_hw_intel.c
index c00238491673..7b3b6fd63d7d 100644
--- a/drivers/ntb/hw/intel/ntb_hw_intel.c
+++ b/drivers/ntb/hw/intel/ntb_hw_intel.c
@@ -2878,7 +2878,7 @@ static const struct intel_ntb_reg skx_reg = {
.link_is_up = xeon_link_is_up,
.db_ioread = skx_db_ioread,
.db_iowrite = skx_db_iowrite,
- .db_size = sizeof(u64),
+ .db_size = sizeof(u32),
.ntb_ctl = SKX_NTBCNTL_OFFSET,
.mw_bar = {2, 4},
};
diff --git a/drivers/ntb/ntb_transport.c b/drivers/ntb/ntb_transport.c
index 02ca45fdd892..10e5bf460139 100644
--- a/drivers/ntb/ntb_transport.c
+++ b/drivers/ntb/ntb_transport.c
@@ -177,14 +177,12 @@ struct ntb_transport_qp {
u64 rx_err_ver;
u64 rx_memcpy;
u64 rx_async;
- u64 dma_rx_prep_err;
u64 tx_bytes;
u64 tx_pkts;
u64 tx_ring_full;
u64 tx_err_no_buf;
u64 tx_memcpy;
u64 tx_async;
- u64 dma_tx_prep_err;
};
struct ntb_transport_mw {
@@ -254,8 +252,6 @@ enum {
#define QP_TO_MW(nt, qp) ((qp) % nt->mw_count)
#define NTB_QP_DEF_NUM_ENTRIES 100
#define NTB_LINK_DOWN_TIMEOUT 10
-#define DMA_RETRIES 20
-#define DMA_OUT_RESOURCE_TO msecs_to_jiffies(50)
static void ntb_transport_rxc_db(unsigned long data);
static const struct ntb_ctx_ops ntb_transport_ops;
@@ -516,12 +512,6 @@ static ssize_t debugfs_read(struct file *filp, char __user *ubuf, size_t count,
out_offset += snprintf(buf + out_offset, out_count - out_offset,
"free tx - \t%u\n",
ntb_transport_tx_free_entry(qp));
- out_offset += snprintf(buf + out_offset, out_count - out_offset,
- "DMA tx prep err - \t%llu\n",
- qp->dma_tx_prep_err);
- out_offset += snprintf(buf + out_offset, out_count - out_offset,
- "DMA rx prep err - \t%llu\n",
- qp->dma_rx_prep_err);
out_offset += snprintf(buf + out_offset, out_count - out_offset,
"\n");
@@ -623,7 +613,7 @@ static int ntb_transport_setup_qp_mw(struct ntb_transport_ctx *nt,
if (!mw->virt_addr)
return -ENOMEM;
- if (qp_count % mw_count && mw_num + 1 < qp_count / mw_count)
+ if (mw_num < qp_count % mw_count)
num_qps_mw = qp_count / mw_count + 1;
else
num_qps_mw = qp_count / mw_count;
@@ -768,8 +758,6 @@ static void ntb_qp_link_down_reset(struct ntb_transport_qp *qp)
qp->tx_err_no_buf = 0;
qp->tx_memcpy = 0;
qp->tx_async = 0;
- qp->dma_tx_prep_err = 0;
- qp->dma_rx_prep_err = 0;
}
static void ntb_qp_link_cleanup(struct ntb_transport_qp *qp)
@@ -1000,7 +988,7 @@ static int ntb_transport_init_queue(struct ntb_transport_ctx *nt,
qp->event_handler = NULL;
ntb_qp_link_down_reset(qp);
- if (qp_count % mw_count && mw_num + 1 < qp_count / mw_count)
+ if (mw_num < qp_count % mw_count)
num_qps_mw = qp_count / mw_count + 1;
else
num_qps_mw = qp_count / mw_count;
@@ -1128,8 +1116,8 @@ static int ntb_transport_probe(struct ntb_client *self, struct ntb_dev *ndev)
qp_count = ilog2(qp_bitmap);
if (max_num_clients && max_num_clients < qp_count)
qp_count = max_num_clients;
- else if (mw_count < qp_count)
- qp_count = mw_count;
+ else if (nt->mw_count < qp_count)
+ qp_count = nt->mw_count;
qp_bitmap &= BIT_ULL(qp_count) - 1;
@@ -1317,7 +1305,6 @@ static int ntb_async_rx_submit(struct ntb_queue_entry *entry, void *offset)
struct dmaengine_unmap_data *unmap;
dma_cookie_t cookie;
void *buf = entry->buf;
- int retries = 0;
len = entry->len;
device = chan->device;
@@ -1346,22 +1333,11 @@ static int ntb_async_rx_submit(struct ntb_queue_entry *entry, void *offset)
unmap->from_cnt = 1;
- for (retries = 0; retries < DMA_RETRIES; retries++) {
- txd = device->device_prep_dma_memcpy(chan,
- unmap->addr[1],
- unmap->addr[0], len,
- DMA_PREP_INTERRUPT);
- if (txd)
- break;
-
- set_current_state(TASK_INTERRUPTIBLE);
- schedule_timeout(DMA_OUT_RESOURCE_TO);
- }
-
- if (!txd) {
- qp->dma_rx_prep_err++;
+ txd = device->device_prep_dma_memcpy(chan, unmap->addr[1],
+ unmap->addr[0], len,
+ DMA_PREP_INTERRUPT);
+ if (!txd)
goto err_get_unmap;
- }
txd->callback_result = ntb_rx_copy_callback;
txd->callback_param = entry;
@@ -1606,7 +1582,6 @@ static int ntb_async_tx_submit(struct ntb_transport_qp *qp,
struct dmaengine_unmap_data *unmap;
dma_addr_t dest;
dma_cookie_t cookie;
- int retries = 0;
device = chan->device;
dest = qp->tx_mw_phys + qp->tx_max_frame * entry->tx_index;
@@ -1628,21 +1603,10 @@ static int ntb_async_tx_submit(struct ntb_transport_qp *qp,
unmap->to_cnt = 1;
- for (retries = 0; retries < DMA_RETRIES; retries++) {
- txd = device->device_prep_dma_memcpy(chan, dest,
- unmap->addr[0], len,
- DMA_PREP_INTERRUPT);
- if (txd)
- break;
-
- set_current_state(TASK_INTERRUPTIBLE);
- schedule_timeout(DMA_OUT_RESOURCE_TO);
- }
-
- if (!txd) {
- qp->dma_tx_prep_err++;
+ txd = device->device_prep_dma_memcpy(chan, dest, unmap->addr[0], len,
+ DMA_PREP_INTERRUPT);
+ if (!txd)
goto err_get_unmap;
- }
txd->callback_result = ntb_tx_copy_callback;
txd->callback_param = entry;
diff --git a/drivers/ntb/test/ntb_perf.c b/drivers/ntb/test/ntb_perf.c
index 434e1d474f33..5cab2831ce99 100644
--- a/drivers/ntb/test/ntb_perf.c
+++ b/drivers/ntb/test/ntb_perf.c
@@ -90,11 +90,11 @@ MODULE_PARM_DESC(max_mw_size, "Limit size of large memory windows");
static unsigned int seg_order = 19; /* 512K */
module_param(seg_order, uint, 0644);
-MODULE_PARM_DESC(seg_order, "size order [n^2] of buffer segment for testing");
+MODULE_PARM_DESC(seg_order, "size order [2^n] of buffer segment for testing");
static unsigned int run_order = 32; /* 4G */
module_param(run_order, uint, 0644);
-MODULE_PARM_DESC(run_order, "size order [n^2] of total data to transfer");
+MODULE_PARM_DESC(run_order, "size order [2^n] of total data to transfer");
static bool use_dma; /* default to 0 */
module_param(use_dma, bool, 0644);
diff --git a/drivers/nvdimm/blk.c b/drivers/nvdimm/blk.c
index f12d23c49771..345acca576b3 100644
--- a/drivers/nvdimm/blk.c
+++ b/drivers/nvdimm/blk.c
@@ -106,7 +106,8 @@ static int nd_blk_rw_integrity(struct nd_namespace_blk *nsblk,
len -= cur_len;
dev_offset += cur_len;
- bvec_iter_advance(bip->bip_vec, &bip->bip_iter, cur_len);
+ if (!bvec_iter_advance(bip->bip_vec, &bip->bip_iter, cur_len))
+ return -EIO;
}
return err;
@@ -179,16 +180,8 @@ static blk_qc_t nd_blk_make_request(struct request_queue *q, struct bio *bio)
int err = 0, rw;
bool do_acct;
- /*
- * bio_integrity_enabled also checks if the bio already has an
- * integrity payload attached. If it does, we *don't* do a
- * bio_integrity_prep here - the payload has been generated by
- * another kernel subsystem, and we just pass it through.
- */
- if (bio_integrity_enabled(bio) && bio_integrity_prep(bio)) {
- bio->bi_status = BLK_STS_IOERR;
- goto out;
- }
+ if (!bio_integrity_prep(bio))
+ return BLK_QC_T_NONE;
bip = bio_integrity(bio);
nsblk = q->queuedata;
@@ -212,7 +205,6 @@ static blk_qc_t nd_blk_make_request(struct request_queue *q, struct bio *bio)
if (do_acct)
nd_iostat_end(bio, start);
- out:
bio_endio(bio);
return BLK_QC_T_NONE;
}
diff --git a/drivers/nvdimm/btt.c b/drivers/nvdimm/btt.c
index b6ba0618ea46..d00c10f382f0 100644
--- a/drivers/nvdimm/btt.c
+++ b/drivers/nvdimm/btt.c
@@ -985,7 +985,8 @@ static int btt_rw_integrity(struct btt *btt, struct bio_integrity_payload *bip,
len -= cur_len;
meta_nsoff += cur_len;
- bvec_iter_advance(bip->bip_vec, &bip->bip_iter, cur_len);
+ if (!bvec_iter_advance(bip->bip_vec, &bip->bip_iter, cur_len))
+ return -EIO;
}
return ret;
@@ -1203,16 +1204,8 @@ static blk_qc_t btt_make_request(struct request_queue *q, struct bio *bio)
int err = 0;
bool do_acct;
- /*
- * bio_integrity_enabled also checks if the bio already has an
- * integrity payload attached. If it does, we *don't* do a
- * bio_integrity_prep here - the payload has been generated by
- * another kernel subsystem, and we just pass it through.
- */
- if (bio_integrity_enabled(bio) && bio_integrity_prep(bio)) {
- bio->bi_status = BLK_STS_IOERR;
- goto out;
- }
+ if (!bio_integrity_prep(bio))
+ return BLK_QC_T_NONE;
do_acct = nd_iostat_start(bio, &start);
bio_for_each_segment(bvec, bio, iter) {
@@ -1239,7 +1232,6 @@ static blk_qc_t btt_make_request(struct request_queue *q, struct bio *bio)
if (do_acct)
nd_iostat_end(bio, start);
-out:
bio_endio(bio);
return BLK_QC_T_NONE;
}
diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c
index 48d3ed3d48d1..7639ab67c7a2 100644
--- a/drivers/nvme/host/pci.c
+++ b/drivers/nvme/host/pci.c
@@ -1992,7 +1992,8 @@ static void nvme_dev_disable(struct nvme_dev *dev, bool shutdown)
if (pci_is_enabled(pdev)) {
u32 csts = readl(dev->bar + NVME_REG_CSTS);
- if (dev->ctrl.state == NVME_CTRL_LIVE)
+ if (dev->ctrl.state == NVME_CTRL_LIVE ||
+ dev->ctrl.state == NVME_CTRL_RESETTING)
nvme_start_freeze(&dev->ctrl);
dead = !!((csts & NVME_CSTS_CFS) || !(csts & NVME_CSTS_RDY) ||
pdev->error_state != pci_channel_io_normal);
diff --git a/drivers/pci/access.c b/drivers/pci/access.c
index 74cf5fffb1e1..c80e37a69305 100644
--- a/drivers/pci/access.c
+++ b/drivers/pci/access.c
@@ -896,7 +896,7 @@ int pci_read_config_byte(const struct pci_dev *dev, int where, u8 *val)
{
if (pci_dev_is_disconnected(dev)) {
*val = ~0;
- return -ENODEV;
+ return PCIBIOS_DEVICE_NOT_FOUND;
}
return pci_bus_read_config_byte(dev->bus, dev->devfn, where, val);
}
@@ -906,7 +906,7 @@ int pci_read_config_word(const struct pci_dev *dev, int where, u16 *val)
{
if (pci_dev_is_disconnected(dev)) {
*val = ~0;
- return -ENODEV;
+ return PCIBIOS_DEVICE_NOT_FOUND;
}
return pci_bus_read_config_word(dev->bus, dev->devfn, where, val);
}
@@ -917,7 +917,7 @@ int pci_read_config_dword(const struct pci_dev *dev, int where,
{
if (pci_dev_is_disconnected(dev)) {
*val = ~0;
- return -ENODEV;
+ return PCIBIOS_DEVICE_NOT_FOUND;
}
return pci_bus_read_config_dword(dev->bus, dev->devfn, where, val);
}
@@ -926,7 +926,7 @@ EXPORT_SYMBOL(pci_read_config_dword);
int pci_write_config_byte(const struct pci_dev *dev, int where, u8 val)
{
if (pci_dev_is_disconnected(dev))
- return -ENODEV;
+ return PCIBIOS_DEVICE_NOT_FOUND;
return pci_bus_write_config_byte(dev->bus, dev->devfn, where, val);
}
EXPORT_SYMBOL(pci_write_config_byte);
@@ -934,7 +934,7 @@ EXPORT_SYMBOL(pci_write_config_byte);
int pci_write_config_word(const struct pci_dev *dev, int where, u16 val)
{
if (pci_dev_is_disconnected(dev))
- return -ENODEV;
+ return PCIBIOS_DEVICE_NOT_FOUND;
return pci_bus_write_config_word(dev->bus, dev->devfn, where, val);
}
EXPORT_SYMBOL(pci_write_config_word);
@@ -943,7 +943,7 @@ int pci_write_config_dword(const struct pci_dev *dev, int where,
u32 val)
{
if (pci_dev_is_disconnected(dev))
- return -ENODEV;
+ return PCIBIOS_DEVICE_NOT_FOUND;
return pci_bus_write_config_dword(dev->bus, dev->devfn, where, val);
}
EXPORT_SYMBOL(pci_write_config_dword);
diff --git a/drivers/pci/endpoint/functions/Kconfig b/drivers/pci/endpoint/functions/Kconfig
index 175edad42d2f..2942066607e0 100644
--- a/drivers/pci/endpoint/functions/Kconfig
+++ b/drivers/pci/endpoint/functions/Kconfig
@@ -5,6 +5,7 @@
config PCI_EPF_TEST
tristate "PCI Endpoint Test driver"
depends on PCI_ENDPOINT
+ select CRC32
help
Enable this configuration option to enable the test driver
for PCI Endpoint.
diff --git a/drivers/pinctrl/pinctrl-amd.c b/drivers/pinctrl/pinctrl-amd.c
index 1482d132fbb8..e432ec887479 100644
--- a/drivers/pinctrl/pinctrl-amd.c
+++ b/drivers/pinctrl/pinctrl-amd.c
@@ -495,64 +495,54 @@ static struct irq_chip amd_gpio_irqchip = {
.flags = IRQCHIP_SKIP_SET_WAKE,
};
-static void amd_gpio_irq_handler(struct irq_desc *desc)
+#define PIN_IRQ_PENDING (BIT(INTERRUPT_STS_OFF) | BIT(WAKE_STS_OFF))
+
+static irqreturn_t amd_gpio_irq_handler(int irq, void *dev_id)
{
- u32 i;
- u32 off;
- u32 reg;
- u32 pin_reg;
- u64 reg64;
- int handled = 0;
- unsigned int irq;
+ struct amd_gpio *gpio_dev = dev_id;
+ struct gpio_chip *gc = &gpio_dev->gc;
+ irqreturn_t ret = IRQ_NONE;
+ unsigned int i, irqnr;
unsigned long flags;
- struct irq_chip *chip = irq_desc_get_chip(desc);
- struct gpio_chip *gc = irq_desc_get_handler_data(desc);
- struct amd_gpio *gpio_dev = gpiochip_get_data(gc);
+ u32 *regs, regval;
+ u64 status, mask;
- chained_irq_enter(chip, desc);
- /*enable GPIO interrupt again*/
+ /* Read the wake status */
raw_spin_lock_irqsave(&gpio_dev->lock, flags);
- reg = readl(gpio_dev->base + WAKE_INT_STATUS_REG1);
- reg64 = reg;
- reg64 = reg64 << 32;
-
- reg = readl(gpio_dev->base + WAKE_INT_STATUS_REG0);
- reg64 |= reg;
+ status = readl(gpio_dev->base + WAKE_INT_STATUS_REG1);
+ status <<= 32;
+ status |= readl(gpio_dev->base + WAKE_INT_STATUS_REG0);
raw_spin_unlock_irqrestore(&gpio_dev->lock, flags);
- /*
- * first 46 bits indicates interrupt status.
- * one bit represents four interrupt sources.
- */
- for (off = 0; off < 46 ; off++) {
- if (reg64 & BIT(off)) {
- for (i = 0; i < 4; i++) {
- pin_reg = readl(gpio_dev->base +
- (off * 4 + i) * 4);
- if ((pin_reg & BIT(INTERRUPT_STS_OFF)) ||
- (pin_reg & BIT(WAKE_STS_OFF))) {
- irq = irq_find_mapping(gc->irqdomain,
- off * 4 + i);
- generic_handle_irq(irq);
- writel(pin_reg,
- gpio_dev->base
- + (off * 4 + i) * 4);
- handled++;
- }
- }
+ /* Bit 0-45 contain the relevant status bits */
+ status &= (1ULL << 46) - 1;
+ regs = gpio_dev->base;
+ for (mask = 1, irqnr = 0; status; mask <<= 1, regs += 4, irqnr += 4) {
+ if (!(status & mask))
+ continue;
+ status &= ~mask;
+
+ /* Each status bit covers four pins */
+ for (i = 0; i < 4; i++) {
+ regval = readl(regs + i);
+ if (!(regval & PIN_IRQ_PENDING))
+ continue;
+ irq = irq_find_mapping(gc->irqdomain, irqnr + i);
+ generic_handle_irq(irq);
+ /* Clear interrupt */
+ writel(regval, regs + i);
+ ret = IRQ_HANDLED;
}
}
- if (handled == 0)
- handle_bad_irq(desc);
-
+ /* Signal EOI to the GPIO unit */
raw_spin_lock_irqsave(&gpio_dev->lock, flags);
- reg = readl(gpio_dev->base + WAKE_INT_MASTER_REG);
- reg |= EOI_MASK;
- writel(reg, gpio_dev->base + WAKE_INT_MASTER_REG);
+ regval = readl(gpio_dev->base + WAKE_INT_MASTER_REG);
+ regval |= EOI_MASK;
+ writel(regval, gpio_dev->base + WAKE_INT_MASTER_REG);
raw_spin_unlock_irqrestore(&gpio_dev->lock, flags);
- chained_irq_exit(chip, desc);
+ return ret;
}
static int amd_get_groups_count(struct pinctrl_dev *pctldev)
@@ -821,10 +811,11 @@ static int amd_gpio_probe(struct platform_device *pdev)
goto out2;
}
- gpiochip_set_chained_irqchip(&gpio_dev->gc,
- &amd_gpio_irqchip,
- irq_base,
- amd_gpio_irq_handler);
+ ret = devm_request_irq(&pdev->dev, irq_base, amd_gpio_irq_handler, 0,
+ KBUILD_MODNAME, gpio_dev);
+ if (ret)
+ goto out2;
+
platform_set_drvdata(pdev, gpio_dev);
dev_dbg(&pdev->dev, "amd gpio driver loaded\n");
diff --git a/drivers/pinctrl/pinctrl-rockchip.c b/drivers/pinctrl/pinctrl-rockchip.c
index f141aa0430b1..9dd981ddbb17 100644
--- a/drivers/pinctrl/pinctrl-rockchip.c
+++ b/drivers/pinctrl/pinctrl-rockchip.c
@@ -143,9 +143,6 @@ struct rockchip_drv {
* @gpio_chip: gpiolib chip
* @grange: gpio range
* @slock: spinlock for the gpio bank
- * @irq_lock: bus lock for irq chip
- * @new_irqs: newly configured irqs which must be muxed as GPIOs in
- * irq_bus_sync_unlock()
*/
struct rockchip_pin_bank {
void __iomem *reg_base;
@@ -168,8 +165,6 @@ struct rockchip_pin_bank {
struct pinctrl_gpio_range grange;
raw_spinlock_t slock;
u32 toggle_edge_mode;
- struct mutex irq_lock;
- u32 new_irqs;
};
#define PIN_BANK(id, pins, label) \
@@ -2134,12 +2129,11 @@ static int rockchip_irq_set_type(struct irq_data *d, unsigned int type)
int ret;
/* make sure the pin is configured as gpio input */
- ret = rockchip_verify_mux(bank, d->hwirq, RK_FUNC_GPIO);
+ ret = rockchip_set_mux(bank, d->hwirq, RK_FUNC_GPIO);
if (ret < 0)
return ret;
- bank->new_irqs |= mask;
-
+ clk_enable(bank->clk);
raw_spin_lock_irqsave(&bank->slock, flags);
data = readl_relaxed(bank->reg_base + GPIO_SWPORT_DDR);
@@ -2197,6 +2191,7 @@ static int rockchip_irq_set_type(struct irq_data *d, unsigned int type)
default:
irq_gc_unlock(gc);
raw_spin_unlock_irqrestore(&bank->slock, flags);
+ clk_disable(bank->clk);
return -EINVAL;
}
@@ -2205,6 +2200,7 @@ static int rockchip_irq_set_type(struct irq_data *d, unsigned int type)
irq_gc_unlock(gc);
raw_spin_unlock_irqrestore(&bank->slock, flags);
+ clk_disable(bank->clk);
return 0;
}
@@ -2248,34 +2244,6 @@ static void rockchip_irq_disable(struct irq_data *d)
clk_disable(bank->clk);
}
-static void rockchip_irq_bus_lock(struct irq_data *d)
-{
- struct irq_chip_generic *gc = irq_data_get_irq_chip_data(d);
- struct rockchip_pin_bank *bank = gc->private;
-
- clk_enable(bank->clk);
- mutex_lock(&bank->irq_lock);
-}
-
-static void rockchip_irq_bus_sync_unlock(struct irq_data *d)
-{
- struct irq_chip_generic *gc = irq_data_get_irq_chip_data(d);
- struct rockchip_pin_bank *bank = gc->private;
-
- while (bank->new_irqs) {
- unsigned int irq = __ffs(bank->new_irqs);
- int ret;
-
- ret = rockchip_set_mux(bank, irq, RK_FUNC_GPIO);
- WARN_ON(ret < 0);
-
- bank->new_irqs &= ~BIT(irq);
- }
-
- mutex_unlock(&bank->irq_lock);
- clk_disable(bank->clk);
-}
-
static int rockchip_interrupts_register(struct platform_device *pdev,
struct rockchip_pinctrl *info)
{
@@ -2342,9 +2310,6 @@ static int rockchip_interrupts_register(struct platform_device *pdev,
gc->chip_types[0].chip.irq_suspend = rockchip_irq_suspend;
gc->chip_types[0].chip.irq_resume = rockchip_irq_resume;
gc->chip_types[0].chip.irq_set_type = rockchip_irq_set_type;
- gc->chip_types[0].chip.irq_bus_lock = rockchip_irq_bus_lock;
- gc->chip_types[0].chip.irq_bus_sync_unlock =
- rockchip_irq_bus_sync_unlock;
gc->wake_enabled = IRQ_MSK(bank->nr_pins);
irq_set_chained_handler_and_data(bank->irq,
@@ -2518,7 +2483,6 @@ static struct rockchip_pin_ctrl *rockchip_pinctrl_get_soc_data(
int bank_pins = 0;
raw_spin_lock_init(&bank->slock);
- mutex_init(&bank->irq_lock);
bank->drvdata = d;
bank->pin_base = ctrl->nr_pins;
ctrl->nr_pins += bank->nr_pins;
diff --git a/drivers/pinctrl/stm32/pinctrl-stm32.c b/drivers/pinctrl/stm32/pinctrl-stm32.c
index d3c5f5dfbbd7..222b6685b09f 100644
--- a/drivers/pinctrl/stm32/pinctrl-stm32.c
+++ b/drivers/pinctrl/stm32/pinctrl-stm32.c
@@ -798,7 +798,7 @@ static int stm32_pconf_parse_conf(struct pinctrl_dev *pctldev,
break;
case PIN_CONFIG_OUTPUT:
__stm32_gpio_set(bank, offset, arg);
- ret = stm32_pmx_gpio_set_direction(pctldev, NULL, pin, false);
+ ret = stm32_pmx_gpio_set_direction(pctldev, range, pin, false);
break;
default:
ret = -EINVAL;
diff --git a/drivers/platform/x86/intel_telemetry_debugfs.c b/drivers/platform/x86/intel_telemetry_debugfs.c
index ef29f18b1951..4cc2f4ea0a25 100644
--- a/drivers/platform/x86/intel_telemetry_debugfs.c
+++ b/drivers/platform/x86/intel_telemetry_debugfs.c
@@ -97,11 +97,9 @@
} \
}
-#ifdef CONFIG_PM_SLEEP
static u8 suspend_prep_ok;
static u32 suspend_shlw_ctr_temp, suspend_deep_ctr_temp;
static u64 suspend_shlw_res_temp, suspend_deep_res_temp;
-#endif
struct telemetry_susp_stats {
u32 shlw_swake_ctr;
@@ -807,7 +805,6 @@ static const struct file_operations telem_ioss_trc_verb_ops = {
.release = single_release,
};
-#ifdef CONFIG_PM_SLEEP
static int pm_suspend_prep_cb(void)
{
struct telemetry_evtlog evtlog[TELEM_MAX_OS_ALLOCATED_EVENTS];
@@ -937,7 +934,6 @@ static int pm_notification(struct notifier_block *this,
static struct notifier_block pm_notifier = {
.notifier_call = pm_notification,
};
-#endif /* CONFIG_PM_SLEEP */
static int __init telemetry_debugfs_init(void)
{
@@ -960,14 +956,13 @@ static int __init telemetry_debugfs_init(void)
if (err < 0)
return -EINVAL;
-
-#ifdef CONFIG_PM_SLEEP
register_pm_notifier(&pm_notifier);
-#endif /* CONFIG_PM_SLEEP */
debugfs_conf->telemetry_dbg_dir = debugfs_create_dir("telemetry", NULL);
- if (!debugfs_conf->telemetry_dbg_dir)
- return -ENOMEM;
+ if (!debugfs_conf->telemetry_dbg_dir) {
+ err = -ENOMEM;
+ goto out_pm;
+ }
f = debugfs_create_file("pss_info", S_IFREG | S_IRUGO,
debugfs_conf->telemetry_dbg_dir, NULL,
@@ -1014,6 +1009,8 @@ static int __init telemetry_debugfs_init(void)
out:
debugfs_remove_recursive(debugfs_conf->telemetry_dbg_dir);
debugfs_conf->telemetry_dbg_dir = NULL;
+out_pm:
+ unregister_pm_notifier(&pm_notifier);
return err;
}
@@ -1022,6 +1019,7 @@ static void __exit telemetry_debugfs_exit(void)
{
debugfs_remove_recursive(debugfs_conf->telemetry_dbg_dir);
debugfs_conf->telemetry_dbg_dir = NULL;
+ unregister_pm_notifier(&pm_notifier);
}
late_initcall(telemetry_debugfs_init);
diff --git a/drivers/rtc/rtc-imxdi.c b/drivers/rtc/rtc-imxdi.c
index 6b54f6c24c5f..80931114c899 100644
--- a/drivers/rtc/rtc-imxdi.c
+++ b/drivers/rtc/rtc-imxdi.c
@@ -709,7 +709,7 @@ static irqreturn_t dryice_irq(int irq, void *dev_id)
/*If the write wait queue is empty then there is no pending
operations. It means the interrupt is for DryIce -Security.
IRQ must be returned as none.*/
- if (list_empty_careful(&imxdi->write_wait.task_list))
+ if (list_empty_careful(&imxdi->write_wait.head))
return rc;
/* DSR_WCF clears itself on DSR read */
diff --git a/drivers/s390/net/netiucv.c b/drivers/s390/net/netiucv.c
index dba94b486f05..fa732bd86729 100644
--- a/drivers/s390/net/netiucv.c
+++ b/drivers/s390/net/netiucv.c
@@ -1954,7 +1954,6 @@ static void netiucv_free_netdevice(struct net_device *dev)
privptr->conn = NULL; privptr->fsm = NULL;
/* privptr gets freed by free_netdev() */
}
- free_netdev(dev);
}
/**
@@ -1972,7 +1971,8 @@ static void netiucv_setup_netdevice(struct net_device *dev)
dev->mtu = NETIUCV_MTU_DEFAULT;
dev->min_mtu = 576;
dev->max_mtu = NETIUCV_MTU_MAX;
- dev->destructor = netiucv_free_netdevice;
+ dev->needs_free_netdev = true;
+ dev->priv_destructor = netiucv_free_netdevice;
dev->hard_header_len = NETIUCV_HDRLEN;
dev->addr_len = 0;
dev->type = ARPHRD_SLIP;
diff --git a/drivers/scsi/dpt/dpti_i2o.h b/drivers/scsi/dpt/dpti_i2o.h
index bd9e31e16249..16fc380b5512 100644
--- a/drivers/scsi/dpt/dpti_i2o.h
+++ b/drivers/scsi/dpt/dpti_i2o.h
@@ -48,7 +48,7 @@
#include <linux/wait.h>
typedef wait_queue_head_t adpt_wait_queue_head_t;
#define ADPT_DECLARE_WAIT_QUEUE_HEAD(wait) DECLARE_WAIT_QUEUE_HEAD_ONSTACK(wait)
-typedef wait_queue_t adpt_wait_queue_t;
+typedef wait_queue_entry_t adpt_wait_queue_entry_t;
/*
* message structures
diff --git a/drivers/scsi/ips.c b/drivers/scsi/ips.c
index 3419e1bcdff6..67621308eb9c 100644
--- a/drivers/scsi/ips.c
+++ b/drivers/scsi/ips.c
@@ -301,13 +301,13 @@ static uint32_t ips_statupd_copperhead_memio(ips_ha_t *);
static uint32_t ips_statupd_morpheus(ips_ha_t *);
static ips_scb_t *ips_getscb(ips_ha_t *);
static void ips_putq_scb_head(ips_scb_queue_t *, ips_scb_t *);
-static void ips_putq_wait_tail(ips_wait_queue_t *, struct scsi_cmnd *);
+static void ips_putq_wait_tail(ips_wait_queue_entry_t *, struct scsi_cmnd *);
static void ips_putq_copp_tail(ips_copp_queue_t *,
ips_copp_wait_item_t *);
static ips_scb_t *ips_removeq_scb_head(ips_scb_queue_t *);
static ips_scb_t *ips_removeq_scb(ips_scb_queue_t *, ips_scb_t *);
-static struct scsi_cmnd *ips_removeq_wait_head(ips_wait_queue_t *);
-static struct scsi_cmnd *ips_removeq_wait(ips_wait_queue_t *,
+static struct scsi_cmnd *ips_removeq_wait_head(ips_wait_queue_entry_t *);
+static struct scsi_cmnd *ips_removeq_wait(ips_wait_queue_entry_t *,
struct scsi_cmnd *);
static ips_copp_wait_item_t *ips_removeq_copp(ips_copp_queue_t *,
ips_copp_wait_item_t *);
@@ -2871,7 +2871,7 @@ ips_removeq_scb(ips_scb_queue_t * queue, ips_scb_t * item)
/* ASSUMED to be called from within the HA lock */
/* */
/****************************************************************************/
-static void ips_putq_wait_tail(ips_wait_queue_t *queue, struct scsi_cmnd *item)
+static void ips_putq_wait_tail(ips_wait_queue_entry_t *queue, struct scsi_cmnd *item)
{
METHOD_TRACE("ips_putq_wait_tail", 1);
@@ -2902,7 +2902,7 @@ static void ips_putq_wait_tail(ips_wait_queue_t *queue, struct scsi_cmnd *item)
/* ASSUMED to be called from within the HA lock */
/* */
/****************************************************************************/
-static struct scsi_cmnd *ips_removeq_wait_head(ips_wait_queue_t *queue)
+static struct scsi_cmnd *ips_removeq_wait_head(ips_wait_queue_entry_t *queue)
{
struct scsi_cmnd *item;
@@ -2936,7 +2936,7 @@ static struct scsi_cmnd *ips_removeq_wait_head(ips_wait_queue_t *queue)
/* ASSUMED to be called from within the HA lock */
/* */
/****************************************************************************/
-static struct scsi_cmnd *ips_removeq_wait(ips_wait_queue_t *queue,
+static struct scsi_cmnd *ips_removeq_wait(ips_wait_queue_entry_t *queue,
struct scsi_cmnd *item)
{
struct scsi_cmnd *p;
diff --git a/drivers/scsi/ips.h b/drivers/scsi/ips.h
index b782bb60baf0..366be3b2f9b4 100644
--- a/drivers/scsi/ips.h
+++ b/drivers/scsi/ips.h
@@ -989,7 +989,7 @@ typedef struct ips_wait_queue {
struct scsi_cmnd *head;
struct scsi_cmnd *tail;
int count;
-} ips_wait_queue_t;
+} ips_wait_queue_entry_t;
typedef struct ips_copp_wait_item {
struct scsi_cmnd *scsi_cmd;
@@ -1035,7 +1035,7 @@ typedef struct ips_ha {
ips_stat_t sp; /* Status packer pointer */
struct ips_scb *scbs; /* Array of all CCBS */
struct ips_scb *scb_freelist; /* SCB free list */
- ips_wait_queue_t scb_waitlist; /* Pending SCB list */
+ ips_wait_queue_entry_t scb_waitlist; /* Pending SCB list */
ips_copp_queue_t copp_waitlist; /* Pending PT list */
ips_scb_queue_t scb_activelist; /* Active SCB list */
IPS_IO_CMD *dummy; /* dummy command */
diff --git a/drivers/scsi/lpfc/lpfc_scsi.c b/drivers/scsi/lpfc/lpfc_scsi.c
index 54fd0c81ceaf..99d2e990b231 100644
--- a/drivers/scsi/lpfc/lpfc_scsi.c
+++ b/drivers/scsi/lpfc/lpfc_scsi.c
@@ -26,6 +26,7 @@
#include <linux/export.h>
#include <linux/delay.h>
#include <asm/unaligned.h>
+#include <linux/t10-pi.h>
#include <linux/crc-t10dif.h>
#include <net/checksum.h>
@@ -2934,8 +2935,8 @@ lpfc_calc_bg_err(struct lpfc_hba *phba, struct lpfc_scsi_buf *lpfc_cmd)
* First check to see if a protection data
* check is valid
*/
- if ((src->ref_tag == 0xffffffff) ||
- (src->app_tag == 0xffff)) {
+ if ((src->ref_tag == T10_PI_REF_ESCAPE) ||
+ (src->app_tag == T10_PI_APP_ESCAPE)) {
start_ref_tag++;
goto skipit;
}
diff --git a/drivers/scsi/qedi/qedi_fw.c b/drivers/scsi/qedi/qedi_fw.c
index 8bc7ee1a8ca8..507512cc478b 100644
--- a/drivers/scsi/qedi/qedi_fw.c
+++ b/drivers/scsi/qedi/qedi_fw.c
@@ -870,7 +870,6 @@ static void qedi_process_cmd_cleanup_resp(struct qedi_ctx *qedi,
QEDI_ERR(&qedi->dbg_ctx,
"Delayed or untracked cleanup response, itt=0x%x, tid=0x%x, cid=0x%x, task=%p\n",
protoitt, cqe->itid, qedi_conn->iscsi_conn_id, task);
- WARN_ON(1);
}
}
diff --git a/drivers/scsi/qedi/qedi_main.c b/drivers/scsi/qedi/qedi_main.c
index 09a294634bc7..879d3b7462f9 100644
--- a/drivers/scsi/qedi/qedi_main.c
+++ b/drivers/scsi/qedi/qedi_main.c
@@ -1499,11 +1499,9 @@ err_idx:
void qedi_clear_task_idx(struct qedi_ctx *qedi, int idx)
{
- if (!test_and_clear_bit(idx, qedi->task_idx_map)) {
+ if (!test_and_clear_bit(idx, qedi->task_idx_map))
QEDI_ERR(&qedi->dbg_ctx,
"FW task context, already cleared, tid=0x%x\n", idx);
- WARN_ON(1);
- }
}
void qedi_update_itt_map(struct qedi_ctx *qedi, u32 tid, u32 proto_itt,
diff --git a/drivers/scsi/qla2xxx/qla_isr.c b/drivers/scsi/qla2xxx/qla_isr.c
index 2572121b765b..de031aed94f6 100644
--- a/drivers/scsi/qla2xxx/qla_isr.c
+++ b/drivers/scsi/qla2xxx/qla_isr.c
@@ -1950,9 +1950,9 @@ qla2x00_handle_dif_error(srb_t *sp, struct sts_entry_24xx *sts24)
* For type 3: ref & app tag is all 'f's
* For type 0,1,2: app tag is all 'f's
*/
- if ((a_app_tag == 0xffff) &&
+ if ((a_app_tag == T10_PI_APP_ESCAPE) &&
((scsi_get_prot_type(cmd) != SCSI_PROT_DIF_TYPE3) ||
- (a_ref_tag == 0xffffffff))) {
+ (a_ref_tag == T10_PI_REF_ESCAPE))) {
uint32_t blocks_done, resid;
sector_t lba_s = scsi_get_lba(cmd);
@@ -1994,9 +1994,9 @@ qla2x00_handle_dif_error(srb_t *sp, struct sts_entry_24xx *sts24)
spt = page_address(sg_page(sg)) + sg->offset;
spt += j;
- spt->app_tag = 0xffff;
+ spt->app_tag = T10_PI_APP_ESCAPE;
if (scsi_get_prot_type(cmd) == SCSI_PROT_DIF_TYPE3)
- spt->ref_tag = 0xffffffff;
+ spt->ref_tag = T10_PI_REF_ESCAPE;
}
return 0;
diff --git a/drivers/staging/iio/cdc/ad7152.c b/drivers/staging/iio/cdc/ad7152.c
index dc6ecd824365..ff10d1f0a7e4 100644
--- a/drivers/staging/iio/cdc/ad7152.c
+++ b/drivers/staging/iio/cdc/ad7152.c
@@ -231,16 +231,12 @@ static int ad7152_write_raw_samp_freq(struct device *dev, int val)
if (i >= ARRAY_SIZE(ad7152_filter_rate_table))
i = ARRAY_SIZE(ad7152_filter_rate_table) - 1;
- mutex_lock(&chip->state_lock);
ret = i2c_smbus_write_byte_data(chip->client,
AD7152_REG_CFG2, AD7152_CFG2_OSR(i));
- if (ret < 0) {
- mutex_unlock(&chip->state_lock);
+ if (ret < 0)
return ret;
- }
chip->filter_rate_setup = i;
- mutex_unlock(&chip->state_lock);
return ret;
}
diff --git a/drivers/staging/lustre/lnet/klnds/o2iblnd/o2iblnd_cb.c b/drivers/staging/lustre/lnet/klnds/o2iblnd/o2iblnd_cb.c
index 0db662d6abdd..85b242ec5f9b 100644
--- a/drivers/staging/lustre/lnet/klnds/o2iblnd/o2iblnd_cb.c
+++ b/drivers/staging/lustre/lnet/klnds/o2iblnd/o2iblnd_cb.c
@@ -3267,7 +3267,7 @@ int
kiblnd_connd(void *arg)
{
spinlock_t *lock = &kiblnd_data.kib_connd_lock;
- wait_queue_t wait;
+ wait_queue_entry_t wait;
unsigned long flags;
struct kib_conn *conn;
int timeout;
@@ -3521,7 +3521,7 @@ kiblnd_scheduler(void *arg)
long id = (long)arg;
struct kib_sched_info *sched;
struct kib_conn *conn;
- wait_queue_t wait;
+ wait_queue_entry_t wait;
unsigned long flags;
struct ib_wc wc;
int did_something;
@@ -3656,7 +3656,7 @@ kiblnd_failover_thread(void *arg)
{
rwlock_t *glock = &kiblnd_data.kib_global_lock;
struct kib_dev *dev;
- wait_queue_t wait;
+ wait_queue_entry_t wait;
unsigned long flags;
int rc;
diff --git a/drivers/staging/lustre/lnet/klnds/socklnd/socklnd_cb.c b/drivers/staging/lustre/lnet/klnds/socklnd/socklnd_cb.c
index 3ed3b08c122c..6b38d5a8fe92 100644
--- a/drivers/staging/lustre/lnet/klnds/socklnd/socklnd_cb.c
+++ b/drivers/staging/lustre/lnet/klnds/socklnd/socklnd_cb.c
@@ -2166,7 +2166,7 @@ ksocknal_connd(void *arg)
{
spinlock_t *connd_lock = &ksocknal_data.ksnd_connd_lock;
struct ksock_connreq *cr;
- wait_queue_t wait;
+ wait_queue_entry_t wait;
int nloops = 0;
int cons_retry = 0;
@@ -2554,7 +2554,7 @@ ksocknal_check_peer_timeouts(int idx)
int
ksocknal_reaper(void *arg)
{
- wait_queue_t wait;
+ wait_queue_entry_t wait;
struct ksock_conn *conn;
struct ksock_sched *sched;
struct list_head enomem_conns;
diff --git a/drivers/staging/lustre/lnet/libcfs/debug.c b/drivers/staging/lustre/lnet/libcfs/debug.c
index c56e9922cd5b..49deb448b044 100644
--- a/drivers/staging/lustre/lnet/libcfs/debug.c
+++ b/drivers/staging/lustre/lnet/libcfs/debug.c
@@ -361,7 +361,7 @@ static int libcfs_debug_dumplog_thread(void *arg)
void libcfs_debug_dumplog(void)
{
- wait_queue_t wait;
+ wait_queue_entry_t wait;
struct task_struct *dumper;
/* we're being careful to ensure that the kernel thread is
diff --git a/drivers/staging/lustre/lnet/libcfs/tracefile.c b/drivers/staging/lustre/lnet/libcfs/tracefile.c
index 9599b7441feb..27082d2f7938 100644
--- a/drivers/staging/lustre/lnet/libcfs/tracefile.c
+++ b/drivers/staging/lustre/lnet/libcfs/tracefile.c
@@ -990,7 +990,7 @@ static int tracefiled(void *arg)
complete(&tctl->tctl_start);
while (1) {
- wait_queue_t __wait;
+ wait_queue_entry_t __wait;
pc.pc_want_daemon_pages = 0;
collect_pages(&pc);
diff --git a/drivers/staging/lustre/lnet/lnet/lib-eq.c b/drivers/staging/lustre/lnet/lnet/lib-eq.c
index ce4b83584e17..9ebba4ef5f90 100644
--- a/drivers/staging/lustre/lnet/lnet/lib-eq.c
+++ b/drivers/staging/lustre/lnet/lnet/lib-eq.c
@@ -312,7 +312,7 @@ __must_hold(&the_lnet.ln_eq_wait_lock)
{
int tms = *timeout_ms;
int wait;
- wait_queue_t wl;
+ wait_queue_entry_t wl;
unsigned long now;
if (!tms)
diff --git a/drivers/staging/lustre/lnet/lnet/lib-socket.c b/drivers/staging/lustre/lnet/lnet/lib-socket.c
index 9fca8d225ee0..f075706bba6d 100644
--- a/drivers/staging/lustre/lnet/lnet/lib-socket.c
+++ b/drivers/staging/lustre/lnet/lnet/lib-socket.c
@@ -516,7 +516,7 @@ lnet_sock_listen(struct socket **sockp, __u32 local_ip, int local_port,
int
lnet_sock_accept(struct socket **newsockp, struct socket *sock)
{
- wait_queue_t wait;
+ wait_queue_entry_t wait;
struct socket *newsock;
int rc;
diff --git a/drivers/staging/lustre/lustre/fid/fid_request.c b/drivers/staging/lustre/lustre/fid/fid_request.c
index 999f250ceed0..bf31bc200d27 100644
--- a/drivers/staging/lustre/lustre/fid/fid_request.c
+++ b/drivers/staging/lustre/lustre/fid/fid_request.c
@@ -192,7 +192,7 @@ static int seq_client_alloc_seq(const struct lu_env *env,
}
static int seq_fid_alloc_prep(struct lu_client_seq *seq,
- wait_queue_t *link)
+ wait_queue_entry_t *link)
{
if (seq->lcs_update) {
add_wait_queue(&seq->lcs_waitq, link);
@@ -223,7 +223,7 @@ static void seq_fid_alloc_fini(struct lu_client_seq *seq)
int seq_client_alloc_fid(const struct lu_env *env,
struct lu_client_seq *seq, struct lu_fid *fid)
{
- wait_queue_t link;
+ wait_queue_entry_t link;
int rc;
LASSERT(seq);
@@ -290,7 +290,7 @@ EXPORT_SYMBOL(seq_client_alloc_fid);
*/
void seq_client_flush(struct lu_client_seq *seq)
{
- wait_queue_t link;
+ wait_queue_entry_t link;
LASSERT(seq);
init_waitqueue_entry(&link, current);
diff --git a/drivers/staging/lustre/lustre/include/lustre_lib.h b/drivers/staging/lustre/lustre/include/lustre_lib.h
index b04d613846ee..f24970da8323 100644
--- a/drivers/staging/lustre/lustre/include/lustre_lib.h
+++ b/drivers/staging/lustre/lustre/include/lustre_lib.h
@@ -201,7 +201,7 @@ struct l_wait_info {
sigmask(SIGALRM))
/**
- * wait_queue_t of Linux (version < 2.6.34) is a FIFO list for exclusively
+ * wait_queue_entry_t of Linux (version < 2.6.34) is a FIFO list for exclusively
* waiting threads, which is not always desirable because all threads will
* be waken up again and again, even user only needs a few of them to be
* active most time. This is not good for performance because cache can
@@ -228,7 +228,7 @@ struct l_wait_info {
*/
#define __l_wait_event(wq, condition, info, ret, l_add_wait) \
do { \
- wait_queue_t __wait; \
+ wait_queue_entry_t __wait; \
long __timeout = info->lwi_timeout; \
sigset_t __blocked; \
int __allow_intr = info->lwi_allow_intr; \
diff --git a/drivers/staging/lustre/lustre/llite/lcommon_cl.c b/drivers/staging/lustre/lustre/llite/lcommon_cl.c
index 8af611033e12..96515b839436 100644
--- a/drivers/staging/lustre/lustre/llite/lcommon_cl.c
+++ b/drivers/staging/lustre/lustre/llite/lcommon_cl.c
@@ -207,7 +207,7 @@ int cl_file_inode_init(struct inode *inode, struct lustre_md *md)
static void cl_object_put_last(struct lu_env *env, struct cl_object *obj)
{
struct lu_object_header *header = obj->co_lu.lo_header;
- wait_queue_t waiter;
+ wait_queue_entry_t waiter;
if (unlikely(atomic_read(&header->loh_ref) != 1)) {
struct lu_site *site = obj->co_lu.lo_dev->ld_site;
diff --git a/drivers/staging/lustre/lustre/lov/lov_cl_internal.h b/drivers/staging/lustre/lustre/lov/lov_cl_internal.h
index 391c632365ae..e889d3a7de9c 100644
--- a/drivers/staging/lustre/lustre/lov/lov_cl_internal.h
+++ b/drivers/staging/lustre/lustre/lov/lov_cl_internal.h
@@ -370,7 +370,7 @@ struct lov_thread_info {
struct ost_lvb lti_lvb;
struct cl_2queue lti_cl2q;
struct cl_page_list lti_plist;
- wait_queue_t lti_waiter;
+ wait_queue_entry_t lti_waiter;
struct cl_attr lti_attr;
};
diff --git a/drivers/staging/lustre/lustre/lov/lov_object.c b/drivers/staging/lustre/lustre/lov/lov_object.c
index ab3ecfeeadc8..eddabbe31e5c 100644
--- a/drivers/staging/lustre/lustre/lov/lov_object.c
+++ b/drivers/staging/lustre/lustre/lov/lov_object.c
@@ -371,7 +371,7 @@ static void lov_subobject_kill(const struct lu_env *env, struct lov_object *lov,
struct lov_layout_raid0 *r0;
struct lu_site *site;
struct lu_site_bkt_data *bkt;
- wait_queue_t *waiter;
+ wait_queue_entry_t *waiter;
r0 = &lov->u.raid0;
LASSERT(r0->lo_sub[idx] == los);
diff --git a/drivers/staging/lustre/lustre/obdclass/lu_object.c b/drivers/staging/lustre/lustre/obdclass/lu_object.c
index abcf951208d2..76ae600ae2c8 100644
--- a/drivers/staging/lustre/lustre/obdclass/lu_object.c
+++ b/drivers/staging/lustre/lustre/obdclass/lu_object.c
@@ -556,7 +556,7 @@ EXPORT_SYMBOL(lu_object_print);
static struct lu_object *htable_lookup(struct lu_site *s,
struct cfs_hash_bd *bd,
const struct lu_fid *f,
- wait_queue_t *waiter,
+ wait_queue_entry_t *waiter,
__u64 *version)
{
struct lu_site_bkt_data *bkt;
@@ -670,7 +670,7 @@ static struct lu_object *lu_object_find_try(const struct lu_env *env,
struct lu_device *dev,
const struct lu_fid *f,
const struct lu_object_conf *conf,
- wait_queue_t *waiter)
+ wait_queue_entry_t *waiter)
{
struct lu_object *o;
struct lu_object *shadow;
@@ -750,7 +750,7 @@ struct lu_object *lu_object_find_at(const struct lu_env *env,
{
struct lu_site_bkt_data *bkt;
struct lu_object *obj;
- wait_queue_t wait;
+ wait_queue_entry_t wait;
while (1) {
obj = lu_object_find_try(env, dev, f, conf, &wait);
diff --git a/drivers/staging/rtl8188eu/os_dep/mon.c b/drivers/staging/rtl8188eu/os_dep/mon.c
index cfe37eb026d6..859d0d6051cd 100644
--- a/drivers/staging/rtl8188eu/os_dep/mon.c
+++ b/drivers/staging/rtl8188eu/os_dep/mon.c
@@ -152,7 +152,7 @@ static const struct net_device_ops mon_netdev_ops = {
static void mon_setup(struct net_device *dev)
{
dev->netdev_ops = &mon_netdev_ops;
- dev->destructor = free_netdev;
+ dev->needs_free_netdev = true;
ether_setup(dev);
dev->priv_flags |= IFF_NO_QUEUE;
dev->type = ARPHRD_IEEE80211;
diff --git a/drivers/staging/rtl8723bs/os_dep/ioctl_cfg80211.c b/drivers/staging/rtl8723bs/os_dep/ioctl_cfg80211.c
index 36c3189fc4b7..bd4352fe2de3 100644
--- a/drivers/staging/rtl8723bs/os_dep/ioctl_cfg80211.c
+++ b/drivers/staging/rtl8723bs/os_dep/ioctl_cfg80211.c
@@ -2667,7 +2667,8 @@ static int rtw_cfg80211_add_monitor_if (struct adapter *padapter, char *name, st
mon_ndev->type = ARPHRD_IEEE80211_RADIOTAP;
strncpy(mon_ndev->name, name, IFNAMSIZ);
mon_ndev->name[IFNAMSIZ - 1] = 0;
- mon_ndev->destructor = rtw_ndev_destructor;
+ mon_ndev->needs_free_netdev = true;
+ mon_ndev->priv_destructor = rtw_ndev_destructor;
mon_ndev->netdev_ops = &rtw_cfg80211_monitor_if_ops;
diff --git a/drivers/staging/rtl8723bs/os_dep/os_intfs.c b/drivers/staging/rtl8723bs/os_dep/os_intfs.c
index f83cfc76505c..021589913681 100644
--- a/drivers/staging/rtl8723bs/os_dep/os_intfs.c
+++ b/drivers/staging/rtl8723bs/os_dep/os_intfs.c
@@ -1207,8 +1207,6 @@ void rtw_ndev_destructor(struct net_device *ndev)
if (ndev->ieee80211_ptr)
kfree((u8 *)ndev->ieee80211_ptr);
-
- free_netdev(ndev);
}
void rtw_dev_unload(struct adapter *padapter)
diff --git a/drivers/staging/rtl8723bs/os_dep/osdep_service.c b/drivers/staging/rtl8723bs/os_dep/osdep_service.c
index 02db59e8b593..aa16d1ab955b 100644
--- a/drivers/staging/rtl8723bs/os_dep/osdep_service.c
+++ b/drivers/staging/rtl8723bs/os_dep/osdep_service.c
@@ -160,7 +160,7 @@ static int isFileReadable(char *path)
oldfs = get_fs(); set_fs(get_ds());
if (1!=readFile(fp, &buf, 1))
- ret = PTR_ERR(fp);
+ ret = -EINVAL;
set_fs(oldfs);
filp_close(fp, NULL);
diff --git a/drivers/target/iscsi/iscsi_target.c b/drivers/target/iscsi/iscsi_target.c
index 0d8f81591bed..3fdca2cdd8da 100644
--- a/drivers/target/iscsi/iscsi_target.c
+++ b/drivers/target/iscsi/iscsi_target.c
@@ -1279,6 +1279,18 @@ iscsit_get_immediate_data(struct iscsi_cmd *cmd, struct iscsi_scsi_req *hdr,
*/
if (dump_payload)
goto after_immediate_data;
+ /*
+ * Check for underflow case where both EDTL and immediate data payload
+ * exceeds what is presented by CDB's TRANSFER LENGTH, and what has
+ * already been set in target_cmd_size_check() as se_cmd->data_length.
+ *
+ * For this special case, fail the command and dump the immediate data
+ * payload.
+ */
+ if (cmd->first_burst_len > cmd->se_cmd.data_length) {
+ cmd->sense_reason = TCM_INVALID_CDB_FIELD;
+ goto after_immediate_data;
+ }
immed_ret = iscsit_handle_immediate_data(cmd, hdr,
cmd->first_burst_len);
@@ -4423,8 +4435,11 @@ static void iscsit_logout_post_handler_closesession(
* always sleep waiting for RX/TX thread shutdown to complete
* within iscsit_close_connection().
*/
- if (!conn->conn_transport->rdma_shutdown)
+ if (!conn->conn_transport->rdma_shutdown) {
sleep = cmpxchg(&conn->tx_thread_active, true, false);
+ if (!sleep)
+ return;
+ }
atomic_set(&conn->conn_logout_remove, 0);
complete(&conn->conn_logout_comp);
@@ -4440,8 +4455,11 @@ static void iscsit_logout_post_handler_samecid(
{
int sleep = 1;
- if (!conn->conn_transport->rdma_shutdown)
+ if (!conn->conn_transport->rdma_shutdown) {
sleep = cmpxchg(&conn->tx_thread_active, true, false);
+ if (!sleep)
+ return;
+ }
atomic_set(&conn->conn_logout_remove, 0);
complete(&conn->conn_logout_comp);
diff --git a/drivers/target/target_core_internal.h b/drivers/target/target_core_internal.h
index 9ab7090f7c83..0912de7c0cf8 100644
--- a/drivers/target/target_core_internal.h
+++ b/drivers/target/target_core_internal.h
@@ -136,7 +136,7 @@ int init_se_kmem_caches(void);
void release_se_kmem_caches(void);
u32 scsi_get_new_index(scsi_index_t);
void transport_subsystem_check_init(void);
-void transport_cmd_finish_abort(struct se_cmd *, int);
+int transport_cmd_finish_abort(struct se_cmd *, int);
unsigned char *transport_dump_cmd_direction(struct se_cmd *);
void transport_dump_dev_state(struct se_device *, char *, int *);
void transport_dump_dev_info(struct se_device *, struct se_lun *,
diff --git a/drivers/target/target_core_sbc.c b/drivers/target/target_core_sbc.c
index 4316f7b65fb7..dc9456e7dac9 100644
--- a/drivers/target/target_core_sbc.c
+++ b/drivers/target/target_core_sbc.c
@@ -1450,7 +1450,7 @@ sbc_dif_verify(struct se_cmd *cmd, sector_t start, unsigned int sectors,
(unsigned long long)sector, sdt->guard_tag,
sdt->app_tag, be32_to_cpu(sdt->ref_tag));
- if (sdt->app_tag == cpu_to_be16(0xffff)) {
+ if (sdt->app_tag == T10_PI_APP_ESCAPE) {
dsg_off += block_size;
goto next;
}
diff --git a/drivers/target/target_core_tmr.c b/drivers/target/target_core_tmr.c
index dce1e1b47316..13f47bf4d16b 100644
--- a/drivers/target/target_core_tmr.c
+++ b/drivers/target/target_core_tmr.c
@@ -75,7 +75,7 @@ void core_tmr_release_req(struct se_tmr_req *tmr)
kfree(tmr);
}
-static void core_tmr_handle_tas_abort(struct se_cmd *cmd, int tas)
+static int core_tmr_handle_tas_abort(struct se_cmd *cmd, int tas)
{
unsigned long flags;
bool remove = true, send_tas;
@@ -91,7 +91,7 @@ static void core_tmr_handle_tas_abort(struct se_cmd *cmd, int tas)
transport_send_task_abort(cmd);
}
- transport_cmd_finish_abort(cmd, remove);
+ return transport_cmd_finish_abort(cmd, remove);
}
static int target_check_cdb_and_preempt(struct list_head *list,
@@ -184,8 +184,8 @@ void core_tmr_abort_task(
cancel_work_sync(&se_cmd->work);
transport_wait_for_tasks(se_cmd);
- transport_cmd_finish_abort(se_cmd, true);
- target_put_sess_cmd(se_cmd);
+ if (!transport_cmd_finish_abort(se_cmd, true))
+ target_put_sess_cmd(se_cmd);
printk("ABORT_TASK: Sending TMR_FUNCTION_COMPLETE for"
" ref_tag: %llu\n", ref_tag);
@@ -281,8 +281,8 @@ static void core_tmr_drain_tmr_list(
cancel_work_sync(&cmd->work);
transport_wait_for_tasks(cmd);
- transport_cmd_finish_abort(cmd, 1);
- target_put_sess_cmd(cmd);
+ if (!transport_cmd_finish_abort(cmd, 1))
+ target_put_sess_cmd(cmd);
}
}
@@ -380,8 +380,8 @@ static void core_tmr_drain_state_list(
cancel_work_sync(&cmd->work);
transport_wait_for_tasks(cmd);
- core_tmr_handle_tas_abort(cmd, tas);
- target_put_sess_cmd(cmd);
+ if (!core_tmr_handle_tas_abort(cmd, tas))
+ target_put_sess_cmd(cmd);
}
}
diff --git a/drivers/target/target_core_transport.c b/drivers/target/target_core_transport.c
index 6025935036c9..f1b3a46bdcaf 100644
--- a/drivers/target/target_core_transport.c
+++ b/drivers/target/target_core_transport.c
@@ -651,9 +651,10 @@ static void transport_lun_remove_cmd(struct se_cmd *cmd)
percpu_ref_put(&lun->lun_ref);
}
-void transport_cmd_finish_abort(struct se_cmd *cmd, int remove)
+int transport_cmd_finish_abort(struct se_cmd *cmd, int remove)
{
bool ack_kref = (cmd->se_cmd_flags & SCF_ACK_KREF);
+ int ret = 0;
if (cmd->se_cmd_flags & SCF_SE_LUN_CMD)
transport_lun_remove_cmd(cmd);
@@ -665,9 +666,11 @@ void transport_cmd_finish_abort(struct se_cmd *cmd, int remove)
cmd->se_tfo->aborted_task(cmd);
if (transport_cmd_check_stop_to_fabric(cmd))
- return;
+ return 1;
if (remove && ack_kref)
- transport_put_cmd(cmd);
+ ret = transport_put_cmd(cmd);
+
+ return ret;
}
static void target_complete_failure_work(struct work_struct *work)
diff --git a/drivers/tty/synclink_gt.c b/drivers/tty/synclink_gt.c
index 31885f20fc15..cc047de72e2a 100644
--- a/drivers/tty/synclink_gt.c
+++ b/drivers/tty/synclink_gt.c
@@ -184,7 +184,7 @@ static void hdlcdev_exit(struct slgt_info *info);
struct cond_wait {
struct cond_wait *next;
wait_queue_head_t q;
- wait_queue_t wait;
+ wait_queue_entry_t wait;
unsigned int data;
};
static void init_cond_wait(struct cond_wait *w, unsigned int data);
diff --git a/drivers/usb/gadget/composite.c b/drivers/usb/gadget/composite.c
index 49d685ad0da9..45b554032332 100644
--- a/drivers/usb/gadget/composite.c
+++ b/drivers/usb/gadget/composite.c
@@ -315,6 +315,9 @@ void usb_remove_function(struct usb_configuration *c, struct usb_function *f)
list_del(&f->list);
if (f->unbind)
f->unbind(c, f);
+
+ if (f->bind_deactivated)
+ usb_function_activate(f);
}
EXPORT_SYMBOL_GPL(usb_remove_function);
@@ -956,12 +959,8 @@ static void remove_config(struct usb_composite_dev *cdev,
f = list_first_entry(&config->functions,
struct usb_function, list);
- list_del(&f->list);
- if (f->unbind) {
- DBG(cdev, "unbind function '%s'/%p\n", f->name, f);
- f->unbind(config, f);
- /* may free memory for "f" */
- }
+
+ usb_remove_function(config, f);
}
list_del(&config->list);
if (config->unbind) {
diff --git a/drivers/usb/gadget/function/f_phonet.c b/drivers/usb/gadget/function/f_phonet.c
index b4058f0000e4..6a1ce6a55158 100644
--- a/drivers/usb/gadget/function/f_phonet.c
+++ b/drivers/usb/gadget/function/f_phonet.c
@@ -281,7 +281,7 @@ static void pn_net_setup(struct net_device *dev)
dev->tx_queue_len = 1;
dev->netdev_ops = &pn_netdev_ops;
- dev->destructor = free_netdev;
+ dev->needs_free_netdev = true;
dev->header_ops = &phonet_header_ops;
}
diff --git a/drivers/usb/gadget/legacy/inode.c b/drivers/usb/gadget/legacy/inode.c
index b9ca0a26cbd9..684900fcfe24 100644
--- a/drivers/usb/gadget/legacy/inode.c
+++ b/drivers/usb/gadget/legacy/inode.c
@@ -1183,8 +1183,10 @@ dev_release (struct inode *inode, struct file *fd)
/* closing ep0 === shutdown all */
- if (dev->gadget_registered)
+ if (dev->gadget_registered) {
usb_gadget_unregister_driver (&gadgetfs_driver);
+ dev->gadget_registered = false;
+ }
/* at this point "good" hardware has disconnected the
* device from USB; the host won't see it any more.
@@ -1677,9 +1679,10 @@ static void
gadgetfs_suspend (struct usb_gadget *gadget)
{
struct dev_data *dev = get_gadget_data (gadget);
+ unsigned long flags;
INFO (dev, "suspended from state %d\n", dev->state);
- spin_lock (&dev->lock);
+ spin_lock_irqsave(&dev->lock, flags);
switch (dev->state) {
case STATE_DEV_SETUP: // VERY odd... host died??
case STATE_DEV_CONNECTED:
@@ -1690,7 +1693,7 @@ gadgetfs_suspend (struct usb_gadget *gadget)
default:
break;
}
- spin_unlock (&dev->lock);
+ spin_unlock_irqrestore(&dev->lock, flags);
}
static struct usb_gadget_driver gadgetfs_driver = {
diff --git a/drivers/usb/gadget/udc/dummy_hcd.c b/drivers/usb/gadget/udc/dummy_hcd.c
index ccabb51cb98d..7635fd7cc328 100644
--- a/drivers/usb/gadget/udc/dummy_hcd.c
+++ b/drivers/usb/gadget/udc/dummy_hcd.c
@@ -442,23 +442,16 @@ static void set_link_state(struct dummy_hcd *dum_hcd)
/* Report reset and disconnect events to the driver */
if (dum->driver && (disconnect || reset)) {
stop_activity(dum);
- spin_unlock(&dum->lock);
if (reset)
usb_gadget_udc_reset(&dum->gadget, dum->driver);
else
dum->driver->disconnect(&dum->gadget);
- spin_lock(&dum->lock);
}
} else if (dum_hcd->active != dum_hcd->old_active) {
- if (dum_hcd->old_active && dum->driver->suspend) {
- spin_unlock(&dum->lock);
+ if (dum_hcd->old_active && dum->driver->suspend)
dum->driver->suspend(&dum->gadget);
- spin_lock(&dum->lock);
- } else if (!dum_hcd->old_active && dum->driver->resume) {
- spin_unlock(&dum->lock);
+ else if (!dum_hcd->old_active && dum->driver->resume)
dum->driver->resume(&dum->gadget);
- spin_lock(&dum->lock);
- }
}
dum_hcd->old_status = dum_hcd->port_status;
@@ -983,7 +976,9 @@ static int dummy_udc_stop(struct usb_gadget *g)
struct dummy_hcd *dum_hcd = gadget_to_dummy_hcd(g);
struct dummy *dum = dum_hcd->dum;
+ spin_lock_irq(&dum->lock);
dum->driver = NULL;
+ spin_unlock_irq(&dum->lock);
return 0;
}
diff --git a/drivers/usb/gadget/udc/net2280.c b/drivers/usb/gadget/udc/net2280.c
index 6cf07857eaca..f2cbd7f8005e 100644
--- a/drivers/usb/gadget/udc/net2280.c
+++ b/drivers/usb/gadget/udc/net2280.c
@@ -2470,11 +2470,8 @@ static void stop_activity(struct net2280 *dev, struct usb_gadget_driver *driver)
nuke(&dev->ep[i]);
/* report disconnect; the driver is already quiesced */
- if (driver) {
- spin_unlock(&dev->lock);
+ if (driver)
driver->disconnect(&dev->gadget);
- spin_lock(&dev->lock);
- }
usb_reinit(dev);
}
@@ -3348,8 +3345,6 @@ next_endpoints:
BIT(PCI_RETRY_ABORT_INTERRUPT))
static void handle_stat1_irqs(struct net2280 *dev, u32 stat)
-__releases(dev->lock)
-__acquires(dev->lock)
{
struct net2280_ep *ep;
u32 tmp, num, mask, scratch;
@@ -3390,14 +3385,12 @@ __acquires(dev->lock)
if (disconnect || reset) {
stop_activity(dev, dev->driver);
ep0_start(dev);
- spin_unlock(&dev->lock);
if (reset)
usb_gadget_udc_reset
(&dev->gadget, dev->driver);
else
(dev->driver->disconnect)
(&dev->gadget);
- spin_lock(&dev->lock);
return;
}
}
diff --git a/drivers/usb/host/xhci-mem.c b/drivers/usb/host/xhci-mem.c
index 1f1687e888d6..fddf2731f798 100644
--- a/drivers/usb/host/xhci-mem.c
+++ b/drivers/usb/host/xhci-mem.c
@@ -2119,11 +2119,12 @@ static void xhci_add_in_port(struct xhci_hcd *xhci, unsigned int num_ports,
{
u32 temp, port_offset, port_count;
int i;
- u8 major_revision;
+ u8 major_revision, minor_revision;
struct xhci_hub *rhub;
temp = readl(addr);
major_revision = XHCI_EXT_PORT_MAJOR(temp);
+ minor_revision = XHCI_EXT_PORT_MINOR(temp);
if (major_revision == 0x03) {
rhub = &xhci->usb3_rhub;
@@ -2137,7 +2138,9 @@ static void xhci_add_in_port(struct xhci_hcd *xhci, unsigned int num_ports,
return;
}
rhub->maj_rev = XHCI_EXT_PORT_MAJOR(temp);
- rhub->min_rev = XHCI_EXT_PORT_MINOR(temp);
+
+ if (rhub->min_rev < minor_revision)
+ rhub->min_rev = minor_revision;
/* Port offset and count in the third dword, see section 7.2 */
temp = readl(addr + 2);
diff --git a/drivers/usb/host/xhci-pci.c b/drivers/usb/host/xhci-pci.c
index 4842be5687a7..783e6687bf4a 100644
--- a/drivers/usb/host/xhci-pci.c
+++ b/drivers/usb/host/xhci-pci.c
@@ -201,6 +201,9 @@ static void xhci_pci_quirks(struct device *dev, struct xhci_hcd *xhci)
if (pdev->vendor == PCI_VENDOR_ID_ASMEDIA &&
pdev->device == 0x1042)
xhci->quirks |= XHCI_BROKEN_STREAMS;
+ if (pdev->vendor == PCI_VENDOR_ID_ASMEDIA &&
+ pdev->device == 0x1142)
+ xhci->quirks |= XHCI_TRUST_TX_LENGTH;
if (pdev->vendor == PCI_VENDOR_ID_TI && pdev->device == 0x8241)
xhci->quirks |= XHCI_LIMIT_ENDPOINT_INTERVAL_7;
diff --git a/drivers/vfio/virqfd.c b/drivers/vfio/virqfd.c
index 27c89cd5d70b..4797217e5e72 100644
--- a/drivers/vfio/virqfd.c
+++ b/drivers/vfio/virqfd.c
@@ -43,7 +43,7 @@ static void virqfd_deactivate(struct virqfd *virqfd)
queue_work(vfio_irqfd_cleanup_wq, &virqfd->shutdown);
}
-static int virqfd_wakeup(wait_queue_t *wait, unsigned mode, int sync, void *key)
+static int virqfd_wakeup(wait_queue_entry_t *wait, unsigned mode, int sync, void *key)
{
struct virqfd *virqfd = container_of(wait, struct virqfd, wait);
unsigned long flags = (unsigned long)key;
diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c
index 042030e5a035..e4613a3c362d 100644
--- a/drivers/vhost/vhost.c
+++ b/drivers/vhost/vhost.c
@@ -165,7 +165,7 @@ static void vhost_poll_func(struct file *file, wait_queue_head_t *wqh,
add_wait_queue(wqh, &poll->wait);
}
-static int vhost_poll_wakeup(wait_queue_t *wait, unsigned mode, int sync,
+static int vhost_poll_wakeup(wait_queue_entry_t *wait, unsigned mode, int sync,
void *key)
{
struct vhost_poll *poll = container_of(wait, struct vhost_poll, wait);
diff --git a/drivers/vhost/vhost.h b/drivers/vhost/vhost.h
index f55671d53f28..f72095868b93 100644
--- a/drivers/vhost/vhost.h
+++ b/drivers/vhost/vhost.h
@@ -31,7 +31,7 @@ struct vhost_work {
struct vhost_poll {
poll_table table;
wait_queue_head_t *wqh;
- wait_queue_t wait;
+ wait_queue_entry_t wait;
struct vhost_work work;
unsigned long mask;
struct vhost_dev *dev;
diff --git a/drivers/video/fbdev/core/fbmon.c b/drivers/video/fbdev/core/fbmon.c
index 687ebb053438..41d7979d81c5 100644
--- a/drivers/video/fbdev/core/fbmon.c
+++ b/drivers/video/fbdev/core/fbmon.c
@@ -1048,7 +1048,7 @@ void fb_edid_add_monspecs(unsigned char *edid, struct fb_monspecs *specs)
for (i = 0; i < (128 - edid[2]) / DETAILED_TIMING_DESCRIPTION_SIZE;
i++, block += DETAILED_TIMING_DESCRIPTION_SIZE)
- if (PIXEL_CLOCK)
+ if (PIXEL_CLOCK != 0)
edt[num++] = block - edid;
/* Yikes, EDID data is totally useless */
diff --git a/drivers/video/fbdev/smscufx.c b/drivers/video/fbdev/smscufx.c
index ec2e7e353685..449fceaf79d5 100644
--- a/drivers/video/fbdev/smscufx.c
+++ b/drivers/video/fbdev/smscufx.c
@@ -1646,8 +1646,9 @@ static int ufx_usb_probe(struct usb_interface *interface,
dev_dbg(dev->gdev, "%s %s - serial #%s\n",
usbdev->manufacturer, usbdev->product, usbdev->serial);
dev_dbg(dev->gdev, "vid_%04x&pid_%04x&rev_%04x driver's ufx_data struct at %p\n",
- usbdev->descriptor.idVendor, usbdev->descriptor.idProduct,
- usbdev->descriptor.bcdDevice, dev);
+ le16_to_cpu(usbdev->descriptor.idVendor),
+ le16_to_cpu(usbdev->descriptor.idProduct),
+ le16_to_cpu(usbdev->descriptor.bcdDevice), dev);
dev_dbg(dev->gdev, "console enable=%d\n", console);
dev_dbg(dev->gdev, "fb_defio enable=%d\n", fb_defio);
diff --git a/drivers/video/fbdev/udlfb.c b/drivers/video/fbdev/udlfb.c
index 6a3c353de7c3..05ef657235df 100644
--- a/drivers/video/fbdev/udlfb.c
+++ b/drivers/video/fbdev/udlfb.c
@@ -1105,8 +1105,8 @@ static int dlfb_ops_blank(int blank_mode, struct fb_info *info)
char *bufptr;
struct urb *urb;
- pr_info("/dev/fb%d FB_BLANK mode %d --> %d\n",
- info->node, dev->blank_mode, blank_mode);
+ pr_debug("/dev/fb%d FB_BLANK mode %d --> %d\n",
+ info->node, dev->blank_mode, blank_mode);
if ((dev->blank_mode == FB_BLANK_POWERDOWN) &&
(blank_mode != FB_BLANK_POWERDOWN)) {
@@ -1613,8 +1613,9 @@ static int dlfb_usb_probe(struct usb_interface *interface,
pr_info("%s %s - serial #%s\n",
usbdev->manufacturer, usbdev->product, usbdev->serial);
pr_info("vid_%04x&pid_%04x&rev_%04x driver's dlfb_data struct at %p\n",
- usbdev->descriptor.idVendor, usbdev->descriptor.idProduct,
- usbdev->descriptor.bcdDevice, dev);
+ le16_to_cpu(usbdev->descriptor.idVendor),
+ le16_to_cpu(usbdev->descriptor.idProduct),
+ le16_to_cpu(usbdev->descriptor.bcdDevice), dev);
pr_info("console enable=%d\n", console);
pr_info("fb_defio enable=%d\n", fb_defio);
pr_info("shadow enable=%d\n", shadow);
diff --git a/drivers/video/fbdev/via/viafbdev.c b/drivers/video/fbdev/via/viafbdev.c
index f9718f012aae..badee04ef496 100644
--- a/drivers/video/fbdev/via/viafbdev.c
+++ b/drivers/video/fbdev/via/viafbdev.c
@@ -1630,16 +1630,14 @@ static void viafb_init_proc(struct viafb_shared *shared)
}
static void viafb_remove_proc(struct viafb_shared *shared)
{
- struct proc_dir_entry *viafb_entry = shared->proc_entry,
- *iga1_entry = shared->iga1_proc_entry,
- *iga2_entry = shared->iga2_proc_entry;
+ struct proc_dir_entry *viafb_entry = shared->proc_entry;
if (!viafb_entry)
return;
- remove_proc_entry("output_devices", iga2_entry);
+ remove_proc_entry("output_devices", shared->iga2_proc_entry);
remove_proc_entry("iga2", viafb_entry);
- remove_proc_entry("output_devices", iga1_entry);
+ remove_proc_entry("output_devices", shared->iga1_proc_entry);
remove_proc_entry("iga1", viafb_entry);
remove_proc_entry("supported_output_devices", viafb_entry);
diff --git a/drivers/virtio/virtio_balloon.c b/drivers/virtio/virtio_balloon.c
index 408c174ef0d5..22caf808bfab 100644
--- a/drivers/virtio/virtio_balloon.c
+++ b/drivers/virtio/virtio_balloon.c
@@ -663,6 +663,12 @@ static int virtballoon_restore(struct virtio_device *vdev)
}
#endif
+static int virtballoon_validate(struct virtio_device *vdev)
+{
+ __virtio_clear_bit(vdev, VIRTIO_F_IOMMU_PLATFORM);
+ return 0;
+}
+
static unsigned int features[] = {
VIRTIO_BALLOON_F_MUST_TELL_HOST,
VIRTIO_BALLOON_F_STATS_VQ,
@@ -675,6 +681,7 @@ static struct virtio_driver virtio_balloon_driver = {
.driver.name = KBUILD_MODNAME,
.driver.owner = THIS_MODULE,
.id_table = id_table,
+ .validate = virtballoon_validate,
.probe = virtballoon_probe,
.remove = virtballoon_remove,
.config_changed = virtballoon_changed,
diff --git a/drivers/xen/manage.c b/drivers/xen/manage.c
index c1ec8ee80924..9e35032351a0 100644
--- a/drivers/xen/manage.c
+++ b/drivers/xen/manage.c
@@ -190,6 +190,7 @@ static void do_poweroff(void)
{
switch (system_state) {
case SYSTEM_BOOTING:
+ case SYSTEM_SCHEDULING:
orderly_poweroff(true);
break;
case SYSTEM_RUNNING:
diff --git a/fs/autofs4/autofs_i.h b/fs/autofs4/autofs_i.h
index beef981aa54f..974f5346458a 100644
--- a/fs/autofs4/autofs_i.h
+++ b/fs/autofs4/autofs_i.h
@@ -83,7 +83,7 @@ struct autofs_info {
struct autofs_wait_queue {
wait_queue_head_t queue;
struct autofs_wait_queue *next;
- autofs_wqt_t wait_queue_token;
+ autofs_wqt_t wait_queue_entry_token;
/* We use the following to see what we are waiting for */
struct qstr name;
u32 dev;
diff --git a/fs/autofs4/dev-ioctl.c b/fs/autofs4/dev-ioctl.c
index 734cbf8d9676..dd9f1bebb5a3 100644
--- a/fs/autofs4/dev-ioctl.c
+++ b/fs/autofs4/dev-ioctl.c
@@ -344,7 +344,7 @@ static int autofs_dev_ioctl_fail(struct file *fp,
int status;
token = (autofs_wqt_t) param->fail.token;
- status = param->fail.status ? param->fail.status : -ENOENT;
+ status = param->fail.status < 0 ? param->fail.status : -ENOENT;
return autofs4_wait_release(sbi, token, status);
}
diff --git a/fs/autofs4/waitq.c b/fs/autofs4/waitq.c
index 24a58bf9ca72..7071895b0678 100644
--- a/fs/autofs4/waitq.c
+++ b/fs/autofs4/waitq.c
@@ -104,7 +104,7 @@ static void autofs4_notify_daemon(struct autofs_sb_info *sbi,
size_t pktsz;
pr_debug("wait id = 0x%08lx, name = %.*s, type=%d\n",
- (unsigned long) wq->wait_queue_token,
+ (unsigned long) wq->wait_queue_entry_token,
wq->name.len, wq->name.name, type);
memset(&pkt, 0, sizeof(pkt)); /* For security reasons */
@@ -120,7 +120,7 @@ static void autofs4_notify_daemon(struct autofs_sb_info *sbi,
pktsz = sizeof(*mp);
- mp->wait_queue_token = wq->wait_queue_token;
+ mp->wait_queue_entry_token = wq->wait_queue_entry_token;
mp->len = wq->name.len;
memcpy(mp->name, wq->name.name, wq->name.len);
mp->name[wq->name.len] = '\0';
@@ -133,7 +133,7 @@ static void autofs4_notify_daemon(struct autofs_sb_info *sbi,
pktsz = sizeof(*ep);
- ep->wait_queue_token = wq->wait_queue_token;
+ ep->wait_queue_entry_token = wq->wait_queue_entry_token;
ep->len = wq->name.len;
memcpy(ep->name, wq->name.name, wq->name.len);
ep->name[wq->name.len] = '\0';
@@ -153,7 +153,7 @@ static void autofs4_notify_daemon(struct autofs_sb_info *sbi,
pktsz = sizeof(*packet);
- packet->wait_queue_token = wq->wait_queue_token;
+ packet->wait_queue_entry_token = wq->wait_queue_entry_token;
packet->len = wq->name.len;
memcpy(packet->name, wq->name.name, wq->name.len);
packet->name[wq->name.len] = '\0';
@@ -428,7 +428,7 @@ int autofs4_wait(struct autofs_sb_info *sbi,
return -ENOMEM;
}
- wq->wait_queue_token = autofs4_next_wait_queue;
+ wq->wait_queue_entry_token = autofs4_next_wait_queue;
if (++autofs4_next_wait_queue == 0)
autofs4_next_wait_queue = 1;
wq->next = sbi->queues;
@@ -461,7 +461,7 @@ int autofs4_wait(struct autofs_sb_info *sbi,
}
pr_debug("new wait id = 0x%08lx, name = %.*s, nfy=%d\n",
- (unsigned long) wq->wait_queue_token, wq->name.len,
+ (unsigned long) wq->wait_queue_entry_token, wq->name.len,
wq->name.name, notify);
/*
@@ -471,7 +471,7 @@ int autofs4_wait(struct autofs_sb_info *sbi,
} else {
wq->wait_ctr++;
pr_debug("existing wait id = 0x%08lx, name = %.*s, nfy=%d\n",
- (unsigned long) wq->wait_queue_token, wq->name.len,
+ (unsigned long) wq->wait_queue_entry_token, wq->name.len,
wq->name.name, notify);
mutex_unlock(&sbi->wq_mutex);
kfree(qstr.name);
@@ -550,13 +550,13 @@ int autofs4_wait(struct autofs_sb_info *sbi,
}
-int autofs4_wait_release(struct autofs_sb_info *sbi, autofs_wqt_t wait_queue_token, int status)
+int autofs4_wait_release(struct autofs_sb_info *sbi, autofs_wqt_t wait_queue_entry_token, int status)
{
struct autofs_wait_queue *wq, **wql;
mutex_lock(&sbi->wq_mutex);
for (wql = &sbi->queues; (wq = *wql) != NULL; wql = &wq->next) {
- if (wq->wait_queue_token == wait_queue_token)
+ if (wq->wait_queue_entry_token == wait_queue_entry_token)
break;
}
diff --git a/fs/block_dev.c b/fs/block_dev.c
index 2c5f08696fff..a7df151f8aba 100644
--- a/fs/block_dev.c
+++ b/fs/block_dev.c
@@ -264,7 +264,10 @@ __blkdev_direct_IO_simple(struct kiocb *iocb, struct iov_iter *iter,
kfree(vecs);
if (unlikely(bio.bi_status))
- return blk_status_to_errno(bio.bi_status);
+ ret = blk_status_to_errno(bio.bi_status);
+
+ bio_uninit(&bio);
+
return ret;
}
diff --git a/fs/btrfs/hash.c b/fs/btrfs/hash.c
index a97fdc156a03..baacc1866861 100644
--- a/fs/btrfs/hash.c
+++ b/fs/btrfs/hash.c
@@ -38,6 +38,7 @@ u32 btrfs_crc32c(u32 crc, const void *address, unsigned int length)
{
SHASH_DESC_ON_STACK(shash, tfm);
u32 *ctx = (u32 *)shash_desc_ctx(shash);
+ u32 retval;
int err;
shash->tfm = tfm;
@@ -47,5 +48,7 @@ u32 btrfs_crc32c(u32 crc, const void *address, unsigned int length)
err = crypto_shash_update(shash, address, length);
BUG_ON(err);
- return *ctx;
+ retval = *ctx;
+ barrier_data(ctx);
+ return retval;
}
diff --git a/fs/cachefiles/internal.h b/fs/cachefiles/internal.h
index 9bf90bcc56ac..bb3a02ca9da4 100644
--- a/fs/cachefiles/internal.h
+++ b/fs/cachefiles/internal.h
@@ -18,7 +18,7 @@
#include <linux/fscache-cache.h>
#include <linux/timer.h>
-#include <linux/wait.h>
+#include <linux/wait_bit.h>
#include <linux/cred.h>
#include <linux/workqueue.h>
#include <linux/security.h>
@@ -97,7 +97,7 @@ struct cachefiles_cache {
* backing file read tracking
*/
struct cachefiles_one_read {
- wait_queue_t monitor; /* link into monitored waitqueue */
+ wait_queue_entry_t monitor; /* link into monitored waitqueue */
struct page *back_page; /* backing file page we're waiting for */
struct page *netfs_page; /* netfs page we're going to fill */
struct fscache_retrieval *op; /* retrieval op covering this */
diff --git a/fs/cachefiles/namei.c b/fs/cachefiles/namei.c
index 41df8a27d7eb..3978b324cbca 100644
--- a/fs/cachefiles/namei.c
+++ b/fs/cachefiles/namei.c
@@ -204,7 +204,7 @@ wait_for_old_object:
wait_queue_head_t *wq;
signed long timeout = 60 * HZ;
- wait_queue_t wait;
+ wait_queue_entry_t wait;
bool requeue;
/* if the object we're waiting for is queued for processing,
diff --git a/fs/cachefiles/rdwr.c b/fs/cachefiles/rdwr.c
index afbdc418966d..18d7aa61ef0f 100644
--- a/fs/cachefiles/rdwr.c
+++ b/fs/cachefiles/rdwr.c
@@ -21,7 +21,7 @@
* - we use this to detect read completion of backing pages
* - the caller holds the waitqueue lock
*/
-static int cachefiles_read_waiter(wait_queue_t *wait, unsigned mode,
+static int cachefiles_read_waiter(wait_queue_entry_t *wait, unsigned mode,
int sync, void *_key)
{
struct cachefiles_one_read *monitor =
@@ -48,7 +48,7 @@ static int cachefiles_read_waiter(wait_queue_t *wait, unsigned mode,
}
/* remove from the waitqueue */
- list_del(&wait->task_list);
+ list_del(&wait->entry);
/* move onto the action list and queue for FS-Cache thread pool */
ASSERT(monitor->op);
diff --git a/fs/ceph/acl.c b/fs/ceph/acl.c
index 987044bca1c2..59cb307b15fb 100644
--- a/fs/ceph/acl.c
+++ b/fs/ceph/acl.c
@@ -131,6 +131,7 @@ int ceph_set_acl(struct inode *inode, struct posix_acl *acl, int type)
}
if (new_mode != old_mode) {
+ newattrs.ia_ctime = current_time(inode);
newattrs.ia_mode = new_mode;
newattrs.ia_valid = ATTR_MODE;
ret = __ceph_setattr(inode, &newattrs);
diff --git a/fs/ceph/export.c b/fs/ceph/export.c
index e8f11fa565c5..7df550c13d7f 100644
--- a/fs/ceph/export.c
+++ b/fs/ceph/export.c
@@ -91,6 +91,10 @@ static struct dentry *__fh_to_dentry(struct super_block *sb, u64 ino)
ceph_mdsc_put_request(req);
if (!inode)
return ERR_PTR(-ESTALE);
+ if (inode->i_nlink == 0) {
+ iput(inode);
+ return ERR_PTR(-ESTALE);
+ }
}
return d_obtain_alias(inode);
diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c
index dcce79b84406..4de6cdddf059 100644
--- a/fs/ceph/inode.c
+++ b/fs/ceph/inode.c
@@ -2022,7 +2022,6 @@ int __ceph_setattr(struct inode *inode, struct iattr *attr)
attr->ia_size > inode->i_size) {
i_size_write(inode, attr->ia_size);
inode->i_blocks = calc_inode_blocks(attr->ia_size);
- inode->i_ctime = attr->ia_ctime;
ci->i_reported_size = attr->ia_size;
dirtied |= CEPH_CAP_FILE_EXCL;
} else if ((issued & CEPH_CAP_FILE_SHARED) == 0 ||
@@ -2044,7 +2043,6 @@ int __ceph_setattr(struct inode *inode, struct iattr *attr)
inode->i_ctime.tv_sec, inode->i_ctime.tv_nsec,
attr->ia_ctime.tv_sec, attr->ia_ctime.tv_nsec,
only ? "ctime only" : "ignored");
- inode->i_ctime = attr->ia_ctime;
if (only) {
/*
* if kernel wants to dirty ctime but nothing else,
@@ -2067,7 +2065,7 @@ int __ceph_setattr(struct inode *inode, struct iattr *attr)
if (dirtied) {
inode_dirty_flags = __ceph_mark_dirty_caps(ci, dirtied,
&prealloc_cf);
- inode->i_ctime = current_time(inode);
+ inode->i_ctime = attr->ia_ctime;
}
release &= issued;
@@ -2085,6 +2083,7 @@ int __ceph_setattr(struct inode *inode, struct iattr *attr)
req->r_inode_drop = release;
req->r_args.setattr.mask = cpu_to_le32(mask);
req->r_num_caps = 1;
+ req->r_stamp = attr->ia_ctime;
err = ceph_mdsc_do_request(mdsc, NULL, req);
}
dout("setattr %p result=%d (%s locally, %d remote)\n", inode, err,
diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c
index f38e56fa9712..0c05df44cc6c 100644
--- a/fs/ceph/mds_client.c
+++ b/fs/ceph/mds_client.c
@@ -1687,7 +1687,6 @@ struct ceph_mds_request *
ceph_mdsc_create_request(struct ceph_mds_client *mdsc, int op, int mode)
{
struct ceph_mds_request *req = kzalloc(sizeof(*req), GFP_NOFS);
- struct timespec ts;
if (!req)
return ERR_PTR(-ENOMEM);
@@ -1706,8 +1705,7 @@ ceph_mdsc_create_request(struct ceph_mds_client *mdsc, int op, int mode)
init_completion(&req->r_safe_completion);
INIT_LIST_HEAD(&req->r_unsafe_item);
- ktime_get_real_ts(&ts);
- req->r_stamp = timespec_trunc(ts, mdsc->fsc->sb->s_time_gran);
+ req->r_stamp = timespec_trunc(current_kernel_time(), mdsc->fsc->sb->s_time_gran);
req->r_op = op;
req->r_direct_mode = mode;
diff --git a/fs/cifs/file.c b/fs/cifs/file.c
index 0fd081bd2a2f..fcef70602b27 100644
--- a/fs/cifs/file.c
+++ b/fs/cifs/file.c
@@ -3271,7 +3271,7 @@ ssize_t cifs_user_readv(struct kiocb *iocb, struct iov_iter *to)
if (!is_sync_kiocb(iocb))
ctx->iocb = iocb;
- if (to->type & ITER_IOVEC)
+ if (to->type == ITER_IOVEC)
ctx->should_dirty = true;
rc = setup_aio_ctx_iter(ctx, to, READ);
diff --git a/fs/cifs/inode.c b/fs/cifs/inode.c
index 4d1fcd76d022..a8693632235f 100644
--- a/fs/cifs/inode.c
+++ b/fs/cifs/inode.c
@@ -24,6 +24,7 @@
#include <linux/pagemap.h>
#include <linux/freezer.h>
#include <linux/sched/signal.h>
+#include <linux/wait_bit.h>
#include <asm/div64.h>
#include "cifsfs.h"
diff --git a/fs/cifs/misc.c b/fs/cifs/misc.c
index b08531977daa..3b147dc6af63 100644
--- a/fs/cifs/misc.c
+++ b/fs/cifs/misc.c
@@ -810,7 +810,7 @@ setup_aio_ctx_iter(struct cifs_aio_ctx *ctx, struct iov_iter *iter, int rw)
if (!pages) {
pages = vmalloc(max_pages * sizeof(struct page *));
- if (!bv) {
+ if (!pages) {
kvfree(bv);
return -ENOMEM;
}
diff --git a/fs/cifs/smb1ops.c b/fs/cifs/smb1ops.c
index 27bc360c7ffd..a723df3e0197 100644
--- a/fs/cifs/smb1ops.c
+++ b/fs/cifs/smb1ops.c
@@ -849,8 +849,13 @@ cifs_query_dir_first(const unsigned int xid, struct cifs_tcon *tcon,
struct cifs_fid *fid, __u16 search_flags,
struct cifs_search_info *srch_inf)
{
- return CIFSFindFirst(xid, tcon, path, cifs_sb,
- &fid->netfid, search_flags, srch_inf, true);
+ int rc;
+
+ rc = CIFSFindFirst(xid, tcon, path, cifs_sb,
+ &fid->netfid, search_flags, srch_inf, true);
+ if (rc)
+ cifs_dbg(FYI, "find first failed=%d\n", rc);
+ return rc;
}
static int
diff --git a/fs/cifs/smb2ops.c b/fs/cifs/smb2ops.c
index c58691834eb2..7e48561abd29 100644
--- a/fs/cifs/smb2ops.c
+++ b/fs/cifs/smb2ops.c
@@ -982,7 +982,7 @@ smb2_query_dir_first(const unsigned int xid, struct cifs_tcon *tcon,
rc = SMB2_open(xid, &oparms, utf16_path, &oplock, NULL, NULL);
kfree(utf16_path);
if (rc) {
- cifs_dbg(VFS, "open dir failed\n");
+ cifs_dbg(FYI, "open dir failed rc=%d\n", rc);
return rc;
}
@@ -992,7 +992,7 @@ smb2_query_dir_first(const unsigned int xid, struct cifs_tcon *tcon,
rc = SMB2_query_directory(xid, tcon, fid->persistent_fid,
fid->volatile_fid, 0, srch_inf);
if (rc) {
- cifs_dbg(VFS, "query directory failed\n");
+ cifs_dbg(FYI, "query directory failed rc=%d\n", rc);
SMB2_close(xid, tcon, fid->persistent_fid, fid->volatile_fid);
}
return rc;
@@ -1809,7 +1809,8 @@ crypt_message(struct TCP_Server_Info *server, struct smb_rqst *rqst, int enc)
sg = init_sg(rqst, sign);
if (!sg) {
- cifs_dbg(VFS, "%s: Failed to init sg %d", __func__, rc);
+ cifs_dbg(VFS, "%s: Failed to init sg", __func__);
+ rc = -ENOMEM;
goto free_req;
}
@@ -1817,6 +1818,7 @@ crypt_message(struct TCP_Server_Info *server, struct smb_rqst *rqst, int enc)
iv = kzalloc(iv_len, GFP_KERNEL);
if (!iv) {
cifs_dbg(VFS, "%s: Failed to alloc IV", __func__);
+ rc = -ENOMEM;
goto free_sg;
}
iv[0] = 3;
diff --git a/fs/cifs/xattr.c b/fs/cifs/xattr.c
index 3cb5c9e2d4e7..de50e749ff05 100644
--- a/fs/cifs/xattr.c
+++ b/fs/cifs/xattr.c
@@ -188,8 +188,6 @@ static int cifs_creation_time_get(struct dentry *dentry, struct inode *inode,
pcreatetime = (__u64 *)value;
*pcreatetime = CIFS_I(inode)->createtime;
return sizeof(__u64);
-
- return rc;
}
diff --git a/fs/configfs/item.c b/fs/configfs/item.c
index 8b2a994042dd..a66f6624d899 100644
--- a/fs/configfs/item.c
+++ b/fs/configfs/item.c
@@ -138,6 +138,14 @@ struct config_item *config_item_get(struct config_item *item)
}
EXPORT_SYMBOL(config_item_get);
+struct config_item *config_item_get_unless_zero(struct config_item *item)
+{
+ if (item && kref_get_unless_zero(&item->ci_kref))
+ return item;
+ return NULL;
+}
+EXPORT_SYMBOL(config_item_get_unless_zero);
+
static void config_item_cleanup(struct config_item *item)
{
struct config_item_type *t = item->ci_type;
diff --git a/fs/configfs/symlink.c b/fs/configfs/symlink.c
index a6ab012a2c6a..c8aabba502f6 100644
--- a/fs/configfs/symlink.c
+++ b/fs/configfs/symlink.c
@@ -83,14 +83,13 @@ static int create_link(struct config_item *parent_item,
ret = -ENOMEM;
sl = kmalloc(sizeof(struct configfs_symlink), GFP_KERNEL);
if (sl) {
- sl->sl_target = config_item_get(item);
spin_lock(&configfs_dirent_lock);
if (target_sd->s_type & CONFIGFS_USET_DROPPING) {
spin_unlock(&configfs_dirent_lock);
- config_item_put(item);
kfree(sl);
return -ENOENT;
}
+ sl->sl_target = config_item_get(item);
list_add(&sl->sl_list, &target_sd->s_links);
spin_unlock(&configfs_dirent_lock);
ret = configfs_create_link(sl, parent_item->ci_dentry,
diff --git a/fs/dax.c b/fs/dax.c
index 2a6889b3585f..33d05aa02aad 100644
--- a/fs/dax.c
+++ b/fs/dax.c
@@ -84,7 +84,7 @@ struct exceptional_entry_key {
};
struct wait_exceptional_entry_queue {
- wait_queue_t wait;
+ wait_queue_entry_t wait;
struct exceptional_entry_key key;
};
@@ -108,7 +108,7 @@ static wait_queue_head_t *dax_entry_waitqueue(struct address_space *mapping,
return wait_table + hash;
}
-static int wake_exceptional_entry_func(wait_queue_t *wait, unsigned int mode,
+static int wake_exceptional_entry_func(wait_queue_entry_t *wait, unsigned int mode,
int sync, void *keyp)
{
struct exceptional_entry_key *key = keyp;
@@ -859,6 +859,7 @@ int dax_writeback_mapping_range(struct address_space *mapping,
if (ret < 0)
goto out;
}
+ start_index = indices[pvec.nr - 1] + 1;
}
out:
put_dax(dax_dev);
diff --git a/fs/dcache.c b/fs/dcache.c
index cddf39777835..a9f995f6859e 100644
--- a/fs/dcache.c
+++ b/fs/dcache.c
@@ -1494,7 +1494,7 @@ static void check_and_drop(void *_data)
{
struct detach_data *data = _data;
- if (!data->mountpoint && !data->select.found)
+ if (!data->mountpoint && list_empty(&data->select.dispose))
__d_drop(data->select.start);
}
@@ -1536,17 +1536,15 @@ void d_invalidate(struct dentry *dentry)
d_walk(dentry, &data, detach_and_collect, check_and_drop);
- if (data.select.found)
+ if (!list_empty(&data.select.dispose))
shrink_dentry_list(&data.select.dispose);
+ else if (!data.mountpoint)
+ return;
if (data.mountpoint) {
detach_mounts(data.mountpoint);
dput(data.mountpoint);
}
-
- if (!data.mountpoint && !data.select.found)
- break;
-
cond_resched();
}
}
diff --git a/fs/eventfd.c b/fs/eventfd.c
index 68b9fffcb2c8..9736df2ce89d 100644
--- a/fs/eventfd.c
+++ b/fs/eventfd.c
@@ -191,7 +191,7 @@ static void eventfd_ctx_do_read(struct eventfd_ctx *ctx, __u64 *cnt)
* This is used to atomically remove a wait queue entry from the eventfd wait
* queue head, and read/reset the counter value.
*/
-int eventfd_ctx_remove_wait_queue(struct eventfd_ctx *ctx, wait_queue_t *wait,
+int eventfd_ctx_remove_wait_queue(struct eventfd_ctx *ctx, wait_queue_entry_t *wait,
__u64 *cnt)
{
unsigned long flags;
diff --git a/fs/eventpoll.c b/fs/eventpoll.c
index 5420767c9b68..b1c8e23ddf65 100644
--- a/fs/eventpoll.c
+++ b/fs/eventpoll.c
@@ -244,7 +244,7 @@ struct eppoll_entry {
* Wait queue item that will be linked to the target file wait
* queue head.
*/
- wait_queue_t wait;
+ wait_queue_entry_t wait;
/* The wait queue head that linked the "wait" wait queue item */
wait_queue_head_t *whead;
@@ -347,13 +347,13 @@ static inline int ep_is_linked(struct list_head *p)
return !list_empty(p);
}
-static inline struct eppoll_entry *ep_pwq_from_wait(wait_queue_t *p)
+static inline struct eppoll_entry *ep_pwq_from_wait(wait_queue_entry_t *p)
{
return container_of(p, struct eppoll_entry, wait);
}
/* Get the "struct epitem" from a wait queue pointer */
-static inline struct epitem *ep_item_from_wait(wait_queue_t *p)
+static inline struct epitem *ep_item_from_wait(wait_queue_entry_t *p)
{
return container_of(p, struct eppoll_entry, wait)->base;
}
@@ -1078,7 +1078,7 @@ static struct epitem *ep_find(struct eventpoll *ep, struct file *file, int fd)
* mechanism. It is called by the stored file descriptors when they
* have events to report.
*/
-static int ep_poll_callback(wait_queue_t *wait, unsigned mode, int sync, void *key)
+static int ep_poll_callback(wait_queue_entry_t *wait, unsigned mode, int sync, void *key)
{
int pwake = 0;
unsigned long flags;
@@ -1094,7 +1094,7 @@ static int ep_poll_callback(wait_queue_t *wait, unsigned mode, int sync, void *k
* can't use __remove_wait_queue(). whead->lock is held by
* the caller.
*/
- list_del_init(&wait->task_list);
+ list_del_init(&wait->entry);
}
spin_lock_irqsave(&ep->lock, flags);
@@ -1699,7 +1699,7 @@ static int ep_poll(struct eventpoll *ep, struct epoll_event __user *events,
int res = 0, eavail, timed_out = 0;
unsigned long flags;
u64 slack = 0;
- wait_queue_t wait;
+ wait_queue_entry_t wait;
ktime_t expires, *to = NULL;
if (timeout > 0) {
diff --git a/fs/exec.c b/fs/exec.c
index 72934df68471..904199086490 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -220,8 +220,26 @@ static struct page *get_arg_page(struct linux_binprm *bprm, unsigned long pos,
if (write) {
unsigned long size = bprm->vma->vm_end - bprm->vma->vm_start;
+ unsigned long ptr_size;
struct rlimit *rlim;
+ /*
+ * Since the stack will hold pointers to the strings, we
+ * must account for them as well.
+ *
+ * The size calculation is the entire vma while each arg page is
+ * built, so each time we get here it's calculating how far it
+ * is currently (rather than each call being just the newly
+ * added size from the arg page). As a result, we need to
+ * always add the entire size of the pointers, so that on the
+ * last call to get_arg_page() we'll actually have the entire
+ * correct size.
+ */
+ ptr_size = (bprm->argc + bprm->envc) * sizeof(void *);
+ if (ptr_size > ULONG_MAX - size)
+ goto fail;
+ size += ptr_size;
+
acct_arg_size(bprm, size / PAGE_SIZE);
/*
@@ -239,13 +257,15 @@ static struct page *get_arg_page(struct linux_binprm *bprm, unsigned long pos,
* to work from.
*/
rlim = current->signal->rlim;
- if (size > ACCESS_ONCE(rlim[RLIMIT_STACK].rlim_cur) / 4) {
- put_page(page);
- return NULL;
- }
+ if (size > READ_ONCE(rlim[RLIMIT_STACK].rlim_cur) / 4)
+ goto fail;
}
return page;
+
+fail:
+ put_page(page);
+ return NULL;
}
static void put_arg_page(struct page *page)
diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h
index 2185c7a040a1..fd2e651bad6d 100644
--- a/fs/f2fs/f2fs.h
+++ b/fs/f2fs/f2fs.h
@@ -1078,6 +1078,7 @@ static inline u32 f2fs_crc32(struct f2fs_sb_info *sbi, const void *address,
{
SHASH_DESC_ON_STACK(shash, sbi->s_chksum_driver);
u32 *ctx = (u32 *)shash_desc_ctx(shash);
+ u32 retval;
int err;
shash->tfm = sbi->s_chksum_driver;
@@ -1087,7 +1088,9 @@ static inline u32 f2fs_crc32(struct f2fs_sb_info *sbi, const void *address,
err = crypto_shash_update(shash, address, length);
BUG_ON(err);
- return *ctx;
+ retval = *ctx;
+ barrier_data(ctx);
+ return retval;
}
static inline bool f2fs_crc_valid(struct f2fs_sb_info *sbi, __u32 blk_crc,
diff --git a/fs/fs_pin.c b/fs/fs_pin.c
index 611b5408f6ec..e747b3d720ee 100644
--- a/fs/fs_pin.c
+++ b/fs/fs_pin.c
@@ -34,7 +34,7 @@ void pin_insert(struct fs_pin *pin, struct vfsmount *m)
void pin_kill(struct fs_pin *p)
{
- wait_queue_t wait;
+ wait_queue_entry_t wait;
if (!p) {
rcu_read_unlock();
@@ -61,7 +61,7 @@ void pin_kill(struct fs_pin *p)
rcu_read_unlock();
schedule();
rcu_read_lock();
- if (likely(list_empty(&wait.task_list)))
+ if (likely(list_empty(&wait.entry)))
break;
/* OK, we know p couldn't have been freed yet */
spin_lock_irq(&p->wait.lock);
diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c
index dde861387a40..d44f5456eb9b 100644
--- a/fs/hugetlbfs/inode.c
+++ b/fs/hugetlbfs/inode.c
@@ -200,7 +200,7 @@ hugetlb_get_unmapped_area(struct file *file, unsigned long addr,
addr = ALIGN(addr, huge_page_size(h));
vma = find_vma(mm, addr);
if (TASK_SIZE - len >= addr &&
- (!vma || addr + len <= vma->vm_start))
+ (!vma || addr + len <= vm_start_gap(vma)))
return addr;
}
diff --git a/fs/inode.c b/fs/inode.c
index f0e5fc77e6a4..ab3b9a795c0b 100644
--- a/fs/inode.c
+++ b/fs/inode.c
@@ -1892,11 +1892,11 @@ static void __wait_on_freeing_inode(struct inode *inode)
wait_queue_head_t *wq;
DEFINE_WAIT_BIT(wait, &inode->i_state, __I_NEW);
wq = bit_waitqueue(&inode->i_state, __I_NEW);
- prepare_to_wait(wq, &wait.wait, TASK_UNINTERRUPTIBLE);
+ prepare_to_wait(wq, &wait.wq_entry, TASK_UNINTERRUPTIBLE);
spin_unlock(&inode->i_lock);
spin_unlock(&inode_hash_lock);
schedule();
- finish_wait(wq, &wait.wait);
+ finish_wait(wq, &wait.wq_entry);
spin_lock(&inode_hash_lock);
}
@@ -2039,11 +2039,11 @@ static void __inode_dio_wait(struct inode *inode)
DEFINE_WAIT_BIT(q, &inode->i_state, __I_DIO_WAKEUP);
do {
- prepare_to_wait(wq, &q.wait, TASK_UNINTERRUPTIBLE);
+ prepare_to_wait(wq, &q.wq_entry, TASK_UNINTERRUPTIBLE);
if (atomic_read(&inode->i_dio_count))
schedule();
} while (atomic_read(&inode->i_dio_count));
- finish_wait(wq, &q.wait);
+ finish_wait(wq, &q.wq_entry);
}
/**
diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c
index ebad34266bcf..7d5ef3bf3f3e 100644
--- a/fs/jbd2/journal.c
+++ b/fs/jbd2/journal.c
@@ -2579,10 +2579,10 @@ restart:
wait_queue_head_t *wq;
DEFINE_WAIT_BIT(wait, &jinode->i_flags, __JI_COMMIT_RUNNING);
wq = bit_waitqueue(&jinode->i_flags, __JI_COMMIT_RUNNING);
- prepare_to_wait(wq, &wait.wait, TASK_UNINTERRUPTIBLE);
+ prepare_to_wait(wq, &wait.wq_entry, TASK_UNINTERRUPTIBLE);
spin_unlock(&journal->j_list_lock);
schedule();
- finish_wait(wq, &wait.wait);
+ finish_wait(wq, &wait.wq_entry);
goto restart;
}
diff --git a/fs/namespace.c b/fs/namespace.c
index 8bd3e4d448b9..5a4438445bf7 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -3488,6 +3488,8 @@ static int mntns_install(struct nsproxy *nsproxy, struct ns_common *ns)
return err;
}
+ put_mnt_ns(old_mnt_ns);
+
/* Update the pwd and root */
set_fs_pwd(fs, &root);
set_fs_root(fs, &root);
diff --git a/fs/nfs/callback_xdr.c b/fs/nfs/callback_xdr.c
index c14758e08d73..390ac9c39c59 100644
--- a/fs/nfs/callback_xdr.c
+++ b/fs/nfs/callback_xdr.c
@@ -753,7 +753,6 @@ static void nfs4_callback_free_slot(struct nfs4_session *session,
* A single slot, so highest used slotid is either 0 or -1
*/
nfs4_free_slot(tbl, slot);
- nfs4_slot_tbl_drain_complete(tbl);
spin_unlock(&tbl->slot_tbl_lock);
}
diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c
index 32ccd7754f8a..2ac00bf4ecf1 100644
--- a/fs/nfs/dir.c
+++ b/fs/nfs/dir.c
@@ -1946,29 +1946,6 @@ nfs_link(struct dentry *old_dentry, struct inode *dir, struct dentry *dentry)
}
EXPORT_SYMBOL_GPL(nfs_link);
-static void
-nfs_complete_rename(struct rpc_task *task, struct nfs_renamedata *data)
-{
- struct dentry *old_dentry = data->old_dentry;
- struct dentry *new_dentry = data->new_dentry;
- struct inode *old_inode = d_inode(old_dentry);
- struct inode *new_inode = d_inode(new_dentry);
-
- nfs_mark_for_revalidate(old_inode);
-
- switch (task->tk_status) {
- case 0:
- if (new_inode != NULL)
- nfs_drop_nlink(new_inode);
- d_move(old_dentry, new_dentry);
- nfs_set_verifier(new_dentry,
- nfs_save_change_attribute(data->new_dir));
- break;
- case -ENOENT:
- nfs_dentry_handle_enoent(old_dentry);
- }
-}
-
/*
* RENAME
* FIXME: Some nfsds, like the Linux user space nfsd, may generate a
@@ -1999,7 +1976,7 @@ int nfs_rename(struct inode *old_dir, struct dentry *old_dentry,
{
struct inode *old_inode = d_inode(old_dentry);
struct inode *new_inode = d_inode(new_dentry);
- struct dentry *dentry = NULL;
+ struct dentry *dentry = NULL, *rehash = NULL;
struct rpc_task *task;
int error = -EBUSY;
@@ -2022,8 +1999,10 @@ int nfs_rename(struct inode *old_dir, struct dentry *old_dentry,
* To prevent any new references to the target during the
* rename, we unhash the dentry in advance.
*/
- if (!d_unhashed(new_dentry))
+ if (!d_unhashed(new_dentry)) {
d_drop(new_dentry);
+ rehash = new_dentry;
+ }
if (d_count(new_dentry) > 2) {
int err;
@@ -2040,6 +2019,7 @@ int nfs_rename(struct inode *old_dir, struct dentry *old_dentry,
goto out;
new_dentry = dentry;
+ rehash = NULL;
new_inode = NULL;
}
}
@@ -2048,8 +2028,7 @@ int nfs_rename(struct inode *old_dir, struct dentry *old_dentry,
if (new_inode != NULL)
NFS_PROTO(new_inode)->return_delegation(new_inode);
- task = nfs_async_rename(old_dir, new_dir, old_dentry, new_dentry,
- nfs_complete_rename);
+ task = nfs_async_rename(old_dir, new_dir, old_dentry, new_dentry, NULL);
if (IS_ERR(task)) {
error = PTR_ERR(task);
goto out;
@@ -2059,9 +2038,27 @@ int nfs_rename(struct inode *old_dir, struct dentry *old_dentry,
if (error == 0)
error = task->tk_status;
rpc_put_task(task);
+ nfs_mark_for_revalidate(old_inode);
out:
+ if (rehash)
+ d_rehash(rehash);
trace_nfs_rename_exit(old_dir, old_dentry,
new_dir, new_dentry, error);
+ if (!error) {
+ if (new_inode != NULL)
+ nfs_drop_nlink(new_inode);
+ /*
+ * The d_move() should be here instead of in an async RPC completion
+ * handler because we need the proper locks to move the dentry. If
+ * we're interrupted by a signal, the async RPC completion handler
+ * should mark the directories for revalidation.
+ */
+ d_move(old_dentry, new_dentry);
+ nfs_set_verifier(new_dentry,
+ nfs_save_change_attribute(new_dir));
+ } else if (error == -ENOENT)
+ nfs_dentry_handle_enoent(old_dentry);
+
/* new dentry created? */
if (dentry)
dput(dentry);
diff --git a/fs/nfs/internal.h b/fs/nfs/internal.h
index 3e24392f2caa..8701d7617964 100644
--- a/fs/nfs/internal.h
+++ b/fs/nfs/internal.h
@@ -7,6 +7,7 @@
#include <linux/security.h>
#include <linux/crc32.h>
#include <linux/nfs_page.h>
+#include <linux/wait_bit.h>
#define NFS_MS_MASK (MS_RDONLY|MS_NOSUID|MS_NODEV|MS_NOEXEC|MS_SYNCHRONOUS)
diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index c08c46a3b8cd..98b0b662af09 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -2589,7 +2589,8 @@ static inline void nfs4_exclusive_attrset(struct nfs4_opendata *opendata,
/* Except MODE, it seems harmless of setting twice. */
if (opendata->o_arg.createmode != NFS4_CREATE_EXCLUSIVE &&
- attrset[1] & FATTR4_WORD1_MODE)
+ (attrset[1] & FATTR4_WORD1_MODE ||
+ attrset[2] & FATTR4_WORD2_MODE_UMASK))
sattr->ia_valid &= ~ATTR_MODE;
if (attrset[2] & FATTR4_WORD2_SECURITY_LABEL)
@@ -6372,7 +6373,7 @@ struct nfs4_lock_waiter {
};
static int
-nfs4_wake_lock_waiter(wait_queue_t *wait, unsigned int mode, int flags, void *key)
+nfs4_wake_lock_waiter(wait_queue_entry_t *wait, unsigned int mode, int flags, void *key)
{
int ret;
struct cb_notify_lock_args *cbnl = key;
@@ -6415,7 +6416,7 @@ nfs4_retry_setlk(struct nfs4_state *state, int cmd, struct file_lock *request)
.inode = state->inode,
.owner = &owner,
.notified = false };
- wait_queue_t wait;
+ wait_queue_entry_t wait;
/* Don't bother with waitqueue if we don't expect a callback */
if (!test_bit(NFS_STATE_MAY_NOTIFY_LOCK, &state->flags))
@@ -8416,6 +8417,7 @@ static void nfs4_layoutget_release(void *calldata)
size_t max_pages = max_response_pages(server);
dprintk("--> %s\n", __func__);
+ nfs4_sequence_free_slot(&lgp->res.seq_res);
nfs4_free_pages(lgp->args.layout.pages, max_pages);
pnfs_put_layout_hdr(NFS_I(inode)->layout);
put_nfs_open_context(lgp->args.ctx);
@@ -8490,7 +8492,6 @@ nfs4_proc_layoutget(struct nfs4_layoutget *lgp, long *timeout, gfp_t gfp_flags)
/* if layoutp->len is 0, nfs4_layoutget_prepare called rpc_exit */
if (status == 0 && lgp->res.layoutp->len)
lseg = pnfs_layout_process(lgp);
- nfs4_sequence_free_slot(&lgp->res.seq_res);
rpc_put_task(task);
dprintk("<-- %s status=%d\n", __func__, status);
if (status)
diff --git a/fs/nfs/nfs4state.c b/fs/nfs/nfs4state.c
index b34de036501b..cbf82b0d4467 100644
--- a/fs/nfs/nfs4state.c
+++ b/fs/nfs/nfs4state.c
@@ -2134,6 +2134,8 @@ again:
put_rpccred(cred);
switch (status) {
case 0:
+ case -EINTR:
+ case -ERESTARTSYS:
break;
case -ETIMEDOUT:
if (clnt->cl_softrtry)
diff --git a/fs/nilfs2/segment.c b/fs/nilfs2/segment.c
index febed1217b3f..70ded52dc1dd 100644
--- a/fs/nilfs2/segment.c
+++ b/fs/nilfs2/segment.c
@@ -2161,7 +2161,7 @@ void nilfs_flush_segment(struct super_block *sb, ino_t ino)
}
struct nilfs_segctor_wait_request {
- wait_queue_t wq;
+ wait_queue_entry_t wq;
__u32 seq;
int err;
atomic_t done;
@@ -2206,8 +2206,7 @@ static void nilfs_segctor_wakeup(struct nilfs_sc_info *sci, int err)
unsigned long flags;
spin_lock_irqsave(&sci->sc_wait_request.lock, flags);
- list_for_each_entry_safe(wrq, n, &sci->sc_wait_request.task_list,
- wq.task_list) {
+ list_for_each_entry_safe(wrq, n, &sci->sc_wait_request.head, wq.entry) {
if (!atomic_read(&wrq->done) &&
nilfs_cnt32_ge(sci->sc_seq_done, wrq->seq)) {
wrq->err = err;
diff --git a/fs/ocfs2/dlmglue.c b/fs/ocfs2/dlmglue.c
index 3b7c937a36b5..4689940a953c 100644
--- a/fs/ocfs2/dlmglue.c
+++ b/fs/ocfs2/dlmglue.c
@@ -2591,6 +2591,10 @@ void ocfs2_inode_unlock_tracker(struct inode *inode,
struct ocfs2_lock_res *lockres;
lockres = &OCFS2_I(inode)->ip_inode_lockres;
+ /* had_lock means that the currect process already takes the cluster
+ * lock previously. If had_lock is 1, we have nothing to do here, and
+ * it will get unlocked where we got the lock.
+ */
if (!had_lock) {
ocfs2_remove_holder(lockres, oh);
ocfs2_inode_unlock(inode, ex);
diff --git a/fs/ocfs2/xattr.c b/fs/ocfs2/xattr.c
index 3c5384d9b3a5..f70c3778d600 100644
--- a/fs/ocfs2/xattr.c
+++ b/fs/ocfs2/xattr.c
@@ -1328,20 +1328,21 @@ static int ocfs2_xattr_get(struct inode *inode,
void *buffer,
size_t buffer_size)
{
- int ret;
+ int ret, had_lock;
struct buffer_head *di_bh = NULL;
+ struct ocfs2_lock_holder oh;
- ret = ocfs2_inode_lock(inode, &di_bh, 0);
- if (ret < 0) {
- mlog_errno(ret);
- return ret;
+ had_lock = ocfs2_inode_lock_tracker(inode, &di_bh, 0, &oh);
+ if (had_lock < 0) {
+ mlog_errno(had_lock);
+ return had_lock;
}
down_read(&OCFS2_I(inode)->ip_xattr_sem);
ret = ocfs2_xattr_get_nolock(inode, di_bh, name_index,
name, buffer, buffer_size);
up_read(&OCFS2_I(inode)->ip_xattr_sem);
- ocfs2_inode_unlock(inode, 0);
+ ocfs2_inode_unlock_tracker(inode, 0, &oh, had_lock);
brelse(di_bh);
@@ -3537,11 +3538,12 @@ int ocfs2_xattr_set(struct inode *inode,
{
struct buffer_head *di_bh = NULL;
struct ocfs2_dinode *di;
- int ret, credits, ref_meta = 0, ref_credits = 0;
+ int ret, credits, had_lock, ref_meta = 0, ref_credits = 0;
struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
struct inode *tl_inode = osb->osb_tl_inode;
struct ocfs2_xattr_set_ctxt ctxt = { NULL, NULL, NULL, };
struct ocfs2_refcount_tree *ref_tree = NULL;
+ struct ocfs2_lock_holder oh;
struct ocfs2_xattr_info xi = {
.xi_name_index = name_index,
@@ -3572,8 +3574,9 @@ int ocfs2_xattr_set(struct inode *inode,
return -ENOMEM;
}
- ret = ocfs2_inode_lock(inode, &di_bh, 1);
- if (ret < 0) {
+ had_lock = ocfs2_inode_lock_tracker(inode, &di_bh, 1, &oh);
+ if (had_lock < 0) {
+ ret = had_lock;
mlog_errno(ret);
goto cleanup_nolock;
}
@@ -3670,7 +3673,7 @@ cleanup:
if (ret)
mlog_errno(ret);
}
- ocfs2_inode_unlock(inode, 1);
+ ocfs2_inode_unlock_tracker(inode, 1, &oh, had_lock);
cleanup_nolock:
brelse(di_bh);
brelse(xbs.xattr_bh);
diff --git a/fs/orangefs/orangefs-bufmap.c b/fs/orangefs/orangefs-bufmap.c
index 83b506020718..038d67545d9f 100644
--- a/fs/orangefs/orangefs-bufmap.c
+++ b/fs/orangefs/orangefs-bufmap.c
@@ -46,8 +46,8 @@ static void run_down(struct slot_map *m)
spin_lock(&m->q.lock);
if (m->c != -1) {
for (;;) {
- if (likely(list_empty(&wait.task_list)))
- __add_wait_queue_tail(&m->q, &wait);
+ if (likely(list_empty(&wait.entry)))
+ __add_wait_queue_entry_tail(&m->q, &wait);
set_current_state(TASK_UNINTERRUPTIBLE);
if (m->c == -1)
@@ -84,8 +84,8 @@ static int wait_for_free(struct slot_map *m)
do {
long n = left, t;
- if (likely(list_empty(&wait.task_list)))
- __add_wait_queue_tail_exclusive(&m->q, &wait);
+ if (likely(list_empty(&wait.entry)))
+ __add_wait_queue_entry_tail_exclusive(&m->q, &wait);
set_current_state(TASK_INTERRUPTIBLE);
if (m->c > 0)
@@ -108,8 +108,8 @@ static int wait_for_free(struct slot_map *m)
left = -EINTR;
} while (left > 0);
- if (!list_empty(&wait.task_list))
- list_del(&wait.task_list);
+ if (!list_empty(&wait.entry))
+ list_del(&wait.entry);
else if (left <= 0 && waitqueue_active(&m->q))
__wake_up_locked_key(&m->q, TASK_INTERRUPTIBLE, NULL);
__set_current_state(TASK_RUNNING);
diff --git a/fs/overlayfs/copy_up.c b/fs/overlayfs/copy_up.c
index 33fe6ca929f7..e5869f91b3ab 100644
--- a/fs/overlayfs/copy_up.c
+++ b/fs/overlayfs/copy_up.c
@@ -329,15 +329,9 @@ static int ovl_copy_up_locked(struct dentry *workdir, struct dentry *upperdir,
.link = link
};
- upper = lookup_one_len(dentry->d_name.name, upperdir,
- dentry->d_name.len);
- err = PTR_ERR(upper);
- if (IS_ERR(upper))
- goto out;
-
err = security_inode_copy_up(dentry, &new_creds);
if (err < 0)
- goto out1;
+ goto out;
if (new_creds)
old_creds = override_creds(new_creds);
@@ -361,7 +355,7 @@ static int ovl_copy_up_locked(struct dentry *workdir, struct dentry *upperdir,
}
if (err)
- goto out2;
+ goto out;
if (S_ISREG(stat->mode)) {
struct path upperpath;
@@ -397,10 +391,23 @@ static int ovl_copy_up_locked(struct dentry *workdir, struct dentry *upperdir,
/*
* Store identifier of lower inode in upper inode xattr to
* allow lookup of the copy up origin inode.
+ *
+ * Don't set origin when we are breaking the association with a lower
+ * hard link.
*/
- err = ovl_set_origin(dentry, lowerpath->dentry, temp);
- if (err)
+ if (S_ISDIR(stat->mode) || stat->nlink == 1) {
+ err = ovl_set_origin(dentry, lowerpath->dentry, temp);
+ if (err)
+ goto out_cleanup;
+ }
+
+ upper = lookup_one_len(dentry->d_name.name, upperdir,
+ dentry->d_name.len);
+ if (IS_ERR(upper)) {
+ err = PTR_ERR(upper);
+ upper = NULL;
goto out_cleanup;
+ }
if (tmpfile)
err = ovl_do_link(temp, udir, upper, true);
@@ -415,17 +422,15 @@ static int ovl_copy_up_locked(struct dentry *workdir, struct dentry *upperdir,
/* Restore timestamps on parent (best effort) */
ovl_set_timestamps(upperdir, pstat);
-out2:
+out:
dput(temp);
-out1:
dput(upper);
-out:
return err;
out_cleanup:
if (!tmpfile)
ovl_cleanup(wdir, temp);
- goto out2;
+ goto out;
}
/*
diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
index f0c8b33d99b1..520802da059c 100644
--- a/fs/proc/task_mmu.c
+++ b/fs/proc/task_mmu.c
@@ -300,11 +300,7 @@ show_map_vma(struct seq_file *m, struct vm_area_struct *vma, int is_pid)
/* We don't show the stack guard page in /proc/maps */
start = vma->vm_start;
- if (stack_guard_page_start(vma, start))
- start += PAGE_SIZE;
end = vma->vm_end;
- if (stack_guard_page_end(vma, end))
- end -= PAGE_SIZE;
seq_setwidth(m, 25 + sizeof(void *) * 6 - 1);
seq_printf(m, "%08lx-%08lx %c%c%c%c %08llx %02x:%02x %lu ",
diff --git a/fs/read_write.c b/fs/read_write.c
index 53c816c61122..d591eeed061f 100644
--- a/fs/read_write.c
+++ b/fs/read_write.c
@@ -1279,7 +1279,7 @@ static size_t compat_writev(struct file *file,
if (!(file->f_mode & FMODE_CAN_WRITE))
goto out;
- ret = compat_do_readv_writev(WRITE, file, vec, vlen, pos, 0);
+ ret = compat_do_readv_writev(WRITE, file, vec, vlen, pos, flags);
out:
if (ret > 0)
diff --git a/fs/reiserfs/journal.c b/fs/reiserfs/journal.c
index 39bb1e838d8d..a11d773e5ff3 100644
--- a/fs/reiserfs/journal.c
+++ b/fs/reiserfs/journal.c
@@ -2956,7 +2956,7 @@ void reiserfs_wait_on_write_block(struct super_block *s)
static void queue_log_writer(struct super_block *s)
{
- wait_queue_t wait;
+ wait_queue_entry_t wait;
struct reiserfs_journal *journal = SB_JOURNAL(s);
set_bit(J_WRITERS_QUEUED, &journal->j_state);
diff --git a/fs/select.c b/fs/select.c
index d6c652a31e99..5b524a977d91 100644
--- a/fs/select.c
+++ b/fs/select.c
@@ -180,7 +180,7 @@ static struct poll_table_entry *poll_get_entry(struct poll_wqueues *p)
return table->entry++;
}
-static int __pollwake(wait_queue_t *wait, unsigned mode, int sync, void *key)
+static int __pollwake(wait_queue_entry_t *wait, unsigned mode, int sync, void *key)
{
struct poll_wqueues *pwq = wait->private;
DECLARE_WAITQUEUE(dummy_wait, pwq->polling_task);
@@ -206,7 +206,7 @@ static int __pollwake(wait_queue_t *wait, unsigned mode, int sync, void *key)
return default_wake_function(&dummy_wait, mode, sync, key);
}
-static int pollwake(wait_queue_t *wait, unsigned mode, int sync, void *key)
+static int pollwake(wait_queue_entry_t *wait, unsigned mode, int sync, void *key)
{
struct poll_table_entry *entry;
diff --git a/fs/signalfd.c b/fs/signalfd.c
index 7e3d71109f51..593b022ac11b 100644
--- a/fs/signalfd.c
+++ b/fs/signalfd.c
@@ -43,7 +43,7 @@ void signalfd_cleanup(struct sighand_struct *sighand)
if (likely(!waitqueue_active(wqh)))
return;
- /* wait_queue_t->func(POLLFREE) should do remove_wait_queue() */
+ /* wait_queue_entry_t->func(POLLFREE) should do remove_wait_queue() */
wake_up_poll(wqh, POLLHUP | POLLFREE);
}
diff --git a/fs/ufs/balloc.c b/fs/ufs/balloc.c
index d642cc0a8271..f80be4c5df9d 100644
--- a/fs/ufs/balloc.c
+++ b/fs/ufs/balloc.c
@@ -400,10 +400,12 @@ u64 ufs_new_fragments(struct inode *inode, void *p, u64 fragment,
/*
* There is not enough space for user on the device
*/
- if (!capable(CAP_SYS_RESOURCE) && ufs_freespace(uspi, UFS_MINFREE) <= 0) {
- mutex_unlock(&UFS_SB(sb)->s_lock);
- UFSD("EXIT (FAILED)\n");
- return 0;
+ if (unlikely(ufs_freefrags(uspi) <= uspi->s_root_blocks)) {
+ if (!capable(CAP_SYS_RESOURCE)) {
+ mutex_unlock(&UFS_SB(sb)->s_lock);
+ UFSD("EXIT (FAILED)\n");
+ return 0;
+ }
}
if (goal >= uspi->s_size)
@@ -421,12 +423,12 @@ u64 ufs_new_fragments(struct inode *inode, void *p, u64 fragment,
if (result) {
ufs_clear_frags(inode, result + oldcount,
newcount - oldcount, locked_page != NULL);
+ *err = 0;
write_seqlock(&UFS_I(inode)->meta_lock);
ufs_cpu_to_data_ptr(sb, p, result);
- write_sequnlock(&UFS_I(inode)->meta_lock);
- *err = 0;
UFS_I(inode)->i_lastfrag =
max(UFS_I(inode)->i_lastfrag, fragment + count);
+ write_sequnlock(&UFS_I(inode)->meta_lock);
}
mutex_unlock(&UFS_SB(sb)->s_lock);
UFSD("EXIT, result %llu\n", (unsigned long long)result);
@@ -439,8 +441,10 @@ u64 ufs_new_fragments(struct inode *inode, void *p, u64 fragment,
result = ufs_add_fragments(inode, tmp, oldcount, newcount);
if (result) {
*err = 0;
+ read_seqlock_excl(&UFS_I(inode)->meta_lock);
UFS_I(inode)->i_lastfrag = max(UFS_I(inode)->i_lastfrag,
fragment + count);
+ read_sequnlock_excl(&UFS_I(inode)->meta_lock);
ufs_clear_frags(inode, result + oldcount, newcount - oldcount,
locked_page != NULL);
mutex_unlock(&UFS_SB(sb)->s_lock);
@@ -451,39 +455,29 @@ u64 ufs_new_fragments(struct inode *inode, void *p, u64 fragment,
/*
* allocate new block and move data
*/
- switch (fs32_to_cpu(sb, usb1->fs_optim)) {
- case UFS_OPTSPACE:
+ if (fs32_to_cpu(sb, usb1->fs_optim) == UFS_OPTSPACE) {
request = newcount;
- if (uspi->s_minfree < 5 || uspi->cs_total.cs_nffree
- > uspi->s_dsize * uspi->s_minfree / (2 * 100))
- break;
- usb1->fs_optim = cpu_to_fs32(sb, UFS_OPTTIME);
- break;
- default:
- usb1->fs_optim = cpu_to_fs32(sb, UFS_OPTTIME);
-
- case UFS_OPTTIME:
+ if (uspi->cs_total.cs_nffree < uspi->s_space_to_time)
+ usb1->fs_optim = cpu_to_fs32(sb, UFS_OPTTIME);
+ } else {
request = uspi->s_fpb;
- if (uspi->cs_total.cs_nffree < uspi->s_dsize *
- (uspi->s_minfree - 2) / 100)
- break;
- usb1->fs_optim = cpu_to_fs32(sb, UFS_OPTTIME);
- break;
+ if (uspi->cs_total.cs_nffree > uspi->s_time_to_space)
+ usb1->fs_optim = cpu_to_fs32(sb, UFS_OPTSPACE);
}
result = ufs_alloc_fragments (inode, cgno, goal, request, err);
if (result) {
ufs_clear_frags(inode, result + oldcount, newcount - oldcount,
locked_page != NULL);
+ mutex_unlock(&UFS_SB(sb)->s_lock);
ufs_change_blocknr(inode, fragment - oldcount, oldcount,
uspi->s_sbbase + tmp,
uspi->s_sbbase + result, locked_page);
+ *err = 0;
write_seqlock(&UFS_I(inode)->meta_lock);
ufs_cpu_to_data_ptr(sb, p, result);
- write_sequnlock(&UFS_I(inode)->meta_lock);
- *err = 0;
UFS_I(inode)->i_lastfrag = max(UFS_I(inode)->i_lastfrag,
fragment + count);
- mutex_unlock(&UFS_SB(sb)->s_lock);
+ write_sequnlock(&UFS_I(inode)->meta_lock);
if (newcount < request)
ufs_free_fragments (inode, result + newcount, request - newcount);
ufs_free_fragments (inode, tmp, oldcount);
diff --git a/fs/ufs/inode.c b/fs/ufs/inode.c
index da553ffec85b..f36d6a53687d 100644
--- a/fs/ufs/inode.c
+++ b/fs/ufs/inode.c
@@ -401,13 +401,20 @@ static int ufs_getfrag_block(struct inode *inode, sector_t fragment, struct buff
u64 phys64 = 0;
unsigned frag = fragment & uspi->s_fpbmask;
- if (!create) {
- phys64 = ufs_frag_map(inode, offsets, depth);
- if (phys64)
- map_bh(bh_result, sb, phys64 + frag);
- return 0;
- }
+ phys64 = ufs_frag_map(inode, offsets, depth);
+ if (!create)
+ goto done;
+ if (phys64) {
+ if (fragment >= UFS_NDIR_FRAGMENT)
+ goto done;
+ read_seqlock_excl(&UFS_I(inode)->meta_lock);
+ if (fragment < UFS_I(inode)->i_lastfrag) {
+ read_sequnlock_excl(&UFS_I(inode)->meta_lock);
+ goto done;
+ }
+ read_sequnlock_excl(&UFS_I(inode)->meta_lock);
+ }
/* This code entered only while writing ....? */
mutex_lock(&UFS_I(inode)->truncate_mutex);
@@ -451,6 +458,11 @@ out:
}
mutex_unlock(&UFS_I(inode)->truncate_mutex);
return err;
+
+done:
+ if (phys64)
+ map_bh(bh_result, sb, phys64 + frag);
+ return 0;
}
static int ufs_writepage(struct page *page, struct writeback_control *wbc)
@@ -554,10 +566,8 @@ static int ufs1_read_inode(struct inode *inode, struct ufs_inode *ufs_inode)
*/
inode->i_mode = mode = fs16_to_cpu(sb, ufs_inode->ui_mode);
set_nlink(inode, fs16_to_cpu(sb, ufs_inode->ui_nlink));
- if (inode->i_nlink == 0) {
- ufs_error (sb, "ufs_read_inode", "inode %lu has zero nlink\n", inode->i_ino);
- return -1;
- }
+ if (inode->i_nlink == 0)
+ return -ESTALE;
/*
* Linux now has 32-bit uid and gid, so we can support EFT.
@@ -566,9 +576,9 @@ static int ufs1_read_inode(struct inode *inode, struct ufs_inode *ufs_inode)
i_gid_write(inode, ufs_get_inode_gid(sb, ufs_inode));
inode->i_size = fs64_to_cpu(sb, ufs_inode->ui_size);
- inode->i_atime.tv_sec = fs32_to_cpu(sb, ufs_inode->ui_atime.tv_sec);
- inode->i_ctime.tv_sec = fs32_to_cpu(sb, ufs_inode->ui_ctime.tv_sec);
- inode->i_mtime.tv_sec = fs32_to_cpu(sb, ufs_inode->ui_mtime.tv_sec);
+ inode->i_atime.tv_sec = (signed)fs32_to_cpu(sb, ufs_inode->ui_atime.tv_sec);
+ inode->i_ctime.tv_sec = (signed)fs32_to_cpu(sb, ufs_inode->ui_ctime.tv_sec);
+ inode->i_mtime.tv_sec = (signed)fs32_to_cpu(sb, ufs_inode->ui_mtime.tv_sec);
inode->i_mtime.tv_nsec = 0;
inode->i_atime.tv_nsec = 0;
inode->i_ctime.tv_nsec = 0;
@@ -602,10 +612,8 @@ static int ufs2_read_inode(struct inode *inode, struct ufs2_inode *ufs2_inode)
*/
inode->i_mode = mode = fs16_to_cpu(sb, ufs2_inode->ui_mode);
set_nlink(inode, fs16_to_cpu(sb, ufs2_inode->ui_nlink));
- if (inode->i_nlink == 0) {
- ufs_error (sb, "ufs_read_inode", "inode %lu has zero nlink\n", inode->i_ino);
- return -1;
- }
+ if (inode->i_nlink == 0)
+ return -ESTALE;
/*
* Linux now has 32-bit uid and gid, so we can support EFT.
@@ -645,7 +653,7 @@ struct inode *ufs_iget(struct super_block *sb, unsigned long ino)
struct ufs_sb_private_info *uspi = UFS_SB(sb)->s_uspi;
struct buffer_head * bh;
struct inode *inode;
- int err;
+ int err = -EIO;
UFSD("ENTER, ino %lu\n", ino);
@@ -680,9 +688,10 @@ struct inode *ufs_iget(struct super_block *sb, unsigned long ino)
err = ufs1_read_inode(inode,
ufs_inode + ufs_inotofsbo(inode->i_ino));
}
-
+ brelse(bh);
if (err)
goto bad_inode;
+
inode->i_version++;
ufsi->i_lastfrag =
(inode->i_size + uspi->s_fsize - 1) >> uspi->s_fshift;
@@ -691,15 +700,13 @@ struct inode *ufs_iget(struct super_block *sb, unsigned long ino)
ufs_set_inode_ops(inode);
- brelse(bh);
-
UFSD("EXIT\n");
unlock_new_inode(inode);
return inode;
bad_inode:
iget_failed(inode);
- return ERR_PTR(-EIO);
+ return ERR_PTR(err);
}
static void ufs1_update_inode(struct inode *inode, struct ufs_inode *ufs_inode)
@@ -874,7 +881,6 @@ static inline void free_data(struct to_free *ctx, u64 from, unsigned count)
ctx->to = from + count;
}
-#define DIRECT_BLOCK ((inode->i_size + uspi->s_bsize - 1) >> uspi->s_bshift)
#define DIRECT_FRAGMENT ((inode->i_size + uspi->s_fsize - 1) >> uspi->s_fshift)
static void ufs_trunc_direct(struct inode *inode)
@@ -1112,19 +1118,24 @@ static void ufs_truncate_blocks(struct inode *inode)
struct super_block *sb = inode->i_sb;
struct ufs_sb_private_info *uspi = UFS_SB(sb)->s_uspi;
unsigned offsets[4];
- int depth = ufs_block_to_path(inode, DIRECT_BLOCK, offsets);
+ int depth;
int depth2;
unsigned i;
struct ufs_buffer_head *ubh[3];
void *p;
u64 block;
- if (!depth)
- return;
+ if (inode->i_size) {
+ sector_t last = (inode->i_size - 1) >> uspi->s_bshift;
+ depth = ufs_block_to_path(inode, last, offsets);
+ if (!depth)
+ return;
+ } else {
+ depth = 1;
+ }
- /* find the last non-zero in offsets[] */
for (depth2 = depth - 1; depth2; depth2--)
- if (offsets[depth2])
+ if (offsets[depth2] != uspi->s_apb - 1)
break;
mutex_lock(&ufsi->truncate_mutex);
@@ -1133,9 +1144,8 @@ static void ufs_truncate_blocks(struct inode *inode)
offsets[0] = UFS_IND_BLOCK;
} else {
/* get the blocks that should be partially emptied */
- p = ufs_get_direct_data_ptr(uspi, ufsi, offsets[0]);
+ p = ufs_get_direct_data_ptr(uspi, ufsi, offsets[0]++);
for (i = 0; i < depth2; i++) {
- offsets[i]++; /* next branch is fully freed */
block = ufs_data_ptr_to_cpu(sb, p);
if (!block)
break;
@@ -1146,7 +1156,7 @@ static void ufs_truncate_blocks(struct inode *inode)
write_sequnlock(&ufsi->meta_lock);
break;
}
- p = ubh_get_data_ptr(uspi, ubh[i], offsets[i + 1]);
+ p = ubh_get_data_ptr(uspi, ubh[i], offsets[i + 1]++);
}
while (i--)
free_branch_tail(inode, offsets[i + 1], ubh[i], depth - i - 1);
@@ -1161,7 +1171,9 @@ static void ufs_truncate_blocks(struct inode *inode)
free_full_branch(inode, block, i - UFS_IND_BLOCK + 1);
}
}
+ read_seqlock_excl(&ufsi->meta_lock);
ufsi->i_lastfrag = DIRECT_FRAGMENT;
+ read_sequnlock_excl(&ufsi->meta_lock);
mark_inode_dirty(inode);
mutex_unlock(&ufsi->truncate_mutex);
}
diff --git a/fs/ufs/super.c b/fs/ufs/super.c
index 878cc6264f1a..0a4f58a5073c 100644
--- a/fs/ufs/super.c
+++ b/fs/ufs/super.c
@@ -480,7 +480,7 @@ static void ufs_setup_cstotal(struct super_block *sb)
usb3 = ubh_get_usb_third(uspi);
if ((mtype == UFS_MOUNT_UFSTYPE_44BSD &&
- (usb1->fs_flags & UFS_FLAGS_UPDATED)) ||
+ (usb2->fs_un.fs_u2.fs_maxbsize == usb1->fs_bsize)) ||
mtype == UFS_MOUNT_UFSTYPE_UFS2) {
/*we have statistic in different place, then usual*/
uspi->cs_total.cs_ndir = fs64_to_cpu(sb, usb2->fs_un.fs_u2.cs_ndir);
@@ -596,9 +596,7 @@ static void ufs_put_cstotal(struct super_block *sb)
usb2 = ubh_get_usb_second(uspi);
usb3 = ubh_get_usb_third(uspi);
- if ((mtype == UFS_MOUNT_UFSTYPE_44BSD &&
- (usb1->fs_flags & UFS_FLAGS_UPDATED)) ||
- mtype == UFS_MOUNT_UFSTYPE_UFS2) {
+ if (mtype == UFS_MOUNT_UFSTYPE_UFS2) {
/*we have statistic in different place, then usual*/
usb2->fs_un.fs_u2.cs_ndir =
cpu_to_fs64(sb, uspi->cs_total.cs_ndir);
@@ -608,16 +606,26 @@ static void ufs_put_cstotal(struct super_block *sb)
cpu_to_fs64(sb, uspi->cs_total.cs_nifree);
usb3->fs_un1.fs_u2.cs_nffree =
cpu_to_fs64(sb, uspi->cs_total.cs_nffree);
- } else {
- usb1->fs_cstotal.cs_ndir =
- cpu_to_fs32(sb, uspi->cs_total.cs_ndir);
- usb1->fs_cstotal.cs_nbfree =
- cpu_to_fs32(sb, uspi->cs_total.cs_nbfree);
- usb1->fs_cstotal.cs_nifree =
- cpu_to_fs32(sb, uspi->cs_total.cs_nifree);
- usb1->fs_cstotal.cs_nffree =
- cpu_to_fs32(sb, uspi->cs_total.cs_nffree);
+ goto out;
+ }
+
+ if (mtype == UFS_MOUNT_UFSTYPE_44BSD &&
+ (usb2->fs_un.fs_u2.fs_maxbsize == usb1->fs_bsize)) {
+ /* store stats in both old and new places */
+ usb2->fs_un.fs_u2.cs_ndir =
+ cpu_to_fs64(sb, uspi->cs_total.cs_ndir);
+ usb2->fs_un.fs_u2.cs_nbfree =
+ cpu_to_fs64(sb, uspi->cs_total.cs_nbfree);
+ usb3->fs_un1.fs_u2.cs_nifree =
+ cpu_to_fs64(sb, uspi->cs_total.cs_nifree);
+ usb3->fs_un1.fs_u2.cs_nffree =
+ cpu_to_fs64(sb, uspi->cs_total.cs_nffree);
}
+ usb1->fs_cstotal.cs_ndir = cpu_to_fs32(sb, uspi->cs_total.cs_ndir);
+ usb1->fs_cstotal.cs_nbfree = cpu_to_fs32(sb, uspi->cs_total.cs_nbfree);
+ usb1->fs_cstotal.cs_nifree = cpu_to_fs32(sb, uspi->cs_total.cs_nifree);
+ usb1->fs_cstotal.cs_nffree = cpu_to_fs32(sb, uspi->cs_total.cs_nffree);
+out:
ubh_mark_buffer_dirty(USPI_UBH(uspi));
ufs_print_super_stuff(sb, usb1, usb2, usb3);
UFSD("EXIT\n");
@@ -996,6 +1004,13 @@ again:
flags |= UFS_ST_SUN;
}
+ if ((flags & UFS_ST_MASK) == UFS_ST_44BSD &&
+ uspi->s_postblformat == UFS_42POSTBLFMT) {
+ if (!silent)
+ pr_err("this is not a 44bsd filesystem");
+ goto failed;
+ }
+
/*
* Check ufs magic number
*/
@@ -1143,8 +1158,8 @@ magic_found:
uspi->s_cgmask = fs32_to_cpu(sb, usb1->fs_cgmask);
if ((flags & UFS_TYPE_MASK) == UFS_TYPE_UFS2) {
- uspi->s_u2_size = fs64_to_cpu(sb, usb3->fs_un1.fs_u2.fs_size);
- uspi->s_u2_dsize = fs64_to_cpu(sb, usb3->fs_un1.fs_u2.fs_dsize);
+ uspi->s_size = fs64_to_cpu(sb, usb3->fs_un1.fs_u2.fs_size);
+ uspi->s_dsize = fs64_to_cpu(sb, usb3->fs_un1.fs_u2.fs_dsize);
} else {
uspi->s_size = fs32_to_cpu(sb, usb1->fs_size);
uspi->s_dsize = fs32_to_cpu(sb, usb1->fs_dsize);
@@ -1193,6 +1208,18 @@ magic_found:
uspi->s_postbloff = fs32_to_cpu(sb, usb3->fs_postbloff);
uspi->s_rotbloff = fs32_to_cpu(sb, usb3->fs_rotbloff);
+ uspi->s_root_blocks = mul_u64_u32_div(uspi->s_dsize,
+ uspi->s_minfree, 100);
+ if (uspi->s_minfree <= 5) {
+ uspi->s_time_to_space = ~0ULL;
+ uspi->s_space_to_time = 0;
+ usb1->fs_optim = cpu_to_fs32(sb, UFS_OPTSPACE);
+ } else {
+ uspi->s_time_to_space = (uspi->s_root_blocks / 2) + 1;
+ uspi->s_space_to_time = mul_u64_u32_div(uspi->s_dsize,
+ uspi->s_minfree - 2, 100) - 1;
+ }
+
/*
* Compute another frequently used values
*/
@@ -1382,19 +1409,17 @@ static int ufs_statfs(struct dentry *dentry, struct kstatfs *buf)
mutex_lock(&UFS_SB(sb)->s_lock);
usb3 = ubh_get_usb_third(uspi);
- if ((flags & UFS_TYPE_MASK) == UFS_TYPE_UFS2) {
+ if ((flags & UFS_TYPE_MASK) == UFS_TYPE_UFS2)
buf->f_type = UFS2_MAGIC;
- buf->f_blocks = fs64_to_cpu(sb, usb3->fs_un1.fs_u2.fs_dsize);
- } else {
+ else
buf->f_type = UFS_MAGIC;
- buf->f_blocks = uspi->s_dsize;
- }
- buf->f_bfree = ufs_blkstofrags(uspi->cs_total.cs_nbfree) +
- uspi->cs_total.cs_nffree;
+
+ buf->f_blocks = uspi->s_dsize;
+ buf->f_bfree = ufs_freefrags(uspi);
buf->f_ffree = uspi->cs_total.cs_nifree;
buf->f_bsize = sb->s_blocksize;
- buf->f_bavail = (buf->f_bfree > (((long)buf->f_blocks / 100) * uspi->s_minfree))
- ? (buf->f_bfree - (((long)buf->f_blocks / 100) * uspi->s_minfree)) : 0;
+ buf->f_bavail = (buf->f_bfree > uspi->s_root_blocks)
+ ? (buf->f_bfree - uspi->s_root_blocks) : 0;
buf->f_files = uspi->s_ncg * uspi->s_ipg;
buf->f_namelen = UFS_MAXNAMLEN;
buf->f_fsid.val[0] = (u32)id;
diff --git a/fs/ufs/ufs_fs.h b/fs/ufs/ufs_fs.h
index 0cbd5d340b67..150eef6f1233 100644
--- a/fs/ufs/ufs_fs.h
+++ b/fs/ufs/ufs_fs.h
@@ -733,10 +733,8 @@ struct ufs_sb_private_info {
__u32 s_dblkno; /* offset of first data after cg */
__u32 s_cgoffset; /* cylinder group offset in cylinder */
__u32 s_cgmask; /* used to calc mod fs_ntrak */
- __u32 s_size; /* number of blocks (fragments) in fs */
- __u32 s_dsize; /* number of data blocks in fs */
- __u64 s_u2_size; /* ufs2: number of blocks (fragments) in fs */
- __u64 s_u2_dsize; /*ufs2: number of data blocks in fs */
+ __u64 s_size; /* number of blocks (fragments) in fs */
+ __u64 s_dsize; /* number of data blocks in fs */
__u32 s_ncg; /* number of cylinder groups */
__u32 s_bsize; /* size of basic blocks */
__u32 s_fsize; /* size of fragments */
@@ -793,6 +791,9 @@ struct ufs_sb_private_info {
__u32 s_maxsymlinklen;/* upper limit on fast symlinks' size */
__s32 fs_magic; /* filesystem magic */
unsigned int s_dirblksize;
+ __u64 s_root_blocks;
+ __u64 s_time_to_space;
+ __u64 s_space_to_time;
};
/*
diff --git a/fs/ufs/util.c b/fs/ufs/util.c
index f41ad0a6106f..02497a492eb2 100644
--- a/fs/ufs/util.c
+++ b/fs/ufs/util.c
@@ -243,9 +243,8 @@ ufs_set_inode_dev(struct super_block *sb, struct ufs_inode_info *ufsi, dev_t dev
struct page *ufs_get_locked_page(struct address_space *mapping,
pgoff_t index)
{
- struct page *page;
-
- page = find_lock_page(mapping, index);
+ struct inode *inode = mapping->host;
+ struct page *page = find_lock_page(mapping, index);
if (!page) {
page = read_mapping_page(mapping, index, NULL);
@@ -253,7 +252,7 @@ struct page *ufs_get_locked_page(struct address_space *mapping,
printk(KERN_ERR "ufs_change_blocknr: "
"read_mapping_page error: ino %lu, index: %lu\n",
mapping->host->i_ino, index);
- goto out;
+ return page;
}
lock_page(page);
@@ -262,8 +261,7 @@ struct page *ufs_get_locked_page(struct address_space *mapping,
/* Truncate got there first */
unlock_page(page);
put_page(page);
- page = NULL;
- goto out;
+ return NULL;
}
if (!PageUptodate(page) || PageError(page)) {
@@ -272,11 +270,12 @@ struct page *ufs_get_locked_page(struct address_space *mapping,
printk(KERN_ERR "ufs_change_blocknr: "
"can not read page: ino %lu, index: %lu\n",
- mapping->host->i_ino, index);
+ inode->i_ino, index);
- page = ERR_PTR(-EIO);
+ return ERR_PTR(-EIO);
}
}
-out:
+ if (!page_has_buffers(page))
+ create_empty_buffers(page, 1 << inode->i_blkbits, 0);
return page;
}
diff --git a/fs/ufs/util.h b/fs/ufs/util.h
index 398019fb1448..9fc7119a1551 100644
--- a/fs/ufs/util.h
+++ b/fs/ufs/util.h
@@ -350,16 +350,11 @@ static inline void *ubh_get_data_ptr(struct ufs_sb_private_info *uspi,
#define ubh_blkmap(ubh,begin,bit) \
((*ubh_get_addr(ubh, (begin) + ((bit) >> 3)) >> ((bit) & 7)) & (0xff >> (UFS_MAXFRAG - uspi->s_fpb)))
-/*
- * Determine the number of available frags given a
- * percentage to hold in reserve.
- */
static inline u64
-ufs_freespace(struct ufs_sb_private_info *uspi, int percentreserved)
+ufs_freefrags(struct ufs_sb_private_info *uspi)
{
return ufs_blkstofrags(uspi->cs_total.cs_nbfree) +
- uspi->cs_total.cs_nffree -
- (uspi->s_dsize * (percentreserved) / 100);
+ uspi->cs_total.cs_nffree;
}
/*
diff --git a/fs/userfaultfd.c b/fs/userfaultfd.c
index f7555fc25877..6148ccd6cccf 100644
--- a/fs/userfaultfd.c
+++ b/fs/userfaultfd.c
@@ -81,7 +81,7 @@ struct userfaultfd_unmap_ctx {
struct userfaultfd_wait_queue {
struct uffd_msg msg;
- wait_queue_t wq;
+ wait_queue_entry_t wq;
struct userfaultfd_ctx *ctx;
bool waken;
};
@@ -91,7 +91,7 @@ struct userfaultfd_wake_range {
unsigned long len;
};
-static int userfaultfd_wake_function(wait_queue_t *wq, unsigned mode,
+static int userfaultfd_wake_function(wait_queue_entry_t *wq, unsigned mode,
int wake_flags, void *key)
{
struct userfaultfd_wake_range *range = key;
@@ -129,7 +129,7 @@ static int userfaultfd_wake_function(wait_queue_t *wq, unsigned mode,
* wouldn't be enough, the smp_mb__before_spinlock is
* enough to avoid an explicit smp_mb() here.
*/
- list_del_init(&wq->task_list);
+ list_del_init(&wq->entry);
out:
return ret;
}
@@ -340,9 +340,28 @@ int handle_userfault(struct vm_fault *vmf, unsigned long reason)
bool must_wait, return_to_userland;
long blocking_state;
- BUG_ON(!rwsem_is_locked(&mm->mmap_sem));
-
ret = VM_FAULT_SIGBUS;
+
+ /*
+ * We don't do userfault handling for the final child pid update.
+ *
+ * We also don't do userfault handling during
+ * coredumping. hugetlbfs has the special
+ * follow_hugetlb_page() to skip missing pages in the
+ * FOLL_DUMP case, anon memory also checks for FOLL_DUMP with
+ * the no_page_table() helper in follow_page_mask(), but the
+ * shmem_vm_ops->fault method is invoked even during
+ * coredumping without mmap_sem and it ends up here.
+ */
+ if (current->flags & (PF_EXITING|PF_DUMPCORE))
+ goto out;
+
+ /*
+ * Coredumping runs without mmap_sem so we can only check that
+ * the mmap_sem is held, if PF_DUMPCORE was not set.
+ */
+ WARN_ON_ONCE(!rwsem_is_locked(&mm->mmap_sem));
+
ctx = vmf->vma->vm_userfaultfd_ctx.ctx;
if (!ctx)
goto out;
@@ -361,12 +380,6 @@ int handle_userfault(struct vm_fault *vmf, unsigned long reason)
goto out;
/*
- * We don't do userfault handling for the final child pid update.
- */
- if (current->flags & PF_EXITING)
- goto out;
-
- /*
* Check that we can return VM_FAULT_RETRY.
*
* NOTE: it should become possible to return VM_FAULT_RETRY
@@ -509,13 +522,13 @@ int handle_userfault(struct vm_fault *vmf, unsigned long reason)
* and it's fine not to block on the spinlock. The uwq on this
* kernel stack can be released after the list_del_init.
*/
- if (!list_empty_careful(&uwq.wq.task_list)) {
+ if (!list_empty_careful(&uwq.wq.entry)) {
spin_lock(&ctx->fault_pending_wqh.lock);
/*
* No need of list_del_init(), the uwq on the stack
* will be freed shortly anyway.
*/
- list_del(&uwq.wq.task_list);
+ list_del(&uwq.wq.entry);
spin_unlock(&ctx->fault_pending_wqh.lock);
}
@@ -847,7 +860,7 @@ wakeup:
static inline struct userfaultfd_wait_queue *find_userfault_in(
wait_queue_head_t *wqh)
{
- wait_queue_t *wq;
+ wait_queue_entry_t *wq;
struct userfaultfd_wait_queue *uwq;
VM_BUG_ON(!spin_is_locked(&wqh->lock));
@@ -856,7 +869,7 @@ static inline struct userfaultfd_wait_queue *find_userfault_in(
if (!waitqueue_active(wqh))
goto out;
/* walk in reverse to provide FIFO behavior to read userfaults */
- wq = list_last_entry(&wqh->task_list, typeof(*wq), task_list);
+ wq = list_last_entry(&wqh->head, typeof(*wq), entry);
uwq = container_of(wq, struct userfaultfd_wait_queue, wq);
out:
return uwq;
@@ -990,14 +1003,14 @@ static ssize_t userfaultfd_ctx_read(struct userfaultfd_ctx *ctx, int no_wait,
* changes __remove_wait_queue() to use
* list_del_init() in turn breaking the
* !list_empty_careful() check in
- * handle_userfault(). The uwq->wq.task_list
+ * handle_userfault(). The uwq->wq.head list
* must never be empty at any time during the
* refile, or the waitqueue could disappear
* from under us. The "wait_queue_head_t"
* parameter of __remove_wait_queue() is unused
* anyway.
*/
- list_del(&uwq->wq.task_list);
+ list_del(&uwq->wq.entry);
__add_wait_queue(&ctx->fault_wqh, &uwq->wq);
write_seqcount_end(&ctx->refile_seq);
@@ -1019,7 +1032,7 @@ static ssize_t userfaultfd_ctx_read(struct userfaultfd_ctx *ctx, int no_wait,
fork_nctx = (struct userfaultfd_ctx *)
(unsigned long)
uwq->msg.arg.reserved.reserved1;
- list_move(&uwq->wq.task_list, &fork_event);
+ list_move(&uwq->wq.entry, &fork_event);
spin_unlock(&ctx->event_wqh.lock);
ret = 0;
break;
@@ -1056,8 +1069,8 @@ static ssize_t userfaultfd_ctx_read(struct userfaultfd_ctx *ctx, int no_wait,
if (!list_empty(&fork_event)) {
uwq = list_first_entry(&fork_event,
typeof(*uwq),
- wq.task_list);
- list_del(&uwq->wq.task_list);
+ wq.entry);
+ list_del(&uwq->wq.entry);
__add_wait_queue(&ctx->event_wqh, &uwq->wq);
userfaultfd_event_complete(ctx, uwq);
}
@@ -1734,17 +1747,17 @@ static long userfaultfd_ioctl(struct file *file, unsigned cmd,
static void userfaultfd_show_fdinfo(struct seq_file *m, struct file *f)
{
struct userfaultfd_ctx *ctx = f->private_data;
- wait_queue_t *wq;
+ wait_queue_entry_t *wq;
struct userfaultfd_wait_queue *uwq;
unsigned long pending = 0, total = 0;
spin_lock(&ctx->fault_pending_wqh.lock);
- list_for_each_entry(wq, &ctx->fault_pending_wqh.task_list, task_list) {
+ list_for_each_entry(wq, &ctx->fault_pending_wqh.head, entry) {
uwq = container_of(wq, struct userfaultfd_wait_queue, wq);
pending++;
total++;
}
- list_for_each_entry(wq, &ctx->fault_wqh.task_list, task_list) {
+ list_for_each_entry(wq, &ctx->fault_wqh.head, entry) {
uwq = container_of(wq, struct userfaultfd_wait_queue, wq);
total++;
}
diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c
index 81f5bf7f0e72..d20c29b9c95b 100644
--- a/fs/xfs/xfs_aops.c
+++ b/fs/xfs/xfs_aops.c
@@ -1319,9 +1319,12 @@ xfs_vm_bmap(
* The swap code (ab-)uses ->bmap to get a block mapping and then
* bypasseѕ the file system for actual I/O. We really can't allow
* that on reflinks inodes, so we have to skip out here. And yes,
- * 0 is the magic code for a bmap error..
+ * 0 is the magic code for a bmap error.
+ *
+ * Since we don't pass back blockdev info, we can't return bmap
+ * information for rt files either.
*/
- if (xfs_is_reflink_inode(ip))
+ if (xfs_is_reflink_inode(ip) || XFS_IS_REALTIME_INODE(ip))
return 0;
filemap_write_and_wait(mapping);
diff --git a/fs/xfs/xfs_buf.c b/fs/xfs/xfs_buf.c
index 290b58464043..438505f395e7 100644
--- a/fs/xfs/xfs_buf.c
+++ b/fs/xfs/xfs_buf.c
@@ -117,7 +117,7 @@ static inline void
__xfs_buf_ioacct_dec(
struct xfs_buf *bp)
{
- ASSERT(spin_is_locked(&bp->b_lock));
+ lockdep_assert_held(&bp->b_lock);
if (bp->b_state & XFS_BSTATE_IN_FLIGHT) {
bp->b_state &= ~XFS_BSTATE_IN_FLIGHT;
diff --git a/fs/xfs/xfs_icache.c b/fs/xfs/xfs_icache.c
index f61c84f8e31a..b9c12e1cc23a 100644
--- a/fs/xfs/xfs_icache.c
+++ b/fs/xfs/xfs_icache.c
@@ -66,7 +66,6 @@ xfs_inode_alloc(
XFS_STATS_INC(mp, vn_active);
ASSERT(atomic_read(&ip->i_pincount) == 0);
- ASSERT(!spin_is_locked(&ip->i_flags_lock));
ASSERT(!xfs_isiflocked(ip));
ASSERT(ip->i_ino == 0);
@@ -190,7 +189,7 @@ xfs_perag_set_reclaim_tag(
{
struct xfs_mount *mp = pag->pag_mount;
- ASSERT(spin_is_locked(&pag->pag_ici_lock));
+ lockdep_assert_held(&pag->pag_ici_lock);
if (pag->pag_ici_reclaimable++)
return;
@@ -212,7 +211,7 @@ xfs_perag_clear_reclaim_tag(
{
struct xfs_mount *mp = pag->pag_mount;
- ASSERT(spin_is_locked(&pag->pag_ici_lock));
+ lockdep_assert_held(&pag->pag_ici_lock);
if (--pag->pag_ici_reclaimable)
return;
@@ -270,12 +269,12 @@ xfs_inew_wait(
DEFINE_WAIT_BIT(wait, &ip->i_flags, __XFS_INEW_BIT);
do {
- prepare_to_wait(wq, &wait.wait, TASK_UNINTERRUPTIBLE);
+ prepare_to_wait(wq, &wait.wq_entry, TASK_UNINTERRUPTIBLE);
if (!xfs_iflags_test(ip, XFS_INEW))
break;
schedule();
} while (true);
- finish_wait(wq, &wait.wait);
+ finish_wait(wq, &wait.wq_entry);
}
/*
diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c
index ec9826c56500..c0a1e840a588 100644
--- a/fs/xfs/xfs_inode.c
+++ b/fs/xfs/xfs_inode.c
@@ -622,12 +622,12 @@ __xfs_iflock(
DEFINE_WAIT_BIT(wait, &ip->i_flags, __XFS_IFLOCK_BIT);
do {
- prepare_to_wait_exclusive(wq, &wait.wait, TASK_UNINTERRUPTIBLE);
+ prepare_to_wait_exclusive(wq, &wait.wq_entry, TASK_UNINTERRUPTIBLE);
if (xfs_isiflocked(ip))
io_schedule();
} while (!xfs_iflock_nowait(ip));
- finish_wait(wq, &wait.wait);
+ finish_wait(wq, &wait.wq_entry);
}
STATIC uint
@@ -2486,11 +2486,11 @@ __xfs_iunpin_wait(
xfs_iunpin(ip);
do {
- prepare_to_wait(wq, &wait.wait, TASK_UNINTERRUPTIBLE);
+ prepare_to_wait(wq, &wait.wq_entry, TASK_UNINTERRUPTIBLE);
if (xfs_ipincount(ip))
io_schedule();
} while (xfs_ipincount(ip));
- finish_wait(wq, &wait.wait);
+ finish_wait(wq, &wait.wq_entry);
}
void
diff --git a/include/acpi/acpi_bus.h b/include/acpi/acpi_bus.h
index ea7df16e71a7..c1b163cb68b1 100644
--- a/include/acpi/acpi_bus.h
+++ b/include/acpi/acpi_bus.h
@@ -211,7 +211,8 @@ struct acpi_device_flags {
u32 of_compatible_ok:1;
u32 coherent_dma:1;
u32 cca_seen:1;
- u32 reserved:20;
+ u32 spi_i2c_slave:1;
+ u32 reserved:19;
};
/* File System */
diff --git a/include/acpi/actbl.h b/include/acpi/actbl.h
index d92543f3bbfd..bdc55c0da19c 100644
--- a/include/acpi/actbl.h
+++ b/include/acpi/actbl.h
@@ -374,6 +374,20 @@ struct acpi_table_desc {
u16 validation_count;
};
+/*
+ * Maximum value of the validation_count field in struct acpi_table_desc.
+ * When reached, validation_count cannot be changed any more and the table will
+ * be permanently regarded as validated.
+ *
+ * This is to prevent situations in which unbalanced table get/put operations
+ * may cause premature table unmapping in the OS to happen.
+ *
+ * The maximum validation count can be defined to any value, but should be
+ * greater than the maximum number of OS early stage mapping slots to avoid
+ * leaking early stage table mappings to the late stage.
+ */
+#define ACPI_MAX_TABLE_VALIDATIONS ACPI_UINT16_MAX
+
/* Masks for Flags field above */
#define ACPI_TABLE_ORIGIN_EXTERNAL_VIRTUAL (0) /* Virtual address, external maintained */
diff --git a/include/dt-bindings/clock/sun50i-a64-ccu.h b/include/dt-bindings/clock/sun50i-a64-ccu.h
index 370c0a0473fc..d66432c6e675 100644
--- a/include/dt-bindings/clock/sun50i-a64-ccu.h
+++ b/include/dt-bindings/clock/sun50i-a64-ccu.h
@@ -43,6 +43,8 @@
#ifndef _DT_BINDINGS_CLK_SUN50I_A64_H_
#define _DT_BINDINGS_CLK_SUN50I_A64_H_
+#define CLK_PLL_PERIPH0 11
+
#define CLK_BUS_MIPI_DSI 28
#define CLK_BUS_CE 29
#define CLK_BUS_DMA 30
diff --git a/include/dt-bindings/clock/sun8i-h3-ccu.h b/include/dt-bindings/clock/sun8i-h3-ccu.h
index c2afc41d6964..e139fe5c62ec 100644
--- a/include/dt-bindings/clock/sun8i-h3-ccu.h
+++ b/include/dt-bindings/clock/sun8i-h3-ccu.h
@@ -43,6 +43,8 @@
#ifndef _DT_BINDINGS_CLK_SUN8I_H3_H_
#define _DT_BINDINGS_CLK_SUN8I_H3_H_
+#define CLK_PLL_PERIPH0 9
+
#define CLK_CPUX 14
#define CLK_BUS_CE 20
diff --git a/include/linux/bcm47xx_nvram.h b/include/linux/bcm47xx_nvram.h
index 2793652fbf66..a414a2b53e41 100644
--- a/include/linux/bcm47xx_nvram.h
+++ b/include/linux/bcm47xx_nvram.h
@@ -8,6 +8,7 @@
#ifndef __BCM47XX_NVRAM_H
#define __BCM47XX_NVRAM_H
+#include <linux/errno.h>
#include <linux/types.h>
#include <linux/kernel.h>
#include <linux/vmalloc.h>
diff --git a/include/linux/bio.h b/include/linux/bio.h
index 4907bea03908..7b1cf4ba0902 100644
--- a/include/linux/bio.h
+++ b/include/linux/bio.h
@@ -165,10 +165,27 @@ static inline void bio_advance_iter(struct bio *bio, struct bvec_iter *iter,
{
iter->bi_sector += bytes >> 9;
- if (bio_no_advance_iter(bio))
+ if (bio_no_advance_iter(bio)) {
iter->bi_size -= bytes;
- else
+ iter->bi_done += bytes;
+ } else {
bvec_iter_advance(bio->bi_io_vec, iter, bytes);
+ /* TODO: It is reasonable to complete bio with error here. */
+ }
+}
+
+static inline bool bio_rewind_iter(struct bio *bio, struct bvec_iter *iter,
+ unsigned int bytes)
+{
+ iter->bi_sector -= bytes >> 9;
+
+ if (bio_no_advance_iter(bio)) {
+ iter->bi_size += bytes;
+ iter->bi_done -= bytes;
+ return true;
+ }
+
+ return bvec_iter_rewind(bio->bi_io_vec, iter, bytes);
}
#define __bio_for_each_segment(bvl, bio, iter, start) \
@@ -303,8 +320,6 @@ struct bio_integrity_payload {
struct bvec_iter bip_iter;
- bio_end_io_t *bip_end_io; /* saved I/O completion fn */
-
unsigned short bip_slab; /* slab the bip came from */
unsigned short bip_vcnt; /* # of integrity bio_vecs */
unsigned short bip_max_vcnt; /* integrity bio_vec slots */
@@ -429,6 +444,7 @@ extern void bio_advance(struct bio *, unsigned);
extern void bio_init(struct bio *bio, struct bio_vec *table,
unsigned short max_vecs);
+extern void bio_uninit(struct bio *);
extern void bio_reset(struct bio *);
void bio_chain(struct bio *, struct bio *);
@@ -721,13 +737,10 @@ struct biovec_slab {
bip_for_each_vec(_bvl, _bio->bi_integrity, _iter)
extern struct bio_integrity_payload *bio_integrity_alloc(struct bio *, gfp_t, unsigned int);
-extern void bio_integrity_free(struct bio *);
extern int bio_integrity_add_page(struct bio *, struct page *, unsigned int, unsigned int);
-extern bool bio_integrity_enabled(struct bio *bio);
-extern int bio_integrity_prep(struct bio *);
-extern void bio_integrity_endio(struct bio *);
+extern bool bio_integrity_prep(struct bio *);
extern void bio_integrity_advance(struct bio *, unsigned int);
-extern void bio_integrity_trim(struct bio *, unsigned int, unsigned int);
+extern void bio_integrity_trim(struct bio *);
extern int bio_integrity_clone(struct bio *, struct bio *, gfp_t);
extern int bioset_integrity_create(struct bio_set *, int);
extern void bioset_integrity_free(struct bio_set *);
@@ -740,11 +753,6 @@ static inline void *bio_integrity(struct bio *bio)
return NULL;
}
-static inline bool bio_integrity_enabled(struct bio *bio)
-{
- return false;
-}
-
static inline int bioset_integrity_create(struct bio_set *bs, int pool_size)
{
return 0;
@@ -755,14 +763,9 @@ static inline void bioset_integrity_free (struct bio_set *bs)
return;
}
-static inline int bio_integrity_prep(struct bio *bio)
-{
- return 0;
-}
-
-static inline void bio_integrity_free(struct bio *bio)
+static inline bool bio_integrity_prep(struct bio *bio)
{
- return;
+ return true;
}
static inline int bio_integrity_clone(struct bio *bio, struct bio *bio_src,
@@ -777,8 +780,7 @@ static inline void bio_integrity_advance(struct bio *bio,
return;
}
-static inline void bio_integrity_trim(struct bio *bio, unsigned int offset,
- unsigned int sectors)
+static inline void bio_integrity_trim(struct bio *bio)
{
return;
}
diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h
index 23d32ff0b462..14542308d25b 100644
--- a/include/linux/blk-mq.h
+++ b/include/linux/blk-mq.h
@@ -33,7 +33,7 @@ struct blk_mq_hw_ctx {
struct blk_mq_ctx **ctxs;
unsigned int nr_ctx;
- wait_queue_t dispatch_wait;
+ wait_queue_entry_t dispatch_wait;
atomic_t wait_index;
struct blk_mq_tags *tags;
diff --git a/include/linux/bvec.h b/include/linux/bvec.h
index 89b65b82d98f..ec8a4d7af6bd 100644
--- a/include/linux/bvec.h
+++ b/include/linux/bvec.h
@@ -22,6 +22,7 @@
#include <linux/kernel.h>
#include <linux/bug.h>
+#include <linux/errno.h>
/*
* was unsigned short, but we might as well be ready for > 64kB I/O pages
@@ -39,6 +40,8 @@ struct bvec_iter {
unsigned int bi_idx; /* current index into bvl_vec */
+ unsigned int bi_done; /* number of bytes completed */
+
unsigned int bi_bvec_done; /* number of bytes completed in
current bvec */
};
@@ -66,12 +69,14 @@ struct bvec_iter {
.bv_offset = bvec_iter_offset((bvec), (iter)), \
})
-static inline void bvec_iter_advance(const struct bio_vec *bv,
- struct bvec_iter *iter,
- unsigned bytes)
+static inline bool bvec_iter_advance(const struct bio_vec *bv,
+ struct bvec_iter *iter, unsigned bytes)
{
- WARN_ONCE(bytes > iter->bi_size,
- "Attempted to advance past end of bvec iter\n");
+ if (WARN_ONCE(bytes > iter->bi_size,
+ "Attempted to advance past end of bvec iter\n")) {
+ iter->bi_size = 0;
+ return false;
+ }
while (bytes) {
unsigned iter_len = bvec_iter_len(bv, *iter);
@@ -80,12 +85,38 @@ static inline void bvec_iter_advance(const struct bio_vec *bv,
bytes -= len;
iter->bi_size -= len;
iter->bi_bvec_done += len;
+ iter->bi_done += len;
if (iter->bi_bvec_done == __bvec_iter_bvec(bv, *iter)->bv_len) {
iter->bi_bvec_done = 0;
iter->bi_idx++;
}
}
+ return true;
+}
+
+static inline bool bvec_iter_rewind(const struct bio_vec *bv,
+ struct bvec_iter *iter,
+ unsigned int bytes)
+{
+ while (bytes) {
+ unsigned len = min(bytes, iter->bi_bvec_done);
+
+ if (iter->bi_bvec_done == 0) {
+ if (WARN_ONCE(iter->bi_idx == 0,
+ "Attempted to rewind iter beyond "
+ "bvec's boundaries\n")) {
+ return false;
+ }
+ iter->bi_idx--;
+ iter->bi_bvec_done = __bvec_iter_bvec(bv, *iter)->bv_len;
+ continue;
+ }
+ bytes -= len;
+ iter->bi_size += len;
+ iter->bi_bvec_done -= len;
+ }
+ return true;
}
#define for_each_bvec(bvl, bio_vec, iter, start) \
diff --git a/include/linux/clocksource.h b/include/linux/clocksource.h
index f2b10d9ebd04..81490456c242 100644
--- a/include/linux/clocksource.h
+++ b/include/linux/clocksource.h
@@ -96,6 +96,7 @@ struct clocksource {
void (*suspend)(struct clocksource *cs);
void (*resume)(struct clocksource *cs);
void (*mark_unstable)(struct clocksource *cs);
+ void (*tick_stable)(struct clocksource *cs);
/* private: */
#ifdef CONFIG_CLOCKSOURCE_WATCHDOG
diff --git a/include/linux/compiler.h b/include/linux/compiler.h
index f8110051188f..707242fdbb89 100644
--- a/include/linux/compiler.h
+++ b/include/linux/compiler.h
@@ -17,11 +17,7 @@
# define __release(x) __context__(x,-1)
# define __cond_lock(x,c) ((c) ? ({ __acquire(x); 1; }) : 0)
# define __percpu __attribute__((noderef, address_space(3)))
-#ifdef CONFIG_SPARSE_RCU_POINTER
# define __rcu __attribute__((noderef, address_space(4)))
-#else /* CONFIG_SPARSE_RCU_POINTER */
-# define __rcu
-#endif /* CONFIG_SPARSE_RCU_POINTER */
# define __private __attribute__((noderef))
extern void __chk_user_ptr(const volatile void __user *);
extern void __chk_io_ptr(const volatile void __iomem *);
diff --git a/include/linux/configfs.h b/include/linux/configfs.h
index 2319b8c108e8..c96709049683 100644
--- a/include/linux/configfs.h
+++ b/include/linux/configfs.h
@@ -74,7 +74,8 @@ extern void config_item_init_type_name(struct config_item *item,
const char *name,
struct config_item_type *type);
-extern struct config_item * config_item_get(struct config_item *);
+extern struct config_item *config_item_get(struct config_item *);
+extern struct config_item *config_item_get_unless_zero(struct config_item *);
extern void config_item_put(struct config_item *);
struct config_item_type {
diff --git a/include/linux/cpumask.h b/include/linux/cpumask.h
index 2404ad238c0b..4bf4479a3a80 100644
--- a/include/linux/cpumask.h
+++ b/include/linux/cpumask.h
@@ -236,6 +236,23 @@ unsigned int cpumask_local_spread(unsigned int i, int node);
(cpu) = cpumask_next_zero((cpu), (mask)), \
(cpu) < nr_cpu_ids;)
+extern int cpumask_next_wrap(int n, const struct cpumask *mask, int start, bool wrap);
+
+/**
+ * for_each_cpu_wrap - iterate over every cpu in a mask, starting at a specified location
+ * @cpu: the (optionally unsigned) integer iterator
+ * @mask: the cpumask poiter
+ * @start: the start location
+ *
+ * The implementation does not assume any bit in @mask is set (including @start).
+ *
+ * After the loop, cpu is >= nr_cpu_ids.
+ */
+#define for_each_cpu_wrap(cpu, mask, start) \
+ for ((cpu) = cpumask_next_wrap((start)-1, (mask), (start), false); \
+ (cpu) < nr_cpumask_bits; \
+ (cpu) = cpumask_next_wrap((cpu), (mask), (start), true))
+
/**
* for_each_cpu_and - iterate over every cpu in both masks
* @cpu: the (optionally unsigned) integer iterator
@@ -276,6 +293,12 @@ static inline void cpumask_set_cpu(unsigned int cpu, struct cpumask *dstp)
set_bit(cpumask_check(cpu), cpumask_bits(dstp));
}
+static inline void __cpumask_set_cpu(unsigned int cpu, struct cpumask *dstp)
+{
+ __set_bit(cpumask_check(cpu), cpumask_bits(dstp));
+}
+
+
/**
* cpumask_clear_cpu - clear a cpu in a cpumask
* @cpu: cpu number (< nr_cpu_ids)
@@ -286,6 +309,11 @@ static inline void cpumask_clear_cpu(int cpu, struct cpumask *dstp)
clear_bit(cpumask_check(cpu), cpumask_bits(dstp));
}
+static inline void __cpumask_clear_cpu(int cpu, struct cpumask *dstp)
+{
+ __clear_bit(cpumask_check(cpu), cpumask_bits(dstp));
+}
+
/**
* cpumask_test_cpu - test for a cpu in a cpumask
* @cpu: cpu number (< nr_cpu_ids)
diff --git a/include/linux/dmi.h b/include/linux/dmi.h
index 5e9c74cf8894..9bbf21a516e4 100644
--- a/include/linux/dmi.h
+++ b/include/linux/dmi.h
@@ -136,7 +136,7 @@ static inline int dmi_name_in_vendors(const char *s) { return 0; }
static inline int dmi_name_in_serial(const char *s) { return 0; }
#define dmi_available 0
static inline int dmi_walk(void (*decode)(const struct dmi_header *, void *),
- void *private_data) { return -1; }
+ void *private_data) { return -ENXIO; }
static inline bool dmi_match(enum dmi_field f, const char *str)
{ return false; }
static inline void dmi_memdev_name(u16 handle, const char **bank,
diff --git a/include/linux/efi.h b/include/linux/efi.h
index ec36f42a2add..8269bcb8ccf7 100644
--- a/include/linux/efi.h
+++ b/include/linux/efi.h
@@ -137,6 +137,18 @@ struct efi_boot_memmap {
#define EFI_CAPSULE_POPULATE_SYSTEM_TABLE 0x00020000
#define EFI_CAPSULE_INITIATE_RESET 0x00040000
+struct capsule_info {
+ efi_capsule_header_t header;
+ int reset_type;
+ long index;
+ size_t count;
+ size_t total_size;
+ phys_addr_t *pages;
+ size_t page_bytes_remain;
+};
+
+int __efi_capsule_setup_info(struct capsule_info *cap_info);
+
/*
* Allocation types for calls to boottime->allocate_pages.
*/
@@ -1403,7 +1415,7 @@ extern int efi_capsule_supported(efi_guid_t guid, u32 flags,
size_t size, int *reset);
extern int efi_capsule_update(efi_capsule_header_t *capsule,
- struct page **pages);
+ phys_addr_t *pages);
#ifdef CONFIG_EFI_RUNTIME_MAP
int efi_runtime_map_init(struct kobject *);
diff --git a/include/linux/eventfd.h b/include/linux/eventfd.h
index ff0b981f078e..9e4befd95bc7 100644
--- a/include/linux/eventfd.h
+++ b/include/linux/eventfd.h
@@ -37,7 +37,7 @@ struct eventfd_ctx *eventfd_ctx_fdget(int fd);
struct eventfd_ctx *eventfd_ctx_fileget(struct file *file);
__u64 eventfd_signal(struct eventfd_ctx *ctx, __u64 n);
ssize_t eventfd_ctx_read(struct eventfd_ctx *ctx, int no_wait, __u64 *cnt);
-int eventfd_ctx_remove_wait_queue(struct eventfd_ctx *ctx, wait_queue_t *wait,
+int eventfd_ctx_remove_wait_queue(struct eventfd_ctx *ctx, wait_queue_entry_t *wait,
__u64 *cnt);
#else /* CONFIG_EVENTFD */
@@ -73,7 +73,7 @@ static inline ssize_t eventfd_ctx_read(struct eventfd_ctx *ctx, int no_wait,
}
static inline int eventfd_ctx_remove_wait_queue(struct eventfd_ctx *ctx,
- wait_queue_t *wait, __u64 *cnt)
+ wait_queue_entry_t *wait, __u64 *cnt)
{
return -ENOSYS;
}
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 65adbddb3163..771fe1131467 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -2,7 +2,7 @@
#define _LINUX_FS_H
#include <linux/linkage.h>
-#include <linux/wait.h>
+#include <linux/wait_bit.h>
#include <linux/kdev_t.h>
#include <linux/dcache.h>
#include <linux/path.h>
diff --git a/include/linux/hashtable.h b/include/linux/hashtable.h
index 661e5c2a8e2a..082dc1bd0801 100644
--- a/include/linux/hashtable.h
+++ b/include/linux/hashtable.h
@@ -167,7 +167,6 @@ static inline void hash_del_rcu(struct hlist_node *node)
/**
* hash_for_each_possible_rcu - iterate over all possible objects hashing to the
* same bucket in an rcu enabled hashtable
- * in a rcu enabled hashtable
* @name: hashtable to iterate
* @obj: the type * to use as a loop cursor for each entry
* @member: the name of the hlist_node within the struct
diff --git a/include/linux/kernel.h b/include/linux/kernel.h
index 13bc08aba704..1c91f26e2996 100644
--- a/include/linux/kernel.h
+++ b/include/linux/kernel.h
@@ -490,9 +490,13 @@ extern int root_mountflags;
extern bool early_boot_irqs_disabled;
-/* Values used for system_state */
+/*
+ * Values used for system_state. Ordering of the states must not be changed
+ * as code checks for <, <=, >, >= STATE.
+ */
extern enum system_states {
SYSTEM_BOOTING,
+ SYSTEM_SCHEDULING,
SYSTEM_RUNNING,
SYSTEM_HALT,
SYSTEM_POWER_OFF,
diff --git a/include/linux/kvm_irqfd.h b/include/linux/kvm_irqfd.h
index 0c1de05098c8..76c2fbc59f35 100644
--- a/include/linux/kvm_irqfd.h
+++ b/include/linux/kvm_irqfd.h
@@ -46,7 +46,7 @@ struct kvm_kernel_irqfd_resampler {
struct kvm_kernel_irqfd {
/* Used for MSI fast-path */
struct kvm *kvm;
- wait_queue_t wait;
+ wait_queue_entry_t wait;
/* Update side is protected by irqfds.lock */
struct kvm_kernel_irq_routing_entry irq_entry;
seqcount_t irq_entry_sc;
diff --git a/include/linux/llist.h b/include/linux/llist.h
index 171baa90f6f6..d11738110a7a 100644
--- a/include/linux/llist.h
+++ b/include/linux/llist.h
@@ -110,6 +110,25 @@ static inline void init_llist_head(struct llist_head *list)
for ((pos) = (node); pos; (pos) = (pos)->next)
/**
+ * llist_for_each_safe - iterate over some deleted entries of a lock-less list
+ * safe against removal of list entry
+ * @pos: the &struct llist_node to use as a loop cursor
+ * @n: another &struct llist_node to use as temporary storage
+ * @node: the first entry of deleted list entries
+ *
+ * In general, some entries of the lock-less list can be traversed
+ * safely only after being deleted from list, so start with an entry
+ * instead of list head.
+ *
+ * If being used on entries deleted from lock-less list directly, the
+ * traverse order is from the newest to the oldest added entry. If
+ * you want to traverse from the oldest to the newest, you must
+ * reverse the order by yourself before traversing.
+ */
+#define llist_for_each_safe(pos, n, node) \
+ for ((pos) = (node); (pos) && ((n) = (pos)->next, true); (pos) = (n))
+
+/**
* llist_for_each_entry - iterate over some deleted entries of lock-less list of given type
* @pos: the type * to use as a loop cursor.
* @node: the fist entry of deleted list entries.
diff --git a/include/linux/mm.h b/include/linux/mm.h
index b892e95d4929..6f543a47fc92 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -1393,12 +1393,6 @@ int clear_page_dirty_for_io(struct page *page);
int get_cmdline(struct task_struct *task, char *buffer, int buflen);
-/* Is the vma a continuation of the stack vma above it? */
-static inline int vma_growsdown(struct vm_area_struct *vma, unsigned long addr)
-{
- return vma && (vma->vm_end == addr) && (vma->vm_flags & VM_GROWSDOWN);
-}
-
static inline bool vma_is_anonymous(struct vm_area_struct *vma)
{
return !vma->vm_ops;
@@ -1414,28 +1408,6 @@ bool vma_is_shmem(struct vm_area_struct *vma);
static inline bool vma_is_shmem(struct vm_area_struct *vma) { return false; }
#endif
-static inline int stack_guard_page_start(struct vm_area_struct *vma,
- unsigned long addr)
-{
- return (vma->vm_flags & VM_GROWSDOWN) &&
- (vma->vm_start == addr) &&
- !vma_growsdown(vma->vm_prev, addr);
-}
-
-/* Is the vma a continuation of the stack vma below it? */
-static inline int vma_growsup(struct vm_area_struct *vma, unsigned long addr)
-{
- return vma && (vma->vm_start == addr) && (vma->vm_flags & VM_GROWSUP);
-}
-
-static inline int stack_guard_page_end(struct vm_area_struct *vma,
- unsigned long addr)
-{
- return (vma->vm_flags & VM_GROWSUP) &&
- (vma->vm_end == addr) &&
- !vma_growsup(vma->vm_next, addr);
-}
-
int vma_is_stack_for_current(struct vm_area_struct *vma);
extern unsigned long move_page_tables(struct vm_area_struct *vma,
@@ -2222,6 +2194,7 @@ void page_cache_async_readahead(struct address_space *mapping,
pgoff_t offset,
unsigned long size);
+extern unsigned long stack_guard_gap;
/* Generic expand stack which grows the stack according to GROWS{UP,DOWN} */
extern int expand_stack(struct vm_area_struct *vma, unsigned long address);
@@ -2250,6 +2223,30 @@ static inline struct vm_area_struct * find_vma_intersection(struct mm_struct * m
return vma;
}
+static inline unsigned long vm_start_gap(struct vm_area_struct *vma)
+{
+ unsigned long vm_start = vma->vm_start;
+
+ if (vma->vm_flags & VM_GROWSDOWN) {
+ vm_start -= stack_guard_gap;
+ if (vm_start > vma->vm_start)
+ vm_start = 0;
+ }
+ return vm_start;
+}
+
+static inline unsigned long vm_end_gap(struct vm_area_struct *vma)
+{
+ unsigned long vm_end = vma->vm_end;
+
+ if (vma->vm_flags & VM_GROWSUP) {
+ vm_end += stack_guard_gap;
+ if (vm_end < vma->vm_end)
+ vm_end = -PAGE_SIZE;
+ }
+ return vm_end;
+}
+
static inline unsigned long vma_pages(struct vm_area_struct *vma)
{
return (vma->vm_end - vma->vm_start) >> PAGE_SHIFT;
diff --git a/include/linux/mm_types_task.h b/include/linux/mm_types_task.h
index 136dfdf63ba1..fc412fbd80bd 100644
--- a/include/linux/mm_types_task.h
+++ b/include/linux/mm_types_task.h
@@ -14,6 +14,10 @@
#include <asm/page.h>
+#ifdef CONFIG_ARCH_WANT_BATCHED_UNMAP_TLB_FLUSH
+#include <asm/tlbbatch.h>
+#endif
+
#define USE_SPLIT_PTE_PTLOCKS (NR_CPUS >= CONFIG_SPLIT_PTLOCK_CPUS)
#define USE_SPLIT_PMD_PTLOCKS (USE_SPLIT_PTE_PTLOCKS && \
IS_ENABLED(CONFIG_ARCH_ENABLE_SPLIT_PMD_PTLOCK))
@@ -67,12 +71,15 @@ struct page_frag {
struct tlbflush_unmap_batch {
#ifdef CONFIG_ARCH_WANT_BATCHED_UNMAP_TLB_FLUSH
/*
- * Each bit set is a CPU that potentially has a TLB entry for one of
- * the PFNs being flushed. See set_tlb_ubc_flush_pending().
+ * The arch code makes the following promise: generic code can modify a
+ * PTE, then call arch_tlbbatch_add_mm() (which internally provides all
+ * needed barriers), then call arch_tlbbatch_flush(), and the entries
+ * will be flushed on all CPUs by the time that arch_tlbbatch_flush()
+ * returns.
*/
- struct cpumask cpumask;
+ struct arch_tlbflush_unmap_batch arch;
- /* True if any bit in cpumask is set */
+ /* True if a flush is needed. */
bool flush_required;
/*
diff --git a/include/linux/moduleparam.h b/include/linux/moduleparam.h
index 6be1949ebcdf..1ee7b30dafec 100644
--- a/include/linux/moduleparam.h
+++ b/include/linux/moduleparam.h
@@ -457,7 +457,7 @@ enum hwparam_type {
hwparam_ioport, /* Module parameter configures an I/O port */
hwparam_iomem, /* Module parameter configures an I/O mem address */
hwparam_ioport_or_iomem, /* Module parameter could be either, depending on other option */
- hwparam_irq, /* Module parameter configures an I/O port */
+ hwparam_irq, /* Module parameter configures an IRQ */
hwparam_dma, /* Module parameter configures a DMA channel */
hwparam_dma_addr, /* Module parameter configures a DMA buffer address */
hwparam_other, /* Module parameter configures some other value */
diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 3f39d27decf4..4ed952c17fc7 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -914,8 +914,7 @@ struct xfrmdev_ops {
*
* int (*ndo_change_mtu)(struct net_device *dev, int new_mtu);
* Called when a user wants to change the Maximum Transfer Unit
- * of a device. If not defined, any request to change MTU will
- * will return an error.
+ * of a device.
*
* void (*ndo_tx_timeout)(struct net_device *dev);
* Callback used when the transmitter has not made any progress
@@ -1596,8 +1595,8 @@ enum netdev_priv_flags {
* @rtnl_link_state: This enum represents the phases of creating
* a new link
*
- * @destructor: Called from unregister,
- * can be used to call free_netdev
+ * @needs_free_netdev: Should unregister perform free_netdev?
+ * @priv_destructor: Called from unregister
* @npinfo: XXX: need comments on this one
* @nd_net: Network namespace this network device is inside
*
@@ -1858,7 +1857,8 @@ struct net_device {
RTNL_LINK_INITIALIZING,
} rtnl_link_state:16;
- void (*destructor)(struct net_device *dev);
+ bool needs_free_netdev;
+ void (*priv_destructor)(struct net_device *dev);
#ifdef CONFIG_NETPOLL
struct netpoll_info __rcu *npinfo;
@@ -4261,6 +4261,11 @@ static inline const char *netdev_name(const struct net_device *dev)
return dev->name;
}
+static inline bool netdev_unregistering(const struct net_device *dev)
+{
+ return dev->reg_state == NETREG_UNREGISTERING;
+}
+
static inline const char *netdev_reg_state(const struct net_device *dev)
{
switch (dev->reg_state) {
diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h
index 316a19f6b635..e7bbd9d4dc6c 100644
--- a/include/linux/pagemap.h
+++ b/include/linux/pagemap.h
@@ -524,7 +524,7 @@ void page_endio(struct page *page, bool is_write, int err);
/*
* Add an arbitrary waiter to a page's wait queue
*/
-extern void add_page_wait_queue(struct page *page, wait_queue_t *waiter);
+extern void add_page_wait_queue(struct page *page, wait_queue_entry_t *waiter);
/*
* Fault everything in given userspace address range in.
diff --git a/include/linux/poll.h b/include/linux/poll.h
index 75ffc5729e4c..2889f09a1c60 100644
--- a/include/linux/poll.h
+++ b/include/linux/poll.h
@@ -75,7 +75,7 @@ static inline void init_poll_funcptr(poll_table *pt, poll_queue_proc qproc)
struct poll_table_entry {
struct file *filp;
unsigned long key;
- wait_queue_t wait;
+ wait_queue_entry_t wait;
wait_queue_head_t *wait_address;
};
diff --git a/include/linux/rcu_node_tree.h b/include/linux/rcu_node_tree.h
index 4b766b61e1a0..426cee67f0e2 100644
--- a/include/linux/rcu_node_tree.h
+++ b/include/linux/rcu_node_tree.h
@@ -7,6 +7,10 @@
* unlimited scalability while maintaining a constant level of contention
* on the root node.
*
+ * This seemingly RCU-private file must be available to SRCU users
+ * because the size of the TREE SRCU srcu_struct structure depends
+ * on these definitions.
+ *
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
diff --git a/include/linux/rcu_segcblist.h b/include/linux/rcu_segcblist.h
index ba4d2621d9ca..c3ad00e63556 100644
--- a/include/linux/rcu_segcblist.h
+++ b/include/linux/rcu_segcblist.h
@@ -1,6 +1,10 @@
/*
* RCU segmented callback lists
*
+ * This seemingly RCU-private file must be available to SRCU users
+ * because the size of the TREE SRCU srcu_struct structure depends
+ * on these definitions.
+ *
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
diff --git a/include/linux/rcupdate.h b/include/linux/rcupdate.h
index e1e5d002fdb9..f816fc72b51e 100644
--- a/include/linux/rcupdate.h
+++ b/include/linux/rcupdate.h
@@ -34,104 +34,15 @@
#define __LINUX_RCUPDATE_H
#include <linux/types.h>
-#include <linux/cache.h>
-#include <linux/spinlock.h>
-#include <linux/threads.h>
-#include <linux/cpumask.h>
-#include <linux/seqlock.h>
-#include <linux/lockdep.h>
-#include <linux/debugobjects.h>
-#include <linux/bug.h>
#include <linux/compiler.h>
-#include <linux/ktime.h>
+#include <linux/atomic.h>
#include <linux/irqflags.h>
+#include <linux/preempt.h>
+#include <linux/bottom_half.h>
+#include <linux/lockdep.h>
+#include <asm/processor.h>
+#include <linux/cpumask.h>
-#include <asm/barrier.h>
-
-#ifndef CONFIG_TINY_RCU
-extern int rcu_expedited; /* for sysctl */
-extern int rcu_normal; /* also for sysctl */
-#endif /* #ifndef CONFIG_TINY_RCU */
-
-#ifdef CONFIG_TINY_RCU
-/* Tiny RCU doesn't expedite, as its purpose in life is instead to be tiny. */
-static inline bool rcu_gp_is_normal(void) /* Internal RCU use. */
-{
- return true;
-}
-static inline bool rcu_gp_is_expedited(void) /* Internal RCU use. */
-{
- return false;
-}
-
-static inline void rcu_expedite_gp(void)
-{
-}
-
-static inline void rcu_unexpedite_gp(void)
-{
-}
-#else /* #ifdef CONFIG_TINY_RCU */
-bool rcu_gp_is_normal(void); /* Internal RCU use. */
-bool rcu_gp_is_expedited(void); /* Internal RCU use. */
-void rcu_expedite_gp(void);
-void rcu_unexpedite_gp(void);
-#endif /* #else #ifdef CONFIG_TINY_RCU */
-
-enum rcutorture_type {
- RCU_FLAVOR,
- RCU_BH_FLAVOR,
- RCU_SCHED_FLAVOR,
- RCU_TASKS_FLAVOR,
- SRCU_FLAVOR,
- INVALID_RCU_FLAVOR
-};
-
-#if defined(CONFIG_TREE_RCU) || defined(CONFIG_PREEMPT_RCU)
-void rcutorture_get_gp_data(enum rcutorture_type test_type, int *flags,
- unsigned long *gpnum, unsigned long *completed);
-void rcutorture_record_test_transition(void);
-void rcutorture_record_progress(unsigned long vernum);
-void do_trace_rcu_torture_read(const char *rcutorturename,
- struct rcu_head *rhp,
- unsigned long secs,
- unsigned long c_old,
- unsigned long c);
-bool rcu_irq_enter_disabled(void);
-#else
-static inline void rcutorture_get_gp_data(enum rcutorture_type test_type,
- int *flags,
- unsigned long *gpnum,
- unsigned long *completed)
-{
- *flags = 0;
- *gpnum = 0;
- *completed = 0;
-}
-static inline void rcutorture_record_test_transition(void)
-{
-}
-static inline void rcutorture_record_progress(unsigned long vernum)
-{
-}
-static inline bool rcu_irq_enter_disabled(void)
-{
- return false;
-}
-#ifdef CONFIG_RCU_TRACE
-void do_trace_rcu_torture_read(const char *rcutorturename,
- struct rcu_head *rhp,
- unsigned long secs,
- unsigned long c_old,
- unsigned long c);
-#else
-#define do_trace_rcu_torture_read(rcutorturename, rhp, secs, c_old, c) \
- do { } while (0)
-#endif
-#endif
-
-#define UINT_CMP_GE(a, b) (UINT_MAX / 2 >= (a) - (b))
-#define UINT_CMP_LT(a, b) (UINT_MAX / 2 < (a) - (b))
#define ULONG_CMP_GE(a, b) (ULONG_MAX / 2 >= (a) - (b))
#define ULONG_CMP_LT(a, b) (ULONG_MAX / 2 < (a) - (b))
#define ulong2long(a) (*(long *)(&(a)))
@@ -139,115 +50,14 @@ void do_trace_rcu_torture_read(const char *rcutorturename,
/* Exported common interfaces */
#ifdef CONFIG_PREEMPT_RCU
-
-/**
- * call_rcu() - Queue an RCU callback for invocation after a grace period.
- * @head: structure to be used for queueing the RCU updates.
- * @func: actual callback function to be invoked after the grace period
- *
- * The callback function will be invoked some time after a full grace
- * period elapses, in other words after all pre-existing RCU read-side
- * critical sections have completed. However, the callback function
- * might well execute concurrently with RCU read-side critical sections
- * that started after call_rcu() was invoked. RCU read-side critical
- * sections are delimited by rcu_read_lock() and rcu_read_unlock(),
- * and may be nested.
- *
- * Note that all CPUs must agree that the grace period extended beyond
- * all pre-existing RCU read-side critical section. On systems with more
- * than one CPU, this means that when "func()" is invoked, each CPU is
- * guaranteed to have executed a full memory barrier since the end of its
- * last RCU read-side critical section whose beginning preceded the call
- * to call_rcu(). It also means that each CPU executing an RCU read-side
- * critical section that continues beyond the start of "func()" must have
- * executed a memory barrier after the call_rcu() but before the beginning
- * of that RCU read-side critical section. Note that these guarantees
- * include CPUs that are offline, idle, or executing in user mode, as
- * well as CPUs that are executing in the kernel.
- *
- * Furthermore, if CPU A invoked call_rcu() and CPU B invoked the
- * resulting RCU callback function "func()", then both CPU A and CPU B are
- * guaranteed to execute a full memory barrier during the time interval
- * between the call to call_rcu() and the invocation of "func()" -- even
- * if CPU A and CPU B are the same CPU (but again only if the system has
- * more than one CPU).
- */
-void call_rcu(struct rcu_head *head,
- rcu_callback_t func);
-
+void call_rcu(struct rcu_head *head, rcu_callback_t func);
#else /* #ifdef CONFIG_PREEMPT_RCU */
-
-/* In classic RCU, call_rcu() is just call_rcu_sched(). */
#define call_rcu call_rcu_sched
-
#endif /* #else #ifdef CONFIG_PREEMPT_RCU */
-/**
- * call_rcu_bh() - Queue an RCU for invocation after a quicker grace period.
- * @head: structure to be used for queueing the RCU updates.
- * @func: actual callback function to be invoked after the grace period
- *
- * The callback function will be invoked some time after a full grace
- * period elapses, in other words after all currently executing RCU
- * read-side critical sections have completed. call_rcu_bh() assumes
- * that the read-side critical sections end on completion of a softirq
- * handler. This means that read-side critical sections in process
- * context must not be interrupted by softirqs. This interface is to be
- * used when most of the read-side critical sections are in softirq context.
- * RCU read-side critical sections are delimited by :
- * - rcu_read_lock() and rcu_read_unlock(), if in interrupt context.
- * OR
- * - rcu_read_lock_bh() and rcu_read_unlock_bh(), if in process context.
- * These may be nested.
- *
- * See the description of call_rcu() for more detailed information on
- * memory ordering guarantees.
- */
-void call_rcu_bh(struct rcu_head *head,
- rcu_callback_t func);
-
-/**
- * call_rcu_sched() - Queue an RCU for invocation after sched grace period.
- * @head: structure to be used for queueing the RCU updates.
- * @func: actual callback function to be invoked after the grace period
- *
- * The callback function will be invoked some time after a full grace
- * period elapses, in other words after all currently executing RCU
- * read-side critical sections have completed. call_rcu_sched() assumes
- * that the read-side critical sections end on enabling of preemption
- * or on voluntary preemption.
- * RCU read-side critical sections are delimited by :
- * - rcu_read_lock_sched() and rcu_read_unlock_sched(),
- * OR
- * anything that disables preemption.
- * These may be nested.
- *
- * See the description of call_rcu() for more detailed information on
- * memory ordering guarantees.
- */
-void call_rcu_sched(struct rcu_head *head,
- rcu_callback_t func);
-
+void call_rcu_bh(struct rcu_head *head, rcu_callback_t func);
+void call_rcu_sched(struct rcu_head *head, rcu_callback_t func);
void synchronize_sched(void);
-
-/**
- * call_rcu_tasks() - Queue an RCU for invocation task-based grace period
- * @head: structure to be used for queueing the RCU updates.
- * @func: actual callback function to be invoked after the grace period
- *
- * The callback function will be invoked some time after a full grace
- * period elapses, in other words after all currently executing RCU
- * read-side critical sections have completed. call_rcu_tasks() assumes
- * that the read-side critical sections end at a voluntary context
- * switch (not a preemption!), entry into idle, or transition to usermode
- * execution. As such, there are no read-side primitives analogous to
- * rcu_read_lock() and rcu_read_unlock() because this primitive is intended
- * to determine that all tasks have passed through a safe state, not so
- * much for data-strcuture synchronization.
- *
- * See the description of call_rcu() for more detailed information on
- * memory ordering guarantees.
- */
void call_rcu_tasks(struct rcu_head *head, rcu_callback_t func);
void synchronize_rcu_tasks(void);
void rcu_barrier_tasks(void);
@@ -301,22 +111,12 @@ void rcu_check_callbacks(int user);
void rcu_report_dead(unsigned int cpu);
void rcu_cpu_starting(unsigned int cpu);
-#ifndef CONFIG_TINY_RCU
-void rcu_end_inkernel_boot(void);
-#else /* #ifndef CONFIG_TINY_RCU */
-static inline void rcu_end_inkernel_boot(void) { }
-#endif /* #ifndef CONFIG_TINY_RCU */
-
#ifdef CONFIG_RCU_STALL_COMMON
void rcu_sysrq_start(void);
void rcu_sysrq_end(void);
#else /* #ifdef CONFIG_RCU_STALL_COMMON */
-static inline void rcu_sysrq_start(void)
-{
-}
-static inline void rcu_sysrq_end(void)
-{
-}
+static inline void rcu_sysrq_start(void) { }
+static inline void rcu_sysrq_end(void) { }
#endif /* #else #ifdef CONFIG_RCU_STALL_COMMON */
#ifdef CONFIG_NO_HZ_FULL
@@ -330,9 +130,7 @@ static inline void rcu_user_exit(void) { }
#ifdef CONFIG_RCU_NOCB_CPU
void rcu_init_nohz(void);
#else /* #ifdef CONFIG_RCU_NOCB_CPU */
-static inline void rcu_init_nohz(void)
-{
-}
+static inline void rcu_init_nohz(void) { }
#endif /* #else #ifdef CONFIG_RCU_NOCB_CPU */
/**
@@ -397,10 +195,6 @@ do { \
rcu_note_voluntary_context_switch(current); \
} while (0)
-#if defined(CONFIG_DEBUG_LOCK_ALLOC) || defined(CONFIG_RCU_TRACE) || defined(CONFIG_SMP)
-bool __rcu_is_watching(void);
-#endif /* #if defined(CONFIG_DEBUG_LOCK_ALLOC) || defined(CONFIG_RCU_TRACE) || defined(CONFIG_SMP) */
-
/*
* Infrastructure to implement the synchronize_() primitives in
* TREE_RCU and rcu_barrier_() primitives in TINY_RCU.
@@ -414,10 +208,6 @@ bool __rcu_is_watching(void);
#error "Unknown RCU implementation specified to kernel configuration"
#endif
-#define RCU_SCHEDULER_INACTIVE 0
-#define RCU_SCHEDULER_INIT 1
-#define RCU_SCHEDULER_RUNNING 2
-
/*
* init_rcu_head_on_stack()/destroy_rcu_head_on_stack() are needed for dynamic
* initialization and destruction of rcu_head on the stack. rcu_head structures
@@ -430,30 +220,16 @@ void destroy_rcu_head(struct rcu_head *head);
void init_rcu_head_on_stack(struct rcu_head *head);
void destroy_rcu_head_on_stack(struct rcu_head *head);
#else /* !CONFIG_DEBUG_OBJECTS_RCU_HEAD */
-static inline void init_rcu_head(struct rcu_head *head)
-{
-}
-
-static inline void destroy_rcu_head(struct rcu_head *head)
-{
-}
-
-static inline void init_rcu_head_on_stack(struct rcu_head *head)
-{
-}
-
-static inline void destroy_rcu_head_on_stack(struct rcu_head *head)
-{
-}
+static inline void init_rcu_head(struct rcu_head *head) { }
+static inline void destroy_rcu_head(struct rcu_head *head) { }
+static inline void init_rcu_head_on_stack(struct rcu_head *head) { }
+static inline void destroy_rcu_head_on_stack(struct rcu_head *head) { }
#endif /* #else !CONFIG_DEBUG_OBJECTS_RCU_HEAD */
#if defined(CONFIG_HOTPLUG_CPU) && defined(CONFIG_PROVE_RCU)
bool rcu_lockdep_current_cpu_online(void);
#else /* #if defined(CONFIG_HOTPLUG_CPU) && defined(CONFIG_PROVE_RCU) */
-static inline bool rcu_lockdep_current_cpu_online(void)
-{
- return true;
-}
+static inline bool rcu_lockdep_current_cpu_online(void) { return true; }
#endif /* #else #if defined(CONFIG_HOTPLUG_CPU) && defined(CONFIG_PROVE_RCU) */
#ifdef CONFIG_DEBUG_LOCK_ALLOC
@@ -473,18 +249,8 @@ extern struct lockdep_map rcu_bh_lock_map;
extern struct lockdep_map rcu_sched_lock_map;
extern struct lockdep_map rcu_callback_map;
int debug_lockdep_rcu_enabled(void);
-
int rcu_read_lock_held(void);
int rcu_read_lock_bh_held(void);
-
-/**
- * rcu_read_lock_sched_held() - might we be in RCU-sched read-side critical section?
- *
- * If CONFIG_DEBUG_LOCK_ALLOC is selected, returns nonzero iff in an
- * RCU-sched read-side critical section. In absence of
- * CONFIG_DEBUG_LOCK_ALLOC, this assumes we are in an RCU-sched read-side
- * critical section unless it can prove otherwise.
- */
int rcu_read_lock_sched_held(void);
#else /* #ifdef CONFIG_DEBUG_LOCK_ALLOC */
@@ -531,9 +297,7 @@ static inline void rcu_preempt_sleep_check(void)
"Illegal context switch in RCU read-side critical section");
}
#else /* #ifdef CONFIG_PROVE_RCU */
-static inline void rcu_preempt_sleep_check(void)
-{
-}
+static inline void rcu_preempt_sleep_check(void) { }
#endif /* #else #ifdef CONFIG_PROVE_RCU */
#define rcu_sleep_check() \
@@ -1084,52 +848,6 @@ static inline notrace void rcu_read_unlock_sched_notrace(void)
#define kfree_rcu(ptr, rcu_head) \
__kfree_rcu(&((ptr)->rcu_head), offsetof(typeof(*(ptr)), rcu_head))
-#ifdef CONFIG_TINY_RCU
-static inline int rcu_needs_cpu(u64 basemono, u64 *nextevt)
-{
- *nextevt = KTIME_MAX;
- return 0;
-}
-#endif /* #ifdef CONFIG_TINY_RCU */
-
-#if defined(CONFIG_RCU_NOCB_CPU_ALL)
-static inline bool rcu_is_nocb_cpu(int cpu) { return true; }
-#elif defined(CONFIG_RCU_NOCB_CPU)
-bool rcu_is_nocb_cpu(int cpu);
-#else
-static inline bool rcu_is_nocb_cpu(int cpu) { return false; }
-#endif
-
-
-/* Only for use by adaptive-ticks code. */
-#ifdef CONFIG_NO_HZ_FULL_SYSIDLE
-bool rcu_sys_is_idle(void);
-void rcu_sysidle_force_exit(void);
-#else /* #ifdef CONFIG_NO_HZ_FULL_SYSIDLE */
-
-static inline bool rcu_sys_is_idle(void)
-{
- return false;
-}
-
-static inline void rcu_sysidle_force_exit(void)
-{
-}
-
-#endif /* #else #ifdef CONFIG_NO_HZ_FULL_SYSIDLE */
-
-
-/*
- * Dump the ftrace buffer, but only one time per callsite per boot.
- */
-#define rcu_ftrace_dump(oops_dump_mode) \
-do { \
- static atomic_t ___rfd_beenhere = ATOMIC_INIT(0); \
- \
- if (!atomic_read(&___rfd_beenhere) && \
- !atomic_xchg(&___rfd_beenhere, 1)) \
- ftrace_dump(oops_dump_mode); \
-} while (0)
/*
* Place this after a lock-acquisition primitive to guarantee that
diff --git a/include/linux/rcutiny.h b/include/linux/rcutiny.h
index 74d9c3a1feee..5becbbccb998 100644
--- a/include/linux/rcutiny.h
+++ b/include/linux/rcutiny.h
@@ -25,7 +25,7 @@
#ifndef __LINUX_TINY_H
#define __LINUX_TINY_H
-#include <linux/cache.h>
+#include <linux/ktime.h>
struct rcu_dynticks;
static inline int rcu_dynticks_snap(struct rcu_dynticks *rdtp)
@@ -33,10 +33,8 @@ static inline int rcu_dynticks_snap(struct rcu_dynticks *rdtp)
return 0;
}
-static inline bool rcu_eqs_special_set(int cpu)
-{
- return false; /* Never flag non-existent other CPUs! */
-}
+/* Never flag non-existent other CPUs! */
+static inline bool rcu_eqs_special_set(int cpu) { return false; }
static inline unsigned long get_state_synchronize_rcu(void)
{
@@ -98,159 +96,38 @@ static inline void kfree_call_rcu(struct rcu_head *head,
rcu_note_voluntary_context_switch_lite(current); \
} while (0)
-/*
- * Take advantage of the fact that there is only one CPU, which
- * allows us to ignore virtualization-based context switches.
- */
-static inline void rcu_virt_note_context_switch(int cpu)
-{
-}
-
-/*
- * Return the number of grace periods started.
- */
-static inline unsigned long rcu_batches_started(void)
-{
- return 0;
-}
-
-/*
- * Return the number of bottom-half grace periods started.
- */
-static inline unsigned long rcu_batches_started_bh(void)
-{
- return 0;
-}
-
-/*
- * Return the number of sched grace periods started.
- */
-static inline unsigned long rcu_batches_started_sched(void)
-{
- return 0;
-}
-
-/*
- * Return the number of grace periods completed.
- */
-static inline unsigned long rcu_batches_completed(void)
-{
- return 0;
-}
-
-/*
- * Return the number of bottom-half grace periods completed.
- */
-static inline unsigned long rcu_batches_completed_bh(void)
-{
- return 0;
-}
-
-/*
- * Return the number of sched grace periods completed.
- */
-static inline unsigned long rcu_batches_completed_sched(void)
+static inline int rcu_needs_cpu(u64 basemono, u64 *nextevt)
{
+ *nextevt = KTIME_MAX;
return 0;
}
/*
- * Return the number of expedited grace periods completed.
- */
-static inline unsigned long rcu_exp_batches_completed(void)
-{
- return 0;
-}
-
-/*
- * Return the number of expedited sched grace periods completed.
+ * Take advantage of the fact that there is only one CPU, which
+ * allows us to ignore virtualization-based context switches.
*/
-static inline unsigned long rcu_exp_batches_completed_sched(void)
-{
- return 0;
-}
-
-static inline void rcu_force_quiescent_state(void)
-{
-}
-
-static inline void rcu_bh_force_quiescent_state(void)
-{
-}
-
-static inline void rcu_sched_force_quiescent_state(void)
-{
-}
-
-static inline void show_rcu_gp_kthreads(void)
-{
-}
-
-static inline void rcu_cpu_stall_reset(void)
-{
-}
-
-static inline void rcu_idle_enter(void)
-{
-}
-
-static inline void rcu_idle_exit(void)
-{
-}
-
-static inline void rcu_irq_enter(void)
-{
-}
-
-static inline void rcu_irq_exit_irqson(void)
-{
-}
-
-static inline void rcu_irq_enter_irqson(void)
-{
-}
-
-static inline void rcu_irq_exit(void)
-{
-}
-
-static inline void exit_rcu(void)
-{
-}
+static inline void rcu_virt_note_context_switch(int cpu) { }
+static inline void rcu_cpu_stall_reset(void) { }
+static inline void rcu_idle_enter(void) { }
+static inline void rcu_idle_exit(void) { }
+static inline void rcu_irq_enter(void) { }
+static inline bool rcu_irq_enter_disabled(void) { return false; }
+static inline void rcu_irq_exit_irqson(void) { }
+static inline void rcu_irq_enter_irqson(void) { }
+static inline void rcu_irq_exit(void) { }
+static inline void exit_rcu(void) { }
#if defined(CONFIG_DEBUG_LOCK_ALLOC) || defined(CONFIG_SRCU)
extern int rcu_scheduler_active __read_mostly;
void rcu_scheduler_starting(void);
#else /* #if defined(CONFIG_DEBUG_LOCK_ALLOC) || defined(CONFIG_SRCU) */
-static inline void rcu_scheduler_starting(void)
-{
-}
+static inline void rcu_scheduler_starting(void) { }
#endif /* #else #if defined(CONFIG_DEBUG_LOCK_ALLOC) || defined(CONFIG_SRCU) */
+static inline void rcu_end_inkernel_boot(void) { }
+static inline bool rcu_is_watching(void) { return true; }
-#if defined(CONFIG_DEBUG_LOCK_ALLOC) || defined(CONFIG_RCU_TRACE)
-
-static inline bool rcu_is_watching(void)
-{
- return __rcu_is_watching();
-}
-
-#else /* defined(CONFIG_DEBUG_LOCK_ALLOC) || defined(CONFIG_RCU_TRACE) */
-
-static inline bool rcu_is_watching(void)
-{
- return true;
-}
-
-#endif /* #else defined(CONFIG_DEBUG_LOCK_ALLOC) || defined(CONFIG_RCU_TRACE) */
-
-static inline void rcu_request_urgent_qs_task(struct task_struct *t)
-{
-}
-
-static inline void rcu_all_qs(void)
-{
- barrier(); /* Avoid RCU read-side critical sections leaking across. */
-}
+/* Avoid RCU read-side critical sections leaking across. */
+static inline void rcu_all_qs(void) { barrier(); }
/* RCUtree hotplug events */
#define rcutree_prepare_cpu NULL
diff --git a/include/linux/rcutree.h b/include/linux/rcutree.h
index 0bacb6b2af69..37d6fd3b7ff8 100644
--- a/include/linux/rcutree.h
+++ b/include/linux/rcutree.h
@@ -79,37 +79,20 @@ void cond_synchronize_rcu(unsigned long oldstate);
unsigned long get_state_synchronize_sched(void);
void cond_synchronize_sched(unsigned long oldstate);
-extern unsigned long rcutorture_testseq;
-extern unsigned long rcutorture_vernum;
-unsigned long rcu_batches_started(void);
-unsigned long rcu_batches_started_bh(void);
-unsigned long rcu_batches_started_sched(void);
-unsigned long rcu_batches_completed(void);
-unsigned long rcu_batches_completed_bh(void);
-unsigned long rcu_batches_completed_sched(void);
-unsigned long rcu_exp_batches_completed(void);
-unsigned long rcu_exp_batches_completed_sched(void);
-void show_rcu_gp_kthreads(void);
-
-void rcu_force_quiescent_state(void);
-void rcu_bh_force_quiescent_state(void);
-void rcu_sched_force_quiescent_state(void);
-
void rcu_idle_enter(void);
void rcu_idle_exit(void);
void rcu_irq_enter(void);
void rcu_irq_exit(void);
void rcu_irq_enter_irqson(void);
void rcu_irq_exit_irqson(void);
+bool rcu_irq_enter_disabled(void);
void exit_rcu(void);
void rcu_scheduler_starting(void);
extern int rcu_scheduler_active __read_mostly;
-
+void rcu_end_inkernel_boot(void);
bool rcu_is_watching(void);
-void rcu_request_urgent_qs_task(struct task_struct *t);
-
void rcu_all_qs(void);
/* RCUtree hotplug events */
diff --git a/include/linux/refcount.h b/include/linux/refcount.h
index b34aa649d204..591792c8e5b0 100644
--- a/include/linux/refcount.h
+++ b/include/linux/refcount.h
@@ -41,6 +41,7 @@ static inline unsigned int refcount_read(const refcount_t *r)
return atomic_read(&r->refs);
}
+#ifdef CONFIG_REFCOUNT_FULL
extern __must_check bool refcount_add_not_zero(unsigned int i, refcount_t *r);
extern void refcount_add(unsigned int i, refcount_t *r);
@@ -48,10 +49,45 @@ extern __must_check bool refcount_inc_not_zero(refcount_t *r);
extern void refcount_inc(refcount_t *r);
extern __must_check bool refcount_sub_and_test(unsigned int i, refcount_t *r);
-extern void refcount_sub(unsigned int i, refcount_t *r);
extern __must_check bool refcount_dec_and_test(refcount_t *r);
extern void refcount_dec(refcount_t *r);
+#else
+static inline __must_check bool refcount_add_not_zero(unsigned int i, refcount_t *r)
+{
+ return atomic_add_unless(&r->refs, i, 0);
+}
+
+static inline void refcount_add(unsigned int i, refcount_t *r)
+{
+ atomic_add(i, &r->refs);
+}
+
+static inline __must_check bool refcount_inc_not_zero(refcount_t *r)
+{
+ return atomic_add_unless(&r->refs, 1, 0);
+}
+
+static inline void refcount_inc(refcount_t *r)
+{
+ atomic_inc(&r->refs);
+}
+
+static inline __must_check bool refcount_sub_and_test(unsigned int i, refcount_t *r)
+{
+ return atomic_sub_and_test(i, &r->refs);
+}
+
+static inline __must_check bool refcount_dec_and_test(refcount_t *r)
+{
+ return atomic_dec_and_test(&r->refs);
+}
+
+static inline void refcount_dec(refcount_t *r)
+{
+ atomic_dec(&r->refs);
+}
+#endif /* CONFIG_REFCOUNT_FULL */
extern __must_check bool refcount_dec_if_one(refcount_t *r);
extern __must_check bool refcount_dec_not_one(refcount_t *r);
diff --git a/include/linux/rtmutex.h b/include/linux/rtmutex.h
index 1abba5ce2a2f..44fd002f7cd5 100644
--- a/include/linux/rtmutex.h
+++ b/include/linux/rtmutex.h
@@ -37,6 +37,9 @@ struct rt_mutex {
int line;
void *magic;
#endif
+#ifdef CONFIG_DEBUG_LOCK_ALLOC
+ struct lockdep_map dep_map;
+#endif
};
struct rt_mutex_waiter;
@@ -58,19 +61,33 @@ struct hrtimer_sleeper;
#ifdef CONFIG_DEBUG_RT_MUTEXES
# define __DEBUG_RT_MUTEX_INITIALIZER(mutexname) \
, .name = #mutexname, .file = __FILE__, .line = __LINE__
-# define rt_mutex_init(mutex) __rt_mutex_init(mutex, __func__)
+
+# define rt_mutex_init(mutex) \
+do { \
+ static struct lock_class_key __key; \
+ __rt_mutex_init(mutex, __func__, &__key); \
+} while (0)
+
extern void rt_mutex_debug_task_free(struct task_struct *tsk);
#else
# define __DEBUG_RT_MUTEX_INITIALIZER(mutexname)
-# define rt_mutex_init(mutex) __rt_mutex_init(mutex, NULL)
+# define rt_mutex_init(mutex) __rt_mutex_init(mutex, NULL, NULL)
# define rt_mutex_debug_task_free(t) do { } while (0)
#endif
+#ifdef CONFIG_DEBUG_LOCK_ALLOC
+#define __DEP_MAP_RT_MUTEX_INITIALIZER(mutexname) \
+ , .dep_map = { .name = #mutexname }
+#else
+#define __DEP_MAP_RT_MUTEX_INITIALIZER(mutexname)
+#endif
+
#define __RT_MUTEX_INITIALIZER(mutexname) \
{ .wait_lock = __RAW_SPIN_LOCK_UNLOCKED(mutexname.wait_lock) \
, .waiters = RB_ROOT \
, .owner = NULL \
- __DEBUG_RT_MUTEX_INITIALIZER(mutexname)}
+ __DEBUG_RT_MUTEX_INITIALIZER(mutexname) \
+ __DEP_MAP_RT_MUTEX_INITIALIZER(mutexname)}
#define DEFINE_RT_MUTEX(mutexname) \
struct rt_mutex mutexname = __RT_MUTEX_INITIALIZER(mutexname)
@@ -86,7 +103,7 @@ static inline int rt_mutex_is_locked(struct rt_mutex *lock)
return lock->owner != NULL;
}
-extern void __rt_mutex_init(struct rt_mutex *lock, const char *name);
+extern void __rt_mutex_init(struct rt_mutex *lock, const char *name, struct lock_class_key *key);
extern void rt_mutex_destroy(struct rt_mutex *lock);
extern void rt_mutex_lock(struct rt_mutex *lock);
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 2b69fc650201..1f0f427e0292 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -421,7 +421,8 @@ struct sched_dl_entity {
u64 dl_runtime; /* Maximum runtime for each instance */
u64 dl_deadline; /* Relative deadline of each instance */
u64 dl_period; /* Separation of two instances (period) */
- u64 dl_bw; /* dl_runtime / dl_deadline */
+ u64 dl_bw; /* dl_runtime / dl_period */
+ u64 dl_density; /* dl_runtime / dl_deadline */
/*
* Actual scheduling parameters. Initialized with the values above,
@@ -445,16 +446,33 @@ struct sched_dl_entity {
*
* @dl_yielded tells if task gave up the CPU before consuming
* all its available runtime during the last job.
+ *
+ * @dl_non_contending tells if the task is inactive while still
+ * contributing to the active utilization. In other words, it
+ * indicates if the inactive timer has been armed and its handler
+ * has not been executed yet. This flag is useful to avoid race
+ * conditions between the inactive timer handler and the wakeup
+ * code.
*/
int dl_throttled;
int dl_boosted;
int dl_yielded;
+ int dl_non_contending;
/*
* Bandwidth enforcement timer. Each -deadline task has its
* own bandwidth to be enforced, thus we need one timer per task.
*/
struct hrtimer dl_timer;
+
+ /*
+ * Inactive timer, responsible for decreasing the active utilization
+ * at the "0-lag time". When a -deadline task blocks, it contributes
+ * to GRUB's active utilization until the "0-lag time", hence a
+ * timer is needed to decrease the active utilization at the correct
+ * time.
+ */
+ struct hrtimer inactive_timer;
};
union rcu_special {
@@ -1096,8 +1114,6 @@ static inline struct pid *task_session(struct task_struct *task)
* current.
* task_xid_nr_ns() : id seen from the ns specified;
*
- * set_task_vxid() : assigns a virtual id to a task;
- *
* see also pid_nr() etc in include/linux/pid.h
*/
pid_t __task_pid_nr_ns(struct task_struct *task, enum pid_type type, struct pid_namespace *ns);
diff --git a/include/linux/sched/clock.h b/include/linux/sched/clock.h
index 34fe92ce1ebd..a55600ffdf4b 100644
--- a/include/linux/sched/clock.h
+++ b/include/linux/sched/clock.h
@@ -23,10 +23,6 @@ extern u64 sched_clock_cpu(int cpu);
extern void sched_clock_init(void);
#ifndef CONFIG_HAVE_UNSTABLE_SCHED_CLOCK
-static inline void sched_clock_init_late(void)
-{
-}
-
static inline void sched_clock_tick(void)
{
}
@@ -39,7 +35,7 @@ static inline void sched_clock_idle_sleep_event(void)
{
}
-static inline void sched_clock_idle_wakeup_event(u64 delta_ns)
+static inline void sched_clock_idle_wakeup_event(void)
{
}
@@ -53,7 +49,6 @@ static inline u64 local_clock(void)
return sched_clock();
}
#else
-extern void sched_clock_init_late(void);
extern int sched_clock_stable(void);
extern void clear_sched_clock_stable(void);
@@ -63,10 +58,10 @@ extern void clear_sched_clock_stable(void);
*/
extern u64 __sched_clock_offset;
-
extern void sched_clock_tick(void);
+extern void sched_clock_tick_stable(void);
extern void sched_clock_idle_sleep_event(void);
-extern void sched_clock_idle_wakeup_event(u64 delta_ns);
+extern void sched_clock_idle_wakeup_event(void);
/*
* As outlined in clock.c, provides a fast, high resolution, nanosecond
diff --git a/include/linux/sched/nohz.h b/include/linux/sched/nohz.h
index 4995b717500b..7d3f75db23e5 100644
--- a/include/linux/sched/nohz.h
+++ b/include/linux/sched/nohz.h
@@ -23,11 +23,11 @@ static inline void set_cpu_sd_state_idle(void) { }
#endif
#ifdef CONFIG_NO_HZ_COMMON
-void calc_load_enter_idle(void);
-void calc_load_exit_idle(void);
+void calc_load_nohz_start(void);
+void calc_load_nohz_stop(void);
#else
-static inline void calc_load_enter_idle(void) { }
-static inline void calc_load_exit_idle(void) { }
+static inline void calc_load_nohz_start(void) { }
+static inline void calc_load_nohz_stop(void) { }
#endif /* CONFIG_NO_HZ_COMMON */
#if defined(CONFIG_NO_HZ_COMMON) && defined(CONFIG_SMP)
diff --git a/include/linux/sched/task.h b/include/linux/sched/task.h
index a978d7189cfd..f0f065c5afcf 100644
--- a/include/linux/sched/task.h
+++ b/include/linux/sched/task.h
@@ -95,8 +95,6 @@ static inline void put_task_struct(struct task_struct *t)
}
struct task_struct *task_rcu_dereference(struct task_struct **ptask);
-struct task_struct *try_get_task_struct(struct task_struct **ptask);
-
#ifdef CONFIG_ARCH_WANTS_DYNAMIC_TASK_STRUCT
extern int arch_task_struct_size __read_mostly;
diff --git a/include/linux/slub_def.h b/include/linux/slub_def.h
index 07ef550c6627..93315d6b21a8 100644
--- a/include/linux/slub_def.h
+++ b/include/linux/slub_def.h
@@ -84,6 +84,7 @@ struct kmem_cache {
int red_left_pad; /* Left redzone padding size */
#ifdef CONFIG_SYSFS
struct kobject kobj; /* For sysfs */
+ struct work_struct kobj_remove_work;
#endif
#ifdef CONFIG_MEMCG
struct memcg_cache_params memcg_params;
diff --git a/include/linux/spinlock.h b/include/linux/spinlock.h
index 59248dcc6ef3..d9510e8522d4 100644
--- a/include/linux/spinlock.h
+++ b/include/linux/spinlock.h
@@ -369,6 +369,26 @@ static __always_inline int spin_trylock_irq(spinlock_t *lock)
raw_spin_trylock_irqsave(spinlock_check(lock), flags); \
})
+/**
+ * spin_unlock_wait - Interpose between successive critical sections
+ * @lock: the spinlock whose critical sections are to be interposed.
+ *
+ * Semantically this is equivalent to a spin_lock() immediately
+ * followed by a spin_unlock(). However, most architectures have
+ * more efficient implementations in which the spin_unlock_wait()
+ * cannot block concurrent lock acquisition, and in some cases
+ * where spin_unlock_wait() does not write to the lock variable.
+ * Nevertheless, spin_unlock_wait() can have high overhead, so if
+ * you feel the need to use it, please check to see if there is
+ * a better way to get your job done.
+ *
+ * The ordering guarantees provided by spin_unlock_wait() are:
+ *
+ * 1. All accesses preceding the spin_unlock_wait() happen before
+ * any accesses in later critical sections for this same lock.
+ * 2. All accesses following the spin_unlock_wait() happen after
+ * any accesses in earlier critical sections for this same lock.
+ */
static __always_inline void spin_unlock_wait(spinlock_t *lock)
{
raw_spin_unlock_wait(&lock->rlock);
diff --git a/include/linux/srcu.h b/include/linux/srcu.h
index 4c1d5f7e62c4..39af9bc0f653 100644
--- a/include/linux/srcu.h
+++ b/include/linux/srcu.h
@@ -60,32 +60,15 @@ int init_srcu_struct(struct srcu_struct *sp);
#include <linux/srcutiny.h>
#elif defined(CONFIG_TREE_SRCU)
#include <linux/srcutree.h>
-#elif defined(CONFIG_CLASSIC_SRCU)
-#include <linux/srcuclassic.h>
-#else
+#elif defined(CONFIG_SRCU)
#error "Unknown SRCU implementation specified to kernel configuration"
+#else
+/* Dummy definition for things like notifiers. Actual use gets link error. */
+struct srcu_struct { };
#endif
-/**
- * call_srcu() - Queue a callback for invocation after an SRCU grace period
- * @sp: srcu_struct in queue the callback
- * @head: structure to be used for queueing the SRCU callback.
- * @func: function to be invoked after the SRCU grace period
- *
- * The callback function will be invoked some time after a full SRCU
- * grace period elapses, in other words after all pre-existing SRCU
- * read-side critical sections have completed. However, the callback
- * function might well execute concurrently with other SRCU read-side
- * critical sections that started after call_srcu() was invoked. SRCU
- * read-side critical sections are delimited by srcu_read_lock() and
- * srcu_read_unlock(), and may be nested.
- *
- * The callback will be invoked from process context, but must nevertheless
- * be fast and must not block.
- */
void call_srcu(struct srcu_struct *sp, struct rcu_head *head,
void (*func)(struct rcu_head *head));
-
void cleanup_srcu_struct(struct srcu_struct *sp);
int __srcu_read_lock(struct srcu_struct *sp) __acquires(sp);
void __srcu_read_unlock(struct srcu_struct *sp, int idx) __releases(sp);
diff --git a/include/linux/srcuclassic.h b/include/linux/srcuclassic.h
deleted file mode 100644
index 5753f7322262..000000000000
--- a/include/linux/srcuclassic.h
+++ /dev/null
@@ -1,115 +0,0 @@
-/*
- * Sleepable Read-Copy Update mechanism for mutual exclusion,
- * classic v4.11 variant.
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, you can access it online at
- * http://www.gnu.org/licenses/gpl-2.0.html.
- *
- * Copyright (C) IBM Corporation, 2017
- *
- * Author: Paul McKenney <paulmck@us.ibm.com>
- */
-
-#ifndef _LINUX_SRCU_CLASSIC_H
-#define _LINUX_SRCU_CLASSIC_H
-
-struct srcu_array {
- unsigned long lock_count[2];
- unsigned long unlock_count[2];
-};
-
-struct rcu_batch {
- struct rcu_head *head, **tail;
-};
-
-#define RCU_BATCH_INIT(name) { NULL, &(name.head) }
-
-struct srcu_struct {
- unsigned long completed;
- struct srcu_array __percpu *per_cpu_ref;
- spinlock_t queue_lock; /* protect ->batch_queue, ->running */
- bool running;
- /* callbacks just queued */
- struct rcu_batch batch_queue;
- /* callbacks try to do the first check_zero */
- struct rcu_batch batch_check0;
- /* callbacks done with the first check_zero and the flip */
- struct rcu_batch batch_check1;
- struct rcu_batch batch_done;
- struct delayed_work work;
-#ifdef CONFIG_DEBUG_LOCK_ALLOC
- struct lockdep_map dep_map;
-#endif /* #ifdef CONFIG_DEBUG_LOCK_ALLOC */
-};
-
-void process_srcu(struct work_struct *work);
-
-#define __SRCU_STRUCT_INIT(name) \
- { \
- .completed = -300, \
- .per_cpu_ref = &name##_srcu_array, \
- .queue_lock = __SPIN_LOCK_UNLOCKED(name.queue_lock), \
- .running = false, \
- .batch_queue = RCU_BATCH_INIT(name.batch_queue), \
- .batch_check0 = RCU_BATCH_INIT(name.batch_check0), \
- .batch_check1 = RCU_BATCH_INIT(name.batch_check1), \
- .batch_done = RCU_BATCH_INIT(name.batch_done), \
- .work = __DELAYED_WORK_INITIALIZER(name.work, process_srcu, 0),\
- __SRCU_DEP_MAP_INIT(name) \
- }
-
-/*
- * Define and initialize a srcu struct at build time.
- * Do -not- call init_srcu_struct() nor cleanup_srcu_struct() on it.
- *
- * Note that although DEFINE_STATIC_SRCU() hides the name from other
- * files, the per-CPU variable rules nevertheless require that the
- * chosen name be globally unique. These rules also prohibit use of
- * DEFINE_STATIC_SRCU() within a function. If these rules are too
- * restrictive, declare the srcu_struct manually. For example, in
- * each file:
- *
- * static struct srcu_struct my_srcu;
- *
- * Then, before the first use of each my_srcu, manually initialize it:
- *
- * init_srcu_struct(&my_srcu);
- *
- * See include/linux/percpu-defs.h for the rules on per-CPU variables.
- */
-#define __DEFINE_SRCU(name, is_static) \
- static DEFINE_PER_CPU(struct srcu_array, name##_srcu_array);\
- is_static struct srcu_struct name = __SRCU_STRUCT_INIT(name)
-#define DEFINE_SRCU(name) __DEFINE_SRCU(name, /* not static */)
-#define DEFINE_STATIC_SRCU(name) __DEFINE_SRCU(name, static)
-
-void synchronize_srcu_expedited(struct srcu_struct *sp);
-void srcu_barrier(struct srcu_struct *sp);
-unsigned long srcu_batches_completed(struct srcu_struct *sp);
-
-static inline void srcutorture_get_gp_data(enum rcutorture_type test_type,
- struct srcu_struct *sp, int *flags,
- unsigned long *gpnum,
- unsigned long *completed)
-{
- if (test_type != SRCU_FLAVOR)
- return;
- *flags = 0;
- *completed = sp->completed;
- *gpnum = *completed;
- if (sp->batch_queue.head || sp->batch_check0.head || sp->batch_check0.head)
- (*gpnum)++;
-}
-
-#endif
diff --git a/include/linux/srcutiny.h b/include/linux/srcutiny.h
index 42311ee0334f..cfbfc540cafc 100644
--- a/include/linux/srcutiny.h
+++ b/include/linux/srcutiny.h
@@ -27,15 +27,14 @@
#include <linux/swait.h>
struct srcu_struct {
- int srcu_lock_nesting[2]; /* srcu_read_lock() nesting depth. */
+ short srcu_lock_nesting[2]; /* srcu_read_lock() nesting depth. */
+ short srcu_idx; /* Current reader array element. */
+ u8 srcu_gp_running; /* GP workqueue running? */
+ u8 srcu_gp_waiting; /* GP waiting for readers? */
struct swait_queue_head srcu_wq;
/* Last srcu_read_unlock() wakes GP. */
- unsigned long srcu_gp_seq; /* GP seq # for callback tagging. */
- struct rcu_segcblist srcu_cblist;
- /* Pending SRCU callbacks. */
- int srcu_idx; /* Current reader array element. */
- bool srcu_gp_running; /* GP workqueue running? */
- bool srcu_gp_waiting; /* GP waiting for readers? */
+ struct rcu_head *srcu_cb_head; /* Pending callbacks: Head. */
+ struct rcu_head **srcu_cb_tail; /* Pending callbacks: Tail. */
struct work_struct srcu_work; /* For driving grace periods. */
#ifdef CONFIG_DEBUG_LOCK_ALLOC
struct lockdep_map dep_map;
@@ -47,7 +46,7 @@ void srcu_drive_gp(struct work_struct *wp);
#define __SRCU_STRUCT_INIT(name) \
{ \
.srcu_wq = __SWAIT_QUEUE_HEAD_INITIALIZER(name.srcu_wq), \
- .srcu_cblist = RCU_SEGCBLIST_INITIALIZER(name.srcu_cblist), \
+ .srcu_cb_tail = &name.srcu_cb_head, \
.srcu_work = __WORK_INITIALIZER(name.srcu_work, srcu_drive_gp), \
__SRCU_DEP_MAP_INIT(name) \
}
@@ -63,31 +62,29 @@ void srcu_drive_gp(struct work_struct *wp);
void synchronize_srcu(struct srcu_struct *sp);
-static inline void synchronize_srcu_expedited(struct srcu_struct *sp)
+/*
+ * Counts the new reader in the appropriate per-CPU element of the
+ * srcu_struct. Can be invoked from irq/bh handlers, but the matching
+ * __srcu_read_unlock() must be in the same handler instance. Returns an
+ * index that must be passed to the matching srcu_read_unlock().
+ */
+static inline int __srcu_read_lock(struct srcu_struct *sp)
{
- synchronize_srcu(sp);
-}
+ int idx;
-static inline void srcu_barrier(struct srcu_struct *sp)
-{
- synchronize_srcu(sp);
+ idx = READ_ONCE(sp->srcu_idx);
+ WRITE_ONCE(sp->srcu_lock_nesting[idx], sp->srcu_lock_nesting[idx] + 1);
+ return idx;
}
-static inline unsigned long srcu_batches_completed(struct srcu_struct *sp)
+static inline void synchronize_srcu_expedited(struct srcu_struct *sp)
{
- return 0;
+ synchronize_srcu(sp);
}
-static inline void srcutorture_get_gp_data(enum rcutorture_type test_type,
- struct srcu_struct *sp, int *flags,
- unsigned long *gpnum,
- unsigned long *completed)
+static inline void srcu_barrier(struct srcu_struct *sp)
{
- if (test_type != SRCU_FLAVOR)
- return;
- *flags = 0;
- *completed = sp->srcu_gp_seq;
- *gpnum = *completed;
+ synchronize_srcu(sp);
}
#endif
diff --git a/include/linux/srcutree.h b/include/linux/srcutree.h
index 32e86d85fd11..42973f787e7e 100644
--- a/include/linux/srcutree.h
+++ b/include/linux/srcutree.h
@@ -40,7 +40,7 @@ struct srcu_data {
unsigned long srcu_unlock_count[2]; /* Unlocks per CPU. */
/* Update-side state. */
- spinlock_t lock ____cacheline_internodealigned_in_smp;
+ raw_spinlock_t __private lock ____cacheline_internodealigned_in_smp;
struct rcu_segcblist srcu_cblist; /* List of callbacks.*/
unsigned long srcu_gp_seq_needed; /* Furthest future GP needed. */
unsigned long srcu_gp_seq_needed_exp; /* Furthest future exp GP. */
@@ -58,7 +58,7 @@ struct srcu_data {
* Node in SRCU combining tree, similar in function to rcu_data.
*/
struct srcu_node {
- spinlock_t lock;
+ raw_spinlock_t __private lock;
unsigned long srcu_have_cbs[4]; /* GP seq for children */
/* having CBs, but only */
/* is > ->srcu_gq_seq. */
@@ -78,7 +78,7 @@ struct srcu_struct {
struct srcu_node *level[RCU_NUM_LVLS + 1];
/* First node at each level. */
struct mutex srcu_cb_mutex; /* Serialize CB preparation. */
- spinlock_t gp_lock; /* protect ->srcu_cblist */
+ raw_spinlock_t __private lock; /* Protect counters */
struct mutex srcu_gp_mutex; /* Serialize GP work. */
unsigned int srcu_idx; /* Current rdr array element. */
unsigned long srcu_gp_seq; /* Grace-period seq #. */
@@ -109,7 +109,7 @@ void process_srcu(struct work_struct *work);
#define __SRCU_STRUCT_INIT(name) \
{ \
.sda = &name##_srcu_data, \
- .gp_lock = __SPIN_LOCK_UNLOCKED(name.gp_lock), \
+ .lock = __RAW_SPIN_LOCK_UNLOCKED(name.lock), \
.srcu_gp_seq_needed = 0 - 1, \
__SRCU_DEP_MAP_INIT(name) \
}
@@ -141,10 +141,5 @@ void process_srcu(struct work_struct *work);
void synchronize_srcu_expedited(struct srcu_struct *sp);
void srcu_barrier(struct srcu_struct *sp);
-unsigned long srcu_batches_completed(struct srcu_struct *sp);
-
-void srcutorture_get_gp_data(enum rcutorture_type test_type,
- struct srcu_struct *sp, int *flags,
- unsigned long *gpnum, unsigned long *completed);
#endif
diff --git a/include/linux/sunrpc/sched.h b/include/linux/sunrpc/sched.h
index 7ba040c797ec..9d7529ffc4ce 100644
--- a/include/linux/sunrpc/sched.h
+++ b/include/linux/sunrpc/sched.h
@@ -13,7 +13,7 @@
#include <linux/ktime.h>
#include <linux/sunrpc/types.h>
#include <linux/spinlock.h>
-#include <linux/wait.h>
+#include <linux/wait_bit.h>
#include <linux/workqueue.h>
#include <linux/sunrpc/xdr.h>
diff --git a/include/linux/t10-pi.h b/include/linux/t10-pi.h
index 9375d23a24e7..635a3c5706bd 100644
--- a/include/linux/t10-pi.h
+++ b/include/linux/t10-pi.h
@@ -33,6 +33,8 @@ struct t10_pi_tuple {
__be32 ref_tag; /* Target LBA or indirect LBA */
};
+#define T10_PI_APP_ESCAPE cpu_to_be16(0xffff)
+#define T10_PI_REF_ESCAPE cpu_to_be32(0xffffffff)
extern const struct blk_integrity_profile t10_pi_type1_crc;
extern const struct blk_integrity_profile t10_pi_type1_ip;
diff --git a/include/linux/timekeeper_internal.h b/include/linux/timekeeper_internal.h
index 110f4532188c..f7043ccca81c 100644
--- a/include/linux/timekeeper_internal.h
+++ b/include/linux/timekeeper_internal.h
@@ -29,7 +29,6 @@
*/
struct tk_read_base {
struct clocksource *clock;
- u64 (*read)(struct clocksource *cs);
u64 mask;
u64 cycle_last;
u32 mult;
@@ -58,7 +57,7 @@ struct tk_read_base {
* interval.
* @xtime_remainder: Shifted nano seconds left over when rounding
* @cycle_interval
- * @raw_interval: Raw nano seconds accumulated per NTP interval.
+ * @raw_interval: Shifted raw nano seconds accumulated per NTP interval.
* @ntp_error: Difference between accumulated time and NTP time in ntp
* shifted nano seconds.
* @ntp_error_shift: Shift conversion between clock shifted nano seconds and
@@ -100,7 +99,7 @@ struct timekeeper {
u64 cycle_interval;
u64 xtime_interval;
s64 xtime_remainder;
- u32 raw_interval;
+ u64 raw_interval;
/* The ntp_tick_length() value currently being used.
* This cached copy ensures we consistently apply the tick
* length for an entire tick, as ntp_tick_length may change
diff --git a/include/linux/uuid.h b/include/linux/uuid.h
index 75f7182d5360..d1defe4ab167 100644
--- a/include/linux/uuid.h
+++ b/include/linux/uuid.h
@@ -48,7 +48,7 @@ static inline void guid_copy(guid_t *dst, const guid_t *src)
memcpy(dst, src, sizeof(guid_t));
}
-static inline bool guid_is_null(guid_t *guid)
+static inline bool guid_is_null(const guid_t *guid)
{
return guid_equal(guid, &guid_null);
}
@@ -63,7 +63,7 @@ static inline void uuid_copy(uuid_t *dst, const uuid_t *src)
memcpy(dst, src, sizeof(uuid_t));
}
-static inline bool uuid_is_null(uuid_t *uuid)
+static inline bool uuid_is_null(const uuid_t *uuid)
{
return uuid_equal(uuid, &uuid_null);
}
diff --git a/include/linux/vfio.h b/include/linux/vfio.h
index edf9b2cad277..f57076b958b7 100644
--- a/include/linux/vfio.h
+++ b/include/linux/vfio.h
@@ -183,7 +183,7 @@ struct virqfd {
void (*thread)(void *, void *);
void *data;
struct work_struct inject;
- wait_queue_t wait;
+ wait_queue_entry_t wait;
poll_table pt;
struct work_struct shutdown;
struct virqfd **pvirqfd;
diff --git a/include/linux/vm_event_item.h b/include/linux/vm_event_item.h
index d84ae90ccd5c..be3ab2d13adf 100644
--- a/include/linux/vm_event_item.h
+++ b/include/linux/vm_event_item.h
@@ -93,10 +93,8 @@ enum vm_event_item { PGPGIN, PGPGOUT, PSWPIN, PSWPOUT,
#endif
#endif
#ifdef CONFIG_DEBUG_TLBFLUSH
-#ifdef CONFIG_SMP
NR_TLB_REMOTE_FLUSH, /* cpu tried to flush others' tlbs */
NR_TLB_REMOTE_FLUSH_RECEIVED,/* cpu received ipi for flush */
-#endif /* CONFIG_SMP */
NR_TLB_LOCAL_FLUSH_ALL,
NR_TLB_LOCAL_FLUSH_ONE,
#endif /* CONFIG_DEBUG_TLBFLUSH */
diff --git a/include/linux/wait.h b/include/linux/wait.h
index db076ca7f11d..b289c96151ee 100644
--- a/include/linux/wait.h
+++ b/include/linux/wait.h
@@ -10,38 +10,30 @@
#include <asm/current.h>
#include <uapi/linux/wait.h>
-typedef struct __wait_queue wait_queue_t;
-typedef int (*wait_queue_func_t)(wait_queue_t *wait, unsigned mode, int flags, void *key);
-int default_wake_function(wait_queue_t *wait, unsigned mode, int flags, void *key);
+typedef struct wait_queue_entry wait_queue_entry_t;
-/* __wait_queue::flags */
+typedef int (*wait_queue_func_t)(struct wait_queue_entry *wq_entry, unsigned mode, int flags, void *key);
+int default_wake_function(struct wait_queue_entry *wq_entry, unsigned mode, int flags, void *key);
+
+/* wait_queue_entry::flags */
#define WQ_FLAG_EXCLUSIVE 0x01
#define WQ_FLAG_WOKEN 0x02
-struct __wait_queue {
+/*
+ * A single wait-queue entry structure:
+ */
+struct wait_queue_entry {
unsigned int flags;
void *private;
wait_queue_func_t func;
- struct list_head task_list;
-};
-
-struct wait_bit_key {
- void *flags;
- int bit_nr;
-#define WAIT_ATOMIC_T_BIT_NR -1
- unsigned long timeout;
+ struct list_head entry;
};
-struct wait_bit_queue {
- struct wait_bit_key key;
- wait_queue_t wait;
-};
-
-struct __wait_queue_head {
+struct wait_queue_head {
spinlock_t lock;
- struct list_head task_list;
+ struct list_head head;
};
-typedef struct __wait_queue_head wait_queue_head_t;
+typedef struct wait_queue_head wait_queue_head_t;
struct task_struct;
@@ -49,82 +41,76 @@ struct task_struct;
* Macros for declaration and initialisaton of the datatypes
*/
-#define __WAITQUEUE_INITIALIZER(name, tsk) { \
- .private = tsk, \
- .func = default_wake_function, \
- .task_list = { NULL, NULL } }
+#define __WAITQUEUE_INITIALIZER(name, tsk) { \
+ .private = tsk, \
+ .func = default_wake_function, \
+ .entry = { NULL, NULL } }
-#define DECLARE_WAITQUEUE(name, tsk) \
- wait_queue_t name = __WAITQUEUE_INITIALIZER(name, tsk)
+#define DECLARE_WAITQUEUE(name, tsk) \
+ struct wait_queue_entry name = __WAITQUEUE_INITIALIZER(name, tsk)
-#define __WAIT_QUEUE_HEAD_INITIALIZER(name) { \
- .lock = __SPIN_LOCK_UNLOCKED(name.lock), \
- .task_list = { &(name).task_list, &(name).task_list } }
+#define __WAIT_QUEUE_HEAD_INITIALIZER(name) { \
+ .lock = __SPIN_LOCK_UNLOCKED(name.lock), \
+ .head = { &(name).head, &(name).head } }
#define DECLARE_WAIT_QUEUE_HEAD(name) \
- wait_queue_head_t name = __WAIT_QUEUE_HEAD_INITIALIZER(name)
-
-#define __WAIT_BIT_KEY_INITIALIZER(word, bit) \
- { .flags = word, .bit_nr = bit, }
+ struct wait_queue_head name = __WAIT_QUEUE_HEAD_INITIALIZER(name)
-#define __WAIT_ATOMIC_T_KEY_INITIALIZER(p) \
- { .flags = p, .bit_nr = WAIT_ATOMIC_T_BIT_NR, }
+extern void __init_waitqueue_head(struct wait_queue_head *wq_head, const char *name, struct lock_class_key *);
-extern void __init_waitqueue_head(wait_queue_head_t *q, const char *name, struct lock_class_key *);
-
-#define init_waitqueue_head(q) \
- do { \
- static struct lock_class_key __key; \
- \
- __init_waitqueue_head((q), #q, &__key); \
+#define init_waitqueue_head(wq_head) \
+ do { \
+ static struct lock_class_key __key; \
+ \
+ __init_waitqueue_head((wq_head), #wq_head, &__key); \
} while (0)
#ifdef CONFIG_LOCKDEP
# define __WAIT_QUEUE_HEAD_INIT_ONSTACK(name) \
({ init_waitqueue_head(&name); name; })
# define DECLARE_WAIT_QUEUE_HEAD_ONSTACK(name) \
- wait_queue_head_t name = __WAIT_QUEUE_HEAD_INIT_ONSTACK(name)
+ struct wait_queue_head name = __WAIT_QUEUE_HEAD_INIT_ONSTACK(name)
#else
# define DECLARE_WAIT_QUEUE_HEAD_ONSTACK(name) DECLARE_WAIT_QUEUE_HEAD(name)
#endif
-static inline void init_waitqueue_entry(wait_queue_t *q, struct task_struct *p)
+static inline void init_waitqueue_entry(struct wait_queue_entry *wq_entry, struct task_struct *p)
{
- q->flags = 0;
- q->private = p;
- q->func = default_wake_function;
+ wq_entry->flags = 0;
+ wq_entry->private = p;
+ wq_entry->func = default_wake_function;
}
static inline void
-init_waitqueue_func_entry(wait_queue_t *q, wait_queue_func_t func)
+init_waitqueue_func_entry(struct wait_queue_entry *wq_entry, wait_queue_func_t func)
{
- q->flags = 0;
- q->private = NULL;
- q->func = func;
+ wq_entry->flags = 0;
+ wq_entry->private = NULL;
+ wq_entry->func = func;
}
/**
* waitqueue_active -- locklessly test for waiters on the queue
- * @q: the waitqueue to test for waiters
+ * @wq_head: the waitqueue to test for waiters
*
* returns true if the wait list is not empty
*
* NOTE: this function is lockless and requires care, incorrect usage _will_
* lead to sporadic and non-obvious failure.
*
- * Use either while holding wait_queue_head_t::lock or when used for wakeups
+ * Use either while holding wait_queue_head::lock or when used for wakeups
* with an extra smp_mb() like:
*
* CPU0 - waker CPU1 - waiter
*
* for (;;) {
- * @cond = true; prepare_to_wait(&wq, &wait, state);
+ * @cond = true; prepare_to_wait(&wq_head, &wait, state);
* smp_mb(); // smp_mb() from set_current_state()
- * if (waitqueue_active(wq)) if (@cond)
- * wake_up(wq); break;
+ * if (waitqueue_active(wq_head)) if (@cond)
+ * wake_up(wq_head); break;
* schedule();
* }
- * finish_wait(&wq, &wait);
+ * finish_wait(&wq_head, &wait);
*
* Because without the explicit smp_mb() it's possible for the
* waitqueue_active() load to get hoisted over the @cond store such that we'll
@@ -133,20 +119,20 @@ init_waitqueue_func_entry(wait_queue_t *q, wait_queue_func_t func)
* Also note that this 'optimization' trades a spin_lock() for an smp_mb(),
* which (when the lock is uncontended) are of roughly equal cost.
*/
-static inline int waitqueue_active(wait_queue_head_t *q)
+static inline int waitqueue_active(struct wait_queue_head *wq_head)
{
- return !list_empty(&q->task_list);
+ return !list_empty(&wq_head->head);
}
/**
* wq_has_sleeper - check if there are any waiting processes
- * @wq: wait queue head
+ * @wq_head: wait queue head
*
- * Returns true if wq has waiting processes
+ * Returns true if wq_head has waiting processes
*
* Please refer to the comment for waitqueue_active.
*/
-static inline bool wq_has_sleeper(wait_queue_head_t *wq)
+static inline bool wq_has_sleeper(struct wait_queue_head *wq_head)
{
/*
* We need to be sure we are in sync with the
@@ -156,63 +142,51 @@ static inline bool wq_has_sleeper(wait_queue_head_t *wq)
* waiting side.
*/
smp_mb();
- return waitqueue_active(wq);
+ return waitqueue_active(wq_head);
}
-extern void add_wait_queue(wait_queue_head_t *q, wait_queue_t *wait);
-extern void add_wait_queue_exclusive(wait_queue_head_t *q, wait_queue_t *wait);
-extern void remove_wait_queue(wait_queue_head_t *q, wait_queue_t *wait);
+extern void add_wait_queue(struct wait_queue_head *wq_head, struct wait_queue_entry *wq_entry);
+extern void add_wait_queue_exclusive(struct wait_queue_head *wq_head, struct wait_queue_entry *wq_entry);
+extern void remove_wait_queue(struct wait_queue_head *wq_head, struct wait_queue_entry *wq_entry);
-static inline void __add_wait_queue(wait_queue_head_t *head, wait_queue_t *new)
+static inline void __add_wait_queue(struct wait_queue_head *wq_head, struct wait_queue_entry *wq_entry)
{
- list_add(&new->task_list, &head->task_list);
+ list_add(&wq_entry->entry, &wq_head->head);
}
/*
* Used for wake-one threads:
*/
static inline void
-__add_wait_queue_exclusive(wait_queue_head_t *q, wait_queue_t *wait)
+__add_wait_queue_exclusive(struct wait_queue_head *wq_head, struct wait_queue_entry *wq_entry)
{
- wait->flags |= WQ_FLAG_EXCLUSIVE;
- __add_wait_queue(q, wait);
+ wq_entry->flags |= WQ_FLAG_EXCLUSIVE;
+ __add_wait_queue(wq_head, wq_entry);
}
-static inline void __add_wait_queue_tail(wait_queue_head_t *head,
- wait_queue_t *new)
+static inline void __add_wait_queue_entry_tail(struct wait_queue_head *wq_head, struct wait_queue_entry *wq_entry)
{
- list_add_tail(&new->task_list, &head->task_list);
+ list_add_tail(&wq_entry->entry, &wq_head->head);
}
static inline void
-__add_wait_queue_tail_exclusive(wait_queue_head_t *q, wait_queue_t *wait)
+__add_wait_queue_entry_tail_exclusive(struct wait_queue_head *wq_head, struct wait_queue_entry *wq_entry)
{
- wait->flags |= WQ_FLAG_EXCLUSIVE;
- __add_wait_queue_tail(q, wait);
+ wq_entry->flags |= WQ_FLAG_EXCLUSIVE;
+ __add_wait_queue_entry_tail(wq_head, wq_entry);
}
static inline void
-__remove_wait_queue(wait_queue_head_t *head, wait_queue_t *old)
+__remove_wait_queue(struct wait_queue_head *wq_head, struct wait_queue_entry *wq_entry)
{
- list_del(&old->task_list);
+ list_del(&wq_entry->entry);
}
-typedef int wait_bit_action_f(struct wait_bit_key *, int mode);
-void __wake_up(wait_queue_head_t *q, unsigned int mode, int nr, void *key);
-void __wake_up_locked_key(wait_queue_head_t *q, unsigned int mode, void *key);
-void __wake_up_sync_key(wait_queue_head_t *q, unsigned int mode, int nr, void *key);
-void __wake_up_locked(wait_queue_head_t *q, unsigned int mode, int nr);
-void __wake_up_sync(wait_queue_head_t *q, unsigned int mode, int nr);
-void __wake_up_bit(wait_queue_head_t *, void *, int);
-int __wait_on_bit(wait_queue_head_t *, struct wait_bit_queue *, wait_bit_action_f *, unsigned);
-int __wait_on_bit_lock(wait_queue_head_t *, struct wait_bit_queue *, wait_bit_action_f *, unsigned);
-void wake_up_bit(void *, int);
-void wake_up_atomic_t(atomic_t *);
-int out_of_line_wait_on_bit(void *, int, wait_bit_action_f *, unsigned);
-int out_of_line_wait_on_bit_timeout(void *, int, wait_bit_action_f *, unsigned, unsigned long);
-int out_of_line_wait_on_bit_lock(void *, int, wait_bit_action_f *, unsigned);
-int out_of_line_wait_on_atomic_t(atomic_t *, int (*)(atomic_t *), unsigned);
-wait_queue_head_t *bit_waitqueue(void *, int);
+void __wake_up(struct wait_queue_head *wq_head, unsigned int mode, int nr, void *key);
+void __wake_up_locked_key(struct wait_queue_head *wq_head, unsigned int mode, void *key);
+void __wake_up_sync_key(struct wait_queue_head *wq_head, unsigned int mode, int nr, void *key);
+void __wake_up_locked(struct wait_queue_head *wq_head, unsigned int mode, int nr);
+void __wake_up_sync(struct wait_queue_head *wq_head, unsigned int mode, int nr);
#define wake_up(x) __wake_up(x, TASK_NORMAL, 1, NULL)
#define wake_up_nr(x, nr) __wake_up(x, TASK_NORMAL, nr, NULL)
@@ -228,28 +202,28 @@ wait_queue_head_t *bit_waitqueue(void *, int);
/*
* Wakeup macros to be used to report events to the targets.
*/
-#define wake_up_poll(x, m) \
+#define wake_up_poll(x, m) \
__wake_up(x, TASK_NORMAL, 1, (void *) (m))
-#define wake_up_locked_poll(x, m) \
+#define wake_up_locked_poll(x, m) \
__wake_up_locked_key((x), TASK_NORMAL, (void *) (m))
-#define wake_up_interruptible_poll(x, m) \
+#define wake_up_interruptible_poll(x, m) \
__wake_up(x, TASK_INTERRUPTIBLE, 1, (void *) (m))
-#define wake_up_interruptible_sync_poll(x, m) \
+#define wake_up_interruptible_sync_poll(x, m) \
__wake_up_sync_key((x), TASK_INTERRUPTIBLE, 1, (void *) (m))
-#define ___wait_cond_timeout(condition) \
-({ \
- bool __cond = (condition); \
- if (__cond && !__ret) \
- __ret = 1; \
- __cond || !__ret; \
+#define ___wait_cond_timeout(condition) \
+({ \
+ bool __cond = (condition); \
+ if (__cond && !__ret) \
+ __ret = 1; \
+ __cond || !__ret; \
})
-#define ___wait_is_interruptible(state) \
- (!__builtin_constant_p(state) || \
- state == TASK_INTERRUPTIBLE || state == TASK_KILLABLE) \
+#define ___wait_is_interruptible(state) \
+ (!__builtin_constant_p(state) || \
+ state == TASK_INTERRUPTIBLE || state == TASK_KILLABLE) \
-extern void init_wait_entry(wait_queue_t *__wait, int flags);
+extern void init_wait_entry(struct wait_queue_entry *wq_entry, int flags);
/*
* The below macro ___wait_event() has an explicit shadow of the __ret
@@ -263,108 +237,108 @@ extern void init_wait_entry(wait_queue_t *__wait, int flags);
* otherwise.
*/
-#define ___wait_event(wq, condition, state, exclusive, ret, cmd) \
-({ \
- __label__ __out; \
- wait_queue_t __wait; \
- long __ret = ret; /* explicit shadow */ \
- \
- init_wait_entry(&__wait, exclusive ? WQ_FLAG_EXCLUSIVE : 0); \
- for (;;) { \
- long __int = prepare_to_wait_event(&wq, &__wait, state);\
- \
- if (condition) \
- break; \
- \
- if (___wait_is_interruptible(state) && __int) { \
- __ret = __int; \
- goto __out; \
- } \
- \
- cmd; \
- } \
- finish_wait(&wq, &__wait); \
-__out: __ret; \
+#define ___wait_event(wq_head, condition, state, exclusive, ret, cmd) \
+({ \
+ __label__ __out; \
+ struct wait_queue_entry __wq_entry; \
+ long __ret = ret; /* explicit shadow */ \
+ \
+ init_wait_entry(&__wq_entry, exclusive ? WQ_FLAG_EXCLUSIVE : 0); \
+ for (;;) { \
+ long __int = prepare_to_wait_event(&wq_head, &__wq_entry, state);\
+ \
+ if (condition) \
+ break; \
+ \
+ if (___wait_is_interruptible(state) && __int) { \
+ __ret = __int; \
+ goto __out; \
+ } \
+ \
+ cmd; \
+ } \
+ finish_wait(&wq_head, &__wq_entry); \
+__out: __ret; \
})
-#define __wait_event(wq, condition) \
- (void)___wait_event(wq, condition, TASK_UNINTERRUPTIBLE, 0, 0, \
+#define __wait_event(wq_head, condition) \
+ (void)___wait_event(wq_head, condition, TASK_UNINTERRUPTIBLE, 0, 0, \
schedule())
/**
* wait_event - sleep until a condition gets true
- * @wq: the waitqueue to wait on
+ * @wq_head: the waitqueue to wait on
* @condition: a C expression for the event to wait for
*
* The process is put to sleep (TASK_UNINTERRUPTIBLE) until the
* @condition evaluates to true. The @condition is checked each time
- * the waitqueue @wq is woken up.
+ * the waitqueue @wq_head is woken up.
*
* wake_up() has to be called after changing any variable that could
* change the result of the wait condition.
*/
-#define wait_event(wq, condition) \
-do { \
- might_sleep(); \
- if (condition) \
- break; \
- __wait_event(wq, condition); \
+#define wait_event(wq_head, condition) \
+do { \
+ might_sleep(); \
+ if (condition) \
+ break; \
+ __wait_event(wq_head, condition); \
} while (0)
-#define __io_wait_event(wq, condition) \
- (void)___wait_event(wq, condition, TASK_UNINTERRUPTIBLE, 0, 0, \
+#define __io_wait_event(wq_head, condition) \
+ (void)___wait_event(wq_head, condition, TASK_UNINTERRUPTIBLE, 0, 0, \
io_schedule())
/*
* io_wait_event() -- like wait_event() but with io_schedule()
*/
-#define io_wait_event(wq, condition) \
-do { \
- might_sleep(); \
- if (condition) \
- break; \
- __io_wait_event(wq, condition); \
+#define io_wait_event(wq_head, condition) \
+do { \
+ might_sleep(); \
+ if (condition) \
+ break; \
+ __io_wait_event(wq_head, condition); \
} while (0)
-#define __wait_event_freezable(wq, condition) \
- ___wait_event(wq, condition, TASK_INTERRUPTIBLE, 0, 0, \
+#define __wait_event_freezable(wq_head, condition) \
+ ___wait_event(wq_head, condition, TASK_INTERRUPTIBLE, 0, 0, \
schedule(); try_to_freeze())
/**
* wait_event_freezable - sleep (or freeze) until a condition gets true
- * @wq: the waitqueue to wait on
+ * @wq_head: the waitqueue to wait on
* @condition: a C expression for the event to wait for
*
* The process is put to sleep (TASK_INTERRUPTIBLE -- so as not to contribute
* to system load) until the @condition evaluates to true. The
- * @condition is checked each time the waitqueue @wq is woken up.
+ * @condition is checked each time the waitqueue @wq_head is woken up.
*
* wake_up() has to be called after changing any variable that could
* change the result of the wait condition.
*/
-#define wait_event_freezable(wq, condition) \
-({ \
- int __ret = 0; \
- might_sleep(); \
- if (!(condition)) \
- __ret = __wait_event_freezable(wq, condition); \
- __ret; \
+#define wait_event_freezable(wq_head, condition) \
+({ \
+ int __ret = 0; \
+ might_sleep(); \
+ if (!(condition)) \
+ __ret = __wait_event_freezable(wq_head, condition); \
+ __ret; \
})
-#define __wait_event_timeout(wq, condition, timeout) \
- ___wait_event(wq, ___wait_cond_timeout(condition), \
- TASK_UNINTERRUPTIBLE, 0, timeout, \
+#define __wait_event_timeout(wq_head, condition, timeout) \
+ ___wait_event(wq_head, ___wait_cond_timeout(condition), \
+ TASK_UNINTERRUPTIBLE, 0, timeout, \
__ret = schedule_timeout(__ret))
/**
* wait_event_timeout - sleep until a condition gets true or a timeout elapses
- * @wq: the waitqueue to wait on
+ * @wq_head: the waitqueue to wait on
* @condition: a C expression for the event to wait for
* @timeout: timeout, in jiffies
*
* The process is put to sleep (TASK_UNINTERRUPTIBLE) until the
* @condition evaluates to true. The @condition is checked each time
- * the waitqueue @wq is woken up.
+ * the waitqueue @wq_head is woken up.
*
* wake_up() has to be called after changing any variable that could
* change the result of the wait condition.
@@ -375,83 +349,83 @@ do { \
* or the remaining jiffies (at least 1) if the @condition evaluated
* to %true before the @timeout elapsed.
*/
-#define wait_event_timeout(wq, condition, timeout) \
-({ \
- long __ret = timeout; \
- might_sleep(); \
- if (!___wait_cond_timeout(condition)) \
- __ret = __wait_event_timeout(wq, condition, timeout); \
- __ret; \
+#define wait_event_timeout(wq_head, condition, timeout) \
+({ \
+ long __ret = timeout; \
+ might_sleep(); \
+ if (!___wait_cond_timeout(condition)) \
+ __ret = __wait_event_timeout(wq_head, condition, timeout); \
+ __ret; \
})
-#define __wait_event_freezable_timeout(wq, condition, timeout) \
- ___wait_event(wq, ___wait_cond_timeout(condition), \
- TASK_INTERRUPTIBLE, 0, timeout, \
+#define __wait_event_freezable_timeout(wq_head, condition, timeout) \
+ ___wait_event(wq_head, ___wait_cond_timeout(condition), \
+ TASK_INTERRUPTIBLE, 0, timeout, \
__ret = schedule_timeout(__ret); try_to_freeze())
/*
* like wait_event_timeout() -- except it uses TASK_INTERRUPTIBLE to avoid
* increasing load and is freezable.
*/
-#define wait_event_freezable_timeout(wq, condition, timeout) \
-({ \
- long __ret = timeout; \
- might_sleep(); \
- if (!___wait_cond_timeout(condition)) \
- __ret = __wait_event_freezable_timeout(wq, condition, timeout); \
- __ret; \
+#define wait_event_freezable_timeout(wq_head, condition, timeout) \
+({ \
+ long __ret = timeout; \
+ might_sleep(); \
+ if (!___wait_cond_timeout(condition)) \
+ __ret = __wait_event_freezable_timeout(wq_head, condition, timeout); \
+ __ret; \
})
-#define __wait_event_exclusive_cmd(wq, condition, cmd1, cmd2) \
- (void)___wait_event(wq, condition, TASK_UNINTERRUPTIBLE, 1, 0, \
+#define __wait_event_exclusive_cmd(wq_head, condition, cmd1, cmd2) \
+ (void)___wait_event(wq_head, condition, TASK_UNINTERRUPTIBLE, 1, 0, \
cmd1; schedule(); cmd2)
/*
* Just like wait_event_cmd(), except it sets exclusive flag
*/
-#define wait_event_exclusive_cmd(wq, condition, cmd1, cmd2) \
-do { \
- if (condition) \
- break; \
- __wait_event_exclusive_cmd(wq, condition, cmd1, cmd2); \
+#define wait_event_exclusive_cmd(wq_head, condition, cmd1, cmd2) \
+do { \
+ if (condition) \
+ break; \
+ __wait_event_exclusive_cmd(wq_head, condition, cmd1, cmd2); \
} while (0)
-#define __wait_event_cmd(wq, condition, cmd1, cmd2) \
- (void)___wait_event(wq, condition, TASK_UNINTERRUPTIBLE, 0, 0, \
+#define __wait_event_cmd(wq_head, condition, cmd1, cmd2) \
+ (void)___wait_event(wq_head, condition, TASK_UNINTERRUPTIBLE, 0, 0, \
cmd1; schedule(); cmd2)
/**
* wait_event_cmd - sleep until a condition gets true
- * @wq: the waitqueue to wait on
+ * @wq_head: the waitqueue to wait on
* @condition: a C expression for the event to wait for
* @cmd1: the command will be executed before sleep
* @cmd2: the command will be executed after sleep
*
* The process is put to sleep (TASK_UNINTERRUPTIBLE) until the
* @condition evaluates to true. The @condition is checked each time
- * the waitqueue @wq is woken up.
+ * the waitqueue @wq_head is woken up.
*
* wake_up() has to be called after changing any variable that could
* change the result of the wait condition.
*/
-#define wait_event_cmd(wq, condition, cmd1, cmd2) \
-do { \
- if (condition) \
- break; \
- __wait_event_cmd(wq, condition, cmd1, cmd2); \
+#define wait_event_cmd(wq_head, condition, cmd1, cmd2) \
+do { \
+ if (condition) \
+ break; \
+ __wait_event_cmd(wq_head, condition, cmd1, cmd2); \
} while (0)
-#define __wait_event_interruptible(wq, condition) \
- ___wait_event(wq, condition, TASK_INTERRUPTIBLE, 0, 0, \
+#define __wait_event_interruptible(wq_head, condition) \
+ ___wait_event(wq_head, condition, TASK_INTERRUPTIBLE, 0, 0, \
schedule())
/**
* wait_event_interruptible - sleep until a condition gets true
- * @wq: the waitqueue to wait on
+ * @wq_head: the waitqueue to wait on
* @condition: a C expression for the event to wait for
*
* The process is put to sleep (TASK_INTERRUPTIBLE) until the
* @condition evaluates to true or a signal is received.
- * The @condition is checked each time the waitqueue @wq is woken up.
+ * The @condition is checked each time the waitqueue @wq_head is woken up.
*
* wake_up() has to be called after changing any variable that could
* change the result of the wait condition.
@@ -459,29 +433,29 @@ do { \
* The function will return -ERESTARTSYS if it was interrupted by a
* signal and 0 if @condition evaluated to true.
*/
-#define wait_event_interruptible(wq, condition) \
-({ \
- int __ret = 0; \
- might_sleep(); \
- if (!(condition)) \
- __ret = __wait_event_interruptible(wq, condition); \
- __ret; \
+#define wait_event_interruptible(wq_head, condition) \
+({ \
+ int __ret = 0; \
+ might_sleep(); \
+ if (!(condition)) \
+ __ret = __wait_event_interruptible(wq_head, condition); \
+ __ret; \
})
-#define __wait_event_interruptible_timeout(wq, condition, timeout) \
- ___wait_event(wq, ___wait_cond_timeout(condition), \
- TASK_INTERRUPTIBLE, 0, timeout, \
+#define __wait_event_interruptible_timeout(wq_head, condition, timeout) \
+ ___wait_event(wq_head, ___wait_cond_timeout(condition), \
+ TASK_INTERRUPTIBLE, 0, timeout, \
__ret = schedule_timeout(__ret))
/**
* wait_event_interruptible_timeout - sleep until a condition gets true or a timeout elapses
- * @wq: the waitqueue to wait on
+ * @wq_head: the waitqueue to wait on
* @condition: a C expression for the event to wait for
* @timeout: timeout, in jiffies
*
* The process is put to sleep (TASK_INTERRUPTIBLE) until the
* @condition evaluates to true or a signal is received.
- * The @condition is checked each time the waitqueue @wq is woken up.
+ * The @condition is checked each time the waitqueue @wq_head is woken up.
*
* wake_up() has to be called after changing any variable that could
* change the result of the wait condition.
@@ -493,50 +467,49 @@ do { \
* to %true before the @timeout elapsed, or -%ERESTARTSYS if it was
* interrupted by a signal.
*/
-#define wait_event_interruptible_timeout(wq, condition, timeout) \
-({ \
- long __ret = timeout; \
- might_sleep(); \
- if (!___wait_cond_timeout(condition)) \
- __ret = __wait_event_interruptible_timeout(wq, \
- condition, timeout); \
- __ret; \
+#define wait_event_interruptible_timeout(wq_head, condition, timeout) \
+({ \
+ long __ret = timeout; \
+ might_sleep(); \
+ if (!___wait_cond_timeout(condition)) \
+ __ret = __wait_event_interruptible_timeout(wq_head, \
+ condition, timeout); \
+ __ret; \
})
-#define __wait_event_hrtimeout(wq, condition, timeout, state) \
-({ \
- int __ret = 0; \
- struct hrtimer_sleeper __t; \
- \
- hrtimer_init_on_stack(&__t.timer, CLOCK_MONOTONIC, \
- HRTIMER_MODE_REL); \
- hrtimer_init_sleeper(&__t, current); \
- if ((timeout) != KTIME_MAX) \
- hrtimer_start_range_ns(&__t.timer, timeout, \
- current->timer_slack_ns, \
- HRTIMER_MODE_REL); \
- \
- __ret = ___wait_event(wq, condition, state, 0, 0, \
- if (!__t.task) { \
- __ret = -ETIME; \
- break; \
- } \
- schedule()); \
- \
- hrtimer_cancel(&__t.timer); \
- destroy_hrtimer_on_stack(&__t.timer); \
- __ret; \
+#define __wait_event_hrtimeout(wq_head, condition, timeout, state) \
+({ \
+ int __ret = 0; \
+ struct hrtimer_sleeper __t; \
+ \
+ hrtimer_init_on_stack(&__t.timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); \
+ hrtimer_init_sleeper(&__t, current); \
+ if ((timeout) != KTIME_MAX) \
+ hrtimer_start_range_ns(&__t.timer, timeout, \
+ current->timer_slack_ns, \
+ HRTIMER_MODE_REL); \
+ \
+ __ret = ___wait_event(wq_head, condition, state, 0, 0, \
+ if (!__t.task) { \
+ __ret = -ETIME; \
+ break; \
+ } \
+ schedule()); \
+ \
+ hrtimer_cancel(&__t.timer); \
+ destroy_hrtimer_on_stack(&__t.timer); \
+ __ret; \
})
/**
* wait_event_hrtimeout - sleep until a condition gets true or a timeout elapses
- * @wq: the waitqueue to wait on
+ * @wq_head: the waitqueue to wait on
* @condition: a C expression for the event to wait for
* @timeout: timeout, as a ktime_t
*
* The process is put to sleep (TASK_UNINTERRUPTIBLE) until the
* @condition evaluates to true or a signal is received.
- * The @condition is checked each time the waitqueue @wq is woken up.
+ * The @condition is checked each time the waitqueue @wq_head is woken up.
*
* wake_up() has to be called after changing any variable that could
* change the result of the wait condition.
@@ -544,25 +517,25 @@ do { \
* The function returns 0 if @condition became true, or -ETIME if the timeout
* elapsed.
*/
-#define wait_event_hrtimeout(wq, condition, timeout) \
-({ \
- int __ret = 0; \
- might_sleep(); \
- if (!(condition)) \
- __ret = __wait_event_hrtimeout(wq, condition, timeout, \
- TASK_UNINTERRUPTIBLE); \
- __ret; \
+#define wait_event_hrtimeout(wq_head, condition, timeout) \
+({ \
+ int __ret = 0; \
+ might_sleep(); \
+ if (!(condition)) \
+ __ret = __wait_event_hrtimeout(wq_head, condition, timeout, \
+ TASK_UNINTERRUPTIBLE); \
+ __ret; \
})
/**
* wait_event_interruptible_hrtimeout - sleep until a condition gets true or a timeout elapses
- * @wq: the waitqueue to wait on
+ * @wq_head: the waitqueue to wait on
* @condition: a C expression for the event to wait for
* @timeout: timeout, as a ktime_t
*
* The process is put to sleep (TASK_INTERRUPTIBLE) until the
* @condition evaluates to true or a signal is received.
- * The @condition is checked each time the waitqueue @wq is woken up.
+ * The @condition is checked each time the waitqueue @wq_head is woken up.
*
* wake_up() has to be called after changing any variable that could
* change the result of the wait condition.
@@ -570,73 +543,73 @@ do { \
* The function returns 0 if @condition became true, -ERESTARTSYS if it was
* interrupted by a signal, or -ETIME if the timeout elapsed.
*/
-#define wait_event_interruptible_hrtimeout(wq, condition, timeout) \
-({ \
- long __ret = 0; \
- might_sleep(); \
- if (!(condition)) \
- __ret = __wait_event_hrtimeout(wq, condition, timeout, \
- TASK_INTERRUPTIBLE); \
- __ret; \
+#define wait_event_interruptible_hrtimeout(wq, condition, timeout) \
+({ \
+ long __ret = 0; \
+ might_sleep(); \
+ if (!(condition)) \
+ __ret = __wait_event_hrtimeout(wq, condition, timeout, \
+ TASK_INTERRUPTIBLE); \
+ __ret; \
})
-#define __wait_event_interruptible_exclusive(wq, condition) \
- ___wait_event(wq, condition, TASK_INTERRUPTIBLE, 1, 0, \
+#define __wait_event_interruptible_exclusive(wq, condition) \
+ ___wait_event(wq, condition, TASK_INTERRUPTIBLE, 1, 0, \
schedule())
-#define wait_event_interruptible_exclusive(wq, condition) \
-({ \
- int __ret = 0; \
- might_sleep(); \
- if (!(condition)) \
- __ret = __wait_event_interruptible_exclusive(wq, condition);\
- __ret; \
+#define wait_event_interruptible_exclusive(wq, condition) \
+({ \
+ int __ret = 0; \
+ might_sleep(); \
+ if (!(condition)) \
+ __ret = __wait_event_interruptible_exclusive(wq, condition); \
+ __ret; \
})
-#define __wait_event_killable_exclusive(wq, condition) \
- ___wait_event(wq, condition, TASK_KILLABLE, 1, 0, \
+#define __wait_event_killable_exclusive(wq, condition) \
+ ___wait_event(wq, condition, TASK_KILLABLE, 1, 0, \
schedule())
-#define wait_event_killable_exclusive(wq, condition) \
-({ \
- int __ret = 0; \
- might_sleep(); \
- if (!(condition)) \
- __ret = __wait_event_killable_exclusive(wq, condition); \
- __ret; \
+#define wait_event_killable_exclusive(wq, condition) \
+({ \
+ int __ret = 0; \
+ might_sleep(); \
+ if (!(condition)) \
+ __ret = __wait_event_killable_exclusive(wq, condition); \
+ __ret; \
})
-#define __wait_event_freezable_exclusive(wq, condition) \
- ___wait_event(wq, condition, TASK_INTERRUPTIBLE, 1, 0, \
+#define __wait_event_freezable_exclusive(wq, condition) \
+ ___wait_event(wq, condition, TASK_INTERRUPTIBLE, 1, 0, \
schedule(); try_to_freeze())
-#define wait_event_freezable_exclusive(wq, condition) \
-({ \
- int __ret = 0; \
- might_sleep(); \
- if (!(condition)) \
- __ret = __wait_event_freezable_exclusive(wq, condition);\
- __ret; \
+#define wait_event_freezable_exclusive(wq, condition) \
+({ \
+ int __ret = 0; \
+ might_sleep(); \
+ if (!(condition)) \
+ __ret = __wait_event_freezable_exclusive(wq, condition); \
+ __ret; \
})
-extern int do_wait_intr(wait_queue_head_t *, wait_queue_t *);
-extern int do_wait_intr_irq(wait_queue_head_t *, wait_queue_t *);
-
-#define __wait_event_interruptible_locked(wq, condition, exclusive, fn) \
-({ \
- int __ret; \
- DEFINE_WAIT(__wait); \
- if (exclusive) \
- __wait.flags |= WQ_FLAG_EXCLUSIVE; \
- do { \
- __ret = fn(&(wq), &__wait); \
- if (__ret) \
- break; \
- } while (!(condition)); \
- __remove_wait_queue(&(wq), &__wait); \
- __set_current_state(TASK_RUNNING); \
- __ret; \
+extern int do_wait_intr(wait_queue_head_t *, wait_queue_entry_t *);
+extern int do_wait_intr_irq(wait_queue_head_t *, wait_queue_entry_t *);
+
+#define __wait_event_interruptible_locked(wq, condition, exclusive, fn) \
+({ \
+ int __ret; \
+ DEFINE_WAIT(__wait); \
+ if (exclusive) \
+ __wait.flags |= WQ_FLAG_EXCLUSIVE; \
+ do { \
+ __ret = fn(&(wq), &__wait); \
+ if (__ret) \
+ break; \
+ } while (!(condition)); \
+ __remove_wait_queue(&(wq), &__wait); \
+ __set_current_state(TASK_RUNNING); \
+ __ret; \
})
@@ -663,8 +636,8 @@ extern int do_wait_intr_irq(wait_queue_head_t *, wait_queue_t *);
* The function will return -ERESTARTSYS if it was interrupted by a
* signal and 0 if @condition evaluated to true.
*/
-#define wait_event_interruptible_locked(wq, condition) \
- ((condition) \
+#define wait_event_interruptible_locked(wq, condition) \
+ ((condition) \
? 0 : __wait_event_interruptible_locked(wq, condition, 0, do_wait_intr))
/**
@@ -690,8 +663,8 @@ extern int do_wait_intr_irq(wait_queue_head_t *, wait_queue_t *);
* The function will return -ERESTARTSYS if it was interrupted by a
* signal and 0 if @condition evaluated to true.
*/
-#define wait_event_interruptible_locked_irq(wq, condition) \
- ((condition) \
+#define wait_event_interruptible_locked_irq(wq, condition) \
+ ((condition) \
? 0 : __wait_event_interruptible_locked(wq, condition, 0, do_wait_intr_irq))
/**
@@ -721,8 +694,8 @@ extern int do_wait_intr_irq(wait_queue_head_t *, wait_queue_t *);
* The function will return -ERESTARTSYS if it was interrupted by a
* signal and 0 if @condition evaluated to true.
*/
-#define wait_event_interruptible_exclusive_locked(wq, condition) \
- ((condition) \
+#define wait_event_interruptible_exclusive_locked(wq, condition) \
+ ((condition) \
? 0 : __wait_event_interruptible_locked(wq, condition, 1, do_wait_intr))
/**
@@ -752,12 +725,12 @@ extern int do_wait_intr_irq(wait_queue_head_t *, wait_queue_t *);
* The function will return -ERESTARTSYS if it was interrupted by a
* signal and 0 if @condition evaluated to true.
*/
-#define wait_event_interruptible_exclusive_locked_irq(wq, condition) \
- ((condition) \
+#define wait_event_interruptible_exclusive_locked_irq(wq, condition) \
+ ((condition) \
? 0 : __wait_event_interruptible_locked(wq, condition, 1, do_wait_intr_irq))
-#define __wait_event_killable(wq, condition) \
+#define __wait_event_killable(wq, condition) \
___wait_event(wq, condition, TASK_KILLABLE, 0, 0, schedule())
/**
@@ -775,21 +748,21 @@ extern int do_wait_intr_irq(wait_queue_head_t *, wait_queue_t *);
* The function will return -ERESTARTSYS if it was interrupted by a
* signal and 0 if @condition evaluated to true.
*/
-#define wait_event_killable(wq, condition) \
-({ \
- int __ret = 0; \
- might_sleep(); \
- if (!(condition)) \
- __ret = __wait_event_killable(wq, condition); \
- __ret; \
+#define wait_event_killable(wq_head, condition) \
+({ \
+ int __ret = 0; \
+ might_sleep(); \
+ if (!(condition)) \
+ __ret = __wait_event_killable(wq_head, condition); \
+ __ret; \
})
-#define __wait_event_lock_irq(wq, condition, lock, cmd) \
- (void)___wait_event(wq, condition, TASK_UNINTERRUPTIBLE, 0, 0, \
- spin_unlock_irq(&lock); \
- cmd; \
- schedule(); \
+#define __wait_event_lock_irq(wq_head, condition, lock, cmd) \
+ (void)___wait_event(wq_head, condition, TASK_UNINTERRUPTIBLE, 0, 0, \
+ spin_unlock_irq(&lock); \
+ cmd; \
+ schedule(); \
spin_lock_irq(&lock))
/**
@@ -797,7 +770,7 @@ extern int do_wait_intr_irq(wait_queue_head_t *, wait_queue_t *);
* condition is checked under the lock. This
* is expected to be called with the lock
* taken.
- * @wq: the waitqueue to wait on
+ * @wq_head: the waitqueue to wait on
* @condition: a C expression for the event to wait for
* @lock: a locked spinlock_t, which will be released before cmd
* and schedule() and reacquired afterwards.
@@ -806,7 +779,7 @@ extern int do_wait_intr_irq(wait_queue_head_t *, wait_queue_t *);
*
* The process is put to sleep (TASK_UNINTERRUPTIBLE) until the
* @condition evaluates to true. The @condition is checked each time
- * the waitqueue @wq is woken up.
+ * the waitqueue @wq_head is woken up.
*
* wake_up() has to be called after changing any variable that could
* change the result of the wait condition.
@@ -815,11 +788,11 @@ extern int do_wait_intr_irq(wait_queue_head_t *, wait_queue_t *);
* dropped before invoking the cmd and going to sleep and is reacquired
* afterwards.
*/
-#define wait_event_lock_irq_cmd(wq, condition, lock, cmd) \
-do { \
- if (condition) \
- break; \
- __wait_event_lock_irq(wq, condition, lock, cmd); \
+#define wait_event_lock_irq_cmd(wq_head, condition, lock, cmd) \
+do { \
+ if (condition) \
+ break; \
+ __wait_event_lock_irq(wq_head, condition, lock, cmd); \
} while (0)
/**
@@ -827,14 +800,14 @@ do { \
* condition is checked under the lock. This
* is expected to be called with the lock
* taken.
- * @wq: the waitqueue to wait on
+ * @wq_head: the waitqueue to wait on
* @condition: a C expression for the event to wait for
* @lock: a locked spinlock_t, which will be released before schedule()
* and reacquired afterwards.
*
* The process is put to sleep (TASK_UNINTERRUPTIBLE) until the
* @condition evaluates to true. The @condition is checked each time
- * the waitqueue @wq is woken up.
+ * the waitqueue @wq_head is woken up.
*
* wake_up() has to be called after changing any variable that could
* change the result of the wait condition.
@@ -842,26 +815,26 @@ do { \
* This is supposed to be called while holding the lock. The lock is
* dropped before going to sleep and is reacquired afterwards.
*/
-#define wait_event_lock_irq(wq, condition, lock) \
-do { \
- if (condition) \
- break; \
- __wait_event_lock_irq(wq, condition, lock, ); \
+#define wait_event_lock_irq(wq_head, condition, lock) \
+do { \
+ if (condition) \
+ break; \
+ __wait_event_lock_irq(wq_head, condition, lock, ); \
} while (0)
-#define __wait_event_interruptible_lock_irq(wq, condition, lock, cmd) \
- ___wait_event(wq, condition, TASK_INTERRUPTIBLE, 0, 0, \
- spin_unlock_irq(&lock); \
- cmd; \
- schedule(); \
+#define __wait_event_interruptible_lock_irq(wq_head, condition, lock, cmd) \
+ ___wait_event(wq_head, condition, TASK_INTERRUPTIBLE, 0, 0, \
+ spin_unlock_irq(&lock); \
+ cmd; \
+ schedule(); \
spin_lock_irq(&lock))
/**
* wait_event_interruptible_lock_irq_cmd - sleep until a condition gets true.
* The condition is checked under the lock. This is expected to
* be called with the lock taken.
- * @wq: the waitqueue to wait on
+ * @wq_head: the waitqueue to wait on
* @condition: a C expression for the event to wait for
* @lock: a locked spinlock_t, which will be released before cmd and
* schedule() and reacquired afterwards.
@@ -870,7 +843,7 @@ do { \
*
* The process is put to sleep (TASK_INTERRUPTIBLE) until the
* @condition evaluates to true or a signal is received. The @condition is
- * checked each time the waitqueue @wq is woken up.
+ * checked each time the waitqueue @wq_head is woken up.
*
* wake_up() has to be called after changing any variable that could
* change the result of the wait condition.
@@ -882,27 +855,27 @@ do { \
* The macro will return -ERESTARTSYS if it was interrupted by a signal
* and 0 if @condition evaluated to true.
*/
-#define wait_event_interruptible_lock_irq_cmd(wq, condition, lock, cmd) \
-({ \
- int __ret = 0; \
- if (!(condition)) \
- __ret = __wait_event_interruptible_lock_irq(wq, \
- condition, lock, cmd); \
- __ret; \
+#define wait_event_interruptible_lock_irq_cmd(wq_head, condition, lock, cmd) \
+({ \
+ int __ret = 0; \
+ if (!(condition)) \
+ __ret = __wait_event_interruptible_lock_irq(wq_head, \
+ condition, lock, cmd); \
+ __ret; \
})
/**
* wait_event_interruptible_lock_irq - sleep until a condition gets true.
* The condition is checked under the lock. This is expected
* to be called with the lock taken.
- * @wq: the waitqueue to wait on
+ * @wq_head: the waitqueue to wait on
* @condition: a C expression for the event to wait for
* @lock: a locked spinlock_t, which will be released before schedule()
* and reacquired afterwards.
*
* The process is put to sleep (TASK_INTERRUPTIBLE) until the
* @condition evaluates to true or signal is received. The @condition is
- * checked each time the waitqueue @wq is woken up.
+ * checked each time the waitqueue @wq_head is woken up.
*
* wake_up() has to be called after changing any variable that could
* change the result of the wait condition.
@@ -913,28 +886,28 @@ do { \
* The macro will return -ERESTARTSYS if it was interrupted by a signal
* and 0 if @condition evaluated to true.
*/
-#define wait_event_interruptible_lock_irq(wq, condition, lock) \
-({ \
- int __ret = 0; \
- if (!(condition)) \
- __ret = __wait_event_interruptible_lock_irq(wq, \
- condition, lock,); \
- __ret; \
+#define wait_event_interruptible_lock_irq(wq_head, condition, lock) \
+({ \
+ int __ret = 0; \
+ if (!(condition)) \
+ __ret = __wait_event_interruptible_lock_irq(wq_head, \
+ condition, lock,); \
+ __ret; \
})
-#define __wait_event_interruptible_lock_irq_timeout(wq, condition, \
- lock, timeout) \
- ___wait_event(wq, ___wait_cond_timeout(condition), \
- TASK_INTERRUPTIBLE, 0, timeout, \
- spin_unlock_irq(&lock); \
- __ret = schedule_timeout(__ret); \
+#define __wait_event_interruptible_lock_irq_timeout(wq_head, condition, \
+ lock, timeout) \
+ ___wait_event(wq_head, ___wait_cond_timeout(condition), \
+ TASK_INTERRUPTIBLE, 0, timeout, \
+ spin_unlock_irq(&lock); \
+ __ret = schedule_timeout(__ret); \
spin_lock_irq(&lock));
/**
* wait_event_interruptible_lock_irq_timeout - sleep until a condition gets
* true or a timeout elapses. The condition is checked under
* the lock. This is expected to be called with the lock taken.
- * @wq: the waitqueue to wait on
+ * @wq_head: the waitqueue to wait on
* @condition: a C expression for the event to wait for
* @lock: a locked spinlock_t, which will be released before schedule()
* and reacquired afterwards.
@@ -942,7 +915,7 @@ do { \
*
* The process is put to sleep (TASK_INTERRUPTIBLE) until the
* @condition evaluates to true or signal is received. The @condition is
- * checked each time the waitqueue @wq is woken up.
+ * checked each time the waitqueue @wq_head is woken up.
*
* wake_up() has to be called after changing any variable that could
* change the result of the wait condition.
@@ -954,263 +927,42 @@ do { \
* was interrupted by a signal, and the remaining jiffies otherwise
* if the condition evaluated to true before the timeout elapsed.
*/
-#define wait_event_interruptible_lock_irq_timeout(wq, condition, lock, \
- timeout) \
-({ \
- long __ret = timeout; \
- if (!___wait_cond_timeout(condition)) \
- __ret = __wait_event_interruptible_lock_irq_timeout( \
- wq, condition, lock, timeout); \
- __ret; \
+#define wait_event_interruptible_lock_irq_timeout(wq_head, condition, lock, \
+ timeout) \
+({ \
+ long __ret = timeout; \
+ if (!___wait_cond_timeout(condition)) \
+ __ret = __wait_event_interruptible_lock_irq_timeout( \
+ wq_head, condition, lock, timeout); \
+ __ret; \
})
/*
* Waitqueues which are removed from the waitqueue_head at wakeup time
*/
-void prepare_to_wait(wait_queue_head_t *q, wait_queue_t *wait, int state);
-void prepare_to_wait_exclusive(wait_queue_head_t *q, wait_queue_t *wait, int state);
-long prepare_to_wait_event(wait_queue_head_t *q, wait_queue_t *wait, int state);
-void finish_wait(wait_queue_head_t *q, wait_queue_t *wait);
-long wait_woken(wait_queue_t *wait, unsigned mode, long timeout);
-int woken_wake_function(wait_queue_t *wait, unsigned mode, int sync, void *key);
-int autoremove_wake_function(wait_queue_t *wait, unsigned mode, int sync, void *key);
-int wake_bit_function(wait_queue_t *wait, unsigned mode, int sync, void *key);
-
-#define DEFINE_WAIT_FUNC(name, function) \
- wait_queue_t name = { \
- .private = current, \
- .func = function, \
- .task_list = LIST_HEAD_INIT((name).task_list), \
+void prepare_to_wait(struct wait_queue_head *wq_head, struct wait_queue_entry *wq_entry, int state);
+void prepare_to_wait_exclusive(struct wait_queue_head *wq_head, struct wait_queue_entry *wq_entry, int state);
+long prepare_to_wait_event(struct wait_queue_head *wq_head, struct wait_queue_entry *wq_entry, int state);
+void finish_wait(struct wait_queue_head *wq_head, struct wait_queue_entry *wq_entry);
+long wait_woken(struct wait_queue_entry *wq_entry, unsigned mode, long timeout);
+int woken_wake_function(struct wait_queue_entry *wq_entry, unsigned mode, int sync, void *key);
+int autoremove_wake_function(struct wait_queue_entry *wq_entry, unsigned mode, int sync, void *key);
+
+#define DEFINE_WAIT_FUNC(name, function) \
+ struct wait_queue_entry name = { \
+ .private = current, \
+ .func = function, \
+ .entry = LIST_HEAD_INIT((name).entry), \
}
#define DEFINE_WAIT(name) DEFINE_WAIT_FUNC(name, autoremove_wake_function)
-#define DEFINE_WAIT_BIT(name, word, bit) \
- struct wait_bit_queue name = { \
- .key = __WAIT_BIT_KEY_INITIALIZER(word, bit), \
- .wait = { \
- .private = current, \
- .func = wake_bit_function, \
- .task_list = \
- LIST_HEAD_INIT((name).wait.task_list), \
- }, \
- }
-
-#define init_wait(wait) \
- do { \
- (wait)->private = current; \
- (wait)->func = autoremove_wake_function; \
- INIT_LIST_HEAD(&(wait)->task_list); \
- (wait)->flags = 0; \
+#define init_wait(wait) \
+ do { \
+ (wait)->private = current; \
+ (wait)->func = autoremove_wake_function; \
+ INIT_LIST_HEAD(&(wait)->entry); \
+ (wait)->flags = 0; \
} while (0)
-
-extern int bit_wait(struct wait_bit_key *, int);
-extern int bit_wait_io(struct wait_bit_key *, int);
-extern int bit_wait_timeout(struct wait_bit_key *, int);
-extern int bit_wait_io_timeout(struct wait_bit_key *, int);
-
-/**
- * wait_on_bit - wait for a bit to be cleared
- * @word: the word being waited on, a kernel virtual address
- * @bit: the bit of the word being waited on
- * @mode: the task state to sleep in
- *
- * There is a standard hashed waitqueue table for generic use. This
- * is the part of the hashtable's accessor API that waits on a bit.
- * For instance, if one were to have waiters on a bitflag, one would
- * call wait_on_bit() in threads waiting for the bit to clear.
- * One uses wait_on_bit() where one is waiting for the bit to clear,
- * but has no intention of setting it.
- * Returned value will be zero if the bit was cleared, or non-zero
- * if the process received a signal and the mode permitted wakeup
- * on that signal.
- */
-static inline int
-wait_on_bit(unsigned long *word, int bit, unsigned mode)
-{
- might_sleep();
- if (!test_bit(bit, word))
- return 0;
- return out_of_line_wait_on_bit(word, bit,
- bit_wait,
- mode);
-}
-
-/**
- * wait_on_bit_io - wait for a bit to be cleared
- * @word: the word being waited on, a kernel virtual address
- * @bit: the bit of the word being waited on
- * @mode: the task state to sleep in
- *
- * Use the standard hashed waitqueue table to wait for a bit
- * to be cleared. This is similar to wait_on_bit(), but calls
- * io_schedule() instead of schedule() for the actual waiting.
- *
- * Returned value will be zero if the bit was cleared, or non-zero
- * if the process received a signal and the mode permitted wakeup
- * on that signal.
- */
-static inline int
-wait_on_bit_io(unsigned long *word, int bit, unsigned mode)
-{
- might_sleep();
- if (!test_bit(bit, word))
- return 0;
- return out_of_line_wait_on_bit(word, bit,
- bit_wait_io,
- mode);
-}
-
-/**
- * wait_on_bit_timeout - wait for a bit to be cleared or a timeout elapses
- * @word: the word being waited on, a kernel virtual address
- * @bit: the bit of the word being waited on
- * @mode: the task state to sleep in
- * @timeout: timeout, in jiffies
- *
- * Use the standard hashed waitqueue table to wait for a bit
- * to be cleared. This is similar to wait_on_bit(), except also takes a
- * timeout parameter.
- *
- * Returned value will be zero if the bit was cleared before the
- * @timeout elapsed, or non-zero if the @timeout elapsed or process
- * received a signal and the mode permitted wakeup on that signal.
- */
-static inline int
-wait_on_bit_timeout(unsigned long *word, int bit, unsigned mode,
- unsigned long timeout)
-{
- might_sleep();
- if (!test_bit(bit, word))
- return 0;
- return out_of_line_wait_on_bit_timeout(word, bit,
- bit_wait_timeout,
- mode, timeout);
-}
-
-/**
- * wait_on_bit_action - wait for a bit to be cleared
- * @word: the word being waited on, a kernel virtual address
- * @bit: the bit of the word being waited on
- * @action: the function used to sleep, which may take special actions
- * @mode: the task state to sleep in
- *
- * Use the standard hashed waitqueue table to wait for a bit
- * to be cleared, and allow the waiting action to be specified.
- * This is like wait_on_bit() but allows fine control of how the waiting
- * is done.
- *
- * Returned value will be zero if the bit was cleared, or non-zero
- * if the process received a signal and the mode permitted wakeup
- * on that signal.
- */
-static inline int
-wait_on_bit_action(unsigned long *word, int bit, wait_bit_action_f *action,
- unsigned mode)
-{
- might_sleep();
- if (!test_bit(bit, word))
- return 0;
- return out_of_line_wait_on_bit(word, bit, action, mode);
-}
-
-/**
- * wait_on_bit_lock - wait for a bit to be cleared, when wanting to set it
- * @word: the word being waited on, a kernel virtual address
- * @bit: the bit of the word being waited on
- * @mode: the task state to sleep in
- *
- * There is a standard hashed waitqueue table for generic use. This
- * is the part of the hashtable's accessor API that waits on a bit
- * when one intends to set it, for instance, trying to lock bitflags.
- * For instance, if one were to have waiters trying to set bitflag
- * and waiting for it to clear before setting it, one would call
- * wait_on_bit() in threads waiting to be able to set the bit.
- * One uses wait_on_bit_lock() where one is waiting for the bit to
- * clear with the intention of setting it, and when done, clearing it.
- *
- * Returns zero if the bit was (eventually) found to be clear and was
- * set. Returns non-zero if a signal was delivered to the process and
- * the @mode allows that signal to wake the process.
- */
-static inline int
-wait_on_bit_lock(unsigned long *word, int bit, unsigned mode)
-{
- might_sleep();
- if (!test_and_set_bit(bit, word))
- return 0;
- return out_of_line_wait_on_bit_lock(word, bit, bit_wait, mode);
-}
-
-/**
- * wait_on_bit_lock_io - wait for a bit to be cleared, when wanting to set it
- * @word: the word being waited on, a kernel virtual address
- * @bit: the bit of the word being waited on
- * @mode: the task state to sleep in
- *
- * Use the standard hashed waitqueue table to wait for a bit
- * to be cleared and then to atomically set it. This is similar
- * to wait_on_bit(), but calls io_schedule() instead of schedule()
- * for the actual waiting.
- *
- * Returns zero if the bit was (eventually) found to be clear and was
- * set. Returns non-zero if a signal was delivered to the process and
- * the @mode allows that signal to wake the process.
- */
-static inline int
-wait_on_bit_lock_io(unsigned long *word, int bit, unsigned mode)
-{
- might_sleep();
- if (!test_and_set_bit(bit, word))
- return 0;
- return out_of_line_wait_on_bit_lock(word, bit, bit_wait_io, mode);
-}
-
-/**
- * wait_on_bit_lock_action - wait for a bit to be cleared, when wanting to set it
- * @word: the word being waited on, a kernel virtual address
- * @bit: the bit of the word being waited on
- * @action: the function used to sleep, which may take special actions
- * @mode: the task state to sleep in
- *
- * Use the standard hashed waitqueue table to wait for a bit
- * to be cleared and then to set it, and allow the waiting action
- * to be specified.
- * This is like wait_on_bit() but allows fine control of how the waiting
- * is done.
- *
- * Returns zero if the bit was (eventually) found to be clear and was
- * set. Returns non-zero if a signal was delivered to the process and
- * the @mode allows that signal to wake the process.
- */
-static inline int
-wait_on_bit_lock_action(unsigned long *word, int bit, wait_bit_action_f *action,
- unsigned mode)
-{
- might_sleep();
- if (!test_and_set_bit(bit, word))
- return 0;
- return out_of_line_wait_on_bit_lock(word, bit, action, mode);
-}
-
-/**
- * wait_on_atomic_t - Wait for an atomic_t to become 0
- * @val: The atomic value being waited on, a kernel virtual address
- * @action: the function used to sleep, which may take special actions
- * @mode: the task state to sleep in
- *
- * Wait for an atomic_t to become 0. We abuse the bit-wait waitqueue table for
- * the purpose of getting a waitqueue, but we set the key to a bit number
- * outside of the target 'word'.
- */
-static inline
-int wait_on_atomic_t(atomic_t *val, int (*action)(atomic_t *), unsigned mode)
-{
- might_sleep();
- if (atomic_read(val) == 0)
- return 0;
- return out_of_line_wait_on_atomic_t(val, action, mode);
-}
-
#endif /* _LINUX_WAIT_H */
diff --git a/include/linux/wait_bit.h b/include/linux/wait_bit.h
new file mode 100644
index 000000000000..12b26660d7e9
--- /dev/null
+++ b/include/linux/wait_bit.h
@@ -0,0 +1,261 @@
+#ifndef _LINUX_WAIT_BIT_H
+#define _LINUX_WAIT_BIT_H
+
+/*
+ * Linux wait-bit related types and methods:
+ */
+#include <linux/wait.h>
+
+struct wait_bit_key {
+ void *flags;
+ int bit_nr;
+#define WAIT_ATOMIC_T_BIT_NR -1
+ unsigned long timeout;
+};
+
+struct wait_bit_queue_entry {
+ struct wait_bit_key key;
+ struct wait_queue_entry wq_entry;
+};
+
+#define __WAIT_BIT_KEY_INITIALIZER(word, bit) \
+ { .flags = word, .bit_nr = bit, }
+
+#define __WAIT_ATOMIC_T_KEY_INITIALIZER(p) \
+ { .flags = p, .bit_nr = WAIT_ATOMIC_T_BIT_NR, }
+
+typedef int wait_bit_action_f(struct wait_bit_key *key, int mode);
+void __wake_up_bit(struct wait_queue_head *wq_head, void *word, int bit);
+int __wait_on_bit(struct wait_queue_head *wq_head, struct wait_bit_queue_entry *wbq_entry, wait_bit_action_f *action, unsigned int mode);
+int __wait_on_bit_lock(struct wait_queue_head *wq_head, struct wait_bit_queue_entry *wbq_entry, wait_bit_action_f *action, unsigned int mode);
+void wake_up_bit(void *word, int bit);
+void wake_up_atomic_t(atomic_t *p);
+int out_of_line_wait_on_bit(void *word, int, wait_bit_action_f *action, unsigned int mode);
+int out_of_line_wait_on_bit_timeout(void *word, int, wait_bit_action_f *action, unsigned int mode, unsigned long timeout);
+int out_of_line_wait_on_bit_lock(void *word, int, wait_bit_action_f *action, unsigned int mode);
+int out_of_line_wait_on_atomic_t(atomic_t *p, int (*)(atomic_t *), unsigned int mode);
+struct wait_queue_head *bit_waitqueue(void *word, int bit);
+extern void __init wait_bit_init(void);
+
+int wake_bit_function(struct wait_queue_entry *wq_entry, unsigned mode, int sync, void *key);
+
+#define DEFINE_WAIT_BIT(name, word, bit) \
+ struct wait_bit_queue_entry name = { \
+ .key = __WAIT_BIT_KEY_INITIALIZER(word, bit), \
+ .wq_entry = { \
+ .private = current, \
+ .func = wake_bit_function, \
+ .entry = \
+ LIST_HEAD_INIT((name).wq_entry.entry), \
+ }, \
+ }
+
+extern int bit_wait(struct wait_bit_key *key, int bit);
+extern int bit_wait_io(struct wait_bit_key *key, int bit);
+extern int bit_wait_timeout(struct wait_bit_key *key, int bit);
+extern int bit_wait_io_timeout(struct wait_bit_key *key, int bit);
+
+/**
+ * wait_on_bit - wait for a bit to be cleared
+ * @word: the word being waited on, a kernel virtual address
+ * @bit: the bit of the word being waited on
+ * @mode: the task state to sleep in
+ *
+ * There is a standard hashed waitqueue table for generic use. This
+ * is the part of the hashtable's accessor API that waits on a bit.
+ * For instance, if one were to have waiters on a bitflag, one would
+ * call wait_on_bit() in threads waiting for the bit to clear.
+ * One uses wait_on_bit() where one is waiting for the bit to clear,
+ * but has no intention of setting it.
+ * Returned value will be zero if the bit was cleared, or non-zero
+ * if the process received a signal and the mode permitted wakeup
+ * on that signal.
+ */
+static inline int
+wait_on_bit(unsigned long *word, int bit, unsigned mode)
+{
+ might_sleep();
+ if (!test_bit(bit, word))
+ return 0;
+ return out_of_line_wait_on_bit(word, bit,
+ bit_wait,
+ mode);
+}
+
+/**
+ * wait_on_bit_io - wait for a bit to be cleared
+ * @word: the word being waited on, a kernel virtual address
+ * @bit: the bit of the word being waited on
+ * @mode: the task state to sleep in
+ *
+ * Use the standard hashed waitqueue table to wait for a bit
+ * to be cleared. This is similar to wait_on_bit(), but calls
+ * io_schedule() instead of schedule() for the actual waiting.
+ *
+ * Returned value will be zero if the bit was cleared, or non-zero
+ * if the process received a signal and the mode permitted wakeup
+ * on that signal.
+ */
+static inline int
+wait_on_bit_io(unsigned long *word, int bit, unsigned mode)
+{
+ might_sleep();
+ if (!test_bit(bit, word))
+ return 0;
+ return out_of_line_wait_on_bit(word, bit,
+ bit_wait_io,
+ mode);
+}
+
+/**
+ * wait_on_bit_timeout - wait for a bit to be cleared or a timeout elapses
+ * @word: the word being waited on, a kernel virtual address
+ * @bit: the bit of the word being waited on
+ * @mode: the task state to sleep in
+ * @timeout: timeout, in jiffies
+ *
+ * Use the standard hashed waitqueue table to wait for a bit
+ * to be cleared. This is similar to wait_on_bit(), except also takes a
+ * timeout parameter.
+ *
+ * Returned value will be zero if the bit was cleared before the
+ * @timeout elapsed, or non-zero if the @timeout elapsed or process
+ * received a signal and the mode permitted wakeup on that signal.
+ */
+static inline int
+wait_on_bit_timeout(unsigned long *word, int bit, unsigned mode,
+ unsigned long timeout)
+{
+ might_sleep();
+ if (!test_bit(bit, word))
+ return 0;
+ return out_of_line_wait_on_bit_timeout(word, bit,
+ bit_wait_timeout,
+ mode, timeout);
+}
+
+/**
+ * wait_on_bit_action - wait for a bit to be cleared
+ * @word: the word being waited on, a kernel virtual address
+ * @bit: the bit of the word being waited on
+ * @action: the function used to sleep, which may take special actions
+ * @mode: the task state to sleep in
+ *
+ * Use the standard hashed waitqueue table to wait for a bit
+ * to be cleared, and allow the waiting action to be specified.
+ * This is like wait_on_bit() but allows fine control of how the waiting
+ * is done.
+ *
+ * Returned value will be zero if the bit was cleared, or non-zero
+ * if the process received a signal and the mode permitted wakeup
+ * on that signal.
+ */
+static inline int
+wait_on_bit_action(unsigned long *word, int bit, wait_bit_action_f *action,
+ unsigned mode)
+{
+ might_sleep();
+ if (!test_bit(bit, word))
+ return 0;
+ return out_of_line_wait_on_bit(word, bit, action, mode);
+}
+
+/**
+ * wait_on_bit_lock - wait for a bit to be cleared, when wanting to set it
+ * @word: the word being waited on, a kernel virtual address
+ * @bit: the bit of the word being waited on
+ * @mode: the task state to sleep in
+ *
+ * There is a standard hashed waitqueue table for generic use. This
+ * is the part of the hashtable's accessor API that waits on a bit
+ * when one intends to set it, for instance, trying to lock bitflags.
+ * For instance, if one were to have waiters trying to set bitflag
+ * and waiting for it to clear before setting it, one would call
+ * wait_on_bit() in threads waiting to be able to set the bit.
+ * One uses wait_on_bit_lock() where one is waiting for the bit to
+ * clear with the intention of setting it, and when done, clearing it.
+ *
+ * Returns zero if the bit was (eventually) found to be clear and was
+ * set. Returns non-zero if a signal was delivered to the process and
+ * the @mode allows that signal to wake the process.
+ */
+static inline int
+wait_on_bit_lock(unsigned long *word, int bit, unsigned mode)
+{
+ might_sleep();
+ if (!test_and_set_bit(bit, word))
+ return 0;
+ return out_of_line_wait_on_bit_lock(word, bit, bit_wait, mode);
+}
+
+/**
+ * wait_on_bit_lock_io - wait for a bit to be cleared, when wanting to set it
+ * @word: the word being waited on, a kernel virtual address
+ * @bit: the bit of the word being waited on
+ * @mode: the task state to sleep in
+ *
+ * Use the standard hashed waitqueue table to wait for a bit
+ * to be cleared and then to atomically set it. This is similar
+ * to wait_on_bit(), but calls io_schedule() instead of schedule()
+ * for the actual waiting.
+ *
+ * Returns zero if the bit was (eventually) found to be clear and was
+ * set. Returns non-zero if a signal was delivered to the process and
+ * the @mode allows that signal to wake the process.
+ */
+static inline int
+wait_on_bit_lock_io(unsigned long *word, int bit, unsigned mode)
+{
+ might_sleep();
+ if (!test_and_set_bit(bit, word))
+ return 0;
+ return out_of_line_wait_on_bit_lock(word, bit, bit_wait_io, mode);
+}
+
+/**
+ * wait_on_bit_lock_action - wait for a bit to be cleared, when wanting to set it
+ * @word: the word being waited on, a kernel virtual address
+ * @bit: the bit of the word being waited on
+ * @action: the function used to sleep, which may take special actions
+ * @mode: the task state to sleep in
+ *
+ * Use the standard hashed waitqueue table to wait for a bit
+ * to be cleared and then to set it, and allow the waiting action
+ * to be specified.
+ * This is like wait_on_bit() but allows fine control of how the waiting
+ * is done.
+ *
+ * Returns zero if the bit was (eventually) found to be clear and was
+ * set. Returns non-zero if a signal was delivered to the process and
+ * the @mode allows that signal to wake the process.
+ */
+static inline int
+wait_on_bit_lock_action(unsigned long *word, int bit, wait_bit_action_f *action,
+ unsigned mode)
+{
+ might_sleep();
+ if (!test_and_set_bit(bit, word))
+ return 0;
+ return out_of_line_wait_on_bit_lock(word, bit, action, mode);
+}
+
+/**
+ * wait_on_atomic_t - Wait for an atomic_t to become 0
+ * @val: The atomic value being waited on, a kernel virtual address
+ * @action: the function used to sleep, which may take special actions
+ * @mode: the task state to sleep in
+ *
+ * Wait for an atomic_t to become 0. We abuse the bit-wait waitqueue table for
+ * the purpose of getting a waitqueue, but we set the key to a bit number
+ * outside of the target 'word'.
+ */
+static inline
+int wait_on_atomic_t(atomic_t *val, int (*action)(atomic_t *), unsigned mode)
+{
+ might_sleep();
+ if (atomic_read(val) == 0)
+ return 0;
+ return out_of_line_wait_on_atomic_t(val, action, mode);
+}
+
+#endif /* _LINUX_WAIT_BIT_H */
diff --git a/include/media/cec-notifier.h b/include/media/cec-notifier.h
index 413335c8cb52..298f996969df 100644
--- a/include/media/cec-notifier.h
+++ b/include/media/cec-notifier.h
@@ -106,6 +106,16 @@ static inline void cec_notifier_set_phys_addr_from_edid(struct cec_notifier *n,
{
}
+static inline void cec_notifier_register(struct cec_notifier *n,
+ struct cec_adapter *adap,
+ void (*callback)(struct cec_adapter *adap, u16 pa))
+{
+}
+
+static inline void cec_notifier_unregister(struct cec_notifier *n)
+{
+}
+
#endif
#endif
diff --git a/include/media/cec.h b/include/media/cec.h
index bfa88d4d67e1..201f060978da 100644
--- a/include/media/cec.h
+++ b/include/media/cec.h
@@ -206,7 +206,7 @@ static inline bool cec_is_sink(const struct cec_adapter *adap)
#define cec_phys_addr_exp(pa) \
((pa) >> 12), ((pa) >> 8) & 0xf, ((pa) >> 4) & 0xf, (pa) & 0xf
-#if IS_ENABLED(CONFIG_CEC_CORE)
+#if IS_REACHABLE(CONFIG_CEC_CORE)
struct cec_adapter *cec_allocate_adapter(const struct cec_adap_ops *ops,
void *priv, const char *name, u32 caps, u8 available_las);
int cec_register_adapter(struct cec_adapter *adap, struct device *parent);
diff --git a/include/net/af_unix.h b/include/net/af_unix.h
index fd60eccb59a6..75e612a45824 100644
--- a/include/net/af_unix.h
+++ b/include/net/af_unix.h
@@ -62,7 +62,7 @@ struct unix_sock {
#define UNIX_GC_CANDIDATE 0
#define UNIX_GC_MAYBE_CYCLE 1
struct socket_wq peer_wq;
- wait_queue_t peer_wake;
+ wait_queue_entry_t peer_wake;
};
static inline struct unix_sock *unix_sk(const struct sock *sk)
diff --git a/include/net/wext.h b/include/net/wext.h
index 345911965dbb..454ff763eeba 100644
--- a/include/net/wext.h
+++ b/include/net/wext.h
@@ -6,7 +6,7 @@
struct net;
#ifdef CONFIG_WEXT_CORE
-int wext_handle_ioctl(struct net *net, struct ifreq *ifr, unsigned int cmd,
+int wext_handle_ioctl(struct net *net, struct iwreq *iwr, unsigned int cmd,
void __user *arg);
int compat_wext_handle_ioctl(struct net *net, unsigned int cmd,
unsigned long arg);
@@ -14,7 +14,7 @@ int compat_wext_handle_ioctl(struct net *net, unsigned int cmd,
struct iw_statistics *get_wireless_stats(struct net_device *dev);
int call_commit_handler(struct net_device *dev);
#else
-static inline int wext_handle_ioctl(struct net *net, struct ifreq *ifr, unsigned int cmd,
+static inline int wext_handle_ioctl(struct net *net, struct iwreq *iwr, unsigned int cmd,
void __user *arg)
{
return -EINVAL;
diff --git a/include/net/xfrm.h b/include/net/xfrm.h
index 7e7e2b0d2915..62f5a259e597 100644
--- a/include/net/xfrm.h
+++ b/include/net/xfrm.h
@@ -1850,8 +1850,9 @@ static inline struct xfrm_offload *xfrm_offload(struct sk_buff *skb)
}
#endif
-#ifdef CONFIG_XFRM_OFFLOAD
void __net_init xfrm_dev_init(void);
+
+#ifdef CONFIG_XFRM_OFFLOAD
int validate_xmit_xfrm(struct sk_buff *skb, netdev_features_t features);
int xfrm_dev_state_add(struct net *net, struct xfrm_state *x,
struct xfrm_user_offload *xuo);
@@ -1877,10 +1878,6 @@ static inline void xfrm_dev_state_free(struct xfrm_state *x)
}
}
#else
-static inline void __net_init xfrm_dev_init(void)
-{
-}
-
static inline int validate_xmit_xfrm(struct sk_buff *skb, netdev_features_t features)
{
return 0;
diff --git a/include/trace/events/rcu.h b/include/trace/events/rcu.h
index e3facb356838..91dc089d65b7 100644
--- a/include/trace/events/rcu.h
+++ b/include/trace/events/rcu.h
@@ -742,6 +742,7 @@ TRACE_EVENT(rcu_torture_read,
* "OnlineQ": _rcu_barrier() found online CPU with callbacks.
* "OnlineNQ": _rcu_barrier() found online CPU, no callbacks.
* "IRQ": An rcu_barrier_callback() callback posted on remote CPU.
+ * "IRQNQ": An rcu_barrier_callback() callback found no callbacks.
* "CB": An rcu_barrier_callback() invoked a callback, not the last.
* "LastCB": An rcu_barrier_callback() invoked the last callback.
* "Inc2": _rcu_barrier() piggyback check counter incremented.
diff --git a/include/uapi/linux/a.out.h b/include/uapi/linux/a.out.h
index 7caf44c7fa51..295cd3ef6330 100644
--- a/include/uapi/linux/a.out.h
+++ b/include/uapi/linux/a.out.h
@@ -112,24 +112,7 @@ enum machine_type {
#define N_TXTADDR(x) (N_MAGIC(x) == QMAGIC ? PAGE_SIZE : 0)
#endif
-/* Address of data segment in memory after it is loaded.
- Note that it is up to you to define SEGMENT_SIZE
- on machines not listed here. */
-#if defined(vax) || defined(hp300) || defined(pyr)
-#define SEGMENT_SIZE page_size
-#endif
-#ifdef sony
-#define SEGMENT_SIZE 0x2000
-#endif /* Sony. */
-#ifdef is68k
-#define SEGMENT_SIZE 0x20000
-#endif
-#if defined(m68k) && defined(PORTAR)
-#define PAGE_SIZE 0x400
-#define SEGMENT_SIZE PAGE_SIZE
-#endif
-
-#ifdef linux
+/* Address of data segment in memory after it is loaded. */
#ifndef __KERNEL__
#include <unistd.h>
#endif
@@ -142,7 +125,6 @@ enum machine_type {
#endif
#endif
#endif
-#endif
#define _N_SEGMENT_ROUND(x) ALIGN(x, SEGMENT_SIZE)
@@ -260,13 +242,7 @@ struct relocation_info
unsigned int r_extern:1;
/* Four bits that aren't used, but when writing an object file
it is desirable to clear them. */
-#ifdef NS32K
- unsigned r_bsr:1;
- unsigned r_disp:1;
- unsigned r_pad:2;
-#else
unsigned int r_pad:4;
-#endif
};
#endif /* no N_RELOCATION_INFO_DECLARED. */
diff --git a/include/uapi/linux/auto_fs.h b/include/uapi/linux/auto_fs.h
index aa63451ef20a..1953f8d6063b 100644
--- a/include/uapi/linux/auto_fs.h
+++ b/include/uapi/linux/auto_fs.h
@@ -26,7 +26,7 @@
#define AUTOFS_MIN_PROTO_VERSION AUTOFS_PROTO_VERSION
/*
- * The wait_queue_token (autofs_wqt_t) is part of a structure which is passed
+ * The wait_queue_entry_token (autofs_wqt_t) is part of a structure which is passed
* back to the kernel via ioctl from userspace. On architectures where 32- and
* 64-bit userspace binaries can be executed it's important that the size of
* autofs_wqt_t stays constant between 32- and 64-bit Linux kernels so that we
@@ -49,7 +49,7 @@ struct autofs_packet_hdr {
struct autofs_packet_missing {
struct autofs_packet_hdr hdr;
- autofs_wqt_t wait_queue_token;
+ autofs_wqt_t wait_queue_entry_token;
int len;
char name[NAME_MAX+1];
};
diff --git a/include/uapi/linux/auto_fs4.h b/include/uapi/linux/auto_fs4.h
index 7c6da423d54e..65b72d0222e7 100644
--- a/include/uapi/linux/auto_fs4.h
+++ b/include/uapi/linux/auto_fs4.h
@@ -108,7 +108,7 @@ enum autofs_notify {
/* v4 multi expire (via pipe) */
struct autofs_packet_expire_multi {
struct autofs_packet_hdr hdr;
- autofs_wqt_t wait_queue_token;
+ autofs_wqt_t wait_queue_entry_token;
int len;
char name[NAME_MAX+1];
};
@@ -123,7 +123,7 @@ union autofs_packet_union {
/* autofs v5 common packet struct */
struct autofs_v5_packet {
struct autofs_packet_hdr hdr;
- autofs_wqt_t wait_queue_token;
+ autofs_wqt_t wait_queue_entry_token;
__u32 dev;
__u64 ino;
__u32 uid;
diff --git a/include/uapi/linux/ethtool.h b/include/uapi/linux/ethtool.h
index d179d7767f51..7d4a594d5d58 100644
--- a/include/uapi/linux/ethtool.h
+++ b/include/uapi/linux/ethtool.h
@@ -1486,8 +1486,10 @@ enum ethtool_link_mode_bit_indices {
* it was forced up into this mode or autonegotiated.
*/
-/* The forced speed, in units of 1Mb. All values 0 to INT_MAX are legal. */
-/* Update drivers/net/phy/phy.c:phy_speed_to_str() when adding new values */
+/* The forced speed, in units of 1Mb. All values 0 to INT_MAX are legal.
+ * Update drivers/net/phy/phy.c:phy_speed_to_str() and
+ * drivers/net/bonding/bond_3ad.c:__get_link_speed() when adding new values.
+ */
#define SPEED_10 10
#define SPEED_100 100
#define SPEED_1000 1000
diff --git a/include/uapi/linux/openvswitch.h b/include/uapi/linux/openvswitch.h
index 61b7d36dfe34..156ee4cab82e 100644
--- a/include/uapi/linux/openvswitch.h
+++ b/include/uapi/linux/openvswitch.h
@@ -343,6 +343,7 @@ enum ovs_key_attr {
#define OVS_KEY_ATTR_MAX (__OVS_KEY_ATTR_MAX - 1)
enum ovs_tunnel_key_attr {
+ /* OVS_TUNNEL_KEY_ATTR_NONE, standard nl API requires this attribute! */
OVS_TUNNEL_KEY_ATTR_ID, /* be64 Tunnel ID */
OVS_TUNNEL_KEY_ATTR_IPV4_SRC, /* be32 src IP address. */
OVS_TUNNEL_KEY_ATTR_IPV4_DST, /* be32 dst IP address. */
diff --git a/include/uapi/linux/sched.h b/include/uapi/linux/sched.h
index 5f0fe019a720..e2a6c7b3510b 100644
--- a/include/uapi/linux/sched.h
+++ b/include/uapi/linux/sched.h
@@ -47,5 +47,6 @@
* For the sched_{set,get}attr() calls
*/
#define SCHED_FLAG_RESET_ON_FORK 0x01
+#define SCHED_FLAG_RECLAIM 0x02
#endif /* _UAPI_LINUX_SCHED_H */
diff --git a/init/Kconfig b/init/Kconfig
index 1d3475fc9496..ee0f03b69d11 100644
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -472,354 +472,7 @@ config TASK_IO_ACCOUNTING
endmenu # "CPU/Task time and stats accounting"
-menu "RCU Subsystem"
-
-config TREE_RCU
- bool
- default y if !PREEMPT && SMP
- help
- This option selects the RCU implementation that is
- designed for very large SMP system with hundreds or
- thousands of CPUs. It also scales down nicely to
- smaller systems.
-
-config PREEMPT_RCU
- bool
- default y if PREEMPT
- help
- This option selects the RCU implementation that is
- designed for very large SMP systems with hundreds or
- thousands of CPUs, but for which real-time response
- is also required. It also scales down nicely to
- smaller systems.
-
- Select this option if you are unsure.
-
-config TINY_RCU
- bool
- default y if !PREEMPT && !SMP
- help
- This option selects the RCU implementation that is
- designed for UP systems from which real-time response
- is not required. This option greatly reduces the
- memory footprint of RCU.
-
-config RCU_EXPERT
- bool "Make expert-level adjustments to RCU configuration"
- default n
- help
- This option needs to be enabled if you wish to make
- expert-level adjustments to RCU configuration. By default,
- no such adjustments can be made, which has the often-beneficial
- side-effect of preventing "make oldconfig" from asking you all
- sorts of detailed questions about how you would like numerous
- obscure RCU options to be set up.
-
- Say Y if you need to make expert-level adjustments to RCU.
-
- Say N if you are unsure.
-
-config SRCU
- bool
- default y
- help
- This option selects the sleepable version of RCU. This version
- permits arbitrary sleeping or blocking within RCU read-side critical
- sections.
-
-config CLASSIC_SRCU
- bool "Use v4.11 classic SRCU implementation"
- default n
- depends on RCU_EXPERT && SRCU
- help
- This option selects the traditional well-tested classic SRCU
- implementation from v4.11, as might be desired for enterprise
- Linux distributions. Without this option, the shiny new
- Tiny SRCU and Tree SRCU implementations are used instead.
- At some point, it is hoped that Tiny SRCU and Tree SRCU
- will accumulate enough test time and confidence to allow
- Classic SRCU to be dropped entirely.
-
- Say Y if you need a rock-solid SRCU.
-
- Say N if you would like help test Tree SRCU.
-
-config TINY_SRCU
- bool
- default y if SRCU && TINY_RCU && !CLASSIC_SRCU
- help
- This option selects the single-CPU non-preemptible version of SRCU.
-
-config TREE_SRCU
- bool
- default y if SRCU && !TINY_RCU && !CLASSIC_SRCU
- help
- This option selects the full-fledged version of SRCU.
-
-config TASKS_RCU
- bool
- default n
- select SRCU
- help
- This option enables a task-based RCU implementation that uses
- only voluntary context switch (not preemption!), idle, and
- user-mode execution as quiescent states.
-
-config RCU_STALL_COMMON
- def_bool ( TREE_RCU || PREEMPT_RCU || RCU_TRACE )
- help
- This option enables RCU CPU stall code that is common between
- the TINY and TREE variants of RCU. The purpose is to allow
- the tiny variants to disable RCU CPU stall warnings, while
- making these warnings mandatory for the tree variants.
-
-config RCU_NEED_SEGCBLIST
- def_bool ( TREE_RCU || PREEMPT_RCU || TINY_SRCU || TREE_SRCU )
-
-config CONTEXT_TRACKING
- bool
-
-config CONTEXT_TRACKING_FORCE
- bool "Force context tracking"
- depends on CONTEXT_TRACKING
- default y if !NO_HZ_FULL
- help
- The major pre-requirement for full dynticks to work is to
- support the context tracking subsystem. But there are also
- other dependencies to provide in order to make the full
- dynticks working.
-
- This option stands for testing when an arch implements the
- context tracking backend but doesn't yet fullfill all the
- requirements to make the full dynticks feature working.
- Without the full dynticks, there is no way to test the support
- for context tracking and the subsystems that rely on it: RCU
- userspace extended quiescent state and tickless cputime
- accounting. This option copes with the absence of the full
- dynticks subsystem by forcing the context tracking on all
- CPUs in the system.
-
- Say Y only if you're working on the development of an
- architecture backend for the context tracking.
-
- Say N otherwise, this option brings an overhead that you
- don't want in production.
-
-
-config RCU_FANOUT
- int "Tree-based hierarchical RCU fanout value"
- range 2 64 if 64BIT
- range 2 32 if !64BIT
- depends on (TREE_RCU || PREEMPT_RCU) && RCU_EXPERT
- default 64 if 64BIT
- default 32 if !64BIT
- help
- This option controls the fanout of hierarchical implementations
- of RCU, allowing RCU to work efficiently on machines with
- large numbers of CPUs. This value must be at least the fourth
- root of NR_CPUS, which allows NR_CPUS to be insanely large.
- The default value of RCU_FANOUT should be used for production
- systems, but if you are stress-testing the RCU implementation
- itself, small RCU_FANOUT values allow you to test large-system
- code paths on small(er) systems.
-
- Select a specific number if testing RCU itself.
- Take the default if unsure.
-
-config RCU_FANOUT_LEAF
- int "Tree-based hierarchical RCU leaf-level fanout value"
- range 2 64 if 64BIT
- range 2 32 if !64BIT
- depends on (TREE_RCU || PREEMPT_RCU) && RCU_EXPERT
- default 16
- help
- This option controls the leaf-level fanout of hierarchical
- implementations of RCU, and allows trading off cache misses
- against lock contention. Systems that synchronize their
- scheduling-clock interrupts for energy-efficiency reasons will
- want the default because the smaller leaf-level fanout keeps
- lock contention levels acceptably low. Very large systems
- (hundreds or thousands of CPUs) will instead want to set this
- value to the maximum value possible in order to reduce the
- number of cache misses incurred during RCU's grace-period
- initialization. These systems tend to run CPU-bound, and thus
- are not helped by synchronized interrupts, and thus tend to
- skew them, which reduces lock contention enough that large
- leaf-level fanouts work well. That said, setting leaf-level
- fanout to a large number will likely cause problematic
- lock contention on the leaf-level rcu_node structures unless
- you boot with the skew_tick kernel parameter.
-
- Select a specific number if testing RCU itself.
-
- Select the maximum permissible value for large systems, but
- please understand that you may also need to set the skew_tick
- kernel boot parameter to avoid contention on the rcu_node
- structure's locks.
-
- Take the default if unsure.
-
-config RCU_FAST_NO_HZ
- bool "Accelerate last non-dyntick-idle CPU's grace periods"
- depends on NO_HZ_COMMON && SMP && RCU_EXPERT
- default n
- help
- This option permits CPUs to enter dynticks-idle state even if
- they have RCU callbacks queued, and prevents RCU from waking
- these CPUs up more than roughly once every four jiffies (by
- default, you can adjust this using the rcutree.rcu_idle_gp_delay
- parameter), thus improving energy efficiency. On the other
- hand, this option increases the duration of RCU grace periods,
- for example, slowing down synchronize_rcu().
-
- Say Y if energy efficiency is critically important, and you
- don't care about increased grace-period durations.
-
- Say N if you are unsure.
-
-config TREE_RCU_TRACE
- def_bool RCU_TRACE && ( TREE_RCU || PREEMPT_RCU )
- select DEBUG_FS
- help
- This option provides tracing for the TREE_RCU and
- PREEMPT_RCU implementations, permitting Makefile to
- trivially select kernel/rcutree_trace.c.
-
-config RCU_BOOST
- bool "Enable RCU priority boosting"
- depends on RT_MUTEXES && PREEMPT_RCU && RCU_EXPERT
- default n
- help
- This option boosts the priority of preempted RCU readers that
- block the current preemptible RCU grace period for too long.
- This option also prevents heavy loads from blocking RCU
- callback invocation for all flavors of RCU.
-
- Say Y here if you are working with real-time apps or heavy loads
- Say N here if you are unsure.
-
-config RCU_KTHREAD_PRIO
- int "Real-time priority to use for RCU worker threads"
- range 1 99 if RCU_BOOST
- range 0 99 if !RCU_BOOST
- default 1 if RCU_BOOST
- default 0 if !RCU_BOOST
- depends on RCU_EXPERT
- help
- This option specifies the SCHED_FIFO priority value that will be
- assigned to the rcuc/n and rcub/n threads and is also the value
- used for RCU_BOOST (if enabled). If you are working with a
- real-time application that has one or more CPU-bound threads
- running at a real-time priority level, you should set
- RCU_KTHREAD_PRIO to a priority higher than the highest-priority
- real-time CPU-bound application thread. The default RCU_KTHREAD_PRIO
- value of 1 is appropriate in the common case, which is real-time
- applications that do not have any CPU-bound threads.
-
- Some real-time applications might not have a single real-time
- thread that saturates a given CPU, but instead might have
- multiple real-time threads that, taken together, fully utilize
- that CPU. In this case, you should set RCU_KTHREAD_PRIO to
- a priority higher than the lowest-priority thread that is
- conspiring to prevent the CPU from running any non-real-time
- tasks. For example, if one thread at priority 10 and another
- thread at priority 5 are between themselves fully consuming
- the CPU time on a given CPU, then RCU_KTHREAD_PRIO should be
- set to priority 6 or higher.
-
- Specify the real-time priority, or take the default if unsure.
-
-config RCU_BOOST_DELAY
- int "Milliseconds to delay boosting after RCU grace-period start"
- range 0 3000
- depends on RCU_BOOST
- default 500
- help
- This option specifies the time to wait after the beginning of
- a given grace period before priority-boosting preempted RCU
- readers blocking that grace period. Note that any RCU reader
- blocking an expedited RCU grace period is boosted immediately.
-
- Accept the default if unsure.
-
-config RCU_NOCB_CPU
- bool "Offload RCU callback processing from boot-selected CPUs"
- depends on TREE_RCU || PREEMPT_RCU
- depends on RCU_EXPERT || NO_HZ_FULL
- default n
- help
- Use this option to reduce OS jitter for aggressive HPC or
- real-time workloads. It can also be used to offload RCU
- callback invocation to energy-efficient CPUs in battery-powered
- asymmetric multiprocessors.
-
- This option offloads callback invocation from the set of
- CPUs specified at boot time by the rcu_nocbs parameter.
- For each such CPU, a kthread ("rcuox/N") will be created to
- invoke callbacks, where the "N" is the CPU being offloaded,
- and where the "x" is "b" for RCU-bh, "p" for RCU-preempt, and
- "s" for RCU-sched. Nothing prevents this kthread from running
- on the specified CPUs, but (1) the kthreads may be preempted
- between each callback, and (2) affinity or cgroups can be used
- to force the kthreads to run on whatever set of CPUs is desired.
-
- Say Y here if you want to help to debug reduced OS jitter.
- Say N here if you are unsure.
-
-choice
- prompt "Build-forced no-CBs CPUs"
- default RCU_NOCB_CPU_NONE
- depends on RCU_NOCB_CPU
- help
- This option allows no-CBs CPUs (whose RCU callbacks are invoked
- from kthreads rather than from softirq context) to be specified
- at build time. Additional no-CBs CPUs may be specified by
- the rcu_nocbs= boot parameter.
-
-config RCU_NOCB_CPU_NONE
- bool "No build_forced no-CBs CPUs"
- help
- This option does not force any of the CPUs to be no-CBs CPUs.
- Only CPUs designated by the rcu_nocbs= boot parameter will be
- no-CBs CPUs, whose RCU callbacks will be invoked by per-CPU
- kthreads whose names begin with "rcuo". All other CPUs will
- invoke their own RCU callbacks in softirq context.
-
- Select this option if you want to choose no-CBs CPUs at
- boot time, for example, to allow testing of different no-CBs
- configurations without having to rebuild the kernel each time.
-
-config RCU_NOCB_CPU_ZERO
- bool "CPU 0 is a build_forced no-CBs CPU"
- help
- This option forces CPU 0 to be a no-CBs CPU, so that its RCU
- callbacks are invoked by a per-CPU kthread whose name begins
- with "rcuo". Additional CPUs may be designated as no-CBs
- CPUs using the rcu_nocbs= boot parameter will be no-CBs CPUs.
- All other CPUs will invoke their own RCU callbacks in softirq
- context.
-
- Select this if CPU 0 needs to be a no-CBs CPU for real-time
- or energy-efficiency reasons, but the real reason it exists
- is to ensure that randconfig testing covers mixed systems.
-
-config RCU_NOCB_CPU_ALL
- bool "All CPUs are build_forced no-CBs CPUs"
- help
- This option forces all CPUs to be no-CBs CPUs. The rcu_nocbs=
- boot parameter will be ignored. All CPUs' RCU callbacks will
- be executed in the context of per-CPU rcuo kthreads created for
- this purpose. Assuming that the kthreads whose names start with
- "rcuo" are bound to "housekeeping" CPUs, this reduces OS jitter
- on the remaining CPUs, but might decrease memory locality during
- RCU-callback invocation, thus potentially degrading throughput.
-
- Select this if all CPUs need to be no-CBs CPUs for real-time
- or energy-efficiency reasons.
-
-endchoice
-
-endmenu # "RCU Subsystem"
+source "kernel/rcu/Kconfig"
config BUILD_BIN2C
bool
@@ -1156,6 +809,7 @@ config CGROUP_HUGETLB
config CPUSETS
bool "Cpuset controller"
+ depends on SMP
help
This option will let you create and manage CPUSETs which
allow dynamically partitioning a system into sets of CPUs and
diff --git a/init/main.c b/init/main.c
index f866510472d7..df58a416dd1d 100644
--- a/init/main.c
+++ b/init/main.c
@@ -389,6 +389,7 @@ static __initdata DECLARE_COMPLETION(kthreadd_done);
static noinline void __ref rest_init(void)
{
+ struct task_struct *tsk;
int pid;
rcu_scheduler_starting();
@@ -397,12 +398,32 @@ static noinline void __ref rest_init(void)
* the init task will end up wanting to create kthreads, which, if
* we schedule it before we create kthreadd, will OOPS.
*/
- kernel_thread(kernel_init, NULL, CLONE_FS);
+ pid = kernel_thread(kernel_init, NULL, CLONE_FS);
+ /*
+ * Pin init on the boot CPU. Task migration is not properly working
+ * until sched_init_smp() has been run. It will set the allowed
+ * CPUs for init to the non isolated CPUs.
+ */
+ rcu_read_lock();
+ tsk = find_task_by_pid_ns(pid, &init_pid_ns);
+ set_cpus_allowed_ptr(tsk, cpumask_of(smp_processor_id()));
+ rcu_read_unlock();
+
numa_default_policy();
pid = kernel_thread(kthreadd, NULL, CLONE_FS | CLONE_FILES);
rcu_read_lock();
kthreadd_task = find_task_by_pid_ns(pid, &init_pid_ns);
rcu_read_unlock();
+
+ /*
+ * Enable might_sleep() and smp_processor_id() checks.
+ * They cannot be enabled earlier because with CONFIG_PRREMPT=y
+ * kernel_thread() would trigger might_sleep() splats. With
+ * CONFIG_PREEMPT_VOLUNTARY=y the init task might have scheduled
+ * already, but it's stuck on the kthreadd_done completion.
+ */
+ system_state = SYSTEM_SCHEDULING;
+
complete(&kthreadd_done);
/*
@@ -1015,10 +1036,6 @@ static noinline void __init kernel_init_freeable(void)
* init can allocate pages on any node
*/
set_mems_allowed(node_states[N_MEMORY]);
- /*
- * init can run on any cpu.
- */
- set_cpus_allowed_ptr(current, cpu_all_mask);
cad_pid = task_pid(current);
diff --git a/kernel/async.c b/kernel/async.c
index d2edd6efec56..2cbd3dd5940d 100644
--- a/kernel/async.c
+++ b/kernel/async.c
@@ -114,14 +114,14 @@ static void async_run_entry_fn(struct work_struct *work)
ktime_t uninitialized_var(calltime), delta, rettime;
/* 1) run (and print duration) */
- if (initcall_debug && system_state == SYSTEM_BOOTING) {
+ if (initcall_debug && system_state < SYSTEM_RUNNING) {
pr_debug("calling %lli_%pF @ %i\n",
(long long)entry->cookie,
entry->func, task_pid_nr(current));
calltime = ktime_get();
}
entry->func(entry->data, entry->cookie);
- if (initcall_debug && system_state == SYSTEM_BOOTING) {
+ if (initcall_debug && system_state < SYSTEM_RUNNING) {
rettime = ktime_get();
delta = ktime_sub(rettime, calltime);
pr_debug("initcall %lli_%pF returned 0 after %lld usecs\n",
@@ -284,14 +284,14 @@ void async_synchronize_cookie_domain(async_cookie_t cookie, struct async_domain
{
ktime_t uninitialized_var(starttime), delta, endtime;
- if (initcall_debug && system_state == SYSTEM_BOOTING) {
+ if (initcall_debug && system_state < SYSTEM_RUNNING) {
pr_debug("async_waiting @ %i\n", task_pid_nr(current));
starttime = ktime_get();
}
wait_event(async_done, lowest_in_progress(domain) >= cookie);
- if (initcall_debug && system_state == SYSTEM_BOOTING) {
+ if (initcall_debug && system_state < SYSTEM_RUNNING) {
endtime = ktime_get();
delta = ktime_sub(endtime, starttime);
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 339c8a1371de..a8a725697bed 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -989,6 +989,11 @@ static int check_xadd(struct bpf_verifier_env *env, struct bpf_insn *insn)
if (err)
return err;
+ if (is_pointer_value(env, insn->src_reg)) {
+ verbose("R%d leaks addr into mem\n", insn->src_reg);
+ return -EACCES;
+ }
+
/* check whether atomic_add can read the memory */
err = check_mem_access(env, insn->dst_reg, insn->off,
BPF_SIZE(insn->code), BPF_READ, -1);
diff --git a/kernel/events/core.c b/kernel/events/core.c
index 6c4e523dc1e2..bc63f8db1b0d 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -925,11 +925,6 @@ static inline int is_cgroup_event(struct perf_event *event)
return 0;
}
-static inline u64 perf_cgroup_event_cgrp_time(struct perf_event *event)
-{
- return 0;
-}
-
static inline void update_cgrp_time_from_event(struct perf_event *event)
{
}
@@ -5729,9 +5724,6 @@ static void perf_output_read_one(struct perf_output_handle *handle,
__output_copy(handle, values, n * sizeof(u64));
}
-/*
- * XXX PERF_FORMAT_GROUP vs inherited events seems difficult.
- */
static void perf_output_read_group(struct perf_output_handle *handle,
struct perf_event *event,
u64 enabled, u64 running)
@@ -5776,6 +5768,13 @@ static void perf_output_read_group(struct perf_output_handle *handle,
#define PERF_FORMAT_TOTAL_TIMES (PERF_FORMAT_TOTAL_TIME_ENABLED|\
PERF_FORMAT_TOTAL_TIME_RUNNING)
+/*
+ * XXX PERF_SAMPLE_READ vs inherited events seems difficult.
+ *
+ * The problem is that its both hard and excessively expensive to iterate the
+ * child list, not to mention that its impossible to IPI the children running
+ * on another CPU, from interrupt/NMI context.
+ */
static void perf_output_read(struct perf_output_handle *handle,
struct perf_event *event)
{
@@ -9193,7 +9192,7 @@ static int perf_try_init_event(struct pmu *pmu, struct perf_event *event)
static struct pmu *perf_init_event(struct perf_event *event)
{
- struct pmu *pmu = NULL;
+ struct pmu *pmu;
int idx;
int ret;
@@ -9462,9 +9461,10 @@ perf_event_alloc(struct perf_event_attr *attr, int cpu,
local64_set(&hwc->period_left, hwc->sample_period);
/*
- * we currently do not support PERF_FORMAT_GROUP on inherited events
+ * We currently do not support PERF_SAMPLE_READ on inherited events.
+ * See perf_output_read().
*/
- if (attr->inherit && (attr->read_format & PERF_FORMAT_GROUP))
+ if (attr->inherit && (attr->sample_type & PERF_SAMPLE_READ))
goto err_ns;
if (!has_branch_stack(event))
@@ -9477,9 +9477,7 @@ perf_event_alloc(struct perf_event_attr *attr, int cpu,
}
pmu = perf_init_event(event);
- if (!pmu)
- goto err_ns;
- else if (IS_ERR(pmu)) {
+ if (IS_ERR(pmu)) {
err = PTR_ERR(pmu);
goto err_ns;
}
@@ -9492,8 +9490,10 @@ perf_event_alloc(struct perf_event_attr *attr, int cpu,
event->addr_filters_offs = kcalloc(pmu->nr_addr_filters,
sizeof(unsigned long),
GFP_KERNEL);
- if (!event->addr_filters_offs)
+ if (!event->addr_filters_offs) {
+ err = -ENOMEM;
goto err_per_task;
+ }
/* force hw sync on the address filters */
event->addr_filters_gen = 1;
diff --git a/kernel/events/ring_buffer.c b/kernel/events/ring_buffer.c
index 2831480c63a2..ee97196bb151 100644
--- a/kernel/events/ring_buffer.c
+++ b/kernel/events/ring_buffer.c
@@ -580,7 +580,7 @@ int rb_alloc_aux(struct ring_buffer *rb, struct perf_event *event,
int ret = -ENOMEM, max_order = 0;
if (!has_aux(event))
- return -ENOTSUPP;
+ return -EOPNOTSUPP;
if (event->pmu->capabilities & PERF_PMU_CAP_AUX_NO_SG) {
/*
diff --git a/kernel/exit.c b/kernel/exit.c
index 516acdb0e0ec..c63226283aef 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -318,19 +318,6 @@ void rcuwait_wake_up(struct rcuwait *w)
rcu_read_unlock();
}
-struct task_struct *try_get_task_struct(struct task_struct **ptask)
-{
- struct task_struct *task;
-
- rcu_read_lock();
- task = task_rcu_dereference(ptask);
- if (task)
- get_task_struct(task);
- rcu_read_unlock();
-
- return task;
-}
-
/*
* Determine if a process group is "orphaned", according to the POSIX
* definition in 2.2.2.52. Orphaned process groups are not to be affected
@@ -1004,7 +991,7 @@ struct wait_opts {
int __user *wo_stat;
struct rusage __user *wo_rusage;
- wait_queue_t child_wait;
+ wait_queue_entry_t child_wait;
int notask_error;
};
@@ -1541,7 +1528,7 @@ static int ptrace_do_wait(struct wait_opts *wo, struct task_struct *tsk)
return 0;
}
-static int child_wait_callback(wait_queue_t *wait, unsigned mode,
+static int child_wait_callback(wait_queue_entry_t *wait, unsigned mode,
int sync, void *key)
{
struct wait_opts *wo = container_of(wait, struct wait_opts,
diff --git a/kernel/extable.c b/kernel/extable.c
index 2676d7f8baf6..0fbdd8582f08 100644
--- a/kernel/extable.c
+++ b/kernel/extable.c
@@ -75,7 +75,7 @@ int core_kernel_text(unsigned long addr)
addr < (unsigned long)_etext)
return 1;
- if (system_state == SYSTEM_BOOTING &&
+ if (system_state < SYSTEM_RUNNING &&
init_kernel_text(addr))
return 1;
return 0;
diff --git a/kernel/futex.c b/kernel/futex.c
index 357348a6cf6b..d6cf71d08f21 100644
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -225,7 +225,7 @@ struct futex_pi_state {
* @requeue_pi_key: the requeue_pi target futex key
* @bitset: bitset for the optional bitmasked wakeup
*
- * We use this hashed waitqueue, instead of a normal wait_queue_t, so
+ * We use this hashed waitqueue, instead of a normal wait_queue_entry_t, so
* we can wake only the relevant ones (hashed queues may be shared).
*
* A futex_q has a woken state, just like tasks have TASK_RUNNING.
diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c
index 070be980c37a..425170d4439b 100644
--- a/kernel/irq/manage.c
+++ b/kernel/irq/manage.c
@@ -1312,8 +1312,10 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new)
ret = __irq_set_trigger(desc,
new->flags & IRQF_TRIGGER_MASK);
- if (ret)
+ if (ret) {
+ irq_release_resources(desc);
goto out_mask;
+ }
}
desc->istate &= ~(IRQS_AUTODETECT | IRQS_SPURIOUS_DISABLED | \
diff --git a/kernel/kexec_core.c b/kernel/kexec_core.c
index ae1a3ba24df5..154ffb489b93 100644
--- a/kernel/kexec_core.c
+++ b/kernel/kexec_core.c
@@ -38,6 +38,7 @@
#include <linux/syscore_ops.h>
#include <linux/compiler.h>
#include <linux/hugetlb.h>
+#include <linux/frame.h>
#include <asm/page.h>
#include <asm/sections.h>
@@ -874,7 +875,7 @@ int kexec_load_disabled;
* only when panic_cpu holds the current CPU number; this is the only CPU
* which processes crash_kexec routines.
*/
-void __crash_kexec(struct pt_regs *regs)
+void __noclone __crash_kexec(struct pt_regs *regs)
{
/* Take the kexec_mutex here to prevent sys_kexec_load
* running on one cpu from replacing the crash kernel
@@ -896,6 +897,7 @@ void __crash_kexec(struct pt_regs *regs)
mutex_unlock(&kexec_mutex);
}
}
+STACK_FRAME_NON_STANDARD(__crash_kexec);
void crash_kexec(struct pt_regs *regs)
{
diff --git a/kernel/livepatch/patch.c b/kernel/livepatch/patch.c
index f8269036bf0b..52c4e907c14b 100644
--- a/kernel/livepatch/patch.c
+++ b/kernel/livepatch/patch.c
@@ -59,7 +59,11 @@ static void notrace klp_ftrace_handler(unsigned long ip,
ops = container_of(fops, struct klp_ops, fops);
- rcu_read_lock();
+ /*
+ * A variant of synchronize_sched() is used to allow patching functions
+ * where RCU is not watching, see klp_synchronize_transition().
+ */
+ preempt_disable_notrace();
func = list_first_or_null_rcu(&ops->func_stack, struct klp_func,
stack_node);
@@ -115,7 +119,7 @@ static void notrace klp_ftrace_handler(unsigned long ip,
klp_arch_set_pc(regs, (unsigned long)func->new_func);
unlock:
- rcu_read_unlock();
+ preempt_enable_notrace();
}
/*
diff --git a/kernel/livepatch/transition.c b/kernel/livepatch/transition.c
index adc0cc64aa4b..b004a1fb6032 100644
--- a/kernel/livepatch/transition.c
+++ b/kernel/livepatch/transition.c
@@ -49,6 +49,28 @@ static void klp_transition_work_fn(struct work_struct *work)
static DECLARE_DELAYED_WORK(klp_transition_work, klp_transition_work_fn);
/*
+ * This function is just a stub to implement a hard force
+ * of synchronize_sched(). This requires synchronizing
+ * tasks even in userspace and idle.
+ */
+static void klp_sync(struct work_struct *work)
+{
+}
+
+/*
+ * We allow to patch also functions where RCU is not watching,
+ * e.g. before user_exit(). We can not rely on the RCU infrastructure
+ * to do the synchronization. Instead hard force the sched synchronization.
+ *
+ * This approach allows to use RCU functions for manipulating func_stack
+ * safely.
+ */
+static void klp_synchronize_transition(void)
+{
+ schedule_on_each_cpu(klp_sync);
+}
+
+/*
* The transition to the target patch state is complete. Clean up the data
* structures.
*/
@@ -73,7 +95,7 @@ static void klp_complete_transition(void)
* func->transition gets cleared, the handler may choose a
* removed function.
*/
- synchronize_rcu();
+ klp_synchronize_transition();
}
if (klp_transition_patch->immediate)
@@ -92,7 +114,7 @@ static void klp_complete_transition(void)
/* Prevent klp_ftrace_handler() from seeing KLP_UNDEFINED state */
if (klp_target_state == KLP_PATCHED)
- synchronize_rcu();
+ klp_synchronize_transition();
read_lock(&tasklist_lock);
for_each_process_thread(g, task) {
@@ -136,7 +158,11 @@ void klp_cancel_transition(void)
*/
void klp_update_patch_state(struct task_struct *task)
{
- rcu_read_lock();
+ /*
+ * A variant of synchronize_sched() is used to allow patching functions
+ * where RCU is not watching, see klp_synchronize_transition().
+ */
+ preempt_disable_notrace();
/*
* This test_and_clear_tsk_thread_flag() call also serves as a read
@@ -153,7 +179,7 @@ void klp_update_patch_state(struct task_struct *task)
if (test_and_clear_tsk_thread_flag(task, TIF_PATCH_PENDING))
task->patch_state = READ_ONCE(klp_target_state);
- rcu_read_unlock();
+ preempt_enable_notrace();
}
/*
@@ -539,7 +565,7 @@ void klp_reverse_transition(void)
clear_tsk_thread_flag(idle_task(cpu), TIF_PATCH_PENDING);
/* Let any remaining calls to klp_update_patch_state() complete */
- synchronize_rcu();
+ klp_synchronize_transition();
klp_start_transition();
}
diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c
index c0e31bfee25c..7d2499bec5fe 100644
--- a/kernel/locking/lockdep.c
+++ b/kernel/locking/lockdep.c
@@ -1157,18 +1157,18 @@ print_circular_bug_header(struct lock_list *entry, unsigned int depth,
if (debug_locks_silent)
return 0;
- printk("\n");
+ pr_warn("\n");
pr_warn("======================================================\n");
pr_warn("WARNING: possible circular locking dependency detected\n");
print_kernel_ident();
pr_warn("------------------------------------------------------\n");
- printk("%s/%d is trying to acquire lock:\n",
+ pr_warn("%s/%d is trying to acquire lock:\n",
curr->comm, task_pid_nr(curr));
print_lock(check_src);
- printk("\nbut task is already holding lock:\n");
+ pr_warn("\nbut task is already holding lock:\n");
print_lock(check_tgt);
- printk("\nwhich lock already depends on the new lock.\n\n");
- printk("\nthe existing dependency chain (in reverse order) is:\n");
+ pr_warn("\nwhich lock already depends on the new lock.\n\n");
+ pr_warn("\nthe existing dependency chain (in reverse order) is:\n");
print_circular_bug_entry(entry, depth);
@@ -1495,13 +1495,13 @@ print_bad_irq_dependency(struct task_struct *curr,
if (!debug_locks_off_graph_unlock() || debug_locks_silent)
return 0;
- printk("\n");
+ pr_warn("\n");
pr_warn("=====================================================\n");
pr_warn("WARNING: %s-safe -> %s-unsafe lock order detected\n",
irqclass, irqclass);
print_kernel_ident();
pr_warn("-----------------------------------------------------\n");
- printk("%s/%d [HC%u[%lu]:SC%u[%lu]:HE%u:SE%u] is trying to acquire:\n",
+ pr_warn("%s/%d [HC%u[%lu]:SC%u[%lu]:HE%u:SE%u] is trying to acquire:\n",
curr->comm, task_pid_nr(curr),
curr->hardirq_context, hardirq_count() >> HARDIRQ_SHIFT,
curr->softirq_context, softirq_count() >> SOFTIRQ_SHIFT,
@@ -1509,46 +1509,46 @@ print_bad_irq_dependency(struct task_struct *curr,
curr->softirqs_enabled);
print_lock(next);
- printk("\nand this task is already holding:\n");
+ pr_warn("\nand this task is already holding:\n");
print_lock(prev);
- printk("which would create a new lock dependency:\n");
+ pr_warn("which would create a new lock dependency:\n");
print_lock_name(hlock_class(prev));
- printk(KERN_CONT " ->");
+ pr_cont(" ->");
print_lock_name(hlock_class(next));
- printk(KERN_CONT "\n");
+ pr_cont("\n");
- printk("\nbut this new dependency connects a %s-irq-safe lock:\n",
+ pr_warn("\nbut this new dependency connects a %s-irq-safe lock:\n",
irqclass);
print_lock_name(backwards_entry->class);
- printk("\n... which became %s-irq-safe at:\n", irqclass);
+ pr_warn("\n... which became %s-irq-safe at:\n", irqclass);
print_stack_trace(backwards_entry->class->usage_traces + bit1, 1);
- printk("\nto a %s-irq-unsafe lock:\n", irqclass);
+ pr_warn("\nto a %s-irq-unsafe lock:\n", irqclass);
print_lock_name(forwards_entry->class);
- printk("\n... which became %s-irq-unsafe at:\n", irqclass);
- printk("...");
+ pr_warn("\n... which became %s-irq-unsafe at:\n", irqclass);
+ pr_warn("...");
print_stack_trace(forwards_entry->class->usage_traces + bit2, 1);
- printk("\nother info that might help us debug this:\n\n");
+ pr_warn("\nother info that might help us debug this:\n\n");
print_irq_lock_scenario(backwards_entry, forwards_entry,
hlock_class(prev), hlock_class(next));
lockdep_print_held_locks(curr);
- printk("\nthe dependencies between %s-irq-safe lock and the holding lock:\n", irqclass);
+ pr_warn("\nthe dependencies between %s-irq-safe lock and the holding lock:\n", irqclass);
if (!save_trace(&prev_root->trace))
return 0;
print_shortest_lock_dependencies(backwards_entry, prev_root);
- printk("\nthe dependencies between the lock to be acquired");
- printk(" and %s-irq-unsafe lock:\n", irqclass);
+ pr_warn("\nthe dependencies between the lock to be acquired");
+ pr_warn(" and %s-irq-unsafe lock:\n", irqclass);
if (!save_trace(&next_root->trace))
return 0;
print_shortest_lock_dependencies(forwards_entry, next_root);
- printk("\nstack backtrace:\n");
+ pr_warn("\nstack backtrace:\n");
dump_stack();
return 0;
@@ -1724,22 +1724,22 @@ print_deadlock_bug(struct task_struct *curr, struct held_lock *prev,
if (!debug_locks_off_graph_unlock() || debug_locks_silent)
return 0;
- printk("\n");
+ pr_warn("\n");
pr_warn("============================================\n");
pr_warn("WARNING: possible recursive locking detected\n");
print_kernel_ident();
pr_warn("--------------------------------------------\n");
- printk("%s/%d is trying to acquire lock:\n",
+ pr_warn("%s/%d is trying to acquire lock:\n",
curr->comm, task_pid_nr(curr));
print_lock(next);
- printk("\nbut task is already holding lock:\n");
+ pr_warn("\nbut task is already holding lock:\n");
print_lock(prev);
- printk("\nother info that might help us debug this:\n");
+ pr_warn("\nother info that might help us debug this:\n");
print_deadlock_scenario(next, prev);
lockdep_print_held_locks(curr);
- printk("\nstack backtrace:\n");
+ pr_warn("\nstack backtrace:\n");
dump_stack();
return 0;
@@ -2074,21 +2074,21 @@ static void print_collision(struct task_struct *curr,
struct held_lock *hlock_next,
struct lock_chain *chain)
{
- printk("\n");
+ pr_warn("\n");
pr_warn("============================\n");
pr_warn("WARNING: chain_key collision\n");
print_kernel_ident();
pr_warn("----------------------------\n");
- printk("%s/%d: ", current->comm, task_pid_nr(current));
- printk("Hash chain already cached but the contents don't match!\n");
+ pr_warn("%s/%d: ", current->comm, task_pid_nr(current));
+ pr_warn("Hash chain already cached but the contents don't match!\n");
- printk("Held locks:");
+ pr_warn("Held locks:");
print_chain_keys_held_locks(curr, hlock_next);
- printk("Locks in cached chain:");
+ pr_warn("Locks in cached chain:");
print_chain_keys_chain(chain);
- printk("\nstack backtrace:\n");
+ pr_warn("\nstack backtrace:\n");
dump_stack();
}
#endif
@@ -2373,16 +2373,16 @@ print_usage_bug(struct task_struct *curr, struct held_lock *this,
if (!debug_locks_off_graph_unlock() || debug_locks_silent)
return 0;
- printk("\n");
+ pr_warn("\n");
pr_warn("================================\n");
pr_warn("WARNING: inconsistent lock state\n");
print_kernel_ident();
pr_warn("--------------------------------\n");
- printk("inconsistent {%s} -> {%s} usage.\n",
+ pr_warn("inconsistent {%s} -> {%s} usage.\n",
usage_str[prev_bit], usage_str[new_bit]);
- printk("%s/%d [HC%u[%lu]:SC%u[%lu]:HE%u:SE%u] takes:\n",
+ pr_warn("%s/%d [HC%u[%lu]:SC%u[%lu]:HE%u:SE%u] takes:\n",
curr->comm, task_pid_nr(curr),
trace_hardirq_context(curr), hardirq_count() >> HARDIRQ_SHIFT,
trace_softirq_context(curr), softirq_count() >> SOFTIRQ_SHIFT,
@@ -2390,16 +2390,16 @@ print_usage_bug(struct task_struct *curr, struct held_lock *this,
trace_softirqs_enabled(curr));
print_lock(this);
- printk("{%s} state was registered at:\n", usage_str[prev_bit]);
+ pr_warn("{%s} state was registered at:\n", usage_str[prev_bit]);
print_stack_trace(hlock_class(this)->usage_traces + prev_bit, 1);
print_irqtrace_events(curr);
- printk("\nother info that might help us debug this:\n");
+ pr_warn("\nother info that might help us debug this:\n");
print_usage_bug_scenario(this);
lockdep_print_held_locks(curr);
- printk("\nstack backtrace:\n");
+ pr_warn("\nstack backtrace:\n");
dump_stack();
return 0;
@@ -2438,28 +2438,28 @@ print_irq_inversion_bug(struct task_struct *curr,
if (!debug_locks_off_graph_unlock() || debug_locks_silent)
return 0;
- printk("\n");
+ pr_warn("\n");
pr_warn("========================================================\n");
pr_warn("WARNING: possible irq lock inversion dependency detected\n");
print_kernel_ident();
pr_warn("--------------------------------------------------------\n");
- printk("%s/%d just changed the state of lock:\n",
+ pr_warn("%s/%d just changed the state of lock:\n",
curr->comm, task_pid_nr(curr));
print_lock(this);
if (forwards)
- printk("but this lock took another, %s-unsafe lock in the past:\n", irqclass);
+ pr_warn("but this lock took another, %s-unsafe lock in the past:\n", irqclass);
else
- printk("but this lock was taken by another, %s-safe lock in the past:\n", irqclass);
+ pr_warn("but this lock was taken by another, %s-safe lock in the past:\n", irqclass);
print_lock_name(other->class);
- printk("\n\nand interrupts could create inverse lock ordering between them.\n\n");
+ pr_warn("\n\nand interrupts could create inverse lock ordering between them.\n\n");
- printk("\nother info that might help us debug this:\n");
+ pr_warn("\nother info that might help us debug this:\n");
/* Find a middle lock (if one exists) */
depth = get_lock_depth(other);
do {
if (depth == 0 && (entry != root)) {
- printk("lockdep:%s bad path found in chain graph\n", __func__);
+ pr_warn("lockdep:%s bad path found in chain graph\n", __func__);
break;
}
middle = entry;
@@ -2475,12 +2475,12 @@ print_irq_inversion_bug(struct task_struct *curr,
lockdep_print_held_locks(curr);
- printk("\nthe shortest dependencies between 2nd lock and 1st lock:\n");
+ pr_warn("\nthe shortest dependencies between 2nd lock and 1st lock:\n");
if (!save_trace(&root->trace))
return 0;
print_shortest_lock_dependencies(other, root);
- printk("\nstack backtrace:\n");
+ pr_warn("\nstack backtrace:\n");
dump_stack();
return 0;
@@ -3189,25 +3189,25 @@ print_lock_nested_lock_not_held(struct task_struct *curr,
if (debug_locks_silent)
return 0;
- printk("\n");
+ pr_warn("\n");
pr_warn("==================================\n");
pr_warn("WARNING: Nested lock was not taken\n");
print_kernel_ident();
pr_warn("----------------------------------\n");
- printk("%s/%d is trying to lock:\n", curr->comm, task_pid_nr(curr));
+ pr_warn("%s/%d is trying to lock:\n", curr->comm, task_pid_nr(curr));
print_lock(hlock);
- printk("\nbut this task is not holding:\n");
- printk("%s\n", hlock->nest_lock->name);
+ pr_warn("\nbut this task is not holding:\n");
+ pr_warn("%s\n", hlock->nest_lock->name);
- printk("\nstack backtrace:\n");
+ pr_warn("\nstack backtrace:\n");
dump_stack();
- printk("\nother info that might help us debug this:\n");
+ pr_warn("\nother info that might help us debug this:\n");
lockdep_print_held_locks(curr);
- printk("\nstack backtrace:\n");
+ pr_warn("\nstack backtrace:\n");
dump_stack();
return 0;
@@ -3402,21 +3402,21 @@ print_unlock_imbalance_bug(struct task_struct *curr, struct lockdep_map *lock,
if (debug_locks_silent)
return 0;
- printk("\n");
+ pr_warn("\n");
pr_warn("=====================================\n");
pr_warn("WARNING: bad unlock balance detected!\n");
print_kernel_ident();
pr_warn("-------------------------------------\n");
- printk("%s/%d is trying to release lock (",
+ pr_warn("%s/%d is trying to release lock (",
curr->comm, task_pid_nr(curr));
print_lockdep_cache(lock);
- printk(KERN_CONT ") at:\n");
+ pr_cont(") at:\n");
print_ip_sym(ip);
- printk("but there are no more locks to release!\n");
- printk("\nother info that might help us debug this:\n");
+ pr_warn("but there are no more locks to release!\n");
+ pr_warn("\nother info that might help us debug this:\n");
lockdep_print_held_locks(curr);
- printk("\nstack backtrace:\n");
+ pr_warn("\nstack backtrace:\n");
dump_stack();
return 0;
@@ -3974,21 +3974,21 @@ print_lock_contention_bug(struct task_struct *curr, struct lockdep_map *lock,
if (debug_locks_silent)
return 0;
- printk("\n");
+ pr_warn("\n");
pr_warn("=================================\n");
pr_warn("WARNING: bad contention detected!\n");
print_kernel_ident();
pr_warn("---------------------------------\n");
- printk("%s/%d is trying to contend lock (",
+ pr_warn("%s/%d is trying to contend lock (",
curr->comm, task_pid_nr(curr));
print_lockdep_cache(lock);
- printk(KERN_CONT ") at:\n");
+ pr_cont(") at:\n");
print_ip_sym(ip);
- printk("but there are no locks held!\n");
- printk("\nother info that might help us debug this:\n");
+ pr_warn("but there are no locks held!\n");
+ pr_warn("\nother info that might help us debug this:\n");
lockdep_print_held_locks(curr);
- printk("\nstack backtrace:\n");
+ pr_warn("\nstack backtrace:\n");
dump_stack();
return 0;
@@ -4318,17 +4318,17 @@ print_freed_lock_bug(struct task_struct *curr, const void *mem_from,
if (debug_locks_silent)
return;
- printk("\n");
+ pr_warn("\n");
pr_warn("=========================\n");
pr_warn("WARNING: held lock freed!\n");
print_kernel_ident();
pr_warn("-------------------------\n");
- printk("%s/%d is freeing memory %p-%p, with a lock still held there!\n",
+ pr_warn("%s/%d is freeing memory %p-%p, with a lock still held there!\n",
curr->comm, task_pid_nr(curr), mem_from, mem_to-1);
print_lock(hlock);
lockdep_print_held_locks(curr);
- printk("\nstack backtrace:\n");
+ pr_warn("\nstack backtrace:\n");
dump_stack();
}
@@ -4376,14 +4376,14 @@ static void print_held_locks_bug(void)
if (debug_locks_silent)
return;
- printk("\n");
+ pr_warn("\n");
pr_warn("====================================\n");
pr_warn("WARNING: %s/%d still has locks held!\n",
current->comm, task_pid_nr(current));
print_kernel_ident();
pr_warn("------------------------------------\n");
lockdep_print_held_locks(current);
- printk("\nstack backtrace:\n");
+ pr_warn("\nstack backtrace:\n");
dump_stack();
}
@@ -4402,10 +4402,10 @@ void debug_show_all_locks(void)
int unlock = 1;
if (unlikely(!debug_locks)) {
- printk("INFO: lockdep is turned off.\n");
+ pr_warn("INFO: lockdep is turned off.\n");
return;
}
- printk("\nShowing all locks held in the system:\n");
+ pr_warn("\nShowing all locks held in the system:\n");
/*
* Here we try to get the tasklist_lock as hard as possible,
@@ -4416,18 +4416,18 @@ void debug_show_all_locks(void)
retry:
if (!read_trylock(&tasklist_lock)) {
if (count == 10)
- printk("hm, tasklist_lock locked, retrying... ");
+ pr_warn("hm, tasklist_lock locked, retrying... ");
if (count) {
count--;
- printk(" #%d", 10-count);
+ pr_cont(" #%d", 10-count);
mdelay(200);
goto retry;
}
- printk(" ignoring it.\n");
+ pr_cont(" ignoring it.\n");
unlock = 0;
} else {
if (count != 10)
- printk(KERN_CONT " locked it.\n");
+ pr_cont(" locked it.\n");
}
do_each_thread(g, p) {
@@ -4445,7 +4445,7 @@ retry:
unlock = 1;
} while_each_thread(g, p);
- printk("\n");
+ pr_warn("\n");
pr_warn("=============================================\n\n");
if (unlock)
@@ -4475,12 +4475,12 @@ asmlinkage __visible void lockdep_sys_exit(void)
if (unlikely(curr->lockdep_depth)) {
if (!debug_locks_off())
return;
- printk("\n");
+ pr_warn("\n");
pr_warn("================================================\n");
pr_warn("WARNING: lock held when returning to user space!\n");
print_kernel_ident();
pr_warn("------------------------------------------------\n");
- printk("%s/%d is leaving the kernel with locks still held!\n",
+ pr_warn("%s/%d is leaving the kernel with locks still held!\n",
curr->comm, curr->pid);
lockdep_print_held_locks(curr);
}
@@ -4490,19 +4490,15 @@ void lockdep_rcu_suspicious(const char *file, const int line, const char *s)
{
struct task_struct *curr = current;
-#ifndef CONFIG_PROVE_RCU_REPEATEDLY
- if (!debug_locks_off())
- return;
-#endif /* #ifdef CONFIG_PROVE_RCU_REPEATEDLY */
/* Note: the following can be executed concurrently, so be careful. */
- printk("\n");
+ pr_warn("\n");
pr_warn("=============================\n");
pr_warn("WARNING: suspicious RCU usage\n");
print_kernel_ident();
pr_warn("-----------------------------\n");
- printk("%s:%d %s!\n", file, line, s);
- printk("\nother info that might help us debug this:\n\n");
- printk("\n%srcu_scheduler_active = %d, debug_locks = %d\n",
+ pr_warn("%s:%d %s!\n", file, line, s);
+ pr_warn("\nother info that might help us debug this:\n\n");
+ pr_warn("\n%srcu_scheduler_active = %d, debug_locks = %d\n",
!rcu_lockdep_current_cpu_online()
? "RCU used illegally from offline CPU!\n"
: !rcu_is_watching()
@@ -4529,10 +4525,10 @@ void lockdep_rcu_suspicious(const char *file, const int line, const char *s)
* rcu_read_lock_bh() and so on from extended quiescent states.
*/
if (!rcu_is_watching())
- printk("RCU used illegally from extended quiescent state!\n");
+ pr_warn("RCU used illegally from extended quiescent state!\n");
lockdep_print_held_locks(curr);
- printk("\nstack backtrace:\n");
+ pr_warn("\nstack backtrace:\n");
dump_stack();
}
EXPORT_SYMBOL_GPL(lockdep_rcu_suspicious);
diff --git a/kernel/locking/rtmutex-debug.c b/kernel/locking/rtmutex-debug.c
index 58e366ad36f4..ac35e648b0e5 100644
--- a/kernel/locking/rtmutex-debug.c
+++ b/kernel/locking/rtmutex-debug.c
@@ -166,12 +166,16 @@ void debug_rt_mutex_free_waiter(struct rt_mutex_waiter *waiter)
memset(waiter, 0x22, sizeof(*waiter));
}
-void debug_rt_mutex_init(struct rt_mutex *lock, const char *name)
+void debug_rt_mutex_init(struct rt_mutex *lock, const char *name, struct lock_class_key *key)
{
/*
* Make sure we are not reinitializing a held lock:
*/
debug_check_no_locks_freed((void *)lock, sizeof(*lock));
lock->name = name;
+
+#ifdef CONFIG_DEBUG_LOCK_ALLOC
+ lockdep_init_map(&lock->dep_map, name, key, 0);
+#endif
}
diff --git a/kernel/locking/rtmutex-debug.h b/kernel/locking/rtmutex-debug.h
index b585af9a1b50..5078c6ddf4a5 100644
--- a/kernel/locking/rtmutex-debug.h
+++ b/kernel/locking/rtmutex-debug.h
@@ -11,7 +11,7 @@
extern void debug_rt_mutex_init_waiter(struct rt_mutex_waiter *waiter);
extern void debug_rt_mutex_free_waiter(struct rt_mutex_waiter *waiter);
-extern void debug_rt_mutex_init(struct rt_mutex *lock, const char *name);
+extern void debug_rt_mutex_init(struct rt_mutex *lock, const char *name, struct lock_class_key *key);
extern void debug_rt_mutex_lock(struct rt_mutex *lock);
extern void debug_rt_mutex_unlock(struct rt_mutex *lock);
extern void debug_rt_mutex_proxy_lock(struct rt_mutex *lock,
diff --git a/kernel/locking/rtmutex.c b/kernel/locking/rtmutex.c
index 28cd09e635ed..78069895032a 100644
--- a/kernel/locking/rtmutex.c
+++ b/kernel/locking/rtmutex.c
@@ -1481,6 +1481,7 @@ void __sched rt_mutex_lock(struct rt_mutex *lock)
{
might_sleep();
+ mutex_acquire(&lock->dep_map, 0, 0, _RET_IP_);
rt_mutex_fastlock(lock, TASK_UNINTERRUPTIBLE, rt_mutex_slowlock);
}
EXPORT_SYMBOL_GPL(rt_mutex_lock);
@@ -1496,9 +1497,16 @@ EXPORT_SYMBOL_GPL(rt_mutex_lock);
*/
int __sched rt_mutex_lock_interruptible(struct rt_mutex *lock)
{
+ int ret;
+
might_sleep();
- return rt_mutex_fastlock(lock, TASK_INTERRUPTIBLE, rt_mutex_slowlock);
+ mutex_acquire(&lock->dep_map, 0, 0, _RET_IP_);
+ ret = rt_mutex_fastlock(lock, TASK_INTERRUPTIBLE, rt_mutex_slowlock);
+ if (ret)
+ mutex_release(&lock->dep_map, 1, _RET_IP_);
+
+ return ret;
}
EXPORT_SYMBOL_GPL(rt_mutex_lock_interruptible);
@@ -1526,11 +1534,18 @@ int __sched rt_mutex_futex_trylock(struct rt_mutex *lock)
int
rt_mutex_timed_lock(struct rt_mutex *lock, struct hrtimer_sleeper *timeout)
{
+ int ret;
+
might_sleep();
- return rt_mutex_timed_fastlock(lock, TASK_INTERRUPTIBLE, timeout,
+ mutex_acquire(&lock->dep_map, 0, 0, _RET_IP_);
+ ret = rt_mutex_timed_fastlock(lock, TASK_INTERRUPTIBLE, timeout,
RT_MUTEX_MIN_CHAINWALK,
rt_mutex_slowlock);
+ if (ret)
+ mutex_release(&lock->dep_map, 1, _RET_IP_);
+
+ return ret;
}
EXPORT_SYMBOL_GPL(rt_mutex_timed_lock);
@@ -1547,10 +1562,16 @@ EXPORT_SYMBOL_GPL(rt_mutex_timed_lock);
*/
int __sched rt_mutex_trylock(struct rt_mutex *lock)
{
+ int ret;
+
if (WARN_ON_ONCE(in_irq() || in_nmi() || in_serving_softirq()))
return 0;
- return rt_mutex_fasttrylock(lock, rt_mutex_slowtrylock);
+ ret = rt_mutex_fasttrylock(lock, rt_mutex_slowtrylock);
+ if (ret)
+ mutex_acquire(&lock->dep_map, 0, 1, _RET_IP_);
+
+ return ret;
}
EXPORT_SYMBOL_GPL(rt_mutex_trylock);
@@ -1561,6 +1582,7 @@ EXPORT_SYMBOL_GPL(rt_mutex_trylock);
*/
void __sched rt_mutex_unlock(struct rt_mutex *lock)
{
+ mutex_release(&lock->dep_map, 1, _RET_IP_);
rt_mutex_fastunlock(lock, rt_mutex_slowunlock);
}
EXPORT_SYMBOL_GPL(rt_mutex_unlock);
@@ -1620,7 +1642,6 @@ void rt_mutex_destroy(struct rt_mutex *lock)
lock->magic = NULL;
#endif
}
-
EXPORT_SYMBOL_GPL(rt_mutex_destroy);
/**
@@ -1632,14 +1653,16 @@ EXPORT_SYMBOL_GPL(rt_mutex_destroy);
*
* Initializing of a locked rt lock is not allowed
*/
-void __rt_mutex_init(struct rt_mutex *lock, const char *name)
+void __rt_mutex_init(struct rt_mutex *lock, const char *name,
+ struct lock_class_key *key)
{
lock->owner = NULL;
raw_spin_lock_init(&lock->wait_lock);
lock->waiters = RB_ROOT;
lock->waiters_leftmost = NULL;
- debug_rt_mutex_init(lock, name);
+ if (name && key)
+ debug_rt_mutex_init(lock, name, key);
}
EXPORT_SYMBOL_GPL(__rt_mutex_init);
@@ -1660,7 +1683,7 @@ EXPORT_SYMBOL_GPL(__rt_mutex_init);
void rt_mutex_init_proxy_locked(struct rt_mutex *lock,
struct task_struct *proxy_owner)
{
- __rt_mutex_init(lock, NULL);
+ __rt_mutex_init(lock, NULL, NULL);
debug_rt_mutex_proxy_lock(lock, proxy_owner);
rt_mutex_set_owner(lock, proxy_owner);
}
diff --git a/kernel/locking/rtmutex.h b/kernel/locking/rtmutex.h
index 6607802efa8b..5c253caffe91 100644
--- a/kernel/locking/rtmutex.h
+++ b/kernel/locking/rtmutex.h
@@ -17,7 +17,7 @@
#define debug_rt_mutex_proxy_lock(l,p) do { } while (0)
#define debug_rt_mutex_proxy_unlock(l) do { } while (0)
#define debug_rt_mutex_unlock(l) do { } while (0)
-#define debug_rt_mutex_init(m, n) do { } while (0)
+#define debug_rt_mutex_init(m, n, k) do { } while (0)
#define debug_rt_mutex_deadlock(d, a ,l) do { } while (0)
#define debug_rt_mutex_print_deadlock(w) do { } while (0)
#define debug_rt_mutex_reset_waiter(w) do { } while (0)
diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c
index a1db38abac5b..bd53ea579dc8 100644
--- a/kernel/printk/printk.c
+++ b/kernel/printk/printk.c
@@ -1175,7 +1175,7 @@ static void boot_delay_msec(int level)
unsigned long long k;
unsigned long timeout;
- if ((boot_delay == 0 || system_state != SYSTEM_BOOTING)
+ if ((boot_delay == 0 || system_state >= SYSTEM_RUNNING)
|| suppress_message_printing(level)) {
return;
}
diff --git a/kernel/rcu/Kconfig b/kernel/rcu/Kconfig
new file mode 100644
index 000000000000..be90c945063f
--- /dev/null
+++ b/kernel/rcu/Kconfig
@@ -0,0 +1,242 @@
+#
+# RCU-related configuration options
+#
+
+menu "RCU Subsystem"
+
+config TREE_RCU
+ bool
+ default y if !PREEMPT && SMP
+ help
+ This option selects the RCU implementation that is
+ designed for very large SMP system with hundreds or
+ thousands of CPUs. It also scales down nicely to
+ smaller systems.
+
+config PREEMPT_RCU
+ bool
+ default y if PREEMPT
+ help
+ This option selects the RCU implementation that is
+ designed for very large SMP systems with hundreds or
+ thousands of CPUs, but for which real-time response
+ is also required. It also scales down nicely to
+ smaller systems.
+
+ Select this option if you are unsure.
+
+config TINY_RCU
+ bool
+ default y if !PREEMPT && !SMP
+ help
+ This option selects the RCU implementation that is
+ designed for UP systems from which real-time response
+ is not required. This option greatly reduces the
+ memory footprint of RCU.
+
+config RCU_EXPERT
+ bool "Make expert-level adjustments to RCU configuration"
+ default n
+ help
+ This option needs to be enabled if you wish to make
+ expert-level adjustments to RCU configuration. By default,
+ no such adjustments can be made, which has the often-beneficial
+ side-effect of preventing "make oldconfig" from asking you all
+ sorts of detailed questions about how you would like numerous
+ obscure RCU options to be set up.
+
+ Say Y if you need to make expert-level adjustments to RCU.
+
+ Say N if you are unsure.
+
+config SRCU
+ bool
+ help
+ This option selects the sleepable version of RCU. This version
+ permits arbitrary sleeping or blocking within RCU read-side critical
+ sections.
+
+config TINY_SRCU
+ bool
+ default y if SRCU && TINY_RCU
+ help
+ This option selects the single-CPU non-preemptible version of SRCU.
+
+config TREE_SRCU
+ bool
+ default y if SRCU && !TINY_RCU
+ help
+ This option selects the full-fledged version of SRCU.
+
+config TASKS_RCU
+ bool
+ default n
+ select SRCU
+ help
+ This option enables a task-based RCU implementation that uses
+ only voluntary context switch (not preemption!), idle, and
+ user-mode execution as quiescent states.
+
+config RCU_STALL_COMMON
+ def_bool ( TREE_RCU || PREEMPT_RCU )
+ help
+ This option enables RCU CPU stall code that is common between
+ the TINY and TREE variants of RCU. The purpose is to allow
+ the tiny variants to disable RCU CPU stall warnings, while
+ making these warnings mandatory for the tree variants.
+
+config RCU_NEED_SEGCBLIST
+ def_bool ( TREE_RCU || PREEMPT_RCU || TREE_SRCU )
+
+config CONTEXT_TRACKING
+ bool
+
+config CONTEXT_TRACKING_FORCE
+ bool "Force context tracking"
+ depends on CONTEXT_TRACKING
+ default y if !NO_HZ_FULL
+ help
+ The major pre-requirement for full dynticks to work is to
+ support the context tracking subsystem. But there are also
+ other dependencies to provide in order to make the full
+ dynticks working.
+
+ This option stands for testing when an arch implements the
+ context tracking backend but doesn't yet fullfill all the
+ requirements to make the full dynticks feature working.
+ Without the full dynticks, there is no way to test the support
+ for context tracking and the subsystems that rely on it: RCU
+ userspace extended quiescent state and tickless cputime
+ accounting. This option copes with the absence of the full
+ dynticks subsystem by forcing the context tracking on all
+ CPUs in the system.
+
+ Say Y only if you're working on the development of an
+ architecture backend for the context tracking.
+
+ Say N otherwise, this option brings an overhead that you
+ don't want in production.
+
+
+config RCU_FANOUT
+ int "Tree-based hierarchical RCU fanout value"
+ range 2 64 if 64BIT
+ range 2 32 if !64BIT
+ depends on (TREE_RCU || PREEMPT_RCU) && RCU_EXPERT
+ default 64 if 64BIT
+ default 32 if !64BIT
+ help
+ This option controls the fanout of hierarchical implementations
+ of RCU, allowing RCU to work efficiently on machines with
+ large numbers of CPUs. This value must be at least the fourth
+ root of NR_CPUS, which allows NR_CPUS to be insanely large.
+ The default value of RCU_FANOUT should be used for production
+ systems, but if you are stress-testing the RCU implementation
+ itself, small RCU_FANOUT values allow you to test large-system
+ code paths on small(er) systems.
+
+ Select a specific number if testing RCU itself.
+ Take the default if unsure.
+
+config RCU_FANOUT_LEAF
+ int "Tree-based hierarchical RCU leaf-level fanout value"
+ range 2 64 if 64BIT
+ range 2 32 if !64BIT
+ depends on (TREE_RCU || PREEMPT_RCU) && RCU_EXPERT
+ default 16
+ help
+ This option controls the leaf-level fanout of hierarchical
+ implementations of RCU, and allows trading off cache misses
+ against lock contention. Systems that synchronize their
+ scheduling-clock interrupts for energy-efficiency reasons will
+ want the default because the smaller leaf-level fanout keeps
+ lock contention levels acceptably low. Very large systems
+ (hundreds or thousands of CPUs) will instead want to set this
+ value to the maximum value possible in order to reduce the
+ number of cache misses incurred during RCU's grace-period
+ initialization. These systems tend to run CPU-bound, and thus
+ are not helped by synchronized interrupts, and thus tend to
+ skew them, which reduces lock contention enough that large
+ leaf-level fanouts work well. That said, setting leaf-level
+ fanout to a large number will likely cause problematic
+ lock contention on the leaf-level rcu_node structures unless
+ you boot with the skew_tick kernel parameter.
+
+ Select a specific number if testing RCU itself.
+
+ Select the maximum permissible value for large systems, but
+ please understand that you may also need to set the skew_tick
+ kernel boot parameter to avoid contention on the rcu_node
+ structure's locks.
+
+ Take the default if unsure.
+
+config RCU_FAST_NO_HZ
+ bool "Accelerate last non-dyntick-idle CPU's grace periods"
+ depends on NO_HZ_COMMON && SMP && RCU_EXPERT
+ default n
+ help
+ This option permits CPUs to enter dynticks-idle state even if
+ they have RCU callbacks queued, and prevents RCU from waking
+ these CPUs up more than roughly once every four jiffies (by
+ default, you can adjust this using the rcutree.rcu_idle_gp_delay
+ parameter), thus improving energy efficiency. On the other
+ hand, this option increases the duration of RCU grace periods,
+ for example, slowing down synchronize_rcu().
+
+ Say Y if energy efficiency is critically important, and you
+ don't care about increased grace-period durations.
+
+ Say N if you are unsure.
+
+config RCU_BOOST
+ bool "Enable RCU priority boosting"
+ depends on RT_MUTEXES && PREEMPT_RCU && RCU_EXPERT
+ default n
+ help
+ This option boosts the priority of preempted RCU readers that
+ block the current preemptible RCU grace period for too long.
+ This option also prevents heavy loads from blocking RCU
+ callback invocation for all flavors of RCU.
+
+ Say Y here if you are working with real-time apps or heavy loads
+ Say N here if you are unsure.
+
+config RCU_BOOST_DELAY
+ int "Milliseconds to delay boosting after RCU grace-period start"
+ range 0 3000
+ depends on RCU_BOOST
+ default 500
+ help
+ This option specifies the time to wait after the beginning of
+ a given grace period before priority-boosting preempted RCU
+ readers blocking that grace period. Note that any RCU reader
+ blocking an expedited RCU grace period is boosted immediately.
+
+ Accept the default if unsure.
+
+config RCU_NOCB_CPU
+ bool "Offload RCU callback processing from boot-selected CPUs"
+ depends on TREE_RCU || PREEMPT_RCU
+ depends on RCU_EXPERT || NO_HZ_FULL
+ default n
+ help
+ Use this option to reduce OS jitter for aggressive HPC or
+ real-time workloads. It can also be used to offload RCU
+ callback invocation to energy-efficient CPUs in battery-powered
+ asymmetric multiprocessors.
+
+ This option offloads callback invocation from the set of
+ CPUs specified at boot time by the rcu_nocbs parameter.
+ For each such CPU, a kthread ("rcuox/N") will be created to
+ invoke callbacks, where the "N" is the CPU being offloaded,
+ and where the "x" is "b" for RCU-bh, "p" for RCU-preempt, and
+ "s" for RCU-sched. Nothing prevents this kthread from running
+ on the specified CPUs, but (1) the kthreads may be preempted
+ between each callback, and (2) affinity or cgroups can be used
+ to force the kthreads to run on whatever set of CPUs is desired.
+
+ Say Y here if you want to help to debug reduced OS jitter.
+ Say N here if you are unsure.
+
+endmenu # "RCU Subsystem"
diff --git a/kernel/rcu/Kconfig.debug b/kernel/rcu/Kconfig.debug
new file mode 100644
index 000000000000..0ec7d1d33a14
--- /dev/null
+++ b/kernel/rcu/Kconfig.debug
@@ -0,0 +1,82 @@
+#
+# RCU-related debugging configuration options
+#
+
+menu "RCU Debugging"
+
+config PROVE_RCU
+ def_bool PROVE_LOCKING
+
+config TORTURE_TEST
+ tristate
+ default n
+
+config RCU_PERF_TEST
+ tristate "performance tests for RCU"
+ depends on DEBUG_KERNEL
+ select TORTURE_TEST
+ select SRCU
+ select TASKS_RCU
+ default n
+ help
+ This option provides a kernel module that runs performance
+ tests on the RCU infrastructure. The kernel module may be built
+ after the fact on the running kernel to be tested, if desired.
+
+ Say Y here if you want RCU performance tests to be built into
+ the kernel.
+ Say M if you want the RCU performance tests to build as a module.
+ Say N if you are unsure.
+
+config RCU_TORTURE_TEST
+ tristate "torture tests for RCU"
+ depends on DEBUG_KERNEL
+ select TORTURE_TEST
+ select SRCU
+ select TASKS_RCU
+ default n
+ help
+ This option provides a kernel module that runs torture tests
+ on the RCU infrastructure. The kernel module may be built
+ after the fact on the running kernel to be tested, if desired.
+
+ Say Y here if you want RCU torture tests to be built into
+ the kernel.
+ Say M if you want the RCU torture tests to build as a module.
+ Say N if you are unsure.
+
+config RCU_CPU_STALL_TIMEOUT
+ int "RCU CPU stall timeout in seconds"
+ depends on RCU_STALL_COMMON
+ range 3 300
+ default 21
+ help
+ If a given RCU grace period extends more than the specified
+ number of seconds, a CPU stall warning is printed. If the
+ RCU grace period persists, additional CPU stall warnings are
+ printed at more widely spaced intervals.
+
+config RCU_TRACE
+ bool "Enable tracing for RCU"
+ depends on DEBUG_KERNEL
+ default y if TREE_RCU
+ select TRACE_CLOCK
+ help
+ This option enables additional tracepoints for ftrace-style
+ event tracing.
+
+ Say Y here if you want to enable RCU tracing
+ Say N if you are unsure.
+
+config RCU_EQS_DEBUG
+ bool "Provide debugging asserts for adding NO_HZ support to an arch"
+ depends on DEBUG_KERNEL
+ help
+ This option provides consistency checks in RCU's handling of
+ NO_HZ. These checks have proven quite helpful in detecting
+ bugs in arch-specific NO_HZ code.
+
+ Say N here if you need ultimate kernel/user switch latencies
+ Say Y if you are unsure
+
+endmenu # "RCU Debugging"
diff --git a/kernel/rcu/Makefile b/kernel/rcu/Makefile
index 23803c7d5180..13c0fc852767 100644
--- a/kernel/rcu/Makefile
+++ b/kernel/rcu/Makefile
@@ -3,13 +3,11 @@
KCOV_INSTRUMENT := n
obj-y += update.o sync.o
-obj-$(CONFIG_CLASSIC_SRCU) += srcu.o
obj-$(CONFIG_TREE_SRCU) += srcutree.o
obj-$(CONFIG_TINY_SRCU) += srcutiny.o
obj-$(CONFIG_RCU_TORTURE_TEST) += rcutorture.o
obj-$(CONFIG_RCU_PERF_TEST) += rcuperf.o
obj-$(CONFIG_TREE_RCU) += tree.o
obj-$(CONFIG_PREEMPT_RCU) += tree.o
-obj-$(CONFIG_TREE_RCU_TRACE) += tree_trace.o
obj-$(CONFIG_TINY_RCU) += tiny.o
obj-$(CONFIG_RCU_NEED_SEGCBLIST) += rcu_segcblist.o
diff --git a/kernel/rcu/rcu.h b/kernel/rcu/rcu.h
index 73e16ec4054b..808b8c85f626 100644
--- a/kernel/rcu/rcu.h
+++ b/kernel/rcu/rcu.h
@@ -212,6 +212,18 @@ int rcu_jiffies_till_stall_check(void);
*/
#define TPS(x) tracepoint_string(x)
+/*
+ * Dump the ftrace buffer, but only one time per callsite per boot.
+ */
+#define rcu_ftrace_dump(oops_dump_mode) \
+do { \
+ static atomic_t ___rfd_beenhere = ATOMIC_INIT(0); \
+ \
+ if (!atomic_read(&___rfd_beenhere) && \
+ !atomic_xchg(&___rfd_beenhere, 1)) \
+ ftrace_dump(oops_dump_mode); \
+} while (0)
+
void rcu_early_boot_tests(void);
void rcu_test_sync_prims(void);
@@ -291,6 +303,271 @@ static inline void rcu_init_levelspread(int *levelspread, const int *levelcnt)
cpu <= rnp->grphi; \
cpu = cpumask_next((cpu), cpu_possible_mask))
+/*
+ * Wrappers for the rcu_node::lock acquire and release.
+ *
+ * Because the rcu_nodes form a tree, the tree traversal locking will observe
+ * different lock values, this in turn means that an UNLOCK of one level
+ * followed by a LOCK of another level does not imply a full memory barrier;
+ * and most importantly transitivity is lost.
+ *
+ * In order to restore full ordering between tree levels, augment the regular
+ * lock acquire functions with smp_mb__after_unlock_lock().
+ *
+ * As ->lock of struct rcu_node is a __private field, therefore one should use
+ * these wrappers rather than directly call raw_spin_{lock,unlock}* on ->lock.
+ */
+#define raw_spin_lock_rcu_node(p) \
+do { \
+ raw_spin_lock(&ACCESS_PRIVATE(p, lock)); \
+ smp_mb__after_unlock_lock(); \
+} while (0)
+
+#define raw_spin_unlock_rcu_node(p) raw_spin_unlock(&ACCESS_PRIVATE(p, lock))
+
+#define raw_spin_lock_irq_rcu_node(p) \
+do { \
+ raw_spin_lock_irq(&ACCESS_PRIVATE(p, lock)); \
+ smp_mb__after_unlock_lock(); \
+} while (0)
+
+#define raw_spin_unlock_irq_rcu_node(p) \
+ raw_spin_unlock_irq(&ACCESS_PRIVATE(p, lock))
+
+#define raw_spin_lock_irqsave_rcu_node(p, flags) \
+do { \
+ raw_spin_lock_irqsave(&ACCESS_PRIVATE(p, lock), flags); \
+ smp_mb__after_unlock_lock(); \
+} while (0)
+
+#define raw_spin_unlock_irqrestore_rcu_node(p, flags) \
+ raw_spin_unlock_irqrestore(&ACCESS_PRIVATE(p, lock), flags) \
+
+#define raw_spin_trylock_rcu_node(p) \
+({ \
+ bool ___locked = raw_spin_trylock(&ACCESS_PRIVATE(p, lock)); \
+ \
+ if (___locked) \
+ smp_mb__after_unlock_lock(); \
+ ___locked; \
+})
+
#endif /* #if defined(SRCU) || !defined(TINY_RCU) */
+#ifdef CONFIG_TINY_RCU
+/* Tiny RCU doesn't expedite, as its purpose in life is instead to be tiny. */
+static inline bool rcu_gp_is_normal(void) /* Internal RCU use. */
+{
+ return true;
+}
+static inline bool rcu_gp_is_expedited(void) /* Internal RCU use. */
+{
+ return false;
+}
+
+static inline void rcu_expedite_gp(void)
+{
+}
+
+static inline void rcu_unexpedite_gp(void)
+{
+}
+#else /* #ifdef CONFIG_TINY_RCU */
+bool rcu_gp_is_normal(void); /* Internal RCU use. */
+bool rcu_gp_is_expedited(void); /* Internal RCU use. */
+void rcu_expedite_gp(void);
+void rcu_unexpedite_gp(void);
+void rcupdate_announce_bootup_oddness(void);
+#endif /* #else #ifdef CONFIG_TINY_RCU */
+
+#define RCU_SCHEDULER_INACTIVE 0
+#define RCU_SCHEDULER_INIT 1
+#define RCU_SCHEDULER_RUNNING 2
+
+#ifdef CONFIG_TINY_RCU
+static inline void rcu_request_urgent_qs_task(struct task_struct *t) { }
+#else /* #ifdef CONFIG_TINY_RCU */
+void rcu_request_urgent_qs_task(struct task_struct *t);
+#endif /* #else #ifdef CONFIG_TINY_RCU */
+
+enum rcutorture_type {
+ RCU_FLAVOR,
+ RCU_BH_FLAVOR,
+ RCU_SCHED_FLAVOR,
+ RCU_TASKS_FLAVOR,
+ SRCU_FLAVOR,
+ INVALID_RCU_FLAVOR
+};
+
+#if defined(CONFIG_TREE_RCU) || defined(CONFIG_PREEMPT_RCU)
+void rcutorture_get_gp_data(enum rcutorture_type test_type, int *flags,
+ unsigned long *gpnum, unsigned long *completed);
+void rcutorture_record_test_transition(void);
+void rcutorture_record_progress(unsigned long vernum);
+void do_trace_rcu_torture_read(const char *rcutorturename,
+ struct rcu_head *rhp,
+ unsigned long secs,
+ unsigned long c_old,
+ unsigned long c);
+#else
+static inline void rcutorture_get_gp_data(enum rcutorture_type test_type,
+ int *flags,
+ unsigned long *gpnum,
+ unsigned long *completed)
+{
+ *flags = 0;
+ *gpnum = 0;
+ *completed = 0;
+}
+static inline void rcutorture_record_test_transition(void)
+{
+}
+static inline void rcutorture_record_progress(unsigned long vernum)
+{
+}
+#ifdef CONFIG_RCU_TRACE
+void do_trace_rcu_torture_read(const char *rcutorturename,
+ struct rcu_head *rhp,
+ unsigned long secs,
+ unsigned long c_old,
+ unsigned long c);
+#else
+#define do_trace_rcu_torture_read(rcutorturename, rhp, secs, c_old, c) \
+ do { } while (0)
+#endif
+#endif
+
+#ifdef CONFIG_TINY_SRCU
+
+static inline void srcutorture_get_gp_data(enum rcutorture_type test_type,
+ struct srcu_struct *sp, int *flags,
+ unsigned long *gpnum,
+ unsigned long *completed)
+{
+ if (test_type != SRCU_FLAVOR)
+ return;
+ *flags = 0;
+ *completed = sp->srcu_idx;
+ *gpnum = *completed;
+}
+
+#elif defined(CONFIG_TREE_SRCU)
+
+void srcutorture_get_gp_data(enum rcutorture_type test_type,
+ struct srcu_struct *sp, int *flags,
+ unsigned long *gpnum, unsigned long *completed);
+
+#endif
+
+#ifdef CONFIG_TINY_RCU
+
+/*
+ * Return the number of grace periods started.
+ */
+static inline unsigned long rcu_batches_started(void)
+{
+ return 0;
+}
+
+/*
+ * Return the number of bottom-half grace periods started.
+ */
+static inline unsigned long rcu_batches_started_bh(void)
+{
+ return 0;
+}
+
+/*
+ * Return the number of sched grace periods started.
+ */
+static inline unsigned long rcu_batches_started_sched(void)
+{
+ return 0;
+}
+
+/*
+ * Return the number of grace periods completed.
+ */
+static inline unsigned long rcu_batches_completed(void)
+{
+ return 0;
+}
+
+/*
+ * Return the number of bottom-half grace periods completed.
+ */
+static inline unsigned long rcu_batches_completed_bh(void)
+{
+ return 0;
+}
+
+/*
+ * Return the number of sched grace periods completed.
+ */
+static inline unsigned long rcu_batches_completed_sched(void)
+{
+ return 0;
+}
+
+/*
+ * Return the number of expedited grace periods completed.
+ */
+static inline unsigned long rcu_exp_batches_completed(void)
+{
+ return 0;
+}
+
+/*
+ * Return the number of expedited sched grace periods completed.
+ */
+static inline unsigned long rcu_exp_batches_completed_sched(void)
+{
+ return 0;
+}
+
+static inline unsigned long srcu_batches_completed(struct srcu_struct *sp)
+{
+ return 0;
+}
+
+static inline void rcu_force_quiescent_state(void)
+{
+}
+
+static inline void rcu_bh_force_quiescent_state(void)
+{
+}
+
+static inline void rcu_sched_force_quiescent_state(void)
+{
+}
+
+static inline void show_rcu_gp_kthreads(void)
+{
+}
+
+#else /* #ifdef CONFIG_TINY_RCU */
+extern unsigned long rcutorture_testseq;
+extern unsigned long rcutorture_vernum;
+unsigned long rcu_batches_started(void);
+unsigned long rcu_batches_started_bh(void);
+unsigned long rcu_batches_started_sched(void);
+unsigned long rcu_batches_completed(void);
+unsigned long rcu_batches_completed_bh(void);
+unsigned long rcu_batches_completed_sched(void);
+unsigned long rcu_exp_batches_completed(void);
+unsigned long rcu_exp_batches_completed_sched(void);
+unsigned long srcu_batches_completed(struct srcu_struct *sp);
+void show_rcu_gp_kthreads(void);
+void rcu_force_quiescent_state(void);
+void rcu_bh_force_quiescent_state(void);
+void rcu_sched_force_quiescent_state(void);
+#endif /* #else #ifdef CONFIG_TINY_RCU */
+
+#ifdef CONFIG_RCU_NOCB_CPU
+bool rcu_is_nocb_cpu(int cpu);
+#else
+static inline bool rcu_is_nocb_cpu(int cpu) { return false; }
+#endif
+
#endif /* __LINUX_RCU_H */
diff --git a/kernel/rcu/rcuperf.c b/kernel/rcu/rcuperf.c
index a4a86fb47e4a..3cc18110b612 100644
--- a/kernel/rcu/rcuperf.c
+++ b/kernel/rcu/rcuperf.c
@@ -48,6 +48,8 @@
#include <linux/torture.h>
#include <linux/vmalloc.h>
+#include "rcu.h"
+
MODULE_LICENSE("GPL");
MODULE_AUTHOR("Paul E. McKenney <paulmck@linux.vnet.ibm.com>");
@@ -59,12 +61,16 @@ MODULE_AUTHOR("Paul E. McKenney <paulmck@linux.vnet.ibm.com>");
#define VERBOSE_PERFOUT_ERRSTRING(s) \
do { if (verbose) pr_alert("%s" PERF_FLAG "!!! %s\n", perf_type, s); } while (0)
+torture_param(bool, gp_async, false, "Use asynchronous GP wait primitives");
+torture_param(int, gp_async_max, 1000, "Max # outstanding waits per reader");
torture_param(bool, gp_exp, false, "Use expedited GP wait primitives");
torture_param(int, holdoff, 10, "Holdoff time before test start (s)");
-torture_param(int, nreaders, -1, "Number of RCU reader threads");
+torture_param(int, nreaders, 0, "Number of RCU reader threads");
torture_param(int, nwriters, -1, "Number of RCU updater threads");
-torture_param(bool, shutdown, false, "Shutdown at end of performance tests.");
+torture_param(bool, shutdown, !IS_ENABLED(MODULE),
+ "Shutdown at end of performance tests.");
torture_param(bool, verbose, true, "Enable verbose debugging printk()s");
+torture_param(int, writer_holdoff, 0, "Holdoff (us) between GPs, zero to disable");
static char *perf_type = "rcu";
module_param(perf_type, charp, 0444);
@@ -86,13 +92,16 @@ static u64 t_rcu_perf_writer_started;
static u64 t_rcu_perf_writer_finished;
static unsigned long b_rcu_perf_writer_started;
static unsigned long b_rcu_perf_writer_finished;
+static DEFINE_PER_CPU(atomic_t, n_async_inflight);
static int rcu_perf_writer_state;
#define RTWS_INIT 0
-#define RTWS_EXP_SYNC 1
-#define RTWS_SYNC 2
-#define RTWS_IDLE 2
-#define RTWS_STOPPING 3
+#define RTWS_ASYNC 1
+#define RTWS_BARRIER 2
+#define RTWS_EXP_SYNC 3
+#define RTWS_SYNC 4
+#define RTWS_IDLE 5
+#define RTWS_STOPPING 6
#define MAX_MEAS 10000
#define MIN_MEAS 100
@@ -114,6 +123,8 @@ struct rcu_perf_ops {
unsigned long (*started)(void);
unsigned long (*completed)(void);
unsigned long (*exp_completed)(void);
+ void (*async)(struct rcu_head *head, rcu_callback_t func);
+ void (*gp_barrier)(void);
void (*sync)(void);
void (*exp_sync)(void);
const char *name;
@@ -153,6 +164,8 @@ static struct rcu_perf_ops rcu_ops = {
.started = rcu_batches_started,
.completed = rcu_batches_completed,
.exp_completed = rcu_exp_batches_completed,
+ .async = call_rcu,
+ .gp_barrier = rcu_barrier,
.sync = synchronize_rcu,
.exp_sync = synchronize_rcu_expedited,
.name = "rcu"
@@ -181,6 +194,8 @@ static struct rcu_perf_ops rcu_bh_ops = {
.started = rcu_batches_started_bh,
.completed = rcu_batches_completed_bh,
.exp_completed = rcu_exp_batches_completed_sched,
+ .async = call_rcu_bh,
+ .gp_barrier = rcu_barrier_bh,
.sync = synchronize_rcu_bh,
.exp_sync = synchronize_rcu_bh_expedited,
.name = "rcu_bh"
@@ -208,6 +223,16 @@ static unsigned long srcu_perf_completed(void)
return srcu_batches_completed(srcu_ctlp);
}
+static void srcu_call_rcu(struct rcu_head *head, rcu_callback_t func)
+{
+ call_srcu(srcu_ctlp, head, func);
+}
+
+static void srcu_rcu_barrier(void)
+{
+ srcu_barrier(srcu_ctlp);
+}
+
static void srcu_perf_synchronize(void)
{
synchronize_srcu(srcu_ctlp);
@@ -226,11 +251,42 @@ static struct rcu_perf_ops srcu_ops = {
.started = NULL,
.completed = srcu_perf_completed,
.exp_completed = srcu_perf_completed,
+ .async = srcu_call_rcu,
+ .gp_barrier = srcu_rcu_barrier,
.sync = srcu_perf_synchronize,
.exp_sync = srcu_perf_synchronize_expedited,
.name = "srcu"
};
+static struct srcu_struct srcud;
+
+static void srcu_sync_perf_init(void)
+{
+ srcu_ctlp = &srcud;
+ init_srcu_struct(srcu_ctlp);
+}
+
+static void srcu_sync_perf_cleanup(void)
+{
+ cleanup_srcu_struct(srcu_ctlp);
+}
+
+static struct rcu_perf_ops srcud_ops = {
+ .ptype = SRCU_FLAVOR,
+ .init = srcu_sync_perf_init,
+ .cleanup = srcu_sync_perf_cleanup,
+ .readlock = srcu_perf_read_lock,
+ .readunlock = srcu_perf_read_unlock,
+ .started = NULL,
+ .completed = srcu_perf_completed,
+ .exp_completed = srcu_perf_completed,
+ .async = srcu_call_rcu,
+ .gp_barrier = srcu_rcu_barrier,
+ .sync = srcu_perf_synchronize,
+ .exp_sync = srcu_perf_synchronize_expedited,
+ .name = "srcud"
+};
+
/*
* Definitions for sched perf testing.
*/
@@ -254,6 +310,8 @@ static struct rcu_perf_ops sched_ops = {
.started = rcu_batches_started_sched,
.completed = rcu_batches_completed_sched,
.exp_completed = rcu_exp_batches_completed_sched,
+ .async = call_rcu_sched,
+ .gp_barrier = rcu_barrier_sched,
.sync = synchronize_sched,
.exp_sync = synchronize_sched_expedited,
.name = "sched"
@@ -281,6 +339,8 @@ static struct rcu_perf_ops tasks_ops = {
.readunlock = tasks_perf_read_unlock,
.started = rcu_no_completed,
.completed = rcu_no_completed,
+ .async = call_rcu_tasks,
+ .gp_barrier = rcu_barrier_tasks,
.sync = synchronize_rcu_tasks,
.exp_sync = synchronize_rcu_tasks,
.name = "tasks"
@@ -344,6 +404,15 @@ rcu_perf_reader(void *arg)
}
/*
+ * Callback function for asynchronous grace periods from rcu_perf_writer().
+ */
+static void rcu_perf_async_cb(struct rcu_head *rhp)
+{
+ atomic_dec(this_cpu_ptr(&n_async_inflight));
+ kfree(rhp);
+}
+
+/*
* RCU perf writer kthread. Repeatedly does a grace period.
*/
static int
@@ -352,6 +421,7 @@ rcu_perf_writer(void *arg)
int i = 0;
int i_max;
long me = (long)arg;
+ struct rcu_head *rhp = NULL;
struct sched_param sp;
bool started = false, done = false, alldone = false;
u64 t;
@@ -380,9 +450,27 @@ rcu_perf_writer(void *arg)
}
do {
+ if (writer_holdoff)
+ udelay(writer_holdoff);
wdp = &wdpp[i];
*wdp = ktime_get_mono_fast_ns();
- if (gp_exp) {
+ if (gp_async) {
+retry:
+ if (!rhp)
+ rhp = kmalloc(sizeof(*rhp), GFP_KERNEL);
+ if (rhp && atomic_read(this_cpu_ptr(&n_async_inflight)) < gp_async_max) {
+ rcu_perf_writer_state = RTWS_ASYNC;
+ atomic_inc(this_cpu_ptr(&n_async_inflight));
+ cur_ops->async(rhp, rcu_perf_async_cb);
+ rhp = NULL;
+ } else if (!kthread_should_stop()) {
+ rcu_perf_writer_state = RTWS_BARRIER;
+ cur_ops->gp_barrier();
+ goto retry;
+ } else {
+ kfree(rhp); /* Because we are stopping. */
+ }
+ } else if (gp_exp) {
rcu_perf_writer_state = RTWS_EXP_SYNC;
cur_ops->exp_sync();
} else {
@@ -429,6 +517,10 @@ rcu_perf_writer(void *arg)
i++;
rcu_perf_wait_shutdown();
} while (!torture_must_stop());
+ if (gp_async) {
+ rcu_perf_writer_state = RTWS_BARRIER;
+ cur_ops->gp_barrier();
+ }
rcu_perf_writer_state = RTWS_STOPPING;
writer_n_durations[me] = i_max;
torture_kthread_stopping("rcu_perf_writer");
@@ -452,6 +544,17 @@ rcu_perf_cleanup(void)
u64 *wdp;
u64 *wdpp;
+ /*
+ * Would like warning at start, but everything is expedited
+ * during the mid-boot phase, so have to wait till the end.
+ */
+ if (rcu_gp_is_expedited() && !rcu_gp_is_normal() && !gp_exp)
+ VERBOSE_PERFOUT_ERRSTRING("All grace periods expedited, no normal ones to measure!");
+ if (rcu_gp_is_normal() && gp_exp)
+ VERBOSE_PERFOUT_ERRSTRING("All grace periods normal, no expedited ones to measure!");
+ if (gp_exp && gp_async)
+ VERBOSE_PERFOUT_ERRSTRING("No expedited async GPs, so went with async!");
+
if (torture_cleanup_begin())
return;
@@ -554,7 +657,7 @@ rcu_perf_init(void)
long i;
int firsterr = 0;
static struct rcu_perf_ops *perf_ops[] = {
- &rcu_ops, &rcu_bh_ops, &srcu_ops, &sched_ops,
+ &rcu_ops, &rcu_bh_ops, &srcu_ops, &srcud_ops, &sched_ops,
RCUPERF_TASKS_OPS
};
@@ -624,16 +727,6 @@ rcu_perf_init(void)
firsterr = -ENOMEM;
goto unwind;
}
- if (rcu_gp_is_expedited() && !rcu_gp_is_normal() && !gp_exp) {
- VERBOSE_PERFOUT_ERRSTRING("All grace periods expedited, no normal ones to measure!");
- firsterr = -EINVAL;
- goto unwind;
- }
- if (rcu_gp_is_normal() && gp_exp) {
- VERBOSE_PERFOUT_ERRSTRING("All grace periods normal, no expedited ones to measure!");
- firsterr = -EINVAL;
- goto unwind;
- }
for (i = 0; i < nrealwriters; i++) {
writer_durations[i] =
kcalloc(MAX_MEAS, sizeof(*writer_durations[i]),
diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c
index ae6e574d4cf5..b8f7f8ce8575 100644
--- a/kernel/rcu/rcutorture.c
+++ b/kernel/rcu/rcutorture.c
@@ -52,6 +52,8 @@
#include <linux/torture.h>
#include <linux/vmalloc.h>
+#include "rcu.h"
+
MODULE_LICENSE("GPL");
MODULE_AUTHOR("Paul E. McKenney <paulmck@us.ibm.com> and Josh Triplett <josh@joshtriplett.org>");
@@ -562,31 +564,19 @@ static void srcu_torture_stats(void)
int __maybe_unused cpu;
int idx;
-#if defined(CONFIG_TREE_SRCU) || defined(CONFIG_CLASSIC_SRCU)
#ifdef CONFIG_TREE_SRCU
idx = srcu_ctlp->srcu_idx & 0x1;
-#else /* #ifdef CONFIG_TREE_SRCU */
- idx = srcu_ctlp->completed & 0x1;
-#endif /* #else #ifdef CONFIG_TREE_SRCU */
pr_alert("%s%s Tree SRCU per-CPU(idx=%d):",
torture_type, TORTURE_FLAG, idx);
for_each_possible_cpu(cpu) {
unsigned long l0, l1;
unsigned long u0, u1;
long c0, c1;
-#ifdef CONFIG_TREE_SRCU
struct srcu_data *counts;
counts = per_cpu_ptr(srcu_ctlp->sda, cpu);
u0 = counts->srcu_unlock_count[!idx];
u1 = counts->srcu_unlock_count[idx];
-#else /* #ifdef CONFIG_TREE_SRCU */
- struct srcu_array *counts;
-
- counts = per_cpu_ptr(srcu_ctlp->per_cpu_ref, cpu);
- u0 = counts->unlock_count[!idx];
- u1 = counts->unlock_count[idx];
-#endif /* #else #ifdef CONFIG_TREE_SRCU */
/*
* Make sure that a lock is always counted if the corresponding
@@ -594,13 +584,8 @@ static void srcu_torture_stats(void)
*/
smp_rmb();
-#ifdef CONFIG_TREE_SRCU
l0 = counts->srcu_lock_count[!idx];
l1 = counts->srcu_lock_count[idx];
-#else /* #ifdef CONFIG_TREE_SRCU */
- l0 = counts->lock_count[!idx];
- l1 = counts->lock_count[idx];
-#endif /* #else #ifdef CONFIG_TREE_SRCU */
c0 = l0 - u0;
c1 = l1 - u1;
@@ -609,7 +594,7 @@ static void srcu_torture_stats(void)
pr_cont("\n");
#elif defined(CONFIG_TINY_SRCU)
idx = READ_ONCE(srcu_ctlp->srcu_idx) & 0x1;
- pr_alert("%s%s Tiny SRCU per-CPU(idx=%d): (%d,%d)\n",
+ pr_alert("%s%s Tiny SRCU per-CPU(idx=%d): (%hd,%hd)\n",
torture_type, TORTURE_FLAG, idx,
READ_ONCE(srcu_ctlp->srcu_lock_nesting[!idx]),
READ_ONCE(srcu_ctlp->srcu_lock_nesting[idx]));
diff --git a/kernel/rcu/srcu.c b/kernel/rcu/srcu.c
deleted file mode 100644
index dea03614263f..000000000000
--- a/kernel/rcu/srcu.c
+++ /dev/null
@@ -1,661 +0,0 @@
-/*
- * Sleepable Read-Copy Update mechanism for mutual exclusion.
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, you can access it online at
- * http://www.gnu.org/licenses/gpl-2.0.html.
- *
- * Copyright (C) IBM Corporation, 2006
- * Copyright (C) Fujitsu, 2012
- *
- * Author: Paul McKenney <paulmck@us.ibm.com>
- * Lai Jiangshan <laijs@cn.fujitsu.com>
- *
- * For detailed explanation of Read-Copy Update mechanism see -
- * Documentation/RCU/ *.txt
- *
- */
-
-#include <linux/export.h>
-#include <linux/mutex.h>
-#include <linux/percpu.h>
-#include <linux/preempt.h>
-#include <linux/rcupdate_wait.h>
-#include <linux/sched.h>
-#include <linux/smp.h>
-#include <linux/delay.h>
-#include <linux/srcu.h>
-
-#include "rcu.h"
-
-/*
- * Initialize an rcu_batch structure to empty.
- */
-static inline void rcu_batch_init(struct rcu_batch *b)
-{
- b->head = NULL;
- b->tail = &b->head;
-}
-
-/*
- * Enqueue a callback onto the tail of the specified rcu_batch structure.
- */
-static inline void rcu_batch_queue(struct rcu_batch *b, struct rcu_head *head)
-{
- *b->tail = head;
- b->tail = &head->next;
-}
-
-/*
- * Is the specified rcu_batch structure empty?
- */
-static inline bool rcu_batch_empty(struct rcu_batch *b)
-{
- return b->tail == &b->head;
-}
-
-/*
- * Remove the callback at the head of the specified rcu_batch structure
- * and return a pointer to it, or return NULL if the structure is empty.
- */
-static inline struct rcu_head *rcu_batch_dequeue(struct rcu_batch *b)
-{
- struct rcu_head *head;
-
- if (rcu_batch_empty(b))
- return NULL;
-
- head = b->head;
- b->head = head->next;
- if (b->tail == &head->next)
- rcu_batch_init(b);
-
- return head;
-}
-
-/*
- * Move all callbacks from the rcu_batch structure specified by "from" to
- * the structure specified by "to".
- */
-static inline void rcu_batch_move(struct rcu_batch *to, struct rcu_batch *from)
-{
- if (!rcu_batch_empty(from)) {
- *to->tail = from->head;
- to->tail = from->tail;
- rcu_batch_init(from);
- }
-}
-
-static int init_srcu_struct_fields(struct srcu_struct *sp)
-{
- sp->completed = 0;
- spin_lock_init(&sp->queue_lock);
- sp->running = false;
- rcu_batch_init(&sp->batch_queue);
- rcu_batch_init(&sp->batch_check0);
- rcu_batch_init(&sp->batch_check1);
- rcu_batch_init(&sp->batch_done);
- INIT_DELAYED_WORK(&sp->work, process_srcu);
- sp->per_cpu_ref = alloc_percpu(struct srcu_array);
- return sp->per_cpu_ref ? 0 : -ENOMEM;
-}
-
-#ifdef CONFIG_DEBUG_LOCK_ALLOC
-
-int __init_srcu_struct(struct srcu_struct *sp, const char *name,
- struct lock_class_key *key)
-{
- /* Don't re-initialize a lock while it is held. */
- debug_check_no_locks_freed((void *)sp, sizeof(*sp));
- lockdep_init_map(&sp->dep_map, name, key, 0);
- return init_srcu_struct_fields(sp);
-}
-EXPORT_SYMBOL_GPL(__init_srcu_struct);
-
-#else /* #ifdef CONFIG_DEBUG_LOCK_ALLOC */
-
-/**
- * init_srcu_struct - initialize a sleep-RCU structure
- * @sp: structure to initialize.
- *
- * Must invoke this on a given srcu_struct before passing that srcu_struct
- * to any other function. Each srcu_struct represents a separate domain
- * of SRCU protection.
- */
-int init_srcu_struct(struct srcu_struct *sp)
-{
- return init_srcu_struct_fields(sp);
-}
-EXPORT_SYMBOL_GPL(init_srcu_struct);
-
-#endif /* #else #ifdef CONFIG_DEBUG_LOCK_ALLOC */
-
-/*
- * Returns approximate total of the readers' ->lock_count[] values for the
- * rank of per-CPU counters specified by idx.
- */
-static unsigned long srcu_readers_lock_idx(struct srcu_struct *sp, int idx)
-{
- int cpu;
- unsigned long sum = 0;
-
- for_each_possible_cpu(cpu) {
- struct srcu_array *cpuc = per_cpu_ptr(sp->per_cpu_ref, cpu);
-
- sum += READ_ONCE(cpuc->lock_count[idx]);
- }
- return sum;
-}
-
-/*
- * Returns approximate total of the readers' ->unlock_count[] values for the
- * rank of per-CPU counters specified by idx.
- */
-static unsigned long srcu_readers_unlock_idx(struct srcu_struct *sp, int idx)
-{
- int cpu;
- unsigned long sum = 0;
-
- for_each_possible_cpu(cpu) {
- struct srcu_array *cpuc = per_cpu_ptr(sp->per_cpu_ref, cpu);
-
- sum += READ_ONCE(cpuc->unlock_count[idx]);
- }
- return sum;
-}
-
-/*
- * Return true if the number of pre-existing readers is determined to
- * be zero.
- */
-static bool srcu_readers_active_idx_check(struct srcu_struct *sp, int idx)
-{
- unsigned long unlocks;
-
- unlocks = srcu_readers_unlock_idx(sp, idx);
-
- /*
- * Make sure that a lock is always counted if the corresponding unlock
- * is counted. Needs to be a smp_mb() as the read side may contain a
- * read from a variable that is written to before the synchronize_srcu()
- * in the write side. In this case smp_mb()s A and B act like the store
- * buffering pattern.
- *
- * This smp_mb() also pairs with smp_mb() C to prevent accesses after the
- * synchronize_srcu() from being executed before the grace period ends.
- */
- smp_mb(); /* A */
-
- /*
- * If the locks are the same as the unlocks, then there must have
- * been no readers on this index at some time in between. This does not
- * mean that there are no more readers, as one could have read the
- * current index but not have incremented the lock counter yet.
- *
- * Possible bug: There is no guarantee that there haven't been ULONG_MAX
- * increments of ->lock_count[] since the unlocks were counted, meaning
- * that this could return true even if there are still active readers.
- * Since there are no memory barriers around srcu_flip(), the CPU is not
- * required to increment ->completed before running
- * srcu_readers_unlock_idx(), which means that there could be an
- * arbitrarily large number of critical sections that execute after
- * srcu_readers_unlock_idx() but use the old value of ->completed.
- */
- return srcu_readers_lock_idx(sp, idx) == unlocks;
-}
-
-/**
- * srcu_readers_active - returns true if there are readers. and false
- * otherwise
- * @sp: which srcu_struct to count active readers (holding srcu_read_lock).
- *
- * Note that this is not an atomic primitive, and can therefore suffer
- * severe errors when invoked on an active srcu_struct. That said, it
- * can be useful as an error check at cleanup time.
- */
-static bool srcu_readers_active(struct srcu_struct *sp)
-{
- int cpu;
- unsigned long sum = 0;
-
- for_each_possible_cpu(cpu) {
- struct srcu_array *cpuc = per_cpu_ptr(sp->per_cpu_ref, cpu);
-
- sum += READ_ONCE(cpuc->lock_count[0]);
- sum += READ_ONCE(cpuc->lock_count[1]);
- sum -= READ_ONCE(cpuc->unlock_count[0]);
- sum -= READ_ONCE(cpuc->unlock_count[1]);
- }
- return sum;
-}
-
-/**
- * cleanup_srcu_struct - deconstruct a sleep-RCU structure
- * @sp: structure to clean up.
- *
- * Must invoke this only after you are finished using a given srcu_struct
- * that was initialized via init_srcu_struct(). This code does some
- * probabalistic checking, spotting late uses of srcu_read_lock(),
- * synchronize_srcu(), synchronize_srcu_expedited(), and call_srcu().
- * If any such late uses are detected, the per-CPU memory associated with
- * the srcu_struct is simply leaked and WARN_ON() is invoked. If the
- * caller frees the srcu_struct itself, a use-after-free crash will likely
- * ensue, but at least there will be a warning printed.
- */
-void cleanup_srcu_struct(struct srcu_struct *sp)
-{
- if (WARN_ON(srcu_readers_active(sp)))
- return; /* Leakage unless caller handles error. */
- free_percpu(sp->per_cpu_ref);
- sp->per_cpu_ref = NULL;
-}
-EXPORT_SYMBOL_GPL(cleanup_srcu_struct);
-
-/*
- * Counts the new reader in the appropriate per-CPU element of the
- * srcu_struct.
- * Returns an index that must be passed to the matching srcu_read_unlock().
- */
-int __srcu_read_lock(struct srcu_struct *sp)
-{
- int idx;
-
- idx = READ_ONCE(sp->completed) & 0x1;
- this_cpu_inc(sp->per_cpu_ref->lock_count[idx]);
- smp_mb(); /* B */ /* Avoid leaking the critical section. */
- return idx;
-}
-EXPORT_SYMBOL_GPL(__srcu_read_lock);
-
-/*
- * Removes the count for the old reader from the appropriate per-CPU
- * element of the srcu_struct. Note that this may well be a different
- * CPU than that which was incremented by the corresponding srcu_read_lock().
- */
-void __srcu_read_unlock(struct srcu_struct *sp, int idx)
-{
- smp_mb(); /* C */ /* Avoid leaking the critical section. */
- this_cpu_inc(sp->per_cpu_ref->unlock_count[idx]);
-}
-EXPORT_SYMBOL_GPL(__srcu_read_unlock);
-
-/*
- * We use an adaptive strategy for synchronize_srcu() and especially for
- * synchronize_srcu_expedited(). We spin for a fixed time period
- * (defined below) to allow SRCU readers to exit their read-side critical
- * sections. If there are still some readers after 10 microseconds,
- * we repeatedly block for 1-millisecond time periods. This approach
- * has done well in testing, so there is no need for a config parameter.
- */
-#define SRCU_RETRY_CHECK_DELAY 5
-#define SYNCHRONIZE_SRCU_TRYCOUNT 2
-#define SYNCHRONIZE_SRCU_EXP_TRYCOUNT 12
-
-/*
- * @@@ Wait until all pre-existing readers complete. Such readers
- * will have used the index specified by "idx".
- * the caller should ensures the ->completed is not changed while checking
- * and idx = (->completed & 1) ^ 1
- */
-static bool try_check_zero(struct srcu_struct *sp, int idx, int trycount)
-{
- for (;;) {
- if (srcu_readers_active_idx_check(sp, idx))
- return true;
- if (--trycount <= 0)
- return false;
- udelay(SRCU_RETRY_CHECK_DELAY);
- }
-}
-
-/*
- * Increment the ->completed counter so that future SRCU readers will
- * use the other rank of the ->(un)lock_count[] arrays. This allows
- * us to wait for pre-existing readers in a starvation-free manner.
- */
-static void srcu_flip(struct srcu_struct *sp)
-{
- WRITE_ONCE(sp->completed, sp->completed + 1);
-
- /*
- * Ensure that if the updater misses an __srcu_read_unlock()
- * increment, that task's next __srcu_read_lock() will see the
- * above counter update. Note that both this memory barrier
- * and the one in srcu_readers_active_idx_check() provide the
- * guarantee for __srcu_read_lock().
- */
- smp_mb(); /* D */ /* Pairs with C. */
-}
-
-/*
- * Enqueue an SRCU callback on the specified srcu_struct structure,
- * initiating grace-period processing if it is not already running.
- *
- * Note that all CPUs must agree that the grace period extended beyond
- * all pre-existing SRCU read-side critical section. On systems with
- * more than one CPU, this means that when "func()" is invoked, each CPU
- * is guaranteed to have executed a full memory barrier since the end of
- * its last corresponding SRCU read-side critical section whose beginning
- * preceded the call to call_rcu(). It also means that each CPU executing
- * an SRCU read-side critical section that continues beyond the start of
- * "func()" must have executed a memory barrier after the call_rcu()
- * but before the beginning of that SRCU read-side critical section.
- * Note that these guarantees include CPUs that are offline, idle, or
- * executing in user mode, as well as CPUs that are executing in the kernel.
- *
- * Furthermore, if CPU A invoked call_rcu() and CPU B invoked the
- * resulting SRCU callback function "func()", then both CPU A and CPU
- * B are guaranteed to execute a full memory barrier during the time
- * interval between the call to call_rcu() and the invocation of "func()".
- * This guarantee applies even if CPU A and CPU B are the same CPU (but
- * again only if the system has more than one CPU).
- *
- * Of course, these guarantees apply only for invocations of call_srcu(),
- * srcu_read_lock(), and srcu_read_unlock() that are all passed the same
- * srcu_struct structure.
- */
-void call_srcu(struct srcu_struct *sp, struct rcu_head *head,
- rcu_callback_t func)
-{
- unsigned long flags;
-
- head->next = NULL;
- head->func = func;
- spin_lock_irqsave(&sp->queue_lock, flags);
- smp_mb__after_unlock_lock(); /* Caller's prior accesses before GP. */
- rcu_batch_queue(&sp->batch_queue, head);
- if (!sp->running) {
- sp->running = true;
- queue_delayed_work(system_power_efficient_wq, &sp->work, 0);
- }
- spin_unlock_irqrestore(&sp->queue_lock, flags);
-}
-EXPORT_SYMBOL_GPL(call_srcu);
-
-static void srcu_advance_batches(struct srcu_struct *sp, int trycount);
-static void srcu_reschedule(struct srcu_struct *sp);
-
-/*
- * Helper function for synchronize_srcu() and synchronize_srcu_expedited().
- */
-static void __synchronize_srcu(struct srcu_struct *sp, int trycount)
-{
- struct rcu_synchronize rcu;
- struct rcu_head *head = &rcu.head;
- bool done = false;
-
- RCU_LOCKDEP_WARN(lock_is_held(&sp->dep_map) ||
- lock_is_held(&rcu_bh_lock_map) ||
- lock_is_held(&rcu_lock_map) ||
- lock_is_held(&rcu_sched_lock_map),
- "Illegal synchronize_srcu() in same-type SRCU (or in RCU) read-side critical section");
-
- might_sleep();
- init_completion(&rcu.completion);
-
- head->next = NULL;
- head->func = wakeme_after_rcu;
- spin_lock_irq(&sp->queue_lock);
- smp_mb__after_unlock_lock(); /* Caller's prior accesses before GP. */
- if (!sp->running) {
- /* steal the processing owner */
- sp->running = true;
- rcu_batch_queue(&sp->batch_check0, head);
- spin_unlock_irq(&sp->queue_lock);
-
- srcu_advance_batches(sp, trycount);
- if (!rcu_batch_empty(&sp->batch_done)) {
- BUG_ON(sp->batch_done.head != head);
- rcu_batch_dequeue(&sp->batch_done);
- done = true;
- }
- /* give the processing owner to work_struct */
- srcu_reschedule(sp);
- } else {
- rcu_batch_queue(&sp->batch_queue, head);
- spin_unlock_irq(&sp->queue_lock);
- }
-
- if (!done) {
- wait_for_completion(&rcu.completion);
- smp_mb(); /* Caller's later accesses after GP. */
- }
-
-}
-
-/**
- * synchronize_srcu - wait for prior SRCU read-side critical-section completion
- * @sp: srcu_struct with which to synchronize.
- *
- * Wait for the count to drain to zero of both indexes. To avoid the
- * possible starvation of synchronize_srcu(), it waits for the count of
- * the index=((->completed & 1) ^ 1) to drain to zero at first,
- * and then flip the completed and wait for the count of the other index.
- *
- * Can block; must be called from process context.
- *
- * Note that it is illegal to call synchronize_srcu() from the corresponding
- * SRCU read-side critical section; doing so will result in deadlock.
- * However, it is perfectly legal to call synchronize_srcu() on one
- * srcu_struct from some other srcu_struct's read-side critical section,
- * as long as the resulting graph of srcu_structs is acyclic.
- *
- * There are memory-ordering constraints implied by synchronize_srcu().
- * On systems with more than one CPU, when synchronize_srcu() returns,
- * each CPU is guaranteed to have executed a full memory barrier since
- * the end of its last corresponding SRCU-sched read-side critical section
- * whose beginning preceded the call to synchronize_srcu(). In addition,
- * each CPU having an SRCU read-side critical section that extends beyond
- * the return from synchronize_srcu() is guaranteed to have executed a
- * full memory barrier after the beginning of synchronize_srcu() and before
- * the beginning of that SRCU read-side critical section. Note that these
- * guarantees include CPUs that are offline, idle, or executing in user mode,
- * as well as CPUs that are executing in the kernel.
- *
- * Furthermore, if CPU A invoked synchronize_srcu(), which returned
- * to its caller on CPU B, then both CPU A and CPU B are guaranteed
- * to have executed a full memory barrier during the execution of
- * synchronize_srcu(). This guarantee applies even if CPU A and CPU B
- * are the same CPU, but again only if the system has more than one CPU.
- *
- * Of course, these memory-ordering guarantees apply only when
- * synchronize_srcu(), srcu_read_lock(), and srcu_read_unlock() are
- * passed the same srcu_struct structure.
- */
-void synchronize_srcu(struct srcu_struct *sp)
-{
- __synchronize_srcu(sp, (rcu_gp_is_expedited() && !rcu_gp_is_normal())
- ? SYNCHRONIZE_SRCU_EXP_TRYCOUNT
- : SYNCHRONIZE_SRCU_TRYCOUNT);
-}
-EXPORT_SYMBOL_GPL(synchronize_srcu);
-
-/**
- * synchronize_srcu_expedited - Brute-force SRCU grace period
- * @sp: srcu_struct with which to synchronize.
- *
- * Wait for an SRCU grace period to elapse, but be more aggressive about
- * spinning rather than blocking when waiting.
- *
- * Note that synchronize_srcu_expedited() has the same deadlock and
- * memory-ordering properties as does synchronize_srcu().
- */
-void synchronize_srcu_expedited(struct srcu_struct *sp)
-{
- __synchronize_srcu(sp, SYNCHRONIZE_SRCU_EXP_TRYCOUNT);
-}
-EXPORT_SYMBOL_GPL(synchronize_srcu_expedited);
-
-/**
- * srcu_barrier - Wait until all in-flight call_srcu() callbacks complete.
- * @sp: srcu_struct on which to wait for in-flight callbacks.
- */
-void srcu_barrier(struct srcu_struct *sp)
-{
- synchronize_srcu(sp);
-}
-EXPORT_SYMBOL_GPL(srcu_barrier);
-
-/**
- * srcu_batches_completed - return batches completed.
- * @sp: srcu_struct on which to report batch completion.
- *
- * Report the number of batches, correlated with, but not necessarily
- * precisely the same as, the number of grace periods that have elapsed.
- */
-unsigned long srcu_batches_completed(struct srcu_struct *sp)
-{
- return sp->completed;
-}
-EXPORT_SYMBOL_GPL(srcu_batches_completed);
-
-#define SRCU_CALLBACK_BATCH 10
-#define SRCU_INTERVAL 1
-
-/*
- * Move any new SRCU callbacks to the first stage of the SRCU grace
- * period pipeline.
- */
-static void srcu_collect_new(struct srcu_struct *sp)
-{
- if (!rcu_batch_empty(&sp->batch_queue)) {
- spin_lock_irq(&sp->queue_lock);
- rcu_batch_move(&sp->batch_check0, &sp->batch_queue);
- spin_unlock_irq(&sp->queue_lock);
- }
-}
-
-/*
- * Core SRCU state machine. Advance callbacks from ->batch_check0 to
- * ->batch_check1 and then to ->batch_done as readers drain.
- */
-static void srcu_advance_batches(struct srcu_struct *sp, int trycount)
-{
- int idx = 1 ^ (sp->completed & 1);
-
- /*
- * Because readers might be delayed for an extended period after
- * fetching ->completed for their index, at any point in time there
- * might well be readers using both idx=0 and idx=1. We therefore
- * need to wait for readers to clear from both index values before
- * invoking a callback.
- */
-
- if (rcu_batch_empty(&sp->batch_check0) &&
- rcu_batch_empty(&sp->batch_check1))
- return; /* no callbacks need to be advanced */
-
- if (!try_check_zero(sp, idx, trycount))
- return; /* failed to advance, will try after SRCU_INTERVAL */
-
- /*
- * The callbacks in ->batch_check1 have already done with their
- * first zero check and flip back when they were enqueued on
- * ->batch_check0 in a previous invocation of srcu_advance_batches().
- * (Presumably try_check_zero() returned false during that
- * invocation, leaving the callbacks stranded on ->batch_check1.)
- * They are therefore ready to invoke, so move them to ->batch_done.
- */
- rcu_batch_move(&sp->batch_done, &sp->batch_check1);
-
- if (rcu_batch_empty(&sp->batch_check0))
- return; /* no callbacks need to be advanced */
- srcu_flip(sp);
-
- /*
- * The callbacks in ->batch_check0 just finished their
- * first check zero and flip, so move them to ->batch_check1
- * for future checking on the other idx.
- */
- rcu_batch_move(&sp->batch_check1, &sp->batch_check0);
-
- /*
- * SRCU read-side critical sections are normally short, so check
- * at least twice in quick succession after a flip.
- */
- trycount = trycount < 2 ? 2 : trycount;
- if (!try_check_zero(sp, idx^1, trycount))
- return; /* failed to advance, will try after SRCU_INTERVAL */
-
- /*
- * The callbacks in ->batch_check1 have now waited for all
- * pre-existing readers using both idx values. They are therefore
- * ready to invoke, so move them to ->batch_done.
- */
- rcu_batch_move(&sp->batch_done, &sp->batch_check1);
-}
-
-/*
- * Invoke a limited number of SRCU callbacks that have passed through
- * their grace period. If there are more to do, SRCU will reschedule
- * the workqueue. Note that needed memory barriers have been executed
- * in this task's context by srcu_readers_active_idx_check().
- */
-static void srcu_invoke_callbacks(struct srcu_struct *sp)
-{
- int i;
- struct rcu_head *head;
-
- for (i = 0; i < SRCU_CALLBACK_BATCH; i++) {
- head = rcu_batch_dequeue(&sp->batch_done);
- if (!head)
- break;
- local_bh_disable();
- head->func(head);
- local_bh_enable();
- }
-}
-
-/*
- * Finished one round of SRCU grace period. Start another if there are
- * more SRCU callbacks queued, otherwise put SRCU into not-running state.
- */
-static void srcu_reschedule(struct srcu_struct *sp)
-{
- bool pending = true;
-
- if (rcu_batch_empty(&sp->batch_done) &&
- rcu_batch_empty(&sp->batch_check1) &&
- rcu_batch_empty(&sp->batch_check0) &&
- rcu_batch_empty(&sp->batch_queue)) {
- spin_lock_irq(&sp->queue_lock);
- if (rcu_batch_empty(&sp->batch_done) &&
- rcu_batch_empty(&sp->batch_check1) &&
- rcu_batch_empty(&sp->batch_check0) &&
- rcu_batch_empty(&sp->batch_queue)) {
- sp->running = false;
- pending = false;
- }
- spin_unlock_irq(&sp->queue_lock);
- }
-
- if (pending)
- queue_delayed_work(system_power_efficient_wq,
- &sp->work, SRCU_INTERVAL);
-}
-
-/*
- * This is the work-queue function that handles SRCU grace periods.
- */
-void process_srcu(struct work_struct *work)
-{
- struct srcu_struct *sp;
-
- sp = container_of(work, struct srcu_struct, work.work);
-
- srcu_collect_new(sp);
- srcu_advance_batches(sp, 1);
- srcu_invoke_callbacks(sp);
- srcu_reschedule(sp);
-}
-EXPORT_SYMBOL_GPL(process_srcu);
diff --git a/kernel/rcu/srcutiny.c b/kernel/rcu/srcutiny.c
index 32798eb14853..1a1c1047d2ed 100644
--- a/kernel/rcu/srcutiny.c
+++ b/kernel/rcu/srcutiny.c
@@ -38,8 +38,8 @@ static int init_srcu_struct_fields(struct srcu_struct *sp)
sp->srcu_lock_nesting[0] = 0;
sp->srcu_lock_nesting[1] = 0;
init_swait_queue_head(&sp->srcu_wq);
- sp->srcu_gp_seq = 0;
- rcu_segcblist_init(&sp->srcu_cblist);
+ sp->srcu_cb_head = NULL;
+ sp->srcu_cb_tail = &sp->srcu_cb_head;
sp->srcu_gp_running = false;
sp->srcu_gp_waiting = false;
sp->srcu_idx = 0;
@@ -88,30 +88,14 @@ void cleanup_srcu_struct(struct srcu_struct *sp)
{
WARN_ON(sp->srcu_lock_nesting[0] || sp->srcu_lock_nesting[1]);
flush_work(&sp->srcu_work);
- WARN_ON(rcu_seq_state(sp->srcu_gp_seq));
WARN_ON(sp->srcu_gp_running);
WARN_ON(sp->srcu_gp_waiting);
- WARN_ON(!rcu_segcblist_empty(&sp->srcu_cblist));
+ WARN_ON(sp->srcu_cb_head);
+ WARN_ON(&sp->srcu_cb_head != sp->srcu_cb_tail);
}
EXPORT_SYMBOL_GPL(cleanup_srcu_struct);
/*
- * Counts the new reader in the appropriate per-CPU element of the
- * srcu_struct. Can be invoked from irq/bh handlers, but the matching
- * __srcu_read_unlock() must be in the same handler instance. Returns an
- * index that must be passed to the matching srcu_read_unlock().
- */
-int __srcu_read_lock(struct srcu_struct *sp)
-{
- int idx;
-
- idx = READ_ONCE(sp->srcu_idx);
- WRITE_ONCE(sp->srcu_lock_nesting[idx], sp->srcu_lock_nesting[idx] + 1);
- return idx;
-}
-EXPORT_SYMBOL_GPL(__srcu_read_lock);
-
-/*
* Removes the count for the old reader from the appropriate element of
* the srcu_struct.
*/
@@ -133,52 +117,44 @@ EXPORT_SYMBOL_GPL(__srcu_read_unlock);
void srcu_drive_gp(struct work_struct *wp)
{
int idx;
- struct rcu_cblist ready_cbs;
- struct srcu_struct *sp;
+ struct rcu_head *lh;
struct rcu_head *rhp;
+ struct srcu_struct *sp;
sp = container_of(wp, struct srcu_struct, srcu_work);
- if (sp->srcu_gp_running || rcu_segcblist_empty(&sp->srcu_cblist))
+ if (sp->srcu_gp_running || !READ_ONCE(sp->srcu_cb_head))
return; /* Already running or nothing to do. */
- /* Tag recently arrived callbacks and wait for readers. */
+ /* Remove recently arrived callbacks and wait for readers. */
WRITE_ONCE(sp->srcu_gp_running, true);
- rcu_segcblist_accelerate(&sp->srcu_cblist,
- rcu_seq_snap(&sp->srcu_gp_seq));
- rcu_seq_start(&sp->srcu_gp_seq);
+ local_irq_disable();
+ lh = sp->srcu_cb_head;
+ sp->srcu_cb_head = NULL;
+ sp->srcu_cb_tail = &sp->srcu_cb_head;
+ local_irq_enable();
idx = sp->srcu_idx;
WRITE_ONCE(sp->srcu_idx, !sp->srcu_idx);
WRITE_ONCE(sp->srcu_gp_waiting, true); /* srcu_read_unlock() wakes! */
swait_event(sp->srcu_wq, !READ_ONCE(sp->srcu_lock_nesting[idx]));
WRITE_ONCE(sp->srcu_gp_waiting, false); /* srcu_read_unlock() cheap. */
- rcu_seq_end(&sp->srcu_gp_seq);
-
- /* Update callback list based on GP, and invoke ready callbacks. */
- rcu_segcblist_advance(&sp->srcu_cblist,
- rcu_seq_current(&sp->srcu_gp_seq));
- if (rcu_segcblist_ready_cbs(&sp->srcu_cblist)) {
- rcu_cblist_init(&ready_cbs);
- local_irq_disable();
- rcu_segcblist_extract_done_cbs(&sp->srcu_cblist, &ready_cbs);
- local_irq_enable();
- rhp = rcu_cblist_dequeue(&ready_cbs);
- for (; rhp != NULL; rhp = rcu_cblist_dequeue(&ready_cbs)) {
- local_bh_disable();
- rhp->func(rhp);
- local_bh_enable();
- }
- local_irq_disable();
- rcu_segcblist_insert_count(&sp->srcu_cblist, &ready_cbs);
- local_irq_enable();
+
+ /* Invoke the callbacks we removed above. */
+ while (lh) {
+ rhp = lh;
+ lh = lh->next;
+ local_bh_disable();
+ rhp->func(rhp);
+ local_bh_enable();
}
- WRITE_ONCE(sp->srcu_gp_running, false);
/*
- * If more callbacks, reschedule ourselves. This can race with
- * a call_srcu() at interrupt level, but the ->srcu_gp_running
- * checks will straighten that out.
+ * Enable rescheduling, and if there are more callbacks,
+ * reschedule ourselves. This can race with a call_srcu()
+ * at interrupt level, but the ->srcu_gp_running checks will
+ * straighten that out.
*/
- if (!rcu_segcblist_empty(&sp->srcu_cblist))
+ WRITE_ONCE(sp->srcu_gp_running, false);
+ if (READ_ONCE(sp->srcu_cb_head))
schedule_work(&sp->srcu_work);
}
EXPORT_SYMBOL_GPL(srcu_drive_gp);
@@ -187,14 +163,16 @@ EXPORT_SYMBOL_GPL(srcu_drive_gp);
* Enqueue an SRCU callback on the specified srcu_struct structure,
* initiating grace-period processing if it is not already running.
*/
-void call_srcu(struct srcu_struct *sp, struct rcu_head *head,
+void call_srcu(struct srcu_struct *sp, struct rcu_head *rhp,
rcu_callback_t func)
{
unsigned long flags;
- head->func = func;
+ rhp->func = func;
+ rhp->next = NULL;
local_irq_save(flags);
- rcu_segcblist_enqueue(&sp->srcu_cblist, head, false);
+ *sp->srcu_cb_tail = rhp;
+ sp->srcu_cb_tail = &rhp->next;
local_irq_restore(flags);
if (!READ_ONCE(sp->srcu_gp_running))
schedule_work(&sp->srcu_work);
diff --git a/kernel/rcu/srcutree.c b/kernel/rcu/srcutree.c
index 157654fa436a..d0ca524bf042 100644
--- a/kernel/rcu/srcutree.c
+++ b/kernel/rcu/srcutree.c
@@ -40,9 +40,15 @@
#include "rcu.h"
#include "rcu_segcblist.h"
-ulong exp_holdoff = 25 * 1000; /* Holdoff (ns) for auto-expediting. */
+/* Holdoff in nanoseconds for auto-expediting. */
+#define DEFAULT_SRCU_EXP_HOLDOFF (25 * 1000)
+static ulong exp_holdoff = DEFAULT_SRCU_EXP_HOLDOFF;
module_param(exp_holdoff, ulong, 0444);
+/* Overflow-check frequency. N bits roughly says every 2**N grace periods. */
+static ulong counter_wrap_check = (ULONG_MAX >> 2);
+module_param(counter_wrap_check, ulong, 0444);
+
static void srcu_invoke_callbacks(struct work_struct *work);
static void srcu_reschedule(struct srcu_struct *sp, unsigned long delay);
@@ -70,7 +76,7 @@ static void init_srcu_struct_nodes(struct srcu_struct *sp, bool is_static)
/* Each pass through this loop initializes one srcu_node structure. */
rcu_for_each_node_breadth_first(sp, snp) {
- spin_lock_init(&snp->lock);
+ raw_spin_lock_init(&ACCESS_PRIVATE(snp, lock));
WARN_ON_ONCE(ARRAY_SIZE(snp->srcu_have_cbs) !=
ARRAY_SIZE(snp->srcu_data_have_cbs));
for (i = 0; i < ARRAY_SIZE(snp->srcu_have_cbs); i++) {
@@ -104,7 +110,7 @@ static void init_srcu_struct_nodes(struct srcu_struct *sp, bool is_static)
snp_first = sp->level[level];
for_each_possible_cpu(cpu) {
sdp = per_cpu_ptr(sp->sda, cpu);
- spin_lock_init(&sdp->lock);
+ raw_spin_lock_init(&ACCESS_PRIVATE(sdp, lock));
rcu_segcblist_init(&sdp->srcu_cblist);
sdp->srcu_cblist_invoking = false;
sdp->srcu_gp_seq_needed = sp->srcu_gp_seq;
@@ -163,7 +169,7 @@ int __init_srcu_struct(struct srcu_struct *sp, const char *name,
/* Don't re-initialize a lock while it is held. */
debug_check_no_locks_freed((void *)sp, sizeof(*sp));
lockdep_init_map(&sp->dep_map, name, key, 0);
- spin_lock_init(&sp->gp_lock);
+ raw_spin_lock_init(&ACCESS_PRIVATE(sp, lock));
return init_srcu_struct_fields(sp, false);
}
EXPORT_SYMBOL_GPL(__init_srcu_struct);
@@ -180,7 +186,7 @@ EXPORT_SYMBOL_GPL(__init_srcu_struct);
*/
int init_srcu_struct(struct srcu_struct *sp)
{
- spin_lock_init(&sp->gp_lock);
+ raw_spin_lock_init(&ACCESS_PRIVATE(sp, lock));
return init_srcu_struct_fields(sp, false);
}
EXPORT_SYMBOL_GPL(init_srcu_struct);
@@ -191,7 +197,7 @@ EXPORT_SYMBOL_GPL(init_srcu_struct);
* First-use initialization of statically allocated srcu_struct
* structure. Wiring up the combining tree is more than can be
* done with compile-time initialization, so this check is added
- * to each update-side SRCU primitive. Use ->gp_lock, which -is-
+ * to each update-side SRCU primitive. Use sp->lock, which -is-
* compile-time initialized, to resolve races involving multiple
* CPUs trying to garner first-use privileges.
*/
@@ -203,13 +209,13 @@ static void check_init_srcu_struct(struct srcu_struct *sp)
/* The smp_load_acquire() pairs with the smp_store_release(). */
if (!rcu_seq_state(smp_load_acquire(&sp->srcu_gp_seq_needed))) /*^^^*/
return; /* Already initialized. */
- spin_lock_irqsave(&sp->gp_lock, flags);
+ raw_spin_lock_irqsave_rcu_node(sp, flags);
if (!rcu_seq_state(sp->srcu_gp_seq_needed)) {
- spin_unlock_irqrestore(&sp->gp_lock, flags);
+ raw_spin_unlock_irqrestore_rcu_node(sp, flags);
return;
}
init_srcu_struct_fields(sp, true);
- spin_unlock_irqrestore(&sp->gp_lock, flags);
+ raw_spin_unlock_irqrestore_rcu_node(sp, flags);
}
/*
@@ -275,15 +281,20 @@ static bool srcu_readers_active_idx_check(struct srcu_struct *sp, int idx)
* not mean that there are no more readers, as one could have read
* the current index but not have incremented the lock counter yet.
*
- * Possible bug: There is no guarantee that there haven't been
- * ULONG_MAX increments of ->srcu_lock_count[] since the unlocks were
- * counted, meaning that this could return true even if there are
- * still active readers. Since there are no memory barriers around
- * srcu_flip(), the CPU is not required to increment ->srcu_idx
- * before running srcu_readers_unlock_idx(), which means that there
- * could be an arbitrarily large number of critical sections that
- * execute after srcu_readers_unlock_idx() but use the old value
- * of ->srcu_idx.
+ * So suppose that the updater is preempted here for so long
+ * that more than ULONG_MAX non-nested readers come and go in
+ * the meantime. It turns out that this cannot result in overflow
+ * because if a reader modifies its unlock count after we read it
+ * above, then that reader's next load of ->srcu_idx is guaranteed
+ * to get the new value, which will cause it to operate on the
+ * other bank of counters, where it cannot contribute to the
+ * overflow of these counters. This means that there is a maximum
+ * of 2*NR_CPUS increments, which cannot overflow given current
+ * systems, especially not on 64-bit systems.
+ *
+ * OK, how about nesting? This does impose a limit on nesting
+ * of floor(ULONG_MAX/NR_CPUS/2), which should be sufficient,
+ * especially on 64-bit systems.
*/
return srcu_readers_lock_idx(sp, idx) == unlocks;
}
@@ -400,8 +411,7 @@ static void srcu_gp_start(struct srcu_struct *sp)
struct srcu_data *sdp = this_cpu_ptr(sp->sda);
int state;
- RCU_LOCKDEP_WARN(!lockdep_is_held(&sp->gp_lock),
- "Invoked srcu_gp_start() without ->gp_lock!");
+ lockdep_assert_held(&sp->lock);
WARN_ON_ONCE(ULONG_CMP_GE(sp->srcu_gp_seq, sp->srcu_gp_seq_needed));
rcu_segcblist_advance(&sdp->srcu_cblist,
rcu_seq_current(&sp->srcu_gp_seq));
@@ -489,17 +499,20 @@ static void srcu_gp_end(struct srcu_struct *sp)
{
unsigned long cbdelay;
bool cbs;
+ int cpu;
+ unsigned long flags;
unsigned long gpseq;
int idx;
int idxnext;
unsigned long mask;
+ struct srcu_data *sdp;
struct srcu_node *snp;
/* Prevent more than one additional grace period. */
mutex_lock(&sp->srcu_cb_mutex);
/* End the current grace period. */
- spin_lock_irq(&sp->gp_lock);
+ raw_spin_lock_irq_rcu_node(sp);
idx = rcu_seq_state(sp->srcu_gp_seq);
WARN_ON_ONCE(idx != SRCU_STATE_SCAN2);
cbdelay = srcu_get_delay(sp);
@@ -508,7 +521,7 @@ static void srcu_gp_end(struct srcu_struct *sp)
gpseq = rcu_seq_current(&sp->srcu_gp_seq);
if (ULONG_CMP_LT(sp->srcu_gp_seq_needed_exp, gpseq))
sp->srcu_gp_seq_needed_exp = gpseq;
- spin_unlock_irq(&sp->gp_lock);
+ raw_spin_unlock_irq_rcu_node(sp);
mutex_unlock(&sp->srcu_gp_mutex);
/* A new grace period can start at this point. But only one. */
@@ -516,7 +529,7 @@ static void srcu_gp_end(struct srcu_struct *sp)
idx = rcu_seq_ctr(gpseq) % ARRAY_SIZE(snp->srcu_have_cbs);
idxnext = (idx + 1) % ARRAY_SIZE(snp->srcu_have_cbs);
rcu_for_each_node_breadth_first(sp, snp) {
- spin_lock_irq(&snp->lock);
+ raw_spin_lock_irq_rcu_node(snp);
cbs = false;
if (snp >= sp->level[rcu_num_lvls - 1])
cbs = snp->srcu_have_cbs[idx] == gpseq;
@@ -526,28 +539,37 @@ static void srcu_gp_end(struct srcu_struct *sp)
snp->srcu_gp_seq_needed_exp = gpseq;
mask = snp->srcu_data_have_cbs[idx];
snp->srcu_data_have_cbs[idx] = 0;
- spin_unlock_irq(&snp->lock);
- if (cbs) {
- smp_mb(); /* GP end before CB invocation. */
+ raw_spin_unlock_irq_rcu_node(snp);
+ if (cbs)
srcu_schedule_cbs_snp(sp, snp, mask, cbdelay);
- }
+
+ /* Occasionally prevent srcu_data counter wrap. */
+ if (!(gpseq & counter_wrap_check))
+ for (cpu = snp->grplo; cpu <= snp->grphi; cpu++) {
+ sdp = per_cpu_ptr(sp->sda, cpu);
+ raw_spin_lock_irqsave_rcu_node(sdp, flags);
+ if (ULONG_CMP_GE(gpseq,
+ sdp->srcu_gp_seq_needed + 100))
+ sdp->srcu_gp_seq_needed = gpseq;
+ raw_spin_unlock_irqrestore_rcu_node(sdp, flags);
+ }
}
/* Callback initiation done, allow grace periods after next. */
mutex_unlock(&sp->srcu_cb_mutex);
/* Start a new grace period if needed. */
- spin_lock_irq(&sp->gp_lock);
+ raw_spin_lock_irq_rcu_node(sp);
gpseq = rcu_seq_current(&sp->srcu_gp_seq);
if (!rcu_seq_state(gpseq) &&
ULONG_CMP_LT(gpseq, sp->srcu_gp_seq_needed)) {
srcu_gp_start(sp);
- spin_unlock_irq(&sp->gp_lock);
+ raw_spin_unlock_irq_rcu_node(sp);
/* Throttle expedited grace periods: Should be rare! */
srcu_reschedule(sp, rcu_seq_ctr(gpseq) & 0x3ff
? 0 : SRCU_INTERVAL);
} else {
- spin_unlock_irq(&sp->gp_lock);
+ raw_spin_unlock_irq_rcu_node(sp);
}
}
@@ -567,18 +589,18 @@ static void srcu_funnel_exp_start(struct srcu_struct *sp, struct srcu_node *snp,
if (rcu_seq_done(&sp->srcu_gp_seq, s) ||
ULONG_CMP_GE(READ_ONCE(snp->srcu_gp_seq_needed_exp), s))
return;
- spin_lock_irqsave(&snp->lock, flags);
+ raw_spin_lock_irqsave_rcu_node(snp, flags);
if (ULONG_CMP_GE(snp->srcu_gp_seq_needed_exp, s)) {
- spin_unlock_irqrestore(&snp->lock, flags);
+ raw_spin_unlock_irqrestore_rcu_node(snp, flags);
return;
}
WRITE_ONCE(snp->srcu_gp_seq_needed_exp, s);
- spin_unlock_irqrestore(&snp->lock, flags);
+ raw_spin_unlock_irqrestore_rcu_node(snp, flags);
}
- spin_lock_irqsave(&sp->gp_lock, flags);
+ raw_spin_lock_irqsave_rcu_node(sp, flags);
if (!ULONG_CMP_LT(sp->srcu_gp_seq_needed_exp, s))
sp->srcu_gp_seq_needed_exp = s;
- spin_unlock_irqrestore(&sp->gp_lock, flags);
+ raw_spin_unlock_irqrestore_rcu_node(sp, flags);
}
/*
@@ -600,14 +622,13 @@ static void srcu_funnel_gp_start(struct srcu_struct *sp, struct srcu_data *sdp,
for (; snp != NULL; snp = snp->srcu_parent) {
if (rcu_seq_done(&sp->srcu_gp_seq, s) && snp != sdp->mynode)
return; /* GP already done and CBs recorded. */
- spin_lock_irqsave(&snp->lock, flags);
+ raw_spin_lock_irqsave_rcu_node(snp, flags);
if (ULONG_CMP_GE(snp->srcu_have_cbs[idx], s)) {
snp_seq = snp->srcu_have_cbs[idx];
if (snp == sdp->mynode && snp_seq == s)
snp->srcu_data_have_cbs[idx] |= sdp->grpmask;
- spin_unlock_irqrestore(&snp->lock, flags);
+ raw_spin_unlock_irqrestore_rcu_node(snp, flags);
if (snp == sdp->mynode && snp_seq != s) {
- smp_mb(); /* CBs after GP! */
srcu_schedule_cbs_sdp(sdp, do_norm
? SRCU_INTERVAL
: 0);
@@ -622,11 +643,11 @@ static void srcu_funnel_gp_start(struct srcu_struct *sp, struct srcu_data *sdp,
snp->srcu_data_have_cbs[idx] |= sdp->grpmask;
if (!do_norm && ULONG_CMP_LT(snp->srcu_gp_seq_needed_exp, s))
snp->srcu_gp_seq_needed_exp = s;
- spin_unlock_irqrestore(&snp->lock, flags);
+ raw_spin_unlock_irqrestore_rcu_node(snp, flags);
}
/* Top of tree, must ensure the grace period will be started. */
- spin_lock_irqsave(&sp->gp_lock, flags);
+ raw_spin_lock_irqsave_rcu_node(sp, flags);
if (ULONG_CMP_LT(sp->srcu_gp_seq_needed, s)) {
/*
* Record need for grace period s. Pair with load
@@ -645,7 +666,7 @@ static void srcu_funnel_gp_start(struct srcu_struct *sp, struct srcu_data *sdp,
queue_delayed_work(system_power_efficient_wq, &sp->work,
srcu_get_delay(sp));
}
- spin_unlock_irqrestore(&sp->gp_lock, flags);
+ raw_spin_unlock_irqrestore_rcu_node(sp, flags);
}
/*
@@ -671,6 +692,16 @@ static bool try_check_zero(struct srcu_struct *sp, int idx, int trycount)
*/
static void srcu_flip(struct srcu_struct *sp)
{
+ /*
+ * Ensure that if this updater saw a given reader's increment
+ * from __srcu_read_lock(), that reader was using an old value
+ * of ->srcu_idx. Also ensure that if a given reader sees the
+ * new value of ->srcu_idx, this updater's earlier scans cannot
+ * have seen that reader's increments (which is OK, because this
+ * grace period need not wait on that reader).
+ */
+ smp_mb(); /* E */ /* Pairs with B and C. */
+
WRITE_ONCE(sp->srcu_idx, sp->srcu_idx + 1);
/*
@@ -745,6 +776,13 @@ static bool srcu_might_be_idle(struct srcu_struct *sp)
}
/*
+ * SRCU callback function to leak a callback.
+ */
+static void srcu_leak_callback(struct rcu_head *rhp)
+{
+}
+
+/*
* Enqueue an SRCU callback on the srcu_data structure associated with
* the current CPU and the specified srcu_struct structure, initiating
* grace-period processing if it is not already running.
@@ -782,10 +820,16 @@ void __call_srcu(struct srcu_struct *sp, struct rcu_head *rhp,
struct srcu_data *sdp;
check_init_srcu_struct(sp);
+ if (debug_rcu_head_queue(rhp)) {
+ /* Probable double call_srcu(), so leak the callback. */
+ WRITE_ONCE(rhp->func, srcu_leak_callback);
+ WARN_ONCE(1, "call_srcu(): Leaked duplicate callback\n");
+ return;
+ }
rhp->func = func;
local_irq_save(flags);
sdp = this_cpu_ptr(sp->sda);
- spin_lock(&sdp->lock);
+ raw_spin_lock_rcu_node(sdp);
rcu_segcblist_enqueue(&sdp->srcu_cblist, rhp, false);
rcu_segcblist_advance(&sdp->srcu_cblist,
rcu_seq_current(&sp->srcu_gp_seq));
@@ -799,13 +843,30 @@ void __call_srcu(struct srcu_struct *sp, struct rcu_head *rhp,
sdp->srcu_gp_seq_needed_exp = s;
needexp = true;
}
- spin_unlock_irqrestore(&sdp->lock, flags);
+ raw_spin_unlock_irqrestore_rcu_node(sdp, flags);
if (needgp)
srcu_funnel_gp_start(sp, sdp, s, do_norm);
else if (needexp)
srcu_funnel_exp_start(sp, sdp->mynode, s);
}
+/**
+ * call_srcu() - Queue a callback for invocation after an SRCU grace period
+ * @sp: srcu_struct in queue the callback
+ * @head: structure to be used for queueing the SRCU callback.
+ * @func: function to be invoked after the SRCU grace period
+ *
+ * The callback function will be invoked some time after a full SRCU
+ * grace period elapses, in other words after all pre-existing SRCU
+ * read-side critical sections have completed. However, the callback
+ * function might well execute concurrently with other SRCU read-side
+ * critical sections that started after call_srcu() was invoked. SRCU
+ * read-side critical sections are delimited by srcu_read_lock() and
+ * srcu_read_unlock(), and may be nested.
+ *
+ * The callback will be invoked from process context, but must nevertheless
+ * be fast and must not block.
+ */
void call_srcu(struct srcu_struct *sp, struct rcu_head *rhp,
rcu_callback_t func)
{
@@ -953,13 +1014,16 @@ void srcu_barrier(struct srcu_struct *sp)
*/
for_each_possible_cpu(cpu) {
sdp = per_cpu_ptr(sp->sda, cpu);
- spin_lock_irq(&sdp->lock);
+ raw_spin_lock_irq_rcu_node(sdp);
atomic_inc(&sp->srcu_barrier_cpu_cnt);
sdp->srcu_barrier_head.func = srcu_barrier_cb;
+ debug_rcu_head_queue(&sdp->srcu_barrier_head);
if (!rcu_segcblist_entrain(&sdp->srcu_cblist,
- &sdp->srcu_barrier_head, 0))
+ &sdp->srcu_barrier_head, 0)) {
+ debug_rcu_head_unqueue(&sdp->srcu_barrier_head);
atomic_dec(&sp->srcu_barrier_cpu_cnt);
- spin_unlock_irq(&sdp->lock);
+ }
+ raw_spin_unlock_irq_rcu_node(sdp);
}
/* Remove the initial count, at which point reaching zero can happen. */
@@ -1008,17 +1072,17 @@ static void srcu_advance_state(struct srcu_struct *sp)
*/
idx = rcu_seq_state(smp_load_acquire(&sp->srcu_gp_seq)); /* ^^^ */
if (idx == SRCU_STATE_IDLE) {
- spin_lock_irq(&sp->gp_lock);
+ raw_spin_lock_irq_rcu_node(sp);
if (ULONG_CMP_GE(sp->srcu_gp_seq, sp->srcu_gp_seq_needed)) {
WARN_ON_ONCE(rcu_seq_state(sp->srcu_gp_seq));
- spin_unlock_irq(&sp->gp_lock);
+ raw_spin_unlock_irq_rcu_node(sp);
mutex_unlock(&sp->srcu_gp_mutex);
return;
}
idx = rcu_seq_state(READ_ONCE(sp->srcu_gp_seq));
if (idx == SRCU_STATE_IDLE)
srcu_gp_start(sp);
- spin_unlock_irq(&sp->gp_lock);
+ raw_spin_unlock_irq_rcu_node(sp);
if (idx != SRCU_STATE_IDLE) {
mutex_unlock(&sp->srcu_gp_mutex);
return; /* Someone else started the grace period. */
@@ -1067,22 +1131,22 @@ static void srcu_invoke_callbacks(struct work_struct *work)
sdp = container_of(work, struct srcu_data, work.work);
sp = sdp->sp;
rcu_cblist_init(&ready_cbs);
- spin_lock_irq(&sdp->lock);
- smp_mb(); /* Old grace periods before callback invocation! */
+ raw_spin_lock_irq_rcu_node(sdp);
rcu_segcblist_advance(&sdp->srcu_cblist,
rcu_seq_current(&sp->srcu_gp_seq));
if (sdp->srcu_cblist_invoking ||
!rcu_segcblist_ready_cbs(&sdp->srcu_cblist)) {
- spin_unlock_irq(&sdp->lock);
+ raw_spin_unlock_irq_rcu_node(sdp);
return; /* Someone else on the job or nothing to do. */
}
/* We are on the job! Extract and invoke ready callbacks. */
sdp->srcu_cblist_invoking = true;
rcu_segcblist_extract_done_cbs(&sdp->srcu_cblist, &ready_cbs);
- spin_unlock_irq(&sdp->lock);
+ raw_spin_unlock_irq_rcu_node(sdp);
rhp = rcu_cblist_dequeue(&ready_cbs);
for (; rhp != NULL; rhp = rcu_cblist_dequeue(&ready_cbs)) {
+ debug_rcu_head_unqueue(rhp);
local_bh_disable();
rhp->func(rhp);
local_bh_enable();
@@ -1092,13 +1156,13 @@ static void srcu_invoke_callbacks(struct work_struct *work)
* Update counts, accelerate new callbacks, and if needed,
* schedule another round of callback invocation.
*/
- spin_lock_irq(&sdp->lock);
+ raw_spin_lock_irq_rcu_node(sdp);
rcu_segcblist_insert_count(&sdp->srcu_cblist, &ready_cbs);
(void)rcu_segcblist_accelerate(&sdp->srcu_cblist,
rcu_seq_snap(&sp->srcu_gp_seq));
sdp->srcu_cblist_invoking = false;
more = rcu_segcblist_ready_cbs(&sdp->srcu_cblist);
- spin_unlock_irq(&sdp->lock);
+ raw_spin_unlock_irq_rcu_node(sdp);
if (more)
srcu_schedule_cbs_sdp(sdp, 0);
}
@@ -1111,7 +1175,7 @@ static void srcu_reschedule(struct srcu_struct *sp, unsigned long delay)
{
bool pushgp = true;
- spin_lock_irq(&sp->gp_lock);
+ raw_spin_lock_irq_rcu_node(sp);
if (ULONG_CMP_GE(sp->srcu_gp_seq, sp->srcu_gp_seq_needed)) {
if (!WARN_ON_ONCE(rcu_seq_state(sp->srcu_gp_seq))) {
/* All requests fulfilled, time to go idle. */
@@ -1121,7 +1185,7 @@ static void srcu_reschedule(struct srcu_struct *sp, unsigned long delay)
/* Outstanding request and no GP. Start one. */
srcu_gp_start(sp);
}
- spin_unlock_irq(&sp->gp_lock);
+ raw_spin_unlock_irq_rcu_node(sp);
if (pushgp)
queue_delayed_work(system_power_efficient_wq, &sp->work, delay);
@@ -1152,3 +1216,12 @@ void srcutorture_get_gp_data(enum rcutorture_type test_type,
*gpnum = rcu_seq_ctr(sp->srcu_gp_seq_needed);
}
EXPORT_SYMBOL_GPL(srcutorture_get_gp_data);
+
+static int __init srcu_bootup_announce(void)
+{
+ pr_info("Hierarchical SRCU implementation.\n");
+ if (exp_holdoff != DEFAULT_SRCU_EXP_HOLDOFF)
+ pr_info("\tNon-default auto-expedite holdoff of %lu ns.\n", exp_holdoff);
+ return 0;
+}
+early_initcall(srcu_bootup_announce);
diff --git a/kernel/rcu/tiny.c b/kernel/rcu/tiny.c
index e5385731e391..f8488965250f 100644
--- a/kernel/rcu/tiny.c
+++ b/kernel/rcu/tiny.c
@@ -35,15 +35,26 @@
#include <linux/time.h>
#include <linux/cpu.h>
#include <linux/prefetch.h>
-#include <linux/trace_events.h>
#include "rcu.h"
-/* Forward declarations for tiny_plugin.h. */
-struct rcu_ctrlblk;
-static void __call_rcu(struct rcu_head *head,
- rcu_callback_t func,
- struct rcu_ctrlblk *rcp);
+/* Global control variables for rcupdate callback mechanism. */
+struct rcu_ctrlblk {
+ struct rcu_head *rcucblist; /* List of pending callbacks (CBs). */
+ struct rcu_head **donetail; /* ->next pointer of last "done" CB. */
+ struct rcu_head **curtail; /* ->next pointer of last CB. */
+};
+
+/* Definition for rcupdate control block. */
+static struct rcu_ctrlblk rcu_sched_ctrlblk = {
+ .donetail = &rcu_sched_ctrlblk.rcucblist,
+ .curtail = &rcu_sched_ctrlblk.rcucblist,
+};
+
+static struct rcu_ctrlblk rcu_bh_ctrlblk = {
+ .donetail = &rcu_bh_ctrlblk.rcucblist,
+ .curtail = &rcu_bh_ctrlblk.rcucblist,
+};
#include "tiny_plugin.h"
@@ -59,19 +70,6 @@ void rcu_barrier_sched(void)
}
EXPORT_SYMBOL(rcu_barrier_sched);
-#if defined(CONFIG_DEBUG_LOCK_ALLOC) || defined(CONFIG_RCU_TRACE)
-
-/*
- * Test whether RCU thinks that the current CPU is idle.
- */
-bool notrace __rcu_is_watching(void)
-{
- return true;
-}
-EXPORT_SYMBOL(__rcu_is_watching);
-
-#endif /* defined(CONFIG_DEBUG_LOCK_ALLOC) || defined(CONFIG_RCU_TRACE) */
-
/*
* Helper function for rcu_sched_qs() and rcu_bh_qs().
* Also irqs are disabled to avoid confusion due to interrupt handlers
@@ -79,7 +77,6 @@ EXPORT_SYMBOL(__rcu_is_watching);
*/
static int rcu_qsctr_help(struct rcu_ctrlblk *rcp)
{
- RCU_TRACE(reset_cpu_stall_ticks(rcp);)
if (rcp->donetail != rcp->curtail) {
rcp->donetail = rcp->curtail;
return 1;
@@ -125,7 +122,6 @@ void rcu_bh_qs(void)
*/
void rcu_check_callbacks(int user)
{
- RCU_TRACE(check_cpu_stalls();)
if (user)
rcu_sched_qs();
else if (!in_softirq())
@@ -140,10 +136,8 @@ void rcu_check_callbacks(int user)
*/
static void __rcu_process_callbacks(struct rcu_ctrlblk *rcp)
{
- const char *rn = NULL;
struct rcu_head *next, *list;
unsigned long flags;
- RCU_TRACE(int cb_count = 0;)
/* Move the ready-to-invoke callbacks to a local list. */
local_irq_save(flags);
@@ -152,7 +146,6 @@ static void __rcu_process_callbacks(struct rcu_ctrlblk *rcp)
local_irq_restore(flags);
return;
}
- RCU_TRACE(trace_rcu_batch_start(rcp->name, 0, rcp->qlen, -1);)
list = rcp->rcucblist;
rcp->rcucblist = *rcp->donetail;
*rcp->donetail = NULL;
@@ -162,22 +155,15 @@ static void __rcu_process_callbacks(struct rcu_ctrlblk *rcp)
local_irq_restore(flags);
/* Invoke the callbacks on the local list. */
- RCU_TRACE(rn = rcp->name;)
while (list) {
next = list->next;
prefetch(next);
debug_rcu_head_unqueue(list);
local_bh_disable();
- __rcu_reclaim(rn, list);
+ __rcu_reclaim("", list);
local_bh_enable();
list = next;
- RCU_TRACE(cb_count++;)
}
- RCU_TRACE(rcu_trace_sub_qlen(rcp, cb_count);)
- RCU_TRACE(trace_rcu_batch_end(rcp->name,
- cb_count, 0, need_resched(),
- is_idle_task(current),
- false));
}
static __latent_entropy void rcu_process_callbacks(struct softirq_action *unused)
@@ -221,7 +207,6 @@ static void __call_rcu(struct rcu_head *head,
local_irq_save(flags);
*rcp->curtail = head;
rcp->curtail = &head->next;
- RCU_TRACE(rcp->qlen++;)
local_irq_restore(flags);
if (unlikely(is_idle_task(current))) {
@@ -254,8 +239,5 @@ EXPORT_SYMBOL_GPL(call_rcu_bh);
void __init rcu_init(void)
{
open_softirq(RCU_SOFTIRQ, rcu_process_callbacks);
- RCU_TRACE(reset_cpu_stall_ticks(&rcu_sched_ctrlblk);)
- RCU_TRACE(reset_cpu_stall_ticks(&rcu_bh_ctrlblk);)
-
rcu_early_boot_tests();
}
diff --git a/kernel/rcu/tiny_plugin.h b/kernel/rcu/tiny_plugin.h
index 371034e77f87..f0a01b2a3062 100644
--- a/kernel/rcu/tiny_plugin.h
+++ b/kernel/rcu/tiny_plugin.h
@@ -22,36 +22,6 @@
* Author: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
*/
-#include <linux/kthread.h>
-#include <linux/init.h>
-#include <linux/debugfs.h>
-#include <linux/seq_file.h>
-
-/* Global control variables for rcupdate callback mechanism. */
-struct rcu_ctrlblk {
- struct rcu_head *rcucblist; /* List of pending callbacks (CBs). */
- struct rcu_head **donetail; /* ->next pointer of last "done" CB. */
- struct rcu_head **curtail; /* ->next pointer of last CB. */
- RCU_TRACE(long qlen); /* Number of pending CBs. */
- RCU_TRACE(unsigned long gp_start); /* Start time for stalls. */
- RCU_TRACE(unsigned long ticks_this_gp); /* Statistic for stalls. */
- RCU_TRACE(unsigned long jiffies_stall); /* Jiffies at next stall. */
- RCU_TRACE(const char *name); /* Name of RCU type. */
-};
-
-/* Definition for rcupdate control block. */
-static struct rcu_ctrlblk rcu_sched_ctrlblk = {
- .donetail = &rcu_sched_ctrlblk.rcucblist,
- .curtail = &rcu_sched_ctrlblk.rcucblist,
- RCU_TRACE(.name = "rcu_sched")
-};
-
-static struct rcu_ctrlblk rcu_bh_ctrlblk = {
- .donetail = &rcu_bh_ctrlblk.rcucblist,
- .curtail = &rcu_bh_ctrlblk.rcucblist,
- RCU_TRACE(.name = "rcu_bh")
-};
-
#if defined(CONFIG_DEBUG_LOCK_ALLOC) || defined(CONFIG_SRCU)
#include <linux/kernel_stat.h>
@@ -75,96 +45,3 @@ void __init rcu_scheduler_starting(void)
}
#endif /* #if defined(CONFIG_DEBUG_LOCK_ALLOC) || defined(CONFIG_SRCU) */
-
-#ifdef CONFIG_RCU_TRACE
-
-static void rcu_trace_sub_qlen(struct rcu_ctrlblk *rcp, int n)
-{
- unsigned long flags;
-
- local_irq_save(flags);
- rcp->qlen -= n;
- local_irq_restore(flags);
-}
-
-/*
- * Dump statistics for TINY_RCU, such as they are.
- */
-static int show_tiny_stats(struct seq_file *m, void *unused)
-{
- seq_printf(m, "rcu_sched: qlen: %ld\n", rcu_sched_ctrlblk.qlen);
- seq_printf(m, "rcu_bh: qlen: %ld\n", rcu_bh_ctrlblk.qlen);
- return 0;
-}
-
-static int show_tiny_stats_open(struct inode *inode, struct file *file)
-{
- return single_open(file, show_tiny_stats, NULL);
-}
-
-static const struct file_operations show_tiny_stats_fops = {
- .owner = THIS_MODULE,
- .open = show_tiny_stats_open,
- .read = seq_read,
- .llseek = seq_lseek,
- .release = single_release,
-};
-
-static struct dentry *rcudir;
-
-static int __init rcutiny_trace_init(void)
-{
- struct dentry *retval;
-
- rcudir = debugfs_create_dir("rcu", NULL);
- if (!rcudir)
- goto free_out;
- retval = debugfs_create_file("rcudata", 0444, rcudir,
- NULL, &show_tiny_stats_fops);
- if (!retval)
- goto free_out;
- return 0;
-free_out:
- debugfs_remove_recursive(rcudir);
- return 1;
-}
-device_initcall(rcutiny_trace_init);
-
-static void check_cpu_stall(struct rcu_ctrlblk *rcp)
-{
- unsigned long j;
- unsigned long js;
-
- if (rcu_cpu_stall_suppress)
- return;
- rcp->ticks_this_gp++;
- j = jiffies;
- js = READ_ONCE(rcp->jiffies_stall);
- if (rcp->rcucblist && ULONG_CMP_GE(j, js)) {
- pr_err("INFO: %s stall on CPU (%lu ticks this GP) idle=%llx (t=%lu jiffies q=%ld)\n",
- rcp->name, rcp->ticks_this_gp, DYNTICK_TASK_EXIT_IDLE,
- jiffies - rcp->gp_start, rcp->qlen);
- dump_stack();
- WRITE_ONCE(rcp->jiffies_stall,
- jiffies + 3 * rcu_jiffies_till_stall_check() + 3);
- } else if (ULONG_CMP_GE(j, js)) {
- WRITE_ONCE(rcp->jiffies_stall,
- jiffies + rcu_jiffies_till_stall_check());
- }
-}
-
-static void reset_cpu_stall_ticks(struct rcu_ctrlblk *rcp)
-{
- rcp->ticks_this_gp = 0;
- rcp->gp_start = jiffies;
- WRITE_ONCE(rcp->jiffies_stall,
- jiffies + rcu_jiffies_till_stall_check());
-}
-
-static void check_cpu_stalls(void)
-{
- RCU_TRACE(check_cpu_stall(&rcu_bh_ctrlblk);)
- RCU_TRACE(check_cpu_stall(&rcu_sched_ctrlblk);)
-}
-
-#endif /* #ifdef CONFIG_RCU_TRACE */
diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
index e354e475e645..51d4c3acf32d 100644
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -168,35 +168,17 @@ static void rcu_report_exp_rdp(struct rcu_state *rsp,
static void sync_sched_exp_online_cleanup(int cpu);
/* rcuc/rcub kthread realtime priority */
-#ifdef CONFIG_RCU_KTHREAD_PRIO
-static int kthread_prio = CONFIG_RCU_KTHREAD_PRIO;
-#else /* #ifdef CONFIG_RCU_KTHREAD_PRIO */
static int kthread_prio = IS_ENABLED(CONFIG_RCU_BOOST) ? 1 : 0;
-#endif /* #else #ifdef CONFIG_RCU_KTHREAD_PRIO */
module_param(kthread_prio, int, 0644);
/* Delay in jiffies for grace-period initialization delays, debug only. */
-#ifdef CONFIG_RCU_TORTURE_TEST_SLOW_PREINIT
-static int gp_preinit_delay = CONFIG_RCU_TORTURE_TEST_SLOW_PREINIT_DELAY;
-module_param(gp_preinit_delay, int, 0644);
-#else /* #ifdef CONFIG_RCU_TORTURE_TEST_SLOW_PREINIT */
-static const int gp_preinit_delay;
-#endif /* #else #ifdef CONFIG_RCU_TORTURE_TEST_SLOW_PREINIT */
-
-#ifdef CONFIG_RCU_TORTURE_TEST_SLOW_INIT
-static int gp_init_delay = CONFIG_RCU_TORTURE_TEST_SLOW_INIT_DELAY;
-module_param(gp_init_delay, int, 0644);
-#else /* #ifdef CONFIG_RCU_TORTURE_TEST_SLOW_INIT */
-static const int gp_init_delay;
-#endif /* #else #ifdef CONFIG_RCU_TORTURE_TEST_SLOW_INIT */
-
-#ifdef CONFIG_RCU_TORTURE_TEST_SLOW_CLEANUP
-static int gp_cleanup_delay = CONFIG_RCU_TORTURE_TEST_SLOW_CLEANUP_DELAY;
-module_param(gp_cleanup_delay, int, 0644);
-#else /* #ifdef CONFIG_RCU_TORTURE_TEST_SLOW_CLEANUP */
-static const int gp_cleanup_delay;
-#endif /* #else #ifdef CONFIG_RCU_TORTURE_TEST_SLOW_CLEANUP */
+static int gp_preinit_delay;
+module_param(gp_preinit_delay, int, 0444);
+static int gp_init_delay;
+module_param(gp_init_delay, int, 0444);
+static int gp_cleanup_delay;
+module_param(gp_cleanup_delay, int, 0444);
/*
* Number of grace periods between delays, normalized by the duration of
@@ -250,6 +232,7 @@ static int rcu_gp_in_progress(struct rcu_state *rsp)
*/
void rcu_sched_qs(void)
{
+ RCU_LOCKDEP_WARN(preemptible(), "rcu_sched_qs() invoked with preemption enabled!!!");
if (!__this_cpu_read(rcu_sched_data.cpu_no_qs.s))
return;
trace_rcu_grace_period(TPS("rcu_sched"),
@@ -265,6 +248,7 @@ void rcu_sched_qs(void)
void rcu_bh_qs(void)
{
+ RCU_LOCKDEP_WARN(preemptible(), "rcu_bh_qs() invoked with preemption enabled!!!");
if (__this_cpu_read(rcu_bh_data.cpu_no_qs.s)) {
trace_rcu_grace_period(TPS("rcu_bh"),
__this_cpu_read(rcu_bh_data.gpnum),
@@ -286,10 +270,6 @@ void rcu_bh_qs(void)
static DEFINE_PER_CPU(struct rcu_dynticks, rcu_dynticks) = {
.dynticks_nesting = DYNTICK_TASK_EXIT_IDLE,
.dynticks = ATOMIC_INIT(RCU_DYNTICK_CTRL_CTR),
-#ifdef CONFIG_NO_HZ_FULL_SYSIDLE
- .dynticks_idle_nesting = DYNTICK_TASK_NEST_VALUE,
- .dynticks_idle = ATOMIC_INIT(1),
-#endif /* #ifdef CONFIG_NO_HZ_FULL_SYSIDLE */
};
/*
@@ -478,7 +458,7 @@ void rcu_note_context_switch(bool preempt)
barrier(); /* Avoid RCU read-side critical sections leaking down. */
trace_rcu_utilization(TPS("Start context switch"));
rcu_sched_qs();
- rcu_preempt_note_context_switch();
+ rcu_preempt_note_context_switch(preempt);
/* Load rcu_urgent_qs before other flags. */
if (!smp_load_acquire(this_cpu_ptr(&rcu_dynticks.rcu_urgent_qs)))
goto out;
@@ -534,9 +514,12 @@ void rcu_all_qs(void)
}
EXPORT_SYMBOL_GPL(rcu_all_qs);
-static long blimit = 10; /* Maximum callbacks per rcu_do_batch. */
-static long qhimark = 10000; /* If this many pending, ignore blimit. */
-static long qlowmark = 100; /* Once only this many pending, use blimit. */
+#define DEFAULT_RCU_BLIMIT 10 /* Maximum callbacks per rcu_do_batch. */
+static long blimit = DEFAULT_RCU_BLIMIT;
+#define DEFAULT_RCU_QHIMARK 10000 /* If this many pending, ignore blimit. */
+static long qhimark = DEFAULT_RCU_QHIMARK;
+#define DEFAULT_RCU_QLOMARK 100 /* Once only this many pending, use blimit. */
+static long qlowmark = DEFAULT_RCU_QLOMARK;
module_param(blimit, long, 0444);
module_param(qhimark, long, 0444);
@@ -559,10 +542,7 @@ module_param(jiffies_till_sched_qs, ulong, 0644);
static bool rcu_start_gp_advanced(struct rcu_state *rsp, struct rcu_node *rnp,
struct rcu_data *rdp);
-static void force_qs_rnp(struct rcu_state *rsp,
- int (*f)(struct rcu_data *rsp, bool *isidle,
- unsigned long *maxj),
- bool *isidle, unsigned long *maxj);
+static void force_qs_rnp(struct rcu_state *rsp, int (*f)(struct rcu_data *rsp));
static void force_quiescent_state(struct rcu_state *rsp);
static int rcu_pending(void);
@@ -757,6 +737,7 @@ static int rcu_future_needs_gp(struct rcu_state *rsp)
int idx = (READ_ONCE(rnp->completed) + 1) & 0x1;
int *fp = &rnp->need_future_gp[idx];
+ RCU_LOCKDEP_WARN(!irqs_disabled(), "rcu_future_needs_gp() invoked with irqs enabled!!!");
return READ_ONCE(*fp);
}
@@ -768,6 +749,7 @@ static int rcu_future_needs_gp(struct rcu_state *rsp)
static bool
cpu_needs_another_gp(struct rcu_state *rsp, struct rcu_data *rdp)
{
+ RCU_LOCKDEP_WARN(!irqs_disabled(), "cpu_needs_another_gp() invoked with irqs enabled!!!");
if (rcu_gp_in_progress(rsp))
return false; /* No, a grace period is already in progress. */
if (rcu_future_needs_gp(rsp))
@@ -794,6 +776,7 @@ static void rcu_eqs_enter_common(bool user)
struct rcu_data *rdp;
struct rcu_dynticks *rdtp = this_cpu_ptr(&rcu_dynticks);
+ RCU_LOCKDEP_WARN(!irqs_disabled(), "rcu_eqs_enter_common() invoked with irqs enabled!!!");
trace_rcu_dyntick(TPS("Start"), rdtp->dynticks_nesting, 0);
if (IS_ENABLED(CONFIG_RCU_EQS_DEBUG) &&
!user && !is_idle_task(current)) {
@@ -864,7 +847,6 @@ void rcu_idle_enter(void)
local_irq_save(flags);
rcu_eqs_enter(false);
- rcu_sysidle_enter(0);
local_irq_restore(flags);
}
EXPORT_SYMBOL_GPL(rcu_idle_enter);
@@ -914,7 +896,6 @@ void rcu_irq_exit(void)
trace_rcu_dyntick(TPS("--="), rdtp->dynticks_nesting, rdtp->dynticks_nesting - 1);
rdtp->dynticks_nesting--;
}
- rcu_sysidle_enter(1);
}
/*
@@ -967,6 +948,7 @@ static void rcu_eqs_exit(bool user)
struct rcu_dynticks *rdtp;
long long oldval;
+ RCU_LOCKDEP_WARN(!irqs_disabled(), "rcu_eqs_exit() invoked with irqs enabled!!!");
rdtp = this_cpu_ptr(&rcu_dynticks);
oldval = rdtp->dynticks_nesting;
WARN_ON_ONCE(IS_ENABLED(CONFIG_RCU_EQS_DEBUG) && oldval < 0);
@@ -995,7 +977,6 @@ void rcu_idle_exit(void)
local_irq_save(flags);
rcu_eqs_exit(false);
- rcu_sysidle_exit(0);
local_irq_restore(flags);
}
EXPORT_SYMBOL_GPL(rcu_idle_exit);
@@ -1047,7 +1028,6 @@ void rcu_irq_enter(void)
trace_rcu_dyntick(TPS("++="), oldval, rdtp->dynticks_nesting);
else
rcu_eqs_exit_common(oldval, true);
- rcu_sysidle_exit(1);
}
/*
@@ -1130,22 +1110,11 @@ void rcu_nmi_exit(void)
}
/**
- * __rcu_is_watching - are RCU read-side critical sections safe?
- *
- * Return true if RCU is watching the running CPU, which means that
- * this CPU can safely enter RCU read-side critical sections. Unlike
- * rcu_is_watching(), the caller of __rcu_is_watching() must have at
- * least disabled preemption.
- */
-bool notrace __rcu_is_watching(void)
-{
- return !rcu_dynticks_curr_cpu_in_eqs();
-}
-
-/**
* rcu_is_watching - see if RCU thinks that the current CPU is idle
*
- * If the current CPU is in its idle loop and is neither in an interrupt
+ * Return true if RCU is watching the running CPU, which means that this
+ * CPU can safely enter RCU read-side critical sections. In other words,
+ * if the current CPU is in its idle loop and is neither in an interrupt
* or NMI handler, return true.
*/
bool notrace rcu_is_watching(void)
@@ -1153,7 +1122,7 @@ bool notrace rcu_is_watching(void)
bool ret;
preempt_disable_notrace();
- ret = __rcu_is_watching();
+ ret = !rcu_dynticks_curr_cpu_in_eqs();
preempt_enable_notrace();
return ret;
}
@@ -1237,11 +1206,9 @@ static int rcu_is_cpu_rrupt_from_idle(void)
* credit them with an implicit quiescent state. Return 1 if this CPU
* is in dynticks idle mode, which is an extended quiescent state.
*/
-static int dyntick_save_progress_counter(struct rcu_data *rdp,
- bool *isidle, unsigned long *maxj)
+static int dyntick_save_progress_counter(struct rcu_data *rdp)
{
rdp->dynticks_snap = rcu_dynticks_snap(rdp->dynticks);
- rcu_sysidle_check_cpu(rdp, isidle, maxj);
if (rcu_dynticks_in_eqs(rdp->dynticks_snap)) {
trace_rcu_fqs(rdp->rsp->name, rdp->gpnum, rdp->cpu, TPS("dti"));
if (ULONG_CMP_LT(READ_ONCE(rdp->gpnum) + ULONG_MAX / 4,
@@ -1258,8 +1225,7 @@ static int dyntick_save_progress_counter(struct rcu_data *rdp,
* idle state since the last call to dyntick_save_progress_counter()
* for this same CPU, or by virtue of having been offline.
*/
-static int rcu_implicit_dynticks_qs(struct rcu_data *rdp,
- bool *isidle, unsigned long *maxj)
+static int rcu_implicit_dynticks_qs(struct rcu_data *rdp)
{
unsigned long jtsq;
bool *rnhqp;
@@ -1674,6 +1640,8 @@ void rcu_cpu_stall_reset(void)
static unsigned long rcu_cbs_completed(struct rcu_state *rsp,
struct rcu_node *rnp)
{
+ lockdep_assert_held(&rnp->lock);
+
/*
* If RCU is idle, we just wait for the next grace period.
* But we can only be sure that RCU is idle if we are looking
@@ -1719,6 +1687,8 @@ rcu_start_future_gp(struct rcu_node *rnp, struct rcu_data *rdp,
bool ret = false;
struct rcu_node *rnp_root = rcu_get_root(rdp->rsp);
+ lockdep_assert_held(&rnp->lock);
+
/*
* Pick up grace-period number for new callbacks. If this
* grace period is already marked as needed, return to the caller.
@@ -1845,6 +1815,8 @@ static bool rcu_accelerate_cbs(struct rcu_state *rsp, struct rcu_node *rnp,
{
bool ret = false;
+ lockdep_assert_held(&rnp->lock);
+
/* If no pending (not yet ready to invoke) callbacks, nothing to do. */
if (!rcu_segcblist_pend_cbs(&rdp->cblist))
return false;
@@ -1883,6 +1855,8 @@ static bool rcu_accelerate_cbs(struct rcu_state *rsp, struct rcu_node *rnp,
static bool rcu_advance_cbs(struct rcu_state *rsp, struct rcu_node *rnp,
struct rcu_data *rdp)
{
+ lockdep_assert_held(&rnp->lock);
+
/* If no pending (not yet ready to invoke) callbacks, nothing to do. */
if (!rcu_segcblist_pend_cbs(&rdp->cblist))
return false;
@@ -1909,6 +1883,8 @@ static bool __note_gp_changes(struct rcu_state *rsp, struct rcu_node *rnp,
bool ret;
bool need_gp;
+ lockdep_assert_held(&rnp->lock);
+
/* Handle the ends of any preceding grace periods first. */
if (rdp->completed == rnp->completed &&
!unlikely(READ_ONCE(rdp->gpwrap))) {
@@ -2115,25 +2091,16 @@ static bool rcu_gp_fqs_check_wake(struct rcu_state *rsp, int *gfp)
*/
static void rcu_gp_fqs(struct rcu_state *rsp, bool first_time)
{
- bool isidle = false;
- unsigned long maxj;
struct rcu_node *rnp = rcu_get_root(rsp);
WRITE_ONCE(rsp->gp_activity, jiffies);
rsp->n_force_qs++;
if (first_time) {
/* Collect dyntick-idle snapshots. */
- if (is_sysidle_rcu_state(rsp)) {
- isidle = true;
- maxj = jiffies - ULONG_MAX / 4;
- }
- force_qs_rnp(rsp, dyntick_save_progress_counter,
- &isidle, &maxj);
- rcu_sysidle_report_gp(rsp, isidle, maxj);
+ force_qs_rnp(rsp, dyntick_save_progress_counter);
} else {
/* Handle dyntick-idle and offline CPUs. */
- isidle = true;
- force_qs_rnp(rsp, rcu_implicit_dynticks_qs, &isidle, &maxj);
+ force_qs_rnp(rsp, rcu_implicit_dynticks_qs);
}
/* Clear flag to prevent immediate re-entry. */
if (READ_ONCE(rsp->gp_flags) & RCU_GP_FLAG_FQS) {
@@ -2341,6 +2308,7 @@ static bool
rcu_start_gp_advanced(struct rcu_state *rsp, struct rcu_node *rnp,
struct rcu_data *rdp)
{
+ lockdep_assert_held(&rnp->lock);
if (!rsp->gp_kthread || !cpu_needs_another_gp(rsp, rdp)) {
/*
* Either we have not yet spawned the grace-period
@@ -2402,6 +2370,7 @@ static bool rcu_start_gp(struct rcu_state *rsp)
static void rcu_report_qs_rsp(struct rcu_state *rsp, unsigned long flags)
__releases(rcu_get_root(rsp)->lock)
{
+ lockdep_assert_held(&rcu_get_root(rsp)->lock);
WARN_ON_ONCE(!rcu_gp_in_progress(rsp));
WRITE_ONCE(rsp->gp_flags, READ_ONCE(rsp->gp_flags) | RCU_GP_FLAG_FQS);
raw_spin_unlock_irqrestore_rcu_node(rcu_get_root(rsp), flags);
@@ -2426,6 +2395,8 @@ rcu_report_qs_rnp(unsigned long mask, struct rcu_state *rsp,
unsigned long oldmask = 0;
struct rcu_node *rnp_c;
+ lockdep_assert_held(&rnp->lock);
+
/* Walk up the rcu_node hierarchy. */
for (;;) {
if (!(rnp->qsmask & mask) || rnp->gpnum != gps) {
@@ -2486,6 +2457,7 @@ static void rcu_report_unblock_qs_rnp(struct rcu_state *rsp,
unsigned long mask;
struct rcu_node *rnp_p;
+ lockdep_assert_held(&rnp->lock);
if (rcu_state_p == &rcu_sched_state || rsp != rcu_state_p ||
rnp->qsmask != 0 || rcu_preempt_blocked_readers_cgp(rnp)) {
raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
@@ -2599,6 +2571,8 @@ static void
rcu_send_cbs_to_orphanage(int cpu, struct rcu_state *rsp,
struct rcu_node *rnp, struct rcu_data *rdp)
{
+ lockdep_assert_held(&rsp->orphan_lock);
+
/* No-CBs CPUs do not have orphanable callbacks. */
if (!IS_ENABLED(CONFIG_HOTPLUG_CPU) || rcu_is_nocb_cpu(rdp->cpu))
return;
@@ -2639,6 +2613,8 @@ static void rcu_adopt_orphan_cbs(struct rcu_state *rsp, unsigned long flags)
{
struct rcu_data *rdp = raw_cpu_ptr(rsp->rda);
+ lockdep_assert_held(&rsp->orphan_lock);
+
/* No-CBs CPUs are handled specially. */
if (!IS_ENABLED(CONFIG_HOTPLUG_CPU) ||
rcu_nocb_adopt_orphan_cbs(rsp, rdp, flags))
@@ -2705,6 +2681,7 @@ static void rcu_cleanup_dead_rnp(struct rcu_node *rnp_leaf)
long mask;
struct rcu_node *rnp = rnp_leaf;
+ lockdep_assert_held(&rnp->lock);
if (!IS_ENABLED(CONFIG_HOTPLUG_CPU) ||
rnp->qsmaskinit || rcu_preempt_has_tasks(rnp))
return;
@@ -2895,10 +2872,7 @@ void rcu_check_callbacks(int user)
*
* The caller must have suppressed start of new grace periods.
*/
-static void force_qs_rnp(struct rcu_state *rsp,
- int (*f)(struct rcu_data *rsp, bool *isidle,
- unsigned long *maxj),
- bool *isidle, unsigned long *maxj)
+static void force_qs_rnp(struct rcu_state *rsp, int (*f)(struct rcu_data *rsp))
{
int cpu;
unsigned long flags;
@@ -2937,7 +2911,7 @@ static void force_qs_rnp(struct rcu_state *rsp,
for_each_leaf_node_possible_cpu(rnp, cpu) {
unsigned long bit = leaf_node_cpu_bit(rnp, cpu);
if ((rnp->qsmask & bit) != 0) {
- if (f(per_cpu_ptr(rsp->rda, cpu), isidle, maxj))
+ if (f(per_cpu_ptr(rsp->rda, cpu)))
mask |= bit;
}
}
@@ -3143,9 +3117,14 @@ __call_rcu(struct rcu_head *head, rcu_callback_t func,
WARN_ON_ONCE((unsigned long)head & (sizeof(void *) - 1));
if (debug_rcu_head_queue(head)) {
- /* Probable double call_rcu(), so leak the callback. */
+ /*
+ * Probable double call_rcu(), so leak the callback.
+ * Use rcu:rcu_callback trace event to find the previous
+ * time callback was passed to __call_rcu().
+ */
+ WARN_ONCE(1, "__call_rcu(): Double-freed CB %p->%pF()!!!\n",
+ head, head->func);
WRITE_ONCE(head->func, rcu_leak_callback);
- WARN_ONCE(1, "__call_rcu(): Leaked duplicate callback\n");
return;
}
head->func = func;
@@ -3194,8 +3173,24 @@ __call_rcu(struct rcu_head *head, rcu_callback_t func,
local_irq_restore(flags);
}
-/*
- * Queue an RCU-sched callback for invocation after a grace period.
+/**
+ * call_rcu_sched() - Queue an RCU for invocation after sched grace period.
+ * @head: structure to be used for queueing the RCU updates.
+ * @func: actual callback function to be invoked after the grace period
+ *
+ * The callback function will be invoked some time after a full grace
+ * period elapses, in other words after all currently executing RCU
+ * read-side critical sections have completed. call_rcu_sched() assumes
+ * that the read-side critical sections end on enabling of preemption
+ * or on voluntary preemption.
+ * RCU read-side critical sections are delimited by :
+ * - rcu_read_lock_sched() and rcu_read_unlock_sched(), OR
+ * - anything that disables preemption.
+ *
+ * These may be nested.
+ *
+ * See the description of call_rcu() for more detailed information on
+ * memory ordering guarantees.
*/
void call_rcu_sched(struct rcu_head *head, rcu_callback_t func)
{
@@ -3203,8 +3198,26 @@ void call_rcu_sched(struct rcu_head *head, rcu_callback_t func)
}
EXPORT_SYMBOL_GPL(call_rcu_sched);
-/*
- * Queue an RCU callback for invocation after a quicker grace period.
+/**
+ * call_rcu_bh() - Queue an RCU for invocation after a quicker grace period.
+ * @head: structure to be used for queueing the RCU updates.
+ * @func: actual callback function to be invoked after the grace period
+ *
+ * The callback function will be invoked some time after a full grace
+ * period elapses, in other words after all currently executing RCU
+ * read-side critical sections have completed. call_rcu_bh() assumes
+ * that the read-side critical sections end on completion of a softirq
+ * handler. This means that read-side critical sections in process
+ * context must not be interrupted by softirqs. This interface is to be
+ * used when most of the read-side critical sections are in softirq context.
+ * RCU read-side critical sections are delimited by :
+ * - rcu_read_lock() and rcu_read_unlock(), if in interrupt context.
+ * OR
+ * - rcu_read_lock_bh() and rcu_read_unlock_bh(), if in process context.
+ * These may be nested.
+ *
+ * See the description of call_rcu() for more detailed information on
+ * memory ordering guarantees.
*/
void call_rcu_bh(struct rcu_head *head, rcu_callback_t func)
{
@@ -3280,12 +3293,6 @@ static inline int rcu_blocking_is_gp(void)
* to have executed a full memory barrier during the execution of
* synchronize_sched() -- even if CPU A and CPU B are the same CPU (but
* again only if the system has more than one CPU).
- *
- * This primitive provides the guarantees made by the (now removed)
- * synchronize_kernel() API. In contrast, synchronize_rcu() only
- * guarantees that rcu_read_lock() sections will have completed.
- * In "classic RCU", these two guarantees happen to be one and
- * the same, but can differ in realtime RCU implementations.
*/
void synchronize_sched(void)
{
@@ -3578,8 +3585,14 @@ static void rcu_barrier_func(void *type)
struct rcu_data *rdp = raw_cpu_ptr(rsp->rda);
_rcu_barrier_trace(rsp, "IRQ", -1, rsp->barrier_sequence);
- atomic_inc(&rsp->barrier_cpu_count);
- rsp->call(&rdp->barrier_head, rcu_barrier_callback);
+ rdp->barrier_head.func = rcu_barrier_callback;
+ debug_rcu_head_queue(&rdp->barrier_head);
+ if (rcu_segcblist_entrain(&rdp->cblist, &rdp->barrier_head, 0)) {
+ atomic_inc(&rsp->barrier_cpu_count);
+ } else {
+ debug_rcu_head_unqueue(&rdp->barrier_head);
+ _rcu_barrier_trace(rsp, "IRQNQ", -1, rsp->barrier_sequence);
+ }
}
/*
@@ -3698,6 +3711,7 @@ static void rcu_init_new_rnp(struct rcu_node *rnp_leaf)
long mask;
struct rcu_node *rnp = rnp_leaf;
+ lockdep_assert_held(&rnp->lock);
for (;;) {
mask = rnp->grpmask;
rnp = rnp->parent;
@@ -3753,7 +3767,6 @@ rcu_init_percpu_data(int cpu, struct rcu_state *rsp)
!init_nocb_callback_list(rdp))
rcu_segcblist_init(&rdp->cblist); /* Re-enable callbacks. */
rdp->dynticks->dynticks_nesting = DYNTICK_TASK_EXIT_IDLE;
- rcu_sysidle_init_percpu_data(rdp->dynticks);
rcu_dynticks_eqs_online();
raw_spin_unlock_rcu_node(rnp); /* irqs remain disabled. */
diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h
index ba38262c3554..9af0f31d6847 100644
--- a/kernel/rcu/tree.h
+++ b/kernel/rcu/tree.h
@@ -45,14 +45,6 @@ struct rcu_dynticks {
bool rcu_need_heavy_qs; /* GP old, need heavy quiescent state. */
unsigned long rcu_qs_ctr; /* Light universal quiescent state ctr. */
bool rcu_urgent_qs; /* GP old need light quiescent state. */
-#ifdef CONFIG_NO_HZ_FULL_SYSIDLE
- long long dynticks_idle_nesting;
- /* irq/process nesting level from idle. */
- atomic_t dynticks_idle; /* Even value for idle, else odd. */
- /* "Idle" excludes userspace execution. */
- unsigned long dynticks_idle_jiffies;
- /* End of last non-NMI non-idle period. */
-#endif /* #ifdef CONFIG_NO_HZ_FULL_SYSIDLE */
#ifdef CONFIG_RCU_FAST_NO_HZ
bool all_lazy; /* Are all CPU's CBs lazy? */
unsigned long nonlazy_posted;
@@ -160,19 +152,6 @@ struct rcu_node {
/* Number of tasks boosted for expedited GP. */
unsigned long n_normal_boosts;
/* Number of tasks boosted for normal GP. */
- unsigned long n_balk_blkd_tasks;
- /* Refused to boost: no blocked tasks. */
- unsigned long n_balk_exp_gp_tasks;
- /* Refused to boost: nothing blocking GP. */
- unsigned long n_balk_boost_tasks;
- /* Refused to boost: already boosting. */
- unsigned long n_balk_notblocked;
- /* Refused to boost: RCU RS CS still running. */
- unsigned long n_balk_notyet;
- /* Refused to boost: not yet time. */
- unsigned long n_balk_nos;
- /* Refused to boost: not sure why, though. */
- /* This can happen due to race conditions. */
#ifdef CONFIG_RCU_NOCB_CPU
struct swait_queue_head nocb_gp_wq[2];
/* Place for rcu_nocb_kthread() to wait GP. */
@@ -312,9 +291,9 @@ struct rcu_data {
};
/* Values for nocb_defer_wakeup field in struct rcu_data. */
-#define RCU_NOGP_WAKE_NOT 0
-#define RCU_NOGP_WAKE 1
-#define RCU_NOGP_WAKE_FORCE 2
+#define RCU_NOCB_WAKE_NOT 0
+#define RCU_NOCB_WAKE 1
+#define RCU_NOCB_WAKE_FORCE 2
#define RCU_JIFFIES_TILL_FORCE_QS (1 + (HZ > 250) + (HZ > 500))
/* For jiffies_till_first_fqs and */
@@ -477,7 +456,7 @@ DECLARE_PER_CPU(char, rcu_cpu_has_work);
/* Forward declarations for rcutree_plugin.h */
static void rcu_bootup_announce(void);
-static void rcu_preempt_note_context_switch(void);
+static void rcu_preempt_note_context_switch(bool preempt);
static int rcu_preempt_blocked_readers_cgp(struct rcu_node *rnp);
#ifdef CONFIG_HOTPLUG_CPU
static bool rcu_preempt_has_tasks(struct rcu_node *rnp);
@@ -529,15 +508,7 @@ static void __init rcu_organize_nocb_kthreads(struct rcu_state *rsp);
#endif /* #ifdef CONFIG_RCU_NOCB_CPU */
static void __maybe_unused rcu_kick_nohz_cpu(int cpu);
static bool init_nocb_callback_list(struct rcu_data *rdp);
-static void rcu_sysidle_enter(int irq);
-static void rcu_sysidle_exit(int irq);
-static void rcu_sysidle_check_cpu(struct rcu_data *rdp, bool *isidle,
- unsigned long *maxj);
-static bool is_sysidle_rcu_state(struct rcu_state *rsp);
-static void rcu_sysidle_report_gp(struct rcu_state *rsp, int isidle,
- unsigned long maxj);
static void rcu_bind_gp_kthread(void);
-static void rcu_sysidle_init_percpu_data(struct rcu_dynticks *rdtp);
static bool rcu_nohz_full_cpu(struct rcu_state *rsp);
static void rcu_dynticks_task_enter(void);
static void rcu_dynticks_task_exit(void);
@@ -551,75 +522,3 @@ void srcu_offline_cpu(unsigned int cpu) { }
#endif /* #else #ifdef CONFIG_SRCU */
#endif /* #ifndef RCU_TREE_NONCORE */
-
-#ifdef CONFIG_RCU_TRACE
-/* Read out queue lengths for tracing. */
-static inline void rcu_nocb_q_lengths(struct rcu_data *rdp, long *ql, long *qll)
-{
-#ifdef CONFIG_RCU_NOCB_CPU
- *ql = atomic_long_read(&rdp->nocb_q_count);
- *qll = atomic_long_read(&rdp->nocb_q_count_lazy);
-#else /* #ifdef CONFIG_RCU_NOCB_CPU */
- *ql = 0;
- *qll = 0;
-#endif /* #else #ifdef CONFIG_RCU_NOCB_CPU */
-}
-#endif /* #ifdef CONFIG_RCU_TRACE */
-
-/*
- * Wrappers for the rcu_node::lock acquire and release.
- *
- * Because the rcu_nodes form a tree, the tree traversal locking will observe
- * different lock values, this in turn means that an UNLOCK of one level
- * followed by a LOCK of another level does not imply a full memory barrier;
- * and most importantly transitivity is lost.
- *
- * In order to restore full ordering between tree levels, augment the regular
- * lock acquire functions with smp_mb__after_unlock_lock().
- *
- * As ->lock of struct rcu_node is a __private field, therefore one should use
- * these wrappers rather than directly call raw_spin_{lock,unlock}* on ->lock.
- */
-static inline void raw_spin_lock_rcu_node(struct rcu_node *rnp)
-{
- raw_spin_lock(&ACCESS_PRIVATE(rnp, lock));
- smp_mb__after_unlock_lock();
-}
-
-static inline void raw_spin_unlock_rcu_node(struct rcu_node *rnp)
-{
- raw_spin_unlock(&ACCESS_PRIVATE(rnp, lock));
-}
-
-static inline void raw_spin_lock_irq_rcu_node(struct rcu_node *rnp)
-{
- raw_spin_lock_irq(&ACCESS_PRIVATE(rnp, lock));
- smp_mb__after_unlock_lock();
-}
-
-static inline void raw_spin_unlock_irq_rcu_node(struct rcu_node *rnp)
-{
- raw_spin_unlock_irq(&ACCESS_PRIVATE(rnp, lock));
-}
-
-#define raw_spin_lock_irqsave_rcu_node(rnp, flags) \
-do { \
- typecheck(unsigned long, flags); \
- raw_spin_lock_irqsave(&ACCESS_PRIVATE(rnp, lock), flags); \
- smp_mb__after_unlock_lock(); \
-} while (0)
-
-#define raw_spin_unlock_irqrestore_rcu_node(rnp, flags) \
-do { \
- typecheck(unsigned long, flags); \
- raw_spin_unlock_irqrestore(&ACCESS_PRIVATE(rnp, lock), flags); \
-} while (0)
-
-static inline bool raw_spin_trylock_rcu_node(struct rcu_node *rnp)
-{
- bool locked = raw_spin_trylock(&ACCESS_PRIVATE(rnp, lock));
-
- if (locked)
- smp_mb__after_unlock_lock();
- return locked;
-}
diff --git a/kernel/rcu/tree_exp.h b/kernel/rcu/tree_exp.h
index e513b4ab1197..dd21ca47e4b4 100644
--- a/kernel/rcu/tree_exp.h
+++ b/kernel/rcu/tree_exp.h
@@ -147,7 +147,7 @@ static void __maybe_unused sync_exp_reset_tree(struct rcu_state *rsp)
*
* Caller must hold the rcu_state's exp_mutex.
*/
-static int sync_rcu_preempt_exp_done(struct rcu_node *rnp)
+static bool sync_rcu_preempt_exp_done(struct rcu_node *rnp)
{
return rnp->exp_tasks == NULL &&
READ_ONCE(rnp->expmask) == 0;
diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h
index c9a48657512a..908b309d60d7 100644
--- a/kernel/rcu/tree_plugin.h
+++ b/kernel/rcu/tree_plugin.h
@@ -70,7 +70,7 @@ static bool __read_mostly rcu_nocb_poll; /* Offload kthread are to poll. */
static void __init rcu_bootup_announce_oddness(void)
{
if (IS_ENABLED(CONFIG_RCU_TRACE))
- pr_info("\tRCU debugfs-based tracing is enabled.\n");
+ pr_info("\tRCU event tracing is enabled.\n");
if ((IS_ENABLED(CONFIG_64BIT) && RCU_FANOUT != 64) ||
(!IS_ENABLED(CONFIG_64BIT) && RCU_FANOUT != 32))
pr_info("\tCONFIG_RCU_FANOUT set to non-default value of %d\n",
@@ -90,8 +90,32 @@ static void __init rcu_bootup_announce_oddness(void)
pr_info("\tBoot-time adjustment of leaf fanout to %d.\n", rcu_fanout_leaf);
if (nr_cpu_ids != NR_CPUS)
pr_info("\tRCU restricting CPUs from NR_CPUS=%d to nr_cpu_ids=%d.\n", NR_CPUS, nr_cpu_ids);
- if (IS_ENABLED(CONFIG_RCU_BOOST))
- pr_info("\tRCU kthread priority: %d.\n", kthread_prio);
+#ifdef CONFIG_RCU_BOOST
+ pr_info("\tRCU priority boosting: priority %d delay %d ms.\n", kthread_prio, CONFIG_RCU_BOOST_DELAY);
+#endif
+ if (blimit != DEFAULT_RCU_BLIMIT)
+ pr_info("\tBoot-time adjustment of callback invocation limit to %ld.\n", blimit);
+ if (qhimark != DEFAULT_RCU_QHIMARK)
+ pr_info("\tBoot-time adjustment of callback high-water mark to %ld.\n", qhimark);
+ if (qlowmark != DEFAULT_RCU_QLOMARK)
+ pr_info("\tBoot-time adjustment of callback low-water mark to %ld.\n", qlowmark);
+ if (jiffies_till_first_fqs != ULONG_MAX)
+ pr_info("\tBoot-time adjustment of first FQS scan delay to %ld jiffies.\n", jiffies_till_first_fqs);
+ if (jiffies_till_next_fqs != ULONG_MAX)
+ pr_info("\tBoot-time adjustment of subsequent FQS scan delay to %ld jiffies.\n", jiffies_till_next_fqs);
+ if (rcu_kick_kthreads)
+ pr_info("\tKick kthreads if too-long grace period.\n");
+ if (IS_ENABLED(CONFIG_DEBUG_OBJECTS_RCU_HEAD))
+ pr_info("\tRCU callback double-/use-after-free debug enabled.\n");
+ if (gp_preinit_delay)
+ pr_info("\tRCU debug GP pre-init slowdown %d jiffies.\n", gp_preinit_delay);
+ if (gp_init_delay)
+ pr_info("\tRCU debug GP init slowdown %d jiffies.\n", gp_init_delay);
+ if (gp_cleanup_delay)
+ pr_info("\tRCU debug GP init slowdown %d jiffies.\n", gp_cleanup_delay);
+ if (IS_ENABLED(CONFIG_RCU_EQS_DEBUG))
+ pr_info("\tRCU debug extended QS entry/exit.\n");
+ rcupdate_announce_bootup_oddness();
}
#ifdef CONFIG_PREEMPT_RCU
@@ -155,6 +179,8 @@ static void rcu_preempt_ctxt_queue(struct rcu_node *rnp, struct rcu_data *rdp)
(rnp->expmask & rdp->grpmask ? RCU_EXP_BLKD : 0);
struct task_struct *t = current;
+ lockdep_assert_held(&rnp->lock);
+
/*
* Decide where to queue the newly blocked task. In theory,
* this could be an if-statement. In practice, when I tried
@@ -263,6 +289,7 @@ static void rcu_preempt_ctxt_queue(struct rcu_node *rnp, struct rcu_data *rdp)
*/
static void rcu_preempt_qs(void)
{
+ RCU_LOCKDEP_WARN(preemptible(), "rcu_preempt_qs() invoked with preemption enabled!!!\n");
if (__this_cpu_read(rcu_data_p->cpu_no_qs.s)) {
trace_rcu_grace_period(TPS("rcu_preempt"),
__this_cpu_read(rcu_data_p->gpnum),
@@ -286,12 +313,14 @@ static void rcu_preempt_qs(void)
*
* Caller must disable interrupts.
*/
-static void rcu_preempt_note_context_switch(void)
+static void rcu_preempt_note_context_switch(bool preempt)
{
struct task_struct *t = current;
struct rcu_data *rdp;
struct rcu_node *rnp;
+ RCU_LOCKDEP_WARN(!irqs_disabled(), "rcu_preempt_note_context_switch() invoked with interrupts enabled!!!\n");
+ WARN_ON_ONCE(!preempt && t->rcu_read_lock_nesting > 0);
if (t->rcu_read_lock_nesting > 0 &&
!t->rcu_read_unlock_special.b.blocked) {
@@ -607,6 +636,7 @@ static int rcu_print_task_exp_stall(struct rcu_node *rnp)
*/
static void rcu_preempt_check_blocked_tasks(struct rcu_node *rnp)
{
+ RCU_LOCKDEP_WARN(preemptible(), "rcu_preempt_check_blocked_tasks() invoked with preemption enabled!!!\n");
WARN_ON_ONCE(rcu_preempt_blocked_readers_cgp(rnp));
if (rcu_preempt_has_tasks(rnp))
rnp->gp_tasks = rnp->blkd_tasks.next;
@@ -643,8 +673,37 @@ static void rcu_preempt_do_callbacks(void)
#endif /* #ifdef CONFIG_RCU_BOOST */
-/*
- * Queue a preemptible-RCU callback for invocation after a grace period.
+/**
+ * call_rcu() - Queue an RCU callback for invocation after a grace period.
+ * @head: structure to be used for queueing the RCU updates.
+ * @func: actual callback function to be invoked after the grace period
+ *
+ * The callback function will be invoked some time after a full grace
+ * period elapses, in other words after all pre-existing RCU read-side
+ * critical sections have completed. However, the callback function
+ * might well execute concurrently with RCU read-side critical sections
+ * that started after call_rcu() was invoked. RCU read-side critical
+ * sections are delimited by rcu_read_lock() and rcu_read_unlock(),
+ * and may be nested.
+ *
+ * Note that all CPUs must agree that the grace period extended beyond
+ * all pre-existing RCU read-side critical section. On systems with more
+ * than one CPU, this means that when "func()" is invoked, each CPU is
+ * guaranteed to have executed a full memory barrier since the end of its
+ * last RCU read-side critical section whose beginning preceded the call
+ * to call_rcu(). It also means that each CPU executing an RCU read-side
+ * critical section that continues beyond the start of "func()" must have
+ * executed a memory barrier after the call_rcu() but before the beginning
+ * of that RCU read-side critical section. Note that these guarantees
+ * include CPUs that are offline, idle, or executing in user mode, as
+ * well as CPUs that are executing in the kernel.
+ *
+ * Furthermore, if CPU A invoked call_rcu() and CPU B invoked the
+ * resulting RCU callback function "func()", then both CPU A and CPU B are
+ * guaranteed to execute a full memory barrier during the time interval
+ * between the call to call_rcu() and the invocation of "func()" -- even
+ * if CPU A and CPU B are the same CPU (but again only if the system has
+ * more than one CPU).
*/
void call_rcu(struct rcu_head *head, rcu_callback_t func)
{
@@ -663,8 +722,13 @@ EXPORT_SYMBOL_GPL(call_rcu);
* synchronize_rcu() was waiting. RCU read-side critical sections are
* delimited by rcu_read_lock() and rcu_read_unlock(), and may be nested.
*
- * See the description of synchronize_sched() for more detailed information
- * on memory ordering guarantees.
+ * See the description of synchronize_sched() for more detailed
+ * information on memory-ordering guarantees. However, please note
+ * that -only- the memory-ordering guarantees apply. For example,
+ * synchronize_rcu() is -not- guaranteed to wait on things like code
+ * protected by preempt_disable(), instead, synchronize_rcu() is -only-
+ * guaranteed to wait on RCU read-side critical sections, that is, sections
+ * of code protected by rcu_read_lock().
*/
void synchronize_rcu(void)
{
@@ -738,7 +802,7 @@ static void __init rcu_bootup_announce(void)
* Because preemptible RCU does not exist, we never have to check for
* CPUs being in quiescent states.
*/
-static void rcu_preempt_note_context_switch(void)
+static void rcu_preempt_note_context_switch(bool preempt)
{
}
@@ -835,33 +899,6 @@ void exit_rcu(void)
#include "../locking/rtmutex_common.h"
-#ifdef CONFIG_RCU_TRACE
-
-static void rcu_initiate_boost_trace(struct rcu_node *rnp)
-{
- if (!rcu_preempt_has_tasks(rnp))
- rnp->n_balk_blkd_tasks++;
- else if (rnp->exp_tasks == NULL && rnp->gp_tasks == NULL)
- rnp->n_balk_exp_gp_tasks++;
- else if (rnp->gp_tasks != NULL && rnp->boost_tasks != NULL)
- rnp->n_balk_boost_tasks++;
- else if (rnp->gp_tasks != NULL && rnp->qsmask != 0)
- rnp->n_balk_notblocked++;
- else if (rnp->gp_tasks != NULL &&
- ULONG_CMP_LT(jiffies, rnp->boost_time))
- rnp->n_balk_notyet++;
- else
- rnp->n_balk_nos++;
-}
-
-#else /* #ifdef CONFIG_RCU_TRACE */
-
-static void rcu_initiate_boost_trace(struct rcu_node *rnp)
-{
-}
-
-#endif /* #else #ifdef CONFIG_RCU_TRACE */
-
static void rcu_wake_cond(struct task_struct *t, int status)
{
/*
@@ -992,8 +1029,8 @@ static void rcu_initiate_boost(struct rcu_node *rnp, unsigned long flags)
{
struct task_struct *t;
+ lockdep_assert_held(&rnp->lock);
if (!rcu_preempt_blocked_readers_cgp(rnp) && rnp->exp_tasks == NULL) {
- rnp->n_balk_exp_gp_tasks++;
raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
return;
}
@@ -1009,7 +1046,6 @@ static void rcu_initiate_boost(struct rcu_node *rnp, unsigned long flags)
if (t)
rcu_wake_cond(t, rnp->boost_kthread_status);
} else {
- rcu_initiate_boost_trace(rnp);
raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
}
}
@@ -1260,8 +1296,7 @@ static void rcu_prepare_kthreads(int cpu)
int rcu_needs_cpu(u64 basemono, u64 *nextevt)
{
*nextevt = KTIME_MAX;
- return IS_ENABLED(CONFIG_RCU_NOCB_CPU_ALL)
- ? 0 : rcu_cpu_has_callbacks(NULL);
+ return rcu_cpu_has_callbacks(NULL);
}
/*
@@ -1372,10 +1407,7 @@ int rcu_needs_cpu(u64 basemono, u64 *nextevt)
struct rcu_dynticks *rdtp = this_cpu_ptr(&rcu_dynticks);
unsigned long dj;
- if (IS_ENABLED(CONFIG_RCU_NOCB_CPU_ALL)) {
- *nextevt = KTIME_MAX;
- return 0;
- }
+ RCU_LOCKDEP_WARN(!irqs_disabled(), "rcu_needs_cpu() invoked with irqs enabled!!!");
/* Snapshot to detect later posting of non-lazy callback. */
rdtp->nonlazy_posted_snap = rdtp->nonlazy_posted;
@@ -1424,8 +1456,8 @@ static void rcu_prepare_for_idle(void)
struct rcu_state *rsp;
int tne;
- if (IS_ENABLED(CONFIG_RCU_NOCB_CPU_ALL) ||
- rcu_is_nocb_cpu(smp_processor_id()))
+ RCU_LOCKDEP_WARN(!irqs_disabled(), "rcu_prepare_for_idle() invoked with irqs enabled!!!");
+ if (rcu_is_nocb_cpu(smp_processor_id()))
return;
/* Handle nohz enablement switches conservatively. */
@@ -1479,8 +1511,8 @@ static void rcu_prepare_for_idle(void)
*/
static void rcu_cleanup_after_idle(void)
{
- if (IS_ENABLED(CONFIG_RCU_NOCB_CPU_ALL) ||
- rcu_is_nocb_cpu(smp_processor_id()))
+ RCU_LOCKDEP_WARN(!irqs_disabled(), "rcu_cleanup_after_idle() invoked with irqs enabled!!!");
+ if (rcu_is_nocb_cpu(smp_processor_id()))
return;
if (rcu_try_advance_all_cbs())
invoke_rcu_core();
@@ -1747,7 +1779,6 @@ static void rcu_init_one_nocb(struct rcu_node *rnp)
init_swait_queue_head(&rnp->nocb_gp_wq[1]);
}
-#ifndef CONFIG_RCU_NOCB_CPU_ALL
/* Is the specified CPU a no-CBs CPU? */
bool rcu_is_nocb_cpu(int cpu)
{
@@ -1755,7 +1786,6 @@ bool rcu_is_nocb_cpu(int cpu)
return cpumask_test_cpu(cpu, rcu_nocb_mask);
return false;
}
-#endif /* #ifndef CONFIG_RCU_NOCB_CPU_ALL */
/*
* Kick the leader kthread for this NOCB group.
@@ -1769,6 +1799,7 @@ static void wake_nocb_leader(struct rcu_data *rdp, bool force)
if (READ_ONCE(rdp_leader->nocb_leader_sleep) || force) {
/* Prior smp_mb__after_atomic() orders against prior enqueue. */
WRITE_ONCE(rdp_leader->nocb_leader_sleep, false);
+ smp_mb(); /* ->nocb_leader_sleep before swake_up(). */
swake_up(&rdp_leader->nocb_wq);
}
}
@@ -1860,7 +1891,7 @@ static void __call_rcu_nocb_enqueue(struct rcu_data *rdp,
trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu,
TPS("WakeEmpty"));
} else {
- WRITE_ONCE(rdp->nocb_defer_wakeup, RCU_NOGP_WAKE);
+ WRITE_ONCE(rdp->nocb_defer_wakeup, RCU_NOCB_WAKE);
/* Store ->nocb_defer_wakeup before ->rcu_urgent_qs. */
smp_store_release(this_cpu_ptr(&rcu_dynticks.rcu_urgent_qs), true);
trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu,
@@ -1874,7 +1905,7 @@ static void __call_rcu_nocb_enqueue(struct rcu_data *rdp,
trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu,
TPS("WakeOvf"));
} else {
- WRITE_ONCE(rdp->nocb_defer_wakeup, RCU_NOGP_WAKE_FORCE);
+ WRITE_ONCE(rdp->nocb_defer_wakeup, RCU_NOCB_WAKE_FORCE);
/* Store ->nocb_defer_wakeup before ->rcu_urgent_qs. */
smp_store_release(this_cpu_ptr(&rcu_dynticks.rcu_urgent_qs), true);
trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu,
@@ -2023,6 +2054,7 @@ wait_again:
* nocb_gp_head, where they await a grace period.
*/
gotcbs = false;
+ smp_mb(); /* wakeup before ->nocb_head reads. */
for (rdp = my_rdp; rdp; rdp = rdp->nocb_next_follower) {
rdp->nocb_gp_head = READ_ONCE(rdp->nocb_head);
if (!rdp->nocb_gp_head)
@@ -2201,8 +2233,8 @@ static void do_nocb_deferred_wakeup(struct rcu_data *rdp)
if (!rcu_nocb_need_deferred_wakeup(rdp))
return;
ndw = READ_ONCE(rdp->nocb_defer_wakeup);
- WRITE_ONCE(rdp->nocb_defer_wakeup, RCU_NOGP_WAKE_NOT);
- wake_nocb_leader(rdp, ndw == RCU_NOGP_WAKE_FORCE);
+ WRITE_ONCE(rdp->nocb_defer_wakeup, RCU_NOCB_WAKE_NOT);
+ wake_nocb_leader(rdp, ndw == RCU_NOCB_WAKE_FORCE);
trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu, TPS("DeferredWake"));
}
@@ -2212,10 +2244,6 @@ void __init rcu_init_nohz(void)
bool need_rcu_nocb_mask = true;
struct rcu_state *rsp;
-#ifdef CONFIG_RCU_NOCB_CPU_NONE
- need_rcu_nocb_mask = false;
-#endif /* #ifndef CONFIG_RCU_NOCB_CPU_NONE */
-
#if defined(CONFIG_NO_HZ_FULL)
if (tick_nohz_full_running && cpumask_weight(tick_nohz_full_mask))
need_rcu_nocb_mask = true;
@@ -2231,14 +2259,6 @@ void __init rcu_init_nohz(void)
if (!have_rcu_nocb_mask)
return;
-#ifdef CONFIG_RCU_NOCB_CPU_ZERO
- pr_info("\tOffload RCU callbacks from CPU 0\n");
- cpumask_set_cpu(0, rcu_nocb_mask);
-#endif /* #ifdef CONFIG_RCU_NOCB_CPU_ZERO */
-#ifdef CONFIG_RCU_NOCB_CPU_ALL
- pr_info("\tOffload RCU callbacks from all CPUs\n");
- cpumask_copy(rcu_nocb_mask, cpu_possible_mask);
-#endif /* #ifdef CONFIG_RCU_NOCB_CPU_ALL */
#if defined(CONFIG_NO_HZ_FULL)
if (tick_nohz_full_running)
cpumask_or(rcu_nocb_mask, rcu_nocb_mask, tick_nohz_full_mask);
@@ -2491,421 +2511,6 @@ static void __maybe_unused rcu_kick_nohz_cpu(int cpu)
#endif /* #ifdef CONFIG_NO_HZ_FULL */
}
-
-#ifdef CONFIG_NO_HZ_FULL_SYSIDLE
-
-static int full_sysidle_state; /* Current system-idle state. */
-#define RCU_SYSIDLE_NOT 0 /* Some CPU is not idle. */
-#define RCU_SYSIDLE_SHORT 1 /* All CPUs idle for brief period. */
-#define RCU_SYSIDLE_LONG 2 /* All CPUs idle for long enough. */
-#define RCU_SYSIDLE_FULL 3 /* All CPUs idle, ready for sysidle. */
-#define RCU_SYSIDLE_FULL_NOTED 4 /* Actually entered sysidle state. */
-
-/*
- * Invoked to note exit from irq or task transition to idle. Note that
- * usermode execution does -not- count as idle here! After all, we want
- * to detect full-system idle states, not RCU quiescent states and grace
- * periods. The caller must have disabled interrupts.
- */
-static void rcu_sysidle_enter(int irq)
-{
- unsigned long j;
- struct rcu_dynticks *rdtp = this_cpu_ptr(&rcu_dynticks);
-
- /* If there are no nohz_full= CPUs, no need to track this. */
- if (!tick_nohz_full_enabled())
- return;
-
- /* Adjust nesting, check for fully idle. */
- if (irq) {
- rdtp->dynticks_idle_nesting--;
- WARN_ON_ONCE(rdtp->dynticks_idle_nesting < 0);
- if (rdtp->dynticks_idle_nesting != 0)
- return; /* Still not fully idle. */
- } else {
- if ((rdtp->dynticks_idle_nesting & DYNTICK_TASK_NEST_MASK) ==
- DYNTICK_TASK_NEST_VALUE) {
- rdtp->dynticks_idle_nesting = 0;
- } else {
- rdtp->dynticks_idle_nesting -= DYNTICK_TASK_NEST_VALUE;
- WARN_ON_ONCE(rdtp->dynticks_idle_nesting < 0);
- return; /* Still not fully idle. */
- }
- }
-
- /* Record start of fully idle period. */
- j = jiffies;
- WRITE_ONCE(rdtp->dynticks_idle_jiffies, j);
- smp_mb__before_atomic();
- atomic_inc(&rdtp->dynticks_idle);
- smp_mb__after_atomic();
- WARN_ON_ONCE(atomic_read(&rdtp->dynticks_idle) & 0x1);
-}
-
-/*
- * Unconditionally force exit from full system-idle state. This is
- * invoked when a normal CPU exits idle, but must be called separately
- * for the timekeeping CPU (tick_do_timer_cpu). The reason for this
- * is that the timekeeping CPU is permitted to take scheduling-clock
- * interrupts while the system is in system-idle state, and of course
- * rcu_sysidle_exit() has no way of distinguishing a scheduling-clock
- * interrupt from any other type of interrupt.
- */
-void rcu_sysidle_force_exit(void)
-{
- int oldstate = READ_ONCE(full_sysidle_state);
- int newoldstate;
-
- /*
- * Each pass through the following loop attempts to exit full
- * system-idle state. If contention proves to be a problem,
- * a trylock-based contention tree could be used here.
- */
- while (oldstate > RCU_SYSIDLE_SHORT) {
- newoldstate = cmpxchg(&full_sysidle_state,
- oldstate, RCU_SYSIDLE_NOT);
- if (oldstate == newoldstate &&
- oldstate == RCU_SYSIDLE_FULL_NOTED) {
- rcu_kick_nohz_cpu(tick_do_timer_cpu);
- return; /* We cleared it, done! */
- }
- oldstate = newoldstate;
- }
- smp_mb(); /* Order initial oldstate fetch vs. later non-idle work. */
-}
-
-/*
- * Invoked to note entry to irq or task transition from idle. Note that
- * usermode execution does -not- count as idle here! The caller must
- * have disabled interrupts.
- */
-static void rcu_sysidle_exit(int irq)
-{
- struct rcu_dynticks *rdtp = this_cpu_ptr(&rcu_dynticks);
-
- /* If there are no nohz_full= CPUs, no need to track this. */
- if (!tick_nohz_full_enabled())
- return;
-
- /* Adjust nesting, check for already non-idle. */
- if (irq) {
- rdtp->dynticks_idle_nesting++;
- WARN_ON_ONCE(rdtp->dynticks_idle_nesting <= 0);
- if (rdtp->dynticks_idle_nesting != 1)
- return; /* Already non-idle. */
- } else {
- /*
- * Allow for irq misnesting. Yes, it really is possible
- * to enter an irq handler then never leave it, and maybe
- * also vice versa. Handle both possibilities.
- */
- if (rdtp->dynticks_idle_nesting & DYNTICK_TASK_NEST_MASK) {
- rdtp->dynticks_idle_nesting += DYNTICK_TASK_NEST_VALUE;
- WARN_ON_ONCE(rdtp->dynticks_idle_nesting <= 0);
- return; /* Already non-idle. */
- } else {
- rdtp->dynticks_idle_nesting = DYNTICK_TASK_EXIT_IDLE;
- }
- }
-
- /* Record end of idle period. */
- smp_mb__before_atomic();
- atomic_inc(&rdtp->dynticks_idle);
- smp_mb__after_atomic();
- WARN_ON_ONCE(!(atomic_read(&rdtp->dynticks_idle) & 0x1));
-
- /*
- * If we are the timekeeping CPU, we are permitted to be non-idle
- * during a system-idle state. This must be the case, because
- * the timekeeping CPU has to take scheduling-clock interrupts
- * during the time that the system is transitioning to full
- * system-idle state. This means that the timekeeping CPU must
- * invoke rcu_sysidle_force_exit() directly if it does anything
- * more than take a scheduling-clock interrupt.
- */
- if (smp_processor_id() == tick_do_timer_cpu)
- return;
-
- /* Update system-idle state: We are clearly no longer fully idle! */
- rcu_sysidle_force_exit();
-}
-
-/*
- * Check to see if the current CPU is idle. Note that usermode execution
- * does not count as idle. The caller must have disabled interrupts,
- * and must be running on tick_do_timer_cpu.
- */
-static void rcu_sysidle_check_cpu(struct rcu_data *rdp, bool *isidle,
- unsigned long *maxj)
-{
- int cur;
- unsigned long j;
- struct rcu_dynticks *rdtp = rdp->dynticks;
-
- /* If there are no nohz_full= CPUs, don't check system-wide idleness. */
- if (!tick_nohz_full_enabled())
- return;
-
- /*
- * If some other CPU has already reported non-idle, if this is
- * not the flavor of RCU that tracks sysidle state, or if this
- * is an offline or the timekeeping CPU, nothing to do.
- */
- if (!*isidle || rdp->rsp != rcu_state_p ||
- cpu_is_offline(rdp->cpu) || rdp->cpu == tick_do_timer_cpu)
- return;
- /* Verify affinity of current kthread. */
- WARN_ON_ONCE(smp_processor_id() != tick_do_timer_cpu);
-
- /* Pick up current idle and NMI-nesting counter and check. */
- cur = atomic_read(&rdtp->dynticks_idle);
- if (cur & 0x1) {
- *isidle = false; /* We are not idle! */
- return;
- }
- smp_mb(); /* Read counters before timestamps. */
-
- /* Pick up timestamps. */
- j = READ_ONCE(rdtp->dynticks_idle_jiffies);
- /* If this CPU entered idle more recently, update maxj timestamp. */
- if (ULONG_CMP_LT(*maxj, j))
- *maxj = j;
-}
-
-/*
- * Is this the flavor of RCU that is handling full-system idle?
- */
-static bool is_sysidle_rcu_state(struct rcu_state *rsp)
-{
- return rsp == rcu_state_p;
-}
-
-/*
- * Return a delay in jiffies based on the number of CPUs, rcu_node
- * leaf fanout, and jiffies tick rate. The idea is to allow larger
- * systems more time to transition to full-idle state in order to
- * avoid the cache thrashing that otherwise occur on the state variable.
- * Really small systems (less than a couple of tens of CPUs) should
- * instead use a single global atomically incremented counter, and later
- * versions of this will automatically reconfigure themselves accordingly.
- */
-static unsigned long rcu_sysidle_delay(void)
-{
- if (nr_cpu_ids <= CONFIG_NO_HZ_FULL_SYSIDLE_SMALL)
- return 0;
- return DIV_ROUND_UP(nr_cpu_ids * HZ, rcu_fanout_leaf * 1000);
-}
-
-/*
- * Advance the full-system-idle state. This is invoked when all of
- * the non-timekeeping CPUs are idle.
- */
-static void rcu_sysidle(unsigned long j)
-{
- /* Check the current state. */
- switch (READ_ONCE(full_sysidle_state)) {
- case RCU_SYSIDLE_NOT:
-
- /* First time all are idle, so note a short idle period. */
- WRITE_ONCE(full_sysidle_state, RCU_SYSIDLE_SHORT);
- break;
-
- case RCU_SYSIDLE_SHORT:
-
- /*
- * Idle for a bit, time to advance to next state?
- * cmpxchg failure means race with non-idle, let them win.
- */
- if (ULONG_CMP_GE(jiffies, j + rcu_sysidle_delay()))
- (void)cmpxchg(&full_sysidle_state,
- RCU_SYSIDLE_SHORT, RCU_SYSIDLE_LONG);
- break;
-
- case RCU_SYSIDLE_LONG:
-
- /*
- * Do an additional check pass before advancing to full.
- * cmpxchg failure means race with non-idle, let them win.
- */
- if (ULONG_CMP_GE(jiffies, j + rcu_sysidle_delay()))
- (void)cmpxchg(&full_sysidle_state,
- RCU_SYSIDLE_LONG, RCU_SYSIDLE_FULL);
- break;
-
- default:
- break;
- }
-}
-
-/*
- * Found a non-idle non-timekeeping CPU, so kick the system-idle state
- * back to the beginning.
- */
-static void rcu_sysidle_cancel(void)
-{
- smp_mb();
- if (full_sysidle_state > RCU_SYSIDLE_SHORT)
- WRITE_ONCE(full_sysidle_state, RCU_SYSIDLE_NOT);
-}
-
-/*
- * Update the sysidle state based on the results of a force-quiescent-state
- * scan of the CPUs' dyntick-idle state.
- */
-static void rcu_sysidle_report(struct rcu_state *rsp, int isidle,
- unsigned long maxj, bool gpkt)
-{
- if (rsp != rcu_state_p)
- return; /* Wrong flavor, ignore. */
- if (gpkt && nr_cpu_ids <= CONFIG_NO_HZ_FULL_SYSIDLE_SMALL)
- return; /* Running state machine from timekeeping CPU. */
- if (isidle)
- rcu_sysidle(maxj); /* More idle! */
- else
- rcu_sysidle_cancel(); /* Idle is over. */
-}
-
-/*
- * Wrapper for rcu_sysidle_report() when called from the grace-period
- * kthread's context.
- */
-static void rcu_sysidle_report_gp(struct rcu_state *rsp, int isidle,
- unsigned long maxj)
-{
- /* If there are no nohz_full= CPUs, no need to track this. */
- if (!tick_nohz_full_enabled())
- return;
-
- rcu_sysidle_report(rsp, isidle, maxj, true);
-}
-
-/* Callback and function for forcing an RCU grace period. */
-struct rcu_sysidle_head {
- struct rcu_head rh;
- int inuse;
-};
-
-static void rcu_sysidle_cb(struct rcu_head *rhp)
-{
- struct rcu_sysidle_head *rshp;
-
- /*
- * The following memory barrier is needed to replace the
- * memory barriers that would normally be in the memory
- * allocator.
- */
- smp_mb(); /* grace period precedes setting inuse. */
-
- rshp = container_of(rhp, struct rcu_sysidle_head, rh);
- WRITE_ONCE(rshp->inuse, 0);
-}
-
-/*
- * Check to see if the system is fully idle, other than the timekeeping CPU.
- * The caller must have disabled interrupts. This is not intended to be
- * called unless tick_nohz_full_enabled().
- */
-bool rcu_sys_is_idle(void)
-{
- static struct rcu_sysidle_head rsh;
- int rss = READ_ONCE(full_sysidle_state);
-
- if (WARN_ON_ONCE(smp_processor_id() != tick_do_timer_cpu))
- return false;
-
- /* Handle small-system case by doing a full scan of CPUs. */
- if (nr_cpu_ids <= CONFIG_NO_HZ_FULL_SYSIDLE_SMALL) {
- int oldrss = rss - 1;
-
- /*
- * One pass to advance to each state up to _FULL.
- * Give up if any pass fails to advance the state.
- */
- while (rss < RCU_SYSIDLE_FULL && oldrss < rss) {
- int cpu;
- bool isidle = true;
- unsigned long maxj = jiffies - ULONG_MAX / 4;
- struct rcu_data *rdp;
-
- /* Scan all the CPUs looking for nonidle CPUs. */
- for_each_possible_cpu(cpu) {
- rdp = per_cpu_ptr(rcu_state_p->rda, cpu);
- rcu_sysidle_check_cpu(rdp, &isidle, &maxj);
- if (!isidle)
- break;
- }
- rcu_sysidle_report(rcu_state_p, isidle, maxj, false);
- oldrss = rss;
- rss = READ_ONCE(full_sysidle_state);
- }
- }
-
- /* If this is the first observation of an idle period, record it. */
- if (rss == RCU_SYSIDLE_FULL) {
- rss = cmpxchg(&full_sysidle_state,
- RCU_SYSIDLE_FULL, RCU_SYSIDLE_FULL_NOTED);
- return rss == RCU_SYSIDLE_FULL;
- }
-
- smp_mb(); /* ensure rss load happens before later caller actions. */
-
- /* If already fully idle, tell the caller (in case of races). */
- if (rss == RCU_SYSIDLE_FULL_NOTED)
- return true;
-
- /*
- * If we aren't there yet, and a grace period is not in flight,
- * initiate a grace period. Either way, tell the caller that
- * we are not there yet. We use an xchg() rather than an assignment
- * to make up for the memory barriers that would otherwise be
- * provided by the memory allocator.
- */
- if (nr_cpu_ids > CONFIG_NO_HZ_FULL_SYSIDLE_SMALL &&
- !rcu_gp_in_progress(rcu_state_p) &&
- !rsh.inuse && xchg(&rsh.inuse, 1) == 0)
- call_rcu(&rsh.rh, rcu_sysidle_cb);
- return false;
-}
-
-/*
- * Initialize dynticks sysidle state for CPUs coming online.
- */
-static void rcu_sysidle_init_percpu_data(struct rcu_dynticks *rdtp)
-{
- rdtp->dynticks_idle_nesting = DYNTICK_TASK_NEST_VALUE;
-}
-
-#else /* #ifdef CONFIG_NO_HZ_FULL_SYSIDLE */
-
-static void rcu_sysidle_enter(int irq)
-{
-}
-
-static void rcu_sysidle_exit(int irq)
-{
-}
-
-static void rcu_sysidle_check_cpu(struct rcu_data *rdp, bool *isidle,
- unsigned long *maxj)
-{
-}
-
-static bool is_sysidle_rcu_state(struct rcu_state *rsp)
-{
- return false;
-}
-
-static void rcu_sysidle_report_gp(struct rcu_state *rsp, int isidle,
- unsigned long maxj)
-{
-}
-
-static void rcu_sysidle_init_percpu_data(struct rcu_dynticks *rdtp)
-{
-}
-
-#endif /* #else #ifdef CONFIG_NO_HZ_FULL_SYSIDLE */
-
/*
* Is this CPU a NO_HZ_FULL CPU that should ignore RCU so that the
* grace-period kthread will do force_quiescent_state() processing?
@@ -2936,13 +2541,7 @@ static void rcu_bind_gp_kthread(void)
if (!tick_nohz_full_enabled())
return;
-#ifdef CONFIG_NO_HZ_FULL_SYSIDLE
- cpu = tick_do_timer_cpu;
- if (cpu >= 0 && cpu < nr_cpu_ids)
- set_cpus_allowed_ptr(current, cpumask_of(cpu));
-#else /* #ifdef CONFIG_NO_HZ_FULL_SYSIDLE */
housekeeping_affine(current);
-#endif /* #else #ifdef CONFIG_NO_HZ_FULL_SYSIDLE */
}
/* Record the current task on dyntick-idle entry. */
diff --git a/kernel/rcu/tree_trace.c b/kernel/rcu/tree_trace.c
deleted file mode 100644
index 6cea17a1ea30..000000000000
--- a/kernel/rcu/tree_trace.c
+++ /dev/null
@@ -1,494 +0,0 @@
-/*
- * Read-Copy Update tracing for hierarchical implementation.
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, you can access it online at
- * http://www.gnu.org/licenses/gpl-2.0.html.
- *
- * Copyright IBM Corporation, 2008
- * Author: Paul E. McKenney
- *
- * Papers: http://www.rdrop.com/users/paulmck/RCU
- *
- * For detailed explanation of Read-Copy Update mechanism see -
- * Documentation/RCU
- *
- */
-#include <linux/types.h>
-#include <linux/kernel.h>
-#include <linux/init.h>
-#include <linux/spinlock.h>
-#include <linux/smp.h>
-#include <linux/rcupdate.h>
-#include <linux/interrupt.h>
-#include <linux/sched.h>
-#include <linux/atomic.h>
-#include <linux/bitops.h>
-#include <linux/completion.h>
-#include <linux/percpu.h>
-#include <linux/notifier.h>
-#include <linux/cpu.h>
-#include <linux/mutex.h>
-#include <linux/debugfs.h>
-#include <linux/seq_file.h>
-#include <linux/prefetch.h>
-
-#define RCU_TREE_NONCORE
-#include "tree.h"
-#include "rcu.h"
-
-static int r_open(struct inode *inode, struct file *file,
- const struct seq_operations *op)
-{
- int ret = seq_open(file, op);
- if (!ret) {
- struct seq_file *m = (struct seq_file *)file->private_data;
- m->private = inode->i_private;
- }
- return ret;
-}
-
-static void *r_start(struct seq_file *m, loff_t *pos)
-{
- struct rcu_state *rsp = (struct rcu_state *)m->private;
- *pos = cpumask_next(*pos - 1, cpu_possible_mask);
- if ((*pos) < nr_cpu_ids)
- return per_cpu_ptr(rsp->rda, *pos);
- return NULL;
-}
-
-static void *r_next(struct seq_file *m, void *v, loff_t *pos)
-{
- (*pos)++;
- return r_start(m, pos);
-}
-
-static void r_stop(struct seq_file *m, void *v)
-{
-}
-
-static int show_rcubarrier(struct seq_file *m, void *v)
-{
- struct rcu_state *rsp = (struct rcu_state *)m->private;
- seq_printf(m, "bcc: %d bseq: %lu\n",
- atomic_read(&rsp->barrier_cpu_count),
- rsp->barrier_sequence);
- return 0;
-}
-
-static int rcubarrier_open(struct inode *inode, struct file *file)
-{
- return single_open(file, show_rcubarrier, inode->i_private);
-}
-
-static const struct file_operations rcubarrier_fops = {
- .owner = THIS_MODULE,
- .open = rcubarrier_open,
- .read = seq_read,
- .llseek = no_llseek,
- .release = single_release,
-};
-
-#ifdef CONFIG_RCU_BOOST
-
-static char convert_kthread_status(unsigned int kthread_status)
-{
- if (kthread_status > RCU_KTHREAD_MAX)
- return '?';
- return "SRWOY"[kthread_status];
-}
-
-#endif /* #ifdef CONFIG_RCU_BOOST */
-
-static void print_one_rcu_data(struct seq_file *m, struct rcu_data *rdp)
-{
- long ql, qll;
-
- if (!rdp->beenonline)
- return;
- seq_printf(m, "%3d%cc=%ld g=%ld cnq=%d/%d:%d",
- rdp->cpu,
- cpu_is_offline(rdp->cpu) ? '!' : ' ',
- ulong2long(rdp->completed), ulong2long(rdp->gpnum),
- rdp->cpu_no_qs.b.norm,
- rdp->rcu_qs_ctr_snap == per_cpu(rdp->dynticks->rcu_qs_ctr, rdp->cpu),
- rdp->core_needs_qs);
- seq_printf(m, " dt=%d/%llx/%d df=%lu",
- rcu_dynticks_snap(rdp->dynticks),
- rdp->dynticks->dynticks_nesting,
- rdp->dynticks->dynticks_nmi_nesting,
- rdp->dynticks_fqs);
- seq_printf(m, " of=%lu", rdp->offline_fqs);
- rcu_nocb_q_lengths(rdp, &ql, &qll);
- qll += rcu_segcblist_n_lazy_cbs(&rdp->cblist);
- ql += rcu_segcblist_n_cbs(&rdp->cblist);
- seq_printf(m, " ql=%ld/%ld qs=%c%c%c%c",
- qll, ql,
- ".N"[!rcu_segcblist_segempty(&rdp->cblist, RCU_NEXT_TAIL)],
- ".R"[!rcu_segcblist_segempty(&rdp->cblist,
- RCU_NEXT_READY_TAIL)],
- ".W"[!rcu_segcblist_segempty(&rdp->cblist, RCU_WAIT_TAIL)],
- ".D"[!rcu_segcblist_segempty(&rdp->cblist, RCU_DONE_TAIL)]);
-#ifdef CONFIG_RCU_BOOST
- seq_printf(m, " kt=%d/%c ktl=%x",
- per_cpu(rcu_cpu_has_work, rdp->cpu),
- convert_kthread_status(per_cpu(rcu_cpu_kthread_status,
- rdp->cpu)),
- per_cpu(rcu_cpu_kthread_loops, rdp->cpu) & 0xffff);
-#endif /* #ifdef CONFIG_RCU_BOOST */
- seq_printf(m, " b=%ld", rdp->blimit);
- seq_printf(m, " ci=%lu nci=%lu co=%lu ca=%lu\n",
- rdp->n_cbs_invoked, rdp->n_nocbs_invoked,
- rdp->n_cbs_orphaned, rdp->n_cbs_adopted);
-}
-
-static int show_rcudata(struct seq_file *m, void *v)
-{
- print_one_rcu_data(m, (struct rcu_data *)v);
- return 0;
-}
-
-static const struct seq_operations rcudate_op = {
- .start = r_start,
- .next = r_next,
- .stop = r_stop,
- .show = show_rcudata,
-};
-
-static int rcudata_open(struct inode *inode, struct file *file)
-{
- return r_open(inode, file, &rcudate_op);
-}
-
-static const struct file_operations rcudata_fops = {
- .owner = THIS_MODULE,
- .open = rcudata_open,
- .read = seq_read,
- .llseek = no_llseek,
- .release = seq_release,
-};
-
-static int show_rcuexp(struct seq_file *m, void *v)
-{
- int cpu;
- struct rcu_state *rsp = (struct rcu_state *)m->private;
- struct rcu_data *rdp;
- unsigned long s0 = 0, s1 = 0, s2 = 0, s3 = 0;
-
- for_each_possible_cpu(cpu) {
- rdp = per_cpu_ptr(rsp->rda, cpu);
- s0 += atomic_long_read(&rdp->exp_workdone0);
- s1 += atomic_long_read(&rdp->exp_workdone1);
- s2 += atomic_long_read(&rdp->exp_workdone2);
- s3 += atomic_long_read(&rdp->exp_workdone3);
- }
- seq_printf(m, "s=%lu wd0=%lu wd1=%lu wd2=%lu wd3=%lu enq=%d sc=%lu\n",
- rsp->expedited_sequence, s0, s1, s2, s3,
- atomic_read(&rsp->expedited_need_qs),
- rsp->expedited_sequence / 2);
- return 0;
-}
-
-static int rcuexp_open(struct inode *inode, struct file *file)
-{
- return single_open(file, show_rcuexp, inode->i_private);
-}
-
-static const struct file_operations rcuexp_fops = {
- .owner = THIS_MODULE,
- .open = rcuexp_open,
- .read = seq_read,
- .llseek = no_llseek,
- .release = single_release,
-};
-
-#ifdef CONFIG_RCU_BOOST
-
-static void print_one_rcu_node_boost(struct seq_file *m, struct rcu_node *rnp)
-{
- seq_printf(m, "%d:%d tasks=%c%c%c%c kt=%c ntb=%lu neb=%lu nnb=%lu ",
- rnp->grplo, rnp->grphi,
- "T."[list_empty(&rnp->blkd_tasks)],
- "N."[!rnp->gp_tasks],
- "E."[!rnp->exp_tasks],
- "B."[!rnp->boost_tasks],
- convert_kthread_status(rnp->boost_kthread_status),
- rnp->n_tasks_boosted, rnp->n_exp_boosts,
- rnp->n_normal_boosts);
- seq_printf(m, "j=%04x bt=%04x\n",
- (int)(jiffies & 0xffff),
- (int)(rnp->boost_time & 0xffff));
- seq_printf(m, " balk: nt=%lu egt=%lu bt=%lu nb=%lu ny=%lu nos=%lu\n",
- rnp->n_balk_blkd_tasks,
- rnp->n_balk_exp_gp_tasks,
- rnp->n_balk_boost_tasks,
- rnp->n_balk_notblocked,
- rnp->n_balk_notyet,
- rnp->n_balk_nos);
-}
-
-static int show_rcu_node_boost(struct seq_file *m, void *unused)
-{
- struct rcu_node *rnp;
-
- rcu_for_each_leaf_node(&rcu_preempt_state, rnp)
- print_one_rcu_node_boost(m, rnp);
- return 0;
-}
-
-static int rcu_node_boost_open(struct inode *inode, struct file *file)
-{
- return single_open(file, show_rcu_node_boost, NULL);
-}
-
-static const struct file_operations rcu_node_boost_fops = {
- .owner = THIS_MODULE,
- .open = rcu_node_boost_open,
- .read = seq_read,
- .llseek = no_llseek,
- .release = single_release,
-};
-
-#endif /* #ifdef CONFIG_RCU_BOOST */
-
-static void print_one_rcu_state(struct seq_file *m, struct rcu_state *rsp)
-{
- unsigned long gpnum;
- int level = 0;
- struct rcu_node *rnp;
-
- gpnum = rsp->gpnum;
- seq_printf(m, "c=%ld g=%ld s=%d jfq=%ld j=%x ",
- ulong2long(rsp->completed), ulong2long(gpnum),
- rsp->gp_state,
- (long)(rsp->jiffies_force_qs - jiffies),
- (int)(jiffies & 0xffff));
- seq_printf(m, "nfqs=%lu/nfqsng=%lu(%lu) fqlh=%lu oqlen=%ld/%ld\n",
- rsp->n_force_qs, rsp->n_force_qs_ngp,
- rsp->n_force_qs - rsp->n_force_qs_ngp,
- READ_ONCE(rsp->n_force_qs_lh),
- rsp->orphan_done.len_lazy,
- rsp->orphan_done.len);
- for (rnp = &rsp->node[0]; rnp - &rsp->node[0] < rcu_num_nodes; rnp++) {
- if (rnp->level != level) {
- seq_puts(m, "\n");
- level = rnp->level;
- }
- seq_printf(m, "%lx/%lx->%lx %c%c>%c %d:%d ^%d ",
- rnp->qsmask, rnp->qsmaskinit, rnp->qsmaskinitnext,
- ".G"[rnp->gp_tasks != NULL],
- ".E"[rnp->exp_tasks != NULL],
- ".T"[!list_empty(&rnp->blkd_tasks)],
- rnp->grplo, rnp->grphi, rnp->grpnum);
- }
- seq_puts(m, "\n");
-}
-
-static int show_rcuhier(struct seq_file *m, void *v)
-{
- struct rcu_state *rsp = (struct rcu_state *)m->private;
- print_one_rcu_state(m, rsp);
- return 0;
-}
-
-static int rcuhier_open(struct inode *inode, struct file *file)
-{
- return single_open(file, show_rcuhier, inode->i_private);
-}
-
-static const struct file_operations rcuhier_fops = {
- .owner = THIS_MODULE,
- .open = rcuhier_open,
- .read = seq_read,
- .llseek = no_llseek,
- .release = single_release,
-};
-
-static void show_one_rcugp(struct seq_file *m, struct rcu_state *rsp)
-{
- unsigned long flags;
- unsigned long completed;
- unsigned long gpnum;
- unsigned long gpage;
- unsigned long gpmax;
- struct rcu_node *rnp = &rsp->node[0];
-
- raw_spin_lock_irqsave_rcu_node(rnp, flags);
- completed = READ_ONCE(rsp->completed);
- gpnum = READ_ONCE(rsp->gpnum);
- if (completed == gpnum)
- gpage = 0;
- else
- gpage = jiffies - rsp->gp_start;
- gpmax = rsp->gp_max;
- raw_spin_unlock_irqrestore(&rnp->lock, flags);
- seq_printf(m, "completed=%ld gpnum=%ld age=%ld max=%ld\n",
- ulong2long(completed), ulong2long(gpnum), gpage, gpmax);
-}
-
-static int show_rcugp(struct seq_file *m, void *v)
-{
- struct rcu_state *rsp = (struct rcu_state *)m->private;
- show_one_rcugp(m, rsp);
- return 0;
-}
-
-static int rcugp_open(struct inode *inode, struct file *file)
-{
- return single_open(file, show_rcugp, inode->i_private);
-}
-
-static const struct file_operations rcugp_fops = {
- .owner = THIS_MODULE,
- .open = rcugp_open,
- .read = seq_read,
- .llseek = no_llseek,
- .release = single_release,
-};
-
-static void print_one_rcu_pending(struct seq_file *m, struct rcu_data *rdp)
-{
- if (!rdp->beenonline)
- return;
- seq_printf(m, "%3d%cnp=%ld ",
- rdp->cpu,
- cpu_is_offline(rdp->cpu) ? '!' : ' ',
- rdp->n_rcu_pending);
- seq_printf(m, "qsp=%ld rpq=%ld cbr=%ld cng=%ld ",
- rdp->n_rp_core_needs_qs,
- rdp->n_rp_report_qs,
- rdp->n_rp_cb_ready,
- rdp->n_rp_cpu_needs_gp);
- seq_printf(m, "gpc=%ld gps=%ld nn=%ld ndw%ld\n",
- rdp->n_rp_gp_completed,
- rdp->n_rp_gp_started,
- rdp->n_rp_nocb_defer_wakeup,
- rdp->n_rp_need_nothing);
-}
-
-static int show_rcu_pending(struct seq_file *m, void *v)
-{
- print_one_rcu_pending(m, (struct rcu_data *)v);
- return 0;
-}
-
-static const struct seq_operations rcu_pending_op = {
- .start = r_start,
- .next = r_next,
- .stop = r_stop,
- .show = show_rcu_pending,
-};
-
-static int rcu_pending_open(struct inode *inode, struct file *file)
-{
- return r_open(inode, file, &rcu_pending_op);
-}
-
-static const struct file_operations rcu_pending_fops = {
- .owner = THIS_MODULE,
- .open = rcu_pending_open,
- .read = seq_read,
- .llseek = no_llseek,
- .release = seq_release,
-};
-
-static int show_rcutorture(struct seq_file *m, void *unused)
-{
- seq_printf(m, "rcutorture test sequence: %lu %s\n",
- rcutorture_testseq >> 1,
- (rcutorture_testseq & 0x1) ? "(test in progress)" : "");
- seq_printf(m, "rcutorture update version number: %lu\n",
- rcutorture_vernum);
- return 0;
-}
-
-static int rcutorture_open(struct inode *inode, struct file *file)
-{
- return single_open(file, show_rcutorture, NULL);
-}
-
-static const struct file_operations rcutorture_fops = {
- .owner = THIS_MODULE,
- .open = rcutorture_open,
- .read = seq_read,
- .llseek = seq_lseek,
- .release = single_release,
-};
-
-static struct dentry *rcudir;
-
-static int __init rcutree_trace_init(void)
-{
- struct rcu_state *rsp;
- struct dentry *retval;
- struct dentry *rspdir;
-
- rcudir = debugfs_create_dir("rcu", NULL);
- if (!rcudir)
- goto free_out;
-
- for_each_rcu_flavor(rsp) {
- rspdir = debugfs_create_dir(rsp->name, rcudir);
- if (!rspdir)
- goto free_out;
-
- retval = debugfs_create_file("rcudata", 0444,
- rspdir, rsp, &rcudata_fops);
- if (!retval)
- goto free_out;
-
- retval = debugfs_create_file("rcuexp", 0444,
- rspdir, rsp, &rcuexp_fops);
- if (!retval)
- goto free_out;
-
- retval = debugfs_create_file("rcu_pending", 0444,
- rspdir, rsp, &rcu_pending_fops);
- if (!retval)
- goto free_out;
-
- retval = debugfs_create_file("rcubarrier", 0444,
- rspdir, rsp, &rcubarrier_fops);
- if (!retval)
- goto free_out;
-
-#ifdef CONFIG_RCU_BOOST
- if (rsp == &rcu_preempt_state) {
- retval = debugfs_create_file("rcuboost", 0444,
- rspdir, NULL, &rcu_node_boost_fops);
- if (!retval)
- goto free_out;
- }
-#endif
-
- retval = debugfs_create_file("rcugp", 0444,
- rspdir, rsp, &rcugp_fops);
- if (!retval)
- goto free_out;
-
- retval = debugfs_create_file("rcuhier", 0444,
- rspdir, rsp, &rcuhier_fops);
- if (!retval)
- goto free_out;
- }
-
- retval = debugfs_create_file("rcutorture", 0444, rcudir,
- NULL, &rcutorture_fops);
- if (!retval)
- goto free_out;
- return 0;
-free_out:
- debugfs_remove_recursive(rcudir);
- return 1;
-}
-device_initcall(rcutree_trace_init);
diff --git a/kernel/rcu/update.c b/kernel/rcu/update.c
index 273e869ca21d..00e77c470017 100644
--- a/kernel/rcu/update.c
+++ b/kernel/rcu/update.c
@@ -62,7 +62,9 @@
#define MODULE_PARAM_PREFIX "rcupdate."
#ifndef CONFIG_TINY_RCU
+extern int rcu_expedited; /* from sysctl */
module_param(rcu_expedited, int, 0);
+extern int rcu_normal; /* from sysctl */
module_param(rcu_normal, int, 0);
static int rcu_normal_after_boot;
module_param(rcu_normal_after_boot, int, 0);
@@ -379,6 +381,7 @@ void __wait_rcu_gp(bool checktiny, int n, call_rcu_func_t *crcu_array,
struct rcu_synchronize *rs_array)
{
int i;
+ int j;
/* Initialize and register callbacks for each flavor specified. */
for (i = 0; i < n; i++) {
@@ -390,7 +393,11 @@ void __wait_rcu_gp(bool checktiny, int n, call_rcu_func_t *crcu_array,
}
init_rcu_head_on_stack(&rs_array[i].head);
init_completion(&rs_array[i].completion);
- (crcu_array[i])(&rs_array[i].head, wakeme_after_rcu);
+ for (j = 0; j < i; j++)
+ if (crcu_array[j] == crcu_array[i])
+ break;
+ if (j == i)
+ (crcu_array[i])(&rs_array[i].head, wakeme_after_rcu);
}
/* Wait for all callbacks to be invoked. */
@@ -399,7 +406,11 @@ void __wait_rcu_gp(bool checktiny, int n, call_rcu_func_t *crcu_array,
(crcu_array[i] == call_rcu ||
crcu_array[i] == call_rcu_bh))
continue;
- wait_for_completion(&rs_array[i].completion);
+ for (j = 0; j < i; j++)
+ if (crcu_array[j] == crcu_array[i])
+ break;
+ if (j == i)
+ wait_for_completion(&rs_array[i].completion);
destroy_rcu_head_on_stack(&rs_array[i].head);
}
}
@@ -560,15 +571,30 @@ static DEFINE_RAW_SPINLOCK(rcu_tasks_cbs_lock);
DEFINE_SRCU(tasks_rcu_exit_srcu);
/* Control stall timeouts. Disable with <= 0, otherwise jiffies till stall. */
-static int rcu_task_stall_timeout __read_mostly = HZ * 60 * 10;
+#define RCU_TASK_STALL_TIMEOUT (HZ * 60 * 10)
+static int rcu_task_stall_timeout __read_mostly = RCU_TASK_STALL_TIMEOUT;
module_param(rcu_task_stall_timeout, int, 0644);
static void rcu_spawn_tasks_kthread(void);
static struct task_struct *rcu_tasks_kthread_ptr;
-/*
- * Post an RCU-tasks callback. First call must be from process context
- * after the scheduler if fully operational.
+/**
+ * call_rcu_tasks() - Queue an RCU for invocation task-based grace period
+ * @rhp: structure to be used for queueing the RCU updates.
+ * @func: actual callback function to be invoked after the grace period
+ *
+ * The callback function will be invoked some time after a full grace
+ * period elapses, in other words after all currently executing RCU
+ * read-side critical sections have completed. call_rcu_tasks() assumes
+ * that the read-side critical sections end at a voluntary context
+ * switch (not a preemption!), entry into idle, or transition to usermode
+ * execution. As such, there are no read-side primitives analogous to
+ * rcu_read_lock() and rcu_read_unlock() because this primitive is intended
+ * to determine that all tasks have passed through a safe state, not so
+ * much for data-strcuture synchronization.
+ *
+ * See the description of call_rcu() for more detailed information on
+ * memory ordering guarantees.
*/
void call_rcu_tasks(struct rcu_head *rhp, rcu_callback_t func)
{
@@ -851,6 +877,23 @@ static void rcu_spawn_tasks_kthread(void)
#endif /* #ifdef CONFIG_TASKS_RCU */
+#ifndef CONFIG_TINY_RCU
+
+/*
+ * Print any non-default Tasks RCU settings.
+ */
+static void __init rcu_tasks_bootup_oddness(void)
+{
+#ifdef CONFIG_TASKS_RCU
+ if (rcu_task_stall_timeout != RCU_TASK_STALL_TIMEOUT)
+ pr_info("\tTasks-RCU CPU stall warnings timeout set to %d (rcu_task_stall_timeout).\n", rcu_task_stall_timeout);
+ else
+ pr_info("\tTasks RCU enabled.\n");
+#endif /* #ifdef CONFIG_TASKS_RCU */
+}
+
+#endif /* #ifndef CONFIG_TINY_RCU */
+
#ifdef CONFIG_PROVE_RCU
/*
@@ -935,3 +978,25 @@ late_initcall(rcu_verify_early_boot_tests);
#else
void rcu_early_boot_tests(void) {}
#endif /* CONFIG_PROVE_RCU */
+
+#ifndef CONFIG_TINY_RCU
+
+/*
+ * Print any significant non-default boot-time settings.
+ */
+void __init rcupdate_announce_bootup_oddness(void)
+{
+ if (rcu_normal)
+ pr_info("\tNo expedited grace period (rcu_normal).\n");
+ else if (rcu_normal_after_boot)
+ pr_info("\tNo expedited grace period (rcu_normal_after_boot).\n");
+ else if (rcu_expedited)
+ pr_info("\tAll grace periods are expedited (rcu_expedited).\n");
+ if (rcu_cpu_stall_suppress)
+ pr_info("\tRCU CPU stall warnings suppressed (rcu_cpu_stall_suppress).\n");
+ if (rcu_cpu_stall_timeout != CONFIG_RCU_CPU_STALL_TIMEOUT)
+ pr_info("\tRCU CPU stall warnings timeout set to %d (rcu_cpu_stall_timeout).\n", rcu_cpu_stall_timeout);
+ rcu_tasks_bootup_oddness();
+}
+
+#endif /* #ifndef CONFIG_TINY_RCU */
diff --git a/kernel/sched/Makefile b/kernel/sched/Makefile
index 89ab6758667b..53f0164ed362 100644
--- a/kernel/sched/Makefile
+++ b/kernel/sched/Makefile
@@ -16,9 +16,9 @@ CFLAGS_core.o := $(PROFILING) -fno-omit-frame-pointer
endif
obj-y += core.o loadavg.o clock.o cputime.o
-obj-y += idle_task.o fair.o rt.o deadline.o stop_task.o
-obj-y += wait.o swait.o completion.o idle.o
-obj-$(CONFIG_SMP) += cpupri.o cpudeadline.o topology.o
+obj-y += idle_task.o fair.o rt.o deadline.o
+obj-y += wait.o wait_bit.o swait.o completion.o idle.o
+obj-$(CONFIG_SMP) += cpupri.o cpudeadline.o topology.o stop_task.o
obj-$(CONFIG_SCHED_AUTOGROUP) += autogroup.o
obj-$(CONFIG_SCHEDSTATS) += stats.o
obj-$(CONFIG_SCHED_DEBUG) += debug.o
diff --git a/kernel/sched/clock.c b/kernel/sched/clock.c
index 00a45c45beca..ca0f8fc945c6 100644
--- a/kernel/sched/clock.c
+++ b/kernel/sched/clock.c
@@ -64,6 +64,7 @@
#include <linux/workqueue.h>
#include <linux/compiler.h>
#include <linux/tick.h>
+#include <linux/init.h>
/*
* Scheduler clock - returns current time in nanosec units.
@@ -124,14 +125,27 @@ int sched_clock_stable(void)
return static_branch_likely(&__sched_clock_stable);
}
+static void __scd_stamp(struct sched_clock_data *scd)
+{
+ scd->tick_gtod = ktime_get_ns();
+ scd->tick_raw = sched_clock();
+}
+
static void __set_sched_clock_stable(void)
{
- struct sched_clock_data *scd = this_scd();
+ struct sched_clock_data *scd;
/*
+ * Since we're still unstable and the tick is already running, we have
+ * to disable IRQs in order to get a consistent scd->tick* reading.
+ */
+ local_irq_disable();
+ scd = this_scd();
+ /*
* Attempt to make the (initial) unstable->stable transition continuous.
*/
__sched_clock_offset = (scd->tick_gtod + __gtod_offset) - (scd->tick_raw);
+ local_irq_enable();
printk(KERN_INFO "sched_clock: Marking stable (%lld, %lld)->(%lld, %lld)\n",
scd->tick_gtod, __gtod_offset,
@@ -141,8 +155,38 @@ static void __set_sched_clock_stable(void)
tick_dep_clear(TICK_DEP_BIT_CLOCK_UNSTABLE);
}
+/*
+ * If we ever get here, we're screwed, because we found out -- typically after
+ * the fact -- that TSC wasn't good. This means all our clocksources (including
+ * ktime) could have reported wrong values.
+ *
+ * What we do here is an attempt to fix up and continue sort of where we left
+ * off in a coherent manner.
+ *
+ * The only way to fully avoid random clock jumps is to boot with:
+ * "tsc=unstable".
+ */
static void __sched_clock_work(struct work_struct *work)
{
+ struct sched_clock_data *scd;
+ int cpu;
+
+ /* take a current timestamp and set 'now' */
+ preempt_disable();
+ scd = this_scd();
+ __scd_stamp(scd);
+ scd->clock = scd->tick_gtod + __gtod_offset;
+ preempt_enable();
+
+ /* clone to all CPUs */
+ for_each_possible_cpu(cpu)
+ per_cpu(sched_clock_data, cpu) = *scd;
+
+ printk(KERN_WARNING "TSC found unstable after boot, most likely due to broken BIOS. Use 'tsc=unstable'.\n");
+ printk(KERN_INFO "sched_clock: Marking unstable (%lld, %lld)<-(%lld, %lld)\n",
+ scd->tick_gtod, __gtod_offset,
+ scd->tick_raw, __sched_clock_offset);
+
static_branch_disable(&__sched_clock_stable);
}
@@ -150,27 +194,11 @@ static DECLARE_WORK(sched_clock_work, __sched_clock_work);
static void __clear_sched_clock_stable(void)
{
- struct sched_clock_data *scd = this_scd();
-
- /*
- * Attempt to make the stable->unstable transition continuous.
- *
- * Trouble is, this is typically called from the TSC watchdog
- * timer, which is late per definition. This means the tick
- * values can already be screwy.
- *
- * Still do what we can.
- */
- __gtod_offset = (scd->tick_raw + __sched_clock_offset) - (scd->tick_gtod);
-
- printk(KERN_INFO "sched_clock: Marking unstable (%lld, %lld)<-(%lld, %lld)\n",
- scd->tick_gtod, __gtod_offset,
- scd->tick_raw, __sched_clock_offset);
+ if (!sched_clock_stable())
+ return;
tick_dep_set(TICK_DEP_BIT_CLOCK_UNSTABLE);
-
- if (sched_clock_stable())
- schedule_work(&sched_clock_work);
+ schedule_work(&sched_clock_work);
}
void clear_sched_clock_stable(void)
@@ -183,7 +211,11 @@ void clear_sched_clock_stable(void)
__clear_sched_clock_stable();
}
-void sched_clock_init_late(void)
+/*
+ * We run this as late_initcall() such that it runs after all built-in drivers,
+ * notably: acpi_processor and intel_idle, which can mark the TSC as unstable.
+ */
+static int __init sched_clock_init_late(void)
{
sched_clock_running = 2;
/*
@@ -197,7 +229,10 @@ void sched_clock_init_late(void)
if (__sched_clock_stable_early)
__set_sched_clock_stable();
+
+ return 0;
}
+late_initcall(sched_clock_init_late);
/*
* min, max except they take wrapping into account
@@ -347,21 +382,38 @@ void sched_clock_tick(void)
{
struct sched_clock_data *scd;
+ if (sched_clock_stable())
+ return;
+
+ if (unlikely(!sched_clock_running))
+ return;
+
WARN_ON_ONCE(!irqs_disabled());
+ scd = this_scd();
+ __scd_stamp(scd);
+ sched_clock_local(scd);
+}
+
+void sched_clock_tick_stable(void)
+{
+ u64 gtod, clock;
+
+ if (!sched_clock_stable())
+ return;
+
/*
- * Update these values even if sched_clock_stable(), because it can
- * become unstable at any point in time at which point we need some
- * values to fall back on.
+ * Called under watchdog_lock.
*
- * XXX arguably we can skip this if we expose tsc_clocksource_reliable
+ * The watchdog just found this TSC to (still) be stable, so now is a
+ * good moment to update our __gtod_offset. Because once we find the
+ * TSC to be unstable, any computation will be computing crap.
*/
- scd = this_scd();
- scd->tick_raw = sched_clock();
- scd->tick_gtod = ktime_get_ns();
-
- if (!sched_clock_stable() && likely(sched_clock_running))
- sched_clock_local(scd);
+ local_irq_disable();
+ gtod = ktime_get_ns();
+ clock = sched_clock();
+ __gtod_offset = (clock + __sched_clock_offset) - gtod;
+ local_irq_enable();
}
/*
@@ -374,15 +426,21 @@ void sched_clock_idle_sleep_event(void)
EXPORT_SYMBOL_GPL(sched_clock_idle_sleep_event);
/*
- * We just idled delta nanoseconds (called with irqs disabled):
+ * We just idled; resync with ktime.
*/
-void sched_clock_idle_wakeup_event(u64 delta_ns)
+void sched_clock_idle_wakeup_event(void)
{
- if (timekeeping_suspended)
+ unsigned long flags;
+
+ if (sched_clock_stable())
+ return;
+
+ if (unlikely(timekeeping_suspended))
return;
+ local_irq_save(flags);
sched_clock_tick();
- touch_softlockup_watchdog_sched();
+ local_irq_restore(flags);
}
EXPORT_SYMBOL_GPL(sched_clock_idle_wakeup_event);
diff --git a/kernel/sched/completion.c b/kernel/sched/completion.c
index 53f9558fa925..13fc5ae9bf2f 100644
--- a/kernel/sched/completion.c
+++ b/kernel/sched/completion.c
@@ -66,7 +66,7 @@ do_wait_for_common(struct completion *x,
if (!x->done) {
DECLARE_WAITQUEUE(wait, current);
- __add_wait_queue_tail_exclusive(&x->wait, &wait);
+ __add_wait_queue_entry_tail_exclusive(&x->wait, &wait);
do {
if (signal_pending_state(state, current)) {
timeout = -ERESTARTSYS;
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 803c3bc274c4..17c667b427b4 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -10,6 +10,7 @@
#include <uapi/linux/sched/types.h>
#include <linux/sched/loadavg.h>
#include <linux/sched/hotplug.h>
+#include <linux/wait_bit.h>
#include <linux/cpuset.h>
#include <linux/delayacct.h>
#include <linux/init_task.h>
@@ -788,36 +789,6 @@ void deactivate_task(struct rq *rq, struct task_struct *p, int flags)
dequeue_task(rq, p, flags);
}
-void sched_set_stop_task(int cpu, struct task_struct *stop)
-{
- struct sched_param param = { .sched_priority = MAX_RT_PRIO - 1 };
- struct task_struct *old_stop = cpu_rq(cpu)->stop;
-
- if (stop) {
- /*
- * Make it appear like a SCHED_FIFO task, its something
- * userspace knows about and won't get confused about.
- *
- * Also, it will make PI more or less work without too
- * much confusion -- but then, stop work should not
- * rely on PI working anyway.
- */
- sched_setscheduler_nocheck(stop, SCHED_FIFO, &param);
-
- stop->sched_class = &stop_sched_class;
- }
-
- cpu_rq(cpu)->stop = stop;
-
- if (old_stop) {
- /*
- * Reset it back to a normal scheduling class so that
- * it can die in pieces.
- */
- old_stop->sched_class = &rt_sched_class;
- }
-}
-
/*
* __normal_prio - return the priority that is based on the static prio
*/
@@ -1588,6 +1559,36 @@ static void update_avg(u64 *avg, u64 sample)
*avg += diff >> 3;
}
+void sched_set_stop_task(int cpu, struct task_struct *stop)
+{
+ struct sched_param param = { .sched_priority = MAX_RT_PRIO - 1 };
+ struct task_struct *old_stop = cpu_rq(cpu)->stop;
+
+ if (stop) {
+ /*
+ * Make it appear like a SCHED_FIFO task, its something
+ * userspace knows about and won't get confused about.
+ *
+ * Also, it will make PI more or less work without too
+ * much confusion -- but then, stop work should not
+ * rely on PI working anyway.
+ */
+ sched_setscheduler_nocheck(stop, SCHED_FIFO, &param);
+
+ stop->sched_class = &stop_sched_class;
+ }
+
+ cpu_rq(cpu)->stop = stop;
+
+ if (old_stop) {
+ /*
+ * Reset it back to a normal scheduling class so that
+ * it can die in pieces.
+ */
+ old_stop->sched_class = &rt_sched_class;
+ }
+}
+
#else
static inline int __set_cpus_allowed_ptr(struct task_struct *p,
@@ -1731,7 +1732,7 @@ void sched_ttwu_pending(void)
{
struct rq *rq = this_rq();
struct llist_node *llist = llist_del_all(&rq->wake_list);
- struct task_struct *p;
+ struct task_struct *p, *t;
struct rq_flags rf;
if (!llist)
@@ -1740,17 +1741,8 @@ void sched_ttwu_pending(void)
rq_lock_irqsave(rq, &rf);
update_rq_clock(rq);
- while (llist) {
- int wake_flags = 0;
-
- p = llist_entry(llist, struct task_struct, wake_entry);
- llist = llist_next(llist);
-
- if (p->sched_remote_wakeup)
- wake_flags = WF_MIGRATED;
-
- ttwu_do_activate(rq, p, wake_flags, &rf);
- }
+ llist_for_each_entry_safe(p, t, llist, wake_entry)
+ ttwu_do_activate(rq, p, p->sched_remote_wakeup ? WF_MIGRATED : 0, &rf);
rq_unlock_irqrestore(rq, &rf);
}
@@ -2148,23 +2140,6 @@ int wake_up_state(struct task_struct *p, unsigned int state)
}
/*
- * This function clears the sched_dl_entity static params.
- */
-void __dl_clear_params(struct task_struct *p)
-{
- struct sched_dl_entity *dl_se = &p->dl;
-
- dl_se->dl_runtime = 0;
- dl_se->dl_deadline = 0;
- dl_se->dl_period = 0;
- dl_se->flags = 0;
- dl_se->dl_bw = 0;
-
- dl_se->dl_throttled = 0;
- dl_se->dl_yielded = 0;
-}
-
-/*
* Perform scheduler related setup for a newly forked process p.
* p is forked by current.
*
@@ -2193,6 +2168,7 @@ static void __sched_fork(unsigned long clone_flags, struct task_struct *p)
RB_CLEAR_NODE(&p->dl.rb_node);
init_dl_task_timer(&p->dl);
+ init_dl_inactive_task_timer(&p->dl);
__dl_clear_params(p);
INIT_LIST_HEAD(&p->rt.run_list);
@@ -2430,7 +2406,7 @@ int sched_fork(unsigned long clone_flags, struct task_struct *p)
unsigned long to_ratio(u64 period, u64 runtime)
{
if (runtime == RUNTIME_INF)
- return 1ULL << 20;
+ return BW_UNIT;
/*
* Doing this here saves a lot of checks in all
@@ -2440,93 +2416,9 @@ unsigned long to_ratio(u64 period, u64 runtime)
if (period == 0)
return 0;
- return div64_u64(runtime << 20, period);
-}
-
-#ifdef CONFIG_SMP
-inline struct dl_bw *dl_bw_of(int i)
-{
- RCU_LOCKDEP_WARN(!rcu_read_lock_sched_held(),
- "sched RCU must be held");
- return &cpu_rq(i)->rd->dl_bw;
-}
-
-static inline int dl_bw_cpus(int i)
-{
- struct root_domain *rd = cpu_rq(i)->rd;
- int cpus = 0;
-
- RCU_LOCKDEP_WARN(!rcu_read_lock_sched_held(),
- "sched RCU must be held");
- for_each_cpu_and(i, rd->span, cpu_active_mask)
- cpus++;
-
- return cpus;
-}
-#else
-inline struct dl_bw *dl_bw_of(int i)
-{
- return &cpu_rq(i)->dl.dl_bw;
-}
-
-static inline int dl_bw_cpus(int i)
-{
- return 1;
-}
-#endif
-
-/*
- * We must be sure that accepting a new task (or allowing changing the
- * parameters of an existing one) is consistent with the bandwidth
- * constraints. If yes, this function also accordingly updates the currently
- * allocated bandwidth to reflect the new situation.
- *
- * This function is called while holding p's rq->lock.
- *
- * XXX we should delay bw change until the task's 0-lag point, see
- * __setparam_dl().
- */
-static int dl_overflow(struct task_struct *p, int policy,
- const struct sched_attr *attr)
-{
-
- struct dl_bw *dl_b = dl_bw_of(task_cpu(p));
- u64 period = attr->sched_period ?: attr->sched_deadline;
- u64 runtime = attr->sched_runtime;
- u64 new_bw = dl_policy(policy) ? to_ratio(period, runtime) : 0;
- int cpus, err = -1;
-
- /* !deadline task may carry old deadline bandwidth */
- if (new_bw == p->dl.dl_bw && task_has_dl_policy(p))
- return 0;
-
- /*
- * Either if a task, enters, leave, or stays -deadline but changes
- * its parameters, we may need to update accordingly the total
- * allocated bandwidth of the container.
- */
- raw_spin_lock(&dl_b->lock);
- cpus = dl_bw_cpus(task_cpu(p));
- if (dl_policy(policy) && !task_has_dl_policy(p) &&
- !__dl_overflow(dl_b, cpus, 0, new_bw)) {
- __dl_add(dl_b, new_bw);
- err = 0;
- } else if (dl_policy(policy) && task_has_dl_policy(p) &&
- !__dl_overflow(dl_b, cpus, p->dl.dl_bw, new_bw)) {
- __dl_clear(dl_b, p->dl.dl_bw);
- __dl_add(dl_b, new_bw);
- err = 0;
- } else if (!dl_policy(policy) && task_has_dl_policy(p)) {
- __dl_clear(dl_b, p->dl.dl_bw);
- err = 0;
- }
- raw_spin_unlock(&dl_b->lock);
-
- return err;
+ return div64_u64(runtime << BW_SHIFT, period);
}
-extern void init_dl_bw(struct dl_bw *dl_b);
-
/*
* wake_up_new_task - wake up a newly created task for the first time.
*
@@ -3687,7 +3579,7 @@ asmlinkage __visible void __sched preempt_schedule_irq(void)
exception_exit(prev_state);
}
-int default_wake_function(wait_queue_t *curr, unsigned mode, int wake_flags,
+int default_wake_function(wait_queue_entry_t *curr, unsigned mode, int wake_flags,
void *key)
{
return try_to_wake_up(curr->private, mode, wake_flags);
@@ -4009,46 +3901,6 @@ static struct task_struct *find_process_by_pid(pid_t pid)
}
/*
- * This function initializes the sched_dl_entity of a newly becoming
- * SCHED_DEADLINE task.
- *
- * Only the static values are considered here, the actual runtime and the
- * absolute deadline will be properly calculated when the task is enqueued
- * for the first time with its new policy.
- */
-static void
-__setparam_dl(struct task_struct *p, const struct sched_attr *attr)
-{
- struct sched_dl_entity *dl_se = &p->dl;
-
- dl_se->dl_runtime = attr->sched_runtime;
- dl_se->dl_deadline = attr->sched_deadline;
- dl_se->dl_period = attr->sched_period ?: dl_se->dl_deadline;
- dl_se->flags = attr->sched_flags;
- dl_se->dl_bw = to_ratio(dl_se->dl_period, dl_se->dl_runtime);
-
- /*
- * Changing the parameters of a task is 'tricky' and we're not doing
- * the correct thing -- also see task_dead_dl() and switched_from_dl().
- *
- * What we SHOULD do is delay the bandwidth release until the 0-lag
- * point. This would include retaining the task_struct until that time
- * and change dl_overflow() to not immediately decrement the current
- * amount.
- *
- * Instead we retain the current runtime/deadline and let the new
- * parameters take effect after the current reservation period lapses.
- * This is safe (albeit pessimistic) because the 0-lag point is always
- * before the current scheduling deadline.
- *
- * We can still have temporary overloads because we do not delay the
- * change in bandwidth until that time; so admission control is
- * not on the safe side. It does however guarantee tasks will never
- * consume more than promised.
- */
-}
-
-/*
* sched_setparam() passes in -1 for its policy, to let the functions
* it calls know not to change it.
*/
@@ -4101,59 +3953,6 @@ static void __setscheduler(struct rq *rq, struct task_struct *p,
p->sched_class = &fair_sched_class;
}
-static void
-__getparam_dl(struct task_struct *p, struct sched_attr *attr)
-{
- struct sched_dl_entity *dl_se = &p->dl;
-
- attr->sched_priority = p->rt_priority;
- attr->sched_runtime = dl_se->dl_runtime;
- attr->sched_deadline = dl_se->dl_deadline;
- attr->sched_period = dl_se->dl_period;
- attr->sched_flags = dl_se->flags;
-}
-
-/*
- * This function validates the new parameters of a -deadline task.
- * We ask for the deadline not being zero, and greater or equal
- * than the runtime, as well as the period of being zero or
- * greater than deadline. Furthermore, we have to be sure that
- * user parameters are above the internal resolution of 1us (we
- * check sched_runtime only since it is always the smaller one) and
- * below 2^63 ns (we have to check both sched_deadline and
- * sched_period, as the latter can be zero).
- */
-static bool
-__checkparam_dl(const struct sched_attr *attr)
-{
- /* deadline != 0 */
- if (attr->sched_deadline == 0)
- return false;
-
- /*
- * Since we truncate DL_SCALE bits, make sure we're at least
- * that big.
- */
- if (attr->sched_runtime < (1ULL << DL_SCALE))
- return false;
-
- /*
- * Since we use the MSB for wrap-around and sign issues, make
- * sure it's not set (mind that period can be equal to zero).
- */
- if (attr->sched_deadline & (1ULL << 63) ||
- attr->sched_period & (1ULL << 63))
- return false;
-
- /* runtime <= deadline <= period (if period != 0) */
- if ((attr->sched_period != 0 &&
- attr->sched_period < attr->sched_deadline) ||
- attr->sched_deadline < attr->sched_runtime)
- return false;
-
- return true;
-}
-
/*
* Check the target process has a UID that matches the current process's:
*/
@@ -4170,19 +3969,6 @@ static bool check_same_owner(struct task_struct *p)
return match;
}
-static bool dl_param_changed(struct task_struct *p, const struct sched_attr *attr)
-{
- struct sched_dl_entity *dl_se = &p->dl;
-
- if (dl_se->dl_runtime != attr->sched_runtime ||
- dl_se->dl_deadline != attr->sched_deadline ||
- dl_se->dl_period != attr->sched_period ||
- dl_se->flags != attr->sched_flags)
- return true;
-
- return false;
-}
-
static int __sched_setscheduler(struct task_struct *p,
const struct sched_attr *attr,
bool user, bool pi)
@@ -4197,8 +3983,8 @@ static int __sched_setscheduler(struct task_struct *p,
int queue_flags = DEQUEUE_SAVE | DEQUEUE_MOVE | DEQUEUE_NOCLOCK;
struct rq *rq;
- /* May grab non-irq protected spin_locks: */
- BUG_ON(in_interrupt());
+ /* The pi code expects interrupts enabled */
+ BUG_ON(pi && in_interrupt());
recheck:
/* Double check policy once rq lock held: */
if (policy < 0) {
@@ -4211,7 +3997,8 @@ recheck:
return -EINVAL;
}
- if (attr->sched_flags & ~(SCHED_FLAG_RESET_ON_FORK))
+ if (attr->sched_flags &
+ ~(SCHED_FLAG_RESET_ON_FORK | SCHED_FLAG_RECLAIM))
return -EINVAL;
/*
@@ -4362,7 +4149,7 @@ change:
* of a SCHED_DEADLINE task) we need to check if enough bandwidth
* is available.
*/
- if ((dl_policy(policy) || dl_task(p)) && dl_overflow(p, policy, attr)) {
+ if ((dl_policy(policy) || dl_task(p)) && sched_dl_overflow(p, policy, attr)) {
task_rq_unlock(rq, p, &rf);
return -EBUSY;
}
@@ -5463,26 +5250,17 @@ void init_idle(struct task_struct *idle, int cpu)
#endif
}
+#ifdef CONFIG_SMP
+
int cpuset_cpumask_can_shrink(const struct cpumask *cur,
const struct cpumask *trial)
{
- int ret = 1, trial_cpus;
- struct dl_bw *cur_dl_b;
- unsigned long flags;
+ int ret = 1;
if (!cpumask_weight(cur))
return ret;
- rcu_read_lock_sched();
- cur_dl_b = dl_bw_of(cpumask_any(cur));
- trial_cpus = cpumask_weight(trial);
-
- raw_spin_lock_irqsave(&cur_dl_b->lock, flags);
- if (cur_dl_b->bw != -1 &&
- cur_dl_b->bw * trial_cpus < cur_dl_b->total_bw)
- ret = 0;
- raw_spin_unlock_irqrestore(&cur_dl_b->lock, flags);
- rcu_read_unlock_sched();
+ ret = dl_cpuset_cpumask_can_shrink(cur, trial);
return ret;
}
@@ -5506,43 +5284,14 @@ int task_can_attach(struct task_struct *p,
goto out;
}
-#ifdef CONFIG_SMP
if (dl_task(p) && !cpumask_intersects(task_rq(p)->rd->span,
- cs_cpus_allowed)) {
- unsigned int dest_cpu = cpumask_any_and(cpu_active_mask,
- cs_cpus_allowed);
- struct dl_bw *dl_b;
- bool overflow;
- int cpus;
- unsigned long flags;
-
- rcu_read_lock_sched();
- dl_b = dl_bw_of(dest_cpu);
- raw_spin_lock_irqsave(&dl_b->lock, flags);
- cpus = dl_bw_cpus(dest_cpu);
- overflow = __dl_overflow(dl_b, cpus, 0, p->dl.dl_bw);
- if (overflow)
- ret = -EBUSY;
- else {
- /*
- * We reserve space for this task in the destination
- * root_domain, as we can't fail after this point.
- * We will free resources in the source root_domain
- * later on (see set_cpus_allowed_dl()).
- */
- __dl_add(dl_b, p->dl.dl_bw);
- }
- raw_spin_unlock_irqrestore(&dl_b->lock, flags);
- rcu_read_unlock_sched();
+ cs_cpus_allowed))
+ ret = dl_task_can_attach(p, cs_cpus_allowed);
- }
-#endif
out:
return ret;
}
-#ifdef CONFIG_SMP
-
bool sched_smp_initialized __read_mostly;
#ifdef CONFIG_NUMA_BALANCING
@@ -5605,7 +5354,7 @@ void idle_task_exit(void)
BUG_ON(cpu_online(smp_processor_id()));
if (mm != &init_mm) {
- switch_mm_irqs_off(mm, &init_mm, current);
+ switch_mm(mm, &init_mm, current);
finish_arch_post_lock_switch();
}
mmdrop(mm);
@@ -5805,23 +5554,8 @@ static void cpuset_cpu_active(void)
static int cpuset_cpu_inactive(unsigned int cpu)
{
- unsigned long flags;
- struct dl_bw *dl_b;
- bool overflow;
- int cpus;
-
if (!cpuhp_tasks_frozen) {
- rcu_read_lock_sched();
- dl_b = dl_bw_of(cpu);
-
- raw_spin_lock_irqsave(&dl_b->lock, flags);
- cpus = dl_bw_cpus(cpu);
- overflow = __dl_overflow(dl_b, cpus, 0, 0);
- raw_spin_unlock_irqrestore(&dl_b->lock, flags);
-
- rcu_read_unlock_sched();
-
- if (overflow)
+ if (dl_cpu_busy(cpu))
return -EBUSY;
cpuset_update_active_cpus();
} else {
@@ -5874,15 +5608,9 @@ int sched_cpu_deactivate(unsigned int cpu)
* users of this state to go away such that all new such users will
* observe it.
*
- * For CONFIG_PREEMPT we have preemptible RCU and its sync_rcu() might
- * not imply sync_sched(), so wait for both.
- *
* Do sync before park smpboot threads to take care the rcu boost case.
*/
- if (IS_ENABLED(CONFIG_PREEMPT))
- synchronize_rcu_mult(call_rcu, call_rcu_sched);
- else
- synchronize_rcu();
+ synchronize_rcu_mult(call_rcu, call_rcu_sched);
if (!sched_smp_initialized)
return 0;
@@ -5958,7 +5686,6 @@ void __init sched_init_smp(void)
cpumask_var_t non_isolated_cpus;
alloc_cpumask_var(&non_isolated_cpus, GFP_KERNEL);
- alloc_cpumask_var(&fallback_doms, GFP_KERNEL);
sched_init_numa();
@@ -5968,7 +5695,7 @@ void __init sched_init_smp(void)
* happen.
*/
mutex_lock(&sched_domains_mutex);
- init_sched_domains(cpu_active_mask);
+ sched_init_domains(cpu_active_mask);
cpumask_andnot(non_isolated_cpus, cpu_possible_mask, cpu_isolated_map);
if (cpumask_empty(non_isolated_cpus))
cpumask_set_cpu(smp_processor_id(), non_isolated_cpus);
@@ -5984,7 +5711,6 @@ void __init sched_init_smp(void)
init_sched_dl_class();
sched_init_smt();
- sched_clock_init_late();
sched_smp_initialized = true;
}
@@ -6000,7 +5726,6 @@ early_initcall(migration_init);
void __init sched_init_smp(void)
{
sched_init_granularity();
- sched_clock_init_late();
}
#endif /* CONFIG_SMP */
@@ -6026,28 +5751,13 @@ static struct kmem_cache *task_group_cache __read_mostly;
DECLARE_PER_CPU(cpumask_var_t, load_balance_mask);
DECLARE_PER_CPU(cpumask_var_t, select_idle_mask);
-#define WAIT_TABLE_BITS 8
-#define WAIT_TABLE_SIZE (1 << WAIT_TABLE_BITS)
-static wait_queue_head_t bit_wait_table[WAIT_TABLE_SIZE] __cacheline_aligned;
-
-wait_queue_head_t *bit_waitqueue(void *word, int bit)
-{
- const int shift = BITS_PER_LONG == 32 ? 5 : 6;
- unsigned long val = (unsigned long)word << shift | bit;
-
- return bit_wait_table + hash_long(val, WAIT_TABLE_BITS);
-}
-EXPORT_SYMBOL(bit_waitqueue);
-
void __init sched_init(void)
{
int i, j;
unsigned long alloc_size = 0, ptr;
sched_clock_init();
-
- for (i = 0; i < WAIT_TABLE_SIZE; i++)
- init_waitqueue_head(bit_wait_table + i);
+ wait_bit_init();
#ifdef CONFIG_FAIR_GROUP_SCHED
alloc_size += 2 * nr_cpu_ids * sizeof(void **);
@@ -6199,7 +5909,6 @@ void __init sched_init(void)
calc_load_update = jiffies + LOAD_FREQ;
#ifdef CONFIG_SMP
- zalloc_cpumask_var(&sched_domains_tmpmask, GFP_NOWAIT);
/* May be allocated at isolcpus cmdline parse time */
if (cpu_isolated_map == NULL)
zalloc_cpumask_var(&cpu_isolated_map, GFP_NOWAIT);
@@ -6251,8 +5960,10 @@ void ___might_sleep(const char *file, int line, int preempt_offset)
if ((preempt_count_equals(preempt_offset) && !irqs_disabled() &&
!is_idle_task(current)) ||
- system_state != SYSTEM_RUNNING || oops_in_progress)
+ system_state == SYSTEM_BOOTING || system_state > SYSTEM_RUNNING ||
+ oops_in_progress)
return;
+
if (time_before(jiffies, prev_jiffy + HZ) && prev_jiffy)
return;
prev_jiffy = jiffies;
@@ -6507,385 +6218,6 @@ void sched_move_task(struct task_struct *tsk)
task_rq_unlock(rq, tsk, &rf);
}
-#endif /* CONFIG_CGROUP_SCHED */
-
-#ifdef CONFIG_RT_GROUP_SCHED
-/*
- * Ensure that the real time constraints are schedulable.
- */
-static DEFINE_MUTEX(rt_constraints_mutex);
-
-/* Must be called with tasklist_lock held */
-static inline int tg_has_rt_tasks(struct task_group *tg)
-{
- struct task_struct *g, *p;
-
- /*
- * Autogroups do not have RT tasks; see autogroup_create().
- */
- if (task_group_is_autogroup(tg))
- return 0;
-
- for_each_process_thread(g, p) {
- if (rt_task(p) && task_group(p) == tg)
- return 1;
- }
-
- return 0;
-}
-
-struct rt_schedulable_data {
- struct task_group *tg;
- u64 rt_period;
- u64 rt_runtime;
-};
-
-static int tg_rt_schedulable(struct task_group *tg, void *data)
-{
- struct rt_schedulable_data *d = data;
- struct task_group *child;
- unsigned long total, sum = 0;
- u64 period, runtime;
-
- period = ktime_to_ns(tg->rt_bandwidth.rt_period);
- runtime = tg->rt_bandwidth.rt_runtime;
-
- if (tg == d->tg) {
- period = d->rt_period;
- runtime = d->rt_runtime;
- }
-
- /*
- * Cannot have more runtime than the period.
- */
- if (runtime > period && runtime != RUNTIME_INF)
- return -EINVAL;
-
- /*
- * Ensure we don't starve existing RT tasks.
- */
- if (rt_bandwidth_enabled() && !runtime && tg_has_rt_tasks(tg))
- return -EBUSY;
-
- total = to_ratio(period, runtime);
-
- /*
- * Nobody can have more than the global setting allows.
- */
- if (total > to_ratio(global_rt_period(), global_rt_runtime()))
- return -EINVAL;
-
- /*
- * The sum of our children's runtime should not exceed our own.
- */
- list_for_each_entry_rcu(child, &tg->children, siblings) {
- period = ktime_to_ns(child->rt_bandwidth.rt_period);
- runtime = child->rt_bandwidth.rt_runtime;
-
- if (child == d->tg) {
- period = d->rt_period;
- runtime = d->rt_runtime;
- }
-
- sum += to_ratio(period, runtime);
- }
-
- if (sum > total)
- return -EINVAL;
-
- return 0;
-}
-
-static int __rt_schedulable(struct task_group *tg, u64 period, u64 runtime)
-{
- int ret;
-
- struct rt_schedulable_data data = {
- .tg = tg,
- .rt_period = period,
- .rt_runtime = runtime,
- };
-
- rcu_read_lock();
- ret = walk_tg_tree(tg_rt_schedulable, tg_nop, &data);
- rcu_read_unlock();
-
- return ret;
-}
-
-static int tg_set_rt_bandwidth(struct task_group *tg,
- u64 rt_period, u64 rt_runtime)
-{
- int i, err = 0;
-
- /*
- * Disallowing the root group RT runtime is BAD, it would disallow the
- * kernel creating (and or operating) RT threads.
- */
- if (tg == &root_task_group && rt_runtime == 0)
- return -EINVAL;
-
- /* No period doesn't make any sense. */
- if (rt_period == 0)
- return -EINVAL;
-
- mutex_lock(&rt_constraints_mutex);
- read_lock(&tasklist_lock);
- err = __rt_schedulable(tg, rt_period, rt_runtime);
- if (err)
- goto unlock;
-
- raw_spin_lock_irq(&tg->rt_bandwidth.rt_runtime_lock);
- tg->rt_bandwidth.rt_period = ns_to_ktime(rt_period);
- tg->rt_bandwidth.rt_runtime = rt_runtime;
-
- for_each_possible_cpu(i) {
- struct rt_rq *rt_rq = tg->rt_rq[i];
-
- raw_spin_lock(&rt_rq->rt_runtime_lock);
- rt_rq->rt_runtime = rt_runtime;
- raw_spin_unlock(&rt_rq->rt_runtime_lock);
- }
- raw_spin_unlock_irq(&tg->rt_bandwidth.rt_runtime_lock);
-unlock:
- read_unlock(&tasklist_lock);
- mutex_unlock(&rt_constraints_mutex);
-
- return err;
-}
-
-static int sched_group_set_rt_runtime(struct task_group *tg, long rt_runtime_us)
-{
- u64 rt_runtime, rt_period;
-
- rt_period = ktime_to_ns(tg->rt_bandwidth.rt_period);
- rt_runtime = (u64)rt_runtime_us * NSEC_PER_USEC;
- if (rt_runtime_us < 0)
- rt_runtime = RUNTIME_INF;
-
- return tg_set_rt_bandwidth(tg, rt_period, rt_runtime);
-}
-
-static long sched_group_rt_runtime(struct task_group *tg)
-{
- u64 rt_runtime_us;
-
- if (tg->rt_bandwidth.rt_runtime == RUNTIME_INF)
- return -1;
-
- rt_runtime_us = tg->rt_bandwidth.rt_runtime;
- do_div(rt_runtime_us, NSEC_PER_USEC);
- return rt_runtime_us;
-}
-
-static int sched_group_set_rt_period(struct task_group *tg, u64 rt_period_us)
-{
- u64 rt_runtime, rt_period;
-
- rt_period = rt_period_us * NSEC_PER_USEC;
- rt_runtime = tg->rt_bandwidth.rt_runtime;
-
- return tg_set_rt_bandwidth(tg, rt_period, rt_runtime);
-}
-
-static long sched_group_rt_period(struct task_group *tg)
-{
- u64 rt_period_us;
-
- rt_period_us = ktime_to_ns(tg->rt_bandwidth.rt_period);
- do_div(rt_period_us, NSEC_PER_USEC);
- return rt_period_us;
-}
-#endif /* CONFIG_RT_GROUP_SCHED */
-
-#ifdef CONFIG_RT_GROUP_SCHED
-static int sched_rt_global_constraints(void)
-{
- int ret = 0;
-
- mutex_lock(&rt_constraints_mutex);
- read_lock(&tasklist_lock);
- ret = __rt_schedulable(NULL, 0, 0);
- read_unlock(&tasklist_lock);
- mutex_unlock(&rt_constraints_mutex);
-
- return ret;
-}
-
-static int sched_rt_can_attach(struct task_group *tg, struct task_struct *tsk)
-{
- /* Don't accept realtime tasks when there is no way for them to run */
- if (rt_task(tsk) && tg->rt_bandwidth.rt_runtime == 0)
- return 0;
-
- return 1;
-}
-
-#else /* !CONFIG_RT_GROUP_SCHED */
-static int sched_rt_global_constraints(void)
-{
- unsigned long flags;
- int i;
-
- raw_spin_lock_irqsave(&def_rt_bandwidth.rt_runtime_lock, flags);
- for_each_possible_cpu(i) {
- struct rt_rq *rt_rq = &cpu_rq(i)->rt;
-
- raw_spin_lock(&rt_rq->rt_runtime_lock);
- rt_rq->rt_runtime = global_rt_runtime();
- raw_spin_unlock(&rt_rq->rt_runtime_lock);
- }
- raw_spin_unlock_irqrestore(&def_rt_bandwidth.rt_runtime_lock, flags);
-
- return 0;
-}
-#endif /* CONFIG_RT_GROUP_SCHED */
-
-static int sched_dl_global_validate(void)
-{
- u64 runtime = global_rt_runtime();
- u64 period = global_rt_period();
- u64 new_bw = to_ratio(period, runtime);
- struct dl_bw *dl_b;
- int cpu, ret = 0;
- unsigned long flags;
-
- /*
- * Here we want to check the bandwidth not being set to some
- * value smaller than the currently allocated bandwidth in
- * any of the root_domains.
- *
- * FIXME: Cycling on all the CPUs is overdoing, but simpler than
- * cycling on root_domains... Discussion on different/better
- * solutions is welcome!
- */
- for_each_possible_cpu(cpu) {
- rcu_read_lock_sched();
- dl_b = dl_bw_of(cpu);
-
- raw_spin_lock_irqsave(&dl_b->lock, flags);
- if (new_bw < dl_b->total_bw)
- ret = -EBUSY;
- raw_spin_unlock_irqrestore(&dl_b->lock, flags);
-
- rcu_read_unlock_sched();
-
- if (ret)
- break;
- }
-
- return ret;
-}
-
-static void sched_dl_do_global(void)
-{
- u64 new_bw = -1;
- struct dl_bw *dl_b;
- int cpu;
- unsigned long flags;
-
- def_dl_bandwidth.dl_period = global_rt_period();
- def_dl_bandwidth.dl_runtime = global_rt_runtime();
-
- if (global_rt_runtime() != RUNTIME_INF)
- new_bw = to_ratio(global_rt_period(), global_rt_runtime());
-
- /*
- * FIXME: As above...
- */
- for_each_possible_cpu(cpu) {
- rcu_read_lock_sched();
- dl_b = dl_bw_of(cpu);
-
- raw_spin_lock_irqsave(&dl_b->lock, flags);
- dl_b->bw = new_bw;
- raw_spin_unlock_irqrestore(&dl_b->lock, flags);
-
- rcu_read_unlock_sched();
- }
-}
-
-static int sched_rt_global_validate(void)
-{
- if (sysctl_sched_rt_period <= 0)
- return -EINVAL;
-
- if ((sysctl_sched_rt_runtime != RUNTIME_INF) &&
- (sysctl_sched_rt_runtime > sysctl_sched_rt_period))
- return -EINVAL;
-
- return 0;
-}
-
-static void sched_rt_do_global(void)
-{
- def_rt_bandwidth.rt_runtime = global_rt_runtime();
- def_rt_bandwidth.rt_period = ns_to_ktime(global_rt_period());
-}
-
-int sched_rt_handler(struct ctl_table *table, int write,
- void __user *buffer, size_t *lenp,
- loff_t *ppos)
-{
- int old_period, old_runtime;
- static DEFINE_MUTEX(mutex);
- int ret;
-
- mutex_lock(&mutex);
- old_period = sysctl_sched_rt_period;
- old_runtime = sysctl_sched_rt_runtime;
-
- ret = proc_dointvec(table, write, buffer, lenp, ppos);
-
- if (!ret && write) {
- ret = sched_rt_global_validate();
- if (ret)
- goto undo;
-
- ret = sched_dl_global_validate();
- if (ret)
- goto undo;
-
- ret = sched_rt_global_constraints();
- if (ret)
- goto undo;
-
- sched_rt_do_global();
- sched_dl_do_global();
- }
- if (0) {
-undo:
- sysctl_sched_rt_period = old_period;
- sysctl_sched_rt_runtime = old_runtime;
- }
- mutex_unlock(&mutex);
-
- return ret;
-}
-
-int sched_rr_handler(struct ctl_table *table, int write,
- void __user *buffer, size_t *lenp,
- loff_t *ppos)
-{
- int ret;
- static DEFINE_MUTEX(mutex);
-
- mutex_lock(&mutex);
- ret = proc_dointvec(table, write, buffer, lenp, ppos);
- /*
- * Make sure that internally we keep jiffies.
- * Also, writing zero resets the timeslice to default:
- */
- if (!ret && write) {
- sched_rr_timeslice =
- sysctl_sched_rr_timeslice <= 0 ? RR_TIMESLICE :
- msecs_to_jiffies(sysctl_sched_rr_timeslice);
- }
- mutex_unlock(&mutex);
- return ret;
-}
-
-#ifdef CONFIG_CGROUP_SCHED
static inline struct task_group *css_tg(struct cgroup_subsys_state *css)
{
diff --git a/kernel/sched/cpufreq_schedutil.c b/kernel/sched/cpufreq_schedutil.c
index 622eed1b7658..076a2e31951c 100644
--- a/kernel/sched/cpufreq_schedutil.c
+++ b/kernel/sched/cpufreq_schedutil.c
@@ -101,9 +101,6 @@ static void sugov_update_commit(struct sugov_policy *sg_policy, u64 time,
if (sg_policy->next_freq == next_freq)
return;
- if (sg_policy->next_freq > next_freq)
- next_freq = (sg_policy->next_freq + next_freq) >> 1;
-
sg_policy->next_freq = next_freq;
sg_policy->last_freq_update_time = time;
diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c
index aea3135c5d90..67c70e287647 100644
--- a/kernel/sched/cputime.c
+++ b/kernel/sched/cputime.c
@@ -615,19 +615,13 @@ static void cputime_adjust(struct task_cputime *curr,
* userspace. Once a task gets some ticks, the monotonicy code at
* 'update' will ensure things converge to the observed ratio.
*/
- if (stime == 0) {
- utime = rtime;
- goto update;
+ if (stime != 0) {
+ if (utime == 0)
+ stime = rtime;
+ else
+ stime = scale_stime(stime, rtime, stime + utime);
}
- if (utime == 0) {
- stime = rtime;
- goto update;
- }
-
- stime = scale_stime(stime, rtime, stime + utime);
-
-update:
/*
* Make sure stime doesn't go backwards; this preserves monotonicity
* for utime because rtime is monotonic.
diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c
index a2ce59015642..a84299f44b5d 100644
--- a/kernel/sched/deadline.c
+++ b/kernel/sched/deadline.c
@@ -17,6 +17,7 @@
#include "sched.h"
#include <linux/slab.h>
+#include <uapi/linux/sched/types.h>
struct dl_bandwidth def_dl_bandwidth;
@@ -43,6 +44,254 @@ static inline int on_dl_rq(struct sched_dl_entity *dl_se)
return !RB_EMPTY_NODE(&dl_se->rb_node);
}
+#ifdef CONFIG_SMP
+static inline struct dl_bw *dl_bw_of(int i)
+{
+ RCU_LOCKDEP_WARN(!rcu_read_lock_sched_held(),
+ "sched RCU must be held");
+ return &cpu_rq(i)->rd->dl_bw;
+}
+
+static inline int dl_bw_cpus(int i)
+{
+ struct root_domain *rd = cpu_rq(i)->rd;
+ int cpus = 0;
+
+ RCU_LOCKDEP_WARN(!rcu_read_lock_sched_held(),
+ "sched RCU must be held");
+ for_each_cpu_and(i, rd->span, cpu_active_mask)
+ cpus++;
+
+ return cpus;
+}
+#else
+static inline struct dl_bw *dl_bw_of(int i)
+{
+ return &cpu_rq(i)->dl.dl_bw;
+}
+
+static inline int dl_bw_cpus(int i)
+{
+ return 1;
+}
+#endif
+
+static inline
+void add_running_bw(u64 dl_bw, struct dl_rq *dl_rq)
+{
+ u64 old = dl_rq->running_bw;
+
+ lockdep_assert_held(&(rq_of_dl_rq(dl_rq))->lock);
+ dl_rq->running_bw += dl_bw;
+ SCHED_WARN_ON(dl_rq->running_bw < old); /* overflow */
+ SCHED_WARN_ON(dl_rq->running_bw > dl_rq->this_bw);
+}
+
+static inline
+void sub_running_bw(u64 dl_bw, struct dl_rq *dl_rq)
+{
+ u64 old = dl_rq->running_bw;
+
+ lockdep_assert_held(&(rq_of_dl_rq(dl_rq))->lock);
+ dl_rq->running_bw -= dl_bw;
+ SCHED_WARN_ON(dl_rq->running_bw > old); /* underflow */
+ if (dl_rq->running_bw > old)
+ dl_rq->running_bw = 0;
+}
+
+static inline
+void add_rq_bw(u64 dl_bw, struct dl_rq *dl_rq)
+{
+ u64 old = dl_rq->this_bw;
+
+ lockdep_assert_held(&(rq_of_dl_rq(dl_rq))->lock);
+ dl_rq->this_bw += dl_bw;
+ SCHED_WARN_ON(dl_rq->this_bw < old); /* overflow */
+}
+
+static inline
+void sub_rq_bw(u64 dl_bw, struct dl_rq *dl_rq)
+{
+ u64 old = dl_rq->this_bw;
+
+ lockdep_assert_held(&(rq_of_dl_rq(dl_rq))->lock);
+ dl_rq->this_bw -= dl_bw;
+ SCHED_WARN_ON(dl_rq->this_bw > old); /* underflow */
+ if (dl_rq->this_bw > old)
+ dl_rq->this_bw = 0;
+ SCHED_WARN_ON(dl_rq->running_bw > dl_rq->this_bw);
+}
+
+void dl_change_utilization(struct task_struct *p, u64 new_bw)
+{
+ struct rq *rq;
+
+ if (task_on_rq_queued(p))
+ return;
+
+ rq = task_rq(p);
+ if (p->dl.dl_non_contending) {
+ sub_running_bw(p->dl.dl_bw, &rq->dl);
+ p->dl.dl_non_contending = 0;
+ /*
+ * If the timer handler is currently running and the
+ * timer cannot be cancelled, inactive_task_timer()
+ * will see that dl_not_contending is not set, and
+ * will not touch the rq's active utilization,
+ * so we are still safe.
+ */
+ if (hrtimer_try_to_cancel(&p->dl.inactive_timer) == 1)
+ put_task_struct(p);
+ }
+ sub_rq_bw(p->dl.dl_bw, &rq->dl);
+ add_rq_bw(new_bw, &rq->dl);
+}
+
+/*
+ * The utilization of a task cannot be immediately removed from
+ * the rq active utilization (running_bw) when the task blocks.
+ * Instead, we have to wait for the so called "0-lag time".
+ *
+ * If a task blocks before the "0-lag time", a timer (the inactive
+ * timer) is armed, and running_bw is decreased when the timer
+ * fires.
+ *
+ * If the task wakes up again before the inactive timer fires,
+ * the timer is cancelled, whereas if the task wakes up after the
+ * inactive timer fired (and running_bw has been decreased) the
+ * task's utilization has to be added to running_bw again.
+ * A flag in the deadline scheduling entity (dl_non_contending)
+ * is used to avoid race conditions between the inactive timer handler
+ * and task wakeups.
+ *
+ * The following diagram shows how running_bw is updated. A task is
+ * "ACTIVE" when its utilization contributes to running_bw; an
+ * "ACTIVE contending" task is in the TASK_RUNNING state, while an
+ * "ACTIVE non contending" task is a blocked task for which the "0-lag time"
+ * has not passed yet. An "INACTIVE" task is a task for which the "0-lag"
+ * time already passed, which does not contribute to running_bw anymore.
+ * +------------------+
+ * wakeup | ACTIVE |
+ * +------------------>+ contending |
+ * | add_running_bw | |
+ * | +----+------+------+
+ * | | ^
+ * | dequeue | |
+ * +--------+-------+ | |
+ * | | t >= 0-lag | | wakeup
+ * | INACTIVE |<---------------+ |
+ * | | sub_running_bw | |
+ * +--------+-------+ | |
+ * ^ | |
+ * | t < 0-lag | |
+ * | | |
+ * | V |
+ * | +----+------+------+
+ * | sub_running_bw | ACTIVE |
+ * +-------------------+ |
+ * inactive timer | non contending |
+ * fired +------------------+
+ *
+ * The task_non_contending() function is invoked when a task
+ * blocks, and checks if the 0-lag time already passed or
+ * not (in the first case, it directly updates running_bw;
+ * in the second case, it arms the inactive timer).
+ *
+ * The task_contending() function is invoked when a task wakes
+ * up, and checks if the task is still in the "ACTIVE non contending"
+ * state or not (in the second case, it updates running_bw).
+ */
+static void task_non_contending(struct task_struct *p)
+{
+ struct sched_dl_entity *dl_se = &p->dl;
+ struct hrtimer *timer = &dl_se->inactive_timer;
+ struct dl_rq *dl_rq = dl_rq_of_se(dl_se);
+ struct rq *rq = rq_of_dl_rq(dl_rq);
+ s64 zerolag_time;
+
+ /*
+ * If this is a non-deadline task that has been boosted,
+ * do nothing
+ */
+ if (dl_se->dl_runtime == 0)
+ return;
+
+ WARN_ON(hrtimer_active(&dl_se->inactive_timer));
+ WARN_ON(dl_se->dl_non_contending);
+
+ zerolag_time = dl_se->deadline -
+ div64_long((dl_se->runtime * dl_se->dl_period),
+ dl_se->dl_runtime);
+
+ /*
+ * Using relative times instead of the absolute "0-lag time"
+ * allows to simplify the code
+ */
+ zerolag_time -= rq_clock(rq);
+
+ /*
+ * If the "0-lag time" already passed, decrease the active
+ * utilization now, instead of starting a timer
+ */
+ if (zerolag_time < 0) {
+ if (dl_task(p))
+ sub_running_bw(dl_se->dl_bw, dl_rq);
+ if (!dl_task(p) || p->state == TASK_DEAD) {
+ struct dl_bw *dl_b = dl_bw_of(task_cpu(p));
+
+ if (p->state == TASK_DEAD)
+ sub_rq_bw(p->dl.dl_bw, &rq->dl);
+ raw_spin_lock(&dl_b->lock);
+ __dl_clear(dl_b, p->dl.dl_bw, dl_bw_cpus(task_cpu(p)));
+ __dl_clear_params(p);
+ raw_spin_unlock(&dl_b->lock);
+ }
+
+ return;
+ }
+
+ dl_se->dl_non_contending = 1;
+ get_task_struct(p);
+ hrtimer_start(timer, ns_to_ktime(zerolag_time), HRTIMER_MODE_REL);
+}
+
+static void task_contending(struct sched_dl_entity *dl_se, int flags)
+{
+ struct dl_rq *dl_rq = dl_rq_of_se(dl_se);
+
+ /*
+ * If this is a non-deadline task that has been boosted,
+ * do nothing
+ */
+ if (dl_se->dl_runtime == 0)
+ return;
+
+ if (flags & ENQUEUE_MIGRATED)
+ add_rq_bw(dl_se->dl_bw, dl_rq);
+
+ if (dl_se->dl_non_contending) {
+ dl_se->dl_non_contending = 0;
+ /*
+ * If the timer handler is currently running and the
+ * timer cannot be cancelled, inactive_task_timer()
+ * will see that dl_not_contending is not set, and
+ * will not touch the rq's active utilization,
+ * so we are still safe.
+ */
+ if (hrtimer_try_to_cancel(&dl_se->inactive_timer) == 1)
+ put_task_struct(dl_task_of(dl_se));
+ } else {
+ /*
+ * Since "dl_non_contending" is not set, the
+ * task's utilization has already been removed from
+ * active utilization (either when the task blocked,
+ * when the "inactive timer" fired).
+ * So, add it back.
+ */
+ add_running_bw(dl_se->dl_bw, dl_rq);
+ }
+}
+
static inline int is_leftmost(struct task_struct *p, struct dl_rq *dl_rq)
{
struct sched_dl_entity *dl_se = &p->dl;
@@ -83,6 +332,10 @@ void init_dl_rq(struct dl_rq *dl_rq)
#else
init_dl_bw(&dl_rq->dl_bw);
#endif
+
+ dl_rq->running_bw = 0;
+ dl_rq->this_bw = 0;
+ init_dl_rq_bw_ratio(dl_rq);
}
#ifdef CONFIG_SMP
@@ -484,13 +737,84 @@ static bool dl_entity_overflow(struct sched_dl_entity *dl_se,
}
/*
- * When a -deadline entity is queued back on the runqueue, its runtime and
- * deadline might need updating.
+ * Revised wakeup rule [1]: For self-suspending tasks, rather then
+ * re-initializing task's runtime and deadline, the revised wakeup
+ * rule adjusts the task's runtime to avoid the task to overrun its
+ * density.
+ *
+ * Reasoning: a task may overrun the density if:
+ * runtime / (deadline - t) > dl_runtime / dl_deadline
+ *
+ * Therefore, runtime can be adjusted to:
+ * runtime = (dl_runtime / dl_deadline) * (deadline - t)
+ *
+ * In such way that runtime will be equal to the maximum density
+ * the task can use without breaking any rule.
+ *
+ * [1] Luca Abeni, Giuseppe Lipari, and Juri Lelli. 2015. Constant
+ * bandwidth server revisited. SIGBED Rev. 11, 4 (January 2015), 19-24.
+ */
+static void
+update_dl_revised_wakeup(struct sched_dl_entity *dl_se, struct rq *rq)
+{
+ u64 laxity = dl_se->deadline - rq_clock(rq);
+
+ /*
+ * If the task has deadline < period, and the deadline is in the past,
+ * it should already be throttled before this check.
+ *
+ * See update_dl_entity() comments for further details.
+ */
+ WARN_ON(dl_time_before(dl_se->deadline, rq_clock(rq)));
+
+ dl_se->runtime = (dl_se->dl_density * laxity) >> BW_SHIFT;
+}
+
+/*
+ * Regarding the deadline, a task with implicit deadline has a relative
+ * deadline == relative period. A task with constrained deadline has a
+ * relative deadline <= relative period.
+ *
+ * We support constrained deadline tasks. However, there are some restrictions
+ * applied only for tasks which do not have an implicit deadline. See
+ * update_dl_entity() to know more about such restrictions.
+ *
+ * The dl_is_implicit() returns true if the task has an implicit deadline.
+ */
+static inline bool dl_is_implicit(struct sched_dl_entity *dl_se)
+{
+ return dl_se->dl_deadline == dl_se->dl_period;
+}
+
+/*
+ * When a deadline entity is placed in the runqueue, its runtime and deadline
+ * might need to be updated. This is done by a CBS wake up rule. There are two
+ * different rules: 1) the original CBS; and 2) the Revisited CBS.
+ *
+ * When the task is starting a new period, the Original CBS is used. In this
+ * case, the runtime is replenished and a new absolute deadline is set.
+ *
+ * When a task is queued before the begin of the next period, using the
+ * remaining runtime and deadline could make the entity to overflow, see
+ * dl_entity_overflow() to find more about runtime overflow. When such case
+ * is detected, the runtime and deadline need to be updated.
+ *
+ * If the task has an implicit deadline, i.e., deadline == period, the Original
+ * CBS is applied. the runtime is replenished and a new absolute deadline is
+ * set, as in the previous cases.
+ *
+ * However, the Original CBS does not work properly for tasks with
+ * deadline < period, which are said to have a constrained deadline. By
+ * applying the Original CBS, a constrained deadline task would be able to run
+ * runtime/deadline in a period. With deadline < period, the task would
+ * overrun the runtime/period allowed bandwidth, breaking the admission test.
*
- * The policy here is that we update the deadline of the entity only if:
- * - the current deadline is in the past,
- * - using the remaining runtime with the current deadline would make
- * the entity exceed its bandwidth.
+ * In order to prevent this misbehave, the Revisited CBS is used for
+ * constrained deadline tasks when a runtime overflow is detected. In the
+ * Revisited CBS, rather than replenishing & setting a new absolute deadline,
+ * the remaining runtime of the task is reduced to avoid runtime overflow.
+ * Please refer to the comments update_dl_revised_wakeup() function to find
+ * more about the Revised CBS rule.
*/
static void update_dl_entity(struct sched_dl_entity *dl_se,
struct sched_dl_entity *pi_se)
@@ -500,6 +824,14 @@ static void update_dl_entity(struct sched_dl_entity *dl_se,
if (dl_time_before(dl_se->deadline, rq_clock(rq)) ||
dl_entity_overflow(dl_se, pi_se, rq_clock(rq))) {
+
+ if (unlikely(!dl_is_implicit(dl_se) &&
+ !dl_time_before(dl_se->deadline, rq_clock(rq)) &&
+ !dl_se->dl_boosted)){
+ update_dl_revised_wakeup(dl_se, rq);
+ return;
+ }
+
dl_se->deadline = rq_clock(rq) + pi_se->dl_deadline;
dl_se->runtime = pi_se->dl_runtime;
}
@@ -593,10 +925,8 @@ static enum hrtimer_restart dl_task_timer(struct hrtimer *timer)
* The task might have changed its scheduling policy to something
* different than SCHED_DEADLINE (through switched_from_dl()).
*/
- if (!dl_task(p)) {
- __dl_clear_params(p);
+ if (!dl_task(p))
goto unlock;
- }
/*
* The task might have been boosted by someone else and might be in the
@@ -723,6 +1053,8 @@ static inline void dl_check_constrained_dl(struct sched_dl_entity *dl_se)
if (unlikely(dl_se->dl_boosted || !start_dl_timer(p)))
return;
dl_se->dl_throttled = 1;
+ if (dl_se->runtime > 0)
+ dl_se->runtime = 0;
}
}
@@ -735,6 +1067,47 @@ int dl_runtime_exceeded(struct sched_dl_entity *dl_se)
extern bool sched_rt_bandwidth_account(struct rt_rq *rt_rq);
/*
+ * This function implements the GRUB accounting rule:
+ * according to the GRUB reclaiming algorithm, the runtime is
+ * not decreased as "dq = -dt", but as
+ * "dq = -max{u / Umax, (1 - Uinact - Uextra)} dt",
+ * where u is the utilization of the task, Umax is the maximum reclaimable
+ * utilization, Uinact is the (per-runqueue) inactive utilization, computed
+ * as the difference between the "total runqueue utilization" and the
+ * runqueue active utilization, and Uextra is the (per runqueue) extra
+ * reclaimable utilization.
+ * Since rq->dl.running_bw and rq->dl.this_bw contain utilizations
+ * multiplied by 2^BW_SHIFT, the result has to be shifted right by
+ * BW_SHIFT.
+ * Since rq->dl.bw_ratio contains 1 / Umax multipled by 2^RATIO_SHIFT,
+ * dl_bw is multiped by rq->dl.bw_ratio and shifted right by RATIO_SHIFT.
+ * Since delta is a 64 bit variable, to have an overflow its value
+ * should be larger than 2^(64 - 20 - 8), which is more than 64 seconds.
+ * So, overflow is not an issue here.
+ */
+u64 grub_reclaim(u64 delta, struct rq *rq, struct sched_dl_entity *dl_se)
+{
+ u64 u_inact = rq->dl.this_bw - rq->dl.running_bw; /* Utot - Uact */
+ u64 u_act;
+ u64 u_act_min = (dl_se->dl_bw * rq->dl.bw_ratio) >> RATIO_SHIFT;
+
+ /*
+ * Instead of computing max{u * bw_ratio, (1 - u_inact - u_extra)},
+ * we compare u_inact + rq->dl.extra_bw with
+ * 1 - (u * rq->dl.bw_ratio >> RATIO_SHIFT), because
+ * u_inact + rq->dl.extra_bw can be larger than
+ * 1 * (so, 1 - u_inact - rq->dl.extra_bw would be negative
+ * leading to wrong results)
+ */
+ if (u_inact + rq->dl.extra_bw > BW_UNIT - u_act_min)
+ u_act = u_act_min;
+ else
+ u_act = BW_UNIT - u_inact - rq->dl.extra_bw;
+
+ return (delta * u_act) >> BW_SHIFT;
+}
+
+/*
* Update the current task's runtime statistics (provided it is still
* a -deadline task and has not been removed from the dl_rq).
*/
@@ -776,6 +1149,8 @@ static void update_curr_dl(struct rq *rq)
sched_rt_avg_update(rq, delta_exec);
+ if (unlikely(dl_se->flags & SCHED_FLAG_RECLAIM))
+ delta_exec = grub_reclaim(delta_exec, rq, &curr->dl);
dl_se->runtime -= delta_exec;
throttle:
@@ -815,6 +1190,56 @@ throttle:
}
}
+static enum hrtimer_restart inactive_task_timer(struct hrtimer *timer)
+{
+ struct sched_dl_entity *dl_se = container_of(timer,
+ struct sched_dl_entity,
+ inactive_timer);
+ struct task_struct *p = dl_task_of(dl_se);
+ struct rq_flags rf;
+ struct rq *rq;
+
+ rq = task_rq_lock(p, &rf);
+
+ if (!dl_task(p) || p->state == TASK_DEAD) {
+ struct dl_bw *dl_b = dl_bw_of(task_cpu(p));
+
+ if (p->state == TASK_DEAD && dl_se->dl_non_contending) {
+ sub_running_bw(p->dl.dl_bw, dl_rq_of_se(&p->dl));
+ sub_rq_bw(p->dl.dl_bw, dl_rq_of_se(&p->dl));
+ dl_se->dl_non_contending = 0;
+ }
+
+ raw_spin_lock(&dl_b->lock);
+ __dl_clear(dl_b, p->dl.dl_bw, dl_bw_cpus(task_cpu(p)));
+ raw_spin_unlock(&dl_b->lock);
+ __dl_clear_params(p);
+
+ goto unlock;
+ }
+ if (dl_se->dl_non_contending == 0)
+ goto unlock;
+
+ sched_clock_tick();
+ update_rq_clock(rq);
+
+ sub_running_bw(dl_se->dl_bw, &rq->dl);
+ dl_se->dl_non_contending = 0;
+unlock:
+ task_rq_unlock(rq, p, &rf);
+ put_task_struct(p);
+
+ return HRTIMER_NORESTART;
+}
+
+void init_dl_inactive_task_timer(struct sched_dl_entity *dl_se)
+{
+ struct hrtimer *timer = &dl_se->inactive_timer;
+
+ hrtimer_init(timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
+ timer->function = inactive_task_timer;
+}
+
#ifdef CONFIG_SMP
static void inc_dl_deadline(struct dl_rq *dl_rq, u64 deadline)
@@ -946,10 +1371,12 @@ enqueue_dl_entity(struct sched_dl_entity *dl_se,
* parameters of the task might need updating. Otherwise,
* we want a replenishment of its runtime.
*/
- if (flags & ENQUEUE_WAKEUP)
+ if (flags & ENQUEUE_WAKEUP) {
+ task_contending(dl_se, flags);
update_dl_entity(dl_se, pi_se);
- else if (flags & ENQUEUE_REPLENISH)
+ } else if (flags & ENQUEUE_REPLENISH) {
replenish_dl_entity(dl_se, pi_se);
+ }
__enqueue_dl_entity(dl_se);
}
@@ -959,11 +1386,6 @@ static void dequeue_dl_entity(struct sched_dl_entity *dl_se)
__dequeue_dl_entity(dl_se);
}
-static inline bool dl_is_constrained(struct sched_dl_entity *dl_se)
-{
- return dl_se->dl_deadline < dl_se->dl_period;
-}
-
static void enqueue_task_dl(struct rq *rq, struct task_struct *p, int flags)
{
struct task_struct *pi_task = rt_mutex_get_top_task(p);
@@ -995,17 +1417,32 @@ static void enqueue_task_dl(struct rq *rq, struct task_struct *p, int flags)
* If that is the case, the task will be throttled and
* the replenishment timer will be set to the next period.
*/
- if (!p->dl.dl_throttled && dl_is_constrained(&p->dl))
+ if (!p->dl.dl_throttled && !dl_is_implicit(&p->dl))
dl_check_constrained_dl(&p->dl);
+ if (p->on_rq == TASK_ON_RQ_MIGRATING || flags & ENQUEUE_RESTORE) {
+ add_rq_bw(p->dl.dl_bw, &rq->dl);
+ add_running_bw(p->dl.dl_bw, &rq->dl);
+ }
+
/*
- * If p is throttled, we do nothing. In fact, if it exhausted
+ * If p is throttled, we do not enqueue it. In fact, if it exhausted
* its budget it needs a replenishment and, since it now is on
* its rq, the bandwidth timer callback (which clearly has not
* run yet) will take care of this.
+ * However, the active utilization does not depend on the fact
+ * that the task is on the runqueue or not (but depends on the
+ * task's state - in GRUB parlance, "inactive" vs "active contending").
+ * In other words, even if a task is throttled its utilization must
+ * be counted in the active utilization; hence, we need to call
+ * add_running_bw().
*/
- if (p->dl.dl_throttled && !(flags & ENQUEUE_REPLENISH))
+ if (p->dl.dl_throttled && !(flags & ENQUEUE_REPLENISH)) {
+ if (flags & ENQUEUE_WAKEUP)
+ task_contending(&p->dl, flags);
+
return;
+ }
enqueue_dl_entity(&p->dl, pi_se, flags);
@@ -1023,6 +1460,23 @@ static void dequeue_task_dl(struct rq *rq, struct task_struct *p, int flags)
{
update_curr_dl(rq);
__dequeue_task_dl(rq, p, flags);
+
+ if (p->on_rq == TASK_ON_RQ_MIGRATING || flags & DEQUEUE_SAVE) {
+ sub_running_bw(p->dl.dl_bw, &rq->dl);
+ sub_rq_bw(p->dl.dl_bw, &rq->dl);
+ }
+
+ /*
+ * This check allows to start the inactive timer (or to immediately
+ * decrease the active utilization, if needed) in two cases:
+ * when the task blocks and when it is terminating
+ * (p->state == TASK_DEAD). We can handle the two cases in the same
+ * way, because from GRUB's point of view the same thing is happening
+ * (the task moves from "active contending" to "active non contending"
+ * or "inactive")
+ */
+ if (flags & DEQUEUE_SLEEP)
+ task_non_contending(p);
}
/*
@@ -1100,6 +1554,37 @@ out:
return cpu;
}
+static void migrate_task_rq_dl(struct task_struct *p)
+{
+ struct rq *rq;
+
+ if (p->state != TASK_WAKING)
+ return;
+
+ rq = task_rq(p);
+ /*
+ * Since p->state == TASK_WAKING, set_task_cpu() has been called
+ * from try_to_wake_up(). Hence, p->pi_lock is locked, but
+ * rq->lock is not... So, lock it
+ */
+ raw_spin_lock(&rq->lock);
+ if (p->dl.dl_non_contending) {
+ sub_running_bw(p->dl.dl_bw, &rq->dl);
+ p->dl.dl_non_contending = 0;
+ /*
+ * If the timer handler is currently running and the
+ * timer cannot be cancelled, inactive_task_timer()
+ * will see that dl_not_contending is not set, and
+ * will not touch the rq's active utilization,
+ * so we are still safe.
+ */
+ if (hrtimer_try_to_cancel(&p->dl.inactive_timer) == 1)
+ put_task_struct(p);
+ }
+ sub_rq_bw(p->dl.dl_bw, &rq->dl);
+ raw_spin_unlock(&rq->lock);
+}
+
static void check_preempt_equal_dl(struct rq *rq, struct task_struct *p)
{
/*
@@ -1255,19 +1740,6 @@ static void task_fork_dl(struct task_struct *p)
*/
}
-static void task_dead_dl(struct task_struct *p)
-{
- struct dl_bw *dl_b = dl_bw_of(task_cpu(p));
-
- /*
- * Since we are TASK_DEAD we won't slip out of the domain!
- */
- raw_spin_lock_irq(&dl_b->lock);
- /* XXX we should retain the bw until 0-lag */
- dl_b->total_bw -= p->dl.dl_bw;
- raw_spin_unlock_irq(&dl_b->lock);
-}
-
static void set_curr_task_dl(struct rq *rq)
{
struct task_struct *p = rq->curr;
@@ -1533,7 +2005,7 @@ retry:
* then possible that next_task has migrated.
*/
task = pick_next_pushable_dl_task(rq);
- if (task_cpu(next_task) == rq->cpu && task == next_task) {
+ if (task == next_task) {
/*
* The task is still there. We don't try
* again, some other cpu will pull it when ready.
@@ -1551,7 +2023,11 @@ retry:
}
deactivate_task(rq, next_task, 0);
+ sub_running_bw(next_task->dl.dl_bw, &rq->dl);
+ sub_rq_bw(next_task->dl.dl_bw, &rq->dl);
set_task_cpu(next_task, later_rq->cpu);
+ add_rq_bw(next_task->dl.dl_bw, &later_rq->dl);
+ add_running_bw(next_task->dl.dl_bw, &later_rq->dl);
activate_task(later_rq, next_task, 0);
ret = 1;
@@ -1639,7 +2115,11 @@ static void pull_dl_task(struct rq *this_rq)
resched = true;
deactivate_task(src_rq, p, 0);
+ sub_running_bw(p->dl.dl_bw, &src_rq->dl);
+ sub_rq_bw(p->dl.dl_bw, &src_rq->dl);
set_task_cpu(p, this_cpu);
+ add_rq_bw(p->dl.dl_bw, &this_rq->dl);
+ add_running_bw(p->dl.dl_bw, &this_rq->dl);
activate_task(this_rq, p, 0);
dmin = p->dl.deadline;
@@ -1695,7 +2175,7 @@ static void set_cpus_allowed_dl(struct task_struct *p,
* until we complete the update.
*/
raw_spin_lock(&src_dl_b->lock);
- __dl_clear(src_dl_b, p->dl.dl_bw);
+ __dl_clear(src_dl_b, p->dl.dl_bw, dl_bw_cpus(task_cpu(p)));
raw_spin_unlock(&src_dl_b->lock);
}
@@ -1737,13 +2217,26 @@ void __init init_sched_dl_class(void)
static void switched_from_dl(struct rq *rq, struct task_struct *p)
{
/*
- * Start the deadline timer; if we switch back to dl before this we'll
- * continue consuming our current CBS slice. If we stay outside of
- * SCHED_DEADLINE until the deadline passes, the timer will reset the
- * task.
+ * task_non_contending() can start the "inactive timer" (if the 0-lag
+ * time is in the future). If the task switches back to dl before
+ * the "inactive timer" fires, it can continue to consume its current
+ * runtime using its current deadline. If it stays outside of
+ * SCHED_DEADLINE until the 0-lag time passes, inactive_task_timer()
+ * will reset the task parameters.
*/
- if (!start_dl_timer(p))
- __dl_clear_params(p);
+ if (task_on_rq_queued(p) && p->dl.dl_runtime)
+ task_non_contending(p);
+
+ if (!task_on_rq_queued(p))
+ sub_rq_bw(p->dl.dl_bw, &rq->dl);
+
+ /*
+ * We cannot use inactive_task_timer() to invoke sub_running_bw()
+ * at the 0-lag time, because the task could have been migrated
+ * while SCHED_OTHER in the meanwhile.
+ */
+ if (p->dl.dl_non_contending)
+ p->dl.dl_non_contending = 0;
/*
* Since this might be the only -deadline task on the rq,
@@ -1762,11 +2255,15 @@ static void switched_from_dl(struct rq *rq, struct task_struct *p)
*/
static void switched_to_dl(struct rq *rq, struct task_struct *p)
{
+ if (hrtimer_try_to_cancel(&p->dl.inactive_timer) == 1)
+ put_task_struct(p);
/* If p is not queued we will update its parameters at next wakeup. */
- if (!task_on_rq_queued(p))
- return;
+ if (!task_on_rq_queued(p)) {
+ add_rq_bw(p->dl.dl_bw, &rq->dl);
+ return;
+ }
/*
* If p is boosted we already updated its params in
* rt_mutex_setprio()->enqueue_task(..., ENQUEUE_REPLENISH),
@@ -1836,6 +2333,7 @@ const struct sched_class dl_sched_class = {
#ifdef CONFIG_SMP
.select_task_rq = select_task_rq_dl,
+ .migrate_task_rq = migrate_task_rq_dl,
.set_cpus_allowed = set_cpus_allowed_dl,
.rq_online = rq_online_dl,
.rq_offline = rq_offline_dl,
@@ -1845,7 +2343,6 @@ const struct sched_class dl_sched_class = {
.set_curr_task = set_curr_task_dl,
.task_tick = task_tick_dl,
.task_fork = task_fork_dl,
- .task_dead = task_dead_dl,
.prio_changed = prio_changed_dl,
.switched_from = switched_from_dl,
@@ -1854,6 +2351,317 @@ const struct sched_class dl_sched_class = {
.update_curr = update_curr_dl,
};
+int sched_dl_global_validate(void)
+{
+ u64 runtime = global_rt_runtime();
+ u64 period = global_rt_period();
+ u64 new_bw = to_ratio(period, runtime);
+ struct dl_bw *dl_b;
+ int cpu, ret = 0;
+ unsigned long flags;
+
+ /*
+ * Here we want to check the bandwidth not being set to some
+ * value smaller than the currently allocated bandwidth in
+ * any of the root_domains.
+ *
+ * FIXME: Cycling on all the CPUs is overdoing, but simpler than
+ * cycling on root_domains... Discussion on different/better
+ * solutions is welcome!
+ */
+ for_each_possible_cpu(cpu) {
+ rcu_read_lock_sched();
+ dl_b = dl_bw_of(cpu);
+
+ raw_spin_lock_irqsave(&dl_b->lock, flags);
+ if (new_bw < dl_b->total_bw)
+ ret = -EBUSY;
+ raw_spin_unlock_irqrestore(&dl_b->lock, flags);
+
+ rcu_read_unlock_sched();
+
+ if (ret)
+ break;
+ }
+
+ return ret;
+}
+
+void init_dl_rq_bw_ratio(struct dl_rq *dl_rq)
+{
+ if (global_rt_runtime() == RUNTIME_INF) {
+ dl_rq->bw_ratio = 1 << RATIO_SHIFT;
+ dl_rq->extra_bw = 1 << BW_SHIFT;
+ } else {
+ dl_rq->bw_ratio = to_ratio(global_rt_runtime(),
+ global_rt_period()) >> (BW_SHIFT - RATIO_SHIFT);
+ dl_rq->extra_bw = to_ratio(global_rt_period(),
+ global_rt_runtime());
+ }
+}
+
+void sched_dl_do_global(void)
+{
+ u64 new_bw = -1;
+ struct dl_bw *dl_b;
+ int cpu;
+ unsigned long flags;
+
+ def_dl_bandwidth.dl_period = global_rt_period();
+ def_dl_bandwidth.dl_runtime = global_rt_runtime();
+
+ if (global_rt_runtime() != RUNTIME_INF)
+ new_bw = to_ratio(global_rt_period(), global_rt_runtime());
+
+ /*
+ * FIXME: As above...
+ */
+ for_each_possible_cpu(cpu) {
+ rcu_read_lock_sched();
+ dl_b = dl_bw_of(cpu);
+
+ raw_spin_lock_irqsave(&dl_b->lock, flags);
+ dl_b->bw = new_bw;
+ raw_spin_unlock_irqrestore(&dl_b->lock, flags);
+
+ rcu_read_unlock_sched();
+ init_dl_rq_bw_ratio(&cpu_rq(cpu)->dl);
+ }
+}
+
+/*
+ * We must be sure that accepting a new task (or allowing changing the
+ * parameters of an existing one) is consistent with the bandwidth
+ * constraints. If yes, this function also accordingly updates the currently
+ * allocated bandwidth to reflect the new situation.
+ *
+ * This function is called while holding p's rq->lock.
+ */
+int sched_dl_overflow(struct task_struct *p, int policy,
+ const struct sched_attr *attr)
+{
+ struct dl_bw *dl_b = dl_bw_of(task_cpu(p));
+ u64 period = attr->sched_period ?: attr->sched_deadline;
+ u64 runtime = attr->sched_runtime;
+ u64 new_bw = dl_policy(policy) ? to_ratio(period, runtime) : 0;
+ int cpus, err = -1;
+
+ /* !deadline task may carry old deadline bandwidth */
+ if (new_bw == p->dl.dl_bw && task_has_dl_policy(p))
+ return 0;
+
+ /*
+ * Either if a task, enters, leave, or stays -deadline but changes
+ * its parameters, we may need to update accordingly the total
+ * allocated bandwidth of the container.
+ */
+ raw_spin_lock(&dl_b->lock);
+ cpus = dl_bw_cpus(task_cpu(p));
+ if (dl_policy(policy) && !task_has_dl_policy(p) &&
+ !__dl_overflow(dl_b, cpus, 0, new_bw)) {
+ if (hrtimer_active(&p->dl.inactive_timer))
+ __dl_clear(dl_b, p->dl.dl_bw, cpus);
+ __dl_add(dl_b, new_bw, cpus);
+ err = 0;
+ } else if (dl_policy(policy) && task_has_dl_policy(p) &&
+ !__dl_overflow(dl_b, cpus, p->dl.dl_bw, new_bw)) {
+ /*
+ * XXX this is slightly incorrect: when the task
+ * utilization decreases, we should delay the total
+ * utilization change until the task's 0-lag point.
+ * But this would require to set the task's "inactive
+ * timer" when the task is not inactive.
+ */
+ __dl_clear(dl_b, p->dl.dl_bw, cpus);
+ __dl_add(dl_b, new_bw, cpus);
+ dl_change_utilization(p, new_bw);
+ err = 0;
+ } else if (!dl_policy(policy) && task_has_dl_policy(p)) {
+ /*
+ * Do not decrease the total deadline utilization here,
+ * switched_from_dl() will take care to do it at the correct
+ * (0-lag) time.
+ */
+ err = 0;
+ }
+ raw_spin_unlock(&dl_b->lock);
+
+ return err;
+}
+
+/*
+ * This function initializes the sched_dl_entity of a newly becoming
+ * SCHED_DEADLINE task.
+ *
+ * Only the static values are considered here, the actual runtime and the
+ * absolute deadline will be properly calculated when the task is enqueued
+ * for the first time with its new policy.
+ */
+void __setparam_dl(struct task_struct *p, const struct sched_attr *attr)
+{
+ struct sched_dl_entity *dl_se = &p->dl;
+
+ dl_se->dl_runtime = attr->sched_runtime;
+ dl_se->dl_deadline = attr->sched_deadline;
+ dl_se->dl_period = attr->sched_period ?: dl_se->dl_deadline;
+ dl_se->flags = attr->sched_flags;
+ dl_se->dl_bw = to_ratio(dl_se->dl_period, dl_se->dl_runtime);
+ dl_se->dl_density = to_ratio(dl_se->dl_deadline, dl_se->dl_runtime);
+}
+
+void __getparam_dl(struct task_struct *p, struct sched_attr *attr)
+{
+ struct sched_dl_entity *dl_se = &p->dl;
+
+ attr->sched_priority = p->rt_priority;
+ attr->sched_runtime = dl_se->dl_runtime;
+ attr->sched_deadline = dl_se->dl_deadline;
+ attr->sched_period = dl_se->dl_period;
+ attr->sched_flags = dl_se->flags;
+}
+
+/*
+ * This function validates the new parameters of a -deadline task.
+ * We ask for the deadline not being zero, and greater or equal
+ * than the runtime, as well as the period of being zero or
+ * greater than deadline. Furthermore, we have to be sure that
+ * user parameters are above the internal resolution of 1us (we
+ * check sched_runtime only since it is always the smaller one) and
+ * below 2^63 ns (we have to check both sched_deadline and
+ * sched_period, as the latter can be zero).
+ */
+bool __checkparam_dl(const struct sched_attr *attr)
+{
+ /* deadline != 0 */
+ if (attr->sched_deadline == 0)
+ return false;
+
+ /*
+ * Since we truncate DL_SCALE bits, make sure we're at least
+ * that big.
+ */
+ if (attr->sched_runtime < (1ULL << DL_SCALE))
+ return false;
+
+ /*
+ * Since we use the MSB for wrap-around and sign issues, make
+ * sure it's not set (mind that period can be equal to zero).
+ */
+ if (attr->sched_deadline & (1ULL << 63) ||
+ attr->sched_period & (1ULL << 63))
+ return false;
+
+ /* runtime <= deadline <= period (if period != 0) */
+ if ((attr->sched_period != 0 &&
+ attr->sched_period < attr->sched_deadline) ||
+ attr->sched_deadline < attr->sched_runtime)
+ return false;
+
+ return true;
+}
+
+/*
+ * This function clears the sched_dl_entity static params.
+ */
+void __dl_clear_params(struct task_struct *p)
+{
+ struct sched_dl_entity *dl_se = &p->dl;
+
+ dl_se->dl_runtime = 0;
+ dl_se->dl_deadline = 0;
+ dl_se->dl_period = 0;
+ dl_se->flags = 0;
+ dl_se->dl_bw = 0;
+ dl_se->dl_density = 0;
+
+ dl_se->dl_throttled = 0;
+ dl_se->dl_yielded = 0;
+ dl_se->dl_non_contending = 0;
+}
+
+bool dl_param_changed(struct task_struct *p, const struct sched_attr *attr)
+{
+ struct sched_dl_entity *dl_se = &p->dl;
+
+ if (dl_se->dl_runtime != attr->sched_runtime ||
+ dl_se->dl_deadline != attr->sched_deadline ||
+ dl_se->dl_period != attr->sched_period ||
+ dl_se->flags != attr->sched_flags)
+ return true;
+
+ return false;
+}
+
+#ifdef CONFIG_SMP
+int dl_task_can_attach(struct task_struct *p, const struct cpumask *cs_cpus_allowed)
+{
+ unsigned int dest_cpu = cpumask_any_and(cpu_active_mask,
+ cs_cpus_allowed);
+ struct dl_bw *dl_b;
+ bool overflow;
+ int cpus, ret;
+ unsigned long flags;
+
+ rcu_read_lock_sched();
+ dl_b = dl_bw_of(dest_cpu);
+ raw_spin_lock_irqsave(&dl_b->lock, flags);
+ cpus = dl_bw_cpus(dest_cpu);
+ overflow = __dl_overflow(dl_b, cpus, 0, p->dl.dl_bw);
+ if (overflow)
+ ret = -EBUSY;
+ else {
+ /*
+ * We reserve space for this task in the destination
+ * root_domain, as we can't fail after this point.
+ * We will free resources in the source root_domain
+ * later on (see set_cpus_allowed_dl()).
+ */
+ __dl_add(dl_b, p->dl.dl_bw, cpus);
+ ret = 0;
+ }
+ raw_spin_unlock_irqrestore(&dl_b->lock, flags);
+ rcu_read_unlock_sched();
+ return ret;
+}
+
+int dl_cpuset_cpumask_can_shrink(const struct cpumask *cur,
+ const struct cpumask *trial)
+{
+ int ret = 1, trial_cpus;
+ struct dl_bw *cur_dl_b;
+ unsigned long flags;
+
+ rcu_read_lock_sched();
+ cur_dl_b = dl_bw_of(cpumask_any(cur));
+ trial_cpus = cpumask_weight(trial);
+
+ raw_spin_lock_irqsave(&cur_dl_b->lock, flags);
+ if (cur_dl_b->bw != -1 &&
+ cur_dl_b->bw * trial_cpus < cur_dl_b->total_bw)
+ ret = 0;
+ raw_spin_unlock_irqrestore(&cur_dl_b->lock, flags);
+ rcu_read_unlock_sched();
+ return ret;
+}
+
+bool dl_cpu_busy(unsigned int cpu)
+{
+ unsigned long flags;
+ struct dl_bw *dl_b;
+ bool overflow;
+ int cpus;
+
+ rcu_read_lock_sched();
+ dl_b = dl_bw_of(cpu);
+ raw_spin_lock_irqsave(&dl_b->lock, flags);
+ cpus = dl_bw_cpus(cpu);
+ overflow = __dl_overflow(dl_b, cpus, 0, 0);
+ raw_spin_unlock_irqrestore(&dl_b->lock, flags);
+ rcu_read_unlock_sched();
+ return overflow;
+}
+#endif
+
#ifdef CONFIG_SCHED_DEBUG
extern void print_dl_rq(struct seq_file *m, int cpu, struct dl_rq *dl_rq);
diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c
index 38f019324f1a..4fa66de52bd6 100644
--- a/kernel/sched/debug.c
+++ b/kernel/sched/debug.c
@@ -552,15 +552,21 @@ void print_rt_rq(struct seq_file *m, int cpu, struct rt_rq *rt_rq)
#define P(x) \
SEQ_printf(m, " .%-30s: %Ld\n", #x, (long long)(rt_rq->x))
+#define PU(x) \
+ SEQ_printf(m, " .%-30s: %lu\n", #x, (unsigned long)(rt_rq->x))
#define PN(x) \
SEQ_printf(m, " .%-30s: %Ld.%06ld\n", #x, SPLIT_NS(rt_rq->x))
- P(rt_nr_running);
+ PU(rt_nr_running);
+#ifdef CONFIG_SMP
+ PU(rt_nr_migratory);
+#endif
P(rt_throttled);
PN(rt_time);
PN(rt_runtime);
#undef PN
+#undef PU
#undef P
}
@@ -569,14 +575,21 @@ void print_dl_rq(struct seq_file *m, int cpu, struct dl_rq *dl_rq)
struct dl_bw *dl_bw;
SEQ_printf(m, "\ndl_rq[%d]:\n", cpu);
- SEQ_printf(m, " .%-30s: %ld\n", "dl_nr_running", dl_rq->dl_nr_running);
+
+#define PU(x) \
+ SEQ_printf(m, " .%-30s: %lu\n", #x, (unsigned long)(dl_rq->x))
+
+ PU(dl_nr_running);
#ifdef CONFIG_SMP
+ PU(dl_nr_migratory);
dl_bw = &cpu_rq(cpu)->rd->dl_bw;
#else
dl_bw = &dl_rq->dl_bw;
#endif
SEQ_printf(m, " .%-30s: %lld\n", "dl_bw->bw", dl_bw->bw);
SEQ_printf(m, " .%-30s: %lld\n", "dl_bw->total_bw", dl_bw->total_bw);
+
+#undef PU
}
extern __read_mostly int sched_clock_running;
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index d71109321841..008c514dc241 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -369,8 +369,9 @@ static inline void list_del_leaf_cfs_rq(struct cfs_rq *cfs_rq)
}
/* Iterate thr' all leaf cfs_rq's on a runqueue */
-#define for_each_leaf_cfs_rq(rq, cfs_rq) \
- list_for_each_entry_rcu(cfs_rq, &rq->leaf_cfs_rq_list, leaf_cfs_rq_list)
+#define for_each_leaf_cfs_rq_safe(rq, cfs_rq, pos) \
+ list_for_each_entry_safe(cfs_rq, pos, &rq->leaf_cfs_rq_list, \
+ leaf_cfs_rq_list)
/* Do the two (enqueued) entities belong to the same group ? */
static inline struct cfs_rq *
@@ -463,8 +464,8 @@ static inline void list_del_leaf_cfs_rq(struct cfs_rq *cfs_rq)
{
}
-#define for_each_leaf_cfs_rq(rq, cfs_rq) \
- for (cfs_rq = &rq->cfs; cfs_rq; cfs_rq = NULL)
+#define for_each_leaf_cfs_rq_safe(rq, cfs_rq, pos) \
+ for (cfs_rq = &rq->cfs, pos = NULL; cfs_rq; cfs_rq = pos)
static inline struct sched_entity *parent_entity(struct sched_entity *se)
{
@@ -1381,7 +1382,6 @@ static unsigned long weighted_cpuload(const int cpu);
static unsigned long source_load(int cpu, int type);
static unsigned long target_load(int cpu, int type);
static unsigned long capacity_of(int cpu);
-static long effective_load(struct task_group *tg, int cpu, long wl, long wg);
/* Cached statistics for all CPUs within a node */
struct numa_stats {
@@ -2469,7 +2469,8 @@ void task_numa_work(struct callback_head *work)
return;
- down_read(&mm->mmap_sem);
+ if (!down_read_trylock(&mm->mmap_sem))
+ return;
vma = find_vma(mm, start);
if (!vma) {
reset_ptenuma_scan(p);
@@ -2584,6 +2585,60 @@ void task_tick_numa(struct rq *rq, struct task_struct *curr)
}
}
}
+
+/*
+ * Can a task be moved from prev_cpu to this_cpu without causing a load
+ * imbalance that would trigger the load balancer?
+ */
+static inline bool numa_wake_affine(struct sched_domain *sd,
+ struct task_struct *p, int this_cpu,
+ int prev_cpu, int sync)
+{
+ struct numa_stats prev_load, this_load;
+ s64 this_eff_load, prev_eff_load;
+
+ update_numa_stats(&prev_load, cpu_to_node(prev_cpu));
+ update_numa_stats(&this_load, cpu_to_node(this_cpu));
+
+ /*
+ * If sync wakeup then subtract the (maximum possible)
+ * effect of the currently running task from the load
+ * of the current CPU:
+ */
+ if (sync) {
+ unsigned long current_load = task_h_load(current);
+
+ if (this_load.load > current_load)
+ this_load.load -= current_load;
+ else
+ this_load.load = 0;
+ }
+
+ /*
+ * In low-load situations, where this_cpu's node is idle due to the
+ * sync cause above having dropped this_load.load to 0, move the task.
+ * Moving to an idle socket will not create a bad imbalance.
+ *
+ * Otherwise check if the nodes are near enough in load to allow this
+ * task to be woken on this_cpu's node.
+ */
+ if (this_load.load > 0) {
+ unsigned long task_load = task_h_load(p);
+
+ this_eff_load = 100;
+ this_eff_load *= prev_load.compute_capacity;
+
+ prev_eff_load = 100 + (sd->imbalance_pct - 100) / 2;
+ prev_eff_load *= this_load.compute_capacity;
+
+ this_eff_load *= this_load.load + task_load;
+ prev_eff_load *= prev_load.load - task_load;
+
+ return this_eff_load <= prev_eff_load;
+ }
+
+ return true;
+}
#else
static void task_tick_numa(struct rq *rq, struct task_struct *curr)
{
@@ -2596,6 +2651,15 @@ static inline void account_numa_enqueue(struct rq *rq, struct task_struct *p)
static inline void account_numa_dequeue(struct rq *rq, struct task_struct *p)
{
}
+
+#ifdef CONFIG_SMP
+static inline bool numa_wake_affine(struct sched_domain *sd,
+ struct task_struct *p, int this_cpu,
+ int prev_cpu, int sync)
+{
+ return true;
+}
+#endif /* !SMP */
#endif /* CONFIG_NUMA_BALANCING */
static void
@@ -2916,12 +2980,12 @@ ___update_load_avg(u64 now, int cpu, struct sched_avg *sa,
/*
* Step 2: update *_avg.
*/
- sa->load_avg = div_u64(sa->load_sum, LOAD_AVG_MAX);
+ sa->load_avg = div_u64(sa->load_sum, LOAD_AVG_MAX - 1024 + sa->period_contrib);
if (cfs_rq) {
cfs_rq->runnable_load_avg =
- div_u64(cfs_rq->runnable_load_sum, LOAD_AVG_MAX);
+ div_u64(cfs_rq->runnable_load_sum, LOAD_AVG_MAX - 1024 + sa->period_contrib);
}
- sa->util_avg = sa->util_sum / LOAD_AVG_MAX;
+ sa->util_avg = sa->util_sum / (LOAD_AVG_MAX - 1024 + sa->period_contrib);
return 1;
}
@@ -2982,8 +3046,7 @@ __update_load_avg_cfs_rq(u64 now, int cpu, struct cfs_rq *cfs_rq)
* differential update where we store the last value we propagated. This in
* turn allows skipping updates if the differential is 'small'.
*
- * Updating tg's load_avg is necessary before update_cfs_share() (which is
- * done) and effective_load() (which is not done because it is too costly).
+ * Updating tg's load_avg is necessary before update_cfs_share().
*/
static inline void update_tg_load_avg(struct cfs_rq *cfs_rq, int force)
{
@@ -3563,7 +3626,7 @@ static inline void check_schedstat_required(void)
trace_sched_stat_runtime_enabled()) {
printk_deferred_once("Scheduler tracepoints stat_sleep, stat_iowait, "
"stat_blocked and stat_runtime require the "
- "kernel parameter schedstats=enabled or "
+ "kernel parameter schedstats=enable or "
"kernel.sched_schedstats=1\n");
}
#endif
@@ -4642,24 +4705,43 @@ static void destroy_cfs_bandwidth(struct cfs_bandwidth *cfs_b)
hrtimer_cancel(&cfs_b->slack_timer);
}
+/*
+ * Both these cpu hotplug callbacks race against unregister_fair_sched_group()
+ *
+ * The race is harmless, since modifying bandwidth settings of unhooked group
+ * bits doesn't do much.
+ */
+
+/* cpu online calback */
static void __maybe_unused update_runtime_enabled(struct rq *rq)
{
- struct cfs_rq *cfs_rq;
+ struct task_group *tg;
- for_each_leaf_cfs_rq(rq, cfs_rq) {
- struct cfs_bandwidth *cfs_b = &cfs_rq->tg->cfs_bandwidth;
+ lockdep_assert_held(&rq->lock);
+
+ rcu_read_lock();
+ list_for_each_entry_rcu(tg, &task_groups, list) {
+ struct cfs_bandwidth *cfs_b = &tg->cfs_bandwidth;
+ struct cfs_rq *cfs_rq = tg->cfs_rq[cpu_of(rq)];
raw_spin_lock(&cfs_b->lock);
cfs_rq->runtime_enabled = cfs_b->quota != RUNTIME_INF;
raw_spin_unlock(&cfs_b->lock);
}
+ rcu_read_unlock();
}
+/* cpu offline callback */
static void __maybe_unused unthrottle_offline_cfs_rqs(struct rq *rq)
{
- struct cfs_rq *cfs_rq;
+ struct task_group *tg;
+
+ lockdep_assert_held(&rq->lock);
+
+ rcu_read_lock();
+ list_for_each_entry_rcu(tg, &task_groups, list) {
+ struct cfs_rq *cfs_rq = tg->cfs_rq[cpu_of(rq)];
- for_each_leaf_cfs_rq(rq, cfs_rq) {
if (!cfs_rq->runtime_enabled)
continue;
@@ -4677,6 +4759,7 @@ static void __maybe_unused unthrottle_offline_cfs_rqs(struct rq *rq)
if (cfs_rq_throttled(cfs_rq))
unthrottle_cfs_rq(cfs_rq);
}
+ rcu_read_unlock();
}
#else /* CONFIG_CFS_BANDWIDTH */
@@ -5215,126 +5298,6 @@ static unsigned long cpu_avg_load_per_task(int cpu)
return 0;
}
-#ifdef CONFIG_FAIR_GROUP_SCHED
-/*
- * effective_load() calculates the load change as seen from the root_task_group
- *
- * Adding load to a group doesn't make a group heavier, but can cause movement
- * of group shares between cpus. Assuming the shares were perfectly aligned one
- * can calculate the shift in shares.
- *
- * Calculate the effective load difference if @wl is added (subtracted) to @tg
- * on this @cpu and results in a total addition (subtraction) of @wg to the
- * total group weight.
- *
- * Given a runqueue weight distribution (rw_i) we can compute a shares
- * distribution (s_i) using:
- *
- * s_i = rw_i / \Sum rw_j (1)
- *
- * Suppose we have 4 CPUs and our @tg is a direct child of the root group and
- * has 7 equal weight tasks, distributed as below (rw_i), with the resulting
- * shares distribution (s_i):
- *
- * rw_i = { 2, 4, 1, 0 }
- * s_i = { 2/7, 4/7, 1/7, 0 }
- *
- * As per wake_affine() we're interested in the load of two CPUs (the CPU the
- * task used to run on and the CPU the waker is running on), we need to
- * compute the effect of waking a task on either CPU and, in case of a sync
- * wakeup, compute the effect of the current task going to sleep.
- *
- * So for a change of @wl to the local @cpu with an overall group weight change
- * of @wl we can compute the new shares distribution (s'_i) using:
- *
- * s'_i = (rw_i + @wl) / (@wg + \Sum rw_j) (2)
- *
- * Suppose we're interested in CPUs 0 and 1, and want to compute the load
- * differences in waking a task to CPU 0. The additional task changes the
- * weight and shares distributions like:
- *
- * rw'_i = { 3, 4, 1, 0 }
- * s'_i = { 3/8, 4/8, 1/8, 0 }
- *
- * We can then compute the difference in effective weight by using:
- *
- * dw_i = S * (s'_i - s_i) (3)
- *
- * Where 'S' is the group weight as seen by its parent.
- *
- * Therefore the effective change in loads on CPU 0 would be 5/56 (3/8 - 2/7)
- * times the weight of the group. The effect on CPU 1 would be -4/56 (4/8 -
- * 4/7) times the weight of the group.
- */
-static long effective_load(struct task_group *tg, int cpu, long wl, long wg)
-{
- struct sched_entity *se = tg->se[cpu];
-
- if (!tg->parent) /* the trivial, non-cgroup case */
- return wl;
-
- for_each_sched_entity(se) {
- struct cfs_rq *cfs_rq = se->my_q;
- long W, w = cfs_rq_load_avg(cfs_rq);
-
- tg = cfs_rq->tg;
-
- /*
- * W = @wg + \Sum rw_j
- */
- W = wg + atomic_long_read(&tg->load_avg);
-
- /* Ensure \Sum rw_j >= rw_i */
- W -= cfs_rq->tg_load_avg_contrib;
- W += w;
-
- /*
- * w = rw_i + @wl
- */
- w += wl;
-
- /*
- * wl = S * s'_i; see (2)
- */
- if (W > 0 && w < W)
- wl = (w * (long)scale_load_down(tg->shares)) / W;
- else
- wl = scale_load_down(tg->shares);
-
- /*
- * Per the above, wl is the new se->load.weight value; since
- * those are clipped to [MIN_SHARES, ...) do so now. See
- * calc_cfs_shares().
- */
- if (wl < MIN_SHARES)
- wl = MIN_SHARES;
-
- /*
- * wl = dw_i = S * (s'_i - s_i); see (3)
- */
- wl -= se->avg.load_avg;
-
- /*
- * Recursively apply this logic to all parent groups to compute
- * the final effective load change on the root group. Since
- * only the @tg group gets extra weight, all parent groups can
- * only redistribute existing shares. @wl is the shift in shares
- * resulting from this level per the above.
- */
- wg = 0;
- }
-
- return wl;
-}
-#else
-
-static long effective_load(struct task_group *tg, int cpu, long wl, long wg)
-{
- return wl;
-}
-
-#endif
-
static void record_wakee(struct task_struct *p)
{
/*
@@ -5385,67 +5348,25 @@ static int wake_wide(struct task_struct *p)
static int wake_affine(struct sched_domain *sd, struct task_struct *p,
int prev_cpu, int sync)
{
- s64 this_load, load;
- s64 this_eff_load, prev_eff_load;
- int idx, this_cpu;
- struct task_group *tg;
- unsigned long weight;
- int balanced;
-
- idx = sd->wake_idx;
- this_cpu = smp_processor_id();
- load = source_load(prev_cpu, idx);
- this_load = target_load(this_cpu, idx);
-
- /*
- * If sync wakeup then subtract the (maximum possible)
- * effect of the currently running task from the load
- * of the current CPU:
- */
- if (sync) {
- tg = task_group(current);
- weight = current->se.avg.load_avg;
-
- this_load += effective_load(tg, this_cpu, -weight, -weight);
- load += effective_load(tg, prev_cpu, 0, -weight);
- }
-
- tg = task_group(p);
- weight = p->se.avg.load_avg;
+ int this_cpu = smp_processor_id();
+ bool affine = false;
/*
- * In low-load situations, where prev_cpu is idle and this_cpu is idle
- * due to the sync cause above having dropped this_load to 0, we'll
- * always have an imbalance, but there's really nothing you can do
- * about that, so that's good too.
- *
- * Otherwise check if either cpus are near enough in load to allow this
- * task to be woken on this_cpu.
+ * Common case: CPUs are in the same socket, and select_idle_sibling()
+ * will do its thing regardless of what we return:
*/
- this_eff_load = 100;
- this_eff_load *= capacity_of(prev_cpu);
-
- prev_eff_load = 100 + (sd->imbalance_pct - 100) / 2;
- prev_eff_load *= capacity_of(this_cpu);
-
- if (this_load > 0) {
- this_eff_load *= this_load +
- effective_load(tg, this_cpu, weight, weight);
-
- prev_eff_load *= load + effective_load(tg, prev_cpu, 0, weight);
- }
-
- balanced = this_eff_load <= prev_eff_load;
+ if (cpus_share_cache(prev_cpu, this_cpu))
+ affine = true;
+ else
+ affine = numa_wake_affine(sd, p, this_cpu, prev_cpu, sync);
schedstat_inc(p->se.statistics.nr_wakeups_affine_attempts);
+ if (affine) {
+ schedstat_inc(sd->ttwu_move_affine);
+ schedstat_inc(p->se.statistics.nr_wakeups_affine);
+ }
- if (!balanced)
- return 0;
-
- schedstat_inc(sd->ttwu_move_affine);
- schedstat_inc(p->se.statistics.nr_wakeups_affine);
-
- return 1;
+ return affine;
}
static inline int task_util(struct task_struct *p);
@@ -5484,12 +5405,12 @@ find_idlest_group(struct sched_domain *sd, struct task_struct *p,
int i;
/* Skip over this group if it has no CPUs allowed */
- if (!cpumask_intersects(sched_group_cpus(group),
+ if (!cpumask_intersects(sched_group_span(group),
&p->cpus_allowed))
continue;
local_group = cpumask_test_cpu(this_cpu,
- sched_group_cpus(group));
+ sched_group_span(group));
/*
* Tally up the load of all CPUs in the group and find
@@ -5499,7 +5420,7 @@ find_idlest_group(struct sched_domain *sd, struct task_struct *p,
runnable_load = 0;
max_spare_cap = 0;
- for_each_cpu(i, sched_group_cpus(group)) {
+ for_each_cpu(i, sched_group_span(group)) {
/* Bias balancing toward cpus of our domain */
if (local_group)
load = source_load(i, load_idx);
@@ -5602,10 +5523,10 @@ find_idlest_cpu(struct sched_group *group, struct task_struct *p, int this_cpu)
/* Check if we have any choice: */
if (group->group_weight == 1)
- return cpumask_first(sched_group_cpus(group));
+ return cpumask_first(sched_group_span(group));
/* Traverse only the allowed CPUs */
- for_each_cpu_and(i, sched_group_cpus(group), &p->cpus_allowed) {
+ for_each_cpu_and(i, sched_group_span(group), &p->cpus_allowed) {
if (idle_cpu(i)) {
struct rq *rq = cpu_rq(i);
struct cpuidle_state *idle = idle_get_state(rq);
@@ -5640,43 +5561,6 @@ find_idlest_cpu(struct sched_group *group, struct task_struct *p, int this_cpu)
return shallowest_idle_cpu != -1 ? shallowest_idle_cpu : least_loaded_cpu;
}
-/*
- * Implement a for_each_cpu() variant that starts the scan at a given cpu
- * (@start), and wraps around.
- *
- * This is used to scan for idle CPUs; such that not all CPUs looking for an
- * idle CPU find the same CPU. The down-side is that tasks tend to cycle
- * through the LLC domain.
- *
- * Especially tbench is found sensitive to this.
- */
-
-static int cpumask_next_wrap(int n, const struct cpumask *mask, int start, int *wrapped)
-{
- int next;
-
-again:
- next = find_next_bit(cpumask_bits(mask), nr_cpumask_bits, n+1);
-
- if (*wrapped) {
- if (next >= start)
- return nr_cpumask_bits;
- } else {
- if (next >= nr_cpumask_bits) {
- *wrapped = 1;
- n = -1;
- goto again;
- }
- }
-
- return next;
-}
-
-#define for_each_cpu_wrap(cpu, mask, start, wrap) \
- for ((wrap) = 0, (cpu) = (start)-1; \
- (cpu) = cpumask_next_wrap((cpu), (mask), (start), &(wrap)), \
- (cpu) < nr_cpumask_bits; )
-
#ifdef CONFIG_SCHED_SMT
static inline void set_idle_cores(int cpu, int val)
@@ -5736,7 +5620,7 @@ unlock:
static int select_idle_core(struct task_struct *p, struct sched_domain *sd, int target)
{
struct cpumask *cpus = this_cpu_cpumask_var_ptr(select_idle_mask);
- int core, cpu, wrap;
+ int core, cpu;
if (!static_branch_likely(&sched_smt_present))
return -1;
@@ -5746,7 +5630,7 @@ static int select_idle_core(struct task_struct *p, struct sched_domain *sd, int
cpumask_and(cpus, sched_domain_span(sd), &p->cpus_allowed);
- for_each_cpu_wrap(core, cpus, target, wrap) {
+ for_each_cpu_wrap(core, cpus, target) {
bool idle = true;
for_each_cpu(cpu, cpu_smt_mask(core)) {
@@ -5809,27 +5693,38 @@ static inline int select_idle_smt(struct task_struct *p, struct sched_domain *sd
static int select_idle_cpu(struct task_struct *p, struct sched_domain *sd, int target)
{
struct sched_domain *this_sd;
- u64 avg_cost, avg_idle = this_rq()->avg_idle;
+ u64 avg_cost, avg_idle;
u64 time, cost;
s64 delta;
- int cpu, wrap;
+ int cpu, nr = INT_MAX;
this_sd = rcu_dereference(*this_cpu_ptr(&sd_llc));
if (!this_sd)
return -1;
- avg_cost = this_sd->avg_scan_cost;
-
/*
* Due to large variance we need a large fuzz factor; hackbench in
* particularly is sensitive here.
*/
- if (sched_feat(SIS_AVG_CPU) && (avg_idle / 512) < avg_cost)
+ avg_idle = this_rq()->avg_idle / 512;
+ avg_cost = this_sd->avg_scan_cost + 1;
+
+ if (sched_feat(SIS_AVG_CPU) && avg_idle < avg_cost)
return -1;
+ if (sched_feat(SIS_PROP)) {
+ u64 span_avg = sd->span_weight * avg_idle;
+ if (span_avg > 4*avg_cost)
+ nr = div_u64(span_avg, avg_cost);
+ else
+ nr = 4;
+ }
+
time = local_clock();
- for_each_cpu_wrap(cpu, sched_domain_span(sd), target, wrap) {
+ for_each_cpu_wrap(cpu, sched_domain_span(sd), target) {
+ if (!--nr)
+ return -1;
if (!cpumask_test_cpu(cpu, &p->cpus_allowed))
continue;
if (idle_cpu(cpu))
@@ -6011,11 +5906,15 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int sd_flag, int wake_f
if (affine_sd) {
sd = NULL; /* Prefer wake_affine over balance flags */
- if (cpu != prev_cpu && wake_affine(affine_sd, p, prev_cpu, sync))
+ if (cpu == prev_cpu)
+ goto pick_cpu;
+
+ if (wake_affine(affine_sd, p, prev_cpu, sync))
new_cpu = cpu;
}
if (!sd) {
+ pick_cpu:
if (sd_flag & SD_BALANCE_WAKE) /* XXX always ? */
new_cpu = select_idle_sibling(p, prev_cpu, new_cpu);
@@ -6168,8 +6067,11 @@ static void set_last_buddy(struct sched_entity *se)
if (entity_is_task(se) && unlikely(task_of(se)->policy == SCHED_IDLE))
return;
- for_each_sched_entity(se)
+ for_each_sched_entity(se) {
+ if (SCHED_WARN_ON(!se->on_rq))
+ return;
cfs_rq_of(se)->last = se;
+ }
}
static void set_next_buddy(struct sched_entity *se)
@@ -6177,8 +6079,11 @@ static void set_next_buddy(struct sched_entity *se)
if (entity_is_task(se) && unlikely(task_of(se)->policy == SCHED_IDLE))
return;
- for_each_sched_entity(se)
+ for_each_sched_entity(se) {
+ if (SCHED_WARN_ON(!se->on_rq))
+ return;
cfs_rq_of(se)->next = se;
+ }
}
static void set_skip_buddy(struct sched_entity *se)
@@ -6686,6 +6591,10 @@ static int migrate_degrades_locality(struct task_struct *p, struct lb_env *env)
if (dst_nid == p->numa_preferred_nid)
return 0;
+ /* Leaving a core idle is often worse than degrading locality. */
+ if (env->idle != CPU_NOT_IDLE)
+ return -1;
+
if (numa_group) {
src_faults = group_faults(p, src_nid);
dst_faults = group_faults(p, dst_nid);
@@ -6970,10 +6879,28 @@ static void attach_tasks(struct lb_env *env)
}
#ifdef CONFIG_FAIR_GROUP_SCHED
+
+static inline bool cfs_rq_is_decayed(struct cfs_rq *cfs_rq)
+{
+ if (cfs_rq->load.weight)
+ return false;
+
+ if (cfs_rq->avg.load_sum)
+ return false;
+
+ if (cfs_rq->avg.util_sum)
+ return false;
+
+ if (cfs_rq->runnable_load_sum)
+ return false;
+
+ return true;
+}
+
static void update_blocked_averages(int cpu)
{
struct rq *rq = cpu_rq(cpu);
- struct cfs_rq *cfs_rq;
+ struct cfs_rq *cfs_rq, *pos;
struct rq_flags rf;
rq_lock_irqsave(rq, &rf);
@@ -6983,7 +6910,7 @@ static void update_blocked_averages(int cpu)
* Iterates the task_group tree in a bottom up fashion, see
* list_add_leaf_cfs_rq() for details.
*/
- for_each_leaf_cfs_rq(rq, cfs_rq) {
+ for_each_leaf_cfs_rq_safe(rq, cfs_rq, pos) {
struct sched_entity *se;
/* throttled entities do not contribute to load */
@@ -6997,6 +6924,13 @@ static void update_blocked_averages(int cpu)
se = cfs_rq->tg->se[cpu];
if (se && !skip_blocked_update(se))
update_load_avg(se, 0);
+
+ /*
+ * There can be a lot of idle CPU cgroups. Don't let fully
+ * decayed cfs_rqs linger on the list.
+ */
+ if (cfs_rq_is_decayed(cfs_rq))
+ list_del_leaf_cfs_rq(cfs_rq);
}
rq_unlock_irqrestore(rq, &rf);
}
@@ -7229,7 +7163,7 @@ void update_group_capacity(struct sched_domain *sd, int cpu)
* span the current group.
*/
- for_each_cpu(cpu, sched_group_cpus(sdg)) {
+ for_each_cpu(cpu, sched_group_span(sdg)) {
struct sched_group_capacity *sgc;
struct rq *rq = cpu_rq(cpu);
@@ -7408,7 +7342,7 @@ static inline void update_sg_lb_stats(struct lb_env *env,
memset(sgs, 0, sizeof(*sgs));
- for_each_cpu_and(i, sched_group_cpus(group), env->cpus) {
+ for_each_cpu_and(i, sched_group_span(group), env->cpus) {
struct rq *rq = cpu_rq(i);
/* Bias balancing toward cpus of our domain */
@@ -7572,7 +7506,7 @@ static inline void update_sd_lb_stats(struct lb_env *env, struct sd_lb_stats *sd
struct sg_lb_stats *sgs = &tmp_sgs;
int local_group;
- local_group = cpumask_test_cpu(env->dst_cpu, sched_group_cpus(sg));
+ local_group = cpumask_test_cpu(env->dst_cpu, sched_group_span(sg));
if (local_group) {
sds->local = sg;
sgs = local;
@@ -7927,7 +7861,7 @@ static struct rq *find_busiest_queue(struct lb_env *env,
unsigned long busiest_load = 0, busiest_capacity = 1;
int i;
- for_each_cpu_and(i, sched_group_cpus(group), env->cpus) {
+ for_each_cpu_and(i, sched_group_span(group), env->cpus) {
unsigned long capacity, wl;
enum fbq_type rt;
@@ -8033,7 +7967,6 @@ static int active_load_balance_cpu_stop(void *data);
static int should_we_balance(struct lb_env *env)
{
struct sched_group *sg = env->sd->groups;
- struct cpumask *sg_cpus, *sg_mask;
int cpu, balance_cpu = -1;
/*
@@ -8043,11 +7976,9 @@ static int should_we_balance(struct lb_env *env)
if (env->idle == CPU_NEWLY_IDLE)
return 1;
- sg_cpus = sched_group_cpus(sg);
- sg_mask = sched_group_mask(sg);
/* Try to find first idle cpu */
- for_each_cpu_and(cpu, sg_cpus, env->cpus) {
- if (!cpumask_test_cpu(cpu, sg_mask) || !idle_cpu(cpu))
+ for_each_cpu_and(cpu, group_balance_mask(sg), env->cpus) {
+ if (!idle_cpu(cpu))
continue;
balance_cpu = cpu;
@@ -8083,7 +8014,7 @@ static int load_balance(int this_cpu, struct rq *this_rq,
.sd = sd,
.dst_cpu = this_cpu,
.dst_rq = this_rq,
- .dst_grpmask = sched_group_cpus(sd->groups),
+ .dst_grpmask = sched_group_span(sd->groups),
.idle = idle,
.loop_break = sched_nr_migrate_break,
.cpus = cpus,
@@ -8659,6 +8590,10 @@ void nohz_balance_enter_idle(int cpu)
if (!cpu_active(cpu))
return;
+ /* Spare idle load balancing on CPUs that don't want to be disturbed: */
+ if (!is_housekeeping_cpu(cpu))
+ return;
+
if (test_bit(NOHZ_TICK_STOPPED, nohz_flags(cpu)))
return;
@@ -9523,10 +9458,10 @@ const struct sched_class fair_sched_class = {
#ifdef CONFIG_SCHED_DEBUG
void print_cfs_stats(struct seq_file *m, int cpu)
{
- struct cfs_rq *cfs_rq;
+ struct cfs_rq *cfs_rq, *pos;
rcu_read_lock();
- for_each_leaf_cfs_rq(cpu_rq(cpu), cfs_rq)
+ for_each_leaf_cfs_rq_safe(cpu_rq(cpu), cfs_rq, pos)
print_cfs_rq(m, cpu, cfs_rq);
rcu_read_unlock();
}
diff --git a/kernel/sched/features.h b/kernel/sched/features.h
index 11192e0cb122..d3fb15555291 100644
--- a/kernel/sched/features.h
+++ b/kernel/sched/features.h
@@ -55,6 +55,7 @@ SCHED_FEAT(TTWU_QUEUE, true)
* When doing wakeups, attempt to limit superfluous scans of the LLC domain.
*/
SCHED_FEAT(SIS_AVG_CPU, false)
+SCHED_FEAT(SIS_PROP, true)
/*
* Issue a WARN when we do multiple update_rq_clock() calls
@@ -76,7 +77,6 @@ SCHED_FEAT(WARN_DOUBLE_CLOCK, false)
SCHED_FEAT(RT_PUSH_IPI, true)
#endif
-SCHED_FEAT(FORCE_SD_OVERLAP, false)
SCHED_FEAT(RT_RUNTIME_SHARE, true)
SCHED_FEAT(LB_MIN, false)
SCHED_FEAT(ATTACH_AGE_LOAD, true)
diff --git a/kernel/sched/idle.c b/kernel/sched/idle.c
index ef63adce0c9c..6c23e30c0e5c 100644
--- a/kernel/sched/idle.c
+++ b/kernel/sched/idle.c
@@ -219,6 +219,7 @@ static void do_idle(void)
*/
__current_set_polling();
+ quiet_vmstat();
tick_nohz_idle_enter();
while (!need_resched()) {
diff --git a/kernel/sched/loadavg.c b/kernel/sched/loadavg.c
index f15fb2bdbc0d..f14716a3522f 100644
--- a/kernel/sched/loadavg.c
+++ b/kernel/sched/loadavg.c
@@ -117,7 +117,7 @@ calc_load(unsigned long load, unsigned long exp, unsigned long active)
* load-average relies on per-cpu sampling from the tick, it is affected by
* NO_HZ.
*
- * The basic idea is to fold the nr_active delta into a global idle-delta upon
+ * The basic idea is to fold the nr_active delta into a global NO_HZ-delta upon
* entering NO_HZ state such that we can include this as an 'extra' cpu delta
* when we read the global state.
*
@@ -126,7 +126,7 @@ calc_load(unsigned long load, unsigned long exp, unsigned long active)
* - When we go NO_HZ idle during the window, we can negate our sample
* contribution, causing under-accounting.
*
- * We avoid this by keeping two idle-delta counters and flipping them
+ * We avoid this by keeping two NO_HZ-delta counters and flipping them
* when the window starts, thus separating old and new NO_HZ load.
*
* The only trick is the slight shift in index flip for read vs write.
@@ -137,22 +137,22 @@ calc_load(unsigned long load, unsigned long exp, unsigned long active)
* r:0 0 1 1 0 0 1 1 0
* w:0 1 1 0 0 1 1 0 0
*
- * This ensures we'll fold the old idle contribution in this window while
+ * This ensures we'll fold the old NO_HZ contribution in this window while
* accumlating the new one.
*
- * - When we wake up from NO_HZ idle during the window, we push up our
+ * - When we wake up from NO_HZ during the window, we push up our
* contribution, since we effectively move our sample point to a known
* busy state.
*
* This is solved by pushing the window forward, and thus skipping the
- * sample, for this cpu (effectively using the idle-delta for this cpu which
+ * sample, for this cpu (effectively using the NO_HZ-delta for this cpu which
* was in effect at the time the window opened). This also solves the issue
- * of having to deal with a cpu having been in NOHZ idle for multiple
- * LOAD_FREQ intervals.
+ * of having to deal with a cpu having been in NO_HZ for multiple LOAD_FREQ
+ * intervals.
*
* When making the ILB scale, we should try to pull this in as well.
*/
-static atomic_long_t calc_load_idle[2];
+static atomic_long_t calc_load_nohz[2];
static int calc_load_idx;
static inline int calc_load_write_idx(void)
@@ -167,7 +167,7 @@ static inline int calc_load_write_idx(void)
/*
* If the folding window started, make sure we start writing in the
- * next idle-delta.
+ * next NO_HZ-delta.
*/
if (!time_before(jiffies, READ_ONCE(calc_load_update)))
idx++;
@@ -180,24 +180,24 @@ static inline int calc_load_read_idx(void)
return calc_load_idx & 1;
}
-void calc_load_enter_idle(void)
+void calc_load_nohz_start(void)
{
struct rq *this_rq = this_rq();
long delta;
/*
- * We're going into NOHZ mode, if there's any pending delta, fold it
- * into the pending idle delta.
+ * We're going into NO_HZ mode, if there's any pending delta, fold it
+ * into the pending NO_HZ delta.
*/
delta = calc_load_fold_active(this_rq, 0);
if (delta) {
int idx = calc_load_write_idx();
- atomic_long_add(delta, &calc_load_idle[idx]);
+ atomic_long_add(delta, &calc_load_nohz[idx]);
}
}
-void calc_load_exit_idle(void)
+void calc_load_nohz_stop(void)
{
struct rq *this_rq = this_rq();
@@ -217,13 +217,13 @@ void calc_load_exit_idle(void)
this_rq->calc_load_update += LOAD_FREQ;
}
-static long calc_load_fold_idle(void)
+static long calc_load_nohz_fold(void)
{
int idx = calc_load_read_idx();
long delta = 0;
- if (atomic_long_read(&calc_load_idle[idx]))
- delta = atomic_long_xchg(&calc_load_idle[idx], 0);
+ if (atomic_long_read(&calc_load_nohz[idx]))
+ delta = atomic_long_xchg(&calc_load_nohz[idx], 0);
return delta;
}
@@ -299,9 +299,9 @@ calc_load_n(unsigned long load, unsigned long exp,
/*
* NO_HZ can leave us missing all per-cpu ticks calling
- * calc_load_account_active(), but since an idle CPU folds its delta into
- * calc_load_tasks_idle per calc_load_account_idle(), all we need to do is fold
- * in the pending idle delta if our idle period crossed a load cycle boundary.
+ * calc_load_fold_active(), but since a NO_HZ CPU folds its delta into
+ * calc_load_nohz per calc_load_nohz_start(), all we need to do is fold
+ * in the pending NO_HZ delta if our NO_HZ period crossed a load cycle boundary.
*
* Once we've updated the global active value, we need to apply the exponential
* weights adjusted to the number of cycles missed.
@@ -330,7 +330,7 @@ static void calc_global_nohz(void)
}
/*
- * Flip the idle index...
+ * Flip the NO_HZ index...
*
* Make sure we first write the new time then flip the index, so that
* calc_load_write_idx() will see the new time when it reads the new
@@ -341,7 +341,7 @@ static void calc_global_nohz(void)
}
#else /* !CONFIG_NO_HZ_COMMON */
-static inline long calc_load_fold_idle(void) { return 0; }
+static inline long calc_load_nohz_fold(void) { return 0; }
static inline void calc_global_nohz(void) { }
#endif /* CONFIG_NO_HZ_COMMON */
@@ -362,9 +362,9 @@ void calc_global_load(unsigned long ticks)
return;
/*
- * Fold the 'old' idle-delta to include all NO_HZ cpus.
+ * Fold the 'old' NO_HZ-delta to include all NO_HZ cpus.
*/
- delta = calc_load_fold_idle();
+ delta = calc_load_nohz_fold();
if (delta)
atomic_long_add(delta, &calc_load_tasks);
@@ -378,7 +378,8 @@ void calc_global_load(unsigned long ticks)
WRITE_ONCE(calc_load_update, sample_window + LOAD_FREQ);
/*
- * In case we idled for multiple LOAD_FREQ intervals, catch up in bulk.
+ * In case we went to NO_HZ for multiple LOAD_FREQ intervals
+ * catch up in bulk.
*/
calc_global_nohz();
}
diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c
index 979b7341008a..45caf937ef90 100644
--- a/kernel/sched/rt.c
+++ b/kernel/sched/rt.c
@@ -840,6 +840,17 @@ static int do_sched_rt_period_timer(struct rt_bandwidth *rt_b, int overrun)
int enqueue = 0;
struct rt_rq *rt_rq = sched_rt_period_rt_rq(rt_b, i);
struct rq *rq = rq_of_rt_rq(rt_rq);
+ int skip;
+
+ /*
+ * When span == cpu_online_mask, taking each rq->lock
+ * can be time-consuming. Try to avoid it when possible.
+ */
+ raw_spin_lock(&rt_rq->rt_runtime_lock);
+ skip = !rt_rq->rt_time && !rt_rq->rt_nr_running;
+ raw_spin_unlock(&rt_rq->rt_runtime_lock);
+ if (skip)
+ continue;
raw_spin_lock(&rq->lock);
if (rt_rq->rt_time) {
@@ -1819,7 +1830,7 @@ retry:
* pushing.
*/
task = pick_next_pushable_task(rq);
- if (task_cpu(next_task) == rq->cpu && task == next_task) {
+ if (task == next_task) {
/*
* The task hasn't migrated, and is still the next
* eligible task, but we failed to find a run-queue
@@ -2438,6 +2449,316 @@ const struct sched_class rt_sched_class = {
.update_curr = update_curr_rt,
};
+#ifdef CONFIG_RT_GROUP_SCHED
+/*
+ * Ensure that the real time constraints are schedulable.
+ */
+static DEFINE_MUTEX(rt_constraints_mutex);
+
+/* Must be called with tasklist_lock held */
+static inline int tg_has_rt_tasks(struct task_group *tg)
+{
+ struct task_struct *g, *p;
+
+ /*
+ * Autogroups do not have RT tasks; see autogroup_create().
+ */
+ if (task_group_is_autogroup(tg))
+ return 0;
+
+ for_each_process_thread(g, p) {
+ if (rt_task(p) && task_group(p) == tg)
+ return 1;
+ }
+
+ return 0;
+}
+
+struct rt_schedulable_data {
+ struct task_group *tg;
+ u64 rt_period;
+ u64 rt_runtime;
+};
+
+static int tg_rt_schedulable(struct task_group *tg, void *data)
+{
+ struct rt_schedulable_data *d = data;
+ struct task_group *child;
+ unsigned long total, sum = 0;
+ u64 period, runtime;
+
+ period = ktime_to_ns(tg->rt_bandwidth.rt_period);
+ runtime = tg->rt_bandwidth.rt_runtime;
+
+ if (tg == d->tg) {
+ period = d->rt_period;
+ runtime = d->rt_runtime;
+ }
+
+ /*
+ * Cannot have more runtime than the period.
+ */
+ if (runtime > period && runtime != RUNTIME_INF)
+ return -EINVAL;
+
+ /*
+ * Ensure we don't starve existing RT tasks.
+ */
+ if (rt_bandwidth_enabled() && !runtime && tg_has_rt_tasks(tg))
+ return -EBUSY;
+
+ total = to_ratio(period, runtime);
+
+ /*
+ * Nobody can have more than the global setting allows.
+ */
+ if (total > to_ratio(global_rt_period(), global_rt_runtime()))
+ return -EINVAL;
+
+ /*
+ * The sum of our children's runtime should not exceed our own.
+ */
+ list_for_each_entry_rcu(child, &tg->children, siblings) {
+ period = ktime_to_ns(child->rt_bandwidth.rt_period);
+ runtime = child->rt_bandwidth.rt_runtime;
+
+ if (child == d->tg) {
+ period = d->rt_period;
+ runtime = d->rt_runtime;
+ }
+
+ sum += to_ratio(period, runtime);
+ }
+
+ if (sum > total)
+ return -EINVAL;
+
+ return 0;
+}
+
+static int __rt_schedulable(struct task_group *tg, u64 period, u64 runtime)
+{
+ int ret;
+
+ struct rt_schedulable_data data = {
+ .tg = tg,
+ .rt_period = period,
+ .rt_runtime = runtime,
+ };
+
+ rcu_read_lock();
+ ret = walk_tg_tree(tg_rt_schedulable, tg_nop, &data);
+ rcu_read_unlock();
+
+ return ret;
+}
+
+static int tg_set_rt_bandwidth(struct task_group *tg,
+ u64 rt_period, u64 rt_runtime)
+{
+ int i, err = 0;
+
+ /*
+ * Disallowing the root group RT runtime is BAD, it would disallow the
+ * kernel creating (and or operating) RT threads.
+ */
+ if (tg == &root_task_group && rt_runtime == 0)
+ return -EINVAL;
+
+ /* No period doesn't make any sense. */
+ if (rt_period == 0)
+ return -EINVAL;
+
+ mutex_lock(&rt_constraints_mutex);
+ read_lock(&tasklist_lock);
+ err = __rt_schedulable(tg, rt_period, rt_runtime);
+ if (err)
+ goto unlock;
+
+ raw_spin_lock_irq(&tg->rt_bandwidth.rt_runtime_lock);
+ tg->rt_bandwidth.rt_period = ns_to_ktime(rt_period);
+ tg->rt_bandwidth.rt_runtime = rt_runtime;
+
+ for_each_possible_cpu(i) {
+ struct rt_rq *rt_rq = tg->rt_rq[i];
+
+ raw_spin_lock(&rt_rq->rt_runtime_lock);
+ rt_rq->rt_runtime = rt_runtime;
+ raw_spin_unlock(&rt_rq->rt_runtime_lock);
+ }
+ raw_spin_unlock_irq(&tg->rt_bandwidth.rt_runtime_lock);
+unlock:
+ read_unlock(&tasklist_lock);
+ mutex_unlock(&rt_constraints_mutex);
+
+ return err;
+}
+
+int sched_group_set_rt_runtime(struct task_group *tg, long rt_runtime_us)
+{
+ u64 rt_runtime, rt_period;
+
+ rt_period = ktime_to_ns(tg->rt_bandwidth.rt_period);
+ rt_runtime = (u64)rt_runtime_us * NSEC_PER_USEC;
+ if (rt_runtime_us < 0)
+ rt_runtime = RUNTIME_INF;
+
+ return tg_set_rt_bandwidth(tg, rt_period, rt_runtime);
+}
+
+long sched_group_rt_runtime(struct task_group *tg)
+{
+ u64 rt_runtime_us;
+
+ if (tg->rt_bandwidth.rt_runtime == RUNTIME_INF)
+ return -1;
+
+ rt_runtime_us = tg->rt_bandwidth.rt_runtime;
+ do_div(rt_runtime_us, NSEC_PER_USEC);
+ return rt_runtime_us;
+}
+
+int sched_group_set_rt_period(struct task_group *tg, u64 rt_period_us)
+{
+ u64 rt_runtime, rt_period;
+
+ rt_period = rt_period_us * NSEC_PER_USEC;
+ rt_runtime = tg->rt_bandwidth.rt_runtime;
+
+ return tg_set_rt_bandwidth(tg, rt_period, rt_runtime);
+}
+
+long sched_group_rt_period(struct task_group *tg)
+{
+ u64 rt_period_us;
+
+ rt_period_us = ktime_to_ns(tg->rt_bandwidth.rt_period);
+ do_div(rt_period_us, NSEC_PER_USEC);
+ return rt_period_us;
+}
+
+static int sched_rt_global_constraints(void)
+{
+ int ret = 0;
+
+ mutex_lock(&rt_constraints_mutex);
+ read_lock(&tasklist_lock);
+ ret = __rt_schedulable(NULL, 0, 0);
+ read_unlock(&tasklist_lock);
+ mutex_unlock(&rt_constraints_mutex);
+
+ return ret;
+}
+
+int sched_rt_can_attach(struct task_group *tg, struct task_struct *tsk)
+{
+ /* Don't accept realtime tasks when there is no way for them to run */
+ if (rt_task(tsk) && tg->rt_bandwidth.rt_runtime == 0)
+ return 0;
+
+ return 1;
+}
+
+#else /* !CONFIG_RT_GROUP_SCHED */
+static int sched_rt_global_constraints(void)
+{
+ unsigned long flags;
+ int i;
+
+ raw_spin_lock_irqsave(&def_rt_bandwidth.rt_runtime_lock, flags);
+ for_each_possible_cpu(i) {
+ struct rt_rq *rt_rq = &cpu_rq(i)->rt;
+
+ raw_spin_lock(&rt_rq->rt_runtime_lock);
+ rt_rq->rt_runtime = global_rt_runtime();
+ raw_spin_unlock(&rt_rq->rt_runtime_lock);
+ }
+ raw_spin_unlock_irqrestore(&def_rt_bandwidth.rt_runtime_lock, flags);
+
+ return 0;
+}
+#endif /* CONFIG_RT_GROUP_SCHED */
+
+static int sched_rt_global_validate(void)
+{
+ if (sysctl_sched_rt_period <= 0)
+ return -EINVAL;
+
+ if ((sysctl_sched_rt_runtime != RUNTIME_INF) &&
+ (sysctl_sched_rt_runtime > sysctl_sched_rt_period))
+ return -EINVAL;
+
+ return 0;
+}
+
+static void sched_rt_do_global(void)
+{
+ def_rt_bandwidth.rt_runtime = global_rt_runtime();
+ def_rt_bandwidth.rt_period = ns_to_ktime(global_rt_period());
+}
+
+int sched_rt_handler(struct ctl_table *table, int write,
+ void __user *buffer, size_t *lenp,
+ loff_t *ppos)
+{
+ int old_period, old_runtime;
+ static DEFINE_MUTEX(mutex);
+ int ret;
+
+ mutex_lock(&mutex);
+ old_period = sysctl_sched_rt_period;
+ old_runtime = sysctl_sched_rt_runtime;
+
+ ret = proc_dointvec(table, write, buffer, lenp, ppos);
+
+ if (!ret && write) {
+ ret = sched_rt_global_validate();
+ if (ret)
+ goto undo;
+
+ ret = sched_dl_global_validate();
+ if (ret)
+ goto undo;
+
+ ret = sched_rt_global_constraints();
+ if (ret)
+ goto undo;
+
+ sched_rt_do_global();
+ sched_dl_do_global();
+ }
+ if (0) {
+undo:
+ sysctl_sched_rt_period = old_period;
+ sysctl_sched_rt_runtime = old_runtime;
+ }
+ mutex_unlock(&mutex);
+
+ return ret;
+}
+
+int sched_rr_handler(struct ctl_table *table, int write,
+ void __user *buffer, size_t *lenp,
+ loff_t *ppos)
+{
+ int ret;
+ static DEFINE_MUTEX(mutex);
+
+ mutex_lock(&mutex);
+ ret = proc_dointvec(table, write, buffer, lenp, ppos);
+ /*
+ * Make sure that internally we keep jiffies.
+ * Also, writing zero resets the timeslice to default:
+ */
+ if (!ret && write) {
+ sched_rr_timeslice =
+ sysctl_sched_rr_timeslice <= 0 ? RR_TIMESLICE :
+ msecs_to_jiffies(sysctl_sched_rr_timeslice);
+ }
+ mutex_unlock(&mutex);
+ return ret;
+}
+
#ifdef CONFIG_SCHED_DEBUG
extern void print_rt_rq(struct seq_file *m, int cpu, struct rt_rq *rt_rq);
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 6dda2aab731e..eeef1a3086d1 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -39,9 +39,9 @@
#include "cpuacct.h"
#ifdef CONFIG_SCHED_DEBUG
-#define SCHED_WARN_ON(x) WARN_ONCE(x, #x)
+# define SCHED_WARN_ON(x) WARN_ONCE(x, #x)
#else
-#define SCHED_WARN_ON(x) ((void)(x))
+# define SCHED_WARN_ON(x) ({ (void)(x), 0; })
#endif
struct rq;
@@ -218,23 +218,25 @@ static inline int dl_bandwidth_enabled(void)
return sysctl_sched_rt_runtime >= 0;
}
-extern struct dl_bw *dl_bw_of(int i);
-
struct dl_bw {
raw_spinlock_t lock;
u64 bw, total_bw;
};
+static inline void __dl_update(struct dl_bw *dl_b, s64 bw);
+
static inline
-void __dl_clear(struct dl_bw *dl_b, u64 tsk_bw)
+void __dl_clear(struct dl_bw *dl_b, u64 tsk_bw, int cpus)
{
dl_b->total_bw -= tsk_bw;
+ __dl_update(dl_b, (s32)tsk_bw / cpus);
}
static inline
-void __dl_add(struct dl_bw *dl_b, u64 tsk_bw)
+void __dl_add(struct dl_bw *dl_b, u64 tsk_bw, int cpus)
{
dl_b->total_bw += tsk_bw;
+ __dl_update(dl_b, -((s32)tsk_bw / cpus));
}
static inline
@@ -244,7 +246,22 @@ bool __dl_overflow(struct dl_bw *dl_b, int cpus, u64 old_bw, u64 new_bw)
dl_b->bw * cpus < dl_b->total_bw - old_bw + new_bw;
}
+void dl_change_utilization(struct task_struct *p, u64 new_bw);
extern void init_dl_bw(struct dl_bw *dl_b);
+extern int sched_dl_global_validate(void);
+extern void sched_dl_do_global(void);
+extern int sched_dl_overflow(struct task_struct *p, int policy,
+ const struct sched_attr *attr);
+extern void __setparam_dl(struct task_struct *p, const struct sched_attr *attr);
+extern void __getparam_dl(struct task_struct *p, struct sched_attr *attr);
+extern bool __checkparam_dl(const struct sched_attr *attr);
+extern void __dl_clear_params(struct task_struct *p);
+extern bool dl_param_changed(struct task_struct *p, const struct sched_attr *attr);
+extern int dl_task_can_attach(struct task_struct *p,
+ const struct cpumask *cs_cpus_allowed);
+extern int dl_cpuset_cpumask_can_shrink(const struct cpumask *cur,
+ const struct cpumask *trial);
+extern bool dl_cpu_busy(unsigned int cpu);
#ifdef CONFIG_CGROUP_SCHED
@@ -366,6 +383,11 @@ extern int alloc_rt_sched_group(struct task_group *tg, struct task_group *parent
extern void init_tg_rt_entry(struct task_group *tg, struct rt_rq *rt_rq,
struct sched_rt_entity *rt_se, int cpu,
struct sched_rt_entity *parent);
+extern int sched_group_set_rt_runtime(struct task_group *tg, long rt_runtime_us);
+extern int sched_group_set_rt_period(struct task_group *tg, u64 rt_period_us);
+extern long sched_group_rt_runtime(struct task_group *tg);
+extern long sched_group_rt_period(struct task_group *tg);
+extern int sched_rt_can_attach(struct task_group *tg, struct task_struct *tsk);
extern struct task_group *sched_create_group(struct task_group *parent);
extern void sched_online_group(struct task_group *tg,
@@ -558,6 +580,30 @@ struct dl_rq {
#else
struct dl_bw dl_bw;
#endif
+ /*
+ * "Active utilization" for this runqueue: increased when a
+ * task wakes up (becomes TASK_RUNNING) and decreased when a
+ * task blocks
+ */
+ u64 running_bw;
+
+ /*
+ * Utilization of the tasks "assigned" to this runqueue (including
+ * the tasks that are in runqueue and the tasks that executed on this
+ * CPU and blocked). Increased when a task moves to this runqueue, and
+ * decreased when the task moves away (migrates, changes scheduling
+ * policy, or terminates).
+ * This is needed to compute the "inactive utilization" for the
+ * runqueue (inactive utilization = this_bw - running_bw).
+ */
+ u64 this_bw;
+ u64 extra_bw;
+
+ /*
+ * Inverse of the fraction of CPU utilization that can be reclaimed
+ * by the GRUB algorithm.
+ */
+ u64 bw_ratio;
};
#ifdef CONFIG_SMP
@@ -606,11 +652,9 @@ struct root_domain {
extern struct root_domain def_root_domain;
extern struct mutex sched_domains_mutex;
-extern cpumask_var_t fallback_doms;
-extern cpumask_var_t sched_domains_tmpmask;
extern void init_defrootdomain(void);
-extern int init_sched_domains(const struct cpumask *cpu_map);
+extern int sched_init_domains(const struct cpumask *cpu_map);
extern void rq_attach_root(struct rq *rq, struct root_domain *rd);
#endif /* CONFIG_SMP */
@@ -1025,7 +1069,11 @@ struct sched_group_capacity {
unsigned long next_update;
int imbalance; /* XXX unrelated to capacity but shared group state */
- unsigned long cpumask[0]; /* iteration mask */
+#ifdef CONFIG_SCHED_DEBUG
+ int id;
+#endif
+
+ unsigned long cpumask[0]; /* balance mask */
};
struct sched_group {
@@ -1046,16 +1094,15 @@ struct sched_group {
unsigned long cpumask[0];
};
-static inline struct cpumask *sched_group_cpus(struct sched_group *sg)
+static inline struct cpumask *sched_group_span(struct sched_group *sg)
{
return to_cpumask(sg->cpumask);
}
/*
- * cpumask masking which cpus in the group are allowed to iterate up the domain
- * tree.
+ * See build_balance_mask().
*/
-static inline struct cpumask *sched_group_mask(struct sched_group *sg)
+static inline struct cpumask *group_balance_mask(struct sched_group *sg)
{
return to_cpumask(sg->sgc->cpumask);
}
@@ -1066,7 +1113,7 @@ static inline struct cpumask *sched_group_mask(struct sched_group *sg)
*/
static inline unsigned int group_first_cpu(struct sched_group *group)
{
- return cpumask_first(sched_group_cpus(group));
+ return cpumask_first(sched_group_span(group));
}
extern int group_balance_cpu(struct sched_group *sg);
@@ -1422,7 +1469,11 @@ static inline void set_curr_task(struct rq *rq, struct task_struct *curr)
curr->sched_class->set_curr_task(rq);
}
+#ifdef CONFIG_SMP
#define sched_class_highest (&stop_sched_class)
+#else
+#define sched_class_highest (&dl_sched_class)
+#endif
#define for_each_class(class) \
for (class = sched_class_highest; class; class = class->next)
@@ -1486,7 +1537,12 @@ extern void init_rt_bandwidth(struct rt_bandwidth *rt_b, u64 period, u64 runtime
extern struct dl_bandwidth def_dl_bandwidth;
extern void init_dl_bandwidth(struct dl_bandwidth *dl_b, u64 period, u64 runtime);
extern void init_dl_task_timer(struct sched_dl_entity *dl_se);
+extern void init_dl_inactive_task_timer(struct sched_dl_entity *dl_se);
+extern void init_dl_rq_bw_ratio(struct dl_rq *dl_rq);
+#define BW_SHIFT 20
+#define BW_UNIT (1 << BW_SHIFT)
+#define RATIO_SHIFT 8
unsigned long to_ratio(u64 period, u64 runtime);
extern void init_entity_runnable_average(struct sched_entity *se);
@@ -1928,6 +1984,33 @@ extern void nohz_balance_exit_idle(unsigned int cpu);
static inline void nohz_balance_exit_idle(unsigned int cpu) { }
#endif
+
+#ifdef CONFIG_SMP
+static inline
+void __dl_update(struct dl_bw *dl_b, s64 bw)
+{
+ struct root_domain *rd = container_of(dl_b, struct root_domain, dl_bw);
+ int i;
+
+ RCU_LOCKDEP_WARN(!rcu_read_lock_sched_held(),
+ "sched RCU must be held");
+ for_each_cpu_and(i, rd->span, cpu_active_mask) {
+ struct rq *rq = cpu_rq(i);
+
+ rq->dl.extra_bw += bw;
+ }
+}
+#else
+static inline
+void __dl_update(struct dl_bw *dl_b, s64 bw)
+{
+ struct dl_rq *dl = container_of(dl_b, struct dl_rq, dl_bw);
+
+ dl->extra_bw += bw;
+}
+#endif
+
+
#ifdef CONFIG_IRQ_TIME_ACCOUNTING
struct irqtime {
u64 total;
diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c
index 1b0b4fb12837..79895aec281e 100644
--- a/kernel/sched/topology.c
+++ b/kernel/sched/topology.c
@@ -10,6 +10,7 @@ DEFINE_MUTEX(sched_domains_mutex);
/* Protected by sched_domains_mutex: */
cpumask_var_t sched_domains_tmpmask;
+cpumask_var_t sched_domains_tmpmask2;
#ifdef CONFIG_SCHED_DEBUG
@@ -35,7 +36,7 @@ static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level,
cpumask_clear(groupmask);
- printk(KERN_DEBUG "%*s domain %d: ", level, "", level);
+ printk(KERN_DEBUG "%*s domain-%d: ", level, "", level);
if (!(sd->flags & SD_LOAD_BALANCE)) {
printk("does not load-balance\n");
@@ -45,14 +46,14 @@ static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level,
return -1;
}
- printk(KERN_CONT "span %*pbl level %s\n",
+ printk(KERN_CONT "span=%*pbl level=%s\n",
cpumask_pr_args(sched_domain_span(sd)), sd->name);
if (!cpumask_test_cpu(cpu, sched_domain_span(sd))) {
printk(KERN_ERR "ERROR: domain->span does not contain "
"CPU%d\n", cpu);
}
- if (!cpumask_test_cpu(cpu, sched_group_cpus(group))) {
+ if (!cpumask_test_cpu(cpu, sched_group_span(group))) {
printk(KERN_ERR "ERROR: domain->groups does not contain"
" CPU%d\n", cpu);
}
@@ -65,29 +66,47 @@ static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level,
break;
}
- if (!cpumask_weight(sched_group_cpus(group))) {
+ if (!cpumask_weight(sched_group_span(group))) {
printk(KERN_CONT "\n");
printk(KERN_ERR "ERROR: empty group\n");
break;
}
if (!(sd->flags & SD_OVERLAP) &&
- cpumask_intersects(groupmask, sched_group_cpus(group))) {
+ cpumask_intersects(groupmask, sched_group_span(group))) {
printk(KERN_CONT "\n");
printk(KERN_ERR "ERROR: repeated CPUs\n");
break;
}
- cpumask_or(groupmask, groupmask, sched_group_cpus(group));
+ cpumask_or(groupmask, groupmask, sched_group_span(group));
- printk(KERN_CONT " %*pbl",
- cpumask_pr_args(sched_group_cpus(group)));
- if (group->sgc->capacity != SCHED_CAPACITY_SCALE) {
- printk(KERN_CONT " (cpu_capacity = %lu)",
- group->sgc->capacity);
+ printk(KERN_CONT " %d:{ span=%*pbl",
+ group->sgc->id,
+ cpumask_pr_args(sched_group_span(group)));
+
+ if ((sd->flags & SD_OVERLAP) &&
+ !cpumask_equal(group_balance_mask(group), sched_group_span(group))) {
+ printk(KERN_CONT " mask=%*pbl",
+ cpumask_pr_args(group_balance_mask(group)));
+ }
+
+ if (group->sgc->capacity != SCHED_CAPACITY_SCALE)
+ printk(KERN_CONT " cap=%lu", group->sgc->capacity);
+
+ if (group == sd->groups && sd->child &&
+ !cpumask_equal(sched_domain_span(sd->child),
+ sched_group_span(group))) {
+ printk(KERN_ERR "ERROR: domain->groups does not match domain->child\n");
}
+ printk(KERN_CONT " }");
+
group = group->next;
+
+ if (group != sd->groups)
+ printk(KERN_CONT ",");
+
} while (group != sd->groups);
printk(KERN_CONT "\n");
@@ -113,7 +132,7 @@ static void sched_domain_debug(struct sched_domain *sd, int cpu)
return;
}
- printk(KERN_DEBUG "CPU%d attaching sched-domain:\n", cpu);
+ printk(KERN_DEBUG "CPU%d attaching sched-domain(s):\n", cpu);
for (;;) {
if (sched_domain_debug_one(sd, cpu, level, sched_domains_tmpmask))
@@ -477,46 +496,214 @@ enum s_alloc {
};
/*
- * Build an iteration mask that can exclude certain CPUs from the upwards
- * domain traversal.
+ * Return the canonical balance CPU for this group, this is the first CPU
+ * of this group that's also in the balance mask.
*
- * Asymmetric node setups can result in situations where the domain tree is of
- * unequal depth, make sure to skip domains that already cover the entire
- * range.
+ * The balance mask are all those CPUs that could actually end up at this
+ * group. See build_balance_mask().
*
- * In that case build_sched_domains() will have terminated the iteration early
- * and our sibling sd spans will be empty. Domains should always include the
- * CPU they're built on, so check that.
+ * Also see should_we_balance().
*/
-static void build_group_mask(struct sched_domain *sd, struct sched_group *sg)
+int group_balance_cpu(struct sched_group *sg)
{
- const struct cpumask *span = sched_domain_span(sd);
+ return cpumask_first(group_balance_mask(sg));
+}
+
+
+/*
+ * NUMA topology (first read the regular topology blurb below)
+ *
+ * Given a node-distance table, for example:
+ *
+ * node 0 1 2 3
+ * 0: 10 20 30 20
+ * 1: 20 10 20 30
+ * 2: 30 20 10 20
+ * 3: 20 30 20 10
+ *
+ * which represents a 4 node ring topology like:
+ *
+ * 0 ----- 1
+ * | |
+ * | |
+ * | |
+ * 3 ----- 2
+ *
+ * We want to construct domains and groups to represent this. The way we go
+ * about doing this is to build the domains on 'hops'. For each NUMA level we
+ * construct the mask of all nodes reachable in @level hops.
+ *
+ * For the above NUMA topology that gives 3 levels:
+ *
+ * NUMA-2 0-3 0-3 0-3 0-3
+ * groups: {0-1,3},{1-3} {0-2},{0,2-3} {1-3},{0-1,3} {0,2-3},{0-2}
+ *
+ * NUMA-1 0-1,3 0-2 1-3 0,2-3
+ * groups: {0},{1},{3} {0},{1},{2} {1},{2},{3} {0},{2},{3}
+ *
+ * NUMA-0 0 1 2 3
+ *
+ *
+ * As can be seen; things don't nicely line up as with the regular topology.
+ * When we iterate a domain in child domain chunks some nodes can be
+ * represented multiple times -- hence the "overlap" naming for this part of
+ * the topology.
+ *
+ * In order to minimize this overlap, we only build enough groups to cover the
+ * domain. For instance Node-0 NUMA-2 would only get groups: 0-1,3 and 1-3.
+ *
+ * Because:
+ *
+ * - the first group of each domain is its child domain; this
+ * gets us the first 0-1,3
+ * - the only uncovered node is 2, who's child domain is 1-3.
+ *
+ * However, because of the overlap, computing a unique CPU for each group is
+ * more complicated. Consider for instance the groups of NODE-1 NUMA-2, both
+ * groups include the CPUs of Node-0, while those CPUs would not in fact ever
+ * end up at those groups (they would end up in group: 0-1,3).
+ *
+ * To correct this we have to introduce the group balance mask. This mask
+ * will contain those CPUs in the group that can reach this group given the
+ * (child) domain tree.
+ *
+ * With this we can once again compute balance_cpu and sched_group_capacity
+ * relations.
+ *
+ * XXX include words on how balance_cpu is unique and therefore can be
+ * used for sched_group_capacity links.
+ *
+ *
+ * Another 'interesting' topology is:
+ *
+ * node 0 1 2 3
+ * 0: 10 20 20 30
+ * 1: 20 10 20 20
+ * 2: 20 20 10 20
+ * 3: 30 20 20 10
+ *
+ * Which looks a little like:
+ *
+ * 0 ----- 1
+ * | / |
+ * | / |
+ * | / |
+ * 2 ----- 3
+ *
+ * This topology is asymmetric, nodes 1,2 are fully connected, but nodes 0,3
+ * are not.
+ *
+ * This leads to a few particularly weird cases where the sched_domain's are
+ * not of the same number for each cpu. Consider:
+ *
+ * NUMA-2 0-3 0-3
+ * groups: {0-2},{1-3} {1-3},{0-2}
+ *
+ * NUMA-1 0-2 0-3 0-3 1-3
+ *
+ * NUMA-0 0 1 2 3
+ *
+ */
+
+
+/*
+ * Build the balance mask; it contains only those CPUs that can arrive at this
+ * group and should be considered to continue balancing.
+ *
+ * We do this during the group creation pass, therefore the group information
+ * isn't complete yet, however since each group represents a (child) domain we
+ * can fully construct this using the sched_domain bits (which are already
+ * complete).
+ */
+static void
+build_balance_mask(struct sched_domain *sd, struct sched_group *sg, struct cpumask *mask)
+{
+ const struct cpumask *sg_span = sched_group_span(sg);
struct sd_data *sdd = sd->private;
struct sched_domain *sibling;
int i;
- for_each_cpu(i, span) {
+ cpumask_clear(mask);
+
+ for_each_cpu(i, sg_span) {
sibling = *per_cpu_ptr(sdd->sd, i);
- if (!cpumask_test_cpu(i, sched_domain_span(sibling)))
+
+ /*
+ * Can happen in the asymmetric case, where these siblings are
+ * unused. The mask will not be empty because those CPUs that
+ * do have the top domain _should_ span the domain.
+ */
+ if (!sibling->child)
continue;
- cpumask_set_cpu(i, sched_group_mask(sg));
+ /* If we would not end up here, we can't continue from here */
+ if (!cpumask_equal(sg_span, sched_domain_span(sibling->child)))
+ continue;
+
+ cpumask_set_cpu(i, mask);
}
+
+ /* We must not have empty masks here */
+ WARN_ON_ONCE(cpumask_empty(mask));
}
/*
- * Return the canonical balance CPU for this group, this is the first CPU
- * of this group that's also in the iteration mask.
+ * XXX: This creates per-node group entries; since the load-balancer will
+ * immediately access remote memory to construct this group's load-balance
+ * statistics having the groups node local is of dubious benefit.
*/
-int group_balance_cpu(struct sched_group *sg)
+static struct sched_group *
+build_group_from_child_sched_domain(struct sched_domain *sd, int cpu)
{
- return cpumask_first_and(sched_group_cpus(sg), sched_group_mask(sg));
+ struct sched_group *sg;
+ struct cpumask *sg_span;
+
+ sg = kzalloc_node(sizeof(struct sched_group) + cpumask_size(),
+ GFP_KERNEL, cpu_to_node(cpu));
+
+ if (!sg)
+ return NULL;
+
+ sg_span = sched_group_span(sg);
+ if (sd->child)
+ cpumask_copy(sg_span, sched_domain_span(sd->child));
+ else
+ cpumask_copy(sg_span, sched_domain_span(sd));
+
+ return sg;
+}
+
+static void init_overlap_sched_group(struct sched_domain *sd,
+ struct sched_group *sg)
+{
+ struct cpumask *mask = sched_domains_tmpmask2;
+ struct sd_data *sdd = sd->private;
+ struct cpumask *sg_span;
+ int cpu;
+
+ build_balance_mask(sd, sg, mask);
+ cpu = cpumask_first_and(sched_group_span(sg), mask);
+
+ sg->sgc = *per_cpu_ptr(sdd->sgc, cpu);
+ if (atomic_inc_return(&sg->sgc->ref) == 1)
+ cpumask_copy(group_balance_mask(sg), mask);
+ else
+ WARN_ON_ONCE(!cpumask_equal(group_balance_mask(sg), mask));
+
+ /*
+ * Initialize sgc->capacity such that even if we mess up the
+ * domains and no possible iteration will get us here, we won't
+ * die on a /0 trap.
+ */
+ sg_span = sched_group_span(sg);
+ sg->sgc->capacity = SCHED_CAPACITY_SCALE * cpumask_weight(sg_span);
+ sg->sgc->min_capacity = SCHED_CAPACITY_SCALE;
}
static int
build_overlap_sched_groups(struct sched_domain *sd, int cpu)
{
- struct sched_group *first = NULL, *last = NULL, *groups = NULL, *sg;
+ struct sched_group *first = NULL, *last = NULL, *sg;
const struct cpumask *span = sched_domain_span(sd);
struct cpumask *covered = sched_domains_tmpmask;
struct sd_data *sdd = sd->private;
@@ -525,7 +712,7 @@ build_overlap_sched_groups(struct sched_domain *sd, int cpu)
cpumask_clear(covered);
- for_each_cpu(i, span) {
+ for_each_cpu_wrap(i, span, cpu) {
struct cpumask *sg_span;
if (cpumask_test_cpu(i, covered))
@@ -533,44 +720,27 @@ build_overlap_sched_groups(struct sched_domain *sd, int cpu)
sibling = *per_cpu_ptr(sdd->sd, i);
- /* See the comment near build_group_mask(). */
+ /*
+ * Asymmetric node setups can result in situations where the
+ * domain tree is of unequal depth, make sure to skip domains
+ * that already cover the entire range.
+ *
+ * In that case build_sched_domains() will have terminated the
+ * iteration early and our sibling sd spans will be empty.
+ * Domains should always include the CPU they're built on, so
+ * check that.
+ */
if (!cpumask_test_cpu(i, sched_domain_span(sibling)))
continue;
- sg = kzalloc_node(sizeof(struct sched_group) + cpumask_size(),
- GFP_KERNEL, cpu_to_node(cpu));
-
+ sg = build_group_from_child_sched_domain(sibling, cpu);
if (!sg)
goto fail;
- sg_span = sched_group_cpus(sg);
- if (sibling->child)
- cpumask_copy(sg_span, sched_domain_span(sibling->child));
- else
- cpumask_set_cpu(i, sg_span);
-
+ sg_span = sched_group_span(sg);
cpumask_or(covered, covered, sg_span);
- sg->sgc = *per_cpu_ptr(sdd->sgc, i);
- if (atomic_inc_return(&sg->sgc->ref) == 1)
- build_group_mask(sd, sg);
-
- /*
- * Initialize sgc->capacity such that even if we mess up the
- * domains and no possible iteration will get us here, we won't
- * die on a /0 trap.
- */
- sg->sgc->capacity = SCHED_CAPACITY_SCALE * cpumask_weight(sg_span);
- sg->sgc->min_capacity = SCHED_CAPACITY_SCALE;
-
- /*
- * Make sure the first group of this domain contains the
- * canonical balance CPU. Otherwise the sched_domain iteration
- * breaks. See update_sg_lb_stats().
- */
- if ((!groups && cpumask_test_cpu(cpu, sg_span)) ||
- group_balance_cpu(sg) == cpu)
- groups = sg;
+ init_overlap_sched_group(sd, sg);
if (!first)
first = sg;
@@ -579,7 +749,7 @@ build_overlap_sched_groups(struct sched_domain *sd, int cpu)
last = sg;
last->next = first;
}
- sd->groups = groups;
+ sd->groups = first;
return 0;
@@ -589,23 +759,106 @@ fail:
return -ENOMEM;
}
-static int get_group(int cpu, struct sd_data *sdd, struct sched_group **sg)
+
+/*
+ * Package topology (also see the load-balance blurb in fair.c)
+ *
+ * The scheduler builds a tree structure to represent a number of important
+ * topology features. By default (default_topology[]) these include:
+ *
+ * - Simultaneous multithreading (SMT)
+ * - Multi-Core Cache (MC)
+ * - Package (DIE)
+ *
+ * Where the last one more or less denotes everything up to a NUMA node.
+ *
+ * The tree consists of 3 primary data structures:
+ *
+ * sched_domain -> sched_group -> sched_group_capacity
+ * ^ ^ ^ ^
+ * `-' `-'
+ *
+ * The sched_domains are per-cpu and have a two way link (parent & child) and
+ * denote the ever growing mask of CPUs belonging to that level of topology.
+ *
+ * Each sched_domain has a circular (double) linked list of sched_group's, each
+ * denoting the domains of the level below (or individual CPUs in case of the
+ * first domain level). The sched_group linked by a sched_domain includes the
+ * CPU of that sched_domain [*].
+ *
+ * Take for instance a 2 threaded, 2 core, 2 cache cluster part:
+ *
+ * CPU 0 1 2 3 4 5 6 7
+ *
+ * DIE [ ]
+ * MC [ ] [ ]
+ * SMT [ ] [ ] [ ] [ ]
+ *
+ * - or -
+ *
+ * DIE 0-7 0-7 0-7 0-7 0-7 0-7 0-7 0-7
+ * MC 0-3 0-3 0-3 0-3 4-7 4-7 4-7 4-7
+ * SMT 0-1 0-1 2-3 2-3 4-5 4-5 6-7 6-7
+ *
+ * CPU 0 1 2 3 4 5 6 7
+ *
+ * One way to think about it is: sched_domain moves you up and down among these
+ * topology levels, while sched_group moves you sideways through it, at child
+ * domain granularity.
+ *
+ * sched_group_capacity ensures each unique sched_group has shared storage.
+ *
+ * There are two related construction problems, both require a CPU that
+ * uniquely identify each group (for a given domain):
+ *
+ * - The first is the balance_cpu (see should_we_balance() and the
+ * load-balance blub in fair.c); for each group we only want 1 CPU to
+ * continue balancing at a higher domain.
+ *
+ * - The second is the sched_group_capacity; we want all identical groups
+ * to share a single sched_group_capacity.
+ *
+ * Since these topologies are exclusive by construction. That is, its
+ * impossible for an SMT thread to belong to multiple cores, and cores to
+ * be part of multiple caches. There is a very clear and unique location
+ * for each CPU in the hierarchy.
+ *
+ * Therefore computing a unique CPU for each group is trivial (the iteration
+ * mask is redundant and set all 1s; all CPUs in a group will end up at _that_
+ * group), we can simply pick the first CPU in each group.
+ *
+ *
+ * [*] in other words, the first group of each domain is its child domain.
+ */
+
+static struct sched_group *get_group(int cpu, struct sd_data *sdd)
{
struct sched_domain *sd = *per_cpu_ptr(sdd->sd, cpu);
struct sched_domain *child = sd->child;
+ struct sched_group *sg;
if (child)
cpu = cpumask_first(sched_domain_span(child));
- if (sg) {
- *sg = *per_cpu_ptr(sdd->sg, cpu);
- (*sg)->sgc = *per_cpu_ptr(sdd->sgc, cpu);
+ sg = *per_cpu_ptr(sdd->sg, cpu);
+ sg->sgc = *per_cpu_ptr(sdd->sgc, cpu);
+
+ /* For claim_allocations: */
+ atomic_inc(&sg->ref);
+ atomic_inc(&sg->sgc->ref);
- /* For claim_allocations: */
- atomic_set(&(*sg)->sgc->ref, 1);
+ if (child) {
+ cpumask_copy(sched_group_span(sg), sched_domain_span(child));
+ cpumask_copy(group_balance_mask(sg), sched_group_span(sg));
+ } else {
+ cpumask_set_cpu(cpu, sched_group_span(sg));
+ cpumask_set_cpu(cpu, group_balance_mask(sg));
}
- return cpu;
+ sg->sgc->capacity = SCHED_CAPACITY_SCALE * cpumask_weight(sched_group_span(sg));
+ sg->sgc->min_capacity = SCHED_CAPACITY_SCALE;
+
+ return sg;
}
/*
@@ -624,34 +877,20 @@ build_sched_groups(struct sched_domain *sd, int cpu)
struct cpumask *covered;
int i;
- get_group(cpu, sdd, &sd->groups);
- atomic_inc(&sd->groups->ref);
-
- if (cpu != cpumask_first(span))
- return 0;
-
lockdep_assert_held(&sched_domains_mutex);
covered = sched_domains_tmpmask;
cpumask_clear(covered);
- for_each_cpu(i, span) {
+ for_each_cpu_wrap(i, span, cpu) {
struct sched_group *sg;
- int group, j;
if (cpumask_test_cpu(i, covered))
continue;
- group = get_group(i, sdd, &sg);
- cpumask_setall(sched_group_mask(sg));
+ sg = get_group(i, sdd);
- for_each_cpu(j, span) {
- if (get_group(j, sdd, NULL) != group)
- continue;
-
- cpumask_set_cpu(j, covered);
- cpumask_set_cpu(j, sched_group_cpus(sg));
- }
+ cpumask_or(covered, covered, sched_group_span(sg));
if (!first)
first = sg;
@@ -660,6 +899,7 @@ build_sched_groups(struct sched_domain *sd, int cpu)
last = sg;
}
last->next = first;
+ sd->groups = first;
return 0;
}
@@ -683,12 +923,12 @@ static void init_sched_groups_capacity(int cpu, struct sched_domain *sd)
do {
int cpu, max_cpu = -1;
- sg->group_weight = cpumask_weight(sched_group_cpus(sg));
+ sg->group_weight = cpumask_weight(sched_group_span(sg));
if (!(sd->flags & SD_ASYM_PACKING))
goto next;
- for_each_cpu(cpu, sched_group_cpus(sg)) {
+ for_each_cpu(cpu, sched_group_span(sg)) {
if (max_cpu < 0)
max_cpu = cpu;
else if (sched_asym_prefer(cpu, max_cpu))
@@ -1308,6 +1548,10 @@ static int __sdt_alloc(const struct cpumask *cpu_map)
if (!sgc)
return -ENOMEM;
+#ifdef CONFIG_SCHED_DEBUG
+ sgc->id = j;
+#endif
+
*per_cpu_ptr(sdd->sgc, j) = sgc;
}
}
@@ -1407,7 +1651,7 @@ build_sched_domains(const struct cpumask *cpu_map, struct sched_domain_attr *att
sd = build_sched_domain(tl, cpu_map, attr, sd, i);
if (tl == sched_domain_topology)
*per_cpu_ptr(d.sd, i) = sd;
- if (tl->flags & SDTL_OVERLAP || sched_feat(FORCE_SD_OVERLAP))
+ if (tl->flags & SDTL_OVERLAP)
sd->flags |= SD_OVERLAP;
if (cpumask_equal(cpu_map, sched_domain_span(sd)))
break;
@@ -1478,7 +1722,7 @@ static struct sched_domain_attr *dattr_cur;
* cpumask) fails, then fallback to a single sched domain,
* as determined by the single cpumask fallback_doms.
*/
-cpumask_var_t fallback_doms;
+static cpumask_var_t fallback_doms;
/*
* arch_update_cpu_topology lets virtualized architectures update the
@@ -1520,10 +1764,14 @@ void free_sched_domains(cpumask_var_t doms[], unsigned int ndoms)
* For now this just excludes isolated CPUs, but could be used to
* exclude other special cases in the future.
*/
-int init_sched_domains(const struct cpumask *cpu_map)
+int sched_init_domains(const struct cpumask *cpu_map)
{
int err;
+ zalloc_cpumask_var(&sched_domains_tmpmask, GFP_KERNEL);
+ zalloc_cpumask_var(&sched_domains_tmpmask2, GFP_KERNEL);
+ zalloc_cpumask_var(&fallback_doms, GFP_KERNEL);
+
arch_update_cpu_topology();
ndoms_cur = 1;
doms_cur = alloc_sched_domains(ndoms_cur);
diff --git a/kernel/sched/wait.c b/kernel/sched/wait.c
index b8c84c6dee64..17f11c6b0a9f 100644
--- a/kernel/sched/wait.c
+++ b/kernel/sched/wait.c
@@ -12,44 +12,44 @@
#include <linux/hash.h>
#include <linux/kthread.h>
-void __init_waitqueue_head(wait_queue_head_t *q, const char *name, struct lock_class_key *key)
+void __init_waitqueue_head(struct wait_queue_head *wq_head, const char *name, struct lock_class_key *key)
{
- spin_lock_init(&q->lock);
- lockdep_set_class_and_name(&q->lock, key, name);
- INIT_LIST_HEAD(&q->task_list);
+ spin_lock_init(&wq_head->lock);
+ lockdep_set_class_and_name(&wq_head->lock, key, name);
+ INIT_LIST_HEAD(&wq_head->head);
}
EXPORT_SYMBOL(__init_waitqueue_head);
-void add_wait_queue(wait_queue_head_t *q, wait_queue_t *wait)
+void add_wait_queue(struct wait_queue_head *wq_head, struct wait_queue_entry *wq_entry)
{
unsigned long flags;
- wait->flags &= ~WQ_FLAG_EXCLUSIVE;
- spin_lock_irqsave(&q->lock, flags);
- __add_wait_queue(q, wait);
- spin_unlock_irqrestore(&q->lock, flags);
+ wq_entry->flags &= ~WQ_FLAG_EXCLUSIVE;
+ spin_lock_irqsave(&wq_head->lock, flags);
+ __add_wait_queue_entry_tail(wq_head, wq_entry);
+ spin_unlock_irqrestore(&wq_head->lock, flags);
}
EXPORT_SYMBOL(add_wait_queue);
-void add_wait_queue_exclusive(wait_queue_head_t *q, wait_queue_t *wait)
+void add_wait_queue_exclusive(struct wait_queue_head *wq_head, struct wait_queue_entry *wq_entry)
{
unsigned long flags;
- wait->flags |= WQ_FLAG_EXCLUSIVE;
- spin_lock_irqsave(&q->lock, flags);
- __add_wait_queue_tail(q, wait);
- spin_unlock_irqrestore(&q->lock, flags);
+ wq_entry->flags |= WQ_FLAG_EXCLUSIVE;
+ spin_lock_irqsave(&wq_head->lock, flags);
+ __add_wait_queue_entry_tail(wq_head, wq_entry);
+ spin_unlock_irqrestore(&wq_head->lock, flags);
}
EXPORT_SYMBOL(add_wait_queue_exclusive);
-void remove_wait_queue(wait_queue_head_t *q, wait_queue_t *wait)
+void remove_wait_queue(struct wait_queue_head *wq_head, struct wait_queue_entry *wq_entry)
{
unsigned long flags;
- spin_lock_irqsave(&q->lock, flags);
- __remove_wait_queue(q, wait);
- spin_unlock_irqrestore(&q->lock, flags);
+ spin_lock_irqsave(&wq_head->lock, flags);
+ __remove_wait_queue(wq_head, wq_entry);
+ spin_unlock_irqrestore(&wq_head->lock, flags);
}
EXPORT_SYMBOL(remove_wait_queue);
@@ -63,12 +63,12 @@ EXPORT_SYMBOL(remove_wait_queue);
* started to run but is not in state TASK_RUNNING. try_to_wake_up() returns
* zero in this (rare) case, and we handle it by continuing to scan the queue.
*/
-static void __wake_up_common(wait_queue_head_t *q, unsigned int mode,
+static void __wake_up_common(struct wait_queue_head *wq_head, unsigned int mode,
int nr_exclusive, int wake_flags, void *key)
{
- wait_queue_t *curr, *next;
+ wait_queue_entry_t *curr, *next;
- list_for_each_entry_safe(curr, next, &q->task_list, task_list) {
+ list_for_each_entry_safe(curr, next, &wq_head->head, entry) {
unsigned flags = curr->flags;
if (curr->func(curr, mode, wake_flags, key) &&
@@ -79,7 +79,7 @@ static void __wake_up_common(wait_queue_head_t *q, unsigned int mode,
/**
* __wake_up - wake up threads blocked on a waitqueue.
- * @q: the waitqueue
+ * @wq_head: the waitqueue
* @mode: which threads
* @nr_exclusive: how many wake-one or wake-many threads to wake up
* @key: is directly passed to the wakeup function
@@ -87,35 +87,35 @@ static void __wake_up_common(wait_queue_head_t *q, unsigned int mode,
* It may be assumed that this function implies a write memory barrier before
* changing the task state if and only if any tasks are woken up.
*/
-void __wake_up(wait_queue_head_t *q, unsigned int mode,
+void __wake_up(struct wait_queue_head *wq_head, unsigned int mode,
int nr_exclusive, void *key)
{
unsigned long flags;
- spin_lock_irqsave(&q->lock, flags);
- __wake_up_common(q, mode, nr_exclusive, 0, key);
- spin_unlock_irqrestore(&q->lock, flags);
+ spin_lock_irqsave(&wq_head->lock, flags);
+ __wake_up_common(wq_head, mode, nr_exclusive, 0, key);
+ spin_unlock_irqrestore(&wq_head->lock, flags);
}
EXPORT_SYMBOL(__wake_up);
/*
* Same as __wake_up but called with the spinlock in wait_queue_head_t held.
*/
-void __wake_up_locked(wait_queue_head_t *q, unsigned int mode, int nr)
+void __wake_up_locked(struct wait_queue_head *wq_head, unsigned int mode, int nr)
{
- __wake_up_common(q, mode, nr, 0, NULL);
+ __wake_up_common(wq_head, mode, nr, 0, NULL);
}
EXPORT_SYMBOL_GPL(__wake_up_locked);
-void __wake_up_locked_key(wait_queue_head_t *q, unsigned int mode, void *key)
+void __wake_up_locked_key(struct wait_queue_head *wq_head, unsigned int mode, void *key)
{
- __wake_up_common(q, mode, 1, 0, key);
+ __wake_up_common(wq_head, mode, 1, 0, key);
}
EXPORT_SYMBOL_GPL(__wake_up_locked_key);
/**
* __wake_up_sync_key - wake up threads blocked on a waitqueue.
- * @q: the waitqueue
+ * @wq_head: the waitqueue
* @mode: which threads
* @nr_exclusive: how many wake-one or wake-many threads to wake up
* @key: opaque value to be passed to wakeup targets
@@ -130,30 +130,30 @@ EXPORT_SYMBOL_GPL(__wake_up_locked_key);
* It may be assumed that this function implies a write memory barrier before
* changing the task state if and only if any tasks are woken up.
*/
-void __wake_up_sync_key(wait_queue_head_t *q, unsigned int mode,
+void __wake_up_sync_key(struct wait_queue_head *wq_head, unsigned int mode,
int nr_exclusive, void *key)
{
unsigned long flags;
int wake_flags = 1; /* XXX WF_SYNC */
- if (unlikely(!q))
+ if (unlikely(!wq_head))
return;
if (unlikely(nr_exclusive != 1))
wake_flags = 0;
- spin_lock_irqsave(&q->lock, flags);
- __wake_up_common(q, mode, nr_exclusive, wake_flags, key);
- spin_unlock_irqrestore(&q->lock, flags);
+ spin_lock_irqsave(&wq_head->lock, flags);
+ __wake_up_common(wq_head, mode, nr_exclusive, wake_flags, key);
+ spin_unlock_irqrestore(&wq_head->lock, flags);
}
EXPORT_SYMBOL_GPL(__wake_up_sync_key);
/*
* __wake_up_sync - see __wake_up_sync_key()
*/
-void __wake_up_sync(wait_queue_head_t *q, unsigned int mode, int nr_exclusive)
+void __wake_up_sync(struct wait_queue_head *wq_head, unsigned int mode, int nr_exclusive)
{
- __wake_up_sync_key(q, mode, nr_exclusive, NULL);
+ __wake_up_sync_key(wq_head, mode, nr_exclusive, NULL);
}
EXPORT_SYMBOL_GPL(__wake_up_sync); /* For internal use only */
@@ -170,48 +170,48 @@ EXPORT_SYMBOL_GPL(__wake_up_sync); /* For internal use only */
* loads to move into the critical region).
*/
void
-prepare_to_wait(wait_queue_head_t *q, wait_queue_t *wait, int state)
+prepare_to_wait(struct wait_queue_head *wq_head, struct wait_queue_entry *wq_entry, int state)
{
unsigned long flags;
- wait->flags &= ~WQ_FLAG_EXCLUSIVE;
- spin_lock_irqsave(&q->lock, flags);
- if (list_empty(&wait->task_list))
- __add_wait_queue(q, wait);
+ wq_entry->flags &= ~WQ_FLAG_EXCLUSIVE;
+ spin_lock_irqsave(&wq_head->lock, flags);
+ if (list_empty(&wq_entry->entry))
+ __add_wait_queue(wq_head, wq_entry);
set_current_state(state);
- spin_unlock_irqrestore(&q->lock, flags);
+ spin_unlock_irqrestore(&wq_head->lock, flags);
}
EXPORT_SYMBOL(prepare_to_wait);
void
-prepare_to_wait_exclusive(wait_queue_head_t *q, wait_queue_t *wait, int state)
+prepare_to_wait_exclusive(struct wait_queue_head *wq_head, struct wait_queue_entry *wq_entry, int state)
{
unsigned long flags;
- wait->flags |= WQ_FLAG_EXCLUSIVE;
- spin_lock_irqsave(&q->lock, flags);
- if (list_empty(&wait->task_list))
- __add_wait_queue_tail(q, wait);
+ wq_entry->flags |= WQ_FLAG_EXCLUSIVE;
+ spin_lock_irqsave(&wq_head->lock, flags);
+ if (list_empty(&wq_entry->entry))
+ __add_wait_queue_entry_tail(wq_head, wq_entry);
set_current_state(state);
- spin_unlock_irqrestore(&q->lock, flags);
+ spin_unlock_irqrestore(&wq_head->lock, flags);
}
EXPORT_SYMBOL(prepare_to_wait_exclusive);
-void init_wait_entry(wait_queue_t *wait, int flags)
+void init_wait_entry(struct wait_queue_entry *wq_entry, int flags)
{
- wait->flags = flags;
- wait->private = current;
- wait->func = autoremove_wake_function;
- INIT_LIST_HEAD(&wait->task_list);
+ wq_entry->flags = flags;
+ wq_entry->private = current;
+ wq_entry->func = autoremove_wake_function;
+ INIT_LIST_HEAD(&wq_entry->entry);
}
EXPORT_SYMBOL(init_wait_entry);
-long prepare_to_wait_event(wait_queue_head_t *q, wait_queue_t *wait, int state)
+long prepare_to_wait_event(struct wait_queue_head *wq_head, struct wait_queue_entry *wq_entry, int state)
{
unsigned long flags;
long ret = 0;
- spin_lock_irqsave(&q->lock, flags);
+ spin_lock_irqsave(&wq_head->lock, flags);
if (unlikely(signal_pending_state(state, current))) {
/*
* Exclusive waiter must not fail if it was selected by wakeup,
@@ -219,24 +219,24 @@ long prepare_to_wait_event(wait_queue_head_t *q, wait_queue_t *wait, int state)
*
* The caller will recheck the condition and return success if
* we were already woken up, we can not miss the event because
- * wakeup locks/unlocks the same q->lock.
+ * wakeup locks/unlocks the same wq_head->lock.
*
* But we need to ensure that set-condition + wakeup after that
* can't see us, it should wake up another exclusive waiter if
* we fail.
*/
- list_del_init(&wait->task_list);
+ list_del_init(&wq_entry->entry);
ret = -ERESTARTSYS;
} else {
- if (list_empty(&wait->task_list)) {
- if (wait->flags & WQ_FLAG_EXCLUSIVE)
- __add_wait_queue_tail(q, wait);
+ if (list_empty(&wq_entry->entry)) {
+ if (wq_entry->flags & WQ_FLAG_EXCLUSIVE)
+ __add_wait_queue_entry_tail(wq_head, wq_entry);
else
- __add_wait_queue(q, wait);
+ __add_wait_queue(wq_head, wq_entry);
}
set_current_state(state);
}
- spin_unlock_irqrestore(&q->lock, flags);
+ spin_unlock_irqrestore(&wq_head->lock, flags);
return ret;
}
@@ -249,10 +249,10 @@ EXPORT_SYMBOL(prepare_to_wait_event);
* condition in the caller before they add the wait
* entry to the wake queue.
*/
-int do_wait_intr(wait_queue_head_t *wq, wait_queue_t *wait)
+int do_wait_intr(wait_queue_head_t *wq, wait_queue_entry_t *wait)
{
- if (likely(list_empty(&wait->task_list)))
- __add_wait_queue_tail(wq, wait);
+ if (likely(list_empty(&wait->entry)))
+ __add_wait_queue_entry_tail(wq, wait);
set_current_state(TASK_INTERRUPTIBLE);
if (signal_pending(current))
@@ -265,10 +265,10 @@ int do_wait_intr(wait_queue_head_t *wq, wait_queue_t *wait)
}
EXPORT_SYMBOL(do_wait_intr);
-int do_wait_intr_irq(wait_queue_head_t *wq, wait_queue_t *wait)
+int do_wait_intr_irq(wait_queue_head_t *wq, wait_queue_entry_t *wait)
{
- if (likely(list_empty(&wait->task_list)))
- __add_wait_queue_tail(wq, wait);
+ if (likely(list_empty(&wait->entry)))
+ __add_wait_queue_entry_tail(wq, wait);
set_current_state(TASK_INTERRUPTIBLE);
if (signal_pending(current))
@@ -283,14 +283,14 @@ EXPORT_SYMBOL(do_wait_intr_irq);
/**
* finish_wait - clean up after waiting in a queue
- * @q: waitqueue waited on
- * @wait: wait descriptor
+ * @wq_head: waitqueue waited on
+ * @wq_entry: wait descriptor
*
* Sets current thread back to running state and removes
* the wait descriptor from the given waitqueue if still
* queued.
*/
-void finish_wait(wait_queue_head_t *q, wait_queue_t *wait)
+void finish_wait(struct wait_queue_head *wq_head, struct wait_queue_entry *wq_entry)
{
unsigned long flags;
@@ -308,20 +308,20 @@ void finish_wait(wait_queue_head_t *q, wait_queue_t *wait)
* have _one_ other CPU that looks at or modifies
* the list).
*/
- if (!list_empty_careful(&wait->task_list)) {
- spin_lock_irqsave(&q->lock, flags);
- list_del_init(&wait->task_list);
- spin_unlock_irqrestore(&q->lock, flags);
+ if (!list_empty_careful(&wq_entry->entry)) {
+ spin_lock_irqsave(&wq_head->lock, flags);
+ list_del_init(&wq_entry->entry);
+ spin_unlock_irqrestore(&wq_head->lock, flags);
}
}
EXPORT_SYMBOL(finish_wait);
-int autoremove_wake_function(wait_queue_t *wait, unsigned mode, int sync, void *key)
+int autoremove_wake_function(struct wait_queue_entry *wq_entry, unsigned mode, int sync, void *key)
{
- int ret = default_wake_function(wait, mode, sync, key);
+ int ret = default_wake_function(wq_entry, mode, sync, key);
if (ret)
- list_del_init(&wait->task_list);
+ list_del_init(&wq_entry->entry);
return ret;
}
EXPORT_SYMBOL(autoremove_wake_function);
@@ -334,24 +334,24 @@ static inline bool is_kthread_should_stop(void)
/*
* DEFINE_WAIT_FUNC(wait, woken_wake_func);
*
- * add_wait_queue(&wq, &wait);
+ * add_wait_queue(&wq_head, &wait);
* for (;;) {
* if (condition)
* break;
*
* p->state = mode; condition = true;
* smp_mb(); // A smp_wmb(); // C
- * if (!wait->flags & WQ_FLAG_WOKEN) wait->flags |= WQ_FLAG_WOKEN;
+ * if (!wq_entry->flags & WQ_FLAG_WOKEN) wq_entry->flags |= WQ_FLAG_WOKEN;
* schedule() try_to_wake_up();
* p->state = TASK_RUNNING; ~~~~~~~~~~~~~~~~~~
- * wait->flags &= ~WQ_FLAG_WOKEN; condition = true;
+ * wq_entry->flags &= ~WQ_FLAG_WOKEN; condition = true;
* smp_mb() // B smp_wmb(); // C
- * wait->flags |= WQ_FLAG_WOKEN;
+ * wq_entry->flags |= WQ_FLAG_WOKEN;
* }
- * remove_wait_queue(&wq, &wait);
+ * remove_wait_queue(&wq_head, &wait);
*
*/
-long wait_woken(wait_queue_t *wait, unsigned mode, long timeout)
+long wait_woken(struct wait_queue_entry *wq_entry, unsigned mode, long timeout)
{
set_current_state(mode); /* A */
/*
@@ -359,7 +359,7 @@ long wait_woken(wait_queue_t *wait, unsigned mode, long timeout)
* woken_wake_function() such that if we observe WQ_FLAG_WOKEN we must
* also observe all state before the wakeup.
*/
- if (!(wait->flags & WQ_FLAG_WOKEN) && !is_kthread_should_stop())
+ if (!(wq_entry->flags & WQ_FLAG_WOKEN) && !is_kthread_should_stop())
timeout = schedule_timeout(timeout);
__set_current_state(TASK_RUNNING);
@@ -369,13 +369,13 @@ long wait_woken(wait_queue_t *wait, unsigned mode, long timeout)
* condition being true _OR_ WQ_FLAG_WOKEN such that we will not miss
* an event.
*/
- smp_store_mb(wait->flags, wait->flags & ~WQ_FLAG_WOKEN); /* B */
+ smp_store_mb(wq_entry->flags, wq_entry->flags & ~WQ_FLAG_WOKEN); /* B */
return timeout;
}
EXPORT_SYMBOL(wait_woken);
-int woken_wake_function(wait_queue_t *wait, unsigned mode, int sync, void *key)
+int woken_wake_function(struct wait_queue_entry *wq_entry, unsigned mode, int sync, void *key)
{
/*
* Although this function is called under waitqueue lock, LOCK
@@ -385,267 +385,8 @@ int woken_wake_function(wait_queue_t *wait, unsigned mode, int sync, void *key)
* and is paired with smp_store_mb() in wait_woken().
*/
smp_wmb(); /* C */
- wait->flags |= WQ_FLAG_WOKEN;
+ wq_entry->flags |= WQ_FLAG_WOKEN;
- return default_wake_function(wait, mode, sync, key);
+ return default_wake_function(wq_entry, mode, sync, key);
}
EXPORT_SYMBOL(woken_wake_function);
-
-int wake_bit_function(wait_queue_t *wait, unsigned mode, int sync, void *arg)
-{
- struct wait_bit_key *key = arg;
- struct wait_bit_queue *wait_bit
- = container_of(wait, struct wait_bit_queue, wait);
-
- if (wait_bit->key.flags != key->flags ||
- wait_bit->key.bit_nr != key->bit_nr ||
- test_bit(key->bit_nr, key->flags))
- return 0;
- else
- return autoremove_wake_function(wait, mode, sync, key);
-}
-EXPORT_SYMBOL(wake_bit_function);
-
-/*
- * To allow interruptible waiting and asynchronous (i.e. nonblocking)
- * waiting, the actions of __wait_on_bit() and __wait_on_bit_lock() are
- * permitted return codes. Nonzero return codes halt waiting and return.
- */
-int __sched
-__wait_on_bit(wait_queue_head_t *wq, struct wait_bit_queue *q,
- wait_bit_action_f *action, unsigned mode)
-{
- int ret = 0;
-
- do {
- prepare_to_wait(wq, &q->wait, mode);
- if (test_bit(q->key.bit_nr, q->key.flags))
- ret = (*action)(&q->key, mode);
- } while (test_bit(q->key.bit_nr, q->key.flags) && !ret);
- finish_wait(wq, &q->wait);
- return ret;
-}
-EXPORT_SYMBOL(__wait_on_bit);
-
-int __sched out_of_line_wait_on_bit(void *word, int bit,
- wait_bit_action_f *action, unsigned mode)
-{
- wait_queue_head_t *wq = bit_waitqueue(word, bit);
- DEFINE_WAIT_BIT(wait, word, bit);
-
- return __wait_on_bit(wq, &wait, action, mode);
-}
-EXPORT_SYMBOL(out_of_line_wait_on_bit);
-
-int __sched out_of_line_wait_on_bit_timeout(
- void *word, int bit, wait_bit_action_f *action,
- unsigned mode, unsigned long timeout)
-{
- wait_queue_head_t *wq = bit_waitqueue(word, bit);
- DEFINE_WAIT_BIT(wait, word, bit);
-
- wait.key.timeout = jiffies + timeout;
- return __wait_on_bit(wq, &wait, action, mode);
-}
-EXPORT_SYMBOL_GPL(out_of_line_wait_on_bit_timeout);
-
-int __sched
-__wait_on_bit_lock(wait_queue_head_t *wq, struct wait_bit_queue *q,
- wait_bit_action_f *action, unsigned mode)
-{
- int ret = 0;
-
- for (;;) {
- prepare_to_wait_exclusive(wq, &q->wait, mode);
- if (test_bit(q->key.bit_nr, q->key.flags)) {
- ret = action(&q->key, mode);
- /*
- * See the comment in prepare_to_wait_event().
- * finish_wait() does not necessarily takes wq->lock,
- * but test_and_set_bit() implies mb() which pairs with
- * smp_mb__after_atomic() before wake_up_page().
- */
- if (ret)
- finish_wait(wq, &q->wait);
- }
- if (!test_and_set_bit(q->key.bit_nr, q->key.flags)) {
- if (!ret)
- finish_wait(wq, &q->wait);
- return 0;
- } else if (ret) {
- return ret;
- }
- }
-}
-EXPORT_SYMBOL(__wait_on_bit_lock);
-
-int __sched out_of_line_wait_on_bit_lock(void *word, int bit,
- wait_bit_action_f *action, unsigned mode)
-{
- wait_queue_head_t *wq = bit_waitqueue(word, bit);
- DEFINE_WAIT_BIT(wait, word, bit);
-
- return __wait_on_bit_lock(wq, &wait, action, mode);
-}
-EXPORT_SYMBOL(out_of_line_wait_on_bit_lock);
-
-void __wake_up_bit(wait_queue_head_t *wq, void *word, int bit)
-{
- struct wait_bit_key key = __WAIT_BIT_KEY_INITIALIZER(word, bit);
- if (waitqueue_active(wq))
- __wake_up(wq, TASK_NORMAL, 1, &key);
-}
-EXPORT_SYMBOL(__wake_up_bit);
-
-/**
- * wake_up_bit - wake up a waiter on a bit
- * @word: the word being waited on, a kernel virtual address
- * @bit: the bit of the word being waited on
- *
- * There is a standard hashed waitqueue table for generic use. This
- * is the part of the hashtable's accessor API that wakes up waiters
- * on a bit. For instance, if one were to have waiters on a bitflag,
- * one would call wake_up_bit() after clearing the bit.
- *
- * In order for this to function properly, as it uses waitqueue_active()
- * internally, some kind of memory barrier must be done prior to calling
- * this. Typically, this will be smp_mb__after_atomic(), but in some
- * cases where bitflags are manipulated non-atomically under a lock, one
- * may need to use a less regular barrier, such fs/inode.c's smp_mb(),
- * because spin_unlock() does not guarantee a memory barrier.
- */
-void wake_up_bit(void *word, int bit)
-{
- __wake_up_bit(bit_waitqueue(word, bit), word, bit);
-}
-EXPORT_SYMBOL(wake_up_bit);
-
-/*
- * Manipulate the atomic_t address to produce a better bit waitqueue table hash
- * index (we're keying off bit -1, but that would produce a horrible hash
- * value).
- */
-static inline wait_queue_head_t *atomic_t_waitqueue(atomic_t *p)
-{
- if (BITS_PER_LONG == 64) {
- unsigned long q = (unsigned long)p;
- return bit_waitqueue((void *)(q & ~1), q & 1);
- }
- return bit_waitqueue(p, 0);
-}
-
-static int wake_atomic_t_function(wait_queue_t *wait, unsigned mode, int sync,
- void *arg)
-{
- struct wait_bit_key *key = arg;
- struct wait_bit_queue *wait_bit
- = container_of(wait, struct wait_bit_queue, wait);
- atomic_t *val = key->flags;
-
- if (wait_bit->key.flags != key->flags ||
- wait_bit->key.bit_nr != key->bit_nr ||
- atomic_read(val) != 0)
- return 0;
- return autoremove_wake_function(wait, mode, sync, key);
-}
-
-/*
- * To allow interruptible waiting and asynchronous (i.e. nonblocking) waiting,
- * the actions of __wait_on_atomic_t() are permitted return codes. Nonzero
- * return codes halt waiting and return.
- */
-static __sched
-int __wait_on_atomic_t(wait_queue_head_t *wq, struct wait_bit_queue *q,
- int (*action)(atomic_t *), unsigned mode)
-{
- atomic_t *val;
- int ret = 0;
-
- do {
- prepare_to_wait(wq, &q->wait, mode);
- val = q->key.flags;
- if (atomic_read(val) == 0)
- break;
- ret = (*action)(val);
- } while (!ret && atomic_read(val) != 0);
- finish_wait(wq, &q->wait);
- return ret;
-}
-
-#define DEFINE_WAIT_ATOMIC_T(name, p) \
- struct wait_bit_queue name = { \
- .key = __WAIT_ATOMIC_T_KEY_INITIALIZER(p), \
- .wait = { \
- .private = current, \
- .func = wake_atomic_t_function, \
- .task_list = \
- LIST_HEAD_INIT((name).wait.task_list), \
- }, \
- }
-
-__sched int out_of_line_wait_on_atomic_t(atomic_t *p, int (*action)(atomic_t *),
- unsigned mode)
-{
- wait_queue_head_t *wq = atomic_t_waitqueue(p);
- DEFINE_WAIT_ATOMIC_T(wait, p);
-
- return __wait_on_atomic_t(wq, &wait, action, mode);
-}
-EXPORT_SYMBOL(out_of_line_wait_on_atomic_t);
-
-/**
- * wake_up_atomic_t - Wake up a waiter on a atomic_t
- * @p: The atomic_t being waited on, a kernel virtual address
- *
- * Wake up anyone waiting for the atomic_t to go to zero.
- *
- * Abuse the bit-waker function and its waitqueue hash table set (the atomic_t
- * check is done by the waiter's wake function, not the by the waker itself).
- */
-void wake_up_atomic_t(atomic_t *p)
-{
- __wake_up_bit(atomic_t_waitqueue(p), p, WAIT_ATOMIC_T_BIT_NR);
-}
-EXPORT_SYMBOL(wake_up_atomic_t);
-
-__sched int bit_wait(struct wait_bit_key *word, int mode)
-{
- schedule();
- if (signal_pending_state(mode, current))
- return -EINTR;
- return 0;
-}
-EXPORT_SYMBOL(bit_wait);
-
-__sched int bit_wait_io(struct wait_bit_key *word, int mode)
-{
- io_schedule();
- if (signal_pending_state(mode, current))
- return -EINTR;
- return 0;
-}
-EXPORT_SYMBOL(bit_wait_io);
-
-__sched int bit_wait_timeout(struct wait_bit_key *word, int mode)
-{
- unsigned long now = READ_ONCE(jiffies);
- if (time_after_eq(now, word->timeout))
- return -EAGAIN;
- schedule_timeout(word->timeout - now);
- if (signal_pending_state(mode, current))
- return -EINTR;
- return 0;
-}
-EXPORT_SYMBOL_GPL(bit_wait_timeout);
-
-__sched int bit_wait_io_timeout(struct wait_bit_key *word, int mode)
-{
- unsigned long now = READ_ONCE(jiffies);
- if (time_after_eq(now, word->timeout))
- return -EAGAIN;
- io_schedule_timeout(word->timeout - now);
- if (signal_pending_state(mode, current))
- return -EINTR;
- return 0;
-}
-EXPORT_SYMBOL_GPL(bit_wait_io_timeout);
diff --git a/kernel/sched/wait_bit.c b/kernel/sched/wait_bit.c
new file mode 100644
index 000000000000..f8159698aa4d
--- /dev/null
+++ b/kernel/sched/wait_bit.c
@@ -0,0 +1,286 @@
+/*
+ * The implementation of the wait_bit*() and related waiting APIs:
+ */
+#include <linux/wait_bit.h>
+#include <linux/sched/signal.h>
+#include <linux/sched/debug.h>
+#include <linux/hash.h>
+
+#define WAIT_TABLE_BITS 8
+#define WAIT_TABLE_SIZE (1 << WAIT_TABLE_BITS)
+
+static wait_queue_head_t bit_wait_table[WAIT_TABLE_SIZE] __cacheline_aligned;
+
+wait_queue_head_t *bit_waitqueue(void *word, int bit)
+{
+ const int shift = BITS_PER_LONG == 32 ? 5 : 6;
+ unsigned long val = (unsigned long)word << shift | bit;
+
+ return bit_wait_table + hash_long(val, WAIT_TABLE_BITS);
+}
+EXPORT_SYMBOL(bit_waitqueue);
+
+int wake_bit_function(struct wait_queue_entry *wq_entry, unsigned mode, int sync, void *arg)
+{
+ struct wait_bit_key *key = arg;
+ struct wait_bit_queue_entry *wait_bit = container_of(wq_entry, struct wait_bit_queue_entry, wq_entry);
+
+ if (wait_bit->key.flags != key->flags ||
+ wait_bit->key.bit_nr != key->bit_nr ||
+ test_bit(key->bit_nr, key->flags))
+ return 0;
+ else
+ return autoremove_wake_function(wq_entry, mode, sync, key);
+}
+EXPORT_SYMBOL(wake_bit_function);
+
+/*
+ * To allow interruptible waiting and asynchronous (i.e. nonblocking)
+ * waiting, the actions of __wait_on_bit() and __wait_on_bit_lock() are
+ * permitted return codes. Nonzero return codes halt waiting and return.
+ */
+int __sched
+__wait_on_bit(struct wait_queue_head *wq_head, struct wait_bit_queue_entry *wbq_entry,
+ wait_bit_action_f *action, unsigned mode)
+{
+ int ret = 0;
+
+ do {
+ prepare_to_wait(wq_head, &wbq_entry->wq_entry, mode);
+ if (test_bit(wbq_entry->key.bit_nr, wbq_entry->key.flags))
+ ret = (*action)(&wbq_entry->key, mode);
+ } while (test_bit(wbq_entry->key.bit_nr, wbq_entry->key.flags) && !ret);
+ finish_wait(wq_head, &wbq_entry->wq_entry);
+ return ret;
+}
+EXPORT_SYMBOL(__wait_on_bit);
+
+int __sched out_of_line_wait_on_bit(void *word, int bit,
+ wait_bit_action_f *action, unsigned mode)
+{
+ struct wait_queue_head *wq_head = bit_waitqueue(word, bit);
+ DEFINE_WAIT_BIT(wq_entry, word, bit);
+
+ return __wait_on_bit(wq_head, &wq_entry, action, mode);
+}
+EXPORT_SYMBOL(out_of_line_wait_on_bit);
+
+int __sched out_of_line_wait_on_bit_timeout(
+ void *word, int bit, wait_bit_action_f *action,
+ unsigned mode, unsigned long timeout)
+{
+ struct wait_queue_head *wq_head = bit_waitqueue(word, bit);
+ DEFINE_WAIT_BIT(wq_entry, word, bit);
+
+ wq_entry.key.timeout = jiffies + timeout;
+ return __wait_on_bit(wq_head, &wq_entry, action, mode);
+}
+EXPORT_SYMBOL_GPL(out_of_line_wait_on_bit_timeout);
+
+int __sched
+__wait_on_bit_lock(struct wait_queue_head *wq_head, struct wait_bit_queue_entry *wbq_entry,
+ wait_bit_action_f *action, unsigned mode)
+{
+ int ret = 0;
+
+ for (;;) {
+ prepare_to_wait_exclusive(wq_head, &wbq_entry->wq_entry, mode);
+ if (test_bit(wbq_entry->key.bit_nr, wbq_entry->key.flags)) {
+ ret = action(&wbq_entry->key, mode);
+ /*
+ * See the comment in prepare_to_wait_event().
+ * finish_wait() does not necessarily takes wwq_head->lock,
+ * but test_and_set_bit() implies mb() which pairs with
+ * smp_mb__after_atomic() before wake_up_page().
+ */
+ if (ret)
+ finish_wait(wq_head, &wbq_entry->wq_entry);
+ }
+ if (!test_and_set_bit(wbq_entry->key.bit_nr, wbq_entry->key.flags)) {
+ if (!ret)
+ finish_wait(wq_head, &wbq_entry->wq_entry);
+ return 0;
+ } else if (ret) {
+ return ret;
+ }
+ }
+}
+EXPORT_SYMBOL(__wait_on_bit_lock);
+
+int __sched out_of_line_wait_on_bit_lock(void *word, int bit,
+ wait_bit_action_f *action, unsigned mode)
+{
+ struct wait_queue_head *wq_head = bit_waitqueue(word, bit);
+ DEFINE_WAIT_BIT(wq_entry, word, bit);
+
+ return __wait_on_bit_lock(wq_head, &wq_entry, action, mode);
+}
+EXPORT_SYMBOL(out_of_line_wait_on_bit_lock);
+
+void __wake_up_bit(struct wait_queue_head *wq_head, void *word, int bit)
+{
+ struct wait_bit_key key = __WAIT_BIT_KEY_INITIALIZER(word, bit);
+ if (waitqueue_active(wq_head))
+ __wake_up(wq_head, TASK_NORMAL, 1, &key);
+}
+EXPORT_SYMBOL(__wake_up_bit);
+
+/**
+ * wake_up_bit - wake up a waiter on a bit
+ * @word: the word being waited on, a kernel virtual address
+ * @bit: the bit of the word being waited on
+ *
+ * There is a standard hashed waitqueue table for generic use. This
+ * is the part of the hashtable's accessor API that wakes up waiters
+ * on a bit. For instance, if one were to have waiters on a bitflag,
+ * one would call wake_up_bit() after clearing the bit.
+ *
+ * In order for this to function properly, as it uses waitqueue_active()
+ * internally, some kind of memory barrier must be done prior to calling
+ * this. Typically, this will be smp_mb__after_atomic(), but in some
+ * cases where bitflags are manipulated non-atomically under a lock, one
+ * may need to use a less regular barrier, such fs/inode.c's smp_mb(),
+ * because spin_unlock() does not guarantee a memory barrier.
+ */
+void wake_up_bit(void *word, int bit)
+{
+ __wake_up_bit(bit_waitqueue(word, bit), word, bit);
+}
+EXPORT_SYMBOL(wake_up_bit);
+
+/*
+ * Manipulate the atomic_t address to produce a better bit waitqueue table hash
+ * index (we're keying off bit -1, but that would produce a horrible hash
+ * value).
+ */
+static inline wait_queue_head_t *atomic_t_waitqueue(atomic_t *p)
+{
+ if (BITS_PER_LONG == 64) {
+ unsigned long q = (unsigned long)p;
+ return bit_waitqueue((void *)(q & ~1), q & 1);
+ }
+ return bit_waitqueue(p, 0);
+}
+
+static int wake_atomic_t_function(struct wait_queue_entry *wq_entry, unsigned mode, int sync,
+ void *arg)
+{
+ struct wait_bit_key *key = arg;
+ struct wait_bit_queue_entry *wait_bit = container_of(wq_entry, struct wait_bit_queue_entry, wq_entry);
+ atomic_t *val = key->flags;
+
+ if (wait_bit->key.flags != key->flags ||
+ wait_bit->key.bit_nr != key->bit_nr ||
+ atomic_read(val) != 0)
+ return 0;
+ return autoremove_wake_function(wq_entry, mode, sync, key);
+}
+
+/*
+ * To allow interruptible waiting and asynchronous (i.e. nonblocking) waiting,
+ * the actions of __wait_on_atomic_t() are permitted return codes. Nonzero
+ * return codes halt waiting and return.
+ */
+static __sched
+int __wait_on_atomic_t(struct wait_queue_head *wq_head, struct wait_bit_queue_entry *wbq_entry,
+ int (*action)(atomic_t *), unsigned mode)
+{
+ atomic_t *val;
+ int ret = 0;
+
+ do {
+ prepare_to_wait(wq_head, &wbq_entry->wq_entry, mode);
+ val = wbq_entry->key.flags;
+ if (atomic_read(val) == 0)
+ break;
+ ret = (*action)(val);
+ } while (!ret && atomic_read(val) != 0);
+ finish_wait(wq_head, &wbq_entry->wq_entry);
+ return ret;
+}
+
+#define DEFINE_WAIT_ATOMIC_T(name, p) \
+ struct wait_bit_queue_entry name = { \
+ .key = __WAIT_ATOMIC_T_KEY_INITIALIZER(p), \
+ .wq_entry = { \
+ .private = current, \
+ .func = wake_atomic_t_function, \
+ .entry = \
+ LIST_HEAD_INIT((name).wq_entry.entry), \
+ }, \
+ }
+
+__sched int out_of_line_wait_on_atomic_t(atomic_t *p, int (*action)(atomic_t *),
+ unsigned mode)
+{
+ struct wait_queue_head *wq_head = atomic_t_waitqueue(p);
+ DEFINE_WAIT_ATOMIC_T(wq_entry, p);
+
+ return __wait_on_atomic_t(wq_head, &wq_entry, action, mode);
+}
+EXPORT_SYMBOL(out_of_line_wait_on_atomic_t);
+
+/**
+ * wake_up_atomic_t - Wake up a waiter on a atomic_t
+ * @p: The atomic_t being waited on, a kernel virtual address
+ *
+ * Wake up anyone waiting for the atomic_t to go to zero.
+ *
+ * Abuse the bit-waker function and its waitqueue hash table set (the atomic_t
+ * check is done by the waiter's wake function, not the by the waker itself).
+ */
+void wake_up_atomic_t(atomic_t *p)
+{
+ __wake_up_bit(atomic_t_waitqueue(p), p, WAIT_ATOMIC_T_BIT_NR);
+}
+EXPORT_SYMBOL(wake_up_atomic_t);
+
+__sched int bit_wait(struct wait_bit_key *word, int mode)
+{
+ schedule();
+ if (signal_pending_state(mode, current))
+ return -EINTR;
+ return 0;
+}
+EXPORT_SYMBOL(bit_wait);
+
+__sched int bit_wait_io(struct wait_bit_key *word, int mode)
+{
+ io_schedule();
+ if (signal_pending_state(mode, current))
+ return -EINTR;
+ return 0;
+}
+EXPORT_SYMBOL(bit_wait_io);
+
+__sched int bit_wait_timeout(struct wait_bit_key *word, int mode)
+{
+ unsigned long now = READ_ONCE(jiffies);
+ if (time_after_eq(now, word->timeout))
+ return -EAGAIN;
+ schedule_timeout(word->timeout - now);
+ if (signal_pending_state(mode, current))
+ return -EINTR;
+ return 0;
+}
+EXPORT_SYMBOL_GPL(bit_wait_timeout);
+
+__sched int bit_wait_io_timeout(struct wait_bit_key *word, int mode)
+{
+ unsigned long now = READ_ONCE(jiffies);
+ if (time_after_eq(now, word->timeout))
+ return -EAGAIN;
+ io_schedule_timeout(word->timeout - now);
+ if (signal_pending_state(mode, current))
+ return -EINTR;
+ return 0;
+}
+EXPORT_SYMBOL_GPL(bit_wait_io_timeout);
+
+void __init wait_bit_init(void)
+{
+ int i;
+
+ for (i = 0; i < WAIT_TABLE_SIZE; i++)
+ init_waitqueue_head(bit_wait_table + i);
+}
diff --git a/kernel/signal.c b/kernel/signal.c
index ca92bcfeb322..45b4c1ffe14e 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -510,7 +510,8 @@ int unhandled_signal(struct task_struct *tsk, int sig)
return !tsk->ptrace;
}
-static void collect_signal(int sig, struct sigpending *list, siginfo_t *info)
+static void collect_signal(int sig, struct sigpending *list, siginfo_t *info,
+ bool *resched_timer)
{
struct sigqueue *q, *first = NULL;
@@ -532,6 +533,12 @@ static void collect_signal(int sig, struct sigpending *list, siginfo_t *info)
still_pending:
list_del_init(&first->list);
copy_siginfo(info, &first->info);
+
+ *resched_timer =
+ (first->flags & SIGQUEUE_PREALLOC) &&
+ (info->si_code == SI_TIMER) &&
+ (info->si_sys_private);
+
__sigqueue_free(first);
} else {
/*
@@ -548,12 +555,12 @@ still_pending:
}
static int __dequeue_signal(struct sigpending *pending, sigset_t *mask,
- siginfo_t *info)
+ siginfo_t *info, bool *resched_timer)
{
int sig = next_signal(pending, mask);
if (sig)
- collect_signal(sig, pending, info);
+ collect_signal(sig, pending, info, resched_timer);
return sig;
}
@@ -565,15 +572,16 @@ static int __dequeue_signal(struct sigpending *pending, sigset_t *mask,
*/
int dequeue_signal(struct task_struct *tsk, sigset_t *mask, siginfo_t *info)
{
+ bool resched_timer = false;
int signr;
/* We only dequeue private signals from ourselves, we don't let
* signalfd steal them
*/
- signr = __dequeue_signal(&tsk->pending, mask, info);
+ signr = __dequeue_signal(&tsk->pending, mask, info, &resched_timer);
if (!signr) {
signr = __dequeue_signal(&tsk->signal->shared_pending,
- mask, info);
+ mask, info, &resched_timer);
#ifdef CONFIG_POSIX_TIMERS
/*
* itimer signal ?
@@ -621,7 +629,7 @@ int dequeue_signal(struct task_struct *tsk, sigset_t *mask, siginfo_t *info)
current->jobctl |= JOBCTL_STOP_DEQUEUED;
}
#ifdef CONFIG_POSIX_TIMERS
- if ((info->si_code & __SI_MASK) == __SI_TIMER && info->si_sys_private) {
+ if (resched_timer) {
/*
* Release the siglock to ensure proper locking order
* of timer locks outside of siglocks. Note, we leave
diff --git a/kernel/smp.c b/kernel/smp.c
index a817769b53c0..3061483cb3ad 100644
--- a/kernel/smp.c
+++ b/kernel/smp.c
@@ -30,6 +30,7 @@ enum {
struct call_function_data {
struct call_single_data __percpu *csd;
cpumask_var_t cpumask;
+ cpumask_var_t cpumask_ipi;
};
static DEFINE_PER_CPU_SHARED_ALIGNED(struct call_function_data, cfd_data);
@@ -45,9 +46,15 @@ int smpcfd_prepare_cpu(unsigned int cpu)
if (!zalloc_cpumask_var_node(&cfd->cpumask, GFP_KERNEL,
cpu_to_node(cpu)))
return -ENOMEM;
+ if (!zalloc_cpumask_var_node(&cfd->cpumask_ipi, GFP_KERNEL,
+ cpu_to_node(cpu))) {
+ free_cpumask_var(cfd->cpumask);
+ return -ENOMEM;
+ }
cfd->csd = alloc_percpu(struct call_single_data);
if (!cfd->csd) {
free_cpumask_var(cfd->cpumask);
+ free_cpumask_var(cfd->cpumask_ipi);
return -ENOMEM;
}
@@ -59,6 +66,7 @@ int smpcfd_dead_cpu(unsigned int cpu)
struct call_function_data *cfd = &per_cpu(cfd_data, cpu);
free_cpumask_var(cfd->cpumask);
+ free_cpumask_var(cfd->cpumask_ipi);
free_percpu(cfd->csd);
return 0;
}
@@ -428,12 +436,13 @@ void smp_call_function_many(const struct cpumask *mask,
cfd = this_cpu_ptr(&cfd_data);
cpumask_and(cfd->cpumask, mask, cpu_online_mask);
- cpumask_clear_cpu(this_cpu, cfd->cpumask);
+ __cpumask_clear_cpu(this_cpu, cfd->cpumask);
/* Some callers race with other cpus changing the passed mask */
if (unlikely(!cpumask_weight(cfd->cpumask)))
return;
+ cpumask_clear(cfd->cpumask_ipi);
for_each_cpu(cpu, cfd->cpumask) {
struct call_single_data *csd = per_cpu_ptr(cfd->csd, cpu);
@@ -442,11 +451,12 @@ void smp_call_function_many(const struct cpumask *mask,
csd->flags |= CSD_FLAG_SYNCHRONOUS;
csd->func = func;
csd->info = info;
- llist_add(&csd->llist, &per_cpu(call_single_queue, cpu));
+ if (llist_add(&csd->llist, &per_cpu(call_single_queue, cpu)))
+ __cpumask_set_cpu(cpu, cfd->cpumask_ipi);
}
/* Send a message to all CPUs in the map */
- arch_send_call_function_ipi_mask(cfd->cpumask);
+ arch_send_call_function_ipi_mask(cfd->cpumask_ipi);
if (wait) {
for_each_cpu(cpu, cfd->cpumask) {
diff --git a/kernel/time/Kconfig b/kernel/time/Kconfig
index 4008d9f95dd7..ac09bc29eb08 100644
--- a/kernel/time/Kconfig
+++ b/kernel/time/Kconfig
@@ -126,56 +126,6 @@ config NO_HZ_FULL_ALL
Note the boot CPU will still be kept outside the range to
handle the timekeeping duty.
-config NO_HZ_FULL_SYSIDLE
- bool "Detect full-system idle state for full dynticks system"
- depends on NO_HZ_FULL
- default n
- help
- At least one CPU must keep the scheduling-clock tick running for
- timekeeping purposes whenever there is a non-idle CPU, where
- "non-idle" also includes dynticks CPUs as long as they are
- running non-idle tasks. Because the underlying adaptive-tick
- support cannot distinguish between all CPUs being idle and
- all CPUs each running a single task in dynticks mode, the
- underlying support simply ensures that there is always a CPU
- handling the scheduling-clock tick, whether or not all CPUs
- are idle. This Kconfig option enables scalable detection of
- the all-CPUs-idle state, thus allowing the scheduling-clock
- tick to be disabled when all CPUs are idle. Note that scalable
- detection of the all-CPUs-idle state means that larger systems
- will be slower to declare the all-CPUs-idle state.
-
- Say Y if you would like to help debug all-CPUs-idle detection.
-
- Say N if you are unsure.
-
-config NO_HZ_FULL_SYSIDLE_SMALL
- int "Number of CPUs above which large-system approach is used"
- depends on NO_HZ_FULL_SYSIDLE
- range 1 NR_CPUS
- default 8
- help
- The full-system idle detection mechanism takes a lazy approach
- on large systems, as is required to attain decent scalability.
- However, on smaller systems, scalability is not anywhere near as
- large a concern as is energy efficiency. The sysidle subsystem
- therefore uses a fast but non-scalable algorithm for small
- systems and a lazier but scalable algorithm for large systems.
- This Kconfig parameter defines the number of CPUs in the largest
- system that will be considered to be "small".
-
- The default value will be fine in most cases. Battery-powered
- systems that (1) enable NO_HZ_FULL_SYSIDLE, (2) have larger
- numbers of CPUs, and (3) are suffering from battery-lifetime
- problems due to long sysidle latencies might wish to experiment
- with larger values for this Kconfig parameter. On the other
- hand, they might be even better served by disabling NO_HZ_FULL
- entirely, given that NO_HZ_FULL is intended for HPC and
- real-time workloads that at present do not tend to be run on
- battery-powered systems.
-
- Take the default if you are unsure.
-
config NO_HZ
bool "Old Idle dynticks config"
depends on !ARCH_USES_GETTIMEOFFSET && GENERIC_CLOCKEVENTS
diff --git a/kernel/time/alarmtimer.c b/kernel/time/alarmtimer.c
index 5cb5b0008d97..ee2f4202d82a 100644
--- a/kernel/time/alarmtimer.c
+++ b/kernel/time/alarmtimer.c
@@ -387,7 +387,7 @@ void alarm_start_relative(struct alarm *alarm, ktime_t start)
{
struct alarm_base *base = &alarm_bases[alarm->type];
- start = ktime_add(start, base->gettime());
+ start = ktime_add_safe(start, base->gettime());
alarm_start(alarm, start);
}
EXPORT_SYMBOL_GPL(alarm_start_relative);
@@ -475,7 +475,7 @@ u64 alarm_forward(struct alarm *alarm, ktime_t now, ktime_t interval)
overrun++;
}
- alarm->node.expires = ktime_add(alarm->node.expires, interval);
+ alarm->node.expires = ktime_add_safe(alarm->node.expires, interval);
return overrun;
}
EXPORT_SYMBOL_GPL(alarm_forward);
@@ -660,13 +660,21 @@ static int alarm_timer_set(struct k_itimer *timr, int flags,
/* start the timer */
timr->it.alarm.interval = timespec64_to_ktime(new_setting->it_interval);
+
+ /*
+ * Rate limit to the tick as a hot fix to prevent DOS. Will be
+ * mopped up later.
+ */
+ if (timr->it.alarm.interval < TICK_NSEC)
+ timr->it.alarm.interval = TICK_NSEC;
+
exp = timespec64_to_ktime(new_setting->it_value);
/* Convert (if necessary) to absolute time */
if (flags != TIMER_ABSTIME) {
ktime_t now;
now = alarm_bases[timr->it.alarm.alarmtimer.type].gettime();
- exp = ktime_add(now, exp);
+ exp = ktime_add_safe(now, exp);
}
alarm_start(&timr->it.alarm.alarmtimer, exp);
diff --git a/kernel/time/clocksource.c b/kernel/time/clocksource.c
index 93621ae718d3..03918a19cf2d 100644
--- a/kernel/time/clocksource.c
+++ b/kernel/time/clocksource.c
@@ -233,6 +233,9 @@ static void clocksource_watchdog(unsigned long data)
continue;
}
+ if (cs == curr_clocksource && cs->tick_stable)
+ cs->tick_stable(cs);
+
if (!(cs->flags & CLOCK_SOURCE_VALID_FOR_HRES) &&
(cs->flags & CLOCK_SOURCE_IS_CONTINUOUS) &&
(watchdog->flags & CLOCK_SOURCE_IS_CONTINUOUS)) {
diff --git a/kernel/time/tick-broadcast.c b/kernel/time/tick-broadcast.c
index 987e496bb51a..b398c2ea69b2 100644
--- a/kernel/time/tick-broadcast.c
+++ b/kernel/time/tick-broadcast.c
@@ -37,9 +37,11 @@ static int tick_broadcast_forced;
static __cacheline_aligned_in_smp DEFINE_RAW_SPINLOCK(tick_broadcast_lock);
#ifdef CONFIG_TICK_ONESHOT
+static void tick_broadcast_setup_oneshot(struct clock_event_device *bc);
static void tick_broadcast_clear_oneshot(int cpu);
static void tick_resume_broadcast_oneshot(struct clock_event_device *bc);
#else
+static inline void tick_broadcast_setup_oneshot(struct clock_event_device *bc) { BUG(); }
static inline void tick_broadcast_clear_oneshot(int cpu) { }
static inline void tick_resume_broadcast_oneshot(struct clock_event_device *bc) { }
#endif
@@ -867,7 +869,7 @@ static void tick_broadcast_init_next_event(struct cpumask *mask,
/**
* tick_broadcast_setup_oneshot - setup the broadcast device
*/
-void tick_broadcast_setup_oneshot(struct clock_event_device *bc)
+static void tick_broadcast_setup_oneshot(struct clock_event_device *bc)
{
int cpu = smp_processor_id();
diff --git a/kernel/time/tick-internal.h b/kernel/time/tick-internal.h
index f738251000fe..be0ac01f2e12 100644
--- a/kernel/time/tick-internal.h
+++ b/kernel/time/tick-internal.h
@@ -126,7 +126,6 @@ static inline int tick_check_oneshot_change(int allow_nohz) { return 0; }
/* Functions related to oneshot broadcasting */
#if defined(CONFIG_GENERIC_CLOCKEVENTS_BROADCAST) && defined(CONFIG_TICK_ONESHOT)
-extern void tick_broadcast_setup_oneshot(struct clock_event_device *bc);
extern void tick_broadcast_switch_to_oneshot(void);
extern void tick_shutdown_broadcast_oneshot(unsigned int cpu);
extern int tick_broadcast_oneshot_active(void);
@@ -134,7 +133,6 @@ extern void tick_check_oneshot_broadcast_this_cpu(void);
bool tick_broadcast_oneshot_available(void);
extern struct cpumask *tick_get_broadcast_oneshot_mask(void);
#else /* !(BROADCAST && ONESHOT): */
-static inline void tick_broadcast_setup_oneshot(struct clock_event_device *bc) { BUG(); }
static inline void tick_broadcast_switch_to_oneshot(void) { }
static inline void tick_shutdown_broadcast_oneshot(unsigned int cpu) { }
static inline int tick_broadcast_oneshot_active(void) { return 0; }
diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c
index 64c97fc130c4..c7a899c5ce64 100644
--- a/kernel/time/tick-sched.c
+++ b/kernel/time/tick-sched.c
@@ -150,6 +150,12 @@ static void tick_sched_handle(struct tick_sched *ts, struct pt_regs *regs)
touch_softlockup_watchdog_sched();
if (is_idle_task(current))
ts->idle_jiffies++;
+ /*
+ * In case the current tick fired too early past its expected
+ * expiration, make sure we don't bypass the next clock reprogramming
+ * to the same deadline.
+ */
+ ts->next_tick = 0;
}
#endif
update_process_times(user_mode(regs));
@@ -554,7 +560,7 @@ static void tick_nohz_stop_idle(struct tick_sched *ts, ktime_t now)
update_ts_time_stats(smp_processor_id(), ts, now, NULL);
ts->idle_active = 0;
- sched_clock_idle_wakeup_event(0);
+ sched_clock_idle_wakeup_event();
}
static ktime_t tick_nohz_start_idle(struct tick_sched *ts)
@@ -660,6 +666,12 @@ static void tick_nohz_restart(struct tick_sched *ts, ktime_t now)
hrtimer_start_expires(&ts->sched_timer, HRTIMER_MODE_ABS_PINNED);
else
tick_program_event(hrtimer_get_expires(&ts->sched_timer), 1);
+
+ /*
+ * Reset to make sure next tick stop doesn't get fooled by past
+ * cached clock deadline.
+ */
+ ts->next_tick = 0;
}
static ktime_t tick_nohz_stop_sched_tick(struct tick_sched *ts,
@@ -701,8 +713,6 @@ static ktime_t tick_nohz_stop_sched_tick(struct tick_sched *ts,
*/
delta = next_tick - basemono;
if (delta <= (u64)TICK_NSEC) {
- tick = 0;
-
/*
* Tell the timer code that the base is not idle, i.e. undo
* the effect of get_next_timer_interrupt():
@@ -712,23 +722,8 @@ static ktime_t tick_nohz_stop_sched_tick(struct tick_sched *ts,
* We've not stopped the tick yet, and there's a timer in the
* next period, so no point in stopping it either, bail.
*/
- if (!ts->tick_stopped)
- goto out;
-
- /*
- * If, OTOH, we did stop it, but there's a pending (expired)
- * timer reprogram the timer hardware to fire now.
- *
- * We will not restart the tick proper, just prod the timer
- * hardware into firing an interrupt to process the pending
- * timers. Just like tick_irq_exit() will not restart the tick
- * for 'normal' interrupts.
- *
- * Only once we exit the idle loop will we re-enable the tick,
- * see tick_nohz_idle_exit().
- */
- if (delta == 0) {
- tick_nohz_restart(ts, now);
+ if (!ts->tick_stopped) {
+ tick = 0;
goto out;
}
}
@@ -771,8 +766,16 @@ static ktime_t tick_nohz_stop_sched_tick(struct tick_sched *ts,
tick = expires;
/* Skip reprogram of event if its not changed */
- if (ts->tick_stopped && (expires == dev->next_event))
- goto out;
+ if (ts->tick_stopped && (expires == ts->next_tick)) {
+ /* Sanity check: make sure clockevent is actually programmed */
+ if (tick == KTIME_MAX || ts->next_tick == hrtimer_get_expires(&ts->sched_timer))
+ goto out;
+
+ WARN_ON_ONCE(1);
+ printk_once("basemono: %llu ts->next_tick: %llu dev->next_event: %llu timer->active: %d timer->expires: %llu\n",
+ basemono, ts->next_tick, dev->next_event,
+ hrtimer_active(&ts->sched_timer), hrtimer_get_expires(&ts->sched_timer));
+ }
/*
* nohz_stop_sched_tick can be called several times before
@@ -782,8 +785,7 @@ static ktime_t tick_nohz_stop_sched_tick(struct tick_sched *ts,
* the scheduler tick in nohz_restart_sched_tick.
*/
if (!ts->tick_stopped) {
- nohz_balance_enter_idle(cpu);
- calc_load_enter_idle();
+ calc_load_nohz_start();
cpu_load_update_nohz_start();
ts->last_tick = hrtimer_get_expires(&ts->sched_timer);
@@ -791,6 +793,8 @@ static ktime_t tick_nohz_stop_sched_tick(struct tick_sched *ts,
trace_tick_stop(1, TICK_DEP_MASK_NONE);
}
+ ts->next_tick = tick;
+
/*
* If the expiration time == KTIME_MAX, then we simply stop
* the tick timer.
@@ -801,12 +805,17 @@ static ktime_t tick_nohz_stop_sched_tick(struct tick_sched *ts,
goto out;
}
+ hrtimer_set_expires(&ts->sched_timer, tick);
+
if (ts->nohz_mode == NOHZ_MODE_HIGHRES)
- hrtimer_start(&ts->sched_timer, tick, HRTIMER_MODE_ABS_PINNED);
+ hrtimer_start_expires(&ts->sched_timer, HRTIMER_MODE_ABS_PINNED);
else
tick_program_event(tick, 1);
out:
- /* Update the estimated sleep length */
+ /*
+ * Update the estimated sleep length until the next timer
+ * (not only the tick).
+ */
ts->sleep_length = ktime_sub(dev->next_event, now);
return tick;
}
@@ -823,7 +832,7 @@ static void tick_nohz_restart_sched_tick(struct tick_sched *ts, ktime_t now)
*/
timer_clear_idle();
- calc_load_exit_idle();
+ calc_load_nohz_stop();
touch_softlockup_watchdog_sched();
/*
* Cancel the scheduled timer and restore the tick
@@ -864,6 +873,11 @@ static bool can_stop_idle_tick(int cpu, struct tick_sched *ts)
if (unlikely(!cpu_online(cpu))) {
if (cpu == tick_do_timer_cpu)
tick_do_timer_cpu = TICK_DO_TIMER_NONE;
+ /*
+ * Make sure the CPU doesn't get fooled by obsolete tick
+ * deadline if it comes back online later.
+ */
+ ts->next_tick = 0;
return false;
}
@@ -923,8 +937,10 @@ static void __tick_nohz_idle_enter(struct tick_sched *ts)
ts->idle_expires = expires;
}
- if (!was_stopped && ts->tick_stopped)
+ if (!was_stopped && ts->tick_stopped) {
ts->idle_jiffies = ts->last_jiffies;
+ nohz_balance_enter_idle(cpu);
+ }
}
}
@@ -1172,6 +1188,8 @@ static enum hrtimer_restart tick_sched_timer(struct hrtimer *timer)
*/
if (regs)
tick_sched_handle(ts, regs);
+ else
+ ts->next_tick = 0;
/* No need to reprogram if we are in idle or full dynticks mode */
if (unlikely(ts->tick_stopped))
diff --git a/kernel/time/tick-sched.h b/kernel/time/tick-sched.h
index bf38226e5c17..075444e3d48e 100644
--- a/kernel/time/tick-sched.h
+++ b/kernel/time/tick-sched.h
@@ -27,6 +27,7 @@ enum tick_nohz_mode {
* timer is modified for nohz sleeps. This is necessary
* to resume the tick timer operation in the timeline
* when the CPU returns from nohz sleep.
+ * @next_tick: Next tick to be fired when in dynticks mode.
* @tick_stopped: Indicator that the idle tick has been stopped
* @idle_jiffies: jiffies at the entry to idle for idle time accounting
* @idle_calls: Total number of idle calls
@@ -44,6 +45,7 @@ struct tick_sched {
unsigned long check_clocks;
enum tick_nohz_mode nohz_mode;
ktime_t last_tick;
+ ktime_t next_tick;
int inidle;
int tick_stopped;
unsigned long idle_jiffies;
diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c
index 9652bc57fd09..b602c48cb841 100644
--- a/kernel/time/timekeeping.c
+++ b/kernel/time/timekeeping.c
@@ -118,6 +118,26 @@ static inline void tk_update_sleep_time(struct timekeeper *tk, ktime_t delta)
tk->offs_boot = ktime_add(tk->offs_boot, delta);
}
+/*
+ * tk_clock_read - atomic clocksource read() helper
+ *
+ * This helper is necessary to use in the read paths because, while the
+ * seqlock ensures we don't return a bad value while structures are updated,
+ * it doesn't protect from potential crashes. There is the possibility that
+ * the tkr's clocksource may change between the read reference, and the
+ * clock reference passed to the read function. This can cause crashes if
+ * the wrong clocksource is passed to the wrong read function.
+ * This isn't necessary to use when holding the timekeeper_lock or doing
+ * a read of the fast-timekeeper tkrs (which is protected by its own locking
+ * and update logic).
+ */
+static inline u64 tk_clock_read(struct tk_read_base *tkr)
+{
+ struct clocksource *clock = READ_ONCE(tkr->clock);
+
+ return clock->read(clock);
+}
+
#ifdef CONFIG_DEBUG_TIMEKEEPING
#define WARNING_FREQ (HZ*300) /* 5 minute rate-limiting */
@@ -175,7 +195,7 @@ static inline u64 timekeeping_get_delta(struct tk_read_base *tkr)
*/
do {
seq = read_seqcount_begin(&tk_core.seq);
- now = tkr->read(tkr->clock);
+ now = tk_clock_read(tkr);
last = tkr->cycle_last;
mask = tkr->mask;
max = tkr->clock->max_cycles;
@@ -209,7 +229,7 @@ static inline u64 timekeeping_get_delta(struct tk_read_base *tkr)
u64 cycle_now, delta;
/* read clocksource */
- cycle_now = tkr->read(tkr->clock);
+ cycle_now = tk_clock_read(tkr);
/* calculate the delta since the last update_wall_time */
delta = clocksource_delta(cycle_now, tkr->cycle_last, tkr->mask);
@@ -238,12 +258,10 @@ static void tk_setup_internals(struct timekeeper *tk, struct clocksource *clock)
++tk->cs_was_changed_seq;
old_clock = tk->tkr_mono.clock;
tk->tkr_mono.clock = clock;
- tk->tkr_mono.read = clock->read;
tk->tkr_mono.mask = clock->mask;
- tk->tkr_mono.cycle_last = tk->tkr_mono.read(clock);
+ tk->tkr_mono.cycle_last = tk_clock_read(&tk->tkr_mono);
tk->tkr_raw.clock = clock;
- tk->tkr_raw.read = clock->read;
tk->tkr_raw.mask = clock->mask;
tk->tkr_raw.cycle_last = tk->tkr_mono.cycle_last;
@@ -262,7 +280,7 @@ static void tk_setup_internals(struct timekeeper *tk, struct clocksource *clock)
/* Go back from cycles -> shifted ns */
tk->xtime_interval = interval * clock->mult;
tk->xtime_remainder = ntpinterval - tk->xtime_interval;
- tk->raw_interval = (interval * clock->mult) >> clock->shift;
+ tk->raw_interval = interval * clock->mult;
/* if changing clocks, convert xtime_nsec shift units */
if (old_clock) {
@@ -404,7 +422,7 @@ static __always_inline u64 __ktime_get_fast_ns(struct tk_fast *tkf)
now += timekeeping_delta_to_ns(tkr,
clocksource_delta(
- tkr->read(tkr->clock),
+ tk_clock_read(tkr),
tkr->cycle_last,
tkr->mask));
} while (read_seqcount_retry(&tkf->seq, seq));
@@ -461,6 +479,10 @@ static u64 dummy_clock_read(struct clocksource *cs)
return cycles_at_suspend;
}
+static struct clocksource dummy_clock = {
+ .read = dummy_clock_read,
+};
+
/**
* halt_fast_timekeeper - Prevent fast timekeeper from accessing clocksource.
* @tk: Timekeeper to snapshot.
@@ -477,13 +499,13 @@ static void halt_fast_timekeeper(struct timekeeper *tk)
struct tk_read_base *tkr = &tk->tkr_mono;
memcpy(&tkr_dummy, tkr, sizeof(tkr_dummy));
- cycles_at_suspend = tkr->read(tkr->clock);
- tkr_dummy.read = dummy_clock_read;
+ cycles_at_suspend = tk_clock_read(tkr);
+ tkr_dummy.clock = &dummy_clock;
update_fast_timekeeper(&tkr_dummy, &tk_fast_mono);
tkr = &tk->tkr_raw;
memcpy(&tkr_dummy, tkr, sizeof(tkr_dummy));
- tkr_dummy.read = dummy_clock_read;
+ tkr_dummy.clock = &dummy_clock;
update_fast_timekeeper(&tkr_dummy, &tk_fast_raw);
}
@@ -649,11 +671,10 @@ static void timekeeping_update(struct timekeeper *tk, unsigned int action)
*/
static void timekeeping_forward_now(struct timekeeper *tk)
{
- struct clocksource *clock = tk->tkr_mono.clock;
u64 cycle_now, delta;
u64 nsec;
- cycle_now = tk->tkr_mono.read(clock);
+ cycle_now = tk_clock_read(&tk->tkr_mono);
delta = clocksource_delta(cycle_now, tk->tkr_mono.cycle_last, tk->tkr_mono.mask);
tk->tkr_mono.cycle_last = cycle_now;
tk->tkr_raw.cycle_last = cycle_now;
@@ -929,8 +950,7 @@ void ktime_get_snapshot(struct system_time_snapshot *systime_snapshot)
do {
seq = read_seqcount_begin(&tk_core.seq);
-
- now = tk->tkr_mono.read(tk->tkr_mono.clock);
+ now = tk_clock_read(&tk->tkr_mono);
systime_snapshot->cs_was_changed_seq = tk->cs_was_changed_seq;
systime_snapshot->clock_was_set_seq = tk->clock_was_set_seq;
base_real = ktime_add(tk->tkr_mono.base,
@@ -1108,7 +1128,7 @@ int get_device_system_crosststamp(int (*get_time_fn)
* Check whether the system counter value provided by the
* device driver is on the current timekeeping interval.
*/
- now = tk->tkr_mono.read(tk->tkr_mono.clock);
+ now = tk_clock_read(&tk->tkr_mono);
interval_start = tk->tkr_mono.cycle_last;
if (!cycle_between(interval_start, cycles, now)) {
clock_was_set_seq = tk->clock_was_set_seq;
@@ -1629,7 +1649,7 @@ void timekeeping_resume(void)
* The less preferred source will only be tried if there is no better
* usable source. The rtc part is handled separately in rtc core code.
*/
- cycle_now = tk->tkr_mono.read(clock);
+ cycle_now = tk_clock_read(&tk->tkr_mono);
if ((clock->flags & CLOCK_SOURCE_SUSPEND_NONSTOP) &&
cycle_now > tk->tkr_mono.cycle_last) {
u64 nsec, cyc_delta;
@@ -1976,7 +1996,7 @@ static u64 logarithmic_accumulation(struct timekeeper *tk, u64 offset,
u32 shift, unsigned int *clock_set)
{
u64 interval = tk->cycle_interval << shift;
- u64 raw_nsecs;
+ u64 snsec_per_sec;
/* If the offset is smaller than a shifted interval, do nothing */
if (offset < interval)
@@ -1991,14 +2011,15 @@ static u64 logarithmic_accumulation(struct timekeeper *tk, u64 offset,
*clock_set |= accumulate_nsecs_to_secs(tk);
/* Accumulate raw time */
- raw_nsecs = (u64)tk->raw_interval << shift;
- raw_nsecs += tk->raw_time.tv_nsec;
- if (raw_nsecs >= NSEC_PER_SEC) {
- u64 raw_secs = raw_nsecs;
- raw_nsecs = do_div(raw_secs, NSEC_PER_SEC);
- tk->raw_time.tv_sec += raw_secs;
+ tk->tkr_raw.xtime_nsec += (u64)tk->raw_time.tv_nsec << tk->tkr_raw.shift;
+ tk->tkr_raw.xtime_nsec += tk->raw_interval << shift;
+ snsec_per_sec = (u64)NSEC_PER_SEC << tk->tkr_raw.shift;
+ while (tk->tkr_raw.xtime_nsec >= snsec_per_sec) {
+ tk->tkr_raw.xtime_nsec -= snsec_per_sec;
+ tk->raw_time.tv_sec++;
}
- tk->raw_time.tv_nsec = raw_nsecs;
+ tk->raw_time.tv_nsec = tk->tkr_raw.xtime_nsec >> tk->tkr_raw.shift;
+ tk->tkr_raw.xtime_nsec -= (u64)tk->raw_time.tv_nsec << tk->tkr_raw.shift;
/* Accumulate error between NTP and clock interval */
tk->ntp_error += tk->ntp_tick << shift;
@@ -2030,7 +2051,7 @@ void update_wall_time(void)
#ifdef CONFIG_ARCH_USES_GETTIMEOFFSET
offset = real_tk->cycle_interval;
#else
- offset = clocksource_delta(tk->tkr_mono.read(tk->tkr_mono.clock),
+ offset = clocksource_delta(tk_clock_read(&tk->tkr_mono),
tk->tkr_mono.cycle_last, tk->tkr_mono.mask);
#endif
diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index 9e5841dc14b5..b308be30dfb9 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -4337,9 +4337,6 @@ static int ftrace_process_regex(struct ftrace_iterator *iter,
command = strsep(&next, ":");
- if (WARN_ON_ONCE(!tr))
- return -EINVAL;
-
mutex_lock(&ftrace_cmd_mutex);
list_for_each_entry(p, &ftrace_commands, list) {
if (strcmp(p->name, command) == 0) {
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 1122f151466f..091e801145c9 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -6881,6 +6881,9 @@ ftrace_trace_snapshot_callback(struct trace_array *tr, struct ftrace_hash *hash,
char *number;
int ret;
+ if (!tr)
+ return -ENODEV;
+
/* hash funcs only work with set_ftrace_filter */
if (!enable)
return -EINVAL;
diff --git a/kernel/trace/trace_functions.c b/kernel/trace/trace_functions.c
index a3bddbfd0874..a0910c0cdf2e 100644
--- a/kernel/trace/trace_functions.c
+++ b/kernel/trace/trace_functions.c
@@ -654,6 +654,9 @@ ftrace_trace_onoff_callback(struct trace_array *tr, struct ftrace_hash *hash,
{
struct ftrace_probe_ops *ops;
+ if (!tr)
+ return -ENODEV;
+
/* we register both traceon and traceoff to this callback */
if (strcmp(cmd, "traceon") == 0)
ops = param ? &traceon_count_probe_ops : &traceon_probe_ops;
@@ -670,6 +673,9 @@ ftrace_stacktrace_callback(struct trace_array *tr, struct ftrace_hash *hash,
{
struct ftrace_probe_ops *ops;
+ if (!tr)
+ return -ENODEV;
+
ops = param ? &stacktrace_count_probe_ops : &stacktrace_probe_ops;
return ftrace_trace_probe_callback(tr, ops, hash, glob, cmd,
@@ -682,6 +688,9 @@ ftrace_dump_callback(struct trace_array *tr, struct ftrace_hash *hash,
{
struct ftrace_probe_ops *ops;
+ if (!tr)
+ return -ENODEV;
+
ops = &dump_probe_ops;
/* Only dump once. */
@@ -695,6 +704,9 @@ ftrace_cpudump_callback(struct trace_array *tr, struct ftrace_hash *hash,
{
struct ftrace_probe_ops *ops;
+ if (!tr)
+ return -ENODEV;
+
ops = &cpudump_probe_ops;
/* Only dump once. */
diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c
index c129fca6ec99..b53c8d369163 100644
--- a/kernel/trace/trace_kprobe.c
+++ b/kernel/trace/trace_kprobe.c
@@ -707,20 +707,16 @@ static int create_trace_kprobe(int argc, char **argv)
pr_info("Probe point is not specified.\n");
return -EINVAL;
}
- if (isdigit(argv[1][0])) {
- /* an address specified */
- ret = kstrtoul(&argv[1][0], 0, (unsigned long *)&addr);
- if (ret) {
- pr_info("Failed to parse address.\n");
- return ret;
- }
- } else {
+
+ /* try to parse an address. if that fails, try to read the
+ * input as a symbol. */
+ if (kstrtoul(argv[1], 0, (unsigned long *)&addr)) {
/* a symbol specified */
symbol = argv[1];
/* TODO: support .init module functions */
ret = traceprobe_split_symbol_offset(symbol, &offset);
if (ret) {
- pr_info("Failed to parse symbol.\n");
+ pr_info("Failed to parse either an address or a symbol.\n");
return ret;
}
if (offset && is_return &&
diff --git a/kernel/trace/trace_stack.c b/kernel/trace/trace_stack.c
index 76aa04d4c925..b4a751e8f9d6 100644
--- a/kernel/trace/trace_stack.c
+++ b/kernel/trace/trace_stack.c
@@ -409,7 +409,9 @@ static const struct file_operations stack_trace_fops = {
static int
stack_trace_filter_open(struct inode *inode, struct file *file)
{
- return ftrace_regex_open(&trace_ops, FTRACE_ITER_FILTER,
+ struct ftrace_ops *ops = inode->i_private;
+
+ return ftrace_regex_open(ops, FTRACE_ITER_FILTER,
inode, file);
}
@@ -476,7 +478,7 @@ static __init int stack_trace_init(void)
NULL, &stack_trace_fops);
trace_create_file("stack_trace_filter", 0444, d_tracer,
- NULL, &stack_trace_filter_fops);
+ &trace_ops, &stack_trace_filter_fops);
if (stack_trace_filter_buf[0])
ftrace_set_early_filter(&trace_ops, stack_trace_filter_buf, 1);
diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index c74bf39ef764..a86688fabc55 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -2864,11 +2864,11 @@ bool flush_work(struct work_struct *work)
EXPORT_SYMBOL_GPL(flush_work);
struct cwt_wait {
- wait_queue_t wait;
+ wait_queue_entry_t wait;
struct work_struct *work;
};
-static int cwt_wakefn(wait_queue_t *wait, unsigned mode, int sync, void *key)
+static int cwt_wakefn(wait_queue_entry_t *wait, unsigned mode, int sync, void *key)
{
struct cwt_wait *cwait = container_of(wait, struct cwt_wait, wait);
diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug
index e4587ebe52c7..9c5d40a50930 100644
--- a/lib/Kconfig.debug
+++ b/lib/Kconfig.debug
@@ -1052,6 +1052,7 @@ config DEBUG_LOCK_ALLOC
depends on DEBUG_KERNEL && TRACE_IRQFLAGS_SUPPORT && STACKTRACE_SUPPORT && LOCKDEP_SUPPORT
select DEBUG_SPINLOCK
select DEBUG_MUTEXES
+ select DEBUG_RT_MUTEXES if RT_MUTEXES
select LOCKDEP
help
This feature will check whether any held lock (spinlock, rwlock,
@@ -1067,6 +1068,7 @@ config PROVE_LOCKING
select LOCKDEP
select DEBUG_SPINLOCK
select DEBUG_MUTEXES
+ select DEBUG_RT_MUTEXES if RT_MUTEXES
select DEBUG_LOCK_ALLOC
select TRACE_IRQFLAGS
default n
@@ -1121,6 +1123,7 @@ config LOCK_STAT
select LOCKDEP
select DEBUG_SPINLOCK
select DEBUG_MUTEXES
+ select DEBUG_RT_MUTEXES if RT_MUTEXES
select DEBUG_LOCK_ALLOC
default n
help
@@ -1301,189 +1304,7 @@ config DEBUG_CREDENTIALS
If unsure, say N.
-menu "RCU Debugging"
-
-config PROVE_RCU
- def_bool PROVE_LOCKING
-
-config PROVE_RCU_REPEATEDLY
- bool "RCU debugging: don't disable PROVE_RCU on first splat"
- depends on PROVE_RCU
- default n
- help
- By itself, PROVE_RCU will disable checking upon issuing the
- first warning (or "splat"). This feature prevents such
- disabling, allowing multiple RCU-lockdep warnings to be printed
- on a single reboot.
-
- Say Y to allow multiple RCU-lockdep warnings per boot.
-
- Say N if you are unsure.
-
-config SPARSE_RCU_POINTER
- bool "RCU debugging: sparse-based checks for pointer usage"
- default n
- help
- This feature enables the __rcu sparse annotation for
- RCU-protected pointers. This annotation will cause sparse
- to flag any non-RCU used of annotated pointers. This can be
- helpful when debugging RCU usage. Please note that this feature
- is not intended to enforce code cleanliness; it is instead merely
- a debugging aid.
-
- Say Y to make sparse flag questionable use of RCU-protected pointers
-
- Say N if you are unsure.
-
-config TORTURE_TEST
- tristate
- default n
-
-config RCU_PERF_TEST
- tristate "performance tests for RCU"
- depends on DEBUG_KERNEL
- select TORTURE_TEST
- select SRCU
- select TASKS_RCU
- default n
- help
- This option provides a kernel module that runs performance
- tests on the RCU infrastructure. The kernel module may be built
- after the fact on the running kernel to be tested, if desired.
-
- Say Y here if you want RCU performance tests to be built into
- the kernel.
- Say M if you want the RCU performance tests to build as a module.
- Say N if you are unsure.
-
-config RCU_TORTURE_TEST
- tristate "torture tests for RCU"
- depends on DEBUG_KERNEL
- select TORTURE_TEST
- select SRCU
- select TASKS_RCU
- default n
- help
- This option provides a kernel module that runs torture tests
- on the RCU infrastructure. The kernel module may be built
- after the fact on the running kernel to be tested, if desired.
-
- Say Y here if you want RCU torture tests to be built into
- the kernel.
- Say M if you want the RCU torture tests to build as a module.
- Say N if you are unsure.
-
-config RCU_TORTURE_TEST_SLOW_PREINIT
- bool "Slow down RCU grace-period pre-initialization to expose races"
- depends on RCU_TORTURE_TEST
- help
- This option delays grace-period pre-initialization (the
- propagation of CPU-hotplug changes up the rcu_node combining
- tree) for a few jiffies between initializing each pair of
- consecutive rcu_node structures. This helps to expose races
- involving grace-period pre-initialization, in other words, it
- makes your kernel less stable. It can also greatly increase
- grace-period latency, especially on systems with large numbers
- of CPUs. This is useful when torture-testing RCU, but in
- almost no other circumstance.
-
- Say Y here if you want your system to crash and hang more often.
- Say N if you want a sane system.
-
-config RCU_TORTURE_TEST_SLOW_PREINIT_DELAY
- int "How much to slow down RCU grace-period pre-initialization"
- range 0 5
- default 3
- depends on RCU_TORTURE_TEST_SLOW_PREINIT
- help
- This option specifies the number of jiffies to wait between
- each rcu_node structure pre-initialization step.
-
-config RCU_TORTURE_TEST_SLOW_INIT
- bool "Slow down RCU grace-period initialization to expose races"
- depends on RCU_TORTURE_TEST
- help
- This option delays grace-period initialization for a few
- jiffies between initializing each pair of consecutive
- rcu_node structures. This helps to expose races involving
- grace-period initialization, in other words, it makes your
- kernel less stable. It can also greatly increase grace-period
- latency, especially on systems with large numbers of CPUs.
- This is useful when torture-testing RCU, but in almost no
- other circumstance.
-
- Say Y here if you want your system to crash and hang more often.
- Say N if you want a sane system.
-
-config RCU_TORTURE_TEST_SLOW_INIT_DELAY
- int "How much to slow down RCU grace-period initialization"
- range 0 5
- default 3
- depends on RCU_TORTURE_TEST_SLOW_INIT
- help
- This option specifies the number of jiffies to wait between
- each rcu_node structure initialization.
-
-config RCU_TORTURE_TEST_SLOW_CLEANUP
- bool "Slow down RCU grace-period cleanup to expose races"
- depends on RCU_TORTURE_TEST
- help
- This option delays grace-period cleanup for a few jiffies
- between cleaning up each pair of consecutive rcu_node
- structures. This helps to expose races involving grace-period
- cleanup, in other words, it makes your kernel less stable.
- It can also greatly increase grace-period latency, especially
- on systems with large numbers of CPUs. This is useful when
- torture-testing RCU, but in almost no other circumstance.
-
- Say Y here if you want your system to crash and hang more often.
- Say N if you want a sane system.
-
-config RCU_TORTURE_TEST_SLOW_CLEANUP_DELAY
- int "How much to slow down RCU grace-period cleanup"
- range 0 5
- default 3
- depends on RCU_TORTURE_TEST_SLOW_CLEANUP
- help
- This option specifies the number of jiffies to wait between
- each rcu_node structure cleanup operation.
-
-config RCU_CPU_STALL_TIMEOUT
- int "RCU CPU stall timeout in seconds"
- depends on RCU_STALL_COMMON
- range 3 300
- default 21
- help
- If a given RCU grace period extends more than the specified
- number of seconds, a CPU stall warning is printed. If the
- RCU grace period persists, additional CPU stall warnings are
- printed at more widely spaced intervals.
-
-config RCU_TRACE
- bool "Enable tracing for RCU"
- depends on DEBUG_KERNEL
- default y if TREE_RCU
- select TRACE_CLOCK
- help
- This option provides tracing in RCU which presents stats
- in debugfs for debugging RCU implementation. It also enables
- additional tracepoints for ftrace-style event tracing.
-
- Say Y here if you want to enable RCU tracing
- Say N if you are unsure.
-
-config RCU_EQS_DEBUG
- bool "Provide debugging asserts for adding NO_HZ support to an arch"
- depends on DEBUG_KERNEL
- help
- This option provides consistency checks in RCU's handling of
- NO_HZ. These checks have proven quite helpful in detecting
- bugs in arch-specific NO_HZ code.
-
- Say N here if you need ultimate kernel/user switch latencies
- Say Y if you are unsure
-
-endmenu # "RCU Debugging"
+source "kernel/rcu/Kconfig.debug"
config DEBUG_WQ_FORCE_RR_CPU
bool "Force round-robin CPU selection for unbound work items"
diff --git a/lib/Makefile b/lib/Makefile
index 0166fbc0fa81..07fbe6a75692 100644
--- a/lib/Makefile
+++ b/lib/Makefile
@@ -25,9 +25,6 @@ lib-y := ctype.o string.o vsprintf.o cmdline.o \
earlycpio.o seq_buf.o siphash.o \
nmi_backtrace.o nodemask.o win_minmax.o
-CFLAGS_radix-tree.o += -DCONFIG_SPARSE_RCU_POINTER
-CFLAGS_idr.o += -DCONFIG_SPARSE_RCU_POINTER
-
lib-$(CONFIG_MMU) += ioremap.o
lib-$(CONFIG_SMP) += cpumask.o
lib-$(CONFIG_DMA_NOOP_OPS) += dma-noop.o
diff --git a/lib/cmdline.c b/lib/cmdline.c
index 3c6432df7e63..4c0888c4a68d 100644
--- a/lib/cmdline.c
+++ b/lib/cmdline.c
@@ -23,14 +23,14 @@
* the values[M, M+1, ..., N] into the ints array in get_options.
*/
-static int get_range(char **str, int *pint)
+static int get_range(char **str, int *pint, int n)
{
int x, inc_counter, upper_range;
(*str)++;
upper_range = simple_strtol((*str), NULL, 0);
inc_counter = upper_range - *pint;
- for (x = *pint; x < upper_range; x++)
+ for (x = *pint; n && x < upper_range; x++, n--)
*pint++ = x;
return inc_counter;
}
@@ -97,7 +97,7 @@ char *get_options(const char *str, int nints, int *ints)
break;
if (res == 3) {
int range_nums;
- range_nums = get_range((char **)&str, ints + i);
+ range_nums = get_range((char **)&str, ints + i, nints - i);
if (range_nums < 0)
break;
/*
diff --git a/lib/cpumask.c b/lib/cpumask.c
index 81dedaab36cc..4731a0895760 100644
--- a/lib/cpumask.c
+++ b/lib/cpumask.c
@@ -43,6 +43,38 @@ int cpumask_any_but(const struct cpumask *mask, unsigned int cpu)
}
EXPORT_SYMBOL(cpumask_any_but);
+/**
+ * cpumask_next_wrap - helper to implement for_each_cpu_wrap
+ * @n: the cpu prior to the place to search
+ * @mask: the cpumask pointer
+ * @start: the start point of the iteration
+ * @wrap: assume @n crossing @start terminates the iteration
+ *
+ * Returns >= nr_cpu_ids on completion
+ *
+ * Note: the @wrap argument is required for the start condition when
+ * we cannot assume @start is set in @mask.
+ */
+int cpumask_next_wrap(int n, const struct cpumask *mask, int start, bool wrap)
+{
+ int next;
+
+again:
+ next = cpumask_next(n, mask);
+
+ if (wrap && n < start && next >= start) {
+ return nr_cpumask_bits;
+
+ } else if (next >= nr_cpumask_bits) {
+ wrap = true;
+ n = -1;
+ goto again;
+ }
+
+ return next;
+}
+EXPORT_SYMBOL(cpumask_next_wrap);
+
/* These are not inline because of header tangles. */
#ifdef CONFIG_CPUMASK_OFFSTACK
/**
diff --git a/lib/libcrc32c.c b/lib/libcrc32c.c
index 74a54b7f2562..9f79547d1b97 100644
--- a/lib/libcrc32c.c
+++ b/lib/libcrc32c.c
@@ -43,7 +43,7 @@ static struct crypto_shash *tfm;
u32 crc32c(u32 crc, const void *address, unsigned int length)
{
SHASH_DESC_ON_STACK(shash, tfm);
- u32 *ctx = (u32 *)shash_desc_ctx(shash);
+ u32 ret, *ctx = (u32 *)shash_desc_ctx(shash);
int err;
shash->tfm = tfm;
@@ -53,7 +53,9 @@ u32 crc32c(u32 crc, const void *address, unsigned int length)
err = crypto_shash_update(shash, address, length);
BUG_ON(err);
- return *ctx;
+ ret = *ctx;
+ barrier_data(ctx);
+ return ret;
}
EXPORT_SYMBOL(crc32c);
diff --git a/lib/locking-selftest-rtmutex.h b/lib/locking-selftest-rtmutex.h
new file mode 100644
index 000000000000..e3cb83989d16
--- /dev/null
+++ b/lib/locking-selftest-rtmutex.h
@@ -0,0 +1,11 @@
+#undef LOCK
+#define LOCK RTL
+
+#undef UNLOCK
+#define UNLOCK RTU
+
+#undef RLOCK
+#undef WLOCK
+
+#undef INIT
+#define INIT RTI
diff --git a/lib/locking-selftest.c b/lib/locking-selftest.c
index f3a217ea0388..6f2b135dc5e8 100644
--- a/lib/locking-selftest.c
+++ b/lib/locking-selftest.c
@@ -21,6 +21,7 @@
#include <linux/interrupt.h>
#include <linux/debug_locks.h>
#include <linux/irqflags.h>
+#include <linux/rtmutex.h>
/*
* Change this to 1 if you want to see the failure printouts:
@@ -46,6 +47,7 @@ __setup("debug_locks_verbose=", setup_debug_locks_verbose);
#define LOCKTYPE_MUTEX 0x4
#define LOCKTYPE_RWSEM 0x8
#define LOCKTYPE_WW 0x10
+#define LOCKTYPE_RTMUTEX 0x20
static struct ww_acquire_ctx t, t2;
static struct ww_mutex o, o2, o3;
@@ -74,6 +76,15 @@ static DECLARE_RWSEM(rwsem_B);
static DECLARE_RWSEM(rwsem_C);
static DECLARE_RWSEM(rwsem_D);
+#ifdef CONFIG_RT_MUTEXES
+
+static DEFINE_RT_MUTEX(rtmutex_A);
+static DEFINE_RT_MUTEX(rtmutex_B);
+static DEFINE_RT_MUTEX(rtmutex_C);
+static DEFINE_RT_MUTEX(rtmutex_D);
+
+#endif
+
/*
* Locks that we initialize dynamically as well so that
* e.g. X1 and X2 becomes two instances of the same class,
@@ -108,6 +119,17 @@ static DECLARE_RWSEM(rwsem_Y2);
static DECLARE_RWSEM(rwsem_Z1);
static DECLARE_RWSEM(rwsem_Z2);
+#ifdef CONFIG_RT_MUTEXES
+
+static DEFINE_RT_MUTEX(rtmutex_X1);
+static DEFINE_RT_MUTEX(rtmutex_X2);
+static DEFINE_RT_MUTEX(rtmutex_Y1);
+static DEFINE_RT_MUTEX(rtmutex_Y2);
+static DEFINE_RT_MUTEX(rtmutex_Z1);
+static DEFINE_RT_MUTEX(rtmutex_Z2);
+
+#endif
+
/*
* non-inlined runtime initializers, to let separate locks share
* the same lock-class:
@@ -129,6 +151,17 @@ INIT_CLASS_FUNC(Z)
static void init_shared_classes(void)
{
+#ifdef CONFIG_RT_MUTEXES
+ static struct lock_class_key rt_X, rt_Y, rt_Z;
+
+ __rt_mutex_init(&rtmutex_X1, __func__, &rt_X);
+ __rt_mutex_init(&rtmutex_X2, __func__, &rt_X);
+ __rt_mutex_init(&rtmutex_Y1, __func__, &rt_Y);
+ __rt_mutex_init(&rtmutex_Y2, __func__, &rt_Y);
+ __rt_mutex_init(&rtmutex_Z1, __func__, &rt_Z);
+ __rt_mutex_init(&rtmutex_Z2, __func__, &rt_Z);
+#endif
+
init_class_X(&lock_X1, &rwlock_X1, &mutex_X1, &rwsem_X1);
init_class_X(&lock_X2, &rwlock_X2, &mutex_X2, &rwsem_X2);
@@ -193,6 +226,10 @@ static void init_shared_classes(void)
#define MU(x) mutex_unlock(&mutex_##x)
#define MI(x) mutex_init(&mutex_##x)
+#define RTL(x) rt_mutex_lock(&rtmutex_##x)
+#define RTU(x) rt_mutex_unlock(&rtmutex_##x)
+#define RTI(x) rt_mutex_init(&rtmutex_##x)
+
#define WSL(x) down_write(&rwsem_##x)
#define WSU(x) up_write(&rwsem_##x)
@@ -264,6 +301,11 @@ GENERATE_TESTCASE(AA_wsem)
#include "locking-selftest-rsem.h"
GENERATE_TESTCASE(AA_rsem)
+#ifdef CONFIG_RT_MUTEXES
+#include "locking-selftest-rtmutex.h"
+GENERATE_TESTCASE(AA_rtmutex);
+#endif
+
#undef E
/*
@@ -345,6 +387,11 @@ GENERATE_TESTCASE(ABBA_wsem)
#include "locking-selftest-rsem.h"
GENERATE_TESTCASE(ABBA_rsem)
+#ifdef CONFIG_RT_MUTEXES
+#include "locking-selftest-rtmutex.h"
+GENERATE_TESTCASE(ABBA_rtmutex);
+#endif
+
#undef E
/*
@@ -373,6 +420,11 @@ GENERATE_TESTCASE(ABBCCA_wsem)
#include "locking-selftest-rsem.h"
GENERATE_TESTCASE(ABBCCA_rsem)
+#ifdef CONFIG_RT_MUTEXES
+#include "locking-selftest-rtmutex.h"
+GENERATE_TESTCASE(ABBCCA_rtmutex);
+#endif
+
#undef E
/*
@@ -401,6 +453,11 @@ GENERATE_TESTCASE(ABCABC_wsem)
#include "locking-selftest-rsem.h"
GENERATE_TESTCASE(ABCABC_rsem)
+#ifdef CONFIG_RT_MUTEXES
+#include "locking-selftest-rtmutex.h"
+GENERATE_TESTCASE(ABCABC_rtmutex);
+#endif
+
#undef E
/*
@@ -430,6 +487,11 @@ GENERATE_TESTCASE(ABBCCDDA_wsem)
#include "locking-selftest-rsem.h"
GENERATE_TESTCASE(ABBCCDDA_rsem)
+#ifdef CONFIG_RT_MUTEXES
+#include "locking-selftest-rtmutex.h"
+GENERATE_TESTCASE(ABBCCDDA_rtmutex);
+#endif
+
#undef E
/*
@@ -458,6 +520,11 @@ GENERATE_TESTCASE(ABCDBDDA_wsem)
#include "locking-selftest-rsem.h"
GENERATE_TESTCASE(ABCDBDDA_rsem)
+#ifdef CONFIG_RT_MUTEXES
+#include "locking-selftest-rtmutex.h"
+GENERATE_TESTCASE(ABCDBDDA_rtmutex);
+#endif
+
#undef E
/*
@@ -486,6 +553,11 @@ GENERATE_TESTCASE(ABCDBCDA_wsem)
#include "locking-selftest-rsem.h"
GENERATE_TESTCASE(ABCDBCDA_rsem)
+#ifdef CONFIG_RT_MUTEXES
+#include "locking-selftest-rtmutex.h"
+GENERATE_TESTCASE(ABCDBCDA_rtmutex);
+#endif
+
#undef E
/*
@@ -513,33 +585,10 @@ GENERATE_TESTCASE(double_unlock_wsem)
#include "locking-selftest-rsem.h"
GENERATE_TESTCASE(double_unlock_rsem)
-#undef E
-
-/*
- * Bad unlock ordering:
- */
-#define E() \
- \
- LOCK(A); \
- LOCK(B); \
- UNLOCK(A); /* fail */ \
- UNLOCK(B);
-
-/*
- * 6 testcases:
- */
-#include "locking-selftest-spin.h"
-GENERATE_TESTCASE(bad_unlock_order_spin)
-#include "locking-selftest-wlock.h"
-GENERATE_TESTCASE(bad_unlock_order_wlock)
-#include "locking-selftest-rlock.h"
-GENERATE_TESTCASE(bad_unlock_order_rlock)
-#include "locking-selftest-mutex.h"
-GENERATE_TESTCASE(bad_unlock_order_mutex)
-#include "locking-selftest-wsem.h"
-GENERATE_TESTCASE(bad_unlock_order_wsem)
-#include "locking-selftest-rsem.h"
-GENERATE_TESTCASE(bad_unlock_order_rsem)
+#ifdef CONFIG_RT_MUTEXES
+#include "locking-selftest-rtmutex.h"
+GENERATE_TESTCASE(double_unlock_rtmutex);
+#endif
#undef E
@@ -567,6 +616,11 @@ GENERATE_TESTCASE(init_held_wsem)
#include "locking-selftest-rsem.h"
GENERATE_TESTCASE(init_held_rsem)
+#ifdef CONFIG_RT_MUTEXES
+#include "locking-selftest-rtmutex.h"
+GENERATE_TESTCASE(init_held_rtmutex);
+#endif
+
#undef E
/*
@@ -916,6 +970,9 @@ GENERATE_PERMUTATIONS_3_EVENTS(irq_read_recursion_soft)
# define I_MUTEX(x) lockdep_reset_lock(&mutex_##x.dep_map)
# define I_RWSEM(x) lockdep_reset_lock(&rwsem_##x.dep_map)
# define I_WW(x) lockdep_reset_lock(&x.dep_map)
+#ifdef CONFIG_RT_MUTEXES
+# define I_RTMUTEX(x) lockdep_reset_lock(&rtmutex_##x.dep_map)
+#endif
#else
# define I_SPINLOCK(x)
# define I_RWLOCK(x)
@@ -924,12 +981,23 @@ GENERATE_PERMUTATIONS_3_EVENTS(irq_read_recursion_soft)
# define I_WW(x)
#endif
+#ifndef I_RTMUTEX
+# define I_RTMUTEX(x)
+#endif
+
+#ifdef CONFIG_RT_MUTEXES
+#define I2_RTMUTEX(x) rt_mutex_init(&rtmutex_##x)
+#else
+#define I2_RTMUTEX(x)
+#endif
+
#define I1(x) \
do { \
I_SPINLOCK(x); \
I_RWLOCK(x); \
I_MUTEX(x); \
I_RWSEM(x); \
+ I_RTMUTEX(x); \
} while (0)
#define I2(x) \
@@ -938,6 +1006,7 @@ GENERATE_PERMUTATIONS_3_EVENTS(irq_read_recursion_soft)
rwlock_init(&rwlock_##x); \
mutex_init(&mutex_##x); \
init_rwsem(&rwsem_##x); \
+ I2_RTMUTEX(x); \
} while (0)
static void reset_locks(void)
@@ -1013,6 +1082,12 @@ static void dotest(void (*testcase_fn)(void), int expected, int lockclass_mask)
reset_locks();
}
+#ifdef CONFIG_RT_MUTEXES
+#define dotest_rt(fn, e, m) dotest((fn), (e), (m))
+#else
+#define dotest_rt(fn, e, m)
+#endif
+
static inline void print_testname(const char *testname)
{
printk("%33s:", testname);
@@ -1050,6 +1125,7 @@ static inline void print_testname(const char *testname)
dotest(name##_mutex, FAILURE, LOCKTYPE_MUTEX); \
dotest(name##_wsem, FAILURE, LOCKTYPE_RWSEM); \
dotest(name##_rsem, FAILURE, LOCKTYPE_RWSEM); \
+ dotest_rt(name##_rtmutex, FAILURE, LOCKTYPE_RTMUTEX); \
pr_cont("\n");
#define DO_TESTCASE_6_SUCCESS(desc, name) \
@@ -1060,6 +1136,7 @@ static inline void print_testname(const char *testname)
dotest(name##_mutex, SUCCESS, LOCKTYPE_MUTEX); \
dotest(name##_wsem, SUCCESS, LOCKTYPE_RWSEM); \
dotest(name##_rsem, SUCCESS, LOCKTYPE_RWSEM); \
+ dotest_rt(name##_rtmutex, SUCCESS, LOCKTYPE_RTMUTEX); \
pr_cont("\n");
/*
@@ -1073,6 +1150,7 @@ static inline void print_testname(const char *testname)
dotest(name##_mutex, FAILURE, LOCKTYPE_MUTEX); \
dotest(name##_wsem, FAILURE, LOCKTYPE_RWSEM); \
dotest(name##_rsem, FAILURE, LOCKTYPE_RWSEM); \
+ dotest_rt(name##_rtmutex, FAILURE, LOCKTYPE_RTMUTEX); \
pr_cont("\n");
#define DO_TESTCASE_2I(desc, name, nr) \
@@ -1825,7 +1903,6 @@ void locking_selftest(void)
DO_TESTCASE_6R("A-B-C-D-B-C-D-A deadlock", ABCDBCDA);
DO_TESTCASE_6("double unlock", double_unlock);
DO_TESTCASE_6("initialize held", init_held);
- DO_TESTCASE_6_SUCCESS("bad unlock order", bad_unlock_order);
printk(" --------------------------------------------------------------------------\n");
print_testname("recursive read-lock");
diff --git a/lib/refcount.c b/lib/refcount.c
index 9f906783987e..5d0582a9480c 100644
--- a/lib/refcount.c
+++ b/lib/refcount.c
@@ -37,6 +37,8 @@
#include <linux/refcount.h>
#include <linux/bug.h>
+#ifdef CONFIG_REFCOUNT_FULL
+
/**
* refcount_add_not_zero - add a value to a refcount unless it is 0
* @i: the value to add to the refcount
@@ -225,6 +227,7 @@ void refcount_dec(refcount_t *r)
WARN_ONCE(refcount_dec_and_test(r), "refcount_t: decrement hit 0; leaking memory.\n");
}
EXPORT_SYMBOL(refcount_dec);
+#endif /* CONFIG_REFCOUNT_FULL */
/**
* refcount_dec_if_one - decrement a refcount if it is 1
diff --git a/lib/smp_processor_id.c b/lib/smp_processor_id.c
index 690d75b132fa..2fb007be0212 100644
--- a/lib/smp_processor_id.c
+++ b/lib/smp_processor_id.c
@@ -28,7 +28,7 @@ notrace static unsigned int check_preemption_disabled(const char *what1,
/*
* It is valid to assume CPU-locality during early bootup:
*/
- if (system_state != SYSTEM_RUNNING)
+ if (system_state < SYSTEM_SCHEDULING)
goto out;
/*
diff --git a/mm/Kconfig b/mm/Kconfig
index beb7a455915d..398b46064544 100644
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -137,7 +137,7 @@ config HAVE_MEMBLOCK_NODE_MAP
config HAVE_MEMBLOCK_PHYS_MAP
bool
-config HAVE_GENERIC_RCU_GUP
+config HAVE_GENERIC_GUP
bool
config ARCH_DISCARD_MEMBLOCK
diff --git a/mm/filemap.c b/mm/filemap.c
index 742034e56100..aea58e983a73 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -800,10 +800,10 @@ struct wait_page_key {
struct wait_page_queue {
struct page *page;
int bit_nr;
- wait_queue_t wait;
+ wait_queue_entry_t wait;
};
-static int wake_page_function(wait_queue_t *wait, unsigned mode, int sync, void *arg)
+static int wake_page_function(wait_queue_entry_t *wait, unsigned mode, int sync, void *arg)
{
struct wait_page_key *key = arg;
struct wait_page_queue *wait_page
@@ -866,7 +866,7 @@ static inline int wait_on_page_bit_common(wait_queue_head_t *q,
struct page *page, int bit_nr, int state, bool lock)
{
struct wait_page_queue wait_page;
- wait_queue_t *wait = &wait_page.wait;
+ wait_queue_entry_t *wait = &wait_page.wait;
int ret = 0;
init_wait(wait);
@@ -877,9 +877,9 @@ static inline int wait_on_page_bit_common(wait_queue_head_t *q,
for (;;) {
spin_lock_irq(&q->lock);
- if (likely(list_empty(&wait->task_list))) {
+ if (likely(list_empty(&wait->entry))) {
if (lock)
- __add_wait_queue_tail_exclusive(q, wait);
+ __add_wait_queue_entry_tail_exclusive(q, wait);
else
__add_wait_queue(q, wait);
SetPageWaiters(page);
@@ -939,7 +939,7 @@ int wait_on_page_bit_killable(struct page *page, int bit_nr)
*
* Add an arbitrary @waiter to the wait queue for the nominated @page.
*/
-void add_page_wait_queue(struct page *page, wait_queue_t *waiter)
+void add_page_wait_queue(struct page *page, wait_queue_entry_t *waiter)
{
wait_queue_head_t *q = page_waitqueue(page);
unsigned long flags;
diff --git a/mm/gup.c b/mm/gup.c
index b3c7214d710d..3ab78dc3db7d 100644
--- a/mm/gup.c
+++ b/mm/gup.c
@@ -387,11 +387,6 @@ static int faultin_page(struct task_struct *tsk, struct vm_area_struct *vma,
/* mlock all present pages, but do not fault in new pages */
if ((*flags & (FOLL_POPULATE | FOLL_MLOCK)) == FOLL_MLOCK)
return -ENOENT;
- /* For mm_populate(), just skip the stack guard page. */
- if ((*flags & FOLL_POPULATE) &&
- (stack_guard_page_start(vma, address) ||
- stack_guard_page_end(vma, address + PAGE_SIZE)))
- return -ENOENT;
if (*flags & FOLL_WRITE)
fault_flags |= FAULT_FLAG_WRITE;
if (*flags & FOLL_REMOTE)
@@ -1151,7 +1146,7 @@ struct page *get_dump_page(unsigned long addr)
#endif /* CONFIG_ELF_CORE */
/*
- * Generic RCU Fast GUP
+ * Generic Fast GUP
*
* get_user_pages_fast attempts to pin user pages by walking the page
* tables directly and avoids taking locks. Thus the walker needs to be
@@ -1172,8 +1167,8 @@ struct page *get_dump_page(unsigned long addr)
* Before activating this code, please be aware that the following assumptions
* are currently made:
*
- * *) HAVE_RCU_TABLE_FREE is enabled, and tlb_remove_table is used to free
- * pages containing page tables.
+ * *) Either HAVE_RCU_TABLE_FREE is enabled, and tlb_remove_table() is used to
+ * free pages containing page tables or TLB flushing requires IPI broadcast.
*
* *) ptes can be read atomically by the architecture.
*
@@ -1183,7 +1178,7 @@ struct page *get_dump_page(unsigned long addr)
*
* This code is based heavily on the PowerPC implementation by Nick Piggin.
*/
-#ifdef CONFIG_HAVE_GENERIC_RCU_GUP
+#ifdef CONFIG_HAVE_GENERIC_GUP
#ifndef gup_get_pte
/*
@@ -1673,4 +1668,4 @@ int get_user_pages_fast(unsigned long start, int nr_pages, int write,
return ret;
}
-#endif /* CONFIG_HAVE_GENERIC_RCU_GUP */
+#endif /* CONFIG_HAVE_GENERIC_GUP */
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index a84909cf20d3..88c6167f194d 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -1426,8 +1426,11 @@ int do_huge_pmd_numa_page(struct vm_fault *vmf, pmd_t pmd)
*/
if (unlikely(pmd_trans_migrating(*vmf->pmd))) {
page = pmd_page(*vmf->pmd);
+ if (!get_page_unless_zero(page))
+ goto out_unlock;
spin_unlock(vmf->ptl);
wait_on_page_locked(page);
+ put_page(page);
goto out;
}
@@ -1459,9 +1462,12 @@ int do_huge_pmd_numa_page(struct vm_fault *vmf, pmd_t pmd)
/* Migration could have started since the pmd_trans_migrating check */
if (!page_locked) {
+ page_nid = -1;
+ if (!get_page_unless_zero(page))
+ goto out_unlock;
spin_unlock(vmf->ptl);
wait_on_page_locked(page);
- page_nid = -1;
+ put_page(page);
goto out;
}
diff --git a/mm/khugepaged.c b/mm/khugepaged.c
index 945fd1ca49b5..df4ebdb2b10a 100644
--- a/mm/khugepaged.c
+++ b/mm/khugepaged.c
@@ -652,7 +652,6 @@ static void __collapse_huge_page_copy(pte_t *pte, struct page *page,
spin_unlock(ptl);
free_page_and_swap_cache(src_page);
}
- cond_resched();
}
}
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 94172089f52f..d75b38b66ef6 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -170,7 +170,7 @@ struct mem_cgroup_event {
*/
poll_table pt;
wait_queue_head_t *wqh;
- wait_queue_t wait;
+ wait_queue_entry_t wait;
struct work_struct remove;
};
@@ -1479,10 +1479,10 @@ static DECLARE_WAIT_QUEUE_HEAD(memcg_oom_waitq);
struct oom_wait_info {
struct mem_cgroup *memcg;
- wait_queue_t wait;
+ wait_queue_entry_t wait;
};
-static int memcg_oom_wake_function(wait_queue_t *wait,
+static int memcg_oom_wake_function(wait_queue_entry_t *wait,
unsigned mode, int sync, void *arg)
{
struct mem_cgroup *wake_memcg = (struct mem_cgroup *)arg;
@@ -1570,7 +1570,7 @@ bool mem_cgroup_oom_synchronize(bool handle)
owait.wait.flags = 0;
owait.wait.func = memcg_oom_wake_function;
owait.wait.private = current;
- INIT_LIST_HEAD(&owait.wait.task_list);
+ INIT_LIST_HEAD(&owait.wait.entry);
prepare_to_wait(&memcg_oom_waitq, &owait.wait, TASK_KILLABLE);
mem_cgroup_mark_under_oom(memcg);
@@ -3725,7 +3725,7 @@ static void memcg_event_remove(struct work_struct *work)
*
* Called with wqh->lock held and interrupts disabled.
*/
-static int memcg_event_wake(wait_queue_t *wait, unsigned mode,
+static int memcg_event_wake(wait_queue_entry_t *wait, unsigned mode,
int sync, void *key)
{
struct mem_cgroup_event *event =
diff --git a/mm/memory-failure.c b/mm/memory-failure.c
index 342fac9ba89b..ecc183fd94f3 100644
--- a/mm/memory-failure.c
+++ b/mm/memory-failure.c
@@ -1184,7 +1184,10 @@ int memory_failure(unsigned long pfn, int trapno, int flags)
* page_remove_rmap() in try_to_unmap_one(). So to determine page status
* correctly, we save a copy of the page flags at this time.
*/
- page_flags = p->flags;
+ if (PageHuge(p))
+ page_flags = hpage->flags;
+ else
+ page_flags = p->flags;
/*
* unpoison always clear PG_hwpoison inside page lock
diff --git a/mm/memory.c b/mm/memory.c
index 2e65df1831d9..bb11c474857e 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -2855,40 +2855,6 @@ out_release:
}
/*
- * This is like a special single-page "expand_{down|up}wards()",
- * except we must first make sure that 'address{-|+}PAGE_SIZE'
- * doesn't hit another vma.
- */
-static inline int check_stack_guard_page(struct vm_area_struct *vma, unsigned long address)
-{
- address &= PAGE_MASK;
- if ((vma->vm_flags & VM_GROWSDOWN) && address == vma->vm_start) {
- struct vm_area_struct *prev = vma->vm_prev;
-
- /*
- * Is there a mapping abutting this one below?
- *
- * That's only ok if it's the same stack mapping
- * that has gotten split..
- */
- if (prev && prev->vm_end == address)
- return prev->vm_flags & VM_GROWSDOWN ? 0 : -ENOMEM;
-
- return expand_downwards(vma, address - PAGE_SIZE);
- }
- if ((vma->vm_flags & VM_GROWSUP) && address + PAGE_SIZE == vma->vm_end) {
- struct vm_area_struct *next = vma->vm_next;
-
- /* As VM_GROWSDOWN but s/below/above/ */
- if (next && next->vm_start == address + PAGE_SIZE)
- return next->vm_flags & VM_GROWSUP ? 0 : -ENOMEM;
-
- return expand_upwards(vma, address + PAGE_SIZE);
- }
- return 0;
-}
-
-/*
* We enter with non-exclusive mmap_sem (to exclude vma changes,
* but allow concurrent faults), and pte mapped but not yet locked.
* We return with mmap_sem still held, but pte unmapped and unlocked.
@@ -2904,10 +2870,6 @@ static int do_anonymous_page(struct vm_fault *vmf)
if (vma->vm_flags & VM_SHARED)
return VM_FAULT_SIGBUS;
- /* Check if we need to add a guard page to the stack */
- if (check_stack_guard_page(vma, vmf->address) < 0)
- return VM_FAULT_SIGSEGV;
-
/*
* Use pte_alloc() instead of pte_alloc_map(). We can't run
* pte_offset_map() on pmds where a huge pmd might be created
diff --git a/mm/mempool.c b/mm/mempool.c
index 47a659dedd44..1c0294858527 100644
--- a/mm/mempool.c
+++ b/mm/mempool.c
@@ -312,7 +312,7 @@ void *mempool_alloc(mempool_t *pool, gfp_t gfp_mask)
{
void *element;
unsigned long flags;
- wait_queue_t wait;
+ wait_queue_entry_t wait;
gfp_t gfp_temp;
VM_WARN_ON_ONCE(gfp_mask & __GFP_ZERO);
diff --git a/mm/mmap.c b/mm/mmap.c
index f82741e199c0..a5e3dcd75e79 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -183,6 +183,7 @@ SYSCALL_DEFINE1(brk, unsigned long, brk)
unsigned long retval;
unsigned long newbrk, oldbrk;
struct mm_struct *mm = current->mm;
+ struct vm_area_struct *next;
unsigned long min_brk;
bool populate;
LIST_HEAD(uf);
@@ -229,7 +230,8 @@ SYSCALL_DEFINE1(brk, unsigned long, brk)
}
/* Check against existing mmap mappings. */
- if (find_vma_intersection(mm, oldbrk, newbrk+PAGE_SIZE))
+ next = find_vma(mm, oldbrk);
+ if (next && newbrk + PAGE_SIZE > vm_start_gap(next))
goto out;
/* Ok, looks good - let it rip. */
@@ -253,10 +255,22 @@ out:
static long vma_compute_subtree_gap(struct vm_area_struct *vma)
{
- unsigned long max, subtree_gap;
- max = vma->vm_start;
- if (vma->vm_prev)
- max -= vma->vm_prev->vm_end;
+ unsigned long max, prev_end, subtree_gap;
+
+ /*
+ * Note: in the rare case of a VM_GROWSDOWN above a VM_GROWSUP, we
+ * allow two stack_guard_gaps between them here, and when choosing
+ * an unmapped area; whereas when expanding we only require one.
+ * That's a little inconsistent, but keeps the code here simpler.
+ */
+ max = vm_start_gap(vma);
+ if (vma->vm_prev) {
+ prev_end = vm_end_gap(vma->vm_prev);
+ if (max > prev_end)
+ max -= prev_end;
+ else
+ max = 0;
+ }
if (vma->vm_rb.rb_left) {
subtree_gap = rb_entry(vma->vm_rb.rb_left,
struct vm_area_struct, vm_rb)->rb_subtree_gap;
@@ -352,7 +366,7 @@ static void validate_mm(struct mm_struct *mm)
anon_vma_unlock_read(anon_vma);
}
- highest_address = vma->vm_end;
+ highest_address = vm_end_gap(vma);
vma = vma->vm_next;
i++;
}
@@ -541,7 +555,7 @@ void __vma_link_rb(struct mm_struct *mm, struct vm_area_struct *vma,
if (vma->vm_next)
vma_gap_update(vma->vm_next);
else
- mm->highest_vm_end = vma->vm_end;
+ mm->highest_vm_end = vm_end_gap(vma);
/*
* vma->vm_prev wasn't known when we followed the rbtree to find the
@@ -856,7 +870,7 @@ again:
vma_gap_update(vma);
if (end_changed) {
if (!next)
- mm->highest_vm_end = end;
+ mm->highest_vm_end = vm_end_gap(vma);
else if (!adjust_next)
vma_gap_update(next);
}
@@ -941,7 +955,7 @@ again:
* mm->highest_vm_end doesn't need any update
* in remove_next == 1 case.
*/
- VM_WARN_ON(mm->highest_vm_end != end);
+ VM_WARN_ON(mm->highest_vm_end != vm_end_gap(vma));
}
}
if (insert && file)
@@ -1787,7 +1801,7 @@ unsigned long unmapped_area(struct vm_unmapped_area_info *info)
while (true) {
/* Visit left subtree if it looks promising */
- gap_end = vma->vm_start;
+ gap_end = vm_start_gap(vma);
if (gap_end >= low_limit && vma->vm_rb.rb_left) {
struct vm_area_struct *left =
rb_entry(vma->vm_rb.rb_left,
@@ -1798,12 +1812,13 @@ unsigned long unmapped_area(struct vm_unmapped_area_info *info)
}
}
- gap_start = vma->vm_prev ? vma->vm_prev->vm_end : 0;
+ gap_start = vma->vm_prev ? vm_end_gap(vma->vm_prev) : 0;
check_current:
/* Check if current node has a suitable gap */
if (gap_start > high_limit)
return -ENOMEM;
- if (gap_end >= low_limit && gap_end - gap_start >= length)
+ if (gap_end >= low_limit &&
+ gap_end > gap_start && gap_end - gap_start >= length)
goto found;
/* Visit right subtree if it looks promising */
@@ -1825,8 +1840,8 @@ check_current:
vma = rb_entry(rb_parent(prev),
struct vm_area_struct, vm_rb);
if (prev == vma->vm_rb.rb_left) {
- gap_start = vma->vm_prev->vm_end;
- gap_end = vma->vm_start;
+ gap_start = vm_end_gap(vma->vm_prev);
+ gap_end = vm_start_gap(vma);
goto check_current;
}
}
@@ -1890,7 +1905,7 @@ unsigned long unmapped_area_topdown(struct vm_unmapped_area_info *info)
while (true) {
/* Visit right subtree if it looks promising */
- gap_start = vma->vm_prev ? vma->vm_prev->vm_end : 0;
+ gap_start = vma->vm_prev ? vm_end_gap(vma->vm_prev) : 0;
if (gap_start <= high_limit && vma->vm_rb.rb_right) {
struct vm_area_struct *right =
rb_entry(vma->vm_rb.rb_right,
@@ -1903,10 +1918,11 @@ unsigned long unmapped_area_topdown(struct vm_unmapped_area_info *info)
check_current:
/* Check if current node has a suitable gap */
- gap_end = vma->vm_start;
+ gap_end = vm_start_gap(vma);
if (gap_end < low_limit)
return -ENOMEM;
- if (gap_start <= high_limit && gap_end - gap_start >= length)
+ if (gap_start <= high_limit &&
+ gap_end > gap_start && gap_end - gap_start >= length)
goto found;
/* Visit left subtree if it looks promising */
@@ -1929,7 +1945,7 @@ check_current:
struct vm_area_struct, vm_rb);
if (prev == vma->vm_rb.rb_right) {
gap_start = vma->vm_prev ?
- vma->vm_prev->vm_end : 0;
+ vm_end_gap(vma->vm_prev) : 0;
goto check_current;
}
}
@@ -1967,7 +1983,7 @@ arch_get_unmapped_area(struct file *filp, unsigned long addr,
unsigned long len, unsigned long pgoff, unsigned long flags)
{
struct mm_struct *mm = current->mm;
- struct vm_area_struct *vma;
+ struct vm_area_struct *vma, *prev;
struct vm_unmapped_area_info info;
if (len > TASK_SIZE - mmap_min_addr)
@@ -1978,9 +1994,10 @@ arch_get_unmapped_area(struct file *filp, unsigned long addr,
if (addr) {
addr = PAGE_ALIGN(addr);
- vma = find_vma(mm, addr);
+ vma = find_vma_prev(mm, addr, &prev);
if (TASK_SIZE - len >= addr && addr >= mmap_min_addr &&
- (!vma || addr + len <= vma->vm_start))
+ (!vma || addr + len <= vm_start_gap(vma)) &&
+ (!prev || addr >= vm_end_gap(prev)))
return addr;
}
@@ -2003,7 +2020,7 @@ arch_get_unmapped_area_topdown(struct file *filp, const unsigned long addr0,
const unsigned long len, const unsigned long pgoff,
const unsigned long flags)
{
- struct vm_area_struct *vma;
+ struct vm_area_struct *vma, *prev;
struct mm_struct *mm = current->mm;
unsigned long addr = addr0;
struct vm_unmapped_area_info info;
@@ -2018,9 +2035,10 @@ arch_get_unmapped_area_topdown(struct file *filp, const unsigned long addr0,
/* requesting a specific address */
if (addr) {
addr = PAGE_ALIGN(addr);
- vma = find_vma(mm, addr);
+ vma = find_vma_prev(mm, addr, &prev);
if (TASK_SIZE - len >= addr && addr >= mmap_min_addr &&
- (!vma || addr + len <= vma->vm_start))
+ (!vma || addr + len <= vm_start_gap(vma)) &&
+ (!prev || addr >= vm_end_gap(prev)))
return addr;
}
@@ -2155,21 +2173,19 @@ find_vma_prev(struct mm_struct *mm, unsigned long addr,
* update accounting. This is shared with both the
* grow-up and grow-down cases.
*/
-static int acct_stack_growth(struct vm_area_struct *vma, unsigned long size, unsigned long grow)
+static int acct_stack_growth(struct vm_area_struct *vma,
+ unsigned long size, unsigned long grow)
{
struct mm_struct *mm = vma->vm_mm;
struct rlimit *rlim = current->signal->rlim;
- unsigned long new_start, actual_size;
+ unsigned long new_start;
/* address space limit tests */
if (!may_expand_vm(mm, vma->vm_flags, grow))
return -ENOMEM;
/* Stack limit test */
- actual_size = size;
- if (size && (vma->vm_flags & (VM_GROWSUP | VM_GROWSDOWN)))
- actual_size -= PAGE_SIZE;
- if (actual_size > READ_ONCE(rlim[RLIMIT_STACK].rlim_cur))
+ if (size > READ_ONCE(rlim[RLIMIT_STACK].rlim_cur))
return -ENOMEM;
/* mlock limit tests */
@@ -2207,16 +2223,32 @@ static int acct_stack_growth(struct vm_area_struct *vma, unsigned long size, uns
int expand_upwards(struct vm_area_struct *vma, unsigned long address)
{
struct mm_struct *mm = vma->vm_mm;
+ struct vm_area_struct *next;
+ unsigned long gap_addr;
int error = 0;
if (!(vma->vm_flags & VM_GROWSUP))
return -EFAULT;
- /* Guard against wrapping around to address 0. */
- if (address < PAGE_ALIGN(address+4))
- address = PAGE_ALIGN(address+4);
- else
+ /* Guard against exceeding limits of the address space. */
+ address &= PAGE_MASK;
+ if (address >= TASK_SIZE)
return -ENOMEM;
+ address += PAGE_SIZE;
+
+ /* Enforce stack_guard_gap */
+ gap_addr = address + stack_guard_gap;
+
+ /* Guard against overflow */
+ if (gap_addr < address || gap_addr > TASK_SIZE)
+ gap_addr = TASK_SIZE;
+
+ next = vma->vm_next;
+ if (next && next->vm_start < gap_addr) {
+ if (!(next->vm_flags & VM_GROWSUP))
+ return -ENOMEM;
+ /* Check that both stack segments have the same anon_vma? */
+ }
/* We must make sure the anon_vma is allocated. */
if (unlikely(anon_vma_prepare(vma)))
@@ -2261,7 +2293,7 @@ int expand_upwards(struct vm_area_struct *vma, unsigned long address)
if (vma->vm_next)
vma_gap_update(vma->vm_next);
else
- mm->highest_vm_end = address;
+ mm->highest_vm_end = vm_end_gap(vma);
spin_unlock(&mm->page_table_lock);
perf_event_mmap(vma);
@@ -2282,6 +2314,8 @@ int expand_downwards(struct vm_area_struct *vma,
unsigned long address)
{
struct mm_struct *mm = vma->vm_mm;
+ struct vm_area_struct *prev;
+ unsigned long gap_addr;
int error;
address &= PAGE_MASK;
@@ -2289,6 +2323,17 @@ int expand_downwards(struct vm_area_struct *vma,
if (error)
return error;
+ /* Enforce stack_guard_gap */
+ gap_addr = address - stack_guard_gap;
+ if (gap_addr > address)
+ return -ENOMEM;
+ prev = vma->vm_prev;
+ if (prev && prev->vm_end > gap_addr) {
+ if (!(prev->vm_flags & VM_GROWSDOWN))
+ return -ENOMEM;
+ /* Check that both stack segments have the same anon_vma? */
+ }
+
/* We must make sure the anon_vma is allocated. */
if (unlikely(anon_vma_prepare(vma)))
return -ENOMEM;
@@ -2343,28 +2388,25 @@ int expand_downwards(struct vm_area_struct *vma,
return error;
}
-/*
- * Note how expand_stack() refuses to expand the stack all the way to
- * abut the next virtual mapping, *unless* that mapping itself is also
- * a stack mapping. We want to leave room for a guard page, after all
- * (the guard page itself is not added here, that is done by the
- * actual page faulting logic)
- *
- * This matches the behavior of the guard page logic (see mm/memory.c:
- * check_stack_guard_page()), which only allows the guard page to be
- * removed under these circumstances.
- */
+/* enforced gap between the expanding stack and other mappings. */
+unsigned long stack_guard_gap = 256UL<<PAGE_SHIFT;
+
+static int __init cmdline_parse_stack_guard_gap(char *p)
+{
+ unsigned long val;
+ char *endptr;
+
+ val = simple_strtoul(p, &endptr, 10);
+ if (!*endptr)
+ stack_guard_gap = val << PAGE_SHIFT;
+
+ return 0;
+}
+__setup("stack_guard_gap=", cmdline_parse_stack_guard_gap);
+
#ifdef CONFIG_STACK_GROWSUP
int expand_stack(struct vm_area_struct *vma, unsigned long address)
{
- struct vm_area_struct *next;
-
- address &= PAGE_MASK;
- next = vma->vm_next;
- if (next && next->vm_start == address + PAGE_SIZE) {
- if (!(next->vm_flags & VM_GROWSUP))
- return -ENOMEM;
- }
return expand_upwards(vma, address);
}
@@ -2386,14 +2428,6 @@ find_extend_vma(struct mm_struct *mm, unsigned long addr)
#else
int expand_stack(struct vm_area_struct *vma, unsigned long address)
{
- struct vm_area_struct *prev;
-
- address &= PAGE_MASK;
- prev = vma->vm_prev;
- if (prev && prev->vm_end == address) {
- if (!(prev->vm_flags & VM_GROWSDOWN))
- return -ENOMEM;
- }
return expand_downwards(vma, address);
}
@@ -2491,7 +2525,7 @@ detach_vmas_to_be_unmapped(struct mm_struct *mm, struct vm_area_struct *vma,
vma->vm_prev = prev;
vma_gap_update(vma);
} else
- mm->highest_vm_end = prev ? prev->vm_end : 0;
+ mm->highest_vm_end = prev ? vm_end_gap(prev) : 0;
tail_vma->vm_next = NULL;
/* Kill the cache */
diff --git a/mm/rmap.c b/mm/rmap.c
index d405f0e0ee96..130c238fe384 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -579,25 +579,13 @@ void page_unlock_anon_vma_read(struct anon_vma *anon_vma)
void try_to_unmap_flush(void)
{
struct tlbflush_unmap_batch *tlb_ubc = &current->tlb_ubc;
- int cpu;
if (!tlb_ubc->flush_required)
return;
- cpu = get_cpu();
-
- if (cpumask_test_cpu(cpu, &tlb_ubc->cpumask)) {
- count_vm_tlb_event(NR_TLB_LOCAL_FLUSH_ALL);
- local_flush_tlb();
- trace_tlb_flush(TLB_LOCAL_SHOOTDOWN, TLB_FLUSH_ALL);
- }
-
- if (cpumask_any_but(&tlb_ubc->cpumask, cpu) < nr_cpu_ids)
- flush_tlb_others(&tlb_ubc->cpumask, NULL, 0, TLB_FLUSH_ALL);
- cpumask_clear(&tlb_ubc->cpumask);
+ arch_tlbbatch_flush(&tlb_ubc->arch);
tlb_ubc->flush_required = false;
tlb_ubc->writable = false;
- put_cpu();
}
/* Flush iff there are potentially writable TLB entries that can race with IO */
@@ -613,7 +601,7 @@ static void set_tlb_ubc_flush_pending(struct mm_struct *mm, bool writable)
{
struct tlbflush_unmap_batch *tlb_ubc = &current->tlb_ubc;
- cpumask_or(&tlb_ubc->cpumask, &tlb_ubc->cpumask, mm_cpumask(mm));
+ arch_tlbbatch_add_mm(&tlb_ubc->arch, mm);
tlb_ubc->flush_required = true;
/*
diff --git a/mm/shmem.c b/mm/shmem.c
index 391f2dcca727..9100c4952698 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -1903,10 +1903,10 @@ unlock:
* entry unconditionally - even if something else had already woken the
* target.
*/
-static int synchronous_wake_function(wait_queue_t *wait, unsigned mode, int sync, void *key)
+static int synchronous_wake_function(wait_queue_entry_t *wait, unsigned mode, int sync, void *key)
{
int ret = default_wake_function(wait, mode, sync, key);
- list_del_init(&wait->task_list);
+ list_del_init(&wait->entry);
return ret;
}
@@ -2841,7 +2841,7 @@ static long shmem_fallocate(struct file *file, int mode, loff_t offset,
spin_lock(&inode->i_lock);
inode->i_private = NULL;
wake_up_all(&shmem_falloc_waitq);
- WARN_ON_ONCE(!list_empty(&shmem_falloc_waitq.task_list));
+ WARN_ON_ONCE(!list_empty(&shmem_falloc_waitq.head));
spin_unlock(&inode->i_lock);
error = 0;
goto out;
diff --git a/mm/slub.c b/mm/slub.c
index 7449593fca72..8addc535bcdc 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -5625,6 +5625,28 @@ static char *create_unique_id(struct kmem_cache *s)
return name;
}
+static void sysfs_slab_remove_workfn(struct work_struct *work)
+{
+ struct kmem_cache *s =
+ container_of(work, struct kmem_cache, kobj_remove_work);
+
+ if (!s->kobj.state_in_sysfs)
+ /*
+ * For a memcg cache, this may be called during
+ * deactivation and again on shutdown. Remove only once.
+ * A cache is never shut down before deactivation is
+ * complete, so no need to worry about synchronization.
+ */
+ return;
+
+#ifdef CONFIG_MEMCG
+ kset_unregister(s->memcg_kset);
+#endif
+ kobject_uevent(&s->kobj, KOBJ_REMOVE);
+ kobject_del(&s->kobj);
+ kobject_put(&s->kobj);
+}
+
static int sysfs_slab_add(struct kmem_cache *s)
{
int err;
@@ -5632,6 +5654,8 @@ static int sysfs_slab_add(struct kmem_cache *s)
struct kset *kset = cache_kset(s);
int unmergeable = slab_unmergeable(s);
+ INIT_WORK(&s->kobj_remove_work, sysfs_slab_remove_workfn);
+
if (!kset) {
kobject_init(&s->kobj, &slab_ktype);
return 0;
@@ -5695,20 +5719,8 @@ static void sysfs_slab_remove(struct kmem_cache *s)
*/
return;
- if (!s->kobj.state_in_sysfs)
- /*
- * For a memcg cache, this may be called during
- * deactivation and again on shutdown. Remove only once.
- * A cache is never shut down before deactivation is
- * complete, so no need to worry about synchronization.
- */
- return;
-
-#ifdef CONFIG_MEMCG
- kset_unregister(s->memcg_kset);
-#endif
- kobject_uevent(&s->kobj, KOBJ_REMOVE);
- kobject_del(&s->kobj);
+ kobject_get(&s->kobj);
+ schedule_work(&s->kobj_remove_work);
}
void sysfs_slab_release(struct kmem_cache *s)
diff --git a/mm/swap_cgroup.c b/mm/swap_cgroup.c
index ac6318a064d3..3405b4ee1757 100644
--- a/mm/swap_cgroup.c
+++ b/mm/swap_cgroup.c
@@ -48,6 +48,9 @@ static int swap_cgroup_prepare(int type)
if (!page)
goto not_enough_page;
ctrl->map[idx] = page;
+
+ if (!(idx % SWAP_CLUSTER_MAX))
+ cond_resched();
}
return 0;
not_enough_page:
diff --git a/mm/vmalloc.c b/mm/vmalloc.c
index 34a1c3e46ed7..ecc97f74ab18 100644
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -287,10 +287,21 @@ struct page *vmalloc_to_page(const void *vmalloc_addr)
if (p4d_none(*p4d))
return NULL;
pud = pud_offset(p4d, addr);
- if (pud_none(*pud))
+
+ /*
+ * Don't dereference bad PUD or PMD (below) entries. This will also
+ * identify huge mappings, which we may encounter on architectures
+ * that define CONFIG_HAVE_ARCH_HUGE_VMAP=y. Such regions will be
+ * identified as vmalloc addresses by is_vmalloc_addr(), but are
+ * not [unambiguously] associated with a struct page, so there is
+ * no correct value to return for them.
+ */
+ WARN_ON_ONCE(pud_bad(*pud));
+ if (pud_none(*pud) || pud_bad(*pud))
return NULL;
pmd = pmd_offset(pud, addr);
- if (pmd_none(*pmd))
+ WARN_ON_ONCE(pmd_bad(*pmd));
+ if (pmd_none(*pmd) || pmd_bad(*pmd))
return NULL;
ptep = pte_offset_map(pmd, addr);
diff --git a/mm/vmpressure.c b/mm/vmpressure.c
index 6063581f705c..ce0618bfa8d0 100644
--- a/mm/vmpressure.c
+++ b/mm/vmpressure.c
@@ -115,9 +115,9 @@ static enum vmpressure_levels vmpressure_calc_level(unsigned long scanned,
unsigned long pressure = 0;
/*
- * reclaimed can be greater than scanned in cases
- * like THP, where the scanned is 1 and reclaimed
- * could be 512
+ * reclaimed can be greater than scanned for things such as reclaimed
+ * slab pages. shrink_node() just adds reclaimed pages without a
+ * related increment to scanned pages.
*/
if (reclaimed >= scanned)
goto out;
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 8ad39bbc79e6..c3c1c6ac62da 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -3652,7 +3652,7 @@ int kswapd_run(int nid)
pgdat->kswapd = kthread_run(kswapd, pgdat, "kswapd%d", nid);
if (IS_ERR(pgdat->kswapd)) {
/* failure at boot is fatal */
- BUG_ON(system_state == SYSTEM_BOOTING);
+ BUG_ON(system_state < SYSTEM_RUNNING);
pr_err("Failed to start kswapd on node %d\n", nid);
ret = PTR_ERR(pgdat->kswapd);
pgdat->kswapd = NULL;
diff --git a/net/8021q/vlan.c b/net/8021q/vlan.c
index 467069b73ce1..9649579b5b9f 100644
--- a/net/8021q/vlan.c
+++ b/net/8021q/vlan.c
@@ -277,7 +277,8 @@ static int register_vlan_device(struct net_device *real_dev, u16 vlan_id)
return 0;
out_free_newdev:
- free_netdev(new_dev);
+ if (new_dev->reg_state == NETREG_UNINITIALIZED)
+ free_netdev(new_dev);
return err;
}
diff --git a/net/8021q/vlan_dev.c b/net/8021q/vlan_dev.c
index 953b6728bd00..abc5f400fc71 100644
--- a/net/8021q/vlan_dev.c
+++ b/net/8021q/vlan_dev.c
@@ -813,7 +813,6 @@ static void vlan_dev_free(struct net_device *dev)
free_percpu(vlan->vlan_pcpu_stats);
vlan->vlan_pcpu_stats = NULL;
- free_netdev(dev);
}
void vlan_setup(struct net_device *dev)
@@ -826,7 +825,8 @@ void vlan_setup(struct net_device *dev)
netif_keep_dst(dev);
dev->netdev_ops = &vlan_netdev_ops;
- dev->destructor = vlan_dev_free;
+ dev->needs_free_netdev = true;
+ dev->priv_destructor = vlan_dev_free;
dev->ethtool_ops = &vlan_ethtool_ops;
dev->min_mtu = 0;
diff --git a/net/9p/trans_fd.c b/net/9p/trans_fd.c
index 7bc2208b6cc4..dca3cdd1a014 100644
--- a/net/9p/trans_fd.c
+++ b/net/9p/trans_fd.c
@@ -95,7 +95,7 @@ enum {
struct p9_poll_wait {
struct p9_conn *conn;
- wait_queue_t wait;
+ wait_queue_entry_t wait;
wait_queue_head_t *wait_addr;
};
@@ -522,7 +522,7 @@ error:
clear_bit(Wworksched, &m->wsched);
}
-static int p9_pollwake(wait_queue_t *wait, unsigned int mode, int sync, void *key)
+static int p9_pollwake(wait_queue_entry_t *wait, unsigned int mode, int sync, void *key)
{
struct p9_poll_wait *pwait =
container_of(wait, struct p9_poll_wait, wait);
diff --git a/net/batman-adv/distributed-arp-table.c b/net/batman-adv/distributed-arp-table.c
index 013e970eff39..000ca2f113ab 100644
--- a/net/batman-adv/distributed-arp-table.c
+++ b/net/batman-adv/distributed-arp-table.c
@@ -1064,8 +1064,9 @@ bool batadv_dat_snoop_outgoing_arp_request(struct batadv_priv *bat_priv,
skb_new->protocol = eth_type_trans(skb_new, soft_iface);
- soft_iface->stats.rx_packets++;
- soft_iface->stats.rx_bytes += skb->len + ETH_HLEN + hdr_size;
+ batadv_inc_counter(bat_priv, BATADV_CNT_RX);
+ batadv_add_counter(bat_priv, BATADV_CNT_RX_BYTES,
+ skb->len + ETH_HLEN + hdr_size);
netif_rx(skb_new);
batadv_dbg(BATADV_DBG_DAT, bat_priv, "ARP request replied locally\n");
diff --git a/net/batman-adv/routing.c b/net/batman-adv/routing.c
index e1ebe14ee2a6..ae9f4d37d34f 100644
--- a/net/batman-adv/routing.c
+++ b/net/batman-adv/routing.c
@@ -987,7 +987,7 @@ int batadv_recv_unicast_packet(struct sk_buff *skb,
batadv_dbg(BATADV_DBG_BLA, bat_priv,
"recv_unicast_packet(): Dropped unicast pkt received from another backbone gw %pM.\n",
orig_addr_gw);
- return NET_RX_DROP;
+ goto free_skb;
}
}
diff --git a/net/batman-adv/soft-interface.c b/net/batman-adv/soft-interface.c
index b25789abf7b9..10f7edfb176e 100644
--- a/net/batman-adv/soft-interface.c
+++ b/net/batman-adv/soft-interface.c
@@ -1034,8 +1034,6 @@ static void batadv_softif_free(struct net_device *dev)
* netdev and its private data (bat_priv)
*/
rcu_barrier();
-
- free_netdev(dev);
}
/**
@@ -1047,7 +1045,8 @@ static void batadv_softif_init_early(struct net_device *dev)
ether_setup(dev);
dev->netdev_ops = &batadv_netdev_ops;
- dev->destructor = batadv_softif_free;
+ dev->needs_free_netdev = true;
+ dev->priv_destructor = batadv_softif_free;
dev->features |= NETIF_F_HW_VLAN_CTAG_FILTER | NETIF_F_NETNS_LOCAL;
dev->priv_flags |= IFF_NO_QUEUE;
diff --git a/net/bluetooth/6lowpan.c b/net/bluetooth/6lowpan.c
index 608959989f8e..ab3b654b05cc 100644
--- a/net/bluetooth/6lowpan.c
+++ b/net/bluetooth/6lowpan.c
@@ -598,7 +598,7 @@ static void netdev_setup(struct net_device *dev)
dev->netdev_ops = &netdev_ops;
dev->header_ops = &header_ops;
- dev->destructor = free_netdev;
+ dev->needs_free_netdev = true;
}
static struct device_type bt_type = {
diff --git a/net/bluetooth/bnep/core.c b/net/bluetooth/bnep/core.c
index fbf251fef70f..5c4808b3da2d 100644
--- a/net/bluetooth/bnep/core.c
+++ b/net/bluetooth/bnep/core.c
@@ -484,7 +484,7 @@ static int bnep_session(void *arg)
struct net_device *dev = s->dev;
struct sock *sk = s->sock->sk;
struct sk_buff *skb;
- wait_queue_t wait;
+ wait_queue_entry_t wait;
BT_DBG("");
diff --git a/net/bluetooth/cmtp/core.c b/net/bluetooth/cmtp/core.c
index 9e59b6654126..14f7c8135c31 100644
--- a/net/bluetooth/cmtp/core.c
+++ b/net/bluetooth/cmtp/core.c
@@ -280,7 +280,7 @@ static int cmtp_session(void *arg)
struct cmtp_session *session = arg;
struct sock *sk = session->sock->sk;
struct sk_buff *skb;
- wait_queue_t wait;
+ wait_queue_entry_t wait;
BT_DBG("session %p", session);
diff --git a/net/bluetooth/hidp/core.c b/net/bluetooth/hidp/core.c
index 0bec4588c3c8..fc31161e98f2 100644
--- a/net/bluetooth/hidp/core.c
+++ b/net/bluetooth/hidp/core.c
@@ -1244,7 +1244,7 @@ static void hidp_session_run(struct hidp_session *session)
static int hidp_session_thread(void *arg)
{
struct hidp_session *session = arg;
- wait_queue_t ctrl_wait, intr_wait;
+ wait_queue_entry_t ctrl_wait, intr_wait;
BT_DBG("session %p", session);
diff --git a/net/bridge/br_device.c b/net/bridge/br_device.c
index 430b53e7d941..f0f3447e8aa4 100644
--- a/net/bridge/br_device.c
+++ b/net/bridge/br_device.c
@@ -379,7 +379,7 @@ void br_dev_setup(struct net_device *dev)
ether_setup(dev);
dev->netdev_ops = &br_netdev_ops;
- dev->destructor = free_netdev;
+ dev->needs_free_netdev = true;
dev->ethtool_ops = &br_ethtool_ops;
SET_NETDEV_DEVTYPE(dev, &br_type);
dev->priv_flags = IFF_EBRIDGE | IFF_NO_QUEUE;
diff --git a/net/caif/caif_socket.c b/net/caif/caif_socket.c
index adcad344c843..21f18ea2fce4 100644
--- a/net/caif/caif_socket.c
+++ b/net/caif/caif_socket.c
@@ -754,6 +754,10 @@ static int caif_connect(struct socket *sock, struct sockaddr *uaddr,
lock_sock(sk);
+ err = -EINVAL;
+ if (addr_len < offsetofend(struct sockaddr, sa_family))
+ goto out;
+
err = -EAFNOSUPPORT;
if (uaddr->sa_family != AF_CAIF)
goto out;
diff --git a/net/caif/cfpkt_skbuff.c b/net/caif/cfpkt_skbuff.c
index 59ce1fcc220c..71b6ab240dea 100644
--- a/net/caif/cfpkt_skbuff.c
+++ b/net/caif/cfpkt_skbuff.c
@@ -81,11 +81,7 @@ static struct cfpkt *cfpkt_create_pfx(u16 len, u16 pfx)
{
struct sk_buff *skb;
- if (likely(in_interrupt()))
- skb = alloc_skb(len + pfx, GFP_ATOMIC);
- else
- skb = alloc_skb(len + pfx, GFP_KERNEL);
-
+ skb = alloc_skb(len + pfx, GFP_ATOMIC);
if (unlikely(skb == NULL))
return NULL;
diff --git a/net/caif/chnl_net.c b/net/caif/chnl_net.c
index 1816fc9f1ee7..fe3c53efb949 100644
--- a/net/caif/chnl_net.c
+++ b/net/caif/chnl_net.c
@@ -392,14 +392,14 @@ static void chnl_net_destructor(struct net_device *dev)
{
struct chnl_net *priv = netdev_priv(dev);
caif_free_client(&priv->chnl);
- free_netdev(dev);
}
static void ipcaif_net_setup(struct net_device *dev)
{
struct chnl_net *priv;
dev->netdev_ops = &netdev_ops;
- dev->destructor = chnl_net_destructor;
+ dev->needs_free_netdev = true;
+ dev->priv_destructor = chnl_net_destructor;
dev->flags |= IFF_NOARP;
dev->flags |= IFF_POINTOPOINT;
dev->mtu = GPRS_PDP_MTU;
diff --git a/net/can/af_can.c b/net/can/af_can.c
index b6406fe33c76..88edac0f3e36 100644
--- a/net/can/af_can.c
+++ b/net/can/af_can.c
@@ -872,8 +872,7 @@ static int can_notifier(struct notifier_block *nb, unsigned long msg,
static int can_pernet_init(struct net *net)
{
- net->can.can_rcvlists_lock =
- __SPIN_LOCK_UNLOCKED(net->can.can_rcvlists_lock);
+ spin_lock_init(&net->can.can_rcvlists_lock);
net->can.can_rx_alldev_list =
kzalloc(sizeof(struct dev_rcv_lists), GFP_KERNEL);
diff --git a/net/core/datagram.c b/net/core/datagram.c
index db1866f2ffcf..34678828e2bb 100644
--- a/net/core/datagram.c
+++ b/net/core/datagram.c
@@ -68,7 +68,7 @@ static inline int connection_based(struct sock *sk)
return sk->sk_type == SOCK_SEQPACKET || sk->sk_type == SOCK_STREAM;
}
-static int receiver_wake_function(wait_queue_t *wait, unsigned int mode, int sync,
+static int receiver_wake_function(wait_queue_entry_t *wait, unsigned int mode, int sync,
void *key)
{
unsigned long bits = (unsigned long)key;
diff --git a/net/core/dev.c b/net/core/dev.c
index fca407b4a6ea..416137c64bf8 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -1253,8 +1253,9 @@ int dev_set_alias(struct net_device *dev, const char *alias, size_t len)
if (!new_ifalias)
return -ENOMEM;
dev->ifalias = new_ifalias;
+ memcpy(dev->ifalias, alias, len);
+ dev->ifalias[len] = 0;
- strlcpy(dev->ifalias, alias, len+1);
return len;
}
@@ -4766,6 +4767,13 @@ struct packet_offload *gro_find_complete_by_type(__be16 type)
}
EXPORT_SYMBOL(gro_find_complete_by_type);
+static void napi_skb_free_stolen_head(struct sk_buff *skb)
+{
+ skb_dst_drop(skb);
+ secpath_reset(skb);
+ kmem_cache_free(skbuff_head_cache, skb);
+}
+
static gro_result_t napi_skb_finish(gro_result_t ret, struct sk_buff *skb)
{
switch (ret) {
@@ -4779,13 +4787,10 @@ static gro_result_t napi_skb_finish(gro_result_t ret, struct sk_buff *skb)
break;
case GRO_MERGED_FREE:
- if (NAPI_GRO_CB(skb)->free == NAPI_GRO_FREE_STOLEN_HEAD) {
- skb_dst_drop(skb);
- secpath_reset(skb);
- kmem_cache_free(skbuff_head_cache, skb);
- } else {
+ if (NAPI_GRO_CB(skb)->free == NAPI_GRO_FREE_STOLEN_HEAD)
+ napi_skb_free_stolen_head(skb);
+ else
__kfree_skb(skb);
- }
break;
case GRO_HELD:
@@ -4857,10 +4862,16 @@ static gro_result_t napi_frags_finish(struct napi_struct *napi,
break;
case GRO_DROP:
- case GRO_MERGED_FREE:
napi_reuse_skb(napi, skb);
break;
+ case GRO_MERGED_FREE:
+ if (NAPI_GRO_CB(skb)->free == NAPI_GRO_FREE_STOLEN_HEAD)
+ napi_skb_free_stolen_head(skb);
+ else
+ napi_reuse_skb(napi, skb);
+ break;
+
case GRO_MERGED:
case GRO_CONSUMED:
break;
@@ -4948,6 +4959,19 @@ __sum16 __skb_gro_checksum_complete(struct sk_buff *skb)
}
EXPORT_SYMBOL(__skb_gro_checksum_complete);
+static void net_rps_send_ipi(struct softnet_data *remsd)
+{
+#ifdef CONFIG_RPS
+ while (remsd) {
+ struct softnet_data *next = remsd->rps_ipi_next;
+
+ if (cpu_online(remsd->cpu))
+ smp_call_function_single_async(remsd->cpu, &remsd->csd);
+ remsd = next;
+ }
+#endif
+}
+
/*
* net_rps_action_and_irq_enable sends any pending IPI's for rps.
* Note: called with local irq disabled, but exits with local irq enabled.
@@ -4963,14 +4987,7 @@ static void net_rps_action_and_irq_enable(struct softnet_data *sd)
local_irq_enable();
/* Send pending IPI's to kick RPS processing on remote cpus. */
- while (remsd) {
- struct softnet_data *next = remsd->rps_ipi_next;
-
- if (cpu_online(remsd->cpu))
- smp_call_function_single_async(remsd->cpu,
- &remsd->csd);
- remsd = next;
- }
+ net_rps_send_ipi(remsd);
} else
#endif
local_irq_enable();
@@ -5199,8 +5216,6 @@ static void busy_poll_stop(struct napi_struct *napi, void *have_poll_lock)
if (rc == BUSY_POLL_BUDGET)
__napi_schedule(napi);
local_bh_enable();
- if (local_softirq_pending())
- do_softirq();
}
void napi_busy_loop(unsigned int napi_id,
@@ -7501,6 +7516,8 @@ out:
err_uninit:
if (dev->netdev_ops->ndo_uninit)
dev->netdev_ops->ndo_uninit(dev);
+ if (dev->priv_destructor)
+ dev->priv_destructor(dev);
goto out;
}
EXPORT_SYMBOL(register_netdevice);
@@ -7708,8 +7725,10 @@ void netdev_run_todo(void)
WARN_ON(rcu_access_pointer(dev->ip6_ptr));
WARN_ON(dev->dn_ptr);
- if (dev->destructor)
- dev->destructor(dev);
+ if (dev->priv_destructor)
+ dev->priv_destructor(dev);
+ if (dev->needs_free_netdev)
+ free_netdev(dev);
/* Report a network device has been unregistered */
rtnl_lock();
@@ -7774,9 +7793,9 @@ struct rtnl_link_stats64 *dev_get_stats(struct net_device *dev,
} else {
netdev_stats_to_stats64(storage, &dev->stats);
}
- storage->rx_dropped += atomic_long_read(&dev->rx_dropped);
- storage->tx_dropped += atomic_long_read(&dev->tx_dropped);
- storage->rx_nohandler += atomic_long_read(&dev->rx_nohandler);
+ storage->rx_dropped += (unsigned long)atomic_long_read(&dev->rx_dropped);
+ storage->tx_dropped += (unsigned long)atomic_long_read(&dev->tx_dropped);
+ storage->rx_nohandler += (unsigned long)atomic_long_read(&dev->rx_nohandler);
return storage;
}
EXPORT_SYMBOL(dev_get_stats);
@@ -8192,7 +8211,7 @@ static int dev_cpu_dead(unsigned int oldcpu)
struct sk_buff **list_skb;
struct sk_buff *skb;
unsigned int cpu;
- struct softnet_data *sd, *oldsd;
+ struct softnet_data *sd, *oldsd, *remsd = NULL;
local_irq_disable();
cpu = smp_processor_id();
@@ -8233,6 +8252,13 @@ static int dev_cpu_dead(unsigned int oldcpu)
raise_softirq_irqoff(NET_TX_SOFTIRQ);
local_irq_enable();
+#ifdef CONFIG_RPS
+ remsd = oldsd->rps_ipi_list;
+ oldsd->rps_ipi_list = NULL;
+#endif
+ /* send out pending IPI's on offline CPU */
+ net_rps_send_ipi(remsd);
+
/* Process offline CPU's input_pkt_queue */
while ((skb = __skb_dequeue(&oldsd->process_queue))) {
netif_rx_ni(skb);
diff --git a/net/core/dev_ioctl.c b/net/core/dev_ioctl.c
index b94b1d293506..27fad31784a8 100644
--- a/net/core/dev_ioctl.c
+++ b/net/core/dev_ioctl.c
@@ -410,6 +410,22 @@ int dev_ioctl(struct net *net, unsigned int cmd, void __user *arg)
if (cmd == SIOCGIFNAME)
return dev_ifname(net, (struct ifreq __user *)arg);
+ /*
+ * Take care of Wireless Extensions. Unfortunately struct iwreq
+ * isn't a proper subset of struct ifreq (it's 8 byte shorter)
+ * so we need to treat it specially, otherwise applications may
+ * fault if the struct they're passing happens to land at the
+ * end of a mapped page.
+ */
+ if (cmd >= SIOCIWFIRST && cmd <= SIOCIWLAST) {
+ struct iwreq iwr;
+
+ if (copy_from_user(&iwr, arg, sizeof(iwr)))
+ return -EFAULT;
+
+ return wext_handle_ioctl(net, &iwr, cmd, arg);
+ }
+
if (copy_from_user(&ifr, arg, sizeof(struct ifreq)))
return -EFAULT;
@@ -559,9 +575,6 @@ int dev_ioctl(struct net *net, unsigned int cmd, void __user *arg)
ret = -EFAULT;
return ret;
}
- /* Take care of Wireless Extensions */
- if (cmd >= SIOCIWFIRST && cmd <= SIOCIWLAST)
- return wext_handle_ioctl(net, &ifr, cmd, arg);
return -ENOTTY;
}
}
diff --git a/net/core/dst.c b/net/core/dst.c
index 6192f11beec9..13ba4a090c41 100644
--- a/net/core/dst.c
+++ b/net/core/dst.c
@@ -469,6 +469,20 @@ static int dst_dev_event(struct notifier_block *this, unsigned long event,
spin_lock_bh(&dst_garbage.lock);
dst = dst_garbage.list;
dst_garbage.list = NULL;
+ /* The code in dst_ifdown places a hold on the loopback device.
+ * If the gc entry processing is set to expire after a lengthy
+ * interval, this hold can cause netdev_wait_allrefs() to hang
+ * out and wait for a long time -- until the the loopback
+ * interface is released. If we're really unlucky, it'll emit
+ * pr_emerg messages to console too. Reset the interval here,
+ * so dst cleanups occur in a more timely fashion.
+ */
+ if (dst_garbage.timer_inc > DST_GC_INC) {
+ dst_garbage.timer_inc = DST_GC_INC;
+ dst_garbage.timer_expires = DST_GC_MIN;
+ mod_delayed_work(system_wq, &dst_gc_work,
+ dst_garbage.timer_expires);
+ }
spin_unlock_bh(&dst_garbage.lock);
if (last)
diff --git a/net/core/fib_rules.c b/net/core/fib_rules.c
index f21c4d3aeae0..3bba291c6c32 100644
--- a/net/core/fib_rules.c
+++ b/net/core/fib_rules.c
@@ -568,7 +568,7 @@ int fib_nl_delrule(struct sk_buff *skb, struct nlmsghdr *nlh,
struct net *net = sock_net(skb->sk);
struct fib_rule_hdr *frh = nlmsg_data(nlh);
struct fib_rules_ops *ops = NULL;
- struct fib_rule *rule, *tmp;
+ struct fib_rule *rule, *r;
struct nlattr *tb[FRA_MAX+1];
struct fib_kuid_range range;
int err = -EINVAL;
@@ -668,16 +668,23 @@ int fib_nl_delrule(struct sk_buff *skb, struct nlmsghdr *nlh,
/*
* Check if this rule is a target to any of them. If so,
+ * adjust to the next one with the same preference or
* disable them. As this operation is eventually very
- * expensive, it is only performed if goto rules have
- * actually been added.
+ * expensive, it is only performed if goto rules, except
+ * current if it is goto rule, have actually been added.
*/
if (ops->nr_goto_rules > 0) {
- list_for_each_entry(tmp, &ops->rules_list, list) {
- if (rtnl_dereference(tmp->ctarget) == rule) {
- RCU_INIT_POINTER(tmp->ctarget, NULL);
+ struct fib_rule *n;
+
+ n = list_next_entry(rule, list);
+ if (&n->list == &ops->rules_list || n->pref != rule->pref)
+ n = NULL;
+ list_for_each_entry(r, &ops->rules_list, list) {
+ if (rtnl_dereference(r->ctarget) != rule)
+ continue;
+ rcu_assign_pointer(r->ctarget, n);
+ if (!n)
ops->unresolved_rules++;
- }
}
}
diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c
index 9e2c0a7cb325..467a2f4510a7 100644
--- a/net/core/rtnetlink.c
+++ b/net/core/rtnetlink.c
@@ -931,6 +931,7 @@ static noinline size_t if_nlmsg_size(const struct net_device *dev,
+ nla_total_size(1) /* IFLA_LINKMODE */
+ nla_total_size(4) /* IFLA_CARRIER_CHANGES */
+ nla_total_size(4) /* IFLA_LINK_NETNSID */
+ + nla_total_size(4) /* IFLA_GROUP */
+ nla_total_size(ext_filter_mask
& RTEXT_FILTER_VF ? 4 : 0) /* IFLA_NUM_VF */
+ rtnl_vfinfo_size(dev, ext_filter_mask) /* IFLA_VFINFO_LIST */
@@ -1124,6 +1125,8 @@ static noinline_for_stack int rtnl_fill_vfinfo(struct sk_buff *skb,
struct ifla_vf_mac vf_mac;
struct ifla_vf_info ivi;
+ memset(&ivi, 0, sizeof(ivi));
+
/* Not all SR-IOV capable drivers support the
* spoofcheck and "RSS query enable" query. Preset to
* -1 so the user space tool can detect that the driver
@@ -1132,7 +1135,6 @@ static noinline_for_stack int rtnl_fill_vfinfo(struct sk_buff *skb,
ivi.spoofchk = -1;
ivi.rss_query_en = -1;
ivi.trusted = -1;
- memset(ivi.mac, 0, sizeof(ivi.mac));
/* The default value for VF link state is "auto"
* IFLA_VF_LINK_STATE_AUTO which equals zero
*/
@@ -1467,6 +1469,7 @@ static const struct nla_policy ifla_policy[IFLA_MAX+1] = {
[IFLA_LINK_NETNSID] = { .type = NLA_S32 },
[IFLA_PROTO_DOWN] = { .type = NLA_U8 },
[IFLA_XDP] = { .type = NLA_NESTED },
+ [IFLA_GROUP] = { .type = NLA_U32 },
};
static const struct nla_policy ifla_info_policy[IFLA_INFO_MAX+1] = {
diff --git a/net/decnet/dn_route.c b/net/decnet/dn_route.c
index 4b9518a0d248..6f95612b4d32 100644
--- a/net/decnet/dn_route.c
+++ b/net/decnet/dn_route.c
@@ -188,12 +188,6 @@ static inline void dnrt_free(struct dn_route *rt)
call_rcu_bh(&rt->dst.rcu_head, dst_rcu_free);
}
-static inline void dnrt_drop(struct dn_route *rt)
-{
- dst_release(&rt->dst);
- call_rcu_bh(&rt->dst.rcu_head, dst_rcu_free);
-}
-
static void dn_dst_check_expire(unsigned long dummy)
{
int i;
@@ -248,7 +242,7 @@ static int dn_dst_gc(struct dst_ops *ops)
}
*rtp = rt->dst.dn_next;
rt->dst.dn_next = NULL;
- dnrt_drop(rt);
+ dnrt_free(rt);
break;
}
spin_unlock_bh(&dn_rt_hash_table[i].lock);
@@ -350,7 +344,7 @@ static int dn_insert_route(struct dn_route *rt, unsigned int hash, struct dn_rou
dst_use(&rth->dst, now);
spin_unlock_bh(&dn_rt_hash_table[hash].lock);
- dnrt_drop(rt);
+ dst_free(&rt->dst);
*rp = rth;
return 0;
}
@@ -380,7 +374,7 @@ static void dn_run_flush(unsigned long dummy)
for(; rt; rt = next) {
next = rcu_dereference_raw(rt->dst.dn_next);
RCU_INIT_POINTER(rt->dst.dn_next, NULL);
- dst_free((struct dst_entry *)rt);
+ dnrt_free(rt);
}
nothing_to_declare:
@@ -1187,7 +1181,7 @@ make_route:
if (dev_out->flags & IFF_LOOPBACK)
flags |= RTCF_LOCAL;
- rt = dst_alloc(&dn_dst_ops, dev_out, 1, DST_OBSOLETE_NONE, DST_HOST);
+ rt = dst_alloc(&dn_dst_ops, dev_out, 0, DST_OBSOLETE_NONE, DST_HOST);
if (rt == NULL)
goto e_nobufs;
diff --git a/net/decnet/netfilter/dn_rtmsg.c b/net/decnet/netfilter/dn_rtmsg.c
index 1ed81ac6dd1a..aa8ffecc46a4 100644
--- a/net/decnet/netfilter/dn_rtmsg.c
+++ b/net/decnet/netfilter/dn_rtmsg.c
@@ -102,7 +102,9 @@ static inline void dnrmg_receive_user_skb(struct sk_buff *skb)
{
struct nlmsghdr *nlh = nlmsg_hdr(skb);
- if (nlh->nlmsg_len < sizeof(*nlh) || skb->len < nlh->nlmsg_len)
+ if (skb->len < sizeof(*nlh) ||
+ nlh->nlmsg_len < sizeof(*nlh) ||
+ skb->len < nlh->nlmsg_len)
return;
if (!netlink_capable(skb, CAP_NET_ADMIN))
diff --git a/net/hsr/hsr_device.c b/net/hsr/hsr_device.c
index c73160fb11e7..0a0a392dc2bd 100644
--- a/net/hsr/hsr_device.c
+++ b/net/hsr/hsr_device.c
@@ -378,7 +378,6 @@ static void hsr_dev_destroy(struct net_device *hsr_dev)
del_timer_sync(&hsr->announce_timer);
synchronize_rcu();
- free_netdev(hsr_dev);
}
static const struct net_device_ops hsr_device_ops = {
@@ -404,7 +403,8 @@ void hsr_dev_setup(struct net_device *dev)
SET_NETDEV_DEVTYPE(dev, &hsr_type);
dev->priv_flags |= IFF_NO_QUEUE;
- dev->destructor = hsr_dev_destroy;
+ dev->needs_free_netdev = true;
+ dev->priv_destructor = hsr_dev_destroy;
dev->hw_features = NETIF_F_SG | NETIF_F_FRAGLIST | NETIF_F_HIGHDMA |
NETIF_F_GSO_MASK | NETIF_F_HW_CSUM |
diff --git a/net/hsr/hsr_forward.c b/net/hsr/hsr_forward.c
index 4ebe2aa3e7d3..04b5450c5a55 100644
--- a/net/hsr/hsr_forward.c
+++ b/net/hsr/hsr_forward.c
@@ -324,8 +324,7 @@ static int hsr_fill_frame_info(struct hsr_frame_info *frame,
unsigned long irqflags;
frame->is_supervision = is_supervision_frame(port->hsr, skb);
- frame->node_src = hsr_get_node(&port->hsr->node_db, skb,
- frame->is_supervision);
+ frame->node_src = hsr_get_node(port, skb, frame->is_supervision);
if (frame->node_src == NULL)
return -1; /* Unknown node and !is_supervision, or no mem */
diff --git a/net/hsr/hsr_framereg.c b/net/hsr/hsr_framereg.c
index 7ea925816f79..284a9b820df8 100644
--- a/net/hsr/hsr_framereg.c
+++ b/net/hsr/hsr_framereg.c
@@ -158,9 +158,10 @@ struct hsr_node *hsr_add_node(struct list_head *node_db, unsigned char addr[],
/* Get the hsr_node from which 'skb' was sent.
*/
-struct hsr_node *hsr_get_node(struct list_head *node_db, struct sk_buff *skb,
+struct hsr_node *hsr_get_node(struct hsr_port *port, struct sk_buff *skb,
bool is_sup)
{
+ struct list_head *node_db = &port->hsr->node_db;
struct hsr_node *node;
struct ethhdr *ethhdr;
u16 seq_out;
@@ -186,7 +187,11 @@ struct hsr_node *hsr_get_node(struct list_head *node_db, struct sk_buff *skb,
*/
seq_out = hsr_get_skb_sequence_nr(skb) - 1;
} else {
- WARN_ONCE(1, "%s: Non-HSR frame\n", __func__);
+ /* this is called also for frames from master port and
+ * so warn only for non master ports
+ */
+ if (port->type != HSR_PT_MASTER)
+ WARN_ONCE(1, "%s: Non-HSR frame\n", __func__);
seq_out = HSR_SEQNR_START;
}
diff --git a/net/hsr/hsr_framereg.h b/net/hsr/hsr_framereg.h
index 438b40f98f5a..4e04f0e868e9 100644
--- a/net/hsr/hsr_framereg.h
+++ b/net/hsr/hsr_framereg.h
@@ -18,7 +18,7 @@ struct hsr_node;
struct hsr_node *hsr_add_node(struct list_head *node_db, unsigned char addr[],
u16 seq_out);
-struct hsr_node *hsr_get_node(struct list_head *node_db, struct sk_buff *skb,
+struct hsr_node *hsr_get_node(struct hsr_port *port, struct sk_buff *skb,
bool is_sup);
void hsr_handle_sup_frame(struct sk_buff *skb, struct hsr_node *node_curr,
struct hsr_port *port);
diff --git a/net/ieee802154/6lowpan/core.c b/net/ieee802154/6lowpan/core.c
index d7efbf0dad20..0a866f332290 100644
--- a/net/ieee802154/6lowpan/core.c
+++ b/net/ieee802154/6lowpan/core.c
@@ -107,7 +107,7 @@ static void lowpan_setup(struct net_device *ldev)
ldev->netdev_ops = &lowpan_netdev_ops;
ldev->header_ops = &lowpan_header_ops;
- ldev->destructor = free_netdev;
+ ldev->needs_free_netdev = true;
ldev->features |= NETIF_F_NETNS_LOCAL;
}
diff --git a/net/ipv4/icmp.c b/net/ipv4/icmp.c
index 43318b5f5647..9144fa7df2ad 100644
--- a/net/ipv4/icmp.c
+++ b/net/ipv4/icmp.c
@@ -657,8 +657,12 @@ void icmp_send(struct sk_buff *skb_in, int type, int code, __be32 info)
/* Needed by both icmp_global_allow and icmp_xmit_lock */
local_bh_disable();
- /* Check global sysctl_icmp_msgs_per_sec ratelimit */
- if (!icmpv4_global_allow(net, type, code))
+ /* Check global sysctl_icmp_msgs_per_sec ratelimit, unless
+ * incoming dev is loopback. If outgoing dev change to not be
+ * loopback, then peer ratelimit still work (in icmpv4_xrlim_allow)
+ */
+ if (!(skb_in->dev && (skb_in->dev->flags&IFF_LOOPBACK)) &&
+ !icmpv4_global_allow(net, type, code))
goto out_bh_enable;
sk = icmp_xmit_lock(net);
diff --git a/net/ipv4/igmp.c b/net/ipv4/igmp.c
index 44fd86de2823..ec9a396fa466 100644
--- a/net/ipv4/igmp.c
+++ b/net/ipv4/igmp.c
@@ -1112,6 +1112,7 @@ static void igmpv3_add_delrec(struct in_device *in_dev, struct ip_mc_list *im)
pmc = kzalloc(sizeof(*pmc), GFP_KERNEL);
if (!pmc)
return;
+ spin_lock_init(&pmc->lock);
spin_lock_bh(&im->lock);
pmc->interface = im->interface;
in_dev_hold(in_dev);
@@ -2071,21 +2072,26 @@ static int ip_mc_add_src(struct in_device *in_dev, __be32 *pmca, int sfmode,
static void ip_mc_clear_src(struct ip_mc_list *pmc)
{
- struct ip_sf_list *psf, *nextpsf;
+ struct ip_sf_list *psf, *nextpsf, *tomb, *sources;
- for (psf = pmc->tomb; psf; psf = nextpsf) {
+ spin_lock_bh(&pmc->lock);
+ tomb = pmc->tomb;
+ pmc->tomb = NULL;
+ sources = pmc->sources;
+ pmc->sources = NULL;
+ pmc->sfmode = MCAST_EXCLUDE;
+ pmc->sfcount[MCAST_INCLUDE] = 0;
+ pmc->sfcount[MCAST_EXCLUDE] = 1;
+ spin_unlock_bh(&pmc->lock);
+
+ for (psf = tomb; psf; psf = nextpsf) {
nextpsf = psf->sf_next;
kfree(psf);
}
- pmc->tomb = NULL;
- for (psf = pmc->sources; psf; psf = nextpsf) {
+ for (psf = sources; psf; psf = nextpsf) {
nextpsf = psf->sf_next;
kfree(psf);
}
- pmc->sources = NULL;
- pmc->sfmode = MCAST_EXCLUDE;
- pmc->sfcount[MCAST_INCLUDE] = 0;
- pmc->sfcount[MCAST_EXCLUDE] = 1;
}
/* Join a multicast group
diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c
index 7a3fd25e8913..532b36e9ce2a 100644
--- a/net/ipv4/ip_output.c
+++ b/net/ipv4/ip_output.c
@@ -964,7 +964,8 @@ static int __ip_append_data(struct sock *sk,
csummode = CHECKSUM_PARTIAL;
cork->length += length;
- if ((((length + fragheaderlen) > mtu) || (skb && skb_is_gso(skb))) &&
+ if ((((length + (skb ? skb->len : fragheaderlen)) > mtu) ||
+ (skb && skb_is_gso(skb))) &&
(sk->sk_protocol == IPPROTO_UDP) &&
(rt->dst.dev->features & NETIF_F_UFO) && !dst_xfrm(&rt->dst) &&
(sk->sk_type == SOCK_DGRAM) && !sk->sk_no_check_tx) {
diff --git a/net/ipv4/ip_tunnel.c b/net/ipv4/ip_tunnel.c
index b878ecbc0608..129d1a3616f8 100644
--- a/net/ipv4/ip_tunnel.c
+++ b/net/ipv4/ip_tunnel.c
@@ -446,6 +446,8 @@ int ip_tunnel_rcv(struct ip_tunnel *tunnel, struct sk_buff *skb,
return 0;
drop:
+ if (tun_dst)
+ dst_release((struct dst_entry *)tun_dst);
kfree_skb(skb);
return 0;
}
@@ -967,7 +969,6 @@ static void ip_tunnel_dev_free(struct net_device *dev)
gro_cells_destroy(&tunnel->gro_cells);
dst_cache_destroy(&tunnel->dst_cache);
free_percpu(dev->tstats);
- free_netdev(dev);
}
void ip_tunnel_dellink(struct net_device *dev, struct list_head *head)
@@ -1155,7 +1156,8 @@ int ip_tunnel_init(struct net_device *dev)
struct iphdr *iph = &tunnel->parms.iph;
int err;
- dev->destructor = ip_tunnel_dev_free;
+ dev->needs_free_netdev = true;
+ dev->priv_destructor = ip_tunnel_dev_free;
dev->tstats = netdev_alloc_pcpu_stats(struct pcpu_sw_netstats);
if (!dev->tstats)
return -ENOMEM;
diff --git a/net/ipv4/ipmr.c b/net/ipv4/ipmr.c
index 551de4d023a8..8ae425cad818 100644
--- a/net/ipv4/ipmr.c
+++ b/net/ipv4/ipmr.c
@@ -101,8 +101,8 @@ static struct mr_table *ipmr_new_table(struct net *net, u32 id);
static void ipmr_free_table(struct mr_table *mrt);
static void ip_mr_forward(struct net *net, struct mr_table *mrt,
- struct sk_buff *skb, struct mfc_cache *cache,
- int local);
+ struct net_device *dev, struct sk_buff *skb,
+ struct mfc_cache *cache, int local);
static int ipmr_cache_report(struct mr_table *mrt,
struct sk_buff *pkt, vifi_t vifi, int assert);
static int __ipmr_fill_mroute(struct mr_table *mrt, struct sk_buff *skb,
@@ -501,7 +501,7 @@ static void reg_vif_setup(struct net_device *dev)
dev->mtu = ETH_DATA_LEN - sizeof(struct iphdr) - 8;
dev->flags = IFF_NOARP;
dev->netdev_ops = &reg_vif_netdev_ops;
- dev->destructor = free_netdev;
+ dev->needs_free_netdev = true;
dev->features |= NETIF_F_NETNS_LOCAL;
}
@@ -988,7 +988,7 @@ static void ipmr_cache_resolve(struct net *net, struct mr_table *mrt,
rtnl_unicast(skb, net, NETLINK_CB(skb).portid);
} else {
- ip_mr_forward(net, mrt, skb, c, 0);
+ ip_mr_forward(net, mrt, skb->dev, skb, c, 0);
}
}
}
@@ -1073,7 +1073,7 @@ static int ipmr_cache_report(struct mr_table *mrt,
/* Queue a packet for resolution. It gets locked cache entry! */
static int ipmr_cache_unresolved(struct mr_table *mrt, vifi_t vifi,
- struct sk_buff *skb)
+ struct sk_buff *skb, struct net_device *dev)
{
const struct iphdr *iph = ip_hdr(skb);
struct mfc_cache *c;
@@ -1130,6 +1130,10 @@ static int ipmr_cache_unresolved(struct mr_table *mrt, vifi_t vifi,
kfree_skb(skb);
err = -ENOBUFS;
} else {
+ if (dev) {
+ skb->dev = dev;
+ skb->skb_iif = dev->ifindex;
+ }
skb_queue_tail(&c->mfc_un.unres.unresolved, skb);
err = 0;
}
@@ -1828,10 +1832,10 @@ static int ipmr_find_vif(struct mr_table *mrt, struct net_device *dev)
/* "local" means that we should preserve one skb (for local delivery) */
static void ip_mr_forward(struct net *net, struct mr_table *mrt,
- struct sk_buff *skb, struct mfc_cache *cache,
- int local)
+ struct net_device *dev, struct sk_buff *skb,
+ struct mfc_cache *cache, int local)
{
- int true_vifi = ipmr_find_vif(mrt, skb->dev);
+ int true_vifi = ipmr_find_vif(mrt, dev);
int psend = -1;
int vif, ct;
@@ -1853,13 +1857,7 @@ static void ip_mr_forward(struct net *net, struct mr_table *mrt,
}
/* Wrong interface: drop packet and (maybe) send PIM assert. */
- if (mrt->vif_table[vif].dev != skb->dev) {
- struct net_device *mdev;
-
- mdev = l3mdev_master_dev_rcu(mrt->vif_table[vif].dev);
- if (mdev == skb->dev)
- goto forward;
-
+ if (mrt->vif_table[vif].dev != dev) {
if (rt_is_output_route(skb_rtable(skb))) {
/* It is our own packet, looped back.
* Very complicated situation...
@@ -2053,7 +2051,7 @@ int ip_mr_input(struct sk_buff *skb)
read_lock(&mrt_lock);
vif = ipmr_find_vif(mrt, dev);
if (vif >= 0) {
- int err2 = ipmr_cache_unresolved(mrt, vif, skb);
+ int err2 = ipmr_cache_unresolved(mrt, vif, skb, dev);
read_unlock(&mrt_lock);
return err2;
@@ -2064,7 +2062,7 @@ int ip_mr_input(struct sk_buff *skb)
}
read_lock(&mrt_lock);
- ip_mr_forward(net, mrt, skb, cache, local);
+ ip_mr_forward(net, mrt, dev, skb, cache, local);
read_unlock(&mrt_lock);
if (local)
@@ -2238,7 +2236,7 @@ int ipmr_get_route(struct net *net, struct sk_buff *skb,
iph->saddr = saddr;
iph->daddr = daddr;
iph->version = 0;
- err = ipmr_cache_unresolved(mrt, vif, skb2);
+ err = ipmr_cache_unresolved(mrt, vif, skb2, dev);
read_unlock(&mrt_lock);
rcu_read_unlock();
return err;
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index b5ea036ca781..40aca7803cf2 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -2330,6 +2330,8 @@ int tcp_disconnect(struct sock *sk, int flags)
tcp_init_send_head(sk);
memset(&tp->rx_opt, 0, sizeof(tp->rx_opt));
__sk_dst_reset(sk);
+ dst_release(sk->sk_rx_dst);
+ sk->sk_rx_dst = NULL;
tcp_saved_syn_free(tp);
/* Clean up fastopen related fields */
diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c
index 6a4fb1e629fb..1d2dbace42ff 100644
--- a/net/ipv6/addrconf.c
+++ b/net/ipv6/addrconf.c
@@ -332,9 +332,9 @@ static void addrconf_mod_rs_timer(struct inet6_dev *idev,
static void addrconf_mod_dad_work(struct inet6_ifaddr *ifp,
unsigned long delay)
{
- if (!delayed_work_pending(&ifp->dad_work))
- in6_ifa_hold(ifp);
- mod_delayed_work(addrconf_wq, &ifp->dad_work, delay);
+ in6_ifa_hold(ifp);
+ if (mod_delayed_work(addrconf_wq, &ifp->dad_work, delay))
+ in6_ifa_put(ifp);
}
static int snmp6_alloc_dev(struct inet6_dev *idev)
@@ -3369,6 +3369,7 @@ static int addrconf_notify(struct notifier_block *this, unsigned long event,
struct net_device *dev = netdev_notifier_info_to_dev(ptr);
struct netdev_notifier_changeupper_info *info;
struct inet6_dev *idev = __in6_dev_get(dev);
+ struct net *net = dev_net(dev);
int run_pending = 0;
int err;
@@ -3384,7 +3385,7 @@ static int addrconf_notify(struct notifier_block *this, unsigned long event,
case NETDEV_CHANGEMTU:
/* if MTU under IPV6_MIN_MTU stop IPv6 on this interface. */
if (dev->mtu < IPV6_MIN_MTU) {
- addrconf_ifdown(dev, 1);
+ addrconf_ifdown(dev, dev != net->loopback_dev);
break;
}
@@ -3500,7 +3501,7 @@ static int addrconf_notify(struct notifier_block *this, unsigned long event,
* IPV6_MIN_MTU stop IPv6 on this interface.
*/
if (dev->mtu < IPV6_MIN_MTU)
- addrconf_ifdown(dev, 1);
+ addrconf_ifdown(dev, dev != net->loopback_dev);
}
break;
diff --git a/net/ipv6/datagram.c b/net/ipv6/datagram.c
index e011122ebd43..5c786f5ab961 100644
--- a/net/ipv6/datagram.c
+++ b/net/ipv6/datagram.c
@@ -250,8 +250,14 @@ ipv4_connected:
*/
err = ip6_datagram_dst_update(sk, true);
- if (err)
+ if (err) {
+ /* Reset daddr and dport so that udp_v6_early_demux()
+ * fails to find this socket
+ */
+ memset(&sk->sk_v6_daddr, 0, sizeof(sk->sk_v6_daddr));
+ inet->inet_dport = 0;
goto out;
+ }
sk->sk_state = TCP_ESTABLISHED;
sk_set_txhash(sk);
diff --git a/net/ipv6/esp6_offload.c b/net/ipv6/esp6_offload.c
index d950d43ba255..f02f131f6435 100644
--- a/net/ipv6/esp6_offload.c
+++ b/net/ipv6/esp6_offload.c
@@ -30,6 +30,25 @@
#include <net/ipv6.h>
#include <linux/icmpv6.h>
+static __u16 esp6_nexthdr_esp_offset(struct ipv6hdr *ipv6_hdr, int nhlen)
+{
+ int off = sizeof(struct ipv6hdr);
+ struct ipv6_opt_hdr *exthdr;
+
+ if (likely(ipv6_hdr->nexthdr == NEXTHDR_ESP))
+ return offsetof(struct ipv6hdr, nexthdr);
+
+ while (off < nhlen) {
+ exthdr = (void *)ipv6_hdr + off;
+ if (exthdr->nexthdr == NEXTHDR_ESP)
+ return off;
+
+ off += ipv6_optlen(exthdr);
+ }
+
+ return 0;
+}
+
static struct sk_buff **esp6_gro_receive(struct sk_buff **head,
struct sk_buff *skb)
{
@@ -38,6 +57,7 @@ static struct sk_buff **esp6_gro_receive(struct sk_buff **head,
struct xfrm_state *x;
__be32 seq;
__be32 spi;
+ int nhoff;
int err;
skb_pull(skb, offset);
@@ -72,6 +92,11 @@ static struct sk_buff **esp6_gro_receive(struct sk_buff **head,
xo->flags |= XFRM_GRO;
+ nhoff = esp6_nexthdr_esp_offset(ipv6_hdr(skb), offset);
+ if (!nhoff)
+ goto out;
+
+ IP6CB(skb)->nhoff = nhoff;
XFRM_TUNNEL_SKB_CB(skb)->tunnel.ip6 = NULL;
XFRM_SPI_SKB_CB(skb)->family = AF_INET6;
XFRM_SPI_SKB_CB(skb)->daddroff = offsetof(struct ipv6hdr, daddr);
diff --git a/net/ipv6/fib6_rules.c b/net/ipv6/fib6_rules.c
index eea23b57c6a5..ec849d88a662 100644
--- a/net/ipv6/fib6_rules.c
+++ b/net/ipv6/fib6_rules.c
@@ -32,7 +32,6 @@ struct fib6_rule {
struct dst_entry *fib6_rule_lookup(struct net *net, struct flowi6 *fl6,
int flags, pol_lookup_t lookup)
{
- struct rt6_info *rt;
struct fib_lookup_arg arg = {
.lookup_ptr = lookup,
.flags = FIB_LOOKUP_NOREF,
@@ -44,21 +43,11 @@ struct dst_entry *fib6_rule_lookup(struct net *net, struct flowi6 *fl6,
fib_rules_lookup(net->ipv6.fib6_rules_ops,
flowi6_to_flowi(fl6), flags, &arg);
- rt = arg.result;
+ if (arg.result)
+ return arg.result;
- if (!rt) {
- dst_hold(&net->ipv6.ip6_null_entry->dst);
- return &net->ipv6.ip6_null_entry->dst;
- }
-
- if (rt->rt6i_flags & RTF_REJECT &&
- rt->dst.error == -EAGAIN) {
- ip6_rt_put(rt);
- rt = net->ipv6.ip6_null_entry;
- dst_hold(&rt->dst);
- }
-
- return &rt->dst;
+ dst_hold(&net->ipv6.ip6_null_entry->dst);
+ return &net->ipv6.ip6_null_entry->dst;
}
static int fib6_rule_action(struct fib_rule *rule, struct flowi *flp,
@@ -121,7 +110,8 @@ static int fib6_rule_action(struct fib_rule *rule, struct flowi *flp,
flp6->saddr = saddr;
}
err = rt->dst.error;
- goto out;
+ if (err != -EAGAIN)
+ goto out;
}
again:
ip6_rt_put(rt);
diff --git a/net/ipv6/icmp.c b/net/ipv6/icmp.c
index 230b5aac9f03..8d7b113958b1 100644
--- a/net/ipv6/icmp.c
+++ b/net/ipv6/icmp.c
@@ -491,7 +491,7 @@ static void icmp6_send(struct sk_buff *skb, u8 type, u8 code, __u32 info,
local_bh_disable();
/* Check global sysctl_icmp_msgs_per_sec ratelimit */
- if (!icmpv6_global_allow(type))
+ if (!(skb->dev->flags&IFF_LOOPBACK) && !icmpv6_global_allow(type))
goto out_bh_enable;
mip6_addr_swap(skb);
diff --git a/net/ipv6/ila/ila_xlat.c b/net/ipv6/ila/ila_xlat.c
index 2fd5ca151dcf..77f7f8c7d93d 100644
--- a/net/ipv6/ila/ila_xlat.c
+++ b/net/ipv6/ila/ila_xlat.c
@@ -62,6 +62,7 @@ static inline u32 ila_locator_hash(struct ila_locator loc)
{
u32 *v = (u32 *)loc.v32;
+ __ila_hash_secret_init();
return jhash_2words(v[0], v[1], hashrnd);
}
diff --git a/net/ipv6/ip6_fib.c b/net/ipv6/ip6_fib.c
index d4bf2c68a545..e6b78ba0e636 100644
--- a/net/ipv6/ip6_fib.c
+++ b/net/ipv6/ip6_fib.c
@@ -289,8 +289,7 @@ struct dst_entry *fib6_rule_lookup(struct net *net, struct flowi6 *fl6,
struct rt6_info *rt;
rt = lookup(net, net->ipv6.fib6_main_tbl, fl6, flags);
- if (rt->rt6i_flags & RTF_REJECT &&
- rt->dst.error == -EAGAIN) {
+ if (rt->dst.error == -EAGAIN) {
ip6_rt_put(rt);
rt = net->ipv6.ip6_null_entry;
dst_hold(&rt->dst);
diff --git a/net/ipv6/ip6_gre.c b/net/ipv6/ip6_gre.c
index 0c5b4caa1949..64eea3962733 100644
--- a/net/ipv6/ip6_gre.c
+++ b/net/ipv6/ip6_gre.c
@@ -991,13 +991,13 @@ static void ip6gre_dev_free(struct net_device *dev)
dst_cache_destroy(&t->dst_cache);
free_percpu(dev->tstats);
- free_netdev(dev);
}
static void ip6gre_tunnel_setup(struct net_device *dev)
{
dev->netdev_ops = &ip6gre_netdev_ops;
- dev->destructor = ip6gre_dev_free;
+ dev->needs_free_netdev = true;
+ dev->priv_destructor = ip6gre_dev_free;
dev->type = ARPHRD_IP6GRE;
@@ -1148,7 +1148,7 @@ static int __net_init ip6gre_init_net(struct net *net)
return 0;
err_reg_dev:
- ip6gre_dev_free(ign->fb_tunnel_dev);
+ free_netdev(ign->fb_tunnel_dev);
err_alloc_dev:
return err;
}
@@ -1300,7 +1300,8 @@ static void ip6gre_tap_setup(struct net_device *dev)
ether_setup(dev);
dev->netdev_ops = &ip6gre_tap_netdev_ops;
- dev->destructor = ip6gre_dev_free;
+ dev->needs_free_netdev = true;
+ dev->priv_destructor = ip6gre_dev_free;
dev->features |= NETIF_F_NETNS_LOCAL;
dev->priv_flags &= ~IFF_TX_SKB_SHARING;
diff --git a/net/ipv6/ip6_output.c b/net/ipv6/ip6_output.c
index bf8a58a1c32d..1699acb2fa2c 100644
--- a/net/ipv6/ip6_output.c
+++ b/net/ipv6/ip6_output.c
@@ -1390,7 +1390,7 @@ emsgsize:
*/
cork->length += length;
- if ((((length + fragheaderlen) > mtu) ||
+ if ((((length + (skb ? skb->len : headersize)) > mtu) ||
(skb && skb_is_gso(skb))) &&
(sk->sk_protocol == IPPROTO_UDP) &&
(rt->dst.dev->features & NETIF_F_UFO) && !dst_xfrm(&rt->dst) &&
diff --git a/net/ipv6/ip6_tunnel.c b/net/ipv6/ip6_tunnel.c
index 9b37f9747fc6..8c6c3c8e7eef 100644
--- a/net/ipv6/ip6_tunnel.c
+++ b/net/ipv6/ip6_tunnel.c
@@ -254,7 +254,6 @@ static void ip6_dev_free(struct net_device *dev)
gro_cells_destroy(&t->gro_cells);
dst_cache_destroy(&t->dst_cache);
free_percpu(dev->tstats);
- free_netdev(dev);
}
static int ip6_tnl_create2(struct net_device *dev)
@@ -322,7 +321,7 @@ static struct ip6_tnl *ip6_tnl_create(struct net *net, struct __ip6_tnl_parm *p)
return t;
failed_free:
- ip6_dev_free(dev);
+ free_netdev(dev);
failed:
return ERR_PTR(err);
}
@@ -859,6 +858,8 @@ static int __ip6_tnl_rcv(struct ip6_tnl *tunnel, struct sk_buff *skb,
return 0;
drop:
+ if (tun_dst)
+ dst_release((struct dst_entry *)tun_dst);
kfree_skb(skb);
return 0;
}
@@ -1247,7 +1248,7 @@ ip4ip6_tnl_xmit(struct sk_buff *skb, struct net_device *dev)
fl6.flowi6_proto = IPPROTO_IPIP;
fl6.daddr = key->u.ipv6.dst;
fl6.flowlabel = key->label;
- dsfield = ip6_tclass(key->label);
+ dsfield = key->tos;
} else {
if (!(t->parms.flags & IP6_TNL_F_IGN_ENCAP_LIMIT))
encap_limit = t->parms.encap_limit;
@@ -1318,7 +1319,7 @@ ip6ip6_tnl_xmit(struct sk_buff *skb, struct net_device *dev)
fl6.flowi6_proto = IPPROTO_IPV6;
fl6.daddr = key->u.ipv6.dst;
fl6.flowlabel = key->label;
- dsfield = ip6_tclass(key->label);
+ dsfield = key->tos;
} else {
offset = ip6_tnl_parse_tlv_enc_lim(skb, skb_network_header(skb));
/* ip6_tnl_parse_tlv_enc_lim() might have reallocated skb->head */
@@ -1777,7 +1778,8 @@ static const struct net_device_ops ip6_tnl_netdev_ops = {
static void ip6_tnl_dev_setup(struct net_device *dev)
{
dev->netdev_ops = &ip6_tnl_netdev_ops;
- dev->destructor = ip6_dev_free;
+ dev->needs_free_netdev = true;
+ dev->priv_destructor = ip6_dev_free;
dev->type = ARPHRD_TUNNEL6;
dev->flags |= IFF_NOARP;
@@ -2224,7 +2226,7 @@ static int __net_init ip6_tnl_init_net(struct net *net)
return 0;
err_register:
- ip6_dev_free(ip6n->fb_tnl_dev);
+ free_netdev(ip6n->fb_tnl_dev);
err_alloc_dev:
return err;
}
diff --git a/net/ipv6/ip6_vti.c b/net/ipv6/ip6_vti.c
index d67ef56454b2..837ea1eefe7f 100644
--- a/net/ipv6/ip6_vti.c
+++ b/net/ipv6/ip6_vti.c
@@ -180,7 +180,6 @@ vti6_tnl_unlink(struct vti6_net *ip6n, struct ip6_tnl *t)
static void vti6_dev_free(struct net_device *dev)
{
free_percpu(dev->tstats);
- free_netdev(dev);
}
static int vti6_tnl_create2(struct net_device *dev)
@@ -235,7 +234,7 @@ static struct ip6_tnl *vti6_tnl_create(struct net *net, struct __ip6_tnl_parm *p
return t;
failed_free:
- vti6_dev_free(dev);
+ free_netdev(dev);
failed:
return NULL;
}
@@ -842,7 +841,8 @@ static const struct net_device_ops vti6_netdev_ops = {
static void vti6_dev_setup(struct net_device *dev)
{
dev->netdev_ops = &vti6_netdev_ops;
- dev->destructor = vti6_dev_free;
+ dev->needs_free_netdev = true;
+ dev->priv_destructor = vti6_dev_free;
dev->type = ARPHRD_TUNNEL6;
dev->hard_header_len = LL_MAX_HEADER + sizeof(struct ipv6hdr);
@@ -1100,7 +1100,7 @@ static int __net_init vti6_init_net(struct net *net)
return 0;
err_register:
- vti6_dev_free(ip6n->fb_tnl_dev);
+ free_netdev(ip6n->fb_tnl_dev);
err_alloc_dev:
return err;
}
diff --git a/net/ipv6/ip6mr.c b/net/ipv6/ip6mr.c
index 374997d26488..2ecb39b943b5 100644
--- a/net/ipv6/ip6mr.c
+++ b/net/ipv6/ip6mr.c
@@ -733,7 +733,7 @@ static void reg_vif_setup(struct net_device *dev)
dev->mtu = 1500 - sizeof(struct ipv6hdr) - 8;
dev->flags = IFF_NOARP;
dev->netdev_ops = &reg_vif_netdev_ops;
- dev->destructor = free_netdev;
+ dev->needs_free_netdev = true;
dev->features |= NETIF_F_NETNS_LOCAL;
}
diff --git a/net/ipv6/proc.c b/net/ipv6/proc.c
index cc8e3ae9ca73..e88bcb8ff0fd 100644
--- a/net/ipv6/proc.c
+++ b/net/ipv6/proc.c
@@ -219,7 +219,7 @@ static void snmp6_seq_show_item64(struct seq_file *seq, void __percpu *mib,
u64 buff64[SNMP_MIB_MAX];
int i;
- memset(buff64, 0, sizeof(unsigned long) * SNMP_MIB_MAX);
+ memset(buff64, 0, sizeof(u64) * SNMP_MIB_MAX);
snmp_get_cpu_field64_batch(buff64, itemlist, mib, syncpoff);
for (i = 0; itemlist[i].name; i++)
diff --git a/net/ipv6/route.c b/net/ipv6/route.c
index dc61b0b5e64e..322bd62e688b 100644
--- a/net/ipv6/route.c
+++ b/net/ipv6/route.c
@@ -2804,6 +2804,7 @@ static int fib6_ifdown(struct rt6_info *rt, void *arg)
if ((rt->dst.dev == dev || !dev) &&
rt != adn->net->ipv6.ip6_null_entry &&
(rt->rt6i_nsiblings == 0 ||
+ (dev && netdev_unregistering(dev)) ||
!rt->rt6i_idev->cnf.ignore_routes_with_linkdown))
return -1;
@@ -3721,7 +3722,11 @@ static int ip6_route_dev_notify(struct notifier_block *this,
net->ipv6.ip6_blk_hole_entry->dst.dev = dev;
net->ipv6.ip6_blk_hole_entry->rt6i_idev = in6_dev_get(dev);
#endif
- } else if (event == NETDEV_UNREGISTER) {
+ } else if (event == NETDEV_UNREGISTER &&
+ dev->reg_state != NETREG_UNREGISTERED) {
+ /* NETDEV_UNREGISTER could be fired for multiple times by
+ * netdev_wait_allrefs(). Make sure we only call this once.
+ */
in6_dev_put(net->ipv6.ip6_null_entry->rt6i_idev);
#ifdef CONFIG_IPV6_MULTIPLE_TABLES
in6_dev_put(net->ipv6.ip6_prohibit_entry->rt6i_idev);
diff --git a/net/ipv6/sit.c b/net/ipv6/sit.c
index 61e5902f0687..f8ad15891cd7 100644
--- a/net/ipv6/sit.c
+++ b/net/ipv6/sit.c
@@ -265,7 +265,7 @@ static struct ip_tunnel *ipip6_tunnel_locate(struct net *net,
return nt;
failed_free:
- ipip6_dev_free(dev);
+ free_netdev(dev);
failed:
return NULL;
}
@@ -305,7 +305,7 @@ static int ipip6_tunnel_get_prl(struct ip_tunnel *t,
* we try harder to allocate.
*/
kp = (cmax <= 1 || capable(CAP_NET_ADMIN)) ?
- kcalloc(cmax, sizeof(*kp), GFP_KERNEL) :
+ kcalloc(cmax, sizeof(*kp), GFP_KERNEL | __GFP_NOWARN) :
NULL;
rcu_read_lock();
@@ -1336,7 +1336,6 @@ static void ipip6_dev_free(struct net_device *dev)
dst_cache_destroy(&tunnel->dst_cache);
free_percpu(dev->tstats);
- free_netdev(dev);
}
#define SIT_FEATURES (NETIF_F_SG | \
@@ -1351,7 +1350,8 @@ static void ipip6_tunnel_setup(struct net_device *dev)
int t_hlen = tunnel->hlen + sizeof(struct iphdr);
dev->netdev_ops = &ipip6_netdev_ops;
- dev->destructor = ipip6_dev_free;
+ dev->needs_free_netdev = true;
+ dev->priv_destructor = ipip6_dev_free;
dev->type = ARPHRD_SIT;
dev->hard_header_len = LL_MAX_HEADER + t_hlen;
diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c
index 06ec39b79609..75703fda23e7 100644
--- a/net/ipv6/udp.c
+++ b/net/ipv6/udp.c
@@ -879,7 +879,8 @@ static struct sock *__udp6_lib_demux_lookup(struct net *net,
struct sock *sk;
udp_portaddr_for_each_entry_rcu(sk, &hslot2->head) {
- if (INET6_MATCH(sk, net, rmt_addr, loc_addr, ports, dif))
+ if (sk->sk_state == TCP_ESTABLISHED &&
+ INET6_MATCH(sk, net, rmt_addr, loc_addr, ports, dif))
return sk;
/* Only check first socket in chain */
break;
diff --git a/net/ipv6/xfrm6_input.c b/net/ipv6/xfrm6_input.c
index 08a807b29298..3ef5d913e7a3 100644
--- a/net/ipv6/xfrm6_input.c
+++ b/net/ipv6/xfrm6_input.c
@@ -43,8 +43,8 @@ int xfrm6_transport_finish(struct sk_buff *skb, int async)
return 1;
#endif
- ipv6_hdr(skb)->payload_len = htons(skb->len);
__skb_push(skb, skb->data - skb_network_header(skb));
+ ipv6_hdr(skb)->payload_len = htons(skb->len - sizeof(struct ipv6hdr));
if (xo && (xo->flags & XFRM_GRO)) {
skb_mac_header_rebuild(skb);
diff --git a/net/irda/irlan/irlan_eth.c b/net/irda/irlan/irlan_eth.c
index 74d09f91709e..3be852808a9d 100644
--- a/net/irda/irlan/irlan_eth.c
+++ b/net/irda/irlan/irlan_eth.c
@@ -65,7 +65,7 @@ static void irlan_eth_setup(struct net_device *dev)
ether_setup(dev);
dev->netdev_ops = &irlan_eth_netdev_ops;
- dev->destructor = free_netdev;
+ dev->needs_free_netdev = true;
dev->min_mtu = 0;
dev->max_mtu = ETH_MAX_MTU;
diff --git a/net/key/af_key.c b/net/key/af_key.c
index 512dc43d0ce6..b1432b668033 100644
--- a/net/key/af_key.c
+++ b/net/key/af_key.c
@@ -1157,6 +1157,7 @@ static struct xfrm_state * pfkey_msg2xfrm_state(struct net *net,
goto out;
}
+ err = -ENOBUFS;
key = ext_hdrs[SADB_EXT_KEY_AUTH - 1];
if (sa->sadb_sa_auth) {
int keysize = 0;
@@ -1168,8 +1169,10 @@ static struct xfrm_state * pfkey_msg2xfrm_state(struct net *net,
if (key)
keysize = (key->sadb_key_bits + 7) / 8;
x->aalg = kmalloc(sizeof(*x->aalg) + keysize, GFP_KERNEL);
- if (!x->aalg)
+ if (!x->aalg) {
+ err = -ENOMEM;
goto out;
+ }
strcpy(x->aalg->alg_name, a->name);
x->aalg->alg_key_len = 0;
if (key) {
@@ -1188,8 +1191,10 @@ static struct xfrm_state * pfkey_msg2xfrm_state(struct net *net,
goto out;
}
x->calg = kmalloc(sizeof(*x->calg), GFP_KERNEL);
- if (!x->calg)
+ if (!x->calg) {
+ err = -ENOMEM;
goto out;
+ }
strcpy(x->calg->alg_name, a->name);
x->props.calgo = sa->sadb_sa_encrypt;
} else {
@@ -1203,8 +1208,10 @@ static struct xfrm_state * pfkey_msg2xfrm_state(struct net *net,
if (key)
keysize = (key->sadb_key_bits + 7) / 8;
x->ealg = kmalloc(sizeof(*x->ealg) + keysize, GFP_KERNEL);
- if (!x->ealg)
+ if (!x->ealg) {
+ err = -ENOMEM;
goto out;
+ }
strcpy(x->ealg->alg_name, a->name);
x->ealg->alg_key_len = 0;
if (key) {
@@ -1249,8 +1256,10 @@ static struct xfrm_state * pfkey_msg2xfrm_state(struct net *net,
struct xfrm_encap_tmpl *natt;
x->encap = kmalloc(sizeof(*x->encap), GFP_KERNEL);
- if (!x->encap)
+ if (!x->encap) {
+ err = -ENOMEM;
goto out;
+ }
natt = x->encap;
n_type = ext_hdrs[SADB_X_EXT_NAT_T_TYPE-1];
@@ -2755,6 +2764,8 @@ static int pfkey_spdflush(struct sock *sk, struct sk_buff *skb, const struct sad
int err, err2;
err = xfrm_policy_flush(net, XFRM_POLICY_TYPE_MAIN, true);
+ if (!err)
+ xfrm_garbage_collect(net);
err2 = unicast_flush_resp(sk, hdr);
if (err || err2) {
if (err == -ESRCH) /* empty table - old silent behavior */
diff --git a/net/l2tp/l2tp_eth.c b/net/l2tp/l2tp_eth.c
index 8b21af7321b9..4de2ec94b08c 100644
--- a/net/l2tp/l2tp_eth.c
+++ b/net/l2tp/l2tp_eth.c
@@ -114,12 +114,13 @@ static void l2tp_eth_get_stats64(struct net_device *dev,
{
struct l2tp_eth *priv = netdev_priv(dev);
- stats->tx_bytes = atomic_long_read(&priv->tx_bytes);
- stats->tx_packets = atomic_long_read(&priv->tx_packets);
- stats->tx_dropped = atomic_long_read(&priv->tx_dropped);
- stats->rx_bytes = atomic_long_read(&priv->rx_bytes);
- stats->rx_packets = atomic_long_read(&priv->rx_packets);
- stats->rx_errors = atomic_long_read(&priv->rx_errors);
+ stats->tx_bytes = (unsigned long) atomic_long_read(&priv->tx_bytes);
+ stats->tx_packets = (unsigned long) atomic_long_read(&priv->tx_packets);
+ stats->tx_dropped = (unsigned long) atomic_long_read(&priv->tx_dropped);
+ stats->rx_bytes = (unsigned long) atomic_long_read(&priv->rx_bytes);
+ stats->rx_packets = (unsigned long) atomic_long_read(&priv->rx_packets);
+ stats->rx_errors = (unsigned long) atomic_long_read(&priv->rx_errors);
+
}
static const struct net_device_ops l2tp_eth_netdev_ops = {
@@ -141,7 +142,7 @@ static void l2tp_eth_dev_setup(struct net_device *dev)
dev->priv_flags &= ~IFF_TX_SKB_SHARING;
dev->features |= NETIF_F_LLTX;
dev->netdev_ops = &l2tp_eth_netdev_ops;
- dev->destructor = free_netdev;
+ dev->needs_free_netdev = true;
}
static void l2tp_eth_dev_recv(struct l2tp_session *session, struct sk_buff *skb, int data_len)
diff --git a/net/mac80211/cfg.c b/net/mac80211/cfg.c
index 6c2e6060cd54..4a388fe8c2d1 100644
--- a/net/mac80211/cfg.c
+++ b/net/mac80211/cfg.c
@@ -902,6 +902,8 @@ static int ieee80211_start_ap(struct wiphy *wiphy, struct net_device *dev,
default:
return -EINVAL;
}
+ sdata->u.ap.req_smps = sdata->smps_mode;
+
sdata->needed_rx_chains = sdata->local->rx_chains;
sdata->vif.bss_conf.beacon_int = params->beacon_interval;
diff --git a/net/mac80211/ieee80211_i.h b/net/mac80211/ieee80211_i.h
index 665501ac358f..5e002f62c235 100644
--- a/net/mac80211/ieee80211_i.h
+++ b/net/mac80211/ieee80211_i.h
@@ -1531,7 +1531,7 @@ ieee80211_have_rx_timestamp(struct ieee80211_rx_status *status)
return true;
/* can't handle non-legacy preamble yet */
if (status->flag & RX_FLAG_MACTIME_PLCP_START &&
- status->encoding != RX_ENC_LEGACY)
+ status->encoding == RX_ENC_LEGACY)
return true;
return false;
}
diff --git a/net/mac80211/iface.c b/net/mac80211/iface.c
index 8fae1a72e6a7..f5f50150ba1c 100644
--- a/net/mac80211/iface.c
+++ b/net/mac80211/iface.c
@@ -1213,7 +1213,6 @@ static const struct net_device_ops ieee80211_monitorif_ops = {
static void ieee80211_if_free(struct net_device *dev)
{
free_percpu(dev->tstats);
- free_netdev(dev);
}
static void ieee80211_if_setup(struct net_device *dev)
@@ -1221,7 +1220,8 @@ static void ieee80211_if_setup(struct net_device *dev)
ether_setup(dev);
dev->priv_flags &= ~IFF_TX_SKB_SHARING;
dev->netdev_ops = &ieee80211_dataif_ops;
- dev->destructor = ieee80211_if_free;
+ dev->needs_free_netdev = true;
+ dev->priv_destructor = ieee80211_if_free;
}
static void ieee80211_if_setup_no_queue(struct net_device *dev)
@@ -1816,6 +1816,7 @@ int ieee80211_if_add(struct ieee80211_local *local, const char *name,
ret = dev_alloc_name(ndev, ndev->name);
if (ret < 0) {
ieee80211_if_free(ndev);
+ free_netdev(ndev);
return ret;
}
@@ -1905,7 +1906,7 @@ int ieee80211_if_add(struct ieee80211_local *local, const char *name,
ret = register_netdevice(ndev);
if (ret) {
- ieee80211_if_free(ndev);
+ free_netdev(ndev);
return ret;
}
}
diff --git a/net/mac80211/mlme.c b/net/mac80211/mlme.c
index 0ea9712bd99e..cc8e6ea1b27e 100644
--- a/net/mac80211/mlme.c
+++ b/net/mac80211/mlme.c
@@ -601,7 +601,7 @@ static void ieee80211_send_assoc(struct ieee80211_sub_if_data *sdata)
struct ieee80211_supported_band *sband;
struct ieee80211_chanctx_conf *chanctx_conf;
struct ieee80211_channel *chan;
- u32 rate_flags, rates = 0;
+ u32 rates = 0;
sdata_assert_lock(sdata);
@@ -612,7 +612,6 @@ static void ieee80211_send_assoc(struct ieee80211_sub_if_data *sdata)
return;
}
chan = chanctx_conf->def.chan;
- rate_flags = ieee80211_chandef_rate_flags(&chanctx_conf->def);
rcu_read_unlock();
sband = local->hw.wiphy->bands[chan->band];
shift = ieee80211_vif_get_shift(&sdata->vif);
@@ -636,9 +635,6 @@ static void ieee80211_send_assoc(struct ieee80211_sub_if_data *sdata)
*/
rates_len = 0;
for (i = 0; i < sband->n_bitrates; i++) {
- if ((rate_flags & sband->bitrates[i].flags)
- != rate_flags)
- continue;
rates |= BIT(i);
rates_len++;
}
@@ -2818,7 +2814,7 @@ static void ieee80211_get_rates(struct ieee80211_supported_band *sband,
u32 *rates, u32 *basic_rates,
bool *have_higher_than_11mbit,
int *min_rate, int *min_rate_index,
- int shift, u32 rate_flags)
+ int shift)
{
int i, j;
@@ -2846,8 +2842,6 @@ static void ieee80211_get_rates(struct ieee80211_supported_band *sband,
int brate;
br = &sband->bitrates[j];
- if ((rate_flags & br->flags) != rate_flags)
- continue;
brate = DIV_ROUND_UP(br->bitrate, (1 << shift) * 5);
if (brate == rate) {
@@ -4398,40 +4392,32 @@ static int ieee80211_prep_connection(struct ieee80211_sub_if_data *sdata,
return -ENOMEM;
}
- if (new_sta || override) {
- err = ieee80211_prep_channel(sdata, cbss);
- if (err) {
- if (new_sta)
- sta_info_free(local, new_sta);
- return -EINVAL;
- }
- }
-
+ /*
+ * Set up the information for the new channel before setting the
+ * new channel. We can't - completely race-free - change the basic
+ * rates bitmap and the channel (sband) that it refers to, but if
+ * we set it up before we at least avoid calling into the driver's
+ * bss_info_changed() method with invalid information (since we do
+ * call that from changing the channel - only for IDLE and perhaps
+ * some others, but ...).
+ *
+ * So to avoid that, just set up all the new information before the
+ * channel, but tell the driver to apply it only afterwards, since
+ * it might need the new channel for that.
+ */
if (new_sta) {
u32 rates = 0, basic_rates = 0;
bool have_higher_than_11mbit;
int min_rate = INT_MAX, min_rate_index = -1;
- struct ieee80211_chanctx_conf *chanctx_conf;
const struct cfg80211_bss_ies *ies;
int shift = ieee80211_vif_get_shift(&sdata->vif);
- u32 rate_flags;
-
- rcu_read_lock();
- chanctx_conf = rcu_dereference(sdata->vif.chanctx_conf);
- if (WARN_ON(!chanctx_conf)) {
- rcu_read_unlock();
- sta_info_free(local, new_sta);
- return -EINVAL;
- }
- rate_flags = ieee80211_chandef_rate_flags(&chanctx_conf->def);
- rcu_read_unlock();
ieee80211_get_rates(sband, bss->supp_rates,
bss->supp_rates_len,
&rates, &basic_rates,
&have_higher_than_11mbit,
&min_rate, &min_rate_index,
- shift, rate_flags);
+ shift);
/*
* This used to be a workaround for basic rates missing
@@ -4489,8 +4475,22 @@ static int ieee80211_prep_connection(struct ieee80211_sub_if_data *sdata,
sdata->vif.bss_conf.sync_dtim_count = 0;
}
rcu_read_unlock();
+ }
- /* tell driver about BSSID, basic rates and timing */
+ if (new_sta || override) {
+ err = ieee80211_prep_channel(sdata, cbss);
+ if (err) {
+ if (new_sta)
+ sta_info_free(local, new_sta);
+ return -EINVAL;
+ }
+ }
+
+ if (new_sta) {
+ /*
+ * tell driver about BSSID, basic rates and timing
+ * this was set up above, before setting the channel
+ */
ieee80211_bss_info_change_notify(sdata,
BSS_CHANGED_BSSID | BSS_CHANGED_BASIC_RATES |
BSS_CHANGED_BEACON_INT);
diff --git a/net/mac80211/rx.c b/net/mac80211/rx.c
index 1f75280ba26c..3674fe3d67dc 100644
--- a/net/mac80211/rx.c
+++ b/net/mac80211/rx.c
@@ -1613,12 +1613,16 @@ ieee80211_rx_h_sta_process(struct ieee80211_rx_data *rx)
*/
if (!ieee80211_hw_check(&sta->local->hw, AP_LINK_PS) &&
!ieee80211_has_morefrags(hdr->frame_control) &&
+ !ieee80211_is_back_req(hdr->frame_control) &&
!(status->rx_flags & IEEE80211_RX_DEFERRED_RELEASE) &&
(rx->sdata->vif.type == NL80211_IFTYPE_AP ||
rx->sdata->vif.type == NL80211_IFTYPE_AP_VLAN) &&
- /* PM bit is only checked in frames where it isn't reserved,
+ /*
+ * PM bit is only checked in frames where it isn't reserved,
* in AP mode it's reserved in non-bufferable management frames
* (cf. IEEE 802.11-2012 8.2.4.1.7 Power Management field)
+ * BAR frames should be ignored as specified in
+ * IEEE 802.11-2012 10.2.1.2.
*/
(!ieee80211_is_mgmt(hdr->frame_control) ||
ieee80211_is_bufferable_mmpdu(hdr->frame_control))) {
diff --git a/net/mac80211/wpa.c b/net/mac80211/wpa.c
index c1ef22df865f..cc19614ff4e6 100644
--- a/net/mac80211/wpa.c
+++ b/net/mac80211/wpa.c
@@ -17,6 +17,7 @@
#include <asm/unaligned.h>
#include <net/mac80211.h>
#include <crypto/aes.h>
+#include <crypto/algapi.h>
#include "ieee80211_i.h"
#include "michael.h"
@@ -153,7 +154,7 @@ ieee80211_rx_h_michael_mic_verify(struct ieee80211_rx_data *rx)
data_len = skb->len - hdrlen - MICHAEL_MIC_LEN;
key = &rx->key->conf.key[NL80211_TKIP_DATA_OFFSET_RX_MIC_KEY];
michael_mic(key, hdr, data, data_len, mic);
- if (memcmp(mic, data + data_len, MICHAEL_MIC_LEN) != 0)
+ if (crypto_memneq(mic, data + data_len, MICHAEL_MIC_LEN))
goto mic_fail;
/* remove Michael MIC from payload */
@@ -1048,7 +1049,7 @@ ieee80211_crypto_aes_cmac_decrypt(struct ieee80211_rx_data *rx)
bip_aad(skb, aad);
ieee80211_aes_cmac(key->u.aes_cmac.tfm, aad,
skb->data + 24, skb->len - 24, mic);
- if (memcmp(mic, mmie->mic, sizeof(mmie->mic)) != 0) {
+ if (crypto_memneq(mic, mmie->mic, sizeof(mmie->mic))) {
key->u.aes_cmac.icverrors++;
return RX_DROP_UNUSABLE;
}
@@ -1098,7 +1099,7 @@ ieee80211_crypto_aes_cmac_256_decrypt(struct ieee80211_rx_data *rx)
bip_aad(skb, aad);
ieee80211_aes_cmac_256(key->u.aes_cmac.tfm, aad,
skb->data + 24, skb->len - 24, mic);
- if (memcmp(mic, mmie->mic, sizeof(mmie->mic)) != 0) {
+ if (crypto_memneq(mic, mmie->mic, sizeof(mmie->mic))) {
key->u.aes_cmac.icverrors++;
return RX_DROP_UNUSABLE;
}
@@ -1202,7 +1203,7 @@ ieee80211_crypto_aes_gmac_decrypt(struct ieee80211_rx_data *rx)
if (ieee80211_aes_gmac(key->u.aes_gmac.tfm, aad, nonce,
skb->data + 24, skb->len - 24,
mic) < 0 ||
- memcmp(mic, mmie->mic, sizeof(mmie->mic)) != 0) {
+ crypto_memneq(mic, mmie->mic, sizeof(mmie->mic))) {
key->u.aes_gmac.icverrors++;
return RX_DROP_UNUSABLE;
}
diff --git a/net/mac802154/iface.c b/net/mac802154/iface.c
index 06019dba4b10..bd88a9b80773 100644
--- a/net/mac802154/iface.c
+++ b/net/mac802154/iface.c
@@ -526,8 +526,6 @@ static void mac802154_wpan_free(struct net_device *dev)
struct ieee802154_sub_if_data *sdata = IEEE802154_DEV_TO_SUB_IF(dev);
mac802154_llsec_destroy(&sdata->sec);
-
- free_netdev(dev);
}
static void ieee802154_if_setup(struct net_device *dev)
@@ -593,7 +591,8 @@ ieee802154_setup_sdata(struct ieee802154_sub_if_data *sdata,
sdata->dev->dev_addr);
sdata->dev->header_ops = &mac802154_header_ops;
- sdata->dev->destructor = mac802154_wpan_free;
+ sdata->dev->needs_free_netdev = true;
+ sdata->dev->priv_destructor = mac802154_wpan_free;
sdata->dev->netdev_ops = &mac802154_wpan_ops;
sdata->dev->ml_priv = &mac802154_mlme_wpan;
wpan_dev->promiscuous_mode = false;
@@ -608,7 +607,7 @@ ieee802154_setup_sdata(struct ieee802154_sub_if_data *sdata,
break;
case NL802154_IFTYPE_MONITOR:
- sdata->dev->destructor = free_netdev;
+ sdata->dev->needs_free_netdev = true;
sdata->dev->netdev_ops = &mac802154_monitor_ops;
wpan_dev->promiscuous_mode = true;
break;
diff --git a/net/openvswitch/vport-internal_dev.c b/net/openvswitch/vport-internal_dev.c
index 89193a634da4..04a3128adcf0 100644
--- a/net/openvswitch/vport-internal_dev.c
+++ b/net/openvswitch/vport-internal_dev.c
@@ -94,7 +94,6 @@ static void internal_dev_destructor(struct net_device *dev)
struct vport *vport = ovs_internal_dev_get_vport(dev);
ovs_vport_free(vport);
- free_netdev(dev);
}
static void
@@ -156,7 +155,8 @@ static void do_setup(struct net_device *netdev)
netdev->priv_flags &= ~IFF_TX_SKB_SHARING;
netdev->priv_flags |= IFF_LIVE_ADDR_CHANGE | IFF_OPENVSWITCH |
IFF_PHONY_HEADROOM | IFF_NO_QUEUE;
- netdev->destructor = internal_dev_destructor;
+ netdev->needs_free_netdev = true;
+ netdev->priv_destructor = internal_dev_destructor;
netdev->ethtool_ops = &internal_dev_ethtool_ops;
netdev->rtnl_link_ops = &internal_dev_link_ops;
diff --git a/net/phonet/pep-gprs.c b/net/phonet/pep-gprs.c
index 21c28b51be94..2c9337946e30 100644
--- a/net/phonet/pep-gprs.c
+++ b/net/phonet/pep-gprs.c
@@ -236,7 +236,7 @@ static void gprs_setup(struct net_device *dev)
dev->tx_queue_len = 10;
dev->netdev_ops = &gprs_netdev_ops;
- dev->destructor = free_netdev;
+ dev->needs_free_netdev = true;
}
/*
diff --git a/net/rxrpc/key.c b/net/rxrpc/key.c
index 0a4e28477ad9..54369225766e 100644
--- a/net/rxrpc/key.c
+++ b/net/rxrpc/key.c
@@ -217,7 +217,7 @@ static int rxrpc_krb5_decode_principal(struct krb5_principal *princ,
unsigned int *_toklen)
{
const __be32 *xdr = *_xdr;
- unsigned int toklen = *_toklen, n_parts, loop, tmp;
+ unsigned int toklen = *_toklen, n_parts, loop, tmp, paddedlen;
/* there must be at least one name, and at least #names+1 length
* words */
@@ -247,16 +247,16 @@ static int rxrpc_krb5_decode_principal(struct krb5_principal *princ,
toklen -= 4;
if (tmp <= 0 || tmp > AFSTOKEN_STRING_MAX)
return -EINVAL;
- if (tmp > toklen)
+ paddedlen = (tmp + 3) & ~3;
+ if (paddedlen > toklen)
return -EINVAL;
princ->name_parts[loop] = kmalloc(tmp + 1, GFP_KERNEL);
if (!princ->name_parts[loop])
return -ENOMEM;
memcpy(princ->name_parts[loop], xdr, tmp);
princ->name_parts[loop][tmp] = 0;
- tmp = (tmp + 3) & ~3;
- toklen -= tmp;
- xdr += tmp >> 2;
+ toklen -= paddedlen;
+ xdr += paddedlen >> 2;
}
if (toklen < 4)
@@ -265,16 +265,16 @@ static int rxrpc_krb5_decode_principal(struct krb5_principal *princ,
toklen -= 4;
if (tmp <= 0 || tmp > AFSTOKEN_K5_REALM_MAX)
return -EINVAL;
- if (tmp > toklen)
+ paddedlen = (tmp + 3) & ~3;
+ if (paddedlen > toklen)
return -EINVAL;
princ->realm = kmalloc(tmp + 1, GFP_KERNEL);
if (!princ->realm)
return -ENOMEM;
memcpy(princ->realm, xdr, tmp);
princ->realm[tmp] = 0;
- tmp = (tmp + 3) & ~3;
- toklen -= tmp;
- xdr += tmp >> 2;
+ toklen -= paddedlen;
+ xdr += paddedlen >> 2;
_debug("%s/...@%s", princ->name_parts[0], princ->realm);
@@ -293,7 +293,7 @@ static int rxrpc_krb5_decode_tagged_data(struct krb5_tagged_data *td,
unsigned int *_toklen)
{
const __be32 *xdr = *_xdr;
- unsigned int toklen = *_toklen, len;
+ unsigned int toklen = *_toklen, len, paddedlen;
/* there must be at least one tag and one length word */
if (toklen <= 8)
@@ -307,15 +307,17 @@ static int rxrpc_krb5_decode_tagged_data(struct krb5_tagged_data *td,
toklen -= 8;
if (len > max_data_size)
return -EINVAL;
+ paddedlen = (len + 3) & ~3;
+ if (paddedlen > toklen)
+ return -EINVAL;
td->data_len = len;
if (len > 0) {
td->data = kmemdup(xdr, len, GFP_KERNEL);
if (!td->data)
return -ENOMEM;
- len = (len + 3) & ~3;
- toklen -= len;
- xdr += len >> 2;
+ toklen -= paddedlen;
+ xdr += paddedlen >> 2;
}
_debug("tag %x len %x", td->tag, td->data_len);
@@ -387,7 +389,7 @@ static int rxrpc_krb5_decode_ticket(u8 **_ticket, u16 *_tktlen,
const __be32 **_xdr, unsigned int *_toklen)
{
const __be32 *xdr = *_xdr;
- unsigned int toklen = *_toklen, len;
+ unsigned int toklen = *_toklen, len, paddedlen;
/* there must be at least one length word */
if (toklen <= 4)
@@ -399,6 +401,9 @@ static int rxrpc_krb5_decode_ticket(u8 **_ticket, u16 *_tktlen,
toklen -= 4;
if (len > AFSTOKEN_K5_TIX_MAX)
return -EINVAL;
+ paddedlen = (len + 3) & ~3;
+ if (paddedlen > toklen)
+ return -EINVAL;
*_tktlen = len;
_debug("ticket len %u", len);
@@ -407,9 +412,8 @@ static int rxrpc_krb5_decode_ticket(u8 **_ticket, u16 *_tktlen,
*_ticket = kmemdup(xdr, len, GFP_KERNEL);
if (!*_ticket)
return -ENOMEM;
- len = (len + 3) & ~3;
- toklen -= len;
- xdr += len >> 2;
+ toklen -= paddedlen;
+ xdr += paddedlen >> 2;
}
*_xdr = xdr;
@@ -552,7 +556,7 @@ static int rxrpc_preparse_xdr(struct key_preparsed_payload *prep)
{
const __be32 *xdr = prep->data, *token;
const char *cp;
- unsigned int len, tmp, loop, ntoken, toklen, sec_ix;
+ unsigned int len, paddedlen, loop, ntoken, toklen, sec_ix;
size_t datalen = prep->datalen;
int ret;
@@ -578,22 +582,21 @@ static int rxrpc_preparse_xdr(struct key_preparsed_payload *prep)
if (len < 1 || len > AFSTOKEN_CELL_MAX)
goto not_xdr;
datalen -= 4;
- tmp = (len + 3) & ~3;
- if (tmp > datalen)
+ paddedlen = (len + 3) & ~3;
+ if (paddedlen > datalen)
goto not_xdr;
cp = (const char *) xdr;
for (loop = 0; loop < len; loop++)
if (!isprint(cp[loop]))
goto not_xdr;
- if (len < tmp)
- for (; loop < tmp; loop++)
- if (cp[loop])
- goto not_xdr;
+ for (; loop < paddedlen; loop++)
+ if (cp[loop])
+ goto not_xdr;
_debug("cellname: [%u/%u] '%*.*s'",
- len, tmp, len, len, (const char *) xdr);
- datalen -= tmp;
- xdr += tmp >> 2;
+ len, paddedlen, len, len, (const char *) xdr);
+ datalen -= paddedlen;
+ xdr += paddedlen >> 2;
/* get the token count */
if (datalen < 12)
@@ -614,10 +617,11 @@ static int rxrpc_preparse_xdr(struct key_preparsed_payload *prep)
sec_ix = ntohl(*xdr);
datalen -= 4;
_debug("token: [%x/%zx] %x", toklen, datalen, sec_ix);
- if (toklen < 20 || toklen > datalen)
+ paddedlen = (toklen + 3) & ~3;
+ if (toklen < 20 || toklen > datalen || paddedlen > datalen)
goto not_xdr;
- datalen -= (toklen + 3) & ~3;
- xdr += (toklen + 3) >> 2;
+ datalen -= paddedlen;
+ xdr += paddedlen >> 2;
} while (--loop > 0);
diff --git a/net/sched/act_pedit.c b/net/sched/act_pedit.c
index 164b5ac094be..7dc5892671c8 100644
--- a/net/sched/act_pedit.c
+++ b/net/sched/act_pedit.c
@@ -94,8 +94,10 @@ static struct tcf_pedit_key_ex *tcf_pedit_keys_ex_parse(struct nlattr *nla,
k++;
}
- if (n)
+ if (n) {
+ err = -EINVAL;
goto err_out;
+ }
return keys_ex;
diff --git a/net/sched/act_police.c b/net/sched/act_police.c
index f42008b29311..b062bc80c7cb 100644
--- a/net/sched/act_police.c
+++ b/net/sched/act_police.c
@@ -132,21 +132,21 @@ static int tcf_act_police_init(struct net *net, struct nlattr *nla,
}
}
- spin_lock_bh(&police->tcf_lock);
if (est) {
err = gen_replace_estimator(&police->tcf_bstats, NULL,
&police->tcf_rate_est,
&police->tcf_lock,
NULL, est);
if (err)
- goto failure_unlock;
+ goto failure;
} else if (tb[TCA_POLICE_AVRATE] &&
(ret == ACT_P_CREATED ||
!gen_estimator_active(&police->tcf_rate_est))) {
err = -EINVAL;
- goto failure_unlock;
+ goto failure;
}
+ spin_lock_bh(&police->tcf_lock);
/* No failure allowed after this point */
police->tcfp_mtu = parm->mtu;
if (police->tcfp_mtu == 0) {
@@ -192,8 +192,6 @@ static int tcf_act_police_init(struct net *net, struct nlattr *nla,
return ret;
-failure_unlock:
- spin_unlock_bh(&police->tcf_lock);
failure:
qdisc_put_rtab(P_tab);
qdisc_put_rtab(R_tab);
diff --git a/net/sched/sch_api.c b/net/sched/sch_api.c
index e88342fde1bc..cfdbfa18a95e 100644
--- a/net/sched/sch_api.c
+++ b/net/sched/sch_api.c
@@ -1019,7 +1019,8 @@ static struct Qdisc *qdisc_create(struct net_device *dev,
return sch;
}
/* ops->init() failed, we call ->destroy() like qdisc_create_dflt() */
- ops->destroy(sch);
+ if (ops->destroy)
+ ops->destroy(sch);
err_out3:
dev_put(dev);
kfree((char *) sch - sch->padded);
diff --git a/net/sctp/endpointola.c b/net/sctp/endpointola.c
index 8c589230794f..3dcd0ecf3d99 100644
--- a/net/sctp/endpointola.c
+++ b/net/sctp/endpointola.c
@@ -275,6 +275,7 @@ static void sctp_endpoint_destroy(struct sctp_endpoint *ep)
if (sctp_sk(sk)->bind_hash)
sctp_put_port(sk);
+ sctp_sk(sk)->ep = NULL;
sock_put(sk);
}
diff --git a/net/sctp/sctp_diag.c b/net/sctp/sctp_diag.c
index 048954eee984..9a647214a91e 100644
--- a/net/sctp/sctp_diag.c
+++ b/net/sctp/sctp_diag.c
@@ -278,7 +278,6 @@ out:
static int sctp_sock_dump(struct sock *sk, void *p)
{
- struct sctp_endpoint *ep = sctp_sk(sk)->ep;
struct sctp_comm_param *commp = p;
struct sk_buff *skb = commp->skb;
struct netlink_callback *cb = commp->cb;
@@ -287,7 +286,9 @@ static int sctp_sock_dump(struct sock *sk, void *p)
int err = 0;
lock_sock(sk);
- list_for_each_entry(assoc, &ep->asocs, asocs) {
+ if (!sctp_sk(sk)->ep)
+ goto release;
+ list_for_each_entry(assoc, &sctp_sk(sk)->ep->asocs, asocs) {
if (cb->args[4] < cb->args[1])
goto next;
diff --git a/net/sctp/socket.c b/net/sctp/socket.c
index f16c8d97b7f3..3a8318e518f1 100644
--- a/net/sctp/socket.c
+++ b/net/sctp/socket.c
@@ -4622,13 +4622,13 @@ int sctp_for_each_endpoint(int (*cb)(struct sctp_endpoint *, void *),
for (head = sctp_ep_hashtable; hash < sctp_ep_hashsize;
hash++, head++) {
- read_lock(&head->lock);
+ read_lock_bh(&head->lock);
sctp_for_each_hentry(epb, &head->chain) {
err = cb(sctp_ep(epb), p);
if (err)
break;
}
- read_unlock(&head->lock);
+ read_unlock_bh(&head->lock);
}
return err;
@@ -4666,9 +4666,8 @@ int sctp_for_each_transport(int (*cb)(struct sctp_transport *, void *),
if (err)
return err;
- sctp_transport_get_idx(net, &hti, pos);
- obj = sctp_transport_get_next(net, &hti);
- for (; obj && !IS_ERR(obj); obj = sctp_transport_get_next(net, &hti)) {
+ obj = sctp_transport_get_idx(net, &hti, pos + 1);
+ for (; !IS_ERR_OR_NULL(obj); obj = sctp_transport_get_next(net, &hti)) {
struct sctp_transport *transport = obj;
if (!sctp_transport_hold(transport))
diff --git a/net/tipc/msg.c b/net/tipc/msg.c
index 312ef7de57d7..ab3087687a32 100644
--- a/net/tipc/msg.c
+++ b/net/tipc/msg.c
@@ -508,7 +508,7 @@ bool tipc_msg_reverse(u32 own_node, struct sk_buff **skb, int err)
}
if (skb_cloned(_skb) &&
- pskb_expand_head(_skb, BUF_HEADROOM, BUF_TAILROOM, GFP_KERNEL))
+ pskb_expand_head(_skb, BUF_HEADROOM, BUF_TAILROOM, GFP_ATOMIC))
goto exit;
/* Now reverse the concerned fields */
diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c
index 6a7fe7660551..c77ced0109b7 100644
--- a/net/unix/af_unix.c
+++ b/net/unix/af_unix.c
@@ -343,7 +343,7 @@ found:
* are still connected to it and there's no way to inform "a polling
* implementation" that it should let go of a certain wait queue
*
- * In order to propagate a wake up, a wait_queue_t of the client
+ * In order to propagate a wake up, a wait_queue_entry_t of the client
* socket is enqueued on the peer_wait queue of the server socket
* whose wake function does a wake_up on the ordinary client socket
* wait queue. This connection is established whenever a write (or
@@ -352,7 +352,7 @@ found:
* was relayed.
*/
-static int unix_dgram_peer_wake_relay(wait_queue_t *q, unsigned mode, int flags,
+static int unix_dgram_peer_wake_relay(wait_queue_entry_t *q, unsigned mode, int flags,
void *key)
{
struct unix_sock *u;
@@ -999,7 +999,8 @@ static int unix_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len)
struct path path = { };
err = -EINVAL;
- if (sunaddr->sun_family != AF_UNIX)
+ if (addr_len < offsetofend(struct sockaddr_un, sun_family) ||
+ sunaddr->sun_family != AF_UNIX)
goto out;
if (addr_len == sizeof(short)) {
@@ -1110,6 +1111,10 @@ static int unix_dgram_connect(struct socket *sock, struct sockaddr *addr,
unsigned int hash;
int err;
+ err = -EINVAL;
+ if (alen < offsetofend(struct sockaddr, sa_family))
+ goto out;
+
if (addr->sa_family != AF_UNSPEC) {
err = unix_mkname(sunaddr, alen, &hash);
if (err < 0)
diff --git a/net/wireless/wext-core.c b/net/wireless/wext-core.c
index 1a4db6790e20..6cdb054484d6 100644
--- a/net/wireless/wext-core.c
+++ b/net/wireless/wext-core.c
@@ -914,13 +914,12 @@ int call_commit_handler(struct net_device *dev)
* Main IOCTl dispatcher.
* Check the type of IOCTL and call the appropriate wrapper...
*/
-static int wireless_process_ioctl(struct net *net, struct ifreq *ifr,
+static int wireless_process_ioctl(struct net *net, struct iwreq *iwr,
unsigned int cmd,
struct iw_request_info *info,
wext_ioctl_func standard,
wext_ioctl_func private)
{
- struct iwreq *iwr = (struct iwreq *) ifr;
struct net_device *dev;
iw_handler handler;
@@ -928,7 +927,7 @@ static int wireless_process_ioctl(struct net *net, struct ifreq *ifr,
* The copy_to/from_user() of ifr is also dealt with in there */
/* Make sure the device exist */
- if ((dev = __dev_get_by_name(net, ifr->ifr_name)) == NULL)
+ if ((dev = __dev_get_by_name(net, iwr->ifr_name)) == NULL)
return -ENODEV;
/* A bunch of special cases, then the generic case...
@@ -957,9 +956,6 @@ static int wireless_process_ioctl(struct net *net, struct ifreq *ifr,
else if (private)
return private(dev, iwr, cmd, info, handler);
}
- /* Old driver API : call driver ioctl handler */
- if (dev->netdev_ops->ndo_do_ioctl)
- return dev->netdev_ops->ndo_do_ioctl(dev, ifr, cmd);
return -EOPNOTSUPP;
}
@@ -977,7 +973,7 @@ static int wext_permission_check(unsigned int cmd)
}
/* entry point from dev ioctl */
-static int wext_ioctl_dispatch(struct net *net, struct ifreq *ifr,
+static int wext_ioctl_dispatch(struct net *net, struct iwreq *iwr,
unsigned int cmd, struct iw_request_info *info,
wext_ioctl_func standard,
wext_ioctl_func private)
@@ -987,9 +983,9 @@ static int wext_ioctl_dispatch(struct net *net, struct ifreq *ifr,
if (ret)
return ret;
- dev_load(net, ifr->ifr_name);
+ dev_load(net, iwr->ifr_name);
rtnl_lock();
- ret = wireless_process_ioctl(net, ifr, cmd, info, standard, private);
+ ret = wireless_process_ioctl(net, iwr, cmd, info, standard, private);
rtnl_unlock();
return ret;
@@ -1039,18 +1035,18 @@ static int ioctl_standard_call(struct net_device * dev,
}
-int wext_handle_ioctl(struct net *net, struct ifreq *ifr, unsigned int cmd,
+int wext_handle_ioctl(struct net *net, struct iwreq *iwr, unsigned int cmd,
void __user *arg)
{
struct iw_request_info info = { .cmd = cmd, .flags = 0 };
int ret;
- ret = wext_ioctl_dispatch(net, ifr, cmd, &info,
+ ret = wext_ioctl_dispatch(net, iwr, cmd, &info,
ioctl_standard_call,
ioctl_private_call);
if (ret >= 0 &&
IW_IS_GET(cmd) &&
- copy_to_user(arg, ifr, sizeof(struct iwreq)))
+ copy_to_user(arg, iwr, sizeof(struct iwreq)))
return -EFAULT;
return ret;
@@ -1107,7 +1103,7 @@ int compat_wext_handle_ioctl(struct net *net, unsigned int cmd,
info.cmd = cmd;
info.flags = IW_REQUEST_FLAG_COMPAT;
- ret = wext_ioctl_dispatch(net, (struct ifreq *) &iwr, cmd, &info,
+ ret = wext_ioctl_dispatch(net, &iwr, cmd, &info,
compat_standard_call,
compat_private_call);
diff --git a/net/xfrm/Makefile b/net/xfrm/Makefile
index abf81b329dc1..55b2ac300995 100644
--- a/net/xfrm/Makefile
+++ b/net/xfrm/Makefile
@@ -4,8 +4,7 @@
obj-$(CONFIG_XFRM) := xfrm_policy.o xfrm_state.o xfrm_hash.o \
xfrm_input.o xfrm_output.o \
- xfrm_sysctl.o xfrm_replay.o
-obj-$(CONFIG_XFRM_OFFLOAD) += xfrm_device.o
+ xfrm_sysctl.o xfrm_replay.o xfrm_device.o
obj-$(CONFIG_XFRM_STATISTICS) += xfrm_proc.o
obj-$(CONFIG_XFRM_ALGO) += xfrm_algo.o
obj-$(CONFIG_XFRM_USER) += xfrm_user.o
diff --git a/net/xfrm/xfrm_device.c b/net/xfrm/xfrm_device.c
index 574e6f32f94f..5aba03685d7d 100644
--- a/net/xfrm/xfrm_device.c
+++ b/net/xfrm/xfrm_device.c
@@ -22,6 +22,7 @@
#include <net/xfrm.h>
#include <linux/notifier.h>
+#ifdef CONFIG_XFRM_OFFLOAD
int validate_xmit_xfrm(struct sk_buff *skb, netdev_features_t features)
{
int err;
@@ -137,6 +138,7 @@ ok:
return true;
}
EXPORT_SYMBOL_GPL(xfrm_dev_offload_ok);
+#endif
int xfrm_dev_register(struct net_device *dev)
{
diff --git a/net/xfrm/xfrm_policy.c b/net/xfrm/xfrm_policy.c
index ed4e52d95172..643a18f72032 100644
--- a/net/xfrm/xfrm_policy.c
+++ b/net/xfrm/xfrm_policy.c
@@ -1006,10 +1006,6 @@ int xfrm_policy_flush(struct net *net, u8 type, bool task_valid)
err = -ESRCH;
out:
spin_unlock_bh(&net->xfrm.xfrm_policy_lock);
-
- if (cnt)
- xfrm_garbage_collect(net);
-
return err;
}
EXPORT_SYMBOL(xfrm_policy_flush);
diff --git a/net/xfrm/xfrm_user.c b/net/xfrm/xfrm_user.c
index 38614df33ec8..86116e9aaf3d 100644
--- a/net/xfrm/xfrm_user.c
+++ b/net/xfrm/xfrm_user.c
@@ -2027,6 +2027,7 @@ static int xfrm_flush_policy(struct sk_buff *skb, struct nlmsghdr *nlh,
return 0;
return err;
}
+ xfrm_garbage_collect(net);
c.data.type = type;
c.event = nlh->nlmsg_type;
diff --git a/scripts/Makefile.headersinst b/scripts/Makefile.headersinst
index ce753a408c56..c583a1e1bd3c 100644
--- a/scripts/Makefile.headersinst
+++ b/scripts/Makefile.headersinst
@@ -14,7 +14,15 @@ __headers:
include scripts/Kbuild.include
srcdir := $(srctree)/$(obj)
-subdirs := $(patsubst $(srcdir)/%/.,%,$(wildcard $(srcdir)/*/.))
+
+# When make is run under a fakechroot environment, the function
+# $(wildcard $(srcdir)/*/.) doesn't only return directories, but also regular
+# files. So, we are using a combination of sort/dir/wildcard which works
+# with fakechroot.
+subdirs := $(patsubst $(srcdir)/%/,%,\
+ $(filter-out $(srcdir)/,\
+ $(sort $(dir $(wildcard $(srcdir)/*/)))))
+
# caller may set destination dir (when installing to asm/)
_dst := $(if $(dst),$(dst),$(obj))
diff --git a/scripts/checkpatch.pl b/scripts/checkpatch.pl
index 4b9569fa931b..c7e4d73fe1ce 100755
--- a/scripts/checkpatch.pl
+++ b/scripts/checkpatch.pl
@@ -5533,23 +5533,6 @@ sub process {
}
}
-# Check for expedited grace periods that interrupt non-idle non-nohz
-# online CPUs. These expedited can therefore degrade real-time response
-# if used carelessly, and should be avoided where not absolutely
-# needed. It is always OK to use synchronize_rcu_expedited() and
-# synchronize_sched_expedited() at boot time (before real-time applications
-# start) and in error situations where real-time response is compromised in
-# any case. Note that synchronize_srcu_expedited() does -not- interrupt
-# other CPUs, so don't warn on uses of synchronize_srcu_expedited().
-# Of course, nothing comes for free, and srcu_read_lock() and
-# srcu_read_unlock() do contain full memory barriers in payment for
-# synchronize_srcu_expedited() non-interruption properties.
- if ($line =~ /\b(synchronize_rcu_expedited|synchronize_sched_expedited)\(/) {
- WARN("EXPEDITED_RCU_GRACE_PERIOD",
- "expedited RCU grace periods should be avoided where they can degrade real-time response\n" . $herecurr);
-
- }
-
# check of hardware specific defines
if ($line =~ m@^.\s*\#\s*if.*\b(__i386__|__powerpc64__|__sun__|__s390x__)\b@ && $realfile !~ m@include/asm-@) {
CHK("ARCH_DEFINES",
diff --git a/scripts/genksyms/genksyms.h b/scripts/genksyms/genksyms.h
index 3bffdcaaa274..b724a0290c75 100644
--- a/scripts/genksyms/genksyms.h
+++ b/scripts/genksyms/genksyms.h
@@ -75,7 +75,7 @@ struct string_list *copy_list_range(struct string_list *start,
int yylex(void);
int yyparse(void);
-void error_with_pos(const char *, ...);
+void error_with_pos(const char *, ...) __attribute__ ((format(printf, 1, 2)));
/*----------------------------------------------------------------------*/
#define xmalloc(size) ({ void *__ptr = malloc(size); \
diff --git a/scripts/kconfig/Makefile b/scripts/kconfig/Makefile
index 90a091b6ae4d..eb8144643b78 100644
--- a/scripts/kconfig/Makefile
+++ b/scripts/kconfig/Makefile
@@ -196,7 +196,7 @@ clean-files += config.pot linux.pot
# Check that we have the required ncurses stuff installed for lxdialog (menuconfig)
PHONY += $(obj)/dochecklxdialog
-$(addprefix $(obj)/,$(lxdialog)): $(obj)/dochecklxdialog
+$(addprefix $(obj)/, mconf.o $(lxdialog)): $(obj)/dochecklxdialog
$(obj)/dochecklxdialog:
$(Q)$(CONFIG_SHELL) $(check-lxdialog) -check $(HOSTCC) $(HOST_EXTRACFLAGS) $(HOSTLOADLIBES_mconf)
diff --git a/scripts/kconfig/nconf.c b/scripts/kconfig/nconf.c
index a9bc5334a478..003114779815 100644
--- a/scripts/kconfig/nconf.c
+++ b/scripts/kconfig/nconf.c
@@ -271,7 +271,7 @@ static struct mitem k_menu_items[MAX_MENU_ITEMS];
static int items_num;
static int global_exit;
/* the currently selected button */
-const char *current_instructions = menu_instructions;
+static const char *current_instructions = menu_instructions;
static char *dialog_input_result;
static int dialog_input_result_len;
@@ -305,7 +305,7 @@ struct function_keys {
};
static const int function_keys_num = 9;
-struct function_keys function_keys[] = {
+static struct function_keys function_keys[] = {
{
.key_str = "F1",
.func = "Help",
@@ -508,7 +508,7 @@ static int get_mext_match(const char *match_str, match_f flag)
index = (index + items_num) % items_num;
while (true) {
char *str = k_menu_items[index].str;
- if (strcasestr(str, match_str) != 0)
+ if (strcasestr(str, match_str) != NULL)
return index;
if (flag == FIND_NEXT_MATCH_UP ||
flag == MATCH_TINKER_PATTERN_UP)
@@ -1067,7 +1067,7 @@ static int do_match(int key, struct match_state *state, int *ans)
static void conf(struct menu *menu)
{
- struct menu *submenu = 0;
+ struct menu *submenu = NULL;
const char *prompt = menu_get_prompt(menu);
struct symbol *sym;
int res;
@@ -1234,7 +1234,7 @@ static void show_help(struct menu *menu)
static void conf_choice(struct menu *menu)
{
const char *prompt = _(menu_get_prompt(menu));
- struct menu *child = 0;
+ struct menu *child = NULL;
struct symbol *active;
int selected_index = 0;
int last_top_row = 0;
@@ -1456,7 +1456,7 @@ static void conf_save(void)
}
}
-void setup_windows(void)
+static void setup_windows(void)
{
int lines, columns;
diff --git a/scripts/kconfig/nconf.gui.c b/scripts/kconfig/nconf.gui.c
index 4b2f44c20caf..a64b1c31253e 100644
--- a/scripts/kconfig/nconf.gui.c
+++ b/scripts/kconfig/nconf.gui.c
@@ -129,7 +129,7 @@ static void no_colors_theme(void)
mkattrn(FUNCTION_TEXT, A_REVERSE);
}
-void set_colors()
+void set_colors(void)
{
start_color();
use_default_colors();
@@ -192,7 +192,7 @@ const char *get_line(const char *text, int line_no)
int lines = 0;
if (!text)
- return 0;
+ return NULL;
for (i = 0; text[i] != '\0' && lines < line_no; i++)
if (text[i] == '\n')
diff --git a/scripts/tags.sh b/scripts/tags.sh
index d661f2f3ef61..d23dcbf17457 100755
--- a/scripts/tags.sh
+++ b/scripts/tags.sh
@@ -106,6 +106,7 @@ all_compiled_sources()
case "$i" in
*.[cS])
j=${i/\.[cS]/\.o}
+ j="${j#$tree}"
if [ -e $j ]; then
echo $i
fi
diff --git a/security/keys/internal.h b/security/keys/internal.h
index c0f8682eba69..91bc6214ae57 100644
--- a/security/keys/internal.h
+++ b/security/keys/internal.h
@@ -13,6 +13,7 @@
#define _INTERNAL_H
#include <linux/sched.h>
+#include <linux/wait_bit.h>
#include <linux/cred.h>
#include <linux/key-type.h>
#include <linux/task_work.h>
diff --git a/security/selinux/hooks.c b/security/selinux/hooks.c
index e67a526d1f30..819fd6858b49 100644
--- a/security/selinux/hooks.c
+++ b/security/selinux/hooks.c
@@ -1106,10 +1106,8 @@ static int selinux_parse_opts_str(char *options,
opts->mnt_opts_flags = kcalloc(NUM_SEL_MNT_OPTS, sizeof(int),
GFP_KERNEL);
- if (!opts->mnt_opts_flags) {
- kfree(opts->mnt_opts);
+ if (!opts->mnt_opts_flags)
goto out_err;
- }
if (fscontext) {
opts->mnt_opts[num_mnt_opts] = fscontext;
@@ -1132,6 +1130,7 @@ static int selinux_parse_opts_str(char *options,
return 0;
out_err:
+ security_free_mnt_opts(opts);
kfree(context);
kfree(defcontext);
kfree(fscontext);
diff --git a/sound/core/control.c b/sound/core/control.c
index c109b82eef4b..6362da17ac3f 100644
--- a/sound/core/control.c
+++ b/sound/core/control.c
@@ -1577,7 +1577,7 @@ static ssize_t snd_ctl_read(struct file *file, char __user *buffer,
struct snd_ctl_event ev;
struct snd_kctl_event *kev;
while (list_empty(&ctl->events)) {
- wait_queue_t wait;
+ wait_queue_entry_t wait;
if ((file->f_flags & O_NONBLOCK) != 0 || result > 0) {
err = -EAGAIN;
goto __end_lock;
diff --git a/sound/core/hwdep.c b/sound/core/hwdep.c
index 9602a7e38d8a..a73baa1242be 100644
--- a/sound/core/hwdep.c
+++ b/sound/core/hwdep.c
@@ -85,7 +85,7 @@ static int snd_hwdep_open(struct inode *inode, struct file * file)
int major = imajor(inode);
struct snd_hwdep *hw;
int err;
- wait_queue_t wait;
+ wait_queue_entry_t wait;
if (major == snd_major) {
hw = snd_lookup_minor_data(iminor(inode),
diff --git a/sound/core/init.c b/sound/core/init.c
index 6bda8436d765..d61d2b3cd521 100644
--- a/sound/core/init.c
+++ b/sound/core/init.c
@@ -989,7 +989,7 @@ EXPORT_SYMBOL(snd_card_file_remove);
*/
int snd_power_wait(struct snd_card *card, unsigned int power_state)
{
- wait_queue_t wait;
+ wait_queue_entry_t wait;
int result = 0;
/* fastpath */
diff --git a/sound/core/oss/pcm_oss.c b/sound/core/oss/pcm_oss.c
index 36baf962f9b0..cd8b7bef8d06 100644
--- a/sound/core/oss/pcm_oss.c
+++ b/sound/core/oss/pcm_oss.c
@@ -1554,7 +1554,7 @@ static int snd_pcm_oss_sync1(struct snd_pcm_substream *substream, size_t size)
ssize_t result = 0;
snd_pcm_state_t state;
long res;
- wait_queue_t wait;
+ wait_queue_entry_t wait;
runtime = substream->runtime;
init_waitqueue_entry(&wait, current);
@@ -2387,7 +2387,7 @@ static int snd_pcm_oss_open(struct inode *inode, struct file *file)
struct snd_pcm_oss_file *pcm_oss_file;
struct snd_pcm_oss_setup setup[2];
int nonblock;
- wait_queue_t wait;
+ wait_queue_entry_t wait;
err = nonseekable_open(inode, file);
if (err < 0)
diff --git a/sound/core/pcm_lib.c b/sound/core/pcm_lib.c
index 5088d4b8db22..877176067072 100644
--- a/sound/core/pcm_lib.c
+++ b/sound/core/pcm_lib.c
@@ -1904,7 +1904,7 @@ static int wait_for_avail(struct snd_pcm_substream *substream,
{
struct snd_pcm_runtime *runtime = substream->runtime;
int is_playback = substream->stream == SNDRV_PCM_STREAM_PLAYBACK;
- wait_queue_t wait;
+ wait_queue_entry_t wait;
int err = 0;
snd_pcm_uframes_t avail = 0;
long wait_time, tout;
@@ -2492,7 +2492,7 @@ static int pcm_chmap_ctl_get(struct snd_kcontrol *kcontrol,
struct snd_pcm_substream *substream;
const struct snd_pcm_chmap_elem *map;
- if (snd_BUG_ON(!info->chmap))
+ if (!info->chmap)
return -EINVAL;
substream = snd_pcm_chmap_substream(info, idx);
if (!substream)
@@ -2524,7 +2524,7 @@ static int pcm_chmap_ctl_tlv(struct snd_kcontrol *kcontrol, int op_flag,
unsigned int __user *dst;
int c, count = 0;
- if (snd_BUG_ON(!info->chmap))
+ if (!info->chmap)
return -EINVAL;
if (size < 8)
return -ENOMEM;
diff --git a/sound/core/pcm_native.c b/sound/core/pcm_native.c
index 13dec5ec93f2..faa2e2be6f2e 100644
--- a/sound/core/pcm_native.c
+++ b/sound/core/pcm_native.c
@@ -1652,7 +1652,7 @@ static int snd_pcm_drain(struct snd_pcm_substream *substream,
struct snd_card *card;
struct snd_pcm_runtime *runtime;
struct snd_pcm_substream *s;
- wait_queue_t wait;
+ wait_queue_entry_t wait;
int result = 0;
int nonblock = 0;
@@ -2353,7 +2353,7 @@ static int snd_pcm_capture_open(struct inode *inode, struct file *file)
static int snd_pcm_open(struct file *file, struct snd_pcm *pcm, int stream)
{
int err;
- wait_queue_t wait;
+ wait_queue_entry_t wait;
if (pcm == NULL) {
err = -ENODEV;
diff --git a/sound/core/rawmidi.c b/sound/core/rawmidi.c
index ab890336175f..32588ad05653 100644
--- a/sound/core/rawmidi.c
+++ b/sound/core/rawmidi.c
@@ -368,7 +368,7 @@ static int snd_rawmidi_open(struct inode *inode, struct file *file)
int err;
struct snd_rawmidi *rmidi;
struct snd_rawmidi_file *rawmidi_file = NULL;
- wait_queue_t wait;
+ wait_queue_entry_t wait;
if ((file->f_flags & O_APPEND) && !(file->f_flags & O_NONBLOCK))
return -EINVAL; /* invalid combination */
@@ -1002,7 +1002,7 @@ static ssize_t snd_rawmidi_read(struct file *file, char __user *buf, size_t coun
while (count > 0) {
spin_lock_irq(&runtime->lock);
while (!snd_rawmidi_ready(substream)) {
- wait_queue_t wait;
+ wait_queue_entry_t wait;
if ((file->f_flags & O_NONBLOCK) != 0 || result > 0) {
spin_unlock_irq(&runtime->lock);
return result > 0 ? result : -EAGAIN;
@@ -1306,7 +1306,7 @@ static ssize_t snd_rawmidi_write(struct file *file, const char __user *buf,
while (count > 0) {
spin_lock_irq(&runtime->lock);
while (!snd_rawmidi_ready_append(substream, count)) {
- wait_queue_t wait;
+ wait_queue_entry_t wait;
if (file->f_flags & O_NONBLOCK) {
spin_unlock_irq(&runtime->lock);
return result > 0 ? result : -EAGAIN;
@@ -1338,7 +1338,7 @@ static ssize_t snd_rawmidi_write(struct file *file, const char __user *buf,
if (file->f_flags & O_DSYNC) {
spin_lock_irq(&runtime->lock);
while (runtime->avail != runtime->buffer_size) {
- wait_queue_t wait;
+ wait_queue_entry_t wait;
unsigned int last_avail = runtime->avail;
init_waitqueue_entry(&wait, current);
add_wait_queue(&runtime->sleep, &wait);
diff --git a/sound/core/seq/seq_fifo.c b/sound/core/seq/seq_fifo.c
index 01c4cfe30c9f..a8c2822e0198 100644
--- a/sound/core/seq/seq_fifo.c
+++ b/sound/core/seq/seq_fifo.c
@@ -179,7 +179,7 @@ int snd_seq_fifo_cell_out(struct snd_seq_fifo *f,
{
struct snd_seq_event_cell *cell;
unsigned long flags;
- wait_queue_t wait;
+ wait_queue_entry_t wait;
if (snd_BUG_ON(!f))
return -EINVAL;
diff --git a/sound/core/seq/seq_memory.c b/sound/core/seq/seq_memory.c
index d4c61ec9be13..d6e9aacdc36b 100644
--- a/sound/core/seq/seq_memory.c
+++ b/sound/core/seq/seq_memory.c
@@ -227,7 +227,7 @@ static int snd_seq_cell_alloc(struct snd_seq_pool *pool,
struct snd_seq_event_cell *cell;
unsigned long flags;
int err = -EAGAIN;
- wait_queue_t wait;
+ wait_queue_entry_t wait;
if (pool == NULL)
return -EINVAL;
diff --git a/sound/core/timer.c b/sound/core/timer.c
index cd67d1c12cf1..884c3066b028 100644
--- a/sound/core/timer.c
+++ b/sound/core/timer.c
@@ -1964,7 +1964,7 @@ static ssize_t snd_timer_user_read(struct file *file, char __user *buffer,
spin_lock_irq(&tu->qlock);
while ((long)count - result >= unit) {
while (!tu->qused) {
- wait_queue_t wait;
+ wait_queue_entry_t wait;
if ((file->f_flags & O_NONBLOCK) != 0 || result > 0) {
err = -EAGAIN;
diff --git a/sound/firewire/amdtp-stream.c b/sound/firewire/amdtp-stream.c
index 9e6f54f8c45d..1e26854b3425 100644
--- a/sound/firewire/amdtp-stream.c
+++ b/sound/firewire/amdtp-stream.c
@@ -682,7 +682,9 @@ static void out_stream_callback(struct fw_iso_context *context, u32 tstamp,
cycle = increment_cycle_count(cycle, 1);
if (s->handle_packet(s, 0, cycle, i) < 0) {
s->packet_index = -1;
- amdtp_stream_pcm_abort(s);
+ if (in_interrupt())
+ amdtp_stream_pcm_abort(s);
+ WRITE_ONCE(s->pcm_buffer_pointer, SNDRV_PCM_POS_XRUN);
return;
}
}
@@ -734,7 +736,9 @@ static void in_stream_callback(struct fw_iso_context *context, u32 tstamp,
/* Queueing error or detecting invalid payload. */
if (i < packets) {
s->packet_index = -1;
- amdtp_stream_pcm_abort(s);
+ if (in_interrupt())
+ amdtp_stream_pcm_abort(s);
+ WRITE_ONCE(s->pcm_buffer_pointer, SNDRV_PCM_POS_XRUN);
return;
}
diff --git a/sound/firewire/amdtp-stream.h b/sound/firewire/amdtp-stream.h
index 7e8831722821..ea1a91e99875 100644
--- a/sound/firewire/amdtp-stream.h
+++ b/sound/firewire/amdtp-stream.h
@@ -135,7 +135,7 @@ struct amdtp_stream {
/* For a PCM substream processing. */
struct snd_pcm_substream *pcm;
struct tasklet_struct period_tasklet;
- unsigned int pcm_buffer_pointer;
+ snd_pcm_uframes_t pcm_buffer_pointer;
unsigned int pcm_period_pointer;
/* To wait for first packet. */
diff --git a/sound/isa/wavefront/wavefront_synth.c b/sound/isa/wavefront/wavefront_synth.c
index 4dae9ff9ef5a..0b1e4b34b299 100644
--- a/sound/isa/wavefront/wavefront_synth.c
+++ b/sound/isa/wavefront/wavefront_synth.c
@@ -1782,7 +1782,7 @@ wavefront_should_cause_interrupt (snd_wavefront_t *dev,
int val, int port, unsigned long timeout)
{
- wait_queue_t wait;
+ wait_queue_entry_t wait;
init_waitqueue_entry(&wait, current);
spin_lock_irq(&dev->irq_lock);
diff --git a/sound/pci/hda/hda_codec.h b/sound/pci/hda/hda_codec.h
index d6fb2d5d01a7..60ce1cfc300f 100644
--- a/sound/pci/hda/hda_codec.h
+++ b/sound/pci/hda/hda_codec.h
@@ -295,6 +295,8 @@ struct hda_codec {
#define list_for_each_codec(c, bus) \
list_for_each_entry(c, &(bus)->core.codec_list, core.list)
+#define list_for_each_codec_safe(c, n, bus) \
+ list_for_each_entry_safe(c, n, &(bus)->core.codec_list, core.list)
/* snd_hda_codec_read/write optional flags */
#define HDA_RW_NO_RESPONSE_FALLBACK (1 << 0)
diff --git a/sound/pci/hda/hda_controller.c b/sound/pci/hda/hda_controller.c
index 3715a5725613..1c60beb5b70a 100644
--- a/sound/pci/hda/hda_controller.c
+++ b/sound/pci/hda/hda_controller.c
@@ -1337,8 +1337,12 @@ EXPORT_SYMBOL_GPL(azx_probe_codecs);
/* configure each codec instance */
int azx_codec_configure(struct azx *chip)
{
- struct hda_codec *codec;
- list_for_each_codec(codec, &chip->bus) {
+ struct hda_codec *codec, *next;
+
+ /* use _safe version here since snd_hda_codec_configure() deregisters
+ * the device upon error and deletes itself from the bus list.
+ */
+ list_for_each_codec_safe(codec, next, &chip->bus) {
snd_hda_codec_configure(codec);
}
return 0;
diff --git a/sound/pci/hda/hda_generic.c b/sound/pci/hda/hda_generic.c
index 2842c82363c0..71545b56b4c8 100644
--- a/sound/pci/hda/hda_generic.c
+++ b/sound/pci/hda/hda_generic.c
@@ -3174,6 +3174,7 @@ static int check_dyn_adc_switch(struct hda_codec *codec)
spec->input_paths[i][nums]);
spec->input_paths[i][nums] =
spec->input_paths[i][n];
+ spec->input_paths[i][n] = 0;
}
}
nums++;
diff --git a/sound/pci/hda/hda_intel.c b/sound/pci/hda/hda_intel.c
index 1770f085c2a6..01eb1dc7b5b3 100644
--- a/sound/pci/hda/hda_intel.c
+++ b/sound/pci/hda/hda_intel.c
@@ -370,10 +370,12 @@ enum {
#define IS_KBL_LP(pci) ((pci)->vendor == 0x8086 && (pci)->device == 0x9d71)
#define IS_KBL_H(pci) ((pci)->vendor == 0x8086 && (pci)->device == 0xa2f0)
#define IS_BXT(pci) ((pci)->vendor == 0x8086 && (pci)->device == 0x5a98)
+#define IS_BXT_T(pci) ((pci)->vendor == 0x8086 && (pci)->device == 0x1a98)
#define IS_GLK(pci) ((pci)->vendor == 0x8086 && (pci)->device == 0x3198)
-#define IS_SKL_PLUS(pci) (IS_SKL(pci) || IS_SKL_LP(pci) || IS_BXT(pci)) || \
- IS_KBL(pci) || IS_KBL_LP(pci) || IS_KBL_H(pci) || \
- IS_GLK(pci)
+#define IS_CFL(pci) ((pci)->vendor == 0x8086 && (pci)->device == 0xa348)
+#define IS_SKL_PLUS(pci) (IS_SKL(pci) || IS_SKL_LP(pci) || IS_BXT(pci) || \
+ IS_BXT_T(pci) || IS_KBL(pci) || IS_KBL_LP(pci) || \
+ IS_KBL_H(pci) || IS_GLK(pci) || IS_CFL(pci))
static char *driver_short_names[] = {
[AZX_DRIVER_ICH] = "HDA Intel",
@@ -2378,6 +2380,9 @@ static const struct pci_device_id azx_ids[] = {
/* Kabylake-H */
{ PCI_DEVICE(0x8086, 0xa2f0),
.driver_data = AZX_DRIVER_PCH | AZX_DCAPS_INTEL_SKYLAKE },
+ /* Coffelake */
+ { PCI_DEVICE(0x8086, 0xa348),
+ .driver_data = AZX_DRIVER_PCH | AZX_DCAPS_INTEL_SKYLAKE},
/* Broxton-P(Apollolake) */
{ PCI_DEVICE(0x8086, 0x5a98),
.driver_data = AZX_DRIVER_PCH | AZX_DCAPS_INTEL_BROXTON },
diff --git a/sound/pci/mixart/mixart_core.c b/sound/pci/mixart/mixart_core.c
index dccf3db48fe0..8bf2ce32d4a8 100644
--- a/sound/pci/mixart/mixart_core.c
+++ b/sound/pci/mixart/mixart_core.c
@@ -239,7 +239,7 @@ int snd_mixart_send_msg(struct mixart_mgr *mgr, struct mixart_msg *request, int
struct mixart_msg resp;
u32 msg_frame = 0; /* set to 0, so it's no notification to wait for, but the answer */
int err;
- wait_queue_t wait;
+ wait_queue_entry_t wait;
long timeout;
init_waitqueue_entry(&wait, current);
@@ -284,7 +284,7 @@ int snd_mixart_send_msg_wait_notif(struct mixart_mgr *mgr,
struct mixart_msg *request, u32 notif_event)
{
int err;
- wait_queue_t wait;
+ wait_queue_entry_t wait;
long timeout;
if (snd_BUG_ON(!notif_event))
diff --git a/sound/pci/ymfpci/ymfpci_main.c b/sound/pci/ymfpci/ymfpci_main.c
index fe4ba463b57c..1114166c685c 100644
--- a/sound/pci/ymfpci/ymfpci_main.c
+++ b/sound/pci/ymfpci/ymfpci_main.c
@@ -781,7 +781,7 @@ static snd_pcm_uframes_t snd_ymfpci_capture_pointer(struct snd_pcm_substream *su
static void snd_ymfpci_irq_wait(struct snd_ymfpci *chip)
{
- wait_queue_t wait;
+ wait_queue_entry_t wait;
int loops = 4;
while (loops-- > 0) {
diff --git a/tools/Makefile b/tools/Makefile
index c8a90d01dd8e..221e1ce78b06 100644
--- a/tools/Makefile
+++ b/tools/Makefile
@@ -19,6 +19,7 @@ help:
@echo ' kvm_stat - top-like utility for displaying kvm statistics'
@echo ' leds - LEDs tools'
@echo ' lguest - a minimal 32-bit x86 hypervisor'
+ @echo ' liblockdep - user-space wrapper for kernel locking-validator'
@echo ' net - misc networking tools'
@echo ' perf - Linux performance measurement and analysis tool'
@echo ' selftests - various kernel selftests'
@@ -89,7 +90,7 @@ freefall: FORCE
kvm_stat: FORCE
$(call descend,kvm/$@)
-all: acpi cgroup cpupower gpio hv firewire lguest \
+all: acpi cgroup cpupower gpio hv firewire lguest liblockdep \
perf selftests turbostat usb \
virtio vm net x86_energy_perf_policy \
tmon freefall objtool kvm_stat
@@ -103,6 +104,9 @@ cpupower_install:
cgroup_install firewire_install gpio_install hv_install lguest_install perf_install usb_install virtio_install vm_install net_install objtool_install:
$(call descend,$(@:_install=),install)
+liblockdep_install:
+ $(call descend,lib/lockdep,install)
+
selftests_install:
$(call descend,testing/$(@:_install=),install)
@@ -119,7 +123,7 @@ kvm_stat_install:
$(call descend,kvm/$(@:_install=),install)
install: acpi_install cgroup_install cpupower_install gpio_install \
- hv_install firewire_install lguest_install \
+ hv_install firewire_install lguest_install liblockdep_install \
perf_install selftests_install turbostat_install usb_install \
virtio_install vm_install net_install x86_energy_perf_policy_install \
tmon_install freefall_install objtool_install kvm_stat_install
diff --git a/tools/include/asm/sections.h b/tools/include/asm/sections.h
new file mode 100644
index 000000000000..a80643d7a7f1
--- /dev/null
+++ b/tools/include/asm/sections.h
@@ -0,0 +1,4 @@
+#ifndef __TOOLS_INCLUDE_LINUX_ASM_SECTIONS_H
+#define __TOOLS_INCLUDE_LINUX_ASM_SECTIONS_H
+
+#endif /* __TOOLS_INCLUDE_LINUX_ASM_SECTIONS_H */
diff --git a/tools/include/linux/bitops.h b/tools/include/linux/bitops.h
index 1aecad369af5..969db1981868 100644
--- a/tools/include/linux/bitops.h
+++ b/tools/include/linux/bitops.h
@@ -61,4 +61,14 @@ static inline unsigned fls_long(unsigned long l)
return fls64(l);
}
+/**
+ * rol32 - rotate a 32-bit value left
+ * @word: value to rotate
+ * @shift: bits to roll
+ */
+static inline __u32 rol32(__u32 word, unsigned int shift)
+{
+ return (word << shift) | (word >> ((-shift) & 31));
+}
+
#endif
diff --git a/tools/include/linux/compiler-gcc.h b/tools/include/linux/compiler-gcc.h
index 825d44f89a29..bd39b2090ad1 100644
--- a/tools/include/linux/compiler-gcc.h
+++ b/tools/include/linux/compiler-gcc.h
@@ -19,3 +19,13 @@
/* &a[0] degrades to a pointer: a different type from an array */
#define __must_be_array(a) BUILD_BUG_ON_ZERO(__same_type((a), &(a)[0]))
+
+#define noinline __attribute__((noinline))
+
+#define __packed __attribute__((packed))
+
+#define __noreturn __attribute__((noreturn))
+
+#define __aligned(x) __attribute__((aligned(x)))
+#define __printf(a, b) __attribute__((format(printf, a, b)))
+#define __scanf(a, b) __attribute__((format(scanf, a, b)))
diff --git a/tools/include/linux/compiler.h b/tools/include/linux/compiler.h
index 23299d7e7160..d7a5604c38d7 100644
--- a/tools/include/linux/compiler.h
+++ b/tools/include/linux/compiler.h
@@ -17,6 +17,10 @@
# define __always_inline inline __attribute__((always_inline))
#endif
+#ifndef noinline
+#define noinline
+#endif
+
/* Are two types/vars the same type (ignoring qualifiers)? */
#ifndef __same_type
# define __same_type(a, b) __builtin_types_compatible_p(typeof(a), typeof(b))
@@ -45,6 +49,10 @@
# define __maybe_unused __attribute__((unused))
#endif
+#ifndef __used
+# define __used __attribute__((__unused__))
+#endif
+
#ifndef __packed
# define __packed __attribute__((__packed__))
#endif
@@ -65,6 +73,14 @@
# define unlikely(x) __builtin_expect(!!(x), 0)
#endif
+#ifndef __init
+# define __init
+#endif
+
+#ifndef noinline
+# define noinline
+#endif
+
#define uninitialized_var(x) x = *(&(x))
#define ACCESS_ONCE(x) (*(volatile typeof(x) *)&(x))
diff --git a/tools/lib/lockdep/uinclude/linux/debug_locks.h b/tools/include/linux/debug_locks.h
index f38eb64df794..61cc7f501168 100644
--- a/tools/lib/lockdep/uinclude/linux/debug_locks.h
+++ b/tools/include/linux/debug_locks.h
@@ -3,8 +3,9 @@
#include <stddef.h>
#include <linux/compiler.h>
+#include <asm/bug.h>
-#define DEBUG_LOCKS_WARN_ON(x) (x)
+#define DEBUG_LOCKS_WARN_ON(x) WARN_ON(x)
extern bool debug_locks;
extern bool debug_locks_silent;
diff --git a/tools/include/linux/delay.h b/tools/include/linux/delay.h
new file mode 100644
index 000000000000..55aa4173af1f
--- /dev/null
+++ b/tools/include/linux/delay.h
@@ -0,0 +1,4 @@
+#ifndef _TOOLS_INCLUDE_LINUX_DELAY_H
+#define _TOOLS_INCLUDE_LINUX_DELAY_H
+
+#endif /* _TOOLS_INCLUDE_LINUX_DELAY_H */
diff --git a/tools/include/linux/err.h b/tools/include/linux/err.h
index bdc3dd8131d4..abf0478a8fb2 100644
--- a/tools/include/linux/err.h
+++ b/tools/include/linux/err.h
@@ -46,4 +46,9 @@ static inline bool __must_check IS_ERR(__force const void *ptr)
return IS_ERR_VALUE((unsigned long)ptr);
}
+static inline bool __must_check IS_ERR_OR_NULL(__force const void *ptr)
+{
+ return unlikely(!ptr) || IS_ERR_VALUE((unsigned long)ptr);
+}
+
#endif /* _LINUX_ERR_H */
diff --git a/tools/include/linux/ftrace.h b/tools/include/linux/ftrace.h
new file mode 100644
index 000000000000..949f541ce11e
--- /dev/null
+++ b/tools/include/linux/ftrace.h
@@ -0,0 +1,4 @@
+#ifndef _TOOLS_INCLUDE_LINUX_FTRACE_H
+#define _TOOLS_INCLUDE_LINUX_FTRACE_H
+
+#endif /* _TOOLS_INCLUDE_LINUX_FTRACE_H */
diff --git a/tools/include/linux/gfp.h b/tools/include/linux/gfp.h
new file mode 100644
index 000000000000..22030756fbc0
--- /dev/null
+++ b/tools/include/linux/gfp.h
@@ -0,0 +1,4 @@
+#ifndef _TOOLS_INCLUDE_LINUX_GFP_H
+#define _TOOLS_INCLUDE_LINUX_GFP_H
+
+#endif /* _TOOLS_INCLUDE_LINUX_GFP_H */
diff --git a/tools/lib/lockdep/uinclude/linux/hardirq.h b/tools/include/linux/hardirq.h
index c8f3f8f58729..c8f3f8f58729 100644
--- a/tools/lib/lockdep/uinclude/linux/hardirq.h
+++ b/tools/include/linux/hardirq.h
diff --git a/tools/include/linux/interrupt.h b/tools/include/linux/interrupt.h
new file mode 100644
index 000000000000..6be25bbdca9e
--- /dev/null
+++ b/tools/include/linux/interrupt.h
@@ -0,0 +1,4 @@
+#ifndef _TOOLS_INCLUDE_LINUX_INTERRUPT_H
+#define _TOOLS_INCLUDE_LINUX_INTERRUPT_H
+
+#endif /* _TOOLS_INCLUDE_LINUX_INTERRUPT_H */
diff --git a/tools/lib/lockdep/uinclude/linux/irqflags.h b/tools/include/linux/irqflags.h
index 6cc296f0fad0..df77669cfe1c 100644
--- a/tools/lib/lockdep/uinclude/linux/irqflags.h
+++ b/tools/include/linux/irqflags.h
@@ -17,19 +17,19 @@
#define raw_local_irq_disable() do { } while (0)
#define raw_local_irq_enable() do { } while (0)
#define raw_local_irq_save(flags) ((flags) = 0)
-#define raw_local_irq_restore(flags) do { } while (0)
+#define raw_local_irq_restore(flags) ((void)(flags))
#define raw_local_save_flags(flags) ((flags) = 0)
-#define raw_irqs_disabled_flags(flags) do { } while (0)
+#define raw_irqs_disabled_flags(flags) ((void)(flags))
#define raw_irqs_disabled() 0
#define raw_safe_halt()
#define local_irq_enable() do { } while (0)
#define local_irq_disable() do { } while (0)
#define local_irq_save(flags) ((flags) = 0)
-#define local_irq_restore(flags) do { } while (0)
+#define local_irq_restore(flags) ((void)(flags))
#define local_save_flags(flags) ((flags) = 0)
#define irqs_disabled() (1)
-#define irqs_disabled_flags(flags) (0)
+#define irqs_disabled_flags(flags) ((void)(flags), 0)
#define safe_halt() do { } while (0)
#define trace_lock_release(x, y)
diff --git a/tools/include/linux/jhash.h b/tools/include/linux/jhash.h
new file mode 100644
index 000000000000..348c6f47e4cc
--- /dev/null
+++ b/tools/include/linux/jhash.h
@@ -0,0 +1,175 @@
+#ifndef _LINUX_JHASH_H
+#define _LINUX_JHASH_H
+
+/* jhash.h: Jenkins hash support.
+ *
+ * Copyright (C) 2006. Bob Jenkins (bob_jenkins@burtleburtle.net)
+ *
+ * http://burtleburtle.net/bob/hash/
+ *
+ * These are the credits from Bob's sources:
+ *
+ * lookup3.c, by Bob Jenkins, May 2006, Public Domain.
+ *
+ * These are functions for producing 32-bit hashes for hash table lookup.
+ * hashword(), hashlittle(), hashlittle2(), hashbig(), mix(), and final()
+ * are externally useful functions. Routines to test the hash are included
+ * if SELF_TEST is defined. You can use this free for any purpose. It's in
+ * the public domain. It has no warranty.
+ *
+ * Copyright (C) 2009-2010 Jozsef Kadlecsik (kadlec@blackhole.kfki.hu)
+ *
+ * I've modified Bob's hash to be useful in the Linux kernel, and
+ * any bugs present are my fault.
+ * Jozsef
+ */
+#include <linux/bitops.h>
+#include <linux/unaligned/packed_struct.h>
+
+/* Best hash sizes are of power of two */
+#define jhash_size(n) ((u32)1<<(n))
+/* Mask the hash value, i.e (value & jhash_mask(n)) instead of (value % n) */
+#define jhash_mask(n) (jhash_size(n)-1)
+
+/* __jhash_mix -- mix 3 32-bit values reversibly. */
+#define __jhash_mix(a, b, c) \
+{ \
+ a -= c; a ^= rol32(c, 4); c += b; \
+ b -= a; b ^= rol32(a, 6); a += c; \
+ c -= b; c ^= rol32(b, 8); b += a; \
+ a -= c; a ^= rol32(c, 16); c += b; \
+ b -= a; b ^= rol32(a, 19); a += c; \
+ c -= b; c ^= rol32(b, 4); b += a; \
+}
+
+/* __jhash_final - final mixing of 3 32-bit values (a,b,c) into c */
+#define __jhash_final(a, b, c) \
+{ \
+ c ^= b; c -= rol32(b, 14); \
+ a ^= c; a -= rol32(c, 11); \
+ b ^= a; b -= rol32(a, 25); \
+ c ^= b; c -= rol32(b, 16); \
+ a ^= c; a -= rol32(c, 4); \
+ b ^= a; b -= rol32(a, 14); \
+ c ^= b; c -= rol32(b, 24); \
+}
+
+/* An arbitrary initial parameter */
+#define JHASH_INITVAL 0xdeadbeef
+
+/* jhash - hash an arbitrary key
+ * @k: sequence of bytes as key
+ * @length: the length of the key
+ * @initval: the previous hash, or an arbitray value
+ *
+ * The generic version, hashes an arbitrary sequence of bytes.
+ * No alignment or length assumptions are made about the input key.
+ *
+ * Returns the hash value of the key. The result depends on endianness.
+ */
+static inline u32 jhash(const void *key, u32 length, u32 initval)
+{
+ u32 a, b, c;
+ const u8 *k = key;
+
+ /* Set up the internal state */
+ a = b = c = JHASH_INITVAL + length + initval;
+
+ /* All but the last block: affect some 32 bits of (a,b,c) */
+ while (length > 12) {
+ a += __get_unaligned_cpu32(k);
+ b += __get_unaligned_cpu32(k + 4);
+ c += __get_unaligned_cpu32(k + 8);
+ __jhash_mix(a, b, c);
+ length -= 12;
+ k += 12;
+ }
+ /* Last block: affect all 32 bits of (c) */
+ /* All the case statements fall through */
+ switch (length) {
+ case 12: c += (u32)k[11]<<24;
+ case 11: c += (u32)k[10]<<16;
+ case 10: c += (u32)k[9]<<8;
+ case 9: c += k[8];
+ case 8: b += (u32)k[7]<<24;
+ case 7: b += (u32)k[6]<<16;
+ case 6: b += (u32)k[5]<<8;
+ case 5: b += k[4];
+ case 4: a += (u32)k[3]<<24;
+ case 3: a += (u32)k[2]<<16;
+ case 2: a += (u32)k[1]<<8;
+ case 1: a += k[0];
+ __jhash_final(a, b, c);
+ case 0: /* Nothing left to add */
+ break;
+ }
+
+ return c;
+}
+
+/* jhash2 - hash an array of u32's
+ * @k: the key which must be an array of u32's
+ * @length: the number of u32's in the key
+ * @initval: the previous hash, or an arbitray value
+ *
+ * Returns the hash value of the key.
+ */
+static inline u32 jhash2(const u32 *k, u32 length, u32 initval)
+{
+ u32 a, b, c;
+
+ /* Set up the internal state */
+ a = b = c = JHASH_INITVAL + (length<<2) + initval;
+
+ /* Handle most of the key */
+ while (length > 3) {
+ a += k[0];
+ b += k[1];
+ c += k[2];
+ __jhash_mix(a, b, c);
+ length -= 3;
+ k += 3;
+ }
+
+ /* Handle the last 3 u32's: all the case statements fall through */
+ switch (length) {
+ case 3: c += k[2];
+ case 2: b += k[1];
+ case 1: a += k[0];
+ __jhash_final(a, b, c);
+ case 0: /* Nothing left to add */
+ break;
+ }
+
+ return c;
+}
+
+
+/* __jhash_nwords - hash exactly 3, 2 or 1 word(s) */
+static inline u32 __jhash_nwords(u32 a, u32 b, u32 c, u32 initval)
+{
+ a += initval;
+ b += initval;
+ c += initval;
+
+ __jhash_final(a, b, c);
+
+ return c;
+}
+
+static inline u32 jhash_3words(u32 a, u32 b, u32 c, u32 initval)
+{
+ return __jhash_nwords(a, b, c, initval + JHASH_INITVAL + (3 << 2));
+}
+
+static inline u32 jhash_2words(u32 a, u32 b, u32 initval)
+{
+ return __jhash_nwords(a, b, 0, initval + JHASH_INITVAL + (2 << 2));
+}
+
+static inline u32 jhash_1word(u32 a, u32 initval)
+{
+ return __jhash_nwords(a, 0, 0, initval + JHASH_INITVAL + (1 << 2));
+}
+
+#endif /* _LINUX_JHASH_H */
diff --git a/tools/lib/lockdep/uinclude/linux/kallsyms.h b/tools/include/linux/kallsyms.h
index b0f2dbdf1a15..582cc1e5f3a4 100644
--- a/tools/lib/lockdep/uinclude/linux/kallsyms.h
+++ b/tools/include/linux/kallsyms.h
@@ -3,6 +3,7 @@
#include <linux/kernel.h>
#include <stdio.h>
+#include <unistd.h>
#define KSYM_NAME_LEN 128
@@ -24,7 +25,7 @@ static inline void print_ip_sym(unsigned long ip)
name = backtrace_symbols((void **)&ip, 1);
- printf("%s\n", *name);
+ dprintf(STDOUT_FILENO, "%s\n", *name);
free(name);
}
diff --git a/tools/lib/lockdep/uinclude/linux/kern_levels.h b/tools/include/linux/kern_levels.h
index 3b9bade28698..3b9bade28698 100644
--- a/tools/lib/lockdep/uinclude/linux/kern_levels.h
+++ b/tools/include/linux/kern_levels.h
diff --git a/tools/include/linux/kernel.h b/tools/include/linux/kernel.h
index 73ccc48126bb..77d2e94ca5df 100644
--- a/tools/include/linux/kernel.h
+++ b/tools/include/linux/kernel.h
@@ -5,6 +5,8 @@
#include <stddef.h>
#include <assert.h>
#include <linux/compiler.h>
+#include <endian.h>
+#include <byteswap.h>
#ifndef UINT_MAX
#define UINT_MAX (~0U)
@@ -32,6 +34,7 @@
(type *)((char *)__mptr - offsetof(type, member)); })
#endif
+#define BUILD_BUG_ON(condition) ((void)sizeof(char[1 - 2*!!(condition)]))
#define BUILD_BUG_ON_ZERO(e) (sizeof(struct { int:-!!(e); }))
#ifndef max
@@ -67,12 +70,33 @@
#endif
#endif
-/*
- * Both need more care to handle endianness
- * (Don't use bitmap_copy_le() for now)
- */
-#define cpu_to_le64(x) (x)
-#define cpu_to_le32(x) (x)
+#if __BYTE_ORDER == __BIG_ENDIAN
+#define cpu_to_le16 bswap_16
+#define cpu_to_le32 bswap_32
+#define cpu_to_le64 bswap_64
+#define le16_to_cpu bswap_16
+#define le32_to_cpu bswap_32
+#define le64_to_cpu bswap_64
+#define cpu_to_be16
+#define cpu_to_be32
+#define cpu_to_be64
+#define be16_to_cpu
+#define be32_to_cpu
+#define be64_to_cpu
+#else
+#define cpu_to_le16
+#define cpu_to_le32
+#define cpu_to_le64
+#define le16_to_cpu
+#define le32_to_cpu
+#define le64_to_cpu
+#define cpu_to_be16 bswap_16
+#define cpu_to_be32 bswap_32
+#define cpu_to_be64 bswap_64
+#define be16_to_cpu bswap_16
+#define be32_to_cpu bswap_32
+#define be64_to_cpu bswap_64
+#endif
int vscnprintf(char *buf, size_t size, const char *fmt, va_list args);
int scnprintf(char * buf, size_t size, const char * fmt, ...);
@@ -89,4 +113,7 @@ int scnprintf(char * buf, size_t size, const char * fmt, ...);
#define round_up(x, y) ((((x)-1) | __round_mask(x, y))+1)
#define round_down(x, y) ((x) & ~__round_mask(x, y))
+#define current_gfp_context(k) 0
+#define synchronize_sched()
+
#endif
diff --git a/tools/lib/lockdep/uinclude/linux/kmemcheck.h b/tools/include/linux/kmemcheck.h
index 94d598bc6abe..94d598bc6abe 100644
--- a/tools/lib/lockdep/uinclude/linux/kmemcheck.h
+++ b/tools/include/linux/kmemcheck.h
diff --git a/tools/include/linux/linkage.h b/tools/include/linux/linkage.h
new file mode 100644
index 000000000000..bc763d500262
--- /dev/null
+++ b/tools/include/linux/linkage.h
@@ -0,0 +1,4 @@
+#ifndef _TOOLS_INCLUDE_LINUX_LINKAGE_H
+#define _TOOLS_INCLUDE_LINUX_LINKAGE_H
+
+#endif /* _TOOLS_INCLUDE_LINUX_LINKAGE_H */
diff --git a/tools/lib/lockdep/uinclude/linux/lockdep.h b/tools/include/linux/lockdep.h
index c808c7d02d21..8da3e8effafa 100644
--- a/tools/lib/lockdep/uinclude/linux/lockdep.h
+++ b/tools/include/linux/lockdep.h
@@ -7,8 +7,15 @@
#include <limits.h>
#include <linux/utsname.h>
#include <linux/compiler.h>
+#include <linux/export.h>
+#include <linux/kern_levels.h>
+#include <linux/err.h>
+#include <linux/rcu.h>
+#include <linux/list.h>
+#include <linux/hardirq.h>
+#include <unistd.h>
-#define MAX_LOCK_DEPTH 2000UL
+#define MAX_LOCK_DEPTH 63UL
#define asmlinkage
#define __visible
@@ -29,31 +36,32 @@ extern struct task_struct *__curr(void);
#define current (__curr())
-#define debug_locks_off() 1
+static inline int debug_locks_off(void)
+{
+ return 1;
+}
+
#define task_pid_nr(tsk) ((tsk)->pid)
#define KSYM_NAME_LEN 128
-#define printk printf
+#define printk(...) dprintf(STDOUT_FILENO, __VA_ARGS__)
+#define pr_err(format, ...) fprintf (stderr, format, ## __VA_ARGS__)
+#define pr_warn pr_err
#define list_del_rcu list_del
#define atomic_t unsigned long
#define atomic_inc(x) ((*(x))++)
-static struct new_utsname *init_utsname(void)
-{
- static struct new_utsname n = (struct new_utsname) {
- .release = "liblockdep",
- .version = LIBLOCKDEP_VERSION,
- };
-
- return &n;
-}
-
#define print_tainted() ""
#define static_obj(x) 1
#define debug_show_all_locks()
extern void debug_check_no_locks_held(void);
+static __used bool __is_kernel_percpu_address(unsigned long addr, void *can_addr)
+{
+ return false;
+}
+
#endif
diff --git a/tools/lib/lockdep/uinclude/linux/module.h b/tools/include/linux/module.h
index 09c7a7be8ccc..07055db296f3 100644
--- a/tools/lib/lockdep/uinclude/linux/module.h
+++ b/tools/include/linux/module.h
@@ -3,4 +3,9 @@
#define module_param(name, type, perm)
+static inline bool __is_module_percpu_address(unsigned long addr, unsigned long *can_addr)
+{
+ return false;
+}
+
#endif
diff --git a/tools/include/linux/mutex.h b/tools/include/linux/mutex.h
new file mode 100644
index 000000000000..a8180d25f2fc
--- /dev/null
+++ b/tools/include/linux/mutex.h
@@ -0,0 +1,4 @@
+#ifndef _TOOLS_INCLUDE_LINUX_MUTEX_H
+#define _TOOLS_INCLUDE_LINUX_MUTEX_H
+
+#endif /* _TOOLS_INCLUDE_LINUX_MUTEX_H */
diff --git a/tools/include/linux/proc_fs.h b/tools/include/linux/proc_fs.h
new file mode 100644
index 000000000000..8b3b03b64fda
--- /dev/null
+++ b/tools/include/linux/proc_fs.h
@@ -0,0 +1,4 @@
+#ifndef _TOOLS_INCLUDE_LINUX_PROC_FS_H
+#define _TOOLS_INCLUDE_LINUX_PROC_FS_H
+
+#endif /* _TOOLS_INCLUDE_LINUX_PROC_FS_H */
diff --git a/tools/lib/lockdep/uinclude/linux/rcu.h b/tools/include/linux/rcu.h
index 042ee8e463c9..5080649dad04 100644
--- a/tools/lib/lockdep/uinclude/linux/rcu.h
+++ b/tools/include/linux/rcu.h
@@ -18,4 +18,7 @@ static inline bool rcu_is_watching(void)
return false;
}
+#define rcu_assign_pointer(p, v) ((p) = (v))
+#define RCU_INIT_POINTER(p, v) p=(v)
+
#endif
diff --git a/tools/include/linux/sched/clock.h b/tools/include/linux/sched/clock.h
new file mode 100644
index 000000000000..5837d17c4182
--- /dev/null
+++ b/tools/include/linux/sched/clock.h
@@ -0,0 +1,4 @@
+#ifndef _TOOLS_PERF_LINUX_SCHED_CLOCK_H
+#define _TOOLS_PERF_LINUX_SCHED_CLOCK_H
+
+#endif /* _TOOLS_PERF_LINUX_SCHED_CLOCK_H */
diff --git a/tools/include/linux/sched/mm.h b/tools/include/linux/sched/mm.h
new file mode 100644
index 000000000000..c8d9f19c1f35
--- /dev/null
+++ b/tools/include/linux/sched/mm.h
@@ -0,0 +1,4 @@
+#ifndef _TOOLS_PERF_LINUX_SCHED_MM_H
+#define _TOOLS_PERF_LINUX_SCHED_MM_H
+
+#endif /* _TOOLS_PERF_LINUX_SCHED_MM_H */
diff --git a/tools/include/linux/sched/task.h b/tools/include/linux/sched/task.h
new file mode 100644
index 000000000000..a97890eca110
--- /dev/null
+++ b/tools/include/linux/sched/task.h
@@ -0,0 +1,4 @@
+#ifndef _TOOLS_PERF_LINUX_SCHED_TASK_H
+#define _TOOLS_PERF_LINUX_SCHED_TASK_H
+
+#endif /* _TOOLS_PERF_LINUX_SCHED_TASK_H */
diff --git a/tools/include/linux/seq_file.h b/tools/include/linux/seq_file.h
new file mode 100644
index 000000000000..102fd9217f1f
--- /dev/null
+++ b/tools/include/linux/seq_file.h
@@ -0,0 +1,4 @@
+#ifndef _TOOLS_INCLUDE_LINUX_SEQ_FILE_H
+#define _TOOLS_INCLUDE_LINUX_SEQ_FILE_H
+
+#endif /* _TOOLS_INCLUDE_LINUX_SEQ_FILE_H */
diff --git a/tools/include/linux/spinlock.h b/tools/include/linux/spinlock.h
index 58397dcb19d6..417cda4f793f 100644
--- a/tools/include/linux/spinlock.h
+++ b/tools/include/linux/spinlock.h
@@ -1,5 +1,31 @@
+#ifndef __LINUX_SPINLOCK_H_
+#define __LINUX_SPINLOCK_H_
+
+#include <pthread.h>
+#include <stdbool.h>
+
#define spinlock_t pthread_mutex_t
#define DEFINE_SPINLOCK(x) pthread_mutex_t x = PTHREAD_MUTEX_INITIALIZER;
#define spin_lock_irqsave(x, f) (void)f, pthread_mutex_lock(x)
#define spin_unlock_irqrestore(x, f) (void)f, pthread_mutex_unlock(x)
+
+#define arch_spinlock_t pthread_mutex_t
+#define __ARCH_SPIN_LOCK_UNLOCKED PTHREAD_MUTEX_INITIALIZER
+
+static inline void arch_spin_lock(arch_spinlock_t *mutex)
+{
+ pthread_mutex_lock(mutex);
+}
+
+static inline void arch_spin_unlock(arch_spinlock_t *mutex)
+{
+ pthread_mutex_unlock(mutex);
+}
+
+static inline bool arch_spin_is_locked(arch_spinlock_t *mutex)
+{
+ return true;
+}
+
+#endif
diff --git a/tools/lib/lockdep/uinclude/linux/stacktrace.h b/tools/include/linux/stacktrace.h
index 39aecc6b19d1..39aecc6b19d1 100644
--- a/tools/lib/lockdep/uinclude/linux/stacktrace.h
+++ b/tools/include/linux/stacktrace.h
diff --git a/tools/include/linux/unaligned/packed_struct.h b/tools/include/linux/unaligned/packed_struct.h
new file mode 100644
index 000000000000..c0d817de4df2
--- /dev/null
+++ b/tools/include/linux/unaligned/packed_struct.h
@@ -0,0 +1,46 @@
+#ifndef _LINUX_UNALIGNED_PACKED_STRUCT_H
+#define _LINUX_UNALIGNED_PACKED_STRUCT_H
+
+#include <linux/kernel.h>
+
+struct __una_u16 { u16 x; } __packed;
+struct __una_u32 { u32 x; } __packed;
+struct __una_u64 { u64 x; } __packed;
+
+static inline u16 __get_unaligned_cpu16(const void *p)
+{
+ const struct __una_u16 *ptr = (const struct __una_u16 *)p;
+ return ptr->x;
+}
+
+static inline u32 __get_unaligned_cpu32(const void *p)
+{
+ const struct __una_u32 *ptr = (const struct __una_u32 *)p;
+ return ptr->x;
+}
+
+static inline u64 __get_unaligned_cpu64(const void *p)
+{
+ const struct __una_u64 *ptr = (const struct __una_u64 *)p;
+ return ptr->x;
+}
+
+static inline void __put_unaligned_cpu16(u16 val, void *p)
+{
+ struct __una_u16 *ptr = (struct __una_u16 *)p;
+ ptr->x = val;
+}
+
+static inline void __put_unaligned_cpu32(u32 val, void *p)
+{
+ struct __una_u32 *ptr = (struct __una_u32 *)p;
+ ptr->x = val;
+}
+
+static inline void __put_unaligned_cpu64(u64 val, void *p)
+{
+ struct __una_u64 *ptr = (struct __una_u64 *)p;
+ ptr->x = val;
+}
+
+#endif /* _LINUX_UNALIGNED_PACKED_STRUCT_H */
diff --git a/tools/include/trace/events/lock.h b/tools/include/trace/events/lock.h
new file mode 100644
index 000000000000..5b15fd5ee1af
--- /dev/null
+++ b/tools/include/trace/events/lock.h
@@ -0,0 +1,4 @@
+#ifndef _TOOLS_INCLUDE_TRACE_EVENTS_LOCK_H
+#define _TOOLS_INCLUDE_TRACE_EVENTS_LOCK_H
+
+#endif /* _TOOLS_INCLUDE_TRACE_EVENTS_LOCK_H */
diff --git a/tools/lib/api/fs/fs.c b/tools/lib/api/fs/fs.c
index 809c7721cd24..a7ecf8f469f4 100644
--- a/tools/lib/api/fs/fs.c
+++ b/tools/lib/api/fs/fs.c
@@ -387,6 +387,22 @@ int filename__read_str(const char *filename, char **buf, size_t *sizep)
return err;
}
+int filename__write_int(const char *filename, int value)
+{
+ int fd = open(filename, O_WRONLY), err = -1;
+ char buf[64];
+
+ if (fd < 0)
+ return err;
+
+ sprintf(buf, "%d", value);
+ if (write(fd, buf, sizeof(buf)) == sizeof(buf))
+ err = 0;
+
+ close(fd);
+ return err;
+}
+
int procfs__read_str(const char *entry, char **buf, size_t *sizep)
{
char path[PATH_MAX];
@@ -480,3 +496,17 @@ int sysctl__read_int(const char *sysctl, int *value)
return filename__read_int(path, value);
}
+
+int sysfs__write_int(const char *entry, int value)
+{
+ char path[PATH_MAX];
+ const char *sysfs = sysfs__mountpoint();
+
+ if (!sysfs)
+ return -1;
+
+ if (snprintf(path, sizeof(path), "%s/%s", sysfs, entry) >= PATH_MAX)
+ return -1;
+
+ return filename__write_int(path, value);
+}
diff --git a/tools/lib/api/fs/fs.h b/tools/lib/api/fs/fs.h
index 956c21127d1e..45605348461e 100644
--- a/tools/lib/api/fs/fs.h
+++ b/tools/lib/api/fs/fs.h
@@ -31,6 +31,8 @@ int filename__read_int(const char *filename, int *value);
int filename__read_ull(const char *filename, unsigned long long *value);
int filename__read_str(const char *filename, char **buf, size_t *sizep);
+int filename__write_int(const char *filename, int value);
+
int procfs__read_str(const char *entry, char **buf, size_t *sizep);
int sysctl__read_int(const char *sysctl, int *value);
@@ -38,4 +40,6 @@ int sysfs__read_int(const char *entry, int *value);
int sysfs__read_ull(const char *entry, unsigned long long *value);
int sysfs__read_str(const char *entry, char **buf, size_t *sizep);
int sysfs__read_bool(const char *entry, bool *value);
+
+int sysfs__write_int(const char *entry, int value);
#endif /* __API_FS__ */
diff --git a/tools/lib/lockdep/Makefile b/tools/lib/lockdep/Makefile
index 3bc0ef9f8923..ed9ace59d112 100644
--- a/tools/lib/lockdep/Makefile
+++ b/tools/lib/lockdep/Makefile
@@ -79,6 +79,7 @@ INCLUDES = -I. -I./uinclude -I./include -I../../include $(CONFIG_INCLUDES)
# Set compile option CFLAGS if not set elsewhere
CFLAGS ?= -g -DCONFIG_LOCKDEP -DCONFIG_STACKTRACE -DCONFIG_PROVE_LOCKING -DBITS_PER_LONG=__WORDSIZE -DLIBLOCKDEP_VERSION='"$(LIBLOCKDEP_VERSION)"' -rdynamic -O0 -g
CFLAGS += -fPIC
+CFLAGS += -Wall
override CFLAGS += $(CONFIG_FLAGS) $(INCLUDES) $(PLUGIN_DIR_SQ)
@@ -100,7 +101,7 @@ include $(srctree)/tools/build/Makefile.include
do_compile_shared_library = \
($(print_shared_lib_compile) \
- $(CC) --shared $^ -o $@ -lpthread -ldl -Wl,-soname='"$@"';$(shell ln -sf $@ liblockdep.so))
+ $(CC) $(LDFLAGS) --shared $^ -o $@ -lpthread -ldl -Wl,-soname='$(@F)';$(shell ln -sf $(@F) $(@D)/liblockdep.so))
do_build_static_lib = \
($(print_static_lib_build) \
@@ -118,10 +119,10 @@ all_cmd: $(CMD_TARGETS)
$(LIB_IN): force
$(Q)$(MAKE) $(build)=liblockdep
-liblockdep.so.$(LIBLOCKDEP_VERSION): $(LIB_IN)
+$(OUTPUT)liblockdep.so.$(LIBLOCKDEP_VERSION): $(LIB_IN)
$(Q)$(do_compile_shared_library)
-liblockdep.a: $(LIB_IN)
+$(OUTPUT)liblockdep.a: $(LIB_IN)
$(Q)$(do_build_static_lib)
tags: force
@@ -149,7 +150,7 @@ install_lib: all_cmd
install: install_lib
clean:
- $(RM) *.o *~ $(TARGETS) *.a *liblockdep*.so* $(VERSION_FILES) .*.d .*.cmd
+ $(RM) $(OUTPUT)*.o *~ $(TARGETS) $(OUTPUT)*.a $(OUTPUT)*liblockdep*.so* $(VERSION_FILES) $(OUTPUT).*.d $(OUTPUT).*.cmd
$(RM) tags TAGS
PHONY += force
diff --git a/tools/lib/lockdep/lockdep.c b/tools/lib/lockdep/lockdep.c
index a0a2e3a266af..ced6d7443cea 100644
--- a/tools/lib/lockdep/lockdep.c
+++ b/tools/lib/lockdep/lockdep.c
@@ -1,8 +1,27 @@
#include <linux/lockdep.h>
+#include <stdlib.h>
/* Trivial API wrappers, we don't (yet) have RCU in user-space: */
#define hlist_for_each_entry_rcu hlist_for_each_entry
#define hlist_add_head_rcu hlist_add_head
#define hlist_del_rcu hlist_del
+#define list_for_each_entry_rcu list_for_each_entry
+#define list_add_tail_rcu list_add_tail
+
+u32 prandom_u32(void)
+{
+ /* Used only by lock_pin_lock() which is dead code */
+ abort();
+}
+
+static struct new_utsname *init_utsname(void)
+{
+ static struct new_utsname n = (struct new_utsname) {
+ .release = "liblockdep",
+ .version = LIBLOCKDEP_VERSION,
+ };
+
+ return &n;
+}
#include "../../../kernel/locking/lockdep.c"
diff --git a/tools/lib/lockdep/preload.c b/tools/lib/lockdep/preload.c
index 52844847569c..6a2d3c5d4e92 100644
--- a/tools/lib/lockdep/preload.c
+++ b/tools/lib/lockdep/preload.c
@@ -4,6 +4,7 @@
#include <dlfcn.h>
#include <stdlib.h>
#include <sysexits.h>
+#include <unistd.h>
#include "include/liblockdep/mutex.h"
#include "../../include/linux/rbtree.h"
@@ -122,8 +123,6 @@ static struct rb_node **__get_lock_node(void *lock, struct rb_node **parent)
#define LIBLOCKDEP_STATIC_ENTRIES 1024
#endif
-#define ARRAY_SIZE(arr) (sizeof(arr) / sizeof((arr)[0]))
-
static struct lock_lookup __locks[LIBLOCKDEP_STATIC_ENTRIES];
static int __locks_nr;
@@ -149,7 +148,7 @@ static struct lock_lookup *alloc_lock(void)
int idx = __locks_nr++;
if (idx >= ARRAY_SIZE(__locks)) {
- fprintf(stderr,
+ dprintf(STDERR_FILENO,
"LOCKDEP error: insufficient LIBLOCKDEP_STATIC_ENTRIES\n");
exit(EX_UNAVAILABLE);
}
diff --git a/tools/lib/lockdep/rbtree.c b/tools/lib/lockdep/rbtree.c
index f7f43033c8b7..297c304571f8 100644
--- a/tools/lib/lockdep/rbtree.c
+++ b/tools/lib/lockdep/rbtree.c
@@ -1 +1 @@
-#include "../../../lib/rbtree.c"
+#include "../../lib/rbtree.c"
diff --git a/tools/lib/lockdep/run_tests.sh b/tools/lib/lockdep/run_tests.sh
index 1069d96248c1..f9b94098fc98 100755
--- a/tools/lib/lockdep/run_tests.sh
+++ b/tools/lib/lockdep/run_tests.sh
@@ -4,9 +4,9 @@ make &> /dev/null
for i in `ls tests/*.c`; do
testname=$(basename "$i" .c)
- gcc -o tests/$testname -pthread -lpthread $i liblockdep.a -Iinclude -D__USE_LIBLOCKDEP &> /dev/null
+ gcc -o tests/$testname -pthread $i liblockdep.a -Iinclude -D__USE_LIBLOCKDEP &> /dev/null
echo -ne "$testname... "
- if [ $(timeout 1 ./tests/$testname | wc -l) -gt 0 ]; then
+ if [ $(timeout 1 ./tests/$testname 2>&1 | wc -l) -gt 0 ]; then
echo "PASSED!"
else
echo "FAILED!"
@@ -18,9 +18,9 @@ done
for i in `ls tests/*.c`; do
testname=$(basename "$i" .c)
- gcc -o tests/$testname -pthread -lpthread -Iinclude $i &> /dev/null
+ gcc -o tests/$testname -pthread -Iinclude $i &> /dev/null
echo -ne "(PRELOAD) $testname... "
- if [ $(timeout 1 ./lockdep ./tests/$testname | wc -l) -gt 0 ]; then
+ if [ $(timeout 1 ./lockdep ./tests/$testname 2>&1 | wc -l) -gt 0 ]; then
echo "PASSED!"
else
echo "FAILED!"
diff --git a/tools/lib/lockdep/uinclude/asm/hash.h b/tools/lib/lockdep/uinclude/asm/hash.h
deleted file mode 100644
index d82b170bb216..000000000000
--- a/tools/lib/lockdep/uinclude/asm/hash.h
+++ /dev/null
@@ -1,6 +0,0 @@
-#ifndef __ASM_GENERIC_HASH_H
-#define __ASM_GENERIC_HASH_H
-
-/* Stub */
-
-#endif /* __ASM_GENERIC_HASH_H */
diff --git a/tools/lib/lockdep/uinclude/asm/hweight.h b/tools/lib/lockdep/uinclude/asm/hweight.h
deleted file mode 100644
index fab00ff936d1..000000000000
--- a/tools/lib/lockdep/uinclude/asm/hweight.h
+++ /dev/null
@@ -1,3 +0,0 @@
-
-/* empty file */
-
diff --git a/tools/lib/lockdep/uinclude/asm/sections.h b/tools/lib/lockdep/uinclude/asm/sections.h
deleted file mode 100644
index fab00ff936d1..000000000000
--- a/tools/lib/lockdep/uinclude/asm/sections.h
+++ /dev/null
@@ -1,3 +0,0 @@
-
-/* empty file */
-
diff --git a/tools/lib/lockdep/uinclude/linux/bitops.h b/tools/lib/lockdep/uinclude/linux/bitops.h
deleted file mode 100644
index fab00ff936d1..000000000000
--- a/tools/lib/lockdep/uinclude/linux/bitops.h
+++ /dev/null
@@ -1,3 +0,0 @@
-
-/* empty file */
-
diff --git a/tools/lib/lockdep/uinclude/linux/compiler.h b/tools/lib/lockdep/uinclude/linux/compiler.h
deleted file mode 100644
index fd3e56a83fc2..000000000000
--- a/tools/lib/lockdep/uinclude/linux/compiler.h
+++ /dev/null
@@ -1,10 +0,0 @@
-#ifndef _LIBLOCKDEP_LINUX_COMPILER_H_
-#define _LIBLOCKDEP_LINUX_COMPILER_H_
-
-#define __used __attribute__((__unused__))
-#define unlikely
-#define READ_ONCE(x) (x)
-#define WRITE_ONCE(x, val) x=(val)
-#define RCU_INIT_POINTER(p, v) p=(v)
-
-#endif
diff --git a/tools/lib/lockdep/uinclude/linux/delay.h b/tools/lib/lockdep/uinclude/linux/delay.h
deleted file mode 100644
index fab00ff936d1..000000000000
--- a/tools/lib/lockdep/uinclude/linux/delay.h
+++ /dev/null
@@ -1,3 +0,0 @@
-
-/* empty file */
-
diff --git a/tools/lib/lockdep/uinclude/linux/ftrace.h b/tools/lib/lockdep/uinclude/linux/ftrace.h
deleted file mode 100644
index fab00ff936d1..000000000000
--- a/tools/lib/lockdep/uinclude/linux/ftrace.h
+++ /dev/null
@@ -1,3 +0,0 @@
-
-/* empty file */
-
diff --git a/tools/lib/lockdep/uinclude/linux/gfp.h b/tools/lib/lockdep/uinclude/linux/gfp.h
deleted file mode 100644
index fab00ff936d1..000000000000
--- a/tools/lib/lockdep/uinclude/linux/gfp.h
+++ /dev/null
@@ -1,3 +0,0 @@
-
-/* empty file */
-
diff --git a/tools/lib/lockdep/uinclude/linux/hash.h b/tools/lib/lockdep/uinclude/linux/hash.h
deleted file mode 100644
index 0f8479858dc0..000000000000
--- a/tools/lib/lockdep/uinclude/linux/hash.h
+++ /dev/null
@@ -1 +0,0 @@
-#include "../../../include/linux/hash.h"
diff --git a/tools/lib/lockdep/uinclude/linux/interrupt.h b/tools/lib/lockdep/uinclude/linux/interrupt.h
deleted file mode 100644
index fab00ff936d1..000000000000
--- a/tools/lib/lockdep/uinclude/linux/interrupt.h
+++ /dev/null
@@ -1,3 +0,0 @@
-
-/* empty file */
-
diff --git a/tools/lib/lockdep/uinclude/linux/kernel.h b/tools/lib/lockdep/uinclude/linux/kernel.h
deleted file mode 100644
index 276c7a8b2ed1..000000000000
--- a/tools/lib/lockdep/uinclude/linux/kernel.h
+++ /dev/null
@@ -1,47 +0,0 @@
-#ifndef _LIBLOCKDEP_LINUX_KERNEL_H_
-#define _LIBLOCKDEP_LINUX_KERNEL_H_
-
-#include <linux/export.h>
-#include <linux/types.h>
-#include <linux/rcu.h>
-#include <linux/hardirq.h>
-#include <linux/kern_levels.h>
-
-#ifndef container_of
-#define container_of(ptr, type, member) ({ \
- const typeof(((type *)0)->member) * __mptr = (ptr); \
- (type *)((char *)__mptr - offsetof(type, member)); })
-#endif
-
-#define max(x, y) ({ \
- typeof(x) _max1 = (x); \
- typeof(y) _max2 = (y); \
- (void) (&_max1 == &_max2); \
- _max1 > _max2 ? _max1 : _max2; })
-
-#define BUILD_BUG_ON(condition) ((void)sizeof(char[1 - 2*!!(condition)]))
-#define WARN_ON(x) (x)
-#define WARN_ON_ONCE(x) (x)
-#define likely(x) (x)
-#define WARN(x, y...) (x)
-#define uninitialized_var(x) x
-#define __init
-#define noinline
-#define list_add_tail_rcu list_add_tail
-#define list_for_each_entry_rcu list_for_each_entry
-#define barrier()
-#define synchronize_sched()
-
-#ifndef CALLER_ADDR0
-#define CALLER_ADDR0 ((unsigned long)__builtin_return_address(0))
-#endif
-
-#ifndef _RET_IP_
-#define _RET_IP_ CALLER_ADDR0
-#endif
-
-#ifndef _THIS_IP_
-#define _THIS_IP_ ({ __label__ __here; __here: (unsigned long)&&__here; })
-#endif
-
-#endif
diff --git a/tools/lib/lockdep/uinclude/linux/linkage.h b/tools/lib/lockdep/uinclude/linux/linkage.h
deleted file mode 100644
index fab00ff936d1..000000000000
--- a/tools/lib/lockdep/uinclude/linux/linkage.h
+++ /dev/null
@@ -1,3 +0,0 @@
-
-/* empty file */
-
diff --git a/tools/lib/lockdep/uinclude/linux/list.h b/tools/lib/lockdep/uinclude/linux/list.h
deleted file mode 100644
index 6e9ef31ed82e..000000000000
--- a/tools/lib/lockdep/uinclude/linux/list.h
+++ /dev/null
@@ -1 +0,0 @@
-#include "../../../include/linux/list.h"
diff --git a/tools/lib/lockdep/uinclude/linux/mutex.h b/tools/lib/lockdep/uinclude/linux/mutex.h
deleted file mode 100644
index fab00ff936d1..000000000000
--- a/tools/lib/lockdep/uinclude/linux/mutex.h
+++ /dev/null
@@ -1,3 +0,0 @@
-
-/* empty file */
-
diff --git a/tools/lib/lockdep/uinclude/linux/poison.h b/tools/lib/lockdep/uinclude/linux/poison.h
deleted file mode 100644
index 0c27bdf14233..000000000000
--- a/tools/lib/lockdep/uinclude/linux/poison.h
+++ /dev/null
@@ -1 +0,0 @@
-#include "../../../include/linux/poison.h"
diff --git a/tools/lib/lockdep/uinclude/linux/prefetch.h b/tools/lib/lockdep/uinclude/linux/prefetch.h
deleted file mode 100644
index d73fe6f850ac..000000000000
--- a/tools/lib/lockdep/uinclude/linux/prefetch.h
+++ /dev/null
@@ -1,6 +0,0 @@
-#ifndef _LIBLOCKDEP_LINUX_PREFETCH_H_
-#define _LIBLOCKDEP_LINUX_PREFETCH_H
-
-static inline void prefetch(void *a __attribute__((unused))) { }
-
-#endif
diff --git a/tools/lib/lockdep/uinclude/linux/proc_fs.h b/tools/lib/lockdep/uinclude/linux/proc_fs.h
deleted file mode 100644
index fab00ff936d1..000000000000
--- a/tools/lib/lockdep/uinclude/linux/proc_fs.h
+++ /dev/null
@@ -1,3 +0,0 @@
-
-/* empty file */
-
diff --git a/tools/lib/lockdep/uinclude/linux/rbtree_augmented.h b/tools/lib/lockdep/uinclude/linux/rbtree_augmented.h
deleted file mode 100644
index c3759477379c..000000000000
--- a/tools/lib/lockdep/uinclude/linux/rbtree_augmented.h
+++ /dev/null
@@ -1,2 +0,0 @@
-#define __always_inline
-#include "../../../include/linux/rbtree_augmented.h"
diff --git a/tools/lib/lockdep/uinclude/linux/seq_file.h b/tools/lib/lockdep/uinclude/linux/seq_file.h
deleted file mode 100644
index fab00ff936d1..000000000000
--- a/tools/lib/lockdep/uinclude/linux/seq_file.h
+++ /dev/null
@@ -1,3 +0,0 @@
-
-/* empty file */
-
diff --git a/tools/lib/lockdep/uinclude/linux/spinlock.h b/tools/lib/lockdep/uinclude/linux/spinlock.h
deleted file mode 100644
index 68c1aa2bcba5..000000000000
--- a/tools/lib/lockdep/uinclude/linux/spinlock.h
+++ /dev/null
@@ -1,25 +0,0 @@
-#ifndef _LIBLOCKDEP_SPINLOCK_H_
-#define _LIBLOCKDEP_SPINLOCK_H_
-
-#include <pthread.h>
-#include <stdbool.h>
-
-#define arch_spinlock_t pthread_mutex_t
-#define __ARCH_SPIN_LOCK_UNLOCKED PTHREAD_MUTEX_INITIALIZER
-
-static inline void arch_spin_lock(arch_spinlock_t *mutex)
-{
- pthread_mutex_lock(mutex);
-}
-
-static inline void arch_spin_unlock(arch_spinlock_t *mutex)
-{
- pthread_mutex_unlock(mutex);
-}
-
-static inline bool arch_spin_is_locked(arch_spinlock_t *mutex)
-{
- return true;
-}
-
-#endif
diff --git a/tools/lib/lockdep/uinclude/linux/stringify.h b/tools/lib/lockdep/uinclude/linux/stringify.h
deleted file mode 100644
index 05dfcd1ac118..000000000000
--- a/tools/lib/lockdep/uinclude/linux/stringify.h
+++ /dev/null
@@ -1,7 +0,0 @@
-#ifndef _LIBLOCKDEP_LINUX_STRINGIFY_H_
-#define _LIBLOCKDEP_LINUX_STRINGIFY_H_
-
-#define __stringify_1(x...) #x
-#define __stringify(x...) __stringify_1(x)
-
-#endif
diff --git a/tools/lib/lockdep/uinclude/trace/events/lock.h b/tools/lib/lockdep/uinclude/trace/events/lock.h
deleted file mode 100644
index fab00ff936d1..000000000000
--- a/tools/lib/lockdep/uinclude/trace/events/lock.h
+++ /dev/null
@@ -1,3 +0,0 @@
-
-/* empty file */
-
diff --git a/tools/objtool/Build b/tools/objtool/Build
index d6cdece5e58b..6f2e1987c4d9 100644
--- a/tools/objtool/Build
+++ b/tools/objtool/Build
@@ -1,5 +1,6 @@
objtool-y += arch/$(SRCARCH)/
objtool-y += builtin-check.o
+objtool-y += check.o
objtool-y += elf.o
objtool-y += special.o
objtool-y += objtool.o
diff --git a/tools/objtool/Documentation/stack-validation.txt b/tools/objtool/Documentation/stack-validation.txt
index 55a60d331f47..17c1195f11f4 100644
--- a/tools/objtool/Documentation/stack-validation.txt
+++ b/tools/objtool/Documentation/stack-validation.txt
@@ -127,28 +127,13 @@ b) 100% reliable stack traces for DWARF enabled kernels
c) Higher live patching compatibility rate
- (NOTE: This is not yet implemented)
-
- Currently with CONFIG_LIVEPATCH there's a basic live patching
- framework which is safe for roughly 85-90% of "security" fixes. But
- patches can't have complex features like function dependency or
- prototype changes, or data structure changes.
-
- There's a strong need to support patches which have the more complex
- features so that the patch compatibility rate for security fixes can
- eventually approach something resembling 100%. To achieve that, a
- "consistency model" is needed, which allows tasks to be safely
- transitioned from an unpatched state to a patched state.
-
- One of the key requirements of the currently proposed livepatch
- consistency model [*] is that it needs to walk the stack of each
- sleeping task to determine if it can be transitioned to the patched
- state. If objtool can ensure that stack traces are reliable, this
- consistency model can be used and the live patching compatibility
- rate can be improved significantly.
-
- [*] https://lkml.kernel.org/r/cover.1423499826.git.jpoimboe@redhat.com
+ Livepatch has an optional "consistency model", which is needed for
+ more complex patches. In order for the consistency model to work,
+ stack traces need to be reliable (or an unreliable condition needs to
+ be detectable). Objtool makes that possible.
+ For more details, see the livepatch documentation in the Linux kernel
+ source tree at Documentation/livepatch/livepatch.txt.
Rules
-----
@@ -201,80 +186,84 @@ To achieve the validation, objtool enforces the following rules:
return normally.
-Errors in .S files
-------------------
+Objtool warnings
+----------------
-If you're getting an error in a compiled .S file which you don't
-understand, first make sure that the affected code follows the above
-rules.
+For asm files, if you're getting an error which doesn't make sense,
+first make sure that the affected code follows the above rules.
+
+For C files, the common culprits are inline asm statements and calls to
+"noreturn" functions. See below for more details.
+
+Another possible cause for errors in C code is if the Makefile removes
+-fno-omit-frame-pointer or adds -fomit-frame-pointer to the gcc options.
Here are some examples of common warnings reported by objtool, what
they mean, and suggestions for how to fix them.
-1. asm_file.o: warning: objtool: func()+0x128: call without frame pointer save/setup
+1. file.o: warning: objtool: func()+0x128: call without frame pointer save/setup
The func() function made a function call without first saving and/or
- updating the frame pointer.
-
- If func() is indeed a callable function, add proper frame pointer
- logic using the FRAME_BEGIN and FRAME_END macros. Otherwise, remove
- its ELF function annotation by changing ENDPROC to END.
+ updating the frame pointer, and CONFIG_FRAME_POINTER is enabled.
- If you're getting this error in a .c file, see the "Errors in .c
- files" section.
+ If the error is for an asm file, and func() is indeed a callable
+ function, add proper frame pointer logic using the FRAME_BEGIN and
+ FRAME_END macros. Otherwise, if it's not a callable function, remove
+ its ELF function annotation by changing ENDPROC to END, and instead
+ use the manual CFI hint macros in asm/undwarf.h.
+ If it's a GCC-compiled .c file, the error may be because the function
+ uses an inline asm() statement which has a "call" instruction. An
+ asm() statement with a call instruction must declare the use of the
+ stack pointer in its output operand. For example, on x86_64:
-2. asm_file.o: warning: objtool: .text+0x53: return instruction outside of a callable function
-
- A return instruction was detected, but objtool couldn't find a way
- for a callable function to reach the instruction.
+ register void *__sp asm("rsp");
+ asm volatile("call func" : "+r" (__sp));
- If the return instruction is inside (or reachable from) a callable
- function, the function needs to be annotated with the ENTRY/ENDPROC
- macros.
+ Otherwise the stack frame may not get created before the call.
- If you _really_ need a return instruction outside of a function, and
- are 100% sure that it won't affect stack traces, you can tell
- objtool to ignore it. See the "Adding exceptions" section below.
+2. file.o: warning: objtool: .text+0x53: unreachable instruction
-3. asm_file.o: warning: objtool: func()+0x9: function has unreachable instruction
+ Objtool couldn't find a code path to reach the instruction.
- The instruction lives inside of a callable function, but there's no
- possible control flow path from the beginning of the function to the
- instruction.
+ If the error is for an asm file, and the instruction is inside (or
+ reachable from) a callable function, the function should be annotated
+ with the ENTRY/ENDPROC macros (ENDPROC is the important one).
+ Otherwise, the code should probably be annotated with the CFI hint
+ macros in asm/undwarf.h so objtool and the unwinder can know the
+ stack state associated with the code.
- If the instruction is actually needed, and it's actually in a
- callable function, ensure that its function is properly annotated
- with ENTRY/ENDPROC.
+ If you're 100% sure the code won't affect stack traces, or if you're
+ a just a bad person, you can tell objtool to ignore it. See the
+ "Adding exceptions" section below.
If it's not actually in a callable function (e.g. kernel entry code),
change ENDPROC to END.
-4. asm_file.o: warning: objtool: func(): can't find starting instruction
+4. file.o: warning: objtool: func(): can't find starting instruction
or
- asm_file.o: warning: objtool: func()+0x11dd: can't decode instruction
+ file.o: warning: objtool: func()+0x11dd: can't decode instruction
- Did you put data in a text section? If so, that can confuse
+ Does the file have data in a text section? If so, that can confuse
objtool's instruction decoder. Move the data to a more appropriate
section like .data or .rodata.
-5. asm_file.o: warning: objtool: func()+0x6: kernel entry/exit from callable instruction
-
- This is a kernel entry/exit instruction like sysenter or sysret.
- Such instructions aren't allowed in a callable function, and are most
- likely part of the kernel entry code.
+5. file.o: warning: objtool: func()+0x6: unsupported instruction in callable function
- If the instruction isn't actually in a callable function, change
- ENDPROC to END.
+ This is a kernel entry/exit instruction like sysenter or iret. Such
+ instructions aren't allowed in a callable function, and are most
+ likely part of the kernel entry code. They should usually not have
+ the callable function annotation (ENDPROC) and should always be
+ annotated with the CFI hint macros in asm/undwarf.h.
-6. asm_file.o: warning: objtool: func()+0x26: sibling call from callable instruction with changed frame pointer
+6. file.o: warning: objtool: func()+0x26: sibling call from callable instruction with modified stack frame
- This is a dynamic jump or a jump to an undefined symbol. Stacktool
+ This is a dynamic jump or a jump to an undefined symbol. Objtool
assumed it's a sibling call and detected that the frame pointer
wasn't first restored to its original state.
@@ -282,24 +271,28 @@ they mean, and suggestions for how to fix them.
destination code to the local file.
If the instruction is not actually in a callable function (e.g.
- kernel entry code), change ENDPROC to END.
+ kernel entry code), change ENDPROC to END and annotate manually with
+ the CFI hint macros in asm/undwarf.h.
-7. asm_file: warning: objtool: func()+0x5c: frame pointer state mismatch
+7. file: warning: objtool: func()+0x5c: stack state mismatch
The instruction's frame pointer state is inconsistent, depending on
which execution path was taken to reach the instruction.
- Make sure the function pushes and sets up the frame pointer (for
- x86_64, this means rbp) at the beginning of the function and pops it
- at the end of the function. Also make sure that no other code in the
- function touches the frame pointer.
+ Make sure that, when CONFIG_FRAME_POINTER is enabled, the function
+ pushes and sets up the frame pointer (for x86_64, this means rbp) at
+ the beginning of the function and pops it at the end of the function.
+ Also make sure that no other code in the function touches the frame
+ pointer.
+ Another possibility is that the code has some asm or inline asm which
+ does some unusual things to the stack or the frame pointer. In such
+ cases it's probably appropriate to use the CFI hint macros in
+ asm/undwarf.h.
-Errors in .c files
-------------------
-1. c_file.o: warning: objtool: funcA() falls through to next function funcB()
+8. file.o: warning: objtool: funcA() falls through to next function funcB()
This means that funcA() doesn't end with a return instruction or an
unconditional jump, and that objtool has determined that the function
@@ -318,22 +311,6 @@ Errors in .c files
might be corrupt due to a gcc bug. For more details, see:
https://gcc.gnu.org/bugzilla/show_bug.cgi?id=70646
-2. If you're getting any other objtool error in a compiled .c file, it
- may be because the file uses an asm() statement which has a "call"
- instruction. An asm() statement with a call instruction must declare
- the use of the stack pointer in its output operand. For example, on
- x86_64:
-
- register void *__sp asm("rsp");
- asm volatile("call func" : "+r" (__sp));
-
- Otherwise the stack frame may not get created before the call.
-
-3. Another possible cause for errors in C code is if the Makefile removes
- -fno-omit-frame-pointer or adds -fomit-frame-pointer to the gcc options.
-
-Also see the above section for .S file errors for more information what
-the individual error messages mean.
If the error doesn't seem to make sense, it could be a bug in objtool.
Feel free to ask the objtool maintainer for help.
diff --git a/tools/objtool/Makefile b/tools/objtool/Makefile
index 27e019c09bd2..0e2765e243c0 100644
--- a/tools/objtool/Makefile
+++ b/tools/objtool/Makefile
@@ -25,7 +25,7 @@ OBJTOOL_IN := $(OBJTOOL)-in.o
all: $(OBJTOOL)
INCLUDES := -I$(srctree)/tools/include -I$(srctree)/tools/arch/$(HOSTARCH)/include/uapi
-CFLAGS += -Wall -Werror $(EXTRA_WARNINGS) -fomit-frame-pointer -O2 -g $(INCLUDES)
+CFLAGS += -Wall -Werror $(EXTRA_WARNINGS) -Wno-switch-default -Wno-switch-enum -fomit-frame-pointer -O2 -g $(INCLUDES)
LDFLAGS += -lelf $(LIBSUBCMD)
# Allow old libelf to be used:
diff --git a/tools/objtool/arch.h b/tools/objtool/arch.h
index a59e061c0b4a..21aeca874edb 100644
--- a/tools/objtool/arch.h
+++ b/tools/objtool/arch.h
@@ -19,25 +19,63 @@
#define _ARCH_H
#include <stdbool.h>
+#include <linux/list.h>
#include "elf.h"
+#include "cfi.h"
-#define INSN_FP_SAVE 1
-#define INSN_FP_SETUP 2
-#define INSN_FP_RESTORE 3
-#define INSN_JUMP_CONDITIONAL 4
-#define INSN_JUMP_UNCONDITIONAL 5
-#define INSN_JUMP_DYNAMIC 6
-#define INSN_CALL 7
-#define INSN_CALL_DYNAMIC 8
-#define INSN_RETURN 9
-#define INSN_CONTEXT_SWITCH 10
-#define INSN_NOP 11
-#define INSN_OTHER 12
+#define INSN_JUMP_CONDITIONAL 1
+#define INSN_JUMP_UNCONDITIONAL 2
+#define INSN_JUMP_DYNAMIC 3
+#define INSN_CALL 4
+#define INSN_CALL_DYNAMIC 5
+#define INSN_RETURN 6
+#define INSN_CONTEXT_SWITCH 7
+#define INSN_STACK 8
+#define INSN_NOP 9
+#define INSN_OTHER 10
#define INSN_LAST INSN_OTHER
+enum op_dest_type {
+ OP_DEST_REG,
+ OP_DEST_REG_INDIRECT,
+ OP_DEST_MEM,
+ OP_DEST_PUSH,
+ OP_DEST_LEAVE,
+};
+
+struct op_dest {
+ enum op_dest_type type;
+ unsigned char reg;
+ int offset;
+};
+
+enum op_src_type {
+ OP_SRC_REG,
+ OP_SRC_REG_INDIRECT,
+ OP_SRC_CONST,
+ OP_SRC_POP,
+ OP_SRC_ADD,
+ OP_SRC_AND,
+};
+
+struct op_src {
+ enum op_src_type type;
+ unsigned char reg;
+ int offset;
+};
+
+struct stack_op {
+ struct op_dest dest;
+ struct op_src src;
+};
+
+void arch_initial_func_cfi_state(struct cfi_state *state);
+
int arch_decode_instruction(struct elf *elf, struct section *sec,
unsigned long offset, unsigned int maxlen,
unsigned int *len, unsigned char *type,
- unsigned long *displacement);
+ unsigned long *immediate, struct stack_op *op);
+
+bool arch_callee_saved_reg(unsigned char reg);
#endif /* _ARCH_H */
diff --git a/tools/objtool/arch/x86/decode.c b/tools/objtool/arch/x86/decode.c
index 6ac99e3266eb..a36c2eba64e7 100644
--- a/tools/objtool/arch/x86/decode.c
+++ b/tools/objtool/arch/x86/decode.c
@@ -27,6 +27,17 @@
#include "../../arch.h"
#include "../../warn.h"
+static unsigned char op_to_cfi_reg[][2] = {
+ {CFI_AX, CFI_R8},
+ {CFI_CX, CFI_R9},
+ {CFI_DX, CFI_R10},
+ {CFI_BX, CFI_R11},
+ {CFI_SP, CFI_R12},
+ {CFI_BP, CFI_R13},
+ {CFI_SI, CFI_R14},
+ {CFI_DI, CFI_R15},
+};
+
static int is_x86_64(struct elf *elf)
{
switch (elf->ehdr.e_machine) {
@@ -40,24 +51,50 @@ static int is_x86_64(struct elf *elf)
}
}
+bool arch_callee_saved_reg(unsigned char reg)
+{
+ switch (reg) {
+ case CFI_BP:
+ case CFI_BX:
+ case CFI_R12:
+ case CFI_R13:
+ case CFI_R14:
+ case CFI_R15:
+ return true;
+
+ case CFI_AX:
+ case CFI_CX:
+ case CFI_DX:
+ case CFI_SI:
+ case CFI_DI:
+ case CFI_SP:
+ case CFI_R8:
+ case CFI_R9:
+ case CFI_R10:
+ case CFI_R11:
+ case CFI_RA:
+ default:
+ return false;
+ }
+}
+
int arch_decode_instruction(struct elf *elf, struct section *sec,
unsigned long offset, unsigned int maxlen,
unsigned int *len, unsigned char *type,
- unsigned long *immediate)
+ unsigned long *immediate, struct stack_op *op)
{
struct insn insn;
- int x86_64;
- unsigned char op1, op2, ext;
+ int x86_64, sign;
+ unsigned char op1, op2, rex = 0, rex_b = 0, rex_r = 0, rex_w = 0,
+ modrm = 0, modrm_mod = 0, modrm_rm = 0, modrm_reg = 0,
+ sib = 0;
x86_64 = is_x86_64(elf);
if (x86_64 == -1)
return -1;
- insn_init(&insn, (void *)(sec->data + offset), maxlen, x86_64);
+ insn_init(&insn, sec->data->d_buf + offset, maxlen, x86_64);
insn_get_length(&insn);
- insn_get_opcode(&insn);
- insn_get_modrm(&insn);
- insn_get_immediate(&insn);
if (!insn_complete(&insn)) {
WARN_FUNC("can't decode instruction", sec, offset);
@@ -73,67 +110,323 @@ int arch_decode_instruction(struct elf *elf, struct section *sec,
op1 = insn.opcode.bytes[0];
op2 = insn.opcode.bytes[1];
+ if (insn.rex_prefix.nbytes) {
+ rex = insn.rex_prefix.bytes[0];
+ rex_w = X86_REX_W(rex) >> 3;
+ rex_r = X86_REX_R(rex) >> 2;
+ rex_b = X86_REX_B(rex);
+ }
+
+ if (insn.modrm.nbytes) {
+ modrm = insn.modrm.bytes[0];
+ modrm_mod = X86_MODRM_MOD(modrm);
+ modrm_reg = X86_MODRM_REG(modrm);
+ modrm_rm = X86_MODRM_RM(modrm);
+ }
+
+ if (insn.sib.nbytes)
+ sib = insn.sib.bytes[0];
+
switch (op1) {
- case 0x55:
- if (!insn.rex_prefix.nbytes)
- /* push rbp */
- *type = INSN_FP_SAVE;
+
+ case 0x1:
+ case 0x29:
+ if (rex_w && !rex_b && modrm_mod == 3 && modrm_rm == 4) {
+
+ /* add/sub reg, %rsp */
+ *type = INSN_STACK;
+ op->src.type = OP_SRC_ADD;
+ op->src.reg = op_to_cfi_reg[modrm_reg][rex_r];
+ op->dest.type = OP_SRC_REG;
+ op->dest.reg = CFI_SP;
+ }
+ break;
+
+ case 0x50 ... 0x57:
+
+ /* push reg */
+ *type = INSN_STACK;
+ op->src.type = OP_SRC_REG;
+ op->src.reg = op_to_cfi_reg[op1 & 0x7][rex_b];
+ op->dest.type = OP_DEST_PUSH;
+
break;
- case 0x5d:
- if (!insn.rex_prefix.nbytes)
- /* pop rbp */
- *type = INSN_FP_RESTORE;
+ case 0x58 ... 0x5f:
+
+ /* pop reg */
+ *type = INSN_STACK;
+ op->src.type = OP_SRC_POP;
+ op->dest.type = OP_DEST_REG;
+ op->dest.reg = op_to_cfi_reg[op1 & 0x7][rex_b];
+
+ break;
+
+ case 0x68:
+ case 0x6a:
+ /* push immediate */
+ *type = INSN_STACK;
+ op->src.type = OP_SRC_CONST;
+ op->dest.type = OP_DEST_PUSH;
break;
case 0x70 ... 0x7f:
*type = INSN_JUMP_CONDITIONAL;
break;
+ case 0x81:
+ case 0x83:
+ if (rex != 0x48)
+ break;
+
+ if (modrm == 0xe4) {
+ /* and imm, %rsp */
+ *type = INSN_STACK;
+ op->src.type = OP_SRC_AND;
+ op->src.reg = CFI_SP;
+ op->src.offset = insn.immediate.value;
+ op->dest.type = OP_DEST_REG;
+ op->dest.reg = CFI_SP;
+ break;
+ }
+
+ if (modrm == 0xc4)
+ sign = 1;
+ else if (modrm == 0xec)
+ sign = -1;
+ else
+ break;
+
+ /* add/sub imm, %rsp */
+ *type = INSN_STACK;
+ op->src.type = OP_SRC_ADD;
+ op->src.reg = CFI_SP;
+ op->src.offset = insn.immediate.value * sign;
+ op->dest.type = OP_DEST_REG;
+ op->dest.reg = CFI_SP;
+ break;
+
case 0x89:
- if (insn.rex_prefix.nbytes == 1 &&
- insn.rex_prefix.bytes[0] == 0x48 &&
- insn.modrm.nbytes && insn.modrm.bytes[0] == 0xe5)
- /* mov rsp, rbp */
- *type = INSN_FP_SETUP;
+ if (rex == 0x48 && modrm == 0xe5) {
+
+ /* mov %rsp, %rbp */
+ *type = INSN_STACK;
+ op->src.type = OP_SRC_REG;
+ op->src.reg = CFI_SP;
+ op->dest.type = OP_DEST_REG;
+ op->dest.reg = CFI_BP;
+ break;
+ }
+ /* fallthrough */
+ case 0x88:
+ if (!rex_b &&
+ (modrm_mod == 1 || modrm_mod == 2) && modrm_rm == 5) {
+
+ /* mov reg, disp(%rbp) */
+ *type = INSN_STACK;
+ op->src.type = OP_SRC_REG;
+ op->src.reg = op_to_cfi_reg[modrm_reg][rex_r];
+ op->dest.type = OP_DEST_REG_INDIRECT;
+ op->dest.reg = CFI_BP;
+ op->dest.offset = insn.displacement.value;
+
+ } else if (rex_w && !rex_b && modrm_rm == 4 && sib == 0x24) {
+
+ /* mov reg, disp(%rsp) */
+ *type = INSN_STACK;
+ op->src.type = OP_SRC_REG;
+ op->src.reg = op_to_cfi_reg[modrm_reg][rex_r];
+ op->dest.type = OP_DEST_REG_INDIRECT;
+ op->dest.reg = CFI_SP;
+ op->dest.offset = insn.displacement.value;
+ }
+
+ break;
+
+ case 0x8b:
+ if (rex_w && !rex_b && modrm_mod == 1 && modrm_rm == 5) {
+
+ /* mov disp(%rbp), reg */
+ *type = INSN_STACK;
+ op->src.type = OP_SRC_REG_INDIRECT;
+ op->src.reg = CFI_BP;
+ op->src.offset = insn.displacement.value;
+ op->dest.type = OP_DEST_REG;
+ op->dest.reg = op_to_cfi_reg[modrm_reg][rex_r];
+
+ } else if (rex_w && !rex_b && sib == 0x24 &&
+ modrm_mod != 3 && modrm_rm == 4) {
+
+ /* mov disp(%rsp), reg */
+ *type = INSN_STACK;
+ op->src.type = OP_SRC_REG_INDIRECT;
+ op->src.reg = CFI_SP;
+ op->src.offset = insn.displacement.value;
+ op->dest.type = OP_DEST_REG;
+ op->dest.reg = op_to_cfi_reg[modrm_reg][rex_r];
+ }
+
break;
case 0x8d:
- if (insn.rex_prefix.nbytes &&
- insn.rex_prefix.bytes[0] == 0x48 &&
- insn.modrm.nbytes && insn.modrm.bytes[0] == 0x2c &&
- insn.sib.nbytes && insn.sib.bytes[0] == 0x24)
- /* lea %(rsp), %rbp */
- *type = INSN_FP_SETUP;
+ if (rex == 0x48 && modrm == 0x65) {
+
+ /* lea -disp(%rbp), %rsp */
+ *type = INSN_STACK;
+ op->src.type = OP_SRC_ADD;
+ op->src.reg = CFI_BP;
+ op->src.offset = insn.displacement.value;
+ op->dest.type = OP_DEST_REG;
+ op->dest.reg = CFI_SP;
+ break;
+ }
+
+ if (rex == 0x4c && modrm == 0x54 && sib == 0x24 &&
+ insn.displacement.value == 8) {
+
+ /*
+ * lea 0x8(%rsp), %r10
+ *
+ * Here r10 is the "drap" pointer, used as a stack
+ * pointer helper when the stack gets realigned.
+ */
+ *type = INSN_STACK;
+ op->src.type = OP_SRC_ADD;
+ op->src.reg = CFI_SP;
+ op->src.offset = 8;
+ op->dest.type = OP_DEST_REG;
+ op->dest.reg = CFI_R10;
+ break;
+ }
+
+ if (rex == 0x4c && modrm == 0x6c && sib == 0x24 &&
+ insn.displacement.value == 16) {
+
+ /*
+ * lea 0x10(%rsp), %r13
+ *
+ * Here r13 is the "drap" pointer, used as a stack
+ * pointer helper when the stack gets realigned.
+ */
+ *type = INSN_STACK;
+ op->src.type = OP_SRC_ADD;
+ op->src.reg = CFI_SP;
+ op->src.offset = 16;
+ op->dest.type = OP_DEST_REG;
+ op->dest.reg = CFI_R13;
+ break;
+ }
+
+ if (rex == 0x49 && modrm == 0x62 &&
+ insn.displacement.value == -8) {
+
+ /*
+ * lea -0x8(%r10), %rsp
+ *
+ * Restoring rsp back to its original value after a
+ * stack realignment.
+ */
+ *type = INSN_STACK;
+ op->src.type = OP_SRC_ADD;
+ op->src.reg = CFI_R10;
+ op->src.offset = -8;
+ op->dest.type = OP_DEST_REG;
+ op->dest.reg = CFI_SP;
+ break;
+ }
+
+ if (rex == 0x49 && modrm == 0x65 &&
+ insn.displacement.value == -16) {
+
+ /*
+ * lea -0x10(%r13), %rsp
+ *
+ * Restoring rsp back to its original value after a
+ * stack realignment.
+ */
+ *type = INSN_STACK;
+ op->src.type = OP_SRC_ADD;
+ op->src.reg = CFI_R13;
+ op->src.offset = -16;
+ op->dest.type = OP_DEST_REG;
+ op->dest.reg = CFI_SP;
+ break;
+ }
+
+ break;
+
+ case 0x8f:
+ /* pop to mem */
+ *type = INSN_STACK;
+ op->src.type = OP_SRC_POP;
+ op->dest.type = OP_DEST_MEM;
break;
case 0x90:
*type = INSN_NOP;
break;
+ case 0x9c:
+ /* pushf */
+ *type = INSN_STACK;
+ op->src.type = OP_SRC_CONST;
+ op->dest.type = OP_DEST_PUSH;
+ break;
+
+ case 0x9d:
+ /* popf */
+ *type = INSN_STACK;
+ op->src.type = OP_SRC_POP;
+ op->dest.type = OP_DEST_MEM;
+ break;
+
case 0x0f:
+
if (op2 >= 0x80 && op2 <= 0x8f)
*type = INSN_JUMP_CONDITIONAL;
else if (op2 == 0x05 || op2 == 0x07 || op2 == 0x34 ||
op2 == 0x35)
+
/* sysenter, sysret */
*type = INSN_CONTEXT_SWITCH;
+
else if (op2 == 0x0d || op2 == 0x1f)
+
/* nopl/nopw */
*type = INSN_NOP;
- else if (op2 == 0x01 && insn.modrm.nbytes &&
- (insn.modrm.bytes[0] == 0xc2 ||
- insn.modrm.bytes[0] == 0xd8))
- /* vmlaunch, vmrun */
- *type = INSN_CONTEXT_SWITCH;
+
+ else if (op2 == 0xa0 || op2 == 0xa8) {
+
+ /* push fs/gs */
+ *type = INSN_STACK;
+ op->src.type = OP_SRC_CONST;
+ op->dest.type = OP_DEST_PUSH;
+
+ } else if (op2 == 0xa1 || op2 == 0xa9) {
+
+ /* pop fs/gs */
+ *type = INSN_STACK;
+ op->src.type = OP_SRC_POP;
+ op->dest.type = OP_DEST_MEM;
+ }
break;
- case 0xc9: /* leave */
- *type = INSN_FP_RESTORE;
+ case 0xc9:
+ /*
+ * leave
+ *
+ * equivalent to:
+ * mov bp, sp
+ * pop bp
+ */
+ *type = INSN_STACK;
+ op->dest.type = OP_DEST_LEAVE;
+
break;
- case 0xe3: /* jecxz/jrcxz */
+ case 0xe3:
+ /* jecxz/jrcxz */
*type = INSN_JUMP_CONDITIONAL;
break;
@@ -158,14 +451,27 @@ int arch_decode_instruction(struct elf *elf, struct section *sec,
break;
case 0xff:
- ext = X86_MODRM_REG(insn.modrm.bytes[0]);
- if (ext == 2 || ext == 3)
+ if (modrm_reg == 2 || modrm_reg == 3)
+
*type = INSN_CALL_DYNAMIC;
- else if (ext == 4)
+
+ else if (modrm_reg == 4)
+
*type = INSN_JUMP_DYNAMIC;
- else if (ext == 5) /*jmpf */
+
+ else if (modrm_reg == 5)
+
+ /* jmpf */
*type = INSN_CONTEXT_SWITCH;
+ else if (modrm_reg == 6) {
+
+ /* push from mem */
+ *type = INSN_STACK;
+ op->src.type = OP_SRC_CONST;
+ op->dest.type = OP_DEST_PUSH;
+ }
+
break;
default:
@@ -176,3 +482,21 @@ int arch_decode_instruction(struct elf *elf, struct section *sec,
return 0;
}
+
+void arch_initial_func_cfi_state(struct cfi_state *state)
+{
+ int i;
+
+ for (i = 0; i < CFI_NUM_REGS; i++) {
+ state->regs[i].base = CFI_UNDEFINED;
+ state->regs[i].offset = 0;
+ }
+
+ /* initial CFA (call frame address) */
+ state->cfa.base = CFI_SP;
+ state->cfa.offset = 8;
+
+ /* initial RA (return address) */
+ state->regs[16].base = CFI_CFA;
+ state->regs[16].offset = -8;
+}
diff --git a/tools/objtool/arch/x86/insn/x86-opcode-map.txt b/tools/objtool/arch/x86/insn/x86-opcode-map.txt
index 767be7c76034..12e377184ee4 100644
--- a/tools/objtool/arch/x86/insn/x86-opcode-map.txt
+++ b/tools/objtool/arch/x86/insn/x86-opcode-map.txt
@@ -1009,7 +1009,7 @@ GrpTable: Grp15
1: fxstor | RDGSBASE Ry (F3),(11B)
2: vldmxcsr Md (v1) | WRFSBASE Ry (F3),(11B)
3: vstmxcsr Md (v1) | WRGSBASE Ry (F3),(11B)
-4: XSAVE
+4: XSAVE | ptwrite Ey (F3),(11B)
5: XRSTOR | lfence (11B)
6: XSAVEOPT | clwb (66) | mfence (11B)
7: clflush | clflushopt (66) | sfence (11B)
diff --git a/tools/objtool/builtin-check.c b/tools/objtool/builtin-check.c
index 282a60368b14..365c34ecab26 100644
--- a/tools/objtool/builtin-check.c
+++ b/tools/objtool/builtin-check.c
@@ -1,5 +1,5 @@
/*
- * Copyright (C) 2015 Josh Poimboeuf <jpoimboe@redhat.com>
+ * Copyright (C) 2015-2017 Josh Poimboeuf <jpoimboe@redhat.com>
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public License
@@ -25,1286 +25,32 @@
* For more information, see tools/objtool/Documentation/stack-validation.txt.
*/
-#include <string.h>
-#include <stdlib.h>
#include <subcmd/parse-options.h>
-
#include "builtin.h"
-#include "elf.h"
-#include "special.h"
-#include "arch.h"
-#include "warn.h"
-
-#include <linux/hashtable.h>
-#include <linux/kernel.h>
-
-#define STATE_FP_SAVED 0x1
-#define STATE_FP_SETUP 0x2
-#define STATE_FENTRY 0x4
-
-struct instruction {
- struct list_head list;
- struct hlist_node hash;
- struct section *sec;
- unsigned long offset;
- unsigned int len, state;
- unsigned char type;
- unsigned long immediate;
- bool alt_group, visited, dead_end;
- struct symbol *call_dest;
- struct instruction *jump_dest;
- struct list_head alts;
- struct symbol *func;
-};
-
-struct alternative {
- struct list_head list;
- struct instruction *insn;
-};
-
-struct objtool_file {
- struct elf *elf;
- struct list_head insn_list;
- DECLARE_HASHTABLE(insn_hash, 16);
- struct section *rodata, *whitelist;
- bool ignore_unreachables, c_file;
-};
-
-const char *objname;
-static bool nofp;
-
-static struct instruction *find_insn(struct objtool_file *file,
- struct section *sec, unsigned long offset)
-{
- struct instruction *insn;
-
- hash_for_each_possible(file->insn_hash, insn, hash, offset)
- if (insn->sec == sec && insn->offset == offset)
- return insn;
-
- return NULL;
-}
-
-static struct instruction *next_insn_same_sec(struct objtool_file *file,
- struct instruction *insn)
-{
- struct instruction *next = list_next_entry(insn, list);
-
- if (&next->list == &file->insn_list || next->sec != insn->sec)
- return NULL;
-
- return next;
-}
-
-static bool gcov_enabled(struct objtool_file *file)
-{
- struct section *sec;
- struct symbol *sym;
-
- list_for_each_entry(sec, &file->elf->sections, list)
- list_for_each_entry(sym, &sec->symbol_list, list)
- if (!strncmp(sym->name, "__gcov_.", 8))
- return true;
-
- return false;
-}
-
-#define for_each_insn(file, insn) \
- list_for_each_entry(insn, &file->insn_list, list)
-
-#define func_for_each_insn(file, func, insn) \
- for (insn = find_insn(file, func->sec, func->offset); \
- insn && &insn->list != &file->insn_list && \
- insn->sec == func->sec && \
- insn->offset < func->offset + func->len; \
- insn = list_next_entry(insn, list))
-
-#define func_for_each_insn_continue_reverse(file, func, insn) \
- for (insn = list_prev_entry(insn, list); \
- &insn->list != &file->insn_list && \
- insn->sec == func->sec && insn->offset >= func->offset; \
- insn = list_prev_entry(insn, list))
-
-#define sec_for_each_insn_from(file, insn) \
- for (; insn; insn = next_insn_same_sec(file, insn))
-
-
-/*
- * Check if the function has been manually whitelisted with the
- * STACK_FRAME_NON_STANDARD macro, or if it should be automatically whitelisted
- * due to its use of a context switching instruction.
- */
-static bool ignore_func(struct objtool_file *file, struct symbol *func)
-{
- struct rela *rela;
- struct instruction *insn;
-
- /* check for STACK_FRAME_NON_STANDARD */
- if (file->whitelist && file->whitelist->rela)
- list_for_each_entry(rela, &file->whitelist->rela->rela_list, list) {
- if (rela->sym->type == STT_SECTION &&
- rela->sym->sec == func->sec &&
- rela->addend == func->offset)
- return true;
- if (rela->sym->type == STT_FUNC && rela->sym == func)
- return true;
- }
-
- /* check if it has a context switching instruction */
- func_for_each_insn(file, func, insn)
- if (insn->type == INSN_CONTEXT_SWITCH)
- return true;
-
- return false;
-}
-
-/*
- * This checks to see if the given function is a "noreturn" function.
- *
- * For global functions which are outside the scope of this object file, we
- * have to keep a manual list of them.
- *
- * For local functions, we have to detect them manually by simply looking for
- * the lack of a return instruction.
- *
- * Returns:
- * -1: error
- * 0: no dead end
- * 1: dead end
- */
-static int __dead_end_function(struct objtool_file *file, struct symbol *func,
- int recursion)
-{
- int i;
- struct instruction *insn;
- bool empty = true;
-
- /*
- * Unfortunately these have to be hard coded because the noreturn
- * attribute isn't provided in ELF data.
- */
- static const char * const global_noreturns[] = {
- "__stack_chk_fail",
- "panic",
- "do_exit",
- "do_task_dead",
- "__module_put_and_exit",
- "complete_and_exit",
- "kvm_spurious_fault",
- "__reiserfs_panic",
- "lbug_with_loc"
- };
-
- if (func->bind == STB_WEAK)
- return 0;
-
- if (func->bind == STB_GLOBAL)
- for (i = 0; i < ARRAY_SIZE(global_noreturns); i++)
- if (!strcmp(func->name, global_noreturns[i]))
- return 1;
-
- if (!func->sec)
- return 0;
-
- func_for_each_insn(file, func, insn) {
- empty = false;
-
- if (insn->type == INSN_RETURN)
- return 0;
- }
-
- if (empty)
- return 0;
-
- /*
- * A function can have a sibling call instead of a return. In that
- * case, the function's dead-end status depends on whether the target
- * of the sibling call returns.
- */
- func_for_each_insn(file, func, insn) {
- if (insn->sec != func->sec ||
- insn->offset >= func->offset + func->len)
- break;
-
- if (insn->type == INSN_JUMP_UNCONDITIONAL) {
- struct instruction *dest = insn->jump_dest;
- struct symbol *dest_func;
-
- if (!dest)
- /* sibling call to another file */
- return 0;
-
- if (dest->sec != func->sec ||
- dest->offset < func->offset ||
- dest->offset >= func->offset + func->len) {
- /* local sibling call */
- dest_func = find_symbol_by_offset(dest->sec,
- dest->offset);
- if (!dest_func)
- continue;
-
- if (recursion == 5) {
- WARN_FUNC("infinite recursion (objtool bug!)",
- dest->sec, dest->offset);
- return -1;
- }
-
- return __dead_end_function(file, dest_func,
- recursion + 1);
- }
- }
-
- if (insn->type == INSN_JUMP_DYNAMIC && list_empty(&insn->alts))
- /* sibling call */
- return 0;
- }
-
- return 1;
-}
-
-static int dead_end_function(struct objtool_file *file, struct symbol *func)
-{
- return __dead_end_function(file, func, 0);
-}
-
-/*
- * Call the arch-specific instruction decoder for all the instructions and add
- * them to the global instruction list.
- */
-static int decode_instructions(struct objtool_file *file)
-{
- struct section *sec;
- struct symbol *func;
- unsigned long offset;
- struct instruction *insn;
- int ret;
-
- list_for_each_entry(sec, &file->elf->sections, list) {
-
- if (!(sec->sh.sh_flags & SHF_EXECINSTR))
- continue;
-
- for (offset = 0; offset < sec->len; offset += insn->len) {
- insn = malloc(sizeof(*insn));
- memset(insn, 0, sizeof(*insn));
-
- INIT_LIST_HEAD(&insn->alts);
- insn->sec = sec;
- insn->offset = offset;
-
- ret = arch_decode_instruction(file->elf, sec, offset,
- sec->len - offset,
- &insn->len, &insn->type,
- &insn->immediate);
- if (ret)
- return ret;
-
- if (!insn->type || insn->type > INSN_LAST) {
- WARN_FUNC("invalid instruction type %d",
- insn->sec, insn->offset, insn->type);
- return -1;
- }
-
- hash_add(file->insn_hash, &insn->hash, insn->offset);
- list_add_tail(&insn->list, &file->insn_list);
- }
-
- list_for_each_entry(func, &sec->symbol_list, list) {
- if (func->type != STT_FUNC)
- continue;
-
- if (!find_insn(file, sec, func->offset)) {
- WARN("%s(): can't find starting instruction",
- func->name);
- return -1;
- }
-
- func_for_each_insn(file, func, insn)
- if (!insn->func)
- insn->func = func;
- }
- }
-
- return 0;
-}
-
-/*
- * Find all uses of the unreachable() macro, which are code path dead ends.
- */
-static int add_dead_ends(struct objtool_file *file)
-{
- struct section *sec;
- struct rela *rela;
- struct instruction *insn;
- bool found;
-
- sec = find_section_by_name(file->elf, ".rela.discard.unreachable");
- if (!sec)
- return 0;
-
- list_for_each_entry(rela, &sec->rela_list, list) {
- if (rela->sym->type != STT_SECTION) {
- WARN("unexpected relocation symbol type in %s", sec->name);
- return -1;
- }
- insn = find_insn(file, rela->sym->sec, rela->addend);
- if (insn)
- insn = list_prev_entry(insn, list);
- else if (rela->addend == rela->sym->sec->len) {
- found = false;
- list_for_each_entry_reverse(insn, &file->insn_list, list) {
- if (insn->sec == rela->sym->sec) {
- found = true;
- break;
- }
- }
-
- if (!found) {
- WARN("can't find unreachable insn at %s+0x%x",
- rela->sym->sec->name, rela->addend);
- return -1;
- }
- } else {
- WARN("can't find unreachable insn at %s+0x%x",
- rela->sym->sec->name, rela->addend);
- return -1;
- }
-
- insn->dead_end = true;
- }
-
- return 0;
-}
-
-/*
- * Warnings shouldn't be reported for ignored functions.
- */
-static void add_ignores(struct objtool_file *file)
-{
- struct instruction *insn;
- struct section *sec;
- struct symbol *func;
-
- list_for_each_entry(sec, &file->elf->sections, list) {
- list_for_each_entry(func, &sec->symbol_list, list) {
- if (func->type != STT_FUNC)
- continue;
-
- if (!ignore_func(file, func))
- continue;
-
- func_for_each_insn(file, func, insn)
- insn->visited = true;
- }
- }
-}
-
-/*
- * Find the destination instructions for all jumps.
- */
-static int add_jump_destinations(struct objtool_file *file)
-{
- struct instruction *insn;
- struct rela *rela;
- struct section *dest_sec;
- unsigned long dest_off;
-
- for_each_insn(file, insn) {
- if (insn->type != INSN_JUMP_CONDITIONAL &&
- insn->type != INSN_JUMP_UNCONDITIONAL)
- continue;
-
- /* skip ignores */
- if (insn->visited)
- continue;
-
- rela = find_rela_by_dest_range(insn->sec, insn->offset,
- insn->len);
- if (!rela) {
- dest_sec = insn->sec;
- dest_off = insn->offset + insn->len + insn->immediate;
- } else if (rela->sym->type == STT_SECTION) {
- dest_sec = rela->sym->sec;
- dest_off = rela->addend + 4;
- } else if (rela->sym->sec->idx) {
- dest_sec = rela->sym->sec;
- dest_off = rela->sym->sym.st_value + rela->addend + 4;
- } else {
- /* sibling call */
- insn->jump_dest = 0;
- continue;
- }
-
- insn->jump_dest = find_insn(file, dest_sec, dest_off);
- if (!insn->jump_dest) {
-
- /*
- * This is a special case where an alt instruction
- * jumps past the end of the section. These are
- * handled later in handle_group_alt().
- */
- if (!strcmp(insn->sec->name, ".altinstr_replacement"))
- continue;
-
- WARN_FUNC("can't find jump dest instruction at %s+0x%lx",
- insn->sec, insn->offset, dest_sec->name,
- dest_off);
- return -1;
- }
- }
-
- return 0;
-}
-
-/*
- * Find the destination instructions for all calls.
- */
-static int add_call_destinations(struct objtool_file *file)
-{
- struct instruction *insn;
- unsigned long dest_off;
- struct rela *rela;
-
- for_each_insn(file, insn) {
- if (insn->type != INSN_CALL)
- continue;
-
- rela = find_rela_by_dest_range(insn->sec, insn->offset,
- insn->len);
- if (!rela) {
- dest_off = insn->offset + insn->len + insn->immediate;
- insn->call_dest = find_symbol_by_offset(insn->sec,
- dest_off);
- if (!insn->call_dest) {
- WARN_FUNC("can't find call dest symbol at offset 0x%lx",
- insn->sec, insn->offset, dest_off);
- return -1;
- }
- } else if (rela->sym->type == STT_SECTION) {
- insn->call_dest = find_symbol_by_offset(rela->sym->sec,
- rela->addend+4);
- if (!insn->call_dest ||
- insn->call_dest->type != STT_FUNC) {
- WARN_FUNC("can't find call dest symbol at %s+0x%x",
- insn->sec, insn->offset,
- rela->sym->sec->name,
- rela->addend + 4);
- return -1;
- }
- } else
- insn->call_dest = rela->sym;
- }
-
- return 0;
-}
-
-/*
- * The .alternatives section requires some extra special care, over and above
- * what other special sections require:
- *
- * 1. Because alternatives are patched in-place, we need to insert a fake jump
- * instruction at the end so that validate_branch() skips all the original
- * replaced instructions when validating the new instruction path.
- *
- * 2. An added wrinkle is that the new instruction length might be zero. In
- * that case the old instructions are replaced with noops. We simulate that
- * by creating a fake jump as the only new instruction.
- *
- * 3. In some cases, the alternative section includes an instruction which
- * conditionally jumps to the _end_ of the entry. We have to modify these
- * jumps' destinations to point back to .text rather than the end of the
- * entry in .altinstr_replacement.
- *
- * 4. It has been requested that we don't validate the !POPCNT feature path
- * which is a "very very small percentage of machines".
- */
-static int handle_group_alt(struct objtool_file *file,
- struct special_alt *special_alt,
- struct instruction *orig_insn,
- struct instruction **new_insn)
-{
- struct instruction *last_orig_insn, *last_new_insn, *insn, *fake_jump;
- unsigned long dest_off;
-
- last_orig_insn = NULL;
- insn = orig_insn;
- sec_for_each_insn_from(file, insn) {
- if (insn->offset >= special_alt->orig_off + special_alt->orig_len)
- break;
-
- if (special_alt->skip_orig)
- insn->type = INSN_NOP;
-
- insn->alt_group = true;
- last_orig_insn = insn;
- }
-
- if (!next_insn_same_sec(file, last_orig_insn)) {
- WARN("%s: don't know how to handle alternatives at end of section",
- special_alt->orig_sec->name);
- return -1;
- }
-
- fake_jump = malloc(sizeof(*fake_jump));
- if (!fake_jump) {
- WARN("malloc failed");
- return -1;
- }
- memset(fake_jump, 0, sizeof(*fake_jump));
- INIT_LIST_HEAD(&fake_jump->alts);
- fake_jump->sec = special_alt->new_sec;
- fake_jump->offset = -1;
- fake_jump->type = INSN_JUMP_UNCONDITIONAL;
- fake_jump->jump_dest = list_next_entry(last_orig_insn, list);
-
- if (!special_alt->new_len) {
- *new_insn = fake_jump;
- return 0;
- }
-
- last_new_insn = NULL;
- insn = *new_insn;
- sec_for_each_insn_from(file, insn) {
- if (insn->offset >= special_alt->new_off + special_alt->new_len)
- break;
-
- last_new_insn = insn;
-
- if (insn->type != INSN_JUMP_CONDITIONAL &&
- insn->type != INSN_JUMP_UNCONDITIONAL)
- continue;
-
- if (!insn->immediate)
- continue;
-
- dest_off = insn->offset + insn->len + insn->immediate;
- if (dest_off == special_alt->new_off + special_alt->new_len)
- insn->jump_dest = fake_jump;
-
- if (!insn->jump_dest) {
- WARN_FUNC("can't find alternative jump destination",
- insn->sec, insn->offset);
- return -1;
- }
- }
-
- if (!last_new_insn) {
- WARN_FUNC("can't find last new alternative instruction",
- special_alt->new_sec, special_alt->new_off);
- return -1;
- }
-
- list_add(&fake_jump->list, &last_new_insn->list);
-
- return 0;
-}
-
-/*
- * A jump table entry can either convert a nop to a jump or a jump to a nop.
- * If the original instruction is a jump, make the alt entry an effective nop
- * by just skipping the original instruction.
- */
-static int handle_jump_alt(struct objtool_file *file,
- struct special_alt *special_alt,
- struct instruction *orig_insn,
- struct instruction **new_insn)
-{
- if (orig_insn->type == INSN_NOP)
- return 0;
-
- if (orig_insn->type != INSN_JUMP_UNCONDITIONAL) {
- WARN_FUNC("unsupported instruction at jump label",
- orig_insn->sec, orig_insn->offset);
- return -1;
- }
-
- *new_insn = list_next_entry(orig_insn, list);
- return 0;
-}
-
-/*
- * Read all the special sections which have alternate instructions which can be
- * patched in or redirected to at runtime. Each instruction having alternate
- * instruction(s) has them added to its insn->alts list, which will be
- * traversed in validate_branch().
- */
-static int add_special_section_alts(struct objtool_file *file)
-{
- struct list_head special_alts;
- struct instruction *orig_insn, *new_insn;
- struct special_alt *special_alt, *tmp;
- struct alternative *alt;
- int ret;
-
- ret = special_get_alts(file->elf, &special_alts);
- if (ret)
- return ret;
-
- list_for_each_entry_safe(special_alt, tmp, &special_alts, list) {
- alt = malloc(sizeof(*alt));
- if (!alt) {
- WARN("malloc failed");
- ret = -1;
- goto out;
- }
-
- orig_insn = find_insn(file, special_alt->orig_sec,
- special_alt->orig_off);
- if (!orig_insn) {
- WARN_FUNC("special: can't find orig instruction",
- special_alt->orig_sec, special_alt->orig_off);
- ret = -1;
- goto out;
- }
+#include "check.h"
- new_insn = NULL;
- if (!special_alt->group || special_alt->new_len) {
- new_insn = find_insn(file, special_alt->new_sec,
- special_alt->new_off);
- if (!new_insn) {
- WARN_FUNC("special: can't find new instruction",
- special_alt->new_sec,
- special_alt->new_off);
- ret = -1;
- goto out;
- }
- }
+bool nofp;
- if (special_alt->group) {
- ret = handle_group_alt(file, special_alt, orig_insn,
- &new_insn);
- if (ret)
- goto out;
- } else if (special_alt->jump_or_nop) {
- ret = handle_jump_alt(file, special_alt, orig_insn,
- &new_insn);
- if (ret)
- goto out;
- }
-
- alt->insn = new_insn;
- list_add_tail(&alt->list, &orig_insn->alts);
-
- list_del(&special_alt->list);
- free(special_alt);
- }
-
-out:
- return ret;
-}
-
-static int add_switch_table(struct objtool_file *file, struct symbol *func,
- struct instruction *insn, struct rela *table,
- struct rela *next_table)
-{
- struct rela *rela = table;
- struct instruction *alt_insn;
- struct alternative *alt;
-
- list_for_each_entry_from(rela, &file->rodata->rela->rela_list, list) {
- if (rela == next_table)
- break;
-
- if (rela->sym->sec != insn->sec ||
- rela->addend <= func->offset ||
- rela->addend >= func->offset + func->len)
- break;
-
- alt_insn = find_insn(file, insn->sec, rela->addend);
- if (!alt_insn) {
- WARN("%s: can't find instruction at %s+0x%x",
- file->rodata->rela->name, insn->sec->name,
- rela->addend);
- return -1;
- }
-
- alt = malloc(sizeof(*alt));
- if (!alt) {
- WARN("malloc failed");
- return -1;
- }
-
- alt->insn = alt_insn;
- list_add_tail(&alt->list, &insn->alts);
- }
-
- return 0;
-}
-
-/*
- * find_switch_table() - Given a dynamic jump, find the switch jump table in
- * .rodata associated with it.
- *
- * There are 3 basic patterns:
- *
- * 1. jmpq *[rodata addr](,%reg,8)
- *
- * This is the most common case by far. It jumps to an address in a simple
- * jump table which is stored in .rodata.
- *
- * 2. jmpq *[rodata addr](%rip)
- *
- * This is caused by a rare GCC quirk, currently only seen in three driver
- * functions in the kernel, only with certain obscure non-distro configs.
- *
- * As part of an optimization, GCC makes a copy of an existing switch jump
- * table, modifies it, and then hard-codes the jump (albeit with an indirect
- * jump) to use a single entry in the table. The rest of the jump table and
- * some of its jump targets remain as dead code.
- *
- * In such a case we can just crudely ignore all unreachable instruction
- * warnings for the entire object file. Ideally we would just ignore them
- * for the function, but that would require redesigning the code quite a
- * bit. And honestly that's just not worth doing: unreachable instruction
- * warnings are of questionable value anyway, and this is such a rare issue.
- *
- * 3. mov [rodata addr],%reg1
- * ... some instructions ...
- * jmpq *(%reg1,%reg2,8)
- *
- * This is a fairly uncommon pattern which is new for GCC 6. As of this
- * writing, there are 11 occurrences of it in the allmodconfig kernel.
- *
- * TODO: Once we have DWARF CFI and smarter instruction decoding logic,
- * ensure the same register is used in the mov and jump instructions.
- */
-static struct rela *find_switch_table(struct objtool_file *file,
- struct symbol *func,
- struct instruction *insn)
-{
- struct rela *text_rela, *rodata_rela;
- struct instruction *orig_insn = insn;
-
- text_rela = find_rela_by_dest_range(insn->sec, insn->offset, insn->len);
- if (text_rela && text_rela->sym == file->rodata->sym) {
- /* case 1 */
- rodata_rela = find_rela_by_dest(file->rodata,
- text_rela->addend);
- if (rodata_rela)
- return rodata_rela;
-
- /* case 2 */
- rodata_rela = find_rela_by_dest(file->rodata,
- text_rela->addend + 4);
- if (!rodata_rela)
- return NULL;
- file->ignore_unreachables = true;
- return rodata_rela;
- }
-
- /* case 3 */
- func_for_each_insn_continue_reverse(file, func, insn) {
- if (insn->type == INSN_JUMP_DYNAMIC)
- break;
-
- /* allow small jumps within the range */
- if (insn->type == INSN_JUMP_UNCONDITIONAL &&
- insn->jump_dest &&
- (insn->jump_dest->offset <= insn->offset ||
- insn->jump_dest->offset > orig_insn->offset))
- break;
-
- /* look for a relocation which references .rodata */
- text_rela = find_rela_by_dest_range(insn->sec, insn->offset,
- insn->len);
- if (!text_rela || text_rela->sym != file->rodata->sym)
- continue;
-
- /*
- * Make sure the .rodata address isn't associated with a
- * symbol. gcc jump tables are anonymous data.
- */
- if (find_symbol_containing(file->rodata, text_rela->addend))
- continue;
-
- return find_rela_by_dest(file->rodata, text_rela->addend);
- }
-
- return NULL;
-}
-
-static int add_func_switch_tables(struct objtool_file *file,
- struct symbol *func)
-{
- struct instruction *insn, *prev_jump = NULL;
- struct rela *rela, *prev_rela = NULL;
- int ret;
-
- func_for_each_insn(file, func, insn) {
- if (insn->type != INSN_JUMP_DYNAMIC)
- continue;
-
- rela = find_switch_table(file, func, insn);
- if (!rela)
- continue;
-
- /*
- * We found a switch table, but we don't know yet how big it
- * is. Don't add it until we reach the end of the function or
- * the beginning of another switch table in the same function.
- */
- if (prev_jump) {
- ret = add_switch_table(file, func, prev_jump, prev_rela,
- rela);
- if (ret)
- return ret;
- }
-
- prev_jump = insn;
- prev_rela = rela;
- }
-
- if (prev_jump) {
- ret = add_switch_table(file, func, prev_jump, prev_rela, NULL);
- if (ret)
- return ret;
- }
-
- return 0;
-}
-
-/*
- * For some switch statements, gcc generates a jump table in the .rodata
- * section which contains a list of addresses within the function to jump to.
- * This finds these jump tables and adds them to the insn->alts lists.
- */
-static int add_switch_table_alts(struct objtool_file *file)
-{
- struct section *sec;
- struct symbol *func;
- int ret;
-
- if (!file->rodata || !file->rodata->rela)
- return 0;
-
- list_for_each_entry(sec, &file->elf->sections, list) {
- list_for_each_entry(func, &sec->symbol_list, list) {
- if (func->type != STT_FUNC)
- continue;
-
- ret = add_func_switch_tables(file, func);
- if (ret)
- return ret;
- }
- }
-
- return 0;
-}
-
-static int decode_sections(struct objtool_file *file)
-{
- int ret;
-
- ret = decode_instructions(file);
- if (ret)
- return ret;
-
- ret = add_dead_ends(file);
- if (ret)
- return ret;
-
- add_ignores(file);
-
- ret = add_jump_destinations(file);
- if (ret)
- return ret;
-
- ret = add_call_destinations(file);
- if (ret)
- return ret;
-
- ret = add_special_section_alts(file);
- if (ret)
- return ret;
-
- ret = add_switch_table_alts(file);
- if (ret)
- return ret;
-
- return 0;
-}
-
-static bool is_fentry_call(struct instruction *insn)
-{
- if (insn->type == INSN_CALL &&
- insn->call_dest->type == STT_NOTYPE &&
- !strcmp(insn->call_dest->name, "__fentry__"))
- return true;
-
- return false;
-}
-
-static bool has_modified_stack_frame(struct instruction *insn)
-{
- return (insn->state & STATE_FP_SAVED) ||
- (insn->state & STATE_FP_SETUP);
-}
-
-static bool has_valid_stack_frame(struct instruction *insn)
-{
- return (insn->state & STATE_FP_SAVED) &&
- (insn->state & STATE_FP_SETUP);
-}
-
-static unsigned int frame_state(unsigned long state)
-{
- return (state & (STATE_FP_SAVED | STATE_FP_SETUP));
-}
-
-/*
- * Follow the branch starting at the given instruction, and recursively follow
- * any other branches (jumps). Meanwhile, track the frame pointer state at
- * each instruction and validate all the rules described in
- * tools/objtool/Documentation/stack-validation.txt.
- */
-static int validate_branch(struct objtool_file *file,
- struct instruction *first, unsigned char first_state)
-{
- struct alternative *alt;
- struct instruction *insn;
- struct section *sec;
- struct symbol *func = NULL;
- unsigned char state;
- int ret;
-
- insn = first;
- sec = insn->sec;
- state = first_state;
-
- if (insn->alt_group && list_empty(&insn->alts)) {
- WARN_FUNC("don't know how to handle branch to middle of alternative instruction group",
- sec, insn->offset);
- return 1;
- }
-
- while (1) {
- if (file->c_file && insn->func) {
- if (func && func != insn->func) {
- WARN("%s() falls through to next function %s()",
- func->name, insn->func->name);
- return 1;
- }
-
- func = insn->func;
- }
-
- if (insn->visited) {
- if (frame_state(insn->state) != frame_state(state)) {
- WARN_FUNC("frame pointer state mismatch",
- sec, insn->offset);
- return 1;
- }
-
- return 0;
- }
-
- insn->visited = true;
- insn->state = state;
-
- list_for_each_entry(alt, &insn->alts, list) {
- ret = validate_branch(file, alt->insn, state);
- if (ret)
- return 1;
- }
-
- switch (insn->type) {
-
- case INSN_FP_SAVE:
- if (!nofp) {
- if (state & STATE_FP_SAVED) {
- WARN_FUNC("duplicate frame pointer save",
- sec, insn->offset);
- return 1;
- }
- state |= STATE_FP_SAVED;
- }
- break;
-
- case INSN_FP_SETUP:
- if (!nofp) {
- if (state & STATE_FP_SETUP) {
- WARN_FUNC("duplicate frame pointer setup",
- sec, insn->offset);
- return 1;
- }
- state |= STATE_FP_SETUP;
- }
- break;
-
- case INSN_FP_RESTORE:
- if (!nofp) {
- if (has_valid_stack_frame(insn))
- state &= ~STATE_FP_SETUP;
-
- state &= ~STATE_FP_SAVED;
- }
- break;
-
- case INSN_RETURN:
- if (!nofp && has_modified_stack_frame(insn)) {
- WARN_FUNC("return without frame pointer restore",
- sec, insn->offset);
- return 1;
- }
- return 0;
-
- case INSN_CALL:
- if (is_fentry_call(insn)) {
- state |= STATE_FENTRY;
- break;
- }
-
- ret = dead_end_function(file, insn->call_dest);
- if (ret == 1)
- return 0;
- if (ret == -1)
- return 1;
-
- /* fallthrough */
- case INSN_CALL_DYNAMIC:
- if (!nofp && !has_valid_stack_frame(insn)) {
- WARN_FUNC("call without frame pointer save/setup",
- sec, insn->offset);
- return 1;
- }
- break;
-
- case INSN_JUMP_CONDITIONAL:
- case INSN_JUMP_UNCONDITIONAL:
- if (insn->jump_dest) {
- ret = validate_branch(file, insn->jump_dest,
- state);
- if (ret)
- return 1;
- } else if (has_modified_stack_frame(insn)) {
- WARN_FUNC("sibling call from callable instruction with changed frame pointer",
- sec, insn->offset);
- return 1;
- } /* else it's a sibling call */
-
- if (insn->type == INSN_JUMP_UNCONDITIONAL)
- return 0;
-
- break;
-
- case INSN_JUMP_DYNAMIC:
- if (list_empty(&insn->alts) &&
- has_modified_stack_frame(insn)) {
- WARN_FUNC("sibling call from callable instruction with changed frame pointer",
- sec, insn->offset);
- return 1;
- }
-
- return 0;
-
- default:
- break;
- }
-
- if (insn->dead_end)
- return 0;
-
- insn = next_insn_same_sec(file, insn);
- if (!insn) {
- WARN("%s: unexpected end of section", sec->name);
- return 1;
- }
- }
-
- return 0;
-}
-
-static bool is_kasan_insn(struct instruction *insn)
-{
- return (insn->type == INSN_CALL &&
- !strcmp(insn->call_dest->name, "__asan_handle_no_return"));
-}
-
-static bool is_ubsan_insn(struct instruction *insn)
-{
- return (insn->type == INSN_CALL &&
- !strcmp(insn->call_dest->name,
- "__ubsan_handle_builtin_unreachable"));
-}
-
-static bool ignore_unreachable_insn(struct symbol *func,
- struct instruction *insn)
-{
- int i;
-
- if (insn->type == INSN_NOP)
- return true;
-
- /*
- * Check if this (or a subsequent) instruction is related to
- * CONFIG_UBSAN or CONFIG_KASAN.
- *
- * End the search at 5 instructions to avoid going into the weeds.
- */
- for (i = 0; i < 5; i++) {
-
- if (is_kasan_insn(insn) || is_ubsan_insn(insn))
- return true;
-
- if (insn->type == INSN_JUMP_UNCONDITIONAL && insn->jump_dest) {
- insn = insn->jump_dest;
- continue;
- }
-
- if (insn->offset + insn->len >= func->offset + func->len)
- break;
- insn = list_next_entry(insn, list);
- }
-
- return false;
-}
-
-static int validate_functions(struct objtool_file *file)
-{
- struct section *sec;
- struct symbol *func;
- struct instruction *insn;
- int ret, warnings = 0;
-
- list_for_each_entry(sec, &file->elf->sections, list) {
- list_for_each_entry(func, &sec->symbol_list, list) {
- if (func->type != STT_FUNC)
- continue;
-
- insn = find_insn(file, sec, func->offset);
- if (!insn)
- continue;
-
- ret = validate_branch(file, insn, 0);
- warnings += ret;
- }
- }
-
- list_for_each_entry(sec, &file->elf->sections, list) {
- list_for_each_entry(func, &sec->symbol_list, list) {
- if (func->type != STT_FUNC)
- continue;
-
- func_for_each_insn(file, func, insn) {
- if (insn->visited)
- continue;
-
- insn->visited = true;
-
- if (file->ignore_unreachables || warnings ||
- ignore_unreachable_insn(func, insn))
- continue;
-
- /*
- * gcov produces a lot of unreachable
- * instructions. If we get an unreachable
- * warning and the file has gcov enabled, just
- * ignore it, and all other such warnings for
- * the file.
- */
- if (!file->ignore_unreachables &&
- gcov_enabled(file)) {
- file->ignore_unreachables = true;
- continue;
- }
-
- WARN_FUNC("function has unreachable instruction", insn->sec, insn->offset);
- warnings++;
- }
- }
- }
-
- return warnings;
-}
-
-static int validate_uncallable_instructions(struct objtool_file *file)
-{
- struct instruction *insn;
- int warnings = 0;
-
- for_each_insn(file, insn) {
- if (!insn->visited && insn->type == INSN_RETURN) {
- WARN_FUNC("return instruction outside of a callable function",
- insn->sec, insn->offset);
- warnings++;
- }
- }
-
- return warnings;
-}
-
-static void cleanup(struct objtool_file *file)
-{
- struct instruction *insn, *tmpinsn;
- struct alternative *alt, *tmpalt;
-
- list_for_each_entry_safe(insn, tmpinsn, &file->insn_list, list) {
- list_for_each_entry_safe(alt, tmpalt, &insn->alts, list) {
- list_del(&alt->list);
- free(alt);
- }
- list_del(&insn->list);
- hash_del(&insn->hash);
- free(insn);
- }
- elf_close(file->elf);
-}
-
-const char * const check_usage[] = {
+static const char * const check_usage[] = {
"objtool check [<options>] file.o",
NULL,
};
+const struct option check_options[] = {
+ OPT_BOOLEAN('f', "no-fp", &nofp, "Skip frame pointer validation"),
+ OPT_END(),
+};
+
int cmd_check(int argc, const char **argv)
{
- struct objtool_file file;
- int ret, warnings = 0;
-
- const struct option options[] = {
- OPT_BOOLEAN('f', "no-fp", &nofp, "Skip frame pointer validation"),
- OPT_END(),
- };
+ const char *objname;
- argc = parse_options(argc, argv, options, check_usage, 0);
+ argc = parse_options(argc, argv, check_options, check_usage, 0);
if (argc != 1)
- usage_with_options(check_usage, options);
+ usage_with_options(check_usage, check_options);
objname = argv[0];
- file.elf = elf_open(objname);
- if (!file.elf) {
- fprintf(stderr, "error reading elf file %s\n", objname);
- return 1;
- }
-
- INIT_LIST_HEAD(&file.insn_list);
- hash_init(file.insn_hash);
- file.whitelist = find_section_by_name(file.elf, ".discard.func_stack_frame_non_standard");
- file.rodata = find_section_by_name(file.elf, ".rodata");
- file.ignore_unreachables = false;
- file.c_file = find_section_by_name(file.elf, ".comment");
-
- ret = decode_sections(&file);
- if (ret < 0)
- goto out;
- warnings += ret;
-
- ret = validate_functions(&file);
- if (ret < 0)
- goto out;
- warnings += ret;
-
- ret = validate_uncallable_instructions(&file);
- if (ret < 0)
- goto out;
- warnings += ret;
-
-out:
- cleanup(&file);
-
- /* ignore warnings for now until we get all the code cleaned up */
- if (ret || warnings)
- return 0;
- return 0;
+ return check(objname, nofp);
}
diff --git a/tools/objtool/cfi.h b/tools/objtool/cfi.h
new file mode 100644
index 000000000000..443ab2c69992
--- /dev/null
+++ b/tools/objtool/cfi.h
@@ -0,0 +1,55 @@
+/*
+ * Copyright (C) 2015-2017 Josh Poimboeuf <jpoimboe@redhat.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, see <http://www.gnu.org/licenses/>.
+ */
+
+#ifndef _OBJTOOL_CFI_H
+#define _OBJTOOL_CFI_H
+
+#define CFI_UNDEFINED -1
+#define CFI_CFA -2
+#define CFI_SP_INDIRECT -3
+#define CFI_BP_INDIRECT -4
+
+#define CFI_AX 0
+#define CFI_DX 1
+#define CFI_CX 2
+#define CFI_BX 3
+#define CFI_SI 4
+#define CFI_DI 5
+#define CFI_BP 6
+#define CFI_SP 7
+#define CFI_R8 8
+#define CFI_R9 9
+#define CFI_R10 10
+#define CFI_R11 11
+#define CFI_R12 12
+#define CFI_R13 13
+#define CFI_R14 14
+#define CFI_R15 15
+#define CFI_RA 16
+#define CFI_NUM_REGS 17
+
+struct cfi_reg {
+ int base;
+ int offset;
+};
+
+struct cfi_state {
+ struct cfi_reg cfa;
+ struct cfi_reg regs[CFI_NUM_REGS];
+};
+
+#endif /* _OBJTOOL_CFI_H */
diff --git a/tools/objtool/check.c b/tools/objtool/check.c
new file mode 100644
index 000000000000..fea222192c57
--- /dev/null
+++ b/tools/objtool/check.c
@@ -0,0 +1,1655 @@
+/*
+ * Copyright (C) 2015-2017 Josh Poimboeuf <jpoimboe@redhat.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include <string.h>
+#include <stdlib.h>
+
+#include "check.h"
+#include "elf.h"
+#include "special.h"
+#include "arch.h"
+#include "warn.h"
+
+#include <linux/hashtable.h>
+#include <linux/kernel.h>
+
+struct alternative {
+ struct list_head list;
+ struct instruction *insn;
+};
+
+const char *objname;
+static bool nofp;
+struct cfi_state initial_func_cfi;
+
+static struct instruction *find_insn(struct objtool_file *file,
+ struct section *sec, unsigned long offset)
+{
+ struct instruction *insn;
+
+ hash_for_each_possible(file->insn_hash, insn, hash, offset)
+ if (insn->sec == sec && insn->offset == offset)
+ return insn;
+
+ return NULL;
+}
+
+static struct instruction *next_insn_same_sec(struct objtool_file *file,
+ struct instruction *insn)
+{
+ struct instruction *next = list_next_entry(insn, list);
+
+ if (!next || &next->list == &file->insn_list || next->sec != insn->sec)
+ return NULL;
+
+ return next;
+}
+
+static bool gcov_enabled(struct objtool_file *file)
+{
+ struct section *sec;
+ struct symbol *sym;
+
+ for_each_sec(file, sec)
+ list_for_each_entry(sym, &sec->symbol_list, list)
+ if (!strncmp(sym->name, "__gcov_.", 8))
+ return true;
+
+ return false;
+}
+
+#define func_for_each_insn(file, func, insn) \
+ for (insn = find_insn(file, func->sec, func->offset); \
+ insn && &insn->list != &file->insn_list && \
+ insn->sec == func->sec && \
+ insn->offset < func->offset + func->len; \
+ insn = list_next_entry(insn, list))
+
+#define func_for_each_insn_continue_reverse(file, func, insn) \
+ for (insn = list_prev_entry(insn, list); \
+ &insn->list != &file->insn_list && \
+ insn->sec == func->sec && insn->offset >= func->offset; \
+ insn = list_prev_entry(insn, list))
+
+#define sec_for_each_insn_from(file, insn) \
+ for (; insn; insn = next_insn_same_sec(file, insn))
+
+#define sec_for_each_insn_continue(file, insn) \
+ for (insn = next_insn_same_sec(file, insn); insn; \
+ insn = next_insn_same_sec(file, insn))
+
+/*
+ * Check if the function has been manually whitelisted with the
+ * STACK_FRAME_NON_STANDARD macro, or if it should be automatically whitelisted
+ * due to its use of a context switching instruction.
+ */
+static bool ignore_func(struct objtool_file *file, struct symbol *func)
+{
+ struct rela *rela;
+ struct instruction *insn;
+
+ /* check for STACK_FRAME_NON_STANDARD */
+ if (file->whitelist && file->whitelist->rela)
+ list_for_each_entry(rela, &file->whitelist->rela->rela_list, list) {
+ if (rela->sym->type == STT_SECTION &&
+ rela->sym->sec == func->sec &&
+ rela->addend == func->offset)
+ return true;
+ if (rela->sym->type == STT_FUNC && rela->sym == func)
+ return true;
+ }
+
+ /* check if it has a context switching instruction */
+ func_for_each_insn(file, func, insn)
+ if (insn->type == INSN_CONTEXT_SWITCH)
+ return true;
+
+ return false;
+}
+
+/*
+ * This checks to see if the given function is a "noreturn" function.
+ *
+ * For global functions which are outside the scope of this object file, we
+ * have to keep a manual list of them.
+ *
+ * For local functions, we have to detect them manually by simply looking for
+ * the lack of a return instruction.
+ *
+ * Returns:
+ * -1: error
+ * 0: no dead end
+ * 1: dead end
+ */
+static int __dead_end_function(struct objtool_file *file, struct symbol *func,
+ int recursion)
+{
+ int i;
+ struct instruction *insn;
+ bool empty = true;
+
+ /*
+ * Unfortunately these have to be hard coded because the noreturn
+ * attribute isn't provided in ELF data.
+ */
+ static const char * const global_noreturns[] = {
+ "__stack_chk_fail",
+ "panic",
+ "do_exit",
+ "do_task_dead",
+ "__module_put_and_exit",
+ "complete_and_exit",
+ "kvm_spurious_fault",
+ "__reiserfs_panic",
+ "lbug_with_loc",
+ "fortify_panic",
+ };
+
+ if (func->bind == STB_WEAK)
+ return 0;
+
+ if (func->bind == STB_GLOBAL)
+ for (i = 0; i < ARRAY_SIZE(global_noreturns); i++)
+ if (!strcmp(func->name, global_noreturns[i]))
+ return 1;
+
+ if (!func->sec)
+ return 0;
+
+ func_for_each_insn(file, func, insn) {
+ empty = false;
+
+ if (insn->type == INSN_RETURN)
+ return 0;
+ }
+
+ if (empty)
+ return 0;
+
+ /*
+ * A function can have a sibling call instead of a return. In that
+ * case, the function's dead-end status depends on whether the target
+ * of the sibling call returns.
+ */
+ func_for_each_insn(file, func, insn) {
+ if (insn->sec != func->sec ||
+ insn->offset >= func->offset + func->len)
+ break;
+
+ if (insn->type == INSN_JUMP_UNCONDITIONAL) {
+ struct instruction *dest = insn->jump_dest;
+ struct symbol *dest_func;
+
+ if (!dest)
+ /* sibling call to another file */
+ return 0;
+
+ if (dest->sec != func->sec ||
+ dest->offset < func->offset ||
+ dest->offset >= func->offset + func->len) {
+ /* local sibling call */
+ dest_func = find_symbol_by_offset(dest->sec,
+ dest->offset);
+ if (!dest_func)
+ continue;
+
+ if (recursion == 5) {
+ WARN_FUNC("infinite recursion (objtool bug!)",
+ dest->sec, dest->offset);
+ return -1;
+ }
+
+ return __dead_end_function(file, dest_func,
+ recursion + 1);
+ }
+ }
+
+ if (insn->type == INSN_JUMP_DYNAMIC && list_empty(&insn->alts))
+ /* sibling call */
+ return 0;
+ }
+
+ return 1;
+}
+
+static int dead_end_function(struct objtool_file *file, struct symbol *func)
+{
+ return __dead_end_function(file, func, 0);
+}
+
+static void clear_insn_state(struct insn_state *state)
+{
+ int i;
+
+ memset(state, 0, sizeof(*state));
+ state->cfa.base = CFI_UNDEFINED;
+ for (i = 0; i < CFI_NUM_REGS; i++)
+ state->regs[i].base = CFI_UNDEFINED;
+ state->drap_reg = CFI_UNDEFINED;
+}
+
+/*
+ * Call the arch-specific instruction decoder for all the instructions and add
+ * them to the global instruction list.
+ */
+static int decode_instructions(struct objtool_file *file)
+{
+ struct section *sec;
+ struct symbol *func;
+ unsigned long offset;
+ struct instruction *insn;
+ int ret;
+
+ for_each_sec(file, sec) {
+
+ if (!(sec->sh.sh_flags & SHF_EXECINSTR))
+ continue;
+
+ for (offset = 0; offset < sec->len; offset += insn->len) {
+ insn = malloc(sizeof(*insn));
+ if (!insn) {
+ WARN("malloc failed");
+ return -1;
+ }
+ memset(insn, 0, sizeof(*insn));
+ INIT_LIST_HEAD(&insn->alts);
+ clear_insn_state(&insn->state);
+
+ insn->sec = sec;
+ insn->offset = offset;
+
+ ret = arch_decode_instruction(file->elf, sec, offset,
+ sec->len - offset,
+ &insn->len, &insn->type,
+ &insn->immediate,
+ &insn->stack_op);
+ if (ret)
+ return ret;
+
+ if (!insn->type || insn->type > INSN_LAST) {
+ WARN_FUNC("invalid instruction type %d",
+ insn->sec, insn->offset, insn->type);
+ return -1;
+ }
+
+ hash_add(file->insn_hash, &insn->hash, insn->offset);
+ list_add_tail(&insn->list, &file->insn_list);
+ }
+
+ list_for_each_entry(func, &sec->symbol_list, list) {
+ if (func->type != STT_FUNC)
+ continue;
+
+ if (!find_insn(file, sec, func->offset)) {
+ WARN("%s(): can't find starting instruction",
+ func->name);
+ return -1;
+ }
+
+ func_for_each_insn(file, func, insn)
+ if (!insn->func)
+ insn->func = func;
+ }
+ }
+
+ return 0;
+}
+
+/*
+ * Find all uses of the unreachable() macro, which are code path dead ends.
+ */
+static int add_dead_ends(struct objtool_file *file)
+{
+ struct section *sec;
+ struct rela *rela;
+ struct instruction *insn;
+ bool found;
+
+ sec = find_section_by_name(file->elf, ".rela.discard.unreachable");
+ if (!sec)
+ return 0;
+
+ list_for_each_entry(rela, &sec->rela_list, list) {
+ if (rela->sym->type != STT_SECTION) {
+ WARN("unexpected relocation symbol type in %s", sec->name);
+ return -1;
+ }
+ insn = find_insn(file, rela->sym->sec, rela->addend);
+ if (insn)
+ insn = list_prev_entry(insn, list);
+ else if (rela->addend == rela->sym->sec->len) {
+ found = false;
+ list_for_each_entry_reverse(insn, &file->insn_list, list) {
+ if (insn->sec == rela->sym->sec) {
+ found = true;
+ break;
+ }
+ }
+
+ if (!found) {
+ WARN("can't find unreachable insn at %s+0x%x",
+ rela->sym->sec->name, rela->addend);
+ return -1;
+ }
+ } else {
+ WARN("can't find unreachable insn at %s+0x%x",
+ rela->sym->sec->name, rela->addend);
+ return -1;
+ }
+
+ insn->dead_end = true;
+ }
+
+ return 0;
+}
+
+/*
+ * Warnings shouldn't be reported for ignored functions.
+ */
+static void add_ignores(struct objtool_file *file)
+{
+ struct instruction *insn;
+ struct section *sec;
+ struct symbol *func;
+
+ for_each_sec(file, sec) {
+ list_for_each_entry(func, &sec->symbol_list, list) {
+ if (func->type != STT_FUNC)
+ continue;
+
+ if (!ignore_func(file, func))
+ continue;
+
+ func_for_each_insn(file, func, insn)
+ insn->ignore = true;
+ }
+ }
+}
+
+/*
+ * Find the destination instructions for all jumps.
+ */
+static int add_jump_destinations(struct objtool_file *file)
+{
+ struct instruction *insn;
+ struct rela *rela;
+ struct section *dest_sec;
+ unsigned long dest_off;
+
+ for_each_insn(file, insn) {
+ if (insn->type != INSN_JUMP_CONDITIONAL &&
+ insn->type != INSN_JUMP_UNCONDITIONAL)
+ continue;
+
+ if (insn->ignore)
+ continue;
+
+ rela = find_rela_by_dest_range(insn->sec, insn->offset,
+ insn->len);
+ if (!rela) {
+ dest_sec = insn->sec;
+ dest_off = insn->offset + insn->len + insn->immediate;
+ } else if (rela->sym->type == STT_SECTION) {
+ dest_sec = rela->sym->sec;
+ dest_off = rela->addend + 4;
+ } else if (rela->sym->sec->idx) {
+ dest_sec = rela->sym->sec;
+ dest_off = rela->sym->sym.st_value + rela->addend + 4;
+ } else {
+ /* sibling call */
+ insn->jump_dest = 0;
+ continue;
+ }
+
+ insn->jump_dest = find_insn(file, dest_sec, dest_off);
+ if (!insn->jump_dest) {
+
+ /*
+ * This is a special case where an alt instruction
+ * jumps past the end of the section. These are
+ * handled later in handle_group_alt().
+ */
+ if (!strcmp(insn->sec->name, ".altinstr_replacement"))
+ continue;
+
+ WARN_FUNC("can't find jump dest instruction at %s+0x%lx",
+ insn->sec, insn->offset, dest_sec->name,
+ dest_off);
+ return -1;
+ }
+ }
+
+ return 0;
+}
+
+/*
+ * Find the destination instructions for all calls.
+ */
+static int add_call_destinations(struct objtool_file *file)
+{
+ struct instruction *insn;
+ unsigned long dest_off;
+ struct rela *rela;
+
+ for_each_insn(file, insn) {
+ if (insn->type != INSN_CALL)
+ continue;
+
+ rela = find_rela_by_dest_range(insn->sec, insn->offset,
+ insn->len);
+ if (!rela) {
+ dest_off = insn->offset + insn->len + insn->immediate;
+ insn->call_dest = find_symbol_by_offset(insn->sec,
+ dest_off);
+ if (!insn->call_dest) {
+ WARN_FUNC("can't find call dest symbol at offset 0x%lx",
+ insn->sec, insn->offset, dest_off);
+ return -1;
+ }
+ } else if (rela->sym->type == STT_SECTION) {
+ insn->call_dest = find_symbol_by_offset(rela->sym->sec,
+ rela->addend+4);
+ if (!insn->call_dest ||
+ insn->call_dest->type != STT_FUNC) {
+ WARN_FUNC("can't find call dest symbol at %s+0x%x",
+ insn->sec, insn->offset,
+ rela->sym->sec->name,
+ rela->addend + 4);
+ return -1;
+ }
+ } else
+ insn->call_dest = rela->sym;
+ }
+
+ return 0;
+}
+
+/*
+ * The .alternatives section requires some extra special care, over and above
+ * what other special sections require:
+ *
+ * 1. Because alternatives are patched in-place, we need to insert a fake jump
+ * instruction at the end so that validate_branch() skips all the original
+ * replaced instructions when validating the new instruction path.
+ *
+ * 2. An added wrinkle is that the new instruction length might be zero. In
+ * that case the old instructions are replaced with noops. We simulate that
+ * by creating a fake jump as the only new instruction.
+ *
+ * 3. In some cases, the alternative section includes an instruction which
+ * conditionally jumps to the _end_ of the entry. We have to modify these
+ * jumps' destinations to point back to .text rather than the end of the
+ * entry in .altinstr_replacement.
+ *
+ * 4. It has been requested that we don't validate the !POPCNT feature path
+ * which is a "very very small percentage of machines".
+ */
+static int handle_group_alt(struct objtool_file *file,
+ struct special_alt *special_alt,
+ struct instruction *orig_insn,
+ struct instruction **new_insn)
+{
+ struct instruction *last_orig_insn, *last_new_insn, *insn, *fake_jump;
+ unsigned long dest_off;
+
+ last_orig_insn = NULL;
+ insn = orig_insn;
+ sec_for_each_insn_from(file, insn) {
+ if (insn->offset >= special_alt->orig_off + special_alt->orig_len)
+ break;
+
+ if (special_alt->skip_orig)
+ insn->type = INSN_NOP;
+
+ insn->alt_group = true;
+ last_orig_insn = insn;
+ }
+
+ if (!next_insn_same_sec(file, last_orig_insn)) {
+ WARN("%s: don't know how to handle alternatives at end of section",
+ special_alt->orig_sec->name);
+ return -1;
+ }
+
+ fake_jump = malloc(sizeof(*fake_jump));
+ if (!fake_jump) {
+ WARN("malloc failed");
+ return -1;
+ }
+ memset(fake_jump, 0, sizeof(*fake_jump));
+ INIT_LIST_HEAD(&fake_jump->alts);
+ clear_insn_state(&fake_jump->state);
+
+ fake_jump->sec = special_alt->new_sec;
+ fake_jump->offset = -1;
+ fake_jump->type = INSN_JUMP_UNCONDITIONAL;
+ fake_jump->jump_dest = list_next_entry(last_orig_insn, list);
+ fake_jump->ignore = true;
+
+ if (!special_alt->new_len) {
+ *new_insn = fake_jump;
+ return 0;
+ }
+
+ last_new_insn = NULL;
+ insn = *new_insn;
+ sec_for_each_insn_from(file, insn) {
+ if (insn->offset >= special_alt->new_off + special_alt->new_len)
+ break;
+
+ last_new_insn = insn;
+
+ if (insn->type != INSN_JUMP_CONDITIONAL &&
+ insn->type != INSN_JUMP_UNCONDITIONAL)
+ continue;
+
+ if (!insn->immediate)
+ continue;
+
+ dest_off = insn->offset + insn->len + insn->immediate;
+ if (dest_off == special_alt->new_off + special_alt->new_len)
+ insn->jump_dest = fake_jump;
+
+ if (!insn->jump_dest) {
+ WARN_FUNC("can't find alternative jump destination",
+ insn->sec, insn->offset);
+ return -1;
+ }
+ }
+
+ if (!last_new_insn) {
+ WARN_FUNC("can't find last new alternative instruction",
+ special_alt->new_sec, special_alt->new_off);
+ return -1;
+ }
+
+ list_add(&fake_jump->list, &last_new_insn->list);
+
+ return 0;
+}
+
+/*
+ * A jump table entry can either convert a nop to a jump or a jump to a nop.
+ * If the original instruction is a jump, make the alt entry an effective nop
+ * by just skipping the original instruction.
+ */
+static int handle_jump_alt(struct objtool_file *file,
+ struct special_alt *special_alt,
+ struct instruction *orig_insn,
+ struct instruction **new_insn)
+{
+ if (orig_insn->type == INSN_NOP)
+ return 0;
+
+ if (orig_insn->type != INSN_JUMP_UNCONDITIONAL) {
+ WARN_FUNC("unsupported instruction at jump label",
+ orig_insn->sec, orig_insn->offset);
+ return -1;
+ }
+
+ *new_insn = list_next_entry(orig_insn, list);
+ return 0;
+}
+
+/*
+ * Read all the special sections which have alternate instructions which can be
+ * patched in or redirected to at runtime. Each instruction having alternate
+ * instruction(s) has them added to its insn->alts list, which will be
+ * traversed in validate_branch().
+ */
+static int add_special_section_alts(struct objtool_file *file)
+{
+ struct list_head special_alts;
+ struct instruction *orig_insn, *new_insn;
+ struct special_alt *special_alt, *tmp;
+ struct alternative *alt;
+ int ret;
+
+ ret = special_get_alts(file->elf, &special_alts);
+ if (ret)
+ return ret;
+
+ list_for_each_entry_safe(special_alt, tmp, &special_alts, list) {
+ alt = malloc(sizeof(*alt));
+ if (!alt) {
+ WARN("malloc failed");
+ ret = -1;
+ goto out;
+ }
+
+ orig_insn = find_insn(file, special_alt->orig_sec,
+ special_alt->orig_off);
+ if (!orig_insn) {
+ WARN_FUNC("special: can't find orig instruction",
+ special_alt->orig_sec, special_alt->orig_off);
+ ret = -1;
+ goto out;
+ }
+
+ new_insn = NULL;
+ if (!special_alt->group || special_alt->new_len) {
+ new_insn = find_insn(file, special_alt->new_sec,
+ special_alt->new_off);
+ if (!new_insn) {
+ WARN_FUNC("special: can't find new instruction",
+ special_alt->new_sec,
+ special_alt->new_off);
+ ret = -1;
+ goto out;
+ }
+ }
+
+ if (special_alt->group) {
+ ret = handle_group_alt(file, special_alt, orig_insn,
+ &new_insn);
+ if (ret)
+ goto out;
+ } else if (special_alt->jump_or_nop) {
+ ret = handle_jump_alt(file, special_alt, orig_insn,
+ &new_insn);
+ if (ret)
+ goto out;
+ }
+
+ alt->insn = new_insn;
+ list_add_tail(&alt->list, &orig_insn->alts);
+
+ list_del(&special_alt->list);
+ free(special_alt);
+ }
+
+out:
+ return ret;
+}
+
+static int add_switch_table(struct objtool_file *file, struct symbol *func,
+ struct instruction *insn, struct rela *table,
+ struct rela *next_table)
+{
+ struct rela *rela = table;
+ struct instruction *alt_insn;
+ struct alternative *alt;
+
+ list_for_each_entry_from(rela, &file->rodata->rela->rela_list, list) {
+ if (rela == next_table)
+ break;
+
+ if (rela->sym->sec != insn->sec ||
+ rela->addend <= func->offset ||
+ rela->addend >= func->offset + func->len)
+ break;
+
+ alt_insn = find_insn(file, insn->sec, rela->addend);
+ if (!alt_insn) {
+ WARN("%s: can't find instruction at %s+0x%x",
+ file->rodata->rela->name, insn->sec->name,
+ rela->addend);
+ return -1;
+ }
+
+ alt = malloc(sizeof(*alt));
+ if (!alt) {
+ WARN("malloc failed");
+ return -1;
+ }
+
+ alt->insn = alt_insn;
+ list_add_tail(&alt->list, &insn->alts);
+ }
+
+ return 0;
+}
+
+/*
+ * find_switch_table() - Given a dynamic jump, find the switch jump table in
+ * .rodata associated with it.
+ *
+ * There are 3 basic patterns:
+ *
+ * 1. jmpq *[rodata addr](,%reg,8)
+ *
+ * This is the most common case by far. It jumps to an address in a simple
+ * jump table which is stored in .rodata.
+ *
+ * 2. jmpq *[rodata addr](%rip)
+ *
+ * This is caused by a rare GCC quirk, currently only seen in three driver
+ * functions in the kernel, only with certain obscure non-distro configs.
+ *
+ * As part of an optimization, GCC makes a copy of an existing switch jump
+ * table, modifies it, and then hard-codes the jump (albeit with an indirect
+ * jump) to use a single entry in the table. The rest of the jump table and
+ * some of its jump targets remain as dead code.
+ *
+ * In such a case we can just crudely ignore all unreachable instruction
+ * warnings for the entire object file. Ideally we would just ignore them
+ * for the function, but that would require redesigning the code quite a
+ * bit. And honestly that's just not worth doing: unreachable instruction
+ * warnings are of questionable value anyway, and this is such a rare issue.
+ *
+ * 3. mov [rodata addr],%reg1
+ * ... some instructions ...
+ * jmpq *(%reg1,%reg2,8)
+ *
+ * This is a fairly uncommon pattern which is new for GCC 6. As of this
+ * writing, there are 11 occurrences of it in the allmodconfig kernel.
+ *
+ * TODO: Once we have DWARF CFI and smarter instruction decoding logic,
+ * ensure the same register is used in the mov and jump instructions.
+ */
+static struct rela *find_switch_table(struct objtool_file *file,
+ struct symbol *func,
+ struct instruction *insn)
+{
+ struct rela *text_rela, *rodata_rela;
+ struct instruction *orig_insn = insn;
+
+ text_rela = find_rela_by_dest_range(insn->sec, insn->offset, insn->len);
+ if (text_rela && text_rela->sym == file->rodata->sym) {
+ /* case 1 */
+ rodata_rela = find_rela_by_dest(file->rodata,
+ text_rela->addend);
+ if (rodata_rela)
+ return rodata_rela;
+
+ /* case 2 */
+ rodata_rela = find_rela_by_dest(file->rodata,
+ text_rela->addend + 4);
+ if (!rodata_rela)
+ return NULL;
+ file->ignore_unreachables = true;
+ return rodata_rela;
+ }
+
+ /* case 3 */
+ func_for_each_insn_continue_reverse(file, func, insn) {
+ if (insn->type == INSN_JUMP_DYNAMIC)
+ break;
+
+ /* allow small jumps within the range */
+ if (insn->type == INSN_JUMP_UNCONDITIONAL &&
+ insn->jump_dest &&
+ (insn->jump_dest->offset <= insn->offset ||
+ insn->jump_dest->offset > orig_insn->offset))
+ break;
+
+ /* look for a relocation which references .rodata */
+ text_rela = find_rela_by_dest_range(insn->sec, insn->offset,
+ insn->len);
+ if (!text_rela || text_rela->sym != file->rodata->sym)
+ continue;
+
+ /*
+ * Make sure the .rodata address isn't associated with a
+ * symbol. gcc jump tables are anonymous data.
+ */
+ if (find_symbol_containing(file->rodata, text_rela->addend))
+ continue;
+
+ return find_rela_by_dest(file->rodata, text_rela->addend);
+ }
+
+ return NULL;
+}
+
+static int add_func_switch_tables(struct objtool_file *file,
+ struct symbol *func)
+{
+ struct instruction *insn, *prev_jump = NULL;
+ struct rela *rela, *prev_rela = NULL;
+ int ret;
+
+ func_for_each_insn(file, func, insn) {
+ if (insn->type != INSN_JUMP_DYNAMIC)
+ continue;
+
+ rela = find_switch_table(file, func, insn);
+ if (!rela)
+ continue;
+
+ /*
+ * We found a switch table, but we don't know yet how big it
+ * is. Don't add it until we reach the end of the function or
+ * the beginning of another switch table in the same function.
+ */
+ if (prev_jump) {
+ ret = add_switch_table(file, func, prev_jump, prev_rela,
+ rela);
+ if (ret)
+ return ret;
+ }
+
+ prev_jump = insn;
+ prev_rela = rela;
+ }
+
+ if (prev_jump) {
+ ret = add_switch_table(file, func, prev_jump, prev_rela, NULL);
+ if (ret)
+ return ret;
+ }
+
+ return 0;
+}
+
+/*
+ * For some switch statements, gcc generates a jump table in the .rodata
+ * section which contains a list of addresses within the function to jump to.
+ * This finds these jump tables and adds them to the insn->alts lists.
+ */
+static int add_switch_table_alts(struct objtool_file *file)
+{
+ struct section *sec;
+ struct symbol *func;
+ int ret;
+
+ if (!file->rodata || !file->rodata->rela)
+ return 0;
+
+ for_each_sec(file, sec) {
+ list_for_each_entry(func, &sec->symbol_list, list) {
+ if (func->type != STT_FUNC)
+ continue;
+
+ ret = add_func_switch_tables(file, func);
+ if (ret)
+ return ret;
+ }
+ }
+
+ return 0;
+}
+
+static int decode_sections(struct objtool_file *file)
+{
+ int ret;
+
+ ret = decode_instructions(file);
+ if (ret)
+ return ret;
+
+ ret = add_dead_ends(file);
+ if (ret)
+ return ret;
+
+ add_ignores(file);
+
+ ret = add_jump_destinations(file);
+ if (ret)
+ return ret;
+
+ ret = add_call_destinations(file);
+ if (ret)
+ return ret;
+
+ ret = add_special_section_alts(file);
+ if (ret)
+ return ret;
+
+ ret = add_switch_table_alts(file);
+ if (ret)
+ return ret;
+
+ return 0;
+}
+
+static bool is_fentry_call(struct instruction *insn)
+{
+ if (insn->type == INSN_CALL &&
+ insn->call_dest->type == STT_NOTYPE &&
+ !strcmp(insn->call_dest->name, "__fentry__"))
+ return true;
+
+ return false;
+}
+
+static bool has_modified_stack_frame(struct insn_state *state)
+{
+ int i;
+
+ if (state->cfa.base != initial_func_cfi.cfa.base ||
+ state->cfa.offset != initial_func_cfi.cfa.offset ||
+ state->stack_size != initial_func_cfi.cfa.offset ||
+ state->drap)
+ return true;
+
+ for (i = 0; i < CFI_NUM_REGS; i++)
+ if (state->regs[i].base != initial_func_cfi.regs[i].base ||
+ state->regs[i].offset != initial_func_cfi.regs[i].offset)
+ return true;
+
+ return false;
+}
+
+static bool has_valid_stack_frame(struct insn_state *state)
+{
+ if (state->cfa.base == CFI_BP && state->regs[CFI_BP].base == CFI_CFA &&
+ state->regs[CFI_BP].offset == -16)
+ return true;
+
+ if (state->drap && state->regs[CFI_BP].base == CFI_BP)
+ return true;
+
+ return false;
+}
+
+static void save_reg(struct insn_state *state, unsigned char reg, int base,
+ int offset)
+{
+ if ((arch_callee_saved_reg(reg) ||
+ (state->drap && reg == state->drap_reg)) &&
+ state->regs[reg].base == CFI_UNDEFINED) {
+ state->regs[reg].base = base;
+ state->regs[reg].offset = offset;
+ }
+}
+
+static void restore_reg(struct insn_state *state, unsigned char reg)
+{
+ state->regs[reg].base = CFI_UNDEFINED;
+ state->regs[reg].offset = 0;
+}
+
+/*
+ * A note about DRAP stack alignment:
+ *
+ * GCC has the concept of a DRAP register, which is used to help keep track of
+ * the stack pointer when aligning the stack. r10 or r13 is used as the DRAP
+ * register. The typical DRAP pattern is:
+ *
+ * 4c 8d 54 24 08 lea 0x8(%rsp),%r10
+ * 48 83 e4 c0 and $0xffffffffffffffc0,%rsp
+ * 41 ff 72 f8 pushq -0x8(%r10)
+ * 55 push %rbp
+ * 48 89 e5 mov %rsp,%rbp
+ * (more pushes)
+ * 41 52 push %r10
+ * ...
+ * 41 5a pop %r10
+ * (more pops)
+ * 5d pop %rbp
+ * 49 8d 62 f8 lea -0x8(%r10),%rsp
+ * c3 retq
+ *
+ * There are some variations in the epilogues, like:
+ *
+ * 5b pop %rbx
+ * 41 5a pop %r10
+ * 41 5c pop %r12
+ * 41 5d pop %r13
+ * 41 5e pop %r14
+ * c9 leaveq
+ * 49 8d 62 f8 lea -0x8(%r10),%rsp
+ * c3 retq
+ *
+ * and:
+ *
+ * 4c 8b 55 e8 mov -0x18(%rbp),%r10
+ * 48 8b 5d e0 mov -0x20(%rbp),%rbx
+ * 4c 8b 65 f0 mov -0x10(%rbp),%r12
+ * 4c 8b 6d f8 mov -0x8(%rbp),%r13
+ * c9 leaveq
+ * 49 8d 62 f8 lea -0x8(%r10),%rsp
+ * c3 retq
+ *
+ * Sometimes r13 is used as the DRAP register, in which case it's saved and
+ * restored beforehand:
+ *
+ * 41 55 push %r13
+ * 4c 8d 6c 24 10 lea 0x10(%rsp),%r13
+ * 48 83 e4 f0 and $0xfffffffffffffff0,%rsp
+ * ...
+ * 49 8d 65 f0 lea -0x10(%r13),%rsp
+ * 41 5d pop %r13
+ * c3 retq
+ */
+static int update_insn_state(struct instruction *insn, struct insn_state *state)
+{
+ struct stack_op *op = &insn->stack_op;
+ struct cfi_reg *cfa = &state->cfa;
+ struct cfi_reg *regs = state->regs;
+
+ /* stack operations don't make sense with an undefined CFA */
+ if (cfa->base == CFI_UNDEFINED) {
+ if (insn->func) {
+ WARN_FUNC("undefined stack state", insn->sec, insn->offset);
+ return -1;
+ }
+ return 0;
+ }
+
+ switch (op->dest.type) {
+
+ case OP_DEST_REG:
+ switch (op->src.type) {
+
+ case OP_SRC_REG:
+ if (cfa->base == op->src.reg && cfa->base == CFI_SP &&
+ op->dest.reg == CFI_BP && regs[CFI_BP].base == CFI_CFA &&
+ regs[CFI_BP].offset == -cfa->offset) {
+
+ /* mov %rsp, %rbp */
+ cfa->base = op->dest.reg;
+ state->bp_scratch = false;
+ } else if (state->drap) {
+
+ /* drap: mov %rsp, %rbp */
+ regs[CFI_BP].base = CFI_BP;
+ regs[CFI_BP].offset = -state->stack_size;
+ state->bp_scratch = false;
+ } else if (!nofp) {
+
+ WARN_FUNC("unknown stack-related register move",
+ insn->sec, insn->offset);
+ return -1;
+ }
+
+ break;
+
+ case OP_SRC_ADD:
+ if (op->dest.reg == CFI_SP && op->src.reg == CFI_SP) {
+
+ /* add imm, %rsp */
+ state->stack_size -= op->src.offset;
+ if (cfa->base == CFI_SP)
+ cfa->offset -= op->src.offset;
+ break;
+ }
+
+ if (op->dest.reg == CFI_SP && op->src.reg == CFI_BP) {
+
+ /* lea disp(%rbp), %rsp */
+ state->stack_size = -(op->src.offset + regs[CFI_BP].offset);
+ break;
+ }
+
+ if (op->dest.reg != CFI_BP && op->src.reg == CFI_SP &&
+ cfa->base == CFI_SP) {
+
+ /* drap: lea disp(%rsp), %drap */
+ state->drap_reg = op->dest.reg;
+ break;
+ }
+
+ if (state->drap && op->dest.reg == CFI_SP &&
+ op->src.reg == state->drap_reg) {
+
+ /* drap: lea disp(%drap), %rsp */
+ cfa->base = CFI_SP;
+ cfa->offset = state->stack_size = -op->src.offset;
+ state->drap_reg = CFI_UNDEFINED;
+ state->drap = false;
+ break;
+ }
+
+ if (op->dest.reg == state->cfa.base) {
+ WARN_FUNC("unsupported stack register modification",
+ insn->sec, insn->offset);
+ return -1;
+ }
+
+ break;
+
+ case OP_SRC_AND:
+ if (op->dest.reg != CFI_SP ||
+ (state->drap_reg != CFI_UNDEFINED && cfa->base != CFI_SP) ||
+ (state->drap_reg == CFI_UNDEFINED && cfa->base != CFI_BP)) {
+ WARN_FUNC("unsupported stack pointer realignment",
+ insn->sec, insn->offset);
+ return -1;
+ }
+
+ if (state->drap_reg != CFI_UNDEFINED) {
+ /* drap: and imm, %rsp */
+ cfa->base = state->drap_reg;
+ cfa->offset = state->stack_size = 0;
+ state->drap = true;
+
+ }
+
+ /*
+ * Older versions of GCC (4.8ish) realign the stack
+ * without DRAP, with a frame pointer.
+ */
+
+ break;
+
+ case OP_SRC_POP:
+ if (!state->drap && op->dest.type == OP_DEST_REG &&
+ op->dest.reg == cfa->base) {
+
+ /* pop %rbp */
+ cfa->base = CFI_SP;
+ }
+
+ if (regs[op->dest.reg].offset == -state->stack_size) {
+
+ if (state->drap && cfa->base == CFI_BP_INDIRECT &&
+ op->dest.type == OP_DEST_REG &&
+ op->dest.reg == state->drap_reg) {
+
+ /* drap: pop %drap */
+ cfa->base = state->drap_reg;
+ cfa->offset = 0;
+ }
+
+ restore_reg(state, op->dest.reg);
+ }
+
+ state->stack_size -= 8;
+ if (cfa->base == CFI_SP)
+ cfa->offset -= 8;
+
+ break;
+
+ case OP_SRC_REG_INDIRECT:
+ if (state->drap && op->src.reg == CFI_BP &&
+ op->src.offset == regs[op->dest.reg].offset) {
+
+ /* drap: mov disp(%rbp), %reg */
+ if (op->dest.reg == state->drap_reg) {
+ cfa->base = state->drap_reg;
+ cfa->offset = 0;
+ }
+
+ restore_reg(state, op->dest.reg);
+
+ } else if (op->src.reg == cfa->base &&
+ op->src.offset == regs[op->dest.reg].offset + cfa->offset) {
+
+ /* mov disp(%rbp), %reg */
+ /* mov disp(%rsp), %reg */
+ restore_reg(state, op->dest.reg);
+ }
+
+ break;
+
+ default:
+ WARN_FUNC("unknown stack-related instruction",
+ insn->sec, insn->offset);
+ return -1;
+ }
+
+ break;
+
+ case OP_DEST_PUSH:
+ state->stack_size += 8;
+ if (cfa->base == CFI_SP)
+ cfa->offset += 8;
+
+ if (op->src.type != OP_SRC_REG)
+ break;
+
+ if (state->drap) {
+ if (op->src.reg == cfa->base && op->src.reg == state->drap_reg) {
+
+ /* drap: push %drap */
+ cfa->base = CFI_BP_INDIRECT;
+ cfa->offset = -state->stack_size;
+
+ /* save drap so we know when to undefine it */
+ save_reg(state, op->src.reg, CFI_CFA, -state->stack_size);
+
+ } else if (op->src.reg == CFI_BP && cfa->base == state->drap_reg) {
+
+ /* drap: push %rbp */
+ state->stack_size = 0;
+
+ } else if (regs[op->src.reg].base == CFI_UNDEFINED) {
+
+ /* drap: push %reg */
+ save_reg(state, op->src.reg, CFI_BP, -state->stack_size);
+ }
+
+ } else {
+
+ /* push %reg */
+ save_reg(state, op->src.reg, CFI_CFA, -state->stack_size);
+ }
+
+ /* detect when asm code uses rbp as a scratch register */
+ if (!nofp && insn->func && op->src.reg == CFI_BP &&
+ cfa->base != CFI_BP)
+ state->bp_scratch = true;
+ break;
+
+ case OP_DEST_REG_INDIRECT:
+
+ if (state->drap) {
+ if (op->src.reg == cfa->base && op->src.reg == state->drap_reg) {
+
+ /* drap: mov %drap, disp(%rbp) */
+ cfa->base = CFI_BP_INDIRECT;
+ cfa->offset = op->dest.offset;
+
+ /* save drap so we know when to undefine it */
+ save_reg(state, op->src.reg, CFI_CFA, op->dest.offset);
+ }
+
+ else if (regs[op->src.reg].base == CFI_UNDEFINED) {
+
+ /* drap: mov reg, disp(%rbp) */
+ save_reg(state, op->src.reg, CFI_BP, op->dest.offset);
+ }
+
+ } else if (op->dest.reg == cfa->base) {
+
+ /* mov reg, disp(%rbp) */
+ /* mov reg, disp(%rsp) */
+ save_reg(state, op->src.reg, CFI_CFA,
+ op->dest.offset - state->cfa.offset);
+ }
+
+ break;
+
+ case OP_DEST_LEAVE:
+ if ((!state->drap && cfa->base != CFI_BP) ||
+ (state->drap && cfa->base != state->drap_reg)) {
+ WARN_FUNC("leave instruction with modified stack frame",
+ insn->sec, insn->offset);
+ return -1;
+ }
+
+ /* leave (mov %rbp, %rsp; pop %rbp) */
+
+ state->stack_size = -state->regs[CFI_BP].offset - 8;
+ restore_reg(state, CFI_BP);
+
+ if (!state->drap) {
+ cfa->base = CFI_SP;
+ cfa->offset -= 8;
+ }
+
+ break;
+
+ case OP_DEST_MEM:
+ if (op->src.type != OP_SRC_POP) {
+ WARN_FUNC("unknown stack-related memory operation",
+ insn->sec, insn->offset);
+ return -1;
+ }
+
+ /* pop mem */
+ state->stack_size -= 8;
+ if (cfa->base == CFI_SP)
+ cfa->offset -= 8;
+
+ break;
+
+ default:
+ WARN_FUNC("unknown stack-related instruction",
+ insn->sec, insn->offset);
+ return -1;
+ }
+
+ return 0;
+}
+
+static bool insn_state_match(struct instruction *insn, struct insn_state *state)
+{
+ struct insn_state *state1 = &insn->state, *state2 = state;
+ int i;
+
+ if (memcmp(&state1->cfa, &state2->cfa, sizeof(state1->cfa))) {
+ WARN_FUNC("stack state mismatch: cfa1=%d%+d cfa2=%d%+d",
+ insn->sec, insn->offset,
+ state1->cfa.base, state1->cfa.offset,
+ state2->cfa.base, state2->cfa.offset);
+
+ } else if (memcmp(&state1->regs, &state2->regs, sizeof(state1->regs))) {
+ for (i = 0; i < CFI_NUM_REGS; i++) {
+ if (!memcmp(&state1->regs[i], &state2->regs[i],
+ sizeof(struct cfi_reg)))
+ continue;
+
+ WARN_FUNC("stack state mismatch: reg1[%d]=%d%+d reg2[%d]=%d%+d",
+ insn->sec, insn->offset,
+ i, state1->regs[i].base, state1->regs[i].offset,
+ i, state2->regs[i].base, state2->regs[i].offset);
+ break;
+ }
+
+ } else if (state1->drap != state2->drap ||
+ (state1->drap && state1->drap_reg != state2->drap_reg)) {
+ WARN_FUNC("stack state mismatch: drap1=%d(%d) drap2=%d(%d)",
+ insn->sec, insn->offset,
+ state1->drap, state1->drap_reg,
+ state2->drap, state2->drap_reg);
+
+ } else
+ return true;
+
+ return false;
+}
+
+/*
+ * Follow the branch starting at the given instruction, and recursively follow
+ * any other branches (jumps). Meanwhile, track the frame pointer state at
+ * each instruction and validate all the rules described in
+ * tools/objtool/Documentation/stack-validation.txt.
+ */
+static int validate_branch(struct objtool_file *file, struct instruction *first,
+ struct insn_state state)
+{
+ struct alternative *alt;
+ struct instruction *insn;
+ struct section *sec;
+ struct symbol *func = NULL;
+ int ret;
+
+ insn = first;
+ sec = insn->sec;
+
+ if (insn->alt_group && list_empty(&insn->alts)) {
+ WARN_FUNC("don't know how to handle branch to middle of alternative instruction group",
+ sec, insn->offset);
+ return -1;
+ }
+
+ while (1) {
+ if (file->c_file && insn->func) {
+ if (func && func != insn->func) {
+ WARN("%s() falls through to next function %s()",
+ func->name, insn->func->name);
+ return 1;
+ }
+ }
+
+ func = insn->func;
+
+ if (insn->visited) {
+ if (!!insn_state_match(insn, &state))
+ return 1;
+
+ return 0;
+ }
+
+ insn->state = state;
+
+ insn->visited = true;
+
+ list_for_each_entry(alt, &insn->alts, list) {
+ ret = validate_branch(file, alt->insn, state);
+ if (ret)
+ return 1;
+ }
+
+ switch (insn->type) {
+
+ case INSN_RETURN:
+ if (func && has_modified_stack_frame(&state)) {
+ WARN_FUNC("return with modified stack frame",
+ sec, insn->offset);
+ return 1;
+ }
+
+ if (state.bp_scratch) {
+ WARN("%s uses BP as a scratch register",
+ insn->func->name);
+ return 1;
+ }
+
+ return 0;
+
+ case INSN_CALL:
+ if (is_fentry_call(insn))
+ break;
+
+ ret = dead_end_function(file, insn->call_dest);
+ if (ret == 1)
+ return 0;
+ if (ret == -1)
+ return 1;
+
+ /* fallthrough */
+ case INSN_CALL_DYNAMIC:
+ if (!nofp && func && !has_valid_stack_frame(&state)) {
+ WARN_FUNC("call without frame pointer save/setup",
+ sec, insn->offset);
+ return 1;
+ }
+ break;
+
+ case INSN_JUMP_CONDITIONAL:
+ case INSN_JUMP_UNCONDITIONAL:
+ if (insn->jump_dest) {
+ ret = validate_branch(file, insn->jump_dest,
+ state);
+ if (ret)
+ return 1;
+ } else if (func && has_modified_stack_frame(&state)) {
+ WARN_FUNC("sibling call from callable instruction with modified stack frame",
+ sec, insn->offset);
+ return 1;
+ } /* else it's a sibling call */
+
+ if (insn->type == INSN_JUMP_UNCONDITIONAL)
+ return 0;
+
+ break;
+
+ case INSN_JUMP_DYNAMIC:
+ if (func && list_empty(&insn->alts) &&
+ has_modified_stack_frame(&state)) {
+ WARN_FUNC("sibling call from callable instruction with modified stack frame",
+ sec, insn->offset);
+ return 1;
+ }
+
+ return 0;
+
+ case INSN_STACK:
+ if (update_insn_state(insn, &state))
+ return -1;
+
+ break;
+
+ default:
+ break;
+ }
+
+ if (insn->dead_end)
+ return 0;
+
+ insn = next_insn_same_sec(file, insn);
+ if (!insn) {
+ WARN("%s: unexpected end of section", sec->name);
+ return 1;
+ }
+ }
+
+ return 0;
+}
+
+static bool is_kasan_insn(struct instruction *insn)
+{
+ return (insn->type == INSN_CALL &&
+ !strcmp(insn->call_dest->name, "__asan_handle_no_return"));
+}
+
+static bool is_ubsan_insn(struct instruction *insn)
+{
+ return (insn->type == INSN_CALL &&
+ !strcmp(insn->call_dest->name,
+ "__ubsan_handle_builtin_unreachable"));
+}
+
+static bool ignore_unreachable_insn(struct instruction *insn)
+{
+ int i;
+
+ if (insn->ignore || insn->type == INSN_NOP)
+ return true;
+
+ /*
+ * Ignore any unused exceptions. This can happen when a whitelisted
+ * function has an exception table entry.
+ */
+ if (!strcmp(insn->sec->name, ".fixup"))
+ return true;
+
+ /*
+ * Check if this (or a subsequent) instruction is related to
+ * CONFIG_UBSAN or CONFIG_KASAN.
+ *
+ * End the search at 5 instructions to avoid going into the weeds.
+ */
+ if (!insn->func)
+ return false;
+ for (i = 0; i < 5; i++) {
+
+ if (is_kasan_insn(insn) || is_ubsan_insn(insn))
+ return true;
+
+ if (insn->type == INSN_JUMP_UNCONDITIONAL && insn->jump_dest) {
+ insn = insn->jump_dest;
+ continue;
+ }
+
+ if (insn->offset + insn->len >= insn->func->offset + insn->func->len)
+ break;
+ insn = list_next_entry(insn, list);
+ }
+
+ return false;
+}
+
+static int validate_functions(struct objtool_file *file)
+{
+ struct section *sec;
+ struct symbol *func;
+ struct instruction *insn;
+ struct insn_state state;
+ int ret, warnings = 0;
+
+ clear_insn_state(&state);
+
+ state.cfa = initial_func_cfi.cfa;
+ memcpy(&state.regs, &initial_func_cfi.regs,
+ CFI_NUM_REGS * sizeof(struct cfi_reg));
+ state.stack_size = initial_func_cfi.cfa.offset;
+
+ for_each_sec(file, sec) {
+ list_for_each_entry(func, &sec->symbol_list, list) {
+ if (func->type != STT_FUNC)
+ continue;
+
+ insn = find_insn(file, sec, func->offset);
+ if (!insn || insn->ignore)
+ continue;
+
+ ret = validate_branch(file, insn, state);
+ warnings += ret;
+ }
+ }
+
+ return warnings;
+}
+
+static int validate_reachable_instructions(struct objtool_file *file)
+{
+ struct instruction *insn;
+
+ if (file->ignore_unreachables)
+ return 0;
+
+ for_each_insn(file, insn) {
+ if (insn->visited || ignore_unreachable_insn(insn))
+ continue;
+
+ /*
+ * gcov produces a lot of unreachable instructions. If we get
+ * an unreachable warning and the file has gcov enabled, just
+ * ignore it, and all other such warnings for the file. Do
+ * this here because this is an expensive function.
+ */
+ if (gcov_enabled(file))
+ return 0;
+
+ WARN_FUNC("unreachable instruction", insn->sec, insn->offset);
+ return 1;
+ }
+
+ return 0;
+}
+
+static void cleanup(struct objtool_file *file)
+{
+ struct instruction *insn, *tmpinsn;
+ struct alternative *alt, *tmpalt;
+
+ list_for_each_entry_safe(insn, tmpinsn, &file->insn_list, list) {
+ list_for_each_entry_safe(alt, tmpalt, &insn->alts, list) {
+ list_del(&alt->list);
+ free(alt);
+ }
+ list_del(&insn->list);
+ hash_del(&insn->hash);
+ free(insn);
+ }
+ elf_close(file->elf);
+}
+
+int check(const char *_objname, bool _nofp)
+{
+ struct objtool_file file;
+ int ret, warnings = 0;
+
+ objname = _objname;
+ nofp = _nofp;
+
+ file.elf = elf_open(objname);
+ if (!file.elf)
+ return 1;
+
+ INIT_LIST_HEAD(&file.insn_list);
+ hash_init(file.insn_hash);
+ file.whitelist = find_section_by_name(file.elf, ".discard.func_stack_frame_non_standard");
+ file.rodata = find_section_by_name(file.elf, ".rodata");
+ file.ignore_unreachables = false;
+ file.c_file = find_section_by_name(file.elf, ".comment");
+
+ arch_initial_func_cfi_state(&initial_func_cfi);
+
+ ret = decode_sections(&file);
+ if (ret < 0)
+ goto out;
+ warnings += ret;
+
+ if (list_empty(&file.insn_list))
+ goto out;
+
+ ret = validate_functions(&file);
+ if (ret < 0)
+ goto out;
+ warnings += ret;
+
+ if (!warnings) {
+ ret = validate_reachable_instructions(&file);
+ if (ret < 0)
+ goto out;
+ warnings += ret;
+ }
+
+out:
+ cleanup(&file);
+
+ /* ignore warnings for now until we get all the code cleaned up */
+ if (ret || warnings)
+ return 0;
+ return 0;
+}
diff --git a/tools/objtool/check.h b/tools/objtool/check.h
new file mode 100644
index 000000000000..da85f5b00ec6
--- /dev/null
+++ b/tools/objtool/check.h
@@ -0,0 +1,66 @@
+/*
+ * Copyright (C) 2017 Josh Poimboeuf <jpoimboe@redhat.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, see <http://www.gnu.org/licenses/>.
+ */
+
+#ifndef _CHECK_H
+#define _CHECK_H
+
+#include <stdbool.h>
+#include "elf.h"
+#include "cfi.h"
+#include "arch.h"
+#include <linux/hashtable.h>
+
+struct insn_state {
+ struct cfi_reg cfa;
+ struct cfi_reg regs[CFI_NUM_REGS];
+ int stack_size;
+ bool bp_scratch;
+ bool drap;
+ int drap_reg;
+};
+
+struct instruction {
+ struct list_head list;
+ struct hlist_node hash;
+ struct section *sec;
+ unsigned long offset;
+ unsigned int len;
+ unsigned char type;
+ unsigned long immediate;
+ bool alt_group, visited, dead_end, ignore;
+ struct symbol *call_dest;
+ struct instruction *jump_dest;
+ struct list_head alts;
+ struct symbol *func;
+ struct stack_op stack_op;
+ struct insn_state state;
+};
+
+struct objtool_file {
+ struct elf *elf;
+ struct list_head insn_list;
+ DECLARE_HASHTABLE(insn_hash, 16);
+ struct section *rodata, *whitelist;
+ bool ignore_unreachables, c_file;
+};
+
+int check(const char *objname, bool nofp);
+
+#define for_each_insn(file, insn) \
+ list_for_each_entry(insn, &file->insn_list, list)
+
+#endif /* _CHECK_H */
diff --git a/tools/objtool/elf.c b/tools/objtool/elf.c
index d897702ce742..1a7e8aa2af58 100644
--- a/tools/objtool/elf.c
+++ b/tools/objtool/elf.c
@@ -37,6 +37,9 @@
#define ELF_C_READ_MMAP ELF_C_READ
#endif
+#define WARN_ELF(format, ...) \
+ WARN(format ": %s", ##__VA_ARGS__, elf_errmsg(-1))
+
struct section *find_section_by_name(struct elf *elf, const char *name)
{
struct section *sec;
@@ -139,12 +142,12 @@ static int read_sections(struct elf *elf)
int i;
if (elf_getshdrnum(elf->elf, &sections_nr)) {
- perror("elf_getshdrnum");
+ WARN_ELF("elf_getshdrnum");
return -1;
}
if (elf_getshdrstrndx(elf->elf, &shstrndx)) {
- perror("elf_getshdrstrndx");
+ WARN_ELF("elf_getshdrstrndx");
return -1;
}
@@ -165,37 +168,36 @@ static int read_sections(struct elf *elf)
s = elf_getscn(elf->elf, i);
if (!s) {
- perror("elf_getscn");
+ WARN_ELF("elf_getscn");
return -1;
}
sec->idx = elf_ndxscn(s);
if (!gelf_getshdr(s, &sec->sh)) {
- perror("gelf_getshdr");
+ WARN_ELF("gelf_getshdr");
return -1;
}
sec->name = elf_strptr(elf->elf, shstrndx, sec->sh.sh_name);
if (!sec->name) {
- perror("elf_strptr");
+ WARN_ELF("elf_strptr");
return -1;
}
- sec->elf_data = elf_getdata(s, NULL);
- if (!sec->elf_data) {
- perror("elf_getdata");
+ sec->data = elf_getdata(s, NULL);
+ if (!sec->data) {
+ WARN_ELF("elf_getdata");
return -1;
}
- if (sec->elf_data->d_off != 0 ||
- sec->elf_data->d_size != sec->sh.sh_size) {
+ if (sec->data->d_off != 0 ||
+ sec->data->d_size != sec->sh.sh_size) {
WARN("unexpected data attributes for %s", sec->name);
return -1;
}
- sec->data = (unsigned long)sec->elf_data->d_buf;
- sec->len = sec->elf_data->d_size;
+ sec->len = sec->data->d_size;
}
/* sanity check, one more call to elf_nextscn() should return NULL */
@@ -232,15 +234,15 @@ static int read_symbols(struct elf *elf)
sym->idx = i;
- if (!gelf_getsym(symtab->elf_data, i, &sym->sym)) {
- perror("gelf_getsym");
+ if (!gelf_getsym(symtab->data, i, &sym->sym)) {
+ WARN_ELF("gelf_getsym");
goto err;
}
sym->name = elf_strptr(elf->elf, symtab->sh.sh_link,
sym->sym.st_name);
if (!sym->name) {
- perror("elf_strptr");
+ WARN_ELF("elf_strptr");
goto err;
}
@@ -322,8 +324,8 @@ static int read_relas(struct elf *elf)
}
memset(rela, 0, sizeof(*rela));
- if (!gelf_getrela(sec->elf_data, i, &rela->rela)) {
- perror("gelf_getrela");
+ if (!gelf_getrela(sec->data, i, &rela->rela)) {
+ WARN_ELF("gelf_getrela");
return -1;
}
@@ -362,12 +364,6 @@ struct elf *elf_open(const char *name)
INIT_LIST_HEAD(&elf->sections);
- elf->name = strdup(name);
- if (!elf->name) {
- perror("strdup");
- goto err;
- }
-
elf->fd = open(name, O_RDONLY);
if (elf->fd == -1) {
perror("open");
@@ -376,12 +372,12 @@ struct elf *elf_open(const char *name)
elf->elf = elf_begin(elf->fd, ELF_C_READ_MMAP, NULL);
if (!elf->elf) {
- perror("elf_begin");
+ WARN_ELF("elf_begin");
goto err;
}
if (!gelf_getehdr(elf->elf, &elf->ehdr)) {
- perror("gelf_getehdr");
+ WARN_ELF("gelf_getehdr");
goto err;
}
@@ -407,6 +403,12 @@ void elf_close(struct elf *elf)
struct symbol *sym, *tmpsym;
struct rela *rela, *tmprela;
+ if (elf->elf)
+ elf_end(elf->elf);
+
+ if (elf->fd > 0)
+ close(elf->fd);
+
list_for_each_entry_safe(sec, tmpsec, &elf->sections, list) {
list_for_each_entry_safe(sym, tmpsym, &sec->symbol_list, list) {
list_del(&sym->list);
@@ -421,11 +423,6 @@ void elf_close(struct elf *elf)
list_del(&sec->list);
free(sec);
}
- if (elf->name)
- free(elf->name);
- if (elf->fd > 0)
- close(elf->fd);
- if (elf->elf)
- elf_end(elf->elf);
+
free(elf);
}
diff --git a/tools/objtool/elf.h b/tools/objtool/elf.h
index 731973e1a3f5..343968b778cb 100644
--- a/tools/objtool/elf.h
+++ b/tools/objtool/elf.h
@@ -37,10 +37,9 @@ struct section {
DECLARE_HASHTABLE(rela_hash, 16);
struct section *base, *rela;
struct symbol *sym;
- Elf_Data *elf_data;
+ Elf_Data *data;
char *name;
int idx;
- unsigned long data;
unsigned int len;
};
@@ -86,6 +85,7 @@ struct rela *find_rela_by_dest_range(struct section *sec, unsigned long offset,
struct symbol *find_containing_func(struct section *sec, unsigned long offset);
void elf_close(struct elf *elf);
-
+#define for_each_sec(file, sec) \
+ list_for_each_entry(sec, &file->elf->sections, list)
#endif /* _OBJTOOL_ELF_H */
diff --git a/tools/objtool/special.c b/tools/objtool/special.c
index bff8abb3a4aa..84f001d52322 100644
--- a/tools/objtool/special.c
+++ b/tools/objtool/special.c
@@ -91,16 +91,16 @@ static int get_alt_entry(struct elf *elf, struct special_entry *entry,
alt->jump_or_nop = entry->jump_or_nop;
if (alt->group) {
- alt->orig_len = *(unsigned char *)(sec->data + offset +
+ alt->orig_len = *(unsigned char *)(sec->data->d_buf + offset +
entry->orig_len);
- alt->new_len = *(unsigned char *)(sec->data + offset +
+ alt->new_len = *(unsigned char *)(sec->data->d_buf + offset +
entry->new_len);
}
if (entry->feature) {
unsigned short feature;
- feature = *(unsigned short *)(sec->data + offset +
+ feature = *(unsigned short *)(sec->data->d_buf + offset +
entry->feature);
/*
diff --git a/tools/objtool/warn.h b/tools/objtool/warn.h
index ac7e07523e84..afd9f7a05f6d 100644
--- a/tools/objtool/warn.h
+++ b/tools/objtool/warn.h
@@ -18,6 +18,13 @@
#ifndef _WARN_H
#define _WARN_H
+#include <stdlib.h>
+#include <string.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <fcntl.h>
+#include "elf.h"
+
extern const char *objname;
static inline char *offstr(struct section *sec, unsigned long offset)
@@ -57,4 +64,7 @@ static inline char *offstr(struct section *sec, unsigned long offset)
free(_str); \
})
+#define WARN_ELF(format, ...) \
+ WARN(format ": %s", ##__VA_ARGS__, elf_errmsg(-1))
+
#endif /* _WARN_H */
diff --git a/tools/perf/Documentation/intel-pt.txt b/tools/perf/Documentation/intel-pt.txt
index b0b3007d3c9c..4b6cdbf8f935 100644
--- a/tools/perf/Documentation/intel-pt.txt
+++ b/tools/perf/Documentation/intel-pt.txt
@@ -108,6 +108,9 @@ approach is available to export the data to a postgresql database. Refer to
script export-to-postgresql.py for more details, and to script
call-graph-from-postgresql.py for an example of using the database.
+There is also script intel-pt-events.py which provides an example of how to
+unpack the raw data for power events and PTWRITE.
+
As mentioned above, it is easy to capture too much data. One way to limit the
data captured is to use 'snapshot' mode which is explained further below.
Refer to 'new snapshot option' and 'Intel PT modes of operation' further below.
@@ -364,6 +367,42 @@ cyc_thresh Specifies how frequently CYC packets are produced - see cyc
CYC packets are not requested by default.
+pt Specifies pass-through which enables the 'branch' config term.
+
+ The default config selects 'pt' if it is available, so a user will
+ never need to specify this term.
+
+branch Enable branch tracing. Branch tracing is enabled by default so to
+ disable branch tracing use 'branch=0'.
+
+ The default config selects 'branch' if it is available.
+
+ptw Enable PTWRITE packets which are produced when a ptwrite instruction
+ is executed.
+
+ Support for this feature is indicated by:
+
+ /sys/bus/event_source/devices/intel_pt/caps/ptwrite
+
+ which contains "1" if the feature is supported and
+ "0" otherwise.
+
+fup_on_ptw Enable a FUP packet to follow the PTWRITE packet. The FUP packet
+ provides the address of the ptwrite instruction. In the absence of
+ fup_on_ptw, the decoder will use the address of the previous branch
+ if branch tracing is enabled, otherwise the address will be zero.
+ Note that fup_on_ptw will work even when branch tracing is disabled.
+
+pwr_evt Enable power events. The power events provide information about
+ changes to the CPU C-state.
+
+ Support for this feature is indicated by:
+
+ /sys/bus/event_source/devices/intel_pt/caps/power_event_trace
+
+ which contains "1" if the feature is supported and
+ "0" otherwise.
+
new snapshot option
-------------------
@@ -674,13 +713,15 @@ Having no option is the same as
which, in turn, is the same as
- --itrace=ibxe
+ --itrace=ibxwpe
The letters are:
i synthesize "instructions" events
b synthesize "branches" events
x synthesize "transactions" events
+ w synthesize "ptwrite" events
+ p synthesize "power" events
c synthesize branches events (calls only)
r synthesize branches events (returns only)
e synthesize tracing error events
@@ -699,7 +740,40 @@ and "r" can be combined to get calls and returns.
'flags' field can be used in perf script to determine whether the event is a
tranasaction start, commit or abort.
-Error events are new. They show where the decoder lost the trace. Error events
+Note that "instructions", "branches" and "transactions" events depend on code
+flow packets which can be disabled by using the config term "branch=0". Refer
+to the config terms section above.
+
+"ptwrite" events record the payload of the ptwrite instruction and whether
+"fup_on_ptw" was used. "ptwrite" events depend on PTWRITE packets which are
+recorded only if the "ptw" config term was used. Refer to the config terms
+section above. perf script "synth" field displays "ptwrite" information like
+this: "ip: 0 payload: 0x123456789abcdef0" where "ip" is 1 if "fup_on_ptw" was
+used.
+
+"Power" events correspond to power event packets and CBR (core-to-bus ratio)
+packets. While CBR packets are always recorded when tracing is enabled, power
+event packets are recorded only if the "pwr_evt" config term was used. Refer to
+the config terms section above. The power events record information about
+C-state changes, whereas CBR is indicative of CPU frequency. perf script
+"event,synth" fields display information like this:
+ cbr: cbr: 22 freq: 2189 MHz (200%)
+ mwait: hints: 0x60 extensions: 0x1
+ pwre: hw: 0 cstate: 2 sub-cstate: 0
+ exstop: ip: 1
+ pwrx: deepest cstate: 2 last cstate: 2 wake reason: 0x4
+Where:
+ "cbr" includes the frequency and the percentage of maximum non-turbo
+ "mwait" shows mwait hints and extensions
+ "pwre" shows C-state transitions (to a C-state deeper than C0) and
+ whether initiated by hardware
+ "exstop" indicates execution stopped and whether the IP was recorded
+ exactly,
+ "pwrx" indicates return to C0
+For more details refer to the Intel 64 and IA-32 Architectures Software
+Developer Manuals.
+
+Error events show where the decoder lost the trace. Error events
are quite important. Users must know if what they are seeing is a complete
picture or not.
diff --git a/tools/perf/Documentation/itrace.txt b/tools/perf/Documentation/itrace.txt
index e2a4c5e0dbe5..a3abe04c779d 100644
--- a/tools/perf/Documentation/itrace.txt
+++ b/tools/perf/Documentation/itrace.txt
@@ -3,13 +3,15 @@
c synthesize branches events (calls only)
r synthesize branches events (returns only)
x synthesize transactions events
+ w synthesize ptwrite events
+ p synthesize power events
e synthesize error events
d create a debug log
g synthesize a call chain (use with i or x)
l synthesize last branch entries (use with i or x)
s skip initial number of events
- The default is all events i.e. the same as --itrace=ibxe
+ The default is all events i.e. the same as --itrace=ibxwpe
In addition, the period (default 100000) for instructions events
can be specified in units of:
@@ -26,8 +28,8 @@
Also the number of last branch entries (default 64, max. 1024) for
instructions or transactions events can be specified.
- It is also possible to skip events generated (instructions, branches, transactions)
- at the beginning. This is useful to ignore initialization code.
+ It is also possible to skip events generated (instructions, branches, transactions,
+ ptwrite, power) at the beginning. This is useful to ignore initialization code.
--itrace=i0nss1000000
diff --git a/tools/perf/Documentation/perf-ftrace.txt b/tools/perf/Documentation/perf-ftrace.txt
index 6e6a8b22c859..721a447f046e 100644
--- a/tools/perf/Documentation/perf-ftrace.txt
+++ b/tools/perf/Documentation/perf-ftrace.txt
@@ -48,6 +48,39 @@ OPTIONS
Ranges of CPUs are specified with -: 0-2.
Default is to trace on all online CPUs.
+-T::
+--trace-funcs=::
+ Only trace functions given by the argument. Multiple functions
+ can be given by using this option more than once. The function
+ argument also can be a glob pattern. It will be passed to
+ 'set_ftrace_filter' in tracefs.
+
+-N::
+--notrace-funcs=::
+ Do not trace functions given by the argument. Like -T option,
+ this can be used more than once to specify multiple functions
+ (or glob patterns). It will be passed to 'set_ftrace_notrace'
+ in tracefs.
+
+-G::
+--graph-funcs=::
+ Set graph filter on the given function (or a glob pattern).
+ This is useful for the function_graph tracer only and enables
+ tracing for functions executed from the given function.
+ This can be used more than once to specify multiple functions.
+ It will be passed to 'set_graph_function' in tracefs.
+
+-g::
+--nograph-funcs=::
+ Set graph notrace filter on the given function (or a glob pattern).
+ Like -G option, this is useful for the function_graph tracer only
+ and disables tracing for function executed from the given function.
+ This can be used more than once to specify multiple functions.
+ It will be passed to 'set_graph_notrace' in tracefs.
+
+-D::
+--graph-depth=::
+ Set max depth for function graph tracer to follow
SEE ALSO
--------
diff --git a/tools/perf/Documentation/perf-script.txt b/tools/perf/Documentation/perf-script.txt
index 3517e204a2b3..5ee8796be96e 100644
--- a/tools/perf/Documentation/perf-script.txt
+++ b/tools/perf/Documentation/perf-script.txt
@@ -116,8 +116,9 @@ OPTIONS
--fields::
Comma separated list of fields to print. Options are:
comm, tid, pid, time, cpu, event, trace, ip, sym, dso, addr, symoff,
- srcline, period, iregs, brstack, brstacksym, flags, bpf-output, brstackinsn,
- callindent, insn, insnlen. Field list can be prepended with the type, trace, sw or hw,
+ srcline, period, iregs, brstack, brstacksym, flags, bpf-output, brstackinsn, brstackoff,
+ callindent, insn, insnlen, synth.
+ Field list can be prepended with the type, trace, sw or hw,
to indicate to which event type the field list applies.
e.g., -F sw:comm,tid,time,ip,sym and -F trace:time,cpu,trace
@@ -130,6 +131,14 @@ OPTIONS
i.e., the specified fields apply to all event types if the type string
is not given.
+ In addition to overriding fields, it is also possible to add or remove
+ fields from the defaults. For example
+
+ -F -cpu,+insn
+
+ removes the cpu field and adds the insn field. Adding/removing fields
+ cannot be mixed with normal overriding.
+
The arguments are processed in the order received. A later usage can
reset a prior request. e.g.:
@@ -185,6 +194,9 @@ OPTIONS
instruction bytes and the instruction length of the current
instruction.
+ The synth field is used by synthesized events which may be created when
+ Instruction Trace decoding.
+
Finally, a user may not set fields to none for all event types.
i.e., -F "" is not allowed.
@@ -203,6 +215,8 @@ OPTIONS
is printed. This is the full execution path leading to the sample. This is only supported when the
sample was recorded with perf record -b or -j any.
+ The brstackoff field will print an offset into a specific dso/binary.
+
-k::
--vmlinux=<file>::
vmlinux pathname
diff --git a/tools/perf/Documentation/perf-stat.txt b/tools/perf/Documentation/perf-stat.txt
index bd0e4417f2be..698076313606 100644
--- a/tools/perf/Documentation/perf-stat.txt
+++ b/tools/perf/Documentation/perf-stat.txt
@@ -239,6 +239,20 @@ taskset.
--no-merge::
Do not merge results from same PMUs.
+--smi-cost::
+Measure SMI cost if msr/aperf/ and msr/smi/ events are supported.
+
+During the measurement, the /sys/device/cpu/freeze_on_smi will be set to
+freeze core counters on SMI.
+The aperf counter will not be effected by the setting.
+The cost of SMI can be measured by (aperf - unhalted core cycles).
+
+In practice, the percentages of SMI cycles is very useful for performance
+oriented analysis. --metric_only will be applied by default.
+The output is SMI cycles%, equals to (aperf - unhalted core cycles) / aperf
+
+Users who wants to get the actual value can apply --no-metric-only.
+
EXAMPLES
--------
diff --git a/tools/perf/Makefile.config b/tools/perf/Makefile.config
index 8354d04b392f..bdf0e87f9b29 100644
--- a/tools/perf/Makefile.config
+++ b/tools/perf/Makefile.config
@@ -19,18 +19,18 @@ CFLAGS := $(EXTRA_CFLAGS) $(EXTRA_WARNINGS)
include $(srctree)/tools/scripts/Makefile.arch
-$(call detected_var,ARCH)
+$(call detected_var,SRCARCH)
NO_PERF_REGS := 1
# Additional ARCH settings for ppc
-ifeq ($(ARCH),powerpc)
+ifeq ($(SRCARCH),powerpc)
NO_PERF_REGS := 0
LIBUNWIND_LIBS := -lunwind -lunwind-ppc64
endif
# Additional ARCH settings for x86
-ifeq ($(ARCH),x86)
+ifeq ($(SRCARCH),x86)
$(call detected,CONFIG_X86)
ifeq (${IS_64_BIT}, 1)
CFLAGS += -DHAVE_ARCH_X86_64_SUPPORT -DHAVE_SYSCALL_TABLE -I$(OUTPUT)arch/x86/include/generated
@@ -43,12 +43,12 @@ ifeq ($(ARCH),x86)
NO_PERF_REGS := 0
endif
-ifeq ($(ARCH),arm)
+ifeq ($(SRCARCH),arm)
NO_PERF_REGS := 0
LIBUNWIND_LIBS = -lunwind -lunwind-arm
endif
-ifeq ($(ARCH),arm64)
+ifeq ($(SRCARCH),arm64)
NO_PERF_REGS := 0
LIBUNWIND_LIBS = -lunwind -lunwind-aarch64
endif
@@ -61,7 +61,7 @@ endif
# Disable it on all other architectures in case libdw unwind
# support is detected in system. Add supported architectures
# to the check.
-ifneq ($(ARCH),$(filter $(ARCH),x86 arm))
+ifneq ($(SRCARCH),$(filter $(SRCARCH),x86 arm powerpc))
NO_LIBDW_DWARF_UNWIND := 1
endif
@@ -115,9 +115,9 @@ endif
FEATURE_CHECK_CFLAGS-libbabeltrace := $(LIBBABELTRACE_CFLAGS)
FEATURE_CHECK_LDFLAGS-libbabeltrace := $(LIBBABELTRACE_LDFLAGS) -lbabeltrace-ctf
-FEATURE_CHECK_CFLAGS-bpf = -I. -I$(srctree)/tools/include -I$(srctree)/tools/arch/$(ARCH)/include/uapi -I$(srctree)/tools/include/uapi
+FEATURE_CHECK_CFLAGS-bpf = -I. -I$(srctree)/tools/include -I$(srctree)/tools/arch/$(SRCARCH)/include/uapi -I$(srctree)/tools/include/uapi
# include ARCH specific config
--include $(src-perf)/arch/$(ARCH)/Makefile
+-include $(src-perf)/arch/$(SRCARCH)/Makefile
ifdef PERF_HAVE_ARCH_REGS_QUERY_REGISTER_OFFSET
CFLAGS += -DHAVE_ARCH_REGS_QUERY_REGISTER_OFFSET
@@ -228,12 +228,12 @@ ifeq ($(DEBUG),0)
endif
INC_FLAGS += -I$(src-perf)/util/include
-INC_FLAGS += -I$(src-perf)/arch/$(ARCH)/include
+INC_FLAGS += -I$(src-perf)/arch/$(SRCARCH)/include
INC_FLAGS += -I$(srctree)/tools/include/uapi
INC_FLAGS += -I$(srctree)/tools/include/
-INC_FLAGS += -I$(srctree)/tools/arch/$(ARCH)/include/uapi
-INC_FLAGS += -I$(srctree)/tools/arch/$(ARCH)/include/
-INC_FLAGS += -I$(srctree)/tools/arch/$(ARCH)/
+INC_FLAGS += -I$(srctree)/tools/arch/$(SRCARCH)/include/uapi
+INC_FLAGS += -I$(srctree)/tools/arch/$(SRCARCH)/include/
+INC_FLAGS += -I$(srctree)/tools/arch/$(SRCARCH)/
# $(obj-perf) for generated common-cmds.h
# $(obj-perf)/util for generated bison/flex headers
@@ -355,7 +355,7 @@ ifndef NO_LIBELF
ifndef NO_DWARF
ifeq ($(origin PERF_HAVE_DWARF_REGS), undefined)
- msg := $(warning DWARF register mappings have not been defined for architecture $(ARCH), DWARF support disabled);
+ msg := $(warning DWARF register mappings have not been defined for architecture $(SRCARCH), DWARF support disabled);
NO_DWARF := 1
else
CFLAGS += -DHAVE_DWARF_SUPPORT $(LIBDW_CFLAGS)
@@ -380,7 +380,7 @@ ifndef NO_LIBELF
CFLAGS += -DHAVE_BPF_PROLOGUE
$(call detected,CONFIG_BPF_PROLOGUE)
else
- msg := $(warning BPF prologue is not supported by architecture $(ARCH), missing regs_query_register_offset());
+ msg := $(warning BPF prologue is not supported by architecture $(SRCARCH), missing regs_query_register_offset());
endif
else
msg := $(warning DWARF support is off, BPF prologue is disabled);
@@ -406,7 +406,7 @@ ifdef PERF_HAVE_JITDUMP
endif
endif
-ifeq ($(ARCH),powerpc)
+ifeq ($(SRCARCH),powerpc)
ifndef NO_DWARF
CFLAGS += -DHAVE_SKIP_CALLCHAIN_IDX
endif
@@ -487,7 +487,7 @@ else
endif
ifndef NO_LOCAL_LIBUNWIND
- ifeq ($(ARCH),$(filter $(ARCH),arm arm64))
+ ifeq ($(SRCARCH),$(filter $(SRCARCH),arm arm64))
$(call feature_check,libunwind-debug-frame)
ifneq ($(feature-libunwind-debug-frame), 1)
msg := $(warning No debug_frame support found in libunwind);
@@ -740,7 +740,7 @@ ifeq (${IS_64_BIT}, 1)
NO_PERF_READ_VDSO32 := 1
endif
endif
- ifneq ($(ARCH), x86)
+ ifneq ($(SRCARCH), x86)
NO_PERF_READ_VDSOX32 := 1
endif
ifndef NO_PERF_READ_VDSOX32
@@ -769,7 +769,7 @@ ifdef LIBBABELTRACE
endif
ifndef NO_AUXTRACE
- ifeq ($(ARCH),x86)
+ ifeq ($(SRCARCH),x86)
ifeq ($(feature-get_cpuid), 0)
msg := $(warning Your gcc lacks the __get_cpuid() builtin, disables support for auxtrace/Intel PT, please install a newer gcc);
NO_AUXTRACE := 1
@@ -872,7 +872,7 @@ sysconfdir = $(prefix)/etc
ETC_PERFCONFIG = etc/perfconfig
endif
ifndef lib
-ifeq ($(ARCH)$(IS_64_BIT), x861)
+ifeq ($(SRCARCH)$(IS_64_BIT), x861)
lib = lib64
else
lib = lib
diff --git a/tools/perf/Makefile.perf b/tools/perf/Makefile.perf
index 79fe31f20a17..5008f51a08a2 100644
--- a/tools/perf/Makefile.perf
+++ b/tools/perf/Makefile.perf
@@ -226,7 +226,7 @@ endif
ifeq ($(config),0)
include $(srctree)/tools/scripts/Makefile.arch
--include arch/$(ARCH)/Makefile
+-include arch/$(SRCARCH)/Makefile
endif
# The FEATURE_DUMP_EXPORT holds location of the actual
diff --git a/tools/perf/arch/Build b/tools/perf/arch/Build
index 109eb75cf7de..d9b6af837c7d 100644
--- a/tools/perf/arch/Build
+++ b/tools/perf/arch/Build
@@ -1,2 +1,2 @@
libperf-y += common.o
-libperf-y += $(ARCH)/
+libperf-y += $(SRCARCH)/
diff --git a/tools/perf/arch/arm/util/cs-etm.c b/tools/perf/arch/arm/util/cs-etm.c
index 29361d9b635a..7ce3d1a25133 100644
--- a/tools/perf/arch/arm/util/cs-etm.c
+++ b/tools/perf/arch/arm/util/cs-etm.c
@@ -17,6 +17,7 @@
#include <api/fs/fs.h>
#include <linux/bitops.h>
+#include <linux/compiler.h>
#include <linux/coresight-pmu.h>
#include <linux/kernel.h>
#include <linux/log2.h>
@@ -202,19 +203,18 @@ static int cs_etm_recording_options(struct auxtrace_record *itr,
pr_debug2("%s snapshot size: %zu\n", CORESIGHT_ETM_PMU_NAME,
opts->auxtrace_snapshot_size);
- if (cs_etm_evsel) {
- /*
- * To obtain the auxtrace buffer file descriptor, the auxtrace
- * event must come first.
- */
- perf_evlist__to_front(evlist, cs_etm_evsel);
- /*
- * In the case of per-cpu mmaps, we need the CPU on the
- * AUX event.
- */
- if (!cpu_map__empty(cpus))
- perf_evsel__set_sample_bit(cs_etm_evsel, CPU);
- }
+ /*
+ * To obtain the auxtrace buffer file descriptor, the auxtrace
+ * event must come first.
+ */
+ perf_evlist__to_front(evlist, cs_etm_evsel);
+
+ /*
+ * In the case of per-cpu mmaps, we need the CPU on the
+ * AUX event.
+ */
+ if (!cpu_map__empty(cpus))
+ perf_evsel__set_sample_bit(cs_etm_evsel, CPU);
/* Add dummy event to keep tracking */
if (opts->full_auxtrace) {
@@ -583,8 +583,7 @@ static FILE *cs_device__open_file(const char *name)
}
-static __attribute__((format(printf, 2, 3)))
-int cs_device__print_file(const char *name, const char *fmt, ...)
+static int __printf(2, 3) cs_device__print_file(const char *name, const char *fmt, ...)
{
va_list args;
FILE *file;
diff --git a/tools/perf/arch/powerpc/util/Build b/tools/perf/arch/powerpc/util/Build
index 90ad64b231cd..2e6595310420 100644
--- a/tools/perf/arch/powerpc/util/Build
+++ b/tools/perf/arch/powerpc/util/Build
@@ -5,4 +5,6 @@ libperf-y += perf_regs.o
libperf-$(CONFIG_DWARF) += dwarf-regs.o
libperf-$(CONFIG_DWARF) += skip-callchain-idx.o
+
libperf-$(CONFIG_LIBUNWIND) += unwind-libunwind.o
+libperf-$(CONFIG_LIBDW_DWARF_UNWIND) += unwind-libdw.o
diff --git a/tools/perf/arch/powerpc/util/unwind-libdw.c b/tools/perf/arch/powerpc/util/unwind-libdw.c
new file mode 100644
index 000000000000..3a24b3c43273
--- /dev/null
+++ b/tools/perf/arch/powerpc/util/unwind-libdw.c
@@ -0,0 +1,73 @@
+#include <elfutils/libdwfl.h>
+#include "../../util/unwind-libdw.h"
+#include "../../util/perf_regs.h"
+#include "../../util/event.h"
+
+/* See backends/ppc_initreg.c and backends/ppc_regs.c in elfutils. */
+static const int special_regs[3][2] = {
+ { 65, PERF_REG_POWERPC_LINK },
+ { 101, PERF_REG_POWERPC_XER },
+ { 109, PERF_REG_POWERPC_CTR },
+};
+
+bool libdw__arch_set_initial_registers(Dwfl_Thread *thread, void *arg)
+{
+ struct unwind_info *ui = arg;
+ struct regs_dump *user_regs = &ui->sample->user_regs;
+ Dwarf_Word dwarf_regs[32], dwarf_nip;
+ size_t i;
+
+#define REG(r) ({ \
+ Dwarf_Word val = 0; \
+ perf_reg_value(&val, user_regs, PERF_REG_POWERPC_##r); \
+ val; \
+})
+
+ dwarf_regs[0] = REG(R0);
+ dwarf_regs[1] = REG(R1);
+ dwarf_regs[2] = REG(R2);
+ dwarf_regs[3] = REG(R3);
+ dwarf_regs[4] = REG(R4);
+ dwarf_regs[5] = REG(R5);
+ dwarf_regs[6] = REG(R6);
+ dwarf_regs[7] = REG(R7);
+ dwarf_regs[8] = REG(R8);
+ dwarf_regs[9] = REG(R9);
+ dwarf_regs[10] = REG(R10);
+ dwarf_regs[11] = REG(R11);
+ dwarf_regs[12] = REG(R12);
+ dwarf_regs[13] = REG(R13);
+ dwarf_regs[14] = REG(R14);
+ dwarf_regs[15] = REG(R15);
+ dwarf_regs[16] = REG(R16);
+ dwarf_regs[17] = REG(R17);
+ dwarf_regs[18] = REG(R18);
+ dwarf_regs[19] = REG(R19);
+ dwarf_regs[20] = REG(R20);
+ dwarf_regs[21] = REG(R21);
+ dwarf_regs[22] = REG(R22);
+ dwarf_regs[23] = REG(R23);
+ dwarf_regs[24] = REG(R24);
+ dwarf_regs[25] = REG(R25);
+ dwarf_regs[26] = REG(R26);
+ dwarf_regs[27] = REG(R27);
+ dwarf_regs[28] = REG(R28);
+ dwarf_regs[29] = REG(R29);
+ dwarf_regs[30] = REG(R30);
+ dwarf_regs[31] = REG(R31);
+ if (!dwfl_thread_state_registers(thread, 0, 32, dwarf_regs))
+ return false;
+
+ dwarf_nip = REG(NIP);
+ dwfl_thread_state_register_pc(thread, dwarf_nip);
+ for (i = 0; i < ARRAY_SIZE(special_regs); i++) {
+ Dwarf_Word val = 0;
+ perf_reg_value(&val, user_regs, special_regs[i][1]);
+ if (!dwfl_thread_state_registers(thread,
+ special_regs[i][0], 1,
+ &val))
+ return false;
+ }
+
+ return true;
+}
diff --git a/tools/perf/arch/x86/tests/insn-x86-dat-32.c b/tools/perf/arch/x86/tests/insn-x86-dat-32.c
index 0f196eec9f48..3cbf6fad169f 100644
--- a/tools/perf/arch/x86/tests/insn-x86-dat-32.c
+++ b/tools/perf/arch/x86/tests/insn-x86-dat-32.c
@@ -1664,3 +1664,15 @@
"0f c7 1d 78 56 34 12 \txrstors 0x12345678",},
{{0x0f, 0xc7, 0x9c, 0xc8, 0x78, 0x56, 0x34, 0x12, }, 8, 0, "", "",
"0f c7 9c c8 78 56 34 12 \txrstors 0x12345678(%eax,%ecx,8)",},
+{{0xf3, 0x0f, 0xae, 0x20, }, 4, 0, "", "",
+"f3 0f ae 20 \tptwritel (%eax)",},
+{{0xf3, 0x0f, 0xae, 0x25, 0x78, 0x56, 0x34, 0x12, }, 8, 0, "", "",
+"f3 0f ae 25 78 56 34 12 \tptwritel 0x12345678",},
+{{0xf3, 0x0f, 0xae, 0xa4, 0xc8, 0x78, 0x56, 0x34, 0x12, }, 9, 0, "", "",
+"f3 0f ae a4 c8 78 56 34 12 \tptwritel 0x12345678(%eax,%ecx,8)",},
+{{0xf3, 0x0f, 0xae, 0x20, }, 4, 0, "", "",
+"f3 0f ae 20 \tptwritel (%eax)",},
+{{0xf3, 0x0f, 0xae, 0x25, 0x78, 0x56, 0x34, 0x12, }, 8, 0, "", "",
+"f3 0f ae 25 78 56 34 12 \tptwritel 0x12345678",},
+{{0xf3, 0x0f, 0xae, 0xa4, 0xc8, 0x78, 0x56, 0x34, 0x12, }, 9, 0, "", "",
+"f3 0f ae a4 c8 78 56 34 12 \tptwritel 0x12345678(%eax,%ecx,8)",},
diff --git a/tools/perf/arch/x86/tests/insn-x86-dat-64.c b/tools/perf/arch/x86/tests/insn-x86-dat-64.c
index af25bc8240d0..aa512fa944dd 100644
--- a/tools/perf/arch/x86/tests/insn-x86-dat-64.c
+++ b/tools/perf/arch/x86/tests/insn-x86-dat-64.c
@@ -1696,3 +1696,33 @@
"0f c7 9c c8 78 56 34 12 \txrstors 0x12345678(%rax,%rcx,8)",},
{{0x41, 0x0f, 0xc7, 0x9c, 0xc8, 0x78, 0x56, 0x34, 0x12, }, 9, 0, "", "",
"41 0f c7 9c c8 78 56 34 12 \txrstors 0x12345678(%r8,%rcx,8)",},
+{{0xf3, 0x0f, 0xae, 0x20, }, 4, 0, "", "",
+"f3 0f ae 20 \tptwritel (%rax)",},
+{{0xf3, 0x41, 0x0f, 0xae, 0x20, }, 5, 0, "", "",
+"f3 41 0f ae 20 \tptwritel (%r8)",},
+{{0xf3, 0x0f, 0xae, 0x24, 0x25, 0x78, 0x56, 0x34, 0x12, }, 9, 0, "", "",
+"f3 0f ae 24 25 78 56 34 12 \tptwritel 0x12345678",},
+{{0xf3, 0x0f, 0xae, 0xa4, 0xc8, 0x78, 0x56, 0x34, 0x12, }, 9, 0, "", "",
+"f3 0f ae a4 c8 78 56 34 12 \tptwritel 0x12345678(%rax,%rcx,8)",},
+{{0xf3, 0x41, 0x0f, 0xae, 0xa4, 0xc8, 0x78, 0x56, 0x34, 0x12, }, 10, 0, "", "",
+"f3 41 0f ae a4 c8 78 56 34 12 \tptwritel 0x12345678(%r8,%rcx,8)",},
+{{0xf3, 0x0f, 0xae, 0x20, }, 4, 0, "", "",
+"f3 0f ae 20 \tptwritel (%rax)",},
+{{0xf3, 0x41, 0x0f, 0xae, 0x20, }, 5, 0, "", "",
+"f3 41 0f ae 20 \tptwritel (%r8)",},
+{{0xf3, 0x0f, 0xae, 0x24, 0x25, 0x78, 0x56, 0x34, 0x12, }, 9, 0, "", "",
+"f3 0f ae 24 25 78 56 34 12 \tptwritel 0x12345678",},
+{{0xf3, 0x0f, 0xae, 0xa4, 0xc8, 0x78, 0x56, 0x34, 0x12, }, 9, 0, "", "",
+"f3 0f ae a4 c8 78 56 34 12 \tptwritel 0x12345678(%rax,%rcx,8)",},
+{{0xf3, 0x41, 0x0f, 0xae, 0xa4, 0xc8, 0x78, 0x56, 0x34, 0x12, }, 10, 0, "", "",
+"f3 41 0f ae a4 c8 78 56 34 12 \tptwritel 0x12345678(%r8,%rcx,8)",},
+{{0xf3, 0x48, 0x0f, 0xae, 0x20, }, 5, 0, "", "",
+"f3 48 0f ae 20 \tptwriteq (%rax)",},
+{{0xf3, 0x49, 0x0f, 0xae, 0x20, }, 5, 0, "", "",
+"f3 49 0f ae 20 \tptwriteq (%r8)",},
+{{0xf3, 0x48, 0x0f, 0xae, 0x24, 0x25, 0x78, 0x56, 0x34, 0x12, }, 10, 0, "", "",
+"f3 48 0f ae 24 25 78 56 34 12 \tptwriteq 0x12345678",},
+{{0xf3, 0x48, 0x0f, 0xae, 0xa4, 0xc8, 0x78, 0x56, 0x34, 0x12, }, 10, 0, "", "",
+"f3 48 0f ae a4 c8 78 56 34 12 \tptwriteq 0x12345678(%rax,%rcx,8)",},
+{{0xf3, 0x49, 0x0f, 0xae, 0xa4, 0xc8, 0x78, 0x56, 0x34, 0x12, }, 10, 0, "", "",
+"f3 49 0f ae a4 c8 78 56 34 12 \tptwriteq 0x12345678(%r8,%rcx,8)",},
diff --git a/tools/perf/arch/x86/tests/insn-x86-dat-src.c b/tools/perf/arch/x86/tests/insn-x86-dat-src.c
index 979487dae8d4..6cdb65d25b79 100644
--- a/tools/perf/arch/x86/tests/insn-x86-dat-src.c
+++ b/tools/perf/arch/x86/tests/insn-x86-dat-src.c
@@ -1343,6 +1343,26 @@ int main(void)
asm volatile("xrstors 0x12345678(%rax,%rcx,8)");
asm volatile("xrstors 0x12345678(%r8,%rcx,8)");
+ /* ptwrite */
+
+ asm volatile("ptwrite (%rax)");
+ asm volatile("ptwrite (%r8)");
+ asm volatile("ptwrite (0x12345678)");
+ asm volatile("ptwrite 0x12345678(%rax,%rcx,8)");
+ asm volatile("ptwrite 0x12345678(%r8,%rcx,8)");
+
+ asm volatile("ptwritel (%rax)");
+ asm volatile("ptwritel (%r8)");
+ asm volatile("ptwritel (0x12345678)");
+ asm volatile("ptwritel 0x12345678(%rax,%rcx,8)");
+ asm volatile("ptwritel 0x12345678(%r8,%rcx,8)");
+
+ asm volatile("ptwriteq (%rax)");
+ asm volatile("ptwriteq (%r8)");
+ asm volatile("ptwriteq (0x12345678)");
+ asm volatile("ptwriteq 0x12345678(%rax,%rcx,8)");
+ asm volatile("ptwriteq 0x12345678(%r8,%rcx,8)");
+
#else /* #ifdef __x86_64__ */
/* bound r32, mem (same op code as EVEX prefix) */
@@ -2653,6 +2673,16 @@ int main(void)
asm volatile("xrstors (0x12345678)");
asm volatile("xrstors 0x12345678(%eax,%ecx,8)");
+ /* ptwrite */
+
+ asm volatile("ptwrite (%eax)");
+ asm volatile("ptwrite (0x12345678)");
+ asm volatile("ptwrite 0x12345678(%eax,%ecx,8)");
+
+ asm volatile("ptwritel (%eax)");
+ asm volatile("ptwritel (0x12345678)");
+ asm volatile("ptwritel 0x12345678(%eax,%ecx,8)");
+
#endif /* #ifndef __x86_64__ */
/* Following line is a marker for the awk script - do not change */
diff --git a/tools/perf/arch/x86/util/intel-bts.c b/tools/perf/arch/x86/util/intel-bts.c
index af2bce7a2cd6..781df40b2966 100644
--- a/tools/perf/arch/x86/util/intel-bts.c
+++ b/tools/perf/arch/x86/util/intel-bts.c
@@ -35,10 +35,6 @@
#define KiB_MASK(x) (KiB(x) - 1)
#define MiB_MASK(x) (MiB(x) - 1)
-#define INTEL_BTS_DFLT_SAMPLE_SIZE KiB(4)
-
-#define INTEL_BTS_MAX_SAMPLE_SIZE KiB(60)
-
struct intel_bts_snapshot_ref {
void *ref_buf;
size_t ref_offset;
diff --git a/tools/perf/arch/x86/util/intel-pt.c b/tools/perf/arch/x86/util/intel-pt.c
index f630de0206a1..9535be57033f 100644
--- a/tools/perf/arch/x86/util/intel-pt.c
+++ b/tools/perf/arch/x86/util/intel-pt.c
@@ -40,10 +40,6 @@
#define KiB_MASK(x) (KiB(x) - 1)
#define MiB_MASK(x) (MiB(x) - 1)
-#define INTEL_PT_DEFAULT_SAMPLE_SIZE KiB(4)
-
-#define INTEL_PT_MAX_SAMPLE_SIZE KiB(60)
-
#define INTEL_PT_PSB_PERIOD_NEAR 256
struct intel_pt_snapshot_ref {
@@ -196,6 +192,7 @@ static u64 intel_pt_default_config(struct perf_pmu *intel_pt_pmu)
int psb_cyc, psb_periods, psb_period;
int pos = 0;
u64 config;
+ char c;
pos += scnprintf(buf + pos, sizeof(buf) - pos, "tsc");
@@ -229,6 +226,10 @@ static u64 intel_pt_default_config(struct perf_pmu *intel_pt_pmu)
}
}
+ if (perf_pmu__scan_file(intel_pt_pmu, "format/pt", "%c", &c) == 1 &&
+ perf_pmu__scan_file(intel_pt_pmu, "format/branch", "%c", &c) == 1)
+ pos += scnprintf(buf + pos, sizeof(buf) - pos, ",pt,branch");
+
pr_debug2("%s default config: %s\n", intel_pt_pmu->name, buf);
intel_pt_parse_terms(&intel_pt_pmu->format, buf, &config);
diff --git a/tools/perf/bench/numa.c b/tools/perf/bench/numa.c
index 27de0c8c5c19..469d65b21122 100644
--- a/tools/perf/bench/numa.c
+++ b/tools/perf/bench/numa.c
@@ -700,7 +700,7 @@ static inline uint32_t lfsr_32(uint32_t lfsr)
* kernel (KSM, zero page, etc.) cannot optimize away RAM
* accesses:
*/
-static inline u64 access_data(u64 *data __attribute__((unused)), u64 val)
+static inline u64 access_data(u64 *data, u64 val)
{
if (g->p.data_reads)
val += *data;
diff --git a/tools/perf/builtin-c2c.c b/tools/perf/builtin-c2c.c
index 620a467ee304..475999e48f66 100644
--- a/tools/perf/builtin-c2c.c
+++ b/tools/perf/builtin-c2c.c
@@ -1725,10 +1725,10 @@ static int c2c_hists__init_sort(struct perf_hpp_list *hpp_list, char *name)
tok; tok = strtok_r(NULL, ", ", &tmp)) { \
ret = _fn(hpp_list, tok); \
if (ret == -EINVAL) { \
- error("Invalid --fields key: `%s'", tok); \
+ pr_err("Invalid --fields key: `%s'", tok); \
break; \
} else if (ret == -ESRCH) { \
- error("Unknown --fields key: `%s'", tok); \
+ pr_err("Unknown --fields key: `%s'", tok); \
break; \
} \
} \
diff --git a/tools/perf/builtin-config.c b/tools/perf/builtin-config.c
index 80668fa7556e..ece45582a48d 100644
--- a/tools/perf/builtin-config.c
+++ b/tools/perf/builtin-config.c
@@ -156,7 +156,7 @@ static int parse_config_arg(char *arg, char **var, char **value)
int cmd_config(int argc, const char **argv)
{
- int i, ret = 0;
+ int i, ret = -1;
struct perf_config_set *set;
char *user_config = mkpath("%s/.perfconfig", getenv("HOME"));
const char *config_filename;
@@ -186,10 +186,8 @@ int cmd_config(int argc, const char **argv)
* because of reinitializing with options config file location.
*/
set = perf_config_set__new();
- if (!set) {
- ret = -1;
+ if (!set)
goto out_err;
- }
switch (actions) {
case ACTION_LIST:
@@ -197,41 +195,54 @@ int cmd_config(int argc, const char **argv)
pr_err("Error: takes no arguments\n");
parse_options_usage(config_usage, config_options, "l", 1);
} else {
- ret = show_config(set);
- if (ret < 0)
+ if (show_config(set) < 0) {
pr_err("Nothing configured, "
"please check your %s \n", config_filename);
+ goto out_err;
+ }
}
break;
default:
- if (argc) {
- for (i = 0; argv[i]; i++) {
- char *var, *value;
- char *arg = strdup(argv[i]);
-
- if (!arg) {
- pr_err("%s: strdup failed\n", __func__);
- ret = -1;
- break;
- }
+ if (!argc) {
+ usage_with_options(config_usage, config_options);
+ break;
+ }
- if (parse_config_arg(arg, &var, &value) < 0) {
- free(arg);
- ret = -1;
- break;
- }
+ for (i = 0; argv[i]; i++) {
+ char *var, *value;
+ char *arg = strdup(argv[i]);
+
+ if (!arg) {
+ pr_err("%s: strdup failed\n", __func__);
+ goto out_err;
+ }
- if (value == NULL)
- ret = show_spec_config(set, var);
- else
- ret = set_config(set, config_filename, var, value);
+ if (parse_config_arg(arg, &var, &value) < 0) {
free(arg);
+ goto out_err;
}
- } else
- usage_with_options(config_usage, config_options);
+
+ if (value == NULL) {
+ if (show_spec_config(set, var) < 0) {
+ pr_err("%s is not configured: %s\n",
+ var, config_filename);
+ free(arg);
+ goto out_err;
+ }
+ } else {
+ if (set_config(set, config_filename, var, value) < 0) {
+ pr_err("Failed to set '%s=%s' on %s\n",
+ var, value, config_filename);
+ free(arg);
+ goto out_err;
+ }
+ }
+ free(arg);
+ }
}
- perf_config_set__delete(set);
+ ret = 0;
out_err:
+ perf_config_set__delete(set);
return ret;
}
diff --git a/tools/perf/builtin-diff.c b/tools/perf/builtin-diff.c
index eec5df80f5a3..0cd4cf6a344b 100644
--- a/tools/perf/builtin-diff.c
+++ b/tools/perf/builtin-diff.c
@@ -1302,7 +1302,10 @@ static int diff__config(const char *var, const char *value,
void *cb __maybe_unused)
{
if (!strcmp(var, "diff.order")) {
- sort_compute = perf_config_int(var, value);
+ int ret;
+ if (perf_config_int(&ret, var, value) < 0)
+ return -1;
+ sort_compute = ret;
return 0;
}
if (!strcmp(var, "diff.compute")) {
diff --git a/tools/perf/builtin-ftrace.c b/tools/perf/builtin-ftrace.c
index 9e0b35cd0eea..dd26c62c9893 100644
--- a/tools/perf/builtin-ftrace.c
+++ b/tools/perf/builtin-ftrace.c
@@ -28,9 +28,19 @@
#define DEFAULT_TRACER "function_graph"
struct perf_ftrace {
- struct perf_evlist *evlist;
- struct target target;
- const char *tracer;
+ struct perf_evlist *evlist;
+ struct target target;
+ const char *tracer;
+ struct list_head filters;
+ struct list_head notrace;
+ struct list_head graph_funcs;
+ struct list_head nograph_funcs;
+ int graph_depth;
+};
+
+struct filter_entry {
+ struct list_head list;
+ char name[];
};
static bool done;
@@ -61,6 +71,7 @@ static int __write_tracing_file(const char *name, const char *val, bool append)
int fd, ret = -1;
ssize_t size = strlen(val);
int flags = O_WRONLY;
+ char errbuf[512];
file = get_tracing_file(name);
if (!file) {
@@ -75,14 +86,16 @@ static int __write_tracing_file(const char *name, const char *val, bool append)
fd = open(file, flags);
if (fd < 0) {
- pr_debug("cannot open tracing file: %s\n", name);
+ pr_debug("cannot open tracing file: %s: %s\n",
+ name, str_error_r(errno, errbuf, sizeof(errbuf)));
goto out;
}
if (write(fd, val, size) == size)
ret = 0;
else
- pr_debug("write '%s' to tracing/%s failed\n", val, name);
+ pr_debug("write '%s' to tracing/%s failed: %s\n",
+ val, name, str_error_r(errno, errbuf, sizeof(errbuf)));
close(fd);
out:
@@ -101,6 +114,7 @@ static int append_tracing_file(const char *name, const char *val)
}
static int reset_tracing_cpu(void);
+static void reset_tracing_filters(void);
static int reset_tracing_files(struct perf_ftrace *ftrace __maybe_unused)
{
@@ -116,6 +130,10 @@ static int reset_tracing_files(struct perf_ftrace *ftrace __maybe_unused)
if (reset_tracing_cpu() < 0)
return -1;
+ if (write_tracing_file("max_graph_depth", "0") < 0)
+ return -1;
+
+ reset_tracing_filters();
return 0;
}
@@ -181,6 +199,68 @@ static int reset_tracing_cpu(void)
return ret;
}
+static int __set_tracing_filter(const char *filter_file, struct list_head *funcs)
+{
+ struct filter_entry *pos;
+
+ list_for_each_entry(pos, funcs, list) {
+ if (append_tracing_file(filter_file, pos->name) < 0)
+ return -1;
+ }
+
+ return 0;
+}
+
+static int set_tracing_filters(struct perf_ftrace *ftrace)
+{
+ int ret;
+
+ ret = __set_tracing_filter("set_ftrace_filter", &ftrace->filters);
+ if (ret < 0)
+ return ret;
+
+ ret = __set_tracing_filter("set_ftrace_notrace", &ftrace->notrace);
+ if (ret < 0)
+ return ret;
+
+ ret = __set_tracing_filter("set_graph_function", &ftrace->graph_funcs);
+ if (ret < 0)
+ return ret;
+
+ /* old kernels do not have this filter */
+ __set_tracing_filter("set_graph_notrace", &ftrace->nograph_funcs);
+
+ return ret;
+}
+
+static void reset_tracing_filters(void)
+{
+ write_tracing_file("set_ftrace_filter", " ");
+ write_tracing_file("set_ftrace_notrace", " ");
+ write_tracing_file("set_graph_function", " ");
+ write_tracing_file("set_graph_notrace", " ");
+}
+
+static int set_tracing_depth(struct perf_ftrace *ftrace)
+{
+ char buf[16];
+
+ if (ftrace->graph_depth == 0)
+ return 0;
+
+ if (ftrace->graph_depth < 0) {
+ pr_err("invalid graph depth: %d\n", ftrace->graph_depth);
+ return -1;
+ }
+
+ snprintf(buf, sizeof(buf), "%d", ftrace->graph_depth);
+
+ if (write_tracing_file("max_graph_depth", buf) < 0)
+ return -1;
+
+ return 0;
+}
+
static int __cmd_ftrace(struct perf_ftrace *ftrace, int argc, const char **argv)
{
char *trace_file;
@@ -223,11 +303,23 @@ static int __cmd_ftrace(struct perf_ftrace *ftrace, int argc, const char **argv)
goto out_reset;
}
+ if (set_tracing_filters(ftrace) < 0) {
+ pr_err("failed to set tracing filters\n");
+ goto out_reset;
+ }
+
+ if (set_tracing_depth(ftrace) < 0) {
+ pr_err("failed to set graph depth\n");
+ goto out_reset;
+ }
+
if (write_tracing_file("current_tracer", ftrace->tracer) < 0) {
pr_err("failed to set current_tracer to %s\n", ftrace->tracer);
goto out_reset;
}
+ setup_pager();
+
trace_file = get_tracing_file("trace_pipe");
if (!trace_file) {
pr_err("failed to open trace_pipe\n");
@@ -251,8 +343,6 @@ static int __cmd_ftrace(struct perf_ftrace *ftrace, int argc, const char **argv)
goto out_close_fd;
}
- setup_pager();
-
perf_evlist__start_workload(ftrace->evlist);
while (!done) {
@@ -307,6 +397,32 @@ static int perf_ftrace_config(const char *var, const char *value, void *cb)
return -1;
}
+static int parse_filter_func(const struct option *opt, const char *str,
+ int unset __maybe_unused)
+{
+ struct list_head *head = opt->value;
+ struct filter_entry *entry;
+
+ entry = malloc(sizeof(*entry) + strlen(str) + 1);
+ if (entry == NULL)
+ return -ENOMEM;
+
+ strcpy(entry->name, str);
+ list_add_tail(&entry->list, head);
+
+ return 0;
+}
+
+static void delete_filter_func(struct list_head *head)
+{
+ struct filter_entry *pos, *tmp;
+
+ list_for_each_entry_safe(pos, tmp, head, list) {
+ list_del(&pos->list);
+ free(pos);
+ }
+}
+
int cmd_ftrace(int argc, const char **argv)
{
int ret;
@@ -330,9 +446,24 @@ int cmd_ftrace(int argc, const char **argv)
"system-wide collection from all CPUs"),
OPT_STRING('C', "cpu", &ftrace.target.cpu_list, "cpu",
"list of cpus to monitor"),
+ OPT_CALLBACK('T', "trace-funcs", &ftrace.filters, "func",
+ "trace given functions only", parse_filter_func),
+ OPT_CALLBACK('N', "notrace-funcs", &ftrace.notrace, "func",
+ "do not trace given functions", parse_filter_func),
+ OPT_CALLBACK('G', "graph-funcs", &ftrace.graph_funcs, "func",
+ "Set graph filter on given functions", parse_filter_func),
+ OPT_CALLBACK('g', "nograph-funcs", &ftrace.nograph_funcs, "func",
+ "Set nograph filter on given functions", parse_filter_func),
+ OPT_INTEGER('D', "graph-depth", &ftrace.graph_depth,
+ "Max depth for function graph tracer"),
OPT_END()
};
+ INIT_LIST_HEAD(&ftrace.filters);
+ INIT_LIST_HEAD(&ftrace.notrace);
+ INIT_LIST_HEAD(&ftrace.graph_funcs);
+ INIT_LIST_HEAD(&ftrace.nograph_funcs);
+
ret = perf_config(perf_ftrace_config, &ftrace);
if (ret < 0)
return -1;
@@ -348,12 +479,14 @@ int cmd_ftrace(int argc, const char **argv)
target__strerror(&ftrace.target, ret, errbuf, 512);
pr_err("%s\n", errbuf);
- return -EINVAL;
+ goto out_delete_filters;
}
ftrace.evlist = perf_evlist__new();
- if (ftrace.evlist == NULL)
- return -ENOMEM;
+ if (ftrace.evlist == NULL) {
+ ret = -ENOMEM;
+ goto out_delete_filters;
+ }
ret = perf_evlist__create_maps(ftrace.evlist, &ftrace.target);
if (ret < 0)
@@ -364,5 +497,11 @@ int cmd_ftrace(int argc, const char **argv)
out_delete_evlist:
perf_evlist__delete(ftrace.evlist);
+out_delete_filters:
+ delete_filter_func(&ftrace.filters);
+ delete_filter_func(&ftrace.notrace);
+ delete_filter_func(&ftrace.graph_funcs);
+ delete_filter_func(&ftrace.nograph_funcs);
+
return ret;
}
diff --git a/tools/perf/builtin-help.c b/tools/perf/builtin-help.c
index 492f8e14ab09..530a7f2fa0f3 100644
--- a/tools/perf/builtin-help.c
+++ b/tools/perf/builtin-help.c
@@ -108,10 +108,14 @@ out:
return ret;
}
-static void exec_woman_emacs(const char *path, const char *page)
+static void exec_failed(const char *cmd)
{
char sbuf[STRERR_BUFSIZE];
+ pr_warning("failed to exec '%s': %s", cmd, str_error_r(errno, sbuf, sizeof(sbuf)));
+}
+static void exec_woman_emacs(const char *path, const char *page)
+{
if (!check_emacsclient_version()) {
/* This works only with emacsclient version >= 22. */
char *man_page;
@@ -122,8 +126,7 @@ static void exec_woman_emacs(const char *path, const char *page)
execlp(path, "emacsclient", "-e", man_page, NULL);
free(man_page);
}
- warning("failed to exec '%s': %s", path,
- str_error_r(errno, sbuf, sizeof(sbuf)));
+ exec_failed(path);
}
}
@@ -134,7 +137,6 @@ static void exec_man_konqueror(const char *path, const char *page)
if (display && *display) {
char *man_page;
const char *filename = "kfmclient";
- char sbuf[STRERR_BUFSIZE];
/* It's simpler to launch konqueror using kfmclient. */
if (path) {
@@ -155,33 +157,27 @@ static void exec_man_konqueror(const char *path, const char *page)
execlp(path, filename, "newTab", man_page, NULL);
free(man_page);
}
- warning("failed to exec '%s': %s", path,
- str_error_r(errno, sbuf, sizeof(sbuf)));
+ exec_failed(path);
}
}
static void exec_man_man(const char *path, const char *page)
{
- char sbuf[STRERR_BUFSIZE];
-
if (!path)
path = "man";
execlp(path, "man", page, NULL);
- warning("failed to exec '%s': %s", path,
- str_error_r(errno, sbuf, sizeof(sbuf)));
+ exec_failed(path);
}
static void exec_man_cmd(const char *cmd, const char *page)
{
- char sbuf[STRERR_BUFSIZE];
char *shell_cmd;
if (asprintf(&shell_cmd, "%s %s", cmd, page) > 0) {
execl("/bin/sh", "sh", "-c", shell_cmd, NULL);
free(shell_cmd);
}
- warning("failed to exec '%s': %s", cmd,
- str_error_r(errno, sbuf, sizeof(sbuf)));
+ exec_failed(cmd);
}
static void add_man_viewer(const char *name)
@@ -214,6 +210,12 @@ static void do_add_man_viewer_info(const char *name,
man_viewer_info_list = new;
}
+static void unsupported_man_viewer(const char *name, const char *var)
+{
+ pr_warning("'%s': path for unsupported man viewer.\n"
+ "Please consider using 'man.<tool>.%s' instead.", name, var);
+}
+
static int add_man_viewer_path(const char *name,
size_t len,
const char *value)
@@ -221,9 +223,7 @@ static int add_man_viewer_path(const char *name,
if (supported_man_viewer(name, len))
do_add_man_viewer_info(name, len, value);
else
- warning("'%s': path for unsupported man viewer.\n"
- "Please consider using 'man.<tool>.cmd' instead.",
- name);
+ unsupported_man_viewer(name, "cmd");
return 0;
}
@@ -233,9 +233,7 @@ static int add_man_viewer_cmd(const char *name,
const char *value)
{
if (supported_man_viewer(name, len))
- warning("'%s': cmd for supported man viewer.\n"
- "Please consider using 'man.<tool>.path' instead.",
- name);
+ unsupported_man_viewer(name, "path");
else
do_add_man_viewer_info(name, len, value);
@@ -247,8 +245,10 @@ static int add_man_viewer_info(const char *var, const char *value)
const char *name = var + 4;
const char *subkey = strrchr(name, '.');
- if (!subkey)
- return error("Config with no key for man viewer: %s", name);
+ if (!subkey) {
+ pr_err("Config with no key for man viewer: %s", name);
+ return -1;
+ }
if (!strcmp(subkey, ".path")) {
if (!value)
@@ -261,7 +261,7 @@ static int add_man_viewer_info(const char *var, const char *value)
return add_man_viewer_cmd(name, subkey - name, value);
}
- warning("'%s': unsupported man viewer sub key.", subkey);
+ pr_warning("'%s': unsupported man viewer sub key.", subkey);
return 0;
}
@@ -332,7 +332,7 @@ static void setup_man_path(void)
setenv("MANPATH", new_path, 1);
free(new_path);
} else {
- error("Unable to setup man path");
+ pr_err("Unable to setup man path");
}
}
@@ -349,7 +349,7 @@ static void exec_viewer(const char *name, const char *page)
else if (info)
exec_man_cmd(info, page);
else
- warning("'%s': unknown man viewer.", name);
+ pr_warning("'%s': unknown man viewer.", name);
}
static int show_man_page(const char *perf_cmd)
diff --git a/tools/perf/builtin-kmem.c b/tools/perf/builtin-kmem.c
index 9409c9464667..0a8a1c45af87 100644
--- a/tools/perf/builtin-kmem.c
+++ b/tools/perf/builtin-kmem.c
@@ -1715,7 +1715,7 @@ static int setup_slab_sorting(struct list_head *sort_list, const char *arg)
if (!tok)
break;
if (slab_sort_dimension__add(tok, sort_list) < 0) {
- error("Unknown slab --sort key: '%s'", tok);
+ pr_err("Unknown slab --sort key: '%s'", tok);
free(str);
return -1;
}
@@ -1741,7 +1741,7 @@ static int setup_page_sorting(struct list_head *sort_list, const char *arg)
if (!tok)
break;
if (page_sort_dimension__add(tok, sort_list) < 0) {
- error("Unknown page --sort key: '%s'", tok);
+ pr_err("Unknown page --sort key: '%s'", tok);
free(str);
return -1;
}
diff --git a/tools/perf/builtin-record.c b/tools/perf/builtin-record.c
index ee7d0a82ccd0..17a14bcce34a 100644
--- a/tools/perf/builtin-record.c
+++ b/tools/perf/builtin-record.c
@@ -453,7 +453,7 @@ try_again:
}
if (perf_evlist__apply_filters(evlist, &pos)) {
- error("failed to set filter \"%s\" on event %s with %d (%s)\n",
+ pr_err("failed to set filter \"%s\" on event %s with %d (%s)\n",
pos->filter, perf_evsel__name(pos), errno,
str_error_r(errno, msg, sizeof(msg)));
rc = -1;
@@ -461,7 +461,7 @@ try_again:
}
if (perf_evlist__apply_drv_configs(evlist, &pos, &err_term)) {
- error("failed to set config \"%s\" on event %s with %d (%s)\n",
+ pr_err("failed to set config \"%s\" on event %s with %d (%s)\n",
err_term->val.drv_cfg, perf_evsel__name(pos), errno,
str_error_r(errno, msg, sizeof(msg)));
rc = -1;
diff --git a/tools/perf/builtin-report.c b/tools/perf/builtin-report.c
index 22478ff2b706..79a33eb1a10d 100644
--- a/tools/perf/builtin-report.c
+++ b/tools/perf/builtin-report.c
@@ -94,10 +94,9 @@ static int report__config(const char *var, const char *value, void *cb)
symbol_conf.cumulate_callchain = perf_config_bool(var, value);
return 0;
}
- if (!strcmp(var, "report.queue-size")) {
- rep->queue_size = perf_config_u64(var, value);
- return 0;
- }
+ if (!strcmp(var, "report.queue-size"))
+ return perf_config_u64(&rep->queue_size, var, value);
+
if (!strcmp(var, "report.sort_order")) {
default_sort_order = strdup(value);
return 0;
@@ -558,6 +557,7 @@ static int __cmd_report(struct report *rep)
ui__error("failed to set cpu bitmap\n");
return ret;
}
+ session->itrace_synth_opts->cpu_bitmap = rep->cpu_bitmap;
}
if (rep->show_threads) {
diff --git a/tools/perf/builtin-sched.c b/tools/perf/builtin-sched.c
index 39996c53995a..322b4def8411 100644
--- a/tools/perf/builtin-sched.c
+++ b/tools/perf/builtin-sched.c
@@ -2066,7 +2066,7 @@ static void save_task_callchain(struct perf_sched *sched,
if (thread__resolve_callchain(thread, cursor, evsel, sample,
NULL, NULL, sched->max_stack + 2) != 0) {
if (verbose > 0)
- error("Failed to resolve callchain. Skipping\n");
+ pr_err("Failed to resolve callchain. Skipping\n");
return;
}
diff --git a/tools/perf/builtin-script.c b/tools/perf/builtin-script.c
index 4761b0d7fcb5..83cdc0a61fd6 100644
--- a/tools/perf/builtin-script.c
+++ b/tools/perf/builtin-script.c
@@ -85,6 +85,8 @@ enum perf_output_field {
PERF_OUTPUT_INSN = 1U << 21,
PERF_OUTPUT_INSNLEN = 1U << 22,
PERF_OUTPUT_BRSTACKINSN = 1U << 23,
+ PERF_OUTPUT_BRSTACKOFF = 1U << 24,
+ PERF_OUTPUT_SYNTH = 1U << 25,
};
struct output_option {
@@ -115,6 +117,13 @@ struct output_option {
{.str = "insn", .field = PERF_OUTPUT_INSN},
{.str = "insnlen", .field = PERF_OUTPUT_INSNLEN},
{.str = "brstackinsn", .field = PERF_OUTPUT_BRSTACKINSN},
+ {.str = "brstackoff", .field = PERF_OUTPUT_BRSTACKOFF},
+ {.str = "synth", .field = PERF_OUTPUT_SYNTH},
+};
+
+enum {
+ OUTPUT_TYPE_SYNTH = PERF_TYPE_MAX,
+ OUTPUT_TYPE_MAX
};
/* default set to maintain compatibility with current format */
@@ -124,7 +133,7 @@ static struct {
unsigned int print_ip_opts;
u64 fields;
u64 invalid_fields;
-} output[PERF_TYPE_MAX] = {
+} output[OUTPUT_TYPE_MAX] = {
[PERF_TYPE_HARDWARE] = {
.user_set = false,
@@ -182,12 +191,44 @@ static struct {
.invalid_fields = PERF_OUTPUT_TRACE | PERF_OUTPUT_BPF_OUTPUT,
},
+
+ [OUTPUT_TYPE_SYNTH] = {
+ .user_set = false,
+
+ .fields = PERF_OUTPUT_COMM | PERF_OUTPUT_TID |
+ PERF_OUTPUT_CPU | PERF_OUTPUT_TIME |
+ PERF_OUTPUT_EVNAME | PERF_OUTPUT_IP |
+ PERF_OUTPUT_SYM | PERF_OUTPUT_DSO |
+ PERF_OUTPUT_SYNTH,
+
+ .invalid_fields = PERF_OUTPUT_TRACE | PERF_OUTPUT_BPF_OUTPUT,
+ },
};
+static inline int output_type(unsigned int type)
+{
+ switch (type) {
+ case PERF_TYPE_SYNTH:
+ return OUTPUT_TYPE_SYNTH;
+ default:
+ return type;
+ }
+}
+
+static inline unsigned int attr_type(unsigned int type)
+{
+ switch (type) {
+ case OUTPUT_TYPE_SYNTH:
+ return PERF_TYPE_SYNTH;
+ default:
+ return type;
+ }
+}
+
static bool output_set_by_user(void)
{
int j;
- for (j = 0; j < PERF_TYPE_MAX; ++j) {
+ for (j = 0; j < OUTPUT_TYPE_MAX; ++j) {
if (output[j].user_set)
return true;
}
@@ -208,7 +249,7 @@ static const char *output_field2str(enum perf_output_field field)
return str;
}
-#define PRINT_FIELD(x) (output[attr->type].fields & PERF_OUTPUT_##x)
+#define PRINT_FIELD(x) (output[output_type(attr->type)].fields & PERF_OUTPUT_##x)
static int perf_evsel__do_check_stype(struct perf_evsel *evsel,
u64 sample_type, const char *sample_msg,
@@ -216,7 +257,7 @@ static int perf_evsel__do_check_stype(struct perf_evsel *evsel,
bool allow_user_set)
{
struct perf_event_attr *attr = &evsel->attr;
- int type = attr->type;
+ int type = output_type(attr->type);
const char *evname;
if (attr->sample_type & sample_type)
@@ -298,10 +339,10 @@ static int perf_evsel__check_attr(struct perf_evsel *evsel,
"selected.\n");
return -EINVAL;
}
- if (PRINT_FIELD(DSO) && !PRINT_FIELD(IP) && !PRINT_FIELD(ADDR)) {
- pr_err("Display of DSO requested but neither sample IP nor "
- "sample address\nis selected. Hence, no addresses to convert "
- "to DSO.\n");
+ if (PRINT_FIELD(DSO) && !PRINT_FIELD(IP) && !PRINT_FIELD(ADDR) &&
+ !PRINT_FIELD(BRSTACK) && !PRINT_FIELD(BRSTACKSYM) && !PRINT_FIELD(BRSTACKOFF)) {
+ pr_err("Display of DSO requested but no address to convert. Select\n"
+ "sample IP, sample address, brstack, brstacksym, or brstackoff.\n");
return -EINVAL;
}
if (PRINT_FIELD(SRCLINE) && !PRINT_FIELD(IP)) {
@@ -346,7 +387,7 @@ static int perf_evsel__check_attr(struct perf_evsel *evsel,
static void set_print_ip_opts(struct perf_event_attr *attr)
{
- unsigned int type = attr->type;
+ unsigned int type = output_type(attr->type);
output[type].print_ip_opts = 0;
if (PRINT_FIELD(IP))
@@ -374,16 +415,17 @@ static int perf_session__check_output_opt(struct perf_session *session)
unsigned int j;
struct perf_evsel *evsel;
- for (j = 0; j < PERF_TYPE_MAX; ++j) {
- evsel = perf_session__find_first_evtype(session, j);
+ for (j = 0; j < OUTPUT_TYPE_MAX; ++j) {
+ evsel = perf_session__find_first_evtype(session, attr_type(j));
/*
* even if fields is set to 0 (ie., show nothing) event must
* exist if user explicitly includes it on the command line
*/
- if (!evsel && output[j].user_set && !output[j].wildcard_set) {
+ if (!evsel && output[j].user_set && !output[j].wildcard_set &&
+ j != OUTPUT_TYPE_SYNTH) {
pr_err("%s events do not exist. "
- "Remove corresponding -f option to proceed.\n",
+ "Remove corresponding -F option to proceed.\n",
event_type(j));
return -1;
}
@@ -514,18 +556,43 @@ mispred_str(struct branch_entry *br)
return br->flags.predicted ? 'P' : 'M';
}
-static void print_sample_brstack(struct perf_sample *sample)
+static void print_sample_brstack(struct perf_sample *sample,
+ struct thread *thread,
+ struct perf_event_attr *attr)
{
struct branch_stack *br = sample->branch_stack;
- u64 i;
+ struct addr_location alf, alt;
+ u64 i, from, to;
if (!(br && br->nr))
return;
for (i = 0; i < br->nr; i++) {
- printf(" 0x%"PRIx64"/0x%"PRIx64"/%c/%c/%c/%d ",
- br->entries[i].from,
- br->entries[i].to,
+ from = br->entries[i].from;
+ to = br->entries[i].to;
+
+ if (PRINT_FIELD(DSO)) {
+ memset(&alf, 0, sizeof(alf));
+ memset(&alt, 0, sizeof(alt));
+ thread__find_addr_map(thread, sample->cpumode, MAP__FUNCTION, from, &alf);
+ thread__find_addr_map(thread, sample->cpumode, MAP__FUNCTION, to, &alt);
+ }
+
+ printf("0x%"PRIx64, from);
+ if (PRINT_FIELD(DSO)) {
+ printf("(");
+ map__fprintf_dsoname(alf.map, stdout);
+ printf(")");
+ }
+
+ printf("/0x%"PRIx64, to);
+ if (PRINT_FIELD(DSO)) {
+ printf("(");
+ map__fprintf_dsoname(alt.map, stdout);
+ printf(")");
+ }
+
+ printf("/%c/%c/%c/%d ",
mispred_str( br->entries + i),
br->entries[i].flags.in_tx? 'X' : '-',
br->entries[i].flags.abort? 'A' : '-',
@@ -534,7 +601,8 @@ static void print_sample_brstack(struct perf_sample *sample)
}
static void print_sample_brstacksym(struct perf_sample *sample,
- struct thread *thread)
+ struct thread *thread,
+ struct perf_event_attr *attr)
{
struct branch_stack *br = sample->branch_stack;
struct addr_location alf, alt;
@@ -559,8 +627,18 @@ static void print_sample_brstacksym(struct perf_sample *sample,
alt.sym = map__find_symbol(alt.map, alt.addr);
symbol__fprintf_symname_offs(alf.sym, &alf, stdout);
+ if (PRINT_FIELD(DSO)) {
+ printf("(");
+ map__fprintf_dsoname(alf.map, stdout);
+ printf(")");
+ }
putchar('/');
symbol__fprintf_symname_offs(alt.sym, &alt, stdout);
+ if (PRINT_FIELD(DSO)) {
+ printf("(");
+ map__fprintf_dsoname(alt.map, stdout);
+ printf(")");
+ }
printf("/%c/%c/%c/%d ",
mispred_str( br->entries + i),
br->entries[i].flags.in_tx? 'X' : '-',
@@ -569,6 +647,51 @@ static void print_sample_brstacksym(struct perf_sample *sample,
}
}
+static void print_sample_brstackoff(struct perf_sample *sample,
+ struct thread *thread,
+ struct perf_event_attr *attr)
+{
+ struct branch_stack *br = sample->branch_stack;
+ struct addr_location alf, alt;
+ u64 i, from, to;
+
+ if (!(br && br->nr))
+ return;
+
+ for (i = 0; i < br->nr; i++) {
+
+ memset(&alf, 0, sizeof(alf));
+ memset(&alt, 0, sizeof(alt));
+ from = br->entries[i].from;
+ to = br->entries[i].to;
+
+ thread__find_addr_map(thread, sample->cpumode, MAP__FUNCTION, from, &alf);
+ if (alf.map && !alf.map->dso->adjust_symbols)
+ from = map__map_ip(alf.map, from);
+
+ thread__find_addr_map(thread, sample->cpumode, MAP__FUNCTION, to, &alt);
+ if (alt.map && !alt.map->dso->adjust_symbols)
+ to = map__map_ip(alt.map, to);
+
+ printf("0x%"PRIx64, from);
+ if (PRINT_FIELD(DSO)) {
+ printf("(");
+ map__fprintf_dsoname(alf.map, stdout);
+ printf(")");
+ }
+ printf("/0x%"PRIx64, to);
+ if (PRINT_FIELD(DSO)) {
+ printf("(");
+ map__fprintf_dsoname(alt.map, stdout);
+ printf(")");
+ }
+ printf("/%c/%c/%c/%d ",
+ mispred_str(br->entries + i),
+ br->entries[i].flags.in_tx ? 'X' : '-',
+ br->entries[i].flags.abort ? 'A' : '-',
+ br->entries[i].flags.cycles);
+ }
+}
#define MAXBB 16384UL
static int grab_bb(u8 *buffer, u64 start, u64 end,
@@ -906,6 +1029,7 @@ static void print_sample_bts(struct perf_sample *sample,
struct machine *machine)
{
struct perf_event_attr *attr = &evsel->attr;
+ unsigned int type = output_type(attr->type);
bool print_srcline_last = false;
if (PRINT_FIELD(CALLINDENT))
@@ -913,7 +1037,7 @@ static void print_sample_bts(struct perf_sample *sample,
/* print branch_from information */
if (PRINT_FIELD(IP)) {
- unsigned int print_opts = output[attr->type].print_ip_opts;
+ unsigned int print_opts = output[type].print_ip_opts;
struct callchain_cursor *cursor = NULL;
if (symbol_conf.use_callchain && sample->callchain &&
@@ -936,7 +1060,7 @@ static void print_sample_bts(struct perf_sample *sample,
/* print branch_to information */
if (PRINT_FIELD(ADDR) ||
((evsel->attr.sample_type & PERF_SAMPLE_ADDR) &&
- !output[attr->type].user_set)) {
+ !output[type].user_set)) {
printf(" => ");
print_sample_addr(sample, thread, attr);
}
@@ -1079,6 +1203,127 @@ static void print_sample_bpf_output(struct perf_sample *sample)
(char *)(sample->raw_data));
}
+static void print_sample_spacing(int len, int spacing)
+{
+ if (len > 0 && len < spacing)
+ printf("%*s", spacing - len, "");
+}
+
+static void print_sample_pt_spacing(int len)
+{
+ print_sample_spacing(len, 34);
+}
+
+static void print_sample_synth_ptwrite(struct perf_sample *sample)
+{
+ struct perf_synth_intel_ptwrite *data = perf_sample__synth_ptr(sample);
+ int len;
+
+ if (perf_sample__bad_synth_size(sample, *data))
+ return;
+
+ len = printf(" IP: %u payload: %#" PRIx64 " ",
+ data->ip, le64_to_cpu(data->payload));
+ print_sample_pt_spacing(len);
+}
+
+static void print_sample_synth_mwait(struct perf_sample *sample)
+{
+ struct perf_synth_intel_mwait *data = perf_sample__synth_ptr(sample);
+ int len;
+
+ if (perf_sample__bad_synth_size(sample, *data))
+ return;
+
+ len = printf(" hints: %#x extensions: %#x ",
+ data->hints, data->extensions);
+ print_sample_pt_spacing(len);
+}
+
+static void print_sample_synth_pwre(struct perf_sample *sample)
+{
+ struct perf_synth_intel_pwre *data = perf_sample__synth_ptr(sample);
+ int len;
+
+ if (perf_sample__bad_synth_size(sample, *data))
+ return;
+
+ len = printf(" hw: %u cstate: %u sub-cstate: %u ",
+ data->hw, data->cstate, data->subcstate);
+ print_sample_pt_spacing(len);
+}
+
+static void print_sample_synth_exstop(struct perf_sample *sample)
+{
+ struct perf_synth_intel_exstop *data = perf_sample__synth_ptr(sample);
+ int len;
+
+ if (perf_sample__bad_synth_size(sample, *data))
+ return;
+
+ len = printf(" IP: %u ", data->ip);
+ print_sample_pt_spacing(len);
+}
+
+static void print_sample_synth_pwrx(struct perf_sample *sample)
+{
+ struct perf_synth_intel_pwrx *data = perf_sample__synth_ptr(sample);
+ int len;
+
+ if (perf_sample__bad_synth_size(sample, *data))
+ return;
+
+ len = printf(" deepest cstate: %u last cstate: %u wake reason: %#x ",
+ data->deepest_cstate, data->last_cstate,
+ data->wake_reason);
+ print_sample_pt_spacing(len);
+}
+
+static void print_sample_synth_cbr(struct perf_sample *sample)
+{
+ struct perf_synth_intel_cbr *data = perf_sample__synth_ptr(sample);
+ unsigned int percent, freq;
+ int len;
+
+ if (perf_sample__bad_synth_size(sample, *data))
+ return;
+
+ freq = (le32_to_cpu(data->freq) + 500) / 1000;
+ len = printf(" cbr: %2u freq: %4u MHz ", data->cbr, freq);
+ if (data->max_nonturbo) {
+ percent = (5 + (1000 * data->cbr) / data->max_nonturbo) / 10;
+ len += printf("(%3u%%) ", percent);
+ }
+ print_sample_pt_spacing(len);
+}
+
+static void print_sample_synth(struct perf_sample *sample,
+ struct perf_evsel *evsel)
+{
+ switch (evsel->attr.config) {
+ case PERF_SYNTH_INTEL_PTWRITE:
+ print_sample_synth_ptwrite(sample);
+ break;
+ case PERF_SYNTH_INTEL_MWAIT:
+ print_sample_synth_mwait(sample);
+ break;
+ case PERF_SYNTH_INTEL_PWRE:
+ print_sample_synth_pwre(sample);
+ break;
+ case PERF_SYNTH_INTEL_EXSTOP:
+ print_sample_synth_exstop(sample);
+ break;
+ case PERF_SYNTH_INTEL_PWRX:
+ print_sample_synth_pwrx(sample);
+ break;
+ case PERF_SYNTH_INTEL_CBR:
+ print_sample_synth_cbr(sample);
+ break;
+ default:
+ break;
+ }
+}
+
struct perf_script {
struct perf_tool tool;
struct perf_session *session;
@@ -1132,8 +1377,9 @@ static void process_event(struct perf_script *script,
{
struct thread *thread = al->thread;
struct perf_event_attr *attr = &evsel->attr;
+ unsigned int type = output_type(attr->type);
- if (output[attr->type].fields == 0)
+ if (output[type].fields == 0)
return;
print_sample_start(sample, thread, evsel);
@@ -1162,6 +1408,10 @@ static void process_event(struct perf_script *script,
if (PRINT_FIELD(TRACE))
event_format__print(evsel->tp_format, sample->cpu,
sample->raw_data, sample->raw_size);
+
+ if (attr->type == PERF_TYPE_SYNTH && PRINT_FIELD(SYNTH))
+ print_sample_synth(sample, evsel);
+
if (PRINT_FIELD(ADDR))
print_sample_addr(sample, thread, attr);
@@ -1180,16 +1430,18 @@ static void process_event(struct perf_script *script,
cursor = &callchain_cursor;
putchar(cursor ? '\n' : ' ');
- sample__fprintf_sym(sample, al, 0, output[attr->type].print_ip_opts, cursor, stdout);
+ sample__fprintf_sym(sample, al, 0, output[type].print_ip_opts, cursor, stdout);
}
if (PRINT_FIELD(IREGS))
print_sample_iregs(sample, attr);
if (PRINT_FIELD(BRSTACK))
- print_sample_brstack(sample);
+ print_sample_brstack(sample, thread, attr);
else if (PRINT_FIELD(BRSTACKSYM))
- print_sample_brstacksym(sample, thread);
+ print_sample_brstacksym(sample, thread, attr);
+ else if (PRINT_FIELD(BRSTACKOFF))
+ print_sample_brstackoff(sample, thread, attr);
if (perf_evsel__is_bpf_output(evsel) && PRINT_FIELD(BPF_OUTPUT))
print_sample_bpf_output(sample);
@@ -1325,7 +1577,8 @@ static int process_attr(struct perf_tool *tool, union perf_event *event,
evlist = *pevlist;
evsel = perf_evlist__last(*pevlist);
- if (evsel->attr.type >= PERF_TYPE_MAX)
+ if (evsel->attr.type >= PERF_TYPE_MAX &&
+ evsel->attr.type != PERF_TYPE_SYNTH)
return 0;
evlist__for_each_entry(evlist, pos) {
@@ -1727,6 +1980,7 @@ static int parse_output_fields(const struct option *opt __maybe_unused,
int rc = 0;
char *str = strdup(arg);
int type = -1;
+ enum { DEFAULT, SET, ADD, REMOVE } change = DEFAULT;
if (!str)
return -ENOMEM;
@@ -1749,6 +2003,8 @@ static int parse_output_fields(const struct option *opt __maybe_unused,
type = PERF_TYPE_RAW;
else if (!strcmp(str, "break"))
type = PERF_TYPE_BREAKPOINT;
+ else if (!strcmp(str, "synth"))
+ type = OUTPUT_TYPE_SYNTH;
else {
fprintf(stderr, "Invalid event type in field string.\n");
rc = -EINVAL;
@@ -1772,23 +2028,44 @@ static int parse_output_fields(const struct option *opt __maybe_unused,
goto out;
}
+ /* Don't override defaults for +- */
+ if (strchr(str, '+') || strchr(str, '-'))
+ goto parse;
+
if (output_set_by_user())
pr_warning("Overriding previous field request for all events.\n");
- for (j = 0; j < PERF_TYPE_MAX; ++j) {
+ for (j = 0; j < OUTPUT_TYPE_MAX; ++j) {
output[j].fields = 0;
output[j].user_set = true;
output[j].wildcard_set = true;
}
}
+parse:
for (tok = strtok_r(tok, ",", &strtok_saveptr); tok; tok = strtok_r(NULL, ",", &strtok_saveptr)) {
+ if (*tok == '+') {
+ if (change == SET)
+ goto out_badmix;
+ change = ADD;
+ tok++;
+ } else if (*tok == '-') {
+ if (change == SET)
+ goto out_badmix;
+ change = REMOVE;
+ tok++;
+ } else {
+ if (change != SET && change != DEFAULT)
+ goto out_badmix;
+ change = SET;
+ }
+
for (i = 0; i < imax; ++i) {
if (strcmp(tok, all_output_options[i].str) == 0)
break;
}
if (i == imax && strcmp(tok, "flags") == 0) {
- print_flags = true;
+ print_flags = change == REMOVE ? false : true;
continue;
}
if (i == imax) {
@@ -1801,12 +2078,16 @@ static int parse_output_fields(const struct option *opt __maybe_unused,
/* add user option to all events types for
* which it is valid
*/
- for (j = 0; j < PERF_TYPE_MAX; ++j) {
+ for (j = 0; j < OUTPUT_TYPE_MAX; ++j) {
if (output[j].invalid_fields & all_output_options[i].field) {
pr_warning("\'%s\' not valid for %s events. Ignoring.\n",
all_output_options[i].str, event_type(j));
- } else
- output[j].fields |= all_output_options[i].field;
+ } else {
+ if (change == REMOVE)
+ output[j].fields &= ~all_output_options[i].field;
+ else
+ output[j].fields |= all_output_options[i].field;
+ }
}
} else {
if (output[type].invalid_fields & all_output_options[i].field) {
@@ -1826,7 +2107,11 @@ static int parse_output_fields(const struct option *opt __maybe_unused,
"Events will not be displayed.\n", event_type(type));
}
}
+ goto out;
+out_badmix:
+ fprintf(stderr, "Cannot mix +-field with overridden fields\n");
+ rc = -EINVAL;
out:
free(str);
return rc;
@@ -2444,10 +2729,11 @@ int cmd_script(int argc, const char **argv)
symbol__config_symfs),
OPT_CALLBACK('F', "fields", NULL, "str",
"comma separated output fields prepend with 'type:'. "
- "Valid types: hw,sw,trace,raw. "
+ "+field to add and -field to remove."
+ "Valid types: hw,sw,trace,raw,synth. "
"Fields: comm,tid,pid,time,cpu,event,trace,ip,sym,dso,"
"addr,symoff,period,iregs,brstack,brstacksym,flags,"
- "bpf-output,callindent,insn,insnlen,brstackinsn",
+ "bpf-output,callindent,insn,insnlen,brstackinsn,synth",
parse_output_fields),
OPT_BOOLEAN('a', "all-cpus", &system_wide,
"system-wide collection from all CPUs"),
@@ -2706,6 +2992,7 @@ int cmd_script(int argc, const char **argv)
err = perf_session__cpu_bitmap(session, cpu_list, cpu_bitmap);
if (err < 0)
goto out_delete;
+ itrace_synth_opts.cpu_bitmap = cpu_bitmap;
}
if (!no_callchain)
diff --git a/tools/perf/builtin-stat.c b/tools/perf/builtin-stat.c
index ad9324d1daf9..48ac53b199fc 100644
--- a/tools/perf/builtin-stat.c
+++ b/tools/perf/builtin-stat.c
@@ -86,6 +86,7 @@
#define DEFAULT_SEPARATOR " "
#define CNTR_NOT_SUPPORTED "<not supported>"
#define CNTR_NOT_COUNTED "<not counted>"
+#define FREEZE_ON_SMI_PATH "devices/cpu/freeze_on_smi"
static void print_counters(struct timespec *ts, int argc, const char **argv);
@@ -122,6 +123,14 @@ static const char * topdown_attrs[] = {
NULL,
};
+static const char *smi_cost_attrs = {
+ "{"
+ "msr/aperf/,"
+ "msr/smi/,"
+ "cycles"
+ "}"
+};
+
static struct perf_evlist *evsel_list;
static struct target target = {
@@ -137,6 +146,8 @@ static bool null_run = false;
static int detailed_run = 0;
static bool transaction_run;
static bool topdown_run = false;
+static bool smi_cost = false;
+static bool smi_reset = false;
static bool big_num = true;
static int big_num_opt = -1;
static const char *csv_sep = NULL;
@@ -625,14 +636,14 @@ try_again:
}
if (perf_evlist__apply_filters(evsel_list, &counter)) {
- error("failed to set filter \"%s\" on event %s with %d (%s)\n",
+ pr_err("failed to set filter \"%s\" on event %s with %d (%s)\n",
counter->filter, perf_evsel__name(counter), errno,
str_error_r(errno, msg, sizeof(msg)));
return -1;
}
if (perf_evlist__apply_drv_configs(evsel_list, &counter, &err_term)) {
- error("failed to set config \"%s\" on event %s with %d (%s)\n",
+ pr_err("failed to set config \"%s\" on event %s with %d (%s)\n",
err_term->val.drv_cfg, perf_evsel__name(counter), errno,
str_error_r(errno, msg, sizeof(msg)));
return -1;
@@ -1782,6 +1793,8 @@ static const struct option stat_options[] = {
"Only print computed metrics. No raw values", enable_metric_only),
OPT_BOOLEAN(0, "topdown", &topdown_run,
"measure topdown level 1 statistics"),
+ OPT_BOOLEAN(0, "smi-cost", &smi_cost,
+ "measure SMI cost"),
OPT_END()
};
@@ -2160,6 +2173,39 @@ static int add_default_attributes(void)
return 0;
}
+ if (smi_cost) {
+ int smi;
+
+ if (sysfs__read_int(FREEZE_ON_SMI_PATH, &smi) < 0) {
+ fprintf(stderr, "freeze_on_smi is not supported.\n");
+ return -1;
+ }
+
+ if (!smi) {
+ if (sysfs__write_int(FREEZE_ON_SMI_PATH, 1) < 0) {
+ fprintf(stderr, "Failed to set freeze_on_smi.\n");
+ return -1;
+ }
+ smi_reset = true;
+ }
+
+ if (pmu_have_event("msr", "aperf") &&
+ pmu_have_event("msr", "smi")) {
+ if (!force_metric_only)
+ metric_only = true;
+ err = parse_events(evsel_list, smi_cost_attrs, NULL);
+ } else {
+ fprintf(stderr, "To measure SMI cost, it needs "
+ "msr/aperf/, msr/smi/ and cpu/cycles/ support\n");
+ return -1;
+ }
+ if (err) {
+ fprintf(stderr, "Cannot set up SMI cost events\n");
+ return -1;
+ }
+ return 0;
+ }
+
if (topdown_run) {
char *str = NULL;
bool warn = false;
@@ -2742,6 +2788,9 @@ int cmd_stat(int argc, const char **argv)
perf_stat__exit_aggr_mode();
perf_evlist__free_stats(evsel_list);
out:
+ if (smi_cost && smi_reset)
+ sysfs__write_int(FREEZE_ON_SMI_PATH, 0);
+
perf_evlist__delete(evsel_list);
return status;
}
diff --git a/tools/perf/builtin-top.c b/tools/perf/builtin-top.c
index 10b6362ca0bf..6052376634c0 100644
--- a/tools/perf/builtin-top.c
+++ b/tools/perf/builtin-top.c
@@ -134,7 +134,7 @@ static int perf_top__parse_source(struct perf_top *top, struct hist_entry *he)
return err;
}
- err = symbol__disassemble(sym, map, NULL, 0);
+ err = symbol__disassemble(sym, map, NULL, 0, NULL);
if (err == 0) {
out_assign:
top->sym_filter_entry = he;
@@ -958,7 +958,7 @@ static int __cmd_top(struct perf_top *top)
ret = perf_evlist__apply_drv_configs(evlist, &pos, &err_term);
if (ret) {
- error("failed to set config \"%s\" on event %s with %d (%s)\n",
+ pr_err("failed to set config \"%s\" on event %s with %d (%s)\n",
err_term->val.drv_cfg, perf_evsel__name(pos), errno,
str_error_r(errno, msg, sizeof(msg)));
goto out_delete;
diff --git a/tools/perf/jvmti/jvmti_agent.c b/tools/perf/jvmti/jvmti_agent.c
index e9651a9d670e..cf36de7ea255 100644
--- a/tools/perf/jvmti/jvmti_agent.c
+++ b/tools/perf/jvmti/jvmti_agent.c
@@ -304,7 +304,7 @@ jvmti_close(void *agent)
FILE *fp = agent;
if (!fp) {
- warnx("jvmti: incalid fd in close_agent");
+ warnx("jvmti: invalid fd in close_agent");
return -1;
}
diff --git a/tools/perf/jvmti/jvmti_agent.h b/tools/perf/jvmti/jvmti_agent.h
index bedf5d0ba9ff..c53a41f48b63 100644
--- a/tools/perf/jvmti/jvmti_agent.h
+++ b/tools/perf/jvmti/jvmti_agent.h
@@ -5,8 +5,6 @@
#include <stdint.h>
#include <jvmti.h>
-#define __unused __attribute__((unused))
-
#if defined(__cplusplus)
extern "C" {
#endif
diff --git a/tools/perf/jvmti/libjvmti.c b/tools/perf/jvmti/libjvmti.c
index 5612641c69b4..6d710904c837 100644
--- a/tools/perf/jvmti/libjvmti.c
+++ b/tools/perf/jvmti/libjvmti.c
@@ -1,3 +1,4 @@
+#include <linux/compiler.h>
#include <sys/types.h>
#include <stdio.h>
#include <string.h>
@@ -238,7 +239,7 @@ code_generated_cb(jvmtiEnv *jvmti,
}
JNIEXPORT jint JNICALL
-Agent_OnLoad(JavaVM *jvm, char *options, void *reserved __unused)
+Agent_OnLoad(JavaVM *jvm, char *options, void *reserved __maybe_unused)
{
jvmtiEventCallbacks cb;
jvmtiCapabilities caps1;
@@ -313,7 +314,7 @@ Agent_OnLoad(JavaVM *jvm, char *options, void *reserved __unused)
}
JNIEXPORT void JNICALL
-Agent_OnUnload(JavaVM *jvm __unused)
+Agent_OnUnload(JavaVM *jvm __maybe_unused)
{
int ret;
diff --git a/tools/perf/pmu-events/Build b/tools/perf/pmu-events/Build
index 9213a1273697..999a4e878162 100644
--- a/tools/perf/pmu-events/Build
+++ b/tools/perf/pmu-events/Build
@@ -2,7 +2,7 @@ hostprogs := jevents
jevents-y += json.o jsmn.o jevents.o
pmu-events-y += pmu-events.o
-JDIR = pmu-events/arch/$(ARCH)
+JDIR = pmu-events/arch/$(SRCARCH)
JSON = $(shell [ -d $(JDIR) ] && \
find $(JDIR) -name '*.json' -o -name 'mapfile.csv')
#
@@ -10,4 +10,4 @@ JSON = $(shell [ -d $(JDIR) ] && \
# directory and create tables in pmu-events.c.
#
$(OUTPUT)pmu-events/pmu-events.c: $(JSON) $(JEVENTS)
- $(Q)$(call echo-cmd,gen)$(JEVENTS) $(ARCH) pmu-events/arch $(OUTPUT)pmu-events/pmu-events.c $(V)
+ $(Q)$(call echo-cmd,gen)$(JEVENTS) $(SRCARCH) pmu-events/arch $(OUTPUT)pmu-events/pmu-events.c $(V)
diff --git a/tools/perf/pmu-events/jevents.c b/tools/perf/pmu-events/jevents.c
index baa073f38334..bd0aabb2bd0f 100644
--- a/tools/perf/pmu-events/jevents.c
+++ b/tools/perf/pmu-events/jevents.c
@@ -48,10 +48,6 @@
#include "json.h"
#include "jevents.h"
-#ifndef __maybe_unused
-#define __maybe_unused __attribute__((unused))
-#endif
-
int verbose;
char *prog;
diff --git a/tools/perf/scripts/python/bin/intel-pt-events-record b/tools/perf/scripts/python/bin/intel-pt-events-record
new file mode 100644
index 000000000000..10fe2b6977d4
--- /dev/null
+++ b/tools/perf/scripts/python/bin/intel-pt-events-record
@@ -0,0 +1,13 @@
+#!/bin/bash
+
+#
+# print Intel PT Power Events and PTWRITE. The intel_pt PMU event needs
+# to be specified with appropriate config terms.
+#
+if ! echo "$@" | grep -q intel_pt ; then
+ echo "Options must include the Intel PT event e.g. -e intel_pt/pwr_evt,ptw/"
+ echo "and for power events it probably needs to be system wide i.e. -a option"
+ echo "For example: -a -e intel_pt/pwr_evt,branch=0/ sleep 1"
+ exit 1
+fi
+perf record $@
diff --git a/tools/perf/scripts/python/bin/intel-pt-events-report b/tools/perf/scripts/python/bin/intel-pt-events-report
new file mode 100644
index 000000000000..9a9c92fcd026
--- /dev/null
+++ b/tools/perf/scripts/python/bin/intel-pt-events-report
@@ -0,0 +1,3 @@
+#!/bin/bash
+# description: print Intel PT Power Events and PTWRITE
+perf script $@ -s "$PERF_EXEC_PATH"/scripts/python/intel-pt-events.py \ No newline at end of file
diff --git a/tools/perf/scripts/python/intel-pt-events.py b/tools/perf/scripts/python/intel-pt-events.py
new file mode 100644
index 000000000000..b19172d673af
--- /dev/null
+++ b/tools/perf/scripts/python/intel-pt-events.py
@@ -0,0 +1,128 @@
+# intel-pt-events.py: Print Intel PT Power Events and PTWRITE
+# Copyright (c) 2017, Intel Corporation.
+#
+# This program is free software; you can redistribute it and/or modify it
+# under the terms and conditions of the GNU General Public License,
+# version 2, as published by the Free Software Foundation.
+#
+# This program is distributed in the hope it will be useful, but WITHOUT
+# ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+# FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
+# more details.
+
+import os
+import sys
+import struct
+
+sys.path.append(os.environ['PERF_EXEC_PATH'] + \
+ '/scripts/python/Perf-Trace-Util/lib/Perf/Trace')
+
+# These perf imports are not used at present
+#from perf_trace_context import *
+#from Core import *
+
+def trace_begin():
+ print "Intel PT Power Events and PTWRITE"
+
+def trace_end():
+ print "End"
+
+def trace_unhandled(event_name, context, event_fields_dict):
+ print ' '.join(['%s=%s'%(k,str(v))for k,v in sorted(event_fields_dict.items())])
+
+def print_ptwrite(raw_buf):
+ data = struct.unpack_from("<IQ", raw_buf)
+ flags = data[0]
+ payload = data[1]
+ exact_ip = flags & 1
+ print "IP: %u payload: %#x" % (exact_ip, payload),
+
+def print_cbr(raw_buf):
+ data = struct.unpack_from("<BBBBII", raw_buf)
+ cbr = data[0]
+ f = (data[4] + 500) / 1000
+ p = ((cbr * 1000 / data[2]) + 5) / 10
+ print "%3u freq: %4u MHz (%3u%%)" % (cbr, f, p),
+
+def print_mwait(raw_buf):
+ data = struct.unpack_from("<IQ", raw_buf)
+ payload = data[1]
+ hints = payload & 0xff
+ extensions = (payload >> 32) & 0x3
+ print "hints: %#x extensions: %#x" % (hints, extensions),
+
+def print_pwre(raw_buf):
+ data = struct.unpack_from("<IQ", raw_buf)
+ payload = data[1]
+ hw = (payload >> 7) & 1
+ cstate = (payload >> 12) & 0xf
+ subcstate = (payload >> 8) & 0xf
+ print "hw: %u cstate: %u sub-cstate: %u" % (hw, cstate, subcstate),
+
+def print_exstop(raw_buf):
+ data = struct.unpack_from("<I", raw_buf)
+ flags = data[0]
+ exact_ip = flags & 1
+ print "IP: %u" % (exact_ip),
+
+def print_pwrx(raw_buf):
+ data = struct.unpack_from("<IQ", raw_buf)
+ payload = data[1]
+ deepest_cstate = payload & 0xf
+ last_cstate = (payload >> 4) & 0xf
+ wake_reason = (payload >> 8) & 0xf
+ print "deepest cstate: %u last cstate: %u wake reason: %#x" % (deepest_cstate, last_cstate, wake_reason),
+
+def print_common_start(comm, sample, name):
+ ts = sample["time"]
+ cpu = sample["cpu"]
+ pid = sample["pid"]
+ tid = sample["tid"]
+ print "%16s %5u/%-5u [%03u] %9u.%09u %7s:" % (comm, pid, tid, cpu, ts / 1000000000, ts %1000000000, name),
+
+def print_common_ip(sample, symbol, dso):
+ ip = sample["ip"]
+ print "%16x %s (%s)" % (ip, symbol, dso)
+
+def process_event(param_dict):
+ event_attr = param_dict["attr"]
+ sample = param_dict["sample"]
+ raw_buf = param_dict["raw_buf"]
+ comm = param_dict["comm"]
+ name = param_dict["ev_name"]
+
+ # Symbol and dso info are not always resolved
+ if (param_dict.has_key("dso")):
+ dso = param_dict["dso"]
+ else:
+ dso = "[unknown]"
+
+ if (param_dict.has_key("symbol")):
+ symbol = param_dict["symbol"]
+ else:
+ symbol = "[unknown]"
+
+ if name == "ptwrite":
+ print_common_start(comm, sample, name)
+ print_ptwrite(raw_buf)
+ print_common_ip(sample, symbol, dso)
+ elif name == "cbr":
+ print_common_start(comm, sample, name)
+ print_cbr(raw_buf)
+ print_common_ip(sample, symbol, dso)
+ elif name == "mwait":
+ print_common_start(comm, sample, name)
+ print_mwait(raw_buf)
+ print_common_ip(sample, symbol, dso)
+ elif name == "pwre":
+ print_common_start(comm, sample, name)
+ print_pwre(raw_buf)
+ print_common_ip(sample, symbol, dso)
+ elif name == "exstop":
+ print_common_start(comm, sample, name)
+ print_exstop(raw_buf)
+ print_common_ip(sample, symbol, dso)
+ elif name == "pwrx":
+ print_common_start(comm, sample, name)
+ print_pwrx(raw_buf)
+ print_common_ip(sample, symbol, dso)
diff --git a/tools/perf/tests/Build b/tools/perf/tests/Build
index af58ebc243ef..84222bdb8689 100644
--- a/tools/perf/tests/Build
+++ b/tools/perf/tests/Build
@@ -75,7 +75,7 @@ $(OUTPUT)tests/llvm-src-relocation.c: tests/bpf-script-test-relocation.c tests/B
$(Q)sed -e 's/"/\\"/g' -e 's/\(.*\)/"\1\\n"/g' $< >> $@
$(Q)echo ';' >> $@
-ifeq ($(ARCH),$(filter $(ARCH),x86 arm arm64 powerpc))
+ifeq ($(SRCARCH),$(filter $(SRCARCH),x86 arm arm64 powerpc))
perf-$(CONFIG_DWARF_UNWIND) += dwarf-unwind.o
endif
diff --git a/tools/perf/tests/attr.c b/tools/perf/tests/attr.c
index 0dd77494bb58..0e77b2cf61ec 100644
--- a/tools/perf/tests/attr.c
+++ b/tools/perf/tests/attr.c
@@ -18,6 +18,7 @@
* permissions. All the event text files are stored there.
*/
+#include <debug.h>
#include <errno.h>
#include <inttypes.h>
#include <stdlib.h>
@@ -29,14 +30,11 @@
#include <sys/stat.h>
#include <unistd.h>
#include "../perf.h"
-#include "util.h"
#include <subcmd/exec-cmd.h>
#include "tests.h"
#define ENV "PERF_TEST_ATTR"
-extern int verbose;
-
static char *dir;
void test_attr__init(void)
@@ -138,8 +136,10 @@ void test_attr__open(struct perf_event_attr *attr, pid_t pid, int cpu,
{
int errno_saved = errno;
- if (store_event(attr, pid, cpu, fd, group_fd, flags))
- die("test attr FAILED");
+ if (store_event(attr, pid, cpu, fd, group_fd, flags)) {
+ pr_err("test attr FAILED");
+ exit(128);
+ }
errno = errno_saved;
}
diff --git a/tools/perf/tests/attr.py b/tools/perf/tests/attr.py
index 1091bd47adfd..cdf21a9d0c35 100644
--- a/tools/perf/tests/attr.py
+++ b/tools/perf/tests/attr.py
@@ -16,6 +16,13 @@ class Fail(Exception):
def getMsg(self):
return '\'%s\' - %s' % (self.test.path, self.msg)
+class Notest(Exception):
+ def __init__(self, test, arch):
+ self.arch = arch
+ self.test = test
+ def getMsg(self):
+ return '[%s] \'%s\'' % (self.arch, self.test.path)
+
class Unsup(Exception):
def __init__(self, test):
self.test = test
@@ -112,6 +119,9 @@ class Event(dict):
# 'command' - perf command name
# 'args' - special command arguments
# 'ret' - expected command return value (0 by default)
+# 'arch' - architecture specific test (optional)
+# comma separated list, ! at the beginning
+# negates it.
#
# [eventX:base]
# - one or multiple instances in file
@@ -134,6 +144,12 @@ class Test(object):
except:
self.ret = 0
+ try:
+ self.arch = parser.get('config', 'arch')
+ log.warning("test limitation '%s'" % self.arch)
+ except:
+ self.arch = ''
+
self.expect = {}
self.result = {}
log.debug(" loading expected events");
@@ -145,6 +161,31 @@ class Test(object):
else:
return True
+ def skip_test(self, myarch):
+ # If architecture not set always run test
+ if self.arch == '':
+ # log.warning("test for arch %s is ok" % myarch)
+ return False
+
+ # Allow multiple values in assignment separated by ','
+ arch_list = self.arch.split(',')
+
+ # Handle negated list such as !s390x,ppc
+ if arch_list[0][0] == '!':
+ arch_list[0] = arch_list[0][1:]
+ log.warning("excluded architecture list %s" % arch_list)
+ for arch_item in arch_list:
+ # log.warning("test for %s arch is %s" % (arch_item, myarch))
+ if arch_item == myarch:
+ return True
+ return False
+
+ for arch_item in arch_list:
+ # log.warning("test for architecture '%s' current '%s'" % (arch_item, myarch))
+ if arch_item == myarch:
+ return False
+ return True
+
def load_events(self, path, events):
parser_event = ConfigParser.SafeConfigParser()
parser_event.read(path)
@@ -168,6 +209,11 @@ class Test(object):
events[section] = e
def run_cmd(self, tempdir):
+ junk1, junk2, junk3, junk4, myarch = (os.uname())
+
+ if self.skip_test(myarch):
+ raise Notest(self, myarch)
+
cmd = "PERF_TEST_ATTR=%s %s %s -o %s/perf.data %s" % (tempdir,
self.perf, self.command, tempdir, self.args)
ret = os.WEXITSTATUS(os.system(cmd))
@@ -265,6 +311,8 @@ def run_tests(options):
Test(f, options).run()
except Unsup, obj:
log.warning("unsupp %s" % obj.getMsg())
+ except Notest, obj:
+ log.warning("skipped %s" % obj.getMsg())
def setup_log(verbose):
global log
diff --git a/tools/perf/tests/bp_signal.c b/tools/perf/tests/bp_signal.c
index 8ba2c4618fe9..39bbb97cd30a 100644
--- a/tools/perf/tests/bp_signal.c
+++ b/tools/perf/tests/bp_signal.c
@@ -62,8 +62,7 @@ static void __test_function(volatile long *ptr)
}
#endif
-__attribute__ ((noinline))
-static int test_function(void)
+static noinline int test_function(void)
{
__test_function(&the_var);
the_var++;
diff --git a/tools/perf/tests/bp_signal_overflow.c b/tools/perf/tests/bp_signal_overflow.c
index 89f92fa67cc4..3b1ac6f31b15 100644
--- a/tools/perf/tests/bp_signal_overflow.c
+++ b/tools/perf/tests/bp_signal_overflow.c
@@ -28,8 +28,7 @@
static int overflows;
-__attribute__ ((noinline))
-static int test_function(void)
+static noinline int test_function(void)
{
return time(NULL);
}
diff --git a/tools/perf/tests/bpf-script-test-prologue.c b/tools/perf/tests/bpf-script-test-prologue.c
index 7230e62c70fc..b4ebc75e25ae 100644
--- a/tools/perf/tests/bpf-script-test-prologue.c
+++ b/tools/perf/tests/bpf-script-test-prologue.c
@@ -10,6 +10,15 @@
#include <uapi/linux/fs.h>
+/*
+ * If CONFIG_PROFILE_ALL_BRANCHES is selected,
+ * 'if' is redefined after include kernel header.
+ * Recover 'if' for BPF object code.
+ */
+#ifdef if
+# undef if
+#endif
+
#define FMODE_READ 0x1
#define FMODE_WRITE 0x2
diff --git a/tools/perf/tests/dwarf-unwind.c b/tools/perf/tests/dwarf-unwind.c
index dfe5c89e2049..3e56d08f7995 100644
--- a/tools/perf/tests/dwarf-unwind.c
+++ b/tools/perf/tests/dwarf-unwind.c
@@ -76,8 +76,7 @@ static int unwind_entry(struct unwind_entry *entry, void *arg)
return strcmp((const char *) symbol, funcs[idx]);
}
-__attribute__ ((noinline))
-static int unwind_thread(struct thread *thread)
+static noinline int unwind_thread(struct thread *thread)
{
struct perf_sample sample;
unsigned long cnt = 0;
@@ -108,8 +107,7 @@ static int unwind_thread(struct thread *thread)
static int global_unwind_retval = -INT_MAX;
-__attribute__ ((noinline))
-static int compare(void *p1, void *p2)
+static noinline int compare(void *p1, void *p2)
{
/* Any possible value should be 'thread' */
struct thread *thread = *(struct thread **)p1;
@@ -128,8 +126,7 @@ static int compare(void *p1, void *p2)
return p1 - p2;
}
-__attribute__ ((noinline))
-static int krava_3(struct thread *thread)
+static noinline int krava_3(struct thread *thread)
{
struct thread *array[2] = {thread, thread};
void *fp = &bsearch;
@@ -147,14 +144,12 @@ static int krava_3(struct thread *thread)
return global_unwind_retval;
}
-__attribute__ ((noinline))
-static int krava_2(struct thread *thread)
+static noinline int krava_2(struct thread *thread)
{
return krava_3(thread);
}
-__attribute__ ((noinline))
-static int krava_1(struct thread *thread)
+static noinline int krava_1(struct thread *thread)
{
return krava_2(thread);
}
diff --git a/tools/perf/tests/parse-events.c b/tools/perf/tests/parse-events.c
index 7fad885491c5..812a053d1941 100644
--- a/tools/perf/tests/parse-events.c
+++ b/tools/perf/tests/parse-events.c
@@ -1810,17 +1810,6 @@ static int test_pmu_events(void)
return ret;
}
-static void debug_warn(const char *warn, va_list params)
-{
- char msg[1024];
-
- if (verbose <= 0)
- return;
-
- vsnprintf(msg, sizeof(msg), warn, params);
- fprintf(stderr, " Warning: %s\n", msg);
-}
-
int test__parse_events(int subtest __maybe_unused)
{
int ret1, ret2 = 0;
@@ -1832,8 +1821,6 @@ do { \
ret2 = ret1; \
} while (0)
- set_warning_routine(debug_warn);
-
TEST_EVENTS(test__events);
if (test_pmu())
diff --git a/tools/perf/tests/task-exit.c b/tools/perf/tests/task-exit.c
index 32873ec91a4e..cf00ebad2ef5 100644
--- a/tools/perf/tests/task-exit.c
+++ b/tools/perf/tests/task-exit.c
@@ -83,7 +83,7 @@ int test__task_exit(int subtest __maybe_unused)
evsel = perf_evlist__first(evlist);
evsel->attr.task = 1;
- evsel->attr.sample_freq = 0;
+ evsel->attr.sample_freq = 1;
evsel->attr.inherit = 0;
evsel->attr.watermark = 0;
evsel->attr.wakeup_events = 1;
diff --git a/tools/perf/ui/browsers/annotate.c b/tools/perf/ui/browsers/annotate.c
index d990ad08a3c6..27f41f28dcb4 100644
--- a/tools/perf/ui/browsers/annotate.c
+++ b/tools/perf/ui/browsers/annotate.c
@@ -46,12 +46,15 @@ static struct annotate_browser_opt {
.jump_arrows = true,
};
+struct arch;
+
struct annotate_browser {
struct ui_browser b;
struct rb_root entries;
struct rb_node *curr_hot;
struct disasm_line *selection;
struct disasm_line **offsets;
+ struct arch *arch;
int nr_events;
u64 start;
int nr_asm_entries;
@@ -125,43 +128,57 @@ static void annotate_browser__write(struct ui_browser *browser, void *entry, int
int i, pcnt_width = annotate_browser__pcnt_width(ab);
double percent_max = 0.0;
char bf[256];
+ bool show_title = false;
for (i = 0; i < ab->nr_events; i++) {
if (bdl->samples[i].percent > percent_max)
percent_max = bdl->samples[i].percent;
}
+ if ((row == 0) && (dl->offset == -1 || percent_max == 0.0)) {
+ if (ab->have_cycles) {
+ if (dl->ipc == 0.0 && dl->cycles == 0)
+ show_title = true;
+ } else
+ show_title = true;
+ }
+
if (dl->offset != -1 && percent_max != 0.0) {
- if (percent_max != 0.0) {
- for (i = 0; i < ab->nr_events; i++) {
- ui_browser__set_percent_color(browser,
- bdl->samples[i].percent,
- current_entry);
- if (annotate_browser__opts.show_total_period) {
- ui_browser__printf(browser, "%6" PRIu64 " ",
- bdl->samples[i].nr);
- } else {
- ui_browser__printf(browser, "%6.2f ",
- bdl->samples[i].percent);
- }
+ for (i = 0; i < ab->nr_events; i++) {
+ ui_browser__set_percent_color(browser,
+ bdl->samples[i].percent,
+ current_entry);
+ if (annotate_browser__opts.show_total_period) {
+ ui_browser__printf(browser, "%6" PRIu64 " ",
+ bdl->samples[i].nr);
+ } else {
+ ui_browser__printf(browser, "%6.2f ",
+ bdl->samples[i].percent);
}
- } else {
- ui_browser__write_nstring(browser, " ", 7 * ab->nr_events);
}
} else {
ui_browser__set_percent_color(browser, 0, current_entry);
- ui_browser__write_nstring(browser, " ", 7 * ab->nr_events);
+
+ if (!show_title)
+ ui_browser__write_nstring(browser, " ", 7 * ab->nr_events);
+ else
+ ui_browser__printf(browser, "%*s", 7, "Percent");
}
if (ab->have_cycles) {
if (dl->ipc)
ui_browser__printf(browser, "%*.2f ", IPC_WIDTH - 1, dl->ipc);
- else
+ else if (!show_title)
ui_browser__write_nstring(browser, " ", IPC_WIDTH);
+ else
+ ui_browser__printf(browser, "%*s ", IPC_WIDTH - 1, "IPC");
+
if (dl->cycles)
ui_browser__printf(browser, "%*" PRIu64 " ",
CYCLES_WIDTH - 1, dl->cycles);
- else
+ else if (!show_title)
ui_browser__write_nstring(browser, " ", CYCLES_WIDTH);
+ else
+ ui_browser__printf(browser, "%*s ", CYCLES_WIDTH - 1, "Cycle");
}
SLsmg_write_char(' ');
@@ -1056,7 +1073,8 @@ int symbol__tui_annotate(struct symbol *sym, struct map *map,
(nr_pcnt - 1);
}
- err = symbol__disassemble(sym, map, perf_evsel__env_arch(evsel), sizeof_bdl);
+ err = symbol__disassemble(sym, map, perf_evsel__env_arch(evsel),
+ sizeof_bdl, &browser.arch);
if (err) {
char msg[BUFSIZ];
symbol__strerror_disassemble(sym, map, err, msg, sizeof(msg));
diff --git a/tools/perf/ui/gtk/annotate.c b/tools/perf/ui/gtk/annotate.c
index e99ba86158d2..d903fd493416 100644
--- a/tools/perf/ui/gtk/annotate.c
+++ b/tools/perf/ui/gtk/annotate.c
@@ -168,7 +168,8 @@ static int symbol__gtk_annotate(struct symbol *sym, struct map *map,
if (map->dso->annotate_warned)
return -1;
- err = symbol__disassemble(sym, map, perf_evsel__env_arch(evsel), 0);
+ err = symbol__disassemble(sym, map, perf_evsel__env_arch(evsel),
+ 0, NULL);
if (err) {
char msg[BUFSIZ];
symbol__strerror_disassemble(sym, map, err, msg, sizeof(msg));
diff --git a/tools/perf/util/annotate.c b/tools/perf/util/annotate.c
index ddbd56df9187..be1caabb9290 100644
--- a/tools/perf/util/annotate.c
+++ b/tools/perf/util/annotate.c
@@ -1379,7 +1379,9 @@ static const char *annotate__norm_arch(const char *arch_name)
return normalize_arch((char *)arch_name);
}
-int symbol__disassemble(struct symbol *sym, struct map *map, const char *arch_name, size_t privsize)
+int symbol__disassemble(struct symbol *sym, struct map *map,
+ const char *arch_name, size_t privsize,
+ struct arch **parch)
{
struct dso *dso = map->dso;
char command[PATH_MAX * 2];
@@ -1405,6 +1407,9 @@ int symbol__disassemble(struct symbol *sym, struct map *map, const char *arch_na
if (arch == NULL)
return -ENOTSUP;
+ if (parch)
+ *parch = arch;
+
if (arch->init) {
err = arch->init(arch);
if (err) {
@@ -1901,7 +1906,8 @@ int symbol__tty_annotate(struct symbol *sym, struct map *map,
struct rb_root source_line = RB_ROOT;
u64 len;
- if (symbol__disassemble(sym, map, perf_evsel__env_arch(evsel), 0) < 0)
+ if (symbol__disassemble(sym, map, perf_evsel__env_arch(evsel),
+ 0, NULL) < 0)
return -1;
len = symbol__size(sym);
diff --git a/tools/perf/util/annotate.h b/tools/perf/util/annotate.h
index 948aa8e6fd39..21055034aedd 100644
--- a/tools/perf/util/annotate.h
+++ b/tools/perf/util/annotate.h
@@ -158,7 +158,9 @@ int hist_entry__inc_addr_samples(struct hist_entry *he, int evidx, u64 addr);
int symbol__alloc_hist(struct symbol *sym);
void symbol__annotate_zero_histograms(struct symbol *sym);
-int symbol__disassemble(struct symbol *sym, struct map *map, const char *arch_name, size_t privsize);
+int symbol__disassemble(struct symbol *sym, struct map *map,
+ const char *arch_name, size_t privsize,
+ struct arch **parch);
enum symbol_disassemble_errno {
SYMBOL_ANNOTATE_ERRNO__SUCCESS = 0,
diff --git a/tools/perf/util/auxtrace.c b/tools/perf/util/auxtrace.c
index 0daf63b9ee3e..5547457566a7 100644
--- a/tools/perf/util/auxtrace.c
+++ b/tools/perf/util/auxtrace.c
@@ -322,6 +322,13 @@ static int auxtrace_queues__add_event_buffer(struct auxtrace_queues *queues,
return auxtrace_queues__add_buffer(queues, idx, buffer);
}
+static bool filter_cpu(struct perf_session *session, int cpu)
+{
+ unsigned long *cpu_bitmap = session->itrace_synth_opts->cpu_bitmap;
+
+ return cpu_bitmap && cpu != -1 && !test_bit(cpu, cpu_bitmap);
+}
+
int auxtrace_queues__add_event(struct auxtrace_queues *queues,
struct perf_session *session,
union perf_event *event, off_t data_offset,
@@ -331,6 +338,9 @@ int auxtrace_queues__add_event(struct auxtrace_queues *queues,
unsigned int idx;
int err;
+ if (filter_cpu(session, event->auxtrace.cpu))
+ return 0;
+
buffer = zalloc(sizeof(struct auxtrace_buffer));
if (!buffer)
return -ENOMEM;
@@ -947,6 +957,8 @@ void itrace_synth_opts__set_default(struct itrace_synth_opts *synth_opts)
synth_opts->instructions = true;
synth_opts->branches = true;
synth_opts->transactions = true;
+ synth_opts->ptwrites = true;
+ synth_opts->pwr_events = true;
synth_opts->errors = true;
synth_opts->period_type = PERF_ITRACE_DEFAULT_PERIOD_TYPE;
synth_opts->period = PERF_ITRACE_DEFAULT_PERIOD;
@@ -1030,6 +1042,12 @@ int itrace_parse_synth_opts(const struct option *opt, const char *str,
case 'x':
synth_opts->transactions = true;
break;
+ case 'w':
+ synth_opts->ptwrites = true;
+ break;
+ case 'p':
+ synth_opts->pwr_events = true;
+ break;
case 'e':
synth_opts->errors = true;
break;
diff --git a/tools/perf/util/auxtrace.h b/tools/perf/util/auxtrace.h
index 9f0de72d58e2..33b5e6cdf38c 100644
--- a/tools/perf/util/auxtrace.h
+++ b/tools/perf/util/auxtrace.h
@@ -59,6 +59,8 @@ enum itrace_period_type {
* @instructions: whether to synthesize 'instructions' events
* @branches: whether to synthesize 'branches' events
* @transactions: whether to synthesize events for transactions
+ * @ptwrites: whether to synthesize events for ptwrites
+ * @pwr_events: whether to synthesize power events
* @errors: whether to synthesize decoder error events
* @dont_decode: whether to skip decoding entirely
* @log: write a decoding log
@@ -72,6 +74,7 @@ enum itrace_period_type {
* @period: 'instructions' events period
* @period_type: 'instructions' events period type
* @initial_skip: skip N events at the beginning.
+ * @cpu_bitmap: CPUs for which to synthesize events, or NULL for all
*/
struct itrace_synth_opts {
bool set;
@@ -79,6 +82,8 @@ struct itrace_synth_opts {
bool instructions;
bool branches;
bool transactions;
+ bool ptwrites;
+ bool pwr_events;
bool errors;
bool dont_decode;
bool log;
@@ -92,6 +97,7 @@ struct itrace_synth_opts {
unsigned long long period;
enum itrace_period_type period_type;
unsigned long initial_skip;
+ unsigned long *cpu_bitmap;
};
/**
diff --git a/tools/perf/util/cache.h b/tools/perf/util/cache.h
index 0328f297a748..0175765c05b9 100644
--- a/tools/perf/util/cache.h
+++ b/tools/perf/util/cache.h
@@ -5,6 +5,7 @@
#include <subcmd/pager.h>
#include "../ui/ui.h"
+#include <linux/compiler.h>
#include <linux/string.h>
#define CMD_EXEC_PATH "--exec-path"
@@ -24,6 +25,6 @@ static inline int is_absolute_path(const char *path)
return path[0] == '/';
}
-char *mkpath(const char *fmt, ...) __attribute__((format (printf, 1, 2)));
+char *mkpath(const char *fmt, ...) __printf(1, 2);
#endif /* __PERF_CACHE_H */
diff --git a/tools/perf/util/config.c b/tools/perf/util/config.c
index 8d724f0fa5a8..31a7dea248d0 100644
--- a/tools/perf/util/config.c
+++ b/tools/perf/util/config.c
@@ -335,32 +335,42 @@ static int perf_parse_long(const char *value, long *ret)
return 0;
}
-static void die_bad_config(const char *name)
+static void bad_config(const char *name)
{
if (config_file_name)
- die("bad config value for '%s' in %s", name, config_file_name);
- die("bad config value for '%s'", name);
+ pr_warning("bad config value for '%s' in %s, ignoring...\n", name, config_file_name);
+ else
+ pr_warning("bad config value for '%s', ignoring...\n", name);
}
-u64 perf_config_u64(const char *name, const char *value)
+int perf_config_u64(u64 *dest, const char *name, const char *value)
{
long long ret = 0;
- if (!perf_parse_llong(value, &ret))
- die_bad_config(name);
- return (u64) ret;
+ if (!perf_parse_llong(value, &ret)) {
+ bad_config(name);
+ return -1;
+ }
+
+ *dest = ret;
+ return 0;
}
-int perf_config_int(const char *name, const char *value)
+int perf_config_int(int *dest, const char *name, const char *value)
{
long ret = 0;
- if (!perf_parse_long(value, &ret))
- die_bad_config(name);
- return ret;
+ if (!perf_parse_long(value, &ret)) {
+ bad_config(name);
+ return -1;
+ }
+ *dest = ret;
+ return 0;
}
static int perf_config_bool_or_int(const char *name, const char *value, int *is_bool)
{
+ int ret;
+
*is_bool = 1;
if (!value)
return 1;
@@ -371,7 +381,7 @@ static int perf_config_bool_or_int(const char *name, const char *value, int *is_
if (!strcasecmp(value, "false") || !strcasecmp(value, "no") || !strcasecmp(value, "off"))
return 0;
*is_bool = 0;
- return perf_config_int(name, value);
+ return perf_config_int(&ret, name, value) < 0 ? -1 : ret;
}
int perf_config_bool(const char *name, const char *value)
@@ -657,8 +667,7 @@ static int perf_config_set__init(struct perf_config_set *set)
user_config = strdup(mkpath("%s/.perfconfig", home));
if (user_config == NULL) {
- warning("Not enough memory to process %s/.perfconfig, "
- "ignoring it.", home);
+ pr_warning("Not enough memory to process %s/.perfconfig, ignoring it.", home);
goto out;
}
@@ -671,8 +680,7 @@ static int perf_config_set__init(struct perf_config_set *set)
ret = 0;
if (st.st_uid && (st.st_uid != geteuid())) {
- warning("File %s not owned by current user or root, "
- "ignoring it.", user_config);
+ pr_warning("File %s not owned by current user or root, ignoring it.", user_config);
goto out_free;
}
@@ -795,7 +803,8 @@ void perf_config_set__delete(struct perf_config_set *set)
*/
int config_error_nonbool(const char *var)
{
- return error("Missing value for '%s'", var);
+ pr_err("Missing value for '%s'", var);
+ return -1;
}
void set_buildid_dir(const char *dir)
diff --git a/tools/perf/util/config.h b/tools/perf/util/config.h
index 1a59a6b43f8b..b6bb11f3f165 100644
--- a/tools/perf/util/config.h
+++ b/tools/perf/util/config.h
@@ -27,8 +27,8 @@ extern const char *config_exclusive_filename;
typedef int (*config_fn_t)(const char *, const char *, void *);
int perf_default_config(const char *, const char *, void *);
int perf_config(config_fn_t fn, void *);
-int perf_config_int(const char *, const char *);
-u64 perf_config_u64(const char *, const char *);
+int perf_config_int(int *dest, const char *, const char *);
+int perf_config_u64(u64 *dest, const char *, const char *);
int perf_config_bool(const char *, const char *);
int config_error_nonbool(const char *);
const char *perf_etc_perfconfig(void);
diff --git a/tools/perf/util/data-convert-bt.c b/tools/perf/util/data-convert-bt.c
index 89d50318833d..3149b70799fd 100644
--- a/tools/perf/util/data-convert-bt.c
+++ b/tools/perf/util/data-convert-bt.c
@@ -1444,10 +1444,8 @@ static int convert__config(const char *var, const char *value, void *cb)
{
struct convert *c = cb;
- if (!strcmp(var, "convert.queue-size")) {
- c->queue_size = perf_config_u64(var, value);
- return 0;
- }
+ if (!strcmp(var, "convert.queue-size"))
+ return perf_config_u64(&c->queue_size, var, value);
return 0;
}
diff --git a/tools/perf/util/debug.h b/tools/perf/util/debug.h
index 8a23ea1a71c7..c818bdb1c1ab 100644
--- a/tools/perf/util/debug.h
+++ b/tools/perf/util/debug.h
@@ -4,6 +4,7 @@
#include <stdbool.h>
#include <string.h>
+#include <linux/compiler.h>
#include "event.h"
#include "../ui/helpline.h"
#include "../ui/progress.h"
@@ -40,16 +41,16 @@ extern int debug_data_convert;
#define STRERR_BUFSIZE 128 /* For the buffer size of str_error_r */
-int dump_printf(const char *fmt, ...) __attribute__((format(printf, 1, 2)));
+int dump_printf(const char *fmt, ...) __printf(1, 2);
void trace_event(union perf_event *event);
-int ui__error(const char *format, ...) __attribute__((format(printf, 1, 2)));
-int ui__warning(const char *format, ...) __attribute__((format(printf, 1, 2)));
+int ui__error(const char *format, ...) __printf(1, 2);
+int ui__warning(const char *format, ...) __printf(1, 2);
void pr_stat(const char *fmt, ...);
-int eprintf(int level, int var, const char *fmt, ...) __attribute__((format(printf, 3, 4)));
-int eprintf_time(int level, int var, u64 t, const char *fmt, ...) __attribute__((format(printf, 4, 5)));
+int eprintf(int level, int var, const char *fmt, ...) __printf(3, 4);
+int eprintf_time(int level, int var, u64 t, const char *fmt, ...) __printf(4, 5);
int veprintf(int level, int var, const char *fmt, va_list args);
int perf_debug_option(const char *str);
diff --git a/tools/perf/util/event.h b/tools/perf/util/event.h
index 7c3fa1c8cbcd..9967c87af7a6 100644
--- a/tools/perf/util/event.h
+++ b/tools/perf/util/event.h
@@ -252,6 +252,127 @@ enum auxtrace_error_type {
PERF_AUXTRACE_ERROR_MAX
};
+/* Attribute type for custom synthesized events */
+#define PERF_TYPE_SYNTH (INT_MAX + 1U)
+
+/* Attribute config for custom synthesized events */
+enum perf_synth_id {
+ PERF_SYNTH_INTEL_PTWRITE,
+ PERF_SYNTH_INTEL_MWAIT,
+ PERF_SYNTH_INTEL_PWRE,
+ PERF_SYNTH_INTEL_EXSTOP,
+ PERF_SYNTH_INTEL_PWRX,
+ PERF_SYNTH_INTEL_CBR,
+};
+
+/*
+ * Raw data formats for synthesized events. Note that 4 bytes of padding are
+ * present to match the 'size' member of PERF_SAMPLE_RAW data which is always
+ * 8-byte aligned. That means we must dereference raw_data with an offset of 4.
+ * Refer perf_sample__synth_ptr() and perf_synth__raw_data(). It also means the
+ * structure sizes are 4 bytes bigger than the raw_size, refer
+ * perf_synth__raw_size().
+ */
+
+struct perf_synth_intel_ptwrite {
+ u32 padding;
+ union {
+ struct {
+ u32 ip : 1,
+ reserved : 31;
+ };
+ u32 flags;
+ };
+ u64 payload;
+};
+
+struct perf_synth_intel_mwait {
+ u32 padding;
+ u32 reserved;
+ union {
+ struct {
+ u64 hints : 8,
+ reserved1 : 24,
+ extensions : 2,
+ reserved2 : 30;
+ };
+ u64 payload;
+ };
+};
+
+struct perf_synth_intel_pwre {
+ u32 padding;
+ u32 reserved;
+ union {
+ struct {
+ u64 reserved1 : 7,
+ hw : 1,
+ subcstate : 4,
+ cstate : 4,
+ reserved2 : 48;
+ };
+ u64 payload;
+ };
+};
+
+struct perf_synth_intel_exstop {
+ u32 padding;
+ union {
+ struct {
+ u32 ip : 1,
+ reserved : 31;
+ };
+ u32 flags;
+ };
+};
+
+struct perf_synth_intel_pwrx {
+ u32 padding;
+ u32 reserved;
+ union {
+ struct {
+ u64 deepest_cstate : 4,
+ last_cstate : 4,
+ wake_reason : 4,
+ reserved1 : 52;
+ };
+ u64 payload;
+ };
+};
+
+struct perf_synth_intel_cbr {
+ u32 padding;
+ union {
+ struct {
+ u32 cbr : 8,
+ reserved1 : 8,
+ max_nonturbo : 8,
+ reserved2 : 8;
+ };
+ u32 flags;
+ };
+ u32 freq;
+ u32 reserved3;
+};
+
+/*
+ * raw_data is always 4 bytes from an 8-byte boundary, so subtract 4 to get
+ * 8-byte alignment.
+ */
+static inline void *perf_sample__synth_ptr(struct perf_sample *sample)
+{
+ return sample->raw_data - 4;
+}
+
+static inline void *perf_synth__raw_data(void *p)
+{
+ return p + 4;
+}
+
+#define perf_synth__raw_size(d) (sizeof(d) - 4)
+
+#define perf_sample__bad_synth_size(s, d) ((s)->raw_size < sizeof(d) - 4)
+
/*
* The kernel collects the number of events it couldn't send in a stretch and
* when possible sends this number in a PERF_RECORD_LOST event. The number of
diff --git a/tools/perf/util/evlist.h b/tools/perf/util/evlist.h
index 94cea4398a13..8d601fbdd8d6 100644
--- a/tools/perf/util/evlist.h
+++ b/tools/perf/util/evlist.h
@@ -1,6 +1,7 @@
#ifndef __PERF_EVLIST_H
#define __PERF_EVLIST_H 1
+#include <linux/compiler.h>
#include <linux/kernel.h>
#include <linux/refcount.h>
#include <linux/list.h>
@@ -34,7 +35,7 @@ struct perf_mmap {
refcount_t refcnt;
u64 prev;
struct auxtrace_mmap auxtrace_mmap;
- char event_copy[PERF_SAMPLE_MAX_SIZE] __attribute__((aligned(8)));
+ char event_copy[PERF_SAMPLE_MAX_SIZE] __aligned(8);
};
static inline size_t
diff --git a/tools/perf/util/evsel.c b/tools/perf/util/evsel.c
index e4f7902d5afa..6f4882f8d61f 100644
--- a/tools/perf/util/evsel.c
+++ b/tools/perf/util/evsel.c
@@ -11,13 +11,17 @@
#include <errno.h>
#include <inttypes.h>
#include <linux/bitops.h>
+#include <api/fs/fs.h>
#include <api/fs/tracing_path.h>
#include <traceevent/event-parse.h>
#include <linux/hw_breakpoint.h>
#include <linux/perf_event.h>
+#include <linux/compiler.h>
#include <linux/err.h>
#include <sys/ioctl.h>
#include <sys/resource.h>
+#include <sys/types.h>
+#include <dirent.h>
#include "asm/bug.h"
#include "callchain.h"
#include "cgroup.h"
@@ -273,8 +277,20 @@ struct perf_evsel *perf_evsel__new_cycles(void)
struct perf_evsel *evsel;
event_attr_init(&attr);
+ /*
+ * Unnamed union member, not supported as struct member named
+ * initializer in older compilers such as gcc 4.4.7
+ *
+ * Just for probing the precise_ip:
+ */
+ attr.sample_period = 1;
perf_event_attr__set_max_precise_ip(&attr);
+ /*
+ * Now let the usual logic to set up the perf_event_attr defaults
+ * to kick in when we return and before perf_evsel__open() is called.
+ */
+ attr.sample_period = 0;
evsel = perf_evsel__new(&attr);
if (evsel == NULL)
@@ -1429,7 +1445,7 @@ int perf_event_attr__fprintf(FILE *fp, struct perf_event_attr *attr,
}
static int __open_attr__fprintf(FILE *fp, const char *name, const char *val,
- void *priv __attribute__((unused)))
+ void *priv __maybe_unused)
{
return fprintf(fp, " %-32s %s\n", name, val);
}
@@ -2459,6 +2475,42 @@ bool perf_evsel__fallback(struct perf_evsel *evsel, int err,
return false;
}
+static bool find_process(const char *name)
+{
+ size_t len = strlen(name);
+ DIR *dir;
+ struct dirent *d;
+ int ret = -1;
+
+ dir = opendir(procfs__mountpoint());
+ if (!dir)
+ return false;
+
+ /* Walk through the directory. */
+ while (ret && (d = readdir(dir)) != NULL) {
+ char path[PATH_MAX];
+ char *data;
+ size_t size;
+
+ if ((d->d_type != DT_DIR) ||
+ !strcmp(".", d->d_name) ||
+ !strcmp("..", d->d_name))
+ continue;
+
+ scnprintf(path, sizeof(path), "%s/%s/comm",
+ procfs__mountpoint(), d->d_name);
+
+ if (filename__read_str(path, &data, &size))
+ continue;
+
+ ret = strncmp(name, data, len);
+ free(data);
+ }
+
+ closedir(dir);
+ return ret ? false : true;
+}
+
int perf_evsel__open_strerror(struct perf_evsel *evsel, struct target *target,
int err, char *msg, size_t size)
{
diff --git a/tools/perf/util/genelf_debug.c b/tools/perf/util/genelf_debug.c
index 5980f7d256b1..40789d8603d0 100644
--- a/tools/perf/util/genelf_debug.c
+++ b/tools/perf/util/genelf_debug.c
@@ -11,6 +11,7 @@
* @remark Copyright 2007 OProfile authors
* @author Philippe Elie
*/
+#include <linux/compiler.h>
#include <sys/types.h>
#include <stdio.h>
#include <getopt.h>
@@ -125,7 +126,7 @@ struct debug_line_header {
* and filesize, last entry is followed by en empty string.
*/
/* follow the first program statement */
-} __attribute__((packed));
+} __packed;
/* DWARF 2 spec talk only about one possible compilation unit header while
* binutils can handle two flavours of dwarf 2, 32 and 64 bits, this is not
@@ -138,7 +139,7 @@ struct compilation_unit_header {
uhalf version;
uword debug_abbrev_offset;
ubyte pointer_size;
-} __attribute__((packed));
+} __packed;
#define DW_LNS_num_opcode (DW_LNS_set_isa + 1)
diff --git a/tools/perf/util/header.c b/tools/perf/util/header.c
index 5cac8d5e009a..76ed7d03e500 100644
--- a/tools/perf/util/header.c
+++ b/tools/perf/util/header.c
@@ -8,6 +8,7 @@
#include <unistd.h>
#include <stdio.h>
#include <stdlib.h>
+#include <linux/compiler.h>
#include <linux/list.h>
#include <linux/kernel.h>
#include <linux/bitops.h>
@@ -841,7 +842,7 @@ static int write_group_desc(int fd, struct perf_header *h __maybe_unused,
/*
* default get_cpuid(): nothing gets recorded
- * actual implementation must be in arch/$(ARCH)/util/header.c
+ * actual implementation must be in arch/$(SRCARCH)/util/header.c
*/
int __weak get_cpuid(char *buffer __maybe_unused, size_t sz __maybe_unused)
{
@@ -1274,7 +1275,7 @@ error:
}
static int __desc_attr__fprintf(FILE *fp, const char *name, const char *val,
- void *priv __attribute__((unused)))
+ void *priv __maybe_unused)
{
return fprintf(fp, ", %s = %s", name, val);
}
diff --git a/tools/perf/util/help-unknown-cmd.c b/tools/perf/util/help-unknown-cmd.c
index 1c88ad6425b8..15b95300d7f3 100644
--- a/tools/perf/util/help-unknown-cmd.c
+++ b/tools/perf/util/help-unknown-cmd.c
@@ -12,7 +12,7 @@ static int perf_unknown_cmd_config(const char *var, const char *value,
void *cb __maybe_unused)
{
if (!strcmp(var, "help.autocorrect"))
- autocorrect = perf_config_int(var,value);
+ return perf_config_int(&autocorrect, var,value);
return 0;
}
diff --git a/tools/perf/util/intel-bts.c b/tools/perf/util/intel-bts.c
index b2834ac7b1f5..218ee2bac9a5 100644
--- a/tools/perf/util/intel-bts.c
+++ b/tools/perf/util/intel-bts.c
@@ -866,8 +866,6 @@ static void intel_bts_print_info(u64 *arr, int start, int finish)
fprintf(stdout, intel_bts_info_fmts[i], arr[i]);
}
-u64 intel_bts_auxtrace_info_priv[INTEL_BTS_AUXTRACE_PRIV_SIZE];
-
int intel_bts_process_auxtrace_info(union perf_event *event,
struct perf_session *session)
{
diff --git a/tools/perf/util/intel-pt-decoder/intel-pt-decoder.c b/tools/perf/util/intel-pt-decoder/intel-pt-decoder.c
index 7cf7f7aca4d2..aa1593ce551d 100644
--- a/tools/perf/util/intel-pt-decoder/intel-pt-decoder.c
+++ b/tools/perf/util/intel-pt-decoder/intel-pt-decoder.c
@@ -64,6 +64,25 @@ enum intel_pt_pkt_state {
INTEL_PT_STATE_FUP_NO_TIP,
};
+static inline bool intel_pt_sample_time(enum intel_pt_pkt_state pkt_state)
+{
+ switch (pkt_state) {
+ case INTEL_PT_STATE_NO_PSB:
+ case INTEL_PT_STATE_NO_IP:
+ case INTEL_PT_STATE_ERR_RESYNC:
+ case INTEL_PT_STATE_IN_SYNC:
+ case INTEL_PT_STATE_TNT:
+ return true;
+ case INTEL_PT_STATE_TIP:
+ case INTEL_PT_STATE_TIP_PGD:
+ case INTEL_PT_STATE_FUP:
+ case INTEL_PT_STATE_FUP_NO_TIP:
+ return false;
+ default:
+ return true;
+ };
+}
+
#ifdef INTEL_PT_STRICT
#define INTEL_PT_STATE_ERR1 INTEL_PT_STATE_NO_PSB
#define INTEL_PT_STATE_ERR2 INTEL_PT_STATE_NO_PSB
@@ -87,11 +106,13 @@ struct intel_pt_decoder {
const unsigned char *buf;
size_t len;
bool return_compression;
+ bool branch_enable;
bool mtc_insn;
bool pge;
bool have_tma;
bool have_cyc;
bool fixup_last_mtc;
+ bool have_last_ip;
uint64_t pos;
uint64_t last_ip;
uint64_t ip;
@@ -99,6 +120,7 @@ struct intel_pt_decoder {
uint64_t timestamp;
uint64_t tsc_timestamp;
uint64_t ref_timestamp;
+ uint64_t sample_timestamp;
uint64_t ret_addr;
uint64_t ctc_timestamp;
uint64_t ctc_delta;
@@ -119,6 +141,7 @@ struct intel_pt_decoder {
int pkt_len;
int last_packet_type;
unsigned int cbr;
+ unsigned int cbr_seen;
unsigned int max_non_turbo_ratio;
double max_non_turbo_ratio_fp;
double cbr_cyc_to_tsc;
@@ -136,9 +159,18 @@ struct intel_pt_decoder {
bool continuous_period;
bool overflow;
bool set_fup_tx_flags;
+ bool set_fup_ptw;
+ bool set_fup_mwait;
+ bool set_fup_pwre;
+ bool set_fup_exstop;
unsigned int fup_tx_flags;
unsigned int tx_flags;
+ uint64_t fup_ptw_payload;
+ uint64_t fup_mwait_payload;
+ uint64_t fup_pwre_payload;
+ uint64_t cbr_payload;
uint64_t timestamp_insn_cnt;
+ uint64_t sample_insn_cnt;
uint64_t stuck_ip;
int no_progress;
int stuck_ip_prd;
@@ -192,6 +224,7 @@ struct intel_pt_decoder *intel_pt_decoder_new(struct intel_pt_params *params)
decoder->pgd_ip = params->pgd_ip;
decoder->data = params->data;
decoder->return_compression = params->return_compression;
+ decoder->branch_enable = params->branch_enable;
decoder->period = params->period;
decoder->period_type = params->period_type;
@@ -398,6 +431,7 @@ static uint64_t intel_pt_calc_ip(const struct intel_pt_pkt *packet,
static inline void intel_pt_set_last_ip(struct intel_pt_decoder *decoder)
{
decoder->last_ip = intel_pt_calc_ip(&decoder->packet, decoder->last_ip);
+ decoder->have_last_ip = true;
}
static inline void intel_pt_set_ip(struct intel_pt_decoder *decoder)
@@ -635,6 +669,8 @@ static int intel_pt_calc_cyc_cb(struct intel_pt_pkt_info *pkt_info)
case INTEL_PT_PAD:
case INTEL_PT_VMCS:
case INTEL_PT_MNT:
+ case INTEL_PT_PTWRITE:
+ case INTEL_PT_PTWRITE_IP:
return 0;
case INTEL_PT_MTC:
@@ -675,6 +711,12 @@ static int intel_pt_calc_cyc_cb(struct intel_pt_pkt_info *pkt_info)
break;
case INTEL_PT_TSC:
+ /*
+ * For now, do not support using TSC packets - refer
+ * intel_pt_calc_cyc_to_tsc().
+ */
+ if (data->from_mtc)
+ return 1;
timestamp = pkt_info->packet.payload |
(data->timestamp & (0xffULL << 56));
if (data->from_mtc && timestamp < data->timestamp &&
@@ -733,6 +775,11 @@ static int intel_pt_calc_cyc_cb(struct intel_pt_pkt_info *pkt_info)
case INTEL_PT_TIP_PGD:
case INTEL_PT_TRACESTOP:
+ case INTEL_PT_EXSTOP:
+ case INTEL_PT_EXSTOP_IP:
+ case INTEL_PT_MWAIT:
+ case INTEL_PT_PWRE:
+ case INTEL_PT_PWRX:
case INTEL_PT_OVF:
case INTEL_PT_BAD: /* Does not happen */
default:
@@ -787,6 +834,14 @@ static void intel_pt_calc_cyc_to_tsc(struct intel_pt_decoder *decoder,
.cbr_cyc_to_tsc = 0,
};
+ /*
+ * For now, do not support using TSC packets for at least the reasons:
+ * 1) timing might have stopped
+ * 2) TSC packets within PSB+ can slip against CYC packets
+ */
+ if (!from_mtc)
+ return;
+
intel_pt_pkt_lookahead(decoder, intel_pt_calc_cyc_cb, &data);
}
@@ -898,6 +953,7 @@ static int intel_pt_walk_insn(struct intel_pt_decoder *decoder,
decoder->tot_insn_cnt += insn_cnt;
decoder->timestamp_insn_cnt += insn_cnt;
+ decoder->sample_insn_cnt += insn_cnt;
decoder->period_insn_cnt += insn_cnt;
if (err) {
@@ -990,6 +1046,57 @@ out_no_progress:
return err;
}
+static bool intel_pt_fup_event(struct intel_pt_decoder *decoder)
+{
+ bool ret = false;
+
+ if (decoder->set_fup_tx_flags) {
+ decoder->set_fup_tx_flags = false;
+ decoder->tx_flags = decoder->fup_tx_flags;
+ decoder->state.type = INTEL_PT_TRANSACTION;
+ decoder->state.from_ip = decoder->ip;
+ decoder->state.to_ip = 0;
+ decoder->state.flags = decoder->fup_tx_flags;
+ return true;
+ }
+ if (decoder->set_fup_ptw) {
+ decoder->set_fup_ptw = false;
+ decoder->state.type = INTEL_PT_PTW;
+ decoder->state.flags |= INTEL_PT_FUP_IP;
+ decoder->state.from_ip = decoder->ip;
+ decoder->state.to_ip = 0;
+ decoder->state.ptw_payload = decoder->fup_ptw_payload;
+ return true;
+ }
+ if (decoder->set_fup_mwait) {
+ decoder->set_fup_mwait = false;
+ decoder->state.type = INTEL_PT_MWAIT_OP;
+ decoder->state.from_ip = decoder->ip;
+ decoder->state.to_ip = 0;
+ decoder->state.mwait_payload = decoder->fup_mwait_payload;
+ ret = true;
+ }
+ if (decoder->set_fup_pwre) {
+ decoder->set_fup_pwre = false;
+ decoder->state.type |= INTEL_PT_PWR_ENTRY;
+ decoder->state.type &= ~INTEL_PT_BRANCH;
+ decoder->state.from_ip = decoder->ip;
+ decoder->state.to_ip = 0;
+ decoder->state.pwre_payload = decoder->fup_pwre_payload;
+ ret = true;
+ }
+ if (decoder->set_fup_exstop) {
+ decoder->set_fup_exstop = false;
+ decoder->state.type |= INTEL_PT_EX_STOP;
+ decoder->state.type &= ~INTEL_PT_BRANCH;
+ decoder->state.flags |= INTEL_PT_FUP_IP;
+ decoder->state.from_ip = decoder->ip;
+ decoder->state.to_ip = 0;
+ ret = true;
+ }
+ return ret;
+}
+
static int intel_pt_walk_fup(struct intel_pt_decoder *decoder)
{
struct intel_pt_insn intel_pt_insn;
@@ -1003,15 +1110,8 @@ static int intel_pt_walk_fup(struct intel_pt_decoder *decoder)
if (err == INTEL_PT_RETURN)
return 0;
if (err == -EAGAIN) {
- if (decoder->set_fup_tx_flags) {
- decoder->set_fup_tx_flags = false;
- decoder->tx_flags = decoder->fup_tx_flags;
- decoder->state.type = INTEL_PT_TRANSACTION;
- decoder->state.from_ip = decoder->ip;
- decoder->state.to_ip = 0;
- decoder->state.flags = decoder->fup_tx_flags;
+ if (intel_pt_fup_event(decoder))
return 0;
- }
return err;
}
decoder->set_fup_tx_flags = false;
@@ -1360,7 +1460,9 @@ static void intel_pt_calc_mtc_timestamp(struct intel_pt_decoder *decoder)
static void intel_pt_calc_cbr(struct intel_pt_decoder *decoder)
{
- unsigned int cbr = decoder->packet.payload;
+ unsigned int cbr = decoder->packet.payload & 0xff;
+
+ decoder->cbr_payload = decoder->packet.payload;
if (decoder->cbr == cbr)
return;
@@ -1417,6 +1519,13 @@ static int intel_pt_walk_psbend(struct intel_pt_decoder *decoder)
case INTEL_PT_TRACESTOP:
case INTEL_PT_BAD:
case INTEL_PT_PSB:
+ case INTEL_PT_PTWRITE:
+ case INTEL_PT_PTWRITE_IP:
+ case INTEL_PT_EXSTOP:
+ case INTEL_PT_EXSTOP_IP:
+ case INTEL_PT_MWAIT:
+ case INTEL_PT_PWRE:
+ case INTEL_PT_PWRX:
decoder->have_tma = false;
intel_pt_log("ERROR: Unexpected packet\n");
return -EAGAIN;
@@ -1446,7 +1555,8 @@ static int intel_pt_walk_psbend(struct intel_pt_decoder *decoder)
case INTEL_PT_FUP:
decoder->pge = true;
- intel_pt_set_last_ip(decoder);
+ if (decoder->packet.count)
+ intel_pt_set_last_ip(decoder);
break;
case INTEL_PT_MODE_TSX:
@@ -1497,6 +1607,13 @@ static int intel_pt_walk_fup_tip(struct intel_pt_decoder *decoder)
case INTEL_PT_MODE_TSX:
case INTEL_PT_BAD:
case INTEL_PT_PSBEND:
+ case INTEL_PT_PTWRITE:
+ case INTEL_PT_PTWRITE_IP:
+ case INTEL_PT_EXSTOP:
+ case INTEL_PT_EXSTOP_IP:
+ case INTEL_PT_MWAIT:
+ case INTEL_PT_PWRE:
+ case INTEL_PT_PWRX:
intel_pt_log("ERROR: Missing TIP after FUP\n");
decoder->pkt_state = INTEL_PT_STATE_ERR3;
return -ENOENT;
@@ -1625,6 +1742,15 @@ next:
break;
}
intel_pt_set_last_ip(decoder);
+ if (!decoder->branch_enable) {
+ decoder->ip = decoder->last_ip;
+ if (intel_pt_fup_event(decoder))
+ return 0;
+ no_tip = false;
+ break;
+ }
+ if (decoder->set_fup_mwait)
+ no_tip = true;
err = intel_pt_walk_fup(decoder);
if (err != -EAGAIN) {
if (err)
@@ -1650,6 +1776,8 @@ next:
break;
case INTEL_PT_PSB:
+ decoder->last_ip = 0;
+ decoder->have_last_ip = true;
intel_pt_clear_stack(&decoder->stack);
err = intel_pt_walk_psbend(decoder);
if (err == -EAGAIN)
@@ -1696,6 +1824,16 @@ next:
case INTEL_PT_CBR:
intel_pt_calc_cbr(decoder);
+ if (!decoder->branch_enable &&
+ decoder->cbr != decoder->cbr_seen) {
+ decoder->cbr_seen = decoder->cbr;
+ decoder->state.type = INTEL_PT_CBR_CHG;
+ decoder->state.from_ip = decoder->ip;
+ decoder->state.to_ip = 0;
+ decoder->state.cbr_payload =
+ decoder->packet.payload;
+ return 0;
+ }
break;
case INTEL_PT_MODE_EXEC:
@@ -1722,6 +1860,71 @@ next:
case INTEL_PT_PAD:
break;
+ case INTEL_PT_PTWRITE_IP:
+ decoder->fup_ptw_payload = decoder->packet.payload;
+ err = intel_pt_get_next_packet(decoder);
+ if (err)
+ return err;
+ if (decoder->packet.type == INTEL_PT_FUP) {
+ decoder->set_fup_ptw = true;
+ no_tip = true;
+ } else {
+ intel_pt_log_at("ERROR: Missing FUP after PTWRITE",
+ decoder->pos);
+ }
+ goto next;
+
+ case INTEL_PT_PTWRITE:
+ decoder->state.type = INTEL_PT_PTW;
+ decoder->state.from_ip = decoder->ip;
+ decoder->state.to_ip = 0;
+ decoder->state.ptw_payload = decoder->packet.payload;
+ return 0;
+
+ case INTEL_PT_MWAIT:
+ decoder->fup_mwait_payload = decoder->packet.payload;
+ decoder->set_fup_mwait = true;
+ break;
+
+ case INTEL_PT_PWRE:
+ if (decoder->set_fup_mwait) {
+ decoder->fup_pwre_payload =
+ decoder->packet.payload;
+ decoder->set_fup_pwre = true;
+ break;
+ }
+ decoder->state.type = INTEL_PT_PWR_ENTRY;
+ decoder->state.from_ip = decoder->ip;
+ decoder->state.to_ip = 0;
+ decoder->state.pwrx_payload = decoder->packet.payload;
+ return 0;
+
+ case INTEL_PT_EXSTOP_IP:
+ err = intel_pt_get_next_packet(decoder);
+ if (err)
+ return err;
+ if (decoder->packet.type == INTEL_PT_FUP) {
+ decoder->set_fup_exstop = true;
+ no_tip = true;
+ } else {
+ intel_pt_log_at("ERROR: Missing FUP after EXSTOP",
+ decoder->pos);
+ }
+ goto next;
+
+ case INTEL_PT_EXSTOP:
+ decoder->state.type = INTEL_PT_EX_STOP;
+ decoder->state.from_ip = decoder->ip;
+ decoder->state.to_ip = 0;
+ return 0;
+
+ case INTEL_PT_PWRX:
+ decoder->state.type = INTEL_PT_PWR_EXIT;
+ decoder->state.from_ip = decoder->ip;
+ decoder->state.to_ip = 0;
+ decoder->state.pwrx_payload = decoder->packet.payload;
+ return 0;
+
default:
return intel_pt_bug(decoder);
}
@@ -1730,8 +1933,9 @@ next:
static inline bool intel_pt_have_ip(struct intel_pt_decoder *decoder)
{
- return decoder->last_ip || decoder->packet.count == 0 ||
- decoder->packet.count == 3 || decoder->packet.count == 6;
+ return decoder->packet.count &&
+ (decoder->have_last_ip || decoder->packet.count == 3 ||
+ decoder->packet.count == 6);
}
/* Walk PSB+ packets to get in sync. */
@@ -1750,6 +1954,13 @@ static int intel_pt_walk_psb(struct intel_pt_decoder *decoder)
__fallthrough;
case INTEL_PT_TIP_PGE:
case INTEL_PT_TIP:
+ case INTEL_PT_PTWRITE:
+ case INTEL_PT_PTWRITE_IP:
+ case INTEL_PT_EXSTOP:
+ case INTEL_PT_EXSTOP_IP:
+ case INTEL_PT_MWAIT:
+ case INTEL_PT_PWRE:
+ case INTEL_PT_PWRX:
intel_pt_log("ERROR: Unexpected packet\n");
return -ENOENT;
@@ -1854,14 +2065,10 @@ static int intel_pt_walk_to_ip(struct intel_pt_decoder *decoder)
break;
case INTEL_PT_FUP:
- if (decoder->overflow) {
- if (intel_pt_have_ip(decoder))
- intel_pt_set_ip(decoder);
- if (decoder->ip)
- return 0;
- }
- if (decoder->packet.count)
- intel_pt_set_last_ip(decoder);
+ if (intel_pt_have_ip(decoder))
+ intel_pt_set_ip(decoder);
+ if (decoder->ip)
+ return 0;
break;
case INTEL_PT_MTC:
@@ -1910,6 +2117,9 @@ static int intel_pt_walk_to_ip(struct intel_pt_decoder *decoder)
break;
case INTEL_PT_PSB:
+ decoder->last_ip = 0;
+ decoder->have_last_ip = true;
+ intel_pt_clear_stack(&decoder->stack);
err = intel_pt_walk_psb(decoder);
if (err)
return err;
@@ -1925,6 +2135,13 @@ static int intel_pt_walk_to_ip(struct intel_pt_decoder *decoder)
case INTEL_PT_VMCS:
case INTEL_PT_MNT:
case INTEL_PT_PAD:
+ case INTEL_PT_PTWRITE:
+ case INTEL_PT_PTWRITE_IP:
+ case INTEL_PT_EXSTOP:
+ case INTEL_PT_EXSTOP_IP:
+ case INTEL_PT_MWAIT:
+ case INTEL_PT_PWRE:
+ case INTEL_PT_PWRX:
default:
break;
}
@@ -1935,6 +2152,19 @@ static int intel_pt_sync_ip(struct intel_pt_decoder *decoder)
{
int err;
+ decoder->set_fup_tx_flags = false;
+ decoder->set_fup_ptw = false;
+ decoder->set_fup_mwait = false;
+ decoder->set_fup_pwre = false;
+ decoder->set_fup_exstop = false;
+
+ if (!decoder->branch_enable) {
+ decoder->pkt_state = INTEL_PT_STATE_IN_SYNC;
+ decoder->overflow = false;
+ decoder->state.type = 0; /* Do not have a sample */
+ return 0;
+ }
+
intel_pt_log("Scanning for full IP\n");
err = intel_pt_walk_to_ip(decoder);
if (err)
@@ -2043,6 +2273,7 @@ static int intel_pt_sync(struct intel_pt_decoder *decoder)
decoder->pge = false;
decoder->continuous_period = false;
+ decoder->have_last_ip = false;
decoder->last_ip = 0;
decoder->ip = 0;
intel_pt_clear_stack(&decoder->stack);
@@ -2051,6 +2282,7 @@ static int intel_pt_sync(struct intel_pt_decoder *decoder)
if (err)
return err;
+ decoder->have_last_ip = true;
decoder->pkt_state = INTEL_PT_STATE_NO_IP;
err = intel_pt_walk_psb(decoder);
@@ -2069,7 +2301,7 @@ static int intel_pt_sync(struct intel_pt_decoder *decoder)
static uint64_t intel_pt_est_timestamp(struct intel_pt_decoder *decoder)
{
- uint64_t est = decoder->timestamp_insn_cnt << 1;
+ uint64_t est = decoder->sample_insn_cnt << 1;
if (!decoder->cbr || !decoder->max_non_turbo_ratio)
goto out;
@@ -2077,7 +2309,7 @@ static uint64_t intel_pt_est_timestamp(struct intel_pt_decoder *decoder)
est *= decoder->max_non_turbo_ratio;
est /= decoder->cbr;
out:
- return decoder->timestamp + est;
+ return decoder->sample_timestamp + est;
}
const struct intel_pt_state *intel_pt_decode(struct intel_pt_decoder *decoder)
@@ -2093,8 +2325,10 @@ const struct intel_pt_state *intel_pt_decode(struct intel_pt_decoder *decoder)
err = intel_pt_sync(decoder);
break;
case INTEL_PT_STATE_NO_IP:
+ decoder->have_last_ip = false;
decoder->last_ip = 0;
- /* Fall through */
+ decoder->ip = 0;
+ __fallthrough;
case INTEL_PT_STATE_ERR_RESYNC:
err = intel_pt_sync_ip(decoder);
break;
@@ -2130,15 +2364,29 @@ const struct intel_pt_state *intel_pt_decode(struct intel_pt_decoder *decoder)
}
} while (err == -ENOLINK);
- decoder->state.err = err ? intel_pt_ext_err(err) : 0;
- decoder->state.timestamp = decoder->timestamp;
+ if (err) {
+ decoder->state.err = intel_pt_ext_err(err);
+ decoder->state.from_ip = decoder->ip;
+ decoder->sample_timestamp = decoder->timestamp;
+ decoder->sample_insn_cnt = decoder->timestamp_insn_cnt;
+ } else {
+ decoder->state.err = 0;
+ if (decoder->cbr != decoder->cbr_seen && decoder->state.type) {
+ decoder->cbr_seen = decoder->cbr;
+ decoder->state.type |= INTEL_PT_CBR_CHG;
+ decoder->state.cbr_payload = decoder->cbr_payload;
+ }
+ if (intel_pt_sample_time(decoder->pkt_state)) {
+ decoder->sample_timestamp = decoder->timestamp;
+ decoder->sample_insn_cnt = decoder->timestamp_insn_cnt;
+ }
+ }
+
+ decoder->state.timestamp = decoder->sample_timestamp;
decoder->state.est_timestamp = intel_pt_est_timestamp(decoder);
decoder->state.cr3 = decoder->cr3;
decoder->state.tot_insn_cnt = decoder->tot_insn_cnt;
- if (err)
- decoder->state.from_ip = decoder->ip;
-
return &decoder->state;
}
diff --git a/tools/perf/util/intel-pt-decoder/intel-pt-decoder.h b/tools/perf/util/intel-pt-decoder/intel-pt-decoder.h
index e90619a43c0c..921b22e8ca0e 100644
--- a/tools/perf/util/intel-pt-decoder/intel-pt-decoder.h
+++ b/tools/perf/util/intel-pt-decoder/intel-pt-decoder.h
@@ -25,11 +25,18 @@
#define INTEL_PT_IN_TX (1 << 0)
#define INTEL_PT_ABORT_TX (1 << 1)
#define INTEL_PT_ASYNC (1 << 2)
+#define INTEL_PT_FUP_IP (1 << 3)
enum intel_pt_sample_type {
INTEL_PT_BRANCH = 1 << 0,
INTEL_PT_INSTRUCTION = 1 << 1,
INTEL_PT_TRANSACTION = 1 << 2,
+ INTEL_PT_PTW = 1 << 3,
+ INTEL_PT_MWAIT_OP = 1 << 4,
+ INTEL_PT_PWR_ENTRY = 1 << 5,
+ INTEL_PT_EX_STOP = 1 << 6,
+ INTEL_PT_PWR_EXIT = 1 << 7,
+ INTEL_PT_CBR_CHG = 1 << 8,
};
enum intel_pt_period_type {
@@ -63,6 +70,11 @@ struct intel_pt_state {
uint64_t timestamp;
uint64_t est_timestamp;
uint64_t trace_nr;
+ uint64_t ptw_payload;
+ uint64_t mwait_payload;
+ uint64_t pwre_payload;
+ uint64_t pwrx_payload;
+ uint64_t cbr_payload;
uint32_t flags;
enum intel_pt_insn_op insn_op;
int insn_len;
@@ -87,6 +99,7 @@ struct intel_pt_params {
bool (*pgd_ip)(uint64_t ip, void *data);
void *data;
bool return_compression;
+ bool branch_enable;
uint64_t period;
enum intel_pt_period_type period_type;
unsigned max_non_turbo_ratio;
diff --git a/tools/perf/util/intel-pt-decoder/intel-pt-log.h b/tools/perf/util/intel-pt-decoder/intel-pt-log.h
index debe751dc3d6..45b64f93f358 100644
--- a/tools/perf/util/intel-pt-decoder/intel-pt-log.h
+++ b/tools/perf/util/intel-pt-decoder/intel-pt-log.h
@@ -16,6 +16,7 @@
#ifndef INCLUDE__INTEL_PT_LOG_H__
#define INCLUDE__INTEL_PT_LOG_H__
+#include <linux/compiler.h>
#include <stdint.h>
#include <inttypes.h>
@@ -34,8 +35,7 @@ void __intel_pt_log_insn(struct intel_pt_insn *intel_pt_insn, uint64_t ip);
void __intel_pt_log_insn_no_data(struct intel_pt_insn *intel_pt_insn,
uint64_t ip);
-__attribute__((format(printf, 1, 2)))
-void __intel_pt_log(const char *fmt, ...);
+void __intel_pt_log(const char *fmt, ...) __printf(1, 2);
#define intel_pt_log(fmt, ...) \
do { \
diff --git a/tools/perf/util/intel-pt-decoder/intel-pt-pkt-decoder.c b/tools/perf/util/intel-pt-decoder/intel-pt-pkt-decoder.c
index 7528ae4f7e28..ba4c9dd18643 100644
--- a/tools/perf/util/intel-pt-decoder/intel-pt-pkt-decoder.c
+++ b/tools/perf/util/intel-pt-decoder/intel-pt-pkt-decoder.c
@@ -64,6 +64,13 @@ static const char * const packet_name[] = {
[INTEL_PT_PIP] = "PIP",
[INTEL_PT_OVF] = "OVF",
[INTEL_PT_MNT] = "MNT",
+ [INTEL_PT_PTWRITE] = "PTWRITE",
+ [INTEL_PT_PTWRITE_IP] = "PTWRITE",
+ [INTEL_PT_EXSTOP] = "EXSTOP",
+ [INTEL_PT_EXSTOP_IP] = "EXSTOP",
+ [INTEL_PT_MWAIT] = "MWAIT",
+ [INTEL_PT_PWRE] = "PWRE",
+ [INTEL_PT_PWRX] = "PWRX",
};
const char *intel_pt_pkt_name(enum intel_pt_pkt_type type)
@@ -123,7 +130,7 @@ static int intel_pt_get_cbr(const unsigned char *buf, size_t len,
if (len < 4)
return INTEL_PT_NEED_MORE_BYTES;
packet->type = INTEL_PT_CBR;
- packet->payload = buf[2];
+ packet->payload = le16_to_cpu(*(uint16_t *)(buf + 2));
return 4;
}
@@ -217,12 +224,80 @@ static int intel_pt_get_3byte(const unsigned char *buf, size_t len,
}
}
+static int intel_pt_get_ptwrite(const unsigned char *buf, size_t len,
+ struct intel_pt_pkt *packet)
+{
+ packet->count = (buf[1] >> 5) & 0x3;
+ packet->type = buf[1] & BIT(7) ? INTEL_PT_PTWRITE_IP :
+ INTEL_PT_PTWRITE;
+
+ switch (packet->count) {
+ case 0:
+ if (len < 6)
+ return INTEL_PT_NEED_MORE_BYTES;
+ packet->payload = le32_to_cpu(*(uint32_t *)(buf + 2));
+ return 6;
+ case 1:
+ if (len < 10)
+ return INTEL_PT_NEED_MORE_BYTES;
+ packet->payload = le64_to_cpu(*(uint64_t *)(buf + 2));
+ return 10;
+ default:
+ return INTEL_PT_BAD_PACKET;
+ }
+}
+
+static int intel_pt_get_exstop(struct intel_pt_pkt *packet)
+{
+ packet->type = INTEL_PT_EXSTOP;
+ return 2;
+}
+
+static int intel_pt_get_exstop_ip(struct intel_pt_pkt *packet)
+{
+ packet->type = INTEL_PT_EXSTOP_IP;
+ return 2;
+}
+
+static int intel_pt_get_mwait(const unsigned char *buf, size_t len,
+ struct intel_pt_pkt *packet)
+{
+ if (len < 10)
+ return INTEL_PT_NEED_MORE_BYTES;
+ packet->type = INTEL_PT_MWAIT;
+ packet->payload = le64_to_cpu(*(uint64_t *)(buf + 2));
+ return 10;
+}
+
+static int intel_pt_get_pwre(const unsigned char *buf, size_t len,
+ struct intel_pt_pkt *packet)
+{
+ if (len < 4)
+ return INTEL_PT_NEED_MORE_BYTES;
+ packet->type = INTEL_PT_PWRE;
+ memcpy_le64(&packet->payload, buf + 2, 2);
+ return 4;
+}
+
+static int intel_pt_get_pwrx(const unsigned char *buf, size_t len,
+ struct intel_pt_pkt *packet)
+{
+ if (len < 7)
+ return INTEL_PT_NEED_MORE_BYTES;
+ packet->type = INTEL_PT_PWRX;
+ memcpy_le64(&packet->payload, buf + 2, 5);
+ return 7;
+}
+
static int intel_pt_get_ext(const unsigned char *buf, size_t len,
struct intel_pt_pkt *packet)
{
if (len < 2)
return INTEL_PT_NEED_MORE_BYTES;
+ if ((buf[1] & 0x1f) == 0x12)
+ return intel_pt_get_ptwrite(buf, len, packet);
+
switch (buf[1]) {
case 0xa3: /* Long TNT */
return intel_pt_get_long_tnt(buf, len, packet);
@@ -244,6 +319,16 @@ static int intel_pt_get_ext(const unsigned char *buf, size_t len,
return intel_pt_get_tma(buf, len, packet);
case 0xC3: /* 3-byte header */
return intel_pt_get_3byte(buf, len, packet);
+ case 0x62: /* EXSTOP no IP */
+ return intel_pt_get_exstop(packet);
+ case 0xE2: /* EXSTOP with IP */
+ return intel_pt_get_exstop_ip(packet);
+ case 0xC2: /* MWAIT */
+ return intel_pt_get_mwait(buf, len, packet);
+ case 0x22: /* PWRE */
+ return intel_pt_get_pwre(buf, len, packet);
+ case 0xA2: /* PWRX */
+ return intel_pt_get_pwrx(buf, len, packet);
default:
return INTEL_PT_BAD_PACKET;
}
@@ -522,6 +607,29 @@ int intel_pt_pkt_desc(const struct intel_pt_pkt *packet, char *buf,
ret = snprintf(buf, buf_len, "%s 0x%llx (NR=%d)",
name, payload, nr);
return ret;
+ case INTEL_PT_PTWRITE:
+ return snprintf(buf, buf_len, "%s 0x%llx IP:0", name, payload);
+ case INTEL_PT_PTWRITE_IP:
+ return snprintf(buf, buf_len, "%s 0x%llx IP:1", name, payload);
+ case INTEL_PT_EXSTOP:
+ return snprintf(buf, buf_len, "%s IP:0", name);
+ case INTEL_PT_EXSTOP_IP:
+ return snprintf(buf, buf_len, "%s IP:1", name);
+ case INTEL_PT_MWAIT:
+ return snprintf(buf, buf_len, "%s 0x%llx Hints 0x%x Extensions 0x%x",
+ name, payload, (unsigned int)(payload & 0xff),
+ (unsigned int)((payload >> 32) & 0x3));
+ case INTEL_PT_PWRE:
+ return snprintf(buf, buf_len, "%s 0x%llx HW:%u CState:%u Sub-CState:%u",
+ name, payload, !!(payload & 0x80),
+ (unsigned int)((payload >> 12) & 0xf),
+ (unsigned int)((payload >> 8) & 0xf));
+ case INTEL_PT_PWRX:
+ return snprintf(buf, buf_len, "%s 0x%llx Last CState:%u Deepest CState:%u Wake Reason 0x%x",
+ name, payload,
+ (unsigned int)((payload >> 4) & 0xf),
+ (unsigned int)(payload & 0xf),
+ (unsigned int)((payload >> 8) & 0xf));
default:
break;
}
diff --git a/tools/perf/util/intel-pt-decoder/intel-pt-pkt-decoder.h b/tools/perf/util/intel-pt-decoder/intel-pt-pkt-decoder.h
index 781bb79883bd..73ddc3a88d07 100644
--- a/tools/perf/util/intel-pt-decoder/intel-pt-pkt-decoder.h
+++ b/tools/perf/util/intel-pt-decoder/intel-pt-pkt-decoder.h
@@ -52,6 +52,13 @@ enum intel_pt_pkt_type {
INTEL_PT_PIP,
INTEL_PT_OVF,
INTEL_PT_MNT,
+ INTEL_PT_PTWRITE,
+ INTEL_PT_PTWRITE_IP,
+ INTEL_PT_EXSTOP,
+ INTEL_PT_EXSTOP_IP,
+ INTEL_PT_MWAIT,
+ INTEL_PT_PWRE,
+ INTEL_PT_PWRX,
};
struct intel_pt_pkt {
diff --git a/tools/perf/util/intel-pt-decoder/x86-opcode-map.txt b/tools/perf/util/intel-pt-decoder/x86-opcode-map.txt
index 767be7c76034..12e377184ee4 100644
--- a/tools/perf/util/intel-pt-decoder/x86-opcode-map.txt
+++ b/tools/perf/util/intel-pt-decoder/x86-opcode-map.txt
@@ -1009,7 +1009,7 @@ GrpTable: Grp15
1: fxstor | RDGSBASE Ry (F3),(11B)
2: vldmxcsr Md (v1) | WRFSBASE Ry (F3),(11B)
3: vstmxcsr Md (v1) | WRGSBASE Ry (F3),(11B)
-4: XSAVE
+4: XSAVE | ptwrite Ey (F3),(11B)
5: XRSTOR | lfence (11B)
6: XSAVEOPT | clwb (66) | mfence (11B)
7: clflush | clflushopt (66) | sfence (11B)
diff --git a/tools/perf/util/intel-pt.c b/tools/perf/util/intel-pt.c
index 4c7718f87a08..b58f9fd1e2ee 100644
--- a/tools/perf/util/intel-pt.c
+++ b/tools/perf/util/intel-pt.c
@@ -81,7 +81,6 @@ struct intel_pt {
bool sample_instructions;
u64 instructions_sample_type;
- u64 instructions_sample_period;
u64 instructions_id;
bool sample_branches;
@@ -93,6 +92,18 @@ struct intel_pt {
u64 transactions_sample_type;
u64 transactions_id;
+ bool sample_ptwrites;
+ u64 ptwrites_sample_type;
+ u64 ptwrites_id;
+
+ bool sample_pwr_events;
+ u64 pwr_events_sample_type;
+ u64 mwait_id;
+ u64 pwre_id;
+ u64 exstop_id;
+ u64 pwrx_id;
+ u64 cbr_id;
+
bool synth_needs_swap;
u64 tsc_bit;
@@ -103,6 +114,7 @@ struct intel_pt {
u64 cyc_bit;
u64 noretcomp_bit;
unsigned max_non_turbo_ratio;
+ unsigned cbr2khz;
unsigned long num_events;
@@ -668,6 +680,19 @@ static bool intel_pt_return_compression(struct intel_pt *pt)
return true;
}
+static bool intel_pt_branch_enable(struct intel_pt *pt)
+{
+ struct perf_evsel *evsel;
+ u64 config;
+
+ evlist__for_each_entry(pt->session->evlist, evsel) {
+ if (intel_pt_get_config(pt, &evsel->attr, &config) &&
+ (config & 1) && !(config & 0x2000))
+ return false;
+ }
+ return true;
+}
+
static unsigned int intel_pt_mtc_period(struct intel_pt *pt)
{
struct perf_evsel *evsel;
@@ -799,6 +824,7 @@ static struct intel_pt_queue *intel_pt_alloc_queue(struct intel_pt *pt,
params.walk_insn = intel_pt_walk_next_insn;
params.data = ptq;
params.return_compression = intel_pt_return_compression(pt);
+ params.branch_enable = intel_pt_branch_enable(pt);
params.max_non_turbo_ratio = pt->max_non_turbo_ratio;
params.mtc_period = intel_pt_mtc_period(pt);
params.tsc_ctc_ratio_n = pt->tsc_ctc_ratio_n;
@@ -1044,6 +1070,36 @@ static void intel_pt_update_last_branch_rb(struct intel_pt_queue *ptq)
bs->nr += 1;
}
+static inline bool intel_pt_skip_event(struct intel_pt *pt)
+{
+ return pt->synth_opts.initial_skip &&
+ pt->num_events++ < pt->synth_opts.initial_skip;
+}
+
+static void intel_pt_prep_b_sample(struct intel_pt *pt,
+ struct intel_pt_queue *ptq,
+ union perf_event *event,
+ struct perf_sample *sample)
+{
+ event->sample.header.type = PERF_RECORD_SAMPLE;
+ event->sample.header.misc = PERF_RECORD_MISC_USER;
+ event->sample.header.size = sizeof(struct perf_event_header);
+
+ if (!pt->timeless_decoding)
+ sample->time = tsc_to_perf_time(ptq->timestamp, &pt->tc);
+
+ sample->cpumode = PERF_RECORD_MISC_USER;
+ sample->ip = ptq->state->from_ip;
+ sample->pid = ptq->pid;
+ sample->tid = ptq->tid;
+ sample->addr = ptq->state->to_ip;
+ sample->period = 1;
+ sample->cpu = ptq->cpu;
+ sample->flags = ptq->flags;
+ sample->insn_len = ptq->insn_len;
+ memcpy(sample->insn, ptq->insn, INTEL_PT_INSN_BUF_SZ);
+}
+
static int intel_pt_inject_event(union perf_event *event,
struct perf_sample *sample, u64 type,
bool swapped)
@@ -1052,9 +1108,35 @@ static int intel_pt_inject_event(union perf_event *event,
return perf_event__synthesize_sample(event, type, 0, sample, swapped);
}
-static int intel_pt_synth_branch_sample(struct intel_pt_queue *ptq)
+static inline int intel_pt_opt_inject(struct intel_pt *pt,
+ union perf_event *event,
+ struct perf_sample *sample, u64 type)
+{
+ if (!pt->synth_opts.inject)
+ return 0;
+
+ return intel_pt_inject_event(event, sample, type, pt->synth_needs_swap);
+}
+
+static int intel_pt_deliver_synth_b_event(struct intel_pt *pt,
+ union perf_event *event,
+ struct perf_sample *sample, u64 type)
{
int ret;
+
+ ret = intel_pt_opt_inject(pt, event, sample, type);
+ if (ret)
+ return ret;
+
+ ret = perf_session__deliver_synth_event(pt->session, event, sample);
+ if (ret)
+ pr_err("Intel PT: failed to deliver event, error %d\n", ret);
+
+ return ret;
+}
+
+static int intel_pt_synth_branch_sample(struct intel_pt_queue *ptq)
+{
struct intel_pt *pt = ptq->pt;
union perf_event *event = ptq->event_buf;
struct perf_sample sample = { .ip = 0, };
@@ -1066,29 +1148,13 @@ static int intel_pt_synth_branch_sample(struct intel_pt_queue *ptq)
if (pt->branches_filter && !(pt->branches_filter & ptq->flags))
return 0;
- if (pt->synth_opts.initial_skip &&
- pt->num_events++ < pt->synth_opts.initial_skip)
+ if (intel_pt_skip_event(pt))
return 0;
- event->sample.header.type = PERF_RECORD_SAMPLE;
- event->sample.header.misc = PERF_RECORD_MISC_USER;
- event->sample.header.size = sizeof(struct perf_event_header);
+ intel_pt_prep_b_sample(pt, ptq, event, &sample);
- if (!pt->timeless_decoding)
- sample.time = tsc_to_perf_time(ptq->timestamp, &pt->tc);
-
- sample.cpumode = PERF_RECORD_MISC_USER;
- sample.ip = ptq->state->from_ip;
- sample.pid = ptq->pid;
- sample.tid = ptq->tid;
- sample.addr = ptq->state->to_ip;
sample.id = ptq->pt->branches_id;
sample.stream_id = ptq->pt->branches_id;
- sample.period = 1;
- sample.cpu = ptq->cpu;
- sample.flags = ptq->flags;
- sample.insn_len = ptq->insn_len;
- memcpy(sample.insn, ptq->insn, INTEL_PT_INSN_BUF_SZ);
/*
* perf report cannot handle events without a branch stack when using
@@ -1105,144 +1171,251 @@ static int intel_pt_synth_branch_sample(struct intel_pt_queue *ptq)
sample.branch_stack = (struct branch_stack *)&dummy_bs;
}
- if (pt->synth_opts.inject) {
- ret = intel_pt_inject_event(event, &sample,
- pt->branches_sample_type,
- pt->synth_needs_swap);
- if (ret)
- return ret;
+ return intel_pt_deliver_synth_b_event(pt, event, &sample,
+ pt->branches_sample_type);
+}
+
+static void intel_pt_prep_sample(struct intel_pt *pt,
+ struct intel_pt_queue *ptq,
+ union perf_event *event,
+ struct perf_sample *sample)
+{
+ intel_pt_prep_b_sample(pt, ptq, event, sample);
+
+ if (pt->synth_opts.callchain) {
+ thread_stack__sample(ptq->thread, ptq->chain,
+ pt->synth_opts.callchain_sz, sample->ip);
+ sample->callchain = ptq->chain;
}
- ret = perf_session__deliver_synth_event(pt->session, event, &sample);
- if (ret)
- pr_err("Intel Processor Trace: failed to deliver branch event, error %d\n",
- ret);
+ if (pt->synth_opts.last_branch) {
+ intel_pt_copy_last_branch_rb(ptq);
+ sample->branch_stack = ptq->last_branch;
+ }
+}
+
+static inline int intel_pt_deliver_synth_event(struct intel_pt *pt,
+ struct intel_pt_queue *ptq,
+ union perf_event *event,
+ struct perf_sample *sample,
+ u64 type)
+{
+ int ret;
+
+ ret = intel_pt_deliver_synth_b_event(pt, event, sample, type);
+
+ if (pt->synth_opts.last_branch)
+ intel_pt_reset_last_branch_rb(ptq);
return ret;
}
static int intel_pt_synth_instruction_sample(struct intel_pt_queue *ptq)
{
- int ret;
struct intel_pt *pt = ptq->pt;
union perf_event *event = ptq->event_buf;
struct perf_sample sample = { .ip = 0, };
- if (pt->synth_opts.initial_skip &&
- pt->num_events++ < pt->synth_opts.initial_skip)
+ if (intel_pt_skip_event(pt))
return 0;
- event->sample.header.type = PERF_RECORD_SAMPLE;
- event->sample.header.misc = PERF_RECORD_MISC_USER;
- event->sample.header.size = sizeof(struct perf_event_header);
-
- if (!pt->timeless_decoding)
- sample.time = tsc_to_perf_time(ptq->timestamp, &pt->tc);
+ intel_pt_prep_sample(pt, ptq, event, &sample);
- sample.cpumode = PERF_RECORD_MISC_USER;
- sample.ip = ptq->state->from_ip;
- sample.pid = ptq->pid;
- sample.tid = ptq->tid;
- sample.addr = ptq->state->to_ip;
sample.id = ptq->pt->instructions_id;
sample.stream_id = ptq->pt->instructions_id;
sample.period = ptq->state->tot_insn_cnt - ptq->last_insn_cnt;
- sample.cpu = ptq->cpu;
- sample.flags = ptq->flags;
- sample.insn_len = ptq->insn_len;
- memcpy(sample.insn, ptq->insn, INTEL_PT_INSN_BUF_SZ);
ptq->last_insn_cnt = ptq->state->tot_insn_cnt;
- if (pt->synth_opts.callchain) {
- thread_stack__sample(ptq->thread, ptq->chain,
- pt->synth_opts.callchain_sz, sample.ip);
- sample.callchain = ptq->chain;
- }
+ return intel_pt_deliver_synth_event(pt, ptq, event, &sample,
+ pt->instructions_sample_type);
+}
- if (pt->synth_opts.last_branch) {
- intel_pt_copy_last_branch_rb(ptq);
- sample.branch_stack = ptq->last_branch;
- }
+static int intel_pt_synth_transaction_sample(struct intel_pt_queue *ptq)
+{
+ struct intel_pt *pt = ptq->pt;
+ union perf_event *event = ptq->event_buf;
+ struct perf_sample sample = { .ip = 0, };
- if (pt->synth_opts.inject) {
- ret = intel_pt_inject_event(event, &sample,
- pt->instructions_sample_type,
- pt->synth_needs_swap);
- if (ret)
- return ret;
- }
+ if (intel_pt_skip_event(pt))
+ return 0;
- ret = perf_session__deliver_synth_event(pt->session, event, &sample);
- if (ret)
- pr_err("Intel Processor Trace: failed to deliver instruction event, error %d\n",
- ret);
+ intel_pt_prep_sample(pt, ptq, event, &sample);
- if (pt->synth_opts.last_branch)
- intel_pt_reset_last_branch_rb(ptq);
+ sample.id = ptq->pt->transactions_id;
+ sample.stream_id = ptq->pt->transactions_id;
- return ret;
+ return intel_pt_deliver_synth_event(pt, ptq, event, &sample,
+ pt->transactions_sample_type);
}
-static int intel_pt_synth_transaction_sample(struct intel_pt_queue *ptq)
+static void intel_pt_prep_p_sample(struct intel_pt *pt,
+ struct intel_pt_queue *ptq,
+ union perf_event *event,
+ struct perf_sample *sample)
+{
+ intel_pt_prep_sample(pt, ptq, event, sample);
+
+ /*
+ * Zero IP is used to mean "trace start" but that is not the case for
+ * power or PTWRITE events with no IP, so clear the flags.
+ */
+ if (!sample->ip)
+ sample->flags = 0;
+}
+
+static int intel_pt_synth_ptwrite_sample(struct intel_pt_queue *ptq)
{
- int ret;
struct intel_pt *pt = ptq->pt;
union perf_event *event = ptq->event_buf;
struct perf_sample sample = { .ip = 0, };
+ struct perf_synth_intel_ptwrite raw;
- if (pt->synth_opts.initial_skip &&
- pt->num_events++ < pt->synth_opts.initial_skip)
+ if (intel_pt_skip_event(pt))
return 0;
- event->sample.header.type = PERF_RECORD_SAMPLE;
- event->sample.header.misc = PERF_RECORD_MISC_USER;
- event->sample.header.size = sizeof(struct perf_event_header);
+ intel_pt_prep_p_sample(pt, ptq, event, &sample);
- if (!pt->timeless_decoding)
- sample.time = tsc_to_perf_time(ptq->timestamp, &pt->tc);
+ sample.id = ptq->pt->ptwrites_id;
+ sample.stream_id = ptq->pt->ptwrites_id;
- sample.cpumode = PERF_RECORD_MISC_USER;
- sample.ip = ptq->state->from_ip;
- sample.pid = ptq->pid;
- sample.tid = ptq->tid;
- sample.addr = ptq->state->to_ip;
- sample.id = ptq->pt->transactions_id;
- sample.stream_id = ptq->pt->transactions_id;
- sample.period = 1;
- sample.cpu = ptq->cpu;
- sample.flags = ptq->flags;
- sample.insn_len = ptq->insn_len;
- memcpy(sample.insn, ptq->insn, INTEL_PT_INSN_BUF_SZ);
+ raw.flags = 0;
+ raw.ip = !!(ptq->state->flags & INTEL_PT_FUP_IP);
+ raw.payload = cpu_to_le64(ptq->state->ptw_payload);
- if (pt->synth_opts.callchain) {
- thread_stack__sample(ptq->thread, ptq->chain,
- pt->synth_opts.callchain_sz, sample.ip);
- sample.callchain = ptq->chain;
- }
+ sample.raw_size = perf_synth__raw_size(raw);
+ sample.raw_data = perf_synth__raw_data(&raw);
- if (pt->synth_opts.last_branch) {
- intel_pt_copy_last_branch_rb(ptq);
- sample.branch_stack = ptq->last_branch;
- }
+ return intel_pt_deliver_synth_event(pt, ptq, event, &sample,
+ pt->ptwrites_sample_type);
+}
- if (pt->synth_opts.inject) {
- ret = intel_pt_inject_event(event, &sample,
- pt->transactions_sample_type,
- pt->synth_needs_swap);
- if (ret)
- return ret;
- }
+static int intel_pt_synth_cbr_sample(struct intel_pt_queue *ptq)
+{
+ struct intel_pt *pt = ptq->pt;
+ union perf_event *event = ptq->event_buf;
+ struct perf_sample sample = { .ip = 0, };
+ struct perf_synth_intel_cbr raw;
+ u32 flags;
- ret = perf_session__deliver_synth_event(pt->session, event, &sample);
- if (ret)
- pr_err("Intel Processor Trace: failed to deliver transaction event, error %d\n",
- ret);
+ if (intel_pt_skip_event(pt))
+ return 0;
- if (pt->synth_opts.last_branch)
- intel_pt_reset_last_branch_rb(ptq);
+ intel_pt_prep_p_sample(pt, ptq, event, &sample);
- return ret;
+ sample.id = ptq->pt->cbr_id;
+ sample.stream_id = ptq->pt->cbr_id;
+
+ flags = (u16)ptq->state->cbr_payload | (pt->max_non_turbo_ratio << 16);
+ raw.flags = cpu_to_le32(flags);
+ raw.freq = cpu_to_le32(raw.cbr * pt->cbr2khz);
+ raw.reserved3 = 0;
+
+ sample.raw_size = perf_synth__raw_size(raw);
+ sample.raw_data = perf_synth__raw_data(&raw);
+
+ return intel_pt_deliver_synth_event(pt, ptq, event, &sample,
+ pt->pwr_events_sample_type);
+}
+
+static int intel_pt_synth_mwait_sample(struct intel_pt_queue *ptq)
+{
+ struct intel_pt *pt = ptq->pt;
+ union perf_event *event = ptq->event_buf;
+ struct perf_sample sample = { .ip = 0, };
+ struct perf_synth_intel_mwait raw;
+
+ if (intel_pt_skip_event(pt))
+ return 0;
+
+ intel_pt_prep_p_sample(pt, ptq, event, &sample);
+
+ sample.id = ptq->pt->mwait_id;
+ sample.stream_id = ptq->pt->mwait_id;
+
+ raw.reserved = 0;
+ raw.payload = cpu_to_le64(ptq->state->mwait_payload);
+
+ sample.raw_size = perf_synth__raw_size(raw);
+ sample.raw_data = perf_synth__raw_data(&raw);
+
+ return intel_pt_deliver_synth_event(pt, ptq, event, &sample,
+ pt->pwr_events_sample_type);
+}
+
+static int intel_pt_synth_pwre_sample(struct intel_pt_queue *ptq)
+{
+ struct intel_pt *pt = ptq->pt;
+ union perf_event *event = ptq->event_buf;
+ struct perf_sample sample = { .ip = 0, };
+ struct perf_synth_intel_pwre raw;
+
+ if (intel_pt_skip_event(pt))
+ return 0;
+
+ intel_pt_prep_p_sample(pt, ptq, event, &sample);
+
+ sample.id = ptq->pt->pwre_id;
+ sample.stream_id = ptq->pt->pwre_id;
+
+ raw.reserved = 0;
+ raw.payload = cpu_to_le64(ptq->state->pwre_payload);
+
+ sample.raw_size = perf_synth__raw_size(raw);
+ sample.raw_data = perf_synth__raw_data(&raw);
+
+ return intel_pt_deliver_synth_event(pt, ptq, event, &sample,
+ pt->pwr_events_sample_type);
+}
+
+static int intel_pt_synth_exstop_sample(struct intel_pt_queue *ptq)
+{
+ struct intel_pt *pt = ptq->pt;
+ union perf_event *event = ptq->event_buf;
+ struct perf_sample sample = { .ip = 0, };
+ struct perf_synth_intel_exstop raw;
+
+ if (intel_pt_skip_event(pt))
+ return 0;
+
+ intel_pt_prep_p_sample(pt, ptq, event, &sample);
+
+ sample.id = ptq->pt->exstop_id;
+ sample.stream_id = ptq->pt->exstop_id;
+
+ raw.flags = 0;
+ raw.ip = !!(ptq->state->flags & INTEL_PT_FUP_IP);
+
+ sample.raw_size = perf_synth__raw_size(raw);
+ sample.raw_data = perf_synth__raw_data(&raw);
+
+ return intel_pt_deliver_synth_event(pt, ptq, event, &sample,
+ pt->pwr_events_sample_type);
+}
+
+static int intel_pt_synth_pwrx_sample(struct intel_pt_queue *ptq)
+{
+ struct intel_pt *pt = ptq->pt;
+ union perf_event *event = ptq->event_buf;
+ struct perf_sample sample = { .ip = 0, };
+ struct perf_synth_intel_pwrx raw;
+
+ if (intel_pt_skip_event(pt))
+ return 0;
+
+ intel_pt_prep_p_sample(pt, ptq, event, &sample);
+
+ sample.id = ptq->pt->pwrx_id;
+ sample.stream_id = ptq->pt->pwrx_id;
+
+ raw.reserved = 0;
+ raw.payload = cpu_to_le64(ptq->state->pwrx_payload);
+
+ sample.raw_size = perf_synth__raw_size(raw);
+ sample.raw_data = perf_synth__raw_data(&raw);
+
+ return intel_pt_deliver_synth_event(pt, ptq, event, &sample,
+ pt->pwr_events_sample_type);
}
static int intel_pt_synth_error(struct intel_pt *pt, int code, int cpu,
@@ -1296,6 +1469,10 @@ static inline bool intel_pt_is_switch_ip(struct intel_pt_queue *ptq, u64 ip)
PERF_IP_FLAG_INTERRUPT | PERF_IP_FLAG_TX_ABORT));
}
+#define INTEL_PT_PWR_EVT (INTEL_PT_MWAIT_OP | INTEL_PT_PWR_ENTRY | \
+ INTEL_PT_EX_STOP | INTEL_PT_PWR_EXIT | \
+ INTEL_PT_CBR_CHG)
+
static int intel_pt_sample(struct intel_pt_queue *ptq)
{
const struct intel_pt_state *state = ptq->state;
@@ -1307,24 +1484,52 @@ static int intel_pt_sample(struct intel_pt_queue *ptq)
ptq->have_sample = false;
- if (pt->sample_instructions &&
- (state->type & INTEL_PT_INSTRUCTION) &&
- (!pt->synth_opts.initial_skip ||
- pt->num_events++ >= pt->synth_opts.initial_skip)) {
+ if (pt->sample_pwr_events && (state->type & INTEL_PT_PWR_EVT)) {
+ if (state->type & INTEL_PT_CBR_CHG) {
+ err = intel_pt_synth_cbr_sample(ptq);
+ if (err)
+ return err;
+ }
+ if (state->type & INTEL_PT_MWAIT_OP) {
+ err = intel_pt_synth_mwait_sample(ptq);
+ if (err)
+ return err;
+ }
+ if (state->type & INTEL_PT_PWR_ENTRY) {
+ err = intel_pt_synth_pwre_sample(ptq);
+ if (err)
+ return err;
+ }
+ if (state->type & INTEL_PT_EX_STOP) {
+ err = intel_pt_synth_exstop_sample(ptq);
+ if (err)
+ return err;
+ }
+ if (state->type & INTEL_PT_PWR_EXIT) {
+ err = intel_pt_synth_pwrx_sample(ptq);
+ if (err)
+ return err;
+ }
+ }
+
+ if (pt->sample_instructions && (state->type & INTEL_PT_INSTRUCTION)) {
err = intel_pt_synth_instruction_sample(ptq);
if (err)
return err;
}
- if (pt->sample_transactions &&
- (state->type & INTEL_PT_TRANSACTION) &&
- (!pt->synth_opts.initial_skip ||
- pt->num_events++ >= pt->synth_opts.initial_skip)) {
+ if (pt->sample_transactions && (state->type & INTEL_PT_TRANSACTION)) {
err = intel_pt_synth_transaction_sample(ptq);
if (err)
return err;
}
+ if (pt->sample_ptwrites && (state->type & INTEL_PT_PTW)) {
+ err = intel_pt_synth_ptwrite_sample(ptq);
+ if (err)
+ return err;
+ }
+
if (!(state->type & INTEL_PT_BRANCH))
return 0;
@@ -1925,36 +2130,65 @@ static int intel_pt_event_synth(struct perf_tool *tool,
NULL);
}
-static int intel_pt_synth_event(struct perf_session *session,
+static int intel_pt_synth_event(struct perf_session *session, const char *name,
struct perf_event_attr *attr, u64 id)
{
struct intel_pt_synth intel_pt_synth;
+ int err;
+
+ pr_debug("Synthesizing '%s' event with id %" PRIu64 " sample type %#" PRIx64 "\n",
+ name, id, (u64)attr->sample_type);
memset(&intel_pt_synth, 0, sizeof(struct intel_pt_synth));
intel_pt_synth.session = session;
- return perf_event__synthesize_attr(&intel_pt_synth.dummy_tool, attr, 1,
- &id, intel_pt_event_synth);
+ err = perf_event__synthesize_attr(&intel_pt_synth.dummy_tool, attr, 1,
+ &id, intel_pt_event_synth);
+ if (err)
+ pr_err("%s: failed to synthesize '%s' event type\n",
+ __func__, name);
+
+ return err;
}
-static int intel_pt_synth_events(struct intel_pt *pt,
- struct perf_session *session)
+static void intel_pt_set_event_name(struct perf_evlist *evlist, u64 id,
+ const char *name)
{
- struct perf_evlist *evlist = session->evlist;
struct perf_evsel *evsel;
- struct perf_event_attr attr;
- bool found = false;
- u64 id;
- int err;
evlist__for_each_entry(evlist, evsel) {
- if (evsel->attr.type == pt->pmu_type && evsel->ids) {
- found = true;
+ if (evsel->id && evsel->id[0] == id) {
+ if (evsel->name)
+ zfree(&evsel->name);
+ evsel->name = strdup(name);
break;
}
}
+}
- if (!found) {
+static struct perf_evsel *intel_pt_evsel(struct intel_pt *pt,
+ struct perf_evlist *evlist)
+{
+ struct perf_evsel *evsel;
+
+ evlist__for_each_entry(evlist, evsel) {
+ if (evsel->attr.type == pt->pmu_type && evsel->ids)
+ return evsel;
+ }
+
+ return NULL;
+}
+
+static int intel_pt_synth_events(struct intel_pt *pt,
+ struct perf_session *session)
+{
+ struct perf_evlist *evlist = session->evlist;
+ struct perf_evsel *evsel = intel_pt_evsel(pt, evlist);
+ struct perf_event_attr attr;
+ u64 id;
+ int err;
+
+ if (!evsel) {
pr_debug("There are no selected events with Intel Processor Trace data\n");
return 0;
}
@@ -1983,6 +2217,25 @@ static int intel_pt_synth_events(struct intel_pt *pt,
if (!id)
id = 1;
+ if (pt->synth_opts.branches) {
+ attr.config = PERF_COUNT_HW_BRANCH_INSTRUCTIONS;
+ attr.sample_period = 1;
+ attr.sample_type |= PERF_SAMPLE_ADDR;
+ err = intel_pt_synth_event(session, "branches", &attr, id);
+ if (err)
+ return err;
+ pt->sample_branches = true;
+ pt->branches_sample_type = attr.sample_type;
+ pt->branches_id = id;
+ id += 1;
+ attr.sample_type &= ~(u64)PERF_SAMPLE_ADDR;
+ }
+
+ if (pt->synth_opts.callchain)
+ attr.sample_type |= PERF_SAMPLE_CALLCHAIN;
+ if (pt->synth_opts.last_branch)
+ attr.sample_type |= PERF_SAMPLE_BRANCH_STACK;
+
if (pt->synth_opts.instructions) {
attr.config = PERF_COUNT_HW_INSTRUCTIONS;
if (pt->synth_opts.period_type == PERF_ITRACE_PERIOD_NANOSECS)
@@ -1990,70 +2243,90 @@ static int intel_pt_synth_events(struct intel_pt *pt,
intel_pt_ns_to_ticks(pt, pt->synth_opts.period);
else
attr.sample_period = pt->synth_opts.period;
- pt->instructions_sample_period = attr.sample_period;
- if (pt->synth_opts.callchain)
- attr.sample_type |= PERF_SAMPLE_CALLCHAIN;
- if (pt->synth_opts.last_branch)
- attr.sample_type |= PERF_SAMPLE_BRANCH_STACK;
- pr_debug("Synthesizing 'instructions' event with id %" PRIu64 " sample type %#" PRIx64 "\n",
- id, (u64)attr.sample_type);
- err = intel_pt_synth_event(session, &attr, id);
- if (err) {
- pr_err("%s: failed to synthesize 'instructions' event type\n",
- __func__);
+ err = intel_pt_synth_event(session, "instructions", &attr, id);
+ if (err)
return err;
- }
pt->sample_instructions = true;
pt->instructions_sample_type = attr.sample_type;
pt->instructions_id = id;
id += 1;
}
+ attr.sample_type &= ~(u64)PERF_SAMPLE_PERIOD;
+ attr.sample_period = 1;
+
if (pt->synth_opts.transactions) {
attr.config = PERF_COUNT_HW_INSTRUCTIONS;
- attr.sample_period = 1;
- if (pt->synth_opts.callchain)
- attr.sample_type |= PERF_SAMPLE_CALLCHAIN;
- if (pt->synth_opts.last_branch)
- attr.sample_type |= PERF_SAMPLE_BRANCH_STACK;
- pr_debug("Synthesizing 'transactions' event with id %" PRIu64 " sample type %#" PRIx64 "\n",
- id, (u64)attr.sample_type);
- err = intel_pt_synth_event(session, &attr, id);
- if (err) {
- pr_err("%s: failed to synthesize 'transactions' event type\n",
- __func__);
+ err = intel_pt_synth_event(session, "transactions", &attr, id);
+ if (err)
return err;
- }
pt->sample_transactions = true;
+ pt->transactions_sample_type = attr.sample_type;
pt->transactions_id = id;
+ intel_pt_set_event_name(evlist, id, "transactions");
id += 1;
- evlist__for_each_entry(evlist, evsel) {
- if (evsel->id && evsel->id[0] == pt->transactions_id) {
- if (evsel->name)
- zfree(&evsel->name);
- evsel->name = strdup("transactions");
- break;
- }
- }
}
- if (pt->synth_opts.branches) {
- attr.config = PERF_COUNT_HW_BRANCH_INSTRUCTIONS;
- attr.sample_period = 1;
- attr.sample_type |= PERF_SAMPLE_ADDR;
- attr.sample_type &= ~(u64)PERF_SAMPLE_CALLCHAIN;
- attr.sample_type &= ~(u64)PERF_SAMPLE_BRANCH_STACK;
- pr_debug("Synthesizing 'branches' event with id %" PRIu64 " sample type %#" PRIx64 "\n",
- id, (u64)attr.sample_type);
- err = intel_pt_synth_event(session, &attr, id);
- if (err) {
- pr_err("%s: failed to synthesize 'branches' event type\n",
- __func__);
+ attr.type = PERF_TYPE_SYNTH;
+ attr.sample_type |= PERF_SAMPLE_RAW;
+
+ if (pt->synth_opts.ptwrites) {
+ attr.config = PERF_SYNTH_INTEL_PTWRITE;
+ err = intel_pt_synth_event(session, "ptwrite", &attr, id);
+ if (err)
return err;
- }
- pt->sample_branches = true;
- pt->branches_sample_type = attr.sample_type;
- pt->branches_id = id;
+ pt->sample_ptwrites = true;
+ pt->ptwrites_sample_type = attr.sample_type;
+ pt->ptwrites_id = id;
+ intel_pt_set_event_name(evlist, id, "ptwrite");
+ id += 1;
+ }
+
+ if (pt->synth_opts.pwr_events) {
+ pt->sample_pwr_events = true;
+ pt->pwr_events_sample_type = attr.sample_type;
+
+ attr.config = PERF_SYNTH_INTEL_CBR;
+ err = intel_pt_synth_event(session, "cbr", &attr, id);
+ if (err)
+ return err;
+ pt->cbr_id = id;
+ intel_pt_set_event_name(evlist, id, "cbr");
+ id += 1;
+ }
+
+ if (pt->synth_opts.pwr_events && (evsel->attr.config & 0x10)) {
+ attr.config = PERF_SYNTH_INTEL_MWAIT;
+ err = intel_pt_synth_event(session, "mwait", &attr, id);
+ if (err)
+ return err;
+ pt->mwait_id = id;
+ intel_pt_set_event_name(evlist, id, "mwait");
+ id += 1;
+
+ attr.config = PERF_SYNTH_INTEL_PWRE;
+ err = intel_pt_synth_event(session, "pwre", &attr, id);
+ if (err)
+ return err;
+ pt->pwre_id = id;
+ intel_pt_set_event_name(evlist, id, "pwre");
+ id += 1;
+
+ attr.config = PERF_SYNTH_INTEL_EXSTOP;
+ err = intel_pt_synth_event(session, "exstop", &attr, id);
+ if (err)
+ return err;
+ pt->exstop_id = id;
+ intel_pt_set_event_name(evlist, id, "exstop");
+ id += 1;
+
+ attr.config = PERF_SYNTH_INTEL_PWRX;
+ err = intel_pt_synth_event(session, "pwrx", &attr, id);
+ if (err)
+ return err;
+ pt->pwrx_id = id;
+ intel_pt_set_event_name(evlist, id, "pwrx");
+ id += 1;
}
pt->synth_needs_swap = evsel->needs_swap;
@@ -2322,6 +2595,7 @@ int intel_pt_process_auxtrace_info(union perf_event *event,
intel_pt_log("TSC frequency %"PRIu64"\n", tsc_freq);
intel_pt_log("Maximum non-turbo ratio %u\n",
pt->max_non_turbo_ratio);
+ pt->cbr2khz = tsc_freq / pt->max_non_turbo_ratio / 1000;
}
if (pt->synth_opts.calls)
diff --git a/tools/perf/util/machine.c b/tools/perf/util/machine.c
index d7f31cb0a4cb..5de2b86b9880 100644
--- a/tools/perf/util/machine.c
+++ b/tools/perf/util/machine.c
@@ -1209,10 +1209,12 @@ int machine__create_kernel_maps(struct machine *machine)
*/
map_groups__fixup_end(&machine->kmaps);
- if (machine__get_running_kernel_start(machine, &name, &addr)) {
- } else if (maps__set_kallsyms_ref_reloc_sym(machine->vmlinux_maps, name, addr)) {
- machine__destroy_kernel_maps(machine);
- return -1;
+ if (!machine__get_running_kernel_start(machine, &name, &addr)) {
+ if (name &&
+ maps__set_kallsyms_ref_reloc_sym(machine->vmlinux_maps, name, addr)) {
+ machine__destroy_kernel_maps(machine);
+ return -1;
+ }
}
return 0;
diff --git a/tools/perf/util/pmu.h b/tools/perf/util/pmu.h
index ea7f450dc609..389e9729331f 100644
--- a/tools/perf/util/pmu.h
+++ b/tools/perf/util/pmu.h
@@ -2,6 +2,7 @@
#define __PMU_H
#include <linux/bitmap.h>
+#include <linux/compiler.h>
#include <linux/perf_event.h>
#include <stdbool.h>
#include "evsel.h"
@@ -83,8 +84,7 @@ void print_pmu_events(const char *event_glob, bool name_only, bool quiet,
bool long_desc, bool details_flag);
bool pmu_have_event(const char *pname, const char *name);
-int perf_pmu__scan_file(struct perf_pmu *pmu, const char *name, const char *fmt,
- ...) __attribute__((format(scanf, 3, 4)));
+int perf_pmu__scan_file(struct perf_pmu *pmu, const char *name, const char *fmt, ...) __scanf(3, 4);
int perf_pmu__test(void);
diff --git a/tools/perf/util/probe-event.c b/tools/perf/util/probe-event.c
index 84e7e698411e..a2670e9d652d 100644
--- a/tools/perf/util/probe-event.c
+++ b/tools/perf/util/probe-event.c
@@ -619,7 +619,7 @@ static int post_process_probe_trace_point(struct probe_trace_point *tp,
struct map *map, unsigned long offs)
{
struct symbol *sym;
- u64 addr = tp->address + tp->offset - offs;
+ u64 addr = tp->address - offs;
sym = map__find_symbol(map, addr);
if (!sym)
diff --git a/tools/perf/util/probe-event.h b/tools/perf/util/probe-event.h
index 373842656fb6..5812947418dd 100644
--- a/tools/perf/util/probe-event.h
+++ b/tools/perf/util/probe-event.h
@@ -1,6 +1,7 @@
#ifndef _PROBE_EVENT_H
#define _PROBE_EVENT_H
+#include <linux/compiler.h>
#include <stdbool.h>
#include "intlist.h"
@@ -171,8 +172,7 @@ void arch__fix_tev_from_maps(struct perf_probe_event *pev,
struct symbol *sym);
/* If there is no space to write, returns -E2BIG. */
-int e_snprintf(char *str, size_t size, const char *format, ...)
- __attribute__((format(printf, 3, 4)));
+int e_snprintf(char *str, size_t size, const char *format, ...) __printf(3, 4);
/* Maximum index number of event-name postfix */
#define MAX_EVENT_INDEX 1024
diff --git a/tools/perf/util/scripting-engines/trace-event-python.c b/tools/perf/util/scripting-engines/trace-event-python.c
index 40de3cb40d21..57b7a00e6f16 100644
--- a/tools/perf/util/scripting-engines/trace-event-python.c
+++ b/tools/perf/util/scripting-engines/trace-event-python.c
@@ -28,6 +28,7 @@
#include <stdbool.h>
#include <errno.h>
#include <linux/bitmap.h>
+#include <linux/compiler.h>
#include <linux/time64.h>
#include "../../perf.h"
@@ -84,7 +85,7 @@ struct tables {
static struct tables tables_global;
-static void handler_call_die(const char *handler_name) NORETURN;
+static void handler_call_die(const char *handler_name) __noreturn;
static void handler_call_die(const char *handler_name)
{
PyErr_Print();
diff --git a/tools/perf/util/session.c b/tools/perf/util/session.c
index 7dc1096264c5..d19c40a81040 100644
--- a/tools/perf/util/session.c
+++ b/tools/perf/util/session.c
@@ -2035,7 +2035,7 @@ int perf_session__cpu_bitmap(struct perf_session *session,
if (!(evsel->attr.sample_type & PERF_SAMPLE_CPU)) {
pr_err("File does not contain CPU events. "
- "Remove -c option to proceed.\n");
+ "Remove -C option to proceed.\n");
return -1;
}
}
diff --git a/tools/perf/util/sort.c b/tools/perf/util/sort.c
index 5762ae4e9e91..8b327c955a4f 100644
--- a/tools/perf/util/sort.c
+++ b/tools/perf/util/sort.c
@@ -2532,12 +2532,12 @@ static int setup_sort_list(struct perf_hpp_list *list, char *str,
ret = sort_dimension__add(list, tok, evlist, level);
if (ret == -EINVAL) {
if (!cacheline_size && !strncasecmp(tok, "dcacheline", strlen(tok)))
- error("The \"dcacheline\" --sort key needs to know the cacheline size and it couldn't be determined on this system");
+ pr_err("The \"dcacheline\" --sort key needs to know the cacheline size and it couldn't be determined on this system");
else
- error("Invalid --sort key: `%s'", tok);
+ pr_err("Invalid --sort key: `%s'", tok);
break;
} else if (ret == -ESRCH) {
- error("Unknown --sort key: `%s'", tok);
+ pr_err("Unknown --sort key: `%s'", tok);
break;
}
}
@@ -2594,7 +2594,7 @@ static int setup_sort_order(struct perf_evlist *evlist)
return 0;
if (sort_order[1] == '\0') {
- error("Invalid --sort key: `+'");
+ pr_err("Invalid --sort key: `+'");
return -EINVAL;
}
@@ -2604,7 +2604,7 @@ static int setup_sort_order(struct perf_evlist *evlist)
*/
if (asprintf(&new_sort_order, "%s,%s",
get_default_sort_order(evlist), sort_order + 1) < 0) {
- error("Not enough memory to set up --sort");
+ pr_err("Not enough memory to set up --sort");
return -ENOMEM;
}
@@ -2668,7 +2668,7 @@ static int __setup_sorting(struct perf_evlist *evlist)
str = strdup(sort_keys);
if (str == NULL) {
- error("Not enough memory to setup sort keys");
+ pr_err("Not enough memory to setup sort keys");
return -ENOMEM;
}
@@ -2678,7 +2678,7 @@ static int __setup_sorting(struct perf_evlist *evlist)
if (!is_strict_order(field_order)) {
str = setup_overhead(str);
if (str == NULL) {
- error("Not enough memory to setup overhead keys");
+ pr_err("Not enough memory to setup overhead keys");
return -ENOMEM;
}
}
@@ -2834,10 +2834,10 @@ static int setup_output_list(struct perf_hpp_list *list, char *str)
tok; tok = strtok_r(NULL, ", ", &tmp)) {
ret = output_field_add(list, tok);
if (ret == -EINVAL) {
- error("Invalid --fields key: `%s'", tok);
+ pr_err("Invalid --fields key: `%s'", tok);
break;
} else if (ret == -ESRCH) {
- error("Unknown --fields key: `%s'", tok);
+ pr_err("Unknown --fields key: `%s'", tok);
break;
}
}
@@ -2877,7 +2877,7 @@ static int __setup_output_field(void)
strp = str = strdup(field_order);
if (str == NULL) {
- error("Not enough memory to setup output fields");
+ pr_err("Not enough memory to setup output fields");
return -ENOMEM;
}
@@ -2885,7 +2885,7 @@ static int __setup_output_field(void)
strp++;
if (!strlen(strp)) {
- error("Invalid --fields key: `+'");
+ pr_err("Invalid --fields key: `+'");
goto out;
}
diff --git a/tools/perf/util/stat-shadow.c b/tools/perf/util/stat-shadow.c
index ac10cc675d39..719d6cb86952 100644
--- a/tools/perf/util/stat-shadow.c
+++ b/tools/perf/util/stat-shadow.c
@@ -44,6 +44,8 @@ static struct stats runtime_topdown_slots_issued[NUM_CTX][MAX_NR_CPUS];
static struct stats runtime_topdown_slots_retired[NUM_CTX][MAX_NR_CPUS];
static struct stats runtime_topdown_fetch_bubbles[NUM_CTX][MAX_NR_CPUS];
static struct stats runtime_topdown_recovery_bubbles[NUM_CTX][MAX_NR_CPUS];
+static struct stats runtime_smi_num_stats[NUM_CTX][MAX_NR_CPUS];
+static struct stats runtime_aperf_stats[NUM_CTX][MAX_NR_CPUS];
static struct rblist runtime_saved_values;
static bool have_frontend_stalled;
@@ -157,6 +159,8 @@ void perf_stat__reset_shadow_stats(void)
memset(runtime_topdown_slots_issued, 0, sizeof(runtime_topdown_slots_issued));
memset(runtime_topdown_fetch_bubbles, 0, sizeof(runtime_topdown_fetch_bubbles));
memset(runtime_topdown_recovery_bubbles, 0, sizeof(runtime_topdown_recovery_bubbles));
+ memset(runtime_smi_num_stats, 0, sizeof(runtime_smi_num_stats));
+ memset(runtime_aperf_stats, 0, sizeof(runtime_aperf_stats));
next = rb_first(&runtime_saved_values.entries);
while (next) {
@@ -217,6 +221,10 @@ void perf_stat__update_shadow_stats(struct perf_evsel *counter, u64 *count,
update_stats(&runtime_dtlb_cache_stats[ctx][cpu], count[0]);
else if (perf_evsel__match(counter, HW_CACHE, HW_CACHE_ITLB))
update_stats(&runtime_itlb_cache_stats[ctx][cpu], count[0]);
+ else if (perf_stat_evsel__is(counter, SMI_NUM))
+ update_stats(&runtime_smi_num_stats[ctx][cpu], count[0]);
+ else if (perf_stat_evsel__is(counter, APERF))
+ update_stats(&runtime_aperf_stats[ctx][cpu], count[0]);
if (counter->collect_stat) {
struct saved_value *v = saved_value_lookup(counter, cpu, ctx,
@@ -592,6 +600,29 @@ static double td_be_bound(int ctx, int cpu)
return sanitize_val(1.0 - sum);
}
+static void print_smi_cost(int cpu, struct perf_evsel *evsel,
+ struct perf_stat_output_ctx *out)
+{
+ double smi_num, aperf, cycles, cost = 0.0;
+ int ctx = evsel_context(evsel);
+ const char *color = NULL;
+
+ smi_num = avg_stats(&runtime_smi_num_stats[ctx][cpu]);
+ aperf = avg_stats(&runtime_aperf_stats[ctx][cpu]);
+ cycles = avg_stats(&runtime_cycles_stats[ctx][cpu]);
+
+ if ((cycles == 0) || (aperf == 0))
+ return;
+
+ if (smi_num)
+ cost = (aperf - cycles) / aperf * 100.00;
+
+ if (cost > 10)
+ color = PERF_COLOR_RED;
+ out->print_metric(out->ctx, color, "%8.1f%%", "SMI cycles%", cost);
+ out->print_metric(out->ctx, NULL, "%4.0f", "SMI#", smi_num);
+}
+
void perf_stat__print_shadow_stats(struct perf_evsel *evsel,
double avg, int cpu,
struct perf_stat_output_ctx *out)
@@ -825,6 +856,8 @@ void perf_stat__print_shadow_stats(struct perf_evsel *evsel,
}
snprintf(unit_buf, sizeof(unit_buf), "%c/sec", unit);
print_metric(ctxp, NULL, "%8.3f", unit_buf, ratio);
+ } else if (perf_stat_evsel__is(evsel, SMI_NUM)) {
+ print_smi_cost(cpu, evsel, out);
} else {
print_metric(ctxp, NULL, NULL, NULL, 0);
}
diff --git a/tools/perf/util/stat.c b/tools/perf/util/stat.c
index c58174443dc1..53b9a994a3dc 100644
--- a/tools/perf/util/stat.c
+++ b/tools/perf/util/stat.c
@@ -86,6 +86,8 @@ static const char *id_str[PERF_STAT_EVSEL_ID__MAX] = {
ID(TOPDOWN_SLOTS_RETIRED, topdown-slots-retired),
ID(TOPDOWN_FETCH_BUBBLES, topdown-fetch-bubbles),
ID(TOPDOWN_RECOVERY_BUBBLES, topdown-recovery-bubbles),
+ ID(SMI_NUM, msr/smi/),
+ ID(APERF, msr/aperf/),
};
#undef ID
diff --git a/tools/perf/util/stat.h b/tools/perf/util/stat.h
index 0a65ae23f495..7522bf10b03e 100644
--- a/tools/perf/util/stat.h
+++ b/tools/perf/util/stat.h
@@ -22,6 +22,8 @@ enum perf_stat_evsel_id {
PERF_STAT_EVSEL_ID__TOPDOWN_SLOTS_RETIRED,
PERF_STAT_EVSEL_ID__TOPDOWN_FETCH_BUBBLES,
PERF_STAT_EVSEL_ID__TOPDOWN_RECOVERY_BUBBLES,
+ PERF_STAT_EVSEL_ID__SMI_NUM,
+ PERF_STAT_EVSEL_ID__APERF,
PERF_STAT_EVSEL_ID__MAX,
};
diff --git a/tools/perf/util/strbuf.h b/tools/perf/util/strbuf.h
index 318424ea561d..802d743378af 100644
--- a/tools/perf/util/strbuf.h
+++ b/tools/perf/util/strbuf.h
@@ -42,6 +42,7 @@
#include <stdarg.h>
#include <stddef.h>
#include <string.h>
+#include <linux/compiler.h>
#include <sys/types.h>
extern char strbuf_slopbuf[];
@@ -85,8 +86,7 @@ static inline int strbuf_addstr(struct strbuf *sb, const char *s) {
return strbuf_add(sb, s, strlen(s));
}
-__attribute__((format(printf,2,3)))
-int strbuf_addf(struct strbuf *sb, const char *fmt, ...);
+int strbuf_addf(struct strbuf *sb, const char *fmt, ...) __printf(2, 3);
/* XXX: if read fails, any partial read is undone */
ssize_t strbuf_read(struct strbuf *, int fd, ssize_t hint);
diff --git a/tools/perf/util/trace-event-parse.c b/tools/perf/util/trace-event-parse.c
index 746bbee645d9..e0a6e9a6a053 100644
--- a/tools/perf/util/trace-event-parse.c
+++ b/tools/perf/util/trace-event-parse.c
@@ -24,7 +24,7 @@
#include <errno.h>
#include "../perf.h"
-#include "util.h"
+#include "debug.h"
#include "trace-event.h"
#include "sane_ctype.h"
@@ -150,7 +150,7 @@ void parse_ftrace_printk(struct pevent *pevent,
while (line) {
addr_str = strtok_r(line, ":", &fmt);
if (!addr_str) {
- warning("printk format with empty entry");
+ pr_warning("printk format with empty entry");
break;
}
addr = strtoull(addr_str, NULL, 16);
diff --git a/tools/perf/util/unwind-libdw.c b/tools/perf/util/unwind-libdw.c
index da45c4be5fb3..7755a5e0fe5e 100644
--- a/tools/perf/util/unwind-libdw.c
+++ b/tools/perf/util/unwind-libdw.c
@@ -178,6 +178,14 @@ frame_callback(Dwfl_Frame *state, void *arg)
Dwarf_Addr pc;
bool isactivation;
+ if (!dwfl_frame_pc(state, &pc, NULL)) {
+ pr_err("%s", dwfl_errmsg(-1));
+ return DWARF_CB_ABORT;
+ }
+
+ // report the module before we query for isactivation
+ report_module(pc, ui);
+
if (!dwfl_frame_pc(state, &pc, &isactivation)) {
pr_err("%s", dwfl_errmsg(-1));
return DWARF_CB_ABORT;
diff --git a/tools/perf/util/usage.c b/tools/perf/util/usage.c
index 996046a66fe5..6cc9d9888ce0 100644
--- a/tools/perf/util/usage.c
+++ b/tools/perf/util/usage.c
@@ -9,75 +9,17 @@
#include "util.h"
#include "debug.h"
-static void report(const char *prefix, const char *err, va_list params)
-{
- char msg[1024];
- vsnprintf(msg, sizeof(msg), err, params);
- fprintf(stderr, " %s%s\n", prefix, msg);
-}
-
-static NORETURN void usage_builtin(const char *err)
+static __noreturn void usage_builtin(const char *err)
{
fprintf(stderr, "\n Usage: %s\n", err);
exit(129);
}
-static NORETURN void die_builtin(const char *err, va_list params)
-{
- report(" Fatal: ", err, params);
- exit(128);
-}
-
-static void error_builtin(const char *err, va_list params)
-{
- report(" Error: ", err, params);
-}
-
-static void warn_builtin(const char *warn, va_list params)
-{
- report(" Warning: ", warn, params);
-}
-
/* If we are in a dlopen()ed .so write to a global variable would segfault
* (ugh), so keep things static. */
-static void (*usage_routine)(const char *err) NORETURN = usage_builtin;
-static void (*error_routine)(const char *err, va_list params) = error_builtin;
-static void (*warn_routine)(const char *err, va_list params) = warn_builtin;
-
-void set_warning_routine(void (*routine)(const char *err, va_list params))
-{
- warn_routine = routine;
-}
+static void (*usage_routine)(const char *err) __noreturn = usage_builtin;
void usage(const char *err)
{
usage_routine(err);
}
-
-void die(const char *err, ...)
-{
- va_list params;
-
- va_start(params, err);
- die_builtin(err, params);
- va_end(params);
-}
-
-int error(const char *err, ...)
-{
- va_list params;
-
- va_start(params, err);
- error_routine(err, params);
- va_end(params);
- return -1;
-}
-
-void warning(const char *warn, ...)
-{
- va_list params;
-
- va_start(params, warn);
- warn_routine(warn, params);
- va_end(params);
-}
diff --git a/tools/perf/util/util.c b/tools/perf/util/util.c
index 28c9f335006c..988111e0bab5 100644
--- a/tools/perf/util/util.c
+++ b/tools/perf/util/util.c
@@ -343,43 +343,6 @@ int perf_event_paranoid(void)
return value;
}
-
-bool find_process(const char *name)
-{
- size_t len = strlen(name);
- DIR *dir;
- struct dirent *d;
- int ret = -1;
-
- dir = opendir(procfs__mountpoint());
- if (!dir)
- return false;
-
- /* Walk through the directory. */
- while (ret && (d = readdir(dir)) != NULL) {
- char path[PATH_MAX];
- char *data;
- size_t size;
-
- if ((d->d_type != DT_DIR) ||
- !strcmp(".", d->d_name) ||
- !strcmp("..", d->d_name))
- continue;
-
- scnprintf(path, sizeof(path), "%s/%s/comm",
- procfs__mountpoint(), d->d_name);
-
- if (filename__read_str(path, &data, &size))
- continue;
-
- ret = strncmp(name, data, len);
- free(data);
- }
-
- closedir(dir);
- return ret ? false : true;
-}
-
static int
fetch_ubuntu_kernel_version(unsigned int *puint)
{
@@ -387,8 +350,12 @@ fetch_ubuntu_kernel_version(unsigned int *puint)
size_t line_len = 0;
char *ptr, *line = NULL;
int version, patchlevel, sublevel, err;
- FILE *vsig = fopen("/proc/version_signature", "r");
+ FILE *vsig;
+
+ if (!puint)
+ return 0;
+ vsig = fopen("/proc/version_signature", "r");
if (!vsig) {
pr_debug("Open /proc/version_signature failed: %s\n",
strerror(errno));
@@ -418,8 +385,7 @@ fetch_ubuntu_kernel_version(unsigned int *puint)
goto errout;
}
- if (puint)
- *puint = (version << 16) + (patchlevel << 8) + sublevel;
+ *puint = (version << 16) + (patchlevel << 8) + sublevel;
err = 0;
errout:
free(line);
@@ -446,6 +412,9 @@ fetch_kernel_version(unsigned int *puint, char *str,
str[str_size - 1] = '\0';
}
+ if (!puint || int_ver_ready)
+ return 0;
+
err = sscanf(utsname.release, "%d.%d.%d",
&version, &patchlevel, &sublevel);
@@ -455,8 +424,7 @@ fetch_kernel_version(unsigned int *puint, char *str,
return -1;
}
- if (puint && !int_ver_ready)
- *puint = (version << 16) + (patchlevel << 8) + sublevel;
+ *puint = (version << 16) + (patchlevel << 8) + sublevel;
return 0;
}
diff --git a/tools/perf/util/util.h b/tools/perf/util/util.h
index 5dfb9bb6482d..2c9e58a45310 100644
--- a/tools/perf/util/util.h
+++ b/tools/perf/util/util.h
@@ -1,7 +1,6 @@
#ifndef GIT_COMPAT_UTIL_H
#define GIT_COMPAT_UTIL_H
-#define _ALL_SOURCE 1
#define _BSD_SOURCE 1
/* glibc 2.20 deprecates _BSD_SOURCE in favour of _DEFAULT_SOURCE */
#define _DEFAULT_SOURCE 1
@@ -11,24 +10,12 @@
#include <stddef.h>
#include <stdlib.h>
#include <stdarg.h>
+#include <linux/compiler.h>
#include <linux/types.h>
-#ifdef __GNUC__
-#define NORETURN __attribute__((__noreturn__))
-#else
-#define NORETURN
-#ifndef __attribute__
-#define __attribute__(x)
-#endif
-#endif
-
/* General helper functions */
-void usage(const char *err) NORETURN;
-void die(const char *err, ...) NORETURN __attribute__((format (printf, 1, 2)));
-int error(const char *err, ...) __attribute__((format (printf, 1, 2)));
-void warning(const char *err, ...) __attribute__((format (printf, 1, 2)));
-
-void set_warning_routine(void (*routine)(const char *err, va_list params));
+void usage(const char *err) __noreturn;
+void die(const char *err, ...) __noreturn __printf(1, 2);
static inline void *zalloc(size_t size)
{
@@ -57,8 +44,6 @@ int hex2u64(const char *ptr, u64 *val);
extern unsigned int page_size;
extern int cacheline_size;
-bool find_process(const char *name);
-
int fetch_kernel_version(unsigned int *puint,
char *str, size_t str_sz);
#define KVER_VERSION(x) (((x) >> 16) & 0xff)
diff --git a/tools/testing/selftests/bpf/bpf_endian.h b/tools/testing/selftests/bpf/bpf_endian.h
index 19d0604f8694..487cbfb89beb 100644
--- a/tools/testing/selftests/bpf/bpf_endian.h
+++ b/tools/testing/selftests/bpf/bpf_endian.h
@@ -1,23 +1,42 @@
#ifndef __BPF_ENDIAN__
#define __BPF_ENDIAN__
-#include <asm/byteorder.h>
+#include <linux/swab.h>
-#if __BYTE_ORDER == __LITTLE_ENDIAN
-# define __bpf_ntohs(x) __builtin_bswap16(x)
-# define __bpf_htons(x) __builtin_bswap16(x)
-#elif __BYTE_ORDER == __BIG_ENDIAN
-# define __bpf_ntohs(x) (x)
-# define __bpf_htons(x) (x)
+/* LLVM's BPF target selects the endianness of the CPU
+ * it compiles on, or the user specifies (bpfel/bpfeb),
+ * respectively. The used __BYTE_ORDER__ is defined by
+ * the compiler, we cannot rely on __BYTE_ORDER from
+ * libc headers, since it doesn't reflect the actual
+ * requested byte order.
+ *
+ * Note, LLVM's BPF target has different __builtin_bswapX()
+ * semantics. It does map to BPF_ALU | BPF_END | BPF_TO_BE
+ * in bpfel and bpfeb case, which means below, that we map
+ * to cpu_to_be16(). We could use it unconditionally in BPF
+ * case, but better not rely on it, so that this header here
+ * can be used from application and BPF program side, which
+ * use different targets.
+ */
+#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
+# define __bpf_ntohs(x) __builtin_bswap16(x)
+# define __bpf_htons(x) __builtin_bswap16(x)
+# define __bpf_constant_ntohs(x) ___constant_swab16(x)
+# define __bpf_constant_htons(x) ___constant_swab16(x)
+#elif __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
+# define __bpf_ntohs(x) (x)
+# define __bpf_htons(x) (x)
+# define __bpf_constant_ntohs(x) (x)
+# define __bpf_constant_htons(x) (x)
#else
-# error "Fix your __BYTE_ORDER?!"
+# error "Fix your compiler's __BYTE_ORDER__?!"
#endif
#define bpf_htons(x) \
(__builtin_constant_p(x) ? \
- __constant_htons(x) : __bpf_htons(x))
+ __bpf_constant_htons(x) : __bpf_htons(x))
#define bpf_ntohs(x) \
(__builtin_constant_p(x) ? \
- __constant_ntohs(x) : __bpf_ntohs(x))
+ __bpf_constant_ntohs(x) : __bpf_ntohs(x))
-#endif
+#endif /* __BPF_ENDIAN__ */
diff --git a/tools/testing/selftests/bpf/test_verifier.c b/tools/testing/selftests/bpf/test_verifier.c
index cabb19b1e371..0ff8c55c0464 100644
--- a/tools/testing/selftests/bpf/test_verifier.c
+++ b/tools/testing/selftests/bpf/test_verifier.c
@@ -3749,6 +3749,72 @@ static struct bpf_test tests[] = {
.errstr = "invalid bpf_context access",
},
{
+ "leak pointer into ctx 1",
+ .insns = {
+ BPF_MOV64_IMM(BPF_REG_0, 0),
+ BPF_STX_MEM(BPF_DW, BPF_REG_1, BPF_REG_0,
+ offsetof(struct __sk_buff, cb[0])),
+ BPF_LD_MAP_FD(BPF_REG_2, 0),
+ BPF_STX_XADD(BPF_DW, BPF_REG_1, BPF_REG_2,
+ offsetof(struct __sk_buff, cb[0])),
+ BPF_EXIT_INSN(),
+ },
+ .fixup_map1 = { 2 },
+ .errstr_unpriv = "R2 leaks addr into mem",
+ .result_unpriv = REJECT,
+ .result = ACCEPT,
+ },
+ {
+ "leak pointer into ctx 2",
+ .insns = {
+ BPF_MOV64_IMM(BPF_REG_0, 0),
+ BPF_STX_MEM(BPF_DW, BPF_REG_1, BPF_REG_0,
+ offsetof(struct __sk_buff, cb[0])),
+ BPF_STX_XADD(BPF_DW, BPF_REG_1, BPF_REG_10,
+ offsetof(struct __sk_buff, cb[0])),
+ BPF_EXIT_INSN(),
+ },
+ .errstr_unpriv = "R10 leaks addr into mem",
+ .result_unpriv = REJECT,
+ .result = ACCEPT,
+ },
+ {
+ "leak pointer into ctx 3",
+ .insns = {
+ BPF_MOV64_IMM(BPF_REG_0, 0),
+ BPF_LD_MAP_FD(BPF_REG_2, 0),
+ BPF_STX_MEM(BPF_DW, BPF_REG_1, BPF_REG_2,
+ offsetof(struct __sk_buff, cb[0])),
+ BPF_EXIT_INSN(),
+ },
+ .fixup_map1 = { 1 },
+ .errstr_unpriv = "R2 leaks addr into ctx",
+ .result_unpriv = REJECT,
+ .result = ACCEPT,
+ },
+ {
+ "leak pointer into map val",
+ .insns = {
+ BPF_MOV64_REG(BPF_REG_6, BPF_REG_1),
+ BPF_ST_MEM(BPF_DW, BPF_REG_10, -8, 0),
+ BPF_MOV64_REG(BPF_REG_2, BPF_REG_10),
+ BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8),
+ BPF_LD_MAP_FD(BPF_REG_1, 0),
+ BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0,
+ BPF_FUNC_map_lookup_elem),
+ BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 3),
+ BPF_MOV64_IMM(BPF_REG_3, 0),
+ BPF_STX_MEM(BPF_DW, BPF_REG_0, BPF_REG_3, 0),
+ BPF_STX_XADD(BPF_DW, BPF_REG_0, BPF_REG_6, 0),
+ BPF_MOV64_IMM(BPF_REG_0, 0),
+ BPF_EXIT_INSN(),
+ },
+ .fixup_map1 = { 4 },
+ .errstr_unpriv = "R6 leaks addr into mem",
+ .result_unpriv = REJECT,
+ .result = ACCEPT,
+ },
+ {
"helper access to map: full range",
.insns = {
BPF_MOV64_REG(BPF_REG_2, BPF_REG_10),
diff --git a/tools/testing/selftests/ntb/ntb_test.sh b/tools/testing/selftests/ntb/ntb_test.sh
index a676d3eefefb..13f5198ba0ee 100755
--- a/tools/testing/selftests/ntb/ntb_test.sh
+++ b/tools/testing/selftests/ntb/ntb_test.sh
@@ -305,7 +305,7 @@ function perf_test()
echo "Running remote perf test $WITH DMA"
write_file "" $REMOTE_PERF/run
echo -n " "
- read_file $LOCAL_PERF/run
+ read_file $REMOTE_PERF/run
echo " Passed"
_modprobe -r ntb_perf
diff --git a/tools/testing/selftests/rcutorture/bin/configcheck.sh b/tools/testing/selftests/rcutorture/bin/configcheck.sh
index eee31e261bf7..70fca318a82b 100755
--- a/tools/testing/selftests/rcutorture/bin/configcheck.sh
+++ b/tools/testing/selftests/rcutorture/bin/configcheck.sh
@@ -27,7 +27,7 @@ cat $1 > $T/.config
cat $2 | sed -e 's/\(.*\)=n/# \1 is not set/' -e 's/^#CHECK#//' |
awk '
-BEGIN {
+{
print "if grep -q \"" $0 "\" < '"$T/.config"'";
print "then";
print "\t:";
diff --git a/tools/testing/selftests/rcutorture/bin/kvm-build.sh b/tools/testing/selftests/rcutorture/bin/kvm-build.sh
index 00cb0db2643d..c29f2ec0bf9f 100755
--- a/tools/testing/selftests/rcutorture/bin/kvm-build.sh
+++ b/tools/testing/selftests/rcutorture/bin/kvm-build.sh
@@ -45,7 +45,7 @@ T=/tmp/test-linux.sh.$$
trap 'rm -rf $T' 0
mkdir $T
-grep -v 'CONFIG_[A-Z]*_TORTURE_TEST' < ${config_template} > $T/config
+grep -v 'CONFIG_[A-Z]*_TORTURE_TEST=' < ${config_template} > $T/config
cat << ___EOF___ >> $T/config
CONFIG_INITRAMFS_SOURCE="$TORTURE_INITRD"
CONFIG_VIRTIO_PCI=y
diff --git a/tools/testing/selftests/rcutorture/bin/kvm.sh b/tools/testing/selftests/rcutorture/bin/kvm.sh
index 3b3c1b693ee1..50091de3a911 100755
--- a/tools/testing/selftests/rcutorture/bin/kvm.sh
+++ b/tools/testing/selftests/rcutorture/bin/kvm.sh
@@ -296,10 +296,7 @@ if test -d .git
then
git status >> $resdir/$ds/testid.txt
git rev-parse HEAD >> $resdir/$ds/testid.txt
- if ! git diff HEAD > $T/git-diff 2>&1
- then
- cp $T/git-diff $resdir/$ds
- fi
+ git diff HEAD >> $resdir/$ds/testid.txt
fi
___EOF___
awk < $T/cfgcpu.pack \
diff --git a/tools/testing/selftests/rcutorture/configs/rcu/CFLIST b/tools/testing/selftests/rcutorture/configs/rcu/CFLIST
index a3a1a05a2b5c..6a0b9f69faad 100644
--- a/tools/testing/selftests/rcutorture/configs/rcu/CFLIST
+++ b/tools/testing/selftests/rcutorture/configs/rcu/CFLIST
@@ -9,6 +9,8 @@ TREE08
TREE09
SRCU-N
SRCU-P
+SRCU-t
+SRCU-u
TINY01
TINY02
TASKS01
diff --git a/tools/testing/selftests/rcutorture/configs/rcu/SRCU-C.boot b/tools/testing/selftests/rcutorture/configs/rcu/SRCU-C.boot
new file mode 100644
index 000000000000..84a7d51b7481
--- /dev/null
+++ b/tools/testing/selftests/rcutorture/configs/rcu/SRCU-C.boot
@@ -0,0 +1 @@
+rcutorture.torture_type=srcud
diff --git a/tools/testing/selftests/rcutorture/configs/rcu/SRCU-N b/tools/testing/selftests/rcutorture/configs/rcu/SRCU-N
index 1a087c3c8bb8..2da8b49589a0 100644
--- a/tools/testing/selftests/rcutorture/configs/rcu/SRCU-N
+++ b/tools/testing/selftests/rcutorture/configs/rcu/SRCU-N
@@ -5,4 +5,4 @@ CONFIG_HOTPLUG_CPU=y
CONFIG_PREEMPT_NONE=y
CONFIG_PREEMPT_VOLUNTARY=n
CONFIG_PREEMPT=n
-CONFIG_RCU_EXPERT=y
+#CHECK#CONFIG_RCU_EXPERT=n
diff --git a/tools/testing/selftests/rcutorture/configs/rcu/SRCU-P b/tools/testing/selftests/rcutorture/configs/rcu/SRCU-P
index 4837430a71c0..ab7ccd38232b 100644
--- a/tools/testing/selftests/rcutorture/configs/rcu/SRCU-P
+++ b/tools/testing/selftests/rcutorture/configs/rcu/SRCU-P
@@ -2,7 +2,11 @@ CONFIG_RCU_TRACE=n
CONFIG_SMP=y
CONFIG_NR_CPUS=8
CONFIG_HOTPLUG_CPU=y
+CONFIG_RCU_EXPERT=y
+CONFIG_RCU_FANOUT=2
+CONFIG_RCU_FANOUT_LEAF=2
CONFIG_PREEMPT_NONE=n
CONFIG_PREEMPT_VOLUNTARY=n
CONFIG_PREEMPT=y
-#CHECK#CONFIG_RCU_EXPERT=n
+CONFIG_DEBUG_LOCK_ALLOC=y
+CONFIG_PROVE_LOCKING=y
diff --git a/tools/testing/selftests/rcutorture/configs/rcu/SRCU-t b/tools/testing/selftests/rcutorture/configs/rcu/SRCU-t
new file mode 100644
index 000000000000..6c78022c8cd8
--- /dev/null
+++ b/tools/testing/selftests/rcutorture/configs/rcu/SRCU-t
@@ -0,0 +1,10 @@
+CONFIG_SMP=n
+CONFIG_PREEMPT_NONE=y
+CONFIG_PREEMPT_VOLUNTARY=n
+CONFIG_PREEMPT=n
+#CHECK#CONFIG_TINY_SRCU=y
+CONFIG_RCU_TRACE=n
+CONFIG_DEBUG_LOCK_ALLOC=n
+CONFIG_DEBUG_OBJECTS_RCU_HEAD=n
+CONFIG_DEBUG_ATOMIC_SLEEP=y
+#CHECK#CONFIG_PREEMPT_COUNT=y
diff --git a/tools/testing/selftests/rcutorture/configs/rcu/SRCU-t.boot b/tools/testing/selftests/rcutorture/configs/rcu/SRCU-t.boot
new file mode 100644
index 000000000000..238bfe3bd0cc
--- /dev/null
+++ b/tools/testing/selftests/rcutorture/configs/rcu/SRCU-t.boot
@@ -0,0 +1 @@
+rcutorture.torture_type=srcu
diff --git a/tools/testing/selftests/rcutorture/configs/rcu/SRCU-u b/tools/testing/selftests/rcutorture/configs/rcu/SRCU-u
new file mode 100644
index 000000000000..6bc24e99862f
--- /dev/null
+++ b/tools/testing/selftests/rcutorture/configs/rcu/SRCU-u
@@ -0,0 +1,9 @@
+CONFIG_SMP=n
+CONFIG_PREEMPT_NONE=y
+CONFIG_PREEMPT_VOLUNTARY=n
+CONFIG_PREEMPT=n
+#CHECK#CONFIG_TINY_SRCU=y
+CONFIG_RCU_TRACE=n
+CONFIG_DEBUG_LOCK_ALLOC=n
+CONFIG_DEBUG_OBJECTS_RCU_HEAD=n
+CONFIG_PREEMPT_COUNT=n
diff --git a/tools/testing/selftests/rcutorture/configs/rcu/SRCU-u.boot b/tools/testing/selftests/rcutorture/configs/rcu/SRCU-u.boot
new file mode 100644
index 000000000000..84a7d51b7481
--- /dev/null
+++ b/tools/testing/selftests/rcutorture/configs/rcu/SRCU-u.boot
@@ -0,0 +1 @@
+rcutorture.torture_type=srcud
diff --git a/tools/testing/selftests/rcutorture/configs/rcu/TINY02 b/tools/testing/selftests/rcutorture/configs/rcu/TINY02
index a59f7686e219..d8674264318d 100644
--- a/tools/testing/selftests/rcutorture/configs/rcu/TINY02
+++ b/tools/testing/selftests/rcutorture/configs/rcu/TINY02
@@ -6,10 +6,9 @@ CONFIG_PREEMPT=n
CONFIG_HZ_PERIODIC=y
CONFIG_NO_HZ_IDLE=n
CONFIG_NO_HZ_FULL=n
-CONFIG_RCU_TRACE=y
CONFIG_PROVE_LOCKING=y
-CONFIG_PROVE_RCU_REPEATEDLY=y
#CHECK#CONFIG_PROVE_RCU=y
CONFIG_DEBUG_LOCK_ALLOC=y
+CONFIG_DEBUG_OBJECTS=y
CONFIG_DEBUG_OBJECTS_RCU_HEAD=y
-CONFIG_PREEMPT_COUNT=y
+CONFIG_DEBUG_ATOMIC_SLEEP=y
diff --git a/tools/testing/selftests/rcutorture/configs/rcu/TREE01 b/tools/testing/selftests/rcutorture/configs/rcu/TREE01
index 359cb258f639..b5b53973c01e 100644
--- a/tools/testing/selftests/rcutorture/configs/rcu/TREE01
+++ b/tools/testing/selftests/rcutorture/configs/rcu/TREE01
@@ -10,12 +10,9 @@ CONFIG_RCU_FAST_NO_HZ=y
CONFIG_RCU_TRACE=y
CONFIG_HOTPLUG_CPU=y
CONFIG_MAXSMP=y
+CONFIG_CPUMASK_OFFSTACK=y
CONFIG_RCU_NOCB_CPU=y
-CONFIG_RCU_NOCB_CPU_ZERO=y
CONFIG_DEBUG_LOCK_ALLOC=n
CONFIG_RCU_BOOST=n
CONFIG_DEBUG_OBJECTS_RCU_HEAD=n
CONFIG_RCU_EXPERT=y
-CONFIG_RCU_TORTURE_TEST_SLOW_CLEANUP=y
-CONFIG_RCU_TORTURE_TEST_SLOW_INIT=y
-CONFIG_RCU_TORTURE_TEST_SLOW_PREINIT=y
diff --git a/tools/testing/selftests/rcutorture/configs/rcu/TREE01.boot b/tools/testing/selftests/rcutorture/configs/rcu/TREE01.boot
index adc3abc82fb8..1d14e1383016 100644
--- a/tools/testing/selftests/rcutorture/configs/rcu/TREE01.boot
+++ b/tools/testing/selftests/rcutorture/configs/rcu/TREE01.boot
@@ -1 +1,5 @@
rcutorture.torture_type=rcu_bh maxcpus=8
+rcutree.gp_preinit_delay=3
+rcutree.gp_init_delay=3
+rcutree.gp_cleanup_delay=3
+rcu_nocbs=0
diff --git a/tools/testing/selftests/rcutorture/configs/rcu/TREE02 b/tools/testing/selftests/rcutorture/configs/rcu/TREE02
index c1ab5926568b..35e639e39366 100644
--- a/tools/testing/selftests/rcutorture/configs/rcu/TREE02
+++ b/tools/testing/selftests/rcutorture/configs/rcu/TREE02
@@ -18,9 +18,6 @@ CONFIG_RCU_NOCB_CPU=n
CONFIG_DEBUG_LOCK_ALLOC=y
CONFIG_PROVE_LOCKING=n
CONFIG_RCU_BOOST=n
-CONFIG_DEBUG_OBJECTS_RCU_HEAD=n
CONFIG_RCU_EXPERT=y
-CONFIG_RCU_TORTURE_TEST_SLOW_CLEANUP=y
-CONFIG_RCU_TORTURE_TEST_SLOW_INIT=y
-CONFIG_RCU_TORTURE_TEST_SLOW_PREINIT=y
+CONFIG_DEBUG_OBJECTS=y
CONFIG_DEBUG_OBJECTS_RCU_HEAD=y
diff --git a/tools/testing/selftests/rcutorture/configs/rcu/TREE03 b/tools/testing/selftests/rcutorture/configs/rcu/TREE03
index 3b93ee544e70..2dc31b16e506 100644
--- a/tools/testing/selftests/rcutorture/configs/rcu/TREE03
+++ b/tools/testing/selftests/rcutorture/configs/rcu/TREE03
@@ -14,9 +14,5 @@ CONFIG_RCU_FANOUT_LEAF=2
CONFIG_RCU_NOCB_CPU=n
CONFIG_DEBUG_LOCK_ALLOC=n
CONFIG_RCU_BOOST=y
-CONFIG_RCU_KTHREAD_PRIO=2
CONFIG_DEBUG_OBJECTS_RCU_HEAD=n
CONFIG_RCU_EXPERT=y
-CONFIG_RCU_TORTURE_TEST_SLOW_CLEANUP=y
-CONFIG_RCU_TORTURE_TEST_SLOW_INIT=y
-CONFIG_RCU_TORTURE_TEST_SLOW_PREINIT=y
diff --git a/tools/testing/selftests/rcutorture/configs/rcu/TREE03.boot b/tools/testing/selftests/rcutorture/configs/rcu/TREE03.boot
index 120c0c88d100..5d2cc0bd50a0 100644
--- a/tools/testing/selftests/rcutorture/configs/rcu/TREE03.boot
+++ b/tools/testing/selftests/rcutorture/configs/rcu/TREE03.boot
@@ -1 +1,5 @@
rcutorture.onoff_interval=1 rcutorture.onoff_holdoff=30
+rcutree.gp_preinit_delay=3
+rcutree.gp_init_delay=3
+rcutree.gp_cleanup_delay=3
+rcutree.kthread_prio=2
diff --git a/tools/testing/selftests/rcutorture/configs/rcu/TREE04 b/tools/testing/selftests/rcutorture/configs/rcu/TREE04
index 5af758e783c7..27d22695d64c 100644
--- a/tools/testing/selftests/rcutorture/configs/rcu/TREE04
+++ b/tools/testing/selftests/rcutorture/configs/rcu/TREE04
@@ -15,11 +15,7 @@ CONFIG_SUSPEND=n
CONFIG_HIBERNATION=n
CONFIG_RCU_FANOUT=4
CONFIG_RCU_FANOUT_LEAF=3
-CONFIG_RCU_NOCB_CPU=n
CONFIG_DEBUG_LOCK_ALLOC=n
CONFIG_DEBUG_OBJECTS_RCU_HEAD=n
CONFIG_RCU_EXPERT=y
-CONFIG_RCU_TORTURE_TEST_SLOW_CLEANUP=y
-CONFIG_RCU_TORTURE_TEST_SLOW_INIT=y
-CONFIG_RCU_TORTURE_TEST_SLOW_PREINIT=y
CONFIG_RCU_EQS_DEBUG=y
diff --git a/tools/testing/selftests/rcutorture/configs/rcu/TREE05 b/tools/testing/selftests/rcutorture/configs/rcu/TREE05
index d4cdc0d74e16..2dde0d9964e3 100644
--- a/tools/testing/selftests/rcutorture/configs/rcu/TREE05
+++ b/tools/testing/selftests/rcutorture/configs/rcu/TREE05
@@ -13,12 +13,8 @@ CONFIG_HOTPLUG_CPU=y
CONFIG_RCU_FANOUT=6
CONFIG_RCU_FANOUT_LEAF=6
CONFIG_RCU_NOCB_CPU=y
-CONFIG_RCU_NOCB_CPU_NONE=y
CONFIG_DEBUG_LOCK_ALLOC=y
CONFIG_PROVE_LOCKING=y
#CHECK#CONFIG_PROVE_RCU=y
CONFIG_DEBUG_OBJECTS_RCU_HEAD=n
CONFIG_RCU_EXPERT=y
-CONFIG_RCU_TORTURE_TEST_SLOW_CLEANUP=y
-CONFIG_RCU_TORTURE_TEST_SLOW_INIT=y
-CONFIG_RCU_TORTURE_TEST_SLOW_PREINIT=y
diff --git a/tools/testing/selftests/rcutorture/configs/rcu/TREE05.boot b/tools/testing/selftests/rcutorture/configs/rcu/TREE05.boot
index 15b3e1a86f74..c7fd050dfcd9 100644
--- a/tools/testing/selftests/rcutorture/configs/rcu/TREE05.boot
+++ b/tools/testing/selftests/rcutorture/configs/rcu/TREE05.boot
@@ -1,2 +1,5 @@
rcutorture.torture_type=sched
rcupdate.rcu_self_test_sched=1
+rcutree.gp_preinit_delay=3
+rcutree.gp_init_delay=3
+rcutree.gp_cleanup_delay=3
diff --git a/tools/testing/selftests/rcutorture/configs/rcu/TREE06 b/tools/testing/selftests/rcutorture/configs/rcu/TREE06
index 4cb02bd28f08..05a4eec3f27b 100644
--- a/tools/testing/selftests/rcutorture/configs/rcu/TREE06
+++ b/tools/testing/selftests/rcutorture/configs/rcu/TREE06
@@ -18,8 +18,6 @@ CONFIG_RCU_NOCB_CPU=n
CONFIG_DEBUG_LOCK_ALLOC=y
CONFIG_PROVE_LOCKING=y
#CHECK#CONFIG_PROVE_RCU=y
+CONFIG_DEBUG_OBJECTS=y
CONFIG_DEBUG_OBJECTS_RCU_HEAD=y
CONFIG_RCU_EXPERT=y
-CONFIG_RCU_TORTURE_TEST_SLOW_CLEANUP=y
-CONFIG_RCU_TORTURE_TEST_SLOW_INIT=y
-CONFIG_RCU_TORTURE_TEST_SLOW_PREINIT=y
diff --git a/tools/testing/selftests/rcutorture/configs/rcu/TREE06.boot b/tools/testing/selftests/rcutorture/configs/rcu/TREE06.boot
index dd90f28ed700..ad18b52a2cad 100644
--- a/tools/testing/selftests/rcutorture/configs/rcu/TREE06.boot
+++ b/tools/testing/selftests/rcutorture/configs/rcu/TREE06.boot
@@ -2,3 +2,6 @@ rcupdate.rcu_self_test=1
rcupdate.rcu_self_test_bh=1
rcupdate.rcu_self_test_sched=1
rcutree.rcu_fanout_exact=1
+rcutree.gp_preinit_delay=3
+rcutree.gp_init_delay=3
+rcutree.gp_cleanup_delay=3
diff --git a/tools/testing/selftests/rcutorture/configs/rcu/TREE07 b/tools/testing/selftests/rcutorture/configs/rcu/TREE07
index b12a3ea1867e..0f4759f4232e 100644
--- a/tools/testing/selftests/rcutorture/configs/rcu/TREE07
+++ b/tools/testing/selftests/rcutorture/configs/rcu/TREE07
@@ -1,6 +1,5 @@
CONFIG_SMP=y
CONFIG_NR_CPUS=16
-CONFIG_CPUMASK_OFFSTACK=y
CONFIG_PREEMPT_NONE=y
CONFIG_PREEMPT_VOLUNTARY=n
CONFIG_PREEMPT=n
@@ -9,16 +8,11 @@ CONFIG_HZ_PERIODIC=n
CONFIG_NO_HZ_IDLE=n
CONFIG_NO_HZ_FULL=y
CONFIG_NO_HZ_FULL_ALL=n
-CONFIG_NO_HZ_FULL_SYSIDLE=y
CONFIG_RCU_FAST_NO_HZ=n
CONFIG_RCU_TRACE=y
CONFIG_HOTPLUG_CPU=y
CONFIG_RCU_FANOUT=2
CONFIG_RCU_FANOUT_LEAF=2
-CONFIG_RCU_NOCB_CPU=n
CONFIG_DEBUG_LOCK_ALLOC=n
CONFIG_DEBUG_OBJECTS_RCU_HEAD=n
CONFIG_RCU_EXPERT=y
-CONFIG_RCU_TORTURE_TEST_SLOW_CLEANUP=y
-CONFIG_RCU_TORTURE_TEST_SLOW_INIT=y
-CONFIG_RCU_TORTURE_TEST_SLOW_PREINIT=y
diff --git a/tools/testing/selftests/rcutorture/configs/rcu/TREE08 b/tools/testing/selftests/rcutorture/configs/rcu/TREE08
index 099cc63c6a3b..fb1c763c10c5 100644
--- a/tools/testing/selftests/rcutorture/configs/rcu/TREE08
+++ b/tools/testing/selftests/rcutorture/configs/rcu/TREE08
@@ -15,7 +15,6 @@ CONFIG_HIBERNATION=n
CONFIG_RCU_FANOUT=3
CONFIG_RCU_FANOUT_LEAF=2
CONFIG_RCU_NOCB_CPU=y
-CONFIG_RCU_NOCB_CPU_ALL=y
CONFIG_DEBUG_LOCK_ALLOC=n
CONFIG_PROVE_LOCKING=n
CONFIG_RCU_BOOST=n
diff --git a/tools/testing/selftests/rcutorture/configs/rcu/TREE08-T b/tools/testing/selftests/rcutorture/configs/rcu/TREE08-T
deleted file mode 100644
index 2ad13f0d29cc..000000000000
--- a/tools/testing/selftests/rcutorture/configs/rcu/TREE08-T
+++ /dev/null
@@ -1,21 +0,0 @@
-CONFIG_SMP=y
-CONFIG_NR_CPUS=16
-CONFIG_PREEMPT_NONE=n
-CONFIG_PREEMPT_VOLUNTARY=n
-CONFIG_PREEMPT=y
-#CHECK#CONFIG_PREEMPT_RCU=y
-CONFIG_HZ_PERIODIC=n
-CONFIG_NO_HZ_IDLE=y
-CONFIG_NO_HZ_FULL=n
-CONFIG_RCU_FAST_NO_HZ=n
-CONFIG_RCU_TRACE=y
-CONFIG_HOTPLUG_CPU=n
-CONFIG_SUSPEND=n
-CONFIG_HIBERNATION=n
-CONFIG_RCU_FANOUT=3
-CONFIG_RCU_FANOUT_LEAF=2
-CONFIG_RCU_NOCB_CPU=y
-CONFIG_RCU_NOCB_CPU_ALL=y
-CONFIG_DEBUG_LOCK_ALLOC=n
-CONFIG_RCU_BOOST=n
-CONFIG_DEBUG_OBJECTS_RCU_HEAD=n
diff --git a/tools/testing/selftests/rcutorture/configs/rcu/TREE08.boot b/tools/testing/selftests/rcutorture/configs/rcu/TREE08.boot
index fb066dc82769..1bd8efc4141e 100644
--- a/tools/testing/selftests/rcutorture/configs/rcu/TREE08.boot
+++ b/tools/testing/selftests/rcutorture/configs/rcu/TREE08.boot
@@ -2,3 +2,4 @@ rcutorture.torture_type=sched
rcupdate.rcu_self_test=1
rcupdate.rcu_self_test_sched=1
rcutree.rcu_fanout_exact=1
+rcu_nocbs=0-7
diff --git a/tools/testing/selftests/rcutorture/configs/rcu/TREE02-T b/tools/testing/selftests/rcutorture/configs/rcuperf/TINY
index 917d2517b5b5..fb05ef5279b4 100644
--- a/tools/testing/selftests/rcutorture/configs/rcu/TREE02-T
+++ b/tools/testing/selftests/rcutorture/configs/rcuperf/TINY
@@ -1,21 +1,16 @@
-CONFIG_SMP=y
-CONFIG_NR_CPUS=8
-CONFIG_PREEMPT_NONE=n
+CONFIG_SMP=n
+CONFIG_PREEMPT_NONE=y
CONFIG_PREEMPT_VOLUNTARY=n
-CONFIG_PREEMPT=y
-#CHECK#CONFIG_PREEMPT_RCU=y
+CONFIG_PREEMPT=n
+#CHECK#CONFIG_TINY_RCU=y
CONFIG_HZ_PERIODIC=n
CONFIG_NO_HZ_IDLE=y
CONFIG_NO_HZ_FULL=n
CONFIG_RCU_FAST_NO_HZ=n
-CONFIG_RCU_TRACE=y
-CONFIG_HOTPLUG_CPU=n
-CONFIG_SUSPEND=n
-CONFIG_HIBERNATION=n
-CONFIG_RCU_FANOUT=3
-CONFIG_RCU_FANOUT_LEAF=3
CONFIG_RCU_NOCB_CPU=n
-CONFIG_DEBUG_LOCK_ALLOC=y
+CONFIG_DEBUG_LOCK_ALLOC=n
CONFIG_PROVE_LOCKING=n
CONFIG_RCU_BOOST=n
CONFIG_DEBUG_OBJECTS_RCU_HEAD=n
+CONFIG_RCU_EXPERT=y
+CONFIG_RCU_TRACE=y
diff --git a/tools/testing/selftests/rcutorture/configs/rcuperf/TREE b/tools/testing/selftests/rcutorture/configs/rcuperf/TREE
index a312f671a29a..721cfda76ab2 100644
--- a/tools/testing/selftests/rcutorture/configs/rcuperf/TREE
+++ b/tools/testing/selftests/rcutorture/configs/rcuperf/TREE
@@ -7,7 +7,6 @@ CONFIG_HZ_PERIODIC=n
CONFIG_NO_HZ_IDLE=y
CONFIG_NO_HZ_FULL=n
CONFIG_RCU_FAST_NO_HZ=n
-CONFIG_RCU_TRACE=n
CONFIG_HOTPLUG_CPU=n
CONFIG_SUSPEND=n
CONFIG_HIBERNATION=n
diff --git a/tools/testing/selftests/rcutorture/configs/rcuperf/TREE54 b/tools/testing/selftests/rcutorture/configs/rcuperf/TREE54
index 985fb170d13c..7629f5dd73b2 100644
--- a/tools/testing/selftests/rcutorture/configs/rcuperf/TREE54
+++ b/tools/testing/selftests/rcutorture/configs/rcuperf/TREE54
@@ -8,7 +8,6 @@ CONFIG_HZ_PERIODIC=n
CONFIG_NO_HZ_IDLE=y
CONFIG_NO_HZ_FULL=n
CONFIG_RCU_FAST_NO_HZ=n
-CONFIG_RCU_TRACE=n
CONFIG_HOTPLUG_CPU=n
CONFIG_SUSPEND=n
CONFIG_HIBERNATION=n
diff --git a/tools/testing/selftests/rcutorture/doc/TINY_RCU.txt b/tools/testing/selftests/rcutorture/doc/TINY_RCU.txt
index 24396ae8355b..a75b16991a92 100644
--- a/tools/testing/selftests/rcutorture/doc/TINY_RCU.txt
+++ b/tools/testing/selftests/rcutorture/doc/TINY_RCU.txt
@@ -18,7 +18,6 @@ CONFIG_PROVE_RCU
In common code tested by TREE_RCU test cases.
-CONFIG_NO_HZ_FULL_SYSIDLE
CONFIG_RCU_NOCB_CPU
Meaningless for TINY_RCU.
diff --git a/tools/testing/selftests/rcutorture/doc/TREE_RCU-kconfig.txt b/tools/testing/selftests/rcutorture/doc/TREE_RCU-kconfig.txt
index 364801b1a230..9ad3f89c8dc7 100644
--- a/tools/testing/selftests/rcutorture/doc/TREE_RCU-kconfig.txt
+++ b/tools/testing/selftests/rcutorture/doc/TREE_RCU-kconfig.txt
@@ -9,28 +9,20 @@ CONFIG_DEBUG_OBJECTS_RCU_HEAD -- Do one.
CONFIG_HOTPLUG_CPU -- Do half. (Every second.)
CONFIG_HZ_PERIODIC -- Do one.
CONFIG_NO_HZ_IDLE -- Do those not otherwise specified. (Groups of two.)
-CONFIG_NO_HZ_FULL -- Do two, one with CONFIG_NO_HZ_FULL_SYSIDLE.
-CONFIG_NO_HZ_FULL_SYSIDLE -- Do one.
+CONFIG_NO_HZ_FULL -- Do two, one with partial CPU enablement.
CONFIG_PREEMPT -- Do half. (First three and #8.)
CONFIG_PROVE_LOCKING -- Do several, covering CONFIG_DEBUG_LOCK_ALLOC=y and not.
CONFIG_PROVE_RCU -- Hardwired to CONFIG_PROVE_LOCKING.
-CONFIG_PROVE_RCU_REPEATEDLY -- Do one.
CONFIG_RCU_BOOST -- one of PREEMPT_RCU.
-CONFIG_RCU_KTHREAD_PRIO -- set to 2 for _BOOST testing.
CONFIG_RCU_FANOUT -- Cover hierarchy, but overlap with others.
CONFIG_RCU_FANOUT_LEAF -- Do one non-default.
-CONFIG_RCU_FAST_NO_HZ -- Do one, but not with CONFIG_RCU_NOCB_CPU_ALL.
-CONFIG_RCU_NOCB_CPU -- Do three, see below.
-CONFIG_RCU_NOCB_CPU_ALL -- Do one.
-CONFIG_RCU_NOCB_CPU_NONE -- Do one.
-CONFIG_RCU_NOCB_CPU_ZERO -- Do one.
+CONFIG_RCU_FAST_NO_HZ -- Do one, but not with all nohz_full CPUs.
+CONFIG_RCU_NOCB_CPU -- Do three, one with no rcu_nocbs CPUs, one with
+ rcu_nocbs=0, and one with all rcu_nocbs CPUs.
CONFIG_RCU_TRACE -- Do half.
CONFIG_SMP -- Need one !SMP for PREEMPT_RCU.
CONFIG_RCU_EXPERT=n -- Do a few, but these have to be vanilla configurations.
CONFIG_RCU_EQS_DEBUG -- Do at least one for CONFIG_NO_HZ_FULL and not.
-CONFIG_RCU_TORTURE_TEST_SLOW_CLEANUP -- Do for all but a couple TREE scenarios.
-CONFIG_RCU_TORTURE_TEST_SLOW_INIT -- Do for all but a couple TREE scenarios.
-CONFIG_RCU_TORTURE_TEST_SLOW_PREINIT -- Do for all but a couple TREE scenarios.
RCU-bh: Do one with PREEMPT and one with !PREEMPT.
RCU-sched: Do one with PREEMPT but not BOOST.
@@ -52,10 +44,6 @@ CONFIG_64BIT
Used only to check CONFIG_RCU_FANOUT value, inspection suffices.
-CONFIG_NO_HZ_FULL_SYSIDLE_SMALL
-
- Defer until Frederic uses this.
-
CONFIG_PREEMPT_COUNT
CONFIG_PREEMPT_RCU
@@ -78,30 +66,16 @@ CONFIG_RCU_TORTURE_TEST_RUNNABLE
Always used in KVM testing.
-CONFIG_RCU_TORTURE_TEST_SLOW_PREINIT_DELAY
-CONFIG_RCU_TORTURE_TEST_SLOW_INIT_DELAY
-CONFIG_RCU_TORTURE_TEST_SLOW_CLEANUP_DELAY
-
- Inspection suffices, ignore.
-
CONFIG_PREEMPT_RCU
CONFIG_TREE_RCU
CONFIG_TINY_RCU
These are controlled by CONFIG_PREEMPT and/or CONFIG_SMP.
-CONFIG_SPARSE_RCU_POINTER
-
- Makes sense only for sparse runs, not for kernel builds.
-
CONFIG_SRCU
CONFIG_TASKS_RCU
Selected by CONFIG_RCU_TORTURE_TEST, so cannot disable.
-CONFIG_RCU_TRACE
-
- Implied by CONFIG_RCU_TRACE for Tree RCU.
-
boot parameters ignored: TBD
diff --git a/tools/testing/selftests/rcutorture/formal/srcu-cbmc/modify_srcu.awk b/tools/testing/selftests/rcutorture/formal/srcu-cbmc/modify_srcu.awk
index 8ff89043d0a9..c9e8bc5082a7 100755
--- a/tools/testing/selftests/rcutorture/formal/srcu-cbmc/modify_srcu.awk
+++ b/tools/testing/selftests/rcutorture/formal/srcu-cbmc/modify_srcu.awk
@@ -1,4 +1,4 @@
-#!/bin/awk -f
+#!/usr/bin/awk -f
# Modify SRCU for formal verification. The first argument should be srcu.h and
# the second should be srcu.c. Outputs modified srcu.h and srcu.c into the
diff --git a/virt/kvm/eventfd.c b/virt/kvm/eventfd.c
index a8d540398bbd..9120edf3c94b 100644
--- a/virt/kvm/eventfd.c
+++ b/virt/kvm/eventfd.c
@@ -184,7 +184,7 @@ int __attribute__((weak)) kvm_arch_set_irq_inatomic(
* Called with wqh->lock held and interrupts disabled
*/
static int
-irqfd_wakeup(wait_queue_t *wait, unsigned mode, int sync, void *key)
+irqfd_wakeup(wait_queue_entry_t *wait, unsigned mode, int sync, void *key)
{
struct kvm_kernel_irqfd *irqfd =
container_of(wait, struct kvm_kernel_irqfd, wait);